torchaudio 2.8.0__cp312-cp312-win_amd64.whl → 2.9.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,382 +0,0 @@
1
- import math
2
- from typing import Optional, Tuple, Union
3
- from torchaudio._internal.module_utils import dropping_support
4
-
5
- import torch
6
- import torchaudio
7
- from torch import Tensor
8
-
9
-
10
- def _compute_image_sources(
11
- room: torch.Tensor,
12
- source: torch.Tensor,
13
- max_order: int,
14
- absorption: torch.Tensor,
15
- scatter: Optional[torch.Tensor] = None,
16
- ) -> Tuple[Tensor, Tensor]:
17
- """Compute image sources in a shoebox-like room.
18
-
19
- Args:
20
- room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
21
- `(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
22
- source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
23
- `(D)`.
24
- max_order (int): The maximum number of reflections of the source.
25
- absorption (torch.Tensor): The absorption coefficients of wall materials.
26
- ``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
27
- The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
28
- ``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
29
- if the coefficients are different to different frequencies. `7` refers to the default number
30
- of octave bands. (See note in `simulate_rir_ism` method).
31
- ``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
32
- of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
33
- Or it is `6` if the room is a 3D room, representing absorption coefficients
34
- of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
35
- scatter (torch.Tensor): The scattering coefficients of wall materials.
36
- The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
37
- used in image source computation. (Default: ``None``)
38
-
39
- Returns:
40
- (torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
41
- Tensor with dimensions `(num_image_source, D)`.
42
- (torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
43
- `(num_band, num_image_source)`.
44
- """
45
- if scatter is None:
46
- tr = torch.sqrt(1 - absorption)
47
- else:
48
- tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
49
-
50
- ind = torch.arange(-max_order, max_order + 1, device=source.device)
51
- if room.shape[0] == 2:
52
- XYZ = torch.meshgrid(ind, ind, indexing="ij")
53
- else:
54
- XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
55
- XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
56
- XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
57
-
58
- # compute locations of image sources
59
- d = room[None, :]
60
- s = source[None, :]
61
- img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
62
-
63
- # attenuation
64
- exp_lo = abs(torch.floor((XYZ / 2)))
65
- exp_hi = abs(torch.floor((XYZ + 1) / 2))
66
- t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, left walls)
67
- t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, right walls)
68
- att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1) # (num_band, num_image_source)
69
- return img_loc, att
70
-
71
-
72
- def _hann(x: torch.Tensor, T: int):
73
- """Compute the Hann window where the values are truncated based on window length.
74
- torch.hann_window can only sample window function at integer points, the method is to sample
75
- continuous window function at non-integer points.
76
-
77
- Args:
78
- x (torch.Tensor): The fractional component of time delay Tensor.
79
- T (torch.Tensor): The window length of sinc function.
80
-
81
- Returns:
82
- (torch.Tensor): The hann window Tensor where values outside
83
- the sinc window (`T`) is set to zero.
84
- """
85
- y = torch.where(
86
- torch.abs(x) <= T / 2,
87
- 0.5 * (1 + torch.cos(2 * math.pi * x / T)),
88
- x.new_zeros(1),
89
- )
90
- return y
91
-
92
-
93
- def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
94
- """Compute fractional delay of impulse response signal.
95
-
96
- Args:
97
- delay (torch.Tensor): The time delay Tensor in samples.
98
- delay_i (torch.Tensor): The integer part of delay.
99
- delay_filter_length (int): The window length for sinc function.
100
-
101
- Returns:
102
- (torch.Tensor): The impulse response Tensor for all image sources.
103
- """
104
- if delay_filter_length % 2 != 1:
105
- raise ValueError("The filter length must be odd")
106
-
107
- pad = delay_filter_length // 2
108
- n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
109
- delay = delay[..., None]
110
-
111
- return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
112
-
113
-
114
- def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
115
- """Validates and converts absorption or scattering parameters to a tensor with appropriate shape
116
-
117
- Args:
118
- coeff (float or torch.Tensor): The absorption coefficients of wall materials.
119
-
120
- If the dtype is ``float``, the absorption coefficient is identical for all walls and
121
- all frequencies.
122
-
123
- If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
124
- where the values represent absorption coefficients of ``"west"``, ``"east"``,
125
- ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
126
-
127
- If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
128
- where 7 represents the number of octave bands.
129
-
130
- Returns:
131
- (torch.Tensor): The expanded coefficient.
132
- The shape is `(1, 6)` for single octave band case, and
133
- `(7, 6)` for multi octave band case.
134
- """
135
- num_walls = 6
136
- if isinstance(coeffs, float):
137
- if coeffs < 0:
138
- raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
139
- return torch.full((1, num_walls), coeffs)
140
- if isinstance(coeffs, Tensor):
141
- if torch.any(coeffs < 0):
142
- raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
143
- if coeffs.ndim == 1:
144
- if coeffs.numel() != num_walls:
145
- raise ValueError(
146
- f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
147
- f"Found the shape {coeffs.shape}."
148
- )
149
- return coeffs.unsqueeze(0)
150
- if coeffs.ndim == 2:
151
- if coeffs.shape[1] != num_walls:
152
- raise ValueError(
153
- f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
154
- f"is a 2D Tensor. Found: {coeffs.shape}."
155
- )
156
- return coeffs
157
- raise TypeError(f"`{name}` must be float or Tensor.")
158
-
159
-
160
- def _validate_inputs(
161
- room: torch.Tensor,
162
- source: torch.Tensor,
163
- mic_array: torch.Tensor,
164
- ):
165
- """Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
166
-
167
- Args:
168
- room (torch.Tensor): The size of the room. width, length (and height)
169
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
170
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
171
- """
172
- if not (room.ndim == 1 and room.numel() == 3):
173
- raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
174
- if not (source.ndim == 1 and source.numel() == 3):
175
- raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
176
- if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
177
- raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
178
-
179
-
180
- @dropping_support
181
- def simulate_rir_ism(
182
- room: torch.Tensor,
183
- source: torch.Tensor,
184
- mic_array: torch.Tensor,
185
- max_order: int,
186
- absorption: Union[float, torch.Tensor],
187
- output_length: Optional[int] = None,
188
- delay_filter_length: int = 81,
189
- center_frequency: Optional[torch.Tensor] = None,
190
- sound_speed: float = 343.0,
191
- sample_rate: float = 16000.0,
192
- ) -> Tensor:
193
- r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
194
- The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
195
-
196
- .. devices:: CPU
197
-
198
- .. properties:: TorchScript
199
-
200
- Args:
201
- room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
202
- three dimensions of the room.
203
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
204
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
205
- max_order (int): The maximum number of reflections of the source.
206
- absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
207
- coefficients of wall materials for sound energy.
208
- If the dtype is ``float``, the absorption coefficient is identical for all walls and
209
- all frequencies.
210
- If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
211
- absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
212
- and ``"ceiling"``, respectively.
213
- If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
214
- output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
215
- the length is defined as
216
-
217
- .. math::
218
- \frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
219
-
220
- where ``max_d`` is the maximum distance between image sources and microphones.
221
- delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
222
- center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
223
- Only used when ``absorption`` is a 2D Tensor.
224
- sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
225
- sample_rate (float, optional): The sample rate of the generated room impulse response signal.
226
- (Default: ``16000.0``)
227
-
228
- Returns:
229
- (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
230
- `(channel, rir_length)`.
231
-
232
- Note:
233
- If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
234
- of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
235
- Users need to tune the values of ``absorption`` to the corresponding frequencies.
236
- """
237
- _validate_inputs(room, source, mic_array)
238
- absorption = _adjust_coeff(absorption, "absorption")
239
- img_location, att = _compute_image_sources(room, source, max_order, absorption)
240
-
241
- # compute distances between image sources and microphones
242
- vec = img_location[:, None, :] - mic_array[None, :, :]
243
- dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel)
244
-
245
- img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel)
246
-
247
- # separate delays in integer / frac part
248
- delay = dist * sample_rate / sound_speed # distance to delay in samples
249
- delay_i = torch.ceil(delay) # integer part
250
-
251
- # compute the shorts IRs corresponding to each image source
252
- irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
253
-
254
- rir_length = int(delay_i.max() + irs.shape[-1])
255
- rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
256
-
257
- # multi-band processing
258
- if absorption.shape[0] > 1:
259
- if center_frequency is None:
260
- center = torch.tensor(
261
- [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
262
- )
263
- else:
264
- center = center_frequency
265
- # n_fft is set to 512 by default.
266
- filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
267
- rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
268
-
269
- # sum up rir signals of all image sources into one waveform.
270
- rir = rir.sum(0)
271
-
272
- if output_length is not None:
273
- if output_length > rir.shape[-1]:
274
- rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
275
- else:
276
- rir = rir[..., :output_length]
277
-
278
- return rir
279
-
280
-
281
- @dropping_support
282
- def ray_tracing(
283
- room: torch.Tensor,
284
- source: torch.Tensor,
285
- mic_array: torch.Tensor,
286
- num_rays: int,
287
- absorption: Union[float, torch.Tensor] = 0.0,
288
- scattering: Union[float, torch.Tensor] = 0.0,
289
- mic_radius: float = 0.5,
290
- sound_speed: float = 343.0,
291
- energy_thres: float = 1e-7,
292
- time_thres: float = 10.0,
293
- hist_bin_size: float = 0.004,
294
- ) -> torch.Tensor:
295
- r"""Compute energy histogram via ray tracing.
296
-
297
- The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
298
-
299
- ``num_rays`` rays are casted uniformly in all directions from the source;
300
- when a ray intersects a wall, it is reflected and part of its energy is absorbed.
301
- It is also scattered (sent directly to the microphone(s)) according to the ``scattering``
302
- coefficient.
303
- When a ray is close to the microphone, its current energy is recorded in the output
304
- histogram for that given time slot.
305
-
306
- .. devices:: CPU
307
-
308
- .. properties:: TorchScript
309
-
310
- Args:
311
- room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
312
- three dimensions of the room.
313
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
314
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
315
- absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials.
316
- (Default: ``0.0``).
317
- If the type is ``float``, the absorption coefficient is identical to all walls and
318
- all frequencies.
319
- If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption
320
- coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and
321
- ``"ceiling"``, respectively.
322
- If ``absorption`` is a 2D Tensor, the shape must be `(num_bands, 6)`.
323
- ``num_bands`` is the number of frequency bands (usually 7).
324
- scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``)
325
- The shape and type of this parameter is the same as for ``absorption``.
326
- mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5)
327
- sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``)
328
- energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``)
329
- The initial energy of each ray is ``2 / num_rays``.
330
- time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0)
331
- hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004)
332
-
333
- Returns:
334
- (torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded.
335
- Each bin corresponds to a given time slot.
336
- The shape is `(channel, num_bands, num_bins)`, where
337
- ``num_bins = ceil(time_thres / hist_bin_size)``.
338
- If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``.
339
- """
340
- if time_thres < hist_bin_size:
341
- raise ValueError(
342
- "`time_thres` must be greater than `hist_bin_size`. "
343
- f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}."
344
- )
345
-
346
- if room.dtype != source.dtype or source.dtype != mic_array.dtype:
347
- raise ValueError(
348
- "dtype of `room`, `source` and `mic_array` must match. "
349
- f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and "
350
- f"`mic_array` ({mic_array.dtype})"
351
- )
352
-
353
- _validate_inputs(room, source, mic_array)
354
- absorption = _adjust_coeff(absorption, "absorption").to(room.dtype)
355
- scattering = _adjust_coeff(scattering, "scattering").to(room.dtype)
356
-
357
- # Bring absorption and scattering to the same shape
358
- if absorption.shape[0] == 1 and scattering.shape[0] > 1:
359
- absorption = absorption.expand(scattering.shape)
360
- if scattering.shape[0] == 1 and absorption.shape[0] > 1:
361
- scattering = scattering.expand(absorption.shape)
362
- if absorption.shape != scattering.shape:
363
- raise ValueError(
364
- "`absorption` and `scattering` must be broadcastable to the same number of bands and walls. "
365
- f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}"
366
- )
367
-
368
- histograms = torch.ops.torchaudio.ray_tracing(
369
- room,
370
- source,
371
- mic_array,
372
- num_rays,
373
- absorption,
374
- scattering,
375
- mic_radius,
376
- sound_speed,
377
- energy_thres,
378
- time_thres,
379
- hist_bin_size,
380
- )
381
-
382
- return histograms
@@ -1,193 +0,0 @@
1
- import math
2
- import warnings
3
- from typing import Optional
4
-
5
- import torch
6
- from torchaudio.functional.functional import _create_triangular_filterbank
7
- from torchaudio._internal.module_utils import dropping_support
8
-
9
-
10
- def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
11
- r"""Convert Hz to Barks.
12
-
13
- Args:
14
- freqs (float): Frequencies in Hz
15
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
16
-
17
- Returns:
18
- barks (float): Frequency in Barks
19
- """
20
-
21
- if bark_scale not in ["schroeder", "traunmuller", "wang"]:
22
- raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
23
-
24
- if bark_scale == "wang":
25
- return 6.0 * math.asinh(freqs / 600.0)
26
- elif bark_scale == "schroeder":
27
- return 7.0 * math.asinh(freqs / 650.0)
28
- # Traunmuller Bark scale
29
- barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
30
- # Bark value correction
31
- if barks < 2:
32
- barks += 0.15 * (2 - barks)
33
- elif barks > 20.1:
34
- barks += 0.22 * (barks - 20.1)
35
-
36
- return barks
37
-
38
-
39
- def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
40
- """Convert bark bin numbers to frequencies.
41
-
42
- Args:
43
- barks (torch.Tensor): Bark frequencies
44
- bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
45
-
46
- Returns:
47
- freqs (torch.Tensor): Barks converted in Hz
48
- """
49
-
50
- if bark_scale not in ["schroeder", "traunmuller", "wang"]:
51
- raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
52
-
53
- if bark_scale == "wang":
54
- return 600.0 * torch.sinh(barks / 6.0)
55
- elif bark_scale == "schroeder":
56
- return 650.0 * torch.sinh(barks / 7.0)
57
- # Bark value correction
58
- if any(barks < 2):
59
- idx = barks < 2
60
- barks[idx] = (barks[idx] - 0.3) / 0.85
61
- elif any(barks > 20.1):
62
- idx = barks > 20.1
63
- barks[idx] = (barks[idx] + 4.422) / 1.22
64
-
65
- # Traunmuller Bark scale
66
- freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
67
-
68
- return freqs
69
-
70
-
71
- def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
72
- a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
73
- return torch.log2(freqs / (a440 / 16))
74
-
75
-
76
- @dropping_support
77
- def barkscale_fbanks(
78
- n_freqs: int,
79
- f_min: float,
80
- f_max: float,
81
- n_barks: int,
82
- sample_rate: int,
83
- bark_scale: str = "traunmuller",
84
- ) -> torch.Tensor:
85
- r"""Create a frequency bin conversion matrix.
86
-
87
- .. devices:: CPU
88
-
89
- .. properties:: TorchScript
90
-
91
- .. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
92
- :alt: Visualization of generated filter bank
93
-
94
- Args:
95
- n_freqs (int): Number of frequencies to highlight/apply
96
- f_min (float): Minimum frequency (Hz)
97
- f_max (float): Maximum frequency (Hz)
98
- n_barks (int): Number of mel filterbanks
99
- sample_rate (int): Sample rate of the audio waveform
100
- bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
101
-
102
- Returns:
103
- torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
104
- meaning number of frequencies to highlight/apply to x the number of filterbanks.
105
- Each column is a filterbank so that assuming there is a matrix A of
106
- size (..., ``n_freqs``), the applied result would be
107
- ``A * barkscale_fbanks(A.size(-1), ...)``.
108
-
109
- """
110
-
111
- # freq bins
112
- all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
113
-
114
- # calculate bark freq bins
115
- m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
116
- m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
117
-
118
- m_pts = torch.linspace(m_min, m_max, n_barks + 2)
119
- f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
120
-
121
- # create filterbank
122
- fb = _create_triangular_filterbank(all_freqs, f_pts)
123
-
124
- if (fb.max(dim=0).values == 0.0).any():
125
- warnings.warn(
126
- "At least one bark filterbank has all zero values. "
127
- f"The value for `n_barks` ({n_barks}) may be set too high. "
128
- f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
129
- )
130
-
131
- return fb
132
-
133
-
134
- @dropping_support
135
- def chroma_filterbank(
136
- sample_rate: int,
137
- n_freqs: int,
138
- n_chroma: int,
139
- *,
140
- tuning: float = 0.0,
141
- ctroct: float = 5.0,
142
- octwidth: Optional[float] = 2.0,
143
- norm: int = 2,
144
- base_c: bool = True,
145
- ):
146
- """Create a frequency-to-chroma conversion matrix. Implementation adapted from librosa.
147
-
148
- Args:
149
- sample_rate (int): Sample rate.
150
- n_freqs (int): Number of input frequencies.
151
- n_chroma (int): Number of output chroma.
152
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
153
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
154
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
155
- If ``None``, then disable weighting altogether. (Default: 2.0)
156
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
157
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
158
-
159
- Returns:
160
- torch.Tensor: Chroma filter bank, with shape `(n_freqs, n_chroma)`.
161
- """
162
- # Skip redundant upper half of frequency range.
163
- freqs = torch.linspace(0, sample_rate // 2, n_freqs)[1:]
164
- freq_bins = n_chroma * _hz_to_octs(freqs, bins_per_octave=n_chroma, tuning=tuning)
165
- freq_bins = torch.cat((torch.tensor([freq_bins[0] - 1.5 * n_chroma]), freq_bins))
166
- freq_bin_widths = torch.cat(
167
- (
168
- torch.maximum(freq_bins[1:] - freq_bins[:-1], torch.tensor(1.0)),
169
- torch.tensor([1]),
170
- )
171
- )
172
-
173
- # (n_freqs, n_chroma)
174
- D = freq_bins.unsqueeze(1) - torch.arange(0, n_chroma)
175
-
176
- n_chroma2 = round(n_chroma / 2)
177
-
178
- # Project to range [-n_chroma/2, n_chroma/2 - 1]
179
- D = torch.remainder(D + n_chroma2, n_chroma) - n_chroma2
180
-
181
- fb = torch.exp(-0.5 * (2 * D / torch.tile(freq_bin_widths.unsqueeze(1), (1, n_chroma))) ** 2)
182
- fb = torch.nn.functional.normalize(fb, p=norm, dim=1)
183
-
184
- if octwidth is not None:
185
- fb *= torch.tile(
186
- torch.exp(-0.5 * (((freq_bins.unsqueeze(1) / n_chroma - ctroct) / octwidth) ** 2)),
187
- (1, n_chroma),
188
- )
189
-
190
- if base_c:
191
- fb = torch.roll(fb, -3 * (n_chroma // 12), dims=1)
192
-
193
- return fb
@@ -1,39 +0,0 @@
1
- from torchaudio._internal.module_utils import dropping_const_support
2
- from ._conformer_wav2vec2 import (
3
- conformer_wav2vec2_base,
4
- conformer_wav2vec2_model,
5
- conformer_wav2vec2_pretrain_base,
6
- conformer_wav2vec2_pretrain_large,
7
- conformer_wav2vec2_pretrain_model,
8
- ConformerWav2Vec2PretrainModel,
9
- )
10
- from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
11
- from .conv_emformer import ConvEmformer
12
- from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
13
- from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
14
- from .rnnt_decoder import Hypothesis as _Hypothesis, RNNTBeamSearchBiasing
15
-
16
- Hypothesis = dropping_const_support(_Hypothesis, name="Hypothesis")
17
-
18
- __all__ = [
19
- "conformer_rnnt_base",
20
- "conformer_rnnt_model",
21
- "conformer_rnnt_biasing",
22
- "conformer_rnnt_biasing_base",
23
- "ConvEmformer",
24
- "conformer_wav2vec2_model",
25
- "conformer_wav2vec2_base",
26
- "conformer_wav2vec2_pretrain_model",
27
- "conformer_wav2vec2_pretrain_base",
28
- "conformer_wav2vec2_pretrain_large",
29
- "ConformerWav2Vec2PretrainModel",
30
- "emformer_hubert_base",
31
- "emformer_hubert_model",
32
- "Hypothesis",
33
- "RNNTBeamSearchBiasing",
34
- "HiFiGANVocoder",
35
- "hifigan_vocoder_v1",
36
- "hifigan_vocoder_v2",
37
- "hifigan_vocoder_v3",
38
- "hifigan_vocoder",
39
- ]