torchaudio 2.8.0__cp310-cp310-win_amd64.whl → 2.9.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,337 +0,0 @@
1
- from typing import List, Optional, Tuple
2
-
3
- import torch
4
- from torchaudio.models import Wav2Vec2Model
5
- from torchaudio.models.emformer import Emformer
6
- from torchaudio.models.rnnt import _TimeReduction
7
- from torchaudio._internal.module_utils import dropping_support
8
-
9
-
10
-
11
- class FeatureEncoder(torch.nn.Module):
12
- """Extract features from log-mel spectrogram input. Consists of linear layer and time reduction layer.
13
-
14
- Args:
15
- input_dim (int): The feature dimension of log-mel spectrogram feature.
16
- output_dim (int): The feature dimension after linear layer.
17
- use_bias (bool): If ``True``, enable bias parameter in the linear layer.
18
- stride (int): Number of frames to merge for the output frame.
19
- """
20
-
21
- def __init__(self, input_dim: int, output_dim: int, use_bias: bool, stride: int):
22
- super().__init__()
23
- self.linear = torch.nn.Linear(input_dim, output_dim, bias=use_bias)
24
- self.time_reduction = _TimeReduction(stride)
25
-
26
- def forward(
27
- self, input: torch.Tensor, lengths: Optional[torch.Tensor]
28
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
29
- """
30
- Args:
31
- input (torch.Tensor): The log-mel spectrogram input.
32
- Tensor with dimensions `(batch, time, input_dim)`.
33
- lengths (torch.Tensor or None): Valid length of each input sample.
34
- Tensor with dimension `(batch, )`.
35
-
36
- Returns:
37
- (torch.Tensor, torch.Tensor or None):
38
- torch.Tensor
39
- Returned feature Tensor after linear layer and time reduction layer.
40
- Tensor with dimensions `(batch, time // stride, output_dim)`.
41
- torch.Tensor or None
42
- The reduced lengths Tensor.
43
- """
44
- output = self.linear(input)
45
- if lengths is None:
46
- B, T, _ = input.shape
47
- dummy_lengths = torch.full((B,), T)
48
- output, _ = self.time_reduction(output, dummy_lengths)
49
- else:
50
- output, lengths = self.time_reduction(output, lengths)
51
- return output, lengths
52
-
53
-
54
- class EmformerEncoder(torch.nn.Module):
55
- """Emformer Encoder class for HuBERT pre-training. Consists of emformer module,
56
- linear layer and layer normalization layer.
57
-
58
- Args:
59
- emformer (torch.nn.Module):
60
- :py:class:`torchaudio.models.Emformer` module that consists of a list of emformer layers.
61
- output_linear (torch.nn.Module):
62
- Linear layer after emformer module.
63
- layer_norm (torch.nn.Module):
64
- Apply layer normalization to the output.
65
- """
66
-
67
- def __init__(
68
- self,
69
- emformer: torch.nn.Module,
70
- output_linear: torch.nn.Module,
71
- layer_norm: torch.nn.Module,
72
- ):
73
- super().__init__()
74
- self.emformer = emformer
75
- self.output_linear = output_linear
76
- self.layer_norm = layer_norm
77
-
78
- def forward(
79
- self,
80
- input: torch.Tensor,
81
- lengths: Optional[torch.Tensor],
82
- ) -> torch.Tensor:
83
- """
84
- Args:
85
- input (torch.Tensor): The input feature for emformer encoder.
86
- Tensor with dimensions `(batch, time, feature_dim)`.
87
- lengths (torch.Tensor or None): Valid length of each input sample.
88
- Tensor with dimension `(batch, )`.
89
-
90
- Returns:
91
- torch.Tensor: The feature Tensor after emformer encoder.
92
- """
93
- if lengths is None:
94
- B, T, _ = input.shape
95
- dummy_lengths = torch.full((B,), T)
96
- output, _ = self.emformer(input, dummy_lengths)
97
- else:
98
- output, lengths = self.emformer(input, lengths)
99
- output = self.output_linear(output)
100
- output = self.layer_norm(output)
101
- return output
102
-
103
- def extract_features(
104
- self,
105
- input: torch.Tensor,
106
- lengths: Optional[torch.Tensor],
107
- num_layers: Optional[int] = None,
108
- ) -> List[torch.Tensor]:
109
- """Extract output Tensors of the emformer layers.
110
-
111
- Args:
112
- input (torch.Tensor): The input feature for emformer encoder.
113
- Tensor with dimensions `(batch, time, feature_dim)`.
114
- lengths (torch.Tensor or None): Valid length of each input sample.
115
- Tensor with dimension `(batch, )`.
116
- num_layers (int or None, optional): If not ``None``, returns the first
117
- `num_layers` layers of Tensors as the output, otherwise returns the
118
- Tensors from all emformer layers.
119
-
120
- Returns:
121
- List[torch.Tensor]:
122
- Output Tensors of selected emformer layers.
123
- """
124
- if num_layers is not None:
125
- if not 0 < num_layers <= len(self.emformer.emformer_layers):
126
- raise ValueError(f"`num_layers` must be between [1, {len(self.emformer.emformer_layers)}]")
127
-
128
- ret: List[torch.Tensor] = []
129
-
130
- input = input.permute(1, 0, 2)
131
- right_context = self.emformer._gen_right_context(input)
132
- utterance = input[: input.size(0) - self.emformer.right_context_length]
133
- attention_mask = self.emformer._gen_attention_mask(utterance)
134
- mems = (
135
- self.emformer.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
136
- if self.emformer.use_mem
137
- else torch.empty(0).to(dtype=input.dtype, device=input.device)
138
- )
139
- output = utterance
140
- if lengths is None:
141
- B, T, _ = input.shape
142
- lengths = torch.full((B,), T)
143
- for layer in self.emformer.emformer_layers:
144
- output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
145
- ret.append(output.permute(1, 0, 2))
146
- if num_layers is not None and len(ret) >= num_layers:
147
- return ret
148
- return ret
149
-
150
-
151
- def _get_emformer_feature_extractor(input_dim: int, output_dim: int, use_bias: bool, stride: int) -> FeatureEncoder:
152
- """Construct FeatureEncoder for emformer model.
153
-
154
- Args:
155
- input_dim (int): The feature dimension of log-mel spectrogram feature.
156
- output_dim (int): The feature dimension after linear layer.
157
- use_bias (bool): If ``True``, enable bias parameter in the linear layer.
158
- stride (int): Number of frames to merge for the output frame.
159
-
160
- Returns:
161
- FeatureEncoder: The resulting FeatureEncoder module.
162
- """
163
- return FeatureEncoder(input_dim, output_dim, use_bias, stride)
164
-
165
-
166
- def _get_emformer_encoder(
167
- input_dim: int,
168
- output_dim: int,
169
- num_heads: int,
170
- ffn_dim: int,
171
- num_layers: int,
172
- segment_length: int,
173
- left_context_length: int,
174
- right_context_length: int,
175
- dropout: float,
176
- activation: str,
177
- max_memory_size: int,
178
- weight_init_scale_strategy: Optional[str],
179
- tanh_on_mem: bool,
180
- ) -> EmformerEncoder:
181
- """Construct EmformerEncoder for emformer model.
182
-
183
- Args:
184
- input_dim (int): The feature dimension of input Tensor.
185
- output_dim (int): The feature dimension after EmformerEncoder.
186
- num_heads (int): Number of attention heads in each Emformer layer.
187
- ffn_dim: (int): Hidden layer dimension of feedforward network.
188
- num_layers (int): Number of Emformer layers to instantiate.
189
- segment_length (int): Length of each input segment.
190
- left_context_length (int): Length of left context.
191
- right_context_length (int): Length of right context.
192
- dropout (float): Dropout probability.
193
- activation (str): Activation function to use in each Emformer layer's
194
- feedforward network. Must be one of ("relu", "gelu", "silu").
195
- max_memory_size (int): Maximum number of memory elements to use.
196
- weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
197
- strategy. Must be one of ("depthwise", "constant", ``None``).
198
- tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
199
-
200
- Returns:
201
- EmformerEncoder: The resulting EmformerEncoder module.
202
- """
203
- emformer = Emformer(
204
- input_dim=input_dim,
205
- num_heads=num_heads,
206
- ffn_dim=ffn_dim,
207
- num_layers=num_layers,
208
- segment_length=segment_length,
209
- left_context_length=left_context_length,
210
- right_context_length=right_context_length,
211
- dropout=dropout,
212
- activation=activation,
213
- max_memory_size=max_memory_size,
214
- weight_init_scale_strategy=weight_init_scale_strategy,
215
- tanh_on_mem=tanh_on_mem,
216
- )
217
- output_linear = torch.nn.Linear(input_dim, output_dim)
218
- layer_norm = torch.nn.LayerNorm(output_dim)
219
- return EmformerEncoder(emformer, output_linear, layer_norm)
220
-
221
-
222
- @dropping_support
223
- def emformer_hubert_model(
224
- extractor_input_dim: int,
225
- extractor_output_dim: int,
226
- extractor_use_bias: bool,
227
- extractor_stride: int,
228
- encoder_input_dim: int,
229
- encoder_output_dim: int,
230
- encoder_num_heads: int,
231
- encoder_ffn_dim: int,
232
- encoder_num_layers: int,
233
- encoder_segment_length: int,
234
- encoder_left_context_length: int,
235
- encoder_right_context_length: int,
236
- encoder_dropout: float,
237
- encoder_activation: str,
238
- encoder_max_memory_size: int,
239
- encoder_weight_init_scale_strategy: Optional[str],
240
- encoder_tanh_on_mem: bool,
241
- aux_num_out: Optional[int],
242
- ) -> Wav2Vec2Model:
243
- """Build a custom Emformer HuBERT model.
244
-
245
- Args:
246
- extractor_input_dim (int): The input dimension for feature extractor.
247
- extractor_output_dim (int): The output dimension after feature extractor.
248
- extractor_use_bias (bool): If ``True``, enable bias parameter in the linear layer of feature extractor.
249
- extractor_stride (int): Number of frames to merge for the output frame in feature extractor.
250
- encoder_input_dim (int): The input dimension for Emformer layer.
251
- encoder_output_dim (int): The output dimension after EmformerEncoder.
252
- encoder_num_heads (int): Number of attention heads in each Emformer layer.
253
- encoder_ffn_dim (int): Hidden layer dimension of feedforward network in Emformer.
254
- encoder_num_layers (int): Number of Emformer layers to instantiate.
255
- encoder_segment_length (int): Length of each input segment.
256
- encoder_left_context_length (int): Length of left context.
257
- encoder_right_context_length (int): Length of right context.
258
- encoder_dropout (float): Dropout probability.
259
- encoder_activation (str): Activation function to use in each Emformer layer's
260
- feedforward network. Must be one of ("relu", "gelu", "silu").
261
- encoder_max_memory_size (int): Maximum number of memory elements to use.
262
- encoder_weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
263
- strategy. Must be one of ("depthwise", "constant", ``None``).
264
- encoder_tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
265
- aux_num_out (int or None):
266
- When provided, attach an extra linear layer on top of encoder, which can be
267
- used for fine-tuning.
268
-
269
- Returns:
270
- Wav2Vec2Model:
271
- The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
272
- with a :py:class:`torchaudio.models.Emformer` encoder.
273
- """
274
- feature_extractor = _get_emformer_feature_extractor(
275
- extractor_input_dim, extractor_output_dim, extractor_use_bias, extractor_stride
276
- )
277
- emformer = _get_emformer_encoder(
278
- encoder_input_dim,
279
- encoder_output_dim,
280
- encoder_num_heads,
281
- encoder_ffn_dim,
282
- encoder_num_layers,
283
- encoder_segment_length,
284
- encoder_left_context_length,
285
- encoder_right_context_length,
286
- encoder_dropout,
287
- encoder_activation,
288
- encoder_max_memory_size,
289
- encoder_weight_init_scale_strategy,
290
- encoder_tanh_on_mem,
291
- )
292
- aux = None
293
- if aux_num_out is not None:
294
- aux = torch.nn.Linear(in_features=encoder_output_dim, out_features=aux_num_out)
295
- return Wav2Vec2Model(feature_extractor, emformer, aux)
296
-
297
-
298
- @dropping_support
299
- def emformer_hubert_base(
300
- extractor_input_dim: int = 80,
301
- extractor_output_dim: int = 128,
302
- encoder_dropout: float = 0.1,
303
- aux_num_out: Optional[int] = None,
304
- ) -> Wav2Vec2Model:
305
- """Build Emformer HuBERT Model with 20 Emformer layers.
306
-
307
- Args:
308
- extractor_input_dim (int, optional): The input dimension for feature extractor. (Default: 80)
309
- extractor_output_dim (int, optional): The output dimension after feature extractor. (Default: 128)
310
- encoder_dropout (float, optional): Dropout probability in Emformer. (Default: 0.1)
311
- aux_num_out (int or None, optional): Output dimension of aux layer for fine-tuning. (Default: ``None``)
312
-
313
- Returns:
314
- Wav2Vec2Model:
315
- The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
316
- with a :py:class:`torchaudio.models.Emformer` encoder.
317
- """
318
- return emformer_hubert_model(
319
- extractor_input_dim=extractor_input_dim,
320
- extractor_output_dim=extractor_output_dim,
321
- extractor_use_bias=False,
322
- extractor_stride=4,
323
- encoder_input_dim=512,
324
- encoder_output_dim=1024,
325
- encoder_num_heads=8,
326
- encoder_ffn_dim=2048,
327
- encoder_num_layers=20,
328
- encoder_segment_length=4,
329
- encoder_left_context_length=30,
330
- encoder_right_context_length=1,
331
- encoder_dropout=encoder_dropout,
332
- encoder_activation="gelu",
333
- encoder_max_memory_size=0,
334
- encoder_weight_init_scale_strategy="depthwise",
335
- encoder_tanh_on_mem=True,
336
- aux_num_out=aux_num_out,
337
- )