torchaudio 2.7.1__cp311-cp311-win_amd64.whl → 2.9.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +184 -33
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +68 -10
  5. torchaudio/_torchcodec.py +340 -0
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +6 -3
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +31 -61
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +19 -1
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +3 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +4 -1
  23. torchaudio/transforms/_transforms.py +4 -3
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -1
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -317
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -13
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -144
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -67
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -433
  54. torchaudio/prototype/functional/_rir.py +0 -379
  55. torchaudio/prototype/functional/functional.py +0 -190
  56. torchaudio/prototype/models/__init__.py +0 -36
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -333
  59. torchaudio/prototype/models/conv_emformer.py +0 -525
  60. torchaudio/prototype/models/hifi_gan.py +0 -336
  61. torchaudio/prototype/models/rnnt.py +0 -711
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -399
  63. torchaudio/prototype/pipelines/__init__.py +0 -12
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -456
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -272
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -99
  75. torchaudio-2.7.1.dist-info/RECORD +0 -144
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -978
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -247
  91. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,333 +0,0 @@
1
- from typing import List, Optional, Tuple
2
-
3
- import torch
4
- from torchaudio.models import Wav2Vec2Model
5
- from torchaudio.models.emformer import Emformer
6
- from torchaudio.models.rnnt import _TimeReduction
7
-
8
-
9
- class FeatureEncoder(torch.nn.Module):
10
- """Extract features from log-mel spectrogram input. Consists of linear layer and time reduction layer.
11
-
12
- Args:
13
- input_dim (int): The feature dimension of log-mel spectrogram feature.
14
- output_dim (int): The feature dimension after linear layer.
15
- use_bias (bool): If ``True``, enable bias parameter in the linear layer.
16
- stride (int): Number of frames to merge for the output frame.
17
- """
18
-
19
- def __init__(self, input_dim: int, output_dim: int, use_bias: bool, stride: int):
20
- super().__init__()
21
- self.linear = torch.nn.Linear(input_dim, output_dim, bias=use_bias)
22
- self.time_reduction = _TimeReduction(stride)
23
-
24
- def forward(
25
- self, input: torch.Tensor, lengths: Optional[torch.Tensor]
26
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
27
- """
28
- Args:
29
- input (torch.Tensor): The log-mel spectrogram input.
30
- Tensor with dimensions `(batch, time, input_dim)`.
31
- lengths (torch.Tensor or None): Valid length of each input sample.
32
- Tensor with dimension `(batch, )`.
33
-
34
- Returns:
35
- (torch.Tensor, torch.Tensor or None):
36
- torch.Tensor
37
- Returned feature Tensor after linear layer and time reduction layer.
38
- Tensor with dimensions `(batch, time // stride, output_dim)`.
39
- torch.Tensor or None
40
- The reduced lengths Tensor.
41
- """
42
- output = self.linear(input)
43
- if lengths is None:
44
- B, T, _ = input.shape
45
- dummy_lengths = torch.full((B,), T)
46
- output, _ = self.time_reduction(output, dummy_lengths)
47
- else:
48
- output, lengths = self.time_reduction(output, lengths)
49
- return output, lengths
50
-
51
-
52
- class EmformerEncoder(torch.nn.Module):
53
- """Emformer Encoder class for HuBERT pre-training. Consists of emformer module,
54
- linear layer and layer normalization layer.
55
-
56
- Args:
57
- emformer (torch.nn.Module):
58
- :py:class:`torchaudio.models.Emformer` module that consists of a list of emformer layers.
59
- output_linear (torch.nn.Module):
60
- Linear layer after emformer module.
61
- layer_norm (torch.nn.Module):
62
- Apply layer normalization to the output.
63
- """
64
-
65
- def __init__(
66
- self,
67
- emformer: torch.nn.Module,
68
- output_linear: torch.nn.Module,
69
- layer_norm: torch.nn.Module,
70
- ):
71
- super().__init__()
72
- self.emformer = emformer
73
- self.output_linear = output_linear
74
- self.layer_norm = layer_norm
75
-
76
- def forward(
77
- self,
78
- input: torch.Tensor,
79
- lengths: Optional[torch.Tensor],
80
- ) -> torch.Tensor:
81
- """
82
- Args:
83
- input (torch.Tensor): The input feature for emformer encoder.
84
- Tensor with dimensions `(batch, time, feature_dim)`.
85
- lengths (torch.Tensor or None): Valid length of each input sample.
86
- Tensor with dimension `(batch, )`.
87
-
88
- Returns:
89
- torch.Tensor: The feature Tensor after emformer encoder.
90
- """
91
- if lengths is None:
92
- B, T, _ = input.shape
93
- dummy_lengths = torch.full((B,), T)
94
- output, _ = self.emformer(input, dummy_lengths)
95
- else:
96
- output, lengths = self.emformer(input, lengths)
97
- output = self.output_linear(output)
98
- output = self.layer_norm(output)
99
- return output
100
-
101
- def extract_features(
102
- self,
103
- input: torch.Tensor,
104
- lengths: Optional[torch.Tensor],
105
- num_layers: Optional[int] = None,
106
- ) -> List[torch.Tensor]:
107
- """Extract output Tensors of the emformer layers.
108
-
109
- Args:
110
- input (torch.Tensor): The input feature for emformer encoder.
111
- Tensor with dimensions `(batch, time, feature_dim)`.
112
- lengths (torch.Tensor or None): Valid length of each input sample.
113
- Tensor with dimension `(batch, )`.
114
- num_layers (int or None, optional): If not ``None``, returns the first
115
- `num_layers` layers of Tensors as the output, otherwise returns the
116
- Tensors from all emformer layers.
117
-
118
- Returns:
119
- List[torch.Tensor]:
120
- Output Tensors of selected emformer layers.
121
- """
122
- if num_layers is not None:
123
- if not 0 < num_layers <= len(self.emformer.emformer_layers):
124
- raise ValueError(f"`num_layers` must be between [1, {len(self.emformer.emformer_layers)}]")
125
-
126
- ret: List[torch.Tensor] = []
127
-
128
- input = input.permute(1, 0, 2)
129
- right_context = self.emformer._gen_right_context(input)
130
- utterance = input[: input.size(0) - self.emformer.right_context_length]
131
- attention_mask = self.emformer._gen_attention_mask(utterance)
132
- mems = (
133
- self.emformer.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
134
- if self.emformer.use_mem
135
- else torch.empty(0).to(dtype=input.dtype, device=input.device)
136
- )
137
- output = utterance
138
- if lengths is None:
139
- B, T, _ = input.shape
140
- lengths = torch.full((B,), T)
141
- for layer in self.emformer.emformer_layers:
142
- output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
143
- ret.append(output.permute(1, 0, 2))
144
- if num_layers is not None and len(ret) >= num_layers:
145
- return ret
146
- return ret
147
-
148
-
149
- def _get_emformer_feature_extractor(input_dim: int, output_dim: int, use_bias: bool, stride: int) -> FeatureEncoder:
150
- """Construct FeatureEncoder for emformer model.
151
-
152
- Args:
153
- input_dim (int): The feature dimension of log-mel spectrogram feature.
154
- output_dim (int): The feature dimension after linear layer.
155
- use_bias (bool): If ``True``, enable bias parameter in the linear layer.
156
- stride (int): Number of frames to merge for the output frame.
157
-
158
- Returns:
159
- FeatureEncoder: The resulting FeatureEncoder module.
160
- """
161
- return FeatureEncoder(input_dim, output_dim, use_bias, stride)
162
-
163
-
164
- def _get_emformer_encoder(
165
- input_dim: int,
166
- output_dim: int,
167
- num_heads: int,
168
- ffn_dim: int,
169
- num_layers: int,
170
- segment_length: int,
171
- left_context_length: int,
172
- right_context_length: int,
173
- dropout: float,
174
- activation: str,
175
- max_memory_size: int,
176
- weight_init_scale_strategy: Optional[str],
177
- tanh_on_mem: bool,
178
- ) -> EmformerEncoder:
179
- """Construct EmformerEncoder for emformer model.
180
-
181
- Args:
182
- input_dim (int): The feature dimension of input Tensor.
183
- output_dim (int): The feature dimension after EmformerEncoder.
184
- num_heads (int): Number of attention heads in each Emformer layer.
185
- ffn_dim: (int): Hidden layer dimension of feedforward network.
186
- num_layers (int): Number of Emformer layers to instantiate.
187
- segment_length (int): Length of each input segment.
188
- left_context_length (int): Length of left context.
189
- right_context_length (int): Length of right context.
190
- dropout (float): Dropout probability.
191
- activation (str): Activation function to use in each Emformer layer's
192
- feedforward network. Must be one of ("relu", "gelu", "silu").
193
- max_memory_size (int): Maximum number of memory elements to use.
194
- weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
195
- strategy. Must be one of ("depthwise", "constant", ``None``).
196
- tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
197
-
198
- Returns:
199
- EmformerEncoder: The resulting EmformerEncoder module.
200
- """
201
- emformer = Emformer(
202
- input_dim=input_dim,
203
- num_heads=num_heads,
204
- ffn_dim=ffn_dim,
205
- num_layers=num_layers,
206
- segment_length=segment_length,
207
- left_context_length=left_context_length,
208
- right_context_length=right_context_length,
209
- dropout=dropout,
210
- activation=activation,
211
- max_memory_size=max_memory_size,
212
- weight_init_scale_strategy=weight_init_scale_strategy,
213
- tanh_on_mem=tanh_on_mem,
214
- )
215
- output_linear = torch.nn.Linear(input_dim, output_dim)
216
- layer_norm = torch.nn.LayerNorm(output_dim)
217
- return EmformerEncoder(emformer, output_linear, layer_norm)
218
-
219
-
220
- def emformer_hubert_model(
221
- extractor_input_dim: int,
222
- extractor_output_dim: int,
223
- extractor_use_bias: bool,
224
- extractor_stride: int,
225
- encoder_input_dim: int,
226
- encoder_output_dim: int,
227
- encoder_num_heads: int,
228
- encoder_ffn_dim: int,
229
- encoder_num_layers: int,
230
- encoder_segment_length: int,
231
- encoder_left_context_length: int,
232
- encoder_right_context_length: int,
233
- encoder_dropout: float,
234
- encoder_activation: str,
235
- encoder_max_memory_size: int,
236
- encoder_weight_init_scale_strategy: Optional[str],
237
- encoder_tanh_on_mem: bool,
238
- aux_num_out: Optional[int],
239
- ) -> Wav2Vec2Model:
240
- """Build a custom Emformer HuBERT model.
241
-
242
- Args:
243
- extractor_input_dim (int): The input dimension for feature extractor.
244
- extractor_output_dim (int): The output dimension after feature extractor.
245
- extractor_use_bias (bool): If ``True``, enable bias parameter in the linear layer of feature extractor.
246
- extractor_stride (int): Number of frames to merge for the output frame in feature extractor.
247
- encoder_input_dim (int): The input dimension for Emformer layer.
248
- encoder_output_dim (int): The output dimension after EmformerEncoder.
249
- encoder_num_heads (int): Number of attention heads in each Emformer layer.
250
- encoder_ffn_dim (int): Hidden layer dimension of feedforward network in Emformer.
251
- encoder_num_layers (int): Number of Emformer layers to instantiate.
252
- encoder_segment_length (int): Length of each input segment.
253
- encoder_left_context_length (int): Length of left context.
254
- encoder_right_context_length (int): Length of right context.
255
- encoder_dropout (float): Dropout probability.
256
- encoder_activation (str): Activation function to use in each Emformer layer's
257
- feedforward network. Must be one of ("relu", "gelu", "silu").
258
- encoder_max_memory_size (int): Maximum number of memory elements to use.
259
- encoder_weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
260
- strategy. Must be one of ("depthwise", "constant", ``None``).
261
- encoder_tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
262
- aux_num_out (int or None):
263
- When provided, attach an extra linear layer on top of encoder, which can be
264
- used for fine-tuning.
265
-
266
- Returns:
267
- Wav2Vec2Model:
268
- The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
269
- with a :py:class:`torchaudio.models.Emformer` encoder.
270
- """
271
- feature_extractor = _get_emformer_feature_extractor(
272
- extractor_input_dim, extractor_output_dim, extractor_use_bias, extractor_stride
273
- )
274
- emformer = _get_emformer_encoder(
275
- encoder_input_dim,
276
- encoder_output_dim,
277
- encoder_num_heads,
278
- encoder_ffn_dim,
279
- encoder_num_layers,
280
- encoder_segment_length,
281
- encoder_left_context_length,
282
- encoder_right_context_length,
283
- encoder_dropout,
284
- encoder_activation,
285
- encoder_max_memory_size,
286
- encoder_weight_init_scale_strategy,
287
- encoder_tanh_on_mem,
288
- )
289
- aux = None
290
- if aux_num_out is not None:
291
- aux = torch.nn.Linear(in_features=encoder_output_dim, out_features=aux_num_out)
292
- return Wav2Vec2Model(feature_extractor, emformer, aux)
293
-
294
-
295
- def emformer_hubert_base(
296
- extractor_input_dim: int = 80,
297
- extractor_output_dim: int = 128,
298
- encoder_dropout: float = 0.1,
299
- aux_num_out: Optional[int] = None,
300
- ) -> Wav2Vec2Model:
301
- """Build Emformer HuBERT Model with 20 Emformer layers.
302
-
303
- Args:
304
- extractor_input_dim (int, optional): The input dimension for feature extractor. (Default: 80)
305
- extractor_output_dim (int, optional): The output dimension after feature extractor. (Default: 128)
306
- encoder_dropout (float, optional): Dropout probability in Emformer. (Default: 0.1)
307
- aux_num_out (int or None, optional): Output dimension of aux layer for fine-tuning. (Default: ``None``)
308
-
309
- Returns:
310
- Wav2Vec2Model:
311
- The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
312
- with a :py:class:`torchaudio.models.Emformer` encoder.
313
- """
314
- return emformer_hubert_model(
315
- extractor_input_dim=extractor_input_dim,
316
- extractor_output_dim=extractor_output_dim,
317
- extractor_use_bias=False,
318
- extractor_stride=4,
319
- encoder_input_dim=512,
320
- encoder_output_dim=1024,
321
- encoder_num_heads=8,
322
- encoder_ffn_dim=2048,
323
- encoder_num_layers=20,
324
- encoder_segment_length=4,
325
- encoder_left_context_length=30,
326
- encoder_right_context_length=1,
327
- encoder_dropout=encoder_dropout,
328
- encoder_activation="gelu",
329
- encoder_max_memory_size=0,
330
- encoder_weight_init_scale_strategy="depthwise",
331
- encoder_tanh_on_mem=True,
332
- aux_num_out=aux_num_out,
333
- )