torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. torchaudio/__init__.py +204 -0
  2. torchaudio/_extension/__init__.py +61 -0
  3. torchaudio/_extension/utils.py +133 -0
  4. torchaudio/_internal/__init__.py +10 -0
  5. torchaudio/_internal/module_utils.py +171 -0
  6. torchaudio/_torchcodec.py +340 -0
  7. torchaudio/compliance/__init__.py +5 -0
  8. torchaudio/compliance/kaldi.py +813 -0
  9. torchaudio/datasets/__init__.py +47 -0
  10. torchaudio/datasets/cmuarctic.py +157 -0
  11. torchaudio/datasets/cmudict.py +186 -0
  12. torchaudio/datasets/commonvoice.py +86 -0
  13. torchaudio/datasets/dr_vctk.py +121 -0
  14. torchaudio/datasets/fluentcommands.py +108 -0
  15. torchaudio/datasets/gtzan.py +1118 -0
  16. torchaudio/datasets/iemocap.py +147 -0
  17. torchaudio/datasets/librilight_limited.py +111 -0
  18. torchaudio/datasets/librimix.py +133 -0
  19. torchaudio/datasets/librispeech.py +174 -0
  20. torchaudio/datasets/librispeech_biasing.py +189 -0
  21. torchaudio/datasets/libritts.py +168 -0
  22. torchaudio/datasets/ljspeech.py +107 -0
  23. torchaudio/datasets/musdb_hq.py +139 -0
  24. torchaudio/datasets/quesst14.py +136 -0
  25. torchaudio/datasets/snips.py +157 -0
  26. torchaudio/datasets/speechcommands.py +183 -0
  27. torchaudio/datasets/tedlium.py +218 -0
  28. torchaudio/datasets/utils.py +54 -0
  29. torchaudio/datasets/vctk.py +143 -0
  30. torchaudio/datasets/voxceleb1.py +309 -0
  31. torchaudio/datasets/yesno.py +89 -0
  32. torchaudio/functional/__init__.py +130 -0
  33. torchaudio/functional/_alignment.py +128 -0
  34. torchaudio/functional/filtering.py +1685 -0
  35. torchaudio/functional/functional.py +2505 -0
  36. torchaudio/lib/__init__.py +0 -0
  37. torchaudio/lib/_torchaudio.so +0 -0
  38. torchaudio/lib/libtorchaudio.so +0 -0
  39. torchaudio/models/__init__.py +85 -0
  40. torchaudio/models/_hdemucs.py +1008 -0
  41. torchaudio/models/conformer.py +293 -0
  42. torchaudio/models/conv_tasnet.py +330 -0
  43. torchaudio/models/decoder/__init__.py +64 -0
  44. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  45. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  46. torchaudio/models/deepspeech.py +84 -0
  47. torchaudio/models/emformer.py +884 -0
  48. torchaudio/models/rnnt.py +816 -0
  49. torchaudio/models/rnnt_decoder.py +339 -0
  50. torchaudio/models/squim/__init__.py +11 -0
  51. torchaudio/models/squim/objective.py +326 -0
  52. torchaudio/models/squim/subjective.py +150 -0
  53. torchaudio/models/tacotron2.py +1046 -0
  54. torchaudio/models/wav2letter.py +72 -0
  55. torchaudio/models/wav2vec2/__init__.py +45 -0
  56. torchaudio/models/wav2vec2/components.py +1167 -0
  57. torchaudio/models/wav2vec2/model.py +1579 -0
  58. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  59. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  60. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  61. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  62. torchaudio/models/wavernn.py +409 -0
  63. torchaudio/pipelines/__init__.py +102 -0
  64. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  65. torchaudio/pipelines/_squim_pipeline.py +156 -0
  66. torchaudio/pipelines/_tts/__init__.py +16 -0
  67. torchaudio/pipelines/_tts/impl.py +385 -0
  68. torchaudio/pipelines/_tts/interface.py +255 -0
  69. torchaudio/pipelines/_tts/utils.py +230 -0
  70. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  71. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  72. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  73. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  74. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  75. torchaudio/transforms/__init__.py +78 -0
  76. torchaudio/transforms/_multi_channel.py +467 -0
  77. torchaudio/transforms/_transforms.py +2138 -0
  78. torchaudio/utils/__init__.py +4 -0
  79. torchaudio/utils/download.py +89 -0
  80. torchaudio/version.py +2 -0
  81. torchaudio-2.9.1.dist-info/METADATA +133 -0
  82. torchaudio-2.9.1.dist-info/RECORD +85 -0
  83. torchaudio-2.9.1.dist-info/WHEEL +5 -0
  84. torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  85. torchaudio-2.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1579 @@
1
+ import math
2
+ from typing import List, Optional, Tuple
3
+
4
+ import torch
5
+ from torch import Tensor
6
+ from torch.nn import Module
7
+
8
+ from . import components
9
+
10
+
11
+ class Wav2Vec2Model(Module):
12
+ """Acoustic model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`.
13
+
14
+ Note:
15
+ To build the model, please use one of the factory functions.
16
+
17
+ See Also:
18
+ * :class:`torchaudio.pipelines.Wav2Vec2Bundle`: Pretrained models (without fine-tuning)
19
+ * :class:`torchaudio.pipelines.Wav2Vec2ASRBundle`: ASR pipelines with pretrained models.
20
+
21
+ Args:
22
+ feature_extractor (torch.nn.Module):
23
+ Feature extractor that extracts feature vectors from raw audio Tensor.
24
+
25
+ encoder (torch.nn.Module):
26
+ Encoder that converts the audio features into the sequence of probability
27
+ distribution (in negative log-likelihood) over labels.
28
+
29
+ aux (torch.nn.Module or None, optional):
30
+ Auxiliary module. If provided, the output from encoder is passed to this module.
31
+ """ # noqa: E501
32
+
33
+ def __init__(
34
+ self,
35
+ feature_extractor: Module,
36
+ encoder: Module,
37
+ aux: Optional[Module] = None,
38
+ ):
39
+ super().__init__()
40
+ self.feature_extractor = feature_extractor
41
+ self.encoder = encoder
42
+ self.aux = aux
43
+
44
+ @torch.jit.export
45
+ def extract_features(
46
+ self,
47
+ waveforms: Tensor,
48
+ lengths: Optional[Tensor] = None,
49
+ num_layers: Optional[int] = None,
50
+ ) -> Tuple[List[Tensor], Optional[Tensor]]:
51
+ """Extract feature vectors from raw waveforms
52
+
53
+ This returns the list of outputs from the intermediate layers of
54
+ transformer block in encoder.
55
+
56
+ Args:
57
+ waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
58
+ lengths (Tensor or None, optional):
59
+ Indicates the valid length of each audio in the batch.
60
+ Shape: `(batch, )`.
61
+ When the ``waveforms`` contains audios with different durations,
62
+ by providing ``lengths`` argument, the model will compute
63
+ the corresponding valid output lengths and apply proper mask in
64
+ transformer attention layer.
65
+ If ``None``, it is assumed that the entire audio waveform
66
+ length is valid.
67
+ num_layers (int or None, optional):
68
+ If given, limit the number of intermediate layers to go through.
69
+ Providing `1` will stop the computation after going through one
70
+ intermediate layers. If not given, the outputs from all the
71
+ intermediate layers are returned.
72
+
73
+ Returns:
74
+ (List[Tensor], Optional[Tensor]):
75
+ List of Tensors
76
+ Features from requested layers.
77
+ Each Tensor is of shape: `(batch, time frame, feature dimension)`
78
+ Tensor or None
79
+ If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
80
+ is returned.
81
+ It indicates the valid length in time axis of each feature Tensor.
82
+ """
83
+ x, lengths = self.feature_extractor(waveforms, lengths)
84
+ x = self.encoder.extract_features(x, lengths, num_layers)
85
+ return x, lengths
86
+
87
+ def forward(
88
+ self,
89
+ waveforms: Tensor,
90
+ lengths: Optional[Tensor] = None,
91
+ ) -> Tuple[Tensor, Optional[Tensor]]:
92
+ """Compute the sequence of probability distribution over labels.
93
+
94
+ Args:
95
+ waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
96
+ lengths (Tensor or None, optional):
97
+ Indicates the valid length of each audio in the batch.
98
+ Shape: `(batch, )`.
99
+ When the ``waveforms`` contains audios with different durations,
100
+ by providing ``lengths`` argument, the model will compute
101
+ the corresponding valid output lengths and apply proper mask in
102
+ transformer attention layer.
103
+ If ``None``, it is assumed that all the audio in ``waveforms``
104
+ have valid length. Default: ``None``.
105
+
106
+ Returns:
107
+ (Tensor, Optional[Tensor]):
108
+ Tensor
109
+ The sequences of probability distribution (in logit) over labels.
110
+ Shape: `(batch, frames, num labels)`.
111
+ Tensor or None
112
+ If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
113
+ is returned.
114
+ It indicates the valid length in time axis of the output Tensor.
115
+ """
116
+ x, lengths = self.feature_extractor(waveforms, lengths)
117
+ x = self.encoder(x, lengths)
118
+ if self.aux is not None:
119
+ x = self.aux(x)
120
+ return x, lengths
121
+
122
+
123
+ class HuBERTPretrainModel(Module):
124
+ """HuBERTPretrainModel()
125
+
126
+ HuBERT model used for pretraining in *HuBERT* :cite:`hsu2021hubert`.
127
+
128
+ Note:
129
+ To build the model, please use one of the factory functions.
130
+
131
+ See Also:
132
+ `HuBERT Pre-training and Fine-tuning Recipes
133
+ <https://github.com/pytorch/audio/tree/main/examples/hubert>`__
134
+
135
+ Args:
136
+ wav2vec2 (Wav2Vec2Model):
137
+ Wav2Vec2 encoder that generates the transformer outputs.
138
+
139
+ mask_generator (torch.nn.Module):
140
+ Mask generator that generates the mask for masked prediction during the training.
141
+
142
+ logit_generator (torch.nn.Module):
143
+ Logit generator that predicts the logits of the masked and unmasked inputs.
144
+
145
+ feature_grad_mult (float or None):
146
+ The factor to scale the convolutional feature extraction layer gradients by.
147
+ If ``None``, the gradients of feature extraction layers are not affected.
148
+ The scale factor will not affect the forward pass.
149
+ """
150
+
151
+ def __init__(
152
+ self,
153
+ wav2vec2: Wav2Vec2Model,
154
+ mask_generator: Module,
155
+ logit_generator: Module,
156
+ feature_grad_mult: Optional[float],
157
+ ):
158
+ super().__init__()
159
+ self.wav2vec2 = wav2vec2
160
+ self.mask_generator = mask_generator
161
+ self.logit_generator = logit_generator
162
+ if feature_grad_mult is not None and not 0.0 < feature_grad_mult < 1.0:
163
+ raise ValueError(
164
+ f"The value of `feature_grad_mult` must be ``None``or between (0, 1). Found {feature_grad_mult}"
165
+ )
166
+ self.feature_grad_mult = feature_grad_mult
167
+
168
+ def forward(
169
+ self,
170
+ waveforms: Tensor,
171
+ labels: Tensor,
172
+ audio_lengths: Optional[Tensor] = None,
173
+ ) -> Tuple[Tensor, Optional[Tensor]]:
174
+ """Compute the sequence of probability distribution over labels.
175
+
176
+ Args:
177
+ waveforms (Tensor): Audio tensor of dimension `[batch, frames]`.
178
+ labels (Tensor): Label for pre-training. A Tensor of dimension `[batch, frames]`.
179
+ audio_lengths (Tensor or None, optional):
180
+ Indicates the valid length of each audio in the batch.
181
+ Shape: `[batch, ]`.
182
+ When the ``waveforms`` contains audios with different durations,
183
+ by providing ``lengths`` argument, the model will compute
184
+ the corresponding valid output lengths and apply proper mask in
185
+ transformer attention layer.
186
+ If ``None``, it is assumed that all the audio in ``waveforms``
187
+ have valid length. Default: ``None``.
188
+
189
+ Returns:
190
+ (Tensor, Tensor, Tensor):
191
+ Tensor
192
+ The masked sequences of probability distribution (in logit).
193
+ Shape: `(masked_frames, num labels)`.
194
+ Tensor
195
+ The unmasked sequence of probability distribution (in logit).
196
+ Shape: `(unmasked_frames, num labels)`.
197
+ Tensor
198
+ The feature mean value for additional penalty loss.
199
+ Shape: `(1,)`.
200
+ """
201
+ x, lengths = self.wav2vec2.feature_extractor(waveforms, audio_lengths)
202
+ if self.feature_grad_mult is not None and self.feature_grad_mult < 1.0:
203
+ x = components.GradMultiply.apply(x, self.feature_grad_mult)
204
+ features_pen = x.float().pow(2).mean()
205
+ if lengths is not None:
206
+ padding_mask = components._get_padding_mask(x, lengths)
207
+ else:
208
+ padding_mask = None
209
+ x, attention_mask = self.wav2vec2.encoder._preprocess(x, lengths)
210
+ x, mask = self.mask_generator(x, padding_mask)
211
+ x = self.wav2vec2.encoder.transformer(x, attention_mask=attention_mask)
212
+ if x.shape[1] != labels.shape[1]:
213
+ raise ValueError("The length of label must match that of HuBERT model output")
214
+ if padding_mask is not None:
215
+ mask_m = torch.logical_and(~padding_mask, mask)
216
+ mask_u = torch.logical_and(~padding_mask, ~mask_m)
217
+ else:
218
+ mask_m = mask
219
+ mask_u = ~mask_m
220
+
221
+ logit_m, logit_u = self.logit_generator(x, labels, mask_m, mask_u)
222
+
223
+ return logit_m, logit_u, features_pen
224
+
225
+
226
+ def wav2vec2_model(
227
+ extractor_mode: str,
228
+ extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
229
+ extractor_conv_bias: bool,
230
+ encoder_embed_dim: int,
231
+ encoder_projection_dropout: float,
232
+ encoder_pos_conv_kernel: int,
233
+ encoder_pos_conv_groups: int,
234
+ encoder_num_layers: int,
235
+ encoder_num_heads: int,
236
+ encoder_attention_dropout: float,
237
+ encoder_ff_interm_features: int,
238
+ encoder_ff_interm_dropout: float,
239
+ encoder_dropout: float,
240
+ encoder_layer_norm_first: bool,
241
+ encoder_layer_drop: float,
242
+ aux_num_out: Optional[int],
243
+ ) -> Wav2Vec2Model:
244
+ """Builds custom :class:`~torchaudio.models.Wav2Vec2Model`.
245
+
246
+ Note:
247
+ The "feature extractor" below corresponds to
248
+ `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
249
+ in the original ``fairseq`` implementation.
250
+ This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
251
+ :cite:`baevski2020wav2vec` paper.
252
+
253
+ The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
254
+ and this is referred as "Transformer" in the paper.
255
+
256
+ Args:
257
+ extractor_mode (str): Operation mode of feature extractor.
258
+ Valid values are ``"group_norm"`` or ``"layer_norm"``.
259
+ If ``"group_norm"``, then a single normalization is applied
260
+ in the first convolution block. Otherwise, all the convolution
261
+ blocks will have layer normalization.
262
+
263
+ This option corresponds to ``extractor_mode`` from ``fairseq``.
264
+ extractor_conv_layer_config (list of integer tuples or None):
265
+ Configuration of convolution layers in feature extractor.
266
+ List of convolution configuration,
267
+ i.e. ``[(output_channel, kernel_size, stride), ...]``
268
+
269
+ If ``None`` is provided, then the following default value is used.
270
+
271
+ .. code-block:: python
272
+
273
+ [
274
+ (512, 10, 5),
275
+ (512, 3, 2),
276
+ (512, 3, 2),
277
+ (512, 3, 2),
278
+ (512, 3, 2),
279
+ (512, 2, 2),
280
+ (512, 2, 2),
281
+ ]
282
+
283
+ This option corresponds to ``conv_feature_layers`` from ``fairseq``.
284
+
285
+ extractor_conv_bias (bool):
286
+ Whether to include bias term to each convolution operation.
287
+
288
+ This option corresponds to ``conv_bias`` from ``fairseq``.
289
+
290
+ encoder_embed_dim (int):
291
+ The dimension of embedding in encoder.
292
+
293
+ This option corresponds to ``encoder_embed_dim`` from ``fairseq``.
294
+
295
+ encoder_projection_dropout (float):
296
+ The dropout probability applied after the input feature is projected
297
+ to ``encoder_embed_dim``.
298
+
299
+ This option corresponds to ``dropout_input`` from ``fairseq``.
300
+
301
+ encoder_pos_conv_kernel (int):
302
+ The kernel size of convolutional positional embeddings.
303
+
304
+ This option corresponds to ``conv_pos`` from ``fairseq``.
305
+
306
+ encoder_pos_conv_groups (int):
307
+ The number of groups of convolutional positional embeddings.
308
+
309
+ This option corresponds to ``conv_pos_groups`` from ``fairseq``.
310
+
311
+ encoder_num_layers (int):
312
+ The number of self attention layers in transformer block.
313
+
314
+ This option corresponds to ``encoder_layers`` from ``fairseq``.
315
+
316
+ encoder_num_heads (int):
317
+ The number of heads in self attention layers.
318
+
319
+ This option corresponds to ``encoder_attention_heads`` from ``fairseq``.
320
+
321
+ encoder_attention_dropout (float):
322
+ The dropout probability applied after softmax in self-attention layer.
323
+
324
+ This option corresponds to ``attention_dropout`` from ``fairseq``.
325
+
326
+ encoder_ff_interm_features (int):
327
+ The dimension of hidden features in feed forward layer.
328
+
329
+ This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``.
330
+
331
+ encoder_ff_interm_dropout (float):
332
+ The dropout probability applied in feedforward layer.
333
+
334
+ This option correspinds to ``activation_dropout`` from ``fairseq``.
335
+
336
+ encoder_dropout (float):
337
+ The dropout probability applied at the end of feed forward layer.
338
+
339
+ This option corresponds to ``dropout`` from ``fairseq``.
340
+
341
+ encoder_layer_norm_first (bool):
342
+ Control the order of layer norm in transformer layer and each encoder layer.
343
+ If True, in transformer layer, layer norm is applied before features are fed
344
+ to encoder layers. In encoder layer, two layer norms are applied before and after
345
+ self attention.
346
+ If False, in transformer layer, layer norm is applied after features are fed
347
+ to encoder layers. In encoder layer, two layer norms are applied after self
348
+ attention, before and after feed forward.
349
+
350
+ This option corresponds to ``layer_norm_first`` from ``fairseq``.
351
+
352
+ encoder_layer_drop (float):
353
+ Probability to drop each encoder layer during training.
354
+
355
+ This option corresponds to ``layerdrop`` from ``fairseq``.
356
+
357
+ aux_num_out (int or None):
358
+ When provided, attach an extra linear layer on top of encoder, which can be
359
+ used for fine-tuning.
360
+
361
+ Returns:
362
+ Wav2Vec2Model:
363
+ The resulting model.
364
+ """ # noqa: E501
365
+ if extractor_conv_layer_config is None:
366
+ extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
367
+
368
+ feature_extractor = components._get_feature_extractor(
369
+ extractor_mode, extractor_conv_layer_config, extractor_conv_bias
370
+ )
371
+ encoder = components._get_encoder(
372
+ in_features=extractor_conv_layer_config[-1][0],
373
+ embed_dim=encoder_embed_dim,
374
+ dropout_input=encoder_projection_dropout,
375
+ pos_conv_kernel=encoder_pos_conv_kernel,
376
+ pos_conv_groups=encoder_pos_conv_groups,
377
+ num_layers=encoder_num_layers,
378
+ num_heads=encoder_num_heads,
379
+ attention_dropout=encoder_attention_dropout,
380
+ ff_interm_features=encoder_ff_interm_features,
381
+ ff_interm_dropout=encoder_ff_interm_dropout,
382
+ dropout=encoder_dropout,
383
+ layer_norm_first=encoder_layer_norm_first,
384
+ layer_drop=encoder_layer_drop,
385
+ )
386
+ aux = None
387
+ if aux_num_out is not None:
388
+ aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
389
+ return Wav2Vec2Model(feature_extractor, encoder, aux)
390
+
391
+
392
+ def wav2vec2_base(
393
+ encoder_projection_dropout: float = 0.1,
394
+ encoder_attention_dropout: float = 0.1,
395
+ encoder_ff_interm_dropout: float = 0.1,
396
+ encoder_dropout: float = 0.1,
397
+ encoder_layer_drop: float = 0.1,
398
+ aux_num_out: Optional[int] = None,
399
+ ) -> Wav2Vec2Model:
400
+ """Builds "base" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
401
+
402
+ Args:
403
+ encoder_projection_dropout (float):
404
+ See :py:func:`wav2vec2_model`.
405
+ encoder_attention_dropout (float):
406
+ See :py:func:`wav2vec2_model`.
407
+ encoder_ff_interm_dropout (float):
408
+ See :py:func:`wav2vec2_model`.
409
+ encoder_dropout (float):
410
+ See :py:func:`wav2vec2_model`.
411
+ encoder_layer_drop (float):
412
+ See :py:func:`wav2vec2_model`.
413
+ aux_num_out (int or None, optional):
414
+ See :py:func:`wav2vec2_model`.
415
+
416
+ Returns:
417
+ Wav2Vec2Model:
418
+ The resulting model.
419
+ """ # noqa: E501
420
+ return wav2vec2_model(
421
+ extractor_mode="group_norm",
422
+ extractor_conv_layer_config=None,
423
+ extractor_conv_bias=False,
424
+ encoder_embed_dim=768,
425
+ encoder_projection_dropout=encoder_projection_dropout,
426
+ encoder_pos_conv_kernel=128,
427
+ encoder_pos_conv_groups=16,
428
+ encoder_num_layers=12,
429
+ encoder_num_heads=12,
430
+ encoder_attention_dropout=encoder_attention_dropout,
431
+ encoder_ff_interm_features=3072,
432
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
433
+ encoder_dropout=encoder_dropout,
434
+ encoder_layer_norm_first=False,
435
+ encoder_layer_drop=encoder_layer_drop,
436
+ aux_num_out=aux_num_out,
437
+ )
438
+
439
+
440
+ def wav2vec2_large(
441
+ encoder_projection_dropout: float = 0.1,
442
+ encoder_attention_dropout: float = 0.1,
443
+ encoder_ff_interm_dropout: float = 0.1,
444
+ encoder_dropout: float = 0.1,
445
+ encoder_layer_drop: float = 0.1,
446
+ aux_num_out: Optional[int] = None,
447
+ ) -> Wav2Vec2Model:
448
+ """Builds "large" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
449
+
450
+ Args:
451
+ encoder_projection_dropout (float):
452
+ See :py:func:`wav2vec2_model`.
453
+ encoder_attention_dropout (float):
454
+ See :py:func:`wav2vec2_model`.
455
+ encoder_ff_interm_dropout (float):
456
+ See :py:func:`wav2vec2_model`.
457
+ encoder_dropout (float):
458
+ See :py:func:`wav2vec2_model`.
459
+ encoder_layer_drop (float):
460
+ See :py:func:`wav2vec2_model`.
461
+ aux_num_out (int or None, optional):
462
+ See :py:func:`wav2vec2_model`.
463
+
464
+ Returns:
465
+ Wav2Vec2Model:
466
+ The resulting model.
467
+ """ # noqa: E501
468
+ return wav2vec2_model(
469
+ extractor_mode="group_norm",
470
+ extractor_conv_layer_config=None,
471
+ extractor_conv_bias=False,
472
+ encoder_embed_dim=1024,
473
+ encoder_projection_dropout=encoder_projection_dropout,
474
+ encoder_pos_conv_kernel=128,
475
+ encoder_pos_conv_groups=16,
476
+ encoder_num_layers=24,
477
+ encoder_num_heads=16,
478
+ encoder_attention_dropout=encoder_attention_dropout,
479
+ encoder_ff_interm_features=4096,
480
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
481
+ encoder_dropout=encoder_dropout,
482
+ encoder_layer_norm_first=False,
483
+ encoder_layer_drop=encoder_layer_drop,
484
+ aux_num_out=aux_num_out,
485
+ )
486
+
487
+
488
+ def wav2vec2_large_lv60k(
489
+ encoder_projection_dropout: float = 0.1,
490
+ encoder_attention_dropout: float = 0.0,
491
+ encoder_ff_interm_dropout: float = 0.1,
492
+ encoder_dropout: float = 0.0,
493
+ encoder_layer_drop: float = 0.1,
494
+ aux_num_out: Optional[int] = None,
495
+ ) -> Wav2Vec2Model:
496
+ """Builds "large lv-60k" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
497
+
498
+ Args:
499
+ encoder_projection_dropout (float):
500
+ See :py:func:`wav2vec2_model`.
501
+ encoder_attention_dropout (float):
502
+ See :py:func:`wav2vec2_model`.
503
+ encoder_ff_interm_dropout (float):
504
+ See :py:func:`wav2vec2_model`.
505
+ encoder_dropout (float):
506
+ See :py:func:`wav2vec2_model`.
507
+ encoder_layer_drop (float):
508
+ See :py:func:`wav2vec2_model`.
509
+ aux_num_out (int or None, optional):
510
+ See :py:func:`wav2vec2_model`.
511
+
512
+ Returns:
513
+ Wav2Vec2Model:
514
+ The resulting model.
515
+ """ # noqa: E501
516
+ return wav2vec2_model(
517
+ extractor_mode="layer_norm",
518
+ extractor_conv_layer_config=None,
519
+ extractor_conv_bias=True,
520
+ encoder_embed_dim=1024,
521
+ encoder_projection_dropout=encoder_projection_dropout,
522
+ encoder_pos_conv_kernel=128,
523
+ encoder_pos_conv_groups=16,
524
+ encoder_num_layers=24,
525
+ encoder_num_heads=16,
526
+ encoder_attention_dropout=encoder_attention_dropout,
527
+ encoder_ff_interm_features=4096,
528
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
529
+ encoder_dropout=encoder_dropout,
530
+ encoder_layer_norm_first=True,
531
+ encoder_layer_drop=encoder_layer_drop,
532
+ aux_num_out=aux_num_out,
533
+ )
534
+
535
+
536
+ def hubert_base(
537
+ encoder_projection_dropout: float = 0.1,
538
+ encoder_attention_dropout: float = 0.1,
539
+ encoder_ff_interm_dropout: float = 0.0,
540
+ encoder_dropout: float = 0.1,
541
+ encoder_layer_drop: float = 0.05,
542
+ aux_num_out: Optional[int] = None,
543
+ ) -> Wav2Vec2Model:
544
+ """Builds "base" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
545
+
546
+ Args:
547
+ encoder_projection_dropout (float):
548
+ See :py:func:`wav2vec2_model`.
549
+ encoder_attention_dropout (float):
550
+ See :py:func:`wav2vec2_model`.
551
+ encoder_ff_interm_dropout (float):
552
+ See :py:func:`wav2vec2_model`.
553
+ encoder_dropout (float):
554
+ See :py:func:`wav2vec2_model`.
555
+ encoder_layer_drop (float):
556
+ See :py:func:`wav2vec2_model`.
557
+ aux_num_out (int or None, optional):
558
+ See :py:func:`wav2vec2_model`.
559
+
560
+ Returns:
561
+ Wav2Vec2Model:
562
+ The resulting model.
563
+ """ # noqa: E501
564
+ return wav2vec2_model(
565
+ extractor_mode="group_norm",
566
+ extractor_conv_layer_config=None,
567
+ extractor_conv_bias=False,
568
+ encoder_embed_dim=768,
569
+ encoder_projection_dropout=encoder_projection_dropout,
570
+ encoder_pos_conv_kernel=128,
571
+ encoder_pos_conv_groups=16,
572
+ encoder_num_layers=12,
573
+ encoder_num_heads=12,
574
+ encoder_attention_dropout=encoder_attention_dropout,
575
+ encoder_ff_interm_features=3072,
576
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
577
+ encoder_dropout=encoder_dropout,
578
+ encoder_layer_norm_first=False,
579
+ encoder_layer_drop=encoder_layer_drop,
580
+ aux_num_out=aux_num_out,
581
+ )
582
+
583
+
584
+ def hubert_large(
585
+ encoder_projection_dropout: float = 0.0,
586
+ encoder_attention_dropout: float = 0.0,
587
+ encoder_ff_interm_dropout: float = 0.0,
588
+ encoder_dropout: float = 0.0,
589
+ encoder_layer_drop: float = 0.0,
590
+ aux_num_out: Optional[int] = None,
591
+ ) -> Wav2Vec2Model:
592
+ """Builds "large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
593
+
594
+ Args:
595
+ encoder_projection_dropout (float):
596
+ See :py:func:`wav2vec2_model`.
597
+ encoder_attention_dropout (float):
598
+ See :py:func:`wav2vec2_model`.
599
+ encoder_ff_interm_dropout (float):
600
+ See :py:func:`wav2vec2_model`.
601
+ encoder_dropout (float):
602
+ See :py:func:`wav2vec2_model`.
603
+ encoder_layer_drop (float):
604
+ See :py:func:`wav2vec2_model`.
605
+ aux_num_out (int or None, optional):
606
+ See :py:func:`wav2vec2_model`.
607
+
608
+ Returns:
609
+ Wav2Vec2Model:
610
+ The resulting model.
611
+ """ # noqa: E501
612
+ return wav2vec2_model(
613
+ extractor_mode="layer_norm",
614
+ extractor_conv_layer_config=None,
615
+ extractor_conv_bias=False,
616
+ encoder_embed_dim=1024,
617
+ encoder_projection_dropout=encoder_projection_dropout,
618
+ encoder_pos_conv_kernel=128,
619
+ encoder_pos_conv_groups=16,
620
+ encoder_num_layers=24,
621
+ encoder_num_heads=16,
622
+ encoder_attention_dropout=encoder_attention_dropout,
623
+ encoder_ff_interm_features=4096,
624
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
625
+ encoder_dropout=encoder_dropout,
626
+ encoder_layer_norm_first=True,
627
+ encoder_layer_drop=encoder_layer_drop,
628
+ aux_num_out=aux_num_out,
629
+ )
630
+
631
+
632
+ def hubert_xlarge(
633
+ encoder_projection_dropout: float = 0.0,
634
+ encoder_attention_dropout: float = 0.0,
635
+ encoder_ff_interm_dropout: float = 0.0,
636
+ encoder_dropout: float = 0.0,
637
+ encoder_layer_drop: float = 0.0,
638
+ aux_num_out: Optional[int] = None,
639
+ ) -> Wav2Vec2Model:
640
+ """Builds "extra large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
641
+
642
+ Args:
643
+ encoder_projection_dropout (float):
644
+ See :py:func:`wav2vec2_model`.
645
+ encoder_attention_dropout (float):
646
+ See :py:func:`wav2vec2_model`.
647
+ encoder_ff_interm_dropout (float):
648
+ See :py:func:`wav2vec2_model`.
649
+ encoder_dropout (float):
650
+ See :py:func:`wav2vec2_model`.
651
+ encoder_layer_drop (float):
652
+ See :py:func:`wav2vec2_model`.
653
+ aux_num_out (int or None, optional):
654
+ See :py:func:`wav2vec2_model`.
655
+
656
+ Returns:
657
+ Wav2Vec2Model:
658
+ The resulting model.
659
+ """ # noqa: E501
660
+ return wav2vec2_model(
661
+ extractor_mode="layer_norm",
662
+ extractor_conv_layer_config=None,
663
+ extractor_conv_bias=False,
664
+ encoder_embed_dim=1280,
665
+ encoder_projection_dropout=encoder_projection_dropout,
666
+ encoder_pos_conv_kernel=128,
667
+ encoder_pos_conv_groups=16,
668
+ encoder_num_layers=48,
669
+ encoder_num_heads=16,
670
+ encoder_attention_dropout=encoder_attention_dropout,
671
+ encoder_ff_interm_features=5120,
672
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
673
+ encoder_dropout=encoder_dropout,
674
+ encoder_layer_norm_first=True,
675
+ encoder_layer_drop=encoder_layer_drop,
676
+ aux_num_out=aux_num_out,
677
+ )
678
+
679
+
680
+ def _init_hubert_pretrain_model(module):
681
+ if isinstance(module, components.ConvLayerBlock):
682
+ torch.nn.init.kaiming_normal_(module.conv.weight)
683
+ elif isinstance(module, components.ConvolutionalPositionalEmbedding):
684
+ # normalize the weight to normal distribution.
685
+ std = math.sqrt(4.0 / (module.embed_dim * module.kernel_size))
686
+ torch.nn.init.normal_(module.conv.weight, mean=0.0, std=std)
687
+ torch.nn.init.constant_(module.conv.bias, 0.0)
688
+ elif isinstance(module, components.SelfAttention):
689
+ # normalize the query, key, value, and out_proj parameters in self attention module.
690
+ torch.nn.init.xavier_uniform_(module.k_proj.weight, gain=1 / math.sqrt(2))
691
+ torch.nn.init.xavier_uniform_(module.v_proj.weight, gain=1 / math.sqrt(2))
692
+ torch.nn.init.xavier_uniform_(module.q_proj.weight, gain=1 / math.sqrt(2))
693
+ torch.nn.init.xavier_uniform_(module.out_proj.weight)
694
+ torch.nn.init.constant_(module.out_proj.bias, 0.0)
695
+ elif isinstance(module, components.Transformer):
696
+ module.apply(components._init_transformer_params)
697
+ else:
698
+ pass
699
+
700
+
701
+ def hubert_pretrain_model(
702
+ extractor_mode: str,
703
+ extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
704
+ extractor_conv_bias: bool,
705
+ encoder_embed_dim: int,
706
+ encoder_projection_dropout: float,
707
+ encoder_pos_conv_kernel: int,
708
+ encoder_pos_conv_groups: int,
709
+ encoder_num_layers: int,
710
+ encoder_num_heads: int,
711
+ encoder_attention_dropout: float,
712
+ encoder_ff_interm_features: int,
713
+ encoder_ff_interm_dropout: float,
714
+ encoder_dropout: float,
715
+ encoder_layer_norm_first: bool,
716
+ encoder_layer_drop: float,
717
+ mask_prob: float,
718
+ mask_selection: str,
719
+ mask_other: float,
720
+ mask_length: int,
721
+ no_mask_overlap: bool,
722
+ mask_min_space: int,
723
+ mask_channel_prob: float,
724
+ mask_channel_selection: str,
725
+ mask_channel_other: float,
726
+ mask_channel_length: int,
727
+ no_mask_channel_overlap: bool,
728
+ mask_channel_min_space: int,
729
+ skip_masked: bool,
730
+ skip_nomask: bool,
731
+ num_classes: int,
732
+ final_dim: int,
733
+ feature_grad_mult: Optional[float],
734
+ ) -> HuBERTPretrainModel:
735
+ """Builds custom :class:`HuBERTPretrainModel` for training from scratch
736
+
737
+ Note:
738
+ The "feature extractor" below corresponds to
739
+ `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
740
+ in the original ``fairseq`` implementation.
741
+ This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
742
+ :cite:`baevski2020wav2vec` paper.
743
+
744
+ The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
745
+ and this is referred as "Transformer" in the paper.
746
+
747
+ Args:
748
+ extractor_mode (str): Operation mode of feature extractor.
749
+ Valid values are ``"group_norm"`` or ``"layer_norm"``.
750
+ If ``"group_norm"``, then a single normalization is applied
751
+ in the first convolution block. Otherwise, all the convolution
752
+ blocks will have layer normalization.
753
+
754
+ This option corresponds to ``extractor_mode`` from ``fairseq``.
755
+
756
+ extractor_conv_layer_config (list of integer tuples or None):
757
+ Configuration of convolution layers in feature extractor.
758
+ List of convolution configuration,
759
+ i.e. ``[(output_channel, kernel_size, stride), ...]``
760
+
761
+ If ``None`` is provided, then the following default value is used.
762
+
763
+ .. code-block:: python
764
+
765
+ [
766
+ (512, 10, 5),
767
+ (512, 3, 2),
768
+ (512, 3, 2),
769
+ (512, 3, 2),
770
+ (512, 3, 2),
771
+ (512, 2, 2),
772
+ (512, 2, 2),
773
+ ]
774
+
775
+ This option corresponds to ``conv_feature_layers`` from ``fairseq``.
776
+
777
+ extractor_conv_bias (bool):
778
+ Whether to include bias term to each convolution operation.
779
+
780
+ This option corresponds to ``conv_bias`` from ``fairseq``.
781
+
782
+ encoder_embed_dim (int):
783
+ The dimension of embedding in encoder.
784
+
785
+ This option corresponds to ``encoder_embed_dim`` from ``fairseq``.
786
+
787
+ encoder_projection_dropout (float):
788
+ The dropout probability applied after the input feature is projected
789
+ to ``encoder_embed_dim``.
790
+
791
+ This option corresponds to ``dropout_input`` from ``fairseq``.
792
+
793
+ encoder_pos_conv_kernel (int):
794
+ The kernel size of convolutional positional embeddings.
795
+
796
+ This option corresponds to ``conv_pos`` from ``fairseq``.
797
+
798
+ encoder_pos_conv_groups (int):
799
+ The number of groups of convolutional positional embeddings.
800
+
801
+ This option corresponds to ``conv_pos_groups`` from ``fairseq``.
802
+
803
+ encoder_num_layers (int):
804
+ The number of self attention layers in transformer block.
805
+
806
+ This option corresponds to ``encoder_layers`` from ``fairseq``.
807
+
808
+ encoder_num_heads (int):
809
+ The number of heads in self attention layers.
810
+
811
+ This option corresponds to ``encoder_attention_heads`` from ``fairseq``.
812
+
813
+ encoder_attention_dropout (float):
814
+ The dropout probability applied after softmax in self-attention layer.
815
+
816
+ This option corresponds to ``attention_dropout`` from ``fairseq``.
817
+
818
+ encoder_ff_interm_features (int):
819
+ The dimension of hidden features in feed forward layer.
820
+
821
+ This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``.
822
+
823
+ encoder_ff_interm_dropout (float):
824
+ The dropout probability applied in feedforward layer.
825
+
826
+ This option correspinds to ``activation_dropout`` from ``fairseq``.
827
+
828
+ encoder_dropout (float):
829
+ The dropout probability applied at the end of feed forward layer.
830
+
831
+ This option corresponds to ``dropout`` from ``fairseq``.
832
+
833
+ encoder_layer_norm_first (bool):
834
+ Control the order of layer norm in transformer layer and each encoder layer.
835
+ If True, in transformer layer, layer norm is applied before features are fed
836
+ to encoder layers. In encoder layer, two layer norms are applied before and after
837
+ self attention.
838
+ If False, in transformer layer, layer norm is applied after features are fed
839
+ to encoder layers. In encoder layer, two layer norms are applied after self
840
+ attention, before and after feed forward.
841
+
842
+ This option corresponds to ``layer_norm_first`` from ``fairseq``.
843
+
844
+ encoder_layer_drop (float):
845
+ Probability to drop each encoder layer during training.
846
+
847
+ This option corresponds to ``layerdrop`` from ``fairseq``.
848
+
849
+ mask_prob (float):
850
+ Probability for each token to be chosen as start of the span to be masked. this will be multiplied by
851
+ number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
852
+ However due to overlaps, the actual number will be smaller (unless no_overlap is True).
853
+
854
+ This option corresponds to ``mask_prob`` from ``fairseq``.
855
+
856
+ mask_selection (str):
857
+ How to choose the mask length. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
858
+
859
+ This option corresponds to ``mask_selection`` from ``fairseq``.
860
+
861
+ mask_other (float):
862
+ Secondary mask argument (used for more complex distributions).
863
+
864
+ This option corresponds to ``mask_other`` from ``fairseq``.
865
+
866
+ mask_length (int):
867
+ The lengths of the mask.
868
+
869
+ This option corresponds to ``mask_length`` from ``fairseq``.
870
+
871
+ no_mask_overlap (bool):
872
+ Whether to allow masks to overlap.
873
+
874
+ This option corresponds to ``no_mask_overlap`` from ``fairseq``.
875
+
876
+ mask_min_space (int):
877
+ Minimum space between spans (if no overlap is enabled).
878
+
879
+ This option corresponds to ``mask_min_space`` from ``fairseq``.
880
+
881
+ mask_channel_prob: (float):
882
+ The probability of replacing a feature with 0.
883
+
884
+ This option corresponds to ``mask_channel_prob`` from ``fairseq``.
885
+
886
+ mask_channel_selection (str):
887
+ How to choose the mask length for channel masking. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
888
+
889
+ This option corresponds to ``mask_channel_selection`` from ``fairseq``.
890
+
891
+ mask_channel_other (float):
892
+ Secondary mask argument for channel masking(used for more complex distributions).
893
+
894
+ This option corresponds to ``mask_channel_other`` from ``fairseq``.
895
+
896
+ mask_channel_length (int):
897
+ Minimum space between spans (if no overlap is enabled) for channel masking.
898
+
899
+ This option corresponds to ``mask_channel_length`` from ``fairseq``.
900
+
901
+ no_mask_channel_overlap (bool):
902
+ Whether to allow channel masks to overlap.
903
+
904
+ This option corresponds to ``no_mask_channel_overlap`` from ``fairseq``.
905
+
906
+ mask_channel_min_space (int):
907
+ Minimum space between spans for channel masking(if no overlap is enabled).
908
+
909
+ This option corresponds to ``mask_channel_min_space`` from ``fairseq``.
910
+
911
+ skip_masked (bool):
912
+ If True, skip computing losses over masked frames.
913
+
914
+ This option corresponds to ``skip_masked`` from ``fairseq``.
915
+
916
+ skip_nomask (bool):
917
+ If True, skip computing losses over unmasked frames.
918
+
919
+ This option corresponds to ``skip_nomask`` from ``fairseq``.
920
+
921
+ num_classes (int):
922
+ The number of classes in the labels.
923
+
924
+ final_dim (int):
925
+ Project final representations and targets to `final_dim`.
926
+
927
+ This option corresponds to ``final_dim`` from ``fairseq``.
928
+
929
+ feature_grad_mult (float or None):
930
+ The factor to scale the convolutional feature extraction layer gradients by.
931
+ The scale factor will not affect the forward pass.
932
+
933
+ This option corresponds to ``feature_grad_mult`` from ``fairseq``.
934
+
935
+ Returns:
936
+ HuBERTPretrainModel:
937
+ The resulting model.
938
+ """ # noqa: E501
939
+ if extractor_conv_layer_config is None:
940
+ extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
941
+
942
+ feature_extractor = components._get_feature_extractor(
943
+ extractor_mode, extractor_conv_layer_config, extractor_conv_bias
944
+ )
945
+ encoder = components._get_encoder(
946
+ in_features=extractor_conv_layer_config[-1][0],
947
+ embed_dim=encoder_embed_dim,
948
+ dropout_input=encoder_projection_dropout,
949
+ pos_conv_kernel=encoder_pos_conv_kernel,
950
+ pos_conv_groups=encoder_pos_conv_groups,
951
+ num_layers=encoder_num_layers,
952
+ num_heads=encoder_num_heads,
953
+ attention_dropout=encoder_attention_dropout,
954
+ ff_interm_features=encoder_ff_interm_features,
955
+ ff_interm_dropout=encoder_ff_interm_dropout,
956
+ dropout=encoder_dropout,
957
+ layer_norm_first=encoder_layer_norm_first,
958
+ layer_drop=encoder_layer_drop,
959
+ )
960
+ wav2vec2 = Wav2Vec2Model(feature_extractor, encoder)
961
+ mask_generator = components.MaskGenerator(
962
+ encoder_embed_dim,
963
+ mask_prob,
964
+ mask_selection,
965
+ mask_other,
966
+ mask_length,
967
+ no_mask_overlap,
968
+ mask_min_space,
969
+ mask_channel_prob,
970
+ mask_channel_selection,
971
+ mask_channel_other,
972
+ mask_channel_length,
973
+ no_mask_channel_overlap,
974
+ mask_channel_min_space,
975
+ )
976
+ logit_generator = components.LogitGenerator(
977
+ encoder_embed_dim,
978
+ num_classes,
979
+ final_dim,
980
+ skip_masked,
981
+ skip_nomask,
982
+ )
983
+ model = HuBERTPretrainModel(
984
+ wav2vec2=wav2vec2,
985
+ mask_generator=mask_generator,
986
+ logit_generator=logit_generator,
987
+ feature_grad_mult=feature_grad_mult,
988
+ )
989
+ # initialize the model for pre-training
990
+ model.apply(_init_hubert_pretrain_model)
991
+ return model
992
+
993
+
994
+ def hubert_pretrain_base(
995
+ encoder_projection_dropout: float = 0.1,
996
+ encoder_attention_dropout: float = 0.1,
997
+ encoder_ff_interm_dropout: float = 0.0,
998
+ encoder_dropout: float = 0.1,
999
+ encoder_layer_drop: float = 0.05,
1000
+ mask_prob: float = 0.8,
1001
+ mask_channel_prob: float = 0.0,
1002
+ mask_channel_length: int = 10,
1003
+ feature_grad_mult: Optional[float] = 0.1,
1004
+ num_classes: int = 100,
1005
+ ) -> HuBERTPretrainModel:
1006
+ """Builds "base" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
1007
+
1008
+ Args:
1009
+ encoder_projection_dropout (float):
1010
+ See :py:func:`hubert_pretrain_model`.
1011
+ encoder_attention_dropout (float):
1012
+ See :py:func:`hubert_pretrain_model`.
1013
+ encoder_ff_interm_dropout (float):
1014
+ See :py:func:`hubert_pretrain_model`.
1015
+ encoder_dropout (float):
1016
+ See :py:func:`hubert_pretrain_model`.
1017
+ encoder_layer_drop (float):
1018
+ See :py:func:`hubert_pretrain_model`.
1019
+ mask_prob (float):
1020
+ See :py:func:`hubert_pretrain_model`.
1021
+ mask_channel_prob (float):
1022
+ See :py:func:`hubert_pretrain_model`.
1023
+ mask_channel_length (int):
1024
+ See :py:func:`hubert_pretrain_model`.
1025
+ feature_grad_mult (float or None):
1026
+ See :py:func:`hubert_pretrain_model`.
1027
+ num_classes (int, optional):
1028
+ See :py:func:`hubert_pretrain_model`.
1029
+
1030
+ Returns:
1031
+ HuBERTPretrainModel:
1032
+ The resulting model.
1033
+ """ # noqa: E501
1034
+ return hubert_pretrain_model(
1035
+ extractor_mode="group_norm",
1036
+ extractor_conv_layer_config=None,
1037
+ extractor_conv_bias=False,
1038
+ encoder_embed_dim=768,
1039
+ encoder_projection_dropout=encoder_projection_dropout,
1040
+ encoder_pos_conv_kernel=128,
1041
+ encoder_pos_conv_groups=16,
1042
+ encoder_num_layers=12,
1043
+ encoder_num_heads=12,
1044
+ encoder_attention_dropout=encoder_attention_dropout,
1045
+ encoder_ff_interm_features=3072,
1046
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1047
+ encoder_dropout=encoder_dropout,
1048
+ encoder_layer_norm_first=False,
1049
+ encoder_layer_drop=encoder_layer_drop,
1050
+ mask_prob=mask_prob,
1051
+ mask_selection="static",
1052
+ mask_other=0.0,
1053
+ mask_length=10,
1054
+ no_mask_overlap=False,
1055
+ mask_min_space=1,
1056
+ mask_channel_prob=mask_channel_prob,
1057
+ mask_channel_selection="static",
1058
+ mask_channel_other=0.0,
1059
+ mask_channel_length=mask_channel_length,
1060
+ no_mask_channel_overlap=False,
1061
+ mask_channel_min_space=1,
1062
+ skip_masked=False,
1063
+ skip_nomask=False,
1064
+ num_classes=num_classes,
1065
+ final_dim=256,
1066
+ feature_grad_mult=feature_grad_mult,
1067
+ )
1068
+
1069
+
1070
+ def hubert_pretrain_large(
1071
+ encoder_projection_dropout: float = 0.0,
1072
+ encoder_attention_dropout: float = 0.0,
1073
+ encoder_ff_interm_dropout: float = 0.0,
1074
+ encoder_dropout: float = 0.0,
1075
+ encoder_layer_drop: float = 0.0,
1076
+ mask_prob: float = 0.8,
1077
+ mask_channel_prob: float = 0.0,
1078
+ mask_channel_length: int = 10,
1079
+ feature_grad_mult: Optional[float] = None,
1080
+ ) -> HuBERTPretrainModel:
1081
+ """Builds "large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
1082
+
1083
+ Args:
1084
+ encoder_projection_dropout (float):
1085
+ See :py:func:`hubert_pretrain_model`.
1086
+ encoder_attention_dropout (float):
1087
+ See :py:func:`hubert_pretrain_model`.
1088
+ encoder_ff_interm_dropout (float):
1089
+ See :py:func:`hubert_pretrain_model`.
1090
+ encoder_dropout (float):
1091
+ See :py:func:`hubert_pretrain_model`.
1092
+ encoder_layer_drop (float):
1093
+ See :py:func:`hubert_pretrain_model`.
1094
+ mask_prob (float):
1095
+ See :py:func:`hubert_pretrain_model`.
1096
+ mask_channel_prob (float):
1097
+ See :py:func:`hubert_pretrain_model`.
1098
+ mask_channel_length (int):
1099
+ See :py:func:`hubert_pretrain_model`.
1100
+ feature_grad_mult (float or None):
1101
+ See :py:func:`hubert_pretrain_model`.
1102
+
1103
+ Returns:
1104
+ HuBERTPretrainModel:
1105
+ The resulting model.
1106
+ """ # noqa: E501
1107
+ return hubert_pretrain_model(
1108
+ extractor_mode="layer_norm",
1109
+ extractor_conv_layer_config=None,
1110
+ extractor_conv_bias=False,
1111
+ encoder_embed_dim=1024,
1112
+ encoder_projection_dropout=encoder_projection_dropout,
1113
+ encoder_pos_conv_kernel=128,
1114
+ encoder_pos_conv_groups=16,
1115
+ encoder_num_layers=24,
1116
+ encoder_num_heads=16,
1117
+ encoder_attention_dropout=encoder_attention_dropout,
1118
+ encoder_ff_interm_features=4096,
1119
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1120
+ encoder_dropout=encoder_dropout,
1121
+ encoder_layer_norm_first=True,
1122
+ encoder_layer_drop=encoder_layer_drop,
1123
+ mask_prob=mask_prob,
1124
+ mask_selection="static",
1125
+ mask_other=0.0,
1126
+ mask_length=10,
1127
+ no_mask_overlap=False,
1128
+ mask_min_space=1,
1129
+ mask_channel_prob=mask_channel_prob,
1130
+ mask_channel_selection="static",
1131
+ mask_channel_other=0.0,
1132
+ mask_channel_length=mask_channel_length,
1133
+ no_mask_channel_overlap=False,
1134
+ mask_channel_min_space=1,
1135
+ skip_masked=False,
1136
+ skip_nomask=False,
1137
+ num_classes=500,
1138
+ final_dim=768,
1139
+ feature_grad_mult=feature_grad_mult,
1140
+ )
1141
+
1142
+
1143
+ def hubert_pretrain_xlarge(
1144
+ encoder_projection_dropout: float = 0.0,
1145
+ encoder_attention_dropout: float = 0.0,
1146
+ encoder_ff_interm_dropout: float = 0.0,
1147
+ encoder_dropout: float = 0.0,
1148
+ encoder_layer_drop: float = 0.0,
1149
+ mask_prob: float = 0.8,
1150
+ mask_channel_prob: float = 0.0,
1151
+ mask_channel_length: int = 10,
1152
+ feature_grad_mult: Optional[float] = None,
1153
+ ) -> HuBERTPretrainModel:
1154
+ """Builds "extra large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining.
1155
+
1156
+ Args:
1157
+ encoder_projection_dropout (float):
1158
+ See :py:func:`hubert_pretrain_model`.
1159
+ encoder_attention_dropout (float):
1160
+ See :py:func:`hubert_pretrain_model`.
1161
+ encoder_ff_interm_dropout (float):
1162
+ See :py:func:`hubert_pretrain_model`.
1163
+ encoder_dropout (float):
1164
+ See :py:func:`hubert_pretrain_model`.
1165
+ encoder_layer_drop (float):
1166
+ See :py:func:`hubert_pretrain_model`.
1167
+ mask_prob (float):
1168
+ See :py:func:`hubert_pretrain_model`.
1169
+ mask_channel_prob (float):
1170
+ See :py:func:`hubert_pretrain_model`.
1171
+ mask_channel_length (int):
1172
+ See :py:func:`hubert_pretrain_model`.
1173
+ feature_grad_mult (float or None):
1174
+ See :py:func:`hubert_pretrain_model`.
1175
+
1176
+ Returns:
1177
+ HuBERTPretrainModel:
1178
+ The resulting model.
1179
+ """ # noqa: E501
1180
+ return hubert_pretrain_model(
1181
+ extractor_mode="layer_norm",
1182
+ extractor_conv_layer_config=None,
1183
+ extractor_conv_bias=False,
1184
+ encoder_embed_dim=1280,
1185
+ encoder_projection_dropout=encoder_projection_dropout,
1186
+ encoder_pos_conv_kernel=128,
1187
+ encoder_pos_conv_groups=16,
1188
+ encoder_num_layers=48,
1189
+ encoder_num_heads=16,
1190
+ encoder_attention_dropout=encoder_attention_dropout,
1191
+ encoder_ff_interm_features=5120,
1192
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1193
+ encoder_dropout=encoder_dropout,
1194
+ encoder_layer_norm_first=True,
1195
+ encoder_layer_drop=encoder_layer_drop,
1196
+ mask_prob=mask_prob,
1197
+ mask_selection="static",
1198
+ mask_other=0.0,
1199
+ mask_length=10,
1200
+ no_mask_overlap=False,
1201
+ mask_min_space=1,
1202
+ mask_channel_prob=mask_channel_prob,
1203
+ mask_channel_selection="static",
1204
+ mask_channel_other=0.0,
1205
+ mask_channel_length=mask_channel_length,
1206
+ no_mask_channel_overlap=False,
1207
+ mask_channel_min_space=1,
1208
+ skip_masked=False,
1209
+ skip_nomask=False,
1210
+ num_classes=500,
1211
+ final_dim=1024,
1212
+ feature_grad_mult=feature_grad_mult,
1213
+ )
1214
+
1215
+
1216
+ def wavlm_model(
1217
+ extractor_mode: str,
1218
+ extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
1219
+ extractor_conv_bias: bool,
1220
+ encoder_embed_dim: int,
1221
+ encoder_projection_dropout: float,
1222
+ encoder_pos_conv_kernel: int,
1223
+ encoder_pos_conv_groups: int,
1224
+ encoder_num_layers: int,
1225
+ encoder_num_heads: int,
1226
+ encoder_num_buckets: int,
1227
+ encoder_max_distance: int,
1228
+ encoder_attention_dropout: float,
1229
+ encoder_ff_interm_features: int,
1230
+ encoder_ff_interm_dropout: float,
1231
+ encoder_dropout: float,
1232
+ encoder_layer_norm_first: bool,
1233
+ encoder_layer_drop: float,
1234
+ aux_num_out: Optional[int],
1235
+ ) -> Wav2Vec2Model:
1236
+ """Builds custom WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
1237
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output object is
1238
+ :class:`~torchaudio.models.Wav2Vec2Model`. Most of the arguments have the same meaning
1239
+ as in :py:func:`~torchaudio.models.wav2vec2_model` so please refer there for documentation.
1240
+
1241
+ Args:
1242
+ extractor_mode (str): Operation mode of feature extractor.
1243
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1244
+
1245
+ extractor_conv_layer_config (list of integer tuples or None):
1246
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1247
+
1248
+ extractor_conv_bias (bool):
1249
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1250
+
1251
+ encoder_embed_dim (int):
1252
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1253
+
1254
+ encoder_projection_dropout (float):
1255
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1256
+
1257
+ encoder_pos_conv_kernel (int):
1258
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1259
+
1260
+ encoder_pos_conv_groups (int):
1261
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1262
+
1263
+ encoder_num_layers (int):
1264
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1265
+
1266
+ encoder_num_heads (int):
1267
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1268
+
1269
+ encoder_num_buckets (int):
1270
+ Number of buckets for relative position embedding.
1271
+ encoder_max_distance (int):
1272
+ Maximum distance for relative position embedding.
1273
+
1274
+ encoder_attention_dropout (float):
1275
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1276
+
1277
+ encoder_ff_interm_features (int):
1278
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1279
+
1280
+ encoder_ff_interm_dropout (float):
1281
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1282
+
1283
+ encoder_dropout (float):
1284
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1285
+
1286
+ encoder_layer_norm_first (bool):
1287
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1288
+
1289
+ encoder_layer_drop (float):
1290
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1291
+
1292
+ aux_num_out (int or None):
1293
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1294
+
1295
+ Returns:
1296
+ Wav2Vec2Model:
1297
+ The resulting model.
1298
+ """
1299
+ if extractor_conv_layer_config is None:
1300
+ extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
1301
+
1302
+ feature_extractor = components._get_feature_extractor(
1303
+ extractor_mode, extractor_conv_layer_config, extractor_conv_bias
1304
+ )
1305
+ encoder = components._get_wavlm_encoder(
1306
+ in_features=extractor_conv_layer_config[-1][0],
1307
+ embed_dim=encoder_embed_dim,
1308
+ dropout_input=encoder_projection_dropout,
1309
+ pos_conv_kernel=encoder_pos_conv_kernel,
1310
+ pos_conv_groups=encoder_pos_conv_groups,
1311
+ num_layers=encoder_num_layers,
1312
+ num_heads=encoder_num_heads,
1313
+ num_buckets=encoder_num_buckets,
1314
+ max_distance=encoder_max_distance,
1315
+ attention_dropout=encoder_attention_dropout,
1316
+ ff_interm_features=encoder_ff_interm_features,
1317
+ ff_interm_dropout=encoder_ff_interm_dropout,
1318
+ dropout=encoder_dropout,
1319
+ layer_norm_first=encoder_layer_norm_first,
1320
+ layer_drop=encoder_layer_drop,
1321
+ )
1322
+ aux = None
1323
+ if aux_num_out is not None:
1324
+ aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
1325
+ return Wav2Vec2Model(feature_extractor, encoder, aux)
1326
+
1327
+
1328
+ def wavlm_base(
1329
+ encoder_projection_dropout: float = 0.1,
1330
+ encoder_attention_dropout: float = 0.1,
1331
+ encoder_ff_interm_dropout: float = 0.1,
1332
+ encoder_dropout: float = 0.1,
1333
+ encoder_layer_drop: float = 0.1,
1334
+ aux_num_out: Optional[int] = None,
1335
+ ) -> Wav2Vec2Model:
1336
+ """Builds "base" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
1337
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
1338
+ :class:`~torchaudio.models.Wav2Vec2Model`.
1339
+
1340
+ Args:
1341
+ encoder_projection_dropout (float):
1342
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1343
+ encoder_attention_dropout (float):
1344
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1345
+ encoder_ff_interm_dropout (float):
1346
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1347
+ encoder_dropout (float):
1348
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1349
+ encoder_layer_drop (float):
1350
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1351
+ aux_num_out (int, optional):
1352
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1353
+
1354
+ Returns:
1355
+ Wav2Vec2Model:
1356
+ The resulting model.
1357
+ """
1358
+ return wavlm_model(
1359
+ extractor_mode="group_norm",
1360
+ extractor_conv_layer_config=None,
1361
+ extractor_conv_bias=False,
1362
+ encoder_embed_dim=768,
1363
+ encoder_projection_dropout=encoder_projection_dropout,
1364
+ encoder_pos_conv_kernel=128,
1365
+ encoder_pos_conv_groups=16,
1366
+ encoder_num_layers=12,
1367
+ encoder_num_heads=12,
1368
+ encoder_num_buckets=320,
1369
+ encoder_max_distance=800,
1370
+ encoder_attention_dropout=encoder_attention_dropout,
1371
+ encoder_ff_interm_features=3072,
1372
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1373
+ encoder_dropout=encoder_dropout,
1374
+ encoder_layer_norm_first=False,
1375
+ encoder_layer_drop=encoder_layer_drop,
1376
+ aux_num_out=aux_num_out,
1377
+ )
1378
+
1379
+
1380
+ def wavlm_large(
1381
+ encoder_projection_dropout: float = 0.1,
1382
+ encoder_attention_dropout: float = 0.1,
1383
+ encoder_ff_interm_dropout: float = 0.0,
1384
+ encoder_dropout: float = 0.1,
1385
+ encoder_layer_drop: float = 0.1,
1386
+ aux_num_out: Optional[int] = None,
1387
+ ) -> Wav2Vec2Model:
1388
+ """Builds "large" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
1389
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
1390
+ :class:`~torchaudio.models.Wav2Vec2Model`.
1391
+
1392
+ Args:
1393
+ encoder_projection_dropout (float):
1394
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1395
+ encoder_attention_dropout (float):
1396
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1397
+ encoder_ff_interm_dropout (float):
1398
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1399
+ encoder_dropout (float):
1400
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1401
+ encoder_layer_drop (float):
1402
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1403
+ aux_num_out (int, optional):
1404
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1405
+
1406
+ Returns:
1407
+ Wav2Vec2Model:
1408
+ The resulting model.
1409
+ """
1410
+ return wavlm_model(
1411
+ extractor_mode="layer_norm",
1412
+ extractor_conv_layer_config=None,
1413
+ extractor_conv_bias=False,
1414
+ encoder_embed_dim=1024,
1415
+ encoder_projection_dropout=encoder_projection_dropout,
1416
+ encoder_pos_conv_kernel=128,
1417
+ encoder_pos_conv_groups=16,
1418
+ encoder_num_layers=24,
1419
+ encoder_num_heads=16,
1420
+ encoder_num_buckets=320,
1421
+ encoder_max_distance=800,
1422
+ encoder_attention_dropout=encoder_attention_dropout,
1423
+ encoder_ff_interm_features=4096,
1424
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1425
+ encoder_dropout=encoder_dropout,
1426
+ encoder_layer_norm_first=True,
1427
+ encoder_layer_drop=encoder_layer_drop,
1428
+ aux_num_out=aux_num_out,
1429
+ )
1430
+
1431
+
1432
+ def wav2vec2_xlsr_300m(
1433
+ encoder_projection_dropout: float = 0.0,
1434
+ encoder_attention_dropout: float = 0.0,
1435
+ encoder_ff_interm_dropout: float = 0.0,
1436
+ encoder_dropout: float = 0.0,
1437
+ encoder_layer_drop: float = 0.0,
1438
+ aux_num_out: Optional[int] = None,
1439
+ ) -> Wav2Vec2Model:
1440
+ """Builds XLS-R model :cite:`babu2021xls` with 300 millions of parameters. The architecture is compatible
1441
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
1442
+ :class:`~torchaudio.models.Wav2Vec2Model`.
1443
+
1444
+ Args:
1445
+ encoder_projection_dropout (float):
1446
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1447
+ encoder_attention_dropout (float):
1448
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1449
+ encoder_ff_interm_dropout (float):
1450
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1451
+ encoder_dropout (float):
1452
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1453
+ encoder_layer_drop (float):
1454
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1455
+ aux_num_out (int, optional):
1456
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1457
+
1458
+ Returns:
1459
+ Wav2Vec2Model:
1460
+ The resulting model.
1461
+ """
1462
+ return wav2vec2_model(
1463
+ extractor_mode="layer_norm",
1464
+ extractor_conv_layer_config=None,
1465
+ extractor_conv_bias=True,
1466
+ encoder_embed_dim=1024,
1467
+ encoder_projection_dropout=encoder_projection_dropout,
1468
+ encoder_pos_conv_kernel=128,
1469
+ encoder_pos_conv_groups=16,
1470
+ encoder_num_layers=24,
1471
+ encoder_num_heads=16,
1472
+ encoder_attention_dropout=encoder_attention_dropout,
1473
+ encoder_ff_interm_features=4096,
1474
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1475
+ encoder_dropout=encoder_dropout,
1476
+ encoder_layer_norm_first=True,
1477
+ encoder_layer_drop=encoder_layer_drop,
1478
+ aux_num_out=aux_num_out,
1479
+ )
1480
+
1481
+
1482
+ def wav2vec2_xlsr_1b(
1483
+ encoder_projection_dropout: float = 0.1,
1484
+ encoder_attention_dropout: float = 0.0,
1485
+ encoder_ff_interm_dropout: float = 0.0,
1486
+ encoder_dropout: float = 0.0,
1487
+ encoder_layer_drop: float = 0.0,
1488
+ aux_num_out: Optional[int] = None,
1489
+ ) -> Wav2Vec2Model:
1490
+ """Builds XLS-R model :cite:`babu2021xls` with 1 billion of parameters. The architecture is compatible
1491
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
1492
+ :class:`~torchaudio.models.Wav2Vec2Model`.
1493
+
1494
+ Args:
1495
+ encoder_projection_dropout (float):
1496
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1497
+ encoder_attention_dropout (float):
1498
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1499
+ encoder_ff_interm_dropout (float):
1500
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1501
+ encoder_dropout (float):
1502
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1503
+ encoder_layer_drop (float):
1504
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1505
+ aux_num_out (int, optional):
1506
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1507
+
1508
+ Returns:
1509
+ Wav2Vec2Model:
1510
+ The resulting model.
1511
+ """
1512
+ return wav2vec2_model(
1513
+ extractor_mode="layer_norm",
1514
+ extractor_conv_layer_config=None,
1515
+ extractor_conv_bias=True,
1516
+ encoder_embed_dim=1280,
1517
+ encoder_projection_dropout=encoder_projection_dropout,
1518
+ encoder_pos_conv_kernel=128,
1519
+ encoder_pos_conv_groups=16,
1520
+ encoder_num_layers=48,
1521
+ encoder_num_heads=16,
1522
+ encoder_attention_dropout=encoder_attention_dropout,
1523
+ encoder_ff_interm_features=5120,
1524
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1525
+ encoder_dropout=encoder_dropout,
1526
+ encoder_layer_norm_first=True,
1527
+ encoder_layer_drop=encoder_layer_drop,
1528
+ aux_num_out=aux_num_out,
1529
+ )
1530
+
1531
+
1532
+ def wav2vec2_xlsr_2b(
1533
+ encoder_projection_dropout: float = 0.1,
1534
+ encoder_attention_dropout: float = 0.0,
1535
+ encoder_ff_interm_dropout: float = 0.0,
1536
+ encoder_dropout: float = 0.0,
1537
+ encoder_layer_drop: float = 0.0,
1538
+ aux_num_out: Optional[int] = None,
1539
+ ) -> Wav2Vec2Model:
1540
+ """Builds XLS-R model :cite:`babu2021xls` with 2 billions of parameters. The architecture is compatible
1541
+ with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
1542
+ :class:`~torchaudio.models.Wav2Vec2Model`.
1543
+
1544
+ Args:
1545
+ encoder_projection_dropout (float):
1546
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1547
+ encoder_attention_dropout (float):
1548
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1549
+ encoder_ff_interm_dropout (float):
1550
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1551
+ encoder_dropout (float):
1552
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1553
+ encoder_layer_drop (float):
1554
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1555
+ aux_num_out (int, optional):
1556
+ See :py:func:`~torchaudio.models.wav2vec2_model`.
1557
+
1558
+ Returns:
1559
+ Wav2Vec2Model:
1560
+ The resulting model.
1561
+ """
1562
+ return wav2vec2_model(
1563
+ extractor_mode="layer_norm",
1564
+ extractor_conv_layer_config=None,
1565
+ extractor_conv_bias=True,
1566
+ encoder_embed_dim=1920,
1567
+ encoder_projection_dropout=encoder_projection_dropout,
1568
+ encoder_pos_conv_kernel=128,
1569
+ encoder_pos_conv_groups=16,
1570
+ encoder_num_layers=48,
1571
+ encoder_num_heads=16,
1572
+ encoder_attention_dropout=encoder_attention_dropout,
1573
+ encoder_ff_interm_features=7680,
1574
+ encoder_ff_interm_dropout=encoder_ff_interm_dropout,
1575
+ encoder_dropout=encoder_dropout,
1576
+ encoder_layer_norm_first=True,
1577
+ encoder_layer_drop=encoder_layer_drop,
1578
+ aux_num_out=aux_num_out,
1579
+ )