torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. torchaudio/__init__.py +204 -0
  2. torchaudio/_extension/__init__.py +61 -0
  3. torchaudio/_extension/utils.py +133 -0
  4. torchaudio/_internal/__init__.py +10 -0
  5. torchaudio/_internal/module_utils.py +171 -0
  6. torchaudio/_torchcodec.py +340 -0
  7. torchaudio/compliance/__init__.py +5 -0
  8. torchaudio/compliance/kaldi.py +813 -0
  9. torchaudio/datasets/__init__.py +47 -0
  10. torchaudio/datasets/cmuarctic.py +157 -0
  11. torchaudio/datasets/cmudict.py +186 -0
  12. torchaudio/datasets/commonvoice.py +86 -0
  13. torchaudio/datasets/dr_vctk.py +121 -0
  14. torchaudio/datasets/fluentcommands.py +108 -0
  15. torchaudio/datasets/gtzan.py +1118 -0
  16. torchaudio/datasets/iemocap.py +147 -0
  17. torchaudio/datasets/librilight_limited.py +111 -0
  18. torchaudio/datasets/librimix.py +133 -0
  19. torchaudio/datasets/librispeech.py +174 -0
  20. torchaudio/datasets/librispeech_biasing.py +189 -0
  21. torchaudio/datasets/libritts.py +168 -0
  22. torchaudio/datasets/ljspeech.py +107 -0
  23. torchaudio/datasets/musdb_hq.py +139 -0
  24. torchaudio/datasets/quesst14.py +136 -0
  25. torchaudio/datasets/snips.py +157 -0
  26. torchaudio/datasets/speechcommands.py +183 -0
  27. torchaudio/datasets/tedlium.py +218 -0
  28. torchaudio/datasets/utils.py +54 -0
  29. torchaudio/datasets/vctk.py +143 -0
  30. torchaudio/datasets/voxceleb1.py +309 -0
  31. torchaudio/datasets/yesno.py +89 -0
  32. torchaudio/functional/__init__.py +130 -0
  33. torchaudio/functional/_alignment.py +128 -0
  34. torchaudio/functional/filtering.py +1685 -0
  35. torchaudio/functional/functional.py +2505 -0
  36. torchaudio/lib/__init__.py +0 -0
  37. torchaudio/lib/_torchaudio.so +0 -0
  38. torchaudio/lib/libtorchaudio.so +0 -0
  39. torchaudio/models/__init__.py +85 -0
  40. torchaudio/models/_hdemucs.py +1008 -0
  41. torchaudio/models/conformer.py +293 -0
  42. torchaudio/models/conv_tasnet.py +330 -0
  43. torchaudio/models/decoder/__init__.py +64 -0
  44. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  45. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  46. torchaudio/models/deepspeech.py +84 -0
  47. torchaudio/models/emformer.py +884 -0
  48. torchaudio/models/rnnt.py +816 -0
  49. torchaudio/models/rnnt_decoder.py +339 -0
  50. torchaudio/models/squim/__init__.py +11 -0
  51. torchaudio/models/squim/objective.py +326 -0
  52. torchaudio/models/squim/subjective.py +150 -0
  53. torchaudio/models/tacotron2.py +1046 -0
  54. torchaudio/models/wav2letter.py +72 -0
  55. torchaudio/models/wav2vec2/__init__.py +45 -0
  56. torchaudio/models/wav2vec2/components.py +1167 -0
  57. torchaudio/models/wav2vec2/model.py +1579 -0
  58. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  59. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  60. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  61. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  62. torchaudio/models/wavernn.py +409 -0
  63. torchaudio/pipelines/__init__.py +102 -0
  64. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  65. torchaudio/pipelines/_squim_pipeline.py +156 -0
  66. torchaudio/pipelines/_tts/__init__.py +16 -0
  67. torchaudio/pipelines/_tts/impl.py +385 -0
  68. torchaudio/pipelines/_tts/interface.py +255 -0
  69. torchaudio/pipelines/_tts/utils.py +230 -0
  70. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  71. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  72. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  73. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  74. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  75. torchaudio/transforms/__init__.py +78 -0
  76. torchaudio/transforms/_multi_channel.py +467 -0
  77. torchaudio/transforms/_transforms.py +2138 -0
  78. torchaudio/utils/__init__.py +4 -0
  79. torchaudio/utils/download.py +89 -0
  80. torchaudio/version.py +2 -0
  81. torchaudio-2.9.1.dist-info/METADATA +133 -0
  82. torchaudio-2.9.1.dist-info/RECORD +85 -0
  83. torchaudio-2.9.1.dist-info/WHEEL +5 -0
  84. torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  85. torchaudio-2.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1699 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, Optional, Tuple
3
+
4
+ from torch.nn import Module
5
+
6
+ from . import aligner, utils
7
+
8
+
9
+ __all__ = [] # type: ignore
10
+
11
+
12
+ @dataclass
13
+ class Wav2Vec2Bundle:
14
+ """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model`.
15
+
16
+ This class provides interfaces for instantiating the pretrained model along with
17
+ the information necessary to retrieve pretrained weights and additional data
18
+ to be used with the model.
19
+
20
+ Torchaudio library instantiates objects of this class, each of which represents
21
+ a different pretrained model. Client code should access pretrained models via these
22
+ instances.
23
+
24
+ Please see below for the usage and the available values.
25
+
26
+ Example - Feature Extraction
27
+ >>> import torchaudio
28
+ >>>
29
+ >>> bundle = torchaudio.pipelines.HUBERT_BASE
30
+ >>>
31
+ >>> # Build the model and load pretrained weight.
32
+ >>> model = bundle.get_model()
33
+ Downloading:
34
+ 100%|███████████████████████████████| 360M/360M [00:06<00:00, 60.6MB/s]
35
+ >>>
36
+ >>> # Resample audio to the expected sampling rate
37
+ >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
38
+ >>>
39
+ >>> # Extract acoustic features
40
+ >>> features, _ = model.extract_features(waveform)
41
+ """ # noqa: E501
42
+
43
+ _path: str
44
+ _params: Dict[str, Any]
45
+ _sample_rate: float
46
+ _normalize_waveform: bool
47
+ _model_type: str
48
+
49
+ @property
50
+ def sample_rate(self) -> float:
51
+ """Sample rate of the audio that the model is trained on.
52
+
53
+ :type: float
54
+ """
55
+ return self._sample_rate
56
+
57
+ def _get_state_dict(self, dl_kwargs):
58
+ # Note: This method is overridden in ASR bundle
59
+ return utils._get_state_dict(self._path, dl_kwargs)
60
+
61
+ def get_model(self, *, dl_kwargs=None) -> Module:
62
+ """Construct the model and load the pretrained weight.
63
+
64
+ The weight file is downloaded from the internet and cached with
65
+ :func:`torch.hub.load_state_dict_from_url`
66
+
67
+ Args:
68
+ dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
69
+
70
+ Returns:
71
+ Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
72
+
73
+ For the models listed below, an additional layer normalization is performed on the input.
74
+
75
+ For all other models, a :py:class:`~torchaudio.models.Wav2Vec2Model` instance is returned.
76
+
77
+ - WAV2VEC2_LARGE_LV60K
78
+ - WAV2VEC2_ASR_LARGE_LV60K_10M
79
+ - WAV2VEC2_ASR_LARGE_LV60K_100H
80
+ - WAV2VEC2_ASR_LARGE_LV60K_960H
81
+ - WAV2VEC2_XLSR53
82
+ - WAV2VEC2_XLSR_300M
83
+ - WAV2VEC2_XLSR_1B
84
+ - WAV2VEC2_XLSR_2B
85
+ - HUBERT_LARGE
86
+ - HUBERT_XLARGE
87
+ - HUBERT_ASR_LARGE
88
+ - HUBERT_ASR_XLARGE
89
+ - WAVLM_LARGE
90
+ """
91
+ model = utils._get_model(self._model_type, self._params)
92
+ state_dict = self._get_state_dict(dl_kwargs)
93
+ model.load_state_dict(state_dict)
94
+ if self._normalize_waveform:
95
+ model = utils._extend_model(model, normalize_waveform=True)
96
+ model.eval()
97
+ return model
98
+
99
+
100
+ @dataclass
101
+ class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
102
+ """Data class that bundles associated information to use pretrained
103
+ :py:class:`~torchaudio.models.Wav2Vec2Model`.
104
+
105
+ This class provides interfaces for instantiating the pretrained model along with
106
+ the information necessary to retrieve pretrained weights and additional data
107
+ to be used with the model.
108
+
109
+ Torchaudio library instantiates objects of this class, each of which represents
110
+ a different pretrained model. Client code should access pretrained models via these
111
+ instances.
112
+
113
+ Please see below for the usage and the available values.
114
+
115
+ Example - ASR
116
+ >>> import torchaudio
117
+ >>>
118
+ >>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
119
+ >>>
120
+ >>> # Build the model and load pretrained weight.
121
+ >>> model = bundle.get_model()
122
+ Downloading:
123
+ 100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
124
+ >>>
125
+ >>> # Check the corresponding labels of the output.
126
+ >>> labels = bundle.get_labels()
127
+ >>> print(labels)
128
+ ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
129
+ >>>
130
+ >>> # Resample audio to the expected sampling rate
131
+ >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
132
+ >>>
133
+ >>> # Infer the label probability distribution
134
+ >>> emissions, _ = model(waveform)
135
+ >>>
136
+ >>> # Pass emission to decoder
137
+ >>> # `ctc_decode` is for illustration purpose only
138
+ >>> transcripts = ctc_decode(emissions, labels)
139
+ """ # noqa: E501
140
+
141
+ _labels: Tuple[str, ...]
142
+ _remove_aux_axis: Tuple[int, ...] = (1, 2, 3)
143
+
144
+ def get_labels(
145
+ self,
146
+ *,
147
+ blank: str = "-",
148
+ ) -> Tuple[str, ...]:
149
+ """The output class labels.
150
+
151
+ The first is blank token, and it is customizable.
152
+
153
+ Args:
154
+ blank (str, optional): Blank token. (default: ``'-'``)
155
+
156
+ Returns:
157
+ Tuple[str, ...]:
158
+ For models fine-tuned on ASR, returns the tuple of strings representing
159
+ the output class labels.
160
+
161
+ Example
162
+ >>> from torchaudio.pipelines import HUBERT_ASR_LARGE as bundle
163
+ >>> bundle.get_labels()
164
+ ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
165
+ """ # noqa: E501
166
+ return (blank, *self._labels)
167
+
168
+ def _get_state_dict(self, dl_kwargs):
169
+ return utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
170
+
171
+
172
+ WAV2VEC2_BASE = Wav2Vec2Bundle(
173
+ _path="wav2vec2_fairseq_base_ls960.pth",
174
+ _params={
175
+ "extractor_mode": "group_norm",
176
+ "extractor_conv_layer_config": [
177
+ (512, 10, 5),
178
+ (512, 3, 2),
179
+ (512, 3, 2),
180
+ (512, 3, 2),
181
+ (512, 3, 2),
182
+ (512, 2, 2),
183
+ (512, 2, 2),
184
+ ],
185
+ "extractor_conv_bias": False,
186
+ "encoder_embed_dim": 768,
187
+ "encoder_projection_dropout": 0.1,
188
+ "encoder_pos_conv_kernel": 128,
189
+ "encoder_pos_conv_groups": 16,
190
+ "encoder_num_layers": 12,
191
+ "encoder_num_heads": 12,
192
+ "encoder_attention_dropout": 0.1,
193
+ "encoder_ff_interm_features": 3072,
194
+ "encoder_ff_interm_dropout": 0.0,
195
+ "encoder_dropout": 0.1,
196
+ "encoder_layer_norm_first": False,
197
+ "encoder_layer_drop": 0.05,
198
+ "aux_num_out": None,
199
+ },
200
+ _sample_rate=16000,
201
+ _normalize_waveform=False,
202
+ _model_type="Wav2Vec2",
203
+ )
204
+ WAV2VEC2_BASE.__doc__ = """Wav2vec 2.0 model ("base" architecture),
205
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
206
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
207
+
208
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
209
+ redistributed with the same license.
210
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
211
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
212
+
213
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
214
+ """ # noqa: E501
215
+
216
+ WAV2VEC2_ASR_BASE_10M = Wav2Vec2ASRBundle(
217
+ _path="wav2vec2_fairseq_base_ls960_asr_ll10m.pth",
218
+ _params={
219
+ "extractor_mode": "group_norm",
220
+ "extractor_conv_layer_config": [
221
+ (512, 10, 5),
222
+ (512, 3, 2),
223
+ (512, 3, 2),
224
+ (512, 3, 2),
225
+ (512, 3, 2),
226
+ (512, 2, 2),
227
+ (512, 2, 2),
228
+ ],
229
+ "extractor_conv_bias": False,
230
+ "encoder_embed_dim": 768,
231
+ "encoder_projection_dropout": 0.1,
232
+ "encoder_pos_conv_kernel": 128,
233
+ "encoder_pos_conv_groups": 16,
234
+ "encoder_num_layers": 12,
235
+ "encoder_num_heads": 12,
236
+ "encoder_attention_dropout": 0.1,
237
+ "encoder_ff_interm_features": 3072,
238
+ "encoder_ff_interm_dropout": 0.0,
239
+ "encoder_dropout": 0.1,
240
+ "encoder_layer_norm_first": False,
241
+ "encoder_layer_drop": 0.05,
242
+ "aux_num_out": 29,
243
+ },
244
+ _labels=utils._get_en_labels(),
245
+ _sample_rate=16000,
246
+ _normalize_waveform=False,
247
+ _model_type="Wav2Vec2",
248
+ )
249
+ WAV2VEC2_ASR_BASE_10M.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
250
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
251
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
252
+ fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
253
+ :cite:`librilight` ("train-10min" subset).
254
+
255
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
256
+ redistributed with the same license.
257
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
258
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
259
+
260
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
261
+ """ # noqa: E501
262
+
263
+ WAV2VEC2_ASR_BASE_100H = Wav2Vec2ASRBundle(
264
+ "wav2vec2_fairseq_base_ls960_asr_ls100.pth",
265
+ {
266
+ "extractor_mode": "group_norm",
267
+ "extractor_conv_layer_config": [
268
+ (512, 10, 5),
269
+ (512, 3, 2),
270
+ (512, 3, 2),
271
+ (512, 3, 2),
272
+ (512, 3, 2),
273
+ (512, 2, 2),
274
+ (512, 2, 2),
275
+ ],
276
+ "extractor_conv_bias": False,
277
+ "encoder_embed_dim": 768,
278
+ "encoder_projection_dropout": 0.1,
279
+ "encoder_pos_conv_kernel": 128,
280
+ "encoder_pos_conv_groups": 16,
281
+ "encoder_num_layers": 12,
282
+ "encoder_num_heads": 12,
283
+ "encoder_attention_dropout": 0.1,
284
+ "encoder_ff_interm_features": 3072,
285
+ "encoder_ff_interm_dropout": 0.0,
286
+ "encoder_dropout": 0.1,
287
+ "encoder_layer_norm_first": False,
288
+ "encoder_layer_drop": 0.05,
289
+ "aux_num_out": 29,
290
+ },
291
+ _labels=utils._get_en_labels(),
292
+ _sample_rate=16000,
293
+ _normalize_waveform=False,
294
+ _model_type="Wav2Vec2",
295
+ )
296
+
297
+ WAV2VEC2_ASR_BASE_100H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
298
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
299
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
300
+ fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
301
+
302
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
303
+ redistributed with the same license.
304
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
305
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
306
+
307
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
308
+ """ # noqa: E501
309
+
310
+ WAV2VEC2_ASR_BASE_960H = Wav2Vec2ASRBundle(
311
+ "wav2vec2_fairseq_base_ls960_asr_ls960.pth",
312
+ {
313
+ "extractor_mode": "group_norm",
314
+ "extractor_conv_layer_config": [
315
+ (512, 10, 5),
316
+ (512, 3, 2),
317
+ (512, 3, 2),
318
+ (512, 3, 2),
319
+ (512, 3, 2),
320
+ (512, 2, 2),
321
+ (512, 2, 2),
322
+ ],
323
+ "extractor_conv_bias": False,
324
+ "encoder_embed_dim": 768,
325
+ "encoder_projection_dropout": 0.1,
326
+ "encoder_pos_conv_kernel": 128,
327
+ "encoder_pos_conv_groups": 16,
328
+ "encoder_num_layers": 12,
329
+ "encoder_num_heads": 12,
330
+ "encoder_attention_dropout": 0.1,
331
+ "encoder_ff_interm_features": 3072,
332
+ "encoder_ff_interm_dropout": 0.0,
333
+ "encoder_dropout": 0.1,
334
+ "encoder_layer_norm_first": False,
335
+ "encoder_layer_drop": 0.05,
336
+ "aux_num_out": 29,
337
+ },
338
+ _labels=utils._get_en_labels(),
339
+ _sample_rate=16000,
340
+ _normalize_waveform=False,
341
+ _model_type="Wav2Vec2",
342
+ )
343
+ WAV2VEC2_ASR_BASE_960H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
344
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
345
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
346
+ fine-tuned for ASR on the same audio with the corresponding transcripts.
347
+
348
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
349
+ redistributed with the same license.
350
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
351
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
352
+
353
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
354
+ """ # noqa: E501
355
+
356
+ WAV2VEC2_LARGE = Wav2Vec2Bundle(
357
+ "wav2vec2_fairseq_large_ls960.pth",
358
+ {
359
+ "extractor_mode": "group_norm",
360
+ "extractor_conv_layer_config": [
361
+ (512, 10, 5),
362
+ (512, 3, 2),
363
+ (512, 3, 2),
364
+ (512, 3, 2),
365
+ (512, 3, 2),
366
+ (512, 2, 2),
367
+ (512, 2, 2),
368
+ ],
369
+ "extractor_conv_bias": False,
370
+ "encoder_embed_dim": 1024,
371
+ "encoder_projection_dropout": 0.1,
372
+ "encoder_pos_conv_kernel": 128,
373
+ "encoder_pos_conv_groups": 16,
374
+ "encoder_num_layers": 24,
375
+ "encoder_num_heads": 16,
376
+ "encoder_attention_dropout": 0.1,
377
+ "encoder_ff_interm_features": 4096,
378
+ "encoder_ff_interm_dropout": 0.0,
379
+ "encoder_dropout": 0.0,
380
+ "encoder_layer_norm_first": False,
381
+ "encoder_layer_drop": 0.2,
382
+ "aux_num_out": None,
383
+ },
384
+ _sample_rate=16000,
385
+ _normalize_waveform=False,
386
+ _model_type="Wav2Vec2",
387
+ )
388
+ WAV2VEC2_LARGE.__doc__ = """Wav2vec 2.0 model ("large" architecture),
389
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
390
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
391
+
392
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
393
+ redistributed with the same license.
394
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
395
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
396
+
397
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
398
+ """ # noqa: E501
399
+
400
+ WAV2VEC2_ASR_LARGE_10M = Wav2Vec2ASRBundle(
401
+ "wav2vec2_fairseq_large_ls960_asr_ll10m.pth",
402
+ {
403
+ "extractor_mode": "group_norm",
404
+ "extractor_conv_layer_config": [
405
+ (512, 10, 5),
406
+ (512, 3, 2),
407
+ (512, 3, 2),
408
+ (512, 3, 2),
409
+ (512, 3, 2),
410
+ (512, 2, 2),
411
+ (512, 2, 2),
412
+ ],
413
+ "extractor_conv_bias": False,
414
+ "encoder_embed_dim": 1024,
415
+ "encoder_projection_dropout": 0.1,
416
+ "encoder_pos_conv_kernel": 128,
417
+ "encoder_pos_conv_groups": 16,
418
+ "encoder_num_layers": 24,
419
+ "encoder_num_heads": 16,
420
+ "encoder_attention_dropout": 0.1,
421
+ "encoder_ff_interm_features": 4096,
422
+ "encoder_ff_interm_dropout": 0.0,
423
+ "encoder_dropout": 0.0,
424
+ "encoder_layer_norm_first": False,
425
+ "encoder_layer_drop": 0.2,
426
+ "aux_num_out": 29,
427
+ },
428
+ _labels=utils._get_en_labels(),
429
+ _sample_rate=16000,
430
+ _normalize_waveform=False,
431
+ _model_type="Wav2Vec2",
432
+ )
433
+ WAV2VEC2_ASR_LARGE_10M.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
434
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
435
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
436
+ fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
437
+ :cite:`librilight` ("train-10min" subset).
438
+
439
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
440
+ redistributed with the same license.
441
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
442
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
443
+
444
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
445
+ """ # noqa: E501
446
+
447
+ WAV2VEC2_ASR_LARGE_100H = Wav2Vec2ASRBundle(
448
+ "wav2vec2_fairseq_large_ls960_asr_ls100.pth",
449
+ {
450
+ "extractor_mode": "group_norm",
451
+ "extractor_conv_layer_config": [
452
+ (512, 10, 5),
453
+ (512, 3, 2),
454
+ (512, 3, 2),
455
+ (512, 3, 2),
456
+ (512, 3, 2),
457
+ (512, 2, 2),
458
+ (512, 2, 2),
459
+ ],
460
+ "extractor_conv_bias": False,
461
+ "encoder_embed_dim": 1024,
462
+ "encoder_projection_dropout": 0.1,
463
+ "encoder_pos_conv_kernel": 128,
464
+ "encoder_pos_conv_groups": 16,
465
+ "encoder_num_layers": 24,
466
+ "encoder_num_heads": 16,
467
+ "encoder_attention_dropout": 0.1,
468
+ "encoder_ff_interm_features": 4096,
469
+ "encoder_ff_interm_dropout": 0.0,
470
+ "encoder_dropout": 0.0,
471
+ "encoder_layer_norm_first": False,
472
+ "encoder_layer_drop": 0.2,
473
+ "aux_num_out": 29,
474
+ },
475
+ _labels=utils._get_en_labels(),
476
+ _sample_rate=16000,
477
+ _normalize_waveform=False,
478
+ _model_type="Wav2Vec2",
479
+ )
480
+ WAV2VEC2_ASR_LARGE_100H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
481
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
482
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
483
+ fine-tuned for ASR on 100 hours of transcribed audio from
484
+ the same dataset ("train-clean-100" subset).
485
+
486
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
487
+ redistributed with the same license.
488
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
489
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
490
+
491
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
492
+ """ # noqa: E501
493
+
494
+ WAV2VEC2_ASR_LARGE_960H = Wav2Vec2ASRBundle(
495
+ "wav2vec2_fairseq_large_ls960_asr_ls960.pth",
496
+ {
497
+ "extractor_mode": "group_norm",
498
+ "extractor_conv_layer_config": [
499
+ (512, 10, 5),
500
+ (512, 3, 2),
501
+ (512, 3, 2),
502
+ (512, 3, 2),
503
+ (512, 3, 2),
504
+ (512, 2, 2),
505
+ (512, 2, 2),
506
+ ],
507
+ "extractor_conv_bias": False,
508
+ "encoder_embed_dim": 1024,
509
+ "encoder_projection_dropout": 0.1,
510
+ "encoder_pos_conv_kernel": 128,
511
+ "encoder_pos_conv_groups": 16,
512
+ "encoder_num_layers": 24,
513
+ "encoder_num_heads": 16,
514
+ "encoder_attention_dropout": 0.1,
515
+ "encoder_ff_interm_features": 4096,
516
+ "encoder_ff_interm_dropout": 0.0,
517
+ "encoder_dropout": 0.0,
518
+ "encoder_layer_norm_first": False,
519
+ "encoder_layer_drop": 0.2,
520
+ "aux_num_out": 29,
521
+ },
522
+ _labels=utils._get_en_labels(),
523
+ _sample_rate=16000,
524
+ _normalize_waveform=False,
525
+ _model_type="Wav2Vec2",
526
+ )
527
+ WAV2VEC2_ASR_LARGE_960H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
528
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
529
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
530
+ fine-tuned for ASR on the same audio with the corresponding transcripts.
531
+
532
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
533
+ redistributed with the same license.
534
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
535
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
536
+
537
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
538
+ """ # noqa: E501
539
+
540
+ WAV2VEC2_LARGE_LV60K = Wav2Vec2Bundle(
541
+ "wav2vec2_fairseq_large_lv60k.pth",
542
+ {
543
+ "extractor_mode": "layer_norm",
544
+ "extractor_conv_layer_config": [
545
+ (512, 10, 5),
546
+ (512, 3, 2),
547
+ (512, 3, 2),
548
+ (512, 3, 2),
549
+ (512, 3, 2),
550
+ (512, 2, 2),
551
+ (512, 2, 2),
552
+ ],
553
+ "extractor_conv_bias": True,
554
+ "encoder_embed_dim": 1024,
555
+ "encoder_projection_dropout": 0.1,
556
+ "encoder_pos_conv_kernel": 128,
557
+ "encoder_pos_conv_groups": 16,
558
+ "encoder_num_layers": 24,
559
+ "encoder_num_heads": 16,
560
+ "encoder_attention_dropout": 0.1,
561
+ "encoder_ff_interm_features": 4096,
562
+ "encoder_ff_interm_dropout": 0.0,
563
+ "encoder_dropout": 0.0,
564
+ "encoder_layer_norm_first": True,
565
+ "encoder_layer_drop": 0.0,
566
+ "aux_num_out": None,
567
+ },
568
+ _sample_rate=16000,
569
+ _normalize_waveform=True,
570
+ _model_type="Wav2Vec2",
571
+ )
572
+ WAV2VEC2_LARGE_LV60K.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture),
573
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
574
+ not fine-tuned.
575
+
576
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
577
+ redistributed with the same license.
578
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
579
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
580
+
581
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
582
+ """ # noqa: E501
583
+
584
+ WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2ASRBundle(
585
+ "wav2vec2_fairseq_large_lv60k_asr_ll10m.pth",
586
+ {
587
+ "extractor_mode": "layer_norm",
588
+ "extractor_conv_layer_config": [
589
+ (512, 10, 5),
590
+ (512, 3, 2),
591
+ (512, 3, 2),
592
+ (512, 3, 2),
593
+ (512, 3, 2),
594
+ (512, 2, 2),
595
+ (512, 2, 2),
596
+ ],
597
+ "extractor_conv_bias": True,
598
+ "encoder_embed_dim": 1024,
599
+ "encoder_projection_dropout": 0.1,
600
+ "encoder_pos_conv_kernel": 128,
601
+ "encoder_pos_conv_groups": 16,
602
+ "encoder_num_layers": 24,
603
+ "encoder_num_heads": 16,
604
+ "encoder_attention_dropout": 0.1,
605
+ "encoder_ff_interm_features": 4096,
606
+ "encoder_ff_interm_dropout": 0.0,
607
+ "encoder_dropout": 0.0,
608
+ "encoder_layer_norm_first": True,
609
+ "encoder_layer_drop": 0.0,
610
+ "aux_num_out": 29,
611
+ },
612
+ _labels=utils._get_en_labels(),
613
+ _sample_rate=16000,
614
+ _normalize_waveform=True,
615
+ _model_type="Wav2Vec2",
616
+ )
617
+ WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
618
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
619
+ fine-tuned for ASR on 10 minutes of transcribed audio from the same dataset ("train-10min" subset).
620
+
621
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
622
+ redistributed with the same license.
623
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
624
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
625
+
626
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
627
+ """ # noqa: E501
628
+
629
+ WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2ASRBundle(
630
+ "wav2vec2_fairseq_large_lv60k_asr_ls100.pth",
631
+ {
632
+ "extractor_mode": "layer_norm",
633
+ "extractor_conv_layer_config": [
634
+ (512, 10, 5),
635
+ (512, 3, 2),
636
+ (512, 3, 2),
637
+ (512, 3, 2),
638
+ (512, 3, 2),
639
+ (512, 2, 2),
640
+ (512, 2, 2),
641
+ ],
642
+ "extractor_conv_bias": True,
643
+ "encoder_embed_dim": 1024,
644
+ "encoder_projection_dropout": 0.1,
645
+ "encoder_pos_conv_kernel": 128,
646
+ "encoder_pos_conv_groups": 16,
647
+ "encoder_num_layers": 24,
648
+ "encoder_num_heads": 16,
649
+ "encoder_attention_dropout": 0.1,
650
+ "encoder_ff_interm_features": 4096,
651
+ "encoder_ff_interm_dropout": 0.0,
652
+ "encoder_dropout": 0.0,
653
+ "encoder_layer_norm_first": True,
654
+ "encoder_layer_drop": 0.0,
655
+ "aux_num_out": 29,
656
+ },
657
+ _labels=utils._get_en_labels(),
658
+ _sample_rate=16000,
659
+ _normalize_waveform=True,
660
+ _model_type="Wav2Vec2",
661
+ )
662
+ WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
663
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
664
+ fine-tuned for ASR on 100 hours of transcribed audio from
665
+ *LibriSpeech* dataset :cite:`7178964` ("train-clean-100" subset).
666
+
667
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
668
+ redistributed with the same license.
669
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
670
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
671
+
672
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
673
+ """ # noqa: E501
674
+
675
+ WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2ASRBundle(
676
+ "wav2vec2_fairseq_large_lv60k_asr_ls960.pth",
677
+ {
678
+ "extractor_mode": "layer_norm",
679
+ "extractor_conv_layer_config": [
680
+ (512, 10, 5),
681
+ (512, 3, 2),
682
+ (512, 3, 2),
683
+ (512, 3, 2),
684
+ (512, 3, 2),
685
+ (512, 2, 2),
686
+ (512, 2, 2),
687
+ ],
688
+ "extractor_conv_bias": True,
689
+ "encoder_embed_dim": 1024,
690
+ "encoder_projection_dropout": 0.1,
691
+ "encoder_pos_conv_kernel": 128,
692
+ "encoder_pos_conv_groups": 16,
693
+ "encoder_num_layers": 24,
694
+ "encoder_num_heads": 16,
695
+ "encoder_attention_dropout": 0.1,
696
+ "encoder_ff_interm_features": 4096,
697
+ "encoder_ff_interm_dropout": 0.0,
698
+ "encoder_dropout": 0.0,
699
+ "encoder_layer_norm_first": True,
700
+ "encoder_layer_drop": 0.0,
701
+ "aux_num_out": 29,
702
+ },
703
+ _labels=utils._get_en_labels(),
704
+ _sample_rate=16000,
705
+ _normalize_waveform=True,
706
+ _model_type="Wav2Vec2",
707
+ )
708
+ WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
709
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* :cite:`librilight` dataset, and
710
+ fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
711
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500").
712
+
713
+ Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
714
+ redistributed with the same license.
715
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
716
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
717
+
718
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
719
+ """ # noqa: E501
720
+
721
+ WAV2VEC2_XLSR53 = Wav2Vec2Bundle(
722
+ "wav2vec2_fairseq_large_xlsr53.pth",
723
+ {
724
+ "extractor_mode": "layer_norm",
725
+ "extractor_conv_layer_config": [
726
+ (512, 10, 5),
727
+ (512, 3, 2),
728
+ (512, 3, 2),
729
+ (512, 3, 2),
730
+ (512, 3, 2),
731
+ (512, 2, 2),
732
+ (512, 2, 2),
733
+ ],
734
+ "extractor_conv_bias": True,
735
+ "encoder_embed_dim": 1024,
736
+ "encoder_projection_dropout": 0.0,
737
+ "encoder_pos_conv_kernel": 128,
738
+ "encoder_pos_conv_groups": 16,
739
+ "encoder_num_layers": 24,
740
+ "encoder_num_heads": 16,
741
+ "encoder_attention_dropout": 0.0,
742
+ "encoder_ff_interm_features": 4096,
743
+ "encoder_ff_interm_dropout": 0.0,
744
+ "encoder_dropout": 0.0,
745
+ "encoder_layer_norm_first": True,
746
+ "encoder_layer_drop": 0.0,
747
+ "aux_num_out": None,
748
+ },
749
+ _sample_rate=16000,
750
+ _normalize_waveform=True,
751
+ _model_type="Wav2Vec2",
752
+ )
753
+ WAV2VEC2_XLSR53.__doc__ = """Wav2vec 2.0 model ("base" architecture),
754
+ pre-trained on 56,000 hours of unlabeled audio from multiple datasets (
755
+ *Multilingual LibriSpeech* :cite:`Pratap_2020`,
756
+ *CommonVoice* :cite:`ardila2020common` and
757
+ *BABEL* :cite:`Gales2014SpeechRA`),
758
+ not fine-tuned.
759
+
760
+ Originally published by the authors of
761
+ *Unsupervised Cross-lingual Representation Learning for Speech Recognition*
762
+ :cite:`conneau2020unsupervised` under MIT License and redistributed with the same license.
763
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
764
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
765
+
766
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
767
+ """ # noqa: E501
768
+
769
+ HUBERT_BASE = Wav2Vec2Bundle(
770
+ "hubert_fairseq_base_ls960.pth",
771
+ {
772
+ "extractor_mode": "group_norm",
773
+ "extractor_conv_layer_config": [
774
+ (512, 10, 5),
775
+ (512, 3, 2),
776
+ (512, 3, 2),
777
+ (512, 3, 2),
778
+ (512, 3, 2),
779
+ (512, 2, 2),
780
+ (512, 2, 2),
781
+ ],
782
+ "extractor_conv_bias": False,
783
+ "encoder_embed_dim": 768,
784
+ "encoder_projection_dropout": 0.1,
785
+ "encoder_pos_conv_kernel": 128,
786
+ "encoder_pos_conv_groups": 16,
787
+ "encoder_num_layers": 12,
788
+ "encoder_num_heads": 12,
789
+ "encoder_attention_dropout": 0.1,
790
+ "encoder_ff_interm_features": 3072,
791
+ "encoder_ff_interm_dropout": 0.0,
792
+ "encoder_dropout": 0.1,
793
+ "encoder_layer_norm_first": False,
794
+ "encoder_layer_drop": 0.05,
795
+ "aux_num_out": None,
796
+ },
797
+ _sample_rate=16000,
798
+ _normalize_waveform=False,
799
+ _model_type="Wav2Vec2",
800
+ )
801
+ HUBERT_BASE.__doc__ = """HuBERT model ("base" architecture),
802
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
803
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
804
+
805
+ Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
806
+ redistributed with the same license.
807
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
808
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
809
+
810
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
811
+ """ # noqa: E501
812
+
813
+ HUBERT_LARGE = Wav2Vec2Bundle(
814
+ "hubert_fairseq_large_ll60k.pth",
815
+ {
816
+ "extractor_mode": "layer_norm",
817
+ "extractor_conv_layer_config": [
818
+ (512, 10, 5),
819
+ (512, 3, 2),
820
+ (512, 3, 2),
821
+ (512, 3, 2),
822
+ (512, 3, 2),
823
+ (512, 2, 2),
824
+ (512, 2, 2),
825
+ ],
826
+ "extractor_conv_bias": False,
827
+ "encoder_embed_dim": 1024,
828
+ "encoder_projection_dropout": 0.0,
829
+ "encoder_pos_conv_kernel": 128,
830
+ "encoder_pos_conv_groups": 16,
831
+ "encoder_num_layers": 24,
832
+ "encoder_num_heads": 16,
833
+ "encoder_attention_dropout": 0.0,
834
+ "encoder_ff_interm_features": 4096,
835
+ "encoder_ff_interm_dropout": 0.0,
836
+ "encoder_dropout": 0.0,
837
+ "encoder_layer_norm_first": True,
838
+ "encoder_layer_drop": 0.0,
839
+ "aux_num_out": None,
840
+ },
841
+ _sample_rate=16000,
842
+ _normalize_waveform=True,
843
+ _model_type="Wav2Vec2",
844
+ )
845
+ HUBERT_LARGE.__doc__ = """HuBERT model ("large" architecture),
846
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
847
+ not fine-tuned.
848
+
849
+ Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
850
+ redistributed with the same license.
851
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
852
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
853
+
854
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
855
+ """ # noqa: E501
856
+
857
+ HUBERT_XLARGE = Wav2Vec2Bundle(
858
+ "hubert_fairseq_xlarge_ll60k.pth",
859
+ {
860
+ "extractor_mode": "layer_norm",
861
+ "extractor_conv_layer_config": [
862
+ (512, 10, 5),
863
+ (512, 3, 2),
864
+ (512, 3, 2),
865
+ (512, 3, 2),
866
+ (512, 3, 2),
867
+ (512, 2, 2),
868
+ (512, 2, 2),
869
+ ],
870
+ "extractor_conv_bias": False,
871
+ "encoder_embed_dim": 1280,
872
+ "encoder_projection_dropout": 0.0,
873
+ "encoder_pos_conv_kernel": 128,
874
+ "encoder_pos_conv_groups": 16,
875
+ "encoder_num_layers": 48,
876
+ "encoder_num_heads": 16,
877
+ "encoder_attention_dropout": 0.0,
878
+ "encoder_ff_interm_features": 5120,
879
+ "encoder_ff_interm_dropout": 0.0,
880
+ "encoder_dropout": 0.0,
881
+ "encoder_layer_norm_first": True,
882
+ "encoder_layer_drop": 0.0,
883
+ "aux_num_out": None,
884
+ },
885
+ _sample_rate=16000,
886
+ _normalize_waveform=True,
887
+ _model_type="Wav2Vec2",
888
+ )
889
+ HUBERT_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
890
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
891
+ not fine-tuned.
892
+
893
+ Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
894
+ redistributed with the same license.
895
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
896
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
897
+
898
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
899
+ """ # noqa: E501
900
+
901
+ HUBERT_ASR_LARGE = Wav2Vec2ASRBundle(
902
+ "hubert_fairseq_large_ll60k_asr_ls960.pth",
903
+ {
904
+ "extractor_mode": "layer_norm",
905
+ "extractor_conv_layer_config": [
906
+ (512, 10, 5),
907
+ (512, 3, 2),
908
+ (512, 3, 2),
909
+ (512, 3, 2),
910
+ (512, 3, 2),
911
+ (512, 2, 2),
912
+ (512, 2, 2),
913
+ ],
914
+ "extractor_conv_bias": False,
915
+ "encoder_embed_dim": 1024,
916
+ "encoder_projection_dropout": 0.0,
917
+ "encoder_pos_conv_kernel": 128,
918
+ "encoder_pos_conv_groups": 16,
919
+ "encoder_num_layers": 24,
920
+ "encoder_num_heads": 16,
921
+ "encoder_attention_dropout": 0.0,
922
+ "encoder_ff_interm_features": 4096,
923
+ "encoder_ff_interm_dropout": 0.1,
924
+ "encoder_dropout": 0.0,
925
+ "encoder_layer_norm_first": True,
926
+ "encoder_layer_drop": 0.1,
927
+ "aux_num_out": 29,
928
+ },
929
+ _labels=utils._get_en_labels(),
930
+ _sample_rate=16000,
931
+ _normalize_waveform=True,
932
+ _model_type="Wav2Vec2",
933
+ )
934
+ HUBERT_ASR_LARGE.__doc__ = """HuBERT model ("large" architecture),
935
+ pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
936
+ fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
937
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500").
938
+
939
+ Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
940
+ redistributed with the same license.
941
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
942
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
943
+
944
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
945
+ """ # noqa: E501
946
+
947
+ HUBERT_ASR_XLARGE = Wav2Vec2ASRBundle(
948
+ "hubert_fairseq_xlarge_ll60k_asr_ls960.pth",
949
+ {
950
+ "extractor_mode": "layer_norm",
951
+ "extractor_conv_layer_config": [
952
+ (512, 10, 5),
953
+ (512, 3, 2),
954
+ (512, 3, 2),
955
+ (512, 3, 2),
956
+ (512, 3, 2),
957
+ (512, 2, 2),
958
+ (512, 2, 2),
959
+ ],
960
+ "extractor_conv_bias": False,
961
+ "encoder_embed_dim": 1280,
962
+ "encoder_projection_dropout": 0.0,
963
+ "encoder_pos_conv_kernel": 128,
964
+ "encoder_pos_conv_groups": 16,
965
+ "encoder_num_layers": 48,
966
+ "encoder_num_heads": 16,
967
+ "encoder_attention_dropout": 0.0,
968
+ "encoder_ff_interm_features": 5120,
969
+ "encoder_ff_interm_dropout": 0.1,
970
+ "encoder_dropout": 0.0,
971
+ "encoder_layer_norm_first": True,
972
+ "encoder_layer_drop": 0.1,
973
+ "aux_num_out": 29,
974
+ },
975
+ _labels=utils._get_en_labels(),
976
+ _sample_rate=16000,
977
+ _normalize_waveform=True,
978
+ _model_type="Wav2Vec2",
979
+ )
980
+ HUBERT_ASR_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
981
+ pre-trained on 60,000 hours of unlabeled audio from
982
+ *Libri-Light* dataset :cite:`librilight`, and
983
+ fine-tuned for ASR on 960 hours of transcribed audio from
984
+ *LibriSpeech* dataset :cite:`7178964`
985
+ (the combination of "train-clean-100", "train-clean-360", and "train-other-500").
986
+
987
+ Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
988
+ redistributed with the same license.
989
+ [`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
990
+ `Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
991
+
992
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
993
+ """ # noqa: E501
994
+
995
+
996
+ VOXPOPULI_ASR_BASE_10K_DE = Wav2Vec2ASRBundle(
997
+ "wav2vec2_voxpopuli_base_10k_asr_de.pt",
998
+ {
999
+ "extractor_mode": "group_norm",
1000
+ "extractor_conv_layer_config": [
1001
+ (512, 10, 5),
1002
+ (512, 3, 2),
1003
+ (512, 3, 2),
1004
+ (512, 3, 2),
1005
+ (512, 3, 2),
1006
+ (512, 2, 2),
1007
+ (512, 2, 2),
1008
+ ],
1009
+ "extractor_conv_bias": False,
1010
+ "encoder_embed_dim": 768,
1011
+ "encoder_projection_dropout": 0.0,
1012
+ "encoder_pos_conv_kernel": 128,
1013
+ "encoder_pos_conv_groups": 16,
1014
+ "encoder_num_layers": 12,
1015
+ "encoder_num_heads": 12,
1016
+ "encoder_attention_dropout": 0.0,
1017
+ "encoder_ff_interm_features": 3072,
1018
+ "encoder_ff_interm_dropout": 0.1,
1019
+ "encoder_dropout": 0.0,
1020
+ "encoder_layer_norm_first": False,
1021
+ "encoder_layer_drop": 0.1,
1022
+ "aux_num_out": 32,
1023
+ },
1024
+ _labels=utils._get_de_labels(),
1025
+ _sample_rate=16000,
1026
+ _normalize_waveform=False,
1027
+ _remove_aux_axis=(1, 2, 3, 35),
1028
+ _model_type="Wav2Vec2",
1029
+ )
1030
+ VOXPOPULI_ASR_BASE_10K_DE.__doc__ = """wav2vec 2.0 model ("base" architecture),
1031
+ pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
1032
+ ("10k" subset, consisting of 23 languages), and
1033
+ fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.
1034
+
1035
+ Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1036
+ redistributed with the same license.
1037
+ [`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
1038
+ `Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
1039
+
1040
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1041
+ """ # noqa: E501
1042
+
1043
+
1044
+ VOXPOPULI_ASR_BASE_10K_EN = Wav2Vec2ASRBundle(
1045
+ "wav2vec2_voxpopuli_base_10k_asr_en.pt",
1046
+ {
1047
+ "extractor_mode": "group_norm",
1048
+ "extractor_conv_layer_config": [
1049
+ (512, 10, 5),
1050
+ (512, 3, 2),
1051
+ (512, 3, 2),
1052
+ (512, 3, 2),
1053
+ (512, 3, 2),
1054
+ (512, 2, 2),
1055
+ (512, 2, 2),
1056
+ ],
1057
+ "extractor_conv_bias": False,
1058
+ "encoder_embed_dim": 768,
1059
+ "encoder_projection_dropout": 0.0,
1060
+ "encoder_pos_conv_kernel": 128,
1061
+ "encoder_pos_conv_groups": 16,
1062
+ "encoder_num_layers": 12,
1063
+ "encoder_num_heads": 12,
1064
+ "encoder_attention_dropout": 0.0,
1065
+ "encoder_ff_interm_features": 3072,
1066
+ "encoder_ff_interm_dropout": 0.1,
1067
+ "encoder_dropout": 0.0,
1068
+ "encoder_layer_norm_first": False,
1069
+ "encoder_layer_drop": 0.1,
1070
+ "aux_num_out": 28,
1071
+ },
1072
+ _labels=utils._get_vp_en_labels(),
1073
+ _sample_rate=16000,
1074
+ _normalize_waveform=False,
1075
+ _remove_aux_axis=(1, 2, 3, 31),
1076
+ _model_type="Wav2Vec2",
1077
+ )
1078
+ VOXPOPULI_ASR_BASE_10K_EN.__doc__ = """wav2vec 2.0 model ("base" architecture),
1079
+ pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
1080
+ ("10k" subset, consisting of 23 languages), and
1081
+ fine-tuned for ASR on 543 hours of transcribed audio from "en" subset.
1082
+
1083
+ Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1084
+ redistributed with the same license.
1085
+ [`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
1086
+ `Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
1087
+
1088
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1089
+ """ # noqa: E501
1090
+
1091
+
1092
+ VOXPOPULI_ASR_BASE_10K_ES = Wav2Vec2ASRBundle(
1093
+ "wav2vec2_voxpopuli_base_10k_asr_es.pt",
1094
+ {
1095
+ "extractor_mode": "group_norm",
1096
+ "extractor_conv_layer_config": [
1097
+ (512, 10, 5),
1098
+ (512, 3, 2),
1099
+ (512, 3, 2),
1100
+ (512, 3, 2),
1101
+ (512, 3, 2),
1102
+ (512, 2, 2),
1103
+ (512, 2, 2),
1104
+ ],
1105
+ "extractor_conv_bias": False,
1106
+ "encoder_embed_dim": 768,
1107
+ "encoder_projection_dropout": 0.0,
1108
+ "encoder_pos_conv_kernel": 128,
1109
+ "encoder_pos_conv_groups": 16,
1110
+ "encoder_num_layers": 12,
1111
+ "encoder_num_heads": 12,
1112
+ "encoder_attention_dropout": 0.0,
1113
+ "encoder_ff_interm_features": 3072,
1114
+ "encoder_ff_interm_dropout": 0.1,
1115
+ "encoder_dropout": 0.0,
1116
+ "encoder_layer_norm_first": False,
1117
+ "encoder_layer_drop": 0.1,
1118
+ "aux_num_out": 35,
1119
+ },
1120
+ _labels=utils._get_es_labels(),
1121
+ _sample_rate=16000,
1122
+ _normalize_waveform=False,
1123
+ _remove_aux_axis=(1, 2, 3, 35),
1124
+ _model_type="Wav2Vec2",
1125
+ )
1126
+ VOXPOPULI_ASR_BASE_10K_ES.__doc__ = """wav2vec 2.0 model ("base" architecture),
1127
+ pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
1128
+ ("10k" subset, consisting of 23 languages), and
1129
+ fine-tuned for ASR on 166 hours of transcribed audio from "es" subset.
1130
+
1131
+ Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1132
+ redistributed with the same license.
1133
+ [`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
1134
+ `Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
1135
+
1136
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1137
+ """ # noqa: E501
1138
+
1139
+ VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
1140
+ "wav2vec2_voxpopuli_base_10k_asr_fr.pt",
1141
+ {
1142
+ "extractor_mode": "group_norm",
1143
+ "extractor_conv_layer_config": [
1144
+ (512, 10, 5),
1145
+ (512, 3, 2),
1146
+ (512, 3, 2),
1147
+ (512, 3, 2),
1148
+ (512, 3, 2),
1149
+ (512, 2, 2),
1150
+ (512, 2, 2),
1151
+ ],
1152
+ "extractor_conv_bias": False,
1153
+ "encoder_embed_dim": 768,
1154
+ "encoder_projection_dropout": 0.0,
1155
+ "encoder_pos_conv_kernel": 128,
1156
+ "encoder_pos_conv_groups": 16,
1157
+ "encoder_num_layers": 12,
1158
+ "encoder_num_heads": 12,
1159
+ "encoder_attention_dropout": 0.0,
1160
+ "encoder_ff_interm_features": 3072,
1161
+ "encoder_ff_interm_dropout": 0.1,
1162
+ "encoder_dropout": 0.0,
1163
+ "encoder_layer_norm_first": False,
1164
+ "encoder_layer_drop": 0.1,
1165
+ "aux_num_out": 43,
1166
+ },
1167
+ _labels=utils._get_fr_labels(),
1168
+ _sample_rate=16000,
1169
+ _normalize_waveform=False,
1170
+ _model_type="Wav2Vec2",
1171
+ )
1172
+ VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model ("base" architecture),
1173
+ pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
1174
+ ("10k" subset, consisting of 23 languages), and
1175
+ fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
1176
+
1177
+ Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1178
+ redistributed with the same license.
1179
+ [`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
1180
+ `Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
1181
+
1182
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1183
+ """ # noqa: E501
1184
+
1185
+
1186
+ VOXPOPULI_ASR_BASE_10K_IT = Wav2Vec2ASRBundle(
1187
+ "wav2vec2_voxpopuli_base_10k_asr_it.pt",
1188
+ {
1189
+ "extractor_mode": "group_norm",
1190
+ "extractor_conv_layer_config": [
1191
+ (512, 10, 5),
1192
+ (512, 3, 2),
1193
+ (512, 3, 2),
1194
+ (512, 3, 2),
1195
+ (512, 3, 2),
1196
+ (512, 2, 2),
1197
+ (512, 2, 2),
1198
+ ],
1199
+ "extractor_conv_bias": False,
1200
+ "encoder_embed_dim": 768,
1201
+ "encoder_projection_dropout": 0.0,
1202
+ "encoder_pos_conv_kernel": 128,
1203
+ "encoder_pos_conv_groups": 16,
1204
+ "encoder_num_layers": 12,
1205
+ "encoder_num_heads": 12,
1206
+ "encoder_attention_dropout": 0.0,
1207
+ "encoder_ff_interm_features": 3072,
1208
+ "encoder_ff_interm_dropout": 0.1,
1209
+ "encoder_dropout": 0.0,
1210
+ "encoder_layer_norm_first": False,
1211
+ "encoder_layer_drop": 0.1,
1212
+ "aux_num_out": 37,
1213
+ },
1214
+ _labels=utils._get_it_labels(),
1215
+ _sample_rate=16000,
1216
+ _normalize_waveform=False,
1217
+ _remove_aux_axis=(1, 2, 3),
1218
+ _model_type="Wav2Vec2",
1219
+ )
1220
+ VOXPOPULI_ASR_BASE_10K_IT.__doc__ = """wav2vec 2.0 model ("base" architecture),
1221
+ pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
1222
+ ("10k" subset, consisting of 23 languages), and
1223
+ fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.
1224
+
1225
+ Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1226
+ redistributed with the same license.
1227
+ [`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
1228
+ `Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
1229
+
1230
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1231
+ """ # noqa: E501
1232
+
1233
+
1234
+ WAVLM_BASE = Wav2Vec2Bundle(
1235
+ "wavlm_base.pth",
1236
+ {
1237
+ "extractor_mode": "group_norm",
1238
+ "extractor_conv_layer_config": [
1239
+ (512, 10, 5),
1240
+ (512, 3, 2),
1241
+ (512, 3, 2),
1242
+ (512, 3, 2),
1243
+ (512, 3, 2),
1244
+ (512, 2, 2),
1245
+ (512, 2, 2),
1246
+ ],
1247
+ "extractor_conv_bias": False,
1248
+ "encoder_embed_dim": 768,
1249
+ "encoder_projection_dropout": 0.1,
1250
+ "encoder_pos_conv_kernel": 128,
1251
+ "encoder_pos_conv_groups": 16,
1252
+ "encoder_num_layers": 12,
1253
+ "encoder_num_heads": 12,
1254
+ "encoder_max_distance": 800,
1255
+ "encoder_num_buckets": 320,
1256
+ "encoder_attention_dropout": 0.1,
1257
+ "encoder_ff_interm_features": 3072,
1258
+ "encoder_ff_interm_dropout": 0.0,
1259
+ "encoder_dropout": 0.1,
1260
+ "encoder_layer_norm_first": False,
1261
+ "encoder_layer_drop": 0.05,
1262
+ "aux_num_out": None,
1263
+ },
1264
+ _model_type="WavLM",
1265
+ _sample_rate=16000,
1266
+ _normalize_waveform=False,
1267
+ )
1268
+ WAVLM_BASE.__doc__ = """WavLM Base model ("base" architecture),
1269
+ pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`, not fine-tuned.
1270
+
1271
+ Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
1272
+ redistributed with the same license.
1273
+ [`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
1274
+ `Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
1275
+
1276
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
1277
+ """ # noqa: E501
1278
+
1279
+
1280
+ WAVLM_BASE_PLUS = Wav2Vec2Bundle(
1281
+ "wavlm_base_plus.pth",
1282
+ {
1283
+ "extractor_mode": "group_norm",
1284
+ "extractor_conv_layer_config": [
1285
+ (512, 10, 5),
1286
+ (512, 3, 2),
1287
+ (512, 3, 2),
1288
+ (512, 3, 2),
1289
+ (512, 3, 2),
1290
+ (512, 2, 2),
1291
+ (512, 2, 2),
1292
+ ],
1293
+ "extractor_conv_bias": False,
1294
+ "encoder_embed_dim": 768,
1295
+ "encoder_projection_dropout": 0.1,
1296
+ "encoder_pos_conv_kernel": 128,
1297
+ "encoder_pos_conv_groups": 16,
1298
+ "encoder_num_layers": 12,
1299
+ "encoder_num_heads": 12,
1300
+ "encoder_max_distance": 800,
1301
+ "encoder_num_buckets": 320,
1302
+ "encoder_attention_dropout": 0.1,
1303
+ "encoder_ff_interm_features": 3072,
1304
+ "encoder_ff_interm_dropout": 0.0,
1305
+ "encoder_dropout": 0.1,
1306
+ "encoder_layer_norm_first": False,
1307
+ "encoder_layer_drop": 0.05,
1308
+ "aux_num_out": None,
1309
+ },
1310
+ _model_type="WavLM",
1311
+ _sample_rate=16000,
1312
+ _normalize_waveform=False,
1313
+ )
1314
+ WAVLM_BASE_PLUS.__doc__ = """WavLM Base+ model ("base" architecture),
1315
+ pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
1316
+ and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
1317
+
1318
+ Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
1319
+ redistributed with the same license.
1320
+ [`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
1321
+ `Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
1322
+
1323
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
1324
+ """ # noqa: E501
1325
+
1326
+
1327
+ WAVLM_LARGE = Wav2Vec2Bundle(
1328
+ "wavlm_large.pth",
1329
+ {
1330
+ "extractor_mode": "layer_norm",
1331
+ "extractor_conv_layer_config": [
1332
+ (512, 10, 5),
1333
+ (512, 3, 2),
1334
+ (512, 3, 2),
1335
+ (512, 3, 2),
1336
+ (512, 3, 2),
1337
+ (512, 2, 2),
1338
+ (512, 2, 2),
1339
+ ],
1340
+ "extractor_conv_bias": False,
1341
+ "encoder_embed_dim": 1024,
1342
+ "encoder_projection_dropout": 0.1,
1343
+ "encoder_pos_conv_kernel": 128,
1344
+ "encoder_pos_conv_groups": 16,
1345
+ "encoder_num_layers": 24,
1346
+ "encoder_num_heads": 16,
1347
+ "encoder_max_distance": 800,
1348
+ "encoder_num_buckets": 320,
1349
+ "encoder_attention_dropout": 0.1,
1350
+ "encoder_ff_interm_features": 4096,
1351
+ "encoder_ff_interm_dropout": 0.0,
1352
+ "encoder_dropout": 0.1,
1353
+ "encoder_layer_norm_first": True,
1354
+ "encoder_layer_drop": 0.05,
1355
+ "aux_num_out": None,
1356
+ },
1357
+ _model_type="WavLM",
1358
+ _sample_rate=16000,
1359
+ _normalize_waveform=True,
1360
+ )
1361
+ WAVLM_LARGE.__doc__ = """WavLM Large model ("large" architecture),
1362
+ pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
1363
+ and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
1364
+
1365
+ Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
1366
+ redistributed with the same license.
1367
+ [`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
1368
+ `Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
1369
+
1370
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
1371
+ """ # noqa: E501
1372
+
1373
+
1374
+ WAV2VEC2_XLSR_300M = Wav2Vec2Bundle(
1375
+ "wav2vec2_xlsr_300m.pth",
1376
+ {
1377
+ "extractor_mode": "layer_norm",
1378
+ "extractor_conv_layer_config": [
1379
+ (512, 10, 5),
1380
+ (512, 3, 2),
1381
+ (512, 3, 2),
1382
+ (512, 3, 2),
1383
+ (512, 3, 2),
1384
+ (512, 2, 2),
1385
+ (512, 2, 2),
1386
+ ],
1387
+ "extractor_conv_bias": True,
1388
+ "encoder_embed_dim": 1024,
1389
+ "encoder_projection_dropout": 0.0,
1390
+ "encoder_pos_conv_kernel": 128,
1391
+ "encoder_pos_conv_groups": 16,
1392
+ "encoder_num_layers": 24,
1393
+ "encoder_num_heads": 16,
1394
+ "encoder_attention_dropout": 0.0,
1395
+ "encoder_ff_interm_features": 4096,
1396
+ "encoder_ff_interm_dropout": 0.0,
1397
+ "encoder_dropout": 0.0,
1398
+ "encoder_layer_norm_first": True,
1399
+ "encoder_layer_drop": 0.0,
1400
+ "aux_num_out": None,
1401
+ },
1402
+ _model_type="Wav2Vec2",
1403
+ _sample_rate=16000,
1404
+ _normalize_waveform=True,
1405
+ )
1406
+ WAV2VEC2_XLSR_300M.__doc__ = """XLS-R model with 300 million parameters,
1407
+ pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
1408
+ *Multilingual LibriSpeech* :cite:`Pratap_2020`,
1409
+ *CommonVoice* :cite:`ardila2020common`,
1410
+ *VoxLingua107* :cite:`valk2021voxlingua107`,
1411
+ *BABEL* :cite:`Gales2014SpeechRA`, and
1412
+ *VoxPopuli* :cite:`voxpopuli`) in 128 languages,
1413
+ not fine-tuned.
1414
+
1415
+ Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
1416
+ redistributed with the same license.
1417
+ [`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
1418
+ `Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
1419
+
1420
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
1421
+ """ # noqa: E501
1422
+
1423
+
1424
+ WAV2VEC2_XLSR_1B = Wav2Vec2Bundle(
1425
+ "wav2vec2_xlsr_1b.pth",
1426
+ {
1427
+ "extractor_mode": "layer_norm",
1428
+ "extractor_conv_layer_config": [
1429
+ (512, 10, 5),
1430
+ (512, 3, 2),
1431
+ (512, 3, 2),
1432
+ (512, 3, 2),
1433
+ (512, 3, 2),
1434
+ (512, 2, 2),
1435
+ (512, 2, 2),
1436
+ ],
1437
+ "extractor_conv_bias": True,
1438
+ "encoder_embed_dim": 1280,
1439
+ "encoder_projection_dropout": 0.1,
1440
+ "encoder_pos_conv_kernel": 128,
1441
+ "encoder_pos_conv_groups": 16,
1442
+ "encoder_num_layers": 48,
1443
+ "encoder_num_heads": 16,
1444
+ "encoder_attention_dropout": 0.0,
1445
+ "encoder_ff_interm_features": 5120,
1446
+ "encoder_ff_interm_dropout": 0.0,
1447
+ "encoder_dropout": 0.0,
1448
+ "encoder_layer_norm_first": True,
1449
+ "encoder_layer_drop": 0.0,
1450
+ "aux_num_out": None,
1451
+ },
1452
+ _model_type="Wav2Vec2",
1453
+ _sample_rate=16000,
1454
+ _normalize_waveform=True,
1455
+ )
1456
+ WAV2VEC2_XLSR_1B.__doc__ = """XLS-R model with 1 billion parameters,
1457
+ pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
1458
+ *Multilingual LibriSpeech* :cite:`Pratap_2020`,
1459
+ *CommonVoice* :cite:`ardila2020common`,
1460
+ *VoxLingua107* :cite:`valk2021voxlingua107`,
1461
+ *BABEL* :cite:`Gales2014SpeechRA`, and
1462
+ *VoxPopuli* :cite:`voxpopuli`) in 128 languages,
1463
+ not fine-tuned.
1464
+
1465
+ Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
1466
+ redistributed with the same license.
1467
+ [`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
1468
+ `Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
1469
+
1470
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
1471
+ """ # noqa: E501
1472
+
1473
+ WAV2VEC2_XLSR_2B = Wav2Vec2Bundle(
1474
+ "wav2vec2_xlsr_2b.pth",
1475
+ {
1476
+ "extractor_mode": "layer_norm",
1477
+ "extractor_conv_layer_config": [
1478
+ (512, 10, 5),
1479
+ (512, 3, 2),
1480
+ (512, 3, 2),
1481
+ (512, 3, 2),
1482
+ (512, 3, 2),
1483
+ (512, 2, 2),
1484
+ (512, 2, 2),
1485
+ ],
1486
+ "extractor_conv_bias": True,
1487
+ "encoder_embed_dim": 1920,
1488
+ "encoder_projection_dropout": 0.1,
1489
+ "encoder_pos_conv_kernel": 128,
1490
+ "encoder_pos_conv_groups": 16,
1491
+ "encoder_num_layers": 48,
1492
+ "encoder_num_heads": 16,
1493
+ "encoder_attention_dropout": 0.0,
1494
+ "encoder_ff_interm_features": 7680,
1495
+ "encoder_ff_interm_dropout": 0.0,
1496
+ "encoder_dropout": 0.0,
1497
+ "encoder_layer_norm_first": True,
1498
+ "encoder_layer_drop": 0.0,
1499
+ "aux_num_out": None,
1500
+ },
1501
+ _model_type="Wav2Vec2",
1502
+ _sample_rate=16000,
1503
+ _normalize_waveform=True,
1504
+ )
1505
+ WAV2VEC2_XLSR_2B.__doc__ = """XLS-R model with 2 billion parameters,
1506
+ pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
1507
+ *Multilingual LibriSpeech* :cite:`Pratap_2020`,
1508
+ *CommonVoice* :cite:`ardila2020common`,
1509
+ *VoxLingua107* :cite:`valk2021voxlingua107`,
1510
+ *BABEL* :cite:`Gales2014SpeechRA`, and
1511
+ *VoxPopuli* :cite:`voxpopuli`) in 128 languages,
1512
+ not fine-tuned.
1513
+
1514
+ Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
1515
+ redistributed with the same license.
1516
+ [`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
1517
+ `Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
1518
+
1519
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
1520
+ """ # noqa: E501
1521
+
1522
+
1523
+ @dataclass
1524
+ class Wav2Vec2FABundle(Wav2Vec2ASRBundle):
1525
+ """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model` for forced alignment.
1526
+
1527
+ This class provides interfaces for instantiating the pretrained model along with
1528
+ the information necessary to retrieve pretrained weights and additional data
1529
+ to be used with the model.
1530
+
1531
+ Torchaudio library instantiates objects of this class, each of which represents
1532
+ a different pretrained model. Client code should access pretrained models via these
1533
+ instances.
1534
+
1535
+ Please see below for the usage and the available values.
1536
+
1537
+ Example - Feature Extraction
1538
+ >>> import torchaudio
1539
+ >>>
1540
+ >>> bundle = torchaudio.pipelines.MMS_FA
1541
+ >>>
1542
+ >>> # Build the model and load pretrained weight.
1543
+ >>> model = bundle.get_model()
1544
+ Downloading:
1545
+ 100%|███████████████████████████████| 1.18G/1.18G [00:05<00:00, 216MB/s]
1546
+ >>>
1547
+ >>> # Resample audio to the expected sampling rate
1548
+ >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
1549
+ >>>
1550
+ >>> # Estimate the probability of token distribution
1551
+ >>> emission, _ = model(waveform)
1552
+ >>>
1553
+ >>> # Generate frame-wise alignment
1554
+ >>> alignment, scores = torchaudio.functional.forced_align(
1555
+ >>> emission, targets, input_lengths, target_lengths, blank=0)
1556
+ >>>
1557
+ """ # noqa: E501
1558
+
1559
+ class Tokenizer(aligner.ITokenizer):
1560
+ """Interface of the tokenizer"""
1561
+
1562
+ class Aligner(aligner.IAligner):
1563
+ """Interface of the aligner"""
1564
+
1565
+ def get_labels(self, star: Optional[str] = "*", blank: str = "-") -> Tuple[str, ...]:
1566
+ """Get the labels corresponding to the feature dimension of emission.
1567
+
1568
+ The first is blank token, and it is customizable.
1569
+
1570
+ Args:
1571
+ star (str or None, optional): Change or disable star token. (default: ``"*"``)
1572
+ blank (str, optional): Change the blank token. (default: ``'-'``)
1573
+
1574
+ Returns:
1575
+ Tuple[str, ...]:
1576
+ For models fine-tuned on ASR, returns the tuple of strings representing
1577
+ the output class labels.
1578
+
1579
+ Example
1580
+ >>> from torchaudio.pipelines import MMS_FA as bundle
1581
+ >>> bundle.get_labels()
1582
+ ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x', '*')
1583
+ >>> bundle.get_labels(star=None)
1584
+ ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x')
1585
+ """ # noqa: E501
1586
+ labels = super().get_labels(blank=blank)
1587
+ return labels if star is None else (*labels, star)
1588
+
1589
+ def get_model(self, with_star: bool = True, *, dl_kwargs=None) -> Module:
1590
+ """Construct the model and load the pretrained weight.
1591
+
1592
+ The weight file is downloaded from the internet and cached with
1593
+ :func:`torch.hub.load_state_dict_from_url`
1594
+
1595
+ Args:
1596
+ with_star (bool, optional): If enabled, the last dimension of output layer is
1597
+ extended by one, which corresponds to `star` token.
1598
+ dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
1599
+
1600
+ Returns:
1601
+ Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
1602
+
1603
+ .. note::
1604
+
1605
+ The model created with this method returns probability in log-domain,
1606
+ (i.e. :py:func:`torch.nn.functional.log_softmax` is applied), whereas
1607
+ the other Wav2Vec2 models returns logit.
1608
+ """
1609
+ model = utils._get_model(self._model_type, self._params)
1610
+ state_dict = utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
1611
+ model.load_state_dict(state_dict)
1612
+ model = utils._extend_model(
1613
+ model, normalize_waveform=self._normalize_waveform, apply_log_softmax=True, append_star=with_star
1614
+ )
1615
+ model.eval()
1616
+ return model
1617
+
1618
+ def get_dict(self, star: Optional[str] = "*", blank: str = "-") -> Dict[str, int]:
1619
+ """Get the mapping from token to index (in emission feature dim)
1620
+
1621
+ Args:
1622
+ star (str or None, optional): Change or disable star token. (default: ``"*"``)
1623
+ blank (str, optional): Change the blank token. (default: ``'-'``)
1624
+
1625
+ Returns:
1626
+ Tuple[str, ...]:
1627
+ For models fine-tuned on ASR, returns the tuple of strings representing
1628
+ the output class labels.
1629
+
1630
+ Example
1631
+ >>> from torchaudio.pipelines import MMS_FA as bundle
1632
+ >>> bundle.get_dict()
1633
+ {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27, '*': 28}
1634
+ >>> bundle.get_dict(star=None)
1635
+ {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27}
1636
+ """ # noqa: E501
1637
+ return {k: i for i, k in enumerate(self.get_labels(star=star, blank=blank))}
1638
+
1639
+ def get_tokenizer(self) -> Tokenizer:
1640
+ """Instantiate a Tokenizer.
1641
+
1642
+ Returns:
1643
+ Tokenizer
1644
+ """
1645
+ return aligner.Tokenizer(self.get_dict())
1646
+
1647
+ def get_aligner(self) -> Aligner:
1648
+ """Instantiate an Aligner.
1649
+
1650
+ Returns:
1651
+ Aligner
1652
+ """
1653
+ return aligner.Aligner(blank=0)
1654
+
1655
+
1656
+ MMS_FA = Wav2Vec2FABundle(
1657
+ "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt",
1658
+ {
1659
+ "extractor_mode": "layer_norm",
1660
+ "extractor_conv_layer_config": [
1661
+ (512, 10, 5),
1662
+ (512, 3, 2),
1663
+ (512, 3, 2),
1664
+ (512, 3, 2),
1665
+ (512, 3, 2),
1666
+ (512, 2, 2),
1667
+ (512, 2, 2),
1668
+ ],
1669
+ "extractor_conv_bias": True,
1670
+ "encoder_embed_dim": 1024,
1671
+ "encoder_projection_dropout": 0.0,
1672
+ "encoder_pos_conv_kernel": 128,
1673
+ "encoder_pos_conv_groups": 16,
1674
+ "encoder_num_layers": 24,
1675
+ "encoder_num_heads": 16,
1676
+ "encoder_attention_dropout": 0.0,
1677
+ "encoder_ff_interm_features": 4096,
1678
+ "encoder_ff_interm_dropout": 0.1,
1679
+ "encoder_dropout": 0.0,
1680
+ "encoder_layer_norm_first": True,
1681
+ "encoder_layer_drop": 0.1,
1682
+ "aux_num_out": 28,
1683
+ },
1684
+ _labels=utils._get_mms_labels(),
1685
+ _sample_rate=16000,
1686
+ _normalize_waveform=True,
1687
+ _model_type="Wav2Vec2",
1688
+ )
1689
+ MMS_FA.__doc__ = """
1690
+ Trained on 31K hours of data in 1,130 languages from *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling`.
1691
+
1692
+ Published by the authors of *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling` under [`CC-BY-NC 4.0 License <https://github.com/facebookresearch/fairseq/tree/100cd91db19bb27277a06a25eb4154c805b10189/examples/mms#license>`__].
1693
+
1694
+ Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2FABundle` for usage details.
1695
+
1696
+ .. note::
1697
+
1698
+ Unlike other Wav2Vec2 bundles, this model does not have a token for word boundary (like `|`). This makes the post-processing of alignments slightly different.
1699
+ """ # noqa: E501