torchaudio 2.8.0__cp313-cp313t-win_amd64.whl → 2.9.0__cp313-cp313t-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +179 -39
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +12 -3
- torchaudio/_torchcodec.py +73 -85
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +0 -2
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +26 -60
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +14 -2
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +1 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +1 -0
- torchaudio/transforms/_transforms.py +2 -2
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -3
- torchaudio/version.py +2 -2
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -350
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -20
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -150
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -68
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -441
- torchaudio/prototype/functional/_rir.py +0 -382
- torchaudio/prototype/functional/functional.py +0 -193
- torchaudio/prototype/models/__init__.py +0 -39
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
- torchaudio/prototype/models/_emformer_hubert.py +0 -337
- torchaudio/prototype/models/conv_emformer.py +0 -529
- torchaudio/prototype/models/hifi_gan.py +0 -342
- torchaudio/prototype/models/rnnt.py +0 -717
- torchaudio/prototype/models/rnnt_decoder.py +0 -402
- torchaudio/prototype/pipelines/__init__.py +0 -21
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -461
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -275
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -118
- torchaudio-2.8.0.dist-info/RECORD +0 -145
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -977
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -275
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/LICENSE +0 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
|
@@ -1,801 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Tuple, Union
|
|
2
|
-
|
|
3
|
-
import torch
|
|
4
|
-
from torch import nn, Tensor
|
|
5
|
-
from torch.nn import Module, ModuleList
|
|
6
|
-
from torchaudio.models import Wav2Vec2Model
|
|
7
|
-
from torchaudio.models.conformer import ConformerLayer
|
|
8
|
-
from torchaudio.models.rnnt import _TimeReduction
|
|
9
|
-
from torchaudio.models.wav2vec2 import components
|
|
10
|
-
from torchaudio._internal.module_utils import dropping_class_support, dropping_support
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _buffered_arange(max) -> Tensor:
|
|
14
|
-
"""Compute arange using a buffered tensor across function calls.
|
|
15
|
-
Produces same result as torch.arange(end=max).
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
max (int): Ending value for arange.
|
|
19
|
-
"""
|
|
20
|
-
if not hasattr(_buffered_arange, "buf"):
|
|
21
|
-
_buffered_arange.buf = torch.LongTensor()
|
|
22
|
-
if max > _buffered_arange.buf.numel():
|
|
23
|
-
_buffered_arange.buf.resize_(max)
|
|
24
|
-
torch.arange(max, out=_buffered_arange.buf)
|
|
25
|
-
return _buffered_arange.buf[:max]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _sample_negatives(input: Tensor, num_negatives: int, cross_sample_negatives: int) -> Tuple[Tensor, Tensor]:
|
|
29
|
-
"""Sample negative examples from masked input.
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
input (Tensor): Tensor of dimension `(batch, frame, dim)`.
|
|
33
|
-
num_negatives (int): Number of negative examples to sample.
|
|
34
|
-
cross_sample_negatives (int): Number of negative examples to cross sample.
|
|
35
|
-
|
|
36
|
-
Returns:
|
|
37
|
-
(Tensor, Tensor):
|
|
38
|
-
Tensor
|
|
39
|
-
The negative samples.
|
|
40
|
-
Tensor
|
|
41
|
-
The indices of the negative samples.
|
|
42
|
-
"""
|
|
43
|
-
if num_negatives == 0 and cross_sample_negatives == 0:
|
|
44
|
-
return (
|
|
45
|
-
torch.zeros(0).to(input.device, input.dtype),
|
|
46
|
-
torch.zeros(0).to(input.device, input.dtype),
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
B, T, D = input.shape
|
|
50
|
-
input = input.view(-1, D)
|
|
51
|
-
|
|
52
|
-
cross_high = T * B
|
|
53
|
-
high = T
|
|
54
|
-
|
|
55
|
-
assert high > 1
|
|
56
|
-
|
|
57
|
-
if num_negatives > 0:
|
|
58
|
-
tszs = _buffered_arange(T).unsqueeze(-1).expand(-1, num_negatives).flatten()
|
|
59
|
-
|
|
60
|
-
neg_idxs = torch.randint(low=0, high=high - 1, size=(B, num_negatives * T))
|
|
61
|
-
neg_idxs[neg_idxs >= tszs] += 1
|
|
62
|
-
|
|
63
|
-
if cross_sample_negatives > 0:
|
|
64
|
-
tszs = _buffered_arange(T).unsqueeze(-1).expand(-1, cross_sample_negatives).flatten()
|
|
65
|
-
|
|
66
|
-
cross_neg_idxs = torch.randint(low=0, high=cross_high - 1, size=(B, cross_sample_negatives * T))
|
|
67
|
-
cross_neg_idxs[cross_neg_idxs >= tszs] += 1
|
|
68
|
-
|
|
69
|
-
if num_negatives > 0:
|
|
70
|
-
neg_idxs = neg_idxs + (torch.arange(B).unsqueeze(1) * high)
|
|
71
|
-
else:
|
|
72
|
-
neg_idxs = cross_neg_idxs
|
|
73
|
-
|
|
74
|
-
if cross_sample_negatives > 0 and num_negatives > 0:
|
|
75
|
-
neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
|
|
76
|
-
|
|
77
|
-
negs = input[neg_idxs.view(-1)]
|
|
78
|
-
negs = negs.view(B, T, num_negatives + cross_sample_negatives, D).permute(2, 0, 1, 3) # NxBxCxT
|
|
79
|
-
|
|
80
|
-
return negs, neg_idxs
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class NegativeSampler(Module):
|
|
84
|
-
r"""Applies preprocessing to input and then computes negative sampling.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
preprocessor (nn.Module): Transforms input tensor prior to negative sampling.
|
|
88
|
-
num_negatives (int): Number of negative examples to sample.
|
|
89
|
-
cross_sample_negatives (int): Number of negative examples to cross sample.
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
def __init__(
|
|
93
|
-
self,
|
|
94
|
-
preprocessor: Module,
|
|
95
|
-
num_negatives: int,
|
|
96
|
-
cross_sample_negatives: int,
|
|
97
|
-
):
|
|
98
|
-
super().__init__()
|
|
99
|
-
self.preprocessor = preprocessor
|
|
100
|
-
self.num_negatives = num_negatives
|
|
101
|
-
self.cross_sample_negatives = cross_sample_negatives
|
|
102
|
-
|
|
103
|
-
def forward(self, input: Tensor) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
|
|
104
|
-
"""
|
|
105
|
-
Args:
|
|
106
|
-
input (Tensor): Tensor of dimension `(B, T, D)`.
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
(Tensor, Tensor, Optional[Tensor]):
|
|
110
|
-
Tensor
|
|
111
|
-
The input tensor after preprocessing, prior to being sampled.
|
|
112
|
-
Tensor
|
|
113
|
-
The negative samples.
|
|
114
|
-
Tensor
|
|
115
|
-
The indices of the negative samples.
|
|
116
|
-
"""
|
|
117
|
-
preprocessed = self.preprocessor(input)
|
|
118
|
-
negs, neg_idxs = _sample_negatives(preprocessed, self.num_negatives, self.cross_sample_negatives)
|
|
119
|
-
return preprocessed, negs, neg_idxs
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
class FeatureEncoder(Module):
|
|
123
|
-
"""Feature Encoder class, consisting of time reduction and linear layer.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
stride (int): Number of frames to merge for the output frame.
|
|
127
|
-
input_dim (int): Input dimension of the tensor.
|
|
128
|
-
output_dim (int): Output dimension of the tensor.
|
|
129
|
-
"""
|
|
130
|
-
|
|
131
|
-
def __init__(self, input_dim: int, output_dim: int, stride: int):
|
|
132
|
-
super().__init__()
|
|
133
|
-
self.time_reduction_layer = _TimeReduction(stride=stride)
|
|
134
|
-
self.linear_layer = nn.Linear(input_dim * stride, output_dim)
|
|
135
|
-
|
|
136
|
-
def forward(
|
|
137
|
-
self,
|
|
138
|
-
x: Tensor,
|
|
139
|
-
lengths: Optional[Tensor],
|
|
140
|
-
) -> Tuple[Tensor, Optional[Tensor]]:
|
|
141
|
-
"""
|
|
142
|
-
Args:
|
|
143
|
-
x (Tensor): Feature Tensor representing log Mel Spectrogram output. shape ``(B, T, D)``.
|
|
144
|
-
lengths (Tensor or None):
|
|
145
|
-
Valid length of each input sample. shape: ``(B, )``.
|
|
146
|
-
|
|
147
|
-
Returns:
|
|
148
|
-
(Tensor, Optional[Tensor]):
|
|
149
|
-
Tensor: output sequence after undergoing time reduction and linear projection.
|
|
150
|
-
Shape ``(B, T // stride, D * stride).
|
|
151
|
-
Optional[Tensor]: output lengths of shape ``(B,)`` if lengths parameter is provided,
|
|
152
|
-
otherwise `None`.
|
|
153
|
-
"""
|
|
154
|
-
if lengths is None:
|
|
155
|
-
B, T, D = x.shape
|
|
156
|
-
dummy_lengths = torch.full((B,), T)
|
|
157
|
-
x, _ = self.time_reduction_layer(x, dummy_lengths)
|
|
158
|
-
x = self.linear_layer(x)
|
|
159
|
-
return x, None
|
|
160
|
-
|
|
161
|
-
x, lengths = self.time_reduction_layer(x, lengths)
|
|
162
|
-
x = self.linear_layer(x)
|
|
163
|
-
return x, lengths
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
class ConformerEncoder(Module):
|
|
167
|
-
"""Conformer Encoder class, consisting of feature projection and conformer modules.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
feature_projection (nn.Module):
|
|
171
|
-
Projects feature to encoder dimension.
|
|
172
|
-
conformer (nn.ModuleList)
|
|
173
|
-
List of Conformer layers.
|
|
174
|
-
"""
|
|
175
|
-
|
|
176
|
-
def __init__(
|
|
177
|
-
self,
|
|
178
|
-
feature_projection: Module,
|
|
179
|
-
conformer: ModuleList,
|
|
180
|
-
):
|
|
181
|
-
super().__init__()
|
|
182
|
-
self.feature_projection = feature_projection
|
|
183
|
-
self.conformer = conformer
|
|
184
|
-
|
|
185
|
-
def _preprocess(
|
|
186
|
-
self,
|
|
187
|
-
features: Tensor,
|
|
188
|
-
lengths: Optional[Tensor] = None,
|
|
189
|
-
) -> Tuple[Tensor, Optional[Tensor]]:
|
|
190
|
-
x = self.feature_projection(features)
|
|
191
|
-
if lengths is not None:
|
|
192
|
-
mask = components._get_padding_mask(x, lengths)
|
|
193
|
-
else:
|
|
194
|
-
mask = None
|
|
195
|
-
return x, mask
|
|
196
|
-
|
|
197
|
-
def _get_intermediate_outputs(
|
|
198
|
-
self,
|
|
199
|
-
x: Tensor,
|
|
200
|
-
mask: Optional[Tensor] = None,
|
|
201
|
-
num_layers: Optional[int] = None,
|
|
202
|
-
) -> List[Tensor]:
|
|
203
|
-
if num_layers is not None:
|
|
204
|
-
if not 0 < num_layers <= len(self.conformer):
|
|
205
|
-
raise ValueError(f"`num_layers` must be between [1, {len(self.conformer)}]")
|
|
206
|
-
|
|
207
|
-
ret: List[Tensor] = []
|
|
208
|
-
|
|
209
|
-
x = x.transpose(0, 1)
|
|
210
|
-
for layer in self.conformer:
|
|
211
|
-
x = layer(x, mask)
|
|
212
|
-
ret.append(x.transpose(0, 1))
|
|
213
|
-
if num_layers is not None and len(ret) >= num_layers:
|
|
214
|
-
return ret
|
|
215
|
-
return ret
|
|
216
|
-
|
|
217
|
-
def forward(
|
|
218
|
-
self,
|
|
219
|
-
features: Tensor,
|
|
220
|
-
lengths: Optional[Tensor] = None,
|
|
221
|
-
) -> Tensor:
|
|
222
|
-
"""
|
|
223
|
-
Args:
|
|
224
|
-
features (Tensor): Tensor of features of shape ``(B, T, D)``.
|
|
225
|
-
lengths (Tensor or None, optional): Valid length of each input sample. shape: ``(B, )``.
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
Tensor: result after applying conformer encoder to features.
|
|
229
|
-
"""
|
|
230
|
-
x, mask = self._preprocess(features, lengths)
|
|
231
|
-
x = x.transpose(0, 1)
|
|
232
|
-
for layer in self.conformer:
|
|
233
|
-
x = layer(x, mask)
|
|
234
|
-
return x.transpose(0, 1)
|
|
235
|
-
|
|
236
|
-
def extract_features(
|
|
237
|
-
self,
|
|
238
|
-
features: Tensor,
|
|
239
|
-
lengths: Optional[Tensor] = None,
|
|
240
|
-
num_layers: Optional[int] = None,
|
|
241
|
-
) -> List[Tensor]:
|
|
242
|
-
"""Returns the list of outputs from the intermediate layers of conformer block in the encoder.
|
|
243
|
-
|
|
244
|
-
Args:
|
|
245
|
-
features (Tensor): Tensor of features of shape ``(B, T, D)``.
|
|
246
|
-
lengths (Tensor or None, optional): Valid length of each input sample. shape: ``(B, )``.
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
List[Tensor]:
|
|
250
|
-
Features from requested layers. Each Tensor is of shape: `(batch, time frame, feature dimension)`.
|
|
251
|
-
"""
|
|
252
|
-
x, masks = self._preprocess(features, lengths)
|
|
253
|
-
return self._get_intermediate_outputs(x, mask=masks, num_layers=num_layers)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
@dropping_class_support
|
|
257
|
-
class ConformerWav2Vec2PretrainModel(Module):
|
|
258
|
-
"""Conformer Wav2Vec2 pre-train model for training from scratch.
|
|
259
|
-
|
|
260
|
-
Note:
|
|
261
|
-
To build the model, please use one of the factory functions,
|
|
262
|
-
:py:func:`conformer_wav2vec2_base` or :py:func:`conformer_wav2vec2_large`
|
|
263
|
-
|
|
264
|
-
Args:
|
|
265
|
-
wav2vec2 (nn.Module):
|
|
266
|
-
Conformer based Wav2Vec2 model, including feature extractor and conformer encoder components.
|
|
267
|
-
mask_generator (nn.Module):
|
|
268
|
-
Mask generator that generates the mask for masked prediction during training.
|
|
269
|
-
negative_sampler (nn.Module):
|
|
270
|
-
Negative sampler to apply after masking.
|
|
271
|
-
|
|
272
|
-
"""
|
|
273
|
-
|
|
274
|
-
def __init__(
|
|
275
|
-
self,
|
|
276
|
-
wav2vec2: Wav2Vec2Model,
|
|
277
|
-
mask_generator: Module,
|
|
278
|
-
negative_sampler: Module,
|
|
279
|
-
):
|
|
280
|
-
super().__init__()
|
|
281
|
-
self.wav2vec2 = wav2vec2
|
|
282
|
-
self.mask_generator = mask_generator
|
|
283
|
-
self.negative_sampler = negative_sampler
|
|
284
|
-
|
|
285
|
-
def forward(
|
|
286
|
-
self,
|
|
287
|
-
features: Tensor,
|
|
288
|
-
audio_lengths: Optional[Tensor] = None,
|
|
289
|
-
) -> Tuple[Tensor, Optional[Tensor], Tensor, Tensor]:
|
|
290
|
-
"""
|
|
291
|
-
Args:
|
|
292
|
-
features (Tensor):
|
|
293
|
-
Tensor of audio features of shape `(batch, frame, dim)`.
|
|
294
|
-
audio_lengths (Tensor or None, optional):
|
|
295
|
-
Tensor of valid length of each valid auidio in the batch.
|
|
296
|
-
shape: `(batch, )` (Default: ``None``)
|
|
297
|
-
|
|
298
|
-
Returns:
|
|
299
|
-
(Tensor, Optional[Tensor], Tensor, Tensor, Tensor, Tensor):
|
|
300
|
-
Tensor
|
|
301
|
-
The masked sequences of probability distribution of shape `(batch, frame dim)`.
|
|
302
|
-
Tensor or None
|
|
303
|
-
If ``lengths`` argument was provided, a Tensor of shape `(batch, )` representing
|
|
304
|
-
valid length in time axis is returns.
|
|
305
|
-
Tensor
|
|
306
|
-
The mask indices.
|
|
307
|
-
Tensor
|
|
308
|
-
The targets, prior to negative sampling.
|
|
309
|
-
Tensor
|
|
310
|
-
The negative samples.
|
|
311
|
-
Tensor
|
|
312
|
-
The indices of the negative samples.
|
|
313
|
-
"""
|
|
314
|
-
x, lengths = self.wav2vec2.feature_extractor(features, audio_lengths)
|
|
315
|
-
|
|
316
|
-
if lengths is not None:
|
|
317
|
-
padding_mask = components._get_padding_mask(x, lengths)
|
|
318
|
-
else:
|
|
319
|
-
padding_mask = None
|
|
320
|
-
|
|
321
|
-
x = self.wav2vec2.encoder.feature_projection.layer_norm(x)
|
|
322
|
-
x = self.wav2vec2.encoder.feature_projection.dropout(x)
|
|
323
|
-
|
|
324
|
-
# Unmasked feature is used to generate positive and negative samples.
|
|
325
|
-
unmasked_x = x.clone()
|
|
326
|
-
# Apply masking to x before passing it to Conformer layers.
|
|
327
|
-
x, mask_idxs = self.mask_generator(x, padding_mask)
|
|
328
|
-
# Select the frames from masked indices for negative sampling.
|
|
329
|
-
unmasked_x = unmasked_x[mask_idxs].view(x.shape[0], -1, x.shape[-1])
|
|
330
|
-
targets, negs, neg_idxs = self.negative_sampler(unmasked_x)
|
|
331
|
-
|
|
332
|
-
x = self.wav2vec2.encoder.feature_projection.projection(x)
|
|
333
|
-
x = x.transpose(0, 1)
|
|
334
|
-
for conformer_layer in self.wav2vec2.encoder.conformer:
|
|
335
|
-
x = conformer_layer(x, padding_mask)
|
|
336
|
-
x = x.transpose(0, 1)
|
|
337
|
-
|
|
338
|
-
return x, lengths, mask_idxs, targets, negs, neg_idxs
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
################################################################################
|
|
342
|
-
def _get_conformer_feature_extractor(
|
|
343
|
-
input_dim: int,
|
|
344
|
-
output_dim: int,
|
|
345
|
-
stride: int,
|
|
346
|
-
) -> FeatureEncoder:
|
|
347
|
-
"""Construct Feature Extractor
|
|
348
|
-
|
|
349
|
-
Args:
|
|
350
|
-
input_dim (int): Input dimension of features.
|
|
351
|
-
output_dim (int): Output dimension after feature extraction.
|
|
352
|
-
stride (int): Stride used in Time Reduction layer of feature extractor.
|
|
353
|
-
|
|
354
|
-
Returns:
|
|
355
|
-
FeatureEncoder: The resulting feature extraction.
|
|
356
|
-
"""
|
|
357
|
-
return FeatureEncoder(input_dim, output_dim, stride)
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
def _get_conformer_encoder(
|
|
361
|
-
in_features: int,
|
|
362
|
-
embed_dim: int,
|
|
363
|
-
dropout_input: float,
|
|
364
|
-
num_layers: int,
|
|
365
|
-
num_heads: int,
|
|
366
|
-
ff_interm_features: int,
|
|
367
|
-
dropout: float,
|
|
368
|
-
depthwise_conv_kernel_size: Union[int, List[int]],
|
|
369
|
-
convolution_first: bool,
|
|
370
|
-
use_group_norm: bool,
|
|
371
|
-
) -> ConformerEncoder:
|
|
372
|
-
"""Construct Conformer Encoder
|
|
373
|
-
|
|
374
|
-
Args:
|
|
375
|
-
in_features (int): The number of input features.
|
|
376
|
-
embed_dim (int): The dimension of the embedding in the feature projection.
|
|
377
|
-
dropout_input (float): The dropout probability applied after the input feature
|
|
378
|
-
is projected to ``embed_dim``.
|
|
379
|
-
num_layers (int): Number of Conformer layers in the encoder.
|
|
380
|
-
num_heads (int): Number of heads in each Conformer layer.
|
|
381
|
-
ff_interm_features (int): Hidden layer dimension of the feedforward network in
|
|
382
|
-
each Conformer layer.
|
|
383
|
-
dropout (float): Dropout probability in each Conformer layer.
|
|
384
|
-
depthwise_conv_kernel_size (int or List[int]): List of kernel sizes corresponding
|
|
385
|
-
to each of the Conformer layers.If int is provided, all layers will have the
|
|
386
|
-
same kernel size.
|
|
387
|
-
convolution_first (bool): Whether to apply the convolution module ahead of the
|
|
388
|
-
attention module in each Conformer layer.
|
|
389
|
-
use_group_norm (bool): Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in
|
|
390
|
-
the convolution module in each Conformer layer.
|
|
391
|
-
|
|
392
|
-
Returns:
|
|
393
|
-
ConformerEncoder:
|
|
394
|
-
The resulting conformer encoder module.
|
|
395
|
-
"""
|
|
396
|
-
feature_projection = components.FeatureProjection(in_features, embed_dim, dropout_input)
|
|
397
|
-
|
|
398
|
-
if type(depthwise_conv_kernel_size) == int:
|
|
399
|
-
depthwise_conv_kernel_size = [depthwise_conv_kernel_size] * num_layers
|
|
400
|
-
|
|
401
|
-
assert len(depthwise_conv_kernel_size) == num_layers
|
|
402
|
-
|
|
403
|
-
conformer_layers = []
|
|
404
|
-
for l in range(num_layers):
|
|
405
|
-
layer = ConformerLayer(
|
|
406
|
-
input_dim=embed_dim,
|
|
407
|
-
ffn_dim=ff_interm_features,
|
|
408
|
-
num_attention_heads=num_heads,
|
|
409
|
-
depthwise_conv_kernel_size=depthwise_conv_kernel_size[l],
|
|
410
|
-
dropout=dropout,
|
|
411
|
-
use_group_norm=use_group_norm,
|
|
412
|
-
convolution_first=convolution_first,
|
|
413
|
-
)
|
|
414
|
-
conformer_layers.append(layer)
|
|
415
|
-
|
|
416
|
-
return ConformerEncoder(feature_projection, ModuleList(conformer_layers))
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
def _get_conformer_negativer_sampler(
|
|
420
|
-
input_dim: int,
|
|
421
|
-
output_dim: int,
|
|
422
|
-
num_negatives: int,
|
|
423
|
-
cross_sample_negatives: int,
|
|
424
|
-
) -> NegativeSampler:
|
|
425
|
-
"""Build custom NegativeSampler module, including linear layer and negative sampling.
|
|
426
|
-
|
|
427
|
-
Args:
|
|
428
|
-
input_dim (int): Dimension of input after feature extraction.
|
|
429
|
-
output_dim (int): Dimension of embedding for use in negative sampling. Same as the
|
|
430
|
-
embedding in the feature projection.
|
|
431
|
-
num_negatives (int): Number of negatives to sample.
|
|
432
|
-
cross_sample_negatives (int): Number of cross sampled negatives.
|
|
433
|
-
|
|
434
|
-
Returns:
|
|
435
|
-
NegativeSampler:
|
|
436
|
-
The resulting negative sampler module.
|
|
437
|
-
"""
|
|
438
|
-
preprocessor = nn.Linear(input_dim, output_dim)
|
|
439
|
-
return NegativeSampler(preprocessor, num_negatives, cross_sample_negatives)
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
@dropping_support
|
|
443
|
-
def conformer_wav2vec2_model(
|
|
444
|
-
extractor_input_dim: int,
|
|
445
|
-
extractor_output_dim: int,
|
|
446
|
-
extractor_stride: int,
|
|
447
|
-
encoder_embed_dim: int,
|
|
448
|
-
encoder_projection_dropout: float,
|
|
449
|
-
encoder_num_layers: int,
|
|
450
|
-
encoder_num_heads: int,
|
|
451
|
-
encoder_ff_interm_features: int,
|
|
452
|
-
encoder_depthwise_conv_kernel_size: Union[int, List[int]],
|
|
453
|
-
encoder_dropout: float,
|
|
454
|
-
encoder_convolution_first: bool,
|
|
455
|
-
encoder_use_group_norm: bool,
|
|
456
|
-
) -> Wav2Vec2Model:
|
|
457
|
-
"""Build a custom Conformer Wav2Vec2Model
|
|
458
|
-
|
|
459
|
-
Args:
|
|
460
|
-
extractor_input_dim (int): Input dimension of the features.
|
|
461
|
-
extractor_output_dim (int): Output dimension after feature extraction.
|
|
462
|
-
extractor_stride (int): Stride used in time reduction layer of feature extraction.
|
|
463
|
-
encoder_embed_dim (int): The dimension of the embedding in the feature projection.
|
|
464
|
-
encoder_projection_dropout (float):
|
|
465
|
-
The dropout probability applied after the input feature is projected to ``embed_dim``
|
|
466
|
-
encoder_num_layers (int): Number of Conformer layers in the encoder.
|
|
467
|
-
encoder_num_heads (int): Number of heads in each Conformer layer.
|
|
468
|
-
encoder_ff_interm_features (int):
|
|
469
|
-
Hidden layer dimension of the feedforward network in each Conformer layer.
|
|
470
|
-
encoder_depthwise_conv_kernel_size (int or List[int]):
|
|
471
|
-
List of kernel sizes corresponding to each of the Conformer layers.
|
|
472
|
-
If int is provided, all layers will have the same kernel size.
|
|
473
|
-
encoder_dropout (float): Dropout probability in each Conformer layer.
|
|
474
|
-
encoder_convolution_first (bool):
|
|
475
|
-
Whether to apply the convolution module ahead of the attention module
|
|
476
|
-
in each Conformer layer.
|
|
477
|
-
encoder_use_group_norm (bool):
|
|
478
|
-
Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in the convolution
|
|
479
|
-
module in each Conformer layer.
|
|
480
|
-
|
|
481
|
-
Returns:
|
|
482
|
-
Wav2Vec2Model:
|
|
483
|
-
The resulting wav2vec2 model with a conformer encoder.
|
|
484
|
-
"""
|
|
485
|
-
feature_extractor = _get_conformer_feature_extractor(
|
|
486
|
-
extractor_input_dim,
|
|
487
|
-
extractor_output_dim,
|
|
488
|
-
extractor_stride,
|
|
489
|
-
)
|
|
490
|
-
|
|
491
|
-
encoder = _get_conformer_encoder(
|
|
492
|
-
in_features=extractor_output_dim,
|
|
493
|
-
embed_dim=encoder_embed_dim,
|
|
494
|
-
dropout_input=encoder_projection_dropout,
|
|
495
|
-
num_layers=encoder_num_layers,
|
|
496
|
-
num_heads=encoder_num_heads,
|
|
497
|
-
ff_interm_features=encoder_ff_interm_features,
|
|
498
|
-
depthwise_conv_kernel_size=encoder_depthwise_conv_kernel_size,
|
|
499
|
-
dropout=encoder_dropout,
|
|
500
|
-
convolution_first=encoder_convolution_first,
|
|
501
|
-
use_group_norm=encoder_use_group_norm,
|
|
502
|
-
)
|
|
503
|
-
|
|
504
|
-
return Wav2Vec2Model(feature_extractor, encoder)
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
@dropping_support
|
|
508
|
-
def conformer_wav2vec2_base(
|
|
509
|
-
extractor_input_dim: int = 64,
|
|
510
|
-
extractor_output_dim: int = 256,
|
|
511
|
-
encoder_projection_dropout: float = 0.0,
|
|
512
|
-
) -> Wav2Vec2Model:
|
|
513
|
-
"""
|
|
514
|
-
Build Conformer Wav2Vec2 Model with "small" architecture from
|
|
515
|
-
*Conformer-Based Slef-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
|
|
516
|
-
|
|
517
|
-
Args:
|
|
518
|
-
extractor_input_dim (int, optional): Input dimension of feature extractor. (Default: 64)
|
|
519
|
-
extractor_output_dim (int, optional): Output dimension of feature extractor. (Default: 256)
|
|
520
|
-
encoder_projection_dropout (float, optional):
|
|
521
|
-
Dropout probability applied after feature projection. (Default: 0.0)
|
|
522
|
-
|
|
523
|
-
Returns:
|
|
524
|
-
Wav2Vec2Model:
|
|
525
|
-
The resulting wav2vec2 model with a conformer encoder and ``base`` configuration.
|
|
526
|
-
"""
|
|
527
|
-
return conformer_wav2vec2_model(
|
|
528
|
-
extractor_input_dim=extractor_input_dim,
|
|
529
|
-
extractor_output_dim=extractor_output_dim,
|
|
530
|
-
extractor_stride=4,
|
|
531
|
-
encoder_embed_dim=256,
|
|
532
|
-
encoder_projection_dropout=encoder_projection_dropout,
|
|
533
|
-
encoder_num_layers=12,
|
|
534
|
-
encoder_num_heads=8,
|
|
535
|
-
encoder_ff_interm_features=1024,
|
|
536
|
-
encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
|
|
537
|
-
encoder_dropout=0.1,
|
|
538
|
-
encoder_convolution_first=True,
|
|
539
|
-
encoder_use_group_norm=True,
|
|
540
|
-
)
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
@dropping_support
|
|
544
|
-
def conformer_wav2vec2_pretrain_model(
|
|
545
|
-
extractor_input_dim: int,
|
|
546
|
-
extractor_output_dim: int,
|
|
547
|
-
extractor_stride: int,
|
|
548
|
-
encoder_embed_dim: int,
|
|
549
|
-
encoder_projection_dropout: float,
|
|
550
|
-
encoder_num_layers: int,
|
|
551
|
-
encoder_num_heads: int,
|
|
552
|
-
encoder_ff_interm_features: int,
|
|
553
|
-
encoder_depthwise_conv_kernel_size: int,
|
|
554
|
-
encoder_dropout: float,
|
|
555
|
-
encoder_convolution_first: bool,
|
|
556
|
-
encoder_use_group_norm: bool,
|
|
557
|
-
mask_prob: float,
|
|
558
|
-
mask_selection: str,
|
|
559
|
-
mask_other: float,
|
|
560
|
-
mask_length: int,
|
|
561
|
-
no_mask_overlap: bool,
|
|
562
|
-
mask_min_space: int,
|
|
563
|
-
mask_channel_prob: float,
|
|
564
|
-
mask_channel_selection: str,
|
|
565
|
-
mask_channel_other: float,
|
|
566
|
-
mask_channel_length: int,
|
|
567
|
-
no_mask_channel_overlap: bool,
|
|
568
|
-
mask_channel_min_space: int,
|
|
569
|
-
num_negatives: int,
|
|
570
|
-
cross_sample_negatives: int,
|
|
571
|
-
) -> ConformerWav2Vec2PretrainModel:
|
|
572
|
-
"""Build a custom Conformer Wav2Vec2 Model for pre-training
|
|
573
|
-
|
|
574
|
-
Args:
|
|
575
|
-
extractor_input_dim (int): Input dimension of the features.
|
|
576
|
-
extractor_output_dim (int): Output dimension after feature extraction.
|
|
577
|
-
extractor_stride (int):
|
|
578
|
-
Stride used in time reduction layer of feature extraction.
|
|
579
|
-
encoder_embed_dim (int):
|
|
580
|
-
The dimension of the embedding in the feature projection.
|
|
581
|
-
encoder_projection_dropout (float):
|
|
582
|
-
The dropout probability applied after the input feature is projected to
|
|
583
|
-
``embed_dim``
|
|
584
|
-
encoder_num_layers (int):
|
|
585
|
-
Number of Conformer layers in the encoder.
|
|
586
|
-
encoder_num_heads (int):
|
|
587
|
-
Number of heads in each Conformer layer.
|
|
588
|
-
encoder_ff_interm_features (int):
|
|
589
|
-
Hidden layer dimension of the feedforward network in each Conformer layer.
|
|
590
|
-
encoder_depthwise_conv_kernel_size (int or List[int]):
|
|
591
|
-
List of kernel sizes corresponding to each of the Conformer layers.
|
|
592
|
-
If int is provided, all layers will have the same kernel size.
|
|
593
|
-
encoder_dropout (float):
|
|
594
|
-
Dropout probability in each Conformer layer.
|
|
595
|
-
encoder_convolution_first (bool):
|
|
596
|
-
Whether to apply the convolution module ahead of the attention module
|
|
597
|
-
in each Conformer layer.
|
|
598
|
-
encoder_use_group_norm (bool):
|
|
599
|
-
Whether to use ``GroupNorm`` rather than ``BatchNorm1d`` in the convolution
|
|
600
|
-
module in each Conformer layer.
|
|
601
|
-
mask_prob (float):
|
|
602
|
-
Probability for each token to be chosen as start of the span to be masked.
|
|
603
|
-
mask_selection (str)
|
|
604
|
-
How to choose the mask length. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
|
|
605
|
-
mask_other (float):
|
|
606
|
-
Secondary mask argument (used for more complex distributions).
|
|
607
|
-
mask_length (int):
|
|
608
|
-
The lengths of the mask.
|
|
609
|
-
no_mask_overlap (bool):
|
|
610
|
-
Whether to allow masks to overlap.
|
|
611
|
-
mask_min_space (int):
|
|
612
|
-
Minimum space between spans (if no overlap is enabled).
|
|
613
|
-
mask_channel_prob: (float):
|
|
614
|
-
The probability of replacing a feature with 0.
|
|
615
|
-
mask_channel_selection (str):
|
|
616
|
-
How to choose the mask length for channel masking.
|
|
617
|
-
Options: [``static``, ``uniform``, ``normal``, ``poisson``].
|
|
618
|
-
mask_channel_other (float):
|
|
619
|
-
Secondary mask argument for channel masking (used for more complex distributions).
|
|
620
|
-
mask_channel_length (int):
|
|
621
|
-
Minimum space between spans (if no overlap is enabled) for channel masking.
|
|
622
|
-
no_mask_channel_overlap (bool):
|
|
623
|
-
Whether to allow channel masks to overlap.
|
|
624
|
-
mask_channel_min_space (int):
|
|
625
|
-
Minimum space between spans for channel masking (if no overlap is enabled).
|
|
626
|
-
num_negatives (int):
|
|
627
|
-
Number of negatives to sample.
|
|
628
|
-
cross_sample_negatives (int):
|
|
629
|
-
Number of cross sampled negatives.
|
|
630
|
-
|
|
631
|
-
Returns:
|
|
632
|
-
ConformerWav2Vec2PretrainModel:
|
|
633
|
-
The resulting model.
|
|
634
|
-
"""
|
|
635
|
-
wav2vec2 = conformer_wav2vec2_model(
|
|
636
|
-
extractor_input_dim,
|
|
637
|
-
extractor_output_dim,
|
|
638
|
-
extractor_stride,
|
|
639
|
-
encoder_embed_dim,
|
|
640
|
-
encoder_projection_dropout,
|
|
641
|
-
encoder_num_layers,
|
|
642
|
-
encoder_num_heads,
|
|
643
|
-
encoder_ff_interm_features,
|
|
644
|
-
encoder_depthwise_conv_kernel_size,
|
|
645
|
-
encoder_dropout,
|
|
646
|
-
encoder_convolution_first,
|
|
647
|
-
encoder_use_group_norm,
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
mask_generator = components.MaskGenerator(
|
|
651
|
-
extractor_output_dim,
|
|
652
|
-
mask_prob,
|
|
653
|
-
mask_selection,
|
|
654
|
-
mask_other,
|
|
655
|
-
mask_length,
|
|
656
|
-
no_mask_overlap,
|
|
657
|
-
mask_min_space,
|
|
658
|
-
mask_channel_prob,
|
|
659
|
-
mask_channel_selection,
|
|
660
|
-
mask_channel_other,
|
|
661
|
-
mask_channel_length,
|
|
662
|
-
no_mask_channel_overlap,
|
|
663
|
-
mask_channel_min_space,
|
|
664
|
-
)
|
|
665
|
-
|
|
666
|
-
negative_sampler = _get_conformer_negativer_sampler(
|
|
667
|
-
extractor_output_dim,
|
|
668
|
-
encoder_embed_dim,
|
|
669
|
-
num_negatives,
|
|
670
|
-
cross_sample_negatives,
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
return ConformerWav2Vec2PretrainModel(
|
|
674
|
-
wav2vec2=wav2vec2,
|
|
675
|
-
mask_generator=mask_generator,
|
|
676
|
-
negative_sampler=negative_sampler,
|
|
677
|
-
)
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
@dropping_support
|
|
681
|
-
def conformer_wav2vec2_pretrain_base(
|
|
682
|
-
extractor_input_dim: int = 64,
|
|
683
|
-
extractor_output_dim: int = 256,
|
|
684
|
-
encoder_projection_dropout: float = 0.0,
|
|
685
|
-
mask_prob: float = 0.3,
|
|
686
|
-
mask_length: int = 3,
|
|
687
|
-
num_negatives: int = 100,
|
|
688
|
-
cross_sample_negatives: int = 0,
|
|
689
|
-
) -> ConformerWav2Vec2PretrainModel:
|
|
690
|
-
"""Build Conformer Wav2Vec2 Model for pre-training with "small" architecture from
|
|
691
|
-
*Conformer-Based Self-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
|
|
692
|
-
|
|
693
|
-
Args:
|
|
694
|
-
extractor_input_dim (int, optional): Input dimension of the features. (Default: 64)
|
|
695
|
-
extractor_output_dim (int, optional): Output dimension after feature extraction. (Default: 256)
|
|
696
|
-
encoder_projection_dropout (float, optional):
|
|
697
|
-
The dropout probability applied after the input feature is projected to
|
|
698
|
-
``embed_dim``. (Default: 0.0)
|
|
699
|
-
mask_prob (float, optional):
|
|
700
|
-
Probability for each token to be chosen as start of the span to be masked. (Default: 0.3)
|
|
701
|
-
mask_length (int, optional):
|
|
702
|
-
The lengths of the mask. (Default: 3)
|
|
703
|
-
num_negatives (int, optional):
|
|
704
|
-
Number of sampled negatives. (Default: 0)
|
|
705
|
-
cross_sample_negatives (int, optional):
|
|
706
|
-
Number of cross sampled negatives. (Default: 0)
|
|
707
|
-
|
|
708
|
-
Returns:
|
|
709
|
-
ConformerWav2Vec2PretrainModel:
|
|
710
|
-
The resulting model.
|
|
711
|
-
"""
|
|
712
|
-
return conformer_wav2vec2_pretrain_model(
|
|
713
|
-
extractor_input_dim=extractor_input_dim,
|
|
714
|
-
extractor_output_dim=extractor_output_dim,
|
|
715
|
-
extractor_stride=4,
|
|
716
|
-
encoder_embed_dim=256,
|
|
717
|
-
encoder_projection_dropout=encoder_projection_dropout,
|
|
718
|
-
encoder_num_layers=12,
|
|
719
|
-
encoder_num_heads=8,
|
|
720
|
-
encoder_ff_interm_features=1024,
|
|
721
|
-
encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
|
|
722
|
-
encoder_dropout=0.1,
|
|
723
|
-
encoder_convolution_first=True,
|
|
724
|
-
encoder_use_group_norm=True,
|
|
725
|
-
mask_prob=mask_prob,
|
|
726
|
-
mask_selection="static",
|
|
727
|
-
mask_other=0.0,
|
|
728
|
-
mask_length=mask_length,
|
|
729
|
-
no_mask_overlap=False,
|
|
730
|
-
mask_min_space=0,
|
|
731
|
-
mask_channel_prob=0,
|
|
732
|
-
mask_channel_selection="static",
|
|
733
|
-
mask_channel_other=0,
|
|
734
|
-
mask_channel_length=10,
|
|
735
|
-
no_mask_channel_overlap=False,
|
|
736
|
-
mask_channel_min_space=1,
|
|
737
|
-
num_negatives=num_negatives,
|
|
738
|
-
cross_sample_negatives=cross_sample_negatives,
|
|
739
|
-
)
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
@dropping_support
|
|
743
|
-
def conformer_wav2vec2_pretrain_large(
|
|
744
|
-
extractor_input_dim: int = 64,
|
|
745
|
-
extractor_output_dim: int = 256,
|
|
746
|
-
encoder_projection_dropout: float = 0.0,
|
|
747
|
-
mask_prob: float = 0.3,
|
|
748
|
-
mask_length: int = 3,
|
|
749
|
-
num_negatives: int = 100,
|
|
750
|
-
cross_sample_negatives: int = 0,
|
|
751
|
-
) -> ConformerWav2Vec2PretrainModel:
|
|
752
|
-
"""Build Conformer Wav2Vec2 Model for pre-training with "large" architecture from
|
|
753
|
-
*Conformer-Based Slef-Supervised Learning for Non-Speech Audio Tasks* :cite:`9746490`
|
|
754
|
-
|
|
755
|
-
Args:
|
|
756
|
-
extractor_input_dim (int, optional): Input dimension of the features. (Default: 64)
|
|
757
|
-
extractor_output_dim (int, optional): Output dimension after feature extraction. (Default: 256)
|
|
758
|
-
encoder_projection_dropout (float, optional):
|
|
759
|
-
The dropout probability applied after the input feature is projected to
|
|
760
|
-
``embed_dim``. (Default: 0.0)
|
|
761
|
-
mask_prob (float, optional):
|
|
762
|
-
Probability for each token to be chosen as start of the span to be masked. (Default: 0.3)
|
|
763
|
-
mask_length (int, optional):
|
|
764
|
-
The lengths of the mask. (Default: 3)
|
|
765
|
-
num_negatives (int, optional):
|
|
766
|
-
Number of sampled negatives. (Default: 0)
|
|
767
|
-
cross_sample_negatives (int, optional):
|
|
768
|
-
Number of cross sampled negatives. (Default: 0)
|
|
769
|
-
|
|
770
|
-
Returns:
|
|
771
|
-
ConformerWav2Vec2PretrainModel:
|
|
772
|
-
The resulting model.
|
|
773
|
-
"""
|
|
774
|
-
return conformer_wav2vec2_pretrain_model(
|
|
775
|
-
extractor_input_dim=extractor_input_dim,
|
|
776
|
-
extractor_output_dim=extractor_output_dim,
|
|
777
|
-
extractor_stride=4,
|
|
778
|
-
encoder_embed_dim=768,
|
|
779
|
-
encoder_projection_dropout=encoder_projection_dropout,
|
|
780
|
-
encoder_num_layers=12,
|
|
781
|
-
encoder_num_heads=12,
|
|
782
|
-
encoder_ff_interm_features=1024,
|
|
783
|
-
encoder_depthwise_conv_kernel_size=[31] + [15] * 11,
|
|
784
|
-
encoder_dropout=0.1,
|
|
785
|
-
encoder_convolution_first=True,
|
|
786
|
-
encoder_use_group_norm=True,
|
|
787
|
-
mask_prob=mask_prob,
|
|
788
|
-
mask_selection="static",
|
|
789
|
-
mask_other=0.0,
|
|
790
|
-
mask_length=mask_length,
|
|
791
|
-
no_mask_overlap=False,
|
|
792
|
-
mask_min_space=0,
|
|
793
|
-
mask_channel_prob=0,
|
|
794
|
-
mask_channel_selection="static",
|
|
795
|
-
mask_channel_other=0,
|
|
796
|
-
mask_channel_length=10,
|
|
797
|
-
no_mask_channel_overlap=False,
|
|
798
|
-
mask_channel_min_space=1,
|
|
799
|
-
num_negatives=num_negatives,
|
|
800
|
-
cross_sample_negatives=cross_sample_negatives,
|
|
801
|
-
)
|