torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.1.dist-info/METADATA +133 -0
- torchaudio-2.9.1.dist-info/RECORD +85 -0
- torchaudio-2.9.1.dist-info/WHEEL +5 -0
- torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- torchaudio-2.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from torch import nn, Tensor
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"Wav2Letter",
|
|
5
|
+
]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Wav2Letter(nn.Module):
|
|
9
|
+
r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
|
|
10
|
+
Recognition System* :cite:`collobert2016wav2letter`.
|
|
11
|
+
|
|
12
|
+
See Also:
|
|
13
|
+
* `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wav2letter>`__
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
|
|
17
|
+
input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
|
|
18
|
+
or ``mfcc`` (Default: ``waveform``).
|
|
19
|
+
num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
|
|
23
|
+
super().__init__()
|
|
24
|
+
|
|
25
|
+
acoustic_num_features = 250 if input_type == "waveform" else num_features
|
|
26
|
+
acoustic_model = nn.Sequential(
|
|
27
|
+
nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
|
|
28
|
+
nn.ReLU(inplace=True),
|
|
29
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
30
|
+
nn.ReLU(inplace=True),
|
|
31
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
32
|
+
nn.ReLU(inplace=True),
|
|
33
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
34
|
+
nn.ReLU(inplace=True),
|
|
35
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
36
|
+
nn.ReLU(inplace=True),
|
|
37
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
38
|
+
nn.ReLU(inplace=True),
|
|
39
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
40
|
+
nn.ReLU(inplace=True),
|
|
41
|
+
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
|
|
42
|
+
nn.ReLU(inplace=True),
|
|
43
|
+
nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
|
|
44
|
+
nn.ReLU(inplace=True),
|
|
45
|
+
nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
|
|
46
|
+
nn.ReLU(inplace=True),
|
|
47
|
+
nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
|
|
48
|
+
nn.ReLU(inplace=True),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if input_type == "waveform":
|
|
52
|
+
waveform_model = nn.Sequential(
|
|
53
|
+
nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
|
|
54
|
+
nn.ReLU(inplace=True),
|
|
55
|
+
)
|
|
56
|
+
self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)
|
|
57
|
+
|
|
58
|
+
if input_type in ["power_spectrum", "mfcc"]:
|
|
59
|
+
self.acoustic_model = acoustic_model
|
|
60
|
+
|
|
61
|
+
def forward(self, x: Tensor) -> Tensor:
|
|
62
|
+
r"""
|
|
63
|
+
Args:
|
|
64
|
+
x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
x = self.acoustic_model(x)
|
|
71
|
+
x = nn.functional.log_softmax(x, dim=1)
|
|
72
|
+
return x
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from . import utils
|
|
2
|
+
from .model import (
|
|
3
|
+
hubert_base,
|
|
4
|
+
hubert_large,
|
|
5
|
+
hubert_pretrain_base,
|
|
6
|
+
hubert_pretrain_large,
|
|
7
|
+
hubert_pretrain_model,
|
|
8
|
+
hubert_pretrain_xlarge,
|
|
9
|
+
hubert_xlarge,
|
|
10
|
+
HuBERTPretrainModel,
|
|
11
|
+
wav2vec2_base,
|
|
12
|
+
wav2vec2_large,
|
|
13
|
+
wav2vec2_large_lv60k,
|
|
14
|
+
wav2vec2_model,
|
|
15
|
+
wav2vec2_xlsr_1b,
|
|
16
|
+
wav2vec2_xlsr_2b,
|
|
17
|
+
wav2vec2_xlsr_300m,
|
|
18
|
+
Wav2Vec2Model,
|
|
19
|
+
wavlm_base,
|
|
20
|
+
wavlm_large,
|
|
21
|
+
wavlm_model,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Wav2Vec2Model",
|
|
26
|
+
"HuBERTPretrainModel",
|
|
27
|
+
"wavlm_model",
|
|
28
|
+
"wavlm_base",
|
|
29
|
+
"wavlm_large",
|
|
30
|
+
"wav2vec2_model",
|
|
31
|
+
"wav2vec2_base",
|
|
32
|
+
"wav2vec2_large",
|
|
33
|
+
"wav2vec2_large_lv60k",
|
|
34
|
+
"hubert_base",
|
|
35
|
+
"hubert_large",
|
|
36
|
+
"hubert_xlarge",
|
|
37
|
+
"hubert_pretrain_model",
|
|
38
|
+
"hubert_pretrain_base",
|
|
39
|
+
"hubert_pretrain_large",
|
|
40
|
+
"hubert_pretrain_xlarge",
|
|
41
|
+
"utils",
|
|
42
|
+
"wav2vec2_xlsr_300m",
|
|
43
|
+
"wav2vec2_xlsr_1b",
|
|
44
|
+
"wav2vec2_xlsr_2b",
|
|
45
|
+
]
|