torchaudio 2.9.1__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchaudio/.dylibs/libc++.1.0.dylib +0 -0
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.1.dist-info/METADATA +133 -0
- torchaudio-2.9.1.dist-info/RECORD +86 -0
- torchaudio-2.9.1.dist-info/WHEEL +5 -0
- torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- torchaudio-2.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from typing import List, NamedTuple, Union
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
import torchaudio
|
|
9
|
+
|
|
10
|
+
torchaudio._extension._load_lib("libctc_prefix_decoder")
|
|
11
|
+
import torchaudio.lib.pybind11_prefixctc as cuctc
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = ["CUCTCHypothesis", "CUCTCDecoder", "cuda_ctc_decoder"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_vocab_list(vocab_file):
|
|
18
|
+
vocab = []
|
|
19
|
+
with open(vocab_file, "r", encoding="utf-8") as f:
|
|
20
|
+
for line in f:
|
|
21
|
+
line = line.strip().split()
|
|
22
|
+
vocab.append(line[0])
|
|
23
|
+
return vocab
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CUCTCHypothesis(NamedTuple):
|
|
27
|
+
r"""Represents hypothesis generated by CUCTC beam search decoder :class:`CUCTCDecoder`."""
|
|
28
|
+
tokens: List[int]
|
|
29
|
+
"""Predicted sequence of token IDs. Shape `(L, )`, where `L` is the length of the output sequence"""
|
|
30
|
+
|
|
31
|
+
words: List[str]
|
|
32
|
+
"""List of predicted tokens. Algin with modeling unit.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
score: float
|
|
36
|
+
"""Score corresponding to hypothesis"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CUCTCDecoder:
|
|
43
|
+
"""CUDA CTC beam search decoder.
|
|
44
|
+
|
|
45
|
+
.. devices:: CUDA
|
|
46
|
+
|
|
47
|
+
Note:
|
|
48
|
+
To build the decoder, please use the factory function :func:`cuda_ctc_decoder`.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
vocab_list: List[str],
|
|
54
|
+
blank_id: int = 0,
|
|
55
|
+
beam_size: int = 10,
|
|
56
|
+
nbest: int = 1,
|
|
57
|
+
blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
|
|
58
|
+
cuda_stream: torch.cuda.streams.Stream = None,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Args:
|
|
62
|
+
blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
|
|
63
|
+
vocab_list (List[str]): list of vocabulary tokens
|
|
64
|
+
beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
|
|
65
|
+
nbest (int): number of best decodings to return
|
|
66
|
+
blank_skip_threshold (float):
|
|
67
|
+
skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
|
|
68
|
+
(Default: 0.95).
|
|
69
|
+
cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
|
|
70
|
+
|
|
71
|
+
"""
|
|
72
|
+
if cuda_stream:
|
|
73
|
+
if not isinstance(cuda_stream, torch.cuda.streams.Stream):
|
|
74
|
+
raise AssertionError("cuda_stream must be torch.cuda.streams.Stream")
|
|
75
|
+
cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
|
|
76
|
+
self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
|
|
77
|
+
self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
|
|
78
|
+
if blank_id != 0:
|
|
79
|
+
raise AssertionError("blank_id must be 0")
|
|
80
|
+
self.blank_id = blank_id
|
|
81
|
+
self.vocab_list = vocab_list
|
|
82
|
+
self.space_id = 0
|
|
83
|
+
self.nbest = nbest
|
|
84
|
+
if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
|
|
85
|
+
raise AssertionError("blank_skip_threshold must be between 0 and 1")
|
|
86
|
+
self.blank_skip_threshold = math.log(blank_skip_threshold)
|
|
87
|
+
self.beam_size = min(beam_size, len(vocab_list)) # beam size must be smaller than vocab size
|
|
88
|
+
|
|
89
|
+
def __del__(self):
|
|
90
|
+
if cuctc is not None:
|
|
91
|
+
cuctc.prefixCTC_free(self.internal_data)
|
|
92
|
+
|
|
93
|
+
def __call__(self, log_prob: torch.Tensor, encoder_out_lens: torch.Tensor):
|
|
94
|
+
"""
|
|
95
|
+
Args:
|
|
96
|
+
log_prob (torch.FloatTensor): GPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
|
|
97
|
+
probability distribution over labels; log_softmax(output of acoustic model).
|
|
98
|
+
lengths (dtype torch.int32): GPU tensor of shape `(batch, )` storing the valid length of
|
|
99
|
+
in time axis of the output Tensor in each batch.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List[List[CUCTCHypothesis]]:
|
|
103
|
+
List of sorted best hypotheses for each audio sequence in the batch.
|
|
104
|
+
"""
|
|
105
|
+
if not encoder_out_lens.dtype == torch.int32:
|
|
106
|
+
raise AssertionError("encoder_out_lens must be torch.int32")
|
|
107
|
+
if not log_prob.dtype == torch.float32:
|
|
108
|
+
raise AssertionError("log_prob must be torch.float32")
|
|
109
|
+
if not (log_prob.is_cuda and encoder_out_lens.is_cuda):
|
|
110
|
+
raise AssertionError("inputs must be cuda tensors")
|
|
111
|
+
if not (log_prob.is_contiguous() and encoder_out_lens.is_contiguous()):
|
|
112
|
+
raise AssertionError("input tensors must be contiguous")
|
|
113
|
+
required_size, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
|
|
114
|
+
self.internal_data,
|
|
115
|
+
self.memory.data_ptr(),
|
|
116
|
+
self.memory.size(0),
|
|
117
|
+
log_prob.data_ptr(),
|
|
118
|
+
encoder_out_lens.data_ptr(),
|
|
119
|
+
log_prob.size(),
|
|
120
|
+
log_prob.stride(),
|
|
121
|
+
self.beam_size,
|
|
122
|
+
self.blank_id,
|
|
123
|
+
self.space_id,
|
|
124
|
+
self.blank_skip_threshold,
|
|
125
|
+
)
|
|
126
|
+
if required_size > 0:
|
|
127
|
+
self.memory = torch.empty(required_size, dtype=torch.int8, device=log_prob.device).contiguous()
|
|
128
|
+
_, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
|
|
129
|
+
self.internal_data,
|
|
130
|
+
self.memory.data_ptr(),
|
|
131
|
+
self.memory.size(0),
|
|
132
|
+
log_prob.data_ptr(),
|
|
133
|
+
encoder_out_lens.data_ptr(),
|
|
134
|
+
log_prob.size(),
|
|
135
|
+
log_prob.stride(),
|
|
136
|
+
self.beam_size,
|
|
137
|
+
self.blank_id,
|
|
138
|
+
self.space_id,
|
|
139
|
+
self.blank_skip_threshold,
|
|
140
|
+
)
|
|
141
|
+
batch_size = len(score_hyps)
|
|
142
|
+
hypos = []
|
|
143
|
+
for i in range(batch_size):
|
|
144
|
+
hypos.append(
|
|
145
|
+
[
|
|
146
|
+
CUCTCHypothesis(
|
|
147
|
+
tokens=score_hyps[i][j][1],
|
|
148
|
+
words=[self.vocab_list[word_id] for word_id in score_hyps[i][j][1]],
|
|
149
|
+
score=score_hyps[i][j][0],
|
|
150
|
+
)
|
|
151
|
+
for j in range(self.nbest)
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
return hypos
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def cuda_ctc_decoder(
|
|
158
|
+
tokens: Union[str, List[str]],
|
|
159
|
+
nbest: int = 1,
|
|
160
|
+
beam_size: int = 10,
|
|
161
|
+
blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
|
|
162
|
+
) -> CUCTCDecoder:
|
|
163
|
+
"""Builds an instance of :class:`CUCTCDecoder`.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
tokens (str or List[str]): File or list containing valid tokens.
|
|
167
|
+
If using a file, the expected format is for tokens mapping to the same index to be on the same line
|
|
168
|
+
beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
|
|
169
|
+
nbest (int): The number of best decodings to return
|
|
170
|
+
blank_id (int): The token ID corresopnding to the blank symbol.
|
|
171
|
+
blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
|
|
172
|
+
(Default: 0.95).
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
CUCTCDecoder: decoder
|
|
176
|
+
|
|
177
|
+
Example
|
|
178
|
+
>>> decoder = cuda_ctc_decoder(
|
|
179
|
+
>>> vocab_file="tokens.txt",
|
|
180
|
+
>>> blank_skip_threshold=0.95,
|
|
181
|
+
>>> )
|
|
182
|
+
>>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
|
|
183
|
+
"""
|
|
184
|
+
if type(tokens) is str:
|
|
185
|
+
tokens = _get_vocab_list(tokens)
|
|
186
|
+
|
|
187
|
+
return CUCTCDecoder(vocab_list=tokens, beam_size=beam_size, nbest=nbest, blank_skip_threshold=blank_skip_threshold)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
__all__ = ["DeepSpeech"]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FullyConnected(torch.nn.Module):
|
|
7
|
+
"""
|
|
8
|
+
Args:
|
|
9
|
+
n_feature: Number of input features
|
|
10
|
+
n_hidden: Internal hidden unit size.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, n_feature: int, n_hidden: int, dropout: float, relu_max_clip: int = 20) -> None:
|
|
14
|
+
super(FullyConnected, self).__init__()
|
|
15
|
+
self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
|
|
16
|
+
self.relu_max_clip = relu_max_clip
|
|
17
|
+
self.dropout = dropout
|
|
18
|
+
|
|
19
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
20
|
+
x = self.fc(x)
|
|
21
|
+
x = torch.nn.functional.relu(x)
|
|
22
|
+
x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip)
|
|
23
|
+
if self.dropout:
|
|
24
|
+
x = torch.nn.functional.dropout(x, self.dropout, self.training)
|
|
25
|
+
return x
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DeepSpeech(torch.nn.Module):
|
|
29
|
+
"""DeepSpeech architecture introduced in
|
|
30
|
+
*Deep Speech: Scaling up end-to-end speech recognition* :cite:`hannun2014deep`.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
n_feature: Number of input features
|
|
34
|
+
n_hidden: Internal hidden unit size.
|
|
35
|
+
n_class: Number of output classes
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
n_feature: int,
|
|
41
|
+
n_hidden: int = 2048,
|
|
42
|
+
n_class: int = 40,
|
|
43
|
+
dropout: float = 0.0,
|
|
44
|
+
) -> None:
|
|
45
|
+
super(DeepSpeech, self).__init__()
|
|
46
|
+
self.n_hidden = n_hidden
|
|
47
|
+
self.fc1 = FullyConnected(n_feature, n_hidden, dropout)
|
|
48
|
+
self.fc2 = FullyConnected(n_hidden, n_hidden, dropout)
|
|
49
|
+
self.fc3 = FullyConnected(n_hidden, n_hidden, dropout)
|
|
50
|
+
self.bi_rnn = torch.nn.RNN(n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True)
|
|
51
|
+
self.fc4 = FullyConnected(n_hidden, n_hidden, dropout)
|
|
52
|
+
self.out = torch.nn.Linear(n_hidden, n_class)
|
|
53
|
+
|
|
54
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
55
|
+
"""
|
|
56
|
+
Args:
|
|
57
|
+
x (torch.Tensor): Tensor of dimension (batch, channel, time, feature).
|
|
58
|
+
Returns:
|
|
59
|
+
Tensor: Predictor tensor of dimension (batch, time, class).
|
|
60
|
+
"""
|
|
61
|
+
# N x C x T x F
|
|
62
|
+
x = self.fc1(x)
|
|
63
|
+
# N x C x T x H
|
|
64
|
+
x = self.fc2(x)
|
|
65
|
+
# N x C x T x H
|
|
66
|
+
x = self.fc3(x)
|
|
67
|
+
# N x C x T x H
|
|
68
|
+
x = x.squeeze(1)
|
|
69
|
+
# N x T x H
|
|
70
|
+
x = x.transpose(0, 1)
|
|
71
|
+
# T x N x H
|
|
72
|
+
x, _ = self.bi_rnn(x)
|
|
73
|
+
# The fifth (non-recurrent) layer takes both the forward and backward units as inputs
|
|
74
|
+
x = x[:, :, : self.n_hidden] + x[:, :, self.n_hidden :]
|
|
75
|
+
# T x N x H
|
|
76
|
+
x = self.fc4(x)
|
|
77
|
+
# T x N x H
|
|
78
|
+
x = self.out(x)
|
|
79
|
+
# T x N x n_class
|
|
80
|
+
x = x.permute(1, 0, 2)
|
|
81
|
+
# N x T x n_class
|
|
82
|
+
x = torch.nn.functional.log_softmax(x, dim=2)
|
|
83
|
+
# N x T x n_class
|
|
84
|
+
return x
|