torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. torchaudio/__init__.py +204 -0
  2. torchaudio/_extension/__init__.py +61 -0
  3. torchaudio/_extension/utils.py +133 -0
  4. torchaudio/_internal/__init__.py +10 -0
  5. torchaudio/_internal/module_utils.py +171 -0
  6. torchaudio/_torchcodec.py +340 -0
  7. torchaudio/compliance/__init__.py +5 -0
  8. torchaudio/compliance/kaldi.py +813 -0
  9. torchaudio/datasets/__init__.py +47 -0
  10. torchaudio/datasets/cmuarctic.py +157 -0
  11. torchaudio/datasets/cmudict.py +186 -0
  12. torchaudio/datasets/commonvoice.py +86 -0
  13. torchaudio/datasets/dr_vctk.py +121 -0
  14. torchaudio/datasets/fluentcommands.py +108 -0
  15. torchaudio/datasets/gtzan.py +1118 -0
  16. torchaudio/datasets/iemocap.py +147 -0
  17. torchaudio/datasets/librilight_limited.py +111 -0
  18. torchaudio/datasets/librimix.py +133 -0
  19. torchaudio/datasets/librispeech.py +174 -0
  20. torchaudio/datasets/librispeech_biasing.py +189 -0
  21. torchaudio/datasets/libritts.py +168 -0
  22. torchaudio/datasets/ljspeech.py +107 -0
  23. torchaudio/datasets/musdb_hq.py +139 -0
  24. torchaudio/datasets/quesst14.py +136 -0
  25. torchaudio/datasets/snips.py +157 -0
  26. torchaudio/datasets/speechcommands.py +183 -0
  27. torchaudio/datasets/tedlium.py +218 -0
  28. torchaudio/datasets/utils.py +54 -0
  29. torchaudio/datasets/vctk.py +143 -0
  30. torchaudio/datasets/voxceleb1.py +309 -0
  31. torchaudio/datasets/yesno.py +89 -0
  32. torchaudio/functional/__init__.py +130 -0
  33. torchaudio/functional/_alignment.py +128 -0
  34. torchaudio/functional/filtering.py +1685 -0
  35. torchaudio/functional/functional.py +2505 -0
  36. torchaudio/lib/__init__.py +0 -0
  37. torchaudio/lib/_torchaudio.so +0 -0
  38. torchaudio/lib/libtorchaudio.so +0 -0
  39. torchaudio/models/__init__.py +85 -0
  40. torchaudio/models/_hdemucs.py +1008 -0
  41. torchaudio/models/conformer.py +293 -0
  42. torchaudio/models/conv_tasnet.py +330 -0
  43. torchaudio/models/decoder/__init__.py +64 -0
  44. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  45. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  46. torchaudio/models/deepspeech.py +84 -0
  47. torchaudio/models/emformer.py +884 -0
  48. torchaudio/models/rnnt.py +816 -0
  49. torchaudio/models/rnnt_decoder.py +339 -0
  50. torchaudio/models/squim/__init__.py +11 -0
  51. torchaudio/models/squim/objective.py +326 -0
  52. torchaudio/models/squim/subjective.py +150 -0
  53. torchaudio/models/tacotron2.py +1046 -0
  54. torchaudio/models/wav2letter.py +72 -0
  55. torchaudio/models/wav2vec2/__init__.py +45 -0
  56. torchaudio/models/wav2vec2/components.py +1167 -0
  57. torchaudio/models/wav2vec2/model.py +1579 -0
  58. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  59. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  60. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  61. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  62. torchaudio/models/wavernn.py +409 -0
  63. torchaudio/pipelines/__init__.py +102 -0
  64. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  65. torchaudio/pipelines/_squim_pipeline.py +156 -0
  66. torchaudio/pipelines/_tts/__init__.py +16 -0
  67. torchaudio/pipelines/_tts/impl.py +385 -0
  68. torchaudio/pipelines/_tts/interface.py +255 -0
  69. torchaudio/pipelines/_tts/utils.py +230 -0
  70. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  71. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  72. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  73. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  74. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  75. torchaudio/transforms/__init__.py +78 -0
  76. torchaudio/transforms/_multi_channel.py +467 -0
  77. torchaudio/transforms/_transforms.py +2138 -0
  78. torchaudio/utils/__init__.py +4 -0
  79. torchaudio/utils/download.py +89 -0
  80. torchaudio/version.py +2 -0
  81. torchaudio-2.9.1.dist-info/METADATA +133 -0
  82. torchaudio-2.9.1.dist-info/RECORD +85 -0
  83. torchaudio-2.9.1.dist-info/WHEEL +5 -0
  84. torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  85. torchaudio-2.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+
5
+ from typing import List, NamedTuple, Union
6
+
7
+ import torch
8
+ import torchaudio
9
+
10
+ torchaudio._extension._load_lib("libctc_prefix_decoder")
11
+ import torchaudio.lib.pybind11_prefixctc as cuctc
12
+
13
+
14
+ __all__ = ["CUCTCHypothesis", "CUCTCDecoder", "cuda_ctc_decoder"]
15
+
16
+
17
+ def _get_vocab_list(vocab_file):
18
+ vocab = []
19
+ with open(vocab_file, "r", encoding="utf-8") as f:
20
+ for line in f:
21
+ line = line.strip().split()
22
+ vocab.append(line[0])
23
+ return vocab
24
+
25
+
26
+ class CUCTCHypothesis(NamedTuple):
27
+ r"""Represents hypothesis generated by CUCTC beam search decoder :class:`CUCTCDecoder`."""
28
+ tokens: List[int]
29
+ """Predicted sequence of token IDs. Shape `(L, )`, where `L` is the length of the output sequence"""
30
+
31
+ words: List[str]
32
+ """List of predicted tokens. Algin with modeling unit.
33
+ """
34
+
35
+ score: float
36
+ """Score corresponding to hypothesis"""
37
+
38
+
39
+ _DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
40
+
41
+
42
+ class CUCTCDecoder:
43
+ """CUDA CTC beam search decoder.
44
+
45
+ .. devices:: CUDA
46
+
47
+ Note:
48
+ To build the decoder, please use the factory function :func:`cuda_ctc_decoder`.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ vocab_list: List[str],
54
+ blank_id: int = 0,
55
+ beam_size: int = 10,
56
+ nbest: int = 1,
57
+ blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
58
+ cuda_stream: torch.cuda.streams.Stream = None,
59
+ ):
60
+ """
61
+ Args:
62
+ blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
63
+ vocab_list (List[str]): list of vocabulary tokens
64
+ beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
65
+ nbest (int): number of best decodings to return
66
+ blank_skip_threshold (float):
67
+ skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
68
+ (Default: 0.95).
69
+ cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
70
+
71
+ """
72
+ if cuda_stream:
73
+ if not isinstance(cuda_stream, torch.cuda.streams.Stream):
74
+ raise AssertionError("cuda_stream must be torch.cuda.streams.Stream")
75
+ cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
76
+ self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
77
+ self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
78
+ if blank_id != 0:
79
+ raise AssertionError("blank_id must be 0")
80
+ self.blank_id = blank_id
81
+ self.vocab_list = vocab_list
82
+ self.space_id = 0
83
+ self.nbest = nbest
84
+ if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
85
+ raise AssertionError("blank_skip_threshold must be between 0 and 1")
86
+ self.blank_skip_threshold = math.log(blank_skip_threshold)
87
+ self.beam_size = min(beam_size, len(vocab_list)) # beam size must be smaller than vocab size
88
+
89
+ def __del__(self):
90
+ if cuctc is not None:
91
+ cuctc.prefixCTC_free(self.internal_data)
92
+
93
+ def __call__(self, log_prob: torch.Tensor, encoder_out_lens: torch.Tensor):
94
+ """
95
+ Args:
96
+ log_prob (torch.FloatTensor): GPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
97
+ probability distribution over labels; log_softmax(output of acoustic model).
98
+ lengths (dtype torch.int32): GPU tensor of shape `(batch, )` storing the valid length of
99
+ in time axis of the output Tensor in each batch.
100
+
101
+ Returns:
102
+ List[List[CUCTCHypothesis]]:
103
+ List of sorted best hypotheses for each audio sequence in the batch.
104
+ """
105
+ if not encoder_out_lens.dtype == torch.int32:
106
+ raise AssertionError("encoder_out_lens must be torch.int32")
107
+ if not log_prob.dtype == torch.float32:
108
+ raise AssertionError("log_prob must be torch.float32")
109
+ if not (log_prob.is_cuda and encoder_out_lens.is_cuda):
110
+ raise AssertionError("inputs must be cuda tensors")
111
+ if not (log_prob.is_contiguous() and encoder_out_lens.is_contiguous()):
112
+ raise AssertionError("input tensors must be contiguous")
113
+ required_size, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
114
+ self.internal_data,
115
+ self.memory.data_ptr(),
116
+ self.memory.size(0),
117
+ log_prob.data_ptr(),
118
+ encoder_out_lens.data_ptr(),
119
+ log_prob.size(),
120
+ log_prob.stride(),
121
+ self.beam_size,
122
+ self.blank_id,
123
+ self.space_id,
124
+ self.blank_skip_threshold,
125
+ )
126
+ if required_size > 0:
127
+ self.memory = torch.empty(required_size, dtype=torch.int8, device=log_prob.device).contiguous()
128
+ _, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
129
+ self.internal_data,
130
+ self.memory.data_ptr(),
131
+ self.memory.size(0),
132
+ log_prob.data_ptr(),
133
+ encoder_out_lens.data_ptr(),
134
+ log_prob.size(),
135
+ log_prob.stride(),
136
+ self.beam_size,
137
+ self.blank_id,
138
+ self.space_id,
139
+ self.blank_skip_threshold,
140
+ )
141
+ batch_size = len(score_hyps)
142
+ hypos = []
143
+ for i in range(batch_size):
144
+ hypos.append(
145
+ [
146
+ CUCTCHypothesis(
147
+ tokens=score_hyps[i][j][1],
148
+ words=[self.vocab_list[word_id] for word_id in score_hyps[i][j][1]],
149
+ score=score_hyps[i][j][0],
150
+ )
151
+ for j in range(self.nbest)
152
+ ]
153
+ )
154
+ return hypos
155
+
156
+
157
+ def cuda_ctc_decoder(
158
+ tokens: Union[str, List[str]],
159
+ nbest: int = 1,
160
+ beam_size: int = 10,
161
+ blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
162
+ ) -> CUCTCDecoder:
163
+ """Builds an instance of :class:`CUCTCDecoder`.
164
+
165
+ Args:
166
+ tokens (str or List[str]): File or list containing valid tokens.
167
+ If using a file, the expected format is for tokens mapping to the same index to be on the same line
168
+ beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
169
+ nbest (int): The number of best decodings to return
170
+ blank_id (int): The token ID corresopnding to the blank symbol.
171
+ blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
172
+ (Default: 0.95).
173
+
174
+ Returns:
175
+ CUCTCDecoder: decoder
176
+
177
+ Example
178
+ >>> decoder = cuda_ctc_decoder(
179
+ >>> vocab_file="tokens.txt",
180
+ >>> blank_skip_threshold=0.95,
181
+ >>> )
182
+ >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
183
+ """
184
+ if type(tokens) is str:
185
+ tokens = _get_vocab_list(tokens)
186
+
187
+ return CUCTCDecoder(vocab_list=tokens, beam_size=beam_size, nbest=nbest, blank_skip_threshold=blank_skip_threshold)
@@ -0,0 +1,84 @@
1
+ import torch
2
+
3
+ __all__ = ["DeepSpeech"]
4
+
5
+
6
+ class FullyConnected(torch.nn.Module):
7
+ """
8
+ Args:
9
+ n_feature: Number of input features
10
+ n_hidden: Internal hidden unit size.
11
+ """
12
+
13
+ def __init__(self, n_feature: int, n_hidden: int, dropout: float, relu_max_clip: int = 20) -> None:
14
+ super(FullyConnected, self).__init__()
15
+ self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
16
+ self.relu_max_clip = relu_max_clip
17
+ self.dropout = dropout
18
+
19
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
20
+ x = self.fc(x)
21
+ x = torch.nn.functional.relu(x)
22
+ x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip)
23
+ if self.dropout:
24
+ x = torch.nn.functional.dropout(x, self.dropout, self.training)
25
+ return x
26
+
27
+
28
+ class DeepSpeech(torch.nn.Module):
29
+ """DeepSpeech architecture introduced in
30
+ *Deep Speech: Scaling up end-to-end speech recognition* :cite:`hannun2014deep`.
31
+
32
+ Args:
33
+ n_feature: Number of input features
34
+ n_hidden: Internal hidden unit size.
35
+ n_class: Number of output classes
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ n_feature: int,
41
+ n_hidden: int = 2048,
42
+ n_class: int = 40,
43
+ dropout: float = 0.0,
44
+ ) -> None:
45
+ super(DeepSpeech, self).__init__()
46
+ self.n_hidden = n_hidden
47
+ self.fc1 = FullyConnected(n_feature, n_hidden, dropout)
48
+ self.fc2 = FullyConnected(n_hidden, n_hidden, dropout)
49
+ self.fc3 = FullyConnected(n_hidden, n_hidden, dropout)
50
+ self.bi_rnn = torch.nn.RNN(n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True)
51
+ self.fc4 = FullyConnected(n_hidden, n_hidden, dropout)
52
+ self.out = torch.nn.Linear(n_hidden, n_class)
53
+
54
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
55
+ """
56
+ Args:
57
+ x (torch.Tensor): Tensor of dimension (batch, channel, time, feature).
58
+ Returns:
59
+ Tensor: Predictor tensor of dimension (batch, time, class).
60
+ """
61
+ # N x C x T x F
62
+ x = self.fc1(x)
63
+ # N x C x T x H
64
+ x = self.fc2(x)
65
+ # N x C x T x H
66
+ x = self.fc3(x)
67
+ # N x C x T x H
68
+ x = x.squeeze(1)
69
+ # N x T x H
70
+ x = x.transpose(0, 1)
71
+ # T x N x H
72
+ x, _ = self.bi_rnn(x)
73
+ # The fifth (non-recurrent) layer takes both the forward and backward units as inputs
74
+ x = x[:, :, : self.n_hidden] + x[:, :, self.n_hidden :]
75
+ # T x N x H
76
+ x = self.fc4(x)
77
+ # T x N x H
78
+ x = self.out(x)
79
+ # T x N x n_class
80
+ x = x.permute(1, 0, 2)
81
+ # N x T x n_class
82
+ x = torch.nn.functional.log_softmax(x, dim=2)
83
+ # N x T x n_class
84
+ return x