torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.1.dist-info/METADATA +133 -0
- torchaudio-2.9.1.dist-info/RECORD +85 -0
- torchaudio-2.9.1.dist-info/WHEEL +5 -0
- torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- torchaudio-2.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
2
|
+
|
|
3
|
+
from ._alignment import forced_align as _forced_align, merge_tokens, TokenSpan
|
|
4
|
+
from .filtering import (
|
|
5
|
+
allpass_biquad,
|
|
6
|
+
band_biquad,
|
|
7
|
+
bandpass_biquad,
|
|
8
|
+
bandreject_biquad,
|
|
9
|
+
bass_biquad,
|
|
10
|
+
biquad,
|
|
11
|
+
contrast,
|
|
12
|
+
dcshift,
|
|
13
|
+
deemph_biquad,
|
|
14
|
+
dither,
|
|
15
|
+
equalizer_biquad,
|
|
16
|
+
filtfilt,
|
|
17
|
+
flanger,
|
|
18
|
+
gain,
|
|
19
|
+
highpass_biquad,
|
|
20
|
+
lfilter,
|
|
21
|
+
lowpass_biquad,
|
|
22
|
+
overdrive,
|
|
23
|
+
phaser,
|
|
24
|
+
riaa_biquad,
|
|
25
|
+
treble_biquad,
|
|
26
|
+
vad,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
forced_align = dropping_support(_forced_align)
|
|
30
|
+
|
|
31
|
+
from .functional import (
|
|
32
|
+
add_noise,
|
|
33
|
+
amplitude_to_DB,
|
|
34
|
+
apply_beamforming,
|
|
35
|
+
compute_deltas,
|
|
36
|
+
convolve,
|
|
37
|
+
create_dct,
|
|
38
|
+
DB_to_amplitude,
|
|
39
|
+
deemphasis,
|
|
40
|
+
detect_pitch_frequency,
|
|
41
|
+
edit_distance,
|
|
42
|
+
fftconvolve,
|
|
43
|
+
frechet_distance,
|
|
44
|
+
griffinlim,
|
|
45
|
+
inverse_spectrogram,
|
|
46
|
+
linear_fbanks,
|
|
47
|
+
loudness,
|
|
48
|
+
mask_along_axis,
|
|
49
|
+
mask_along_axis_iid,
|
|
50
|
+
melscale_fbanks,
|
|
51
|
+
mu_law_decoding,
|
|
52
|
+
mu_law_encoding,
|
|
53
|
+
mvdr_weights_rtf,
|
|
54
|
+
mvdr_weights_souden,
|
|
55
|
+
phase_vocoder,
|
|
56
|
+
pitch_shift,
|
|
57
|
+
preemphasis,
|
|
58
|
+
psd,
|
|
59
|
+
resample,
|
|
60
|
+
rnnt_loss,
|
|
61
|
+
rtf_evd,
|
|
62
|
+
rtf_power,
|
|
63
|
+
sliding_window_cmn,
|
|
64
|
+
spectral_centroid,
|
|
65
|
+
spectrogram,
|
|
66
|
+
speed,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
__all__ = [
|
|
70
|
+
"amplitude_to_DB",
|
|
71
|
+
"compute_deltas",
|
|
72
|
+
"create_dct",
|
|
73
|
+
"melscale_fbanks",
|
|
74
|
+
"linear_fbanks",
|
|
75
|
+
"DB_to_amplitude",
|
|
76
|
+
"loudness",
|
|
77
|
+
"detect_pitch_frequency",
|
|
78
|
+
"griffinlim",
|
|
79
|
+
"mask_along_axis",
|
|
80
|
+
"mask_along_axis_iid",
|
|
81
|
+
"mu_law_encoding",
|
|
82
|
+
"mu_law_decoding",
|
|
83
|
+
"phase_vocoder",
|
|
84
|
+
"sliding_window_cmn",
|
|
85
|
+
"spectrogram",
|
|
86
|
+
"inverse_spectrogram",
|
|
87
|
+
"spectral_centroid",
|
|
88
|
+
"allpass_biquad",
|
|
89
|
+
"band_biquad",
|
|
90
|
+
"bandpass_biquad",
|
|
91
|
+
"bandreject_biquad",
|
|
92
|
+
"bass_biquad",
|
|
93
|
+
"biquad",
|
|
94
|
+
"contrast",
|
|
95
|
+
"dither",
|
|
96
|
+
"dcshift",
|
|
97
|
+
"deemph_biquad",
|
|
98
|
+
"equalizer_biquad",
|
|
99
|
+
"filtfilt",
|
|
100
|
+
"flanger",
|
|
101
|
+
"forced_align",
|
|
102
|
+
"merge_tokens",
|
|
103
|
+
"TokenSpan",
|
|
104
|
+
"gain",
|
|
105
|
+
"highpass_biquad",
|
|
106
|
+
"lfilter",
|
|
107
|
+
"lowpass_biquad",
|
|
108
|
+
"overdrive",
|
|
109
|
+
"phaser",
|
|
110
|
+
"riaa_biquad",
|
|
111
|
+
"treble_biquad",
|
|
112
|
+
"vad",
|
|
113
|
+
"resample",
|
|
114
|
+
"edit_distance",
|
|
115
|
+
"pitch_shift",
|
|
116
|
+
"rnnt_loss",
|
|
117
|
+
"psd",
|
|
118
|
+
"mvdr_weights_souden",
|
|
119
|
+
"mvdr_weights_rtf",
|
|
120
|
+
"rtf_evd",
|
|
121
|
+
"rtf_power",
|
|
122
|
+
"apply_beamforming",
|
|
123
|
+
"fftconvolve",
|
|
124
|
+
"convolve",
|
|
125
|
+
"add_noise",
|
|
126
|
+
"speed",
|
|
127
|
+
"preemphasis",
|
|
128
|
+
"deemphasis",
|
|
129
|
+
"frechet_distance",
|
|
130
|
+
]
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
from torch import Tensor
|
|
6
|
+
from torchaudio._extension import fail_if_no_align
|
|
7
|
+
|
|
8
|
+
__all__ = []
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@fail_if_no_align
|
|
12
|
+
def forced_align(
|
|
13
|
+
log_probs: Tensor,
|
|
14
|
+
targets: Tensor,
|
|
15
|
+
input_lengths: Optional[Tensor] = None,
|
|
16
|
+
target_lengths: Optional[Tensor] = None,
|
|
17
|
+
blank: int = 0,
|
|
18
|
+
) -> Tuple[Tensor, Tensor]:
|
|
19
|
+
r"""Align a CTC label sequence to an emission.
|
|
20
|
+
|
|
21
|
+
.. devices:: CPU CUDA
|
|
22
|
+
|
|
23
|
+
.. properties:: TorchScript
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
log_probs (Tensor): log probability of CTC emission output.
|
|
27
|
+
Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
|
|
28
|
+
`C` is the number of characters in alphabet including blank.
|
|
29
|
+
targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
|
|
30
|
+
where `L` is the target length.
|
|
31
|
+
input_lengths (Tensor or None, optional):
|
|
32
|
+
Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
|
|
33
|
+
target_lengths (Tensor or None, optional):
|
|
34
|
+
Lengths of the targets. 1-D Tensor of shape `(B,)`.
|
|
35
|
+
blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Tuple(Tensor, Tensor):
|
|
39
|
+
Tensor: Label for each time step in the alignment path computed using forced alignment.
|
|
40
|
+
|
|
41
|
+
Tensor: Log probability scores of the labels for each time step.
|
|
42
|
+
|
|
43
|
+
Note:
|
|
44
|
+
The sequence length of `log_probs` must satisfy:
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
.. math::
|
|
48
|
+
L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
|
|
49
|
+
|
|
50
|
+
where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
|
|
51
|
+
For example, in str `"aabbc"`, the number of repeats are `2`.
|
|
52
|
+
|
|
53
|
+
Note:
|
|
54
|
+
The current version only supports ``batch_size==1``.
|
|
55
|
+
"""
|
|
56
|
+
if blank in targets:
|
|
57
|
+
raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
|
|
58
|
+
if torch.max(targets) >= log_probs.shape[-1]:
|
|
59
|
+
raise ValueError("targets values must be less than the CTC dimension")
|
|
60
|
+
|
|
61
|
+
if input_lengths is None:
|
|
62
|
+
batch_size, length = log_probs.size(0), log_probs.size(1)
|
|
63
|
+
input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
|
|
64
|
+
if target_lengths is None:
|
|
65
|
+
batch_size, length = targets.size(0), targets.size(1)
|
|
66
|
+
target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
|
|
67
|
+
|
|
68
|
+
# For TorchScript compatibility
|
|
69
|
+
assert input_lengths is not None
|
|
70
|
+
assert target_lengths is not None
|
|
71
|
+
|
|
72
|
+
paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
|
|
73
|
+
return paths, scores[:, torch.arange(scores.shape[1]), paths[0]]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class TokenSpan:
|
|
78
|
+
"""TokenSpan()
|
|
79
|
+
Token with time stamps and score. Returned by :py:func:`merge_tokens`.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
token: int
|
|
83
|
+
"""The token"""
|
|
84
|
+
start: int
|
|
85
|
+
"""The start time (inclusive) in emission time axis."""
|
|
86
|
+
end: int
|
|
87
|
+
"""The end time (exclusive) in emission time axis."""
|
|
88
|
+
score: float
|
|
89
|
+
"""The score of the this token."""
|
|
90
|
+
|
|
91
|
+
def __len__(self) -> int:
|
|
92
|
+
"""Returns the time span"""
|
|
93
|
+
return self.end - self.start
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
|
|
97
|
+
"""Removes repeated tokens and blank tokens from the given CTC token sequence.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
|
|
101
|
+
Shape: `(time, )`.
|
|
102
|
+
scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
|
|
103
|
+
Shape: `(time, )`. When computing the token-size score, the given score is averaged
|
|
104
|
+
across the corresponding time span.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
list of TokenSpan
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
>>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
|
|
111
|
+
>>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
|
|
112
|
+
"""
|
|
113
|
+
if tokens.ndim != 1 or scores.ndim != 1:
|
|
114
|
+
raise ValueError("`tokens` and `scores` must be 1D Tensor.")
|
|
115
|
+
if len(tokens) != len(scores):
|
|
116
|
+
raise ValueError("`tokens` and `scores` must be the same length.")
|
|
117
|
+
|
|
118
|
+
diff = torch.diff(
|
|
119
|
+
tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
|
|
120
|
+
)
|
|
121
|
+
changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
|
|
122
|
+
tokens = tokens.tolist()
|
|
123
|
+
spans = [
|
|
124
|
+
TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
|
|
125
|
+
for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
|
|
126
|
+
if (token := tokens[start]) != blank
|
|
127
|
+
]
|
|
128
|
+
return spans
|