torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.1.dist-info/METADATA +133 -0
- torchaudio-2.9.1.dist-info/RECORD +85 -0
- torchaudio-2.9.1.dist-info/WHEEL +5 -0
- torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- torchaudio-2.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from .cmuarctic import CMUARCTIC
|
|
2
|
+
from .cmudict import CMUDict
|
|
3
|
+
from .commonvoice import COMMONVOICE
|
|
4
|
+
from .dr_vctk import DR_VCTK
|
|
5
|
+
from .fluentcommands import FluentSpeechCommands
|
|
6
|
+
from .gtzan import GTZAN
|
|
7
|
+
from .iemocap import IEMOCAP
|
|
8
|
+
from .librilight_limited import LibriLightLimited
|
|
9
|
+
from .librimix import LibriMix
|
|
10
|
+
from .librispeech import LIBRISPEECH
|
|
11
|
+
from .librispeech_biasing import LibriSpeechBiasing
|
|
12
|
+
from .libritts import LIBRITTS
|
|
13
|
+
from .ljspeech import LJSPEECH
|
|
14
|
+
from .musdb_hq import MUSDB_HQ
|
|
15
|
+
from .quesst14 import QUESST14
|
|
16
|
+
from .snips import Snips
|
|
17
|
+
from .speechcommands import SPEECHCOMMANDS
|
|
18
|
+
from .tedlium import TEDLIUM
|
|
19
|
+
from .vctk import VCTK_092
|
|
20
|
+
from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification
|
|
21
|
+
from .yesno import YESNO
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"COMMONVOICE",
|
|
26
|
+
"LIBRISPEECH",
|
|
27
|
+
"LibriSpeechBiasing",
|
|
28
|
+
"LibriLightLimited",
|
|
29
|
+
"SPEECHCOMMANDS",
|
|
30
|
+
"VCTK_092",
|
|
31
|
+
"DR_VCTK",
|
|
32
|
+
"YESNO",
|
|
33
|
+
"LJSPEECH",
|
|
34
|
+
"GTZAN",
|
|
35
|
+
"CMUARCTIC",
|
|
36
|
+
"CMUDict",
|
|
37
|
+
"LibriMix",
|
|
38
|
+
"LIBRITTS",
|
|
39
|
+
"TEDLIUM",
|
|
40
|
+
"QUESST14",
|
|
41
|
+
"MUSDB_HQ",
|
|
42
|
+
"FluentSpeechCommands",
|
|
43
|
+
"VoxCeleb1Identification",
|
|
44
|
+
"VoxCeleb1Verification",
|
|
45
|
+
"IEMOCAP",
|
|
46
|
+
"Snips",
|
|
47
|
+
]
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Tuple, Union
|
|
5
|
+
|
|
6
|
+
import torchaudio
|
|
7
|
+
from torch import Tensor
|
|
8
|
+
from torch.utils.data import Dataset
|
|
9
|
+
from torchaudio._internal import download_url_to_file
|
|
10
|
+
from torchaudio.datasets.utils import _extract_tar
|
|
11
|
+
|
|
12
|
+
URL = "aew"
|
|
13
|
+
FOLDER_IN_ARCHIVE = "ARCTIC"
|
|
14
|
+
_CHECKSUMS = {
|
|
15
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406", # noqa: E501
|
|
16
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c", # noqa: E501
|
|
17
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc", # noqa: E501
|
|
18
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c", # noqa: E501
|
|
19
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff", # noqa: E501
|
|
20
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904", # noqa: E501
|
|
21
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6", # noqa: E501
|
|
22
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a", # noqa: E501
|
|
23
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2", # noqa: E501
|
|
24
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f", # noqa: E501
|
|
25
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73", # noqa: E501
|
|
26
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717", # noqa: E501
|
|
27
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0", # noqa: E501
|
|
28
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a", # noqa: E501
|
|
29
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4", # noqa: E501
|
|
30
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b", # noqa: E501
|
|
31
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1", # noqa: E501
|
|
32
|
+
"http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea", # noqa: E501
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
|
|
37
|
+
|
|
38
|
+
utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
|
|
39
|
+
|
|
40
|
+
# Remove space, double quote, and single parenthesis from transcript
|
|
41
|
+
transcript = transcript[1:-3]
|
|
42
|
+
|
|
43
|
+
file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
|
|
44
|
+
|
|
45
|
+
# Load audio
|
|
46
|
+
waveform, sample_rate = torchaudio.load(file_audio)
|
|
47
|
+
|
|
48
|
+
return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CMUARCTIC(Dataset):
|
|
52
|
+
"""*CMU ARCTIC* :cite:`Kominek03cmuarctic` dataset.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
56
|
+
url (str, optional):
|
|
57
|
+
The URL to download the dataset from or the type of the dataset to download.
|
|
58
|
+
(default: ``"aew"``)
|
|
59
|
+
Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``,
|
|
60
|
+
``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``,
|
|
61
|
+
``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``.
|
|
62
|
+
folder_in_archive (str, optional):
|
|
63
|
+
The top-level directory of the dataset. (default: ``"ARCTIC"``)
|
|
64
|
+
download (bool, optional):
|
|
65
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
_file_text = "txt.done.data"
|
|
69
|
+
_folder_text = "etc"
|
|
70
|
+
_ext_audio = ".wav"
|
|
71
|
+
_folder_audio = "wav"
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False
|
|
75
|
+
) -> None:
|
|
76
|
+
|
|
77
|
+
if url in [
|
|
78
|
+
"aew",
|
|
79
|
+
"ahw",
|
|
80
|
+
"aup",
|
|
81
|
+
"awb",
|
|
82
|
+
"axb",
|
|
83
|
+
"bdl",
|
|
84
|
+
"clb",
|
|
85
|
+
"eey",
|
|
86
|
+
"fem",
|
|
87
|
+
"gka",
|
|
88
|
+
"jmk",
|
|
89
|
+
"ksp",
|
|
90
|
+
"ljm",
|
|
91
|
+
"lnh",
|
|
92
|
+
"rms",
|
|
93
|
+
"rxr",
|
|
94
|
+
"slp",
|
|
95
|
+
"slt",
|
|
96
|
+
]:
|
|
97
|
+
|
|
98
|
+
url = "cmu_us_" + url + "_arctic"
|
|
99
|
+
ext_archive = ".tar.bz2"
|
|
100
|
+
base_url = "http://www.festvox.org/cmu_arctic/packed/"
|
|
101
|
+
|
|
102
|
+
url = os.path.join(base_url, url + ext_archive)
|
|
103
|
+
|
|
104
|
+
# Get string representation of 'root' in case Path object is passed
|
|
105
|
+
root = os.fspath(root)
|
|
106
|
+
|
|
107
|
+
basename = os.path.basename(url)
|
|
108
|
+
root = os.path.join(root, folder_in_archive)
|
|
109
|
+
if not os.path.isdir(root):
|
|
110
|
+
os.mkdir(root)
|
|
111
|
+
archive = os.path.join(root, basename)
|
|
112
|
+
|
|
113
|
+
basename = basename.split(".")[0]
|
|
114
|
+
|
|
115
|
+
self._path = os.path.join(root, basename)
|
|
116
|
+
|
|
117
|
+
if download:
|
|
118
|
+
if not os.path.isdir(self._path):
|
|
119
|
+
if not os.path.isfile(archive):
|
|
120
|
+
checksum = _CHECKSUMS.get(url, None)
|
|
121
|
+
download_url_to_file(url, archive, hash_prefix=checksum)
|
|
122
|
+
_extract_tar(archive)
|
|
123
|
+
else:
|
|
124
|
+
if not os.path.exists(self._path):
|
|
125
|
+
raise RuntimeError(
|
|
126
|
+
f"The path {self._path} doesn't exist. "
|
|
127
|
+
"Please check the ``root`` path or set `download=True` to download it"
|
|
128
|
+
)
|
|
129
|
+
self._text = os.path.join(self._path, self._folder_text, self._file_text)
|
|
130
|
+
|
|
131
|
+
with open(self._text, "r") as text:
|
|
132
|
+
walker = csv.reader(text)
|
|
133
|
+
self._walker = list(walker)
|
|
134
|
+
|
|
135
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
|
|
136
|
+
"""Load the n-th sample from the dataset.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
n (int): The index of the sample to be loaded
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Tuple of the following items;
|
|
143
|
+
|
|
144
|
+
Tensor:
|
|
145
|
+
Waveform
|
|
146
|
+
int:
|
|
147
|
+
Sample rate
|
|
148
|
+
str:
|
|
149
|
+
Transcript
|
|
150
|
+
str:
|
|
151
|
+
Utterance ID
|
|
152
|
+
"""
|
|
153
|
+
line = self._walker[n]
|
|
154
|
+
return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
|
|
155
|
+
|
|
156
|
+
def __len__(self) -> int:
|
|
157
|
+
return len(self._walker)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable, List, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio._internal import download_url_to_file
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_CHECKSUMS = {
|
|
11
|
+
"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4", # noqa: E501
|
|
12
|
+
"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027", # noqa: E501
|
|
13
|
+
}
|
|
14
|
+
_PUNCTUATIONS = {
|
|
15
|
+
"!EXCLAMATION-POINT",
|
|
16
|
+
'"CLOSE-QUOTE',
|
|
17
|
+
'"DOUBLE-QUOTE',
|
|
18
|
+
'"END-OF-QUOTE',
|
|
19
|
+
'"END-QUOTE',
|
|
20
|
+
'"IN-QUOTES',
|
|
21
|
+
'"QUOTE',
|
|
22
|
+
'"UNQUOTE',
|
|
23
|
+
"#HASH-MARK",
|
|
24
|
+
"#POUND-SIGN",
|
|
25
|
+
"#SHARP-SIGN",
|
|
26
|
+
"%PERCENT",
|
|
27
|
+
"&ERSAND",
|
|
28
|
+
"'END-INNER-QUOTE",
|
|
29
|
+
"'END-QUOTE",
|
|
30
|
+
"'INNER-QUOTE",
|
|
31
|
+
"'QUOTE",
|
|
32
|
+
"'SINGLE-QUOTE",
|
|
33
|
+
"(BEGIN-PARENS",
|
|
34
|
+
"(IN-PARENTHESES",
|
|
35
|
+
"(LEFT-PAREN",
|
|
36
|
+
"(OPEN-PARENTHESES",
|
|
37
|
+
"(PAREN",
|
|
38
|
+
"(PARENS",
|
|
39
|
+
"(PARENTHESES",
|
|
40
|
+
")CLOSE-PAREN",
|
|
41
|
+
")CLOSE-PARENTHESES",
|
|
42
|
+
")END-PAREN",
|
|
43
|
+
")END-PARENS",
|
|
44
|
+
")END-PARENTHESES",
|
|
45
|
+
")END-THE-PAREN",
|
|
46
|
+
")PAREN",
|
|
47
|
+
")PARENS",
|
|
48
|
+
")RIGHT-PAREN",
|
|
49
|
+
")UN-PARENTHESES",
|
|
50
|
+
"+PLUS",
|
|
51
|
+
",COMMA",
|
|
52
|
+
"--DASH",
|
|
53
|
+
"-DASH",
|
|
54
|
+
"-HYPHEN",
|
|
55
|
+
"...ELLIPSIS",
|
|
56
|
+
".DECIMAL",
|
|
57
|
+
".DOT",
|
|
58
|
+
".FULL-STOP",
|
|
59
|
+
".PERIOD",
|
|
60
|
+
".POINT",
|
|
61
|
+
"/SLASH",
|
|
62
|
+
":COLON",
|
|
63
|
+
";SEMI-COLON",
|
|
64
|
+
";SEMI-COLON(1)",
|
|
65
|
+
"?QUESTION-MARK",
|
|
66
|
+
"{BRACE",
|
|
67
|
+
"{LEFT-BRACE",
|
|
68
|
+
"{OPEN-BRACE",
|
|
69
|
+
"}CLOSE-BRACE",
|
|
70
|
+
"}RIGHT-BRACE",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
|
|
75
|
+
_alt_re = re.compile(r"\([0-9]+\)")
|
|
76
|
+
cmudict: List[Tuple[str, List[str]]] = []
|
|
77
|
+
for line in lines:
|
|
78
|
+
if not line or line.startswith(";;;"): # ignore comments
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
word, phones = line.strip().split(" ")
|
|
82
|
+
if word in _PUNCTUATIONS:
|
|
83
|
+
if exclude_punctuations:
|
|
84
|
+
continue
|
|
85
|
+
# !EXCLAMATION-POINT -> !
|
|
86
|
+
# --DASH -> --
|
|
87
|
+
# ...ELLIPSIS -> ...
|
|
88
|
+
if word.startswith("..."):
|
|
89
|
+
word = "..."
|
|
90
|
+
elif word.startswith("--"):
|
|
91
|
+
word = "--"
|
|
92
|
+
else:
|
|
93
|
+
word = word[0]
|
|
94
|
+
|
|
95
|
+
# if a word have multiple pronunciations, there will be (number) appended to it
|
|
96
|
+
# for example, DATAPOINTS and DATAPOINTS(1),
|
|
97
|
+
# the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
|
|
98
|
+
word = re.sub(_alt_re, "", word)
|
|
99
|
+
phones = phones.split(" ")
|
|
100
|
+
cmudict.append((word, phones))
|
|
101
|
+
|
|
102
|
+
return cmudict
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class CMUDict(Dataset):
|
|
106
|
+
"""*CMU Pronouncing Dictionary* :cite:`cmudict` (CMUDict) dataset.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
110
|
+
exclude_punctuations (bool, optional):
|
|
111
|
+
When enabled, exclude the pronounciation of punctuations, such as
|
|
112
|
+
`!EXCLAMATION-POINT` and `#HASH-MARK`.
|
|
113
|
+
download (bool, optional):
|
|
114
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
115
|
+
url (str, optional):
|
|
116
|
+
The URL to download the dictionary from.
|
|
117
|
+
(default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``)
|
|
118
|
+
url_symbols (str, optional):
|
|
119
|
+
The URL to download the list of symbols from.
|
|
120
|
+
(default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``)
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
root: Union[str, Path],
|
|
126
|
+
exclude_punctuations: bool = True,
|
|
127
|
+
*,
|
|
128
|
+
download: bool = False,
|
|
129
|
+
url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b",
|
|
130
|
+
url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols",
|
|
131
|
+
) -> None:
|
|
132
|
+
|
|
133
|
+
self.exclude_punctuations = exclude_punctuations
|
|
134
|
+
|
|
135
|
+
self._root_path = Path(root)
|
|
136
|
+
if not os.path.isdir(self._root_path):
|
|
137
|
+
raise RuntimeError(f"The root directory does not exist; {root}")
|
|
138
|
+
|
|
139
|
+
dict_file = self._root_path / os.path.basename(url)
|
|
140
|
+
symbol_file = self._root_path / os.path.basename(url_symbols)
|
|
141
|
+
if not os.path.exists(dict_file):
|
|
142
|
+
if not download:
|
|
143
|
+
raise RuntimeError(
|
|
144
|
+
"The dictionary file is not found in the following location. "
|
|
145
|
+
f"Set `download=True` to download it. {dict_file}"
|
|
146
|
+
)
|
|
147
|
+
checksum = _CHECKSUMS.get(url, None)
|
|
148
|
+
download_url_to_file(url, dict_file, checksum)
|
|
149
|
+
if not os.path.exists(symbol_file):
|
|
150
|
+
if not download:
|
|
151
|
+
raise RuntimeError(
|
|
152
|
+
"The symbol file is not found in the following location. "
|
|
153
|
+
f"Set `download=True` to download it. {symbol_file}"
|
|
154
|
+
)
|
|
155
|
+
checksum = _CHECKSUMS.get(url_symbols, None)
|
|
156
|
+
download_url_to_file(url_symbols, symbol_file, checksum)
|
|
157
|
+
|
|
158
|
+
with open(symbol_file, "r") as text:
|
|
159
|
+
self._symbols = [line.strip() for line in text.readlines()]
|
|
160
|
+
|
|
161
|
+
with open(dict_file, "r", encoding="latin-1") as text:
|
|
162
|
+
self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
|
|
163
|
+
|
|
164
|
+
def __getitem__(self, n: int) -> Tuple[str, List[str]]:
|
|
165
|
+
"""Load the n-th sample from the dataset.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
n (int): The index of the sample to be loaded.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Tuple of a word and its phonemes
|
|
172
|
+
|
|
173
|
+
str:
|
|
174
|
+
Word
|
|
175
|
+
List[str]:
|
|
176
|
+
Phonemes
|
|
177
|
+
"""
|
|
178
|
+
return self._dictionary[n]
|
|
179
|
+
|
|
180
|
+
def __len__(self) -> int:
|
|
181
|
+
return len(self._dictionary)
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def symbols(self) -> List[str]:
|
|
185
|
+
"""list[str]: A list of phonemes symbols, such as ``"AA"``, ``"AE"``, ``"AH"``."""
|
|
186
|
+
return self._symbols.copy()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import torchaudio
|
|
7
|
+
from torch import Tensor
|
|
8
|
+
from torch.utils.data import Dataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_commonvoice_item(
|
|
12
|
+
line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
|
|
13
|
+
) -> Tuple[Tensor, int, Dict[str, str]]:
|
|
14
|
+
# Each line as the following data:
|
|
15
|
+
# client_id, path, sentence, up_votes, down_votes, age, gender, accent
|
|
16
|
+
|
|
17
|
+
if header[1] != "path":
|
|
18
|
+
raise ValueError(f"expect `header[1]` to be 'path', but got {header[1]}")
|
|
19
|
+
fileid = line[1]
|
|
20
|
+
filename = os.path.join(path, folder_audio, fileid)
|
|
21
|
+
if not filename.endswith(ext_audio):
|
|
22
|
+
filename += ext_audio
|
|
23
|
+
waveform, sample_rate = torchaudio.load(filename)
|
|
24
|
+
|
|
25
|
+
dic = dict(zip(header, line))
|
|
26
|
+
|
|
27
|
+
return waveform, sample_rate, dic
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class COMMONVOICE(Dataset):
|
|
31
|
+
"""*CommonVoice* :cite:`ardila2020common` dataset.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
root (str or Path): Path to the directory where the dataset is located.
|
|
35
|
+
(Where the ``tsv`` file is present.)
|
|
36
|
+
tsv (str, optional):
|
|
37
|
+
The name of the tsv file used to construct the metadata, such as
|
|
38
|
+
``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
|
|
39
|
+
``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
_ext_txt = ".txt"
|
|
43
|
+
_ext_audio = ".mp3"
|
|
44
|
+
_folder_audio = "clips"
|
|
45
|
+
|
|
46
|
+
def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None:
|
|
47
|
+
|
|
48
|
+
# Get string representation of 'root' in case Path object is passed
|
|
49
|
+
self._path = os.fspath(root)
|
|
50
|
+
self._tsv = os.path.join(self._path, tsv)
|
|
51
|
+
|
|
52
|
+
with open(self._tsv, "r") as tsv_:
|
|
53
|
+
walker = csv.reader(tsv_, delimiter="\t")
|
|
54
|
+
self._header = next(walker)
|
|
55
|
+
self._walker = list(walker)
|
|
56
|
+
|
|
57
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
|
|
58
|
+
"""Load the n-th sample from the dataset.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
n (int): The index of the sample to be loaded
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Tuple of the following items;
|
|
65
|
+
|
|
66
|
+
Tensor:
|
|
67
|
+
Waveform
|
|
68
|
+
int:
|
|
69
|
+
Sample rate
|
|
70
|
+
Dict[str, str]:
|
|
71
|
+
Dictionary containing the following items from the corresponding TSV file;
|
|
72
|
+
|
|
73
|
+
* ``"client_id"``
|
|
74
|
+
* ``"path"``
|
|
75
|
+
* ``"sentence"``
|
|
76
|
+
* ``"up_votes"``
|
|
77
|
+
* ``"down_votes"``
|
|
78
|
+
* ``"age"``
|
|
79
|
+
* ``"gender"``
|
|
80
|
+
* ``"accent"``
|
|
81
|
+
"""
|
|
82
|
+
line = self._walker[n]
|
|
83
|
+
return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
|
|
84
|
+
|
|
85
|
+
def __len__(self) -> int:
|
|
86
|
+
return len(self._walker)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import torchaudio
|
|
5
|
+
from torch import Tensor
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio._internal import download_url_to_file
|
|
8
|
+
from torchaudio.datasets.utils import _extract_zip
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
|
|
12
|
+
_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
|
|
13
|
+
_SUPPORTED_SUBSETS = {"train", "test"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DR_VCTK(Dataset):
|
|
17
|
+
"""*Device Recorded VCTK (Small subset version)* :cite:`Sarfjoo2018DeviceRV` dataset.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
root (str or Path): Root directory where the dataset's top level directory is found.
|
|
21
|
+
subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``).
|
|
22
|
+
download (bool):
|
|
23
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
24
|
+
url (str): The URL to download the dataset from.
|
|
25
|
+
(default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
root: Union[str, Path],
|
|
31
|
+
subset: str = "train",
|
|
32
|
+
*,
|
|
33
|
+
download: bool = False,
|
|
34
|
+
url: str = _URL,
|
|
35
|
+
) -> None:
|
|
36
|
+
if subset not in _SUPPORTED_SUBSETS:
|
|
37
|
+
raise RuntimeError(
|
|
38
|
+
f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
root = Path(root).expanduser()
|
|
42
|
+
archive = root / "DR-VCTK.zip"
|
|
43
|
+
|
|
44
|
+
self._subset = subset
|
|
45
|
+
self._path = root / "DR-VCTK" / "DR-VCTK"
|
|
46
|
+
self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k"
|
|
47
|
+
self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k"
|
|
48
|
+
self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt"
|
|
49
|
+
|
|
50
|
+
if not self._path.is_dir():
|
|
51
|
+
if not archive.is_file():
|
|
52
|
+
if not download:
|
|
53
|
+
raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
|
|
54
|
+
download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
|
|
55
|
+
_extract_zip(archive, root)
|
|
56
|
+
|
|
57
|
+
self._config = self._load_config(self._config_filepath)
|
|
58
|
+
self._filename_list = sorted(self._config)
|
|
59
|
+
|
|
60
|
+
def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
|
|
61
|
+
# Skip header
|
|
62
|
+
skip_rows = 2 if self._subset == "train" else 1
|
|
63
|
+
|
|
64
|
+
config = {}
|
|
65
|
+
with open(filepath) as f:
|
|
66
|
+
for i, line in enumerate(f):
|
|
67
|
+
if i < skip_rows or not line:
|
|
68
|
+
continue
|
|
69
|
+
filename, source, channel_id = line.strip().split("\t")
|
|
70
|
+
config[filename] = (source, int(channel_id))
|
|
71
|
+
return config
|
|
72
|
+
|
|
73
|
+
def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
|
|
74
|
+
speaker_id, utterance_id = filename.split(".")[0].split("_")
|
|
75
|
+
source, channel_id = self._config[filename]
|
|
76
|
+
file_clean_audio = self._clean_audio_dir / filename
|
|
77
|
+
file_noisy_audio = self._noisy_audio_dir / filename
|
|
78
|
+
waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
|
|
79
|
+
waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
|
|
80
|
+
return (
|
|
81
|
+
waveform_clean,
|
|
82
|
+
sample_rate_clean,
|
|
83
|
+
waveform_noisy,
|
|
84
|
+
sample_rate_noisy,
|
|
85
|
+
speaker_id,
|
|
86
|
+
utterance_id,
|
|
87
|
+
source,
|
|
88
|
+
channel_id,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
|
|
92
|
+
"""Load the n-th sample from the dataset.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
n (int): The index of the sample to be loaded
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Tuple of the following items;
|
|
99
|
+
|
|
100
|
+
Tensor:
|
|
101
|
+
Clean waveform
|
|
102
|
+
int:
|
|
103
|
+
Sample rate of the clean waveform
|
|
104
|
+
Tensor:
|
|
105
|
+
Noisy waveform
|
|
106
|
+
int:
|
|
107
|
+
Sample rate of the noisy waveform
|
|
108
|
+
str:
|
|
109
|
+
Speaker ID
|
|
110
|
+
str:
|
|
111
|
+
Utterance ID
|
|
112
|
+
str:
|
|
113
|
+
Source
|
|
114
|
+
int:
|
|
115
|
+
Channel ID
|
|
116
|
+
"""
|
|
117
|
+
filename = self._filename_list[n]
|
|
118
|
+
return self._load_dr_vctk_item(filename)
|
|
119
|
+
|
|
120
|
+
def __len__(self) -> int:
|
|
121
|
+
return len(self._filename_list)
|