torchaudio 2.9.0__cp314-cp314-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/.dylibs/libc++.1.0.dylib +0 -0
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.0.dist-info/LICENSE +25 -0
- torchaudio-2.9.0.dist-info/METADATA +122 -0
- torchaudio-2.9.0.dist-info/RECORD +86 -0
- torchaudio-2.9.0.dist-info/WHEEL +5 -0
- torchaudio-2.9.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
from torch.utils.data import Dataset
|
|
8
|
+
from torchaudio._internal import download_url_to_file
|
|
9
|
+
from torchaudio.datasets.utils import _extract_tar, _load_waveform
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
URL = "https://speech.fit.vutbr.cz/files/quesst14Database.tgz"
|
|
13
|
+
SAMPLE_RATE = 8000
|
|
14
|
+
_CHECKSUM = "4f869e06bc066bbe9c5dde31dbd3909a0870d70291110ebbb38878dcbc2fc5e4"
|
|
15
|
+
_LANGUAGES = [
|
|
16
|
+
"albanian",
|
|
17
|
+
"basque",
|
|
18
|
+
"czech",
|
|
19
|
+
"nnenglish",
|
|
20
|
+
"romanian",
|
|
21
|
+
"slovak",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class QUESST14(Dataset):
|
|
26
|
+
"""*QUESST14* :cite:`Mir2015QUESST2014EQ` dataset.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
root (str or Path): Root directory where the dataset's top level directory is found
|
|
30
|
+
subset (str): Subset of the dataset to use. Options: [``"docs"``, ``"dev"``, ``"eval"``].
|
|
31
|
+
language (str or None, optional): Language to get dataset for.
|
|
32
|
+
Options: [``None``, ``albanian``, ``basque``, ``czech``, ``nnenglish``, ``romanian``, ``slovak``].
|
|
33
|
+
If ``None``, dataset consists of all languages. (default: ``"nnenglish"``)
|
|
34
|
+
download (bool, optional): Whether to download the dataset if it is not found at root path.
|
|
35
|
+
(default: ``False``)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
root: Union[str, Path],
|
|
41
|
+
subset: str,
|
|
42
|
+
language: Optional[str] = "nnenglish",
|
|
43
|
+
download: bool = False,
|
|
44
|
+
) -> None:
|
|
45
|
+
if subset not in ["docs", "dev", "eval"]:
|
|
46
|
+
raise ValueError("`subset` must be one of ['docs', 'dev', 'eval']")
|
|
47
|
+
|
|
48
|
+
if language is not None and language not in _LANGUAGES:
|
|
49
|
+
raise ValueError(f"`language` must be None or one of {str(_LANGUAGES)}")
|
|
50
|
+
|
|
51
|
+
# Get string representation of 'root'
|
|
52
|
+
root = os.fspath(root)
|
|
53
|
+
|
|
54
|
+
basename = os.path.basename(URL)
|
|
55
|
+
archive = os.path.join(root, basename)
|
|
56
|
+
|
|
57
|
+
basename = basename.rsplit(".", 2)[0]
|
|
58
|
+
self._path = os.path.join(root, basename)
|
|
59
|
+
|
|
60
|
+
if not os.path.isdir(self._path):
|
|
61
|
+
if not os.path.isfile(archive):
|
|
62
|
+
if not download:
|
|
63
|
+
raise RuntimeError("Dataset not found. Please use `download=True` to download")
|
|
64
|
+
download_url_to_file(URL, archive, hash_prefix=_CHECKSUM)
|
|
65
|
+
_extract_tar(archive, root)
|
|
66
|
+
|
|
67
|
+
if subset == "docs":
|
|
68
|
+
self.data = filter_audio_paths(self._path, language, "language_key_utterances.lst")
|
|
69
|
+
elif subset == "dev":
|
|
70
|
+
self.data = filter_audio_paths(self._path, language, "language_key_dev.lst")
|
|
71
|
+
elif subset == "eval":
|
|
72
|
+
self.data = filter_audio_paths(self._path, language, "language_key_eval.lst")
|
|
73
|
+
|
|
74
|
+
def get_metadata(self, n: int) -> Tuple[str, int, str]:
|
|
75
|
+
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
|
|
76
|
+
but otherwise returns the same fields as :py:func:`__getitem__`.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
n (int): The index of the sample to be loaded
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Tuple of the following items;
|
|
83
|
+
|
|
84
|
+
str:
|
|
85
|
+
Path to audio
|
|
86
|
+
int:
|
|
87
|
+
Sample rate
|
|
88
|
+
str:
|
|
89
|
+
File name
|
|
90
|
+
"""
|
|
91
|
+
audio_path = self.data[n]
|
|
92
|
+
relpath = os.path.relpath(audio_path, self._path)
|
|
93
|
+
return relpath, SAMPLE_RATE, audio_path.with_suffix("").name
|
|
94
|
+
|
|
95
|
+
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
|
|
96
|
+
"""Load the n-th sample from the dataset.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
n (int): The index of the sample to be loaded
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Tuple of the following items;
|
|
103
|
+
|
|
104
|
+
Tensor:
|
|
105
|
+
Waveform
|
|
106
|
+
int:
|
|
107
|
+
Sample rate
|
|
108
|
+
str:
|
|
109
|
+
File name
|
|
110
|
+
"""
|
|
111
|
+
metadata = self.get_metadata(n)
|
|
112
|
+
waveform = _load_waveform(self._path, metadata[0], metadata[1])
|
|
113
|
+
return (waveform,) + metadata[1:]
|
|
114
|
+
|
|
115
|
+
def __len__(self) -> int:
|
|
116
|
+
return len(self.data)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def filter_audio_paths(
|
|
120
|
+
path: str,
|
|
121
|
+
language: str,
|
|
122
|
+
lst_name: str,
|
|
123
|
+
):
|
|
124
|
+
"""Extract audio paths for the given language."""
|
|
125
|
+
audio_paths = []
|
|
126
|
+
|
|
127
|
+
path = Path(path)
|
|
128
|
+
with open(path / "scoring" / lst_name) as f:
|
|
129
|
+
for line in f:
|
|
130
|
+
audio_path, lang = line.strip().split()
|
|
131
|
+
if language is not None and lang != language:
|
|
132
|
+
continue
|
|
133
|
+
audio_path = re.sub(r"^.*?\/", "", audio_path)
|
|
134
|
+
audio_paths.append(path / audio_path)
|
|
135
|
+
|
|
136
|
+
return audio_paths
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio.datasets.utils import _load_waveform
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_SAMPLE_RATE = 16000
|
|
11
|
+
_SPEAKERS = [
|
|
12
|
+
"Aditi",
|
|
13
|
+
"Amy",
|
|
14
|
+
"Brian",
|
|
15
|
+
"Emma",
|
|
16
|
+
"Geraint",
|
|
17
|
+
"Ivy",
|
|
18
|
+
"Joanna",
|
|
19
|
+
"Joey",
|
|
20
|
+
"Justin",
|
|
21
|
+
"Kendra",
|
|
22
|
+
"Kimberly",
|
|
23
|
+
"Matthew",
|
|
24
|
+
"Nicole",
|
|
25
|
+
"Raveena",
|
|
26
|
+
"Russell",
|
|
27
|
+
"Salli",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _load_labels(file: Path, subset: str):
|
|
32
|
+
"""Load transcirpt, iob, and intent labels for all utterances.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file (Path): The path to the label file.
|
|
36
|
+
subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dictionary of labels, where the key is the filename of the audio,
|
|
40
|
+
and the label is a Tuple of transcript, Inside–outside–beginning (IOB) label, and intention label.
|
|
41
|
+
"""
|
|
42
|
+
labels = {}
|
|
43
|
+
with open(file, "r") as f:
|
|
44
|
+
for line in f:
|
|
45
|
+
line = line.strip().split(" ")
|
|
46
|
+
index = line[0]
|
|
47
|
+
trans, iob_intent = " ".join(line[1:]).split("\t")
|
|
48
|
+
trans = " ".join(trans.split(" ")[1:-1])
|
|
49
|
+
iob = " ".join(iob_intent.split(" ")[1:-1])
|
|
50
|
+
intent = iob_intent.split(" ")[-1]
|
|
51
|
+
if subset in index:
|
|
52
|
+
labels[index] = (trans, iob, intent)
|
|
53
|
+
return labels
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Snips(Dataset):
|
|
57
|
+
"""*Snips* :cite:`coucke2018snips` dataset.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
root (str or Path): Root directory where the dataset's top level directory is found.
|
|
61
|
+
subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
|
|
62
|
+
speakers (List[str] or None, optional): The speaker list to include in the dataset. If ``None``,
|
|
63
|
+
include all speakers in the subset. (Default: ``None``)
|
|
64
|
+
audio_format (str, optional): The extension of the audios. Options: [``"mp3"``, ``"wav"``].
|
|
65
|
+
(Default: ``"mp3"``)
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
_trans_file = "all.iob.snips.txt"
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
root: Union[str, Path],
|
|
73
|
+
subset: str,
|
|
74
|
+
speakers: Optional[List[str]] = None,
|
|
75
|
+
audio_format: str = "mp3",
|
|
76
|
+
) -> None:
|
|
77
|
+
if subset not in ["train", "valid", "test"]:
|
|
78
|
+
raise ValueError('`subset` must be one of ["train", "valid", "test"].')
|
|
79
|
+
if audio_format not in ["mp3", "wav"]:
|
|
80
|
+
raise ValueError('`audio_format` must be one of ["mp3", "wav].')
|
|
81
|
+
|
|
82
|
+
root = Path(root)
|
|
83
|
+
self._path = root / "SNIPS"
|
|
84
|
+
self.audio_path = self._path / subset
|
|
85
|
+
if speakers is None:
|
|
86
|
+
speakers = _SPEAKERS
|
|
87
|
+
|
|
88
|
+
if not os.path.isdir(self._path):
|
|
89
|
+
raise RuntimeError("Dataset not found.")
|
|
90
|
+
|
|
91
|
+
self.audio_paths = self.audio_path.glob(f"*.{audio_format}")
|
|
92
|
+
self.data = []
|
|
93
|
+
for audio_path in sorted(self.audio_paths):
|
|
94
|
+
audio_name = str(audio_path.name)
|
|
95
|
+
speaker = audio_name.split("-")[0]
|
|
96
|
+
if speaker in speakers:
|
|
97
|
+
self.data.append(audio_path)
|
|
98
|
+
transcript_path = self._path / self._trans_file
|
|
99
|
+
self.labels = _load_labels(transcript_path, subset)
|
|
100
|
+
|
|
101
|
+
def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
|
|
102
|
+
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
|
|
103
|
+
but otherwise returns the same fields as :py:func:`__getitem__`.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
n (int): The index of the sample to be loaded.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Tuple of the following items:
|
|
110
|
+
|
|
111
|
+
str:
|
|
112
|
+
Path to audio
|
|
113
|
+
int:
|
|
114
|
+
Sample rate
|
|
115
|
+
str:
|
|
116
|
+
File name
|
|
117
|
+
str:
|
|
118
|
+
Transcription of audio
|
|
119
|
+
str:
|
|
120
|
+
Inside–outside–beginning (IOB) label of transcription
|
|
121
|
+
str:
|
|
122
|
+
Intention label of the audio.
|
|
123
|
+
"""
|
|
124
|
+
audio_path = self.data[n]
|
|
125
|
+
relpath = os.path.relpath(audio_path, self._path)
|
|
126
|
+
file_name = audio_path.with_suffix("").name
|
|
127
|
+
transcript, iob, intent = self.labels[file_name]
|
|
128
|
+
return relpath, _SAMPLE_RATE, file_name, transcript, iob, intent
|
|
129
|
+
|
|
130
|
+
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, str]:
|
|
131
|
+
"""Load the n-th sample from the dataset.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
n (int): The index of the sample to be loaded
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Tuple of the following items:
|
|
138
|
+
|
|
139
|
+
Tensor:
|
|
140
|
+
Waveform
|
|
141
|
+
int:
|
|
142
|
+
Sample rate
|
|
143
|
+
str:
|
|
144
|
+
File name
|
|
145
|
+
str:
|
|
146
|
+
Transcription of audio
|
|
147
|
+
str:
|
|
148
|
+
Inside–outside–beginning (IOB) label of transcription
|
|
149
|
+
str:
|
|
150
|
+
Intention label of the audio.
|
|
151
|
+
"""
|
|
152
|
+
metadata = self.get_metadata(n)
|
|
153
|
+
waveform = _load_waveform(self._path, metadata[0], metadata[1])
|
|
154
|
+
return (waveform,) + metadata[1:]
|
|
155
|
+
|
|
156
|
+
def __len__(self) -> int:
|
|
157
|
+
return len(self.data)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from torch import Tensor
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio._internal import download_url_to_file
|
|
8
|
+
from torchaudio.datasets.utils import _extract_tar, _load_waveform
|
|
9
|
+
|
|
10
|
+
FOLDER_IN_ARCHIVE = "SpeechCommands"
|
|
11
|
+
URL = "speech_commands_v0.02"
|
|
12
|
+
HASH_DIVIDER = "_nohash_"
|
|
13
|
+
EXCEPT_FOLDER = "_background_noise_"
|
|
14
|
+
SAMPLE_RATE = 16000
|
|
15
|
+
_CHECKSUMS = {
|
|
16
|
+
"http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d", # noqa: E501
|
|
17
|
+
"http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz": "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58", # noqa: E501
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_list(root, *filenames):
|
|
22
|
+
output = []
|
|
23
|
+
for filename in filenames:
|
|
24
|
+
filepath = os.path.join(root, filename)
|
|
25
|
+
with open(filepath) as fileobj:
|
|
26
|
+
output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
|
|
27
|
+
return output
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_speechcommands_metadata(filepath: str, path: str) -> Tuple[str, int, str, str, int]:
|
|
31
|
+
relpath = os.path.relpath(filepath, path)
|
|
32
|
+
reldir, filename = os.path.split(relpath)
|
|
33
|
+
_, label = os.path.split(reldir)
|
|
34
|
+
# Besides the officially supported split method for datasets defined by "validation_list.txt"
|
|
35
|
+
# and "testing_list.txt" over "speech_commands_v0.0x.tar.gz" archives, an alternative split
|
|
36
|
+
# method referred to in paragraph 2-3 of Section 7.1, references 13 and 14 of the original
|
|
37
|
+
# paper, and the checksums file from the tensorflow_datasets package [1] is also supported.
|
|
38
|
+
# Some filenames in those "speech_commands_test_set_v0.0x.tar.gz" archives have the form
|
|
39
|
+
# "xxx.wav.wav", so file extensions twice needs to be stripped twice.
|
|
40
|
+
# [1] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/url_checksums/speech_commands.txt
|
|
41
|
+
speaker, _ = os.path.splitext(filename)
|
|
42
|
+
speaker, _ = os.path.splitext(speaker)
|
|
43
|
+
|
|
44
|
+
speaker_id, utterance_number = speaker.split(HASH_DIVIDER)
|
|
45
|
+
utterance_number = int(utterance_number)
|
|
46
|
+
|
|
47
|
+
return relpath, SAMPLE_RATE, label, speaker_id, utterance_number
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class SPEECHCOMMANDS(Dataset):
|
|
51
|
+
"""*Speech Commands* :cite:`speechcommandsv2` dataset.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
55
|
+
url (str, optional): The URL to download the dataset from,
|
|
56
|
+
or the type of the dataset to dowload.
|
|
57
|
+
Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"``
|
|
58
|
+
(default: ``"speech_commands_v0.02"``)
|
|
59
|
+
folder_in_archive (str, optional):
|
|
60
|
+
The top-level directory of the dataset. (default: ``"SpeechCommands"``)
|
|
61
|
+
download (bool, optional):
|
|
62
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
63
|
+
subset (str or None, optional):
|
|
64
|
+
Select a subset of the dataset [None, "training", "validation", "testing"]. None means
|
|
65
|
+
the whole dataset. "validation" and "testing" are defined in "validation_list.txt" and
|
|
66
|
+
"testing_list.txt", respectively, and "training" is the rest. Details for the files
|
|
67
|
+
"validation_list.txt" and "testing_list.txt" are explained in the README of the dataset
|
|
68
|
+
and in the introduction of Section 7 of the original paper and its reference 12. The
|
|
69
|
+
original paper can be found `here <https://arxiv.org/pdf/1804.03209.pdf>`_. (Default: ``None``)
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
root: Union[str, Path],
|
|
75
|
+
url: str = URL,
|
|
76
|
+
folder_in_archive: str = FOLDER_IN_ARCHIVE,
|
|
77
|
+
download: bool = False,
|
|
78
|
+
subset: Optional[str] = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
|
|
81
|
+
if subset is not None and subset not in ["training", "validation", "testing"]:
|
|
82
|
+
raise ValueError("When `subset` is not None, it must be one of ['training', 'validation', 'testing'].")
|
|
83
|
+
|
|
84
|
+
if url in [
|
|
85
|
+
"speech_commands_v0.01",
|
|
86
|
+
"speech_commands_v0.02",
|
|
87
|
+
]:
|
|
88
|
+
base_url = "http://download.tensorflow.org/data/"
|
|
89
|
+
ext_archive = ".tar.gz"
|
|
90
|
+
|
|
91
|
+
url = os.path.join(base_url, url + ext_archive)
|
|
92
|
+
|
|
93
|
+
# Get string representation of 'root' in case Path object is passed
|
|
94
|
+
root = os.fspath(root)
|
|
95
|
+
self._archive = os.path.join(root, folder_in_archive)
|
|
96
|
+
|
|
97
|
+
basename = os.path.basename(url)
|
|
98
|
+
archive = os.path.join(root, basename)
|
|
99
|
+
|
|
100
|
+
basename = basename.rsplit(".", 2)[0]
|
|
101
|
+
folder_in_archive = os.path.join(folder_in_archive, basename)
|
|
102
|
+
|
|
103
|
+
self._path = os.path.join(root, folder_in_archive)
|
|
104
|
+
|
|
105
|
+
if download:
|
|
106
|
+
if not os.path.isdir(self._path):
|
|
107
|
+
if not os.path.isfile(archive):
|
|
108
|
+
checksum = _CHECKSUMS.get(url, None)
|
|
109
|
+
download_url_to_file(url, archive, hash_prefix=checksum)
|
|
110
|
+
_extract_tar(archive, self._path)
|
|
111
|
+
else:
|
|
112
|
+
if not os.path.exists(self._path):
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
f"The path {self._path} doesn't exist. "
|
|
115
|
+
"Please check the ``root`` path or set `download=True` to download it"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if subset == "validation":
|
|
119
|
+
self._walker = _load_list(self._path, "validation_list.txt")
|
|
120
|
+
elif subset == "testing":
|
|
121
|
+
self._walker = _load_list(self._path, "testing_list.txt")
|
|
122
|
+
elif subset == "training":
|
|
123
|
+
excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
|
|
124
|
+
walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
|
|
125
|
+
self._walker = [
|
|
126
|
+
w
|
|
127
|
+
for w in walker
|
|
128
|
+
if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and os.path.normpath(w) not in excludes
|
|
129
|
+
]
|
|
130
|
+
else:
|
|
131
|
+
walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
|
|
132
|
+
self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
|
|
133
|
+
|
|
134
|
+
def get_metadata(self, n: int) -> Tuple[str, int, str, str, int]:
|
|
135
|
+
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
|
|
136
|
+
but otherwise returns the same fields as :py:func:`__getitem__`.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
n (int): The index of the sample to be loaded
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Tuple of the following items;
|
|
143
|
+
|
|
144
|
+
str:
|
|
145
|
+
Path to the audio
|
|
146
|
+
int:
|
|
147
|
+
Sample rate
|
|
148
|
+
str:
|
|
149
|
+
Label
|
|
150
|
+
str:
|
|
151
|
+
Speaker ID
|
|
152
|
+
int:
|
|
153
|
+
Utterance number
|
|
154
|
+
"""
|
|
155
|
+
fileid = self._walker[n]
|
|
156
|
+
return _get_speechcommands_metadata(fileid, self._archive)
|
|
157
|
+
|
|
158
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
|
|
159
|
+
"""Load the n-th sample from the dataset.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
n (int): The index of the sample to be loaded
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Tuple of the following items;
|
|
166
|
+
|
|
167
|
+
Tensor:
|
|
168
|
+
Waveform
|
|
169
|
+
int:
|
|
170
|
+
Sample rate
|
|
171
|
+
str:
|
|
172
|
+
Label
|
|
173
|
+
str:
|
|
174
|
+
Speaker ID
|
|
175
|
+
int:
|
|
176
|
+
Utterance number
|
|
177
|
+
"""
|
|
178
|
+
metadata = self.get_metadata(n)
|
|
179
|
+
waveform = _load_waveform(self._archive, metadata[0], metadata[1])
|
|
180
|
+
return (waveform,) + metadata[1:]
|
|
181
|
+
|
|
182
|
+
def __len__(self) -> int:
|
|
183
|
+
return len(self._walker)
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Tuple, Union
|
|
4
|
+
|
|
5
|
+
import torchaudio
|
|
6
|
+
from torch import Tensor
|
|
7
|
+
from torch.utils.data import Dataset
|
|
8
|
+
from torchaudio._internal import download_url_to_file
|
|
9
|
+
from torchaudio.datasets.utils import _extract_tar
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_RELEASE_CONFIGS = {
|
|
13
|
+
"release1": {
|
|
14
|
+
"folder_in_archive": "TEDLIUM_release1",
|
|
15
|
+
"url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz",
|
|
16
|
+
"checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27",
|
|
17
|
+
"data_path": "",
|
|
18
|
+
"subset": "train",
|
|
19
|
+
"supported_subsets": ["train", "test", "dev"],
|
|
20
|
+
"dict": "TEDLIUM.150K.dic",
|
|
21
|
+
},
|
|
22
|
+
"release2": {
|
|
23
|
+
"folder_in_archive": "TEDLIUM_release2",
|
|
24
|
+
"url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz",
|
|
25
|
+
"checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58",
|
|
26
|
+
"data_path": "",
|
|
27
|
+
"subset": "train",
|
|
28
|
+
"supported_subsets": ["train", "test", "dev"],
|
|
29
|
+
"dict": "TEDLIUM.152k.dic",
|
|
30
|
+
},
|
|
31
|
+
"release3": {
|
|
32
|
+
"folder_in_archive": "TEDLIUM_release-3",
|
|
33
|
+
"url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
|
|
34
|
+
"checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
|
|
35
|
+
"data_path": "data/",
|
|
36
|
+
"subset": "train",
|
|
37
|
+
"supported_subsets": ["train", "test", "dev"],
|
|
38
|
+
"dict": "TEDLIUM.152k.dic",
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TEDLIUM(Dataset):
|
|
44
|
+
"""*Tedlium* :cite:`rousseau2012tedlium` dataset (releases 1,2 and 3).
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
48
|
+
release (str, optional): Release version.
|
|
49
|
+
Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
|
|
50
|
+
(default: ``"release1"``).
|
|
51
|
+
subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
|
|
52
|
+
and ``"test"``. Defaults to ``"train"``.
|
|
53
|
+
download (bool, optional):
|
|
54
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
55
|
+
audio_ext (str, optional): extension for audio file (default: ``".sph"``)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
root: Union[str, Path],
|
|
61
|
+
release: str = "release1",
|
|
62
|
+
subset: str = "train",
|
|
63
|
+
download: bool = False,
|
|
64
|
+
audio_ext: str = ".sph",
|
|
65
|
+
) -> None:
|
|
66
|
+
self._ext_audio = audio_ext
|
|
67
|
+
if release in _RELEASE_CONFIGS.keys():
|
|
68
|
+
folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
|
|
69
|
+
url = _RELEASE_CONFIGS[release]["url"]
|
|
70
|
+
subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
|
|
71
|
+
else:
|
|
72
|
+
# Raise warning
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"The release {} does not match any of the supported tedlium releases{} ".format(
|
|
75
|
+
release,
|
|
76
|
+
_RELEASE_CONFIGS.keys(),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
|
|
80
|
+
# Raise warning
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
"The subset {} does not match any of the supported tedlium subsets{} ".format(
|
|
83
|
+
subset,
|
|
84
|
+
_RELEASE_CONFIGS[release]["supported_subsets"],
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Get string representation of 'root' in case Path object is passed
|
|
89
|
+
root = os.fspath(root)
|
|
90
|
+
|
|
91
|
+
basename = os.path.basename(url)
|
|
92
|
+
archive = os.path.join(root, basename)
|
|
93
|
+
|
|
94
|
+
basename = basename.split(".")[0]
|
|
95
|
+
|
|
96
|
+
if release == "release3":
|
|
97
|
+
if subset == "train":
|
|
98
|
+
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
|
|
99
|
+
else:
|
|
100
|
+
self._path = os.path.join(root, folder_in_archive, "legacy", subset)
|
|
101
|
+
else:
|
|
102
|
+
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
|
|
103
|
+
|
|
104
|
+
if download:
|
|
105
|
+
if not os.path.isdir(self._path):
|
|
106
|
+
if not os.path.isfile(archive):
|
|
107
|
+
checksum = _RELEASE_CONFIGS[release]["checksum"]
|
|
108
|
+
download_url_to_file(url, archive, hash_prefix=checksum)
|
|
109
|
+
_extract_tar(archive)
|
|
110
|
+
else:
|
|
111
|
+
if not os.path.exists(self._path):
|
|
112
|
+
raise RuntimeError(
|
|
113
|
+
f"The path {self._path} doesn't exist. "
|
|
114
|
+
"Please check the ``root`` path or set `download=True` to download it"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Create list for all samples
|
|
118
|
+
self._filelist = []
|
|
119
|
+
stm_path = os.path.join(self._path, "stm")
|
|
120
|
+
for file in sorted(os.listdir(stm_path)):
|
|
121
|
+
if file.endswith(".stm"):
|
|
122
|
+
stm_path = os.path.join(self._path, "stm", file)
|
|
123
|
+
with open(stm_path) as f:
|
|
124
|
+
l = len(f.readlines())
|
|
125
|
+
file = file.replace(".stm", "")
|
|
126
|
+
self._filelist.extend((file, line) for line in range(l))
|
|
127
|
+
# Create dict path for later read
|
|
128
|
+
self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"])
|
|
129
|
+
self._phoneme_dict = None
|
|
130
|
+
|
|
131
|
+
def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]:
|
|
132
|
+
"""Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
fileid (str): File id to identify both text and audio files corresponding to the sample
|
|
136
|
+
line (int): Line identifier for the sample inside the text file
|
|
137
|
+
path (str): Dataset root path
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
(Tensor, int, str, int, int, int):
|
|
141
|
+
``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
|
|
142
|
+
"""
|
|
143
|
+
transcript_path = os.path.join(path, "stm", fileid)
|
|
144
|
+
with open(transcript_path + ".stm") as f:
|
|
145
|
+
transcript = f.readlines()[line]
|
|
146
|
+
talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)
|
|
147
|
+
|
|
148
|
+
wave_path = os.path.join(path, "sph", fileid)
|
|
149
|
+
waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time)
|
|
150
|
+
|
|
151
|
+
return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)
|
|
152
|
+
|
|
153
|
+
def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
|
|
154
|
+
"""Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
|
|
155
|
+
and load individual sentences from a full ted audio talk file.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
path (str): Path to audio file
|
|
159
|
+
start_time (int): Time in seconds where the sample sentence stars
|
|
160
|
+
end_time (int): Time in seconds where the sample sentence finishes
|
|
161
|
+
sample_rate (float, optional): Sampling rate
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
[Tensor, int]: Audio tensor representation and sample rate
|
|
165
|
+
"""
|
|
166
|
+
start_time = int(float(start_time) * sample_rate)
|
|
167
|
+
end_time = int(float(end_time) * sample_rate)
|
|
168
|
+
|
|
169
|
+
kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
|
|
170
|
+
|
|
171
|
+
return torchaudio.load(path, **kwargs)
|
|
172
|
+
|
|
173
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
|
|
174
|
+
"""Load the n-th sample from the dataset.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
n (int): The index of the sample to be loaded
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tuple of the following items;
|
|
181
|
+
|
|
182
|
+
Tensor:
|
|
183
|
+
Waveform
|
|
184
|
+
int:
|
|
185
|
+
Sample rate
|
|
186
|
+
str:
|
|
187
|
+
Transcript
|
|
188
|
+
int:
|
|
189
|
+
Talk ID
|
|
190
|
+
int:
|
|
191
|
+
Speaker ID
|
|
192
|
+
int:
|
|
193
|
+
Identifier
|
|
194
|
+
"""
|
|
195
|
+
fileid, line = self._filelist[n]
|
|
196
|
+
return self._load_tedlium_item(fileid, line, self._path)
|
|
197
|
+
|
|
198
|
+
def __len__(self) -> int:
|
|
199
|
+
"""TEDLIUM dataset custom function overwritting len default behaviour.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
int: TEDLIUM dataset length
|
|
203
|
+
"""
|
|
204
|
+
return len(self._filelist)
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def phoneme_dict(self):
|
|
208
|
+
"""dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
|
|
209
|
+
Note that some words have empty phonemes.
|
|
210
|
+
"""
|
|
211
|
+
# Read phoneme dictionary
|
|
212
|
+
if not self._phoneme_dict:
|
|
213
|
+
self._phoneme_dict = {}
|
|
214
|
+
with open(self._dict_path, "r", encoding="utf-8") as f:
|
|
215
|
+
for line in f.readlines():
|
|
216
|
+
content = line.strip().split()
|
|
217
|
+
self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list
|
|
218
|
+
return self._phoneme_dict.copy()
|