torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchaudio/__init__.py +204 -0
- torchaudio/_extension/__init__.py +61 -0
- torchaudio/_extension/utils.py +133 -0
- torchaudio/_internal/__init__.py +10 -0
- torchaudio/_internal/module_utils.py +171 -0
- torchaudio/_torchcodec.py +340 -0
- torchaudio/compliance/__init__.py +5 -0
- torchaudio/compliance/kaldi.py +813 -0
- torchaudio/datasets/__init__.py +47 -0
- torchaudio/datasets/cmuarctic.py +157 -0
- torchaudio/datasets/cmudict.py +186 -0
- torchaudio/datasets/commonvoice.py +86 -0
- torchaudio/datasets/dr_vctk.py +121 -0
- torchaudio/datasets/fluentcommands.py +108 -0
- torchaudio/datasets/gtzan.py +1118 -0
- torchaudio/datasets/iemocap.py +147 -0
- torchaudio/datasets/librilight_limited.py +111 -0
- torchaudio/datasets/librimix.py +133 -0
- torchaudio/datasets/librispeech.py +174 -0
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +168 -0
- torchaudio/datasets/ljspeech.py +107 -0
- torchaudio/datasets/musdb_hq.py +139 -0
- torchaudio/datasets/quesst14.py +136 -0
- torchaudio/datasets/snips.py +157 -0
- torchaudio/datasets/speechcommands.py +183 -0
- torchaudio/datasets/tedlium.py +218 -0
- torchaudio/datasets/utils.py +54 -0
- torchaudio/datasets/vctk.py +143 -0
- torchaudio/datasets/voxceleb1.py +309 -0
- torchaudio/datasets/yesno.py +89 -0
- torchaudio/functional/__init__.py +130 -0
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +1685 -0
- torchaudio/functional/functional.py +2505 -0
- torchaudio/lib/__init__.py +0 -0
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/models/__init__.py +85 -0
- torchaudio/models/_hdemucs.py +1008 -0
- torchaudio/models/conformer.py +293 -0
- torchaudio/models/conv_tasnet.py +330 -0
- torchaudio/models/decoder/__init__.py +64 -0
- torchaudio/models/decoder/_ctc_decoder.py +568 -0
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/deepspeech.py +84 -0
- torchaudio/models/emformer.py +884 -0
- torchaudio/models/rnnt.py +816 -0
- torchaudio/models/rnnt_decoder.py +339 -0
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/tacotron2.py +1046 -0
- torchaudio/models/wav2letter.py +72 -0
- torchaudio/models/wav2vec2/__init__.py +45 -0
- torchaudio/models/wav2vec2/components.py +1167 -0
- torchaudio/models/wav2vec2/model.py +1579 -0
- torchaudio/models/wav2vec2/utils/__init__.py +7 -0
- torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
- torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
- torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
- torchaudio/models/wavernn.py +409 -0
- torchaudio/pipelines/__init__.py +102 -0
- torchaudio/pipelines/_source_separation_pipeline.py +109 -0
- torchaudio/pipelines/_squim_pipeline.py +156 -0
- torchaudio/pipelines/_tts/__init__.py +16 -0
- torchaudio/pipelines/_tts/impl.py +385 -0
- torchaudio/pipelines/_tts/interface.py +255 -0
- torchaudio/pipelines/_tts/utils.py +230 -0
- torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
- torchaudio/pipelines/_wav2vec2/utils.py +346 -0
- torchaudio/pipelines/rnnt_pipeline.py +380 -0
- torchaudio/transforms/__init__.py +78 -0
- torchaudio/transforms/_multi_channel.py +467 -0
- torchaudio/transforms/_transforms.py +2138 -0
- torchaudio/utils/__init__.py +4 -0
- torchaudio/utils/download.py +89 -0
- torchaudio/version.py +2 -0
- torchaudio-2.9.1.dist-info/METADATA +133 -0
- torchaudio-2.9.1.dist-info/RECORD +85 -0
- torchaudio-2.9.1.dist-info/WHEEL +5 -0
- torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- torchaudio-2.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from torch import Tensor
|
|
7
|
+
from torch.utils.data import Dataset
|
|
8
|
+
from torchaudio.datasets.utils import _load_waveform
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_SAMPLE_RATE = 16000
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_wavs_paths(data_dir):
|
|
15
|
+
wav_dir = data_dir / "sentences" / "wav"
|
|
16
|
+
wav_paths = sorted(str(p) for p in wav_dir.glob("*/*.wav"))
|
|
17
|
+
relative_paths = []
|
|
18
|
+
for wav_path in wav_paths:
|
|
19
|
+
start = wav_path.find("Session")
|
|
20
|
+
wav_path = wav_path[start:]
|
|
21
|
+
relative_paths.append(wav_path)
|
|
22
|
+
return relative_paths
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class IEMOCAP(Dataset):
|
|
26
|
+
"""*IEMOCAP* :cite:`iemocap` dataset.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
root (str or Path): Root directory where the dataset's top level directory is found
|
|
30
|
+
sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
|
|
31
|
+
utterance_type (str or None, optional): Which type(s) of utterances to include in the dataset.
|
|
32
|
+
Options: ("scripted", "improvised", ``None``). If ``None``, both scripted and improvised
|
|
33
|
+
data are used.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
root: Union[str, Path],
|
|
39
|
+
sessions: Tuple[str] = (1, 2, 3, 4, 5),
|
|
40
|
+
utterance_type: Optional[str] = None,
|
|
41
|
+
):
|
|
42
|
+
root = Path(root)
|
|
43
|
+
self._path = root / "IEMOCAP"
|
|
44
|
+
|
|
45
|
+
if not os.path.isdir(self._path):
|
|
46
|
+
raise RuntimeError("Dataset not found.")
|
|
47
|
+
|
|
48
|
+
if utterance_type not in ["scripted", "improvised", None]:
|
|
49
|
+
raise ValueError("utterance_type must be one of ['scripted', 'improvised', or None]")
|
|
50
|
+
|
|
51
|
+
all_data = []
|
|
52
|
+
self.data = []
|
|
53
|
+
self.mapping = {}
|
|
54
|
+
|
|
55
|
+
for session in sessions:
|
|
56
|
+
session_name = f"Session{session}"
|
|
57
|
+
session_dir = self._path / session_name
|
|
58
|
+
|
|
59
|
+
# get wav paths
|
|
60
|
+
wav_paths = _get_wavs_paths(session_dir)
|
|
61
|
+
for wav_path in wav_paths:
|
|
62
|
+
wav_stem = str(Path(wav_path).stem)
|
|
63
|
+
all_data.append(wav_stem)
|
|
64
|
+
|
|
65
|
+
# add labels
|
|
66
|
+
label_dir = session_dir / "dialog" / "EmoEvaluation"
|
|
67
|
+
query = "*.txt"
|
|
68
|
+
if utterance_type == "scripted":
|
|
69
|
+
query = "*script*.txt"
|
|
70
|
+
elif utterance_type == "improvised":
|
|
71
|
+
query = "*impro*.txt"
|
|
72
|
+
label_paths = label_dir.glob(query)
|
|
73
|
+
|
|
74
|
+
for label_path in label_paths:
|
|
75
|
+
with open(label_path, "r") as f:
|
|
76
|
+
for line in f:
|
|
77
|
+
if not line.startswith("["):
|
|
78
|
+
continue
|
|
79
|
+
line = re.split("[\t\n]", line)
|
|
80
|
+
wav_stem = line[1]
|
|
81
|
+
label = line[2]
|
|
82
|
+
if wav_stem not in all_data:
|
|
83
|
+
continue
|
|
84
|
+
if label not in ["neu", "hap", "ang", "sad", "exc", "fru"]:
|
|
85
|
+
continue
|
|
86
|
+
self.mapping[wav_stem] = {}
|
|
87
|
+
self.mapping[wav_stem]["label"] = label
|
|
88
|
+
|
|
89
|
+
for wav_path in wav_paths:
|
|
90
|
+
wav_stem = str(Path(wav_path).stem)
|
|
91
|
+
if wav_stem in self.mapping:
|
|
92
|
+
self.data.append(wav_stem)
|
|
93
|
+
self.mapping[wav_stem]["path"] = wav_path
|
|
94
|
+
|
|
95
|
+
def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
|
|
96
|
+
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
|
|
97
|
+
but otherwise returns the same fields as :py:meth:`__getitem__`.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
n (int): The index of the sample to be loaded
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple of the following items;
|
|
104
|
+
|
|
105
|
+
str:
|
|
106
|
+
Path to audio
|
|
107
|
+
int:
|
|
108
|
+
Sample rate
|
|
109
|
+
str:
|
|
110
|
+
File name
|
|
111
|
+
str:
|
|
112
|
+
Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
|
|
113
|
+
str:
|
|
114
|
+
Speaker
|
|
115
|
+
"""
|
|
116
|
+
wav_stem = self.data[n]
|
|
117
|
+
wav_path = self.mapping[wav_stem]["path"]
|
|
118
|
+
label = self.mapping[wav_stem]["label"]
|
|
119
|
+
speaker = wav_stem.split("_")[0]
|
|
120
|
+
return (wav_path, _SAMPLE_RATE, wav_stem, label, speaker)
|
|
121
|
+
|
|
122
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]:
|
|
123
|
+
"""Load the n-th sample from the dataset.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
n (int): The index of the sample to be loaded
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Tuple of the following items;
|
|
130
|
+
|
|
131
|
+
Tensor:
|
|
132
|
+
Waveform
|
|
133
|
+
int:
|
|
134
|
+
Sample rate
|
|
135
|
+
str:
|
|
136
|
+
File name
|
|
137
|
+
str:
|
|
138
|
+
Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
|
|
139
|
+
str:
|
|
140
|
+
Speaker
|
|
141
|
+
"""
|
|
142
|
+
metadata = self.get_metadata(n)
|
|
143
|
+
waveform = _load_waveform(self._path, metadata[0], metadata[1])
|
|
144
|
+
return (waveform,) + metadata[1:]
|
|
145
|
+
|
|
146
|
+
def __len__(self):
|
|
147
|
+
return len(self.data)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import torchaudio
|
|
6
|
+
from torch import Tensor
|
|
7
|
+
from torch.utils.data import Dataset
|
|
8
|
+
from torchaudio._internal import download_url_to_file
|
|
9
|
+
from torchaudio.datasets.librispeech import _get_librispeech_metadata
|
|
10
|
+
from torchaudio.datasets.utils import _extract_tar
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_ARCHIVE_NAME = "librispeech_finetuning"
|
|
14
|
+
_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
|
|
15
|
+
_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
|
|
16
|
+
_SUBSET_MAP = {"10min": ["1h/0"], "1h": ["1h/*"], "10h": ["1h/*", "9h"]}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_fileids_paths(path: Path, folders: List[str], _ext_audio: str) -> List[Tuple[str, str]]:
|
|
20
|
+
"""Get the file names and the corresponding file paths without `speaker_id`
|
|
21
|
+
and `chapter_id` directories.
|
|
22
|
+
The format of path is like:
|
|
23
|
+
{root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
|
|
24
|
+
{root}/{_ARCHIVE_NAME}/9h/[clean, other]
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
path (Path): Root path to the dataset.
|
|
28
|
+
folders (List[str]): Folders that contain the desired audio files.
|
|
29
|
+
_ext_audio (str): Extension of audio files.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List[Tuple[str, str]]:
|
|
33
|
+
List of tuples where the first element is the relative path to the audio file.
|
|
34
|
+
The format of relative path is like:
|
|
35
|
+
1h/[0-5]/[clean, other] or 9h/[clean, other]
|
|
36
|
+
The second element is the file name without audio extension.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
path = Path(path)
|
|
40
|
+
files_paths = []
|
|
41
|
+
for folder in folders:
|
|
42
|
+
paths = [p.relative_to(path) for p in path.glob(f"{folder}/*/*/*/*{_ext_audio}")]
|
|
43
|
+
files_paths += [(str(p.parent.parent.parent), str(p.stem)) for p in paths] # get subset folder and file name
|
|
44
|
+
files_paths.sort(key=lambda x: x[0] + x[1])
|
|
45
|
+
return files_paths
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LibriLightLimited(Dataset):
|
|
49
|
+
"""Subset of Libri-light :cite:`librilight` dataset,
|
|
50
|
+
which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
54
|
+
subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
|
|
55
|
+
(Default: ``"10min"``).
|
|
56
|
+
download (bool, optional):
|
|
57
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
_ext_txt = ".trans.txt"
|
|
61
|
+
_ext_audio = ".flac"
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
root: Union[str, Path],
|
|
66
|
+
subset: str = "10min",
|
|
67
|
+
download: bool = False,
|
|
68
|
+
) -> None:
|
|
69
|
+
if subset not in _SUBSET_MAP:
|
|
70
|
+
raise ValueError(f"`subset` must be one of {_SUBSET_MAP.keys()}. Found: {subset}")
|
|
71
|
+
folders = _SUBSET_MAP[subset]
|
|
72
|
+
|
|
73
|
+
root = os.fspath(root)
|
|
74
|
+
self._path = os.path.join(root, _ARCHIVE_NAME)
|
|
75
|
+
archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
|
|
76
|
+
if not os.path.isdir(self._path):
|
|
77
|
+
if not download:
|
|
78
|
+
raise RuntimeError("Dataset not found. Please use `download=True` to download")
|
|
79
|
+
if not os.path.isfile(archive):
|
|
80
|
+
download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
|
|
81
|
+
_extract_tar(archive)
|
|
82
|
+
self._fileids_paths = _get_fileids_paths(self._path, folders, self._ext_audio)
|
|
83
|
+
|
|
84
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
|
|
85
|
+
"""Load the n-th sample from the dataset.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
n (int): The index of the sample to be loaded
|
|
89
|
+
Returns:
|
|
90
|
+
Tuple of the following items;
|
|
91
|
+
|
|
92
|
+
Tensor:
|
|
93
|
+
Waveform
|
|
94
|
+
int:
|
|
95
|
+
Sample rate
|
|
96
|
+
str:
|
|
97
|
+
Transcript
|
|
98
|
+
int:
|
|
99
|
+
Speaker ID
|
|
100
|
+
int:
|
|
101
|
+
Chapter ID
|
|
102
|
+
int:
|
|
103
|
+
Utterance ID
|
|
104
|
+
"""
|
|
105
|
+
file_path, fileid = self._fileids_paths[n]
|
|
106
|
+
metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt)
|
|
107
|
+
waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0]))
|
|
108
|
+
return (waveform,) + metadata[1:]
|
|
109
|
+
|
|
110
|
+
def __len__(self) -> int:
|
|
111
|
+
return len(self._fileids_paths)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio.datasets.utils import _load_waveform
|
|
8
|
+
|
|
9
|
+
_TASKS_TO_MIXTURE = {
|
|
10
|
+
"sep_clean": "mix_clean",
|
|
11
|
+
"enh_single": "mix_single",
|
|
12
|
+
"enh_both": "mix_both",
|
|
13
|
+
"sep_noisy": "mix_both",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LibriMix(Dataset):
|
|
18
|
+
r"""*LibriMix* :cite:`cosentino2020librimix` dataset.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
root (str or Path): The path where the directory ``Libri2Mix`` or
|
|
22
|
+
``Libri3Mix`` is stored. Not the path of those directories.
|
|
23
|
+
subset (str, optional): The subset to use. Options: [``"train-360"``, ``"train-100"``,
|
|
24
|
+
``"dev"``, and ``"test"``] (Default: ``"train-360"``).
|
|
25
|
+
num_speakers (int, optional): The number of speakers, which determines the directories
|
|
26
|
+
to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect
|
|
27
|
+
N source audios. (Default: 2)
|
|
28
|
+
sample_rate (int, optional): Sample rate of audio files. The ``sample_rate`` determines
|
|
29
|
+
which subdirectory the audio are fetched. If any of the audio has a different sample
|
|
30
|
+
rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000)
|
|
31
|
+
task (str, optional): The task of LibriMix.
|
|
32
|
+
Options: [``"enh_single"``, ``"enh_both"``, ``"sep_clean"``, ``"sep_noisy"``]
|
|
33
|
+
(Default: ``"sep_clean"``)
|
|
34
|
+
mode (str, optional): The mode when creating the mixture. If set to ``"min"``, the lengths of mixture
|
|
35
|
+
and sources are the minimum length of all sources. If set to ``"max"``, the lengths of mixture and
|
|
36
|
+
sources are zero padded to the maximum length of all sources.
|
|
37
|
+
Options: [``"min"``, ``"max"``]
|
|
38
|
+
(Default: ``"min"``)
|
|
39
|
+
|
|
40
|
+
Note:
|
|
41
|
+
The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
root: Union[str, Path],
|
|
47
|
+
subset: str = "train-360",
|
|
48
|
+
num_speakers: int = 2,
|
|
49
|
+
sample_rate: int = 8000,
|
|
50
|
+
task: str = "sep_clean",
|
|
51
|
+
mode: str = "min",
|
|
52
|
+
):
|
|
53
|
+
self.root = Path(root) / f"Libri{num_speakers}Mix"
|
|
54
|
+
if not os.path.exists(self.root):
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
f"The path {self.root} doesn't exist. "
|
|
57
|
+
"Please check the ``root`` path and ``num_speakers`` or download the dataset manually."
|
|
58
|
+
)
|
|
59
|
+
if mode not in ["max", "min"]:
|
|
60
|
+
raise ValueError(f'Expect ``mode`` to be one in ["min", "max"]. Found {mode}.')
|
|
61
|
+
if sample_rate == 8000:
|
|
62
|
+
mix_dir = self.root / "wav8k" / mode / subset
|
|
63
|
+
elif sample_rate == 16000:
|
|
64
|
+
mix_dir = self.root / "wav16k" / mode / subset
|
|
65
|
+
else:
|
|
66
|
+
raise ValueError(f"Unsupported sample rate. Found {sample_rate}.")
|
|
67
|
+
self.sample_rate = sample_rate
|
|
68
|
+
self.task = task
|
|
69
|
+
|
|
70
|
+
self.mix_dir = mix_dir / _TASKS_TO_MIXTURE[task]
|
|
71
|
+
if task == "enh_both":
|
|
72
|
+
self.src_dirs = [(mix_dir / "mix_clean")]
|
|
73
|
+
else:
|
|
74
|
+
self.src_dirs = [(mix_dir / f"s{i+1}") for i in range(num_speakers)]
|
|
75
|
+
|
|
76
|
+
self.files = [p.name for p in self.mix_dir.glob("*.wav")]
|
|
77
|
+
self.files.sort()
|
|
78
|
+
|
|
79
|
+
def _load_sample(self, key) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
|
|
80
|
+
metadata = self.get_metadata(key)
|
|
81
|
+
mixed = _load_waveform(self.root, metadata[1], metadata[0])
|
|
82
|
+
srcs = []
|
|
83
|
+
for i, path_ in enumerate(metadata[2]):
|
|
84
|
+
src = _load_waveform(self.root, path_, metadata[0])
|
|
85
|
+
if mixed.shape != src.shape:
|
|
86
|
+
raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}")
|
|
87
|
+
srcs.append(src)
|
|
88
|
+
return self.sample_rate, mixed, srcs
|
|
89
|
+
|
|
90
|
+
def get_metadata(self, key: int) -> Tuple[int, str, List[str]]:
|
|
91
|
+
"""Get metadata for the n-th sample from the dataset.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
key (int): The index of the sample to be loaded
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Tuple of the following items;
|
|
98
|
+
|
|
99
|
+
int:
|
|
100
|
+
Sample rate
|
|
101
|
+
str:
|
|
102
|
+
Path to mixed audio
|
|
103
|
+
List of str:
|
|
104
|
+
List of paths to source audios
|
|
105
|
+
"""
|
|
106
|
+
filename = self.files[key]
|
|
107
|
+
mixed_path = os.path.relpath(self.mix_dir / filename, self.root)
|
|
108
|
+
srcs_paths = []
|
|
109
|
+
for dir_ in self.src_dirs:
|
|
110
|
+
src = os.path.relpath(dir_ / filename, self.root)
|
|
111
|
+
srcs_paths.append(src)
|
|
112
|
+
return self.sample_rate, mixed_path, srcs_paths
|
|
113
|
+
|
|
114
|
+
def __len__(self) -> int:
|
|
115
|
+
return len(self.files)
|
|
116
|
+
|
|
117
|
+
def __getitem__(self, key: int) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
|
|
118
|
+
"""Load the n-th sample from the dataset.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
key (int): The index of the sample to be loaded
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Tuple of the following items;
|
|
125
|
+
|
|
126
|
+
int:
|
|
127
|
+
Sample rate
|
|
128
|
+
Tensor:
|
|
129
|
+
Mixture waveform
|
|
130
|
+
List of Tensors:
|
|
131
|
+
List of source waveforms
|
|
132
|
+
"""
|
|
133
|
+
return self._load_sample(key)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Tuple, Union
|
|
4
|
+
|
|
5
|
+
from torch import Tensor
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
from torchaudio._internal import download_url_to_file
|
|
8
|
+
from torchaudio.datasets.utils import _extract_tar, _load_waveform
|
|
9
|
+
|
|
10
|
+
URL = "train-clean-100"
|
|
11
|
+
FOLDER_IN_ARCHIVE = "LibriSpeech"
|
|
12
|
+
SAMPLE_RATE = 16000
|
|
13
|
+
_DATA_SUBSETS = [
|
|
14
|
+
"dev-clean",
|
|
15
|
+
"dev-other",
|
|
16
|
+
"test-clean",
|
|
17
|
+
"test-other",
|
|
18
|
+
"train-clean-100",
|
|
19
|
+
"train-clean-360",
|
|
20
|
+
"train-other-500",
|
|
21
|
+
]
|
|
22
|
+
_CHECKSUMS = {
|
|
23
|
+
"http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3", # noqa: E501
|
|
24
|
+
"http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365", # noqa: E501
|
|
25
|
+
"http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23", # noqa: E501
|
|
26
|
+
"http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29", # noqa: E501
|
|
27
|
+
"http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2", # noqa: E501
|
|
28
|
+
"http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf", # noqa: E501
|
|
29
|
+
"http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2", # noqa: E501
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _download_librispeech(root, url):
|
|
34
|
+
base_url = "http://www.openslr.org/resources/12/"
|
|
35
|
+
ext_archive = ".tar.gz"
|
|
36
|
+
|
|
37
|
+
filename = url + ext_archive
|
|
38
|
+
archive = os.path.join(root, filename)
|
|
39
|
+
download_url = os.path.join(base_url, filename)
|
|
40
|
+
if not os.path.isfile(archive):
|
|
41
|
+
checksum = _CHECKSUMS.get(download_url, None)
|
|
42
|
+
download_url_to_file(download_url, archive, hash_prefix=checksum)
|
|
43
|
+
_extract_tar(archive)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_librispeech_metadata(
|
|
47
|
+
fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str
|
|
48
|
+
) -> Tuple[str, int, str, int, int, int]:
|
|
49
|
+
speaker_id, chapter_id, utterance_id = fileid.split("-")
|
|
50
|
+
|
|
51
|
+
# Get audio path and sample rate
|
|
52
|
+
fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
|
|
53
|
+
filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")
|
|
54
|
+
|
|
55
|
+
# Load text
|
|
56
|
+
file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
|
|
57
|
+
file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
|
|
58
|
+
with open(file_text) as ft:
|
|
59
|
+
for line in ft:
|
|
60
|
+
fileid_text, transcript = line.strip().split(" ", 1)
|
|
61
|
+
if fileid_audio == fileid_text:
|
|
62
|
+
break
|
|
63
|
+
else:
|
|
64
|
+
# Translation not found
|
|
65
|
+
raise FileNotFoundError(f"Translation not found for {fileid_audio}")
|
|
66
|
+
|
|
67
|
+
return (
|
|
68
|
+
filepath,
|
|
69
|
+
SAMPLE_RATE,
|
|
70
|
+
transcript,
|
|
71
|
+
int(speaker_id),
|
|
72
|
+
int(chapter_id),
|
|
73
|
+
int(utterance_id),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LIBRISPEECH(Dataset):
|
|
78
|
+
"""*LibriSpeech* :cite:`7178964` dataset.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
root (str or Path): Path to the directory where the dataset is found or downloaded.
|
|
82
|
+
url (str, optional): The URL to download the dataset from,
|
|
83
|
+
or the type of the dataset to dowload.
|
|
84
|
+
Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
|
|
85
|
+
``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
|
|
86
|
+
``"train-other-500"``. (default: ``"train-clean-100"``)
|
|
87
|
+
folder_in_archive (str, optional):
|
|
88
|
+
The top-level directory of the dataset. (default: ``"LibriSpeech"``)
|
|
89
|
+
download (bool, optional):
|
|
90
|
+
Whether to download the dataset if it is not found at root path. (default: ``False``).
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
_ext_txt = ".trans.txt"
|
|
94
|
+
_ext_audio = ".flac"
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
root: Union[str, Path],
|
|
99
|
+
url: str = URL,
|
|
100
|
+
folder_in_archive: str = FOLDER_IN_ARCHIVE,
|
|
101
|
+
download: bool = False,
|
|
102
|
+
) -> None:
|
|
103
|
+
self._url = url
|
|
104
|
+
if url not in _DATA_SUBSETS:
|
|
105
|
+
raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
|
|
106
|
+
|
|
107
|
+
root = os.fspath(root)
|
|
108
|
+
self._archive = os.path.join(root, folder_in_archive)
|
|
109
|
+
self._path = os.path.join(root, folder_in_archive, url)
|
|
110
|
+
|
|
111
|
+
if not os.path.isdir(self._path):
|
|
112
|
+
if download:
|
|
113
|
+
_download_librispeech(root, url)
|
|
114
|
+
else:
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
|
|
120
|
+
|
|
121
|
+
def get_metadata(self, n: int) -> Tuple[str, int, str, int, int, int]:
|
|
122
|
+
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
|
|
123
|
+
but otherwise returns the same fields as :py:func:`__getitem__`.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
n (int): The index of the sample to be loaded
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Tuple of the following items;
|
|
130
|
+
|
|
131
|
+
str:
|
|
132
|
+
Path to audio
|
|
133
|
+
int:
|
|
134
|
+
Sample rate
|
|
135
|
+
str:
|
|
136
|
+
Transcript
|
|
137
|
+
int:
|
|
138
|
+
Speaker ID
|
|
139
|
+
int:
|
|
140
|
+
Chapter ID
|
|
141
|
+
int:
|
|
142
|
+
Utterance ID
|
|
143
|
+
"""
|
|
144
|
+
fileid = self._walker[n]
|
|
145
|
+
return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt)
|
|
146
|
+
|
|
147
|
+
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
|
|
148
|
+
"""Load the n-th sample from the dataset.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
n (int): The index of the sample to be loaded
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Tuple of the following items;
|
|
155
|
+
|
|
156
|
+
Tensor:
|
|
157
|
+
Waveform
|
|
158
|
+
int:
|
|
159
|
+
Sample rate
|
|
160
|
+
str:
|
|
161
|
+
Transcript
|
|
162
|
+
int:
|
|
163
|
+
Speaker ID
|
|
164
|
+
int:
|
|
165
|
+
Chapter ID
|
|
166
|
+
int:
|
|
167
|
+
Utterance ID
|
|
168
|
+
"""
|
|
169
|
+
metadata = self.get_metadata(n)
|
|
170
|
+
waveform = _load_waveform(self._archive, metadata[0], metadata[1])
|
|
171
|
+
return (waveform,) + metadata[1:]
|
|
172
|
+
|
|
173
|
+
def __len__(self) -> int:
|
|
174
|
+
return len(self._walker)
|