torchaudio 2.9.0__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (86) hide show
  1. torchaudio/.dylibs/libc++.1.0.dylib +0 -0
  2. torchaudio/__init__.py +204 -0
  3. torchaudio/_extension/__init__.py +61 -0
  4. torchaudio/_extension/utils.py +133 -0
  5. torchaudio/_internal/__init__.py +10 -0
  6. torchaudio/_internal/module_utils.py +171 -0
  7. torchaudio/_torchcodec.py +340 -0
  8. torchaudio/compliance/__init__.py +5 -0
  9. torchaudio/compliance/kaldi.py +813 -0
  10. torchaudio/datasets/__init__.py +47 -0
  11. torchaudio/datasets/cmuarctic.py +157 -0
  12. torchaudio/datasets/cmudict.py +186 -0
  13. torchaudio/datasets/commonvoice.py +86 -0
  14. torchaudio/datasets/dr_vctk.py +121 -0
  15. torchaudio/datasets/fluentcommands.py +108 -0
  16. torchaudio/datasets/gtzan.py +1118 -0
  17. torchaudio/datasets/iemocap.py +147 -0
  18. torchaudio/datasets/librilight_limited.py +111 -0
  19. torchaudio/datasets/librimix.py +133 -0
  20. torchaudio/datasets/librispeech.py +174 -0
  21. torchaudio/datasets/librispeech_biasing.py +189 -0
  22. torchaudio/datasets/libritts.py +168 -0
  23. torchaudio/datasets/ljspeech.py +107 -0
  24. torchaudio/datasets/musdb_hq.py +139 -0
  25. torchaudio/datasets/quesst14.py +136 -0
  26. torchaudio/datasets/snips.py +157 -0
  27. torchaudio/datasets/speechcommands.py +183 -0
  28. torchaudio/datasets/tedlium.py +218 -0
  29. torchaudio/datasets/utils.py +54 -0
  30. torchaudio/datasets/vctk.py +143 -0
  31. torchaudio/datasets/voxceleb1.py +309 -0
  32. torchaudio/datasets/yesno.py +89 -0
  33. torchaudio/functional/__init__.py +130 -0
  34. torchaudio/functional/_alignment.py +128 -0
  35. torchaudio/functional/filtering.py +1685 -0
  36. torchaudio/functional/functional.py +2505 -0
  37. torchaudio/lib/__init__.py +0 -0
  38. torchaudio/lib/_torchaudio.so +0 -0
  39. torchaudio/lib/libtorchaudio.so +0 -0
  40. torchaudio/models/__init__.py +85 -0
  41. torchaudio/models/_hdemucs.py +1008 -0
  42. torchaudio/models/conformer.py +293 -0
  43. torchaudio/models/conv_tasnet.py +330 -0
  44. torchaudio/models/decoder/__init__.py +64 -0
  45. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  46. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  47. torchaudio/models/deepspeech.py +84 -0
  48. torchaudio/models/emformer.py +884 -0
  49. torchaudio/models/rnnt.py +816 -0
  50. torchaudio/models/rnnt_decoder.py +339 -0
  51. torchaudio/models/squim/__init__.py +11 -0
  52. torchaudio/models/squim/objective.py +326 -0
  53. torchaudio/models/squim/subjective.py +150 -0
  54. torchaudio/models/tacotron2.py +1046 -0
  55. torchaudio/models/wav2letter.py +72 -0
  56. torchaudio/models/wav2vec2/__init__.py +45 -0
  57. torchaudio/models/wav2vec2/components.py +1167 -0
  58. torchaudio/models/wav2vec2/model.py +1579 -0
  59. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  60. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  61. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  62. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  63. torchaudio/models/wavernn.py +409 -0
  64. torchaudio/pipelines/__init__.py +102 -0
  65. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  66. torchaudio/pipelines/_squim_pipeline.py +156 -0
  67. torchaudio/pipelines/_tts/__init__.py +16 -0
  68. torchaudio/pipelines/_tts/impl.py +385 -0
  69. torchaudio/pipelines/_tts/interface.py +255 -0
  70. torchaudio/pipelines/_tts/utils.py +230 -0
  71. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  72. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  73. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  74. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  75. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  76. torchaudio/transforms/__init__.py +78 -0
  77. torchaudio/transforms/_multi_channel.py +467 -0
  78. torchaudio/transforms/_transforms.py +2138 -0
  79. torchaudio/utils/__init__.py +4 -0
  80. torchaudio/utils/download.py +89 -0
  81. torchaudio/version.py +2 -0
  82. torchaudio-2.9.0.dist-info/LICENSE +25 -0
  83. torchaudio-2.9.0.dist-info/METADATA +122 -0
  84. torchaudio-2.9.0.dist-info/RECORD +86 -0
  85. torchaudio-2.9.0.dist-info/WHEEL +5 -0
  86. torchaudio-2.9.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,147 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Optional, Tuple, Union
5
+
6
+ from torch import Tensor
7
+ from torch.utils.data import Dataset
8
+ from torchaudio.datasets.utils import _load_waveform
9
+
10
+
11
+ _SAMPLE_RATE = 16000
12
+
13
+
14
+ def _get_wavs_paths(data_dir):
15
+ wav_dir = data_dir / "sentences" / "wav"
16
+ wav_paths = sorted(str(p) for p in wav_dir.glob("*/*.wav"))
17
+ relative_paths = []
18
+ for wav_path in wav_paths:
19
+ start = wav_path.find("Session")
20
+ wav_path = wav_path[start:]
21
+ relative_paths.append(wav_path)
22
+ return relative_paths
23
+
24
+
25
+ class IEMOCAP(Dataset):
26
+ """*IEMOCAP* :cite:`iemocap` dataset.
27
+
28
+ Args:
29
+ root (str or Path): Root directory where the dataset's top level directory is found
30
+ sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
31
+ utterance_type (str or None, optional): Which type(s) of utterances to include in the dataset.
32
+ Options: ("scripted", "improvised", ``None``). If ``None``, both scripted and improvised
33
+ data are used.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ root: Union[str, Path],
39
+ sessions: Tuple[str] = (1, 2, 3, 4, 5),
40
+ utterance_type: Optional[str] = None,
41
+ ):
42
+ root = Path(root)
43
+ self._path = root / "IEMOCAP"
44
+
45
+ if not os.path.isdir(self._path):
46
+ raise RuntimeError("Dataset not found.")
47
+
48
+ if utterance_type not in ["scripted", "improvised", None]:
49
+ raise ValueError("utterance_type must be one of ['scripted', 'improvised', or None]")
50
+
51
+ all_data = []
52
+ self.data = []
53
+ self.mapping = {}
54
+
55
+ for session in sessions:
56
+ session_name = f"Session{session}"
57
+ session_dir = self._path / session_name
58
+
59
+ # get wav paths
60
+ wav_paths = _get_wavs_paths(session_dir)
61
+ for wav_path in wav_paths:
62
+ wav_stem = str(Path(wav_path).stem)
63
+ all_data.append(wav_stem)
64
+
65
+ # add labels
66
+ label_dir = session_dir / "dialog" / "EmoEvaluation"
67
+ query = "*.txt"
68
+ if utterance_type == "scripted":
69
+ query = "*script*.txt"
70
+ elif utterance_type == "improvised":
71
+ query = "*impro*.txt"
72
+ label_paths = label_dir.glob(query)
73
+
74
+ for label_path in label_paths:
75
+ with open(label_path, "r") as f:
76
+ for line in f:
77
+ if not line.startswith("["):
78
+ continue
79
+ line = re.split("[\t\n]", line)
80
+ wav_stem = line[1]
81
+ label = line[2]
82
+ if wav_stem not in all_data:
83
+ continue
84
+ if label not in ["neu", "hap", "ang", "sad", "exc", "fru"]:
85
+ continue
86
+ self.mapping[wav_stem] = {}
87
+ self.mapping[wav_stem]["label"] = label
88
+
89
+ for wav_path in wav_paths:
90
+ wav_stem = str(Path(wav_path).stem)
91
+ if wav_stem in self.mapping:
92
+ self.data.append(wav_stem)
93
+ self.mapping[wav_stem]["path"] = wav_path
94
+
95
+ def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
96
+ """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
97
+ but otherwise returns the same fields as :py:meth:`__getitem__`.
98
+
99
+ Args:
100
+ n (int): The index of the sample to be loaded
101
+
102
+ Returns:
103
+ Tuple of the following items;
104
+
105
+ str:
106
+ Path to audio
107
+ int:
108
+ Sample rate
109
+ str:
110
+ File name
111
+ str:
112
+ Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
113
+ str:
114
+ Speaker
115
+ """
116
+ wav_stem = self.data[n]
117
+ wav_path = self.mapping[wav_stem]["path"]
118
+ label = self.mapping[wav_stem]["label"]
119
+ speaker = wav_stem.split("_")[0]
120
+ return (wav_path, _SAMPLE_RATE, wav_stem, label, speaker)
121
+
122
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]:
123
+ """Load the n-th sample from the dataset.
124
+
125
+ Args:
126
+ n (int): The index of the sample to be loaded
127
+
128
+ Returns:
129
+ Tuple of the following items;
130
+
131
+ Tensor:
132
+ Waveform
133
+ int:
134
+ Sample rate
135
+ str:
136
+ File name
137
+ str:
138
+ Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
139
+ str:
140
+ Speaker
141
+ """
142
+ metadata = self.get_metadata(n)
143
+ waveform = _load_waveform(self._path, metadata[0], metadata[1])
144
+ return (waveform,) + metadata[1:]
145
+
146
+ def __len__(self):
147
+ return len(self.data)
@@ -0,0 +1,111 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Tuple, Union
4
+
5
+ import torchaudio
6
+ from torch import Tensor
7
+ from torch.utils.data import Dataset
8
+ from torchaudio._internal import download_url_to_file
9
+ from torchaudio.datasets.librispeech import _get_librispeech_metadata
10
+ from torchaudio.datasets.utils import _extract_tar
11
+
12
+
13
+ _ARCHIVE_NAME = "librispeech_finetuning"
14
+ _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
15
+ _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
16
+ _SUBSET_MAP = {"10min": ["1h/0"], "1h": ["1h/*"], "10h": ["1h/*", "9h"]}
17
+
18
+
19
+ def _get_fileids_paths(path: Path, folders: List[str], _ext_audio: str) -> List[Tuple[str, str]]:
20
+ """Get the file names and the corresponding file paths without `speaker_id`
21
+ and `chapter_id` directories.
22
+ The format of path is like:
23
+ {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
24
+ {root}/{_ARCHIVE_NAME}/9h/[clean, other]
25
+
26
+ Args:
27
+ path (Path): Root path to the dataset.
28
+ folders (List[str]): Folders that contain the desired audio files.
29
+ _ext_audio (str): Extension of audio files.
30
+
31
+ Returns:
32
+ List[Tuple[str, str]]:
33
+ List of tuples where the first element is the relative path to the audio file.
34
+ The format of relative path is like:
35
+ 1h/[0-5]/[clean, other] or 9h/[clean, other]
36
+ The second element is the file name without audio extension.
37
+ """
38
+
39
+ path = Path(path)
40
+ files_paths = []
41
+ for folder in folders:
42
+ paths = [p.relative_to(path) for p in path.glob(f"{folder}/*/*/*/*{_ext_audio}")]
43
+ files_paths += [(str(p.parent.parent.parent), str(p.stem)) for p in paths] # get subset folder and file name
44
+ files_paths.sort(key=lambda x: x[0] + x[1])
45
+ return files_paths
46
+
47
+
48
+ class LibriLightLimited(Dataset):
49
+ """Subset of Libri-light :cite:`librilight` dataset,
50
+ which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.
51
+
52
+ Args:
53
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
54
+ subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
55
+ (Default: ``"10min"``).
56
+ download (bool, optional):
57
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
58
+ """
59
+
60
+ _ext_txt = ".trans.txt"
61
+ _ext_audio = ".flac"
62
+
63
+ def __init__(
64
+ self,
65
+ root: Union[str, Path],
66
+ subset: str = "10min",
67
+ download: bool = False,
68
+ ) -> None:
69
+ if subset not in _SUBSET_MAP:
70
+ raise ValueError(f"`subset` must be one of {_SUBSET_MAP.keys()}. Found: {subset}")
71
+ folders = _SUBSET_MAP[subset]
72
+
73
+ root = os.fspath(root)
74
+ self._path = os.path.join(root, _ARCHIVE_NAME)
75
+ archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
76
+ if not os.path.isdir(self._path):
77
+ if not download:
78
+ raise RuntimeError("Dataset not found. Please use `download=True` to download")
79
+ if not os.path.isfile(archive):
80
+ download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
81
+ _extract_tar(archive)
82
+ self._fileids_paths = _get_fileids_paths(self._path, folders, self._ext_audio)
83
+
84
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
85
+ """Load the n-th sample from the dataset.
86
+
87
+ Args:
88
+ n (int): The index of the sample to be loaded
89
+ Returns:
90
+ Tuple of the following items;
91
+
92
+ Tensor:
93
+ Waveform
94
+ int:
95
+ Sample rate
96
+ str:
97
+ Transcript
98
+ int:
99
+ Speaker ID
100
+ int:
101
+ Chapter ID
102
+ int:
103
+ Utterance ID
104
+ """
105
+ file_path, fileid = self._fileids_paths[n]
106
+ metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt)
107
+ waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0]))
108
+ return (waveform,) + metadata[1:]
109
+
110
+ def __len__(self) -> int:
111
+ return len(self._fileids_paths)
@@ -0,0 +1,133 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Tuple, Union
4
+
5
+ import torch
6
+ from torch.utils.data import Dataset
7
+ from torchaudio.datasets.utils import _load_waveform
8
+
9
+ _TASKS_TO_MIXTURE = {
10
+ "sep_clean": "mix_clean",
11
+ "enh_single": "mix_single",
12
+ "enh_both": "mix_both",
13
+ "sep_noisy": "mix_both",
14
+ }
15
+
16
+
17
+ class LibriMix(Dataset):
18
+ r"""*LibriMix* :cite:`cosentino2020librimix` dataset.
19
+
20
+ Args:
21
+ root (str or Path): The path where the directory ``Libri2Mix`` or
22
+ ``Libri3Mix`` is stored. Not the path of those directories.
23
+ subset (str, optional): The subset to use. Options: [``"train-360"``, ``"train-100"``,
24
+ ``"dev"``, and ``"test"``] (Default: ``"train-360"``).
25
+ num_speakers (int, optional): The number of speakers, which determines the directories
26
+ to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect
27
+ N source audios. (Default: 2)
28
+ sample_rate (int, optional): Sample rate of audio files. The ``sample_rate`` determines
29
+ which subdirectory the audio are fetched. If any of the audio has a different sample
30
+ rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000)
31
+ task (str, optional): The task of LibriMix.
32
+ Options: [``"enh_single"``, ``"enh_both"``, ``"sep_clean"``, ``"sep_noisy"``]
33
+ (Default: ``"sep_clean"``)
34
+ mode (str, optional): The mode when creating the mixture. If set to ``"min"``, the lengths of mixture
35
+ and sources are the minimum length of all sources. If set to ``"max"``, the lengths of mixture and
36
+ sources are zero padded to the maximum length of all sources.
37
+ Options: [``"min"``, ``"max"``]
38
+ (Default: ``"min"``)
39
+
40
+ Note:
41
+ The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ root: Union[str, Path],
47
+ subset: str = "train-360",
48
+ num_speakers: int = 2,
49
+ sample_rate: int = 8000,
50
+ task: str = "sep_clean",
51
+ mode: str = "min",
52
+ ):
53
+ self.root = Path(root) / f"Libri{num_speakers}Mix"
54
+ if not os.path.exists(self.root):
55
+ raise RuntimeError(
56
+ f"The path {self.root} doesn't exist. "
57
+ "Please check the ``root`` path and ``num_speakers`` or download the dataset manually."
58
+ )
59
+ if mode not in ["max", "min"]:
60
+ raise ValueError(f'Expect ``mode`` to be one in ["min", "max"]. Found {mode}.')
61
+ if sample_rate == 8000:
62
+ mix_dir = self.root / "wav8k" / mode / subset
63
+ elif sample_rate == 16000:
64
+ mix_dir = self.root / "wav16k" / mode / subset
65
+ else:
66
+ raise ValueError(f"Unsupported sample rate. Found {sample_rate}.")
67
+ self.sample_rate = sample_rate
68
+ self.task = task
69
+
70
+ self.mix_dir = mix_dir / _TASKS_TO_MIXTURE[task]
71
+ if task == "enh_both":
72
+ self.src_dirs = [(mix_dir / "mix_clean")]
73
+ else:
74
+ self.src_dirs = [(mix_dir / f"s{i+1}") for i in range(num_speakers)]
75
+
76
+ self.files = [p.name for p in self.mix_dir.glob("*.wav")]
77
+ self.files.sort()
78
+
79
+ def _load_sample(self, key) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
80
+ metadata = self.get_metadata(key)
81
+ mixed = _load_waveform(self.root, metadata[1], metadata[0])
82
+ srcs = []
83
+ for i, path_ in enumerate(metadata[2]):
84
+ src = _load_waveform(self.root, path_, metadata[0])
85
+ if mixed.shape != src.shape:
86
+ raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}")
87
+ srcs.append(src)
88
+ return self.sample_rate, mixed, srcs
89
+
90
+ def get_metadata(self, key: int) -> Tuple[int, str, List[str]]:
91
+ """Get metadata for the n-th sample from the dataset.
92
+
93
+ Args:
94
+ key (int): The index of the sample to be loaded
95
+
96
+ Returns:
97
+ Tuple of the following items;
98
+
99
+ int:
100
+ Sample rate
101
+ str:
102
+ Path to mixed audio
103
+ List of str:
104
+ List of paths to source audios
105
+ """
106
+ filename = self.files[key]
107
+ mixed_path = os.path.relpath(self.mix_dir / filename, self.root)
108
+ srcs_paths = []
109
+ for dir_ in self.src_dirs:
110
+ src = os.path.relpath(dir_ / filename, self.root)
111
+ srcs_paths.append(src)
112
+ return self.sample_rate, mixed_path, srcs_paths
113
+
114
+ def __len__(self) -> int:
115
+ return len(self.files)
116
+
117
+ def __getitem__(self, key: int) -> Tuple[int, torch.Tensor, List[torch.Tensor]]:
118
+ """Load the n-th sample from the dataset.
119
+
120
+ Args:
121
+ key (int): The index of the sample to be loaded
122
+
123
+ Returns:
124
+ Tuple of the following items;
125
+
126
+ int:
127
+ Sample rate
128
+ Tensor:
129
+ Mixture waveform
130
+ List of Tensors:
131
+ List of source waveforms
132
+ """
133
+ return self._load_sample(key)
@@ -0,0 +1,174 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Tuple, Union
4
+
5
+ from torch import Tensor
6
+ from torch.utils.data import Dataset
7
+ from torchaudio._internal import download_url_to_file
8
+ from torchaudio.datasets.utils import _extract_tar, _load_waveform
9
+
10
+ URL = "train-clean-100"
11
+ FOLDER_IN_ARCHIVE = "LibriSpeech"
12
+ SAMPLE_RATE = 16000
13
+ _DATA_SUBSETS = [
14
+ "dev-clean",
15
+ "dev-other",
16
+ "test-clean",
17
+ "test-other",
18
+ "train-clean-100",
19
+ "train-clean-360",
20
+ "train-other-500",
21
+ ]
22
+ _CHECKSUMS = {
23
+ "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3", # noqa: E501
24
+ "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365", # noqa: E501
25
+ "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23", # noqa: E501
26
+ "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29", # noqa: E501
27
+ "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2", # noqa: E501
28
+ "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf", # noqa: E501
29
+ "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2", # noqa: E501
30
+ }
31
+
32
+
33
+ def _download_librispeech(root, url):
34
+ base_url = "http://www.openslr.org/resources/12/"
35
+ ext_archive = ".tar.gz"
36
+
37
+ filename = url + ext_archive
38
+ archive = os.path.join(root, filename)
39
+ download_url = os.path.join(base_url, filename)
40
+ if not os.path.isfile(archive):
41
+ checksum = _CHECKSUMS.get(download_url, None)
42
+ download_url_to_file(download_url, archive, hash_prefix=checksum)
43
+ _extract_tar(archive)
44
+
45
+
46
+ def _get_librispeech_metadata(
47
+ fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str
48
+ ) -> Tuple[str, int, str, int, int, int]:
49
+ speaker_id, chapter_id, utterance_id = fileid.split("-")
50
+
51
+ # Get audio path and sample rate
52
+ fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
53
+ filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")
54
+
55
+ # Load text
56
+ file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
57
+ file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
58
+ with open(file_text) as ft:
59
+ for line in ft:
60
+ fileid_text, transcript = line.strip().split(" ", 1)
61
+ if fileid_audio == fileid_text:
62
+ break
63
+ else:
64
+ # Translation not found
65
+ raise FileNotFoundError(f"Translation not found for {fileid_audio}")
66
+
67
+ return (
68
+ filepath,
69
+ SAMPLE_RATE,
70
+ transcript,
71
+ int(speaker_id),
72
+ int(chapter_id),
73
+ int(utterance_id),
74
+ )
75
+
76
+
77
+ class LIBRISPEECH(Dataset):
78
+ """*LibriSpeech* :cite:`7178964` dataset.
79
+
80
+ Args:
81
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
82
+ url (str, optional): The URL to download the dataset from,
83
+ or the type of the dataset to dowload.
84
+ Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
85
+ ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
86
+ ``"train-other-500"``. (default: ``"train-clean-100"``)
87
+ folder_in_archive (str, optional):
88
+ The top-level directory of the dataset. (default: ``"LibriSpeech"``)
89
+ download (bool, optional):
90
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
91
+ """
92
+
93
+ _ext_txt = ".trans.txt"
94
+ _ext_audio = ".flac"
95
+
96
+ def __init__(
97
+ self,
98
+ root: Union[str, Path],
99
+ url: str = URL,
100
+ folder_in_archive: str = FOLDER_IN_ARCHIVE,
101
+ download: bool = False,
102
+ ) -> None:
103
+ self._url = url
104
+ if url not in _DATA_SUBSETS:
105
+ raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
106
+
107
+ root = os.fspath(root)
108
+ self._archive = os.path.join(root, folder_in_archive)
109
+ self._path = os.path.join(root, folder_in_archive, url)
110
+
111
+ if not os.path.isdir(self._path):
112
+ if download:
113
+ _download_librispeech(root, url)
114
+ else:
115
+ raise RuntimeError(
116
+ f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
117
+ )
118
+
119
+ self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
120
+
121
+ def get_metadata(self, n: int) -> Tuple[str, int, str, int, int, int]:
122
+ """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
123
+ but otherwise returns the same fields as :py:func:`__getitem__`.
124
+
125
+ Args:
126
+ n (int): The index of the sample to be loaded
127
+
128
+ Returns:
129
+ Tuple of the following items;
130
+
131
+ str:
132
+ Path to audio
133
+ int:
134
+ Sample rate
135
+ str:
136
+ Transcript
137
+ int:
138
+ Speaker ID
139
+ int:
140
+ Chapter ID
141
+ int:
142
+ Utterance ID
143
+ """
144
+ fileid = self._walker[n]
145
+ return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt)
146
+
147
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
148
+ """Load the n-th sample from the dataset.
149
+
150
+ Args:
151
+ n (int): The index of the sample to be loaded
152
+
153
+ Returns:
154
+ Tuple of the following items;
155
+
156
+ Tensor:
157
+ Waveform
158
+ int:
159
+ Sample rate
160
+ str:
161
+ Transcript
162
+ int:
163
+ Speaker ID
164
+ int:
165
+ Chapter ID
166
+ int:
167
+ Utterance ID
168
+ """
169
+ metadata = self.get_metadata(n)
170
+ waveform = _load_waveform(self._archive, metadata[0], metadata[1])
171
+ return (waveform,) + metadata[1:]
172
+
173
+ def __len__(self) -> int:
174
+ return len(self._walker)