torchaudio 2.9.0__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (86) hide show
  1. torchaudio/.dylibs/libc++.1.0.dylib +0 -0
  2. torchaudio/__init__.py +204 -0
  3. torchaudio/_extension/__init__.py +61 -0
  4. torchaudio/_extension/utils.py +133 -0
  5. torchaudio/_internal/__init__.py +10 -0
  6. torchaudio/_internal/module_utils.py +171 -0
  7. torchaudio/_torchcodec.py +340 -0
  8. torchaudio/compliance/__init__.py +5 -0
  9. torchaudio/compliance/kaldi.py +813 -0
  10. torchaudio/datasets/__init__.py +47 -0
  11. torchaudio/datasets/cmuarctic.py +157 -0
  12. torchaudio/datasets/cmudict.py +186 -0
  13. torchaudio/datasets/commonvoice.py +86 -0
  14. torchaudio/datasets/dr_vctk.py +121 -0
  15. torchaudio/datasets/fluentcommands.py +108 -0
  16. torchaudio/datasets/gtzan.py +1118 -0
  17. torchaudio/datasets/iemocap.py +147 -0
  18. torchaudio/datasets/librilight_limited.py +111 -0
  19. torchaudio/datasets/librimix.py +133 -0
  20. torchaudio/datasets/librispeech.py +174 -0
  21. torchaudio/datasets/librispeech_biasing.py +189 -0
  22. torchaudio/datasets/libritts.py +168 -0
  23. torchaudio/datasets/ljspeech.py +107 -0
  24. torchaudio/datasets/musdb_hq.py +139 -0
  25. torchaudio/datasets/quesst14.py +136 -0
  26. torchaudio/datasets/snips.py +157 -0
  27. torchaudio/datasets/speechcommands.py +183 -0
  28. torchaudio/datasets/tedlium.py +218 -0
  29. torchaudio/datasets/utils.py +54 -0
  30. torchaudio/datasets/vctk.py +143 -0
  31. torchaudio/datasets/voxceleb1.py +309 -0
  32. torchaudio/datasets/yesno.py +89 -0
  33. torchaudio/functional/__init__.py +130 -0
  34. torchaudio/functional/_alignment.py +128 -0
  35. torchaudio/functional/filtering.py +1685 -0
  36. torchaudio/functional/functional.py +2505 -0
  37. torchaudio/lib/__init__.py +0 -0
  38. torchaudio/lib/_torchaudio.so +0 -0
  39. torchaudio/lib/libtorchaudio.so +0 -0
  40. torchaudio/models/__init__.py +85 -0
  41. torchaudio/models/_hdemucs.py +1008 -0
  42. torchaudio/models/conformer.py +293 -0
  43. torchaudio/models/conv_tasnet.py +330 -0
  44. torchaudio/models/decoder/__init__.py +64 -0
  45. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  46. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  47. torchaudio/models/deepspeech.py +84 -0
  48. torchaudio/models/emformer.py +884 -0
  49. torchaudio/models/rnnt.py +816 -0
  50. torchaudio/models/rnnt_decoder.py +339 -0
  51. torchaudio/models/squim/__init__.py +11 -0
  52. torchaudio/models/squim/objective.py +326 -0
  53. torchaudio/models/squim/subjective.py +150 -0
  54. torchaudio/models/tacotron2.py +1046 -0
  55. torchaudio/models/wav2letter.py +72 -0
  56. torchaudio/models/wav2vec2/__init__.py +45 -0
  57. torchaudio/models/wav2vec2/components.py +1167 -0
  58. torchaudio/models/wav2vec2/model.py +1579 -0
  59. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  60. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  61. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  62. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  63. torchaudio/models/wavernn.py +409 -0
  64. torchaudio/pipelines/__init__.py +102 -0
  65. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  66. torchaudio/pipelines/_squim_pipeline.py +156 -0
  67. torchaudio/pipelines/_tts/__init__.py +16 -0
  68. torchaudio/pipelines/_tts/impl.py +385 -0
  69. torchaudio/pipelines/_tts/interface.py +255 -0
  70. torchaudio/pipelines/_tts/utils.py +230 -0
  71. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  72. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  73. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  74. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  75. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  76. torchaudio/transforms/__init__.py +78 -0
  77. torchaudio/transforms/_multi_channel.py +467 -0
  78. torchaudio/transforms/_transforms.py +2138 -0
  79. torchaudio/utils/__init__.py +4 -0
  80. torchaudio/utils/download.py +89 -0
  81. torchaudio/version.py +2 -0
  82. torchaudio-2.9.0.dist-info/LICENSE +25 -0
  83. torchaudio-2.9.0.dist-info/METADATA +122 -0
  84. torchaudio-2.9.0.dist-info/RECORD +86 -0
  85. torchaudio-2.9.0.dist-info/WHEEL +5 -0
  86. torchaudio-2.9.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,136 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Optional, Tuple, Union
5
+
6
+ import torch
7
+ from torch.utils.data import Dataset
8
+ from torchaudio._internal import download_url_to_file
9
+ from torchaudio.datasets.utils import _extract_tar, _load_waveform
10
+
11
+
12
+ URL = "https://speech.fit.vutbr.cz/files/quesst14Database.tgz"
13
+ SAMPLE_RATE = 8000
14
+ _CHECKSUM = "4f869e06bc066bbe9c5dde31dbd3909a0870d70291110ebbb38878dcbc2fc5e4"
15
+ _LANGUAGES = [
16
+ "albanian",
17
+ "basque",
18
+ "czech",
19
+ "nnenglish",
20
+ "romanian",
21
+ "slovak",
22
+ ]
23
+
24
+
25
+ class QUESST14(Dataset):
26
+ """*QUESST14* :cite:`Mir2015QUESST2014EQ` dataset.
27
+
28
+ Args:
29
+ root (str or Path): Root directory where the dataset's top level directory is found
30
+ subset (str): Subset of the dataset to use. Options: [``"docs"``, ``"dev"``, ``"eval"``].
31
+ language (str or None, optional): Language to get dataset for.
32
+ Options: [``None``, ``albanian``, ``basque``, ``czech``, ``nnenglish``, ``romanian``, ``slovak``].
33
+ If ``None``, dataset consists of all languages. (default: ``"nnenglish"``)
34
+ download (bool, optional): Whether to download the dataset if it is not found at root path.
35
+ (default: ``False``)
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ root: Union[str, Path],
41
+ subset: str,
42
+ language: Optional[str] = "nnenglish",
43
+ download: bool = False,
44
+ ) -> None:
45
+ if subset not in ["docs", "dev", "eval"]:
46
+ raise ValueError("`subset` must be one of ['docs', 'dev', 'eval']")
47
+
48
+ if language is not None and language not in _LANGUAGES:
49
+ raise ValueError(f"`language` must be None or one of {str(_LANGUAGES)}")
50
+
51
+ # Get string representation of 'root'
52
+ root = os.fspath(root)
53
+
54
+ basename = os.path.basename(URL)
55
+ archive = os.path.join(root, basename)
56
+
57
+ basename = basename.rsplit(".", 2)[0]
58
+ self._path = os.path.join(root, basename)
59
+
60
+ if not os.path.isdir(self._path):
61
+ if not os.path.isfile(archive):
62
+ if not download:
63
+ raise RuntimeError("Dataset not found. Please use `download=True` to download")
64
+ download_url_to_file(URL, archive, hash_prefix=_CHECKSUM)
65
+ _extract_tar(archive, root)
66
+
67
+ if subset == "docs":
68
+ self.data = filter_audio_paths(self._path, language, "language_key_utterances.lst")
69
+ elif subset == "dev":
70
+ self.data = filter_audio_paths(self._path, language, "language_key_dev.lst")
71
+ elif subset == "eval":
72
+ self.data = filter_audio_paths(self._path, language, "language_key_eval.lst")
73
+
74
+ def get_metadata(self, n: int) -> Tuple[str, int, str]:
75
+ """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
76
+ but otherwise returns the same fields as :py:func:`__getitem__`.
77
+
78
+ Args:
79
+ n (int): The index of the sample to be loaded
80
+
81
+ Returns:
82
+ Tuple of the following items;
83
+
84
+ str:
85
+ Path to audio
86
+ int:
87
+ Sample rate
88
+ str:
89
+ File name
90
+ """
91
+ audio_path = self.data[n]
92
+ relpath = os.path.relpath(audio_path, self._path)
93
+ return relpath, SAMPLE_RATE, audio_path.with_suffix("").name
94
+
95
+ def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
96
+ """Load the n-th sample from the dataset.
97
+
98
+ Args:
99
+ n (int): The index of the sample to be loaded
100
+
101
+ Returns:
102
+ Tuple of the following items;
103
+
104
+ Tensor:
105
+ Waveform
106
+ int:
107
+ Sample rate
108
+ str:
109
+ File name
110
+ """
111
+ metadata = self.get_metadata(n)
112
+ waveform = _load_waveform(self._path, metadata[0], metadata[1])
113
+ return (waveform,) + metadata[1:]
114
+
115
+ def __len__(self) -> int:
116
+ return len(self.data)
117
+
118
+
119
+ def filter_audio_paths(
120
+ path: str,
121
+ language: str,
122
+ lst_name: str,
123
+ ):
124
+ """Extract audio paths for the given language."""
125
+ audio_paths = []
126
+
127
+ path = Path(path)
128
+ with open(path / "scoring" / lst_name) as f:
129
+ for line in f:
130
+ audio_path, lang = line.strip().split()
131
+ if language is not None and lang != language:
132
+ continue
133
+ audio_path = re.sub(r"^.*?\/", "", audio_path)
134
+ audio_paths.append(path / audio_path)
135
+
136
+ return audio_paths
@@ -0,0 +1,157 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import torch
6
+ from torch.utils.data import Dataset
7
+ from torchaudio.datasets.utils import _load_waveform
8
+
9
+
10
+ _SAMPLE_RATE = 16000
11
+ _SPEAKERS = [
12
+ "Aditi",
13
+ "Amy",
14
+ "Brian",
15
+ "Emma",
16
+ "Geraint",
17
+ "Ivy",
18
+ "Joanna",
19
+ "Joey",
20
+ "Justin",
21
+ "Kendra",
22
+ "Kimberly",
23
+ "Matthew",
24
+ "Nicole",
25
+ "Raveena",
26
+ "Russell",
27
+ "Salli",
28
+ ]
29
+
30
+
31
+ def _load_labels(file: Path, subset: str):
32
+ """Load transcirpt, iob, and intent labels for all utterances.
33
+
34
+ Args:
35
+ file (Path): The path to the label file.
36
+ subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
37
+
38
+ Returns:
39
+ Dictionary of labels, where the key is the filename of the audio,
40
+ and the label is a Tuple of transcript, Inside–outside–beginning (IOB) label, and intention label.
41
+ """
42
+ labels = {}
43
+ with open(file, "r") as f:
44
+ for line in f:
45
+ line = line.strip().split(" ")
46
+ index = line[0]
47
+ trans, iob_intent = " ".join(line[1:]).split("\t")
48
+ trans = " ".join(trans.split(" ")[1:-1])
49
+ iob = " ".join(iob_intent.split(" ")[1:-1])
50
+ intent = iob_intent.split(" ")[-1]
51
+ if subset in index:
52
+ labels[index] = (trans, iob, intent)
53
+ return labels
54
+
55
+
56
+ class Snips(Dataset):
57
+ """*Snips* :cite:`coucke2018snips` dataset.
58
+
59
+ Args:
60
+ root (str or Path): Root directory where the dataset's top level directory is found.
61
+ subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].
62
+ speakers (List[str] or None, optional): The speaker list to include in the dataset. If ``None``,
63
+ include all speakers in the subset. (Default: ``None``)
64
+ audio_format (str, optional): The extension of the audios. Options: [``"mp3"``, ``"wav"``].
65
+ (Default: ``"mp3"``)
66
+ """
67
+
68
+ _trans_file = "all.iob.snips.txt"
69
+
70
+ def __init__(
71
+ self,
72
+ root: Union[str, Path],
73
+ subset: str,
74
+ speakers: Optional[List[str]] = None,
75
+ audio_format: str = "mp3",
76
+ ) -> None:
77
+ if subset not in ["train", "valid", "test"]:
78
+ raise ValueError('`subset` must be one of ["train", "valid", "test"].')
79
+ if audio_format not in ["mp3", "wav"]:
80
+ raise ValueError('`audio_format` must be one of ["mp3", "wav].')
81
+
82
+ root = Path(root)
83
+ self._path = root / "SNIPS"
84
+ self.audio_path = self._path / subset
85
+ if speakers is None:
86
+ speakers = _SPEAKERS
87
+
88
+ if not os.path.isdir(self._path):
89
+ raise RuntimeError("Dataset not found.")
90
+
91
+ self.audio_paths = self.audio_path.glob(f"*.{audio_format}")
92
+ self.data = []
93
+ for audio_path in sorted(self.audio_paths):
94
+ audio_name = str(audio_path.name)
95
+ speaker = audio_name.split("-")[0]
96
+ if speaker in speakers:
97
+ self.data.append(audio_path)
98
+ transcript_path = self._path / self._trans_file
99
+ self.labels = _load_labels(transcript_path, subset)
100
+
101
+ def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
102
+ """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
103
+ but otherwise returns the same fields as :py:func:`__getitem__`.
104
+
105
+ Args:
106
+ n (int): The index of the sample to be loaded.
107
+
108
+ Returns:
109
+ Tuple of the following items:
110
+
111
+ str:
112
+ Path to audio
113
+ int:
114
+ Sample rate
115
+ str:
116
+ File name
117
+ str:
118
+ Transcription of audio
119
+ str:
120
+ Inside–outside–beginning (IOB) label of transcription
121
+ str:
122
+ Intention label of the audio.
123
+ """
124
+ audio_path = self.data[n]
125
+ relpath = os.path.relpath(audio_path, self._path)
126
+ file_name = audio_path.with_suffix("").name
127
+ transcript, iob, intent = self.labels[file_name]
128
+ return relpath, _SAMPLE_RATE, file_name, transcript, iob, intent
129
+
130
+ def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, str]:
131
+ """Load the n-th sample from the dataset.
132
+
133
+ Args:
134
+ n (int): The index of the sample to be loaded
135
+
136
+ Returns:
137
+ Tuple of the following items:
138
+
139
+ Tensor:
140
+ Waveform
141
+ int:
142
+ Sample rate
143
+ str:
144
+ File name
145
+ str:
146
+ Transcription of audio
147
+ str:
148
+ Inside–outside–beginning (IOB) label of transcription
149
+ str:
150
+ Intention label of the audio.
151
+ """
152
+ metadata = self.get_metadata(n)
153
+ waveform = _load_waveform(self._path, metadata[0], metadata[1])
154
+ return (waveform,) + metadata[1:]
155
+
156
+ def __len__(self) -> int:
157
+ return len(self.data)
@@ -0,0 +1,183 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional, Tuple, Union
4
+
5
+ from torch import Tensor
6
+ from torch.utils.data import Dataset
7
+ from torchaudio._internal import download_url_to_file
8
+ from torchaudio.datasets.utils import _extract_tar, _load_waveform
9
+
10
+ FOLDER_IN_ARCHIVE = "SpeechCommands"
11
+ URL = "speech_commands_v0.02"
12
+ HASH_DIVIDER = "_nohash_"
13
+ EXCEPT_FOLDER = "_background_noise_"
14
+ SAMPLE_RATE = 16000
15
+ _CHECKSUMS = {
16
+ "http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d", # noqa: E501
17
+ "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz": "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58", # noqa: E501
18
+ }
19
+
20
+
21
+ def _load_list(root, *filenames):
22
+ output = []
23
+ for filename in filenames:
24
+ filepath = os.path.join(root, filename)
25
+ with open(filepath) as fileobj:
26
+ output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
27
+ return output
28
+
29
+
30
+ def _get_speechcommands_metadata(filepath: str, path: str) -> Tuple[str, int, str, str, int]:
31
+ relpath = os.path.relpath(filepath, path)
32
+ reldir, filename = os.path.split(relpath)
33
+ _, label = os.path.split(reldir)
34
+ # Besides the officially supported split method for datasets defined by "validation_list.txt"
35
+ # and "testing_list.txt" over "speech_commands_v0.0x.tar.gz" archives, an alternative split
36
+ # method referred to in paragraph 2-3 of Section 7.1, references 13 and 14 of the original
37
+ # paper, and the checksums file from the tensorflow_datasets package [1] is also supported.
38
+ # Some filenames in those "speech_commands_test_set_v0.0x.tar.gz" archives have the form
39
+ # "xxx.wav.wav", so file extensions twice needs to be stripped twice.
40
+ # [1] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/url_checksums/speech_commands.txt
41
+ speaker, _ = os.path.splitext(filename)
42
+ speaker, _ = os.path.splitext(speaker)
43
+
44
+ speaker_id, utterance_number = speaker.split(HASH_DIVIDER)
45
+ utterance_number = int(utterance_number)
46
+
47
+ return relpath, SAMPLE_RATE, label, speaker_id, utterance_number
48
+
49
+
50
+ class SPEECHCOMMANDS(Dataset):
51
+ """*Speech Commands* :cite:`speechcommandsv2` dataset.
52
+
53
+ Args:
54
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
55
+ url (str, optional): The URL to download the dataset from,
56
+ or the type of the dataset to dowload.
57
+ Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"``
58
+ (default: ``"speech_commands_v0.02"``)
59
+ folder_in_archive (str, optional):
60
+ The top-level directory of the dataset. (default: ``"SpeechCommands"``)
61
+ download (bool, optional):
62
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
63
+ subset (str or None, optional):
64
+ Select a subset of the dataset [None, "training", "validation", "testing"]. None means
65
+ the whole dataset. "validation" and "testing" are defined in "validation_list.txt" and
66
+ "testing_list.txt", respectively, and "training" is the rest. Details for the files
67
+ "validation_list.txt" and "testing_list.txt" are explained in the README of the dataset
68
+ and in the introduction of Section 7 of the original paper and its reference 12. The
69
+ original paper can be found `here <https://arxiv.org/pdf/1804.03209.pdf>`_. (Default: ``None``)
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ root: Union[str, Path],
75
+ url: str = URL,
76
+ folder_in_archive: str = FOLDER_IN_ARCHIVE,
77
+ download: bool = False,
78
+ subset: Optional[str] = None,
79
+ ) -> None:
80
+
81
+ if subset is not None and subset not in ["training", "validation", "testing"]:
82
+ raise ValueError("When `subset` is not None, it must be one of ['training', 'validation', 'testing'].")
83
+
84
+ if url in [
85
+ "speech_commands_v0.01",
86
+ "speech_commands_v0.02",
87
+ ]:
88
+ base_url = "http://download.tensorflow.org/data/"
89
+ ext_archive = ".tar.gz"
90
+
91
+ url = os.path.join(base_url, url + ext_archive)
92
+
93
+ # Get string representation of 'root' in case Path object is passed
94
+ root = os.fspath(root)
95
+ self._archive = os.path.join(root, folder_in_archive)
96
+
97
+ basename = os.path.basename(url)
98
+ archive = os.path.join(root, basename)
99
+
100
+ basename = basename.rsplit(".", 2)[0]
101
+ folder_in_archive = os.path.join(folder_in_archive, basename)
102
+
103
+ self._path = os.path.join(root, folder_in_archive)
104
+
105
+ if download:
106
+ if not os.path.isdir(self._path):
107
+ if not os.path.isfile(archive):
108
+ checksum = _CHECKSUMS.get(url, None)
109
+ download_url_to_file(url, archive, hash_prefix=checksum)
110
+ _extract_tar(archive, self._path)
111
+ else:
112
+ if not os.path.exists(self._path):
113
+ raise RuntimeError(
114
+ f"The path {self._path} doesn't exist. "
115
+ "Please check the ``root`` path or set `download=True` to download it"
116
+ )
117
+
118
+ if subset == "validation":
119
+ self._walker = _load_list(self._path, "validation_list.txt")
120
+ elif subset == "testing":
121
+ self._walker = _load_list(self._path, "testing_list.txt")
122
+ elif subset == "training":
123
+ excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
124
+ walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
125
+ self._walker = [
126
+ w
127
+ for w in walker
128
+ if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and os.path.normpath(w) not in excludes
129
+ ]
130
+ else:
131
+ walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
132
+ self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
133
+
134
+ def get_metadata(self, n: int) -> Tuple[str, int, str, str, int]:
135
+ """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
136
+ but otherwise returns the same fields as :py:func:`__getitem__`.
137
+
138
+ Args:
139
+ n (int): The index of the sample to be loaded
140
+
141
+ Returns:
142
+ Tuple of the following items;
143
+
144
+ str:
145
+ Path to the audio
146
+ int:
147
+ Sample rate
148
+ str:
149
+ Label
150
+ str:
151
+ Speaker ID
152
+ int:
153
+ Utterance number
154
+ """
155
+ fileid = self._walker[n]
156
+ return _get_speechcommands_metadata(fileid, self._archive)
157
+
158
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
159
+ """Load the n-th sample from the dataset.
160
+
161
+ Args:
162
+ n (int): The index of the sample to be loaded
163
+
164
+ Returns:
165
+ Tuple of the following items;
166
+
167
+ Tensor:
168
+ Waveform
169
+ int:
170
+ Sample rate
171
+ str:
172
+ Label
173
+ str:
174
+ Speaker ID
175
+ int:
176
+ Utterance number
177
+ """
178
+ metadata = self.get_metadata(n)
179
+ waveform = _load_waveform(self._archive, metadata[0], metadata[1])
180
+ return (waveform,) + metadata[1:]
181
+
182
+ def __len__(self) -> int:
183
+ return len(self._walker)
@@ -0,0 +1,218 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Tuple, Union
4
+
5
+ import torchaudio
6
+ from torch import Tensor
7
+ from torch.utils.data import Dataset
8
+ from torchaudio._internal import download_url_to_file
9
+ from torchaudio.datasets.utils import _extract_tar
10
+
11
+
12
+ _RELEASE_CONFIGS = {
13
+ "release1": {
14
+ "folder_in_archive": "TEDLIUM_release1",
15
+ "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz",
16
+ "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27",
17
+ "data_path": "",
18
+ "subset": "train",
19
+ "supported_subsets": ["train", "test", "dev"],
20
+ "dict": "TEDLIUM.150K.dic",
21
+ },
22
+ "release2": {
23
+ "folder_in_archive": "TEDLIUM_release2",
24
+ "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz",
25
+ "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58",
26
+ "data_path": "",
27
+ "subset": "train",
28
+ "supported_subsets": ["train", "test", "dev"],
29
+ "dict": "TEDLIUM.152k.dic",
30
+ },
31
+ "release3": {
32
+ "folder_in_archive": "TEDLIUM_release-3",
33
+ "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
34
+ "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
35
+ "data_path": "data/",
36
+ "subset": "train",
37
+ "supported_subsets": ["train", "test", "dev"],
38
+ "dict": "TEDLIUM.152k.dic",
39
+ },
40
+ }
41
+
42
+
43
+ class TEDLIUM(Dataset):
44
+ """*Tedlium* :cite:`rousseau2012tedlium` dataset (releases 1,2 and 3).
45
+
46
+ Args:
47
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
48
+ release (str, optional): Release version.
49
+ Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
50
+ (default: ``"release1"``).
51
+ subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
52
+ and ``"test"``. Defaults to ``"train"``.
53
+ download (bool, optional):
54
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
55
+ audio_ext (str, optional): extension for audio file (default: ``".sph"``)
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ root: Union[str, Path],
61
+ release: str = "release1",
62
+ subset: str = "train",
63
+ download: bool = False,
64
+ audio_ext: str = ".sph",
65
+ ) -> None:
66
+ self._ext_audio = audio_ext
67
+ if release in _RELEASE_CONFIGS.keys():
68
+ folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
69
+ url = _RELEASE_CONFIGS[release]["url"]
70
+ subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
71
+ else:
72
+ # Raise warning
73
+ raise RuntimeError(
74
+ "The release {} does not match any of the supported tedlium releases{} ".format(
75
+ release,
76
+ _RELEASE_CONFIGS.keys(),
77
+ )
78
+ )
79
+ if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
80
+ # Raise warning
81
+ raise RuntimeError(
82
+ "The subset {} does not match any of the supported tedlium subsets{} ".format(
83
+ subset,
84
+ _RELEASE_CONFIGS[release]["supported_subsets"],
85
+ )
86
+ )
87
+
88
+ # Get string representation of 'root' in case Path object is passed
89
+ root = os.fspath(root)
90
+
91
+ basename = os.path.basename(url)
92
+ archive = os.path.join(root, basename)
93
+
94
+ basename = basename.split(".")[0]
95
+
96
+ if release == "release3":
97
+ if subset == "train":
98
+ self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
99
+ else:
100
+ self._path = os.path.join(root, folder_in_archive, "legacy", subset)
101
+ else:
102
+ self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
103
+
104
+ if download:
105
+ if not os.path.isdir(self._path):
106
+ if not os.path.isfile(archive):
107
+ checksum = _RELEASE_CONFIGS[release]["checksum"]
108
+ download_url_to_file(url, archive, hash_prefix=checksum)
109
+ _extract_tar(archive)
110
+ else:
111
+ if not os.path.exists(self._path):
112
+ raise RuntimeError(
113
+ f"The path {self._path} doesn't exist. "
114
+ "Please check the ``root`` path or set `download=True` to download it"
115
+ )
116
+
117
+ # Create list for all samples
118
+ self._filelist = []
119
+ stm_path = os.path.join(self._path, "stm")
120
+ for file in sorted(os.listdir(stm_path)):
121
+ if file.endswith(".stm"):
122
+ stm_path = os.path.join(self._path, "stm", file)
123
+ with open(stm_path) as f:
124
+ l = len(f.readlines())
125
+ file = file.replace(".stm", "")
126
+ self._filelist.extend((file, line) for line in range(l))
127
+ # Create dict path for later read
128
+ self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"])
129
+ self._phoneme_dict = None
130
+
131
+ def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]:
132
+ """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
133
+
134
+ Args:
135
+ fileid (str): File id to identify both text and audio files corresponding to the sample
136
+ line (int): Line identifier for the sample inside the text file
137
+ path (str): Dataset root path
138
+
139
+ Returns:
140
+ (Tensor, int, str, int, int, int):
141
+ ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
142
+ """
143
+ transcript_path = os.path.join(path, "stm", fileid)
144
+ with open(transcript_path + ".stm") as f:
145
+ transcript = f.readlines()[line]
146
+ talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)
147
+
148
+ wave_path = os.path.join(path, "sph", fileid)
149
+ waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time)
150
+
151
+ return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)
152
+
153
+ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
154
+ """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
155
+ and load individual sentences from a full ted audio talk file.
156
+
157
+ Args:
158
+ path (str): Path to audio file
159
+ start_time (int): Time in seconds where the sample sentence stars
160
+ end_time (int): Time in seconds where the sample sentence finishes
161
+ sample_rate (float, optional): Sampling rate
162
+
163
+ Returns:
164
+ [Tensor, int]: Audio tensor representation and sample rate
165
+ """
166
+ start_time = int(float(start_time) * sample_rate)
167
+ end_time = int(float(end_time) * sample_rate)
168
+
169
+ kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
170
+
171
+ return torchaudio.load(path, **kwargs)
172
+
173
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
174
+ """Load the n-th sample from the dataset.
175
+
176
+ Args:
177
+ n (int): The index of the sample to be loaded
178
+
179
+ Returns:
180
+ Tuple of the following items;
181
+
182
+ Tensor:
183
+ Waveform
184
+ int:
185
+ Sample rate
186
+ str:
187
+ Transcript
188
+ int:
189
+ Talk ID
190
+ int:
191
+ Speaker ID
192
+ int:
193
+ Identifier
194
+ """
195
+ fileid, line = self._filelist[n]
196
+ return self._load_tedlium_item(fileid, line, self._path)
197
+
198
+ def __len__(self) -> int:
199
+ """TEDLIUM dataset custom function overwritting len default behaviour.
200
+
201
+ Returns:
202
+ int: TEDLIUM dataset length
203
+ """
204
+ return len(self._filelist)
205
+
206
+ @property
207
+ def phoneme_dict(self):
208
+ """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
209
+ Note that some words have empty phonemes.
210
+ """
211
+ # Read phoneme dictionary
212
+ if not self._phoneme_dict:
213
+ self._phoneme_dict = {}
214
+ with open(self._dict_path, "r", encoding="utf-8") as f:
215
+ for line in f.readlines():
216
+ content = line.strip().split()
217
+ self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list
218
+ return self._phoneme_dict.copy()