torchaudio 2.9.1__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. torchaudio/__init__.py +204 -0
  2. torchaudio/_extension/__init__.py +61 -0
  3. torchaudio/_extension/utils.py +133 -0
  4. torchaudio/_internal/__init__.py +10 -0
  5. torchaudio/_internal/module_utils.py +171 -0
  6. torchaudio/_torchcodec.py +340 -0
  7. torchaudio/compliance/__init__.py +5 -0
  8. torchaudio/compliance/kaldi.py +813 -0
  9. torchaudio/datasets/__init__.py +47 -0
  10. torchaudio/datasets/cmuarctic.py +157 -0
  11. torchaudio/datasets/cmudict.py +186 -0
  12. torchaudio/datasets/commonvoice.py +86 -0
  13. torchaudio/datasets/dr_vctk.py +121 -0
  14. torchaudio/datasets/fluentcommands.py +108 -0
  15. torchaudio/datasets/gtzan.py +1118 -0
  16. torchaudio/datasets/iemocap.py +147 -0
  17. torchaudio/datasets/librilight_limited.py +111 -0
  18. torchaudio/datasets/librimix.py +133 -0
  19. torchaudio/datasets/librispeech.py +174 -0
  20. torchaudio/datasets/librispeech_biasing.py +189 -0
  21. torchaudio/datasets/libritts.py +168 -0
  22. torchaudio/datasets/ljspeech.py +107 -0
  23. torchaudio/datasets/musdb_hq.py +139 -0
  24. torchaudio/datasets/quesst14.py +136 -0
  25. torchaudio/datasets/snips.py +157 -0
  26. torchaudio/datasets/speechcommands.py +183 -0
  27. torchaudio/datasets/tedlium.py +218 -0
  28. torchaudio/datasets/utils.py +54 -0
  29. torchaudio/datasets/vctk.py +143 -0
  30. torchaudio/datasets/voxceleb1.py +309 -0
  31. torchaudio/datasets/yesno.py +89 -0
  32. torchaudio/functional/__init__.py +130 -0
  33. torchaudio/functional/_alignment.py +128 -0
  34. torchaudio/functional/filtering.py +1685 -0
  35. torchaudio/functional/functional.py +2505 -0
  36. torchaudio/lib/__init__.py +0 -0
  37. torchaudio/lib/_torchaudio.so +0 -0
  38. torchaudio/lib/libtorchaudio.so +0 -0
  39. torchaudio/models/__init__.py +85 -0
  40. torchaudio/models/_hdemucs.py +1008 -0
  41. torchaudio/models/conformer.py +293 -0
  42. torchaudio/models/conv_tasnet.py +330 -0
  43. torchaudio/models/decoder/__init__.py +64 -0
  44. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  45. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  46. torchaudio/models/deepspeech.py +84 -0
  47. torchaudio/models/emformer.py +884 -0
  48. torchaudio/models/rnnt.py +816 -0
  49. torchaudio/models/rnnt_decoder.py +339 -0
  50. torchaudio/models/squim/__init__.py +11 -0
  51. torchaudio/models/squim/objective.py +326 -0
  52. torchaudio/models/squim/subjective.py +150 -0
  53. torchaudio/models/tacotron2.py +1046 -0
  54. torchaudio/models/wav2letter.py +72 -0
  55. torchaudio/models/wav2vec2/__init__.py +45 -0
  56. torchaudio/models/wav2vec2/components.py +1167 -0
  57. torchaudio/models/wav2vec2/model.py +1579 -0
  58. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  59. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  60. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  61. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  62. torchaudio/models/wavernn.py +409 -0
  63. torchaudio/pipelines/__init__.py +102 -0
  64. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  65. torchaudio/pipelines/_squim_pipeline.py +156 -0
  66. torchaudio/pipelines/_tts/__init__.py +16 -0
  67. torchaudio/pipelines/_tts/impl.py +385 -0
  68. torchaudio/pipelines/_tts/interface.py +255 -0
  69. torchaudio/pipelines/_tts/utils.py +230 -0
  70. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  71. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  72. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  73. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  74. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  75. torchaudio/transforms/__init__.py +78 -0
  76. torchaudio/transforms/_multi_channel.py +467 -0
  77. torchaudio/transforms/_transforms.py +2138 -0
  78. torchaudio/utils/__init__.py +4 -0
  79. torchaudio/utils/download.py +89 -0
  80. torchaudio/version.py +2 -0
  81. torchaudio-2.9.1.dist-info/METADATA +133 -0
  82. torchaudio-2.9.1.dist-info/RECORD +85 -0
  83. torchaudio-2.9.1.dist-info/WHEEL +5 -0
  84. torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  85. torchaudio-2.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,47 @@
1
+ from .cmuarctic import CMUARCTIC
2
+ from .cmudict import CMUDict
3
+ from .commonvoice import COMMONVOICE
4
+ from .dr_vctk import DR_VCTK
5
+ from .fluentcommands import FluentSpeechCommands
6
+ from .gtzan import GTZAN
7
+ from .iemocap import IEMOCAP
8
+ from .librilight_limited import LibriLightLimited
9
+ from .librimix import LibriMix
10
+ from .librispeech import LIBRISPEECH
11
+ from .librispeech_biasing import LibriSpeechBiasing
12
+ from .libritts import LIBRITTS
13
+ from .ljspeech import LJSPEECH
14
+ from .musdb_hq import MUSDB_HQ
15
+ from .quesst14 import QUESST14
16
+ from .snips import Snips
17
+ from .speechcommands import SPEECHCOMMANDS
18
+ from .tedlium import TEDLIUM
19
+ from .vctk import VCTK_092
20
+ from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification
21
+ from .yesno import YESNO
22
+
23
+
24
+ __all__ = [
25
+ "COMMONVOICE",
26
+ "LIBRISPEECH",
27
+ "LibriSpeechBiasing",
28
+ "LibriLightLimited",
29
+ "SPEECHCOMMANDS",
30
+ "VCTK_092",
31
+ "DR_VCTK",
32
+ "YESNO",
33
+ "LJSPEECH",
34
+ "GTZAN",
35
+ "CMUARCTIC",
36
+ "CMUDict",
37
+ "LibriMix",
38
+ "LIBRITTS",
39
+ "TEDLIUM",
40
+ "QUESST14",
41
+ "MUSDB_HQ",
42
+ "FluentSpeechCommands",
43
+ "VoxCeleb1Identification",
44
+ "VoxCeleb1Verification",
45
+ "IEMOCAP",
46
+ "Snips",
47
+ ]
@@ -0,0 +1,157 @@
1
+ import csv
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Tuple, Union
5
+
6
+ import torchaudio
7
+ from torch import Tensor
8
+ from torch.utils.data import Dataset
9
+ from torchaudio._internal import download_url_to_file
10
+ from torchaudio.datasets.utils import _extract_tar
11
+
12
+ URL = "aew"
13
+ FOLDER_IN_ARCHIVE = "ARCTIC"
14
+ _CHECKSUMS = {
15
+ "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406", # noqa: E501
16
+ "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c", # noqa: E501
17
+ "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc", # noqa: E501
18
+ "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c", # noqa: E501
19
+ "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff", # noqa: E501
20
+ "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904", # noqa: E501
21
+ "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6", # noqa: E501
22
+ "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a", # noqa: E501
23
+ "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2", # noqa: E501
24
+ "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f", # noqa: E501
25
+ "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73", # noqa: E501
26
+ "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717", # noqa: E501
27
+ "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0", # noqa: E501
28
+ "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a", # noqa: E501
29
+ "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4", # noqa: E501
30
+ "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b", # noqa: E501
31
+ "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1", # noqa: E501
32
+ "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea", # noqa: E501
33
+ }
34
+
35
+
36
+ def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
37
+
38
+ utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
39
+
40
+ # Remove space, double quote, and single parenthesis from transcript
41
+ transcript = transcript[1:-3]
42
+
43
+ file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
44
+
45
+ # Load audio
46
+ waveform, sample_rate = torchaudio.load(file_audio)
47
+
48
+ return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
49
+
50
+
51
+ class CMUARCTIC(Dataset):
52
+ """*CMU ARCTIC* :cite:`Kominek03cmuarctic` dataset.
53
+
54
+ Args:
55
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
56
+ url (str, optional):
57
+ The URL to download the dataset from or the type of the dataset to download.
58
+ (default: ``"aew"``)
59
+ Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``,
60
+ ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``,
61
+ ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``.
62
+ folder_in_archive (str, optional):
63
+ The top-level directory of the dataset. (default: ``"ARCTIC"``)
64
+ download (bool, optional):
65
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
66
+ """
67
+
68
+ _file_text = "txt.done.data"
69
+ _folder_text = "etc"
70
+ _ext_audio = ".wav"
71
+ _folder_audio = "wav"
72
+
73
+ def __init__(
74
+ self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False
75
+ ) -> None:
76
+
77
+ if url in [
78
+ "aew",
79
+ "ahw",
80
+ "aup",
81
+ "awb",
82
+ "axb",
83
+ "bdl",
84
+ "clb",
85
+ "eey",
86
+ "fem",
87
+ "gka",
88
+ "jmk",
89
+ "ksp",
90
+ "ljm",
91
+ "lnh",
92
+ "rms",
93
+ "rxr",
94
+ "slp",
95
+ "slt",
96
+ ]:
97
+
98
+ url = "cmu_us_" + url + "_arctic"
99
+ ext_archive = ".tar.bz2"
100
+ base_url = "http://www.festvox.org/cmu_arctic/packed/"
101
+
102
+ url = os.path.join(base_url, url + ext_archive)
103
+
104
+ # Get string representation of 'root' in case Path object is passed
105
+ root = os.fspath(root)
106
+
107
+ basename = os.path.basename(url)
108
+ root = os.path.join(root, folder_in_archive)
109
+ if not os.path.isdir(root):
110
+ os.mkdir(root)
111
+ archive = os.path.join(root, basename)
112
+
113
+ basename = basename.split(".")[0]
114
+
115
+ self._path = os.path.join(root, basename)
116
+
117
+ if download:
118
+ if not os.path.isdir(self._path):
119
+ if not os.path.isfile(archive):
120
+ checksum = _CHECKSUMS.get(url, None)
121
+ download_url_to_file(url, archive, hash_prefix=checksum)
122
+ _extract_tar(archive)
123
+ else:
124
+ if not os.path.exists(self._path):
125
+ raise RuntimeError(
126
+ f"The path {self._path} doesn't exist. "
127
+ "Please check the ``root`` path or set `download=True` to download it"
128
+ )
129
+ self._text = os.path.join(self._path, self._folder_text, self._file_text)
130
+
131
+ with open(self._text, "r") as text:
132
+ walker = csv.reader(text)
133
+ self._walker = list(walker)
134
+
135
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
136
+ """Load the n-th sample from the dataset.
137
+
138
+ Args:
139
+ n (int): The index of the sample to be loaded
140
+
141
+ Returns:
142
+ Tuple of the following items;
143
+
144
+ Tensor:
145
+ Waveform
146
+ int:
147
+ Sample rate
148
+ str:
149
+ Transcript
150
+ str:
151
+ Utterance ID
152
+ """
153
+ line = self._walker[n]
154
+ return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
155
+
156
+ def __len__(self) -> int:
157
+ return len(self._walker)
@@ -0,0 +1,186 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Tuple, Union
5
+
6
+ from torch.utils.data import Dataset
7
+ from torchaudio._internal import download_url_to_file
8
+
9
+
10
+ _CHECKSUMS = {
11
+ "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4", # noqa: E501
12
+ "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027", # noqa: E501
13
+ }
14
+ _PUNCTUATIONS = {
15
+ "!EXCLAMATION-POINT",
16
+ '"CLOSE-QUOTE',
17
+ '"DOUBLE-QUOTE',
18
+ '"END-OF-QUOTE',
19
+ '"END-QUOTE',
20
+ '"IN-QUOTES',
21
+ '"QUOTE',
22
+ '"UNQUOTE',
23
+ "#HASH-MARK",
24
+ "#POUND-SIGN",
25
+ "#SHARP-SIGN",
26
+ "%PERCENT",
27
+ "&AMPERSAND",
28
+ "'END-INNER-QUOTE",
29
+ "'END-QUOTE",
30
+ "'INNER-QUOTE",
31
+ "'QUOTE",
32
+ "'SINGLE-QUOTE",
33
+ "(BEGIN-PARENS",
34
+ "(IN-PARENTHESES",
35
+ "(LEFT-PAREN",
36
+ "(OPEN-PARENTHESES",
37
+ "(PAREN",
38
+ "(PARENS",
39
+ "(PARENTHESES",
40
+ ")CLOSE-PAREN",
41
+ ")CLOSE-PARENTHESES",
42
+ ")END-PAREN",
43
+ ")END-PARENS",
44
+ ")END-PARENTHESES",
45
+ ")END-THE-PAREN",
46
+ ")PAREN",
47
+ ")PARENS",
48
+ ")RIGHT-PAREN",
49
+ ")UN-PARENTHESES",
50
+ "+PLUS",
51
+ ",COMMA",
52
+ "--DASH",
53
+ "-DASH",
54
+ "-HYPHEN",
55
+ "...ELLIPSIS",
56
+ ".DECIMAL",
57
+ ".DOT",
58
+ ".FULL-STOP",
59
+ ".PERIOD",
60
+ ".POINT",
61
+ "/SLASH",
62
+ ":COLON",
63
+ ";SEMI-COLON",
64
+ ";SEMI-COLON(1)",
65
+ "?QUESTION-MARK",
66
+ "{BRACE",
67
+ "{LEFT-BRACE",
68
+ "{OPEN-BRACE",
69
+ "}CLOSE-BRACE",
70
+ "}RIGHT-BRACE",
71
+ }
72
+
73
+
74
+ def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
75
+ _alt_re = re.compile(r"\([0-9]+\)")
76
+ cmudict: List[Tuple[str, List[str]]] = []
77
+ for line in lines:
78
+ if not line or line.startswith(";;;"): # ignore comments
79
+ continue
80
+
81
+ word, phones = line.strip().split(" ")
82
+ if word in _PUNCTUATIONS:
83
+ if exclude_punctuations:
84
+ continue
85
+ # !EXCLAMATION-POINT -> !
86
+ # --DASH -> --
87
+ # ...ELLIPSIS -> ...
88
+ if word.startswith("..."):
89
+ word = "..."
90
+ elif word.startswith("--"):
91
+ word = "--"
92
+ else:
93
+ word = word[0]
94
+
95
+ # if a word have multiple pronunciations, there will be (number) appended to it
96
+ # for example, DATAPOINTS and DATAPOINTS(1),
97
+ # the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
98
+ word = re.sub(_alt_re, "", word)
99
+ phones = phones.split(" ")
100
+ cmudict.append((word, phones))
101
+
102
+ return cmudict
103
+
104
+
105
+ class CMUDict(Dataset):
106
+ """*CMU Pronouncing Dictionary* :cite:`cmudict` (CMUDict) dataset.
107
+
108
+ Args:
109
+ root (str or Path): Path to the directory where the dataset is found or downloaded.
110
+ exclude_punctuations (bool, optional):
111
+ When enabled, exclude the pronounciation of punctuations, such as
112
+ `!EXCLAMATION-POINT` and `#HASH-MARK`.
113
+ download (bool, optional):
114
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
115
+ url (str, optional):
116
+ The URL to download the dictionary from.
117
+ (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``)
118
+ url_symbols (str, optional):
119
+ The URL to download the list of symbols from.
120
+ (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``)
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ root: Union[str, Path],
126
+ exclude_punctuations: bool = True,
127
+ *,
128
+ download: bool = False,
129
+ url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b",
130
+ url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols",
131
+ ) -> None:
132
+
133
+ self.exclude_punctuations = exclude_punctuations
134
+
135
+ self._root_path = Path(root)
136
+ if not os.path.isdir(self._root_path):
137
+ raise RuntimeError(f"The root directory does not exist; {root}")
138
+
139
+ dict_file = self._root_path / os.path.basename(url)
140
+ symbol_file = self._root_path / os.path.basename(url_symbols)
141
+ if not os.path.exists(dict_file):
142
+ if not download:
143
+ raise RuntimeError(
144
+ "The dictionary file is not found in the following location. "
145
+ f"Set `download=True` to download it. {dict_file}"
146
+ )
147
+ checksum = _CHECKSUMS.get(url, None)
148
+ download_url_to_file(url, dict_file, checksum)
149
+ if not os.path.exists(symbol_file):
150
+ if not download:
151
+ raise RuntimeError(
152
+ "The symbol file is not found in the following location. "
153
+ f"Set `download=True` to download it. {symbol_file}"
154
+ )
155
+ checksum = _CHECKSUMS.get(url_symbols, None)
156
+ download_url_to_file(url_symbols, symbol_file, checksum)
157
+
158
+ with open(symbol_file, "r") as text:
159
+ self._symbols = [line.strip() for line in text.readlines()]
160
+
161
+ with open(dict_file, "r", encoding="latin-1") as text:
162
+ self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
163
+
164
+ def __getitem__(self, n: int) -> Tuple[str, List[str]]:
165
+ """Load the n-th sample from the dataset.
166
+
167
+ Args:
168
+ n (int): The index of the sample to be loaded.
169
+
170
+ Returns:
171
+ Tuple of a word and its phonemes
172
+
173
+ str:
174
+ Word
175
+ List[str]:
176
+ Phonemes
177
+ """
178
+ return self._dictionary[n]
179
+
180
+ def __len__(self) -> int:
181
+ return len(self._dictionary)
182
+
183
+ @property
184
+ def symbols(self) -> List[str]:
185
+ """list[str]: A list of phonemes symbols, such as ``"AA"``, ``"AE"``, ``"AH"``."""
186
+ return self._symbols.copy()
@@ -0,0 +1,86 @@
1
+ import csv
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple, Union
5
+
6
+ import torchaudio
7
+ from torch import Tensor
8
+ from torch.utils.data import Dataset
9
+
10
+
11
+ def load_commonvoice_item(
12
+ line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
13
+ ) -> Tuple[Tensor, int, Dict[str, str]]:
14
+ # Each line as the following data:
15
+ # client_id, path, sentence, up_votes, down_votes, age, gender, accent
16
+
17
+ if header[1] != "path":
18
+ raise ValueError(f"expect `header[1]` to be 'path', but got {header[1]}")
19
+ fileid = line[1]
20
+ filename = os.path.join(path, folder_audio, fileid)
21
+ if not filename.endswith(ext_audio):
22
+ filename += ext_audio
23
+ waveform, sample_rate = torchaudio.load(filename)
24
+
25
+ dic = dict(zip(header, line))
26
+
27
+ return waveform, sample_rate, dic
28
+
29
+
30
+ class COMMONVOICE(Dataset):
31
+ """*CommonVoice* :cite:`ardila2020common` dataset.
32
+
33
+ Args:
34
+ root (str or Path): Path to the directory where the dataset is located.
35
+ (Where the ``tsv`` file is present.)
36
+ tsv (str, optional):
37
+ The name of the tsv file used to construct the metadata, such as
38
+ ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
39
+ ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
40
+ """
41
+
42
+ _ext_txt = ".txt"
43
+ _ext_audio = ".mp3"
44
+ _folder_audio = "clips"
45
+
46
+ def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None:
47
+
48
+ # Get string representation of 'root' in case Path object is passed
49
+ self._path = os.fspath(root)
50
+ self._tsv = os.path.join(self._path, tsv)
51
+
52
+ with open(self._tsv, "r") as tsv_:
53
+ walker = csv.reader(tsv_, delimiter="\t")
54
+ self._header = next(walker)
55
+ self._walker = list(walker)
56
+
57
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
58
+ """Load the n-th sample from the dataset.
59
+
60
+ Args:
61
+ n (int): The index of the sample to be loaded
62
+
63
+ Returns:
64
+ Tuple of the following items;
65
+
66
+ Tensor:
67
+ Waveform
68
+ int:
69
+ Sample rate
70
+ Dict[str, str]:
71
+ Dictionary containing the following items from the corresponding TSV file;
72
+
73
+ * ``"client_id"``
74
+ * ``"path"``
75
+ * ``"sentence"``
76
+ * ``"up_votes"``
77
+ * ``"down_votes"``
78
+ * ``"age"``
79
+ * ``"gender"``
80
+ * ``"accent"``
81
+ """
82
+ line = self._walker[n]
83
+ return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
84
+
85
+ def __len__(self) -> int:
86
+ return len(self._walker)
@@ -0,0 +1,121 @@
1
+ from pathlib import Path
2
+ from typing import Dict, Tuple, Union
3
+
4
+ import torchaudio
5
+ from torch import Tensor
6
+ from torch.utils.data import Dataset
7
+ from torchaudio._internal import download_url_to_file
8
+ from torchaudio.datasets.utils import _extract_zip
9
+
10
+
11
+ _URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
12
+ _CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
13
+ _SUPPORTED_SUBSETS = {"train", "test"}
14
+
15
+
16
+ class DR_VCTK(Dataset):
17
+ """*Device Recorded VCTK (Small subset version)* :cite:`Sarfjoo2018DeviceRV` dataset.
18
+
19
+ Args:
20
+ root (str or Path): Root directory where the dataset's top level directory is found.
21
+ subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``).
22
+ download (bool):
23
+ Whether to download the dataset if it is not found at root path. (default: ``False``).
24
+ url (str): The URL to download the dataset from.
25
+ (default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ root: Union[str, Path],
31
+ subset: str = "train",
32
+ *,
33
+ download: bool = False,
34
+ url: str = _URL,
35
+ ) -> None:
36
+ if subset not in _SUPPORTED_SUBSETS:
37
+ raise RuntimeError(
38
+ f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}"
39
+ )
40
+
41
+ root = Path(root).expanduser()
42
+ archive = root / "DR-VCTK.zip"
43
+
44
+ self._subset = subset
45
+ self._path = root / "DR-VCTK" / "DR-VCTK"
46
+ self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k"
47
+ self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k"
48
+ self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt"
49
+
50
+ if not self._path.is_dir():
51
+ if not archive.is_file():
52
+ if not download:
53
+ raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
54
+ download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
55
+ _extract_zip(archive, root)
56
+
57
+ self._config = self._load_config(self._config_filepath)
58
+ self._filename_list = sorted(self._config)
59
+
60
+ def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
61
+ # Skip header
62
+ skip_rows = 2 if self._subset == "train" else 1
63
+
64
+ config = {}
65
+ with open(filepath) as f:
66
+ for i, line in enumerate(f):
67
+ if i < skip_rows or not line:
68
+ continue
69
+ filename, source, channel_id = line.strip().split("\t")
70
+ config[filename] = (source, int(channel_id))
71
+ return config
72
+
73
+ def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
74
+ speaker_id, utterance_id = filename.split(".")[0].split("_")
75
+ source, channel_id = self._config[filename]
76
+ file_clean_audio = self._clean_audio_dir / filename
77
+ file_noisy_audio = self._noisy_audio_dir / filename
78
+ waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
79
+ waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
80
+ return (
81
+ waveform_clean,
82
+ sample_rate_clean,
83
+ waveform_noisy,
84
+ sample_rate_noisy,
85
+ speaker_id,
86
+ utterance_id,
87
+ source,
88
+ channel_id,
89
+ )
90
+
91
+ def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
92
+ """Load the n-th sample from the dataset.
93
+
94
+ Args:
95
+ n (int): The index of the sample to be loaded
96
+
97
+ Returns:
98
+ Tuple of the following items;
99
+
100
+ Tensor:
101
+ Clean waveform
102
+ int:
103
+ Sample rate of the clean waveform
104
+ Tensor:
105
+ Noisy waveform
106
+ int:
107
+ Sample rate of the noisy waveform
108
+ str:
109
+ Speaker ID
110
+ str:
111
+ Utterance ID
112
+ str:
113
+ Source
114
+ int:
115
+ Channel ID
116
+ """
117
+ filename = self._filename_list[n]
118
+ return self._load_dr_vctk_item(filename)
119
+
120
+ def __len__(self) -> int:
121
+ return len(self._filename_list)