torchaudio 2.0.2__cp39-cp39-win_amd64.whl → 2.1.1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +22 -3
- torchaudio/_backend/__init__.py +55 -4
- torchaudio/_backend/backend.py +53 -0
- torchaudio/_backend/common.py +52 -0
- torchaudio/_backend/ffmpeg.py +373 -0
- torchaudio/_backend/soundfile.py +54 -0
- torchaudio/_backend/soundfile_backend.py +457 -0
- torchaudio/_backend/sox.py +91 -0
- torchaudio/_backend/utils.py +81 -323
- torchaudio/_extension/__init__.py +55 -36
- torchaudio/_extension/utils.py +109 -17
- torchaudio/_internal/__init__.py +4 -1
- torchaudio/_internal/module_utils.py +37 -6
- torchaudio/backend/__init__.py +7 -11
- torchaudio/backend/_no_backend.py +24 -0
- torchaudio/backend/_sox_io_backend.py +297 -0
- torchaudio/backend/common.py +12 -52
- torchaudio/backend/no_backend.py +11 -21
- torchaudio/backend/soundfile_backend.py +11 -448
- torchaudio/backend/sox_io_backend.py +11 -435
- torchaudio/backend/utils.py +9 -18
- torchaudio/datasets/__init__.py +2 -0
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/cmudict.py +61 -62
- torchaudio/datasets/dr_vctk.py +1 -1
- torchaudio/datasets/gtzan.py +1 -1
- torchaudio/datasets/librilight_limited.py +1 -1
- torchaudio/datasets/librispeech.py +1 -1
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +1 -1
- torchaudio/datasets/ljspeech.py +1 -1
- torchaudio/datasets/musdb_hq.py +1 -1
- torchaudio/datasets/quesst14.py +1 -1
- torchaudio/datasets/speechcommands.py +1 -1
- torchaudio/datasets/tedlium.py +1 -1
- torchaudio/datasets/vctk.py +1 -1
- torchaudio/datasets/voxceleb1.py +1 -1
- torchaudio/datasets/yesno.py +1 -1
- torchaudio/functional/__init__.py +6 -2
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +69 -92
- torchaudio/functional/functional.py +99 -148
- torchaudio/io/__init__.py +4 -1
- torchaudio/io/_effector.py +347 -0
- torchaudio/io/_stream_reader.py +158 -90
- torchaudio/io/_stream_writer.py +196 -10
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/_torchaudio_ffmpeg4.pyd +0 -0
- torchaudio/lib/_torchaudio_ffmpeg5.pyd +0 -0
- torchaudio/lib/_torchaudio_ffmpeg6.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg4.pyd +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg5.pyd +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg6.pyd +0 -0
- torchaudio/models/__init__.py +14 -0
- torchaudio/models/decoder/__init__.py +22 -7
- torchaudio/models/decoder/_ctc_decoder.py +123 -69
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/rnnt_decoder.py +10 -14
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/wav2vec2/components.py +6 -10
- torchaudio/pipelines/__init__.py +9 -0
- torchaudio/pipelines/_squim_pipeline.py +176 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +198 -68
- torchaudio/pipelines/_wav2vec2/utils.py +120 -0
- torchaudio/sox_effects/sox_effects.py +7 -30
- torchaudio/transforms/__init__.py +2 -0
- torchaudio/transforms/_transforms.py +99 -54
- torchaudio/utils/download.py +2 -2
- torchaudio/utils/ffmpeg_utils.py +20 -15
- torchaudio/utils/sox_utils.py +8 -9
- torchaudio/version.py +2 -2
- torchaudio-2.1.1.dist-info/METADATA +113 -0
- torchaudio-2.1.1.dist-info/RECORD +115 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
- torchaudio/io/_compat.py +0 -241
- torchaudio/lib/_torchaudio_ffmpeg.pyd +0 -0
- torchaudio/lib/flashlight_lib_text_decoder.pyd +0 -0
- torchaudio/lib/flashlight_lib_text_dictionary.pyd +0 -0
- torchaudio/lib/libflashlight-text.pyd +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg.pyd +0 -0
- torchaudio-2.0.2.dist-info/METADATA +0 -26
- torchaudio-2.0.2.dist-info/RECORD +0 -98
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
torchaudio/_backend/utils.py
CHANGED
|
@@ -1,268 +1,25 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import re
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
2
|
from functools import lru_cache
|
|
5
|
-
from typing import BinaryIO, Dict, Optional, Tuple, Union
|
|
3
|
+
from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
|
|
6
4
|
|
|
7
5
|
import torch
|
|
8
|
-
import torchaudio.backend.soundfile_backend as soundfile_backend
|
|
9
|
-
from torchaudio._extension import _FFMPEG_INITIALIZED, _SOX_INITIALIZED
|
|
10
|
-
from torchaudio.backend.common import AudioMetaData
|
|
11
6
|
|
|
12
|
-
|
|
13
|
-
|
|
7
|
+
from torchaudio._extension import _FFMPEG_EXT, _SOX_INITIALIZED
|
|
8
|
+
from torchaudio.io import CodecConfig
|
|
14
9
|
|
|
10
|
+
from . import soundfile_backend
|
|
15
11
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@staticmethod
|
|
23
|
-
@abstractmethod
|
|
24
|
-
def load(
|
|
25
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
26
|
-
frame_offset: int = 0,
|
|
27
|
-
num_frames: int = -1,
|
|
28
|
-
normalize: bool = True,
|
|
29
|
-
channels_first: bool = True,
|
|
30
|
-
format: Optional[str] = None,
|
|
31
|
-
buffer_size: int = 4096,
|
|
32
|
-
) -> Tuple[torch.Tensor, int]:
|
|
33
|
-
raise NotImplementedError
|
|
34
|
-
|
|
35
|
-
@staticmethod
|
|
36
|
-
@abstractmethod
|
|
37
|
-
def save(
|
|
38
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
39
|
-
src: torch.Tensor,
|
|
40
|
-
sample_rate: int,
|
|
41
|
-
channels_first: bool = True,
|
|
42
|
-
format: Optional[str] = None,
|
|
43
|
-
encoding: Optional[str] = None,
|
|
44
|
-
bits_per_sample: Optional[int] = None,
|
|
45
|
-
buffer_size: int = 4096,
|
|
46
|
-
) -> None:
|
|
47
|
-
raise NotImplementedError
|
|
48
|
-
|
|
49
|
-
@staticmethod
|
|
50
|
-
@abstractmethod
|
|
51
|
-
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
52
|
-
raise NotImplementedError
|
|
53
|
-
|
|
54
|
-
@staticmethod
|
|
55
|
-
@abstractmethod
|
|
56
|
-
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
57
|
-
raise NotImplementedError
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _map_encoding(encoding: str) -> str:
|
|
61
|
-
for dst in ["PCM_S", "PCM_U", "PCM_F"]:
|
|
62
|
-
if dst in encoding:
|
|
63
|
-
return dst
|
|
64
|
-
if encoding == "PCM_MULAW":
|
|
65
|
-
return "ULAW"
|
|
66
|
-
elif encoding == "PCM_ALAW":
|
|
67
|
-
return "ALAW"
|
|
68
|
-
return encoding
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
|
|
72
|
-
if m := re.search(r"PCM_\w(\d+)\w*", encoding):
|
|
73
|
-
return int(m.group(1))
|
|
74
|
-
elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
|
|
75
|
-
return 8
|
|
76
|
-
return bits_per_sample
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
class FFmpegBackend(Backend):
|
|
80
|
-
@staticmethod
|
|
81
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
82
|
-
if hasattr(uri, "read"):
|
|
83
|
-
metadata = info_audio_fileobj(uri, format, buffer_size=buffer_size)
|
|
84
|
-
else:
|
|
85
|
-
metadata = info_audio(os.path.normpath(uri), format)
|
|
86
|
-
metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
|
|
87
|
-
metadata.encoding = _map_encoding(metadata.encoding)
|
|
88
|
-
return metadata
|
|
89
|
-
|
|
90
|
-
@staticmethod
|
|
91
|
-
def load(
|
|
92
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
93
|
-
frame_offset: int = 0,
|
|
94
|
-
num_frames: int = -1,
|
|
95
|
-
normalize: bool = True,
|
|
96
|
-
channels_first: bool = True,
|
|
97
|
-
format: Optional[str] = None,
|
|
98
|
-
buffer_size: int = 4096,
|
|
99
|
-
) -> Tuple[torch.Tensor, int]:
|
|
100
|
-
if hasattr(uri, "read"):
|
|
101
|
-
return load_audio_fileobj(
|
|
102
|
-
uri,
|
|
103
|
-
frame_offset,
|
|
104
|
-
num_frames,
|
|
105
|
-
normalize,
|
|
106
|
-
channels_first,
|
|
107
|
-
format,
|
|
108
|
-
buffer_size,
|
|
109
|
-
)
|
|
110
|
-
else:
|
|
111
|
-
return load_audio(os.path.normpath(uri), frame_offset, num_frames, normalize, channels_first, format)
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def save(
|
|
115
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
116
|
-
src: torch.Tensor,
|
|
117
|
-
sample_rate: int,
|
|
118
|
-
channels_first: bool = True,
|
|
119
|
-
format: Optional[str] = None,
|
|
120
|
-
encoding: Optional[str] = None,
|
|
121
|
-
bits_per_sample: Optional[int] = None,
|
|
122
|
-
buffer_size: int = 4096,
|
|
123
|
-
) -> None:
|
|
124
|
-
save_audio(
|
|
125
|
-
uri,
|
|
126
|
-
src,
|
|
127
|
-
sample_rate,
|
|
128
|
-
channels_first,
|
|
129
|
-
format,
|
|
130
|
-
encoding,
|
|
131
|
-
bits_per_sample,
|
|
132
|
-
buffer_size,
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
@staticmethod
|
|
136
|
-
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
137
|
-
return True
|
|
138
|
-
|
|
139
|
-
@staticmethod
|
|
140
|
-
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
141
|
-
return True
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
class SoXBackend(Backend):
|
|
145
|
-
@staticmethod
|
|
146
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
147
|
-
if hasattr(uri, "read"):
|
|
148
|
-
raise ValueError(
|
|
149
|
-
"SoX backend does not support reading from file-like objects. ",
|
|
150
|
-
"Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
|
|
151
|
-
)
|
|
152
|
-
else:
|
|
153
|
-
sinfo = torch.ops.torchaudio.sox_io_get_info(uri, format)
|
|
154
|
-
if sinfo:
|
|
155
|
-
return AudioMetaData(*sinfo)
|
|
156
|
-
else:
|
|
157
|
-
raise RuntimeError(f"Failed to fetch metadata for {uri}.")
|
|
158
|
-
|
|
159
|
-
@staticmethod
|
|
160
|
-
def load(
|
|
161
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
162
|
-
frame_offset: int = 0,
|
|
163
|
-
num_frames: int = -1,
|
|
164
|
-
normalize: bool = True,
|
|
165
|
-
channels_first: bool = True,
|
|
166
|
-
format: Optional[str] = None,
|
|
167
|
-
buffer_size: int = 4096,
|
|
168
|
-
) -> Tuple[torch.Tensor, int]:
|
|
169
|
-
if hasattr(uri, "read"):
|
|
170
|
-
raise ValueError(
|
|
171
|
-
"SoX backend does not support loading from file-like objects. ",
|
|
172
|
-
"Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
|
|
173
|
-
)
|
|
174
|
-
else:
|
|
175
|
-
ret = torch.ops.torchaudio.sox_io_load_audio_file(
|
|
176
|
-
uri, frame_offset, num_frames, normalize, channels_first, format
|
|
177
|
-
)
|
|
178
|
-
if not ret:
|
|
179
|
-
raise RuntimeError(f"Failed to load audio from {uri}.")
|
|
180
|
-
return ret
|
|
181
|
-
|
|
182
|
-
@staticmethod
|
|
183
|
-
def save(
|
|
184
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
185
|
-
src: torch.Tensor,
|
|
186
|
-
sample_rate: int,
|
|
187
|
-
channels_first: bool = True,
|
|
188
|
-
format: Optional[str] = None,
|
|
189
|
-
encoding: Optional[str] = None,
|
|
190
|
-
bits_per_sample: Optional[int] = None,
|
|
191
|
-
buffer_size: int = 4096,
|
|
192
|
-
) -> None:
|
|
193
|
-
if hasattr(uri, "write"):
|
|
194
|
-
raise ValueError(
|
|
195
|
-
"SoX backend does not support writing to file-like objects. ",
|
|
196
|
-
"Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
|
|
197
|
-
)
|
|
198
|
-
else:
|
|
199
|
-
torch.ops.torchaudio.sox_io_save_audio_file(
|
|
200
|
-
uri,
|
|
201
|
-
src,
|
|
202
|
-
sample_rate,
|
|
203
|
-
channels_first,
|
|
204
|
-
None,
|
|
205
|
-
format,
|
|
206
|
-
encoding,
|
|
207
|
-
bits_per_sample,
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
@staticmethod
|
|
211
|
-
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
212
|
-
# i.e. not a file-like object.
|
|
213
|
-
return not hasattr(uri, "read")
|
|
214
|
-
|
|
215
|
-
@staticmethod
|
|
216
|
-
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
217
|
-
# i.e. not a file-like object.
|
|
218
|
-
return not hasattr(uri, "write")
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
class SoundfileBackend(Backend):
|
|
222
|
-
@abstractmethod
|
|
223
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
224
|
-
return soundfile_backend.info(uri, format)
|
|
225
|
-
|
|
226
|
-
@abstractmethod
|
|
227
|
-
def load(
|
|
228
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
229
|
-
frame_offset: int = 0,
|
|
230
|
-
num_frames: int = -1,
|
|
231
|
-
normalize: bool = True,
|
|
232
|
-
channels_first: bool = True,
|
|
233
|
-
format: Optional[str] = None,
|
|
234
|
-
buffer_size: int = 4096,
|
|
235
|
-
) -> Tuple[torch.Tensor, int]:
|
|
236
|
-
return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
|
|
237
|
-
|
|
238
|
-
@abstractmethod
|
|
239
|
-
def save(
|
|
240
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
241
|
-
src: torch.Tensor,
|
|
242
|
-
sample_rate: int,
|
|
243
|
-
channels_first: bool = True,
|
|
244
|
-
format: Optional[str] = None,
|
|
245
|
-
encoding: Optional[str] = None,
|
|
246
|
-
bits_per_sample: Optional[int] = None,
|
|
247
|
-
buffer_size: int = 4096,
|
|
248
|
-
) -> None:
|
|
249
|
-
soundfile_backend.save(
|
|
250
|
-
uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
@abstractmethod
|
|
254
|
-
def can_decode(uri, format) -> bool:
|
|
255
|
-
return True
|
|
256
|
-
|
|
257
|
-
@abstractmethod
|
|
258
|
-
def can_encode(uri, format) -> bool:
|
|
259
|
-
return True
|
|
12
|
+
from .backend import Backend
|
|
13
|
+
from .common import AudioMetaData
|
|
14
|
+
from .ffmpeg import FFmpegBackend
|
|
15
|
+
from .soundfile import SoundfileBackend
|
|
16
|
+
from .sox import SoXBackend
|
|
260
17
|
|
|
261
18
|
|
|
262
19
|
@lru_cache(None)
|
|
263
|
-
def get_available_backends() -> Dict[str, Backend]:
|
|
264
|
-
backend_specs = {}
|
|
265
|
-
if
|
|
20
|
+
def get_available_backends() -> Dict[str, Type[Backend]]:
|
|
21
|
+
backend_specs: Dict[str, Type[Backend]] = {}
|
|
22
|
+
if _FFMPEG_EXT is not None:
|
|
266
23
|
backend_specs["ffmpeg"] = FFmpegBackend
|
|
267
24
|
if _SOX_INITIALIZED:
|
|
268
25
|
backend_specs["sox"] = SoXBackend
|
|
@@ -303,19 +60,19 @@ def get_info_func():
|
|
|
303
60
|
) -> AudioMetaData:
|
|
304
61
|
"""Get signal information of an audio file.
|
|
305
62
|
|
|
63
|
+
Note:
|
|
64
|
+
When the input type is file-like object, this function cannot
|
|
65
|
+
get the correct length (``num_samples``) for certain formats,
|
|
66
|
+
such as ``vorbis``.
|
|
67
|
+
In this case, the value of ``num_samples`` is ``0``.
|
|
68
|
+
|
|
306
69
|
Args:
|
|
307
70
|
uri (path-like object or file-like object):
|
|
308
71
|
Source of audio data. The following types are accepted:
|
|
309
72
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
Note:
|
|
315
|
-
When the input type is file-like object, this function cannot
|
|
316
|
-
get the correct length (``num_samples``) for certain formats,
|
|
317
|
-
such as ``vorbis``.
|
|
318
|
-
In this case, the value of ``num_samples`` is ``0``.
|
|
73
|
+
* ``path-like``: File path or URL.
|
|
74
|
+
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
|
|
75
|
+
which returns byte string of at most ``size`` length.
|
|
319
76
|
|
|
320
77
|
format (str or None, optional):
|
|
321
78
|
If not ``None``, interpreted as hint that may allow backend to override the detected format.
|
|
@@ -325,12 +82,17 @@ def get_info_func():
|
|
|
325
82
|
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
|
326
83
|
|
|
327
84
|
backend (str or None, optional):
|
|
328
|
-
I/O backend to use.
|
|
329
|
-
|
|
85
|
+
I/O backend to use.
|
|
86
|
+
If ``None``, function selects backend given input and available backends.
|
|
87
|
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
|
88
|
+
with the corresponding backend available.
|
|
330
89
|
(Default: ``None``)
|
|
331
90
|
|
|
91
|
+
.. seealso::
|
|
92
|
+
:ref:`backend`
|
|
93
|
+
|
|
332
94
|
Returns:
|
|
333
|
-
AudioMetaData
|
|
95
|
+
AudioMetaData
|
|
334
96
|
"""
|
|
335
97
|
backend = dispatcher(uri, format, backend)
|
|
336
98
|
return backend.info(uri, format, buffer_size)
|
|
@@ -362,27 +124,19 @@ def get_load_func():
|
|
|
362
124
|
buffer_size: int = 4096,
|
|
363
125
|
backend: Optional[str] = None,
|
|
364
126
|
) -> Tuple[torch.Tensor, int]:
|
|
365
|
-
"""Load audio data from
|
|
366
|
-
|
|
367
|
-
Note:
|
|
368
|
-
The formats this function can handle depend on backend availability.
|
|
369
|
-
This function is tested on the following formats:
|
|
370
|
-
|
|
371
|
-
* WAV
|
|
372
|
-
|
|
373
|
-
* 32-bit floating-point
|
|
374
|
-
* 32-bit signed integer
|
|
375
|
-
* 24-bit signed integer
|
|
376
|
-
* 16-bit signed integer
|
|
377
|
-
* 8-bit unsigned integer
|
|
378
|
-
|
|
379
|
-
* FLAC
|
|
380
|
-
* OGG/VORBIS
|
|
381
|
-
* SPHERE
|
|
127
|
+
"""Load audio data from source.
|
|
382
128
|
|
|
383
129
|
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
|
384
130
|
``float32`` dtype, and the shape of `[channel, time]`.
|
|
385
131
|
|
|
132
|
+
Note:
|
|
133
|
+
The formats this function can handle depend on the availability of backends.
|
|
134
|
+
Please use the following functions to fetch the supported formats.
|
|
135
|
+
|
|
136
|
+
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
|
|
137
|
+
- Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
|
|
138
|
+
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
|
|
139
|
+
|
|
386
140
|
.. warning::
|
|
387
141
|
|
|
388
142
|
``normalize`` argument does not perform volume normalization.
|
|
@@ -432,9 +186,13 @@ def get_load_func():
|
|
|
432
186
|
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
|
433
187
|
|
|
434
188
|
backend (str or None, optional):
|
|
435
|
-
I/O backend to use.
|
|
436
|
-
|
|
437
|
-
|
|
189
|
+
I/O backend to use.
|
|
190
|
+
If ``None``, function selects backend given input and available backends.
|
|
191
|
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
|
192
|
+
with the corresponding backend being available. (Default: ``None``)
|
|
193
|
+
|
|
194
|
+
.. seealso::
|
|
195
|
+
:ref:`backend`
|
|
438
196
|
|
|
439
197
|
Returns:
|
|
440
198
|
(torch.Tensor, int): Resulting Tensor and sample rate.
|
|
@@ -472,22 +230,17 @@ def get_save_func():
|
|
|
472
230
|
bits_per_sample: Optional[int] = None,
|
|
473
231
|
buffer_size: int = 4096,
|
|
474
232
|
backend: Optional[str] = None,
|
|
233
|
+
compression: Optional[Union[CodecConfig, float, int]] = None,
|
|
475
234
|
):
|
|
476
235
|
"""Save audio data to file.
|
|
477
236
|
|
|
478
237
|
Note:
|
|
479
238
|
The formats this function can handle depend on the availability of backends.
|
|
480
|
-
|
|
239
|
+
Please use the following functions to fetch the supported formats.
|
|
481
240
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
* 32-bit signed integer
|
|
486
|
-
* 16-bit signed integer
|
|
487
|
-
* 8-bit unsigned integer
|
|
488
|
-
|
|
489
|
-
* FLAC
|
|
490
|
-
* OGG/VORBIS
|
|
241
|
+
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
|
|
242
|
+
- Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
|
|
243
|
+
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
|
|
491
244
|
|
|
492
245
|
Args:
|
|
493
246
|
uri (str or pathlib.Path): Path to audio file.
|
|
@@ -508,11 +261,11 @@ def get_save_func():
|
|
|
508
261
|
This argument is effective only for supported formats, i.e.
|
|
509
262
|
``"wav"`` and ``""flac"```. Valid values are
|
|
510
263
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
264
|
+
- ``"PCM_S"`` (signed integer Linear PCM)
|
|
265
|
+
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
|
266
|
+
- ``"PCM_F"`` (floating point PCM)
|
|
267
|
+
- ``"ULAW"`` (mu-law)
|
|
268
|
+
- ``"ALAW"`` (a-law)
|
|
516
269
|
|
|
517
270
|
bits_per_sample (int or None, optional): Changes the bit depth for the
|
|
518
271
|
supported formats.
|
|
@@ -524,35 +277,40 @@ def get_save_func():
|
|
|
524
277
|
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
|
525
278
|
|
|
526
279
|
backend (str or None, optional):
|
|
527
|
-
I/O backend to use.
|
|
528
|
-
|
|
529
|
-
|
|
280
|
+
I/O backend to use.
|
|
281
|
+
If ``None``, function selects backend given input and available backends.
|
|
282
|
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
|
283
|
+
with the corresponding backend being available.
|
|
284
|
+
(Default: ``None``)
|
|
530
285
|
|
|
286
|
+
.. seealso::
|
|
287
|
+
:ref:`backend`
|
|
531
288
|
|
|
289
|
+
compression (CodecConfig, float, int, or None, optional):
|
|
290
|
+
Compression configuration to apply.
|
|
532
291
|
|
|
533
|
-
|
|
292
|
+
If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
|
|
534
293
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
- 32-bit signed integer PCM
|
|
538
|
-
- 24-bit signed integer PCM
|
|
539
|
-
- 16-bit signed integer PCM
|
|
540
|
-
- 8-bit unsigned integer PCM
|
|
541
|
-
- 8-bit mu-law
|
|
542
|
-
- 8-bit a-law
|
|
294
|
+
Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
|
|
295
|
+
``sox`` command line interface must be provided. For instance:
|
|
543
296
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
297
|
+
``"mp3"``
|
|
298
|
+
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
|
|
299
|
+
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
|
|
547
300
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
301
|
+
``"flac"``
|
|
302
|
+
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
|
|
303
|
+
|
|
304
|
+
``"ogg"``, ``"vorbis"``
|
|
305
|
+
Number from ``-1`` to ``10``; ``-1`` is the highest compression
|
|
306
|
+
and lowest quality. Default: ``3``.
|
|
307
|
+
|
|
308
|
+
Refer to http://sox.sourceforge.net/soxformat.html for more details.
|
|
551
309
|
|
|
552
|
-
``"ogg"``
|
|
553
|
-
- Doesn't accept changing configuration.
|
|
554
310
|
"""
|
|
555
311
|
backend = dispatcher(uri, format, backend)
|
|
556
|
-
return backend.save(
|
|
312
|
+
return backend.save(
|
|
313
|
+
uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
|
|
314
|
+
)
|
|
557
315
|
|
|
558
316
|
return save
|
|
@@ -2,9 +2,13 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
-
from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
|
|
5
|
+
from torchaudio._internal.module_utils import eval_env, fail_with_message, is_module_available, no_op
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
try:
|
|
8
|
+
from .fb import _init_ffmpeg
|
|
9
|
+
except ImportError:
|
|
10
|
+
from .utils import _init_ffmpeg
|
|
11
|
+
from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _fail_since_no_sox, _init_dll_path, _init_sox, _load_lib
|
|
8
12
|
|
|
9
13
|
_LG = logging.getLogger(__name__)
|
|
10
14
|
|
|
@@ -14,15 +18,13 @@ _LG = logging.getLogger(__name__)
|
|
|
14
18
|
# Builder uses it for debugging purpose, so we export it.
|
|
15
19
|
# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
|
|
16
20
|
__all__ = [
|
|
17
|
-
"fail_if_no_kaldi",
|
|
18
21
|
"fail_if_no_sox",
|
|
19
22
|
"fail_if_no_ffmpeg",
|
|
20
23
|
"_check_cuda_version",
|
|
21
24
|
"_IS_TORCHAUDIO_EXT_AVAILABLE",
|
|
22
|
-
"_IS_KALDI_AVAILABLE",
|
|
23
25
|
"_IS_RIR_AVAILABLE",
|
|
24
26
|
"_SOX_INITIALIZED",
|
|
25
|
-
"
|
|
27
|
+
"_FFMPEG_EXT",
|
|
26
28
|
]
|
|
27
29
|
|
|
28
30
|
|
|
@@ -34,11 +36,11 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
|
|
|
34
36
|
# In case of an error, we do not catch the failure as it suggests there is something
|
|
35
37
|
# wrong with the installation.
|
|
36
38
|
_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
|
|
37
|
-
#
|
|
39
|
+
# RIR features are implemented in _torchaudio extension, but they can be individually
|
|
38
40
|
# turned on/off at build time. Available means that _torchaudio is loaded properly, and
|
|
39
|
-
#
|
|
41
|
+
# RIR features are found there.
|
|
40
42
|
_IS_RIR_AVAILABLE = False
|
|
41
|
-
|
|
43
|
+
_IS_ALIGN_AVAILABLE = False
|
|
42
44
|
if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
|
43
45
|
_load_lib("libtorchaudio")
|
|
44
46
|
|
|
@@ -46,26 +48,45 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
|
|
46
48
|
|
|
47
49
|
_check_cuda_version()
|
|
48
50
|
_IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
|
|
49
|
-
|
|
51
|
+
_IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
# Note: This will be change in the future when sox is dynamically linked.
|
|
55
|
-
# At that point, this initialization should handle the case where
|
|
56
|
-
# sox integration is built but libsox is not found.
|
|
54
|
+
# Initialize libsox-related features
|
|
57
55
|
_SOX_INITIALIZED = False
|
|
58
|
-
if
|
|
59
|
-
|
|
60
|
-
|
|
56
|
+
_USE_SOX = False if os.name == "nt" else eval_env("TORCHAUDIO_USE_SOX", True)
|
|
57
|
+
_SOX_MODULE_AVAILABLE = is_module_available("torchaudio.lib._torchaudio_sox")
|
|
58
|
+
if _USE_SOX and _SOX_MODULE_AVAILABLE:
|
|
59
|
+
try:
|
|
60
|
+
_init_sox()
|
|
61
|
+
_SOX_INITIALIZED = True
|
|
62
|
+
except Exception:
|
|
63
|
+
# The initialization of sox extension will fail if supported sox
|
|
64
|
+
# libraries are not found in the system.
|
|
65
|
+
# Since the rest of the torchaudio works without it, we do not report the
|
|
66
|
+
# error here.
|
|
67
|
+
# The error will be raised when user code attempts to use these features.
|
|
68
|
+
_LG.debug("Failed to initialize sox extension", exc_info=True)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if os.name == "nt":
|
|
72
|
+
fail_if_no_sox = fail_with_message("requires sox extension, which is not supported on Windows.")
|
|
73
|
+
elif not _USE_SOX:
|
|
74
|
+
fail_if_no_sox = fail_with_message("requires sox extension, but it is disabled. (TORCHAUDIO_USE_SOX=0)")
|
|
75
|
+
elif not _SOX_MODULE_AVAILABLE:
|
|
76
|
+
fail_if_no_sox = fail_with_message(
|
|
77
|
+
"requires sox extension, but TorchAudio is not compiled with it. "
|
|
78
|
+
"Please build TorchAudio with libsox support. (BUILD_SOX=1)"
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
fail_if_no_sox = no_op if _SOX_INITIALIZED else _fail_since_no_sox
|
|
61
82
|
|
|
62
83
|
|
|
63
84
|
# Initialize FFmpeg-related features
|
|
64
|
-
|
|
65
|
-
|
|
85
|
+
_FFMPEG_EXT = None
|
|
86
|
+
_USE_FFMPEG = eval_env("TORCHAUDIO_USE_FFMPEG", True)
|
|
87
|
+
if _USE_FFMPEG and _IS_TORCHAUDIO_EXT_AVAILABLE:
|
|
66
88
|
try:
|
|
67
|
-
_init_ffmpeg()
|
|
68
|
-
_FFMPEG_INITIALIZED = True
|
|
89
|
+
_FFMPEG_EXT = _init_ffmpeg()
|
|
69
90
|
except Exception:
|
|
70
91
|
# The initialization of FFmpeg extension will fail if supported FFmpeg
|
|
71
92
|
# libraries are not found in the system.
|
|
@@ -75,22 +96,11 @@ if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
|
|
|
75
96
|
_LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
|
|
76
97
|
|
|
77
98
|
|
|
78
|
-
|
|
79
|
-
no_op
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
"requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
|
|
83
|
-
)
|
|
84
|
-
)
|
|
85
|
-
fail_if_no_sox = (
|
|
86
|
-
no_op
|
|
87
|
-
if _SOX_INITIALIZED
|
|
88
|
-
else fail_with_message(
|
|
89
|
-
"requires sox extension, but TorchAudio is not compiled with it. Please build TorchAudio with libsox support."
|
|
90
|
-
)
|
|
91
|
-
)
|
|
99
|
+
if _USE_FFMPEG:
|
|
100
|
+
fail_if_no_ffmpeg = _fail_since_no_ffmpeg if _FFMPEG_EXT is None else no_op
|
|
101
|
+
else:
|
|
102
|
+
fail_if_no_ffmpeg = fail_with_message("requires ffmpeg extension, but it is disabled. (TORCHAUDIO_USE_FFMPEG=0)")
|
|
92
103
|
|
|
93
|
-
fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
|
|
94
104
|
|
|
95
105
|
fail_if_no_rir = (
|
|
96
106
|
no_op
|
|
@@ -99,3 +109,12 @@ fail_if_no_rir = (
|
|
|
99
109
|
"requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
|
|
100
110
|
)
|
|
101
111
|
)
|
|
112
|
+
|
|
113
|
+
fail_if_no_align = (
|
|
114
|
+
no_op
|
|
115
|
+
if _IS_ALIGN_AVAILABLE
|
|
116
|
+
else fail_with_message(
|
|
117
|
+
"Requires alignment extension, but TorchAudio is not compiled with it. \
|
|
118
|
+
Please build TorchAudio with alignment support."
|
|
119
|
+
)
|
|
120
|
+
)
|