torchaudio 2.0.2__cp311-cp311-manylinux2014_aarch64.whl → 2.1.1__cp311-cp311-manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +22 -3
- torchaudio/_backend/__init__.py +55 -4
- torchaudio/_backend/backend.py +53 -0
- torchaudio/_backend/common.py +52 -0
- torchaudio/_backend/ffmpeg.py +373 -0
- torchaudio/_backend/soundfile.py +54 -0
- torchaudio/_backend/soundfile_backend.py +457 -0
- torchaudio/_backend/sox.py +91 -0
- torchaudio/_backend/utils.py +81 -323
- torchaudio/_extension/__init__.py +55 -36
- torchaudio/_extension/utils.py +109 -17
- torchaudio/_internal/__init__.py +4 -1
- torchaudio/_internal/module_utils.py +37 -6
- torchaudio/backend/__init__.py +7 -11
- torchaudio/backend/_no_backend.py +24 -0
- torchaudio/backend/_sox_io_backend.py +297 -0
- torchaudio/backend/common.py +12 -52
- torchaudio/backend/no_backend.py +11 -21
- torchaudio/backend/soundfile_backend.py +11 -448
- torchaudio/backend/sox_io_backend.py +11 -435
- torchaudio/backend/utils.py +9 -18
- torchaudio/datasets/__init__.py +2 -0
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/cmudict.py +61 -62
- torchaudio/datasets/dr_vctk.py +1 -1
- torchaudio/datasets/gtzan.py +1 -1
- torchaudio/datasets/librilight_limited.py +1 -1
- torchaudio/datasets/librispeech.py +1 -1
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +1 -1
- torchaudio/datasets/ljspeech.py +1 -1
- torchaudio/datasets/musdb_hq.py +1 -1
- torchaudio/datasets/quesst14.py +1 -1
- torchaudio/datasets/speechcommands.py +1 -1
- torchaudio/datasets/tedlium.py +1 -1
- torchaudio/datasets/vctk.py +1 -1
- torchaudio/datasets/voxceleb1.py +1 -1
- torchaudio/datasets/yesno.py +1 -1
- torchaudio/functional/__init__.py +6 -2
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +69 -92
- torchaudio/functional/functional.py +99 -148
- torchaudio/io/__init__.py +4 -1
- torchaudio/io/_effector.py +347 -0
- torchaudio/io/_stream_reader.py +158 -90
- torchaudio/io/_stream_writer.py +196 -10
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
- torchaudio/lib/_torchaudio_sox.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
- torchaudio/lib/libtorchaudio_sox.so +0 -0
- torchaudio/models/__init__.py +14 -0
- torchaudio/models/decoder/__init__.py +22 -7
- torchaudio/models/decoder/_ctc_decoder.py +123 -69
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/rnnt_decoder.py +10 -14
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/wav2vec2/components.py +6 -10
- torchaudio/pipelines/__init__.py +9 -0
- torchaudio/pipelines/_squim_pipeline.py +176 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +198 -68
- torchaudio/pipelines/_wav2vec2/utils.py +120 -0
- torchaudio/sox_effects/sox_effects.py +7 -30
- torchaudio/transforms/__init__.py +2 -0
- torchaudio/transforms/_transforms.py +99 -54
- torchaudio/utils/download.py +2 -2
- torchaudio/utils/ffmpeg_utils.py +20 -15
- torchaudio/utils/sox_utils.py +8 -9
- torchaudio/version.py +2 -2
- torchaudio-2.1.1.dist-info/METADATA +113 -0
- torchaudio-2.1.1.dist-info/RECORD +117 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
- torchaudio/io/_compat.py +0 -241
- torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
- torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
- torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
- torchaudio/lib/libflashlight-text.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
- torchaudio-2.0.2.dist-info/METADATA +0 -30
- torchaudio-2.0.2.dist-info/RECORD +0 -100
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
torchaudio/_extension/utils.py
CHANGED
|
@@ -6,15 +6,18 @@ Anything that depends on external state should happen in __init__.py
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
import importlib
|
|
10
|
+
import logging
|
|
9
11
|
import os
|
|
12
|
+
import platform
|
|
13
|
+
import warnings
|
|
10
14
|
from functools import wraps
|
|
11
15
|
from pathlib import Path
|
|
12
16
|
|
|
13
17
|
import torch
|
|
14
|
-
|
|
15
18
|
import torchaudio
|
|
16
|
-
from torchaudio._internal.module_utils import is_module_available
|
|
17
19
|
|
|
20
|
+
_LG = logging.getLogger(__name__)
|
|
18
21
|
_LIB_DIR = Path(__file__).parent.parent / "lib"
|
|
19
22
|
|
|
20
23
|
|
|
@@ -67,7 +70,7 @@ def _init_sox():
|
|
|
67
70
|
_load_lib("libtorchaudio_sox")
|
|
68
71
|
import torchaudio.lib._torchaudio_sox # noqa
|
|
69
72
|
|
|
70
|
-
|
|
73
|
+
torchaudio.lib._torchaudio_sox.set_verbosity(0)
|
|
71
74
|
|
|
72
75
|
import atexit
|
|
73
76
|
|
|
@@ -75,22 +78,92 @@ def _init_sox():
|
|
|
75
78
|
atexit.register(torch.ops.torchaudio.sox_effects_shutdown_sox_effects)
|
|
76
79
|
|
|
77
80
|
|
|
78
|
-
def
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
81
|
+
def _try_access_avutil(ffmpeg_ver):
|
|
82
|
+
libname_template = {
|
|
83
|
+
"Linux": "libavutil.so.{ver}",
|
|
84
|
+
"Darwin": "libavutil.{ver}.dylib",
|
|
85
|
+
"Windows": "avutil-{ver}.dll",
|
|
86
|
+
}[platform.system()]
|
|
87
|
+
avutil_ver = {"6": 58, "5": 57, "4": 56}[ffmpeg_ver]
|
|
88
|
+
libavutil = libname_template.format(ver=avutil_ver)
|
|
89
|
+
torchaudio.lib._torchaudio.find_avutil(libavutil)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _find_versionsed_ffmpeg_extension(ffmpeg_ver: str):
|
|
93
|
+
_LG.debug("Attempting to load FFmpeg version %s.", ffmpeg_ver)
|
|
94
|
+
|
|
95
|
+
library = f"libtorchaudio_ffmpeg{ffmpeg_ver}"
|
|
96
|
+
extension = f"_torchaudio_ffmpeg{ffmpeg_ver}"
|
|
97
|
+
|
|
98
|
+
if not _get_lib_path(extension).exists():
|
|
99
|
+
raise RuntimeError(f"FFmpeg {ffmpeg_ver} extension is not available.")
|
|
100
|
+
|
|
101
|
+
if ffmpeg_ver:
|
|
102
|
+
# A simple check for FFmpeg availability.
|
|
103
|
+
# This is not technically sufficient as other libraries could be missing,
|
|
104
|
+
# but usually this is sufficient.
|
|
105
|
+
#
|
|
106
|
+
# Note: the reason why this check is performed is because I don't know
|
|
107
|
+
# if the next `_load_lib` (which calls `ctypes.CDLL` under the hood),
|
|
108
|
+
# could leak handle to shared libraries of dependencies, in case it fails.
|
|
109
|
+
#
|
|
110
|
+
# i.e. If the `ctypes.CDLL("foo")` fails because one of `foo`'s dependency
|
|
111
|
+
# does not exist while `foo` and some other dependencies exist, is it guaranteed
|
|
112
|
+
# that none-of them are kept in memory after the failure??
|
|
113
|
+
_try_access_avutil(ffmpeg_ver)
|
|
114
|
+
|
|
115
|
+
_load_lib(library)
|
|
83
116
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
except OSError as err:
|
|
87
|
-
raise ImportError("FFmpeg libraries are not found. Please install FFmpeg.") from err
|
|
117
|
+
_LG.debug("Found FFmpeg version %s.", ffmpeg_ver)
|
|
118
|
+
return importlib.import_module(f"torchaudio.lib.{extension}")
|
|
88
119
|
|
|
89
|
-
import torchaudio.lib._torchaudio_ffmpeg # noqa
|
|
90
120
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
121
|
+
_FFMPEG_VERS = ["6", "5", "4", ""]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _find_ffmpeg_extension(ffmpeg_vers, show_error):
|
|
125
|
+
logger = _LG.error if show_error else _LG.debug
|
|
126
|
+
for ffmpeg_ver in ffmpeg_vers:
|
|
127
|
+
try:
|
|
128
|
+
return _find_versionsed_ffmpeg_extension(ffmpeg_ver)
|
|
129
|
+
except Exception:
|
|
130
|
+
logger("Failed to load FFmpeg %s extension.", ffmpeg_ver, exc_info=True)
|
|
131
|
+
continue
|
|
132
|
+
raise ImportError(f"Failed to intialize FFmpeg extension. Tried versions: {ffmpeg_vers}")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _find_available_ffmpeg_ext():
|
|
136
|
+
ffmpeg_vers = ["6", "5", "4", ""]
|
|
137
|
+
return [v for v in ffmpeg_vers if _get_lib_path(f"_torchaudio_ffmpeg{v}").exists()]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _init_ffmpeg(show_error=False):
|
|
141
|
+
ffmpeg_vers = _find_available_ffmpeg_ext()
|
|
142
|
+
if not ffmpeg_vers:
|
|
143
|
+
raise RuntimeError(
|
|
144
|
+
# fmt: off
|
|
145
|
+
"TorchAudio is not built with FFmpeg integration. "
|
|
146
|
+
"Please build torchaudio with USE_FFMPEG=1."
|
|
147
|
+
# fmt: on
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# User override
|
|
151
|
+
if ffmpeg_ver := os.environ.get("TORCHAUDIO_USE_FFMPEG_VERSION"):
|
|
152
|
+
if ffmpeg_vers == [""]:
|
|
153
|
+
warnings.warn("TorchAudio is built in single FFmpeg mode. TORCHAUDIO_USE_FFMPEG_VERSION is ignored.")
|
|
154
|
+
else:
|
|
155
|
+
if ffmpeg_ver not in ffmpeg_vers:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"The FFmpeg version {ffmpeg_ver} (read from TORCHAUDIO_USE_FFMPEG_VERSION) "
|
|
158
|
+
f"is not available. Available versions are {[v for v in ffmpeg_vers if v]}"
|
|
159
|
+
)
|
|
160
|
+
ffmpeg_vers = [ffmpeg_ver]
|
|
161
|
+
|
|
162
|
+
ext = _find_ffmpeg_extension(ffmpeg_vers, show_error)
|
|
163
|
+
ext.init()
|
|
164
|
+
if ext.get_log_level() > 8:
|
|
165
|
+
ext.set_log_level(8)
|
|
166
|
+
return ext
|
|
94
167
|
|
|
95
168
|
|
|
96
169
|
def _init_dll_path():
|
|
@@ -124,6 +197,25 @@ def _check_cuda_version():
|
|
|
124
197
|
return version
|
|
125
198
|
|
|
126
199
|
|
|
200
|
+
def _fail_since_no_sox(func):
|
|
201
|
+
@wraps(func)
|
|
202
|
+
def wrapped(*_args, **_kwargs):
|
|
203
|
+
try:
|
|
204
|
+
# Note:
|
|
205
|
+
# We run _init_sox again just to show users the stacktrace.
|
|
206
|
+
# _init_sox would not succeed here.
|
|
207
|
+
_init_sox()
|
|
208
|
+
except Exception as err:
|
|
209
|
+
raise RuntimeError(
|
|
210
|
+
f"{func.__name__} requires sox extension which is not available. "
|
|
211
|
+
"Please refer to the stacktrace above for how to resolve this."
|
|
212
|
+
) from err
|
|
213
|
+
# This should not happen in normal execution, but just in case.
|
|
214
|
+
return func(*_args, **_kwargs)
|
|
215
|
+
|
|
216
|
+
return wrapped
|
|
217
|
+
|
|
218
|
+
|
|
127
219
|
def _fail_since_no_ffmpeg(func):
|
|
128
220
|
@wraps(func)
|
|
129
221
|
def wrapped(*_args, **_kwargs):
|
|
@@ -131,7 +223,7 @@ def _fail_since_no_ffmpeg(func):
|
|
|
131
223
|
# Note:
|
|
132
224
|
# We run _init_ffmpeg again just to show users the stacktrace.
|
|
133
225
|
# _init_ffmpeg would not succeed here.
|
|
134
|
-
_init_ffmpeg()
|
|
226
|
+
_init_ffmpeg(show_error=True)
|
|
135
227
|
except Exception as err:
|
|
136
228
|
raise RuntimeError(
|
|
137
229
|
f"{func.__name__} requires FFmpeg extension which is not available. "
|
torchaudio/_internal/__init__.py
CHANGED
|
@@ -1,9 +1,29 @@
|
|
|
1
1
|
import importlib.util
|
|
2
|
+
import os
|
|
2
3
|
import warnings
|
|
3
4
|
from functools import wraps
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
def eval_env(var, default):
|
|
9
|
+
"""Check if environment varable has True-y value"""
|
|
10
|
+
if var not in os.environ:
|
|
11
|
+
return default
|
|
12
|
+
|
|
13
|
+
val = os.environ.get(var, "0")
|
|
14
|
+
trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
|
|
15
|
+
falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
|
|
16
|
+
if val in trues:
|
|
17
|
+
return True
|
|
18
|
+
if val not in falses:
|
|
19
|
+
# fmt: off
|
|
20
|
+
raise RuntimeError(
|
|
21
|
+
f"Unexpected environment variable value `{var}={val}`. "
|
|
22
|
+
f"Expected one of {trues + falses}")
|
|
23
|
+
# fmt: on
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
|
|
7
27
|
def is_module_available(*modules: str) -> bool:
|
|
8
28
|
r"""Returns if a top-level module with :attr:`name` exists *without**
|
|
9
29
|
importing it. This is generally safer than try-catch block around a
|
|
@@ -40,25 +60,36 @@ def requires_module(*modules: str):
|
|
|
40
60
|
return decorator
|
|
41
61
|
|
|
42
62
|
|
|
43
|
-
def deprecated(direction: str, version: Optional[str] = None):
|
|
63
|
+
def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
|
|
44
64
|
"""Decorator to add deprecation message
|
|
45
65
|
|
|
46
66
|
Args:
|
|
47
67
|
direction (str): Migration steps to be given to users.
|
|
48
68
|
version (str or int): The version when the object will be removed
|
|
69
|
+
remove (bool): If enabled, append future removal message.
|
|
49
70
|
"""
|
|
50
71
|
|
|
51
72
|
def decorator(func):
|
|
52
73
|
@wraps(func)
|
|
53
74
|
def wrapped(*args, **kwargs):
|
|
54
|
-
message =
|
|
55
|
-
|
|
56
|
-
f'
|
|
57
|
-
f"{direction}"
|
|
58
|
-
)
|
|
75
|
+
message = f"{func.__module__}.{func.__name__} has been deprecated. {direction}"
|
|
76
|
+
if remove:
|
|
77
|
+
message += f' It will be removed from {"future" if version is None else version} release. '
|
|
59
78
|
warnings.warn(message, stacklevel=2)
|
|
60
79
|
return func(*args, **kwargs)
|
|
61
80
|
|
|
81
|
+
message = "This function has been deprecated. "
|
|
82
|
+
if remove:
|
|
83
|
+
message += f'It will be removed from {"future" if version is None else version} release. '
|
|
84
|
+
|
|
85
|
+
wrapped.__doc__ = f"""DEPRECATED: {func.__doc__}
|
|
86
|
+
|
|
87
|
+
.. warning::
|
|
88
|
+
|
|
89
|
+
{message}
|
|
90
|
+
{direction}
|
|
91
|
+
"""
|
|
92
|
+
|
|
62
93
|
return wrapped
|
|
63
94
|
|
|
64
95
|
return decorator
|
torchaudio/backend/__init__.py
CHANGED
|
@@ -1,14 +1,10 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
1
|
+
# NOTE:
|
|
2
|
+
# The entire `torchaudio.backend` module is deprecated.
|
|
3
|
+
# New things should be added to `torchaudio._backend`.
|
|
4
|
+
# Only things related to backward compatibility should be placed here.
|
|
3
5
|
|
|
4
|
-
from . import utils
|
|
5
|
-
from .utils import _is_backend_dispatcher_enabled, get_audio_backend, list_audio_backends, set_audio_backend
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
from . import common, no_backend, soundfile_backend, sox_io_backend # noqa
|
|
8
|
+
from .utils import _init_backend, get_audio_backend, list_audio_backends, set_audio_backend
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
torchaudio.load = get_load_func()
|
|
12
|
-
torchaudio.save = get_save_func()
|
|
13
|
-
else:
|
|
14
|
-
utils._init_audio_backend()
|
|
10
|
+
__all__ = ["_init_backend", "get_audio_backend", "list_audio_backends", "set_audio_backend"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Callable, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from torch import Tensor
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def load(
|
|
8
|
+
filepath: Union[str, Path],
|
|
9
|
+
out: Optional[Tensor] = None,
|
|
10
|
+
normalization: Union[bool, float, Callable] = True,
|
|
11
|
+
channels_first: bool = True,
|
|
12
|
+
num_frames: int = 0,
|
|
13
|
+
offset: int = 0,
|
|
14
|
+
filetype: Optional[str] = None,
|
|
15
|
+
) -> Tuple[Tensor, int]:
|
|
16
|
+
raise RuntimeError("No audio I/O backend is available.")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
|
|
20
|
+
raise RuntimeError("No audio I/O backend is available.")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def info(filepath: str) -> None:
|
|
24
|
+
raise RuntimeError("No audio I/O backend is available.")
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
import torchaudio
|
|
6
|
+
from torchaudio import AudioMetaData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@torchaudio._extension.fail_if_no_sox
|
|
10
|
+
def info(
|
|
11
|
+
filepath: str,
|
|
12
|
+
format: Optional[str] = None,
|
|
13
|
+
) -> AudioMetaData:
|
|
14
|
+
"""Get signal information of an audio file.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
filepath (str):
|
|
18
|
+
Source of audio data.
|
|
19
|
+
|
|
20
|
+
format (str or None, optional):
|
|
21
|
+
Override the format detection with the given format.
|
|
22
|
+
Providing the argument might help when libsox can not infer the format
|
|
23
|
+
from header or extension.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
AudioMetaData: Metadata of the given audio.
|
|
27
|
+
"""
|
|
28
|
+
if not torch.jit.is_scripting():
|
|
29
|
+
if hasattr(filepath, "read"):
|
|
30
|
+
raise RuntimeError("sox_io backend does not support file-like object.")
|
|
31
|
+
filepath = os.fspath(filepath)
|
|
32
|
+
sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
|
|
33
|
+
return AudioMetaData(*sinfo)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@torchaudio._extension.fail_if_no_sox
|
|
37
|
+
def load(
|
|
38
|
+
filepath: str,
|
|
39
|
+
frame_offset: int = 0,
|
|
40
|
+
num_frames: int = -1,
|
|
41
|
+
normalize: bool = True,
|
|
42
|
+
channels_first: bool = True,
|
|
43
|
+
format: Optional[str] = None,
|
|
44
|
+
) -> Tuple[torch.Tensor, int]:
|
|
45
|
+
"""Load audio data from file.
|
|
46
|
+
|
|
47
|
+
Note:
|
|
48
|
+
This function can handle all the codecs that underlying libsox can handle,
|
|
49
|
+
however it is tested on the following formats;
|
|
50
|
+
|
|
51
|
+
* WAV, AMB
|
|
52
|
+
|
|
53
|
+
* 32-bit floating-point
|
|
54
|
+
* 32-bit signed integer
|
|
55
|
+
* 24-bit signed integer
|
|
56
|
+
* 16-bit signed integer
|
|
57
|
+
* 8-bit unsigned integer (WAV only)
|
|
58
|
+
|
|
59
|
+
* MP3
|
|
60
|
+
* FLAC
|
|
61
|
+
* OGG/VORBIS
|
|
62
|
+
* OPUS
|
|
63
|
+
* SPHERE
|
|
64
|
+
* AMR-NB
|
|
65
|
+
|
|
66
|
+
To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
|
|
67
|
+
handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
|
|
68
|
+
and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
|
|
69
|
+
|
|
70
|
+
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
|
71
|
+
``float32`` dtype, and the shape of `[channel, time]`.
|
|
72
|
+
|
|
73
|
+
.. warning::
|
|
74
|
+
|
|
75
|
+
``normalize`` argument does not perform volume normalization.
|
|
76
|
+
It only converts the sample type to `torch.float32` from the native sample
|
|
77
|
+
type.
|
|
78
|
+
|
|
79
|
+
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
|
80
|
+
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
|
|
81
|
+
this function can return integer Tensor, where the samples are expressed within the whole range
|
|
82
|
+
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
|
|
83
|
+
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
|
|
84
|
+
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
|
|
85
|
+
|
|
86
|
+
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
|
|
87
|
+
``flac`` and ``mp3``.
|
|
88
|
+
|
|
89
|
+
For these formats, this function always returns ``float32`` Tensor with values.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
filepath (path-like object): Source of audio data.
|
|
93
|
+
frame_offset (int):
|
|
94
|
+
Number of frames to skip before start reading data.
|
|
95
|
+
num_frames (int, optional):
|
|
96
|
+
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
|
97
|
+
starting from ``frame_offset``.
|
|
98
|
+
This function may return the less number of frames if there is not enough
|
|
99
|
+
frames in the given file.
|
|
100
|
+
normalize (bool, optional):
|
|
101
|
+
When ``True``, this function converts the native sample type to ``float32``.
|
|
102
|
+
Default: ``True``.
|
|
103
|
+
|
|
104
|
+
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
|
105
|
+
integer type.
|
|
106
|
+
This argument has no effect for formats other than integer WAV type.
|
|
107
|
+
|
|
108
|
+
channels_first (bool, optional):
|
|
109
|
+
When True, the returned Tensor has dimension `[channel, time]`.
|
|
110
|
+
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
|
111
|
+
format (str or None, optional):
|
|
112
|
+
Override the format detection with the given format.
|
|
113
|
+
Providing the argument might help when libsox can not infer the format
|
|
114
|
+
from header or extension.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
(torch.Tensor, int): Resulting Tensor and sample rate.
|
|
118
|
+
If the input file has integer wav format and ``normalize=False``, then it has
|
|
119
|
+
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
|
120
|
+
`[channel, time]` else `[time, channel]`.
|
|
121
|
+
"""
|
|
122
|
+
if not torch.jit.is_scripting():
|
|
123
|
+
if hasattr(filepath, "read"):
|
|
124
|
+
raise RuntimeError("sox_io backend does not support file-like object.")
|
|
125
|
+
filepath = os.fspath(filepath)
|
|
126
|
+
return torch.ops.torchaudio.sox_io_load_audio_file(
|
|
127
|
+
filepath, frame_offset, num_frames, normalize, channels_first, format
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@torchaudio._extension.fail_if_no_sox
|
|
132
|
+
def save(
|
|
133
|
+
filepath: str,
|
|
134
|
+
src: torch.Tensor,
|
|
135
|
+
sample_rate: int,
|
|
136
|
+
channels_first: bool = True,
|
|
137
|
+
compression: Optional[float] = None,
|
|
138
|
+
format: Optional[str] = None,
|
|
139
|
+
encoding: Optional[str] = None,
|
|
140
|
+
bits_per_sample: Optional[int] = None,
|
|
141
|
+
):
|
|
142
|
+
"""Save audio data to file.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
filepath (path-like object): Path to save file.
|
|
146
|
+
src (torch.Tensor): Audio data to save. must be 2D tensor.
|
|
147
|
+
sample_rate (int): sampling rate
|
|
148
|
+
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
|
149
|
+
otherwise `[time, channel]`.
|
|
150
|
+
compression (float or None, optional): Used for formats other than WAV.
|
|
151
|
+
This corresponds to ``-C`` option of ``sox`` command.
|
|
152
|
+
|
|
153
|
+
``"mp3"``
|
|
154
|
+
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
|
|
155
|
+
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
|
|
156
|
+
|
|
157
|
+
``"flac"``
|
|
158
|
+
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
|
|
159
|
+
|
|
160
|
+
``"ogg"``, ``"vorbis"``
|
|
161
|
+
Number from ``-1`` to ``10``; ``-1`` is the highest compression
|
|
162
|
+
and lowest quality. Default: ``3``.
|
|
163
|
+
|
|
164
|
+
See the detail at http://sox.sourceforge.net/soxformat.html.
|
|
165
|
+
format (str or None, optional): Override the audio format.
|
|
166
|
+
When ``filepath`` argument is path-like object, audio format is infered from
|
|
167
|
+
file extension. If file extension is missing or different, you can specify the
|
|
168
|
+
correct format with this argument.
|
|
169
|
+
|
|
170
|
+
When ``filepath`` argument is file-like object, this argument is required.
|
|
171
|
+
|
|
172
|
+
Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
|
|
173
|
+
``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
|
|
174
|
+
|
|
175
|
+
encoding (str or None, optional): Changes the encoding for the supported formats.
|
|
176
|
+
This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
|
|
177
|
+
and ``"sph"``. Valid values are;
|
|
178
|
+
|
|
179
|
+
- ``"PCM_S"`` (signed integer Linear PCM)
|
|
180
|
+
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
|
181
|
+
- ``"PCM_F"`` (floating point PCM)
|
|
182
|
+
- ``"ULAW"`` (mu-law)
|
|
183
|
+
- ``"ALAW"`` (a-law)
|
|
184
|
+
|
|
185
|
+
Default values
|
|
186
|
+
If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
|
|
187
|
+
|
|
188
|
+
``"wav"``, ``"amb"``
|
|
189
|
+
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
|
|
190
|
+
| Tensor is used to determine the default value.
|
|
191
|
+
|
|
192
|
+
- ``"PCM_U"`` if dtype is ``uint8``
|
|
193
|
+
- ``"PCM_S"`` if dtype is ``int16`` or ``int32``
|
|
194
|
+
- ``"PCM_F"`` if dtype is ``float32``
|
|
195
|
+
|
|
196
|
+
- ``"PCM_U"`` if ``bits_per_sample=8``
|
|
197
|
+
- ``"PCM_S"`` otherwise
|
|
198
|
+
|
|
199
|
+
``"sph"`` format;
|
|
200
|
+
- the default value is ``"PCM_S"``
|
|
201
|
+
|
|
202
|
+
bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
|
|
203
|
+
When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
|
|
204
|
+
bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
|
|
205
|
+
|
|
206
|
+
Default Value;
|
|
207
|
+
If not provided, the default values are picked based on ``format`` and ``"encoding"``;
|
|
208
|
+
|
|
209
|
+
``"wav"``, ``"amb"``;
|
|
210
|
+
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
|
|
211
|
+
| Tensor is used.
|
|
212
|
+
|
|
213
|
+
- ``8`` if dtype is ``uint8``
|
|
214
|
+
- ``16`` if dtype is ``int16``
|
|
215
|
+
- ``32`` if dtype is ``int32`` or ``float32``
|
|
216
|
+
|
|
217
|
+
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
|
|
218
|
+
- ``16`` if ``encoding`` is ``"PCM_S"``
|
|
219
|
+
- ``32`` if ``encoding`` is ``"PCM_F"``
|
|
220
|
+
|
|
221
|
+
``"flac"`` format;
|
|
222
|
+
- the default value is ``24``
|
|
223
|
+
|
|
224
|
+
``"sph"`` format;
|
|
225
|
+
- ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
|
|
226
|
+
- ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
|
|
227
|
+
|
|
228
|
+
``"amb"`` format;
|
|
229
|
+
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
|
|
230
|
+
- ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
|
|
231
|
+
- ``32`` if ``encoding`` is ``"PCM_F"``
|
|
232
|
+
|
|
233
|
+
Supported formats/encodings/bit depth/compression are;
|
|
234
|
+
|
|
235
|
+
``"wav"``, ``"amb"``
|
|
236
|
+
- 32-bit floating-point PCM
|
|
237
|
+
- 32-bit signed integer PCM
|
|
238
|
+
- 24-bit signed integer PCM
|
|
239
|
+
- 16-bit signed integer PCM
|
|
240
|
+
- 8-bit unsigned integer PCM
|
|
241
|
+
- 8-bit mu-law
|
|
242
|
+
- 8-bit a-law
|
|
243
|
+
|
|
244
|
+
Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
|
|
245
|
+
|
|
246
|
+
``"mp3"``
|
|
247
|
+
Fixed bit rate (such as 128kHz) and variable bit rate compression.
|
|
248
|
+
Default: VBR with high quality.
|
|
249
|
+
|
|
250
|
+
``"flac"``
|
|
251
|
+
- 8-bit
|
|
252
|
+
- 16-bit
|
|
253
|
+
- 24-bit (default)
|
|
254
|
+
|
|
255
|
+
``"ogg"``, ``"vorbis"``
|
|
256
|
+
- Different quality level. Default: approx. 112kbps
|
|
257
|
+
|
|
258
|
+
``"sph"``
|
|
259
|
+
- 8-bit signed integer PCM
|
|
260
|
+
- 16-bit signed integer PCM
|
|
261
|
+
- 24-bit signed integer PCM
|
|
262
|
+
- 32-bit signed integer PCM (default)
|
|
263
|
+
- 8-bit mu-law
|
|
264
|
+
- 8-bit a-law
|
|
265
|
+
- 16-bit a-law
|
|
266
|
+
- 24-bit a-law
|
|
267
|
+
- 32-bit a-law
|
|
268
|
+
|
|
269
|
+
``"amr-nb"``
|
|
270
|
+
Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
|
|
271
|
+
|
|
272
|
+
``"gsm"``
|
|
273
|
+
Lossy Speech Compression, CPU intensive.
|
|
274
|
+
|
|
275
|
+
``"htk"``
|
|
276
|
+
Uses a default single-channel 16-bit PCM format.
|
|
277
|
+
|
|
278
|
+
Note:
|
|
279
|
+
To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
|
|
280
|
+
``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
|
|
281
|
+
to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
|
|
282
|
+
or ``libmp3lame`` etc.
|
|
283
|
+
"""
|
|
284
|
+
if not torch.jit.is_scripting():
|
|
285
|
+
if hasattr(filepath, "write"):
|
|
286
|
+
raise RuntimeError("sox_io backend does not handle file-like object.")
|
|
287
|
+
filepath = os.fspath(filepath)
|
|
288
|
+
torch.ops.torchaudio.sox_io_save_audio_file(
|
|
289
|
+
filepath,
|
|
290
|
+
src,
|
|
291
|
+
sample_rate,
|
|
292
|
+
channels_first,
|
|
293
|
+
compression,
|
|
294
|
+
format,
|
|
295
|
+
encoding,
|
|
296
|
+
bits_per_sample,
|
|
297
|
+
)
|
torchaudio/backend/common.py
CHANGED
|
@@ -1,53 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
:ivar int num_channels: The number of channels
|
|
10
|
-
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
|
11
|
-
or when it cannot be accurately inferred.
|
|
12
|
-
:ivar str encoding: Audio encoding
|
|
13
|
-
The values encoding can take are one of the following:
|
|
14
|
-
|
|
15
|
-
* ``PCM_S``: Signed integer linear PCM
|
|
16
|
-
* ``PCM_U``: Unsigned integer linear PCM
|
|
17
|
-
* ``PCM_F``: Floating point linear PCM
|
|
18
|
-
* ``FLAC``: Flac, Free Lossless Audio Codec
|
|
19
|
-
* ``ULAW``: Mu-law
|
|
20
|
-
* ``ALAW``: A-law
|
|
21
|
-
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
|
22
|
-
* ``VORBIS``: OGG Vorbis
|
|
23
|
-
* ``AMR_WB``: Adaptive Multi-Rate Wideband
|
|
24
|
-
* ``AMR_NB``: Adaptive Multi-Rate Narrowband
|
|
25
|
-
* ``OPUS``: Opus
|
|
26
|
-
* ``HTK``: Single channel 16-bit PCM
|
|
27
|
-
* ``UNKNOWN`` : None of above
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
sample_rate: int,
|
|
33
|
-
num_frames: int,
|
|
34
|
-
num_channels: int,
|
|
35
|
-
bits_per_sample: int,
|
|
36
|
-
encoding: str,
|
|
37
|
-
):
|
|
38
|
-
self.sample_rate = sample_rate
|
|
39
|
-
self.num_frames = num_frames
|
|
40
|
-
self.num_channels = num_channels
|
|
41
|
-
self.bits_per_sample = bits_per_sample
|
|
42
|
-
self.encoding = encoding
|
|
43
|
-
|
|
44
|
-
def __str__(self):
|
|
45
|
-
return (
|
|
46
|
-
f"AudioMetaData("
|
|
47
|
-
f"sample_rate={self.sample_rate}, "
|
|
48
|
-
f"num_frames={self.num_frames}, "
|
|
49
|
-
f"num_channels={self.num_channels}, "
|
|
50
|
-
f"bits_per_sample={self.bits_per_sample}, "
|
|
51
|
-
f"encoding={self.encoding}"
|
|
52
|
-
f")"
|
|
1
|
+
def __getattr__(name: str):
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
if name == "AudioMetaData":
|
|
5
|
+
warnings.warn(
|
|
6
|
+
"`torchaudio.backend.common.AudioMetaData` has been moved to "
|
|
7
|
+
"`torchaudio.AudioMetaData`. Please update the import path.",
|
|
8
|
+
stacklevel=2,
|
|
53
9
|
)
|
|
10
|
+
from torchaudio._backend.common import AudioMetaData
|
|
11
|
+
|
|
12
|
+
return AudioMetaData
|
|
13
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|