torchaudio 2.8.0__cp312-cp312-win_amd64.whl → 2.9.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +179 -39
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +12 -3
- torchaudio/_torchcodec.py +73 -85
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +0 -2
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +26 -60
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +14 -2
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +1 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +1 -0
- torchaudio/transforms/_transforms.py +2 -2
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -3
- torchaudio/version.py +2 -2
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -350
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -20
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -150
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -68
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -441
- torchaudio/prototype/functional/_rir.py +0 -382
- torchaudio/prototype/functional/functional.py +0 -193
- torchaudio/prototype/models/__init__.py +0 -39
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
- torchaudio/prototype/models/_emformer_hubert.py +0 -337
- torchaudio/prototype/models/conv_emformer.py +0 -529
- torchaudio/prototype/models/hifi_gan.py +0 -342
- torchaudio/prototype/models/rnnt.py +0 -717
- torchaudio/prototype/models/rnnt_decoder.py +0 -402
- torchaudio/prototype/pipelines/__init__.py +0 -21
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -461
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -275
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -118
- torchaudio-2.8.0.dist-info/RECORD +0 -145
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -977
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -275
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,457 +0,0 @@
|
|
|
1
|
-
"""The new soundfile backend which will become default in 0.8.0 onward"""
|
|
2
|
-
import warnings
|
|
3
|
-
from typing import Optional, Tuple
|
|
4
|
-
|
|
5
|
-
import torch
|
|
6
|
-
from torchaudio._internal import module_utils as _mod_utils
|
|
7
|
-
|
|
8
|
-
from .common import AudioMetaData
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
_IS_SOUNDFILE_AVAILABLE = False
|
|
12
|
-
|
|
13
|
-
# TODO: import soundfile only when it is used.
|
|
14
|
-
if _mod_utils.is_module_available("soundfile"):
|
|
15
|
-
try:
|
|
16
|
-
import soundfile
|
|
17
|
-
|
|
18
|
-
_requires_soundfile = _mod_utils.no_op
|
|
19
|
-
_IS_SOUNDFILE_AVAILABLE = True
|
|
20
|
-
except Exception:
|
|
21
|
-
_requires_soundfile = _mod_utils.fail_with_message(
|
|
22
|
-
"requires soundfile, but we failed to import it. Please check the installation of soundfile."
|
|
23
|
-
)
|
|
24
|
-
else:
|
|
25
|
-
_requires_soundfile = _mod_utils.fail_with_message(
|
|
26
|
-
"requires soundfile, but it is not installed. Please install soundfile."
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# Mapping from soundfile subtype to number of bits per sample.
|
|
31
|
-
# This is mostly heuristical and the value is set to 0 when it is irrelevant
|
|
32
|
-
# (lossy formats) or when it can't be inferred.
|
|
33
|
-
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
|
|
34
|
-
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
|
|
35
|
-
# the default seems to be 8 bits but it can be compressed further to 4 bits.
|
|
36
|
-
# The dict is inspired from
|
|
37
|
-
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
|
|
38
|
-
_SUBTYPE_TO_BITS_PER_SAMPLE = {
|
|
39
|
-
"PCM_S8": 8, # Signed 8 bit data
|
|
40
|
-
"PCM_16": 16, # Signed 16 bit data
|
|
41
|
-
"PCM_24": 24, # Signed 24 bit data
|
|
42
|
-
"PCM_32": 32, # Signed 32 bit data
|
|
43
|
-
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
|
|
44
|
-
"FLOAT": 32, # 32 bit float data
|
|
45
|
-
"DOUBLE": 64, # 64 bit float data
|
|
46
|
-
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
|
47
|
-
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
|
48
|
-
"IMA_ADPCM": 0, # IMA ADPCM.
|
|
49
|
-
"MS_ADPCM": 0, # Microsoft ADPCM.
|
|
50
|
-
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
|
|
51
|
-
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
|
|
52
|
-
"G721_32": 0, # 32kbs G721 ADPCM encoding.
|
|
53
|
-
"G723_24": 0, # 24kbs G723 ADPCM encoding.
|
|
54
|
-
"G723_40": 0, # 40kbs G723 ADPCM encoding.
|
|
55
|
-
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
|
|
56
|
-
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
|
|
57
|
-
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
|
|
58
|
-
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
|
|
59
|
-
"DPCM_8": 8, # 8 bit differential PCM (XI only)
|
|
60
|
-
"DPCM_16": 16, # 16 bit differential PCM (XI only)
|
|
61
|
-
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
|
|
62
|
-
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
|
|
63
|
-
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
|
|
64
|
-
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
|
|
65
|
-
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _get_bit_depth(subtype):
|
|
70
|
-
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
|
|
71
|
-
warnings.warn(
|
|
72
|
-
f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
|
|
73
|
-
"attribute will be set to 0. If you are seeing this warning, please "
|
|
74
|
-
"report by opening an issue on github (after checking for existing/closed ones). "
|
|
75
|
-
"You may otherwise ignore this warning."
|
|
76
|
-
)
|
|
77
|
-
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
_SUBTYPE_TO_ENCODING = {
|
|
81
|
-
"PCM_S8": "PCM_S",
|
|
82
|
-
"PCM_16": "PCM_S",
|
|
83
|
-
"PCM_24": "PCM_S",
|
|
84
|
-
"PCM_32": "PCM_S",
|
|
85
|
-
"PCM_U8": "PCM_U",
|
|
86
|
-
"FLOAT": "PCM_F",
|
|
87
|
-
"DOUBLE": "PCM_F",
|
|
88
|
-
"ULAW": "ULAW",
|
|
89
|
-
"ALAW": "ALAW",
|
|
90
|
-
"VORBIS": "VORBIS",
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _get_encoding(format: str, subtype: str):
|
|
95
|
-
if format == "FLAC":
|
|
96
|
-
return "FLAC"
|
|
97
|
-
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
@_requires_soundfile
|
|
101
|
-
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
|
|
102
|
-
"""Get signal information of an audio file.
|
|
103
|
-
|
|
104
|
-
Note:
|
|
105
|
-
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
|
106
|
-
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
|
107
|
-
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
filepath (path-like object or file-like object):
|
|
111
|
-
Source of audio data.
|
|
112
|
-
format (str or None, optional):
|
|
113
|
-
Not used. PySoundFile does not accept format hint.
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
AudioMetaData: meta data of the given audio.
|
|
117
|
-
|
|
118
|
-
"""
|
|
119
|
-
sinfo = soundfile.info(filepath)
|
|
120
|
-
return AudioMetaData(
|
|
121
|
-
sinfo.samplerate,
|
|
122
|
-
sinfo.frames,
|
|
123
|
-
sinfo.channels,
|
|
124
|
-
bits_per_sample=_get_bit_depth(sinfo.subtype),
|
|
125
|
-
encoding=_get_encoding(sinfo.format, sinfo.subtype),
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
_SUBTYPE2DTYPE = {
|
|
130
|
-
"PCM_S8": "int8",
|
|
131
|
-
"PCM_U8": "uint8",
|
|
132
|
-
"PCM_16": "int16",
|
|
133
|
-
"PCM_32": "int32",
|
|
134
|
-
"FLOAT": "float32",
|
|
135
|
-
"DOUBLE": "float64",
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
@_requires_soundfile
|
|
140
|
-
def load(
|
|
141
|
-
filepath: str,
|
|
142
|
-
frame_offset: int = 0,
|
|
143
|
-
num_frames: int = -1,
|
|
144
|
-
normalize: bool = True,
|
|
145
|
-
channels_first: bool = True,
|
|
146
|
-
format: Optional[str] = None,
|
|
147
|
-
) -> Tuple[torch.Tensor, int]:
|
|
148
|
-
"""Load audio data from file.
|
|
149
|
-
|
|
150
|
-
Note:
|
|
151
|
-
The formats this function can handle depend on the soundfile installation.
|
|
152
|
-
This function is tested on the following formats;
|
|
153
|
-
|
|
154
|
-
* WAV
|
|
155
|
-
|
|
156
|
-
* 32-bit floating-point
|
|
157
|
-
* 32-bit signed integer
|
|
158
|
-
* 16-bit signed integer
|
|
159
|
-
* 8-bit unsigned integer
|
|
160
|
-
|
|
161
|
-
* FLAC
|
|
162
|
-
* OGG/VORBIS
|
|
163
|
-
* SPHERE
|
|
164
|
-
|
|
165
|
-
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
|
166
|
-
``float32`` dtype, and the shape of `[channel, time]`.
|
|
167
|
-
|
|
168
|
-
.. warning::
|
|
169
|
-
|
|
170
|
-
``normalize`` argument does not perform volume normalization.
|
|
171
|
-
It only converts the sample type to `torch.float32` from the native sample
|
|
172
|
-
type.
|
|
173
|
-
|
|
174
|
-
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
|
175
|
-
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
|
|
176
|
-
this function can return integer Tensor, where the samples are expressed within the whole range
|
|
177
|
-
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
|
|
178
|
-
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
|
|
179
|
-
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
|
|
180
|
-
|
|
181
|
-
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
|
|
182
|
-
``flac`` and ``mp3``.
|
|
183
|
-
|
|
184
|
-
For these formats, this function always returns ``float32`` Tensor with values.
|
|
185
|
-
|
|
186
|
-
Note:
|
|
187
|
-
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
|
188
|
-
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
|
189
|
-
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
|
190
|
-
|
|
191
|
-
Args:
|
|
192
|
-
filepath (path-like object or file-like object):
|
|
193
|
-
Source of audio data.
|
|
194
|
-
frame_offset (int, optional):
|
|
195
|
-
Number of frames to skip before start reading data.
|
|
196
|
-
num_frames (int, optional):
|
|
197
|
-
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
|
198
|
-
starting from ``frame_offset``.
|
|
199
|
-
This function may return the less number of frames if there is not enough
|
|
200
|
-
frames in the given file.
|
|
201
|
-
normalize (bool, optional):
|
|
202
|
-
When ``True``, this function converts the native sample type to ``float32``.
|
|
203
|
-
Default: ``True``.
|
|
204
|
-
|
|
205
|
-
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
|
206
|
-
integer type.
|
|
207
|
-
This argument has no effect for formats other than integer WAV type.
|
|
208
|
-
|
|
209
|
-
channels_first (bool, optional):
|
|
210
|
-
When True, the returned Tensor has dimension `[channel, time]`.
|
|
211
|
-
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
|
212
|
-
format (str or None, optional):
|
|
213
|
-
Not used. PySoundFile does not accept format hint.
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
(torch.Tensor, int): Resulting Tensor and sample rate.
|
|
217
|
-
If the input file has integer wav format and normalization is off, then it has
|
|
218
|
-
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
|
219
|
-
`[channel, time]` else `[time, channel]`.
|
|
220
|
-
"""
|
|
221
|
-
with soundfile.SoundFile(filepath, "r") as file_:
|
|
222
|
-
if file_.format != "WAV" or normalize:
|
|
223
|
-
dtype = "float32"
|
|
224
|
-
elif file_.subtype not in _SUBTYPE2DTYPE:
|
|
225
|
-
raise ValueError(f"Unsupported subtype: {file_.subtype}")
|
|
226
|
-
else:
|
|
227
|
-
dtype = _SUBTYPE2DTYPE[file_.subtype]
|
|
228
|
-
|
|
229
|
-
frames = file_._prepare_read(frame_offset, None, num_frames)
|
|
230
|
-
waveform = file_.read(frames, dtype, always_2d=True)
|
|
231
|
-
sample_rate = file_.samplerate
|
|
232
|
-
|
|
233
|
-
waveform = torch.from_numpy(waveform)
|
|
234
|
-
if channels_first:
|
|
235
|
-
waveform = waveform.t()
|
|
236
|
-
return waveform, sample_rate
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
|
|
240
|
-
if not encoding:
|
|
241
|
-
if not bits_per_sample:
|
|
242
|
-
subtype = {
|
|
243
|
-
torch.uint8: "PCM_U8",
|
|
244
|
-
torch.int16: "PCM_16",
|
|
245
|
-
torch.int32: "PCM_32",
|
|
246
|
-
torch.float32: "FLOAT",
|
|
247
|
-
torch.float64: "DOUBLE",
|
|
248
|
-
}.get(dtype)
|
|
249
|
-
if not subtype:
|
|
250
|
-
raise ValueError(f"Unsupported dtype for wav: {dtype}")
|
|
251
|
-
return subtype
|
|
252
|
-
if bits_per_sample == 8:
|
|
253
|
-
return "PCM_U8"
|
|
254
|
-
return f"PCM_{bits_per_sample}"
|
|
255
|
-
if encoding == "PCM_S":
|
|
256
|
-
if not bits_per_sample:
|
|
257
|
-
return "PCM_32"
|
|
258
|
-
if bits_per_sample == 8:
|
|
259
|
-
raise ValueError("wav does not support 8-bit signed PCM encoding.")
|
|
260
|
-
return f"PCM_{bits_per_sample}"
|
|
261
|
-
if encoding == "PCM_U":
|
|
262
|
-
if bits_per_sample in (None, 8):
|
|
263
|
-
return "PCM_U8"
|
|
264
|
-
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
|
|
265
|
-
if encoding == "PCM_F":
|
|
266
|
-
if bits_per_sample in (None, 32):
|
|
267
|
-
return "FLOAT"
|
|
268
|
-
if bits_per_sample == 64:
|
|
269
|
-
return "DOUBLE"
|
|
270
|
-
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
|
|
271
|
-
if encoding == "ULAW":
|
|
272
|
-
if bits_per_sample in (None, 8):
|
|
273
|
-
return "ULAW"
|
|
274
|
-
raise ValueError("wav only supports 8-bit mu-law encoding.")
|
|
275
|
-
if encoding == "ALAW":
|
|
276
|
-
if bits_per_sample in (None, 8):
|
|
277
|
-
return "ALAW"
|
|
278
|
-
raise ValueError("wav only supports 8-bit a-law encoding.")
|
|
279
|
-
raise ValueError(f"wav does not support {encoding}.")
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
|
|
283
|
-
if encoding in (None, "PCM_S"):
|
|
284
|
-
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
|
|
285
|
-
if encoding in ("PCM_U", "PCM_F"):
|
|
286
|
-
raise ValueError(f"sph does not support {encoding} encoding.")
|
|
287
|
-
if encoding == "ULAW":
|
|
288
|
-
if bits_per_sample in (None, 8):
|
|
289
|
-
return "ULAW"
|
|
290
|
-
raise ValueError("sph only supports 8-bit for mu-law encoding.")
|
|
291
|
-
if encoding == "ALAW":
|
|
292
|
-
return "ALAW"
|
|
293
|
-
raise ValueError(f"sph does not support {encoding}.")
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
|
|
297
|
-
if format == "wav":
|
|
298
|
-
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
|
|
299
|
-
if format == "flac":
|
|
300
|
-
if encoding:
|
|
301
|
-
raise ValueError("flac does not support encoding.")
|
|
302
|
-
if not bits_per_sample:
|
|
303
|
-
return "PCM_16"
|
|
304
|
-
if bits_per_sample > 24:
|
|
305
|
-
raise ValueError("flac does not support bits_per_sample > 24.")
|
|
306
|
-
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
|
|
307
|
-
if format in ("ogg", "vorbis"):
|
|
308
|
-
if bits_per_sample:
|
|
309
|
-
raise ValueError("ogg/vorbis does not support bits_per_sample.")
|
|
310
|
-
if encoding is None or encoding == "vorbis":
|
|
311
|
-
return "VORBIS"
|
|
312
|
-
if encoding == "opus":
|
|
313
|
-
return "OPUS"
|
|
314
|
-
raise ValueError(f"Unexpected encoding: {encoding}")
|
|
315
|
-
if format == "mp3":
|
|
316
|
-
return "MPEG_LAYER_III"
|
|
317
|
-
if format == "sph":
|
|
318
|
-
return _get_subtype_for_sphere(encoding, bits_per_sample)
|
|
319
|
-
if format in ("nis", "nist"):
|
|
320
|
-
return "PCM_16"
|
|
321
|
-
raise ValueError(f"Unsupported format: {format}")
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
@_requires_soundfile
|
|
325
|
-
def save(
|
|
326
|
-
filepath: str,
|
|
327
|
-
src: torch.Tensor,
|
|
328
|
-
sample_rate: int,
|
|
329
|
-
channels_first: bool = True,
|
|
330
|
-
compression: Optional[float] = None,
|
|
331
|
-
format: Optional[str] = None,
|
|
332
|
-
encoding: Optional[str] = None,
|
|
333
|
-
bits_per_sample: Optional[int] = None,
|
|
334
|
-
):
|
|
335
|
-
"""Save audio data to file.
|
|
336
|
-
|
|
337
|
-
Note:
|
|
338
|
-
The formats this function can handle depend on the soundfile installation.
|
|
339
|
-
This function is tested on the following formats;
|
|
340
|
-
|
|
341
|
-
* WAV
|
|
342
|
-
|
|
343
|
-
* 32-bit floating-point
|
|
344
|
-
* 32-bit signed integer
|
|
345
|
-
* 16-bit signed integer
|
|
346
|
-
* 8-bit unsigned integer
|
|
347
|
-
|
|
348
|
-
* FLAC
|
|
349
|
-
* OGG/VORBIS
|
|
350
|
-
* SPHERE
|
|
351
|
-
|
|
352
|
-
Note:
|
|
353
|
-
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
|
354
|
-
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
|
355
|
-
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
|
356
|
-
|
|
357
|
-
Args:
|
|
358
|
-
filepath (str or pathlib.Path): Path to audio file.
|
|
359
|
-
src (torch.Tensor): Audio data to save. must be 2D tensor.
|
|
360
|
-
sample_rate (int): sampling rate
|
|
361
|
-
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
|
362
|
-
otherwise `[time, channel]`.
|
|
363
|
-
compression (float of None, optional): Not used.
|
|
364
|
-
It is here only for interface compatibility reson with "sox_io" backend.
|
|
365
|
-
format (str or None, optional): Override the audio format.
|
|
366
|
-
When ``filepath`` argument is path-like object, audio format is
|
|
367
|
-
inferred from file extension. If the file extension is missing or
|
|
368
|
-
different, you can specify the correct format with this argument.
|
|
369
|
-
|
|
370
|
-
When ``filepath`` argument is file-like object,
|
|
371
|
-
this argument is required.
|
|
372
|
-
|
|
373
|
-
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
|
|
374
|
-
``"flac"`` and ``"sph"``.
|
|
375
|
-
encoding (str or None, optional): Changes the encoding for supported formats.
|
|
376
|
-
This argument is effective only for supported formats, sush as
|
|
377
|
-
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
|
|
378
|
-
|
|
379
|
-
- ``"PCM_S"`` (signed integer Linear PCM)
|
|
380
|
-
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
|
381
|
-
- ``"PCM_F"`` (floating point PCM)
|
|
382
|
-
- ``"ULAW"`` (mu-law)
|
|
383
|
-
- ``"ALAW"`` (a-law)
|
|
384
|
-
|
|
385
|
-
bits_per_sample (int or None, optional): Changes the bit depth for the
|
|
386
|
-
supported formats.
|
|
387
|
-
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
|
|
388
|
-
you can change the bit depth.
|
|
389
|
-
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
|
390
|
-
|
|
391
|
-
Supported formats/encodings/bit depth/compression are:
|
|
392
|
-
|
|
393
|
-
``"wav"``
|
|
394
|
-
- 32-bit floating-point PCM
|
|
395
|
-
- 32-bit signed integer PCM
|
|
396
|
-
- 24-bit signed integer PCM
|
|
397
|
-
- 16-bit signed integer PCM
|
|
398
|
-
- 8-bit unsigned integer PCM
|
|
399
|
-
- 8-bit mu-law
|
|
400
|
-
- 8-bit a-law
|
|
401
|
-
|
|
402
|
-
Note:
|
|
403
|
-
Default encoding/bit depth is determined by the dtype of
|
|
404
|
-
the input Tensor.
|
|
405
|
-
|
|
406
|
-
``"flac"``
|
|
407
|
-
- 8-bit
|
|
408
|
-
- 16-bit (default)
|
|
409
|
-
- 24-bit
|
|
410
|
-
|
|
411
|
-
``"ogg"``, ``"vorbis"``
|
|
412
|
-
- Doesn't accept changing configuration.
|
|
413
|
-
|
|
414
|
-
``"sph"``
|
|
415
|
-
- 8-bit signed integer PCM
|
|
416
|
-
- 16-bit signed integer PCM
|
|
417
|
-
- 24-bit signed integer PCM
|
|
418
|
-
- 32-bit signed integer PCM (default)
|
|
419
|
-
- 8-bit mu-law
|
|
420
|
-
- 8-bit a-law
|
|
421
|
-
- 16-bit a-law
|
|
422
|
-
- 24-bit a-law
|
|
423
|
-
- 32-bit a-law
|
|
424
|
-
|
|
425
|
-
"""
|
|
426
|
-
if src.ndim != 2:
|
|
427
|
-
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
|
|
428
|
-
if compression is not None:
|
|
429
|
-
warnings.warn(
|
|
430
|
-
'`save` function of "soundfile" backend does not support "compression" parameter. '
|
|
431
|
-
"The argument is silently ignored."
|
|
432
|
-
)
|
|
433
|
-
if hasattr(filepath, "write"):
|
|
434
|
-
if format is None:
|
|
435
|
-
raise RuntimeError("`format` is required when saving to file object.")
|
|
436
|
-
ext = format.lower()
|
|
437
|
-
else:
|
|
438
|
-
ext = str(filepath).split(".")[-1].lower()
|
|
439
|
-
|
|
440
|
-
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
|
|
441
|
-
raise ValueError("Invalid bits_per_sample.")
|
|
442
|
-
if bits_per_sample == 24:
|
|
443
|
-
warnings.warn(
|
|
444
|
-
"Saving audio with 24 bits per sample might warp samples near -1. "
|
|
445
|
-
"Using 16 bits per sample might be able to avoid this."
|
|
446
|
-
)
|
|
447
|
-
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
|
|
448
|
-
|
|
449
|
-
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
|
|
450
|
-
# so we extend the extensions manually here
|
|
451
|
-
if ext in ["nis", "nist", "sph"] and format is None:
|
|
452
|
-
format = "NIST"
|
|
453
|
-
|
|
454
|
-
if channels_first:
|
|
455
|
-
src = src.t()
|
|
456
|
-
|
|
457
|
-
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
|
torchaudio/_backend/sox.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import BinaryIO, Optional, Tuple, Union
|
|
3
|
-
|
|
4
|
-
import torch
|
|
5
|
-
import torchaudio
|
|
6
|
-
|
|
7
|
-
from .backend import Backend
|
|
8
|
-
from .common import AudioMetaData
|
|
9
|
-
|
|
10
|
-
sox_ext = torchaudio._extension.lazy_import_sox_ext()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SoXBackend(Backend):
|
|
14
|
-
@staticmethod
|
|
15
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
16
|
-
if hasattr(uri, "read"):
|
|
17
|
-
raise ValueError(
|
|
18
|
-
"SoX backend does not support reading from file-like objects. ",
|
|
19
|
-
"Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
|
|
20
|
-
)
|
|
21
|
-
else:
|
|
22
|
-
sinfo = sox_ext.get_info(uri, format)
|
|
23
|
-
if sinfo:
|
|
24
|
-
return AudioMetaData(*sinfo)
|
|
25
|
-
else:
|
|
26
|
-
raise RuntimeError(f"Failed to fetch metadata for {uri}.")
|
|
27
|
-
|
|
28
|
-
@staticmethod
|
|
29
|
-
def load(
|
|
30
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
31
|
-
frame_offset: int = 0,
|
|
32
|
-
num_frames: int = -1,
|
|
33
|
-
normalize: bool = True,
|
|
34
|
-
channels_first: bool = True,
|
|
35
|
-
format: Optional[str] = None,
|
|
36
|
-
buffer_size: int = 4096,
|
|
37
|
-
) -> Tuple[torch.Tensor, int]:
|
|
38
|
-
if hasattr(uri, "read"):
|
|
39
|
-
raise ValueError(
|
|
40
|
-
"SoX backend does not support loading from file-like objects. ",
|
|
41
|
-
"Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
|
|
42
|
-
)
|
|
43
|
-
else:
|
|
44
|
-
ret = sox_ext.load_audio_file(str(uri), frame_offset, num_frames, normalize, channels_first, format)
|
|
45
|
-
if not ret:
|
|
46
|
-
raise RuntimeError(f"Failed to load audio from {uri}.")
|
|
47
|
-
return ret
|
|
48
|
-
|
|
49
|
-
@staticmethod
|
|
50
|
-
def save(
|
|
51
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
52
|
-
src: torch.Tensor,
|
|
53
|
-
sample_rate: int,
|
|
54
|
-
channels_first: bool = True,
|
|
55
|
-
format: Optional[str] = None,
|
|
56
|
-
encoding: Optional[str] = None,
|
|
57
|
-
bits_per_sample: Optional[int] = None,
|
|
58
|
-
buffer_size: int = 4096,
|
|
59
|
-
compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
|
|
60
|
-
) -> None:
|
|
61
|
-
if not isinstance(compression, (float, int, type(None))):
|
|
62
|
-
raise ValueError(
|
|
63
|
-
"SoX backend expects non-`None` value for argument `compression` to be of ",
|
|
64
|
-
f"type `float` or `int`, but received value of type {type(compression)}",
|
|
65
|
-
)
|
|
66
|
-
if hasattr(uri, "write"):
|
|
67
|
-
raise ValueError(
|
|
68
|
-
"SoX backend does not support writing to file-like objects. ",
|
|
69
|
-
"Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
|
|
70
|
-
)
|
|
71
|
-
else:
|
|
72
|
-
sox_ext.save_audio_file(
|
|
73
|
-
str(uri),
|
|
74
|
-
src,
|
|
75
|
-
sample_rate,
|
|
76
|
-
channels_first,
|
|
77
|
-
compression,
|
|
78
|
-
format,
|
|
79
|
-
encoding,
|
|
80
|
-
bits_per_sample,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
@staticmethod
|
|
84
|
-
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
85
|
-
# i.e. not a file-like object.
|
|
86
|
-
return not hasattr(uri, "read")
|
|
87
|
-
|
|
88
|
-
@staticmethod
|
|
89
|
-
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
90
|
-
# i.e. not a file-like object.
|
|
91
|
-
return not hasattr(uri, "write")
|