torchaudio 2.8.0__cp313-cp313t-win_amd64.whl → 2.9.0__cp313-cp313t-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +179 -39
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +12 -3
- torchaudio/_torchcodec.py +73 -85
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +0 -2
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +26 -60
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +14 -2
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +1 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +1 -0
- torchaudio/transforms/_transforms.py +2 -2
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -3
- torchaudio/version.py +2 -2
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -350
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -20
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -150
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -68
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -441
- torchaudio/prototype/functional/_rir.py +0 -382
- torchaudio/prototype/functional/functional.py +0 -193
- torchaudio/prototype/models/__init__.py +0 -39
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
- torchaudio/prototype/models/_emformer_hubert.py +0 -337
- torchaudio/prototype/models/conv_emformer.py +0 -529
- torchaudio/prototype/models/hifi_gan.py +0 -342
- torchaudio/prototype/models/rnnt.py +0 -717
- torchaudio/prototype/models/rnnt_decoder.py +0 -402
- torchaudio/prototype/pipelines/__init__.py +0 -21
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -461
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -275
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -118
- torchaudio-2.8.0.dist-info/RECORD +0 -145
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -977
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -275
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/LICENSE +0 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
torchaudio/_backend/backend.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import BinaryIO, Optional, Tuple, Union
|
|
4
|
-
|
|
5
|
-
from torch import Tensor
|
|
6
|
-
from torchaudio.io import CodecConfig
|
|
7
|
-
|
|
8
|
-
from .common import AudioMetaData
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class Backend(ABC):
|
|
12
|
-
@staticmethod
|
|
13
|
-
@abstractmethod
|
|
14
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
15
|
-
raise NotImplementedError
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
@abstractmethod
|
|
19
|
-
def load(
|
|
20
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
21
|
-
frame_offset: int = 0,
|
|
22
|
-
num_frames: int = -1,
|
|
23
|
-
normalize: bool = True,
|
|
24
|
-
channels_first: bool = True,
|
|
25
|
-
format: Optional[str] = None,
|
|
26
|
-
buffer_size: int = 4096,
|
|
27
|
-
) -> Tuple[Tensor, int]:
|
|
28
|
-
raise NotImplementedError
|
|
29
|
-
|
|
30
|
-
@staticmethod
|
|
31
|
-
@abstractmethod
|
|
32
|
-
def save(
|
|
33
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
34
|
-
src: Tensor,
|
|
35
|
-
sample_rate: int,
|
|
36
|
-
channels_first: bool = True,
|
|
37
|
-
format: Optional[str] = None,
|
|
38
|
-
encoding: Optional[str] = None,
|
|
39
|
-
bits_per_sample: Optional[int] = None,
|
|
40
|
-
buffer_size: int = 4096,
|
|
41
|
-
compression: Optional[Union[CodecConfig, float, int]] = None,
|
|
42
|
-
) -> None:
|
|
43
|
-
raise NotImplementedError
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
@abstractmethod
|
|
47
|
-
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
@staticmethod
|
|
51
|
-
@abstractmethod
|
|
52
|
-
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
|
53
|
-
raise NotImplementedError
|
torchaudio/_backend/common.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
class AudioMetaData:
|
|
2
|
-
"""AudioMetaData()
|
|
3
|
-
|
|
4
|
-
Return type of ``torchaudio.info`` function.
|
|
5
|
-
|
|
6
|
-
:ivar int sample_rate: Sample rate
|
|
7
|
-
:ivar int num_frames: The number of frames
|
|
8
|
-
:ivar int num_channels: The number of channels
|
|
9
|
-
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
|
10
|
-
or when it cannot be accurately inferred.
|
|
11
|
-
:ivar str encoding: Audio encoding
|
|
12
|
-
The values encoding can take are one of the following:
|
|
13
|
-
|
|
14
|
-
* ``PCM_S``: Signed integer linear PCM
|
|
15
|
-
* ``PCM_U``: Unsigned integer linear PCM
|
|
16
|
-
* ``PCM_F``: Floating point linear PCM
|
|
17
|
-
* ``FLAC``: Flac, Free Lossless Audio Codec
|
|
18
|
-
* ``ULAW``: Mu-law
|
|
19
|
-
* ``ALAW``: A-law
|
|
20
|
-
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
|
21
|
-
* ``VORBIS``: OGG Vorbis
|
|
22
|
-
* ``AMR_WB``: Adaptive Multi-Rate Wideband
|
|
23
|
-
* ``AMR_NB``: Adaptive Multi-Rate Narrowband
|
|
24
|
-
* ``OPUS``: Opus
|
|
25
|
-
* ``HTK``: Single channel 16-bit PCM
|
|
26
|
-
* ``UNKNOWN`` : None of above
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
sample_rate: int,
|
|
32
|
-
num_frames: int,
|
|
33
|
-
num_channels: int,
|
|
34
|
-
bits_per_sample: int,
|
|
35
|
-
encoding: str,
|
|
36
|
-
):
|
|
37
|
-
self.sample_rate = sample_rate
|
|
38
|
-
self.num_frames = num_frames
|
|
39
|
-
self.num_channels = num_channels
|
|
40
|
-
self.bits_per_sample = bits_per_sample
|
|
41
|
-
self.encoding = encoding
|
|
42
|
-
|
|
43
|
-
def __str__(self):
|
|
44
|
-
return (
|
|
45
|
-
f"AudioMetaData("
|
|
46
|
-
f"sample_rate={self.sample_rate}, "
|
|
47
|
-
f"num_frames={self.num_frames}, "
|
|
48
|
-
f"num_channels={self.num_channels}, "
|
|
49
|
-
f"bits_per_sample={self.bits_per_sample}, "
|
|
50
|
-
f"encoding={self.encoding}"
|
|
51
|
-
f")"
|
|
52
|
-
)
|
torchaudio/_backend/ffmpeg.py
DELETED
|
@@ -1,334 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
import sys
|
|
4
|
-
from typing import BinaryIO, Optional, Tuple, Union
|
|
5
|
-
|
|
6
|
-
import torch
|
|
7
|
-
import torchaudio
|
|
8
|
-
|
|
9
|
-
from .backend import Backend
|
|
10
|
-
from .common import AudioMetaData
|
|
11
|
-
|
|
12
|
-
InputType = Union[BinaryIO, str, os.PathLike]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def info_audio(
|
|
16
|
-
src: InputType,
|
|
17
|
-
format: Optional[str],
|
|
18
|
-
buffer_size: int = 4096,
|
|
19
|
-
) -> AudioMetaData:
|
|
20
|
-
s = torchaudio.io.StreamReader(src, format, None, buffer_size)
|
|
21
|
-
sinfo = s.get_src_stream_info(s.default_audio_stream)
|
|
22
|
-
if sinfo.num_frames == 0:
|
|
23
|
-
waveform = _load_audio(s)
|
|
24
|
-
num_frames = waveform.size(1)
|
|
25
|
-
else:
|
|
26
|
-
num_frames = sinfo.num_frames
|
|
27
|
-
return AudioMetaData(
|
|
28
|
-
int(sinfo.sample_rate),
|
|
29
|
-
num_frames,
|
|
30
|
-
sinfo.num_channels,
|
|
31
|
-
sinfo.bits_per_sample,
|
|
32
|
-
sinfo.codec.upper(),
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _get_load_filter(
|
|
37
|
-
frame_offset: int = 0,
|
|
38
|
-
num_frames: int = -1,
|
|
39
|
-
convert: bool = True,
|
|
40
|
-
) -> Optional[str]:
|
|
41
|
-
if frame_offset < 0:
|
|
42
|
-
raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
|
|
43
|
-
if num_frames == 0 or num_frames < -1:
|
|
44
|
-
raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
|
|
45
|
-
|
|
46
|
-
# All default values -> no filter
|
|
47
|
-
if frame_offset == 0 and num_frames == -1 and not convert:
|
|
48
|
-
return None
|
|
49
|
-
# Only convert
|
|
50
|
-
aformat = "aformat=sample_fmts=fltp"
|
|
51
|
-
if frame_offset == 0 and num_frames == -1 and convert:
|
|
52
|
-
return aformat
|
|
53
|
-
# At least one of frame_offset or num_frames has non-default value
|
|
54
|
-
if num_frames > 0:
|
|
55
|
-
atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
|
|
56
|
-
else:
|
|
57
|
-
atrim = "atrim=start_sample={}".format(frame_offset)
|
|
58
|
-
if not convert:
|
|
59
|
-
return atrim
|
|
60
|
-
return "{},{}".format(atrim, aformat)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _load_audio(
|
|
64
|
-
s: "torchaudio.io.StreamReader",
|
|
65
|
-
filter: Optional[str] = None,
|
|
66
|
-
channels_first: bool = True,
|
|
67
|
-
) -> torch.Tensor:
|
|
68
|
-
s.add_audio_stream(-1, -1, filter_desc=filter)
|
|
69
|
-
s.process_all_packets()
|
|
70
|
-
chunk = s.pop_chunks()[0]
|
|
71
|
-
if chunk is None:
|
|
72
|
-
raise RuntimeError("Failed to decode audio.")
|
|
73
|
-
waveform = chunk._elem
|
|
74
|
-
return waveform.T if channels_first else waveform
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def load_audio(
|
|
78
|
-
src: InputType,
|
|
79
|
-
frame_offset: int = 0,
|
|
80
|
-
num_frames: int = -1,
|
|
81
|
-
convert: bool = True,
|
|
82
|
-
channels_first: bool = True,
|
|
83
|
-
format: Optional[str] = None,
|
|
84
|
-
buffer_size: int = 4096,
|
|
85
|
-
) -> Tuple[torch.Tensor, int]:
|
|
86
|
-
if hasattr(src, "read") and format == "vorbis":
|
|
87
|
-
format = "ogg"
|
|
88
|
-
s = torchaudio.io.StreamReader(src, format, None, buffer_size)
|
|
89
|
-
sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
|
|
90
|
-
filter = _get_load_filter(frame_offset, num_frames, convert)
|
|
91
|
-
waveform = _load_audio(s, filter, channels_first)
|
|
92
|
-
return waveform, sample_rate
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def _get_sample_format(dtype: torch.dtype) -> str:
|
|
96
|
-
dtype_to_format = {
|
|
97
|
-
torch.uint8: "u8",
|
|
98
|
-
torch.int16: "s16",
|
|
99
|
-
torch.int32: "s32",
|
|
100
|
-
torch.int64: "s64",
|
|
101
|
-
torch.float32: "flt",
|
|
102
|
-
torch.float64: "dbl",
|
|
103
|
-
}
|
|
104
|
-
format = dtype_to_format.get(dtype)
|
|
105
|
-
if format is None:
|
|
106
|
-
raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
|
|
107
|
-
return format
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def _native_endianness() -> str:
|
|
111
|
-
if sys.byteorder == "little":
|
|
112
|
-
return "le"
|
|
113
|
-
else:
|
|
114
|
-
return "be"
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
|
|
118
|
-
if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
|
|
119
|
-
raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
|
|
120
|
-
endianness = _native_endianness()
|
|
121
|
-
if not encoding:
|
|
122
|
-
if not bits_per_sample:
|
|
123
|
-
# default to PCM S16
|
|
124
|
-
return f"pcm_s16{endianness}"
|
|
125
|
-
if bits_per_sample == 8:
|
|
126
|
-
return "pcm_u8"
|
|
127
|
-
return f"pcm_s{bits_per_sample}{endianness}"
|
|
128
|
-
if encoding == "PCM_S":
|
|
129
|
-
if not bits_per_sample:
|
|
130
|
-
bits_per_sample = 16
|
|
131
|
-
if bits_per_sample == 8:
|
|
132
|
-
raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
|
|
133
|
-
return f"pcm_s{bits_per_sample}{endianness}"
|
|
134
|
-
if encoding == "PCM_U":
|
|
135
|
-
if bits_per_sample in (None, 8):
|
|
136
|
-
return "pcm_u8"
|
|
137
|
-
raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
|
|
138
|
-
if encoding == "PCM_F":
|
|
139
|
-
if not bits_per_sample:
|
|
140
|
-
bits_per_sample = 32
|
|
141
|
-
if bits_per_sample in (32, 64):
|
|
142
|
-
return f"pcm_f{bits_per_sample}{endianness}"
|
|
143
|
-
raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
|
|
144
|
-
if encoding == "ULAW":
|
|
145
|
-
if bits_per_sample in (None, 8):
|
|
146
|
-
return "pcm_mulaw"
|
|
147
|
-
raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
|
|
148
|
-
if encoding == "ALAW":
|
|
149
|
-
if bits_per_sample in (None, 8):
|
|
150
|
-
return "pcm_alaw"
|
|
151
|
-
raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
|
|
152
|
-
raise ValueError(f"WAV encoding {encoding} is not supported.")
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def _get_flac_sample_fmt(bps):
|
|
156
|
-
if bps is None or bps == 16:
|
|
157
|
-
return "s16"
|
|
158
|
-
if bps == 24:
|
|
159
|
-
return "s32"
|
|
160
|
-
raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def _parse_save_args(
|
|
164
|
-
ext: Optional[str],
|
|
165
|
-
format: Optional[str],
|
|
166
|
-
encoding: Optional[str],
|
|
167
|
-
bps: Optional[int],
|
|
168
|
-
):
|
|
169
|
-
# torchaudio's save function accepts the followings, which do not 1to1 map
|
|
170
|
-
# to FFmpeg.
|
|
171
|
-
#
|
|
172
|
-
# - format: audio format
|
|
173
|
-
# - bits_per_sample: encoder sample format
|
|
174
|
-
# - encoding: such as PCM_U8.
|
|
175
|
-
#
|
|
176
|
-
# In FFmpeg, format is specified with the following three (and more)
|
|
177
|
-
#
|
|
178
|
-
# - muxer: could be audio format or container format.
|
|
179
|
-
# the one we passed to the constructor of StreamWriter
|
|
180
|
-
# - encoder: the audio encoder used to encode audio
|
|
181
|
-
# - encoder sample format: the format used by encoder to encode audio.
|
|
182
|
-
#
|
|
183
|
-
# If encoder sample format is different from source sample format, StreamWriter
|
|
184
|
-
# will insert a filter automatically.
|
|
185
|
-
#
|
|
186
|
-
def _type(spec):
|
|
187
|
-
# either format is exactly the specified one
|
|
188
|
-
# or extension matches to the spec AND there is no format override.
|
|
189
|
-
return format == spec or (format is None and ext == spec)
|
|
190
|
-
|
|
191
|
-
if _type("wav") or _type("amb"):
|
|
192
|
-
# wav is special because it supports different encoding through encoders
|
|
193
|
-
# each encoder only supports one encoder format
|
|
194
|
-
#
|
|
195
|
-
# amb format is a special case originated from libsox.
|
|
196
|
-
# It is basically a WAV format, with slight modification.
|
|
197
|
-
# https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
|
|
198
|
-
# It is a format so that decoders will recognize it as ambisonic.
|
|
199
|
-
# https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
|
|
200
|
-
# FFmpeg does not recognize amb because it is basically a WAV format.
|
|
201
|
-
muxer = "wav"
|
|
202
|
-
encoder = _get_encoder_for_wav(encoding, bps)
|
|
203
|
-
sample_fmt = None
|
|
204
|
-
elif _type("vorbis"):
|
|
205
|
-
# FFpmeg does not recognize vorbis extension, while libsox used to do.
|
|
206
|
-
# For the sake of bakward compatibility, (and the simplicity),
|
|
207
|
-
# we support the case where users want to do save("foo.vorbis")
|
|
208
|
-
muxer = "ogg"
|
|
209
|
-
encoder = "vorbis"
|
|
210
|
-
sample_fmt = None
|
|
211
|
-
else:
|
|
212
|
-
muxer = format
|
|
213
|
-
encoder = None
|
|
214
|
-
sample_fmt = None
|
|
215
|
-
if _type("flac"):
|
|
216
|
-
sample_fmt = _get_flac_sample_fmt(bps)
|
|
217
|
-
if _type("ogg"):
|
|
218
|
-
sample_fmt = _get_flac_sample_fmt(bps)
|
|
219
|
-
return muxer, encoder, sample_fmt
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def save_audio(
|
|
223
|
-
uri: InputType,
|
|
224
|
-
src: torch.Tensor,
|
|
225
|
-
sample_rate: int,
|
|
226
|
-
channels_first: bool = True,
|
|
227
|
-
format: Optional[str] = None,
|
|
228
|
-
encoding: Optional[str] = None,
|
|
229
|
-
bits_per_sample: Optional[int] = None,
|
|
230
|
-
buffer_size: int = 4096,
|
|
231
|
-
compression: Optional[torchaudio.io.CodecConfig] = None,
|
|
232
|
-
) -> None:
|
|
233
|
-
ext = None
|
|
234
|
-
if hasattr(uri, "write"):
|
|
235
|
-
if format is None:
|
|
236
|
-
raise RuntimeError("'format' is required when saving to file object.")
|
|
237
|
-
else:
|
|
238
|
-
uri = os.path.normpath(uri)
|
|
239
|
-
if tokens := str(uri).split(".")[1:]:
|
|
240
|
-
ext = tokens[-1].lower()
|
|
241
|
-
|
|
242
|
-
muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
|
|
243
|
-
|
|
244
|
-
if channels_first:
|
|
245
|
-
src = src.T
|
|
246
|
-
|
|
247
|
-
s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
|
|
248
|
-
s.add_audio_stream(
|
|
249
|
-
sample_rate,
|
|
250
|
-
num_channels=src.size(-1),
|
|
251
|
-
format=_get_sample_format(src.dtype),
|
|
252
|
-
encoder=encoder,
|
|
253
|
-
encoder_format=enc_fmt,
|
|
254
|
-
codec_config=compression,
|
|
255
|
-
)
|
|
256
|
-
with s.open():
|
|
257
|
-
s.write_audio_chunk(0, src)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
def _map_encoding(encoding: str) -> str:
|
|
261
|
-
for dst in ["PCM_S", "PCM_U", "PCM_F"]:
|
|
262
|
-
if dst in encoding:
|
|
263
|
-
return dst
|
|
264
|
-
if encoding == "PCM_MULAW":
|
|
265
|
-
return "ULAW"
|
|
266
|
-
elif encoding == "PCM_ALAW":
|
|
267
|
-
return "ALAW"
|
|
268
|
-
return encoding
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
|
|
272
|
-
if m := re.search(r"PCM_\w(\d+)\w*", encoding):
|
|
273
|
-
return int(m.group(1))
|
|
274
|
-
elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
|
|
275
|
-
return 8
|
|
276
|
-
return bits_per_sample
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
class FFmpegBackend(Backend):
|
|
280
|
-
@staticmethod
|
|
281
|
-
def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
282
|
-
metadata = info_audio(uri, format, buffer_size)
|
|
283
|
-
metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
|
|
284
|
-
metadata.encoding = _map_encoding(metadata.encoding)
|
|
285
|
-
return metadata
|
|
286
|
-
|
|
287
|
-
@staticmethod
|
|
288
|
-
def load(
|
|
289
|
-
uri: InputType,
|
|
290
|
-
frame_offset: int = 0,
|
|
291
|
-
num_frames: int = -1,
|
|
292
|
-
normalize: bool = True,
|
|
293
|
-
channels_first: bool = True,
|
|
294
|
-
format: Optional[str] = None,
|
|
295
|
-
buffer_size: int = 4096,
|
|
296
|
-
) -> Tuple[torch.Tensor, int]:
|
|
297
|
-
return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
|
|
298
|
-
|
|
299
|
-
@staticmethod
|
|
300
|
-
def save(
|
|
301
|
-
uri: InputType,
|
|
302
|
-
src: torch.Tensor,
|
|
303
|
-
sample_rate: int,
|
|
304
|
-
channels_first: bool = True,
|
|
305
|
-
format: Optional[str] = None,
|
|
306
|
-
encoding: Optional[str] = None,
|
|
307
|
-
bits_per_sample: Optional[int] = None,
|
|
308
|
-
buffer_size: int = 4096,
|
|
309
|
-
compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
|
|
310
|
-
) -> None:
|
|
311
|
-
if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
|
|
312
|
-
raise ValueError(
|
|
313
|
-
"FFmpeg backend expects non-`None` value for argument `compression` to be of ",
|
|
314
|
-
f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
|
|
315
|
-
)
|
|
316
|
-
save_audio(
|
|
317
|
-
uri,
|
|
318
|
-
src,
|
|
319
|
-
sample_rate,
|
|
320
|
-
channels_first,
|
|
321
|
-
format,
|
|
322
|
-
encoding,
|
|
323
|
-
bits_per_sample,
|
|
324
|
-
buffer_size,
|
|
325
|
-
compression,
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
@staticmethod
|
|
329
|
-
def can_decode(uri: InputType, format: Optional[str]) -> bool:
|
|
330
|
-
return True
|
|
331
|
-
|
|
332
|
-
@staticmethod
|
|
333
|
-
def can_encode(uri: InputType, format: Optional[str]) -> bool:
|
|
334
|
-
return True
|
torchaudio/_backend/soundfile.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import BinaryIO, Optional, Tuple, Union
|
|
3
|
-
|
|
4
|
-
import torch
|
|
5
|
-
from torchaudio.io import CodecConfig
|
|
6
|
-
|
|
7
|
-
from . import soundfile_backend
|
|
8
|
-
from .backend import Backend
|
|
9
|
-
from .common import AudioMetaData
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SoundfileBackend(Backend):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
|
15
|
-
return soundfile_backend.info(uri, format)
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def load(
|
|
19
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
20
|
-
frame_offset: int = 0,
|
|
21
|
-
num_frames: int = -1,
|
|
22
|
-
normalize: bool = True,
|
|
23
|
-
channels_first: bool = True,
|
|
24
|
-
format: Optional[str] = None,
|
|
25
|
-
buffer_size: int = 4096,
|
|
26
|
-
) -> Tuple[torch.Tensor, int]:
|
|
27
|
-
return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def save(
|
|
31
|
-
uri: Union[BinaryIO, str, os.PathLike],
|
|
32
|
-
src: torch.Tensor,
|
|
33
|
-
sample_rate: int,
|
|
34
|
-
channels_first: bool = True,
|
|
35
|
-
format: Optional[str] = None,
|
|
36
|
-
encoding: Optional[str] = None,
|
|
37
|
-
bits_per_sample: Optional[int] = None,
|
|
38
|
-
buffer_size: int = 4096,
|
|
39
|
-
compression: Optional[Union[CodecConfig, float, int]] = None,
|
|
40
|
-
) -> None:
|
|
41
|
-
if compression:
|
|
42
|
-
raise ValueError("soundfile backend does not support argument `compression`.")
|
|
43
|
-
|
|
44
|
-
soundfile_backend.save(
|
|
45
|
-
uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def can_decode(uri, format) -> bool:
|
|
50
|
-
return True
|
|
51
|
-
|
|
52
|
-
@staticmethod
|
|
53
|
-
def can_encode(uri, format) -> bool:
|
|
54
|
-
return True
|