torchaudio 2.8.0__cp313-cp313t-win_amd64.whl → 2.9.0__cp313-cp313t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/LICENSE +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
@@ -1,457 +0,0 @@
1
- """The new soundfile backend which will become default in 0.8.0 onward"""
2
- import warnings
3
- from typing import Optional, Tuple
4
-
5
- import torch
6
- from torchaudio._internal import module_utils as _mod_utils
7
-
8
- from .common import AudioMetaData
9
-
10
-
11
- _IS_SOUNDFILE_AVAILABLE = False
12
-
13
- # TODO: import soundfile only when it is used.
14
- if _mod_utils.is_module_available("soundfile"):
15
- try:
16
- import soundfile
17
-
18
- _requires_soundfile = _mod_utils.no_op
19
- _IS_SOUNDFILE_AVAILABLE = True
20
- except Exception:
21
- _requires_soundfile = _mod_utils.fail_with_message(
22
- "requires soundfile, but we failed to import it. Please check the installation of soundfile."
23
- )
24
- else:
25
- _requires_soundfile = _mod_utils.fail_with_message(
26
- "requires soundfile, but it is not installed. Please install soundfile."
27
- )
28
-
29
-
30
- # Mapping from soundfile subtype to number of bits per sample.
31
- # This is mostly heuristical and the value is set to 0 when it is irrelevant
32
- # (lossy formats) or when it can't be inferred.
33
- # For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
34
- # According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
35
- # the default seems to be 8 bits but it can be compressed further to 4 bits.
36
- # The dict is inspired from
37
- # https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
38
- _SUBTYPE_TO_BITS_PER_SAMPLE = {
39
- "PCM_S8": 8, # Signed 8 bit data
40
- "PCM_16": 16, # Signed 16 bit data
41
- "PCM_24": 24, # Signed 24 bit data
42
- "PCM_32": 32, # Signed 32 bit data
43
- "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
44
- "FLOAT": 32, # 32 bit float data
45
- "DOUBLE": 64, # 64 bit float data
46
- "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
47
- "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
48
- "IMA_ADPCM": 0, # IMA ADPCM.
49
- "MS_ADPCM": 0, # Microsoft ADPCM.
50
- "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
51
- "VOX_ADPCM": 0, # OKI / Dialogix ADPCM
52
- "G721_32": 0, # 32kbs G721 ADPCM encoding.
53
- "G723_24": 0, # 24kbs G723 ADPCM encoding.
54
- "G723_40": 0, # 40kbs G723 ADPCM encoding.
55
- "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
56
- "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
57
- "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
58
- "DWVW_N": 0, # N bit Delta Width Variable Word encoding.
59
- "DPCM_8": 8, # 8 bit differential PCM (XI only)
60
- "DPCM_16": 16, # 16 bit differential PCM (XI only)
61
- "VORBIS": 0, # Xiph Vorbis encoding. (lossy)
62
- "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
63
- "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
64
- "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
65
- "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
66
- }
67
-
68
-
69
- def _get_bit_depth(subtype):
70
- if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
71
- warnings.warn(
72
- f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
73
- "attribute will be set to 0. If you are seeing this warning, please "
74
- "report by opening an issue on github (after checking for existing/closed ones). "
75
- "You may otherwise ignore this warning."
76
- )
77
- return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
78
-
79
-
80
- _SUBTYPE_TO_ENCODING = {
81
- "PCM_S8": "PCM_S",
82
- "PCM_16": "PCM_S",
83
- "PCM_24": "PCM_S",
84
- "PCM_32": "PCM_S",
85
- "PCM_U8": "PCM_U",
86
- "FLOAT": "PCM_F",
87
- "DOUBLE": "PCM_F",
88
- "ULAW": "ULAW",
89
- "ALAW": "ALAW",
90
- "VORBIS": "VORBIS",
91
- }
92
-
93
-
94
- def _get_encoding(format: str, subtype: str):
95
- if format == "FLAC":
96
- return "FLAC"
97
- return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
98
-
99
-
100
- @_requires_soundfile
101
- def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
102
- """Get signal information of an audio file.
103
-
104
- Note:
105
- ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
106
- ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
107
- which has a restriction on type annotation due to TorchScript compiler compatiblity.
108
-
109
- Args:
110
- filepath (path-like object or file-like object):
111
- Source of audio data.
112
- format (str or None, optional):
113
- Not used. PySoundFile does not accept format hint.
114
-
115
- Returns:
116
- AudioMetaData: meta data of the given audio.
117
-
118
- """
119
- sinfo = soundfile.info(filepath)
120
- return AudioMetaData(
121
- sinfo.samplerate,
122
- sinfo.frames,
123
- sinfo.channels,
124
- bits_per_sample=_get_bit_depth(sinfo.subtype),
125
- encoding=_get_encoding(sinfo.format, sinfo.subtype),
126
- )
127
-
128
-
129
- _SUBTYPE2DTYPE = {
130
- "PCM_S8": "int8",
131
- "PCM_U8": "uint8",
132
- "PCM_16": "int16",
133
- "PCM_32": "int32",
134
- "FLOAT": "float32",
135
- "DOUBLE": "float64",
136
- }
137
-
138
-
139
- @_requires_soundfile
140
- def load(
141
- filepath: str,
142
- frame_offset: int = 0,
143
- num_frames: int = -1,
144
- normalize: bool = True,
145
- channels_first: bool = True,
146
- format: Optional[str] = None,
147
- ) -> Tuple[torch.Tensor, int]:
148
- """Load audio data from file.
149
-
150
- Note:
151
- The formats this function can handle depend on the soundfile installation.
152
- This function is tested on the following formats;
153
-
154
- * WAV
155
-
156
- * 32-bit floating-point
157
- * 32-bit signed integer
158
- * 16-bit signed integer
159
- * 8-bit unsigned integer
160
-
161
- * FLAC
162
- * OGG/VORBIS
163
- * SPHERE
164
-
165
- By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
166
- ``float32`` dtype, and the shape of `[channel, time]`.
167
-
168
- .. warning::
169
-
170
- ``normalize`` argument does not perform volume normalization.
171
- It only converts the sample type to `torch.float32` from the native sample
172
- type.
173
-
174
- When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
175
- signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
176
- this function can return integer Tensor, where the samples are expressed within the whole range
177
- of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
178
- ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
179
- support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
180
-
181
- ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
182
- ``flac`` and ``mp3``.
183
-
184
- For these formats, this function always returns ``float32`` Tensor with values.
185
-
186
- Note:
187
- ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
188
- ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
189
- which has a restriction on type annotation due to TorchScript compiler compatiblity.
190
-
191
- Args:
192
- filepath (path-like object or file-like object):
193
- Source of audio data.
194
- frame_offset (int, optional):
195
- Number of frames to skip before start reading data.
196
- num_frames (int, optional):
197
- Maximum number of frames to read. ``-1`` reads all the remaining samples,
198
- starting from ``frame_offset``.
199
- This function may return the less number of frames if there is not enough
200
- frames in the given file.
201
- normalize (bool, optional):
202
- When ``True``, this function converts the native sample type to ``float32``.
203
- Default: ``True``.
204
-
205
- If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
206
- integer type.
207
- This argument has no effect for formats other than integer WAV type.
208
-
209
- channels_first (bool, optional):
210
- When True, the returned Tensor has dimension `[channel, time]`.
211
- Otherwise, the returned Tensor's dimension is `[time, channel]`.
212
- format (str or None, optional):
213
- Not used. PySoundFile does not accept format hint.
214
-
215
- Returns:
216
- (torch.Tensor, int): Resulting Tensor and sample rate.
217
- If the input file has integer wav format and normalization is off, then it has
218
- integer type, else ``float32`` type. If ``channels_first=True``, it has
219
- `[channel, time]` else `[time, channel]`.
220
- """
221
- with soundfile.SoundFile(filepath, "r") as file_:
222
- if file_.format != "WAV" or normalize:
223
- dtype = "float32"
224
- elif file_.subtype not in _SUBTYPE2DTYPE:
225
- raise ValueError(f"Unsupported subtype: {file_.subtype}")
226
- else:
227
- dtype = _SUBTYPE2DTYPE[file_.subtype]
228
-
229
- frames = file_._prepare_read(frame_offset, None, num_frames)
230
- waveform = file_.read(frames, dtype, always_2d=True)
231
- sample_rate = file_.samplerate
232
-
233
- waveform = torch.from_numpy(waveform)
234
- if channels_first:
235
- waveform = waveform.t()
236
- return waveform, sample_rate
237
-
238
-
239
- def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
240
- if not encoding:
241
- if not bits_per_sample:
242
- subtype = {
243
- torch.uint8: "PCM_U8",
244
- torch.int16: "PCM_16",
245
- torch.int32: "PCM_32",
246
- torch.float32: "FLOAT",
247
- torch.float64: "DOUBLE",
248
- }.get(dtype)
249
- if not subtype:
250
- raise ValueError(f"Unsupported dtype for wav: {dtype}")
251
- return subtype
252
- if bits_per_sample == 8:
253
- return "PCM_U8"
254
- return f"PCM_{bits_per_sample}"
255
- if encoding == "PCM_S":
256
- if not bits_per_sample:
257
- return "PCM_32"
258
- if bits_per_sample == 8:
259
- raise ValueError("wav does not support 8-bit signed PCM encoding.")
260
- return f"PCM_{bits_per_sample}"
261
- if encoding == "PCM_U":
262
- if bits_per_sample in (None, 8):
263
- return "PCM_U8"
264
- raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
265
- if encoding == "PCM_F":
266
- if bits_per_sample in (None, 32):
267
- return "FLOAT"
268
- if bits_per_sample == 64:
269
- return "DOUBLE"
270
- raise ValueError("wav only supports 32/64-bit float PCM encoding.")
271
- if encoding == "ULAW":
272
- if bits_per_sample in (None, 8):
273
- return "ULAW"
274
- raise ValueError("wav only supports 8-bit mu-law encoding.")
275
- if encoding == "ALAW":
276
- if bits_per_sample in (None, 8):
277
- return "ALAW"
278
- raise ValueError("wav only supports 8-bit a-law encoding.")
279
- raise ValueError(f"wav does not support {encoding}.")
280
-
281
-
282
- def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
283
- if encoding in (None, "PCM_S"):
284
- return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
285
- if encoding in ("PCM_U", "PCM_F"):
286
- raise ValueError(f"sph does not support {encoding} encoding.")
287
- if encoding == "ULAW":
288
- if bits_per_sample in (None, 8):
289
- return "ULAW"
290
- raise ValueError("sph only supports 8-bit for mu-law encoding.")
291
- if encoding == "ALAW":
292
- return "ALAW"
293
- raise ValueError(f"sph does not support {encoding}.")
294
-
295
-
296
- def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
297
- if format == "wav":
298
- return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
299
- if format == "flac":
300
- if encoding:
301
- raise ValueError("flac does not support encoding.")
302
- if not bits_per_sample:
303
- return "PCM_16"
304
- if bits_per_sample > 24:
305
- raise ValueError("flac does not support bits_per_sample > 24.")
306
- return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
307
- if format in ("ogg", "vorbis"):
308
- if bits_per_sample:
309
- raise ValueError("ogg/vorbis does not support bits_per_sample.")
310
- if encoding is None or encoding == "vorbis":
311
- return "VORBIS"
312
- if encoding == "opus":
313
- return "OPUS"
314
- raise ValueError(f"Unexpected encoding: {encoding}")
315
- if format == "mp3":
316
- return "MPEG_LAYER_III"
317
- if format == "sph":
318
- return _get_subtype_for_sphere(encoding, bits_per_sample)
319
- if format in ("nis", "nist"):
320
- return "PCM_16"
321
- raise ValueError(f"Unsupported format: {format}")
322
-
323
-
324
- @_requires_soundfile
325
- def save(
326
- filepath: str,
327
- src: torch.Tensor,
328
- sample_rate: int,
329
- channels_first: bool = True,
330
- compression: Optional[float] = None,
331
- format: Optional[str] = None,
332
- encoding: Optional[str] = None,
333
- bits_per_sample: Optional[int] = None,
334
- ):
335
- """Save audio data to file.
336
-
337
- Note:
338
- The formats this function can handle depend on the soundfile installation.
339
- This function is tested on the following formats;
340
-
341
- * WAV
342
-
343
- * 32-bit floating-point
344
- * 32-bit signed integer
345
- * 16-bit signed integer
346
- * 8-bit unsigned integer
347
-
348
- * FLAC
349
- * OGG/VORBIS
350
- * SPHERE
351
-
352
- Note:
353
- ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
354
- ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
355
- which has a restriction on type annotation due to TorchScript compiler compatiblity.
356
-
357
- Args:
358
- filepath (str or pathlib.Path): Path to audio file.
359
- src (torch.Tensor): Audio data to save. must be 2D tensor.
360
- sample_rate (int): sampling rate
361
- channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
362
- otherwise `[time, channel]`.
363
- compression (float of None, optional): Not used.
364
- It is here only for interface compatibility reson with "sox_io" backend.
365
- format (str or None, optional): Override the audio format.
366
- When ``filepath`` argument is path-like object, audio format is
367
- inferred from file extension. If the file extension is missing or
368
- different, you can specify the correct format with this argument.
369
-
370
- When ``filepath`` argument is file-like object,
371
- this argument is required.
372
-
373
- Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
374
- ``"flac"`` and ``"sph"``.
375
- encoding (str or None, optional): Changes the encoding for supported formats.
376
- This argument is effective only for supported formats, sush as
377
- ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
378
-
379
- - ``"PCM_S"`` (signed integer Linear PCM)
380
- - ``"PCM_U"`` (unsigned integer Linear PCM)
381
- - ``"PCM_F"`` (floating point PCM)
382
- - ``"ULAW"`` (mu-law)
383
- - ``"ALAW"`` (a-law)
384
-
385
- bits_per_sample (int or None, optional): Changes the bit depth for the
386
- supported formats.
387
- When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
388
- you can change the bit depth.
389
- Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
390
-
391
- Supported formats/encodings/bit depth/compression are:
392
-
393
- ``"wav"``
394
- - 32-bit floating-point PCM
395
- - 32-bit signed integer PCM
396
- - 24-bit signed integer PCM
397
- - 16-bit signed integer PCM
398
- - 8-bit unsigned integer PCM
399
- - 8-bit mu-law
400
- - 8-bit a-law
401
-
402
- Note:
403
- Default encoding/bit depth is determined by the dtype of
404
- the input Tensor.
405
-
406
- ``"flac"``
407
- - 8-bit
408
- - 16-bit (default)
409
- - 24-bit
410
-
411
- ``"ogg"``, ``"vorbis"``
412
- - Doesn't accept changing configuration.
413
-
414
- ``"sph"``
415
- - 8-bit signed integer PCM
416
- - 16-bit signed integer PCM
417
- - 24-bit signed integer PCM
418
- - 32-bit signed integer PCM (default)
419
- - 8-bit mu-law
420
- - 8-bit a-law
421
- - 16-bit a-law
422
- - 24-bit a-law
423
- - 32-bit a-law
424
-
425
- """
426
- if src.ndim != 2:
427
- raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
428
- if compression is not None:
429
- warnings.warn(
430
- '`save` function of "soundfile" backend does not support "compression" parameter. '
431
- "The argument is silently ignored."
432
- )
433
- if hasattr(filepath, "write"):
434
- if format is None:
435
- raise RuntimeError("`format` is required when saving to file object.")
436
- ext = format.lower()
437
- else:
438
- ext = str(filepath).split(".")[-1].lower()
439
-
440
- if bits_per_sample not in (None, 8, 16, 24, 32, 64):
441
- raise ValueError("Invalid bits_per_sample.")
442
- if bits_per_sample == 24:
443
- warnings.warn(
444
- "Saving audio with 24 bits per sample might warp samples near -1. "
445
- "Using 16 bits per sample might be able to avoid this."
446
- )
447
- subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
448
-
449
- # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
450
- # so we extend the extensions manually here
451
- if ext in ["nis", "nist", "sph"] and format is None:
452
- format = "NIST"
453
-
454
- if channels_first:
455
- src = src.t()
456
-
457
- soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
@@ -1,91 +0,0 @@
1
- import os
2
- from typing import BinaryIO, Optional, Tuple, Union
3
-
4
- import torch
5
- import torchaudio
6
-
7
- from .backend import Backend
8
- from .common import AudioMetaData
9
-
10
- sox_ext = torchaudio._extension.lazy_import_sox_ext()
11
-
12
-
13
- class SoXBackend(Backend):
14
- @staticmethod
15
- def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
16
- if hasattr(uri, "read"):
17
- raise ValueError(
18
- "SoX backend does not support reading from file-like objects. ",
19
- "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
20
- )
21
- else:
22
- sinfo = sox_ext.get_info(uri, format)
23
- if sinfo:
24
- return AudioMetaData(*sinfo)
25
- else:
26
- raise RuntimeError(f"Failed to fetch metadata for {uri}.")
27
-
28
- @staticmethod
29
- def load(
30
- uri: Union[BinaryIO, str, os.PathLike],
31
- frame_offset: int = 0,
32
- num_frames: int = -1,
33
- normalize: bool = True,
34
- channels_first: bool = True,
35
- format: Optional[str] = None,
36
- buffer_size: int = 4096,
37
- ) -> Tuple[torch.Tensor, int]:
38
- if hasattr(uri, "read"):
39
- raise ValueError(
40
- "SoX backend does not support loading from file-like objects. ",
41
- "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
42
- )
43
- else:
44
- ret = sox_ext.load_audio_file(str(uri), frame_offset, num_frames, normalize, channels_first, format)
45
- if not ret:
46
- raise RuntimeError(f"Failed to load audio from {uri}.")
47
- return ret
48
-
49
- @staticmethod
50
- def save(
51
- uri: Union[BinaryIO, str, os.PathLike],
52
- src: torch.Tensor,
53
- sample_rate: int,
54
- channels_first: bool = True,
55
- format: Optional[str] = None,
56
- encoding: Optional[str] = None,
57
- bits_per_sample: Optional[int] = None,
58
- buffer_size: int = 4096,
59
- compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
60
- ) -> None:
61
- if not isinstance(compression, (float, int, type(None))):
62
- raise ValueError(
63
- "SoX backend expects non-`None` value for argument `compression` to be of ",
64
- f"type `float` or `int`, but received value of type {type(compression)}",
65
- )
66
- if hasattr(uri, "write"):
67
- raise ValueError(
68
- "SoX backend does not support writing to file-like objects. ",
69
- "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
70
- )
71
- else:
72
- sox_ext.save_audio_file(
73
- str(uri),
74
- src,
75
- sample_rate,
76
- channels_first,
77
- compression,
78
- format,
79
- encoding,
80
- bits_per_sample,
81
- )
82
-
83
- @staticmethod
84
- def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
85
- # i.e. not a file-like object.
86
- return not hasattr(uri, "read")
87
-
88
- @staticmethod
89
- def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
90
- # i.e. not a file-like object.
91
- return not hasattr(uri, "write")