torchaudio 2.0.2__cp311-cp311-manylinux2014_aarch64.whl → 2.1.1__cp311-cp311-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (90) hide show
  1. torchaudio/__init__.py +22 -3
  2. torchaudio/_backend/__init__.py +55 -4
  3. torchaudio/_backend/backend.py +53 -0
  4. torchaudio/_backend/common.py +52 -0
  5. torchaudio/_backend/ffmpeg.py +373 -0
  6. torchaudio/_backend/soundfile.py +54 -0
  7. torchaudio/_backend/soundfile_backend.py +457 -0
  8. torchaudio/_backend/sox.py +91 -0
  9. torchaudio/_backend/utils.py +81 -323
  10. torchaudio/_extension/__init__.py +55 -36
  11. torchaudio/_extension/utils.py +109 -17
  12. torchaudio/_internal/__init__.py +4 -1
  13. torchaudio/_internal/module_utils.py +37 -6
  14. torchaudio/backend/__init__.py +7 -11
  15. torchaudio/backend/_no_backend.py +24 -0
  16. torchaudio/backend/_sox_io_backend.py +297 -0
  17. torchaudio/backend/common.py +12 -52
  18. torchaudio/backend/no_backend.py +11 -21
  19. torchaudio/backend/soundfile_backend.py +11 -448
  20. torchaudio/backend/sox_io_backend.py +11 -435
  21. torchaudio/backend/utils.py +9 -18
  22. torchaudio/datasets/__init__.py +2 -0
  23. torchaudio/datasets/cmuarctic.py +1 -1
  24. torchaudio/datasets/cmudict.py +61 -62
  25. torchaudio/datasets/dr_vctk.py +1 -1
  26. torchaudio/datasets/gtzan.py +1 -1
  27. torchaudio/datasets/librilight_limited.py +1 -1
  28. torchaudio/datasets/librispeech.py +1 -1
  29. torchaudio/datasets/librispeech_biasing.py +189 -0
  30. torchaudio/datasets/libritts.py +1 -1
  31. torchaudio/datasets/ljspeech.py +1 -1
  32. torchaudio/datasets/musdb_hq.py +1 -1
  33. torchaudio/datasets/quesst14.py +1 -1
  34. torchaudio/datasets/speechcommands.py +1 -1
  35. torchaudio/datasets/tedlium.py +1 -1
  36. torchaudio/datasets/vctk.py +1 -1
  37. torchaudio/datasets/voxceleb1.py +1 -1
  38. torchaudio/datasets/yesno.py +1 -1
  39. torchaudio/functional/__init__.py +6 -2
  40. torchaudio/functional/_alignment.py +128 -0
  41. torchaudio/functional/filtering.py +69 -92
  42. torchaudio/functional/functional.py +99 -148
  43. torchaudio/io/__init__.py +4 -1
  44. torchaudio/io/_effector.py +347 -0
  45. torchaudio/io/_stream_reader.py +158 -90
  46. torchaudio/io/_stream_writer.py +196 -10
  47. torchaudio/lib/_torchaudio.so +0 -0
  48. torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
  49. torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
  50. torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
  51. torchaudio/lib/_torchaudio_sox.so +0 -0
  52. torchaudio/lib/libtorchaudio.so +0 -0
  53. torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
  54. torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
  55. torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
  56. torchaudio/lib/libtorchaudio_sox.so +0 -0
  57. torchaudio/models/__init__.py +14 -0
  58. torchaudio/models/decoder/__init__.py +22 -7
  59. torchaudio/models/decoder/_ctc_decoder.py +123 -69
  60. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  61. torchaudio/models/rnnt_decoder.py +10 -14
  62. torchaudio/models/squim/__init__.py +11 -0
  63. torchaudio/models/squim/objective.py +326 -0
  64. torchaudio/models/squim/subjective.py +150 -0
  65. torchaudio/models/wav2vec2/components.py +6 -10
  66. torchaudio/pipelines/__init__.py +9 -0
  67. torchaudio/pipelines/_squim_pipeline.py +176 -0
  68. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  69. torchaudio/pipelines/_wav2vec2/impl.py +198 -68
  70. torchaudio/pipelines/_wav2vec2/utils.py +120 -0
  71. torchaudio/sox_effects/sox_effects.py +7 -30
  72. torchaudio/transforms/__init__.py +2 -0
  73. torchaudio/transforms/_transforms.py +99 -54
  74. torchaudio/utils/download.py +2 -2
  75. torchaudio/utils/ffmpeg_utils.py +20 -15
  76. torchaudio/utils/sox_utils.py +8 -9
  77. torchaudio/version.py +2 -2
  78. torchaudio-2.1.1.dist-info/METADATA +113 -0
  79. torchaudio-2.1.1.dist-info/RECORD +117 -0
  80. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
  81. torchaudio/io/_compat.py +0 -241
  82. torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
  83. torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
  84. torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
  85. torchaudio/lib/libflashlight-text.so +0 -0
  86. torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
  87. torchaudio-2.0.2.dist-info/METADATA +0 -30
  88. torchaudio-2.0.2.dist-info/RECORD +0 -100
  89. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
  90. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
@@ -1,268 +1,25 @@
1
1
  import os
2
- import re
3
- from abc import ABC, abstractmethod
4
2
  from functools import lru_cache
5
- from typing import BinaryIO, Dict, Optional, Tuple, Union
3
+ from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
6
4
 
7
5
  import torch
8
- import torchaudio.backend.soundfile_backend as soundfile_backend
9
- from torchaudio._extension import _FFMPEG_INITIALIZED, _SOX_INITIALIZED
10
- from torchaudio.backend.common import AudioMetaData
11
6
 
12
- if _FFMPEG_INITIALIZED:
13
- from torchaudio.io._compat import info_audio, info_audio_fileobj, load_audio, load_audio_fileobj, save_audio
7
+ from torchaudio._extension import _FFMPEG_EXT, _SOX_INITIALIZED
8
+ from torchaudio.io import CodecConfig
14
9
 
10
+ from . import soundfile_backend
15
11
 
16
- class Backend(ABC):
17
- @staticmethod
18
- @abstractmethod
19
- def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
20
- raise NotImplementedError
21
-
22
- @staticmethod
23
- @abstractmethod
24
- def load(
25
- uri: Union[BinaryIO, str, os.PathLike],
26
- frame_offset: int = 0,
27
- num_frames: int = -1,
28
- normalize: bool = True,
29
- channels_first: bool = True,
30
- format: Optional[str] = None,
31
- buffer_size: int = 4096,
32
- ) -> Tuple[torch.Tensor, int]:
33
- raise NotImplementedError
34
-
35
- @staticmethod
36
- @abstractmethod
37
- def save(
38
- uri: Union[BinaryIO, str, os.PathLike],
39
- src: torch.Tensor,
40
- sample_rate: int,
41
- channels_first: bool = True,
42
- format: Optional[str] = None,
43
- encoding: Optional[str] = None,
44
- bits_per_sample: Optional[int] = None,
45
- buffer_size: int = 4096,
46
- ) -> None:
47
- raise NotImplementedError
48
-
49
- @staticmethod
50
- @abstractmethod
51
- def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
52
- raise NotImplementedError
53
-
54
- @staticmethod
55
- @abstractmethod
56
- def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
57
- raise NotImplementedError
58
-
59
-
60
- def _map_encoding(encoding: str) -> str:
61
- for dst in ["PCM_S", "PCM_U", "PCM_F"]:
62
- if dst in encoding:
63
- return dst
64
- if encoding == "PCM_MULAW":
65
- return "ULAW"
66
- elif encoding == "PCM_ALAW":
67
- return "ALAW"
68
- return encoding
69
-
70
-
71
- def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
72
- if m := re.search(r"PCM_\w(\d+)\w*", encoding):
73
- return int(m.group(1))
74
- elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
75
- return 8
76
- return bits_per_sample
77
-
78
-
79
- class FFmpegBackend(Backend):
80
- @staticmethod
81
- def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
82
- if hasattr(uri, "read"):
83
- metadata = info_audio_fileobj(uri, format, buffer_size=buffer_size)
84
- else:
85
- metadata = info_audio(os.path.normpath(uri), format)
86
- metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
87
- metadata.encoding = _map_encoding(metadata.encoding)
88
- return metadata
89
-
90
- @staticmethod
91
- def load(
92
- uri: Union[BinaryIO, str, os.PathLike],
93
- frame_offset: int = 0,
94
- num_frames: int = -1,
95
- normalize: bool = True,
96
- channels_first: bool = True,
97
- format: Optional[str] = None,
98
- buffer_size: int = 4096,
99
- ) -> Tuple[torch.Tensor, int]:
100
- if hasattr(uri, "read"):
101
- return load_audio_fileobj(
102
- uri,
103
- frame_offset,
104
- num_frames,
105
- normalize,
106
- channels_first,
107
- format,
108
- buffer_size,
109
- )
110
- else:
111
- return load_audio(os.path.normpath(uri), frame_offset, num_frames, normalize, channels_first, format)
112
-
113
- @staticmethod
114
- def save(
115
- uri: Union[BinaryIO, str, os.PathLike],
116
- src: torch.Tensor,
117
- sample_rate: int,
118
- channels_first: bool = True,
119
- format: Optional[str] = None,
120
- encoding: Optional[str] = None,
121
- bits_per_sample: Optional[int] = None,
122
- buffer_size: int = 4096,
123
- ) -> None:
124
- save_audio(
125
- uri,
126
- src,
127
- sample_rate,
128
- channels_first,
129
- format,
130
- encoding,
131
- bits_per_sample,
132
- buffer_size,
133
- )
134
-
135
- @staticmethod
136
- def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
137
- return True
138
-
139
- @staticmethod
140
- def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
141
- return True
142
-
143
-
144
- class SoXBackend(Backend):
145
- @staticmethod
146
- def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
147
- if hasattr(uri, "read"):
148
- raise ValueError(
149
- "SoX backend does not support reading from file-like objects. ",
150
- "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
151
- )
152
- else:
153
- sinfo = torch.ops.torchaudio.sox_io_get_info(uri, format)
154
- if sinfo:
155
- return AudioMetaData(*sinfo)
156
- else:
157
- raise RuntimeError(f"Failed to fetch metadata for {uri}.")
158
-
159
- @staticmethod
160
- def load(
161
- uri: Union[BinaryIO, str, os.PathLike],
162
- frame_offset: int = 0,
163
- num_frames: int = -1,
164
- normalize: bool = True,
165
- channels_first: bool = True,
166
- format: Optional[str] = None,
167
- buffer_size: int = 4096,
168
- ) -> Tuple[torch.Tensor, int]:
169
- if hasattr(uri, "read"):
170
- raise ValueError(
171
- "SoX backend does not support loading from file-like objects. ",
172
- "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
173
- )
174
- else:
175
- ret = torch.ops.torchaudio.sox_io_load_audio_file(
176
- uri, frame_offset, num_frames, normalize, channels_first, format
177
- )
178
- if not ret:
179
- raise RuntimeError(f"Failed to load audio from {uri}.")
180
- return ret
181
-
182
- @staticmethod
183
- def save(
184
- uri: Union[BinaryIO, str, os.PathLike],
185
- src: torch.Tensor,
186
- sample_rate: int,
187
- channels_first: bool = True,
188
- format: Optional[str] = None,
189
- encoding: Optional[str] = None,
190
- bits_per_sample: Optional[int] = None,
191
- buffer_size: int = 4096,
192
- ) -> None:
193
- if hasattr(uri, "write"):
194
- raise ValueError(
195
- "SoX backend does not support writing to file-like objects. ",
196
- "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
197
- )
198
- else:
199
- torch.ops.torchaudio.sox_io_save_audio_file(
200
- uri,
201
- src,
202
- sample_rate,
203
- channels_first,
204
- None,
205
- format,
206
- encoding,
207
- bits_per_sample,
208
- )
209
-
210
- @staticmethod
211
- def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
212
- # i.e. not a file-like object.
213
- return not hasattr(uri, "read")
214
-
215
- @staticmethod
216
- def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
217
- # i.e. not a file-like object.
218
- return not hasattr(uri, "write")
219
-
220
-
221
- class SoundfileBackend(Backend):
222
- @abstractmethod
223
- def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
224
- return soundfile_backend.info(uri, format)
225
-
226
- @abstractmethod
227
- def load(
228
- uri: Union[BinaryIO, str, os.PathLike],
229
- frame_offset: int = 0,
230
- num_frames: int = -1,
231
- normalize: bool = True,
232
- channels_first: bool = True,
233
- format: Optional[str] = None,
234
- buffer_size: int = 4096,
235
- ) -> Tuple[torch.Tensor, int]:
236
- return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
237
-
238
- @abstractmethod
239
- def save(
240
- uri: Union[BinaryIO, str, os.PathLike],
241
- src: torch.Tensor,
242
- sample_rate: int,
243
- channels_first: bool = True,
244
- format: Optional[str] = None,
245
- encoding: Optional[str] = None,
246
- bits_per_sample: Optional[int] = None,
247
- buffer_size: int = 4096,
248
- ) -> None:
249
- soundfile_backend.save(
250
- uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
251
- )
252
-
253
- @abstractmethod
254
- def can_decode(uri, format) -> bool:
255
- return True
256
-
257
- @abstractmethod
258
- def can_encode(uri, format) -> bool:
259
- return True
12
+ from .backend import Backend
13
+ from .common import AudioMetaData
14
+ from .ffmpeg import FFmpegBackend
15
+ from .soundfile import SoundfileBackend
16
+ from .sox import SoXBackend
260
17
 
261
18
 
262
19
  @lru_cache(None)
263
- def get_available_backends() -> Dict[str, Backend]:
264
- backend_specs = {}
265
- if _FFMPEG_INITIALIZED:
20
+ def get_available_backends() -> Dict[str, Type[Backend]]:
21
+ backend_specs: Dict[str, Type[Backend]] = {}
22
+ if _FFMPEG_EXT is not None:
266
23
  backend_specs["ffmpeg"] = FFmpegBackend
267
24
  if _SOX_INITIALIZED:
268
25
  backend_specs["sox"] = SoXBackend
@@ -303,19 +60,19 @@ def get_info_func():
303
60
  ) -> AudioMetaData:
304
61
  """Get signal information of an audio file.
305
62
 
63
+ Note:
64
+ When the input type is file-like object, this function cannot
65
+ get the correct length (``num_samples``) for certain formats,
66
+ such as ``vorbis``.
67
+ In this case, the value of ``num_samples`` is ``0``.
68
+
306
69
  Args:
307
70
  uri (path-like object or file-like object):
308
71
  Source of audio data. The following types are accepted:
309
72
 
310
- * ``path-like``: file path
311
- * ``file-like``: Object with ``read(size: int) -> bytes`` method,
312
- which returns byte string of at most ``size`` length.
313
-
314
- Note:
315
- When the input type is file-like object, this function cannot
316
- get the correct length (``num_samples``) for certain formats,
317
- such as ``vorbis``.
318
- In this case, the value of ``num_samples`` is ``0``.
73
+ * ``path-like``: File path or URL.
74
+ * ``file-like``: Object with ``read(size: int) -> bytes`` method,
75
+ which returns byte string of at most ``size`` length.
319
76
 
320
77
  format (str or None, optional):
321
78
  If not ``None``, interpreted as hint that may allow backend to override the detected format.
@@ -325,12 +82,17 @@ def get_info_func():
325
82
  Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
326
83
 
327
84
  backend (str or None, optional):
328
- I/O backend to use. If ``None``, function selects backend given input and available backends.
329
- Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding backend available.
85
+ I/O backend to use.
86
+ If ``None``, function selects backend given input and available backends.
87
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
88
+ with the corresponding backend available.
330
89
  (Default: ``None``)
331
90
 
91
+ .. seealso::
92
+ :ref:`backend`
93
+
332
94
  Returns:
333
- AudioMetaData: Metadata of the given audio.
95
+ AudioMetaData
334
96
  """
335
97
  backend = dispatcher(uri, format, backend)
336
98
  return backend.info(uri, format, buffer_size)
@@ -362,27 +124,19 @@ def get_load_func():
362
124
  buffer_size: int = 4096,
363
125
  backend: Optional[str] = None,
364
126
  ) -> Tuple[torch.Tensor, int]:
365
- """Load audio data from file.
366
-
367
- Note:
368
- The formats this function can handle depend on backend availability.
369
- This function is tested on the following formats:
370
-
371
- * WAV
372
-
373
- * 32-bit floating-point
374
- * 32-bit signed integer
375
- * 24-bit signed integer
376
- * 16-bit signed integer
377
- * 8-bit unsigned integer
378
-
379
- * FLAC
380
- * OGG/VORBIS
381
- * SPHERE
127
+ """Load audio data from source.
382
128
 
383
129
  By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
384
130
  ``float32`` dtype, and the shape of `[channel, time]`.
385
131
 
132
+ Note:
133
+ The formats this function can handle depend on the availability of backends.
134
+ Please use the following functions to fetch the supported formats.
135
+
136
+ - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
137
+ - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
138
+ - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
139
+
386
140
  .. warning::
387
141
 
388
142
  ``normalize`` argument does not perform volume normalization.
@@ -432,9 +186,13 @@ def get_load_func():
432
186
  Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
433
187
 
434
188
  backend (str or None, optional):
435
- I/O backend to use. If ``None``, function selects backend given input and available backends.
436
- Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding
437
- backend being available. (Default: ``None``)
189
+ I/O backend to use.
190
+ If ``None``, function selects backend given input and available backends.
191
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
192
+ with the corresponding backend being available. (Default: ``None``)
193
+
194
+ .. seealso::
195
+ :ref:`backend`
438
196
 
439
197
  Returns:
440
198
  (torch.Tensor, int): Resulting Tensor and sample rate.
@@ -472,22 +230,17 @@ def get_save_func():
472
230
  bits_per_sample: Optional[int] = None,
473
231
  buffer_size: int = 4096,
474
232
  backend: Optional[str] = None,
233
+ compression: Optional[Union[CodecConfig, float, int]] = None,
475
234
  ):
476
235
  """Save audio data to file.
477
236
 
478
237
  Note:
479
238
  The formats this function can handle depend on the availability of backends.
480
- This function is tested on the following formats:
239
+ Please use the following functions to fetch the supported formats.
481
240
 
482
- * WAV
483
-
484
- * 32-bit floating-point
485
- * 32-bit signed integer
486
- * 16-bit signed integer
487
- * 8-bit unsigned integer
488
-
489
- * FLAC
490
- * OGG/VORBIS
241
+ - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
242
+ - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
243
+ - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
491
244
 
492
245
  Args:
493
246
  uri (str or pathlib.Path): Path to audio file.
@@ -508,11 +261,11 @@ def get_save_func():
508
261
  This argument is effective only for supported formats, i.e.
509
262
  ``"wav"`` and ``""flac"```. Valid values are
510
263
 
511
- - ``"PCM_S"`` (signed integer Linear PCM)
512
- - ``"PCM_U"`` (unsigned integer Linear PCM)
513
- - ``"PCM_F"`` (floating point PCM)
514
- - ``"ULAW"`` (mu-law)
515
- - ``"ALAW"`` (a-law)
264
+ - ``"PCM_S"`` (signed integer Linear PCM)
265
+ - ``"PCM_U"`` (unsigned integer Linear PCM)
266
+ - ``"PCM_F"`` (floating point PCM)
267
+ - ``"ULAW"`` (mu-law)
268
+ - ``"ALAW"`` (a-law)
516
269
 
517
270
  bits_per_sample (int or None, optional): Changes the bit depth for the
518
271
  supported formats.
@@ -524,35 +277,40 @@ def get_save_func():
524
277
  Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
525
278
 
526
279
  backend (str or None, optional):
527
- I/O backend to use. If ``None``, function selects backend given input and available backends.
528
- Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding
529
- backend being available. (Default: ``None``)
280
+ I/O backend to use.
281
+ If ``None``, function selects backend given input and available backends.
282
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
283
+ with the corresponding backend being available.
284
+ (Default: ``None``)
530
285
 
286
+ .. seealso::
287
+ :ref:`backend`
531
288
 
289
+ compression (CodecConfig, float, int, or None, optional):
290
+ Compression configuration to apply.
532
291
 
533
- Supported formats/encodings/bit depth/compression are:
292
+ If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
534
293
 
535
- ``"wav"``
536
- - 32-bit floating-point PCM
537
- - 32-bit signed integer PCM
538
- - 24-bit signed integer PCM
539
- - 16-bit signed integer PCM
540
- - 8-bit unsigned integer PCM
541
- - 8-bit mu-law
542
- - 8-bit a-law
294
+ Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
295
+ ``sox`` command line interface must be provided. For instance:
543
296
 
544
- Note:
545
- Default encoding/bit depth is determined by the dtype of
546
- the input Tensor.
297
+ ``"mp3"``
298
+ Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
299
+ VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
547
300
 
548
- ``"flac"``
549
- - 16-bit (default)
550
- - 24-bit
301
+ ``"flac"``
302
+ Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
303
+
304
+ ``"ogg"``, ``"vorbis"``
305
+ Number from ``-1`` to ``10``; ``-1`` is the highest compression
306
+ and lowest quality. Default: ``3``.
307
+
308
+ Refer to http://sox.sourceforge.net/soxformat.html for more details.
551
309
 
552
- ``"ogg"``
553
- - Doesn't accept changing configuration.
554
310
  """
555
311
  backend = dispatcher(uri, format, backend)
556
- return backend.save(uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size)
312
+ return backend.save(
313
+ uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
314
+ )
557
315
 
558
316
  return save
@@ -2,9 +2,13 @@ import logging
2
2
  import os
3
3
  import sys
4
4
 
5
- from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
5
+ from torchaudio._internal.module_utils import eval_env, fail_with_message, is_module_available, no_op
6
6
 
7
- from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _init_dll_path, _init_ffmpeg, _init_sox, _load_lib
7
+ try:
8
+ from .fb import _init_ffmpeg
9
+ except ImportError:
10
+ from .utils import _init_ffmpeg
11
+ from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _fail_since_no_sox, _init_dll_path, _init_sox, _load_lib
8
12
 
9
13
  _LG = logging.getLogger(__name__)
10
14
 
@@ -14,15 +18,13 @@ _LG = logging.getLogger(__name__)
14
18
  # Builder uses it for debugging purpose, so we export it.
15
19
  # https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
16
20
  __all__ = [
17
- "fail_if_no_kaldi",
18
21
  "fail_if_no_sox",
19
22
  "fail_if_no_ffmpeg",
20
23
  "_check_cuda_version",
21
24
  "_IS_TORCHAUDIO_EXT_AVAILABLE",
22
- "_IS_KALDI_AVAILABLE",
23
25
  "_IS_RIR_AVAILABLE",
24
26
  "_SOX_INITIALIZED",
25
- "_FFMPEG_INITIALIZED",
27
+ "_FFMPEG_EXT",
26
28
  ]
27
29
 
28
30
 
@@ -34,11 +36,11 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
34
36
  # In case of an error, we do not catch the failure as it suggests there is something
35
37
  # wrong with the installation.
36
38
  _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
37
- # Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
39
+ # RIR features are implemented in _torchaudio extension, but they can be individually
38
40
  # turned on/off at build time. Available means that _torchaudio is loaded properly, and
39
- # Kaldi or RIR features are found there.
41
+ # RIR features are found there.
40
42
  _IS_RIR_AVAILABLE = False
41
- _IS_KALDI_AVAILABLE = False
43
+ _IS_ALIGN_AVAILABLE = False
42
44
  if _IS_TORCHAUDIO_EXT_AVAILABLE:
43
45
  _load_lib("libtorchaudio")
44
46
 
@@ -46,26 +48,45 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
46
48
 
47
49
  _check_cuda_version()
48
50
  _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
49
- _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
51
+ _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
50
52
 
51
53
 
52
- # Similar to libtorchaudio, sox-related features should be importable when present.
53
- #
54
- # Note: This will be change in the future when sox is dynamically linked.
55
- # At that point, this initialization should handle the case where
56
- # sox integration is built but libsox is not found.
54
+ # Initialize libsox-related features
57
55
  _SOX_INITIALIZED = False
58
- if is_module_available("torchaudio.lib._torchaudio_sox"):
59
- _init_sox()
60
- _SOX_INITIALIZED = True
56
+ _USE_SOX = False if os.name == "nt" else eval_env("TORCHAUDIO_USE_SOX", True)
57
+ _SOX_MODULE_AVAILABLE = is_module_available("torchaudio.lib._torchaudio_sox")
58
+ if _USE_SOX and _SOX_MODULE_AVAILABLE:
59
+ try:
60
+ _init_sox()
61
+ _SOX_INITIALIZED = True
62
+ except Exception:
63
+ # The initialization of sox extension will fail if supported sox
64
+ # libraries are not found in the system.
65
+ # Since the rest of the torchaudio works without it, we do not report the
66
+ # error here.
67
+ # The error will be raised when user code attempts to use these features.
68
+ _LG.debug("Failed to initialize sox extension", exc_info=True)
69
+
70
+
71
+ if os.name == "nt":
72
+ fail_if_no_sox = fail_with_message("requires sox extension, which is not supported on Windows.")
73
+ elif not _USE_SOX:
74
+ fail_if_no_sox = fail_with_message("requires sox extension, but it is disabled. (TORCHAUDIO_USE_SOX=0)")
75
+ elif not _SOX_MODULE_AVAILABLE:
76
+ fail_if_no_sox = fail_with_message(
77
+ "requires sox extension, but TorchAudio is not compiled with it. "
78
+ "Please build TorchAudio with libsox support. (BUILD_SOX=1)"
79
+ )
80
+ else:
81
+ fail_if_no_sox = no_op if _SOX_INITIALIZED else _fail_since_no_sox
61
82
 
62
83
 
63
84
  # Initialize FFmpeg-related features
64
- _FFMPEG_INITIALIZED = False
65
- if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
85
+ _FFMPEG_EXT = None
86
+ _USE_FFMPEG = eval_env("TORCHAUDIO_USE_FFMPEG", True)
87
+ if _USE_FFMPEG and _IS_TORCHAUDIO_EXT_AVAILABLE:
66
88
  try:
67
- _init_ffmpeg()
68
- _FFMPEG_INITIALIZED = True
89
+ _FFMPEG_EXT = _init_ffmpeg()
69
90
  except Exception:
70
91
  # The initialization of FFmpeg extension will fail if supported FFmpeg
71
92
  # libraries are not found in the system.
@@ -75,22 +96,11 @@ if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
75
96
  _LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
76
97
 
77
98
 
78
- fail_if_no_kaldi = (
79
- no_op
80
- if _IS_KALDI_AVAILABLE
81
- else fail_with_message(
82
- "requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
83
- )
84
- )
85
- fail_if_no_sox = (
86
- no_op
87
- if _SOX_INITIALIZED
88
- else fail_with_message(
89
- "requires sox extension, but TorchAudio is not compiled with it. Please build TorchAudio with libsox support."
90
- )
91
- )
99
+ if _USE_FFMPEG:
100
+ fail_if_no_ffmpeg = _fail_since_no_ffmpeg if _FFMPEG_EXT is None else no_op
101
+ else:
102
+ fail_if_no_ffmpeg = fail_with_message("requires ffmpeg extension, but it is disabled. (TORCHAUDIO_USE_FFMPEG=0)")
92
103
 
93
- fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
94
104
 
95
105
  fail_if_no_rir = (
96
106
  no_op
@@ -99,3 +109,12 @@ fail_if_no_rir = (
99
109
  "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
100
110
  )
101
111
  )
112
+
113
+ fail_if_no_align = (
114
+ no_op
115
+ if _IS_ALIGN_AVAILABLE
116
+ else fail_with_message(
117
+ "Requires alignment extension, but TorchAudio is not compiled with it. \
118
+ Please build TorchAudio with alignment support."
119
+ )
120
+ )