PyPI - torchaudio - Versions diffs - 2.0.2__cp39-cp39-win_amd64.whl → 2.1.1__cp39-cp39-win_amd64.whl - Mend

torchaudio 2.0.2__cp39-cp39-win_amd64.whl → 2.1.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (88) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.pyd +0 -0
torchaudio/lib/libtorchaudio.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.pyd +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +115 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.pyd +0 -0
torchaudio/lib/flashlight_lib_text_decoder.pyd +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.pyd +0 -0
torchaudio/lib/libflashlight-text.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.pyd +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -98
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/_backend/utils.py CHANGED Viewed

@@ -1,268 +1,25 @@
 import os
-import re
-from abc import ABC, abstractmethod
 from functools import lru_cache
-from typing import BinaryIO, Dict, Optional, Tuple, Union
+from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
 import torch
-import torchaudio.backend.soundfile_backend as soundfile_backend
-from torchaudio._extension import _FFMPEG_INITIALIZED, _SOX_INITIALIZED
-from torchaudio.backend.common import AudioMetaData
-if _FFMPEG_INITIALIZED:
-    from torchaudio.io._compat import info_audio, info_audio_fileobj, load_audio, load_audio_fileobj, save_audio
+from torchaudio._extension import _FFMPEG_EXT, _SOX_INITIALIZED
+from torchaudio.io import CodecConfig
+from . import soundfile_backend
-class Backend(ABC):
-    @staticmethod
-    @abstractmethod
-    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def load(
-        uri: Union[BinaryIO, str, os.PathLike],
-        frame_offset: int = 0,
-        num_frames: int = -1,
-        normalize: bool = True,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        buffer_size: int = 4096,
-    ) -> Tuple[torch.Tensor, int]:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def save(
-        uri: Union[BinaryIO, str, os.PathLike],
-        src: torch.Tensor,
-        sample_rate: int,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        encoding: Optional[str] = None,
-        bits_per_sample: Optional[int] = None,
-        buffer_size: int = 4096,
-    ) -> None:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        raise NotImplementedError
-def _map_encoding(encoding: str) -> str:
-    for dst in ["PCM_S", "PCM_U", "PCM_F"]:
-        if dst in encoding:
-            return dst
-    if encoding == "PCM_MULAW":
-        return "ULAW"
-    elif encoding == "PCM_ALAW":
-        return "ALAW"
-    return encoding
-def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
-    if m := re.search(r"PCM_\w(\d+)\w*", encoding):
-        return int(m.group(1))
-    elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
-        return 8
-    return bits_per_sample
-class FFmpegBackend(Backend):
-    @staticmethod
-    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
-        if hasattr(uri, "read"):
-            metadata = info_audio_fileobj(uri, format, buffer_size=buffer_size)
-        else:
-            metadata = info_audio(os.path.normpath(uri), format)
-        metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
-        metadata.encoding = _map_encoding(metadata.encoding)
-        return metadata
-    @staticmethod
-    def load(
-        uri: Union[BinaryIO, str, os.PathLike],
-        frame_offset: int = 0,
-        num_frames: int = -1,
-        normalize: bool = True,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        buffer_size: int = 4096,
-    ) -> Tuple[torch.Tensor, int]:
-        if hasattr(uri, "read"):
-            return load_audio_fileobj(
-                uri,
-                frame_offset,
-                num_frames,
-                normalize,
-                channels_first,
-                format,
-                buffer_size,
-            )
-        else:
-            return load_audio(os.path.normpath(uri), frame_offset, num_frames, normalize, channels_first, format)
-    @staticmethod
-    def save(
-        uri: Union[BinaryIO, str, os.PathLike],
-        src: torch.Tensor,
-        sample_rate: int,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        encoding: Optional[str] = None,
-        bits_per_sample: Optional[int] = None,
-        buffer_size: int = 4096,
-    ) -> None:
-        save_audio(
-            uri,
-            src,
-            sample_rate,
-            channels_first,
-            format,
-            encoding,
-            bits_per_sample,
-            buffer_size,
-        )
-    @staticmethod
-    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        return True
-    @staticmethod
-    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        return True
-class SoXBackend(Backend):
-    @staticmethod
-    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
-        if hasattr(uri, "read"):
-            raise ValueError(
-                "SoX backend does not support reading from file-like objects. ",
-                "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
-            )
-        else:
-            sinfo = torch.ops.torchaudio.sox_io_get_info(uri, format)
-            if sinfo:
-                return AudioMetaData(*sinfo)
-            else:
-                raise RuntimeError(f"Failed to fetch metadata for {uri}.")
-    @staticmethod
-    def load(
-        uri: Union[BinaryIO, str, os.PathLike],
-        frame_offset: int = 0,
-        num_frames: int = -1,
-        normalize: bool = True,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        buffer_size: int = 4096,
-    ) -> Tuple[torch.Tensor, int]:
-        if hasattr(uri, "read"):
-            raise ValueError(
-                "SoX backend does not support loading from file-like objects. ",
-                "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
-            )
-        else:
-            ret = torch.ops.torchaudio.sox_io_load_audio_file(
-                uri, frame_offset, num_frames, normalize, channels_first, format
-            )
-            if not ret:
-                raise RuntimeError(f"Failed to load audio from {uri}.")
-            return ret
-    @staticmethod
-    def save(
-        uri: Union[BinaryIO, str, os.PathLike],
-        src: torch.Tensor,
-        sample_rate: int,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        encoding: Optional[str] = None,
-        bits_per_sample: Optional[int] = None,
-        buffer_size: int = 4096,
-    ) -> None:
-        if hasattr(uri, "write"):
-            raise ValueError(
-                "SoX backend does not support writing to file-like objects. ",
-                "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
-            )
-        else:
-            torch.ops.torchaudio.sox_io_save_audio_file(
-                uri,
-                src,
-                sample_rate,
-                channels_first,
-                None,
-                format,
-                encoding,
-                bits_per_sample,
-            )
-    @staticmethod
-    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        # i.e. not a file-like object.
-        return not hasattr(uri, "read")
-    @staticmethod
-    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
-        # i.e. not a file-like object.
-        return not hasattr(uri, "write")
-class SoundfileBackend(Backend):
-    @abstractmethod
-    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
-        return soundfile_backend.info(uri, format)
-    @abstractmethod
-    def load(
-        uri: Union[BinaryIO, str, os.PathLike],
-        frame_offset: int = 0,
-        num_frames: int = -1,
-        normalize: bool = True,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        buffer_size: int = 4096,
-    ) -> Tuple[torch.Tensor, int]:
-        return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
-    @abstractmethod
-    def save(
-        uri: Union[BinaryIO, str, os.PathLike],
-        src: torch.Tensor,
-        sample_rate: int,
-        channels_first: bool = True,
-        format: Optional[str] = None,
-        encoding: Optional[str] = None,
-        bits_per_sample: Optional[int] = None,
-        buffer_size: int = 4096,
-    ) -> None:
-        soundfile_backend.save(
-            uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
-        )
-    @abstractmethod
-    def can_decode(uri, format) -> bool:
-        return True
-    @abstractmethod
-    def can_encode(uri, format) -> bool:
-        return True
+from .backend import Backend
+from .common import AudioMetaData
+from .ffmpeg import FFmpegBackend
+from .soundfile import SoundfileBackend
+from .sox import SoXBackend
 @lru_cache(None)
-def get_available_backends() -> Dict[str, Backend]:
-    backend_specs = {}
-    if _FFMPEG_INITIALIZED:
+def get_available_backends() -> Dict[str, Type[Backend]]:
+    backend_specs: Dict[str, Type[Backend]] = {}
+    if _FFMPEG_EXT is not None:
         backend_specs["ffmpeg"] = FFmpegBackend
     if _SOX_INITIALIZED:
         backend_specs["sox"] = SoXBackend
@@ -303,19 +60,19 @@ def get_info_func():
     ) -> AudioMetaData:
         """Get signal information of an audio file.
+        Note:
+            When the input type is file-like object, this function cannot
+            get the correct length (``num_samples``) for certain formats,
+            such as ``vorbis``.
+            In this case, the value of ``num_samples`` is ``0``.
         Args:
             uri (path-like object or file-like object):
                 Source of audio data. The following types are accepted:
-                    * ``path-like``: file path
-                    * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                      which returns byte string of at most ``size`` length.
-                Note:
-                    When the input type is file-like object, this function cannot
-                    get the correct length (``num_samples``) for certain formats,
-                    such as ``vorbis``.
-                    In this case, the value of ``num_samples`` is ``0``.
+                * ``path-like``: File path or URL.
+                * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                  which returns byte string of at most ``size`` length.
             format (str or None, optional):
                 If not ``None``, interpreted as hint that may allow backend to override the detected format.
@@ -325,12 +82,17 @@ def get_info_func():
                 Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
             backend (str or None, optional):
-                I/O backend to use. If ``None``, function selects backend given input and available backends.
-                Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding backend available.
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend available.
                 (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
         Returns:
-            AudioMetaData: Metadata of the given audio.
+            AudioMetaData
         """
         backend = dispatcher(uri, format, backend)
         return backend.info(uri, format, buffer_size)
@@ -362,27 +124,19 @@ def get_load_func():
         buffer_size: int = 4096,
         backend: Optional[str] = None,
     ) -> Tuple[torch.Tensor, int]:
-        """Load audio data from file.
-        Note:
-            The formats this function can handle depend on backend availability.
-            This function is tested on the following formats:
-            * WAV
-                * 32-bit floating-point
-                * 32-bit signed integer
-                * 24-bit signed integer
-                * 16-bit signed integer
-                * 8-bit unsigned integer
-            * FLAC
-            * OGG/VORBIS
-            * SPHERE
+        """Load audio data from source.
         By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
         ``float32`` dtype, and the shape of `[channel, time]`.
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
         .. warning::
             ``normalize`` argument does not perform volume normalization.
@@ -432,9 +186,13 @@ def get_load_func():
                 Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
             backend (str or None, optional):
-                I/O backend to use. If ``None``, function selects backend given input and available backends.
-                Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding
-                backend being available. (Default: ``None``)
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available. (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
         Returns:
             (torch.Tensor, int): Resulting Tensor and sample rate.
@@ -472,22 +230,17 @@ def get_save_func():
         bits_per_sample: Optional[int] = None,
         buffer_size: int = 4096,
         backend: Optional[str] = None,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
     ):
         """Save audio data to file.
         Note:
             The formats this function can handle depend on the availability of backends.
-            This function is tested on the following formats:
+            Please use the following functions to fetch the supported formats.
-            * WAV
-                * 32-bit floating-point
-                * 32-bit signed integer
-                * 16-bit signed integer
-                * 8-bit unsigned integer
-            * FLAC
-            * OGG/VORBIS
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
         Args:
             uri (str or pathlib.Path): Path to audio file.
@@ -508,11 +261,11 @@ def get_save_func():
                 This argument is effective only for supported formats, i.e.
                 ``"wav"`` and ``""flac"```. Valid values are
-                    - ``"PCM_S"`` (signed integer Linear PCM)
-                    - ``"PCM_U"`` (unsigned integer Linear PCM)
-                    - ``"PCM_F"`` (floating point PCM)
-                    - ``"ULAW"`` (mu-law)
-                    - ``"ALAW"`` (a-law)
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
             bits_per_sample (int or None, optional): Changes the bit depth for the
                 supported formats.
@@ -524,35 +277,40 @@ def get_save_func():
                 Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
             backend (str or None, optional):
-                I/O backend to use. If ``None``, function selects backend given input and available backends.
-                Otherwise, must be one of ["ffmpeg", "sox", "soundfile"], with the corresponding
-                backend being available. (Default: ``None``)
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available.
+                (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
+            compression (CodecConfig, float, int, or None, optional):
+                Compression configuration to apply.
-        Supported formats/encodings/bit depth/compression are:
+                If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
-        ``"wav"``
-            - 32-bit floating-point PCM
-            - 32-bit signed integer PCM
-            - 24-bit signed integer PCM
-            - 16-bit signed integer PCM
-            - 8-bit unsigned integer PCM
-            - 8-bit mu-law
-            - 8-bit a-law
+                Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
+                ``sox`` command line interface must be provided. For instance:
-            Note:
-                Default encoding/bit depth is determined by the dtype of
-                the input Tensor.
+                ``"mp3"``
+                    Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                    VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
-        ``"flac"``
-            - 16-bit (default)
-            - 24-bit
+                ``"flac"``
+                    Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+                ``"ogg"``, ``"vorbis"``
+                    Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                    and lowest quality. Default: ``3``.
+                Refer to http://sox.sourceforge.net/soxformat.html for more details.
-        ``"ogg"``
-            - Doesn't accept changing configuration.
         """
         backend = dispatcher(uri, format, backend)
-        return backend.save(uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size)
+        return backend.save(
+            uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
+        )
     return save

torchaudio/_extension/__init__.py CHANGED Viewed

@@ -2,9 +2,13 @@ import logging
 import os
 import sys
-from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
+from torchaudio._internal.module_utils import eval_env, fail_with_message, is_module_available, no_op
-from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _init_dll_path, _init_ffmpeg, _init_sox, _load_lib
+try:
+    from .fb import _init_ffmpeg
+except ImportError:
+    from .utils import _init_ffmpeg
+from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _fail_since_no_sox, _init_dll_path, _init_sox, _load_lib
 _LG = logging.getLogger(__name__)
@@ -14,15 +18,13 @@ _LG = logging.getLogger(__name__)
 # Builder uses it for debugging purpose, so we export it.
 # https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
 __all__ = [
-    "fail_if_no_kaldi",
     "fail_if_no_sox",
     "fail_if_no_ffmpeg",
     "_check_cuda_version",
     "_IS_TORCHAUDIO_EXT_AVAILABLE",
-    "_IS_KALDI_AVAILABLE",
     "_IS_RIR_AVAILABLE",
     "_SOX_INITIALIZED",
-    "_FFMPEG_INITIALIZED",
+    "_FFMPEG_EXT",
 ]
@@ -34,11 +36,11 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
 # In case of an error, we do not catch the failure as it suggests there is something
 # wrong with the installation.
 _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
-# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
+# RIR features are implemented in _torchaudio extension, but they can be individually
 # turned on/off at build time. Available means that _torchaudio is loaded properly, and
-# Kaldi or RIR features are found there.
+# RIR features are found there.
 _IS_RIR_AVAILABLE = False
-_IS_KALDI_AVAILABLE = False
+_IS_ALIGN_AVAILABLE = False
 if _IS_TORCHAUDIO_EXT_AVAILABLE:
     _load_lib("libtorchaudio")
@@ -46,26 +48,45 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
     _check_cuda_version()
     _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
-    _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
+    _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
-# Similar to libtorchaudio, sox-related features should be importable when present.
-#
-# Note: This will be change in the future when sox is dynamically linked.
-# At that point, this initialization should handle the case where
-# sox integration is built but libsox is not found.
+# Initialize libsox-related features
 _SOX_INITIALIZED = False
-if is_module_available("torchaudio.lib._torchaudio_sox"):
-    _init_sox()
-    _SOX_INITIALIZED = True
+_USE_SOX = False if os.name == "nt" else eval_env("TORCHAUDIO_USE_SOX", True)
+_SOX_MODULE_AVAILABLE = is_module_available("torchaudio.lib._torchaudio_sox")
+if _USE_SOX and _SOX_MODULE_AVAILABLE:
+    try:
+        _init_sox()
+        _SOX_INITIALIZED = True
+    except Exception:
+        # The initialization of sox extension will fail if supported sox
+        # libraries are not found in the system.
+        # Since the rest of the torchaudio works without it, we do not report the
+        # error here.
+        # The error will be raised when user code attempts to use these features.
+        _LG.debug("Failed to initialize sox extension", exc_info=True)
+if os.name == "nt":
+    fail_if_no_sox = fail_with_message("requires sox extension, which is not supported on Windows.")
+elif not _USE_SOX:
+    fail_if_no_sox = fail_with_message("requires sox extension, but it is disabled. (TORCHAUDIO_USE_SOX=0)")
+elif not _SOX_MODULE_AVAILABLE:
+    fail_if_no_sox = fail_with_message(
+        "requires sox extension, but TorchAudio is not compiled with it. "
+        "Please build TorchAudio with libsox support. (BUILD_SOX=1)"
+    )
+else:
+    fail_if_no_sox = no_op if _SOX_INITIALIZED else _fail_since_no_sox
 # Initialize FFmpeg-related features
-_FFMPEG_INITIALIZED = False
-if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
+_FFMPEG_EXT = None
+_USE_FFMPEG = eval_env("TORCHAUDIO_USE_FFMPEG", True)
+if _USE_FFMPEG and _IS_TORCHAUDIO_EXT_AVAILABLE:
     try:
-        _init_ffmpeg()
-        _FFMPEG_INITIALIZED = True
+        _FFMPEG_EXT = _init_ffmpeg()
     except Exception:
         # The initialization of FFmpeg extension will fail if supported FFmpeg
         # libraries are not found in the system.
@@ -75,22 +96,11 @@ if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
         _LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
-fail_if_no_kaldi = (
-    no_op
-    if _IS_KALDI_AVAILABLE
-    else fail_with_message(
-        "requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
-    )
-)
-fail_if_no_sox = (
-    no_op
-    if _SOX_INITIALIZED
-    else fail_with_message(
-        "requires sox extension, but TorchAudio is not compiled with it. Please build TorchAudio with libsox support."
-    )
-)
+if _USE_FFMPEG:
+    fail_if_no_ffmpeg = _fail_since_no_ffmpeg if _FFMPEG_EXT is None else no_op
+else:
+    fail_if_no_ffmpeg = fail_with_message("requires ffmpeg extension, but it is disabled. (TORCHAUDIO_USE_FFMPEG=0)")
-fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
 fail_if_no_rir = (
     no_op
@@ -99,3 +109,12 @@ fail_if_no_rir = (
         "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
     )
 )
+fail_if_no_align = (
+    no_op
+    if _IS_ALIGN_AVAILABLE
+    else fail_with_message(
+        "Requires alignment extension, but TorchAudio is not compiled with it. \
+        Please build TorchAudio with alignment support."
+    )
+)