PyPI - torchaudio - Versions diffs - 2.0.2__cp39-cp39-manylinux1_x86_64.whl → 2.1.1__cp39-cp39-manylinux1_x86_64.whl - Mend

torchaudio 2.0.2__cp39-cp39-manylinux1_x86_64.whl → 2.1.1__cp39-cp39-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libctc_prefix_decoder.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/lib/pybind11_prefixctc.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +119 -0
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/io/_stream_writer.py CHANGED Viewed

@@ -1,9 +1,45 @@
+from dataclasses import dataclass
 from typing import BinaryIO, Dict, Optional, Union
 import torch
 import torchaudio
+if torchaudio._extension._FFMPEG_EXT is None:
+    ConfigBase = object
+else:
+    ConfigBase = torchaudio._extension._FFMPEG_EXT.CodecConfig
+    _StreamWriter = torchaudio._extension._FFMPEG_EXT.StreamWriter
+    _StreamWriterFileObj = torchaudio._extension._FFMPEG_EXT.StreamWriterFileObj
+@dataclass
+class CodecConfig(ConfigBase):
+    """Codec configuration."""
+    bit_rate: int = -1
+    """Bit rate"""
+    compression_level: int = -1
+    """Compression level"""
+    qscale: Optional[int] = None
+    """Global quality factor. Enables variable bit rate. Valid values depend on encoder.
+    For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while
+    libvorbis takes ``-1`` - ``10``.
+    """
+    gop_size: int = -1
+    """The number of pictures in a group of pictures, or 0 for intra_only"""
+    max_b_frames: int = -1
+    """maximum number of B-frames between non-B-frames."""
+    def __post_init__(self):
+        super().__init__(self.bit_rate, self.compression_level, self.qscale, self.gop_size, self.max_b_frames)
 def _format_doc(**kwargs):
     def decorator(obj):
         obj.__doc__ = obj.__doc__.format(**kwargs)
@@ -28,7 +64,29 @@ _encoder_option = """Options passed to encoder.
                 To list encoder options for a encoder, you can use
                 ``ffmpeg -h encoder=<ENCODER>`` command.
-                Default: ``None``."""
+                Default: ``None``.
+                |
+                In addition to encoder-specific options, you can also pass options related
+                to multithreading. They are effective only if the encoder support them.
+                If neither of them are provided, StreamReader defaults to single thread.
+                ``"threads"``: The number of threads (in str).
+                Providing the value ``"0"`` will let FFmpeg decides based on its heuristics.
+                ``"thread_type"``: Which multithreading method to use.
+                The valid values are ``"frame"`` or ``"slice"``.
+                Note that each encoder supports different set of methods.
+                If not provided, a default value is used.
+                - ``"frame"``: Encode more than one frame at once.
+                  Each thread handles one frame.
+                  This will increase decoding delay by one frame per thread
+                - ``"slice"``: Encode more than one part of a single frame at once.
+                |
+                """
 _encoder_format = """Format used to encode media.
@@ -38,13 +96,34 @@ _encoder_format = """Format used to encode media.
                 To list supported formats for the encoder, you can use
                 ``ffmpeg -h encoder=<ENCODER>`` command.
+                Default: ``None``.
+                Note:
+                    When ``encoder_format`` option is not provided, encoder uses its default format.
+                    For example, when encoding audio into wav format, 16-bit signed integer is used,
+                    and when encoding video into mp4 format (h264 encoder), one of YUV format is used.
+                    This is because typically, 32-bit or 16-bit floating point is used in audio models but
+                    they are not commonly used in audio formats. Similarly, RGB24 is commonly used in vision
+                    models, but video formats usually (and better) support YUV formats.
+                """
+_codec_config = """Codec configuration. Please refer to :py:class:`CodecConfig` for
+                configuration options.
                 Default: ``None``."""
+_filter_desc = """Additional processing to apply before encoding the input media.
+                """
 _format_common_args = _format_doc(
     encoder=_encoder,
     encoder_option=_encoder_option,
     encoder_format=_encoder_format,
+    codec_config=_codec_config,
+    filter_desc=_filter_desc,
 )
@@ -109,11 +188,10 @@ class StreamWriter:
         format: Optional[str] = None,
         buffer_size: int = 4096,
     ):
-        torch._C._log_api_usage_once("torchaudio.io.StreamWriter")
         if isinstance(dst, str):
-            self._s = torch.classes.torchaudio.ffmpeg_StreamWriter(dst, format)
+            self._s = _StreamWriter(dst, format)
         elif hasattr(dst, "write"):
-            self._s = torchaudio.lib._torchaudio_ffmpeg.StreamWriterFileObj(dst, format, buffer_size)
+            self._s = _StreamWriterFileObj(dst, format, buffer_size)
         else:
             raise ValueError("`dst` must be either a string or a file-like object.")
         self._is_open = False
@@ -124,9 +202,14 @@ class StreamWriter:
         sample_rate: int,
         num_channels: int,
         format: str = "flt",
+        *,
         encoder: Optional[str] = None,
         encoder_option: Optional[Dict[str, str]] = None,
+        encoder_sample_rate: Optional[int] = None,
+        encoder_num_channels: Optional[int] = None,
         encoder_format: Optional[str] = None,
+        codec_config: Optional[CodecConfig] = None,
+        filter_desc: Optional[str] = None,
     ):
         """Add an output audio stream.
@@ -151,9 +234,56 @@ class StreamWriter:
             encoder_option (dict or None, optional): {encoder_option}
+            encoder_sample_rate (int or None, optional): Override the sample rate used for encoding time.
+                Some encoders pose restriction on the sample rate used for encoding.
+                If the source sample rate is not supported by the encoder, the source sample rate is used,
+                otherwise a default one is picked.
+                For example, ``"opus"`` encoder only supports 48k Hz, so, when encoding a
+                waveform with ``"opus"`` encoder, it is always encoded as 48k Hz.
+                Meanwhile ``"mp3"`` (``"libmp3lame"``) supports 44.1k, 48k, 32k, 22.05k,
+                24k, 16k, 11.025k, 12k and 8k Hz.
+                If the original sample rate is one of these, then the original sample rate
+                is used, otherwise it will be resampled to a default one (44.1k).
+                When encoding into WAV format, there is no restriction on sample rate,
+                so the original sample rate will be used.
+                Providing ``encoder_sample_rate`` will override this behavior and
+                make encoder attempt to use the provided sample rate.
+                The provided value must be one support by the encoder.
+            encoder_num_channels (int or None, optional): Override the number of channels used for encoding.
+                Similar to sample rate, some encoders (such as ``"opus"``,
+                ``"vorbis"`` and ``"g722"``) pose restriction on
+                the numbe of channels that can be used for encoding.
+                If the original number of channels is supported by encoder,
+                then it will be used, otherwise, the encoder attempts to
+                remix the channel to one of the supported ones.
+                Providing ``encoder_num_channels`` will override this behavior and
+                make encoder attempt to use the provided number of channels.
+                The provided value must be one support by the encoder.
             encoder_format (str or None, optional): {encoder_format}
+            codec_config (CodecConfig or None, optional): {codec_config}
+            filter_desc (str or None, optional): {filter_desc}
         """
-        self._s.add_audio_stream(sample_rate, num_channels, format, encoder, encoder_option, encoder_format)
+        self._s.add_audio_stream(
+            sample_rate,
+            num_channels,
+            format,
+            encoder,
+            encoder_option,
+            encoder_format,
+            encoder_sample_rate,
+            encoder_num_channels,
+            codec_config,
+            filter_desc,
+        )
     @_format_common_args
     def add_video_stream(
@@ -162,9 +292,15 @@ class StreamWriter:
         width: int,
         height: int,
         format: str = "rgb24",
+        *,
         encoder: Optional[str] = None,
         encoder_option: Optional[Dict[str, str]] = None,
+        encoder_frame_rate: Optional[float] = None,
+        encoder_width: Optional[int] = None,
+        encoder_height: Optional[int] = None,
         encoder_format: Optional[str] = None,
+        codec_config: Optional[CodecConfig] = None,
+        filter_desc: Optional[str] = None,
         hw_accel: Optional[str] = None,
     ):
         """Add an output video stream.
@@ -195,8 +331,30 @@ class StreamWriter:
             encoder_option (dict or None, optional): {encoder_option}
+            encoder_frame_rate (float or None, optional): Override the frame rate used for encoding.
+                Some encoders, (such as ``"mpeg1"`` and ``"mpeg2"``) pose restriction on the
+                frame rate that can be used for encoding.
+                If such case, if the source frame rate (provided as ``frame_rate``) is not
+                one of the supported frame rate, then a default one is picked, and the frame rate
+                is changed on-the-fly. Otherwise the source frame rate is used.
+                Providing ``encoder_frame_rate`` will override this behavior and
+                make encoder attempts to use the provided sample rate.
+                The provided value must be one support by the encoder.
+            encoder_width (int or None, optional): Width of the image used for encoding.
+                This allows to change the image size during encoding.
+            encoder_height (int or None, optional): Height of the image used for encoding.
+                This allows to change the image size during encoding.
             encoder_format (str or None, optional): {encoder_format}
+            codec_config (CodecConfig or None, optional): {codec_config}
+            filter_desc (str or None, optional): {filter_desc}
             hw_accel (str or None, optional): Enable hardware acceleration.
                 When video is encoded on CUDA hardware, for example
@@ -207,7 +365,21 @@ class StreamWriter:
                 If `None`, the video chunk Tensor has to be CPU Tensor.
                 Default: ``None``.
         """
-        self._s.add_video_stream(frame_rate, width, height, format, encoder, encoder_option, encoder_format, hw_accel)
+        self._s.add_video_stream(
+            frame_rate,
+            width,
+            height,
+            format,
+            encoder,
+            encoder_option,
+            encoder_format,
+            encoder_frame_rate,
+            encoder_width,
+            encoder_height,
+            hw_accel,
+            codec_config,
+            filter_desc,
+        )
     def set_metadata(self, metadata: Dict[str, str]):
         """Set file-level metadata
@@ -276,17 +448,24 @@ class StreamWriter:
             self._s.close()
             self._is_open = False
-    def write_audio_chunk(self, i: int, chunk: torch.Tensor):
+    def write_audio_chunk(self, i: int, chunk: torch.Tensor, pts: Optional[float] = None):
         """Write audio data
         Args:
             i (int): Stream index.
             chunk (Tensor): Waveform tensor. Shape: `(frame, channel)`.
                 The ``dtype`` must match what was passed to :py:meth:`add_audio_stream` method.
+            pts (float, optional, or None): If provided, overwrite the presentation timestamp.
+                .. note::
+                   The provided value is converted to integer value expressed in basis of
+                   sample rate. Therefore, it is truncated to the nearest value of
+                   ``n / sample_rate``.
         """
-        self._s.write_audio_chunk(i, chunk)
+        self._s.write_audio_chunk(i, chunk, pts)
-    def write_video_chunk(self, i: int, chunk: torch.Tensor):
+    def write_video_chunk(self, i: int, chunk: torch.Tensor, pts: Optional[float] = None):
         """Write video/image data
         Args:
@@ -296,8 +475,15 @@ class StreamWriter:
                 The ``dtype`` must be ``torch.uint8``.
                 The shape (height, width and the number of channels) must match
                 what was configured when calling :py:meth:`add_video_stream`
+            pts (float, optional or None): If provided, overwrite the presentation timestamp.
+                .. note::
+                   The provided value is converted to integer value expressed in basis of
+                   frame rate. Therefore, it is truncated to the nearest value of
+                   ``n / frame_rate``.
         """
-        self._s.write_video_chunk(i, chunk)
+        self._s.write_video_chunk(i, chunk, pts)
     def flush(self):
         """Flush the frames from encoders and write the frames to the destination."""

torchaudio/lib/_torchaudio.so CHANGED Viewed

Binary file

torchaudio/lib/_torchaudio_ffmpeg4.so ADDED Viewed

Binary file

torchaudio/lib/_torchaudio_ffmpeg5.so ADDED Viewed

Binary file

torchaudio/lib/_torchaudio_ffmpeg6.so ADDED Viewed

Binary file

torchaudio/lib/_torchaudio_sox.so CHANGED Viewed

Binary file

torchaudio/lib/libctc_prefix_decoder.so ADDED Viewed

Binary file

torchaudio/lib/libtorchaudio.so CHANGED Viewed

Binary file

torchaudio/lib/libtorchaudio_ffmpeg4.so ADDED Viewed

Binary file

torchaudio/lib/libtorchaudio_ffmpeg5.so ADDED Viewed

Binary file

torchaudio/lib/libtorchaudio_ffmpeg6.so ADDED Viewed

Binary file

torchaudio/lib/libtorchaudio_sox.so CHANGED Viewed

Binary file

torchaudio/lib/pybind11_prefixctc.so ADDED Viewed

Binary file

torchaudio/models/__init__.py CHANGED Viewed

@@ -5,6 +5,14 @@ from .deepspeech import DeepSpeech
 from .emformer import Emformer
 from .rnnt import emformer_rnnt_base, emformer_rnnt_model, RNNT
 from .rnnt_decoder import Hypothesis, RNNTBeamSearch
+from .squim import (
+    squim_objective_base,
+    squim_objective_model,
+    squim_subjective_base,
+    squim_subjective_model,
+    SquimObjective,
+    SquimSubjective,
+)
 from .tacotron2 import Tacotron2
 from .wav2letter import Wav2Letter
 from .wav2vec2 import (
@@ -68,4 +76,10 @@ __all__ = [
     "hdemucs_low",
     "hdemucs_medium",
     "hdemucs_high",
+    "squim_objective_base",
+    "squim_objective_model",
+    "squim_subjective_base",
+    "squim_subjective_model",
+    "SquimObjective",
+    "SquimSubjective",
 ]

torchaudio/models/decoder/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
-_INITIALIZED = False
-_LAZILY_IMPORTED = [
+_CTC_DECODERS = [
     "CTCHypothesis",
     "CTCDecoder",
     "CTCDecoderLM",
@@ -7,25 +6,41 @@ _LAZILY_IMPORTED = [
     "ctc_decoder",
     "download_pretrained_files",
 ]
+_CUDA_CTC_DECODERS = [
+    "CUCTCDecoder",
+    "CUCTCHypothesis",
+    "cuda_ctc_decoder",
+]
 def __getattr__(name: str):
-    if name in _LAZILY_IMPORTED:
+    if name in _CTC_DECODERS:
         try:
             from . import _ctc_decoder
-        except AttributeError as err:
+        except Exception as err:
             raise RuntimeError(
-                "CTC decoder requires the decoder extension. Please set BUILD_CTC_DECODER=1 when building from source."
+                "CTC Decoder suit requires flashlight-text package and optionally KenLM. Please install them."
             ) from err
         item = getattr(_ctc_decoder, name)
         globals()[name] = item
         return item
+    elif name in _CUDA_CTC_DECODERS:
+        try:
+            from . import _cuda_ctc_decoder
+        except AttributeError as err:
+            raise RuntimeError(
+                "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
+            ) from err
+        item = getattr(_cuda_ctc_decoder, name)
+        globals()[name] = item
+        return item
     raise AttributeError(f"module {__name__} has no attribute {name}")
 def __dir__():
-    return sorted(__all__ + _LAZILY_IMPORTED)
+    return sorted(__all__)
-__all__ = []
+__all__ = [_CTC_DECODERS, _CUDA_CTC_DECODERS]

torchaudio/models/decoder/_ctc_decoder.py CHANGED Viewed

@@ -2,69 +2,38 @@ from __future__ import annotations
 import itertools as it
-import warnings
 from abc import abstractmethod
 from collections import namedtuple
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 import torch
-import torchaudio
-from torchaudio.utils import download_asset
-# We prioritize the version from upstream flashlight here.
-# This will allow applications that use the upstream flashlight
-# alongside torchaudio.
-if torchaudio._internal.module_utils.is_module_available("flashlight"):
-    from flashlight.lib.text.decoder import (
-        CriterionType as _CriterionType,
-        LexiconDecoder as _LexiconDecoder,
-        LexiconDecoderOptions as _LexiconDecoderOptions,
-        LexiconFreeDecoder as _LexiconFreeDecoder,
-        LexiconFreeDecoderOptions as _LexiconFreeDecoderOptions,
-        LM as _LM,
-        LMState as _LMState,
-        SmearingMode as _SmearingMode,
-        Trie as _Trie,
-        ZeroLM as _ZeroLM,
-    )
-    from flashlight.lib.text.dictionary import (
-        create_word_dict as _create_word_dict,
-        Dictionary as _Dictionary,
-        load_words as _load_words,
-    )
+from flashlight.lib.text.decoder import (
+    CriterionType as _CriterionType,
+    LexiconDecoder as _LexiconDecoder,
+    LexiconDecoderOptions as _LexiconDecoderOptions,
+    LexiconFreeDecoder as _LexiconFreeDecoder,
+    LexiconFreeDecoderOptions as _LexiconFreeDecoderOptions,
+    LM as _LM,
+    LMState as _LMState,
+    SmearingMode as _SmearingMode,
+    Trie as _Trie,
+    ZeroLM as _ZeroLM,
+)
+from flashlight.lib.text.dictionary import (
+    create_word_dict as _create_word_dict,
+    Dictionary as _Dictionary,
+    load_words as _load_words,
+)
+from torchaudio.utils import download_asset
+try:
+    from flashlight.lib.text.decoder.kenlm import KenLM as _KenLM
+except Exception:
     try:
         from flashlight.lib.text.decoder import KenLM as _KenLM
     except Exception:
         _KenLM = None
-else:
-    torchaudio._extension._load_lib("libflashlight-text")
-    from torchaudio.lib.flashlight_lib_text_decoder import (
-        CriterionType as _CriterionType,
-        KenLM as _KenLM,
-        LexiconDecoder as _LexiconDecoder,
-        LexiconDecoderOptions as _LexiconDecoderOptions,
-        LexiconFreeDecoder as _LexiconFreeDecoder,
-        LexiconFreeDecoderOptions as _LexiconFreeDecoderOptions,
-        LM as _LM,
-        LMState as _LMState,
-        SmearingMode as _SmearingMode,
-        Trie as _Trie,
-        ZeroLM as _ZeroLM,
-    )
-    from torchaudio.lib.flashlight_lib_text_dictionary import (
-        create_word_dict as _create_word_dict,
-        Dictionary as _Dictionary,
-        load_words as _load_words,
-    )
-    warnings.warn(
-        "The built-in flashlight integration is deprecated, and will be removed in future release. "
-        "Please install flashlight-text. https://pypi.org/project/flashlight-text/ "
-        "For the detail of CTC decoder migration, please see https://github.com/pytorch/audio/issues/3088."
-    )
 __all__ = [
     "CTCHypothesis",
@@ -292,10 +261,102 @@ class CTCDecoder:
                 timesteps.append(i)
         return torch.IntTensor(timesteps)
+    def decode_begin(self):
+        """Initialize the internal state of the decoder.
+        See :py:meth:`decode_step` for the usage.
+        .. note::
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        self.decoder.decode_begin()
+    def decode_end(self):
+        """Finalize the internal state of the decoder.
+        See :py:meth:`decode_step` for the usage.
+        .. note::
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        self.decoder.decode_end()
+    def decode_step(self, emissions: torch.FloatTensor):
+        """Perform incremental decoding on top of the curent internal state.
+        .. note::
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        Args:
+            emissions (torch.FloatTensor): CPU tensor of shape `(frame, num_tokens)` storing sequences of
+                probability distribution over labels; output of acoustic model.
+        Example:
+            >>> decoder = torchaudio.models.decoder.ctc_decoder(...)
+            >>> decoder.decode_begin()
+            >>> decoder.decode_step(emission1)
+            >>> decoder.decode_step(emission2)
+            >>> decoder.decode_end()
+            >>> result = decoder.get_final_hypothesis()
+        """
+        if emissions.dtype != torch.float32:
+            raise ValueError("emissions must be float32.")
+        if not emissions.is_cpu:
+            raise RuntimeError("emissions must be a CPU tensor.")
+        if not emissions.is_contiguous():
+            raise RuntimeError("emissions must be contiguous.")
+        if emissions.ndim != 2:
+            raise RuntimeError(f"emissions must be 2D. Found {emissions.shape}")
+        T, N = emissions.size()
+        self.decoder.decode_step(emissions.data_ptr(), T, N)
+    def _to_hypo(self, results) -> List[CTCHypothesis]:
+        return [
+            CTCHypothesis(
+                tokens=self._get_tokens(result.tokens),
+                words=[self.word_dict.get_entry(x) for x in result.words if x >= 0],
+                score=result.score,
+                timesteps=self._get_timesteps(result.tokens),
+            )
+            for result in results
+        ]
+    def get_final_hypothesis(self) -> List[CTCHypothesis]:
+        """Get the final hypothesis
+        Returns:
+            List[CTCHypothesis]:
+                List of sorted best hypotheses.
+        .. note::
+           This method is required only when performing online decoding.
+           It is not necessary when performing batch decoding with :py:meth:`__call__`.
+        """
+        results = self.decoder.get_all_final_hypothesis()
+        return self._to_hypo(results[: self.nbest])
     def __call__(
         self, emissions: torch.FloatTensor, lengths: Optional[torch.Tensor] = None
     ) -> List[List[CTCHypothesis]]:
         """
+        Performs batched offline decoding.
+        .. note::
+           This method performs offline decoding in one go. To perform incremental decoding,
+           please refer to :py:meth:`decode_step`.
         Args:
             emissions (torch.FloatTensor): CPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
                 probability distribution over labels; output of acoustic model.
@@ -310,13 +371,16 @@ class CTCDecoder:
         if emissions.dtype != torch.float32:
             raise ValueError("emissions must be float32.")
-        if emissions.is_cuda:
+        if not emissions.is_cpu:
             raise RuntimeError("emissions must be a CPU tensor.")
         if not emissions.is_contiguous():
             raise RuntimeError("emissions must be contiguous.")
-        if lengths is not None and lengths.is_cuda:
+        if emissions.ndim != 3:
+            raise RuntimeError(f"emissions must be 3D. Found {emissions.shape}")
+        if lengths is not None and not lengths.is_cpu:
             raise RuntimeError("lengths must be a CPU tensor.")
         B, T, N = emissions.size()
@@ -329,20 +393,7 @@ class CTCDecoder:
         for b in range(B):
             emissions_ptr = emissions.data_ptr() + float_bytes * b * emissions.stride(0)
             results = self.decoder.decode(emissions_ptr, lengths[b], N)
-            nbest_results = results[: self.nbest]
-            hypos.append(
-                [
-                    CTCHypothesis(
-                        tokens=self._get_tokens(result.tokens),
-                        words=[self.word_dict.get_entry(x) for x in result.words if x >= 0],
-                        score=result.score,
-                        timesteps=self._get_timesteps(result.tokens),
-                    )
-                    for result in nbest_results
-                ]
-            )
+            hypos.append(self._to_hypo(results[: self.nbest]))
         return hypos
     def idxs_to_tokens(self, idxs: torch.LongTensor) -> List:
@@ -450,7 +501,10 @@ def ctc_decoder(
     if type(lm) == str:
         if _KenLM is None:
-            raise RuntimeError("flashlight is installed, but KenLM is not installed. Please install KenLM.")
+            raise RuntimeError(
+                "flashlight-text is installed, but KenLM is not installed. "
+                "Please refer to https://github.com/kpu/kenlm#python-module for how to install it."
+            )
         lm = _KenLM(lm, word_dict)
     elif lm is None:
         lm = _ZeroLM()