PyPI - torchaudio - Versions diffs - 2.0.2__cp310-cp310-manylinux1_x86_64.whl → 2.1.1__cp310-cp310-manylinux1_x86_64.whl - Mend

torchaudio 2.0.2__cp310-cp310-manylinux1_x86_64.whl → 2.1.1__cp310-cp310-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libctc_prefix_decoder.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/lib/pybind11_prefixctc.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +119 -0
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/datasets/cmudict.py CHANGED Viewed

@@ -3,78 +3,77 @@ import re
 from pathlib import Path
 from typing import Iterable, List, Tuple, Union
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 _CHECKSUMS = {
     "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",  # noqa: E501
     "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",  # noqa: E501
 }
-_PUNCTUATIONS = set(
-    [
-        "!EXCLAMATION-POINT",
-        '"CLOSE-QUOTE',
-        '"DOUBLE-QUOTE',
-        '"END-OF-QUOTE',
-        '"END-QUOTE',
-        '"IN-QUOTES',
-        '"QUOTE',
-        '"UNQUOTE',
-        "#HASH-MARK",
-        "#POUND-SIGN",
-        "#SHARP-SIGN",
-        "%PERCENT",
-        "&AMPERSAND",
-        "'END-INNER-QUOTE",
-        "'END-QUOTE",
-        "'INNER-QUOTE",
-        "'QUOTE",
-        "'SINGLE-QUOTE",
-        "(BEGIN-PARENS",
-        "(IN-PARENTHESES",
-        "(LEFT-PAREN",
-        "(OPEN-PARENTHESES",
-        "(PAREN",
-        "(PARENS",
-        "(PARENTHESES",
-        ")CLOSE-PAREN",
-        ")CLOSE-PARENTHESES",
-        ")END-PAREN",
-        ")END-PARENS",
-        ")END-PARENTHESES",
-        ")END-THE-PAREN",
-        ")PAREN",
-        ")PARENS",
-        ")RIGHT-PAREN",
-        ")UN-PARENTHESES",
-        "+PLUS",
-        ",COMMA",
-        "--DASH",
-        "-DASH",
-        "-HYPHEN",
-        "...ELLIPSIS",
-        ".DECIMAL",
-        ".DOT",
-        ".FULL-STOP",
-        ".PERIOD",
-        ".POINT",
-        "/SLASH",
-        ":COLON",
-        ";SEMI-COLON",
-        ";SEMI-COLON(1)",
-        "?QUESTION-MARK",
-        "{BRACE",
-        "{LEFT-BRACE",
-        "{OPEN-BRACE",
-        "}CLOSE-BRACE",
-        "}RIGHT-BRACE",
-    ]
-)
+_PUNCTUATIONS = {
+    "!EXCLAMATION-POINT",
+    '"CLOSE-QUOTE',
+    '"DOUBLE-QUOTE',
+    '"END-OF-QUOTE',
+    '"END-QUOTE',
+    '"IN-QUOTES',
+    '"QUOTE',
+    '"UNQUOTE',
+    "#HASH-MARK",
+    "#POUND-SIGN",
+    "#SHARP-SIGN",
+    "%PERCENT",
+    "&AMPERSAND",
+    "'END-INNER-QUOTE",
+    "'END-QUOTE",
+    "'INNER-QUOTE",
+    "'QUOTE",
+    "'SINGLE-QUOTE",
+    "(BEGIN-PARENS",
+    "(IN-PARENTHESES",
+    "(LEFT-PAREN",
+    "(OPEN-PARENTHESES",
+    "(PAREN",
+    "(PARENS",
+    "(PARENTHESES",
+    ")CLOSE-PAREN",
+    ")CLOSE-PARENTHESES",
+    ")END-PAREN",
+    ")END-PARENS",
+    ")END-PARENTHESES",
+    ")END-THE-PAREN",
+    ")PAREN",
+    ")PARENS",
+    ")RIGHT-PAREN",
+    ")UN-PARENTHESES",
+    "+PLUS",
+    ",COMMA",
+    "--DASH",
+    "-DASH",
+    "-HYPHEN",
+    "...ELLIPSIS",
+    ".DECIMAL",
+    ".DOT",
+    ".FULL-STOP",
+    ".PERIOD",
+    ".POINT",
+    "/SLASH",
+    ":COLON",
+    ";SEMI-COLON",
+    ";SEMI-COLON(1)",
+    "?QUESTION-MARK",
+    "{BRACE",
+    "{LEFT-BRACE",
+    "{OPEN-BRACE",
+    "}CLOSE-BRACE",
+    "}RIGHT-BRACE",
+}
 def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
     _alt_re = re.compile(r"\([0-9]+\)")
-    cmudict: List[Tuple[str, List[str]]] = list()
+    cmudict: List[Tuple[str, List[str]]] = []
     for line in lines:
         if not line or line.startswith(";;;"):  # ignore comments
             continue

torchaudio/datasets/dr_vctk.py CHANGED Viewed

@@ -3,8 +3,8 @@ from typing import Dict, Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_zip

torchaudio/datasets/gtzan.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Optional, Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar
 # The following lists prefixed with `filtered_` provide a filtered split

torchaudio/datasets/librilight_limited.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import List, Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.librispeech import _get_librispeech_metadata
 from torchaudio.datasets.utils import _extract_tar

torchaudio/datasets/librispeech.py CHANGED Viewed

@@ -3,8 +3,8 @@ from pathlib import Path
 from typing import Tuple, Union
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar, _load_waveform
 URL = "train-clean-100"

torchaudio/datasets/librispeech_biasing.py ADDED Viewed

@@ -0,0 +1,189 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar, _load_waveform
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriSpeech"
+SAMPLE_RATE = 16000
+_DATA_SUBSETS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",  # noqa: E501
+    "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2",  # noqa: E501
+}
+def _download_librispeech(root, url):
+    base_url = "http://www.openslr.org/resources/12/"
+    ext_archive = ".tar.gz"
+    filename = url + ext_archive
+    archive = os.path.join(root, filename)
+    download_url = os.path.join(base_url, filename)
+    if not os.path.isfile(archive):
+        checksum = _CHECKSUMS.get(download_url, None)
+        download_url_to_file(download_url, archive, hash_prefix=checksum)
+    _extract_tar(archive)
+def _get_librispeech_metadata(
+    fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str, blist: List[str]
+) -> Tuple[str, int, str, int, int, int]:
+    blist = blist or []
+    speaker_id, chapter_id, utterance_id = fileid.split("-")
+    # Get audio path and sample rate
+    fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
+    filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")
+    # Load text
+    file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
+    file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
+    uttblist = []
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                # get utterance biasing list
+                for word in transcript.split():
+                    if word in blist and word not in uttblist:
+                        uttblist.append(word)
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError(f"Translation not found for {fileid_audio}")
+    return (
+        filepath,
+        SAMPLE_RATE,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+        uttblist,
+    )
+class LibriSpeechBiasing(Dataset):
+    """*LibriSpeech* :cite:`7178964` dataset with prefix-tree construction and biasing support.
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        blist (list, optional):
+            The list of biasing words (default: ``[]``).
+    """
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        blist: List[str] = None,
+    ) -> None:
+        self._url = url
+        if url not in _DATA_SUBSETS:
+            raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
+        root = os.fspath(root)
+        self._archive = os.path.join(root, folder_in_archive)
+        self._path = os.path.join(root, folder_in_archive, url)
+        if not os.path.isdir(self._path):
+            if download:
+                _download_librispeech(root, url)
+            else:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+        self.blist = blist
+    def get_metadata(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+            list:
+                List of biasing words in the utterance
+        """
+        fileid = self._walker[n]
+        return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt, self.blist)
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                Transcript
+            int:
+                Speaker ID
+            int:
+                Chapter ID
+            int:
+                Utterance ID
+            list:
+                List of biasing words in the utterance
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._archive, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+    def __len__(self) -> int:
+        return len(self._walker)

torchaudio/datasets/libritts.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar
 URL = "train-clean-100"

torchaudio/datasets/ljspeech.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing import Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar

torchaudio/datasets/musdb_hq.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torchaudio
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_zip
 _URL = "https://zenodo.org/record/3338373/files/musdb18hq.zip"

torchaudio/datasets/quesst14.py CHANGED Viewed

@@ -4,8 +4,8 @@ from pathlib import Path
 from typing import Optional, Tuple, Union
 import torch
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar, _load_waveform

torchaudio/datasets/speechcommands.py CHANGED Viewed

@@ -3,8 +3,8 @@ from pathlib import Path
 from typing import Optional, Tuple, Union
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar, _load_waveform
 FOLDER_IN_ARCHIVE = "SpeechCommands"

torchaudio/datasets/tedlium.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar

torchaudio/datasets/vctk.py CHANGED Viewed

@@ -3,8 +3,8 @@ from typing import Tuple
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_zip
 URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"

torchaudio/datasets/voxceleb1.py CHANGED Viewed

@@ -3,8 +3,8 @@ from pathlib import Path
 from typing import List, Tuple, Union
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_zip, _load_waveform

torchaudio/datasets/yesno.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import List, Tuple, Union
 import torchaudio
 from torch import Tensor
-from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio._internal import download_url_to_file
 from torchaudio.datasets.utils import _extract_tar

torchaudio/functional/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from ._alignment import forced_align, merge_tokens, TokenSpan
 from .filtering import (
     allpass_biquad,
     band_biquad,
@@ -28,7 +29,6 @@ from .functional import (
     apply_beamforming,
     apply_codec,
     compute_deltas,
-    compute_kaldi_pitch,
     convolve,
     create_dct,
     DB_to_amplitude,
@@ -36,6 +36,7 @@ from .functional import (
     detect_pitch_frequency,
     edit_distance,
     fftconvolve,
+    frechet_distance,
     griffinlim,
     inverse_spectrogram,
     linear_fbanks,
@@ -64,7 +65,6 @@ from .functional import (
 __all__ = [
     "amplitude_to_DB",
     "compute_deltas",
-    "compute_kaldi_pitch",
     "create_dct",
     "melscale_fbanks",
     "linear_fbanks",
@@ -94,6 +94,9 @@ __all__ = [
     "equalizer_biquad",
     "filtfilt",
     "flanger",
+    "forced_align",
+    "merge_tokens",
+    "TokenSpan",
     "gain",
     "highpass_biquad",
     "lfilter",
@@ -120,4 +123,5 @@ __all__ = [
     "speed",
     "preemphasis",
     "deemphasis",
+    "frechet_distance",
 ]

torchaudio/functional/_alignment.py ADDED Viewed

@@ -0,0 +1,128 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+from torch import Tensor
+from torchaudio._extension import fail_if_no_align
+__all__ = []
+@fail_if_no_align
+def forced_align(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Optional[Tensor] = None,
+    target_lengths: Optional[Tensor] = None,
+    blank: int = 0,
+) -> Tuple[Tensor, Tensor]:
+    r"""Align a CTC label sequence to an emission.
+    .. devices:: CPU CUDA
+    .. properties:: TorchScript
+    Args:
+        log_probs (Tensor): log probability of CTC emission output.
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
+            `C` is the number of characters in alphabet including blank.
+        targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
+            where `L` is the target length.
+        input_lengths (Tensor or None, optional):
+            Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (Tensor or None, optional):
+            Lengths of the targets. 1-D Tensor of shape `(B,)`.
+        blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
+    Returns:
+        Tuple(Tensor, Tensor):
+            Tensor: Label for each time step in the alignment path computed using forced alignment.
+            Tensor: Log probability scores of the labels for each time step.
+    Note:
+        The sequence length of `log_probs` must satisfy:
+        .. math::
+            L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
+        where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
+        For example, in str `"aabbc"`, the number of repeats are `2`.
+    Note:
+        The current version only supports ``batch_size==1``.
+    """
+    if blank in targets:
+        raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
+    if torch.max(targets) >= log_probs.shape[-1]:
+        raise ValueError("targets values must be less than the CTC dimension")
+    if input_lengths is None:
+        batch_size, length = log_probs.size(0), log_probs.size(1)
+        input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
+    if target_lengths is None:
+        batch_size, length = targets.size(0), targets.size(1)
+        target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
+    # For TorchScript compatibility
+    assert input_lengths is not None
+    assert target_lengths is not None
+    paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+    return paths, scores
+@dataclass
+class TokenSpan:
+    """TokenSpan()
+    Token with time stamps and score. Returned by :py:func:`merge_tokens`.
+    """
+    token: int
+    """The token"""
+    start: int
+    """The start time (inclusive) in emission time axis."""
+    end: int
+    """The end time (exclusive) in emission time axis."""
+    score: float
+    """The score of the this token."""
+    def __len__(self) -> int:
+        """Returns the time span"""
+        return self.end - self.start
+def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
+    """Removes repeated tokens and blank tokens from the given CTC token sequence.
+    Args:
+        tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`.
+        scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`. When computing the token-size score, the given score is averaged
+            across the corresponding time span.
+    Returns:
+        list of TokenSpan
+    Example:
+        >>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
+        >>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
+    """
+    if tokens.ndim != 1 or scores.ndim != 1:
+        raise ValueError("`tokens` and `scores` must be 1D Tensor.")
+    if len(tokens) != len(scores):
+        raise ValueError("`tokens` and `scores` must be the same length.")
+    diff = torch.diff(
+        tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
+    )
+    changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
+    tokens = tokens.tolist()
+    spans = [
+        TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
+        for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
+        if (token := tokens[start]) != blank
+    ]
+    return spans