PyPI - torchaudio - Versions diffs - 2.0.2__cp38-cp38-win_amd64.whl → 2.1.1__cp38-cp38-win_amd64.whl - Mend

torchaudio 2.0.2__cp38-cp38-win_amd64.whl → 2.1.1__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (88) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.pyd +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.pyd +0 -0
torchaudio/lib/libtorchaudio.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.pyd +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +115 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.pyd +0 -0
torchaudio/lib/flashlight_lib_text_decoder.pyd +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.pyd +0 -0
torchaudio/lib/libflashlight-text.pyd +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.pyd +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -98
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/io/_stream_reader.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import BinaryIO, Dict, Iterator, Optional, Tuple, Union
+from typing import BinaryIO, Dict, Iterator, Optional, Tuple, TypeVar, Union
 import torch
 import torchaudio
 from torch.utils._pytree import tree_map
+if torchaudio._extension._FFMPEG_EXT is not None:
+    _StreamReader = torchaudio._extension._FFMPEG_EXT.StreamReader
+    _StreamReaderFileObj = torchaudio._extension._FFMPEG_EXT.StreamReaderFileObj
 __all__ = [
     "StreamReader",
 ]
@@ -103,70 +108,44 @@ class SourceVideoStream(SourceStream):
     """Frame rate."""
-# Indices of SrcInfo returned by low-level `get_src_stream_info`
-# - COMMON
-_MEDIA_TYPE = 0
-_CODEC = 1
-_CODEC_LONG = 2
-_FORMAT = 3
-_BIT_RATE = 4
-_NUM_FRAMES = 5
-_BPS = 6
-_METADATA = 7
-# - AUDIO
-_SAMPLE_RATE = 8
-_NUM_CHANNELS = 9
-# - VIDEO
-_WIDTH = 10
-_HEIGHT = 11
-_FRAME_RATE = 12
 def _parse_si(i):
-    media_type = i[_MEDIA_TYPE]
-    codec_name = i[_CODEC]
-    codec_long_name = i[_CODEC_LONG]
-    fmt = i[_FORMAT]
-    bit_rate = i[_BIT_RATE]
-    num_frames = i[_NUM_FRAMES]
-    bps = i[_BPS]
-    metadata = i[_METADATA]
+    media_type = i.media_type
     if media_type == "audio":
         return SourceAudioStream(
-            media_type=media_type,
-            codec=codec_name,
-            codec_long_name=codec_long_name,
-            format=fmt,
-            bit_rate=bit_rate,
-            num_frames=num_frames,
-            bits_per_sample=bps,
-            metadata=metadata,
-            sample_rate=i[_SAMPLE_RATE],
-            num_channels=i[_NUM_CHANNELS],
+            media_type=i.media_type,
+            codec=i.codec_name,
+            codec_long_name=i.codec_long_name,
+            format=i.format,
+            bit_rate=i.bit_rate,
+            num_frames=i.num_frames,
+            bits_per_sample=i.bits_per_sample,
+            metadata=i.metadata,
+            sample_rate=i.sample_rate,
+            num_channels=i.num_channels,
         )
     if media_type == "video":
         return SourceVideoStream(
-            media_type=media_type,
-            codec=codec_name,
-            codec_long_name=codec_long_name,
-            format=fmt,
-            bit_rate=bit_rate,
-            num_frames=num_frames,
-            bits_per_sample=bps,
-            metadata=metadata,
-            width=i[_WIDTH],
-            height=i[_HEIGHT],
-            frame_rate=i[_FRAME_RATE],
+            media_type=i.media_type,
+            codec=i.codec_name,
+            codec_long_name=i.codec_long_name,
+            format=i.format,
+            bit_rate=i.bit_rate,
+            num_frames=i.num_frames,
+            bits_per_sample=i.bits_per_sample,
+            metadata=i.metadata,
+            width=i.width,
+            height=i.height,
+            frame_rate=i.frame_rate,
         )
     return SourceStream(
-        media_type=media_type,
-        codec=codec_name,
-        codec_long_name=codec_long_name,
+        media_type=i.media_type,
+        codec=i.codec_name,
+        codec_long_name=i.codec_long_name,
         format=None,
         bit_rate=None,
         num_frames=None,
         bits_per_sample=None,
-        metadata=metadata,
+        metadata=i.metadata,
     )
@@ -180,18 +159,93 @@ class OutputStream:
     """Index of the source stream that this output stream is connected."""
     filter_description: str
     """Description of filter graph applied to the source stream."""
+    media_type: str
+    """The type of the stream. ``"audio"`` or ``"video"``."""
+    format: str
+    """Media format. Such as ``"s16"`` and ``"yuv420p"``.
+    Commonly found audio values are;
+    - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger.
+    - ``"s16"``, ``"s16p"``: 16-bit signed integer.
+    - ``"s32"``, ``"s32p"``: 32-bit signed integer.
+    - ``"flt"``, ``"fltp"``: 32-bit floating-point.
+    .. note::
+       `p` at the end indicates the format is `planar`.
+       Channels are grouped together instead of interspersed in memory."""
+@dataclass
+class OutputAudioStream(OutputStream):
+    """Information about an audio output stream configured with
+    :meth:`~torchaudio.io.StreamReader.add_audio_stream` or
+    :meth:`~torchaudio.io.StreamReader.add_basic_audio_stream`.
+    In addition to the attributes reported by :class:`OutputStream`,
+    the following attributes are reported.
+    """
+    sample_rate: float
+    """Sample rate of the audio."""
+    num_channels: int
+    """Number of channels."""
+@dataclass
+class OutputVideoStream(OutputStream):
+    """Information about a video output stream configured with
+    :meth:`~torchaudio.io.StreamReader.add_video_stream` or
+    :meth:`~torchaudio.io.StreamReader.add_basic_video_stream`.
+    In addition to the attributes reported by :class:`OutputStream`,
+    the following attributes are reported.
+    """
+    width: int
+    """Width of the video frame in pixel."""
+    height: int
+    """Height of the video frame in pixel."""
+    frame_rate: float
+    """Frame rate."""
 def _parse_oi(i):
-    return OutputStream(i[0], i[1])
+    media_type = i.media_type
+    if media_type == "audio":
+        return OutputAudioStream(
+            source_index=i.source_index,
+            filter_description=i.filter_description,
+            media_type=i.media_type,
+            format=i.format,
+            sample_rate=i.sample_rate,
+            num_channels=i.num_channels,
+        )
+    if media_type == "video":
+        return OutputVideoStream(
+            source_index=i.source_index,
+            filter_description=i.filter_description,
+            media_type=i.media_type,
+            format=i.format,
+            width=i.width,
+            height=i.height,
+            frame_rate=i.frame_rate,
+        )
+    raise ValueError(f"Unexpected media_type: {i.media_type}({i})")
-def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str]):
+def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str], num_channels: Optional[int]):
     descs = []
     if sample_rate is not None:
         descs.append(f"aresample={sample_rate}")
-    if fmt is not None:
-        descs.append(f"aformat=sample_fmts={fmt}")
+    if fmt is not None or num_channels is not None:
+        parts = []
+        if fmt is not None:
+            parts.append(f"sample_fmts={fmt}")
+        if num_channels is not None:
+            parts.append(f"channel_layouts={num_channels}c")
+        descs.append(f"aformat={':'.join(parts)}")
     return ",".join(descs) if descs else None
@@ -381,6 +435,10 @@ _format_video_args = _format_doc(
 )
+InputStreamTypes = TypeVar("InputStream", bound=SourceStream)
+OutputStreamTypes = TypeVar("OutputStream", bound=OutputStream)
 @torchaudio._extension.fail_if_no_ffmpeg
 class StreamReader:
     """Fetch and decode audio/video streams chunk by chunk.
@@ -388,7 +446,7 @@ class StreamReader:
     For the detailed usage of this class, please refer to the tutorial.
     Args:
-        src (str, file-like object or Tensor): The media source.
+        src (str, file-like object): The media source.
             If string-type, it must be a resource indicator that FFmpeg can
             handle. This includes a file path, URL, device identifier or
             filter expression. The supported value depends on the FFmpeg found
@@ -401,9 +459,6 @@ class StreamReader:
             of codec detection. The signagure of `seek` method must be
             `seek(offset: int, whence: int) -> int`.
-            If Tensor, it is interpreted as byte buffer.
-            It must be one-dimensional, of type ``torch.uint8``.
             Please refer to the following for the expected signature and behavior
             of `read` and `seek` method.
@@ -457,20 +512,17 @@ class StreamReader:
     def __init__(
         self,
-        src: Union[str, BinaryIO, torch.Tensor],
+        src: Union[str, BinaryIO],
         format: Optional[str] = None,
         option: Optional[Dict[str, str]] = None,
         buffer_size: int = 4096,
     ):
-        torch._C._log_api_usage_once("torchaudio.io.StreamReader")
         if isinstance(src, str):
-            self._be = torch.classes.torchaudio.ffmpeg_StreamReader(src, format, option)
-        elif isinstance(src, torch.Tensor):
-            self._be = torch.classes.torchaudio.ffmpeg_StreamReaderTensor(src, format, option, buffer_size)
+            self._be = _StreamReader(src, format, option)
         elif hasattr(src, "read"):
-            self._be = torchaudio.lib._torchaudio_ffmpeg.StreamReaderFileObj(src, format, option, buffer_size)
+            self._be = _StreamReaderFileObj(src, format, option, buffer_size)
         else:
-            raise ValueError("`src` must be either string, Tensor or file-like object.")
+            raise ValueError("`src` must be either a string or file-like object.")
         i = self._be.find_best_audio_stream()
         self._default_audio_stream = None if i < 0 else i
@@ -517,28 +569,37 @@ class StreamReader:
         """
         return self._be.get_metadata()
-    def get_src_stream_info(self, i: int) -> Union[SourceStream, SourceAudioStream, SourceVideoStream]:
+    def get_src_stream_info(self, i: int) -> InputStreamTypes:
         """Get the metadata of source stream
         Args:
             i (int): Stream index.
         Returns:
-            Information about the source stream.
-            If the source stream is audio type, then :class:`SourceAudioStream` returned.
-            If it is video type, then :class:`SourceVideoStream` is returned.
-            Otherwise :class:`SourceStream` class is returned.
+            InputStreamTypes:
+                Information about the source stream.
+                If the source stream is audio type, then
+                :class:`~torchaudio.io._stream_reader.SourceAudioStream` is returned.
+                If it is video type, then
+                :class:`~torchaudio.io._stream_reader.SourceVideoStream` is returned.
+                Otherwise :class:`~torchaudio.io._stream_reader.SourceStream` class is returned.
         """
         return _parse_si(self._be.get_src_stream_info(i))
-    def get_out_stream_info(self, i: int) -> OutputStream:
+    def get_out_stream_info(self, i: int) -> OutputStreamTypes:
         """Get the metadata of output stream
         Args:
             i (int): Stream index.
         Returns:
-            OutputStream
+            OutputStreamTypes
+                Information about the output stream.
+                If the output stream is audio type, then
+                :class:`~torchaudio.io._stream_reader.OutputAudioStream` is returned.
+                If it is video type, then
+                :class:`~torchaudio.io._stream_reader.OutputVideoStream` is returned.
         """
-        return _parse_oi(self._be.get_out_stream_info(i))
+        info = self._be.get_out_stream_info(i)
+        return _parse_oi(info)
     def seek(self, timestamp: float, mode: str = "precise"):
         """Seek the stream to the given timestamp [second]
@@ -574,11 +635,13 @@ class StreamReader:
         self,
         frames_per_chunk: int,
         buffer_chunk_size: int = 3,
+        *,
         stream_index: Optional[int] = None,
         decoder: Optional[str] = None,
         decoder_option: Optional[Dict[str, str]] = None,
         format: Optional[str] = "fltp",
         sample_rate: Optional[int] = None,
+        num_channels: Optional[int] = None,
     ):
         """Add output audio stream
@@ -611,14 +674,16 @@ class StreamReader:
                 Default: ``"fltp"``.
             sample_rate (int or None, optional): If provided, resample the audio.
+            num_channels (int, or None, optional): If provided, change the number of channels.
         """
         self.add_audio_stream(
             frames_per_chunk,
             buffer_chunk_size,
-            stream_index,
-            decoder,
-            decoder_option,
-            _get_afilter_desc(sample_rate, format),
+            stream_index=stream_index,
+            decoder=decoder,
+            decoder_option=decoder_option,
+            filter_desc=_get_afilter_desc(sample_rate, format, num_channels),
         )
     @_format_video_args
@@ -626,14 +691,15 @@ class StreamReader:
         self,
         frames_per_chunk: int,
         buffer_chunk_size: int = 3,
+        *,
         stream_index: Optional[int] = None,
         decoder: Optional[str] = None,
         decoder_option: Optional[Dict[str, str]] = None,
-        hw_accel: Optional[str] = None,
         format: Optional[str] = "rgb24",
         frame_rate: Optional[int] = None,
         width: Optional[int] = None,
         height: Optional[int] = None,
+        hw_accel: Optional[str] = None,
     ):
         """Add output video stream
@@ -648,8 +714,6 @@ class StreamReader:
             decoder_option (dict or None, optional): {decoder_option}
-            hw_accel (str or None, optional): {hw_accel}
             format (str, optional): Change the format of image channels. Valid values are,
                 - ``"rgb24"``: 8 bits * 3 channels (R, G, B)
@@ -664,15 +728,17 @@ class StreamReader:
             width (int or None, optional): If provided, change the image width. Unit: Pixel.
             height (int or None, optional): If provided, change the image height. Unit: Pixel.
+            hw_accel (str or None, optional): {hw_accel}
         """
         self.add_video_stream(
             frames_per_chunk,
             buffer_chunk_size,
-            stream_index,
-            decoder,
-            decoder_option,
-            hw_accel,
-            _get_vfilter_desc(frame_rate, width, height, format),
+            stream_index=stream_index,
+            decoder=decoder,
+            decoder_option=decoder_option,
+            filter_desc=_get_vfilter_desc(frame_rate, width, height, format),
+            hw_accel=hw_accel,
         )
     @_format_audio_args
@@ -680,6 +746,7 @@ class StreamReader:
         self,
         frames_per_chunk: int,
         buffer_chunk_size: int = 3,
+        *,
         stream_index: Optional[int] = None,
         decoder: Optional[str] = None,
         decoder_option: Optional[Dict[str, str]] = None,
@@ -721,11 +788,12 @@ class StreamReader:
         self,
         frames_per_chunk: int,
         buffer_chunk_size: int = 3,
+        *,
         stream_index: Optional[int] = None,
         decoder: Optional[str] = None,
         decoder_option: Optional[Dict[str, str]] = None,
-        hw_accel: Optional[str] = None,
         filter_desc: Optional[str] = None,
+        hw_accel: Optional[str] = None,
     ):
         """Add output video stream
@@ -848,7 +916,7 @@ class StreamReader:
             if chunk is None:
                 ret.append(None)
             else:
-                ret.append(ChunkTensor(chunk[0], chunk[1]))
+                ret.append(ChunkTensor(chunk.frames, chunk.pts))
         return ret
     def fill_buffer(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int: