PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show

xinference/thirdparty/audiotools/core/audio_signal.py ADDED Viewed

@@ -0,0 +1,1682 @@
+import copy
+import functools
+import hashlib
+import math
+import pathlib
+import tempfile
+import typing
+import warnings
+from collections import namedtuple
+from pathlib import Path
+import julius
+import numpy as np
+import soundfile
+import torch
+from . import util
+from .display import DisplayMixin
+from .dsp import DSPMixin
+from .effects import EffectMixin
+from .effects import ImpulseResponseMixin
+from .ffmpeg import FFMPEGMixin
+from .loudness import LoudnessMixin
+from .playback import PlayMixin
+from .whisper import WhisperMixin
+STFTParams = namedtuple(
+    "STFTParams",
+    ["window_length", "hop_length", "window_type", "match_stride", "padding_type"],
+)
+"""
+STFTParams object is a container that holds STFT parameters - window_length,
+hop_length, and window_type. Not all parameters need to be specified. Ones that
+are not specified will be inferred by the AudioSignal parameters.
+Parameters
+----------
+window_length : int, optional
+    Window length of STFT, by default ``0.032 * self.sample_rate``.
+hop_length : int, optional
+    Hop length of STFT, by default ``window_length // 4``.
+window_type : str, optional
+    Type of window to use, by default ``sqrt\\_hann``.
+match_stride : bool, optional
+    Whether to match the stride of convolutional layers, by default False
+padding_type : str, optional
+    Type of padding to use, by default 'reflect'
+"""
+STFTParams.__new__.__defaults__ = (None, None, None, None, None)
+class AudioSignal(
+    EffectMixin,
+    LoudnessMixin,
+    PlayMixin,
+    ImpulseResponseMixin,
+    DSPMixin,
+    DisplayMixin,
+    FFMPEGMixin,
+    WhisperMixin,
+):
+    """This is the core object of this library. Audio is always
+    loaded into an AudioSignal, which then enables all the features
+    of this library, including audio augmentations, I/O, playback,
+    and more.
+    The structure of this object is that the base functionality
+    is defined in ``core/audio_signal.py``, while extensions to
+    that functionality are defined in the other ``core/*.py``
+    files. For example, all the display-based functionality
+    (e.g. plot spectrograms, waveforms, write to tensorboard)
+    are in ``core/display.py``.
+    Parameters
+    ----------
+    audio_path_or_array : typing.Union[torch.Tensor, str, Path, np.ndarray]
+        Object to create AudioSignal from. Can be a tensor, numpy array,
+        or a path to a file. The file is always reshaped to
+    sample_rate : int, optional
+        Sample rate of the audio. If different from underlying file, resampling is
+        performed. If passing in an array or tensor, this must be defined,
+        by default None
+    stft_params : STFTParams, optional
+        Parameters of STFT to use. , by default None
+    offset : float, optional
+        Offset in seconds to read from file, by default 0
+    duration : float, optional
+        Duration in seconds to read from file, by default None
+    device : str, optional
+        Device to load audio onto, by default None
+    Examples
+    --------
+    Loading an AudioSignal from an array, at a sample rate of
+    44100.
+    >>> signal = AudioSignal(torch.randn(5*44100), 44100)
+    Note, the signal is reshaped to have a batch size, and one
+    audio channel:
+    >>> print(signal.shape)
+    (1, 1, 44100)
+    You can treat AudioSignals like tensors, and many of the same
+    functions you might use on tensors are defined for AudioSignals
+    as well:
+    >>> signal.to("cuda")
+    >>> signal.cuda()
+    >>> signal.clone()
+    >>> signal.detach()
+    Indexing AudioSignals returns an AudioSignal:
+    >>> signal[..., 3*44100:4*44100]
+    The above signal is 1 second long, and is also an AudioSignal.
+    """
+    def __init__(
+        self,
+        audio_path_or_array: typing.Union[torch.Tensor, str, Path, np.ndarray],
+        sample_rate: int = None,
+        stft_params: STFTParams = None,
+        offset: float = 0,
+        duration: float = None,
+        device: str = None,
+    ):
+        audio_path = None
+        audio_array = None
+        if isinstance(audio_path_or_array, str):
+            audio_path = audio_path_or_array
+        elif isinstance(audio_path_or_array, pathlib.Path):
+            audio_path = audio_path_or_array
+        elif isinstance(audio_path_or_array, np.ndarray):
+            audio_array = audio_path_or_array
+        elif torch.is_tensor(audio_path_or_array):
+            audio_array = audio_path_or_array
+        else:
+            raise ValueError(
+                "audio_path_or_array must be either a Path, "
+                "string, numpy array, or torch Tensor!"
+            )
+        self.path_to_file = None
+        self.audio_data = None
+        self.sources = None  # List of AudioSignal objects.
+        self.stft_data = None
+        if audio_path is not None:
+            self.load_from_file(
+                audio_path, offset=offset, duration=duration, device=device
+            )
+        elif audio_array is not None:
+            assert sample_rate is not None, "Must set sample rate!"
+            self.load_from_array(audio_array, sample_rate, device=device)
+        self.window = None
+        self.stft_params = stft_params
+        self.metadata = {
+            "offset": offset,
+            "duration": duration,
+        }
+    @property
+    def path_to_input_file(
+        self,
+    ):
+        """
+        Path to input file, if it exists.
+        Alias to ``path_to_file`` for backwards compatibility
+        """
+        return self.path_to_file
+    @classmethod
+    def excerpt(
+        cls,
+        audio_path: typing.Union[str, Path],
+        offset: float = None,
+        duration: float = None,
+        state: typing.Union[np.random.RandomState, int] = None,
+        **kwargs,
+    ):
+        """Randomly draw an excerpt of ``duration`` seconds from an
+        audio file specified at ``audio_path``, between ``offset`` seconds
+        and end of file. ``state`` can be used to seed the random draw.
+        Parameters
+        ----------
+        audio_path : typing.Union[str, Path]
+            Path to audio file to grab excerpt from.
+        offset : float, optional
+            Lower bound for the start time, in seconds drawn from
+            the file, by default None.
+        duration : float, optional
+            Duration of excerpt, in seconds, by default None
+        state : typing.Union[np.random.RandomState, int], optional
+            RandomState or seed of random state, by default None
+        Returns
+        -------
+        AudioSignal
+            AudioSignal containing excerpt.
+        Examples
+        --------
+        >>> signal = AudioSignal.excerpt("path/to/audio", duration=5)
+        """
+        info = util.info(audio_path)
+        total_duration = info.duration
+        state = util.random_state(state)
+        lower_bound = 0 if offset is None else offset
+        upper_bound = max(total_duration - duration, 0)
+        offset = state.uniform(lower_bound, upper_bound)
+        signal = cls(audio_path, offset=offset, duration=duration, **kwargs)
+        signal.metadata["offset"] = offset
+        signal.metadata["duration"] = duration
+        return signal
+    @classmethod
+    def salient_excerpt(
+        cls,
+        audio_path: typing.Union[str, Path],
+        loudness_cutoff: float = None,
+        num_tries: int = 8,
+        state: typing.Union[np.random.RandomState, int] = None,
+        **kwargs,
+    ):
+        """Similar to AudioSignal.excerpt, except it extracts excerpts only
+        if they are above a specified loudness threshold, which is computed via
+        a fast LUFS routine.
+        Parameters
+        ----------
+        audio_path : typing.Union[str, Path]
+            Path to audio file to grab excerpt from.
+        loudness_cutoff : float, optional
+            Loudness threshold in dB. Typical values are ``-40, -60``,
+            etc, by default None
+        num_tries : int, optional
+            Number of tries to grab an excerpt above the threshold
+            before giving up, by default 8.
+        state : typing.Union[np.random.RandomState, int], optional
+            RandomState or seed of random state, by default None
+        kwargs : dict
+            Keyword arguments to AudioSignal.excerpt
+        Returns
+        -------
+        AudioSignal
+            AudioSignal containing excerpt.
+        .. warning::
+            if ``num_tries`` is set to None, ``salient_excerpt`` may try forever, which can
+            result in an infinite loop if ``audio_path`` does not have
+            any loud enough excerpts.
+        Examples
+        --------
+        >>> signal = AudioSignal.salient_excerpt(
+                "path/to/audio",
+                loudness_cutoff=-40,
+                duration=5
+            )
+        """
+        state = util.random_state(state)
+        if loudness_cutoff is None:
+            excerpt = cls.excerpt(audio_path, state=state, **kwargs)
+        else:
+            loudness = -np.inf
+            num_try = 0
+            while loudness <= loudness_cutoff:
+                excerpt = cls.excerpt(audio_path, state=state, **kwargs)
+                loudness = excerpt.loudness()
+                num_try += 1
+                if num_tries is not None and num_try >= num_tries:
+                    break
+        return excerpt
+    @classmethod
+    def zeros(
+        cls,
+        duration: float,
+        sample_rate: int,
+        num_channels: int = 1,
+        batch_size: int = 1,
+        **kwargs,
+    ):
+        """Helper function create an AudioSignal of all zeros.
+        Parameters
+        ----------
+        duration : float
+            Duration of AudioSignal
+        sample_rate : int
+            Sample rate of AudioSignal
+        num_channels : int, optional
+            Number of channels, by default 1
+        batch_size : int, optional
+            Batch size, by default 1
+        Returns
+        -------
+        AudioSignal
+            AudioSignal containing all zeros.
+        Examples
+        --------
+        Generate 5 seconds of all zeros at a sample rate of 44100.
+        >>> signal = AudioSignal.zeros(5.0, 44100)
+        """
+        n_samples = int(duration * sample_rate)
+        return cls(
+            torch.zeros(batch_size, num_channels, n_samples), sample_rate, **kwargs
+        )
+    @classmethod
+    def wave(
+        cls,
+        frequency: float,
+        duration: float,
+        sample_rate: int,
+        num_channels: int = 1,
+        shape: str = "sine",
+        **kwargs,
+    ):
+        """
+        Generate a waveform of a given frequency and shape.
+        Parameters
+        ----------
+        frequency : float
+            Frequency of the waveform
+        duration : float
+            Duration of the waveform
+        sample_rate : int
+            Sample rate of the waveform
+        num_channels : int, optional
+            Number of channels, by default 1
+        shape : str, optional
+            Shape of the waveform, by default "saw"
+            One of "sawtooth", "square", "sine", "triangle"
+        kwargs : dict
+            Keyword arguments to AudioSignal
+        """
+        n_samples = int(duration * sample_rate)
+        t = torch.linspace(0, duration, n_samples)
+        if shape == "sawtooth":
+            from scipy.signal import sawtooth
+            wave_data = sawtooth(2 * np.pi * frequency * t, 0.5)
+        elif shape == "square":
+            from scipy.signal import square
+            wave_data = square(2 * np.pi * frequency * t)
+        elif shape == "sine":
+            wave_data = np.sin(2 * np.pi * frequency * t)
+        elif shape == "triangle":
+            from scipy.signal import sawtooth
+            # frequency is doubled by the abs call, so omit the 2 in 2pi
+            wave_data = sawtooth(np.pi * frequency * t, 0.5)
+            wave_data = -np.abs(wave_data) * 2 + 1
+        else:
+            raise ValueError(f"Invalid shape {shape}")
+        wave_data = torch.tensor(wave_data, dtype=torch.float32)
+        wave_data = wave_data.unsqueeze(0).unsqueeze(0).repeat(1, num_channels, 1)
+        return cls(wave_data, sample_rate, **kwargs)
+    @classmethod
+    def batch(
+        cls,
+        audio_signals: list,
+        pad_signals: bool = False,
+        truncate_signals: bool = False,
+        resample: bool = False,
+        dim: int = 0,
+    ):
+        """Creates a batched AudioSignal from a list of AudioSignals.
+        Parameters
+        ----------
+        audio_signals : list[AudioSignal]
+            List of AudioSignal objects
+        pad_signals : bool, optional
+            Whether to pad signals to length of the maximum length
+            AudioSignal in the list, by default False
+        truncate_signals : bool, optional
+            Whether to truncate signals to length of shortest length
+            AudioSignal in the list, by default False
+        resample : bool, optional
+            Whether to resample AudioSignal to the sample rate of
+            the first AudioSignal in the list, by default False
+        dim : int, optional
+            Dimension along which to batch the signals.
+        Returns
+        -------
+        AudioSignal
+            Batched AudioSignal.
+        Raises
+        ------
+        RuntimeError
+            If not all AudioSignals are the same sample rate, and
+            ``resample=False``, an error is raised.
+        RuntimeError
+            If not all AudioSignals are the same the length, and
+            both ``pad_signals=False`` and ``truncate_signals=False``,
+            an error is raised.
+        Examples
+        --------
+        Batching a bunch of random signals:
+        >>> signal_list = [AudioSignal(torch.randn(44100), 44100) for _ in range(10)]
+        >>> signal = AudioSignal.batch(signal_list)
+        >>> print(signal.shape)
+        (10, 1, 44100)
+        """
+        signal_lengths = [x.signal_length for x in audio_signals]
+        sample_rates = [x.sample_rate for x in audio_signals]
+        if len(set(sample_rates)) != 1:
+            if resample:
+                for x in audio_signals:
+                    x.resample(sample_rates[0])
+            else:
+                raise RuntimeError(
+                    f"Not all signals had the same sample rate! Got {sample_rates}. "
+                    f"All signals must have the same sample rate, or resample must be True. "
+                )
+        if len(set(signal_lengths)) != 1:
+            if pad_signals:
+                max_length = max(signal_lengths)
+                for x in audio_signals:
+                    pad_len = max_length - x.signal_length
+                    x.zero_pad(0, pad_len)
+            elif truncate_signals:
+                min_length = min(signal_lengths)
+                for x in audio_signals:
+                    x.truncate_samples(min_length)
+            else:
+                raise RuntimeError(
+                    f"Not all signals had the same length! Got {signal_lengths}. "
+                    f"All signals must be the same length, or pad_signals/truncate_signals "
+                    f"must be True. "
+                )
+        # Concatenate along the specified dimension (default 0)
+        audio_data = torch.cat([x.audio_data for x in audio_signals], dim=dim)
+        audio_paths = [x.path_to_file for x in audio_signals]
+        batched_signal = cls(
+            audio_data,
+            sample_rate=audio_signals[0].sample_rate,
+        )
+        batched_signal.path_to_file = audio_paths
+        return batched_signal
+    # I/O
+    def load_from_file(
+        self,
+        audio_path: typing.Union[str, Path],
+        offset: float,
+        duration: float,
+        device: str = "cpu",
+    ):
+        """Loads data from file. Used internally when AudioSignal
+        is instantiated with a path to a file.
+        Parameters
+        ----------
+        audio_path : typing.Union[str, Path]
+            Path to file
+        offset : float
+            Offset in seconds
+        duration : float
+            Duration in seconds
+        device : str, optional
+            Device to put AudioSignal on, by default "cpu"
+        Returns
+        -------
+        AudioSignal
+            AudioSignal loaded from file
+        """
+        import librosa
+        data, sample_rate = librosa.load(
+            audio_path,
+            offset=offset,
+            duration=duration,
+            sr=None,
+            mono=False,
+        )
+        data = util.ensure_tensor(data)
+        if data.shape[-1] == 0:
+            raise RuntimeError(
+                f"Audio file {audio_path} with offset {offset} and duration {duration} is empty!"
+            )
+        if data.ndim < 2:
+            data = data.unsqueeze(0)
+        if data.ndim < 3:
+            data = data.unsqueeze(0)
+        self.audio_data = data
+        self.original_signal_length = self.signal_length
+        self.sample_rate = sample_rate
+        self.path_to_file = audio_path
+        return self.to(device)
+    def load_from_array(
+        self,
+        audio_array: typing.Union[torch.Tensor, np.ndarray],
+        sample_rate: int,
+        device: str = "cpu",
+    ):
+        """Loads data from array, reshaping it to be exactly 3
+        dimensions. Used internally when AudioSignal is called
+        with a tensor or an array.
+        Parameters
+        ----------
+        audio_array : typing.Union[torch.Tensor, np.ndarray]
+            Array/tensor of audio of samples.
+        sample_rate : int
+            Sample rate of audio
+        device : str, optional
+            Device to move audio onto, by default "cpu"
+        Returns
+        -------
+        AudioSignal
+            AudioSignal loaded from array
+        """
+        audio_data = util.ensure_tensor(audio_array)
+        if audio_data.dtype == torch.double:
+            audio_data = audio_data.float()
+        if audio_data.ndim < 2:
+            audio_data = audio_data.unsqueeze(0)
+        if audio_data.ndim < 3:
+            audio_data = audio_data.unsqueeze(0)
+        self.audio_data = audio_data
+        self.original_signal_length = self.signal_length
+        self.sample_rate = sample_rate
+        return self.to(device)
+    def write(self, audio_path: typing.Union[str, Path]):
+        """Writes audio to a file. Only writes the audio
+        that is in the very first item of the batch. To write other items
+        in the batch, index the signal along the batch dimension
+        before writing. After writing, the signal's ``path_to_file``
+        attribute is updated to the new path.
+        Parameters
+        ----------
+        audio_path : typing.Union[str, Path]
+            Path to write audio to.
+        Returns
+        -------
+        AudioSignal
+            Returns original AudioSignal, so you can use this in a fluent
+            interface.
+        Examples
+        --------
+        Creating and writing a signal to disk:
+        >>> signal = AudioSignal(torch.randn(10, 1, 44100), 44100)
+        >>> signal.write("/tmp/out.wav")
+        Writing a different element of the batch:
+        >>> signal[5].write("/tmp/out.wav")
+        Using this in a fluent interface:
+        >>> signal.write("/tmp/original.wav").low_pass(4000).write("/tmp/lowpass.wav")
+        """
+        if self.audio_data[0].abs().max() > 1:
+            warnings.warn("Audio amplitude > 1 clipped when saving")
+        soundfile.write(str(audio_path), self.audio_data[0].numpy().T, self.sample_rate)
+        self.path_to_file = audio_path
+        return self
+    def deepcopy(self):
+        """Copies the signal and all of its attributes.
+        Returns
+        -------
+        AudioSignal
+            Deep copy of the audio signal.
+        """
+        return copy.deepcopy(self)
+    def copy(self):
+        """Shallow copy of signal.
+        Returns
+        -------
+        AudioSignal
+            Shallow copy of the audio signal.
+        """
+        return copy.copy(self)
+    def clone(self):
+        """Clones all tensors contained in the AudioSignal,
+        and returns a copy of the signal with everything
+        cloned. Useful when using AudioSignal within autograd
+        computation graphs.
+        Relevant attributes are the stft data, the audio data,
+        and the loudness of the file.
+        Returns
+        -------
+        AudioSignal
+            Clone of AudioSignal.
+        """
+        clone = type(self)(
+            self.audio_data.clone(),
+            self.sample_rate,
+            stft_params=self.stft_params,
+        )
+        if self.stft_data is not None:
+            clone.stft_data = self.stft_data.clone()
+        if self._loudness is not None:
+            clone._loudness = self._loudness.clone()
+        clone.path_to_file = copy.deepcopy(self.path_to_file)
+        clone.metadata = copy.deepcopy(self.metadata)
+        return clone
+    def detach(self):
+        """Detaches tensors contained in AudioSignal.
+        Relevant attributes are the stft data, the audio data,
+        and the loudness of the file.
+        Returns
+        -------
+        AudioSignal
+            Same signal, but with all tensors detached.
+        """
+        if self._loudness is not None:
+            self._loudness = self._loudness.detach()
+        if self.stft_data is not None:
+            self.stft_data = self.stft_data.detach()
+        self.audio_data = self.audio_data.detach()
+        return self
+    def hash(self):
+        """Writes the audio data to a temporary file, and then
+        hashes it using hashlib. Useful for creating a file
+        name based on the audio content.
+        Returns
+        -------
+        str
+            Hash of audio data.
+        Examples
+        --------
+        Creating a signal, and writing it to a unique file name:
+        >>> signal = AudioSignal(torch.randn(44100), 44100)
+        >>> hash = signal.hash()
+        >>> signal.write(f"{hash}.wav")
+        """
+        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
+            self.write(f.name)
+            h = hashlib.sha256()
+            b = bytearray(128 * 1024)
+            mv = memoryview(b)
+            with open(f.name, "rb", buffering=0) as f:
+                for n in iter(lambda: f.readinto(mv), 0):
+                    h.update(mv[:n])
+            file_hash = h.hexdigest()
+        return file_hash
+    # Signal operations
+    def to_mono(self):
+        """Converts audio data to mono audio, by taking the mean
+        along the channels dimension.
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with mean of channels.
+        """
+        self.audio_data = self.audio_data.mean(1, keepdim=True)
+        return self
+    def resample(self, sample_rate: int):
+        """Resamples the audio, using sinc interpolation. This works on both
+        cpu and gpu, and is much faster on gpu.
+        Parameters
+        ----------
+        sample_rate : int
+            Sample rate to resample to.
+        Returns
+        -------
+        AudioSignal
+            Resampled AudioSignal
+        """
+        if sample_rate == self.sample_rate:
+            return self
+        self.audio_data = julius.resample_frac(
+            self.audio_data, self.sample_rate, sample_rate
+        )
+        self.sample_rate = sample_rate
+        return self
+    # Tensor operations
+    def to(self, device: str):
+        """Moves all tensors contained in signal to the specified device.
+        Parameters
+        ----------
+        device : str
+            Device to move AudioSignal onto. Typical values are
+            "cuda", "cpu", or "cuda:n" to specify the nth gpu.
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with all tensors moved to specified device.
+        """
+        if self._loudness is not None:
+            self._loudness = self._loudness.to(device)
+        if self.stft_data is not None:
+            self.stft_data = self.stft_data.to(device)
+        if self.audio_data is not None:
+            self.audio_data = self.audio_data.to(device)
+        return self
+    def float(self):
+        """Calls ``.float()`` on ``self.audio_data``.
+        Returns
+        -------
+        AudioSignal
+        """
+        self.audio_data = self.audio_data.float()
+        return self
+    def cpu(self):
+        """Moves AudioSignal to cpu.
+        Returns
+        -------
+        AudioSignal
+        """
+        return self.to("cpu")
+    def cuda(self):  # pragma: no cover
+        """Moves AudioSignal to cuda.
+        Returns
+        -------
+        AudioSignal
+        """
+        return self.to("cuda")
+    def numpy(self):
+        """Detaches ``self.audio_data``, moves to cpu, and converts to numpy.
+        Returns
+        -------
+        np.ndarray
+            Audio data as a numpy array.
+        """
+        return self.audio_data.detach().cpu().numpy()
+    def zero_pad(self, before: int, after: int):
+        """Zero pads the audio_data tensor before and after.
+        Parameters
+        ----------
+        before : int
+            How many zeros to prepend to audio.
+        after : int
+            How many zeros to append to audio.
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with padding applied.
+        """
+        self.audio_data = torch.nn.functional.pad(self.audio_data, (before, after))
+        return self
+    def zero_pad_to(self, length: int, mode: str = "after"):
+        """Pad with zeros to a specified length, either before or after
+        the audio data.
+        Parameters
+        ----------
+        length : int
+            Length to pad to
+        mode : str, optional
+            Whether to prepend or append zeros to signal, by default "after"
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with padding applied.
+        """
+        if mode == "before":
+            self.zero_pad(max(length - self.signal_length, 0), 0)
+        elif mode == "after":
+            self.zero_pad(0, max(length - self.signal_length, 0))
+        return self
+    def trim(self, before: int, after: int):
+        """Trims the audio_data tensor before and after.
+        Parameters
+        ----------
+        before : int
+            How many samples to trim from beginning.
+        after : int
+            How many samples to trim from end.
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with trimming applied.
+        """
+        if after == 0:
+            self.audio_data = self.audio_data[..., before:]
+        else:
+            self.audio_data = self.audio_data[..., before:-after]
+        return self
+    def truncate_samples(self, length_in_samples: int):
+        """Truncate signal to specified length.
+        Parameters
+        ----------
+        length_in_samples : int
+            Truncate to this many samples.
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with truncation applied.
+        """
+        self.audio_data = self.audio_data[..., :length_in_samples]
+        return self
+    @property
+    def device(self):
+        """Get device that AudioSignal is on.
+        Returns
+        -------
+        torch.device
+            Device that AudioSignal is on.
+        """
+        if self.audio_data is not None:
+            device = self.audio_data.device
+        elif self.stft_data is not None:
+            device = self.stft_data.device
+        return device
+    # Properties
+    @property
+    def audio_data(self):
+        """Returns the audio data tensor in the object.
+        Audio data is always of the shape
+        (batch_size, num_channels, num_samples). If value has less
+        than 3 dims (e.g. is (num_channels, num_samples)), then it will
+        be reshaped to (1, num_channels, num_samples) - a batch size of 1.
+        Parameters
+        ----------
+        data : typing.Union[torch.Tensor, np.ndarray]
+            Audio data to set.
+        Returns
+        -------
+        torch.Tensor
+            Audio samples.
+        """
+        return self._audio_data
+    @audio_data.setter
+    def audio_data(self, data: typing.Union[torch.Tensor, np.ndarray]):
+        if data is not None:
+            assert torch.is_tensor(data), "audio_data should be torch.Tensor"
+            assert data.ndim == 3, "audio_data should be 3-dim (B, C, T)"
+        self._audio_data = data
+        # Old loudness value not guaranteed to be right, reset it.
+        self._loudness = None
+        return
+    # alias for audio_data
+    samples = audio_data
+    @property
+    def stft_data(self):
+        """Returns the STFT data inside the signal. Shape is
+        (batch, channels, frequencies, time).
+        Returns
+        -------
+        torch.Tensor
+            Complex spectrogram data.
+        """
+        return self._stft_data
+    @stft_data.setter
+    def stft_data(self, data: typing.Union[torch.Tensor, np.ndarray]):
+        if data is not None:
+            assert torch.is_tensor(data) and torch.is_complex(data)
+            if self.stft_data is not None and self.stft_data.shape != data.shape:
+                warnings.warn("stft_data changed shape")
+        self._stft_data = data
+        return
+    @property
+    def batch_size(self):
+        """Batch size of audio signal.
+        Returns
+        -------
+        int
+            Batch size of signal.
+        """
+        return self.audio_data.shape[0]
+    @property
+    def signal_length(self):
+        """Length of audio signal.
+        Returns
+        -------
+        int
+            Length of signal in samples.
+        """
+        return self.audio_data.shape[-1]
+    # alias for signal_length
+    length = signal_length
+    @property
+    def shape(self):
+        """Shape of audio data.
+        Returns
+        -------
+        tuple
+            Shape of audio data.
+        """
+        return self.audio_data.shape
+    @property
+    def signal_duration(self):
+        """Length of audio signal in seconds.
+        Returns
+        -------
+        float
+            Length of signal in seconds.
+        """
+        return self.signal_length / self.sample_rate
+    # alias for signal_duration
+    duration = signal_duration
+    @property
+    def num_channels(self):
+        """Number of audio channels.
+        Returns
+        -------
+        int
+            Number of audio channels.
+        """
+        return self.audio_data.shape[1]
+    # STFT
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_window(window_type: str, window_length: int, device: str):
+        """Wrapper around scipy.signal.get_window so one can also get the
+        popular sqrt-hann window. This function caches for efficiency
+        using functools.lru\\_cache.
+        Parameters
+        ----------
+        window_type : str
+            Type of window to get
+        window_length : int
+            Length of the window
+        device : str
+            Device to put window onto.
+        Returns
+        -------
+        torch.Tensor
+            Window returned by scipy.signal.get_window, as a tensor.
+        """
+        from scipy import signal
+        if window_type == "average":
+            window = np.ones(window_length) / window_length
+        elif window_type == "sqrt_hann":
+            window = np.sqrt(signal.get_window("hann", window_length))
+        else:
+            window = signal.get_window(window_type, window_length)
+        window = torch.from_numpy(window).to(device).float()
+        return window
+    @property
+    def stft_params(self):
+        """Returns STFTParams object, which can be re-used to other
+        AudioSignals.
+        This property can be set as well. If values are not defined in STFTParams,
+        they are inferred automatically from the signal properties. The default is to use
+        32ms windows, with 8ms hop length, and the square root of the hann window.
+        Returns
+        -------
+        STFTParams
+            STFT parameters for the AudioSignal.
+        Examples
+        --------
+        >>> stft_params = STFTParams(128, 32)
+        >>> signal1 = AudioSignal(torch.randn(44100), 44100, stft_params=stft_params)
+        >>> signal2 = AudioSignal(torch.randn(44100), 44100, stft_params=signal1.stft_params)
+        >>> signal1.stft_params = STFTParams() # Defaults
+        """
+        return self._stft_params
+    @stft_params.setter
+    def stft_params(self, value: STFTParams):
+        default_win_len = int(2 ** (np.ceil(np.log2(0.032 * self.sample_rate))))
+        default_hop_len = default_win_len // 4
+        default_win_type = "hann"
+        default_match_stride = False
+        default_padding_type = "reflect"
+        default_stft_params = STFTParams(
+            window_length=default_win_len,
+            hop_length=default_hop_len,
+            window_type=default_win_type,
+            match_stride=default_match_stride,
+            padding_type=default_padding_type,
+        )._asdict()
+        value = value._asdict() if value else default_stft_params
+        for key in default_stft_params:
+            if value[key] is None:
+                value[key] = default_stft_params[key]
+        self._stft_params = STFTParams(**value)
+        self.stft_data = None
+    def compute_stft_padding(
+        self, window_length: int, hop_length: int, match_stride: bool
+    ):
+        """Compute how the STFT should be padded, based on match\\_stride.
+        Parameters
+        ----------
+        window_length : int
+            Window length of STFT.
+        hop_length : int
+            Hop length of STFT.
+        match_stride : bool
+            Whether or not to match stride, making the STFT have the same alignment as
+            convolutional layers.
+        Returns
+        -------
+        tuple
+            Amount to pad on either side of audio.
+        """
+        length = self.signal_length
+        if match_stride:
+            assert (
+                hop_length == window_length // 4
+            ), "For match_stride, hop must equal n_fft // 4"
+            right_pad = math.ceil(length / hop_length) * hop_length - length
+            pad = (window_length - hop_length) // 2
+        else:
+            right_pad = 0
+            pad = 0
+        return right_pad, pad
+    def stft(
+        self,
+        window_length: int = None,
+        hop_length: int = None,
+        window_type: str = None,
+        match_stride: bool = None,
+        padding_type: str = None,
+    ):
+        """Computes the short-time Fourier transform of the audio data,
+        with specified STFT parameters.
+        Parameters
+        ----------
+        window_length : int, optional
+            Window length of STFT, by default ``0.032 * self.sample_rate``.
+        hop_length : int, optional
+            Hop length of STFT, by default ``window_length // 4``.
+        window_type : str, optional
+            Type of window to use, by default ``sqrt\\_hann``.
+        match_stride : bool, optional
+            Whether to match the stride of convolutional layers, by default False
+        padding_type : str, optional
+            Type of padding to use, by default 'reflect'
+        Returns
+        -------
+        torch.Tensor
+            STFT of audio data.
+        Examples
+        --------
+        Compute the STFT of an AudioSignal:
+        >>> signal = AudioSignal(torch.randn(44100), 44100)
+        >>> signal.stft()
+        Vary the window and hop length:
+        >>> stft_params = [STFTParams(128, 32), STFTParams(512, 128)]
+        >>> for stft_param in stft_params:
+        >>>     signal.stft_params = stft_params
+        >>>     signal.stft()
+        """
+        window_length = (
+            self.stft_params.window_length
+            if window_length is None
+            else int(window_length)
+        )
+        hop_length = (
+            self.stft_params.hop_length if hop_length is None else int(hop_length)
+        )
+        window_type = (
+            self.stft_params.window_type if window_type is None else window_type
+        )
+        match_stride = (
+            self.stft_params.match_stride if match_stride is None else match_stride
+        )
+        padding_type = (
+            self.stft_params.padding_type if padding_type is None else padding_type
+        )
+        window = self.get_window(window_type, window_length, self.audio_data.device)
+        window = window.to(self.audio_data.device)
+        audio_data = self.audio_data
+        right_pad, pad = self.compute_stft_padding(
+            window_length, hop_length, match_stride
+        )
+        audio_data = torch.nn.functional.pad(
+            audio_data, (pad, pad + right_pad), padding_type
+        )
+        stft_data = torch.stft(
+            audio_data.reshape(-1, audio_data.shape[-1]),
+            n_fft=window_length,
+            hop_length=hop_length,
+            window=window,
+            return_complex=True,
+            center=True,
+        )
+        _, nf, nt = stft_data.shape
+        stft_data = stft_data.reshape(self.batch_size, self.num_channels, nf, nt)
+        if match_stride:
+            # Drop first two and last two frames, which are added
+            # because of padding. Now num_frames * hop_length = num_samples.
+            stft_data = stft_data[..., 2:-2]
+        self.stft_data = stft_data
+        return stft_data
+    def istft(
+        self,
+        window_length: int = None,
+        hop_length: int = None,
+        window_type: str = None,
+        match_stride: bool = None,
+        length: int = None,
+    ):
+        """Computes inverse STFT and sets it to audio\\_data.
+        Parameters
+        ----------
+        window_length : int, optional
+            Window length of STFT, by default ``0.032 * self.sample_rate``.
+        hop_length : int, optional
+            Hop length of STFT, by default ``window_length // 4``.
+        window_type : str, optional
+            Type of window to use, by default ``sqrt\\_hann``.
+        match_stride : bool, optional
+            Whether to match the stride of convolutional layers, by default False
+        length : int, optional
+            Original length of signal, by default None
+        Returns
+        -------
+        AudioSignal
+            AudioSignal with istft applied.
+        Raises
+        ------
+        RuntimeError
+            Raises an error if stft was not called prior to istft on the signal,
+            or if stft_data is not set.
+        """
+        if self.stft_data is None:
+            raise RuntimeError("Cannot do inverse STFT without self.stft_data!")
+        window_length = (
+            self.stft_params.window_length
+            if window_length is None
+            else int(window_length)
+        )
+        hop_length = (
+            self.stft_params.hop_length if hop_length is None else int(hop_length)
+        )
+        window_type = (
+            self.stft_params.window_type if window_type is None else window_type
+        )
+        match_stride = (
+            self.stft_params.match_stride if match_stride is None else match_stride
+        )
+        window = self.get_window(window_type, window_length, self.stft_data.device)
+        nb, nch, nf, nt = self.stft_data.shape
+        stft_data = self.stft_data.reshape(nb * nch, nf, nt)
+        right_pad, pad = self.compute_stft_padding(
+            window_length, hop_length, match_stride
+        )
+        if length is None:
+            length = self.original_signal_length
+            length = length + 2 * pad + right_pad
+        if match_stride:
+            # Zero-pad the STFT on either side, putting back the frames that were
+            # dropped in stft().
+            stft_data = torch.nn.functional.pad(stft_data, (2, 2))
+        audio_data = torch.istft(
+            stft_data,
+            n_fft=window_length,
+            hop_length=hop_length,
+            window=window,
+            length=length,
+            center=True,
+        )
+        audio_data = audio_data.reshape(nb, nch, -1)
+        if match_stride:
+            audio_data = audio_data[..., pad : -(pad + right_pad)]
+        self.audio_data = audio_data
+        return self
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_mel_filters(
+        sr: int, n_fft: int, n_mels: int, fmin: float = 0.0, fmax: float = None
+    ):
+        """Create a Filterbank matrix to combine FFT bins into Mel-frequency bins.
+        Parameters
+        ----------
+        sr : int
+            Sample rate of audio
+        n_fft : int
+            Number of FFT bins
+        n_mels : int
+            Number of mels
+        fmin : float, optional
+            Lowest frequency, in Hz, by default 0.0
+        fmax : float, optional
+            Highest frequency, by default None
+        Returns
+        -------
+        np.ndarray [shape=(n_mels, 1 + n_fft/2)]
+            Mel transform matrix
+        """
+        from librosa.filters import mel as librosa_mel_fn
+        return librosa_mel_fn(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+        )
+    def mel_spectrogram(
+        self, n_mels: int = 80, mel_fmin: float = 0.0, mel_fmax: float = None, **kwargs
+    ):
+        """Computes a Mel spectrogram.
+        Parameters
+        ----------
+        n_mels : int, optional
+            Number of mels, by default 80
+        mel_fmin : float, optional
+            Lowest frequency, in Hz, by default 0.0
+        mel_fmax : float, optional
+            Highest frequency, by default None
+        kwargs : dict, optional
+            Keyword arguments to self.stft().
+        Returns
+        -------
+        torch.Tensor [shape=(batch, channels, mels, time)]
+            Mel spectrogram.
+        """
+        stft = self.stft(**kwargs)
+        magnitude = torch.abs(stft)
+        nf = magnitude.shape[2]
+        mel_basis = self.get_mel_filters(
+            sr=self.sample_rate,
+            n_fft=2 * (nf - 1),
+            n_mels=n_mels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+        )
+        mel_basis = torch.from_numpy(mel_basis).to(self.device)
+        mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T
+        mel_spectrogram = mel_spectrogram.transpose(-1, 2)
+        return mel_spectrogram
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_dct(n_mfcc: int, n_mels: int, norm: str = "ortho", device: str = None):
+        """Create a discrete cosine transform (DCT) transformation matrix with shape (``n_mels``, ``n_mfcc``),
+        it can be normalized depending on norm. For more information about dct:
+        http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
+        Parameters
+        ----------
+        n_mfcc : int
+            Number of mfccs
+        n_mels : int
+            Number of mels
+        norm   : str
+            Use "ortho" to get a orthogonal matrix or None, by default "ortho"
+        device : str, optional
+            Device to load the transformation matrix on, by default None
+        Returns
+        -------
+        torch.Tensor [shape=(n_mels, n_mfcc)] T
+            The dct transformation matrix.
+        """
+        from torchaudio.functional import create_dct
+        return create_dct(n_mfcc, n_mels, norm).to(device)
+    def mfcc(
+        self, n_mfcc: int = 40, n_mels: int = 80, log_offset: float = 1e-6, **kwargs
+    ):
+        """Computes mel-frequency cepstral coefficients (MFCCs).
+        Parameters
+        ----------
+        n_mfcc : int, optional
+            Number of mels, by default 40
+        n_mels : int, optional
+            Number of mels, by default 80
+        log_offset: float, optional
+            Small value to prevent numerical issues when trying to compute log(0), by default 1e-6
+        kwargs : dict, optional
+            Keyword arguments to self.mel_spectrogram(), note that some of them will be used for self.stft()
+        Returns
+        -------
+        torch.Tensor [shape=(batch, channels, mfccs, time)]
+            MFCCs.
+        """
+        mel_spectrogram = self.mel_spectrogram(n_mels, **kwargs)
+        mel_spectrogram = torch.log(mel_spectrogram + log_offset)
+        dct_mat = self.get_dct(n_mfcc, n_mels, "ortho", self.device)
+        mfcc = mel_spectrogram.transpose(-1, -2) @ dct_mat
+        mfcc = mfcc.transpose(-1, -2)
+        return mfcc
+    @property
+    def magnitude(self):
+        """Computes and returns the absolute value of the STFT, which
+        is the magnitude. This value can also be set to some tensor.
+        When set, ``self.stft_data`` is manipulated so that its magnitude
+        matches what this is set to, and modulated by the phase.
+        Returns
+        -------
+        torch.Tensor
+            Magnitude of STFT.
+        Examples
+        --------
+        >>> signal = AudioSignal(torch.randn(44100), 44100)
+        >>> magnitude = signal.magnitude # Computes stft if not computed
+        >>> magnitude[magnitude < magnitude.mean()] = 0
+        >>> signal.magnitude = magnitude
+        >>> signal.istft()
+        """
+        if self.stft_data is None:
+            self.stft()
+        return torch.abs(self.stft_data)
+    @magnitude.setter
+    def magnitude(self, value):
+        self.stft_data = value * torch.exp(1j * self.phase)
+        return
+    def log_magnitude(
+        self, ref_value: float = 1.0, amin: float = 1e-5, top_db: float = 80.0
+    ):
+        """Computes the log-magnitude of the spectrogram.
+        Parameters
+        ----------
+        ref_value : float, optional
+            The magnitude is scaled relative to ``ref``: ``20 * log10(S / ref)``.
+            Zeros in the output correspond to positions where ``S == ref``,
+            by default 1.0
+        amin : float, optional
+            Minimum threshold for ``S`` and ``ref``, by default 1e-5
+        top_db : float, optional
+            Threshold the output at ``top_db`` below the peak:
+            ``max(10 * log10(S/ref)) - top_db``, by default -80.0
+        Returns
+        -------
+        torch.Tensor
+            Log-magnitude spectrogram
+        """
+        magnitude = self.magnitude
+        amin = amin**2
+        log_spec = 10.0 * torch.log10(magnitude.pow(2).clamp(min=amin))
+        log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+        if top_db is not None:
+            log_spec = torch.maximum(log_spec, log_spec.max() - top_db)
+        return log_spec
+    @property
+    def phase(self):
+        """Computes and returns the phase of the STFT.
+        This value can also be set to some tensor.
+        When set, ``self.stft_data`` is manipulated so that its phase
+        matches what this is set to, we original magnitudeith th.
+        Returns
+        -------
+        torch.Tensor
+            Phase of STFT.
+        Examples
+        --------
+        >>> signal = AudioSignal(torch.randn(44100), 44100)
+        >>> phase = signal.phase # Computes stft if not computed
+        >>> phase[phase < phase.mean()] = 0
+        >>> signal.phase = phase
+        >>> signal.istft()
+        """
+        if self.stft_data is None:
+            self.stft()
+        return torch.angle(self.stft_data)
+    @phase.setter
+    def phase(self, value):
+        self.stft_data = self.magnitude * torch.exp(1j * value)
+        return
+    # Operator overloading
+    def __add__(self, other):
+        new_signal = self.clone()
+        new_signal.audio_data += util._get_value(other)
+        return new_signal
+    def __iadd__(self, other):
+        self.audio_data += util._get_value(other)
+        return self
+    def __radd__(self, other):
+        return self + other
+    def __sub__(self, other):
+        new_signal = self.clone()
+        new_signal.audio_data -= util._get_value(other)
+        return new_signal
+    def __isub__(self, other):
+        self.audio_data -= util._get_value(other)
+        return self
+    def __mul__(self, other):
+        new_signal = self.clone()
+        new_signal.audio_data *= util._get_value(other)
+        return new_signal
+    def __imul__(self, other):
+        self.audio_data *= util._get_value(other)
+        return self
+    def __rmul__(self, other):
+        return self * other
+    # Representation
+    def _info(self):
+        dur = f"{self.signal_duration:0.3f}" if self.signal_duration else "[unknown]"
+        info = {
+            "duration": f"{dur} seconds",
+            "batch_size": self.batch_size,
+            "path": self.path_to_file if self.path_to_file else "path unknown",
+            "sample_rate": self.sample_rate,
+            "num_channels": self.num_channels if self.num_channels else "[unknown]",
+            "audio_data.shape": self.audio_data.shape,
+            "stft_params": self.stft_params,
+            "device": self.device,
+        }
+        return info
+    def markdown(self):
+        """Produces a markdown representation of AudioSignal, in a markdown table.
+        Returns
+        -------
+        str
+            Markdown representation of AudioSignal.
+        Examples
+        --------
+        >>> signal = AudioSignal(torch.randn(44100), 44100)
+        >>> print(signal.markdown())
+        | Key | Value
+        |---|---
+        | duration | 1.000 seconds |
+        | batch_size | 1 |
+        | path | path unknown |
+        | sample_rate | 44100 |
+        | num_channels | 1 |
+        | audio_data.shape | torch.Size([1, 1, 44100]) |
+        | stft_params | STFTParams(window_length=2048, hop_length=512, window_type='sqrt_hann', match_stride=False) |
+        | device | cpu |
+        """
+        info = self._info()
+        FORMAT = "| Key | Value \n" "|---|--- \n"
+        for k, v in info.items():
+            row = f"| {k} | {v} |\n"
+            FORMAT += row
+        return FORMAT
+    def __str__(self):
+        info = self._info()
+        desc = ""
+        for k, v in info.items():
+            desc += f"{k}: {v}\n"
+        return desc
+    def __rich__(self):
+        from rich.table import Table
+        info = self._info()
+        table = Table(title=f"{self.__class__.__name__}")
+        table.add_column("Key", style="green")
+        table.add_column("Value", style="cyan")
+        for k, v in info.items():
+            table.add_row(k, str(v))
+        return table
+    # Comparison
+    def __eq__(self, other):
+        for k, v in list(self.__dict__.items()):
+            if torch.is_tensor(v):
+                if not torch.allclose(v, other.__dict__[k], atol=1e-6):
+                    max_error = (v - other.__dict__[k]).abs().max()
+                    print(f"Max abs error for {k}: {max_error}")
+                    return False
+        return True
+    # Indexing
+    def __getitem__(self, key):
+        if torch.is_tensor(key) and key.ndim == 0 and key.item() is True:
+            assert self.batch_size == 1
+            audio_data = self.audio_data
+            _loudness = self._loudness
+            stft_data = self.stft_data
+        elif isinstance(key, (bool, int, list, slice, tuple)) or (
+            torch.is_tensor(key) and key.ndim <= 1
+        ):
+            # Indexing only on the batch dimension.
+            # Then let's copy over relevant stuff.
+            # Future work: make this work for time-indexing
+            # as well, using the hop length.
+            audio_data = self.audio_data[key]
+            _loudness = self._loudness[key] if self._loudness is not None else None
+            stft_data = self.stft_data[key] if self.stft_data is not None else None
+        sources = None
+        copy = type(self)(audio_data, self.sample_rate, stft_params=self.stft_params)
+        copy._loudness = _loudness
+        copy._stft_data = stft_data
+        copy.sources = sources
+        return copy
+    def __setitem__(self, key, value):
+        if not isinstance(value, type(self)):
+            self.audio_data[key] = value
+            return
+        if torch.is_tensor(key) and key.ndim == 0 and key.item() is True:
+            assert self.batch_size == 1
+            self.audio_data = value.audio_data
+            self._loudness = value._loudness
+            self.stft_data = value.stft_data
+            return
+        elif isinstance(key, (bool, int, list, slice, tuple)) or (
+            torch.is_tensor(key) and key.ndim <= 1
+        ):
+            if self.audio_data is not None and value.audio_data is not None:
+                self.audio_data[key] = value.audio_data
+            if self._loudness is not None and value._loudness is not None:
+                self._loudness[key] = value._loudness
+            if self.stft_data is not None and value.stft_data is not None:
+                self.stft_data[key] = value.stft_data
+            return
+    def __ne__(self, other):
+        return not self == other