PyPI - xinference - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (334) hide show

xinference/thirdparty/audiotools/core/display.py ADDED Viewed

@@ -0,0 +1,194 @@
+import inspect
+import typing
+from functools import wraps
+from . import util
+def format_figure(func):
+    """Decorator for formatting figures produced by the code below.
+    See :py:func:`audiotools.core.util.format_figure` for more.
+    Parameters
+    ----------
+    func : Callable
+        Plotting function that is decorated by this function.
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        f_keys = inspect.signature(util.format_figure).parameters.keys()
+        f_kwargs = {}
+        for k, v in list(kwargs.items()):
+            if k in f_keys:
+                kwargs.pop(k)
+                f_kwargs[k] = v
+        func(*args, **kwargs)
+        util.format_figure(**f_kwargs)
+    return wrapper
+class DisplayMixin:
+    @format_figure
+    def specshow(
+        self,
+        preemphasis: bool = False,
+        x_axis: str = "time",
+        y_axis: str = "linear",
+        n_mels: int = 128,
+        **kwargs,
+    ):
+        """Displays a spectrogram, using ``librosa.display.specshow``.
+        Parameters
+        ----------
+        preemphasis : bool, optional
+            Whether or not to apply preemphasis, which makes high
+            frequency detail easier to see, by default False
+        x_axis : str, optional
+            How to label the x axis, by default "time"
+        y_axis : str, optional
+            How to label the y axis, by default "linear"
+        n_mels : int, optional
+            If displaying a mel spectrogram with ``y_axis = "mel"``,
+            this controls the number of mels, by default 128.
+        kwargs : dict, optional
+            Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
+        """
+        import librosa
+        import librosa.display
+        # Always re-compute the STFT data before showing it, in case
+        # it changed.
+        signal = self.clone()
+        signal.stft_data = None
+        if preemphasis:
+            signal.preemphasis()
+        ref = signal.magnitude.max()
+        log_mag = signal.log_magnitude(ref_value=ref)
+        if y_axis == "mel":
+            log_mag = 20 * signal.mel_spectrogram(n_mels).clamp(1e-5).log10()
+            log_mag -= log_mag.max()
+        librosa.display.specshow(
+            log_mag.numpy()[0].mean(axis=0),
+            x_axis=x_axis,
+            y_axis=y_axis,
+            sr=signal.sample_rate,
+            **kwargs,
+        )
+    @format_figure
+    def waveplot(self, x_axis: str = "time", **kwargs):
+        """Displays a waveform plot, using ``librosa.display.waveshow``.
+        Parameters
+        ----------
+        x_axis : str, optional
+            How to label the x axis, by default "time"
+        kwargs : dict, optional
+            Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
+        """
+        import librosa
+        import librosa.display
+        audio_data = self.audio_data[0].mean(dim=0)
+        audio_data = audio_data.cpu().numpy()
+        plot_fn = "waveshow" if hasattr(librosa.display, "waveshow") else "waveplot"
+        wave_plot_fn = getattr(librosa.display, plot_fn)
+        wave_plot_fn(audio_data, x_axis=x_axis, sr=self.sample_rate, **kwargs)
+    @format_figure
+    def wavespec(self, x_axis: str = "time", **kwargs):
+        """Displays a waveform plot, using ``librosa.display.waveshow``.
+        Parameters
+        ----------
+        x_axis : str, optional
+            How to label the x axis, by default "time"
+        kwargs : dict, optional
+            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow`.
+        """
+        import matplotlib.pyplot as plt
+        from matplotlib.gridspec import GridSpec
+        gs = GridSpec(6, 1)
+        plt.subplot(gs[0, :])
+        self.waveplot(x_axis=x_axis)
+        plt.subplot(gs[1:, :])
+        self.specshow(x_axis=x_axis, **kwargs)
+    def write_audio_to_tb(
+        self,
+        tag: str,
+        writer,
+        step: int = None,
+        plot_fn: typing.Union[typing.Callable, str] = "specshow",
+        **kwargs,
+    ):
+        """Writes a signal and its spectrogram to Tensorboard. Will show up
+        under the Audio and Images tab in Tensorboard.
+        Parameters
+        ----------
+        tag : str
+            Tag to write signal to (e.g. ``clean/sample_0.wav``). The image will be
+            written to the corresponding ``.png`` file (e.g. ``clean/sample_0.png``).
+        writer : SummaryWriter
+            A SummaryWriter object from PyTorch library.
+        step : int, optional
+            The step to write the signal to, by default None
+        plot_fn : typing.Union[typing.Callable, str], optional
+            How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
+        kwargs : dict, optional
+            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
+            whatever ``plot_fn`` is set to.
+        """
+        import matplotlib.pyplot as plt
+        audio_data = self.audio_data[0, 0].detach().cpu()
+        sample_rate = self.sample_rate
+        writer.add_audio(tag, audio_data, step, sample_rate)
+        if plot_fn is not None:
+            if isinstance(plot_fn, str):
+                plot_fn = getattr(self, plot_fn)
+            fig = plt.figure()
+            plt.clf()
+            plot_fn(**kwargs)
+            writer.add_figure(tag.replace("wav", "png"), fig, step)
+    def save_image(
+        self,
+        image_path: str,
+        plot_fn: typing.Union[typing.Callable, str] = "specshow",
+        **kwargs,
+    ):
+        """Save AudioSignal spectrogram (or whatever ``plot_fn`` is set to) to
+        a specified file.
+        Parameters
+        ----------
+        image_path : str
+            Where to save the file to.
+        plot_fn : typing.Union[typing.Callable, str], optional
+            How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
+        kwargs : dict, optional
+            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
+            whatever ``plot_fn`` is set to.
+        """
+        import matplotlib.pyplot as plt
+        if isinstance(plot_fn, str):
+            plot_fn = getattr(self, plot_fn)
+        plt.clf()
+        plot_fn(**kwargs)
+        plt.savefig(image_path, bbox_inches="tight", pad_inches=0)
+        plt.close()

xinference/thirdparty/audiotools/core/dsp.py ADDED Viewed

@@ -0,0 +1,390 @@
+import typing
+import julius
+import numpy as np
+import torch
+from . import util
+class DSPMixin:
+    _original_batch_size = None
+    _original_num_channels = None
+    _padded_signal_length = None
+    def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
+        self._original_batch_size = self.batch_size
+        self._original_num_channels = self.num_channels
+        window_length = int(window_duration * self.sample_rate)
+        hop_length = int(hop_duration * self.sample_rate)
+        if window_length % hop_length != 0:
+            factor = window_length // hop_length
+            window_length = factor * hop_length
+        self.zero_pad(hop_length, hop_length)
+        self._padded_signal_length = self.signal_length
+        return window_length, hop_length
+    def windows(
+        self, window_duration: float, hop_duration: float, preprocess: bool = True
+    ):
+        """Generator which yields windows of specified duration from signal with a specified
+        hop length.
+        Parameters
+        ----------
+        window_duration : float
+            Duration of every window in seconds.
+        hop_duration : float
+            Hop between windows in seconds.
+        preprocess : bool, optional
+            Whether to preprocess the signal, so that the first sample is in
+            the middle of the first window, by default True
+        Yields
+        ------
+        AudioSignal
+            Each window is returned as an AudioSignal.
+        """
+        if preprocess:
+            window_length, hop_length = self._preprocess_signal_for_windowing(
+                window_duration, hop_duration
+            )
+        self.audio_data = self.audio_data.reshape(-1, 1, self.signal_length)
+        for b in range(self.batch_size):
+            i = 0
+            start_idx = i * hop_length
+            while True:
+                start_idx = i * hop_length
+                i += 1
+                end_idx = start_idx + window_length
+                if end_idx > self.signal_length:
+                    break
+                yield self[b, ..., start_idx:end_idx]
+    def collect_windows(
+        self, window_duration: float, hop_duration: float, preprocess: bool = True
+    ):
+        """Reshapes signal into windows of specified duration from signal with a specified
+        hop length. Window are placed along the batch dimension. Use with
+        :py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
+        original signal.
+        Parameters
+        ----------
+        window_duration : float
+            Duration of every window in seconds.
+        hop_duration : float
+            Hop between windows in seconds.
+        preprocess : bool, optional
+            Whether to preprocess the signal, so that the first sample is in
+            the middle of the first window, by default True
+        Returns
+        -------
+        AudioSignal
+            AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
+        """
+        if preprocess:
+            window_length, hop_length = self._preprocess_signal_for_windowing(
+                window_duration, hop_duration
+            )
+        # self.audio_data: (nb, nch, nt).
+        unfolded = torch.nn.functional.unfold(
+            self.audio_data.reshape(-1, 1, 1, self.signal_length),
+            kernel_size=(1, window_length),
+            stride=(1, hop_length),
+        )
+        # unfolded: (nb * nch, window_length, num_windows).
+        # -> (nb * nch * num_windows, 1, window_length)
+        unfolded = unfolded.permute(0, 2, 1).reshape(-1, 1, window_length)
+        self.audio_data = unfolded
+        return self
+    def overlap_and_add(self, hop_duration: float):
+        """Function which takes a list of windows and overlap adds them into a
+        signal the same length as ``audio_signal``.
+        Parameters
+        ----------
+        hop_duration : float
+            How much to shift for each window
+            (overlap is window_duration - hop_duration) in seconds.
+        Returns
+        -------
+        AudioSignal
+            overlap-and-added signal.
+        """
+        hop_length = int(hop_duration * self.sample_rate)
+        window_length = self.signal_length
+        nb, nch = self._original_batch_size, self._original_num_channels
+        unfolded = self.audio_data.reshape(nb * nch, -1, window_length).permute(0, 2, 1)
+        folded = torch.nn.functional.fold(
+            unfolded,
+            output_size=(1, self._padded_signal_length),
+            kernel_size=(1, window_length),
+            stride=(1, hop_length),
+        )
+        norm = torch.ones_like(unfolded, device=unfolded.device)
+        norm = torch.nn.functional.fold(
+            norm,
+            output_size=(1, self._padded_signal_length),
+            kernel_size=(1, window_length),
+            stride=(1, hop_length),
+        )
+        folded = folded / norm
+        folded = folded.reshape(nb, nch, -1)
+        self.audio_data = folded
+        self.trim(hop_length, hop_length)
+        return self
+    def low_pass(
+        self, cutoffs: typing.Union[torch.Tensor, np.ndarray, float], zeros: int = 51
+    ):
+        """Low-passes the signal in-place. Each item in the batch
+        can have a different low-pass cutoff, if the input
+        to this signal is an array or tensor. If a float, all
+        items are given the same low-pass filter.
+        Parameters
+        ----------
+        cutoffs : typing.Union[torch.Tensor, np.ndarray, float]
+            Cutoff in Hz of low-pass filter.
+        zeros : int, optional
+            Number of taps to use in low-pass filter, by default 51
+        Returns
+        -------
+        AudioSignal
+            Low-passed AudioSignal.
+        """
+        cutoffs = util.ensure_tensor(cutoffs, 2, self.batch_size)
+        cutoffs = cutoffs / self.sample_rate
+        filtered = torch.empty_like(self.audio_data)
+        for i, cutoff in enumerate(cutoffs):
+            lp_filter = julius.LowPassFilter(cutoff.cpu(), zeros=zeros).to(self.device)
+            filtered[i] = lp_filter(self.audio_data[i])
+        self.audio_data = filtered
+        self.stft_data = None
+        return self
+    def high_pass(
+        self, cutoffs: typing.Union[torch.Tensor, np.ndarray, float], zeros: int = 51
+    ):
+        """High-passes the signal in-place. Each item in the batch
+        can have a different high-pass cutoff, if the input
+        to this signal is an array or tensor. If a float, all
+        items are given the same high-pass filter.
+        Parameters
+        ----------
+        cutoffs : typing.Union[torch.Tensor, np.ndarray, float]
+            Cutoff in Hz of high-pass filter.
+        zeros : int, optional
+            Number of taps to use in high-pass filter, by default 51
+        Returns
+        -------
+        AudioSignal
+            High-passed AudioSignal.
+        """
+        cutoffs = util.ensure_tensor(cutoffs, 2, self.batch_size)
+        cutoffs = cutoffs / self.sample_rate
+        filtered = torch.empty_like(self.audio_data)
+        for i, cutoff in enumerate(cutoffs):
+            hp_filter = julius.HighPassFilter(cutoff.cpu(), zeros=zeros).to(self.device)
+            filtered[i] = hp_filter(self.audio_data[i])
+        self.audio_data = filtered
+        self.stft_data = None
+        return self
+    def mask_frequencies(
+        self,
+        fmin_hz: typing.Union[torch.Tensor, np.ndarray, float],
+        fmax_hz: typing.Union[torch.Tensor, np.ndarray, float],
+        val: float = 0.0,
+    ):
+        """Masks frequencies between ``fmin_hz`` and ``fmax_hz``, and fills them
+        with the value specified by ``val``. Useful for implementing SpecAug.
+        The min and max can be different for every item in the batch.
+        Parameters
+        ----------
+        fmin_hz : typing.Union[torch.Tensor, np.ndarray, float]
+            Lower end of band to mask out.
+        fmax_hz : typing.Union[torch.Tensor, np.ndarray, float]
+            Upper end of band to mask out.
+        val : float, optional
+            Value to fill in, by default 0.0
+        Returns
+        -------
+        AudioSignal
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+            masked audio data.
+        """
+        # SpecAug
+        mag, phase = self.magnitude, self.phase
+        fmin_hz = util.ensure_tensor(fmin_hz, ndim=mag.ndim)
+        fmax_hz = util.ensure_tensor(fmax_hz, ndim=mag.ndim)
+        assert torch.all(fmin_hz < fmax_hz)
+        # build mask
+        nbins = mag.shape[-2]
+        bins_hz = torch.linspace(0, self.sample_rate / 2, nbins, device=self.device)
+        bins_hz = bins_hz[None, None, :, None].repeat(
+            self.batch_size, 1, 1, mag.shape[-1]
+        )
+        mask = (fmin_hz <= bins_hz) & (bins_hz < fmax_hz)
+        mask = mask.to(self.device)
+        mag = mag.masked_fill(mask, val)
+        phase = phase.masked_fill(mask, val)
+        self.stft_data = mag * torch.exp(1j * phase)
+        return self
+    def mask_timesteps(
+        self,
+        tmin_s: typing.Union[torch.Tensor, np.ndarray, float],
+        tmax_s: typing.Union[torch.Tensor, np.ndarray, float],
+        val: float = 0.0,
+    ):
+        """Masks timesteps between ``tmin_s`` and ``tmax_s``, and fills them
+        with the value specified by ``val``. Useful for implementing SpecAug.
+        The min and max can be different for every item in the batch.
+        Parameters
+        ----------
+        tmin_s : typing.Union[torch.Tensor, np.ndarray, float]
+            Lower end of timesteps to mask out.
+        tmax_s : typing.Union[torch.Tensor, np.ndarray, float]
+            Upper end of timesteps to mask out.
+        val : float, optional
+            Value to fill in, by default 0.0
+        Returns
+        -------
+        AudioSignal
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+            masked audio data.
+        """
+        # SpecAug
+        mag, phase = self.magnitude, self.phase
+        tmin_s = util.ensure_tensor(tmin_s, ndim=mag.ndim)
+        tmax_s = util.ensure_tensor(tmax_s, ndim=mag.ndim)
+        assert torch.all(tmin_s < tmax_s)
+        # build mask
+        nt = mag.shape[-1]
+        bins_t = torch.linspace(0, self.signal_duration, nt, device=self.device)
+        bins_t = bins_t[None, None, None, :].repeat(
+            self.batch_size, 1, mag.shape[-2], 1
+        )
+        mask = (tmin_s <= bins_t) & (bins_t < tmax_s)
+        mag = mag.masked_fill(mask, val)
+        phase = phase.masked_fill(mask, val)
+        self.stft_data = mag * torch.exp(1j * phase)
+        return self
+    def mask_low_magnitudes(
+        self, db_cutoff: typing.Union[torch.Tensor, np.ndarray, float], val: float = 0.0
+    ):
+        """Mask away magnitudes below a specified threshold, which
+        can be different for every item in the batch.
+        Parameters
+        ----------
+        db_cutoff : typing.Union[torch.Tensor, np.ndarray, float]
+            Decibel value for which things below it will be masked away.
+        val : float, optional
+            Value to fill in for masked portions, by default 0.0
+        Returns
+        -------
+        AudioSignal
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+            masked audio data.
+        """
+        mag = self.magnitude
+        log_mag = self.log_magnitude()
+        db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
+        mask = log_mag < db_cutoff
+        mag = mag.masked_fill(mask, val)
+        self.magnitude = mag
+        return self
+    def shift_phase(self, shift: typing.Union[torch.Tensor, np.ndarray, float]):
+        """Shifts the phase by a constant value.
+        Parameters
+        ----------
+        shift : typing.Union[torch.Tensor, np.ndarray, float]
+            What to shift the phase by.
+        Returns
+        -------
+        AudioSignal
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+            masked audio data.
+        """
+        shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
+        self.phase = self.phase + shift
+        return self
+    def corrupt_phase(self, scale: typing.Union[torch.Tensor, np.ndarray, float]):
+        """Corrupts the phase randomly by some scaled value.
+        Parameters
+        ----------
+        scale : typing.Union[torch.Tensor, np.ndarray, float]
+            Standard deviation of noise to add to the phase.
+        Returns
+        -------
+        AudioSignal
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+            masked audio data.
+        """
+        scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
+        self.phase = self.phase + scale * torch.randn_like(self.phase)
+        return self
+    def preemphasis(self, coef: float = 0.85):
+        """Applies pre-emphasis to audio signal.
+        Parameters
+        ----------
+        coef : float, optional
+            How much pre-emphasis to apply, lower values do less. 0 does nothing.
+            by default 0.85
+        Returns
+        -------
+        AudioSignal
+            Pre-emphasized signal.
+        """
+        kernel = torch.tensor([1, -coef, 0]).view(1, 1, -1).to(self.device)
+        x = self.audio_data.reshape(-1, 1, self.signal_length)
+        x = torch.nn.functional.conv1d(x, kernel, padding=1)
+        self.audio_data = x.reshape(*self.audio_data.shape)
+        return self