PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show

xinference/thirdparty/audiotools/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+Functions for comparing AudioSignal objects to one another.
+"""  # fmt: skip
+from . import distance
+from . import quality
+from . import spectral

xinference/thirdparty/audiotools/metrics/distance.py ADDED Viewed

@@ -0,0 +1,131 @@
+import torch
+from torch import nn
+from .. import AudioSignal
+class L1Loss(nn.L1Loss):
+    """L1 Loss between AudioSignals. Defaults
+    to comparing ``audio_data``, but any
+    attribute of an AudioSignal can be used.
+    Parameters
+    ----------
+    attribute : str, optional
+        Attribute of signal to compare, defaults to ``audio_data``.
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+    """
+    def __init__(self, attribute: str = "audio_data", weight: float = 1.0, **kwargs):
+        self.attribute = attribute
+        self.weight = weight
+        super().__init__(**kwargs)
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate AudioSignal
+        y : AudioSignal
+            Reference AudioSignal
+        Returns
+        -------
+        torch.Tensor
+            L1 loss between AudioSignal attributes.
+        """
+        if isinstance(x, AudioSignal):
+            x = getattr(x, self.attribute)
+            y = getattr(y, self.attribute)
+        return super().forward(x, y)
+class SISDRLoss(nn.Module):
+    """
+    Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
+    of estimated and reference audio signals or aligned features.
+    Parameters
+    ----------
+    scaling : int, optional
+        Whether to use scale-invariant (True) or
+        signal-to-noise ratio (False), by default True
+    reduction : str, optional
+        How to reduce across the batch (either 'mean',
+        'sum', or none).], by default ' mean'
+    zero_mean : int, optional
+        Zero mean the references and estimates before
+        computing the loss, by default True
+    clip_min : int, optional
+        The minimum possible loss value. Helps network
+        to not focus on making already good examples better, by default None
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+    """
+    def __init__(
+        self,
+        scaling: int = True,
+        reduction: str = "mean",
+        zero_mean: int = True,
+        clip_min: int = None,
+        weight: float = 1.0,
+    ):
+        self.scaling = scaling
+        self.reduction = reduction
+        self.zero_mean = zero_mean
+        self.clip_min = clip_min
+        self.weight = weight
+        super().__init__()
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        eps = 1e-8
+        # nb, nc, nt
+        if isinstance(x, AudioSignal):
+            references = x.audio_data
+            estimates = y.audio_data
+        else:
+            references = x
+            estimates = y
+        nb = references.shape[0]
+        references = references.reshape(nb, 1, -1).permute(0, 2, 1)
+        estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
+        # samples now on axis 1
+        if self.zero_mean:
+            mean_reference = references.mean(dim=1, keepdim=True)
+            mean_estimate = estimates.mean(dim=1, keepdim=True)
+        else:
+            mean_reference = 0
+            mean_estimate = 0
+        _references = references - mean_reference
+        _estimates = estimates - mean_estimate
+        references_projection = (_references**2).sum(dim=-2) + eps
+        references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
+        scale = (
+            (references_on_estimates / references_projection).unsqueeze(1)
+            if self.scaling
+            else 1
+        )
+        e_true = scale * _references
+        e_res = _estimates - e_true
+        signal = (e_true**2).sum(dim=1)
+        noise = (e_res**2).sum(dim=1)
+        sdr = -10 * torch.log10(signal / noise + eps)
+        if self.clip_min is not None:
+            sdr = torch.clamp(sdr, min=self.clip_min)
+        if self.reduction == "mean":
+            sdr = sdr.mean()
+        elif self.reduction == "sum":
+            sdr = sdr.sum()
+        return sdr

xinference/thirdparty/audiotools/metrics/quality.py ADDED Viewed

@@ -0,0 +1,159 @@
+import os
+import numpy as np
+import torch
+from .. import AudioSignal
+def stoi(
+    estimates: AudioSignal,
+    references: AudioSignal,
+    extended: int = False,
+):
+    """Short term objective intelligibility
+    Computes the STOI (See [1][2]) of a denoised signal compared to a clean
+    signal, The output is expected to have a monotonic relation with the
+    subjective speech-intelligibility, where a higher score denotes better
+    speech intelligibility. Uses pystoi under the hood.
+    Parameters
+    ----------
+    estimates : AudioSignal
+        Denoised speech
+    references : AudioSignal
+        Clean original speech
+    extended : int, optional
+        Boolean, whether to use the extended STOI described in [3], by default False
+    Returns
+    -------
+    Tensor[float]
+        Short time objective intelligibility measure between clean and
+        denoised speech
+    References
+    ----------
+    1.  C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
+        Objective Intelligibility Measure for Time-Frequency Weighted Noisy
+        Speech', ICASSP 2010, Texas, Dallas.
+    2.  C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for
+        Intelligibility Prediction of Time-Frequency Weighted Noisy Speech',
+        IEEE Transactions on Audio, Speech, and Language Processing, 2011.
+    3.  Jesper Jensen and Cees H. Taal, 'An Algorithm for Predicting the
+        Intelligibility of Speech Masked by Modulated Noise Maskers',
+        IEEE Transactions on Audio, Speech and Language Processing, 2016.
+    """
+    import pystoi
+    estimates = estimates.clone().to_mono()
+    references = references.clone().to_mono()
+    stois = []
+    for i in range(estimates.batch_size):
+        _stoi = pystoi.stoi(
+            references.audio_data[i, 0].detach().cpu().numpy(),
+            estimates.audio_data[i, 0].detach().cpu().numpy(),
+            references.sample_rate,
+            extended=extended,
+        )
+        stois.append(_stoi)
+    return torch.from_numpy(np.array(stois))
+def pesq(
+    estimates: AudioSignal,
+    references: AudioSignal,
+    mode: str = "wb",
+    target_sr: float = 16000,
+):
+    """_summary_
+    Parameters
+    ----------
+    estimates : AudioSignal
+        Degraded AudioSignal
+    references : AudioSignal
+        Reference AudioSignal
+    mode : str, optional
+        'wb' (wide-band) or 'nb' (narrow-band), by default "wb"
+    target_sr : float, optional
+        Target sample rate, by default 16000
+    Returns
+    -------
+    Tensor[float]
+        PESQ score: P.862.2 Prediction (MOS-LQO)
+    """
+    from pesq import pesq as pesq_fn
+    estimates = estimates.clone().to_mono().resample(target_sr)
+    references = references.clone().to_mono().resample(target_sr)
+    pesqs = []
+    for i in range(estimates.batch_size):
+        _pesq = pesq_fn(
+            estimates.sample_rate,
+            references.audio_data[i, 0].detach().cpu().numpy(),
+            estimates.audio_data[i, 0].detach().cpu().numpy(),
+            mode,
+        )
+        pesqs.append(_pesq)
+    return torch.from_numpy(np.array(pesqs))
+def visqol(
+    estimates: AudioSignal,
+    references: AudioSignal,
+    mode: str = "audio",
+):  # pragma: no cover
+    """ViSQOL score.
+    Parameters
+    ----------
+    estimates : AudioSignal
+        Degraded AudioSignal
+    references : AudioSignal
+        Reference AudioSignal
+    mode : str, optional
+        'audio' or 'speech', by default 'audio'
+    Returns
+    -------
+    Tensor[float]
+        ViSQOL score (MOS-LQO)
+    """
+    from visqol import visqol_lib_py
+    from visqol.pb2 import visqol_config_pb2
+    from visqol.pb2 import similarity_result_pb2
+    config = visqol_config_pb2.VisqolConfig()
+    if mode == "audio":
+        target_sr = 48000
+        config.options.use_speech_scoring = False
+        svr_model_path = "libsvm_nu_svr_model.txt"
+    elif mode == "speech":
+        target_sr = 16000
+        config.options.use_speech_scoring = True
+        svr_model_path = "lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite"
+    else:
+        raise ValueError(f"Unrecognized mode: {mode}")
+    config.audio.sample_rate = target_sr
+    config.options.svr_model_path = os.path.join(
+        os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path
+    )
+    api = visqol_lib_py.VisqolApi()
+    api.Create(config)
+    estimates = estimates.clone().to_mono().resample(target_sr)
+    references = references.clone().to_mono().resample(target_sr)
+    visqols = []
+    for i in range(estimates.batch_size):
+        _visqol = api.Measure(
+            references.audio_data[i, 0].detach().cpu().numpy().astype(float),
+            estimates.audio_data[i, 0].detach().cpu().numpy().astype(float),
+        )
+        visqols.append(_visqol.moslqo)
+    return torch.from_numpy(np.array(visqols))

xinference/thirdparty/audiotools/metrics/spectral.py ADDED Viewed

@@ -0,0 +1,247 @@
+import typing
+from typing import List
+import numpy as np
+from torch import nn
+from .. import AudioSignal
+from .. import STFTParams
+class MultiScaleSTFTLoss(nn.Module):
+    """Computes the multi-scale STFT loss from [1].
+    Parameters
+    ----------
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+    References
+    ----------
+    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
+        "DDSP: Differentiable Digital Signal Processing."
+        International Conference on Learning Representations. 2019.
+    """
+    def __init__(
+        self,
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.loss_fn = loss_fn
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.clamp_eps = clamp_eps
+        self.weight = weight
+        self.pow = pow
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes multi-scale STFT between an estimate and a reference
+        signal.
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+        Returns
+        -------
+        torch.Tensor
+            Multi-scale STFT loss.
+        """
+        loss = 0.0
+        for s in self.stft_params:
+            x.stft(s.window_length, s.hop_length, s.window_type)
+            y.stft(s.window_length, s.hop_length, s.window_type)
+            loss += self.log_weight * self.loss_fn(
+                x.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
+        return loss
+class MelSpectrogramLoss(nn.Module):
+    """Compute distance between mel spectrograms. Can be used
+    in a multi-scale way.
+    Parameters
+    ----------
+    n_mels : List[int]
+        Number of mels per STFT, by default [150, 80],
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+    """
+    def __init__(
+        self,
+        n_mels: List[int] = [150, 80],
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        mel_fmin: List[float] = [0.0, 0.0],
+        mel_fmax: List[float] = [None, None],
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.n_mels = n_mels
+        self.loss_fn = loss_fn
+        self.clamp_eps = clamp_eps
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.weight = weight
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.pow = pow
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes mel loss between an estimate and a reference
+        signal.
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+        Returns
+        -------
+        torch.Tensor
+            Mel loss.
+        """
+        loss = 0.0
+        for n_mels, fmin, fmax, s in zip(
+            self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
+        ):
+            kwargs = {
+                "window_length": s.window_length,
+                "hop_length": s.hop_length,
+                "window_type": s.window_type,
+            }
+            x_mels = x.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+            y_mels = y.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+            loss += self.log_weight * self.loss_fn(
+                x_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x_mels, y_mels)
+        return loss
+class PhaseLoss(nn.Module):
+    """Difference between phase spectrograms.
+    Parameters
+    ----------
+    window_length : int, optional
+        Length of STFT window, by default 2048
+    hop_length : int, optional
+        Hop length of STFT window, by default 512
+    weight : float, optional
+        Weight of loss, by default 1.0
+    """
+    def __init__(
+        self, window_length: int = 2048, hop_length: int = 512, weight: float = 1.0
+    ):
+        super().__init__()
+        self.weight = weight
+        self.stft_params = STFTParams(window_length, hop_length)
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes phase loss between an estimate and a reference
+        signal.
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+        Returns
+        -------
+        torch.Tensor
+            Phase loss.
+        """
+        s = self.stft_params
+        x.stft(s.window_length, s.hop_length, s.window_type)
+        y.stft(s.window_length, s.hop_length, s.window_type)
+        # Take circular difference
+        diff = x.phase - y.phase
+        diff[diff < -np.pi] += 2 * np.pi
+        diff[diff > np.pi] -= -2 * np.pi
+        # Scale true magnitude to weights in [0, 1]
+        x_min, x_max = x.magnitude.min(), x.magnitude.max()
+        weights = (x.magnitude - x_min) / (x_max - x_min)
+        # Take weighted mean of all phase errors
+        loss = ((weights * diff) ** 2).mean()
+        return loss

xinference/thirdparty/audiotools/ml/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from . import decorators
+from . import layers
+from .accelerator import Accelerator
+from .experiment import Experiment
+from .layers import BaseModel

xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl