PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (317) hide show

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py ADDED Viewed

@@ -0,0 +1,184 @@
+# Copyright (c) 2023 Amphion.
+#
+# This code is modified from https://github.com/ZhangXInFD/SpeechTokenizer/blob/main/speechtokenizer/model.py
+# Licensed under Apache License 2.0
+from .modules.seanet import SEANetEncoder, SEANetDecoder
+from .modules.quantization import ResidualVectorQuantizer
+import torch.nn as nn
+from einops import rearrange
+import torch
+import numpy as np
+class SpeechTokenizer(nn.Module):
+    def __init__(self, config):
+        """
+        Parameters
+        ----------
+        config : json
+            Model Config.
+        """
+        super().__init__()
+        self.encoder = SEANetEncoder(
+            n_filters=config.get("n_filters"),
+            dimension=config.get("dimension"),
+            ratios=config.get("strides"),
+            lstm=config.get("lstm_layers"),
+            bidirectional=config.get("bidirectional"),
+            dilation_base=config.get("dilation_base"),
+            residual_kernel_size=config.get("residual_kernel_size"),
+            n_residual_layers=config.get("n_residual_layers"),
+            activation=config.get("activation"),
+        )
+        self.sample_rate = config.get("sample_rate")
+        self.n_q = config.get("n_q")
+        self.downsample_rate = np.prod(config.get("strides"))
+        if config.get("dimension") != config.get("semantic_dimension"):
+            self.transform = nn.Linear(
+                config.get("dimension"), config.get("semantic_dimension")
+            )
+        else:
+            self.transform = nn.Identity()
+        self.quantizer = ResidualVectorQuantizer(
+            dimension=config.get("dimension"),
+            n_q=config.get("n_q"),
+            bins=config.get("codebook_size"),
+        )
+        self.decoder = SEANetDecoder(
+            n_filters=config.get("n_filters"),
+            dimension=config.get("dimension"),
+            ratios=config.get("strides"),
+            lstm=config.get("lstm_layers"),
+            bidirectional=False,
+            dilation_base=config.get("dilation_base"),
+            residual_kernel_size=config.get("residual_kernel_size"),
+            n_residual_layers=config.get("n_residual_layers"),
+            activation=config.get("activation"),
+        )
+    @classmethod
+    def load_from_checkpoint(cls, config_path: str, ckpt_path: str):
+        """
+        Parameters
+        ----------
+        config_path : str
+            Path of model configuration file.
+        ckpt_path : str
+            Path of model  checkpoint.
+        Returns
+        -------
+        model : SpeechTokenizer
+            SpeechTokenizer model.
+        """
+        import json
+        with open(config_path) as f:
+            cfg = json.load(f)
+        model = cls(cfg)
+        params = torch.load(ckpt_path, map_location="cpu")
+        model.load_state_dict(params)
+        return model
+    def forward(self, x: torch.tensor, n_q: int = None, layers: list = [0]):
+        """
+        Parameters
+        ----------
+        x : torch.tensor
+            Input wavs. Shape: (batch, channels, timesteps).
+        n_q : int, optional
+            Number of quantizers in RVQ used to encode. The default is all layers.
+        layers : list[int], optional
+            Layers of RVQ should return quantized result. The default is the first layer.
+        Returns
+        -------
+        o : torch.tensor
+            Output wavs. Shape: (batch, channels, timesteps).
+        commit_loss : torch.tensor
+            Commitment loss from residual vector quantizers.
+        feature : torch.tensor
+            Output of RVQ's first layer. Shape: (batch, timesteps, dimension)
+        """
+        n_q = n_q if n_q else self.n_q
+        e = self.encoder(x)
+        quantized, codes, commit_loss, quantized_list = self.quantizer(
+            e, n_q=n_q, layers=layers
+        )
+        feature = rearrange(quantized_list[0], "b d t -> b t d")
+        feature = self.transform(feature)
+        o = self.decoder(quantized)
+        return o, commit_loss, feature
+    def forward_feature(self, x: torch.tensor, layers: list = None):
+        """
+        Parameters
+        ----------
+        x : torch.tensor
+            Input wavs. Shape should be (batch, channels, timesteps).
+        layers : list[int], optional
+            Layers of RVQ should return quantized result. The default is all layers.
+        Returns
+        -------
+        quantized_list : list[torch.tensor]
+            Quantized of required layers.
+        """
+        e = self.encoder(x)
+        layers = layers if layers else list(range(self.n_q))
+        quantized, codes, commit_loss, quantized_list = self.quantizer(e, layers=layers)
+        return quantized_list
+    def encode(self, x: torch.tensor, n_q: int = None, st: int = None):
+        """
+        Parameters
+        ----------
+        x : torch.tensor
+            Input wavs. Shape: (batch, channels, timesteps).
+        n_q : int, optional
+            Number of quantizers in RVQ used to encode. The default is all layers.
+        st : int, optional
+            Start quantizer index in RVQ. The default is 0.
+        Returns
+        -------
+        codes : torch.tensor
+            Output indices for each quantizer. Shape: (n_q, batch, timesteps)
+        """
+        e = self.encoder(x)
+        if st is None:
+            st = 0
+        n_q = n_q if n_q else self.n_q
+        codes = self.quantizer.encode(e, n_q=n_q, st=st)
+        return codes
+    def decode(self, codes: torch.tensor, st: int = 0):
+        """
+        Parameters
+        ----------
+        codes : torch.tensor
+            Indices for each quantizer. Shape: (n_q, batch, timesteps).
+        st : int, optional
+            Start quantizer index in RVQ. The default is 0.
+        Returns
+        -------
+        o : torch.tensor
+            Reconstruct wavs from codes. Shape: (batch, channels, timesteps)
+        """
+        quantized = self.quantizer.decode(codes, st=st)
+        o = self.decoder(quantized)
+        return o

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This source file is copied from https://github.com/facebookresearch/encodec
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Torch modules."""
+# flake8: noqa
+from .conv import (
+    pad1d,
+    unpad1d,
+    NormConv1d,
+    NormConvTranspose1d,
+    NormConv2d,
+    NormConvTranspose2d,
+    SConv1d,
+    SConvTranspose1d,
+)
+from .lstm import SLSTM
+from .seanet import SEANetEncoder, SEANetDecoder

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py ADDED Viewed

@@ -0,0 +1,346 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This source file is copied from https://github.com/facebookresearch/encodec
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Convolutional layers wrappers and utilities."""
+import math
+import typing as tp
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm, weight_norm
+from .norm import ConvLayerNorm
+CONV_NORMALIZATIONS = frozenset(
+    [
+        "none",
+        "weight_norm",
+        "spectral_norm",
+        "time_layer_norm",
+        "layer_norm",
+        "time_group_norm",
+    ]
+)
+def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "weight_norm":
+        return weight_norm(module)
+    elif norm == "spectral_norm":
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_norm_module(
+    module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs
+) -> nn.Module:
+    """Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn't support causal evaluation.
+    """
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "layer_norm":
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
+    elif norm == "time_group_norm":
+        if causal:
+            raise ValueError("GroupNorm doesn't support causal evaluation.")
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "zero",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConv2d(nn.Module):
+    """Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            nn.ConvTranspose1d(*args, **kwargs), norm
+        )
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose2d(nn.Module):
+    """Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            nn.ConvTranspose2d(*args, **kwargs), norm
+        )
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class SConv1d(nn.Module):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        pad_mode: str = "reflect",
+    ):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            warnings.warn(
+                "SConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.pad_mode = pad_mode
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        padding_total = (kernel_size - 1) * dilation - (stride - 1)
+        extra_padding = get_extra_padding_for_conv1d(
+            x, kernel_size, stride, padding_total
+        )
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(
+                x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )
+        return self.conv(x)
+class SConvTranspose1d(nn.Module):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        norm: str = "none",
+        trim_right_ratio: float = 1.0,
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+    ):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert (
+            self.causal or self.trim_right_ratio == 1.0
+        ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+        y = self.convtr(x)
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py ADDED Viewed

@@ -0,0 +1,46 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This source file is copied from https://github.com/facebookresearch/encodec
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""LSTM layers module."""
+from torch import nn
+class SLSTM(nn.Module):
+    """
+    LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    """
+    def __init__(
+        self,
+        dimension: int,
+        num_layers: int = 2,
+        skip: bool = True,
+        bidirectional: bool = False,
+    ):
+        super().__init__()
+        self.bidirectional = bidirectional
+        self.skip = skip
+        self.lstm = nn.LSTM(
+            dimension, dimension, num_layers, bidirectional=bidirectional
+        )
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.bidirectional:
+            x = x.repeat(1, 1, 2)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py ADDED Viewed

@@ -0,0 +1,37 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This source file is copied from https://github.com/facebookresearch/encodec
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Normalization modules."""
+import typing as tp
+import einops
+import torch
+from torch import nn
+class ConvLayerNorm(nn.LayerNorm):
+    """
+    Convolution-friendly LayerNorm that moves channels to last dimensions
+    before running the normalization and moves them back to original position right after.
+    """
+    def __init__(
+        self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs
+    ):
+        super().__init__(normalized_shape, **kwargs)
+    def forward(self, x):
+        x = einops.rearrange(x, "b ... t -> b t ...")
+        x = super().forward(x)
+        x = einops.rearrange(x, "b t ... -> b ... t")
+        return

xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This source file is copied from https://github.com/facebookresearch/encodec
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# flake8: noqa
+from .vq import QuantizedResult, ResidualVectorQuantizer

xinference 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl