PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show

xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py ADDED Viewed

@@ -0,0 +1,110 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/ttv_v1/styleencoder.py
+from . import attentions
+from torch import nn
+import torch
+from torch.nn import functional as F
+class Mish(nn.Module):
+    def __init__(self):
+        super(Mish, self).__init__()
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class Conv1dGLU(nn.Module):
+    """
+    Conv1d + GLU(Gated Linear Unit) with residual connection.
+    For GLU refer to https://arxiv.org/abs/1612.08083 paper.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, dropout):
+        super(Conv1dGLU, self).__init__()
+        self.out_channels = out_channels
+        self.conv1 = nn.Conv1d(
+            in_channels, 2 * out_channels, kernel_size=kernel_size, padding=2
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1)
+        x = x1 * torch.sigmoid(x2)
+        x = residual + self.dropout(x)
+        return x
+class StyleEncoder(torch.nn.Module):
+    def __init__(self, in_dim=513, hidden_dim=128, out_dim=256):
+        super().__init__()
+        self.in_dim = in_dim  # Linear 513 wav2vec 2.0 1024
+        self.hidden_dim = hidden_dim
+        self.out_dim = out_dim
+        self.kernel_size = 5
+        self.n_head = 2
+        self.dropout = 0.1
+        self.spectral = nn.Sequential(
+            nn.Conv1d(self.in_dim, self.hidden_dim, 1),
+            Mish(),
+            nn.Dropout(self.dropout),
+            nn.Conv1d(self.hidden_dim, self.hidden_dim, 1),
+            Mish(),
+            nn.Dropout(self.dropout),
+        )
+        self.temporal = nn.Sequential(
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+        )
+        self.slf_attn = attentions.MultiHeadAttention(
+            self.hidden_dim,
+            self.hidden_dim,
+            self.n_head,
+            p_dropout=self.dropout,
+            proximal_bias=False,
+            proximal_init=True,
+        )
+        self.atten_drop = nn.Dropout(self.dropout)
+        self.fc = nn.Conv1d(self.hidden_dim, self.out_dim, 1)
+    def forward(self, x, mask=None):
+        # spectral
+        x = self.spectral(x) * mask
+        # temporal
+        x = self.temporal(x) * mask
+        # self-attention
+        attn_mask = mask.unsqueeze(2) * mask.unsqueeze(-1)
+        y = self.slf_attn(x, x, attn_mask=attn_mask)
+        x = x + self.atten_drop(y)
+        # fc
+        x = self.fc(x)
+        # temoral average pooling
+        w = self.temporal_avg_pool(x, mask=mask)
+        return w
+    def temporal_avg_pool(self, x, mask=None):
+        if mask is None:
+            out = torch.mean(x, dim=2)
+        else:
+            len_ = mask.sum(dim=2)
+            x = x.sum(dim=2)
+            out = torch.div(x, len_)
+        return out

xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py ADDED Viewed

@@ -0,0 +1,224 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/ttv_v1/modules.py
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from modules.dac.model.encodec import SConv1d
+from . import commons
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+        causal=False,
+    ):
+        super(WN, self).__init__()
+        conv1d_type = SConv1d
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels != 0:
+            self.cond_layer = conv1d_type(
+                gin_channels, 2 * hidden_channels * n_layers, 1, norm="weight_norm"
+            )
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = conv1d_type(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+                norm="weight_norm",
+                causal=causal,
+            )
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = conv1d_type(
+                hidden_channels, res_skip_channels, 1, norm="weight_norm", causal=causal
+            )
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)

xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os, sys
+import os.path as osp
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import Optimizer
+from functools import reduce
+from torch.optim import AdamW
+class MultiOptimizer:
+    def __init__(self, optimizers={}, schedulers={}):
+        self.optimizers = optimizers
+        self.schedulers = schedulers
+        self.keys = list(optimizers.keys())
+        self.param_groups = reduce(
+            lambda x, y: x + y, [v.param_groups for v in self.optimizers.values()]
+        )
+    def state_dict(self):
+        state_dicts = [(key, self.optimizers[key].state_dict()) for key in self.keys]
+        return state_dicts
+    def scheduler_state_dict(self):
+        state_dicts = [(key, self.schedulers[key].state_dict()) for key in self.keys]
+        return state_dicts
+    def load_state_dict(self, state_dict):
+        for key, val in state_dict:
+            try:
+                self.optimizers[key].load_state_dict(val)
+            except:
+                print("Unloaded %s" % key)
+    def load_scheduler_state_dict(self, state_dict):
+        for key, val in state_dict:
+            try:
+                self.schedulers[key].load_state_dict(val)
+            except:
+                print("Unloaded %s" % key)
+    def step(self, key=None, scaler=None):
+        keys = [key] if key is not None else self.keys
+        _ = [self._step(key, scaler) for key in keys]
+    def _step(self, key, scaler=None):
+        if scaler is not None:
+            scaler.step(self.optimizers[key])
+            scaler.update()
+        else:
+            self.optimizers[key].step()
+    def zero_grad(self, key=None):
+        if key is not None:
+            self.optimizers[key].zero_grad()
+        else:
+            _ = [self.optimizers[key].zero_grad() for key in self.keys]
+    def scheduler(self, *args, key=None):
+        if key is not None:
+            self.schedulers[key].step(*args)
+        else:
+            _ = [self.schedulers[key].step_batch(*args) for key in self.keys]
+def define_scheduler(optimizer, params):
+    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=params["gamma"])
+    return scheduler
+def build_optimizer(model_dict, scheduler_params_dict, lr, type="AdamW"):
+    optim = {}
+    for key, model in model_dict.items():
+        model_parameters = model.parameters()
+        parameters_names = []
+        parameters_names.append(
+            [name_param_pair[0] for name_param_pair in model.named_parameters()]
+        )
+        if type == "AdamW":
+            optim[key] = AdamW(
+                model_parameters,
+                lr=lr,
+                betas=(0.9, 0.98),
+                eps=1e-9,
+                weight_decay=0.1,
+            )
+        else:
+            raise ValueError("Unknown optimizer type: %s" % type)
+    schedulers = dict(
+        [
+            (key, torch.optim.lr_scheduler.ExponentialLR(opt, gamma=0.999996))
+            for key, opt in optim.items()
+        ]
+    )
+    multi_optim = MultiOptimizer(optim, schedulers)
+    return multi_optim

xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py ADDED Viewed

@@ -0,0 +1,210 @@
+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from concurrent.futures import ALL_COMPLETED
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from einops import rearrange, repeat
+from indextts.utils.maskgct.models.codec.amphion_codec.quantize import ResidualVQ
+from indextts.utils.maskgct.models.codec.kmeans.vocos import VocosBackbone
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.Linear):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+def compute_codebook_perplexity(indices, codebook_size):
+    indices = indices.flatten()
+    prob = torch.bincount(indices, minlength=codebook_size).float() / indices.size(0)
+    perp = torch.exp(-torch.sum(prob * torch.log(prob + 1e-10)))
+    return perp
+class RepCodec(nn.Module):
+    def __init__(
+        self,
+        codebook_size=8192,
+        hidden_size=1024,
+        codebook_dim=8,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        num_quantizers=1,
+        downsample_scale=1,
+        cfg=None,
+    ):
+        super().__init__()
+        codebook_size = (
+            cfg.codebook_size
+            if cfg is not None and hasattr(cfg, "codebook_size")
+            else codebook_size
+        )
+        codebook_dim = (
+            cfg.codebook_dim
+            if cfg is not None and hasattr(cfg, "codebook_dim")
+            else codebook_dim
+        )
+        hidden_size = (
+            cfg.hidden_size
+            if cfg is not None and hasattr(cfg, "hidden_size")
+            else hidden_size
+        )
+        vocos_dim = (
+            cfg.vocos_dim
+            if cfg is not None and hasattr(cfg, "vocos_dim")
+            else vocos_dim
+        )
+        vocos_intermediate_dim = (
+            cfg.vocos_intermediate_dim
+            if cfg is not None and hasattr(cfg, "vocos_dim")
+            else vocos_intermediate_dim
+        )
+        vocos_num_layers = (
+            cfg.vocos_num_layers
+            if cfg is not None and hasattr(cfg, "vocos_dim")
+            else vocos_num_layers
+        )
+        num_quantizers = (
+            cfg.num_quantizers
+            if cfg is not None and hasattr(cfg, "num_quantizers")
+            else num_quantizers
+        )
+        downsample_scale = (
+            cfg.downsample_scale
+            if cfg is not None and hasattr(cfg, "downsample_scale")
+            else downsample_scale
+        )
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.hidden_size = hidden_size
+        self.vocos_dim = vocos_dim
+        self.vocos_intermediate_dim = vocos_intermediate_dim
+        self.vocos_num_layers = vocos_num_layers
+        self.num_quantizers = num_quantizers
+        self.downsample_scale = downsample_scale
+        if self.downsample_scale != None and self.downsample_scale > 1:
+            self.down = nn.Conv1d(
+                self.hidden_size, self.hidden_size, kernel_size=3, stride=2, padding=1
+            )
+            self.up = nn.Conv1d(
+                self.hidden_size, self.hidden_size, kernel_size=3, stride=1, padding=1
+            )
+        self.encoder = nn.Sequential(
+            VocosBackbone(
+                input_channels=self.hidden_size,
+                dim=self.vocos_dim,
+                intermediate_dim=self.vocos_intermediate_dim,
+                num_layers=self.vocos_num_layers,
+                adanorm_num_embeddings=None,
+            ),
+            nn.Linear(self.vocos_dim, self.hidden_size),
+        )
+        self.decoder = nn.Sequential(
+            VocosBackbone(
+                input_channels=self.hidden_size,
+                dim=self.vocos_dim,
+                intermediate_dim=self.vocos_intermediate_dim,
+                num_layers=self.vocos_num_layers,
+                adanorm_num_embeddings=None,
+            ),
+            nn.Linear(self.vocos_dim, self.hidden_size),
+        )
+        self.quantizer = ResidualVQ(
+            input_dim=hidden_size,
+            num_quantizers=num_quantizers,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_type="fvq",
+            quantizer_dropout=0.0,
+            commitment=0.15,
+            codebook_loss_weight=1.0,
+            use_l2_normlize=True,
+        )
+        self.reset_parameters()
+    def forward(self, x):
+        # downsample
+        if self.downsample_scale != None and self.downsample_scale > 1:
+            x = x.transpose(1, 2)
+            x = self.down(x)
+            x = F.gelu(x)
+            x = x.transpose(1, 2)
+        # encoder
+        x = self.encoder(x.transpose(1, 2)).transpose(1, 2)
+        # vq
+        (
+            quantized_out,
+            all_indices,
+            all_commit_losses,
+            all_codebook_losses,
+            _,
+        ) = self.quantizer(x)
+        # decoder
+        x = self.decoder(quantized_out)
+        # up
+        if self.downsample_scale != None and self.downsample_scale > 1:
+            x = x.transpose(1, 2)
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+            x_rec = self.up(x).transpose(1, 2)
+        codebook_loss = (all_codebook_losses + all_commit_losses).mean()
+        all_indices = all_indices
+        return x_rec, codebook_loss, all_indices
+    def quantize(self, x):
+        if self.downsample_scale != None and self.downsample_scale > 1:
+            x = x.transpose(1, 2)
+            x = self.down(x)
+            x = F.gelu(x)
+            x = x.transpose(1, 2)
+        x = self.encoder(x.transpose(1, 2)).transpose(1, 2)
+        (
+            quantized_out,
+            all_indices,
+            all_commit_losses,
+            all_codebook_losses,
+            _,
+        ) = self.quantizer(x)
+        if all_indices.shape[0] == 1:
+            return all_indices.squeeze(0), quantized_out.transpose(1, 2)
+        return all_indices, quantized_out.transpose(1, 2)
+    def reset_parameters(self):
+        self.apply(init_weights)
+if __name__ == "__main__":
+    repcodec = RepCodec(vocos_dim=1024, downsample_scale=2)
+    print(repcodec)
+    print(sum(p.numel() for p in repcodec.parameters()) / 1e6)
+    x = torch.randn(5, 10, 1024)
+    x_rec, codebook_loss, all_indices = repcodec(x)
+    print(x_rec.shape, codebook_loss, all_indices.shape)
+    vq_id, emb = repcodec.quantize(x)
+    print(vq_id.shape, emb.shape)

xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl