PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py ADDED Viewed

@@ -0,0 +1,139 @@
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from vector_quantize_pytorch import GroupedResidualFSQ
+from .firefly import ConvNeXtBlock
+@dataclass
+class FSQResult:
+    z: torch.Tensor
+    codes: torch.Tensor
+    latents: torch.Tensor
+class DownsampleFiniteScalarQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 1,
+        n_groups: int = 1,
+        levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
+        downsample_factor: tuple[int] = (2, 2),
+        downsample_dims: tuple[int] | None = None,
+    ):
+        super().__init__()
+        if downsample_dims is None:
+            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
+        all_dims = (input_dim,) + tuple(downsample_dims)
+        self.residual_fsq = GroupedResidualFSQ(
+            dim=all_dims[-1],
+            levels=levels,
+            num_quantizers=n_codebooks,
+            groups=n_groups,
+        )
+        self.downsample_factor = downsample_factor
+        self.downsample_dims = downsample_dims
+        self.downsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    nn.Conv1d(
+                        all_dims[idx],
+                        all_dims[idx + 1],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx + 1]),
+                )
+                for idx, factor in enumerate(downsample_factor)
+            ]
+        )
+        self.upsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    nn.ConvTranspose1d(
+                        all_dims[idx + 1],
+                        all_dims[idx],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx]),
+                )
+                for idx, factor in reversed(list(enumerate(downsample_factor)))
+            ]
+        )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, z) -> FSQResult:
+        original_shape = z.shape
+        z = self.downsample(z)
+        quantized, indices = self.residual_fsq(z.mT)
+        result = FSQResult(
+            z=quantized.mT,
+            codes=indices.mT,
+            latents=z,
+        )
+        result.z = self.upsample(result.z)
+        # Pad or crop z to match original shape
+        diff = original_shape[-1] - result.z.shape[-1]
+        left = diff // 2
+        right = diff - left
+        if diff > 0:
+            result.z = F.pad(result.z, (left, right))
+        elif diff < 0:
+            result.z = result.z[..., left:-right]
+        return result
+    def encode(self, z):
+        z = self.downsample(z)
+        _, indices = self.residual_fsq(z.mT)
+        indices = rearrange(indices, "g b l r -> b (g r) l")
+        return indices
+    def decode(self, indices: torch.Tensor):
+        indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
+        z_q = self.residual_fsq.get_output_from_indices(indices)
+        z_q = self.upsample(z_q.mT)
+        return z_q
+    # def from_latents(self, latents: torch.Tensor):
+    #     z_q, z_p, codes = super().from_latents(latents)
+    #     z_q = self.upsample(z_q)
+    #     return z_q, z_p, codes
+if __name__ == "__main__":
+    rvq = DownsampleFiniteScalarQuantize(
+        n_codebooks=1,
+        downsample_factor=(2, 2),
+    )
+    x = torch.randn(16, 512, 80)
+    result = rvq(x)
+    print(rvq)
+    print(result.latents.shape, result.codes.shape, result.z.shape)
+    # y = rvq.from_codes(result.codes)
+    # print(y[0].shape)
+    # y = rvq.from_latents(result.latents)
+    # print(y[0].shape)

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py ADDED Viewed

@@ -0,0 +1,115 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from fish_speech.utils import autocast_exclude_mps
+from .wavenet import WaveNet
+class ReferenceEncoder(WaveNet):
+    def __init__(
+        self,
+        input_channels: Optional[int] = None,
+        output_channels: Optional[int] = None,
+        residual_channels: int = 512,
+        residual_layers: int = 20,
+        dilation_cycle: Optional[int] = 4,
+        num_heads: int = 8,
+        latent_len: int = 4,
+    ):
+        super().__init__(
+            input_channels=input_channels,
+            residual_channels=residual_channels,
+            residual_layers=residual_layers,
+            dilation_cycle=dilation_cycle,
+        )
+        self.head_dim = residual_channels // num_heads
+        self.num_heads = num_heads
+        self.latent_len = latent_len
+        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels))
+        self.q = nn.Linear(residual_channels, residual_channels, bias=True)
+        self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True)
+        self.q_norm = nn.LayerNorm(self.head_dim)
+        self.k_norm = nn.LayerNorm(self.head_dim)
+        self.proj = nn.Linear(residual_channels, residual_channels)
+        self.proj_drop = nn.Dropout(0.1)
+        self.norm = nn.LayerNorm(residual_channels)
+        self.mlp = nn.Sequential(
+            nn.Linear(residual_channels, residual_channels * 4),
+            nn.SiLU(),
+            nn.Linear(residual_channels * 4, residual_channels),
+        )
+        self.output_projection_attn = nn.Linear(residual_channels, output_channels)
+        torch.nn.init.trunc_normal_(self.latent, std=0.02)
+        self.apply(self.init_weights)
+    def init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                torch.nn.init.constant_(m.bias, 0)
+    def forward(self, x, attn_mask=None):
+        x = super().forward(x).mT
+        B, N, C = x.shape
+        # Calculate mask
+        if attn_mask is not None:
+            assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool
+            attn_mask = attn_mask[:, None, None, :].expand(
+                B, self.num_heads, self.latent_len, N
+            )
+        q_latent = self.latent.expand(B, -1, -1)
+        q = (
+            self.q(q_latent)
+            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        kv = (
+            self.kv(x)
+            .reshape(B, N, 2, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        x = x + self.mlp(self.norm(x))
+        x = self.output_projection_attn(x)
+        x = x.mean(1)
+        return x
+if __name__ == "__main__":
+    with autocast_exclude_mps(device_type="cpu", dtype=torch.bfloat16):
+        model = ReferenceEncoder(
+            input_channels=128,
+            output_channels=64,
+            residual_channels=384,
+            residual_layers=20,
+            dilation_cycle=4,
+            num_heads=8,
+        )
+        x = torch.randn(4, 128, 64)
+        mask = torch.ones(4, 64, dtype=torch.bool)
+        y = model(x, mask)
+        print(y.shape)
+        loss = F.mse_loss(y, torch.randn(4, 64))
+        loss.backward()

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py ADDED Viewed

@@ -0,0 +1,225 @@
+import math
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Mish(nn.Module):
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class DiffusionEmbedding(nn.Module):
+    """Diffusion Step Embedding"""
+    def __init__(self, d_denoiser):
+        super(DiffusionEmbedding, self).__init__()
+        self.dim = d_denoiser
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class LinearNorm(nn.Module):
+    """LinearNorm Projection"""
+    def __init__(self, in_features, out_features, bias=False):
+        super(LinearNorm, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+        nn.init.xavier_uniform_(self.linear.weight)
+        if bias:
+            nn.init.constant_(self.linear.bias, 0.0)
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+class ConvNorm(nn.Module):
+    """1D Convolution"""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain="linear",
+    ):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        nn.init.kaiming_normal_(self.conv.weight)
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+class ResidualBlock(nn.Module):
+    """Residual Block"""
+    def __init__(
+        self,
+        residual_channels,
+        use_linear_bias=False,
+        dilation=1,
+        condition_channels=None,
+    ):
+        super(ResidualBlock, self).__init__()
+        self.conv_layer = ConvNorm(
+            residual_channels,
+            2 * residual_channels,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+        )
+        if condition_channels is not None:
+            self.diffusion_projection = LinearNorm(
+                residual_channels, residual_channels, use_linear_bias
+            )
+            self.condition_projection = ConvNorm(
+                condition_channels, 2 * residual_channels, kernel_size=1
+            )
+        self.output_projection = ConvNorm(
+            residual_channels, 2 * residual_channels, kernel_size=1
+        )
+    def forward(self, x, condition=None, diffusion_step=None):
+        y = x
+        if diffusion_step is not None:
+            diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+            y = y + diffusion_step
+        y = self.conv_layer(y)
+        if condition is not None:
+            condition = self.condition_projection(condition)
+            y = y + condition
+        gate, filter = torch.chunk(y, 2, dim=1)
+        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = self.output_projection(y)
+        residual, skip = torch.chunk(y, 2, dim=1)
+        return (x + residual) / math.sqrt(2.0), skip
+class WaveNet(nn.Module):
+    def __init__(
+        self,
+        input_channels: Optional[int] = None,
+        output_channels: Optional[int] = None,
+        residual_channels: int = 512,
+        residual_layers: int = 20,
+        dilation_cycle: Optional[int] = 4,
+        is_diffusion: bool = False,
+        condition_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        # Input projection
+        self.input_projection = None
+        if input_channels is not None and input_channels != residual_channels:
+            self.input_projection = ConvNorm(
+                input_channels, residual_channels, kernel_size=1
+            )
+        if input_channels is None:
+            input_channels = residual_channels
+        self.input_channels = input_channels
+        # Residual layers
+        self.residual_layers = nn.ModuleList(
+            [
+                ResidualBlock(
+                    residual_channels=residual_channels,
+                    use_linear_bias=False,
+                    dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
+                    condition_channels=condition_channels,
+                )
+                for i in range(residual_layers)
+            ]
+        )
+        # Skip projection
+        self.skip_projection = ConvNorm(
+            residual_channels, residual_channels, kernel_size=1
+        )
+        # Output projection
+        self.output_projection = None
+        if output_channels is not None and output_channels != residual_channels:
+            self.output_projection = ConvNorm(
+                residual_channels, output_channels, kernel_size=1
+            )
+        if is_diffusion:
+            self.diffusion_embedding = DiffusionEmbedding(residual_channels)
+            self.mlp = nn.Sequential(
+                LinearNorm(residual_channels, residual_channels * 4, False),
+                Mish(),
+                LinearNorm(residual_channels * 4, residual_channels, False),
+            )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if getattr(m, "bias", None) is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x, t=None, condition=None):
+        if self.input_projection is not None:
+            x = self.input_projection(x)
+            x = F.silu(x)
+        if t is not None:
+            t = self.diffusion_embedding(t)
+            t = self.mlp(t)
+        skip = []
+        for layer in self.residual_layers:
+            x, skip_connection = layer(x, condition, t)
+            skip.append(skip_connection)
+        x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        if self.output_projection is not None:
+            x = F.silu(x)
+            x = self.output_projection(x)
+        return x

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py ADDED Viewed

@@ -0,0 +1,94 @@
+import matplotlib
+import torch
+from matplotlib import pyplot as plt
+matplotlib.use("Agg")
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def plot_mel(data, titles=None):
+    fig, axes = plt.subplots(len(data), 1, squeeze=False)
+    if titles is None:
+        titles = [None for i in range(len(data))]
+    plt.tight_layout()
+    for i in range(len(data)):
+        mel = data[i]
+        if isinstance(mel, torch.Tensor):
+            mel = mel.float().detach().cpu().numpy()
+        axes[i][0].imshow(mel, origin="lower")
+        axes[i][0].set_aspect(2.5, adjustable="box")
+        axes[i][0].set_ylim(0, mel.shape[0])
+        axes[i][0].set_title(titles[i], fontsize="medium")
+        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
+        axes[i][0].set_anchor("W")
+    return fig
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(in_act, n_channels):
+    n_channels_int = n_channels[0]
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def avg_with_mask(x, mask):
+    assert mask.dtype == torch.float, "Mask should be float"
+    if mask.ndim == 2:
+        mask = mask.unsqueeze(1)
+    if mask.shape[1] == 1:
+        mask = mask.expand_as(x)
+    return (x * mask).sum() / mask.sum()

xinference/thirdparty/fish_speech/fish_speech/scheduler.py ADDED Viewed

@@ -0,0 +1,40 @@
+import math
+def get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int | float,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    final_lr_ratio: float = 0.0,
+):
+    if 0 < num_warmup_steps < 1:  # float mode
+        num_warmup_steps = int(num_warmup_steps * num_training_steps)
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        final_lr_ratio,
+        0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
+    )
+def get_constant_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int | float,
+    num_training_steps: int | None = None,
+):
+    if 0 < num_warmup_steps < 1:  # float mode
+        num_warmup_steps = int(num_warmup_steps * num_training_steps)
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    return 1.0

xinference/thirdparty/fish_speech/fish_speech/text/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .clean import clean_text
+from .spliter import split_text
+__all__ = ["clean_text", "split_text"]

xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py ADDED Viewed

File without changes

xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl