PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from mlx_audio.tts.models.indextts.indextts import Model, ModelArgs
+__all__ = ["Model", "ModelArgs"]

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py ADDED Viewed

@@ -0,0 +1,180 @@
+import math
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        n_head: int,
+        n_feat: int,
+        bias=True,
+        head_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.n_head = n_head
+        self.head_dim = n_feat // n_head if not head_dim else head_dim
+        self.scale = self.head_dim**-0.5
+        self.linear_q = nn.Linear(n_feat, self.head_dim * self.n_head, bias=bias)
+        self.linear_k = nn.Linear(n_feat, self.head_dim * self.n_head, bias=bias)
+        self.linear_v = nn.Linear(n_feat, self.head_dim * self.n_head, bias=bias)
+        self.linear_out = nn.Linear(self.head_dim * self.n_head, n_feat, bias=bias)
+    def __call__(
+        self,
+        q: mx.array,
+        k: mx.array,
+        v: mx.array,
+        pos_emb: mx.array | None = None,
+        mask: mx.array | None = None,
+        cache=None,
+    ) -> mx.array:
+        q, k, v = self.linear_q(q), self.linear_k(k), self.linear_v(v)
+        batch, q_seq, _ = q.shape
+        _, k_seq, _ = k.shape
+        q = q.reshape(batch, q_seq, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        k = k.reshape(batch, k_seq, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        v = v.reshape(batch, k_seq, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        if cache:
+            k, v = cache.update_and_fetch(k, v)
+        o = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+        o = o.transpose(0, 2, 1, 3).reshape(batch, q_seq, -1)
+        return self.linear_out(o)
+class RelPositionMultiHeadAttention(MultiHeadAttention):
+    def __init__(
+        self,
+        n_head: int,
+        n_feat: int,
+        bias: bool = True,
+        head_dim: Optional[int] = None,
+        pos_bias_u: mx.array | None = None,
+        pos_bias_v: mx.array | None = None,
+    ):
+        super().__init__(n_head=n_head, n_feat=n_feat, bias=bias, head_dim=head_dim)
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        if pos_bias_u is None:
+            self._pos_bias_u_init = mx.zeros((self.n_head, self.head_dim))
+        else:
+            self._pos_bias_u_init = pos_bias_u
+        if pos_bias_v is None:
+            self._pos_bias_v_init = mx.zeros((self.n_head, self.head_dim))
+        else:
+            self._pos_bias_v_init = pos_bias_v
+        self.pos_bias_u = self._pos_bias_u_init
+        self.pos_bias_v = self._pos_bias_v_init
+    def __call__(
+        self,
+        q: mx.array,
+        k: mx.array,
+        v: mx.array,
+        pos_emb: mx.array | None = None,
+        mask: mx.array | None = None,
+        cache=None,
+    ) -> mx.array:
+        if pos_emb is None:
+            raise ValueError("pos_emb is necessary!")
+        q, k, v = self.linear_q(q), self.linear_k(k), self.linear_v(v)
+        p = self.linear_pos(pos_emb)  # p stands for position
+        batch, q_seq, _ = q.shape
+        _, k_seq, _ = k.shape
+        _, pos_len, _ = p.shape
+        q = q.reshape(batch, q_seq, self.n_head, self.head_dim)
+        q_u = (q + self.pos_bias_u).transpose(0, 2, 1, 3)
+        q_v = (q + self.pos_bias_v).transpose(0, 2, 1, 3)
+        k = k.reshape(batch, k_seq, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        v = v.reshape(batch, k_seq, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        p = p.reshape(batch, pos_len, self.n_head, self.head_dim).transpose(0, 2, 1, 3)
+        if cache is not None:
+            k, v = cache.update_and_fetch(k, v)
+        matrix_bd = mx.matmul(q_v, p.swapaxes(-2, -1))
+        matrix_bd = matrix_bd * self.scale
+        if mask is not None:
+            mask = mx.expand_dims(mask, 0)
+            matrix_bd[mask] = -mx.inf
+        o = mx.fast.scaled_dot_product_attention(
+            q_u, k, v, scale=self.scale, mask=matrix_bd
+        )
+        o = o.transpose(0, 2, 1, 3).reshape(batch, q_seq, -1)
+        return self.linear_out(o)
+class RelPositionalEncoding(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        max_len: int = 5000,
+        scale_input: bool = True,
+    ):
+        assert d_model % 2 == 0 and max_len > 0
+        super().__init__()
+        self.d_model = d_model
+        self.max_len = max_len
+        self.scale = math.sqrt(self.d_model) if scale_input else 1.0
+        self.calculate_pe()
+    def calculate_pe(self):
+        positions = mx.arange(0, self.max_len, 1, dtype=mx.int32)
+        positions = mx.expand_dims(positions, axis=1).astype(mx.float32)
+        div_term = mx.exp(
+            mx.arange(0, self.d_model, 2, dtype=mx.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe = mx.zeros((self.max_len, self.d_model), dtype=mx.float32)
+        pe[:, 0::2] = mx.sin(positions * div_term)
+        pe[:, 1::2] = mx.cos(positions * div_term)
+        self._pe = mx.expand_dims(pe, axis=0).astype(mx.float32)
+        mx.eval(self._pe)
+    def __call__(self, x: mx.array, offset: int = 0) -> tuple[mx.array, mx.array]:
+        input_len = x.shape[1] + offset
+        if input_len > self.max_len:
+            self.max_len = input_len + 1
+            self.calculate_pe()
+        x = x * self.scale
+        pos_emb = self._pe[:, offset : offset + x.shape[1]].astype(x.dtype)
+        return x, pos_emb
+class LearnedPositionEncoding(nn.Module):
+    def __init__(self, seq_len: int, model_dim: int):
+        super().__init__()
+        self.emb = nn.Embedding(seq_len, model_dim)
+    def __call__(self, x: mx.array, offset: int = 0):
+        return self.emb(mx.arange(offset, offset + x.shape[1]))

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py ADDED Viewed

@@ -0,0 +1,124 @@
+from dataclasses import dataclass
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten
+from mlx_audio.codec.models.bigvgan.bigvgan import BigVGAN, BigVGANConfig
+from mlx_audio.codec.models.bigvgan.conv import WNConv1d
+from mlx_audio.tts.models.indextts.ecapa_tdnn.ecapa_tdnn import ECPATDNN, ECPATDNNArgs
+@dataclass
+class BigVGANConditioningConfig(BigVGANConfig):
+    gpt_dim: int = 1
+    speaker_embedding_dim: int = 1
+    cond_d_vector_in_each_upsampling_layer: bool = True
+class BigVGANConditioning(BigVGAN):
+    def __init__(self, config: BigVGANConditioningConfig):
+        super().__init__(config)
+        self.conv_pre = WNConv1d(
+            config.gpt_dim, config.upsample_initial_channel, 7, 1, 3
+        )
+        self.cond_in_each_up_layer = config.cond_d_vector_in_each_upsampling_layer
+        self.speaker_encoder = ECPATDNN(
+            ECPATDNNArgs(config.num_mels, lin_neurons=config.speaker_embedding_dim)
+        )
+        self.cond_layer = nn.Conv1d(
+            config.speaker_embedding_dim, config.upsample_initial_channel, 1
+        )
+        if config.cond_d_vector_in_each_upsampling_layer:
+            self.conds = [
+                nn.Conv1d(
+                    config.speaker_embedding_dim,
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    1,
+                )
+                for i in range(len(self.ups))
+            ]
+        else:
+            self.conds = []
+    def __call__(
+        self, x: mx.array, mel_refer: mx.array
+    ) -> mx.array:  # (batch, num_mels, seq)
+        x = x.transpose(0, 2, 1)
+        mel_refer = mel_refer.transpose(0, 2, 1)
+        speaker_embedding = self.speaker_encoder(mel_refer)
+        x = self.conv_pre(x)
+        x += self.cond_layer(speaker_embedding)
+        for step in range(self.num_upsamples):
+            for idx in range(len(self.ups[step])):
+                x = self.ups[step][idx](x)
+            if self.cond_in_each_up_layer:
+                x += self.conds[step](speaker_embedding)
+            xs = self.resblocks[step * self.num_kernels](x)
+            for idx in range(1, self.num_kernels):
+                xs += self.resblocks[step * self.num_kernels + idx](x)
+            x = xs / self.num_kernels
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        if self.use_tanh_at_final:
+            x = mx.tanh(x)
+        else:
+            x = mx.clip(x, -1.0, 1.0)
+        return x.transpose(0, 2, 1)
+    def sanitize(self, weights: dict[str, mx.array]):
+        new_weights = {}
+        curr_weights = dict(tree_flatten(self.parameters()))
+        for key, value in weights.items():
+            if "num_batches_tracked" in key:
+                continue
+            key = (
+                key.replace("norm.norm", "norm")
+                .replace("conv.conv", "conv")
+                .replace("conv1.conv", "conv1")
+                .replace("conv2.conv", "conv2")
+                .replace("fc.conv", "fc")
+                .replace("asp_bn.norm", "asp_bn")
+            )
+            if (
+                "conv" in key
+                or "cond_layer" in key
+                or "lowpass.filter" in key
+                or "upsample.filter" in key
+                or "conds" in key
+                or "fc" in key
+            ):
+                if value.ndim == 3:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(0, 2, 1)
+                elif value.ndim == 4:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(0, 2, 3, 1)
+            if "ups." in key:
+                if value.ndim == 3:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(1, 2, 0)
+            new_weights[key] = value
+        del curr_weights
+        return new_weights

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py ADDED Viewed

@@ -0,0 +1,247 @@
+from dataclasses import dataclass
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.indextts.attention import (
+    MultiHeadAttention,
+    RelPositionalEncoding,
+    RelPositionMultiHeadAttention,
+)
+@dataclass
+class ConformerArgs:
+    input_size: int = 100
+    output_size: int = 256
+    num_blocks: int = 6
+    linear_units: int = 2048
+    attention_heads: int = 4
+    pos_enc_layer_type: str = "rel_pos"
+    input_layer: str = "conv2d"
+    cnn_module_kernel: int = 15
+    pos_emb_max_len: int = 2048
+    causal_downsampling: bool = False
+    use_bias: bool = True
+    xscaling: bool = True
+    macaron_style: bool = False
+    pos_bias_u: mx.array | None = None
+    pos_bias_v: mx.array | None = None
+    perceiver_mult: int = 2
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, d_ff: int, use_bias: bool = True):
+        super().__init__()
+        self.w_1 = nn.Linear(dim, d_ff, bias=use_bias)
+        self.activation = nn.SiLU()
+        self.w_2 = nn.Linear(d_ff, dim, bias=use_bias)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.w_2(self.activation(self.w_1(x)))
+class Convolution(nn.Module):
+    def __init__(self, args: ConformerArgs):
+        assert (args.cnn_module_kernel - 1) % 2 == 0
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            args.output_size,
+            args.output_size * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=args.use_bias,
+        )
+        self.depthwise_conv = nn.Conv1d(
+            args.output_size,
+            args.output_size,
+            kernel_size=args.cnn_module_kernel,
+            stride=1,
+            padding=(args.cnn_module_kernel - 1) // 2,
+            groups=args.output_size,
+            bias=args.use_bias,
+        )
+        self.norm = nn.LayerNorm(args.output_size)
+        self.activation = nn.SiLU()
+        self.pointwise_conv2 = nn.Conv1d(
+            args.output_size,
+            args.output_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=args.use_bias,
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.pointwise_conv1(x)
+        x = nn.glu(x, axis=2)
+        x = self.depthwise_conv(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.pointwise_conv2(x)
+        return x
+class ConformerBlock(nn.Module):
+    def __init__(self, args: ConformerArgs):
+        super().__init__()
+        self.macaron_style = args.macaron_style
+        self.ff_scale = 0.5 if self.macaron_style else 1
+        if args.macaron_style:
+            self.norm_ff_macaron = nn.LayerNorm(args.output_size)
+            self.feed_forward_macaron = FeedForward(
+                args.output_size, args.linear_units, args.use_bias
+            )
+        self.norm_mha = nn.LayerNorm(args.output_size)
+        self.self_attn = (
+            RelPositionMultiHeadAttention(
+                args.attention_heads,
+                args.output_size,
+                bias=args.use_bias,
+                pos_bias_u=args.pos_bias_u,
+                pos_bias_v=args.pos_bias_v,
+            )
+            if args.pos_enc_layer_type == "rel_pos"
+            else MultiHeadAttention(
+                args.attention_heads,
+                args.output_size,
+                bias=True,
+            )
+        )
+        self.norm_conv = nn.LayerNorm(args.output_size)
+        self.conv_module = Convolution(args)
+        self.norm_ff = nn.LayerNorm(args.output_size)
+        self.feed_forward = FeedForward(
+            args.output_size, args.linear_units, args.use_bias
+        )
+        self.norm_final = nn.LayerNorm(args.output_size)
+    def __call__(
+        self,
+        x: mx.array,
+        pos_emb: mx.array | None = None,
+        mask: mx.array | None = None,
+        cache=None,
+    ) -> mx.array:
+        if self.macaron_style:
+            x += self.ff_scale * self.feed_forward_macaron(self.norm_ff_macaron(x))
+        x_norm = self.norm_mha(x)
+        x += self.self_attn(
+            x_norm, x_norm, x_norm, mask=mask, pos_emb=pos_emb, cache=cache
+        )
+        x += self.conv_module(self.norm_conv(x))
+        x += self.ff_scale * self.feed_forward(self.norm_ff(x))
+        return self.norm_final(x)
+class Conv2dSubsampling(nn.Module):
+    CONV_LAYERS = {
+        "conv2d2": [(3, 2)],
+        "conv2d3": [(5, 3)],
+        "conv2d4": [(3, 2), (3, 2)],
+        "conv2d6": [(3, 2), (5, 3)],
+        "conv2d8": [(3, 2), (3, 2), (3, 2)],
+    }
+    CONV_MASKS = {
+        "conv2d2": [slice(2, None, 2)],
+        "conv2d3": [slice(None, -2, 3)],
+        "conv2d4": [slice(2, None, 2), slice(2, None, 2)],
+        "conv2d6": [slice(2, None, 2), slice(4, None, 3)],
+        "conv2d8": [slice(2, None, 2), slice(2, None, 2), slice(2, None, 2)],
+    }
+    def __init__(self, args: ConformerArgs):
+        super().__init__()
+        conv_layers = self.CONV_LAYERS[args.input_layer]
+        self.mask_patterns = self.CONV_MASKS[args.input_layer]
+        self.conv = []
+        self.subsampling_rate = 0
+        in_channels = 1
+        out_freq = args.input_size
+        for kernel_size, stride in conv_layers:
+            self.conv.append(
+                nn.Conv2d(
+                    in_channels,
+                    args.output_size,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                )
+            )
+            self.conv.append(nn.ReLU())
+            in_channels = args.output_size
+            out_freq = (out_freq - kernel_size + stride) // stride
+            self.subsampling_rate *= stride
+        self.out = [nn.Linear(args.output_size * out_freq, args.output_size)]
+    def __call__(self, x: mx.array, mask: Optional[mx.array] = None):
+        x = x[:, :, :, None]
+        for layer in self.conv:
+            x = layer(x)
+        x = x.swapaxes(2, 3).reshape(*x.shape[:2], -1)
+        for layer in self.out:
+            x = layer(x)
+        if mask is not None:
+            for pattern in self.mask_patterns:
+                mask = mask[pattern]
+        return x, mask
+class Conformer(nn.Module):
+    def __init__(self, args: ConformerArgs):
+        super().__init__()
+        if args.pos_enc_layer_type == "rel_pos":
+            self.pos_enc = RelPositionalEncoding(
+                d_model=args.output_size,
+                max_len=args.pos_emb_max_len,
+                scale_input=args.xscaling,
+            )
+        else:
+            self.pos_enc = None
+        self.embed = Conv2dSubsampling(args)
+        self.encoders = [ConformerBlock(args) for _ in range(args.num_blocks)]
+        self.after_norm = nn.LayerNorm(args.output_size, eps=1e-5)
+    def __call__(
+        self, x: mx.array, mask: Optional[mx.array] = None, cache=None
+    ) -> mx.array:
+        x, mask = self.embed(x, mask)
+        if cache is None:
+            cache = [None] * len(self.encoders)
+        pos_emb = None
+        if self.pos_enc is not None:
+            x, pos_emb = self.pos_enc(
+                x,
+                offset=cache[0].offset if cache[0] is not None else 0,  # type: ignore
+            )
+        for layer, c in zip(self.encoders, cache):
+            x = layer(x, pos_emb=pos_emb, cache=c, mask=mask)
+        x = self.after_norm(x)
+        return x

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py ADDED Viewed

File without changes

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py ADDED Viewed

@@ -0,0 +1,59 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.indextts.ecapa_tdnn.tdnn import TDNN
+class AttentiveStatisticsPooling(nn.Module):
+    def __init__(
+        self, channels: int, attention_channels: int, global_context: bool = True
+    ):
+        super().__init__()
+        self.eps = 1e-12
+        self.global_context = global_context
+        self.tdnn = TDNN(
+            channels * 3 if global_context else channels, attention_channels, 1
+        )
+        self.tanh = nn.Tanh()
+        self.conv = nn.Conv1d(attention_channels, channels, 1)
+    def __call__(self, x: mx.array, mask: Optional[mx.array] = None):  # NLC
+        N, L, C = x.shape
+        if mask is not None:
+            mask = mask[:, :, None]
+        else:
+            mask = mx.ones((N, L, 1))
+        if self.global_context:
+            global_mean = (x * mask).sum(1, keepdims=True) / (
+                mask.sum(1, keepdims=True) + self.eps
+            )
+            global_std = mx.sqrt(
+                ((x - global_mean) ** 2 * mask).sum(1, keepdims=True)
+                / (mask.sum(1, keepdims=True) + self.eps)
+                + self.eps
+            )
+            attn = mx.concat(
+                [
+                    x,
+                    mx.repeat(global_mean, L, axis=1),
+                    mx.repeat(global_std, L, axis=1),
+                ],
+                axis=2,
+            )
+        else:
+            attn = x
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+        attn = mx.softmax(mx.where(mask == 0, -mx.inf, attn), axis=1)
+        mean = (x * attn).sum(1, keepdims=True)
+        std = mx.sqrt(((x - mean) ** 2 * attn).sum(1, keepdims=True) + self.eps)
+        return mx.concat([mean, std], axis=2)

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py ADDED Viewed

@@ -0,0 +1,91 @@
+from dataclasses import dataclass, field
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.indextts.ecapa_tdnn.asp import AttentiveStatisticsPooling
+from mlx_audio.tts.models.indextts.ecapa_tdnn.se_res2net import SeRes2Net
+from mlx_audio.tts.models.indextts.ecapa_tdnn.tdnn import TDNN
+@dataclass
+class ECPATDNNArgs:
+    input_size: int
+    lin_neurons: int = 192
+    channels: list[int] = field(default_factory=lambda: [512, 512, 512, 512, 1536])
+    kernel_sizes: list[int] = field(default_factory=lambda: [5, 3, 3, 3, 1])
+    dilations: list[int] = field(default_factory=lambda: [1, 2, 3, 4, 1])
+    attention_channels: int = 128
+    res2net_scale: int = 8
+    se_channels: int = 128
+    global_context: bool = True
+    groups: list[int] = field(default_factory=lambda: [1, 1, 1, 1, 1])
+class ECPATDNN(nn.Module):
+    def __init__(self, args: ECPATDNNArgs):
+        super().__init__()
+        assert len(args.channels) == len(args.kernel_sizes) and len(
+            args.channels
+        ) == len(args.dilations)
+        self.args = args
+        self.blocks = [
+            TDNN(
+                args.input_size,
+                args.channels[0],
+                args.kernel_sizes[0],
+                dilation=args.dilations[0],
+                groups=args.groups[0],
+            )
+        ] + [
+            SeRes2Net(
+                args.channels[i - 1],
+                args.channels[i],
+                scale=args.res2net_scale,
+                attention_channels=args.se_channels,
+                kernel_size=args.kernel_sizes[i],
+                dilation=args.dilations[i],
+                groups=args.groups[i],
+            )
+            for i in range(1, len(args.channels) - 1)
+        ]
+        self.mfa = TDNN(
+            args.channels[-2] * (len(args.channels) - 2),
+            args.channels[-1],
+            args.kernel_sizes[-1],
+            dilation=args.dilations[-1],
+            groups=args.groups[-1],
+        )
+        self.asp = AttentiveStatisticsPooling(
+            args.channels[-1],
+            attention_channels=args.attention_channels,
+            global_context=args.global_context,
+        )
+        self.asp_bn = nn.BatchNorm(args.channels[-1] * 2)
+        self.fc = nn.Conv1d(
+            in_channels=args.channels[-1] * 2,
+            out_channels=args.lin_neurons,
+            kernel_size=1,
+        )
+    def __call__(self, x: mx.array, mask: Optional[mx.array] = None):  #
+        xl = []
+        for layer in self.blocks:
+            if isinstance(layer, SeRes2Net):
+                x = layer(x, mask=mask)
+                xl.append(mx.array(x))
+            else:
+                x = layer(x)
+        x = mx.concat(xl, axis=2)
+        x = self.mfa(x)
+        x = self.asp(x, mask=mask)
+        x = self.asp_bn(x)
+        x = self.fc(x)
+        return x