PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py ADDED Viewed

@@ -0,0 +1,283 @@
+# Copyright (c) 2021 Zhengyang Chen (chenzhengyang117@gmail.com)
+#               2022 Hongji Wang (jijijiang77@gmail.com)
+#               2023 Bing Han (hanbing97@sjtu.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" This implementation is adapted from github repo:
+    https://github.com/lawlict/ECAPA-TDNN.
+"""
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.spark.modules.speaker import pooling_layers as pooling_layers
+class Res2Conv1dReluBn(nn.Module):
+    """
+    in_channels == out_channels == channels
+    """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.channels = channels
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
+            self.bns.append(nn.BatchNorm(self.width))
+        # self.convs = [*self.convs]  # nn.ModuleList(self.convs)
+        # self.bns = [*self.bns]  # nn.ModuleList(self.bns)
+    def __call__(self, x):
+        out = []
+        spx = mx.split(x, self.scale, axis=1)
+        sp = spx[0]
+        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
+            # Order: conv -> relu -> bn
+            if i >= 1:
+                sp = sp + spx[i]
+            sp = conv(sp.transpose(0, 2, 1))
+            sp = bn(nn.relu(sp)).transpose(0, 2, 1)
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = mx.concatenate(out, axis=1)
+        return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
+class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
+        )
+        self.bn = nn.BatchNorm(out_channels)
+    def __call__(self, x):
+        x = self.conv(x.swapaxes(1, 2)).swapaxes(1, 2)
+        x = nn.relu(x)
+        x = self.bn(x.swapaxes(1, 2)).swapaxes(1, 2)
+        return x
+""" The SE connection of 1D case.
+"""
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def __call__(self, x):
+        out = mx.mean(x, axis=2)
+        out = nn.relu(self.linear1(out))
+        out = mx.sigmoid(self.linear2(out))
+        out = x * out[:, :, None]
+        return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+class SE_Res2Block(nn.Module):
+    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
+        super().__init__()
+        self.se_res2block = [
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            Res2Conv1dReluBn(
+                channels, kernel_size, stride, padding, dilation, scale=scale
+            ),
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            SE_Connect(channels),
+        ]
+    def __call__(self, x):
+        res = x
+        for module in self.se_res2block:
+            x = module(x)
+        return x + res
+class ECAPA_TDNN(nn.Module):
+    def __init__(
+        self,
+        channels=512,
+        feat_dim=80,
+        embed_dim=192,
+        pooling_func="ASTP",
+        global_context_att=False,
+        emb_bn=False,
+    ):
+        super().__init__()
+        self.layer1 = Conv1dReluBn(feat_dim, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8
+        )
+        self.layer3 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8
+        )
+        self.layer4 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8
+        )
+        cat_channels = channels * 3
+        out_channels = 512 * 3
+        self.conv = nn.Conv1d(cat_channels, out_channels, kernel_size=1)
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=out_channels, global_context_att=global_context_att
+        )
+        self.pool_out_dim = self.pool.get_out_dim()
+        self.bn = nn.BatchNorm(self.pool_out_dim)
+        self.linear = nn.Linear(self.pool_out_dim, embed_dim)
+        self.emb_bn = emb_bn
+        if emb_bn:  # better in SSL for SV
+            self.bn2 = nn.BatchNorm(embed_dim)
+        else:
+            self.bn2 = nn.Identity()
+    def __call__(self, x, return_latent=False):
+        x = x.transpose(0, 2, 1)  # (B,T,F) -> (B,F,T)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = mx.concatenate([out2, out3, out4], axis=1)
+        out = self.conv(out.transpose(0, 2, 1)).transpose(0, 2, 1)
+        latent = nn.relu(out)
+        out = self.pool(latent)
+        out = self.bn(out)
+        out = self.linear(out)
+        if self.emb_bn:
+            out = self.bn2(out)
+        if return_latent:
+            return out, latent
+        return out
+def ECAPA_TDNN_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+if __name__ == "__main__":
+    from mlx.utils import tree_flatten
+    x = mx.zeros(shape=(1, 200, 100))
+    model = ECAPA_TDNN_GLOB_c512(feat_dim=100, embed_dim=256, pooling_func="ASTP")
+    model.eval()
+    out, latent = model(x, True)
+    print(out.shape)
+    print(latent.shape)
+    # Count parameters for MLX model
+    num_params = 0
+    weights = dict(tree_flatten(model.parameters()))
+    for k, v in weights.items():
+        num_params += v.size
+    print("{} M".format(num_params / 1e6))
+    # from thop import profile
+    # x_np = torch.randn(1, 200, 80)
+    # flops, params = profile(model, inputs=(x_np, ))
+    # print("FLOPs: {} G, Params: {} M".format(flops / 1e9, params / 1e6))

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py ADDED Viewed

@@ -0,0 +1,326 @@
+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
+from collections import namedtuple
+from functools import wraps
+import mlx.core as mx
+import mlx.nn as nn
+from einops import rearrange, repeat
+def exists(val):
+    return val is not None
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# main class
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, causal=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.causal = causal
+        self.mask = None
+    def get_mask(self, n, device=None):
+        if exists(self.mask) and self.mask.shape[-1] >= n:
+            return self.mask[:n, :n]
+        mask = mx.triu(mx.ones((n, n), dtype=mx.bool_), 1)
+        self.mask = mask
+        return mask
+    def __call__(self, q, k, v, mask=None):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        n = q.shape[-2]
+        scale = q.shape[-1] ** -0.5
+        # Handle different dimensions for k and v
+        kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
+        # similarity
+        if k.ndim == 3:
+            k = mx.expand_dims(k, axis=1)
+            k = mx.broadcast_to(k, q.shape)
+        if v.ndim == 3:
+            v = mx.expand_dims(v, axis=1)
+            v = mx.broadcast_to(v, q.shape[:-1] + (v.shape[-1],))
+        # q: [b h i d], k: [b h j d]
+        sim = mx.matmul(q, mx.transpose(k, (0, 1, 3, 2))) * scale
+        # key padding mask
+        if exists(mask):
+            mask = mx.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
+            sim = mx.where(mask, sim, -1e9)
+        # causal mask
+        if self.causal:
+            causal_mask = self.get_mask(n)
+            sim = mx.where(causal_mask, -1e9, sim)
+        # attention
+        attn = mx.softmax(sim, axis=-1)
+        if self.dropout > 0 and self.training:
+            attn = self.attn_dropout(attn)
+        # aggregate values
+        out = mx.matmul(attn, v)
+        return out
+def Sequential(*mods):
+    return nn.Sequential(*[mod for mod in mods if exists(mod)])
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+class RMSNorm(nn.Module):
+    def __init__(self, dim, scale=True, dim_cond=None):
+        super().__init__()
+        self.cond = exists(dim_cond)
+        self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
+        self.scale = dim**0.5
+        self.gamma = mx.ones((dim,)) if scale else None
+    def __call__(self, x, cond=None):
+        def normalize(input, p=2.0, dim=1, eps=1e-12):
+            norm = mx.power(
+                mx.sum(mx.power(mx.abs(input), p), axis=dim, keepdims=True), 1 / p
+            )
+            return input / mx.maximum(norm, eps)
+        gamma = default(self.gamma, 1)
+        out = normalize(x, dim=-1) * self.scale * gamma
+        if not self.cond:
+            return out
+        assert exists(cond)
+        gamma, beta = mx.split(self.to_gamma_beta(cond), 2, axis=-1)
+        gamma = mx.expand_dims(gamma, axis=1)
+        beta = mx.expand_dims(beta, axis=1)
+        return out * gamma + beta
+class CausalConv1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+        )
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.stride = stride
+        assert stride == 1
+        self.causal_padding = dilation * (kernel_size - 1)
+    def __call__(self, x):
+        causal_padded_x = mx.pad(x, [(0, 0), (0, 0), (self.causal_padding, 0)])
+        return self.conv(causal_padded_x)
+class GEGLU(nn.Module):
+    def __call__(self, x):
+        x, gate = mx.split(x, 2, axis=-1)
+        return nn.gelu(gate) * x
+def FeedForward(dim, mult=4, causal_conv=False):
+    dim_inner = int(dim * mult * 2 / 3)
+    conv = None
+    if causal_conv:
+        conv = [
+            lambda x: mx.transpose(x, (0, 2, 1)),  # b n d -> b d n
+            CausalConv1d(dim_inner, dim_inner, 3),
+            lambda x: mx.transpose(x, (0, 2, 1)),  # b d n -> b n d
+        ]
+        return [
+            nn.Linear(dim, dim_inner * 2),
+            GEGLU(),
+            conv,
+            nn.Linear(dim_inner, dim),
+        ]
+    else:
+        return [
+            nn.Linear(dim, dim_inner * 2),
+            GEGLU(),
+            nn.Linear(dim_inner, dim),
+        ]
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_context=None,
+        causal=False,
+        dim_head=64,
+        heads=8,
+        dropout=0.0,
+        cross_attn_include_queries=False,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.cross_attn_include_queries = cross_attn_include_queries
+        dim_inner = dim_head * heads
+        dim_context = default(dim_context, dim)
+        self.attend = Attend(causal=causal, dropout=dropout)
+        self.to_q = nn.Linear(dim, dim_inner, bias=False)
+        self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
+        self.to_out = nn.Linear(dim_inner, dim, bias=False)
+    def __call__(self, x, context=None, mask=None):
+        h, has_context = self.heads, exists(context)
+        context = default(context, x)
+        if has_context and self.cross_attn_include_queries:
+            context = mx.concatenate([x, context], axis=-2)
+        q = self.to_q(x)
+        kv = self.to_kv(context)
+        k, v = mx.split(kv, 2, axis=-1)
+        # Reshape for multi-head attention
+        q = mx.reshape(q, (q.shape[0], q.shape[1], h, -1))
+        q = mx.transpose(q, (0, 2, 1, 3))  # b n (h d) -> b h n d
+        k = mx.reshape(k, (k.shape[0], k.shape[1], h, -1))
+        k = mx.transpose(k, (0, 2, 1, 3))  # b n (h d) -> b h n d
+        v = mx.reshape(v, (v.shape[0], v.shape[1], h, -1))
+        v = mx.transpose(v, (0, 2, 1, 3))  # b n (h d) -> b h n d
+        out = self.attend(q, k, v, mask=mask)
+        out = mx.transpose(out, (0, 2, 1, 3))  # b h n d -> b n h d
+        out = mx.reshape(out, (out.shape[0], out.shape[1], -1))  # b n h d -> b n (h d)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=2,
+        dim_context=None,
+        num_latents=32,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+    ):
+        super().__init__()
+        dim_context = default(dim_context, dim)
+        self.proj_context = (
+            nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
+        )
+        self.latents = mx.random.normal(shape=(num_latents, dim), scale=0.02)
+        self.layers = []
+        for _ in range(depth):
+            self.layers.append(
+                [
+                    Attention(
+                        dim=dim,
+                        dim_head=dim_head,
+                        heads=heads,
+                        cross_attn_include_queries=True,
+                    ),
+                    FeedForward(dim=dim, mult=ff_mult),
+                ]
+            )
+        self.norm = RMSNorm(dim)
+    def __call__(self, x, mask=None):
+        batch = x.shape[0]
+        x = self.proj_context(x)
+        latents = mx.broadcast_to(self.latents, (batch,) + self.latents.shape)
+        for attn, ff in self.layers:
+            latents = attn(latents, x, mask=mask) + latents
+            skip_connect = latents
+            for module in ff:
+                latents = module(latents)
+            latents = skip_connect + latents
+        return self.norm(latents)
+if __name__ == "__main__":
+    from mlx.utils import tree_flatten
+    model = PerceiverResampler(dim=256, dim_context=80)
+    x = mx.random.normal(shape=(8, 200, 80))
+    out = model(x)
+    print("Output shape:", out.shape)  # [8, 32, 80]
+    # Count parameters for MLX model
+    num_params = 0
+    weights = dict(tree_flatten(model.parameters()))
+    for k, v in weights.items():
+        num_params += v.size
+    print("{} M".format(num_params / 1e6))