PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/__init__.py ADDED Viewed

File without changes

nexaai/mlx_backend/mlx_audio/codec/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .models import DAC, Encodec, Mimi, Vocos

nexaai/mlx_backend/mlx_audio/codec/models/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .descript import DAC
+from .encodec import Encodec
+from .mimi import Mimi
+from .snac import SNAC
+from .vocos import Vocos

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .bigvgan import BigVGAN, BigVGANConfig

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py ADDED Viewed

@@ -0,0 +1,51 @@
+import mlx.core as mx
+import mlx.nn as nn
+class Snake(nn.Module):
+    def __init__(
+        self, in_features: int, alpha: float = 1.0, alpha_logscale: bool = False
+    ):
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = (
+            mx.zeros(in_features) if alpha_logscale else mx.ones(in_features)
+        ) * alpha
+    def __call__(self, x: mx.array):
+        alpha = self.alpha[None, :, None]
+        if self.alpha_logscale:
+            alpha = mx.exp(alpha)
+        x += (1.0 / (alpha + 1e-9)) * mx.power(mx.sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    def __init__(
+        self, in_features: int, alpha: float = 1.0, alpha_logscale: bool = False
+    ):
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = (
+            mx.zeros(in_features) if alpha_logscale else mx.ones(in_features)
+        ) * alpha
+        self.beta = (
+            mx.zeros(in_features) if alpha_logscale else mx.ones(in_features)
+        ) * alpha
+    def __call__(self, x: mx.array):
+        alpha = self.alpha[None, None, :]
+        beta = self.beta[None, None, :]
+        if self.alpha_logscale:
+            alpha = mx.exp(alpha)
+            beta = mx.exp(beta)
+        x += (1.0 / (beta + 1e-9)) * mx.power(mx.sin(x * alpha), 2)
+        return x

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py ADDED Viewed

@@ -0,0 +1,96 @@
+import mlx.core as mx
+import mlx.nn as nn
+from typing_extensions import Literal
+from mlx_audio.codec.models.bigvgan.activation import Snake, SnakeBeta
+from mlx_audio.codec.models.bigvgan.conv import WNConv1d
+from mlx_audio.codec.models.bigvgan.resample import Activation1d
+class AMPBlock1(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        snake_logscale: bool,
+        activation: Literal["snake", "snakebeta"],
+        kernel_size=3,
+        dilation: list[int] = [1, 3, 5],
+    ):
+        super().__init__()
+        self.convs1 = [
+            WNConv1d(
+                channels,
+                channels,
+                kernel_size,
+                stride=1,
+                dilation=d,
+                padding=((kernel_size - 1) * d) // 2,
+            )
+            for d in dilation
+        ]
+        self.convs2 = [
+            WNConv1d(
+                channels,
+                channels,
+                kernel_size,
+                stride=1,
+                dilation=1,
+                padding=(kernel_size - 1) // 2,
+            )
+            for _ in dilation
+        ]
+        self.activations = [
+            Activation1d(
+                Snake(channels, alpha_logscale=snake_logscale)
+                if activation == "snake"
+                else SnakeBeta(channels, alpha_logscale=snake_logscale)
+            )
+            for _ in range(len(dilation) * 2)
+        ]
+    def __call__(self, x: mx.array):
+        for conv1, conv2, activation1, activation2 in zip(
+            self.convs1, self.convs2, self.activations[::2], self.activations[1::2]
+        ):
+            x = x + conv2(activation2(conv1(activation1(x))))
+        return x
+class AMPBlock2(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        snake_logscale: bool,
+        activation: Literal["snake", "snakebeta"],
+        kernel_size=3,
+        dilation: list[int] = [1, 3, 5],
+    ):
+        super().__init__()
+        self.convs = [
+            WNConv1d(
+                channels,
+                channels,
+                kernel_size,
+                stride=1,
+                dilation=d,
+                padding=((kernel_size - 1) * d) // 2,
+            )
+            for d in dilation
+        ]
+        self.activations = [
+            Activation1d(
+                Snake(channels, alpha_logscale=snake_logscale)
+                if activation == "snake"
+                else SnakeBeta(channels, alpha_logscale=snake_logscale)
+            )
+            for _ in dilation
+        ]
+    def __call__(self, x: mx.array):
+        for conv, activation in zip(self.convs, self.activations):
+            x = x + conv(activation(x))
+        return x

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py ADDED Viewed

@@ -0,0 +1,149 @@
+from dataclasses import dataclass
+from typing import Literal
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten
+from mlx_audio.codec.models.bigvgan.activation import Snake, SnakeBeta
+from mlx_audio.codec.models.bigvgan.amp import AMPBlock1, AMPBlock2
+from mlx_audio.codec.models.bigvgan.conv import WNConv1d, WNConvTranspose1d
+from mlx_audio.codec.models.bigvgan.resample import Activation1d
+@dataclass
+class BigVGANConfig:
+    num_mels: int
+    upsample_rates: list[int]
+    upsample_kernel_sizes: list[int]
+    upsample_initial_channel: int
+    resblock: Literal["1", "2"]
+    resblock_kernel_sizes: list[int]
+    resblock_dilation_sizes: list[list[int]]
+    activation: Literal["snakebeta", "snake"]
+    snake_logscale: bool
+    use_bias_at_final: bool = True  # compatability
+    use_tanh_at_final: bool = True  # compatability
+class BigVGAN(nn.Module):
+    def __init__(self, config: BigVGANConfig):
+        super().__init__()
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.use_tanh_at_final = config.use_tanh_at_final
+        self.conv_pre = WNConv1d(
+            config.num_mels, config.upsample_initial_channel, 7, 1, 3
+        )
+        self.ups = [
+            [
+                WNConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2,
+                )
+            ]
+            for i, (u, k) in enumerate(
+                zip(config.upsample_rates, config.upsample_kernel_sizes)
+            )
+        ]
+        self.resblocks = [
+            (
+                AMPBlock1(
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    config.snake_logscale,
+                    config.activation,
+                    k,
+                    d,
+                )
+                if config.resblock == "1"
+                else AMPBlock2(
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    config.snake_logscale,
+                    config.activation,
+                    k,
+                    d,
+                )
+            )
+            for i in range(len(self.ups))
+            for j, (k, d) in enumerate(
+                zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)
+            )
+        ]
+        self.activation_post = Activation1d(
+            Snake(
+                config.upsample_initial_channel // (2 ** len(self.ups)),
+                alpha_logscale=config.snake_logscale,
+            )
+            if config.activation == "snake"
+            else SnakeBeta(
+                config.upsample_initial_channel // (2 ** len(self.ups)),
+                alpha_logscale=config.snake_logscale,
+            )
+        )
+        self.conv_post = WNConv1d(
+            config.upsample_initial_channel // (2 ** len(self.ups)),
+            1,
+            7,
+            1,
+            padding=3,
+            bias=config.use_bias_at_final,
+        )
+    def __call__(
+        self, x: mx.array, *args, **kwargs
+    ) -> mx.array:  # (batch, num_mels, seq)
+        x = x.transpose(0, 2, 1)
+        x = self.conv_pre(x)
+        for step in range(self.num_upsamples):
+            for idx in range(len(self.ups[step])):
+                x = self.ups[step][idx](x)
+            xs = self.resblocks[step * self.num_kernels](x)
+            for idx in range(1, self.num_kernels):
+                xs += self.resblocks[step * self.num_kernels + idx](x)
+            x = xs / self.num_kernels
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        if self.use_tanh_at_final:
+            x = mx.tanh(x)
+        else:
+            x = mx.clip(x, -1.0, 1.0)
+        return x.transpose(0, 2, 1)
+    def sanitize(self, weights: dict[str, mx.array]):
+        new_weights = {}
+        curr_weights = dict(tree_flatten(self.parameters()))
+        for key, value in weights.items():
+            if "num_batches_tracked" in key:
+                continue
+            if "conv" in key or "lowpass.filter" in key or "upsample.filter" in key:
+                if value.ndim == 3:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(0, 2, 1)
+                elif value.ndim == 4:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(0, 2, 3, 1)
+            if "ups." in key:
+                if value.ndim == 3:
+                    if value.shape != curr_weights[key].shape:
+                        value = value.transpose(1, 2, 0)
+            new_weights[key] = value
+        del curr_weights
+        return new_weights

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py ADDED Viewed

@@ -0,0 +1,114 @@
+import math
+import mlx.core as mx
+import mlx.nn as nn
+def normalize_weight(x, except_dim=0):
+    if x.ndim != 3:
+        raise ValueError("Input tensor must have 3 dimensions")
+    axes = tuple(i for i in range(x.ndim) if i != except_dim)
+    return mx.sqrt(mx.sum(mx.power(x, 2), axis=axes, keepdims=True))
+class WNConv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if bias:
+            self.bias = mx.zeros((out_channels,))
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.stride = stride
+        self.groups = groups
+        scale = math.sqrt(1 / (in_channels * kernel_size))
+        weight_init = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, kernel_size, in_channels),
+        )
+        self.weight_g = normalize_weight(weight_init)
+        self.weight_v = weight_init / (self.weight_g + 1e-12)
+    def _extra_repr(self):
+        return (
+            f"in_channels={self.weight_v.shape[2]}, out_channels={self.weight_v.shape[0]}, "
+            f"kernel_size={self.kernel_size}, stride={self.stride}, "
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"bias={'bias' in self}"
+        )
+    def __call__(self, x):
+        weight = self.weight_g * self.weight_v / normalize_weight(self.weight_v)
+        y = mx.conv1d(x, weight, self.stride, self.padding, self.dilation, self.groups)
+        if "bias" in self:
+            y = y + self.bias
+        return y
+class WNConvTranspose1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        output_padding: int = 0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.bias = mx.zeros((out_channels,)) if bias else None
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.stride = stride
+        self.output_padding = output_padding
+        scale = math.sqrt(1 / (in_channels * kernel_size))
+        weight_init = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, kernel_size, in_channels),
+        )
+        self.weight_g = normalize_weight(weight_init, except_dim=2)
+        self.weight_v = weight_init / (self.weight_g + 1e-12)
+    def _extra_repr(self):
+        return (
+            f"in_channels={self.weight_v.shape[2]}, out_channels={self.weight_v.shape[0]}, "
+            f"kernel_size={self.kernel_size}, stride={self.stride}, "
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"output_padding={self.output_padding}, bias={'bias' in self}"
+        )
+    def __call__(self, x):
+        weight = (
+            self.weight_g
+            * self.weight_v
+            / normalize_weight(self.weight_v, except_dim=2)
+        )
+        y = mx.conv_transpose1d(
+            x, weight, self.stride, self.padding, self.dilation, self.output_padding
+        )
+        nn.ConvTranspose1d
+        if self.bias is not None:
+            y = y + self.bias
+        return y

nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py ADDED Viewed

@@ -0,0 +1,177 @@
+import math
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+def sinc(x: mx.array):
+    return mx.where(
+        x == 0,
+        mx.array(1.0, dtype=x.dtype),
+        mx.sin(math.pi * x) / math.pi / x,
+    )
+def kaiser_sinc_filter1d(
+    cutoff: float, half_width: float, kernel_size: int
+) -> mx.array:  # return filter [1,kernel_size,1]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = mx.array(np.kaiser(kernel_size, beta=beta))
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = mx.arange(-half_size, half_size) + 0.5
+    else:
+        time = mx.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter = mx.zeros_like(time).reshape(1, kernel_size, 1)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        filter_ /= filter_.sum()
+        filter = filter_.reshape(1, kernel_size, 1)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff: float = 0.5,
+        half_width: float = 0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "edge",
+        kernel_size: int = 12,
+    ):
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.even = kernel_size % 2 == 0
+        self.stride = stride
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.filter = kaiser_sinc_filter1d(
+            cutoff, half_width, kernel_size
+        )  # (1, kernel_size, 1)
+        mx.eval(self.filter)
+    def __call__(self, x: mx.array):  # (b, t, c)
+        _, _, C = x.shape
+        if self.padding:
+            x = mx.pad(
+                x,
+                ((0, 0), (self.pad_left, self.pad_right), (0, 0)),
+                mode=self.padding_mode,
+            )
+        expanded_filter = mx.broadcast_to(self.filter, (C, *self.filter.shape[1:]))
+        out = mx.conv1d(
+            x,
+            expanded_filter,
+            stride=self.stride,
+            groups=C,
+        )
+        return out
+class UpSample1d(nn.Module):
+    def __init__(self, ratio: int = 2, kernel_size: Optional[int] = None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = (
+            self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        )
+        self.filter = kaiser_sinc_filter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
+        )
+        mx.eval(self.filter)
+    def __call__(self, x: mx.array) -> mx.array:  # (b, t, c)
+        _, _, C = x.shape
+        x = mx.pad(x, ((0, 0), (self.pad, self.pad), (0, 0)), mode="edge")
+        expanded_filter = mx.broadcast_to(self.filter, (C, *self.filter.shape[1:]))
+        x = self.ratio * mx.conv_transpose1d(
+            x,
+            expanded_filter,
+            stride=self.stride,
+            groups=C,
+        )
+        return x[:, self.pad_left : -self.pad_right, :]
+class DownSample1d(nn.Module):
+    def __init__(self, ratio: int = 2, kernel_size: Optional[int] = None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+    def __call__(self, x: mx.array) -> mx.array:  # (b, t, c)
+        return self.lowpass(x)
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation: nn.Module,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    def __call__(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .dac import DAC