PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from typing import List
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.codec.models.vocos.vocos import VocosBackbone
+from mlx_audio.tts.models.spark.modules.blocks.sampler import SamplingBlock
+class Decoder(nn.Module):
+    """Decoder module with convnext and upsampling blocks
+    Args:
+        sample_ratios (List[int]): sample ratios
+            example: [2, 2] means downsample by 2x and then upsample by 2x
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        condition_dim: int = None,
+        sample_ratios: List[int] = [1, 1],
+        use_tanh_at_final: bool = False,
+    ):
+        super().__init__()
+        self.linear_pre = nn.Linear(input_channels, vocos_dim)
+        modules = []
+        for ratio in sample_ratios:
+            module_list = [
+                SamplingBlock(
+                    dim=vocos_dim,
+                    groups=vocos_dim,
+                    upsample_scale=ratio,
+                ),
+                VocosBackbone(
+                    input_channels=vocos_dim,
+                    dim=vocos_dim,
+                    intermediate_dim=vocos_intermediate_dim,
+                    num_layers=2,
+                ),
+            ]
+            modules.append(module_list)
+        self.downsample = modules
+        self.vocos_backbone = VocosBackbone(
+            input_channels=vocos_dim,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+            adanorm_num_embeddings=condition_dim,
+        )
+        self.linear = nn.Linear(vocos_dim, out_channels)
+        self.use_tanh_at_final = use_tanh_at_final
+    def __call__(self, x: mx.array, c: mx.array = None):
+        """encoder forward.
+        Args:
+            x (mx.array): (batch_size, input_channels, length)
+        Returns:
+            x (mx.array): (batch_size, encode_channels, length)
+        """
+        x = self.linear_pre(x.transpose(0, 2, 1))
+        for modules in self.downsample:
+            for module in modules:
+                x = module(x)
+        x = self.vocos_backbone(x.transpose(0, 2, 1), bandwidth_id=c)
+        x = self.linear(x).transpose(0, 2, 1)
+        if self.use_tanh_at_final:
+            x = mx.tanh(x)
+        return x
+# test
+if __name__ == "__main__":
+    test_input = mx.random.normal(
+        (8, 1024, 50), dtype=mx.float32
+    )  # Batch size = 8, 1024 channels, length = 50
+    condition = mx.random.randint(0, 100, (256, 8))  # 8, 256
+    decoder = Decoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        condition_dim=256,
+        sample_ratios=[2, 2],
+    )
+    output = decoder(test_input, condition)
+    print(output.shape)  # torch.Size([8, 256, 200])
+    if output.shape == (8, 256, 200):
+        print("Decoder test passed")
+    else:
+        print("Decoder test failed")

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.codec.models.vocos.vocos import VocosBackbone
+from mlx_audio.tts.models.spark.modules.blocks.sampler import SamplingBlock
+class Encoder(nn.Module):
+    """Encoder module with convnext and downsampling blocks"""
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        sample_ratios: List[int] = [1, 1],
+    ):
+        super().__init__()
+        """
+        Encoder module with VocosBackbone and sampling blocks.
+        Args:
+            sample_ratios (List[int]): sample ratios
+                example: [2, 2] means downsample by 2x and then upsample by 2x
+        """
+        self.encoder = VocosBackbone(
+            input_channels=input_channels,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+        )
+        modules = []
+        for ratio in sample_ratios:
+            modules.append(
+                [
+                    SamplingBlock(
+                        dim=vocos_dim,
+                        groups=vocos_dim,
+                        downsample_scale=ratio,
+                    ),
+                    VocosBackbone(
+                        input_channels=vocos_dim,
+                        dim=vocos_dim,
+                        intermediate_dim=vocos_intermediate_dim,
+                        num_layers=2,
+                        bias=True,
+                    ),
+                ]
+            )
+        self.downsample = modules
+        self.project = nn.Linear(vocos_dim, out_channels)
+    def __call__(self, x: mx.array, *args):
+        """
+        Args:
+            x (mx.array): (batch_size, input_channels, length)
+        Returns:
+            x (mx.array): (batch_size, encode_channels, length)
+        """
+        x = self.encoder(x)
+        for modules in self.downsample:
+            for module in modules:
+                x = x.transpose(0, 2, 1)
+                x = module(x)
+        x = self.project(x)
+        return x.transpose(0, 2, 1)
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "dwconv.weight" in k:
+                if v.shape[1] < v.shape[-1]:
+                    sanitized_weights[k] = v.transpose(0, 2, 1)
+                else:
+                    sanitized_weights[k] = v
+            elif "embed.weight" in k:
+                if v.shape[1] > v.shape[-1]:
+                    sanitized_weights[k] = v.transpose(0, 2, 1)
+                else:
+                    sanitized_weights[k] = v
+            else:
+                sanitized_weights[k] = v
+        return sanitized_weights
+# test
+if __name__ == "__main__":
+    test_input = mx.random.normal(
+        (8, 1024, 50), dtype=mx.float32
+    )  # Batch size = 8, 1024 channels, length = 50
+    encoder = Encoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        sample_ratios=[2, 2],
+    )
+    output = encoder(test_input)
+    print(output.shape)  # torch.Size([8, 256, 12])
+    if output.shape == (8, 256, 12):
+        print("test successful")
+    else:
+        print("test failed")

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py ADDED Viewed

@@ -0,0 +1,113 @@
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.codec.models.descript.dac import (
+    ResidualUnit,
+    Snake1d,
+    WNConv1d,
+    WNConvTranspose1d,
+)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        kernel_size: int = 2,
+        stride: int = 1,
+    ):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - stride) // 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def __call__(self, x):
+        return self.block(x)
+class WaveGenerator(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        kernel_sizes,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, (kernel_size, stride) in enumerate(zip(kernel_sizes, rates)):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, kernel_size, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = layers
+    def __call__(self, x):
+        x = x.transpose(0, 2, 1)
+        for module in self.model:
+            x = module(x)
+        return x.transpose(0, 2, 1)
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "decoder.model" in k:
+                if "block.layers" not in k:
+                    k = k.replace("block", "block.layers")
+                    sanitized_weights[k] = v
+            if ".alpha" in k:
+                if v.shape[1] > v.shape[-1]:
+                    sanitized_weights[k] = v.transpose(0, 2, 1)
+                else:
+                    sanitized_weights[k] = v
+            elif (
+                "decoder.model" in k
+                and "block.layers.1" in k
+                and ("weight_v" in k or "weight_g" in k)
+                and k.count("block") == 1
+            ):
+                if v.shape[0] > v.shape[-1]:
+                    sanitized_weights[k] = v.transpose(1, 2, 0)
+                else:
+                    sanitized_weights[k] = v
+            else:
+                sanitized_weights[k] = v
+        return sanitized_weights
+if __name__ == "__main__":
+    test_input = mx.random.normal((8, 1024, 50), dtype=mx.float32)
+    wave_generator = WaveGenerator(1024, 16, [2, 2], [7, 7])
+    output = wave_generator(test_input)
+    print(output.shape)
+    if output.shape == (8, 1, 203):
+        print("WaveGenerator test passed")
+    else:
+        print("WaveGenerator test failed")

nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""
+Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+Code adapted from Jax version in Appendix A.1
+"""
+from __future__ import annotations
+from contextlib import nullcontext
+from functools import partial, wraps
+from typing import List, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+# helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+# tensor helpers
+def round_ste(z: mx.array) -> mx.array:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z)
+# main class
+class FSQ(nn.Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks=1,
+        keep_num_codebooks_dim: bool | None = None,
+        scale: float | None = None,
+        allowed_dtypes: Tuple[mx.dtype, ...] = (mx.float32, mx.float64),
+        channel_first: bool = False,
+        projection_has_bias: bool = True,
+        return_indices=True,
+        force_quantization_f32=True,
+    ):
+        super().__init__()
+        _levels = mx.array(list(levels), dtype=mx.int32)
+        self._levels = _levels
+        _basis = mx.cumprod(mx.array([1] + list(levels[:-1])), axis=0)
+        self._basis = _basis
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = (
+            nn.Linear(self.dim, effective_codebook_dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(effective_codebook_dim, self.dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(mx.arange(self.codebook_size))
+            self._implicit_codebook = implicit_codebook
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def atanh(self, x):
+        return mx.log((1 + x) / (1 - x)) / 2
+    def bound(self, z, eps: float = 1e-3):
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = mx.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = self.atanh(offset / half_l)  # original atanh
+        return mx.tanh(z + shift) * half_l - offset
+    def quantize(self, z):
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(axis=-1).astype(mx.int32)
+    def indices_to_level_indices(self, indices):
+        """Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings"""
+        indices = mx.reshape(indices, (indices.shape[0], -1, 1))
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """Inverse of `codes_to_indices`."""
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = mx.reshape(codes, (codes.shape[0], -1))
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = mx.reshape(codes, (codes.shape[0], -1, codes.shape[-1]))
+        return codes
+    def __call__(self, z):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = mx.reshape(z, (z.shape[0], -1, z.shape[-1]))
+            # z = mx.reshape(z, (z.shape[0], -1))
+        assert (
+            z.shape[-1] == self.dim
+        ), f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+        z = self.project_in(z)
+        z = mx.reshape(z, (z.shape[0], z.shape[1], self.num_codebooks, z.shape[-1]))
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        orig_dtype = z.dtype
+        if force_f32 and orig_dtype not in self.allowed_dtypes:
+            z = z.float()
+        codes = self.quantize(z)
+        # returning indices could be optional
+        indices = None
+        if self.return_indices:
+            indices = self.codes_to_indices(codes)
+        codes = mx.reshape(codes, (codes.shape[0], codes.shape[1], -1))
+        codes = codes.astype(orig_dtype)
+        # project out
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = mx.reshape(out, (out.shape[0], -1, out.shape[-1]))
+            out = mx.reshape(
+                out, (out.shape[0], out.shape[1], out.shape[2], out.shape[-1])
+            )
+            indices = mx.reshape(indices, (indices.shape[0], -1, indices.shape[-1]))
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = mx.reshape(indices, (indices.shape[0], -1, indices.shape[-1]))
+        # return quantized output and indices
+        return out, indices