PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py ADDED Viewed

@@ -0,0 +1,359 @@
+from __future__ import annotations
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, List, Optional
+import mlx.core as mx
+import mlx.nn as nn
+import yaml
+from huggingface_hub import snapshot_download
+from mlx_audio.utils import hanning, istft
+from ..encodec import Encodec
+from .mel import log_mel_spectrogram
+class FeatureExtractor(nn.Module):
+    """Base class for feature extractors."""
+    def __call__(self, audio: mx.array, **kwargs) -> mx.array:
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class MelSpectrogramFeatures(FeatureExtractor):
+    def __init__(
+        self,
+        sample_rate=24_000,
+        n_fft=1024,
+        hop_length=256,
+        n_mels=100,
+        padding="center",
+    ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.n_mels = n_mels
+    def __call__(self, audio: mx.array, **kwargs):
+        return log_mel_spectrogram(
+            audio,
+            sample_rate=self.sample_rate,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            padding=0,
+        )
+class EncodecFeatures(FeatureExtractor):
+    def __init__(
+        self,
+        encodec_model: str = "encodec_24khz",
+        bandwidths: List[float] = [1.5, 3.0, 6.0, 12.0],
+        train_codebooks: bool = False,
+    ):
+        super().__init__()
+        if encodec_model == "encodec_24khz":
+            encodec, preprocessor = Encodec.from_pretrained(
+                "mlx-community/encodec-24khz-float32"
+            )
+        elif encodec_model == "encodec_48khz":
+            encodec, preprocessor = Encodec.from_pretrained(
+                "mlx-community/encodec-48khz-float32"
+            )
+        else:
+            raise ValueError(
+                f"Unsupported encodec_model: {encodec_model}. Supported options are 'encodec_24khz' and 'encodec_48khz'."
+            )
+        self.encodec = encodec
+        self.preprocessor = preprocessor
+        self.num_q = self.encodec.quantizer.get_num_quantizers_for_bandwidth(
+            bandwidth=max(bandwidths)
+        )
+        self.codebook_weights = mx.concatenate(
+            [vq.codebook.embed for vq in self.encodec.quantizer.layers[: self.num_q]]
+        )
+        self.bandwidths = bandwidths
+    def get_encodec_codes(self, audio: mx.array, bandwidth_id: int) -> mx.array:
+        features, mask = self.preprocessor(audio)
+        if isinstance(bandwidth_id, mx.array):
+            bandwidth_id = int(bandwidth_id.flatten().tolist()[0])
+        elif isinstance(bandwidth_id, list):
+            bandwidth_id = bandwidth_id[0]
+        codes, _ = self.encodec.encode(
+            features, mask, bandwidth=self.bandwidths[bandwidth_id]
+        )
+        return mx.reshape(codes, (codes.shape[-2], 1, codes.shape[-1]))
+    def get_features_from_codes(self, codes: mx.array) -> mx.array:
+        offsets = mx.arange(
+            0,
+            self.encodec.quantizer.codebook_size * codes.shape[0],
+            self.encodec.quantizer.codebook_size,
+        )
+        embeddings_idxs = codes + mx.reshape(offsets, (offsets.shape[0], 1, 1))
+        embeddings = self.codebook_weights[embeddings_idxs]
+        features = mx.sum(embeddings, axis=0)
+        return features
+    def __call__(self, audio: mx.array, **kwargs) -> mx.array:
+        bandwidth_id = kwargs.get("bandwidth_id")
+        if bandwidth_id is None:
+            raise ValueError("The 'bandwidth_id' argument is required")
+        codes = self.get_encodec_codes(audio, bandwidth_id=bandwidth_id)
+        return self.get_features_from_codes(codes)
+class ISTFTHead(nn.Module):
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "center"):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.out = nn.Linear(dim, n_fft + 2)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.out(x).swapaxes(1, 2)
+        mag, p = x.split(2, axis=1)
+        mag = mx.exp(mag)
+        mag = mx.clip(mag, None, 1e2)
+        x = mx.cos(p)
+        y = mx.sin(p)
+        S = mag * (x + 1j * y)
+        audio = istft(
+            S.squeeze(0),
+            window=hanning(self.n_fft),
+            hop_length=self.hop_length,
+            win_length=self.n_fft,
+        )
+        return audio
+class ConvNeXtBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+        adanorm_num_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        # depthwise conv
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)
+        self.adanorm = adanorm_num_embeddings is not None
+        if adanorm_num_embeddings:
+            self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        # pointwise/1x1 convs, implemented with linear layers
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            layer_scale_init_value * mx.ones(dim)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def __call__(
+        self, x: mx.array, cond_embedding_id: Optional[mx.array] = None
+    ) -> mx.array:
+        residual = x
+        x = self.dwconv(x)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = residual + x
+        return x
+class AdaLayerNorm(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = embedding_dim
+        self.scale = nn.Linear(num_embeddings, embedding_dim)
+        self.shift = nn.Linear(num_embeddings, embedding_dim)
+        self.scale.weight = mx.ones(self.scale.weight.shape)
+        self.shift.weight = mx.zeros(self.shift.weight.shape)
+    def __call__(self, x: mx.array, cond_embedding: mx.array) -> mx.array:
+        scale = self.scale(cond_embedding)
+        shift = self.shift(cond_embedding)
+        x = mx.fast.layer_norm(x, weight=None, bias=None, eps=self.eps)
+        x = x * scale[:, None, :] + shift[:, None, :]
+        return x
+class VocosBackbone(nn.Module):
+    def __init__(
+        self,
+        input_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        layer_scale_init_value: Optional[float] = None,
+        adanorm_num_embeddings: Optional[int] = None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
+        self.adanorm = adanorm_num_embeddings is not None
+        if adanorm_num_embeddings:
+            self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+        self.convnext = [
+            ConvNeXtBlock(
+                dim=dim,
+                intermediate_dim=intermediate_dim,
+                layer_scale_init_value=layer_scale_init_value,
+                adanorm_num_embeddings=adanorm_num_embeddings,
+            )
+            for _ in range(num_layers)
+        ]
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6, bias=bias)
+    def __call__(self, x: mx.array, **kwargs) -> mx.array:
+        bandwidth_id = kwargs.get("bandwidth_id", None)
+        # Transpose if the input is not in the correct shape
+        if x.shape[-1] != self.input_channels:
+            x = x.transpose(0, 2, 1)
+        x = self.embed(x)
+        if self.adanorm:
+            assert bandwidth_id is not None
+            x = self.norm(x, bandwidth_id)
+        else:
+            x = self.norm(x)
+        for conv_block in self.convnext:
+            x = conv_block(x, cond_embedding_id=bandwidth_id)
+        x = self.final_layer_norm(x)
+        return x
+class Vocos(nn.Module):
+    def __init__(
+        self,
+        feature_extractor: FeatureExtractor,
+        backbone: VocosBackbone,
+        head: ISTFTHead,
+    ):
+        super().__init__()
+        self.feature_extractor = feature_extractor
+        self.backbone = backbone
+        self.head = head
+    @classmethod
+    def from_hparams(cls, config: dict) -> Vocos:
+        """
+        Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
+        """
+        config = SimpleNamespace(**config)
+        if "MelSpectrogramFeatures" in config.feature_extractor["class_path"]:
+            feature_extractor_init_args = config.feature_extractor["init_args"]
+            feature_extractor = MelSpectrogramFeatures(**feature_extractor_init_args)
+        elif "EncodecFeatures" in config.feature_extractor["class_path"]:
+            feature_extractor = EncodecFeatures(**config.feature_extractor["init_args"])
+        backbone = VocosBackbone(**config.backbone["init_args"])
+        head = ISTFTHead(**config.head["init_args"])
+        model = cls(feature_extractor=feature_extractor, backbone=backbone, head=head)
+        return model
+    @classmethod
+    def from_pretrained(cls, path_or_repo: str) -> Vocos:
+        """
+        Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
+        """
+        path = Path(path_or_repo)
+        if not path.exists():
+            path = Path(
+                snapshot_download(
+                    repo_id=path_or_repo,
+                    allow_patterns=["*.yaml", "*.safetensors"],
+                )
+            )
+        model_path = path / "model.safetensors"
+        with open(model_path, "rb") as f:
+            weights = mx.load(f)
+        config_path = path / "config.yaml"
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        model = cls.from_hparams(config)
+        # remove unused weights
+        try:
+            del weights["feature_extractor.mel_spec.spectrogram.window"]
+            del weights["head.istft.window"]
+        except KeyError:
+            pass
+        # transpose weights as needed
+        new_weights = {}
+        for k, v in weights.items():
+            basename, pname = k.rsplit(".", 1)
+            if "backbone.embed" in basename and pname == "weight":
+                new_weights[k] = v.moveaxis(1, 2)
+            elif "dwconv" in basename and pname == "weight":
+                new_weights[k] = v.moveaxis(1, 2)
+            else:
+                new_weights[k] = v
+        # use strict = False to avoid the encodec weights
+        model.load_weights(list(new_weights.items()), strict=False)
+        model.eval()
+        return model
+    def __call__(self, audio_input: mx.array, **kwargs: Any) -> mx.array:
+        features = self.feature_extractor(audio_input, **kwargs)
+        audio_output = self.decode(features, **kwargs)
+        return audio_output
+    def get_encodec_codes(self, audio_input: mx.array, bandwidth_id: int) -> mx.array:
+        if not isinstance(self.feature_extractor, EncodecFeatures):
+            raise ValueError("This model does not support getting encodec codes.")
+        return self.feature_extractor.get_encodec_codes(audio_input, bandwidth_id)
+    def decode(self, features_input: mx.array, **kwargs: Any) -> mx.array:
+        x = self.backbone(features_input, **kwargs)
+        audio_output = self.head(x)
+        return audio_output
+    def decode_from_codes(self, codes: mx.array, **kwargs: Any) -> mx.array:
+        features = self.feature_extractor.get_features_from_codes(codes)
+        audio_output = self.decode(features, **kwargs)
+        return audio_output

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py ADDED Viewed

File without changes

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py ADDED Viewed

@@ -0,0 +1,54 @@
+import math
+import unittest
+import mlx.core as mx
+from mlx_audio.codec.models.bigvgan.bigvgan import BigVGAN, BigVGANConfig
+class TestBigVGAN(unittest.TestCase):
+    def test_bigvgan_22khz_80bands(self):
+        cfg = BigVGANConfig(
+            num_mels=80,
+            upsample_rates=[4, 4, 2, 2, 2, 2],
+            upsample_kernel_sizes=[8, 8, 4, 4, 4, 4],
+            upsample_initial_channel=1536,
+            resblock="1",
+            resblock_kernel_sizes=[3, 7, 11],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            activation="snakebeta",
+            snake_logscale=True,
+            use_bias_at_final=True,
+            use_tanh_at_final=True,
+        )
+        model = BigVGAN(cfg)
+        audio = mx.zeros((1, 80, 800))
+        y = model(audio)
+        self.assertEqual(y.shape, (1, 1, 800 * math.prod(cfg.upsample_rates)))
+    def test_bigvgan_44khz_128bands_512x(self):
+        cfg = BigVGANConfig(
+            num_mels=128,
+            upsample_rates=[8, 4, 2, 2, 2, 2],
+            upsample_kernel_sizes=[16, 8, 4, 4, 4, 4],
+            upsample_initial_channel=1536,
+            resblock="1",
+            resblock_kernel_sizes=[3, 7, 11],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            activation="snakebeta",
+            snake_logscale=True,
+            use_bias_at_final=False,
+            use_tanh_at_final=False,
+        )
+        model = BigVGAN(cfg)
+        audio = mx.zeros((1, 128, 800))
+        y = model(audio)
+        self.assertEqual(y.shape, (1, 1, 800 * math.prod(cfg.upsample_rates)))
+if __name__ == "__main__":
+    unittest.main()

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py ADDED Viewed

@@ -0,0 +1,109 @@
+import unittest
+import mlx.core as mx
+from ..models.descript import DAC
+class TestDescript(unittest.TestCase):
+    """Test Descript model encoding and decoding."""
+    def test_descript_16khz(self):
+        audio = mx.zeros((1, 1, 80_000))
+        encoder_dim = 64
+        encoder_rates = [2, 4, 5, 8]
+        decoder_dim = 1536
+        decoder_rates = [8, 5, 4, 2]
+        n_codebooks = 12
+        codebook_size = 1024
+        codebook_dim = 8
+        sample_rate = 16_000
+        model = DAC(
+            encoder_dim=encoder_dim,
+            encoder_rates=encoder_rates,
+            decoder_dim=decoder_dim,
+            decoder_rates=decoder_rates,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            sample_rate=sample_rate,
+        )
+        x = model.preprocess(audio, sample_rate)
+        z, codes, latents, _, _ = model.encode(x)
+        self.assertEqual(z.shape, (1, 1024, 250))
+        self.assertEqual(codes.shape, (1, 12, 250))
+        self.assertEqual(latents.shape, (1, 96, 250))
+        y = model.decode(z).squeeze(-1)
+        self.assertEqual(y.shape, (1, 80_043))
+    def test_descript_24khz(self):
+        audio = mx.zeros((1, 1, 120_000))
+        encoder_dim = 64
+        encoder_rates = [2, 4, 5, 8]
+        decoder_dim = 1536
+        decoder_rates = [8, 5, 4, 2]
+        n_codebooks = 32
+        codebook_size = 1024
+        codebook_dim = 8
+        sample_rate = 24_000
+        model = DAC(
+            encoder_dim=encoder_dim,
+            encoder_rates=encoder_rates,
+            decoder_dim=decoder_dim,
+            decoder_rates=decoder_rates,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            sample_rate=sample_rate,
+        )
+        x = model.preprocess(audio, sample_rate)
+        z, codes, latents, _, _ = model.encode(x)
+        self.assertEqual(z.shape, (1, 1024, 375))
+        self.assertEqual(codes.shape, (1, 32, 375))
+        self.assertEqual(latents.shape, (1, 256, 375))
+        y = model.decode(z).squeeze(-1)
+        self.assertEqual(y.shape, (1, 120_043))
+    def test_descript_44khz(self):
+        audio = mx.zeros((1, 1, 220_000))
+        encoder_dim = 64
+        encoder_rates = [2, 4, 8, 8]
+        decoder_dim = 1536
+        decoder_rates = [8, 8, 4, 2]
+        n_codebooks = 9
+        codebook_size = 1024
+        codebook_dim = 8
+        sample_rate = 44_100
+        model = DAC(
+            encoder_dim=encoder_dim,
+            encoder_rates=encoder_rates,
+            decoder_dim=decoder_dim,
+            decoder_rates=decoder_rates,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            sample_rate=sample_rate,
+        )
+        x = model.preprocess(audio, sample_rate)
+        z, codes, latents, _, _ = model.encode(x)
+        self.assertEqual(z.shape, (1, 1024, 430))
+        self.assertEqual(codes.shape, (1, 9, 430))
+        self.assertEqual(latents.shape, (1, 72, 430))
+        y = model.decode(z).squeeze(-1)
+        self.assertEqual(y.shape, (1, 220_235))
+if __name__ == "__main__":
+    unittest.main()

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py ADDED Viewed

@@ -0,0 +1,58 @@
+import unittest
+import mlx.core as mx
+from ..models.encodec import Encodec, EncodecConfig
+config = EncodecConfig(
+    audio_channels=1,
+    chunk_length_s=None,
+    codebook_dim=128,
+    codebook_size=1024,
+    compress=2,
+    dilation_growth_rate=2,
+    hidden_size=128,
+    kernel_size=7,
+    last_kernel_size=7,
+    model_type="encodec",
+    norm_type="weight_norm",
+    normalize=False,
+    num_filters=32,
+    num_lstm_layers=2,
+    num_residual_layers=1,
+    overlap=None,
+    pad_mode="reflect",
+    residual_kernel_size=3,
+    sampling_rate=24000,
+    target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0],
+    trim_right_ratio=1.0,
+    upsampling_ratios=[8, 5, 4, 2],
+    use_causal_conv=True,
+)
+class TestEncodec(unittest.TestCase):
+    """Test EnCodec model encoding and decoding."""
+    def test_encodec_24khz(self):
+        model = Encodec(config)
+        audio = mx.zeros((1, 120_000, 1))
+        # default bandwidth
+        (codes, scales) = model.encode(audio)
+        self.assertEqual(codes.shape, (1, 1, 2, 375))
+        audio_out = model.decode(codes, scales)
+        self.assertEqual(audio_out.shape, (1, 120_000, 1))
+        # 6kbps bandwidth
+        (codes, scales) = model.encode(audio, bandwidth=6)
+        self.assertEqual(codes.shape, (1, 1, 8, 375))
+        audio_out = model.decode(codes, scales)
+        self.assertEqual(audio_out.shape, (1, 120_000, 1))
+if __name__ == "__main__":
+    unittest.main()

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py ADDED Viewed

@@ -0,0 +1,22 @@
+import unittest
+import mlx.core as mx
+from ..models.mimi.mimi import Mimi, mimi_202407
+class TestMimi(unittest.TestCase):
+    def test_mimi_model(self):
+        """Test Mimi model encoding and decoding."""
+        model = Mimi(mimi_202407(32))
+        audio = mx.zeros((1, 1, 120_000))
+        codes = model.encode(audio)
+        self.assertEqual(codes.shape, (1, 32, 63))
+        audio_out = model.decode(codes)
+        self.assertEqual(audio_out.shape, (1, 1, 120_960))
+if __name__ == "__main__":
+    unittest.main()

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py ADDED Viewed

@@ -0,0 +1,25 @@
+import unittest
+import mlx.core as mx
+from ..models.s3 import S3TokenizerV2
+from ..models.s3.utils import log_mel_spectrogram
+class TestS3TokenizerV2(unittest.TestCase):
+    """Test S3TokenizerV2 model encoding and decoding."""
+    def test_s3_tokenizer_v2(self):
+        audio = mx.zeros((160_000))
+        mel = log_mel_spectrogram(audio)
+        model = S3TokenizerV2("speech_tokenizer_v2_25hz")
+        mel_batch = mel[None, ...]  # (1, n_mels, T)
+        mel_len = mx.array([mel.shape[1]], dtype=mx.int32)
+        codes, code_lens = model(mel_batch, mel_len)
+        self.assertEqual(codes.shape, (1, 251))
+        codes = codes[0, : code_lens[0].item()]
+        self.assertEqual(codes.shape, (251,))

nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py ADDED Viewed

@@ -0,0 +1,40 @@
+import unittest
+import mlx.core as mx
+from ..models.snac import SNAC
+config = {
+    "sampling_rate": 24000,
+    "encoder_dim": 48,
+    "encoder_rates": [2, 4, 8, 8],
+    "decoder_dim": 1024,
+    "decoder_rates": [8, 8, 4, 2],
+    "attn_window_size": None,
+    "codebook_size": 4096,
+    "codebook_dim": 8,
+    "vq_strides": [4, 2, 1],
+    "noise": True,
+    "depthwise": True,
+}
+class TestSNAC(unittest.TestCase):
+    """Test SNAC model encoding and decoding."""
+    def test_snac(self):
+        audio = mx.zeros((1, 1, 120_000))
+        model = SNAC(**config)
+        codes = model.encode(audio)
+        self.assertEqual(len(codes), 3)
+        self.assertEqual(codes[0].shape, (1, 59))
+        self.assertEqual(codes[1].shape, (1, 118))
+        self.assertEqual(codes[2].shape, (1, 236))
+        reconstructed = model.decode(codes).squeeze(-1)
+        self.assertEqual(reconstructed.shape, (1, 120_907))
+if __name__ == "__main__":
+    unittest.main()