PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py ADDED Viewed

@@ -0,0 +1,228 @@
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import soundfile as sf
+from einops.array_api import rearrange
+SUPPORTED_VERSIONS = ["1.0.0"]
+@dataclass
+class DACFile:
+    codes: mx.array
+    # Metadata
+    chunk_length: int
+    original_length: int
+    input_db: float
+    channels: int
+    sample_rate: int
+    padding: bool
+    dac_version: str
+    def save(self, path):
+        artifacts = {
+            "codes": np.array(self.codes).astype(np.uint16),
+            "metadata": {
+                "input_db": self.input_db,
+                "original_length": self.original_length,
+                "sample_rate": self.sample_rate,
+                "chunk_length": self.chunk_length,
+                "channels": self.channels,
+                "padding": self.padding,
+                "dac_version": SUPPORTED_VERSIONS[-1],
+            },
+        }
+        path = Path(path).with_suffix(".dac")
+        with open(path, "wb") as f:
+            np.save(f, artifacts)
+        return path
+    @classmethod
+    def load(cls, path):
+        artifacts = np.load(path, allow_pickle=True)[()]
+        codes = mx.array(artifacts["codes"], dtype=mx.int32)
+        if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
+            raise RuntimeError(
+                f"Given file {path} can't be loaded with this version of descript-audio-codec."
+            )
+        return cls(codes=codes, **artifacts["metadata"])
+class CodecMixin:
+    @property
+    def padding(self):
+        if not hasattr(self, "_padding"):
+            self._padding = True
+        return self._padding
+    @padding.setter
+    def padding(self, value):
+        assert isinstance(value, bool)
+        layers = [
+            layer
+            for layer in self.modules()
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d))
+        ]
+        for layer in layers:
+            if value:
+                if hasattr(layer, "original_padding"):
+                    layer.padding = layer.original_padding
+            else:
+                layer.original_padding = layer.padding
+                layer.padding = tuple(0 for _ in range(len(layer.padding)))
+        self._padding = value
+    def get_delay(self):
+        l_out = self.get_output_length(0)
+        L = l_out
+        layers = []
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                layers.append(layer)
+        for layer in reversed(layers):
+            d = layer.dilation
+            k = layer.weight.shape[1]
+            s = layer.stride
+            if isinstance(layer, nn.ConvTranspose1d):
+                L = ((L - d * (k - 1) - 1) / s) + 1
+            elif isinstance(layer, nn.Conv1d):
+                L = (L - 1) * s + d * (k - 1) + 1
+            L = math.ceil(L)
+        l_in = L
+        return (l_in - l_out) // 2
+    def get_output_length(self, input_length):
+        L = input_length
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                d = layer.dilation
+                k = layer.weight.shape[1]
+                s = layer.stride
+                if isinstance(layer, nn.Conv1d):
+                    L = ((L - d * (k - 1) - 1) / s) + 1
+                elif isinstance(layer, nn.ConvTranspose1d):
+                    L = (L - 1) * s + d * (k - 1) + 1
+                L = math.floor(L)
+        return L
+    def compress(
+        self,
+        audio_path: Union[str, Path],
+        win_duration: float = 1.0,
+        normalize_db: float = -16,
+        n_quantizers: int = None,
+    ) -> DACFile:
+        audio_signal, original_sr = sf.read(audio_path)
+        signal_duration = audio_signal.shape[-1] / original_sr
+        original_padding = self.padding
+        if original_sr != self.sample_rate:
+            raise ValueError(
+                f"Sample rate of the audio signal ({original_sr}) does not match the sample rate of the model ({self.sample_rate})."
+            )
+        audio_data = mx.array(audio_signal)
+        rms = mx.sqrt(mx.mean(mx.power(audio_data, 2), axis=-1) + 1e-12)
+        input_db = 20 * mx.log10(rms / 1.0 + 1e-12)
+        if normalize_db is not None:
+            audio_data = audio_data * mx.power(10, (normalize_db - input_db) / 20)
+        audio_data = rearrange(audio_data, "n -> 1 1 n")
+        nb, nac, nt = audio_data.shape
+        audio_data = rearrange(audio_data, "nb nac nt -> (nb nac) 1 nt")
+        win_duration = signal_duration if win_duration is None else win_duration
+        if signal_duration <= win_duration:
+            self.padding = True
+            n_samples = nt
+            hop = nt
+        else:
+            self.padding = False
+            audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (self.delay, self.delay)])
+            n_samples = int(win_duration * self.sample_rate)
+            n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
+            hop = self.get_output_length(n_samples)
+        codes = []
+        for i in range(0, nt, hop):
+            x = audio_data[..., i : i + n_samples]
+            x = mx.pad(x, [(0, 0), (0, 0), (0, max(0, n_samples - x.shape[-1]))])
+            x = self.preprocess(x, self.sample_rate)
+            _, c, _, _, _ = self.encode(x, n_quantizers)
+            codes.append(c)
+            chunk_length = c.shape[-1]
+        codes = mx.concatenate(codes, axis=-1)
+        dac_file = DACFile(
+            codes=codes,
+            chunk_length=chunk_length,
+            original_length=signal_duration,
+            input_db=input_db,
+            channels=nac,
+            sample_rate=original_sr,
+            padding=self.padding,
+            dac_version=SUPPORTED_VERSIONS[-1],
+        )
+        if n_quantizers is not None:
+            codes = codes[:, :n_quantizers, :]
+        self.padding = original_padding
+        return dac_file
+    def decompress(self, obj: Union[str, Path, DACFile]) -> mx.array:
+        if isinstance(obj, (str, Path)):
+            obj = DACFile.load(obj)
+        if self.sample_rate != obj.sample_rate:
+            raise ValueError(
+                f"Sample rate of the audio signal ({obj.sample_rate}) does not match the sample rate of the model ({self.sample_rate})."
+            )
+        original_padding = self.padding
+        self.padding = obj.padding
+        codes = obj.codes
+        chunk_length = obj.chunk_length
+        recons = []
+        for i in range(0, codes.shape[-1], chunk_length):
+            c = codes[..., i : i + chunk_length]
+            z = self.quantizer.from_codes(c)[0]
+            r = self.decode(z)
+            recons.append(r)
+        recons = mx.concatenate(recons, axis=1)
+        recons = rearrange(recons, "1 n 1 -> 1 n")
+        target_db = obj.input_db
+        normalize_db = -16
+        if normalize_db is not None:
+            recons = recons * mx.power(10, (target_db - normalize_db) / 20)
+        self.padding = original_padding
+        return recons

nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py ADDED Viewed

@@ -0,0 +1,285 @@
+import json
+import math
+from pathlib import Path
+from typing import List, Literal, Union
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from huggingface_hub import snapshot_download
+from .base import CodecMixin
+from .nn.layers import Snake1d, WNConv1d, WNConvTranspose1d
+from .nn.quantize import ResidualVectorQuantize
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def __call__(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Snake1d(dim // 2),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def __call__(self, x):
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+        d_latent: int = 64,
+    ):
+        super().__init__()
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        for stride in strides:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        self.block += [
+            Snake1d(d_model),
+            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
+        ]
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def __call__(self, x):
+        return self.block(x).moveaxis(1, 2)
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def __call__(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def __call__(self, x):
+        return self.model(x)
+class DAC(nn.Module, CodecMixin):
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = [2, 4, 5, 8],
+        latent_dim: int = None,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 5, 4, 2],
+        n_codebooks: int = 32,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        sample_rate: int = 44100,
+        **kwargs,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer = ResidualVectorQuantize(
+            input_dim=latent_dim,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+        )
+        self.decoder = Decoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+        )
+        self.sample_rate = sample_rate
+        self.delay = self.get_delay()
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
+        audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (0, right_pad)])
+        return audio_data
+    def encode(
+        self,
+        audio_data: mx.array,
+        n_quantizers: int = None,
+    ):
+        z = self.encoder(audio_data.moveaxis(1, 2))
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
+            z, n_quantizers
+        )
+        return z, codes, latents, commitment_loss, codebook_loss
+    def decode(self, z: mx.array):
+        return self.decoder(z.moveaxis(1, 2))
+    def _extra_repr(self):
+        return (
+            f"encoder_dim={self.encoder_dim}, "
+            f"encoder_rates={self.encoder_rates}, "
+            f"latent_dim={self.latent_dim}, "
+            f"decoder_dim={self.decoder_dim}, "
+            f"decoder_rates={self.decoder_rates}, "
+            f"n_codebooks={self.n_codebooks}, "
+            f"codebook_size={self.codebook_size}, "
+            f"codebook_dim={self.codebook_dim}"
+        )
+    def __call__(
+        self,
+        audio_data: mx.array,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+        use_rvq: bool = True,
+        return_loss: bool = False,
+    ):
+        length = audio_data.shape[-1]
+        audio_data = self.preprocess(audio_data, sample_rate)
+        if use_rvq:
+            z, codes, latents, commitment_loss, codebook_loss = self.encode(
+                audio_data, n_quantizers
+            )
+        else:
+            z = self.encoder(audio_data.moveaxis(1, 2))
+        x = self.decode(z)
+        if return_loss:
+            return mx.losses.mse(x, audio_data)
+        return {
+            "audio": x[..., :length],
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+    ) -> "DAC":
+        path = fetch_from_hub(repo_id)
+        if path is None:
+            raise ValueError(f"Could not find model {path}")
+        model_path = path / "model.safetensors"
+        config_path = path / "config.json"
+        with open(config_path) as f:
+            config = json.load(f)
+        dac = DAC(**config)
+        weights = mx.load(model_path.as_posix(), format="safetensors")
+        dac.load_weights(list(weights.items()))
+        mx.eval(dac.parameters())
+        return dac
+# fetch model from hub
+def fetch_from_hub(hf_repo: str) -> Path:
+    model_path = Path(
+        snapshot_download(
+            repo_id=hf_repo,
+            allow_patterns=["*.safetensors", "*.json"],
+        )
+    )
+    return model_path

nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from . import layers, quantize

nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py ADDED Viewed

@@ -0,0 +1,129 @@
+import math
+import mlx.core as mx
+import mlx.nn as nn
+def normalize_weight(x, except_dim=0):
+    if x.ndim != 3:
+        raise ValueError("Input tensor must have 3 dimensions")
+    axes = tuple(i for i in range(x.ndim) if i != except_dim)
+    return mx.sqrt(mx.sum(mx.power(x, 2), axis=axes, keepdims=True))
+class WNConv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = True,
+        groups: int = 1,
+    ):
+        super().__init__()
+        if bias:
+            self.bias = mx.zeros((out_channels,))
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.stride = stride
+        self.groups = groups
+        scale = math.sqrt(1 / (in_channels * kernel_size))
+        weight_init = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, kernel_size, in_channels),
+        )
+        self.weight_g = normalize_weight(weight_init)
+        self.weight_v = weight_init / (self.weight_g + 1e-12)
+    def _extra_repr(self):
+        return (
+            f"in_channels={self.weight_v.shape[2]}, out_channels={self.weight_v.shape[0]}, "
+            f"kernel_size={self.kernel_size}, stride={self.stride}, "
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"bias={'bias' in self}"
+        )
+    def __call__(self, x):
+        weight = self.weight_g * self.weight_v / normalize_weight(self.weight_v)
+        y = mx.conv1d(x, weight, self.stride, self.padding, self.dilation, self.groups)
+        if "bias" in self:
+            y = y + self.bias
+        return y
+class WNConvTranspose1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.bias = mx.zeros((out_channels,)) if bias else None
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.stride = stride
+        self.groups = groups
+        scale = math.sqrt(1 / (in_channels * kernel_size))
+        weight_init = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, kernel_size, in_channels // groups),
+        )
+        self.weight_g = normalize_weight(weight_init, except_dim=2)
+        self.weight_v = weight_init / (self.weight_g + 1e-12)
+    def _extra_repr(self):
+        return (
+            f"in_channels={self.weight_v.shape[2] * self.groups}, out_channels={self.weight_v.shape[0]}, "
+            f"kernel_size={self.kernel_size}, stride={self.stride}, "
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"groups={self.groups}, bias={'bias' in self}"
+        )
+    def __call__(self, x):
+        weight = (
+            self.weight_g
+            * self.weight_v
+            / normalize_weight(self.weight_v, except_dim=2)
+        )
+        y = mx.conv_transpose1d(
+            x, weight, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.bias is not None:
+            y = y + self.bias
+        return y
+def snake(x, alpha):
+    recip = mx.reciprocal(alpha + 1e-9)
+    x = x + recip * mx.power(mx.sin(alpha * x), 2)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = mx.ones((1, 1, channels))
+    def __call__(self, x):
+        x = snake(x, self.alpha)
+        return x