PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .mimi import Mimi, MimiStreamingDecoder

nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py ADDED Viewed

@@ -0,0 +1,286 @@
+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass
+import mlx.core as mx
+import mlx.nn as nn
+from huggingface_hub import hf_hub_download
+from .modules import (
+    ConvDownsample1d,
+    ConvTrUpsample1d,
+    ProjectedTransformer,
+    SeanetConfig,
+    SeanetDecoder,
+    SeanetEncoder,
+    SplitResidualVectorQuantizer,
+    TransformerConfig,
+)
+@dataclass
+class MimiConfig:
+    channels: int
+    sample_rate: float
+    frame_rate: float
+    renormalize: bool
+    seanet: SeanetConfig
+    transformer: TransformerConfig
+    quantizer_nq: int
+    quantizer_bins: int
+    quantizer_dim: int
+def mimi_202407(num_codebooks: int) -> MimiConfig:
+    seanet = SeanetConfig(
+        dimension=512,
+        channels=1,
+        causal=True,
+        nfilters=64,
+        nresidual_layers=1,
+        ratios=[8, 6, 5, 4],
+        ksize=7,
+        residual_ksize=3,
+        last_ksize=3,
+        dilation_base=2,
+        pad_mode="constant",
+        true_skip=True,
+        compress=2,
+    )
+    transformer = TransformerConfig(
+        d_model=seanet.dimension,
+        num_heads=8,
+        num_layers=8,
+        causal=True,
+        norm_first=True,
+        bias_ff=False,
+        bias_attn=False,
+        layer_scale=0.01,
+        positional_embedding="rope",
+        use_conv_bias=True,
+        gating=False,
+        norm="layer_norm",
+        context=250,
+        max_period=10000,
+        max_seq_len=8192,
+        kv_repeat=1,
+        dim_feedforward=2048,
+        conv_layout=True,
+        use_conv_block=False,
+        cross_attention=False,
+        conv_kernel_size=3,
+    )
+    return MimiConfig(
+        channels=1,
+        sample_rate=24000,
+        frame_rate=12.5,
+        renormalize=True,
+        seanet=seanet,
+        transformer=transformer,
+        quantizer_nq=num_codebooks,
+        quantizer_bins=2048,
+        quantizer_dim=256,
+    )
+class Mimi(nn.Module):
+    def __init__(self, cfg: MimiConfig):
+        super().__init__()
+        dim = cfg.seanet.dimension
+        self.cfg = cfg
+        encoder_frame_rate = cfg.sample_rate / math.prod(cfg.seanet.ratios)
+        downsample_stride = int(encoder_frame_rate / cfg.frame_rate)
+        self.encoder = SeanetEncoder(cfg.seanet)
+        self.decoder = SeanetDecoder(cfg.seanet)
+        self.quantizer = SplitResidualVectorQuantizer(
+            dim=cfg.quantizer_dim,
+            input_dim=dim,
+            output_dim=dim,
+            nq=cfg.quantizer_nq,
+            bins=cfg.quantizer_bins,
+        )
+        self.encoder_transformer = ProjectedTransformer(
+            cfg.transformer,
+            input_dim=dim,
+            output_dims=[dim],
+        )
+        self.decoder_transformer = ProjectedTransformer(
+            cfg.transformer,
+            input_dim=dim,
+            output_dims=[dim],
+        )
+        self.downsample = ConvDownsample1d(
+            stride=downsample_stride,
+            dim=dim,
+            causal=True,
+        )
+        self.upsample = ConvTrUpsample1d(
+            stride=downsample_stride,
+            dim=dim,
+            causal=True,
+        )
+        self.encoder_cache = self.encoder_transformer.make_cache()
+        self.decoder_cache = self.decoder_transformer.make_cache()
+    def reset_state(self):
+        self.encoder.reset_state()
+        self.decoder.reset_state()
+        for c in self.decoder_cache:
+            c.reset()
+        for c in self.encoder_cache:
+            c.reset()
+    def encode(self, xs: mx.array) -> mx.array:
+        self.encoder.reset_state()
+        for c in self.encoder_cache:
+            c.reset()
+        xs = self.encoder(xs)
+        xs = self.encoder_transformer(xs, cache=self.encoder_cache)[0]
+        xs = self.downsample(xs)
+        return self.quantizer.encode(xs)
+    def decode(self, xs: mx.array) -> mx.array:
+        self.decoder.reset_state()
+        for c in self.decoder_cache:
+            c.reset()
+        xs = self.quantizer.decode(xs)
+        xs = self.upsample(xs)
+        xs = self.decoder_transformer(xs, cache=self.decoder_cache)[0]
+        return self.decoder(xs)
+    def encode_step(self, xs: mx.array) -> mx.array:
+        xs = self.encoder.step(xs)
+        xs = self.encoder_transformer(xs, cache=self.encoder_cache)[0]
+        xs = self.downsample.step(xs)
+        xs = self.quantizer.encode(xs)
+        return xs
+    def decode_step(self, xs: mx.array) -> mx.array:
+        xs = self.quantizer.decode(xs)
+        xs = self.upsample.step(xs)
+        xs = self.decoder_transformer(xs, cache=self.decoder_cache)[0]
+        xs = self.decoder.step(xs)
+        return xs
+    def warmup(self):
+        pcm = mx.zeros((1, 1, 1920 * 4))
+        codes = self.encode(pcm)
+        pcm_out = self.decode(codes)
+        mx.eval(pcm_out)
+    def load_pytorch_weights(
+        self,
+        file: str,
+        strict: bool = True,
+    ) -> nn.Module:
+        weights = []
+        for k, v in mx.load(file).items():
+            v: mx.array = v
+            k: str = ".".join([s.removeprefix("_") for s in k.split(".")])
+            if k.startswith("encoder.model."):
+                k = k.replace("encoder.model.", "encoder.")
+            if k.startswith("decoder.model."):
+                k = k.replace("decoder.model.", "decoder.")
+            if k.endswith(".in_proj_weight"):
+                k = k.replace(".in_proj_weight", ".in_proj.weight")
+            if k.endswith(".linear1.weight"):
+                k = k.replace(".linear1.weight", ".gating.linear1.weight")
+            if k.endswith(".linear2.weight"):
+                k = k.replace(".linear2.weight", ".gating.linear2.weight")
+            # Awfully hardcoded matching between the pytorch layers and their mlx equivalent :(
+            for layerIdx, decoderIdx in enumerate([2, 5, 8, 11]):
+                k = k.replace(
+                    f"decoder.{decoderIdx}.", f"decoder.layers.{layerIdx}.upsample."
+                )
+                k = k.replace(
+                    f"decoder.{decoderIdx + 1}.",
+                    f"decoder.layers.{layerIdx}.residuals.0.",
+                )
+            for layerIdx, encoderIdx in enumerate([1, 4, 7, 10]):
+                k = k.replace(
+                    f"encoder.{encoderIdx}.", f"encoder.layers.{layerIdx}.residuals.0."
+                )
+                k = k.replace(
+                    f"encoder.{encoderIdx + 2}.",
+                    f"encoder.layers.{layerIdx}.downsample.",
+                )
+            k = k.replace("decoder.0.", "decoder.init_conv1d.")
+            k = k.replace("decoder.14.", "decoder.final_conv1d.")
+            k = k.replace("encoder.0.", "encoder.init_conv1d.")
+            k = k.replace("encoder.14.", "encoder.final_conv1d.")
+            k = k.replace(".block.1.", ".block.0.")
+            k = k.replace(".block.3.", ".block.1.")
+            # PyTorch layout for conv weights is outC, inC, kSize, for MLX it's outC, kSize, inC
+            if (
+                k.endswith(".conv.weight")
+                or k.endswith(".output_proj.weight")
+                or k.endswith(".input_proj.weight")
+            ):
+                v = v.swapaxes(-1, -2)
+            # PyTorch layout for conv-transposed weights is inC, outC, kSize, for MLX it's outC, kSize, inC
+            if k.endswith(".convtr.weight"):
+                v = v.transpose(1, 2, 0)
+            weights.append((k, v))
+        return self.load_weights(weights, strict=strict)
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+        filename: str = "tokenizer-e351c8d8-checkpoint125.safetensors",
+    ) -> nn.Module:
+        cfg = mimi_202407(32)
+        model = cls(cfg)
+        model_file = hf_hub_download(repo_id, filename)
+        model.load_pytorch_weights(model_file, strict=True)
+        return model
+class MimiStreamingDecoder:
+    """Incremental decoder wrapper for the Mimi codec.
+    This helper keeps the internal state of the Mimi model across calls and
+    decodes audio tokens frame by frame using ``decode_step``.
+    """
+    def __init__(self, mimi: "Mimi") -> None:  # noqa: F821 - Mimi defined below
+        self._mimi = mimi
+        self.reset()
+    def reset(self) -> None:
+        """Reset the underlying codec state."""
+        self._mimi.decoder.reset_state()
+        self._mimi.upsample.reset_state()
+        for c in self._mimi.decoder_cache:
+            c.reset()
+    def decode_frames(self, tokens: mx.array) -> mx.array:
+        """Decode a sequence of audio tokens incrementally.
+        Parameters
+        ----------
+        tokens:
+            Array of shape ``(B, C, T)`` or ``(C, T)`` containing the audio
+            tokens to decode. ``B`` is the batch dimension, ``C`` is the number
+            of codebooks and ``T`` the number of frames.
+        Returns
+        -------
+        mx.array
+            The decoded waveform for the provided frames.
+        """
+        if tokens.ndim == 2:
+            tokens = mx.expand_dims(tokens, 0)
+        pcm = []
+        for t in range(tokens.shape[-1]):
+            step_tokens = tokens[:, :, t : t + 1]
+            pcm.append(self._mimi.decode_step(step_tokens))
+        return mx.concat(pcm, axis=-1)

nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# flake8: noqa
+"""Modules used for building the models."""
+from .conv import (
+    Conv1d,
+    ConvDownsample1d,
+    ConvTranspose1d,
+    ConvTrUpsample1d,
+    NormConv1d,
+    NormConvTranspose1d,
+    StreamableConv1d,
+    StreamableConvTranspose1d,
+)
+from .kv_cache import KVCache, RotatingKVCache
+from .quantization import SplitResidualVectorQuantizer
+from .seanet import SeanetConfig, SeanetDecoder, SeanetEncoder
+from .transformer import ProjectedTransformer, Transformer, TransformerConfig

nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py ADDED Viewed

@@ -0,0 +1,398 @@
+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import mlx.core as mx
+import mlx.nn as nn
+class Conv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int = 1,
+        padding: int = 0,
+        groups: int = 1,
+        dilation: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        nn.Conv1d
+        scale = 1 / (in_channels * ksize)
+        self.weight = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, ksize, in_channels // groups),
+        )
+        self.bias = None
+        if bias:
+            self.bias = mx.zeros(out_channels)
+        self._padding = padding
+        self._groups = groups
+        self._stride = stride
+        self._dilation = dilation
+    def __call__(self, xs: mx.array) -> mx.array:
+        # MLX uses NLC whereas pytorch/candle use NCL
+        y = mx.conv1d(
+            xs.swapaxes(-1, -2),
+            self.weight,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            groups=self._groups,
+        )
+        if self.bias is not None:
+            y = y + self.bias
+        return y.swapaxes(-1, -2)
+class ConvTranspose1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int = 1,
+        padding: int = 0,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        nn.Conv1d
+        scale = 1 / (in_channels * ksize)
+        self.weight = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels // groups, ksize, in_channels),
+        )
+        self.bias = None
+        if bias:
+            self.bias = mx.zeros(out_channels)
+        self._padding = padding
+        self._groups = groups
+        self._stride = stride
+        self._ksize = ksize
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        if groups == in_channels and groups == out_channels:
+            eye = (
+                mx.eye(out_channels)
+                .astype(self.weight.dtype)
+                .reshape((out_channels, 1, out_channels))
+            )
+            eye = mx.repeat(eye, repeats=ksize, axis=1)
+            self._expanded_weight = mx.repeat(self.weight, repeats=groups, axis=0) * eye
+            self._expanded_groups = 1
+        elif groups > 1:
+            raise ValueError("groups are not supported in ConvTranspose1d")
+        else:
+            self._expanded_weight = self.weight
+            self._expanded_groups = groups
+    def update(self, parameters: dict) -> nn.Module:
+        super().update(parameters)
+        groups = self._groups
+        in_channels = self._in_channels
+        out_channels = self._out_channels
+        ksize = self._ksize
+        if groups == in_channels and groups == out_channels:
+            eye = (
+                mx.eye(out_channels)
+                .astype(self.weight.dtype)
+                .reshape((out_channels, 1, out_channels))
+            )
+            eye = mx.repeat(eye, repeats=ksize, axis=1)
+            self._expanded_weight = mx.repeat(self.weight, repeats=groups, axis=0) * eye
+            self._expanded_groups = 1
+        elif groups > 1:
+            raise ValueError("groups are not supported in ConvTranspose1d")
+        else:
+            self._expanded_weight = self.weight
+            self._expanded_groups = groups
+        return self
+    def __call__(self, xs: mx.array) -> mx.array:
+        y = mx.conv_transpose1d(
+            xs.swapaxes(-1, -2),
+            self._expanded_weight,
+            stride=self._stride,
+            padding=self._padding,
+            groups=self._expanded_groups,
+        )
+        if self.bias is not None:
+            y = y + self.bias
+        return y.swapaxes(-1, -2)
+class NormConv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int = 1,
+        padding: int = 0,
+        groups: int = 1,
+        dilation: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.conv = Conv1d(
+            in_channels,
+            out_channels,
+            ksize,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            dilation=dilation,
+            bias=bias,
+        )
+    def __call__(self, xs: mx.array) -> mx.array:
+        return self.conv(xs)
+class NormConvTranspose1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int = 1,
+        padding: int = 0,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.convtr = ConvTranspose1d(
+            in_channels,
+            out_channels,
+            ksize,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+    def __call__(self, xs: mx.array) -> mx.array:
+        return self.convtr(xs)
+def get_extra_padding_for_conv1d(
+    xs: mx.array,
+    ksize: int,
+    stride: int,
+    padding_total: int,
+) -> int:
+    len_ = xs.shape[-1]
+    nframes = max(len_ + padding_total - ksize, 0) / stride + 1.0
+    ideal_len = (int(math.ceil(nframes)) - 1) * stride + ksize - padding_total
+    return max(0, ideal_len - len_)
+def unpad1d(xs: mx.array, unpad_l: int, unpad_r: int) -> mx.array:
+    left = unpad_l
+    right = xs.shape[-1] - unpad_r
+    return xs[..., left:right]
+# TODO(laurent): add a streaming module abstract class?
+class StreamableConv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int,
+        dilation: int,
+        groups: int,
+        bias: bool,
+        causal: bool,
+        pad_mode: str,
+    ):
+        super().__init__()
+        self._causal = causal
+        self._pad_mode = pad_mode
+        self._ksize = ksize
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            ksize,
+            stride=stride,
+            groups=groups,
+            dilation=dilation,
+            bias=bias,
+        )
+        self._prev_xs = None
+        self._left_pad_applied = False
+        self._out_channels = out_channels
+    def reset_state(self):
+        self._prev_xs = None
+        self._left_pad_applied = False
+    def __call__(self, xs: mx.array) -> mx.array:
+        ksize = self._ksize
+        ksize = (ksize - 1) * self.conv.conv._dilation + 1
+        padding_total = ksize - self.conv.conv._stride
+        extra_padding = get_extra_padding_for_conv1d(
+            xs,
+            ksize=ksize,
+            stride=self.conv.conv._stride,
+            padding_total=padding_total,
+        )
+        z = 0, 0
+        if self._causal:
+            padding_left = padding_total
+            padding_right = 0
+        else:
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+        widths = [z, z, (padding_left, padding_right + extra_padding)]
+        pd = mx.pad(xs, pad_width=widths, mode=self._pad_mode)
+        return self.conv(pd)
+    def step(self, xs: mx.array) -> mx.array:
+        b, _, len_ = xs.shape
+        if len_ == 0:
+            return mx.zeros((b, self._out_channels, 0))
+        stride = self.conv.conv._stride
+        dilation = self.conv.conv._dilation
+        ksize = (self._ksize - 1) * dilation + 1
+        if not self._left_pad_applied:
+            self._left_pad_applied = True
+            padding_total = ksize - stride
+            xs = mx.pad(
+                xs, pad_width=((0, 0), (0, 0), (padding_total, 0)), mode=self._pad_mode
+            )
+        if self._prev_xs is not None:
+            xs = mx.concat([self._prev_xs, xs], axis=-1)
+        len_ = xs.shape[-1]
+        nframes = max(len_ + stride - ksize, 0) // stride
+        if nframes > 0:
+            offset = nframes * stride
+            self._prev_xs = xs[..., offset:]
+            in_l = (nframes - 1) * stride + ksize
+            if in_l > 0:
+                xs = xs[..., 0:in_l]
+                return self.conv(xs)
+            else:
+                return mx.zeros((b, self._out_channels, 0))
+        else:
+            self._prev_xs = xs
+            return mx.zeros((b, self._out_channels, 0))
+class StreamableConvTranspose1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        ksize: int,
+        stride: int,
+        groups: int,
+        bias: bool,
+        causal: bool,
+    ):
+        super().__init__()
+        self._causal = causal
+        self._ksize = ksize
+        self.convtr = NormConvTranspose1d(
+            in_channels,
+            out_channels,
+            ksize,
+            stride=stride,
+            groups=groups,
+            bias=bias,
+        )
+        self._prev_ys = None
+    def reset_state(self):
+        self._prev_ys = None
+    def __call__(self, xs: mx.array) -> mx.array:
+        stride = self.convtr.convtr._stride
+        padding_total = max(self._ksize - stride, 0)
+        xs = self.convtr(xs)
+        if self._causal:
+            unpad_l = 0
+            unpad_r = padding_total
+        else:
+            unpad_r = padding_total // 2
+            unpad_l = padding_total - unpad_r
+        return unpad1d(xs, unpad_l=unpad_l, unpad_r=unpad_r)
+    def step(self, xs: mx.array) -> mx.array:
+        b, _, len_ = xs.shape
+        if len_ == 0:
+            return mx.zeros((b, self._out_channels, 0))
+        stride = self.convtr.convtr._stride
+        ys = self.convtr(xs)
+        ot = ys.shape[-1]
+        if self._prev_ys is not None:
+            prev_ys = self._prev_ys
+            pt = prev_ys.shape[-1]
+            if self.convtr.convtr.bias is not None:
+                prev_ys = prev_ys - self.convtr.convtr.bias[None, :, None]
+            ys1, ys2 = ys[..., :pt] + prev_ys, ys[..., pt:]
+            ys = mx.concat([ys1, ys2], axis=-1)
+        invalid_steps = self._ksize - stride
+        ys, self._prev_ys = ys[..., : ot - invalid_steps], ys[..., ot - invalid_steps :]
+        return ys
+class ConvDownsample1d(nn.Module):
+    def __init__(self, stride: int, dim: int, causal: bool):
+        super().__init__()
+        self.conv = StreamableConv1d(
+            in_channels=dim,
+            out_channels=dim,
+            ksize=2 * stride,
+            stride=stride,
+            dilation=1,
+            groups=1,
+            bias=False,
+            causal=causal,
+            pad_mode="edge",
+        )
+    def reset_state(self):
+        self.conv.reset_state()
+    def __call__(self, xs: mx.array) -> mx.array:
+        return self.conv(xs)
+    def step(self, xs: mx.array) -> mx.array:
+        return self.conv.step(xs)
+class ConvTrUpsample1d(nn.Module):
+    def __init__(self, stride: int, dim: int, causal: bool):
+        super().__init__()
+        self.convtr = StreamableConvTranspose1d(
+            in_channels=dim,
+            out_channels=dim,
+            ksize=2 * stride,
+            stride=stride,
+            groups=dim,
+            bias=False,
+            causal=causal,
+        )
+    def reset_state(self):
+        self.convtr.reset_state()
+    def __call__(self, xs: mx.array) -> mx.array:
+        xs = self.convtr(xs)
+        return xs
+    def step(self, xs: mx.array) -> mx.array:
+        xs = self.convtr.step(xs)
+        return xs