PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py ADDED Viewed

@@ -0,0 +1,256 @@
+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+from dataclasses import dataclass
+import mlx.core as mx
+import mlx.nn as nn
+from .kv_cache import KVCache, RotatingKVCache
+@dataclass
+class TransformerConfig:
+    d_model: int
+    num_heads: int
+    num_layers: int
+    causal: bool
+    norm_first: bool
+    bias_ff: bool
+    bias_attn: bool
+    layer_scale: float | None
+    positional_embedding: str
+    use_conv_block: bool
+    cross_attention: bool
+    conv_kernel_size: int
+    use_conv_bias: bool
+    gating: bool
+    norm: str
+    context: int
+    max_period: int
+    max_seq_len: int
+    kv_repeat: int
+    dim_feedforward: int
+    conv_layout: bool
+    @property
+    def head_dim(self) -> int:
+        return self.d_model // self.num_heads
+class Id(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, xs: mx.array) -> mx.array:
+        return xs
+class LayerScale(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = mx.ones(dim)
+    def __call__(self, xs: mx.array) -> mx.array:
+        return xs * self.scale
+class Attention(nn.Module):
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        num_kv = cfg.num_heads // cfg.kv_repeat
+        out_dim = cfg.d_model + 2 * num_kv * cfg.d_model // cfg.num_heads
+        self.cfg = cfg
+        self.in_proj = nn.Linear(cfg.d_model, out_dim, bias=cfg.bias_attn)
+        self.out_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=cfg.bias_attn)
+        self.scale = cfg.head_dim ** (-0.5)
+        self.rope = None
+        if cfg.positional_embedding == "rope":
+            self.rope = nn.RoPE(cfg.head_dim, traditional=True, base=cfg.max_period)
+    def __call__(
+        self,
+        xs: mx.array,
+        cache: KVCache | RotatingKVCache,
+        mask: mx.array | None = None,
+    ) -> mx.array:
+        assert self.cfg.kv_repeat == 1, "only kv_repeat==1 is supported"
+        b, t, hd = xs.shape
+        qkv = self.in_proj(xs).reshape(b, t, 3, self.cfg.num_heads, self.cfg.head_dim)
+        q = qkv[:, :, 0].transpose(0, 2, 1, 3)
+        k = qkv[:, :, 1].transpose(0, 2, 1, 3)
+        v = qkv[:, :, 2].transpose(0, 2, 1, 3)
+        if self.rope is not None:
+            q = self.rope(q, offset=cache.offset)
+            k = self.rope(k, offset=cache.offset)
+        k, v = cache.update_and_fetch(k, v)
+        k_len = k.shape[2]
+        k_target_len = t + min(self.cfg.context, k_len - t)
+        if k_target_len < k_len:
+            k = k[:, :, k_len - k_target_len :]
+            v = v[:, :, k_len - k_target_len :]
+        xs = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+        xs = xs.transpose(0, 2, 1, 3).reshape(b, t, hd)
+        xs = self.out_proj(xs)
+        return xs
+class MlpGating(nn.Module):
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        hidden = 2 * cfg.dim_feedforward // 3
+        if cfg.dim_feedforward == 4 * cfg.d_model:
+            hidden = 11 * cfg.d_model // 4
+        self.linear_in = nn.Linear(cfg.d_model, 2 * hidden, bias=cfg.bias_ff)
+        self.linear_out = nn.Linear(hidden, cfg.d_model, bias=cfg.bias_ff)
+    def __call__(self, xs: mx.array) -> mx.array:
+        xs = self.linear_in(xs)
+        b, t, _ = xs.shape
+        xs = xs.reshape(b, t, 2, -1)
+        return self.linear_out(nn.silu(xs[:, :, 0]) * xs[:, :, 1])
+class MlpNoGating(nn.Module):
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        self.linear1 = nn.Linear(cfg.d_model, cfg.dim_feedforward, bias=cfg.bias_ff)
+        self.linear2 = nn.Linear(cfg.dim_feedforward, cfg.d_model, bias=cfg.bias_ff)
+    def __call__(self, xs: mx.array) -> mx.array:
+        return self.linear2(nn.gelu_approx(self.linear1(xs)))
+class TransformerLayer(nn.Module):
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        assert not cfg.use_conv_block, "conv-block is not supported"
+        assert not cfg.cross_attention, "cross-attn is not supported"
+        if cfg.gating:
+            self.gating = MlpGating(cfg)
+        else:
+            # TODO: Use a better name?
+            self.gating = MlpNoGating(cfg)
+        if cfg.norm == "layer_norm":
+            self.norm1 = nn.LayerNorm(cfg.d_model, 1e-5)
+            self.norm2 = nn.LayerNorm(cfg.d_model, 1e-5)
+        elif cfg.norm == "rms_norm":
+            self.norm1 = nn.RMSNorm(cfg.d_model, 1e-8)
+            self.norm2 = nn.RMSNorm(cfg.d_model, 1e-8)
+        else:
+            raise ValueError(f"unsupported norm type {cfg.norm}")
+        if cfg.layer_scale is not None:
+            self.layer_scale_1 = LayerScale(cfg.d_model)
+            self.layer_scale_2 = LayerScale(cfg.d_model)
+        else:
+            self.layer_scale_1 = Id()
+            self.layer_scale_2 = Id()
+        self.self_attn = Attention(cfg)
+    def __call__(
+        self,
+        xs: mx.array,
+        cache: KVCache | RotatingKVCache,
+    ) -> mx.array:
+        n1 = self.norm1(xs)
+        n1 = self.self_attn(n1, cache=cache)
+        xs = xs + self.layer_scale_1(n1)
+        xs = xs + self.layer_scale_2(self.gating(self.norm2(xs)))
+        return xs
+class Transformer(nn.Module):
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.layers = [TransformerLayer(cfg=cfg) for _ in range(cfg.num_layers)]
+    def __call__(
+        self,
+        xs: mx.array,
+        cache: list[KVCache] | list[RotatingKVCache],
+    ) -> mx.array:
+        for layer, c in zip(self.layers, cache):
+            xs = layer(xs, cache=c)
+        return xs
+    def make_cache(self) -> list[KVCache]:
+        num_kv_heads = self.cfg.num_heads // self.cfg.kv_repeat
+        return [
+            KVCache(head_dim=self.cfg.head_dim, n_kv_heads=num_kv_heads)
+            for _ in self.layers
+        ]
+    def make_rot_cache(self) -> list[RotatingKVCache]:
+        num_kv_heads = self.cfg.num_heads // self.cfg.kv_repeat
+        return [
+            RotatingKVCache(
+                head_dim=self.cfg.head_dim,
+                n_kv_heads=num_kv_heads,
+                max_size=self.cfg.max_seq_len,
+            )
+            for _ in self.layers
+        ]
+class ProjectedTransformer(nn.Module):
+    def __init__(self, cfg: TransformerConfig, input_dim: int, output_dims: list[int]):
+        super().__init__()
+        self.conv_layout = cfg.conv_layout
+        self.transformer = Transformer(cfg)
+        if input_dim == cfg.d_model:
+            self.input_proj = None
+        else:
+            self.input_proj = nn.Linear(input_dim, cfg.d_model, bias=False)
+        output_projs = []
+        for output_dim in output_dims:
+            if output_dim == cfg.d_model:
+                p = None
+            else:
+                p = nn.Linear(cfg.d_model, output_dim, bias=False)
+            output_projs.append(p)
+        self.output_projs = output_projs
+    def __call__(
+        self,
+        xs: mx.array,
+        cache: list[KVCache] | list[RotatingKVCache],
+    ) -> list[mx.array]:
+        if self.conv_layout:
+            xs = xs.swapaxes(1, 2)
+        if self.input_proj is not None:
+            xs = self.input_proj(xs)
+        xs = self.transformer(xs, cache=cache)
+        outs = []
+        for output_proj in self.output_projs:
+            if output_proj is None:
+                out = xs
+            else:
+                out = output_proj(xs)
+            if self.conv_layout:
+                out = out.swapaxes(1, 2)
+            outs.append(out)
+        return outs
+    def make_cache(self) -> list[KVCache]:
+        return self.transformer.make_cache()
+    def make_rot_cache(self) -> list[RotatingKVCache]:
+        return self.transformer.make_rot_cache()

nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .model_v2 import S3TokenizerV2

nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py ADDED Viewed

@@ -0,0 +1,260 @@
+from dataclasses import dataclass
+from typing import Iterable, Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from einops.array_api import rearrange
+from .utils import make_non_pad_mask, mask_to_bias
+@dataclass
+class ModelConfig:
+    n_mels: int = 128
+    n_audio_ctx: int = 1500
+    n_audio_state: int = 1280
+    n_audio_head: int = 20
+    n_audio_layer: int = 6
+    n_codebook_size: int = 4096
+def sinusoids(length: int, channels: int, max_timescale: float = 10000) -> mx.array:
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = mx.exp(-log_timescale_increment * mx.arange(channels // 2))
+    scaled_time = mx.arange(length)[:, None] * inv_timescales[None, :]
+    return mx.concatenate([mx.sin(scaled_time), mx.cos(scaled_time)], axis=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = nn.Linear(n_state, n_state)
+        self.key = nn.Linear(n_state, n_state, bias=False)
+        self.value = nn.Linear(n_state, n_state)
+        self.out = nn.Linear(n_state, n_state)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+    ) -> Tuple[mx.array, mx.array]:
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(
+        self, q: mx.array, k: mx.array, v: mx.array, mask: Optional[mx.array] = None
+    ) -> Tuple[mx.array, mx.array | None]:
+        B, T, D = q.shape
+        scale = (D // self.n_head) ** -0.25
+        q = q.reshape(B, T, self.n_head, -1).transpose(0, 2, 1, 3) * scale
+        k = k.reshape(B, T, self.n_head, -1).transpose(0, 2, 1, 3) * scale
+        v = v.reshape(B, T, self.n_head, -1).transpose(0, 2, 1, 3)
+        output = mx.fast.scaled_dot_product_attention(q, k, v, scale=1, mask=mask)
+        output = output.transpose(0, 2, 1, 3).reshape(B, T, D)
+        return output, None
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = nn.LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = nn.LayerNorm(n_state)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+    ) -> mx.array:
+        x = x + self.attn(self.attn_ln(x), mask=mask)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self,
+        n_mels: int,
+        n_ctx: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        stride: int,
+    ):
+        super().__init__()
+        self.stride = stride
+        self.conv1 = nn.Conv1d(
+            in_channels=n_mels,
+            out_channels=n_state,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+        )
+        self.conv2 = nn.Conv1d(
+            in_channels=n_state,
+            out_channels=n_state,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.positional_embedding = sinusoids(n_ctx, n_state)
+        self.blocks = [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+    def __call__(self, x: mx.array, x_len: mx.array) -> Tuple[mx.array, mx.array]:
+        """
+        x : mx.array, shape = (batch_size, n_mels, T)
+            the mel spectrogram of the audio
+        x_len: mx.array, shape = (batch_size,)
+            length of each audio in x
+        """
+        mask = make_non_pad_mask(x_len)
+        mask = mx.expand_dims(mask, axis=1)  # (B, 1, T)
+        x = x.transpose(0, 2, 1)  # (B, T, n_mels)
+        mask_transposed = mask.transpose(0, 2, 1)  # (B, T, 1)
+        x = self.conv1(x * mask_transposed)
+        x = nn.gelu(x)
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // self.stride + 1
+        mask = make_non_pad_mask(x_len)
+        mask_transposed = mx.expand_dims(mask, axis=-1)  # (B, T, 1)
+        x = self.conv2(x * mask_transposed)
+        x = nn.gelu(x)
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // 2 + 1
+        mask = make_non_pad_mask(x_len)
+        mask = mask_to_bias(mask, x.dtype)
+        mask = mx.expand_dims(mask, axis=1)  # (B, 1, T)
+        x = x + self.positional_embedding[: x.shape[1], :]
+        for block in self.blocks:
+            x = block(x, mx.expand_dims(mask, axis=1))
+        return x, x_len
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+    """
+    def __init__(self, dim: int, codebook_size: int):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.embed = mx.zeros((codebook_size, dim))
+    def preprocess(self, x: mx.array) -> mx.array:
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x: mx.array) -> mx.array:
+        embed = self.embed.T
+        dist = -(
+            mx.sum(x.astype(mx.float32) ** 2, axis=1, keepdims=True)
+            - 2 * x @ embed
+            + mx.sum(embed.astype(mx.float32) ** 2, axis=0, keepdims=True)
+        )
+        embed_ind = mx.argmax(dist, axis=-1)
+        return embed_ind
+    def postprocess_emb(self, embed_ind: mx.array, shape: tuple) -> mx.array:
+        return embed_ind.reshape(*shape[:-1])
+    def dequantize(self, embed_ind: mx.array) -> mx.array:
+        quantize = self.embed[embed_ind]
+        return quantize
+    def encode(self, x: mx.array) -> mx.array:
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind: mx.array) -> mx.array:
+        quantize = self.dequantize(embed_ind)
+        return quantize
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+    """
+    def __init__(self, dim: int, codebook_size: int):
+        super().__init__()
+        self._codebook = EuclideanCodebook(dim=dim, codebook_size=codebook_size)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x: mx.array) -> mx.array:
+        x = x / mx.sqrt(mx.sum(x**2, axis=-1, keepdims=True) + 1e-8)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind: mx.array) -> mx.array:
+        quantize = self._codebook.decode(embed_ind)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+class S3Tokenizer(nn.Module):
+    """S3 tokenizer implementation
+    Args:
+        config (ModelConfig): Config
+    """
+    def __init__(self, name: str, config: ModelConfig = ModelConfig()):
+        super().__init__()
+        self.config = config
+        self.encoder = AudioEncoder(
+            self.config.n_mels,
+            self.config.n_audio_ctx,
+            self.config.n_audio_state,
+            self.config.n_audio_head,
+            self.config.n_audio_layer,
+            2 if name == "speech_tokenizer_v1_25hz" else 1,
+        )
+        self.quantizer = VectorQuantization(
+            self.config.n_audio_state, self.config.n_codebook_size
+        )
+    def __call__(self, mel: mx.array, mel_len: mx.array) -> Tuple[mx.array, mx.array]:
+        return self.quantize(mel, mel_len)
+    def quantize(self, mel: mx.array, mel_len: mx.array) -> Tuple[mx.array, mx.array]:
+        hidden, code_len = self.encoder(mel, mel_len)
+        code = self.quantizer.encode(hidden)
+        return code, code_len