PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py ADDED Viewed

@@ -0,0 +1,209 @@
+from typing import Any, Dict, List
+import mlx.core as mx
+import mlx.nn as nn
+from einops.array_api import rearrange
+from mlx_audio.codec.models.descript.nn.layers import WNConv1d
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+class FactorizedVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        codebook_size: int,
+        codebook_dim: int,
+        commitment: float,
+        codebook_loss_weight: float = 1.0,
+        decay: float = 0.99,
+        threshold_ema_dead_code: float = 2,
+        momentum: float = 0.99,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.commitment = commitment
+        self.codebook_dim = codebook_dim
+        self.codebook_loss_weight = codebook_loss_weight
+        self.decay = decay
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.momentum = momentum
+        requires_projection = input_dim != codebook_dim
+        self.in_project = (
+            WNConv1d(in_channels=input_dim, out_channels=codebook_dim, kernel_size=1)
+            if requires_projection
+            else nn.Identity()
+        )
+        self.out_project = (
+            WNConv1d(in_channels=codebook_dim, out_channels=input_dim, kernel_size=1)
+            if requires_projection
+            else nn.Identity()
+        )
+        self.codebook = nn.Embedding(self.codebook_size, codebook_dim)
+        self.cluster_size = mx.zeros((self.codebook_size,))
+    def __call__(self, z: mx.array) -> Dict[str, Any]:
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # transpose since we use linear
+        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
+        z_e = self.in_project(z.transpose(0, 2, 1)).transpose(0, 2, 1)
+        z_q, indices, dists = self.decode_latents(z_e)
+        # statistic the usage of codes
+        embed_onehot = mx.zeros(
+            (indices.shape[0], indices.shape[1], self.codebook_size), dtype=z_e.dtype
+        )
+        for i in range(indices.shape[0]):
+            for j in range(indices.shape[1]):
+                embed_onehot[i, j, indices[i, j]] = 1.0
+        avg_probs = mx.mean(embed_onehot.reshape(-1, self.codebook_size), axis=0)
+        perplexity = mx.exp(-mx.sum(avg_probs * mx.log(avg_probs + 1e-10)))
+        active_num = (embed_onehot.sum(0).sum(0) > 0).sum()
+        commit_loss = mx.zeros(0)
+        codebook_loss = mx.zeros(0)
+        z_q = z_e + (
+            z_q - z_e
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_project(z_q.transpose(0, 2, 1)).transpose(0, 2, 1)
+        vq_loss = (commit_loss + codebook_loss).mean()
+        return {
+            "z_q": z_q,
+            "indices": indices,
+            "dists": dists,
+            "vq_loss": vq_loss,
+            "perplexity": perplexity,
+            "active_num": active_num.astype(mx.float32),
+        }
+    def vq2emb(self, vq, out_proj=True):
+        emb = self.embed_code(vq)
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb
+    def tokenize(self, z: mx.array) -> mx.array:
+        """tokenize the input tensor"""
+        z_e = self.in_project(z.transpose(0, 2, 1)).transpose(0, 2, 1)
+        _, indices, _ = self.decode_latents(z_e)
+        return indices
+    def detokenize(self, indices):
+        """detokenize the input indices"""
+        # Check if indices are empty
+        if indices.shape[0] == 0 or indices.shape[1] == 0:
+            # Return an appropriate empty or placeholder tensor
+            return mx.zeros((1, self.input_dim, 1))
+        z_q = self.decode_code(indices).transpose(0, 2, 1)
+        z_q = self.out_project(z_q)
+        return z_q
+    def get_emb(self):
+        return self.codebook.weight
+    def embed_code(self, embed_id):
+        return mx.take(self.codebook.weight, embed_id, axis=0)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(0, 2, 1)
+    def normalize(self, x):
+        """Normalize input tensor along dimension 1."""
+        norm = mx.sqrt(mx.sum(mx.power(x, 2), axis=1, keepdims=True))
+        return x / mx.maximum(norm, 1e-12)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight
+        # L2 normalize encodings and codebook
+        encodings = self.normalize(encodings)
+        codebook = self.normalize(codebook)
+        # Compute euclidean distance between encodings and codebook,
+        # with L2 normalization, the distance is equal to cosine distance
+        dist = (
+            mx.sum(mx.power(encodings, 2), axis=1, keepdims=True)
+            - 2 * encodings @ codebook.T
+            + mx.sum(mx.power(codebook, 2), axis=1, keepdims=True).T
+        )
+        min_encoding_indices = mx.argmax(-dist, axis=1)
+        indices = mx.reshape(min_encoding_indices, (latents.shape[0], latents.shape[2]))
+        z_q = self.decode_code(indices)
+        return z_q, indices, dist
+    def get_codes_from_indices(self, indices):
+        """Get codebook vectors from indices.
+        Args:
+            indices: Tensor of shape [B, T]
+        Returns:
+            Tensor of shape [B, D, T]
+        """
+        return self.decode_code(indices)
+    def get_output_from_indices(self, indices):
+        """Get output from indices.
+        Args:
+            indices: Tensor of shape [B, T]
+        Returns:
+            Tensor of shape [B, D, T]
+        """
+        z_q = self.get_codes_from_indices(indices)
+        return self.out_project(z_q.transpose(0, 2, 1)).transpose(0, 2, 1)
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "weight_v" in k:
+                if v.shape[1] > v.shape[-1]:
+                    sanitized_weights[k] = v.transpose(0, 2, 1)
+                else:
+                    sanitized_weights[k] = v
+            else:
+                sanitized_weights[k] = v
+        return sanitized_weights

nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py ADDED Viewed

@@ -0,0 +1,309 @@
+import random
+from typing import List
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.spark.modules.finite_scalar_quantization import FSQ
+def exists(val):
+    return val is not None
+def first(l):
+    return l[0]
+def default(val, d):
+    return val if exists(val) else d
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+class ResidualFSQ(nn.Module):
+    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""
+    def __init__(
+        self,
+        *,
+        levels: List[int],
+        num_quantizers,
+        dim=None,
+        is_channel_first=False,
+        quantize_dropout=False,
+        quantize_dropout_cutoff_index=0,
+        quantize_dropout_multiple_of=1,
+        **kwargs,
+    ):
+        super().__init__()
+        codebook_dim = len(levels)
+        dim = default(dim, codebook_dim)
+        requires_projection = codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.has_projections = requires_projection
+        self.is_channel_first = is_channel_first
+        self.num_quantizers = num_quantizers
+        self.levels = levels
+        self.layers = []
+        # Convert ListConfig to a regular list before passing to mx.array
+        levels_tensor = mx.array(list(levels))
+        scales = []
+        for ind in range(num_quantizers):
+            scales.append((levels_tensor - 1) ** -ind)
+            fsq = FSQ(levels=levels, dim=codebook_dim, **kwargs)
+            self.layers.append(fsq)
+        assert all([not fsq.has_projections for fsq in self.layers])
+        self.codebook_size = self.layers[0].codebook_size
+        self._scales = mx.array(scales)
+        self.quantize_dropout = quantize_dropout and num_quantizers > 1
+        assert quantize_dropout_cutoff_index >= 0
+        self.quantize_dropout_cutoff_index = quantize_dropout_cutoff_index
+        self.quantize_dropout_multiple_of = quantize_dropout_multiple_of  # encodec paper proposes structured dropout, believe this was set to 4
+    @property
+    def codebooks(self):
+        codebooks = [layer._implicit_codebook for layer in self.layers]
+        codebooks = mx.stack(codebooks, axis=0)
+        return codebooks
+    def get_codes_from_indices(self, indices):
+        batch, quantize_dim = indices.shape[0], indices.shape[-1]
+        # may also receive indices in the shape of 'b h w q' (accept_image_fmap)
+        # MLX doesn't have pack function, so we need to reshape manually
+        original_shape = indices.shape
+        indices = mx.reshape(indices, (indices.shape[0], -1, indices.shape[-1]))
+        # because of quantize dropout, one can pass in indices that are coarse
+        # and the network should be able to reconstruct
+        if quantize_dim < self.num_quantizers:
+            assert (
+                self.quantize_dropout > 0.0
+            ), "quantize dropout must be greater than 0 if you wish to reconstruct from a signal with less fine quantizations"
+            indices = mx.pad(
+                indices,
+                ((0, 0), (0, 0), (0, self.num_quantizers - quantize_dim)),
+                constant_value=-1,
+            )
+        # take care of quantizer dropout
+        mask = indices == -1
+        # MLX doesn't have masked_fill, so we use where
+        indices = mx.where(
+            mask, mx.zeros_like(indices), indices
+        )  # have it fetch a dummy code to be masked out later
+        # MLX doesn't have get_at function, so we need to manually gather codes
+        all_codes = []
+        for q in range(self.codebooks.shape[0]):
+            q_codes = []
+            for b in range(indices.shape[0]):
+                n_codes = []
+                for n in range(indices.shape[1]):
+                    idx = indices[b, n, q]
+                    n_codes.append(self.codebooks[q, idx])
+                q_codes.append(mx.stack(n_codes))
+            all_codes.append(mx.stack(q_codes))
+        all_codes = mx.stack(all_codes)[:, :, :, 0, :]  # Shape: (q, b, n, d)
+        # mask out any codes that were dropout-ed
+        # Reshape mask for broadcasting: q b n 1
+        mask_reshaped = mx.reshape(
+            mask, (mask.shape[2], mask.shape[0], mask.shape[1], 1)
+        )
+        all_codes = mx.where(mask_reshaped, mx.zeros_like(all_codes), all_codes)
+        # scale the codes
+        # Reshape scales for broadcasting: q 1 1 d
+        scales = mx.reshape(
+            self._scales, (self._scales.shape[0], 1, 1, self._scales.shape[1])
+        )
+        all_codes = all_codes * scales
+        # if (accept_image_fmap = True) then return shape (quantize, batch, height, width, dimension)
+        # Reshape all_codes back to original dimensions
+        if len(original_shape) > 3:  # If we had height, width dimensions
+            all_codes = mx.reshape(
+                all_codes,
+                (
+                    all_codes.shape[0],
+                    original_shape[0],
+                    *original_shape[1:-1],
+                    all_codes.shape[-1],
+                ),
+            )
+        return all_codes
+    def get_output_from_indices(self, indices):
+        codes = self.get_codes_from_indices(indices)
+        codes_summed = mx.sum(codes, axis=0)
+        return self.project_out(codes_summed)
+    def __call__(
+        self, x, return_all_codes=False, rand_quantize_dropout_fixed_seed=None
+    ):
+        num_quant, quant_dropout_multiple_of = (
+            self.num_quantizers,
+            self.quantize_dropout_multiple_of,
+        )
+        # handle channel first
+        if self.is_channel_first:
+            # Manually implement rearrange and pack functionality
+            # First, move dimension d from position 1 to the end
+            shape = x.shape
+            # Assuming shape is (b, d, ...)
+            new_shape = (shape[0],) + shape[2:] + (shape[1],)
+            x = mx.transpose(x, (0,) + tuple(range(2, len(shape))) + (1,))
+            # Pack operation: flatten all dimensions between b and d
+            # This is equivalent to pack([x], "b * d")
+            ps = x.shape
+            middle_dims = x.shape[1:-1]
+            flattened_dim = 1
+            for dim in middle_dims:
+                flattened_dim *= dim
+            x = mx.reshape(x, (x.shape[0], flattened_dim, x.shape[-1]))
+        # maybe project in
+        x = self.project_in(x)
+        quantized_out = 0.0
+        residual = x
+        all_indices = []
+        should_quantize_dropout = self.training and self.quantize_dropout
+        # sample a layer index at which to dropout further residual quantization
+        # also prepare null indices
+        if should_quantize_dropout:
+            # check if seed is manually passed in
+            rand = random.Random(rand_quantize_dropout_fixed_seed)
+            rand_quantize_dropout_index = rand.randrange(
+                self.quantize_dropout_cutoff_index, num_quant
+            )
+            if quant_dropout_multiple_of != 1:
+                rand_quantize_dropout_index = (
+                    round_up_multiple(
+                        rand_quantize_dropout_index + 1, quant_dropout_multiple_of
+                    )
+                    - 1
+                )
+            null_indices = mx.full(x.shape[:2], -1, dtype=mx.int32)
+        # go through the layers
+        for quantizer_index, (layer, scale) in enumerate(
+            zip(self.layers, self._scales)
+        ):
+            if (
+                should_quantize_dropout
+                and quantizer_index > rand_quantize_dropout_index
+            ):
+                all_indices.append(null_indices)
+                continue
+            quantized, indices = layer(residual / scale)
+            quantized = quantized * scale
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+        # project out, if needed
+        quantized_out = self.project_out(quantized_out)
+        # stack all indices
+        all_indices = mx.stack(all_indices, axis=-1)
+        # channel first out
+        if self.is_channel_first:
+            # MLX doesn't have unpack, so we need to reshape manually
+            # Assuming ps contains the original batch dimensions
+            # Reshape to combine all dimensions between batch and the last dimension
+            batch_size = ps[0] if isinstance(ps, tuple) else ps
+            quantized_out = mx.reshape(
+                quantized_out, (batch_size, -1, quantized_out.shape[-1])
+            ).swapaxes(
+                2, 1
+            )  # swap to match torch output
+            all_indices = mx.reshape(
+                all_indices, (batch_size, -1, all_indices.shape[-1])
+            ).swapaxes(
+                2, 1
+            )  # swap to match torch output
+        # return
+        ret = (quantized_out, all_indices)
+        if not return_all_codes:
+            return ret
+        # whether to return all codes from all codebooks across layers
+        all_codes = self.get_codes_from_indices(all_indices)
+        # will return all codes in shape (quantizer, batch, sequence length, codebook dimension)
+        return (*ret, all_codes)
+if __name__ == "__main__":
+    model = ResidualFSQ(
+        levels=[4, 4, 4, 4, 4, 4],
+        num_quantizers=1,
+        dim=30,
+        is_channel_first=True,
+        quantize_dropout=False,
+    )
+    x = mx.random.normal((2, 30, 10))
+    quantize, embed_ind = model(x)
+    emb_from_ind = model.get_output_from_indices(embed_ind.transpose(0, 2, 1))
+    print(quantize == emb_from_ind.transpose(0, 2, 1))
+    print("quantize shape", quantize.shape)
+    print("embed_ind", embed_ind)

nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+