PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py ADDED Viewed

@@ -0,0 +1,287 @@
+import typing as tp
+import mlx.core as mx
+from .config import DataConfig
+def build_delay_indices(
+    B: int, T: int, C: int, delay_pattern: tp.List[int]
+) -> tp.Tuple[mx.array, mx.array]:
+    """
+    Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
+    Negative t_idx => BOS; t_idx >= T => PAD.
+    """
+    delay_arr = mx.array(delay_pattern, dtype=mx.int32)
+    t_idx_BxT = mx.broadcast_to(
+        mx.arange(T, dtype=mx.int32)[None, :],
+        [B, T],
+    )
+    t_idx_BxTx1 = mx.expand_dims(t_idx_BxT, -1)
+    t_idx_BxTxC = t_idx_BxTx1 - mx.reshape(delay_arr, (1, 1, C))
+    b_idx_BxTxC = mx.broadcast_to(
+        mx.reshape(mx.arange(B, dtype=mx.int32), (B, 1, 1)),
+        [B, T, C],
+    )
+    c_idx_BxTxC = mx.broadcast_to(
+        mx.reshape(mx.arange(C, dtype=mx.int32), (1, 1, C)),
+        [B, T, C],
+    )
+    # We must clamp time indices to [0..T-1] so gather_nd equivalent won't fail
+    t_clamped_BxTxC = mx.clip(t_idx_BxTxC, 0, T - 1)
+    indices_BTCx3 = mx.stack(
+        [
+            mx.reshape(b_idx_BxTxC, (-1,)),
+            mx.reshape(t_clamped_BxTxC, (-1,)),
+            mx.reshape(c_idx_BxTxC, (-1,)),
+        ],
+        axis=1,
+    ).astype(mx.int32)
+    return t_idx_BxTxC, indices_BTCx3
+def apply_audio_delay(
+    audio_BxTxC: mx.array,
+    pad_value: int,
+    bos_value: int,
+    precomp: tp.Tuple[mx.array, mx.array],
+) -> mx.array:
+    """
+    Applies the delay pattern to batched audio tokens using precomputed indices,
+    inserting BOS where t_idx < 0 and PAD where t_idx >= T.
+    Args:
+        audio_BxTxC: [B, T, C] int16 audio tokens (or int32/float)
+        pad_value: the padding token
+        bos_value: the BOS token
+        precomp:  (t_idx_BxTxC, indices_BTCx3) from build_delay_indices
+    Returns:
+        result_BxTxC: [B, T, C] delayed audio tokens
+    """
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    def gather_nd(array, indices):
+        gathered = []
+        for idx in range(indices.shape[0]):
+            b, t, c = indices[idx, 0], indices[idx, 1], indices[idx, 2]
+            gathered.append(array[b, t, c])
+        return mx.array(gathered)
+    # Apply gather
+    gathered_flat = gather_nd(audio_BxTxC, indices_BTCx3)
+    gathered_BxTxC = mx.reshape(gathered_flat, audio_BxTxC.shape)
+    # Create masks
+    mask_bos = t_idx_BxTxC < 0  # => place bos_value
+    mask_pad = t_idx_BxTxC >= audio_BxTxC.shape[1]  # => place pad_value
+    # Create scalar values
+    bos_tensor = mx.full(1, bos_value, dtype=audio_BxTxC.dtype)
+    pad_tensor = mx.full(1, pad_value, dtype=audio_BxTxC.dtype)
+    # Apply masks (if mask_bos, BOS; else if mask_pad, PAD; else original gather)
+    result_BxTxC = mx.where(
+        mask_bos, bos_tensor, mx.where(mask_pad, pad_tensor, gathered_BxTxC)
+    )
+    return result_BxTxC
+def audio_to_codebook(
+    model,
+    input_values,
+    data_config: DataConfig,
+    padding_mask=None,
+    sample_rate=44100,
+):
+    """
+    Encodes the input audio waveform into discrete codes.
+    Args:
+        model: The model to use for encoding.
+        input_values (`mx.array` of shape `(batch_size, channels, sequence_length)`):
+            Float values of the input audio waveform.
+        padding_mask (`mx.array` of shape `(batch_size, channels, sequence_length)`):
+            Padding mask used to pad the `input_values`.
+        sample_rate (`int`, *optional*) :
+            Signal sampling_rate
+    Returns:
+        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+        `codebook` of shape `[batch_size, num_codebooks, frames]`.
+        Scale is not used here.
+    """
+    audio_data = model.preprocess(input_values, sample_rate)
+    if padding_mask is None:
+        padding_mask = mx.ones_like(input_values).astype(mx.bool_)
+    _, encoded_frame, _, _, _ = model.encode(audio_data, n_quantizers=None)  # 1, C, T
+    seq_length = encoded_frame.shape[2]
+    t_idx_BxTxC, indices_BTCx3 = build_delay_indices(
+        B=1,
+        T=seq_length,
+        C=data_config.channels,
+        delay_pattern=data_config.delay_pattern,
+    )
+    encoded_frame = apply_audio_delay(
+        audio_BxTxC=mx.transpose(encoded_frame, (0, 2, 1)),  # 1, T, C
+        pad_value=data_config.audio_pad_value,
+        bos_value=data_config.audio_bos_value,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+    )
+    return encoded_frame
+def build_revert_indices(
+    B: int, T: int, C: int, delay_pattern: tp.List[int]
+) -> tp.Tuple[mx.array, mx.array]:
+    """
+    Precompute indices for the revert operation using MLX.
+    Returns:
+        A tuple (t_idx_BxTxC, indices_BTCx3) where:
+            - t_idx_BxTxC is a tensor of shape [B, T, C] computed as time indices plus the delay.
+            - indices_BTCx3 is a tensor of shape [B*T*C, 3] used for gathering, computed from:
+                batch indices, clamped time indices, and channel indices.
+    """
+    delay_arr = mx.array(delay_pattern, dtype=mx.int32)
+    t_idx_BT1 = mx.broadcast_to(mx.expand_dims(mx.arange(T), 0), [B, T])
+    t_idx_BT1 = mx.expand_dims(t_idx_BT1, -1)
+    t_idx_BxTxC = mx.minimum(
+        t_idx_BT1 + mx.reshape(delay_arr, (1, 1, C)),
+        mx.array(T - 1, dtype=mx.int32),
+    )
+    b_idx_BxTxC = mx.broadcast_to(mx.reshape(mx.arange(B), (B, 1, 1)), [B, T, C])
+    c_idx_BxTxC = mx.broadcast_to(mx.reshape(mx.arange(C), (1, 1, C)), [B, T, C])
+    indices_BTCx3 = mx.stack(
+        [
+            mx.reshape(b_idx_BxTxC, (-1,)),
+            mx.reshape(t_idx_BxTxC, (-1,)),
+            mx.reshape(c_idx_BxTxC, (-1,)),
+        ],
+        axis=1,
+    ).astype(mx.int32)
+    return t_idx_BxTxC, indices_BTCx3
+def revert_audio_delay(
+    audio_BxTxC: mx.array,
+    pad_value: int,
+    precomp: tp.Tuple[mx.array, mx.array],
+    T: int,
+) -> mx.array:
+    """
+    Reverts a delay pattern from batched audio tokens using precomputed indices (MLX version).
+    Args:
+        audio_BxTxC: Input delayed audio tensor
+        pad_value: Padding value for out-of-bounds indices
+        precomp: Precomputed revert indices tuple containing:
+            - t_idx_BxTxC: Time offset indices tensor
+            - indices_BTCx3: Gather indices tensor for original audio
+        T: Original sequence length before padding
+    Returns:
+        Reverted audio tensor with same shape as input
+    """
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    def gather_nd(array, indices):
+        gathered = []
+        for idx in range(indices.shape[0]):
+            b, t, c = indices[idx, 0], indices[idx, 1], indices[idx, 2]
+            gathered.append(array[b, t, c])
+        return mx.array(gathered)
+    gathered_flat = gather_nd(audio_BxTxC, indices_BTCx3)
+    gathered_BxTxC = mx.reshape(gathered_flat, audio_BxTxC.shape)
+    pad_tensor = mx.full(1, pad_value, dtype=audio_BxTxC.dtype)
+    T_tensor = mx.array(T)
+    result_BxTxC = mx.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)
+    return result_BxTxC
+def decode(
+    model,
+    audio_codes,
+):
+    """
+    Decodes the given frames into an output audio waveform
+    """
+    if len(audio_codes) != 1:
+        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+    try:
+        audio_values = model.quantizer.from_codes(audio_codes)
+        audio_values = model.decode(audio_values[0])
+        return audio_values
+    except Exception as e:
+        print(f"Error in decode method: {str(e)}")
+        raise
+def codebook_to_audio(
+    generated_codes: mx.array, model, delay_pattern, B=1, T=2600, C=9
+):
+    """Process a single codebook file to generate audio"""
+    # Remove BOS token
+    generated_codes = generated_codes[:, 1:]
+    if generated_codes.shape[1] > T:
+        generated_codes = generated_codes[:, :T]
+    seq_length = generated_codes.shape[1]
+    # Build revert indices
+    t_idx_BxTxC, indices_BTCx3 = build_revert_indices(
+        B=B, T=seq_length, C=C, delay_pattern=delay_pattern
+    )
+    # Transpose and add batch dimension
+    audio_BxTxC = mx.expand_dims(mx.transpose(generated_codes, (1, 0)), 0)
+    reverted_codebook = revert_audio_delay(
+        audio_BxTxC=audio_BxTxC,
+        pad_value=0,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+        T=seq_length,
+    )
+    reverted_codebook = reverted_codebook[:, :-30, :]
+    codebook = mx.transpose(reverted_codebook, (0, 2, 1))
+    min_valid_index = 0
+    max_valid_index = 1023
+    invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
+    num_invalid = mx.sum(invalid_mask).item()
+    if num_invalid > 0:
+        print(
+            f"Warning: Clamping {num_invalid} indices outside range [{min_valid_index}, {max_valid_index}] to 0."
+        )
+    # Set invalid values to 0
+    zeros = mx.zeros_like(codebook)
+    codebook = mx.where(invalid_mask, zeros, codebook)
+    audio_array = decode(model, codebook)
+    return audio_array

nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py ADDED Viewed

@@ -0,0 +1,256 @@
+"""Configuration management module for the Dia model.
+This module provides comprehensive configuration management for the Dia model,
+utilizing dataclasses for validation. It defines configurations for data processing,
+model architecture (encoder and decoder), and training settings.
+Key components:
+- DataConfig: Parameters for data loading and preprocessing.
+- EncoderConfig: Architecture details for the encoder module.
+- DecoderConfig: Architecture details for the decoder module.
+- ModelConfig: Combined model architecture settings.
+- TrainingConfig: Training hyperparameters and settings.
+- DiaConfig: Master configuration combining all components.
+"""
+import json
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+@dataclass(frozen=True)
+class DataConfig:
+    """Configuration for data loading and preprocessing.
+    Attributes:
+        text_length: Maximum length of text sequences (must be multiple of 128).
+        audio_length: Maximum length of audio sequences (must be multiple of 128).
+        channels: Number of audio channels.
+        text_pad_value: Value used for padding text sequences.
+        audio_eos_value: Value representing the end of audio sequences.
+        audio_bos_value: Value representing the beginning of audio sequences.
+        audio_pad_value: Value used for padding audio sequences.
+        delay_pattern: List of delay values for each audio channel.
+    """
+    text_length: int
+    audio_length: int
+    channels: int = 9
+    text_pad_value: int = 0
+    audio_eos_value: int = 1024
+    audio_pad_value: int = 1025
+    audio_bos_value: int = 1026
+    delay_pattern: List[int] = field(
+        default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15]
+    )
+    def __post_init__(self):
+        # Ensure text_length and audio_length are multiples of 128
+        object.__setattr__(self, "text_length", (self.text_length + 127) // 128 * 128)
+        object.__setattr__(self, "audio_length", (self.audio_length + 127) // 128 * 128)
+    def __hash__(self) -> int:
+        """Generate a hash based on all fields of the config."""
+        return hash(
+            (
+                self.text_length,
+                self.audio_length,
+                self.channels,
+                self.text_pad_value,
+                self.audio_pad_value,
+                self.audio_bos_value,
+                self.audio_eos_value,
+                tuple(self.delay_pattern),
+            )
+        )
+@dataclass(frozen=True)
+class EncoderConfig:
+    """Configuration for the encoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        n_head: Number of attention heads.
+        head_dim: Dimension per attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization (LayerNorm before attention/MLP).
+    """
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    n_head: int
+    head_dim: int
+    mlp_activations: List[str] = field(default_factory=lambda: ["silu", "linear"])
+    use_pre_norm: bool = False
+@dataclass(frozen=True)
+class DecoderConfig:
+    """Configuration for the decoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        gqa_query_heads: Number of query heads for grouped-query self-attention.
+        kv_heads: Number of key/value heads for grouped-query self-attention.
+        gqa_head_dim: Dimension per query head for grouped-query self-attention.
+        cross_query_heads: Number of query heads for cross-attention.
+        cross_head_dim: Dimension per cross-attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization.
+    """
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    gqa_query_heads: int
+    kv_heads: int
+    gqa_head_dim: int
+    cross_query_heads: int
+    cross_head_dim: int
+    mlp_activations: List[str] = field(default_factory=lambda: ["silu", "linear"])
+    use_pre_norm: bool = False
+@dataclass(frozen=True)
+class ModelConfig:
+    """Main configuration container for the Dia model architecture.
+    Attributes:
+        encoder: Configuration for the encoder component.
+        decoder: Configuration for the decoder component.
+        src_vocab_size: Size of the source (text) vocabulary.
+        tgt_vocab_size: Size of the target (audio code) vocabulary.
+        dropout: Dropout probability applied within the model.
+        normalization_layer_epsilon: Epsilon value for normalization layers (e.g., LayerNorm).
+        weight_dtype: Data type for model weights (e.g., "float32", "bfloat16").
+        rope_min_timescale: Minimum timescale for Rotary Positional Embeddings (RoPE).
+        rope_max_timescale: Maximum timescale for Rotary Positional Embeddings (RoPE).
+    """
+    encoder: EncoderConfig
+    decoder: DecoderConfig
+    src_vocab_size: int = 128
+    tgt_vocab_size: int = 1028
+    dropout: float = 0.0
+    normalization_layer_epsilon: float = 1.0e-5
+    weight_dtype: str = "float32"
+    rope_min_timescale: int = 1
+    rope_max_timescale: int = 10_000
+    sample_rate: int = 44100
+@dataclass(frozen=True)
+class TrainingConfig:
+    """Training process configuration and hyperparameters.
+    Note: This configuration currently only includes precision settings.
+    Other training parameters (like batch size, learning rate, optimizer settings)
+    are assumed to be handled externally.
+    Attributes:
+        dtype: Data type for activations during training (e.g., "bfloat16", "float32").
+        logits_dot_in_fp32: Whether to compute the final logits dot product in fp32 for stability.
+    """
+    dtype: str = "bfloat16"
+    logits_dot_in_fp32: bool = False
+@dataclass(frozen=True)
+class DiaConfig:
+    """Master configuration for the Dia model.
+    Combines all sub-configurations into a single validated object.
+    Attributes:
+        version: Configuration version string.
+        model: Model architecture configuration.
+        training: Training process configuration (precision settings).
+        data: Data loading and processing configuration.
+    """
+    model: ModelConfig
+    training: TrainingConfig
+    data: DataConfig
+    version: str = "1.0"
+    def save(self, path: str) -> None:
+        """Save the current configuration instance to a JSON file.
+        Ensures the parent directory exists and the file has a .json extension.
+        Args:
+            path: The target file path to save the configuration.
+        Raises:
+            ValueError: If the path is not a file with a .json extension.
+        """
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        config_dict = {
+            "version": self.version,
+            "model": {
+                "encoder": vars(self.model.encoder),
+                "decoder": vars(self.model.decoder),
+                "src_vocab_size": self.model.src_vocab_size,
+                "tgt_vocab_size": self.model.tgt_vocab_size,
+                "dropout": self.model.dropout,
+                "normalization_layer_epsilon": self.model.normalization_layer_epsilon,
+                "weight_dtype": self.model.weight_dtype,
+                "rope_min_timescale": self.model.rope_min_timescale,
+                "rope_max_timescale": self.model.rope_max_timescale,
+                "sample_rate": self.model.sample_rate,
+            },
+            "training": vars(self.training),
+            "data": vars(self.data),
+        }
+        with open(path, "w") as f:
+            json.dump(config_dict, f, indent=2)
+    @classmethod
+    def load_dict(cls, config: dict) -> Optional["DiaConfig"]:
+        try:
+            model_config = ModelConfig(
+                encoder=EncoderConfig(**config["model"]["encoder"]),
+                decoder=DecoderConfig(**config["model"]["decoder"]),
+                **{
+                    k: v
+                    for k, v in config["model"].items()
+                    if k not in ["encoder", "decoder"]
+                },
+            )
+            return cls(
+                version=config.get("version", "1.0"),
+                model=model_config,
+                training=TrainingConfig(**config["training"]),
+                data=DataConfig(**config["data"]),
+            )
+        except (KeyError, TypeError):
+            return None
+    @classmethod
+    def load(cls, path: str) -> Optional["DiaConfig"]:
+        """Load and validate a Dia configuration from a JSON file.
+        Args:
+            path: The path to the configuration file.
+        Returns:
+            A validated DiaConfig instance if the file exists and is valid,
+            otherwise None if the file is not found.
+        Raises:
+            ValueError: If the JSON content fails validation against the DiaConfig schema.
+        """
+        try:
+            with open(path, "r") as f:
+                config = json.load(f)
+            return cls.load_dict(config)
+        except FileNotFoundError:
+            return None