PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py ADDED Viewed

@@ -0,0 +1,324 @@
+import time
+from dataclasses import dataclass
+from typing import List, Optional
+import mlx.core as mx
+from mlx_lm.generate import stream_generate
+from mlx_lm.models.llama import Model as LlamaModel
+from mlx_lm.models.llama import ModelArgs as LlamaModelConfig
+from mlx_lm.sample_utils import make_logits_processors, make_sampler
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from mlx_audio.codec.models.snac import SNAC
+from ..base import GenerationResult
+@dataclass
+class ModelConfig(LlamaModelConfig):
+    tokenizer_name: str = "mlx-community/orpheus-3b-0.1-ft-bf16"
+    sample_rate: int = 24000
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+snac_model = SNAC.from_pretrained("mlx-community/snac_24khz").eval()
+def decode_audio_from_codes(code_list):
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
+    for i in range((len(code_list) + 1) // 7):
+        layer_1.append(code_list[7 * i])
+        layer_2.append(code_list[7 * i + 1] - 4096)
+        layer_3.append(code_list[7 * i + 2] - (2 * 4096))
+        layer_3.append(code_list[7 * i + 3] - (3 * 4096))
+        layer_2.append(code_list[7 * i + 4] - (4 * 4096))
+        layer_3.append(code_list[7 * i + 5] - (5 * 4096))
+        layer_3.append(code_list[7 * i + 6] - (6 * 4096))
+    codes = [
+        mx.expand_dims(mx.array(layer_1), 0),
+        mx.expand_dims(mx.array(layer_2), 0),
+        mx.expand_dims(mx.array(layer_3), 0),
+    ]
+    audio_hat = snac_model.decode(codes).squeeze(-1)
+    return audio_hat
+def encode_audio_to_codes(audio):
+    audio = audio[None, None, :]
+    codes = snac_model.encode(audio)
+    layer_1 = codes[0].squeeze(0).tolist()
+    layer_2 = codes[1].squeeze(0).tolist()
+    layer_3 = codes[2].squeeze(0).tolist()
+    code_list = []
+    num_groups = len(layer_1)
+    for i in range(num_groups):
+        code_list.append(layer_1[i])
+        code_list.append(layer_2[2 * i] + 4096)
+        code_list.append(layer_3[4 * i] + 2 * 4096)
+        code_list.append(layer_3[4 * i + 1] + 3 * 4096)
+        code_list.append(layer_2[2 * i + 1] + 4 * 4096)
+        code_list.append(layer_3[4 * i + 2] + 5 * 4096)
+        code_list.append(layer_3[4 * i + 3] + 6 * 4096)
+    return mx.array(code_list)[None, :]
+class Model(LlamaModel):
+    def __init__(self, config: ModelConfig, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.model_type = config.model_type
+        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def sample_rate(self):
+        return self.config.sample_rate
+    def parse_output(self, input_ids):
+        token_to_find = 128257
+        token_to_remove = 128258
+        # MLX doesn't have nonzero, so we need to create indices manually
+        mask = input_ids == token_to_find
+        indices = []
+        for i in range(mask.shape[0]):
+            for j in range(mask.shape[1]):
+                if mask[i, j]:
+                    indices.append((i, j))
+        token_indices = [[], []]
+        for i, j in indices:
+            token_indices[0].append(i)
+            token_indices[1].append(j)
+        token_indices = mx.array(token_indices)
+        if len(token_indices[1]) > 0:
+            last_occurrence_idx = int(token_indices[1][-1])
+            cropped_tensor = input_ids[:, last_occurrence_idx + 1 :]
+        else:
+            cropped_tensor = input_ids
+        mask = cropped_tensor != token_to_remove
+        processed_rows = []
+        for row in cropped_tensor:
+            # Create a mask and filter manually since boolean indexing isn't supported
+            row_list = row.tolist()
+            masked_row = mx.array([val for val in row_list if val != token_to_remove])
+            processed_rows.append(masked_row)
+        code_lists = []
+        for row in processed_rows:
+            row_length = row.shape[0]
+            new_length = (row_length // 7) * 7
+            trimmed_row = row[:new_length]
+            trimmed_row = [t - 128266 for t in trimmed_row]
+            code_lists.append(trimmed_row)
+        return code_lists
+    def prepare_input_ids(
+        self,
+        prompts: List[str],
+        voice: Optional[str] = None,
+        ref_audio: Optional[mx.array] = None,
+        ref_text: Optional[str] = None,
+    ):
+        audio_input_ids = None
+        if ref_audio is not None and ref_text is not None:
+            print(
+                "\033[93mWARNING: Audio cloning doesn't work reliably on Orpheus.\033[0m \nA known issue affecting Torch and MLX versions. \nWill be fixed once the Canopy labs repo update their code or the model."
+            )
+            audio_input_ids = encode_audio_to_codes(ref_audio) + 128266
+            audio_transcript_ids = self.tokenizer(
+                ref_text, return_tensors="mlx"
+            ).input_ids
+        elif voice is not None:
+            prompts = [f"{voice}: " + p for p in prompts]
+        start_token = mx.array([[128259]], dtype=mx.int64)  # Start of human
+        end_tokens = mx.array(
+            [[128009, 128260]], dtype=mx.int64
+        )  # End of text, End of human
+        prompt_input_ids = []
+        for prompt in prompts:
+            prompt_input_ids.append(
+                self.tokenizer(prompt, return_tensors="mlx").input_ids
+            )
+        batch_input_ids = []
+        pad_token = mx.array([128263], dtype=mx.int64)
+        max_len = max([p.shape[1] for p in prompt_input_ids])
+        for input_ids in prompt_input_ids:
+            modified_input_ids = []
+            padding_len = max_len - input_ids.shape[1]
+            if padding_len > 0:
+                modified_input_ids.append(mx.repeat(pad_token, padding_len)[None, :])
+            # reference audio and transcript
+            if audio_input_ids is not None:
+                audio_start_tokens = mx.array([[128261, 128257]], dtype=mx.int64)
+                audio_end_tokens = mx.array([[128258, 128262]], dtype=mx.int64)
+                ref_input_ids = mx.concatenate(
+                    [
+                        start_token,
+                        audio_transcript_ids,
+                        end_tokens,
+                        audio_start_tokens,
+                        audio_input_ids,
+                        audio_end_tokens,
+                    ],
+                    axis=1,
+                )
+                modified_input_ids.append(ref_input_ids)
+            # prompt
+            one_prompt_input_ids = mx.concatenate(
+                [start_token, input_ids, end_tokens], axis=1
+            )  # SOH SOT Text EOT EOH
+            modified_input_ids.append(one_prompt_input_ids)
+            batch_input_ids.append(mx.concatenate(modified_input_ids, axis=1))
+        batch_input_ids = mx.concatenate(batch_input_ids, axis=0)
+        batch_mask = mx.where(batch_input_ids == pad_token, False, True)
+        return batch_input_ids, batch_mask
+    def generate(
+        self,
+        text,
+        voice: str,
+        temperature: float = 0.6,
+        top_p: float = 0.8,
+        split_pattern: str = "\n",
+        max_tokens: int = 1200,
+        verbose: bool = False,
+        ref_audio: mx.array = None,
+        ref_text: Optional[str] = None,
+        **kwargs,
+    ):
+        prompt = text.replace("\\n", "\n").replace("\\t", "\t")
+        prompts = prompt.split(split_pattern)
+        input_ids, _ = self.prepare_input_ids(
+            prompts,
+            voice,
+            ref_audio,
+            ref_text,
+        )
+        sampler = make_sampler(temperature, top_p, top_k=kwargs.get("top_k", -1))
+        logits_processors = make_logits_processors(
+            kwargs.get("logit_bias", None),
+            kwargs.get("repetition_penalty", 1.3),
+            kwargs.get("repetition_context_size", 20),
+        )
+        time_start = time.time()
+        # TODO: Support batch processing as in the Colab: https://github.com/canopyai/Orpheus-TTS
+        for i, response in enumerate(
+            tqdm(
+                stream_generate(
+                    self,
+                    tokenizer=self.tokenizer,
+                    prompt=input_ids.squeeze(0),
+                    max_tokens=max_tokens,
+                    sampler=sampler,
+                    logits_processors=logits_processors,
+                ),
+                total=max_tokens,
+                disable=not verbose,
+            )
+        ):
+            next_token = mx.array([response.token])
+            input_ids = mx.concatenate([input_ids, next_token[None, :]], axis=1)
+            if i % 50 == 0:
+                mx.clear_cache()
+            if next_token == 128258:
+                break
+        code_lists = self.parse_output(input_ids)
+        my_samples = []
+        for code_list in code_lists:
+            samples = decode_audio_from_codes(code_list)
+            my_samples.append(samples)
+        time_end = time.time()
+        if len(prompts) != len(my_samples):
+            raise Exception("Number of prompts and samples do not match")
+        else:
+            for i in range(len(my_samples)):
+                audio = my_samples[i][0]
+                samples = audio.shape[0] if audio is not None else 0
+                assert samples > 0, "No audio generated"
+                # Calculate token count
+                token_count = input_ids.shape[1] if input_ids is not None else 0
+                # Calculate audio duration in seconds
+                sample_rate = self.config.sample_rate
+                audio_duration_seconds = samples / sample_rate
+                # Calculate real-time factor (RTF)
+                rtf = audio_duration_seconds / (time_end - time_start)
+                # Format duration as HH:MM:SS.mmm
+                duration_mins = int(audio_duration_seconds // 60)
+                duration_secs = int(audio_duration_seconds % 60)
+                duration_ms = int((audio_duration_seconds % 1) * 1000)
+                duration_hours = int(audio_duration_seconds // 3600)
+                duration_str = f"{duration_hours:02d}:{duration_mins:02d}:{duration_secs:02d}.{duration_ms:03d}"
+                yield GenerationResult(
+                    audio=audio,
+                    samples=samples,
+                    sample_rate=sample_rate,
+                    segment_idx=i,
+                    token_count=token_count,
+                    audio_duration=duration_str,
+                    real_time_factor=rtf,
+                    prompt={
+                        "tokens": token_count,
+                        "tokens-per-sec": (
+                            round(token_count / audio_duration_seconds, 2)
+                            if audio_duration_seconds > 0
+                            else 0
+                        ),
+                    },
+                    audio_samples={
+                        "samples": samples,
+                        "samples-per-sec": (
+                            round(samples / audio_duration_seconds, 2)
+                            if audio_duration_seconds > 0
+                            else 0
+                        ),
+                    },
+                    processing_time_seconds=time_end - time_start,
+                    peak_memory_usage=mx.get_peak_memory() / 1e9,
+                )
+                # Clear cache after each segment to avoid memory leaks
+                mx.clear_cache()

nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .outetts import Model, ModelConfig

nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py ADDED Viewed

@@ -0,0 +1,351 @@
+import io
+import json
+import os
+from dataclasses import asdict
+from typing import Union
+import mlx.core as mx
+import numpy as np
+from mlx_audio.stt.utils import SAMPLE_RATE as WHISPER_SAMPLE_RATE
+from mlx_audio.stt.utils import load_model, resample_audio
+from .dac_interface import DacInterface
+from .prompt_processor import PromptProcessor
+def calculate_pitch(
+    audio_array: mx.array,
+    sr: int,
+    min_freq: float = 75.0,
+    max_freq: float = 600.0,
+    frame_length: int = 400,
+    hop_length: int = 160,
+    threshold: float = 0.3,
+) -> mx.array:
+    """
+    Calculate pitch frequencies for short audio clips using autocorrelation.
+    Args:
+        audio_array: Input audio array (1D or 2D [channels, samples])
+        sr: Sampling rate
+        min_freq: Minimum detectable frequency (Hz)
+        max_freq: Maximum detectable frequency (Hz)
+        frame_length: Analysis frame length in samples
+        hop_length: Hop size in samples
+        threshold: Voicing threshold (0.0-1.0)
+    Returns:
+        Array of pitch values (Hz) per frame
+    """
+    audio_np = np.array(audio_array)
+    # convert to mono and ensure 1D
+    if len(audio_np.shape) > 1:
+        audio_np = np.mean(audio_np, axis=0)
+    audio_np = np.squeeze(audio_np)
+    num_samples = audio_np.shape[-1]
+    pad_len = (frame_length - (num_samples % hop_length)) % hop_length
+    audio_np = np.pad(audio_np, (0, pad_len))
+    num_frames = (len(audio_np) - frame_length) // hop_length + 1
+    frames = np.zeros((num_frames, frame_length))
+    for i in range(num_frames):
+        frames[i] = audio_np[i * hop_length : i * hop_length + frame_length]
+    window = np.hanning(frame_length)
+    frames_windowed = frames * window
+    # compute autocorrelation using FFT
+    fft_frames = np.fft.rfft(frames_windowed, n=2 * frame_length, axis=1)
+    power_spectrum = fft_frames.real**2 + fft_frames.imag**2
+    autocorr = np.fft.irfft(power_spectrum, axis=1)[:, :frame_length]
+    # find valid frequency range indices
+    min_idx = max(1, int(sr / max_freq))
+    max_idx = min(frame_length, int(sr / min_freq))
+    # find peak indices in valid range
+    relevant_autocorr = autocorr[:, min_idx:max_idx]
+    peak_indices = np.argmax(relevant_autocorr, axis=1) + min_idx
+    peak_values = np.array([autocorr[i, peak_indices[i]] for i in range(num_frames)])
+    # parabolic interpolation for sub-sample accuracy
+    indices = np.clip(peak_indices, 1, frame_length - 2)
+    alpha = np.array([autocorr[i, indices[i] - 1] for i in range(num_frames)])
+    beta = np.array([autocorr[i, indices[i]] for i in range(num_frames)])
+    gamma = np.array([autocorr[i, indices[i] + 1] for i in range(num_frames)])
+    delta = 0.5 * (alpha - gamma) / (alpha - 2 * beta + gamma + 1e-8)
+    valid_mask = (peak_indices > 0) & (peak_indices < frame_length - 1)
+    delta = np.where(valid_mask, delta, 0.0)
+    # calculate final periods and pitches
+    best_period = (peak_indices + delta) / sr
+    pitch = np.where(best_period > 0, 1.0 / best_period, 0.0)
+    # apply voicing threshold
+    autocorr_0 = autocorr[:, 0]
+    voiced = (peak_values / (autocorr_0 + 1e-8)) > threshold
+    pitch = np.where(voiced, pitch, 0.0)
+    # clamp valid frequencies
+    pitch = np.clip(pitch, min_freq, max_freq)
+    return mx.array(pitch)
+def extract_single_pitch_value(
+    audio_array: mx.array,
+    sr: int,
+    min_freq: float = 75.0,
+    max_freq: float = 600.0,
+    frame_length: int = 400,
+    hop_length: int = 160,
+    threshold: float = 0.3,
+) -> float:
+    """
+    Calculates the average pitch of an audio array and normalizes it to 0-1 range.
+    Args:
+        audio_array: Input audio array (1D or 2D [channels, samples])
+        sr: Sampling rate
+        min_freq: Minimum detectable frequency (Hz)
+        max_freq: Maximum detectable frequency (Hz)
+        frame_length: Analysis frame length in samples
+        hop_length: Hop size in samples
+        threshold: Voicing threshold (0.0-1.0)
+    Returns:
+        A single float value representing the normalized average pitch (0.0-1.0).
+    """
+    pitch_array = calculate_pitch(
+        audio_array, sr, min_freq, max_freq, frame_length, hop_length, threshold
+    )
+    # calculate the average pitch across frames
+    average_pitch = float(mx.mean(pitch_array))
+    # normalize to 0-1 range
+    normalized_pitch = (average_pitch - min_freq) / (max_freq - min_freq)
+    # clamp to ensure it's strictly within 0-1
+    normalized_pitch = min(max(normalized_pitch, 0.0), 1.0)
+    return normalized_pitch
+class Features:
+    def __init__(self):
+        self.eps = 1e-10
+    def scale_values(self, value: float) -> int:
+        """
+        Scale a value from [0,1] to [0,100] and round to nearest integer
+        """
+        return round(value * 100)
+    def features_to_tokens(self, features: dict) -> list:
+        """
+        Convert features to token strings in format <|feature_value|>
+        """
+        return [f"<|{name}_{value}|>" for name, value in features.items()]
+    def validate_audio(self, audio: mx.array) -> bool:
+        if audio is None or not isinstance(audio, mx.array):
+            return False
+        if audio.size == 0:  # Check if array is empty
+            return False
+        audio_np = np.array(audio)
+        if np.isnan(audio_np).any() or np.isinf(audio_np).any():
+            return False
+        return True
+    def get_default_features(self) -> dict:
+        """
+        Return default feature values when audio is invalid
+        """
+        return {"energy": 0, "spectral_centroid": 0, "pitch": 0}
+    def extract_audio_features(self, audio: mx.array, sr: int) -> dict:
+        """
+        Extract fast-to-compute features from audio segments.
+        Each feature is normalized to [0, 1] range.
+        Args:
+            audio: Audio array of shape [channels, samples]
+            sr: Sample rate
+        Returns:
+            Dictionary of features, each as a single float value
+        """
+        if not self.validate_audio(audio):
+            return self.get_default_features()
+        audio_np = np.array(audio)
+        # convert to mono if stereo
+        if len(audio_np.shape) == 2 and audio_np.shape[0] > 1:
+            audio_np = np.mean(audio_np, axis=0, keepdims=True)
+        audio = mx.array(audio_np)
+        features = {}
+        # rms energy (loudness) - normalized to [0, 1]
+        features["energy"] = float(mx.sqrt(mx.mean(audio**2)))
+        # spectral centroid - normalized to [0, 1]
+        spec_np = np.abs(np.fft.rfft(audio_np))
+        freqs_np = np.linspace(0, sr / 2, spec_np.shape[-1])
+        spec_sum = np.sum(spec_np) + self.eps
+        centroid = np.sum(freqs_np * spec_np.squeeze()) / spec_sum
+        features["spectral_centroid"] = float(centroid / (sr / 2))
+        # pitch - normalized to [0, 1]
+        features["pitch"] = extract_single_pitch_value(audio, sr)
+        # scale values to 0-100 range
+        for name, value in features.items():
+            features[name] = self.scale_values(value)
+        return features
+class AudioProcessor:
+    def __init__(
+        self, audio_codec_path: str = "mlx-community/dac-speech-24khz-1.5kbps"
+    ):
+        self.features = Features()
+        self.audio_codec = DacInterface(audio_codec_path)
+    def create_speaker_from_whisper(
+        self,
+        audio: str,
+        whisper_model: str = "mlx-community/whisper-large-v3-turbo",
+    ):
+        if isinstance(audio, str):
+            audio = self.audio_codec.load_audio(audio)
+        else:
+            # resample audio to 16000 for whisper
+            resampled_audio = resample_audio(
+                audio[..., None], self.audio_codec.sr, WHISPER_SAMPLE_RATE
+            )
+            resampled_audio = mx.array(resampled_audio, dtype=mx.float32).mean(axis=1)
+            # convert to 2d array
+            audio = audio[None, None, ...]
+        seconds = audio.flatten().shape[0] / self.audio_codec.sr
+        if seconds > 20:
+            print(
+                "Speaker audio is longer than 20 seconds. Use a shorter clip for best results."
+            )
+        if seconds > 15:
+            print(
+                "Speaker audio is longer than 15 seconds. For best results, consider using an audio clip up to 15 seconds."
+            )
+        # load whisper model
+        whisper_model = load_model(whisper_model)
+        # transcribe audio
+        data = whisper_model.generate(resampled_audio.flatten(), word_timestamps=True)
+        data = asdict(data)
+        # clear memory
+        del whisper_model
+        mx.clear_cache()
+        text = PromptProcessor.text_normalizations(data["text"])
+        words = []
+        for s in data["segments"]:
+            words.extend(
+                [
+                    {
+                        "word": i["word"].strip(),
+                        "start": float(i["start"]),
+                        "end": float(i["end"]),
+                    }
+                    for i in s["words"]
+                ]
+            )
+        return self.create_speaker_from_dict(
+            {"audio": {"bytes": audio}, "text": text, "words": words}
+        )
+    def create_speaker_from_dict(self, data: dict):
+        audio = data["audio"]["bytes"]
+        if isinstance(audio, str):
+            audio = io.BytesIO(audio)
+            audio = self.audio_codec.load_audio(audio)
+        full_codes = self.audio_codec.encode(audio, verbose=True).tolist()[0]
+        c1 = full_codes[0]
+        c2 = full_codes[1]
+        sr = self.audio_codec.sr
+        text = data["text"]
+        words = data["words"]
+        tps = 75
+        audio = audio.squeeze(0)
+        global_features = self.features.extract_audio_features(audio, sr)
+        start = None
+        word_codes = []
+        max_extension = 20
+        for idx, i in enumerate(words):
+            if start is None:
+                start = max(0, int(i["start"] * tps) - max_extension)
+            word = i["word"].strip()
+            if idx == len(words) - 1:
+                end = min(len(c1), int(i["end"] * tps) + max_extension)
+            else:
+                end = int(i["end"] * tps)
+            word_c1 = c1[start:end]
+            word_c2 = c2[start:end]
+            word_audio = audio[:, int(i["start"] * sr) : int(i["end"] * sr)]
+            features = self.features.extract_audio_features(word_audio, sr)
+            start = end
+            word_codes.append(
+                {
+                    "word": word,
+                    "duration": round(len(word_c1) / tps, 2),
+                    "c1": word_c1,
+                    "c2": word_c2,
+                    "features": features,
+                }
+            )
+        return {"text": text, "words": word_codes, "global_features": global_features}
+    def save_speaker(self, speaker: dict, path: str):
+        # Expand ~ to home directory to save in ~/.cache/mlx_audio/voices
+        path = os.path.expanduser(path)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "w") as f:
+            json.dump(speaker, f)
+        print(f"Speaker saved to: {path}")
+    def load_speaker(self, path: str):
+        # Expand ~ to home directory to load from ~/.cache/mlx_audio/voices
+        path = os.path.expanduser(path)
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Speaker file not found: {path}")
+        with open(path, "r") as f:
+            return json.load(f)
+        print(f"Speaker loaded from: {path}")