PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py ADDED Viewed

@@ -0,0 +1,162 @@
+import math
+import mlx.core as mx
+import numpy as np
+import pyloudnorm as pyln
+import scipy.signal
+import soundfile as sf
+from mlx_audio.codec import DAC
+def process_audio_array(
+    audio: mx.array,
+    sample_rate: int = 24000,
+    target_loudness: float = -18.0,
+    peak_limit: float = -1,
+    block_size: float = 0.400,
+) -> mx.array:
+    audio_np = np.array(audio)
+    # handle multi-channel audio
+    if len(audio_np.shape) > 1:
+        if audio_np.shape[1] > 1:
+            audio_np = np.mean(audio_np, axis=1)
+        else:
+            audio_np = np.squeeze(audio_np)
+    original_length = len(audio_np)
+    min_samples = int(block_size * sample_rate)
+    if original_length < min_samples:
+        pad_length = min_samples - original_length
+        audio_padded = np.pad(audio_np, (0, pad_length), mode="constant")
+    else:
+        audio_padded = audio_np
+    # measure and normalize loudness
+    meter = pyln.Meter(sample_rate, block_size=block_size)
+    measured_loudness = meter.integrated_loudness(audio_padded)
+    normalized = pyln.normalize.loudness(
+        audio_padded, measured_loudness, target_loudness
+    )
+    # apply peak limiting if necessary
+    peak_value = np.max(np.abs(normalized))
+    threshold_value = 10 ** (peak_limit / 20)
+    if peak_value > threshold_value:
+        normalized = pyln.normalize.peak(normalized, peak_limit)
+    if original_length < min_samples:
+        normalized = normalized[:original_length]
+    normalized_array = mx.array(normalized).reshape(1, 1, -1)
+    return normalized_array
+class DacInterface:
+    def __init__(self, repo_id: str = "mlx-community/dac-speech-24khz-1.5kbps"):
+        self.model = DAC.from_pretrained(repo_id)
+        self.sr = 24000
+    def convert_audio(
+        self, audio: mx.array, sr: int, target_sr: int, target_channels: int
+    ):
+        audio_np = np.array(audio)
+        if len(audio_np.shape) < 2:
+            audio_np = audio_np.reshape(1, -1)
+        channels, length = audio_np.shape[-2:]
+        if target_channels == 1:
+            if channels > 1:
+                audio_np = np.mean(audio_np, axis=-2, keepdims=True)
+        elif target_channels == 2:
+            if channels == 1:
+                audio_np = np.repeat(audio_np, 2, axis=-2)
+            elif channels > 2:
+                audio_np = audio_np[..., :2, :]
+        if sr != target_sr:
+            new_length = int(length * target_sr / sr)
+            resampled = np.zeros((target_channels, new_length))
+            for ch in range(target_channels):
+                resampled[ch] = scipy.signal.resample(audio_np[ch], new_length)
+            audio_np = resampled
+        return mx.array(audio_np)
+    def convert_audio_array(self, audio: mx.array, sr):
+        return self.convert_audio(audio, sr, self.sr, 1)
+    def load_audio(self, path):
+        audio_np, sr = sf.read(path)
+        audio = mx.array(audio_np)
+        if len(audio.shape) == 1:
+            audio = audio.reshape(1, -1)
+        # if stereo, reshape to channels-first format
+        elif len(audio.shape) > 1 and audio.shape[0] > audio.shape[1]:
+            audio = audio.T
+        return self.convert_audio_array(audio, sr).reshape(1, 1, -1)
+    def preprocess(self, audio_data):
+        length = audio_data.shape[-1]
+        hop_length = self.model.hop_length
+        right_pad = math.ceil(length / hop_length) * hop_length - length
+        audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (0, right_pad)])
+        return audio_data
+    def encode(self, x: mx.array, win_duration: int = 5.0, verbose: bool = False):
+        x = process_audio_array(x)
+        nb, nac, nt = x.shape
+        x = x.reshape(nb * nac, 1, nt)
+        n_samples = int(win_duration * self.sr)
+        n_samples = int(
+            math.ceil(n_samples / self.model.hop_length) * self.model.hop_length
+        )
+        hop = n_samples
+        codes_list = []
+        if verbose:
+            from tqdm import trange
+            range_fn = trange
+        else:
+            range_fn = range
+        for i in range_fn(0, nt, hop):
+            chunk = x[..., i : i + n_samples]
+            audio_data = self.preprocess(chunk)
+            _, c, _, _, _ = self.model.encode(audio_data, None)
+            codes_list.append(c)
+        codes = mx.concatenate(codes_list, axis=-1)
+        return codes
+    def decode(self, codes: mx.array, verbose: bool = False) -> mx.array:
+        model = self.model
+        chunk_length = 4096
+        recons = []
+        if verbose:
+            from tqdm import trange
+            range_fn = trange
+        else:
+            range_fn = range
+        @mx.compile
+        def decode_chunk(codes):
+            z = model.quantizer.from_codes(codes)[0]
+            r = model.decode(z)
+            return r
+        for i in range_fn(0, codes.shape[-1], chunk_length):
+            c = codes[..., i : i + chunk_length]
+            recons.append(decode_chunk(c))
+        recons = mx.concatenate(recons, axis=-1)
+        return process_audio_array(recons.swapaxes(1, 2))

nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py ADDED Viewed

@@ -0,0 +1,255 @@
+import json
+import re
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.generate import stream_generate
+from mlx_lm.models.llama import Model as LlamaModel
+from mlx_lm.models.llama import ModelArgs as LlamaModelConfig
+from mlx_lm.models.qwen2 import Model as Qwen2Model
+from mlx_lm.models.qwen2 import ModelArgs as Qwen2ModelConfig
+from mlx_lm.models.qwen3 import Model as Qwen3Model
+from mlx_lm.models.qwen3 import ModelArgs as Qwen3ModelConfig
+from mlx_lm.sample_utils import make_logits_processors, make_sampler
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from ..base import GenerationResult
+from .audio_processor import AudioProcessor
+from .dac_interface import DacInterface
+from .prompt_processor import PromptProcessor
+@dataclass
+class ModelConfig(LlamaModelConfig, Qwen2ModelConfig, Qwen3ModelConfig):
+    tokenizer_name: str = "OuteAI/Llama-OuteTTS-1.0-1B"
+    sample_rate: int = 24000
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig, **kwargs):
+        super().__init__()
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
+        self.model = self._initialize_model(config, **kwargs)
+    def _initialize_model(self, config: ModelConfig, **kwargs) -> nn.Module:
+        model_map = {"llama": LlamaModel, "qwen2": Qwen2Model, "qwen3": Qwen3Model}
+        if config.model_type not in model_map:
+            raise ValueError(f"Unsupported model type: {config.model_type}")
+        return model_map[config.model_type](config, **kwargs)
+    def sanitize(self, weights):
+        weights = self.model.sanitize(weights)
+        return {
+            (
+                f"model.{k}"
+                if not k.startswith("model.model.")
+                and not k.startswith("model.lm_head")
+                else k
+            ): v
+            for k, v in weights.items()
+        }
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def sample_rate(self):
+        return self.config.sample_rate
+    def __call__(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    def get_speaker(self, voice: Optional[str], ref_audio: Optional[str]) -> dict:
+        if voice is None and ref_audio is None:
+            voice = f"{Path(__file__).parent}/default_speaker.json"
+            return self.audio_processor.load_speaker(voice)
+        if voice is not None:
+            return self.audio_processor.load_speaker(voice)
+        speaker = self.audio_processor.create_speaker_from_whisper(ref_audio)
+        file_id = str(uuid.uuid4())
+        save_path = f"~/.cache/mlx_audio/voices/outetts_{file_id}.json"
+        self.audio_processor.save_speaker(speaker, save_path)
+        return speaker
+    def chunk_text(self, text: str, max_words: int = 30) -> List[str]:
+        sentences = re.split(r"[.!?。！？︕︖]+", text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            words = sentence.split()
+            if current_length + len(words) > max_words:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            current_chunk.extend(words)
+            current_length += len(words)
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        return chunks
+    def generate_result(
+        self, audio, start_time: float, token_count: int, segment_idx: int, **kwargs
+    ) -> GenerationResult:
+        samples = audio.shape[0] if audio is not None else 0
+        assert samples > 0, "No audio generated"
+        sample_rate = (
+            self.config.sample_rate
+            if kwargs.get("sample_rate") is None
+            else kwargs.get("sample_rate")
+        )
+        audio_duration_seconds = samples / sample_rate
+        elapsed_time = time.perf_counter() - start_time
+        rtf = audio_duration_seconds / elapsed_time
+        duration_mins = int(audio_duration_seconds // 60)
+        duration_secs = int(audio_duration_seconds % 60)
+        duration_ms = int((audio_duration_seconds % 1) * 1000)
+        duration_hours = int(audio_duration_seconds // 3600)
+        duration_str = f"{duration_hours:02d}:{duration_mins:02d}:{duration_secs:02d}.{duration_ms:03d}"
+        return GenerationResult(
+            audio=audio,
+            samples=samples,
+            sample_rate=sample_rate,
+            segment_idx=segment_idx,
+            token_count=token_count,
+            audio_duration=duration_str,
+            real_time_factor=rtf,
+            prompt={
+                "tokens": token_count,
+                "tokens-per-sec": (
+                    round(token_count / elapsed_time, 2) if elapsed_time > 0 else 0
+                ),
+            },
+            audio_samples={
+                "samples": samples,
+                "samples-per-sec": (
+                    round(samples / elapsed_time, 2) if elapsed_time > 0 else 0
+                ),
+            },
+            processing_time_seconds=elapsed_time,
+            peak_memory_usage=mx.get_peak_memory() / 1e9,
+        )
+    def generate(
+        self,
+        text,
+        voice: Optional[str] = None,
+        temperature: float = 0.4,
+        top_p: float = 0.9,
+        split_pattern: str = "\n",
+        max_tokens: int = 1200,
+        verbose: bool = False,
+        ref_audio: Optional[str] = None,
+        stream: bool = False,
+        streaming_interval: float = 2.0,
+        **kwargs,
+    ):
+        prompts = self.chunk_text(text)
+        self.prompt_processor = PromptProcessor(self.tokenizer)
+        self.audio_processor = AudioProcessor()
+        speaker = self.get_speaker(voice, ref_audio)
+        sampler = make_sampler(
+            temperature,
+            top_p,
+            min_p=kwargs.get("min_p", 0.05),
+            top_k=kwargs.get("top_k", 40),
+        )
+        logits_processors = make_logits_processors(
+            kwargs.get("logit_bias", None),
+            kwargs.get("repetition_penalty", 1.1),
+            kwargs.get("repetition_context_size", 64),
+        )
+        for prompt in prompts:
+            completion_prompt = self.prompt_processor.get_completion_prompt(
+                prompt, speaker
+            )
+            input_ids = self.tokenizer.encode(
+                completion_prompt, add_special_tokens=False, return_tensors="mlx"
+            )
+            input_length = input_ids.shape[1]
+            generated_token_count = 0
+            yielded_token_count = 0
+            streaming_token_interval = int(streaming_interval * 137.5)
+            yielded_frame_count = 0
+            time_start = time.perf_counter()
+            for i, response in enumerate(
+                tqdm(
+                    stream_generate(
+                        self.model,
+                        tokenizer=self.tokenizer,
+                        prompt=input_ids.squeeze(0),
+                        max_tokens=max_tokens,
+                        sampler=sampler,
+                        logits_processors=logits_processors,
+                    ),
+                    total=max_tokens,
+                    disable=not verbose,
+                )
+            ):
+                next_token = mx.array([response.token])
+                input_ids = mx.concatenate([input_ids, next_token[None, :]], axis=1)
+                generated_token_count += 1
+                # send a partial result in streaming mode
+                if stream and generated_token_count % streaming_token_interval == 0:
+                    output_ids = input_ids[:, input_length:].tolist()[0]
+                    output = self.prompt_processor.extract_audio_from_tokens(output_ids)
+                    audio = self.audio_processor.audio_codec.decode(mx.array([output]))[
+                        -1, -1, :
+                    ]
+                    yield self.generate_result(
+                        audio=audio[yielded_frame_count:],
+                        start_time=time_start,
+                        token_count=len(output_ids) - yielded_token_count,
+                        segment_idx=i,
+                        **kwargs,
+                    )
+                    yielded_token_count = len(output_ids)
+                    yielded_frame_count = audio.shape[0]
+                    time_start = time.perf_counter()
+            output_ids = input_ids[:, input_length:].tolist()[0]
+            output = self.prompt_processor.extract_audio_from_tokens(output_ids)
+            audio = self.audio_processor.audio_codec.decode(mx.array([output]))[
+                -1, -1, :
+            ]
+            if audio.shape[0] > yielded_frame_count:
+                yield self.generate_result(
+                    audio=audio[yielded_frame_count:],
+                    start_time=time_start,
+                    token_count=len(output_ids) - yielded_token_count,
+                    segment_idx=i,
+                    **kwargs,
+                )
+            # Clear cache after each segment to avoid memory leaks
+            mx.clear_cache()

nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py ADDED Viewed

@@ -0,0 +1,181 @@
+import re
+from typing import Union
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from .tokens import SpecialTokens
+class PromptProcessor:
+    def __init__(
+        self, tokenizer: Union[str, PreTrainedTokenizer, PreTrainedTokenizerFast]
+    ):
+        self.special_tokens = SpecialTokens()
+        if tokenizer:
+            if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+                self.tokenizer = tokenizer
+            elif isinstance(tokenizer, str):
+                self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+            else:
+                raise ValueError(f"Invalid tokenizer: {type(tokenizer)}")
+            self.c1 = {}
+            self.c2 = {}
+            self.get_audio_token_map()
+        self.input_prompt = "{bos}\n{text_start}{text}{text_end}\n{audio_start}\n"
+        self.global_features = "{fs}{codes}{fe}\n"
+    def get_audio_token_map(self):
+        self.c1 = {
+            self.tokenizer.encode(
+                self.special_tokens.c1.format(i), add_special_tokens=False
+            )[0]: i
+            for i in range(1025)
+        }
+        self.c2 = {
+            self.tokenizer.encode(
+                self.special_tokens.c2.format(i), add_special_tokens=False
+            )[0]: i
+            for i in range(1025)
+        }
+    def get_features(self, f: dict):
+        features = {
+            "energy": f.get("energy", 0),
+            "spectral_centroid": f.get("spectral_centroid", 0),
+            "pitch": f.get("pitch", 0),
+        }
+        return [f"<|{k}_{v}|>" for k, v in features.items()]
+    def get_global_features(self, f: dict):
+        return self.global_features.format(
+            fs=self.special_tokens.global_features_start,
+            codes="".join(self.get_features(f)),
+            fe=self.special_tokens.global_features_end,
+        )
+    def create_codes(self, words: dict):
+        codes = []
+        for i in words:
+            word = (
+                i["word"]
+                + self.special_tokens.features
+                + self.special_tokens.time.format(i["duration"])
+            )
+            word += "".join(self.get_features(i["features"]))
+            pairs = []
+            for idx in range(len(i["c1"])):
+                c1 = self.special_tokens.c1.format(i["c1"][idx])
+                c2 = self.special_tokens.c2.format(i["c2"][idx])
+                pairs.append(f"{c1}{c2}")
+            word += self.special_tokens.code + "".join(pairs)
+            codes.append(
+                self.special_tokens.word_start + word + self.special_tokens.word_end
+            )
+        return "\n".join(codes)
+    def _init_prompt(self, text):
+        return self.input_prompt.format(
+            bos=self.special_tokens.bos,
+            text_start=self.special_tokens.text_start,
+            text=text,
+            text_end=self.special_tokens.text_end,
+            audio_start=self.special_tokens.audio_start,
+        )
+    def _get_separator(self, text: str) -> str:
+        has_hiragana = any("\u3040" <= c <= "\u309f" for c in text)
+        has_katakana = any("\u30a0" <= c <= "\u30ff" for c in text)
+        has_han = any("\u4e00" <= c <= "\u9fff" for c in text)
+        has_hangul = any("\uac00" <= c <= "\ud7af" for c in text)
+        if has_hiragana or has_katakana or has_han:
+            return "。"
+        elif has_hangul:
+            return ". "
+        else:
+            return ". "
+    def merge_speaker_text(self, input_text: str, speaker_text: str) -> str:
+        speaker_text = speaker_text.strip()
+        separator = self._get_separator(speaker_text)
+        # Determine allowed endings based on the separator
+        if separator == "。":
+            allowed_ends = ["。", "？", "！", "?", "!"]
+        else:
+            allowed_ends = [".", "?", "!"]
+        rs = ""
+        if speaker_text:
+            last_char = speaker_text[-1]
+            if last_char not in allowed_ends:
+                rs = separator
+            else:
+                if separator != "。":
+                    rs = " "
+        output = speaker_text.strip() + rs + input_text.strip()
+        return output, rs.strip()
+    @staticmethod
+    def text_normalizations(text: str) -> str:
+        # Normalize whitespace characters (newlines, tabs, etc.) to single spaces
+        text = re.sub(r"\s+", " ", text)
+        text = text.replace("…", "...")  # Replace ellipsis character with three dots
+        # Strip leading/trailing whitespace
+        text = text.strip()
+        # Normalize common Unicode characters to ASCII equivalents
+        text = re.sub(r"[“”]", '"', text)  # Curly quotes to straight quotes
+        text = re.sub(r"[‘’]", "'", text)  # Curly single quotes
+        text = re.sub(r"[–—]", "-", text)  # Various dashes to hyphen
+        # Remove control characters
+        text = re.sub(r"[\x00-\x1F\x7F-\x9F]", "", text)
+        return text
+    def get_completion_prompt(self, text: str, speaker: dict = None):
+        text = self.text_normalizations(text)
+        if speaker is not None:
+            text, separator = self.merge_speaker_text(text, speaker["text"])
+            speaker["words"][-1]["word"] += separator
+            codes = self.create_codes(speaker["words"])
+        prompt = self._init_prompt(text)
+        if speaker is not None:
+            prompt += codes + "\n" + self.special_tokens.word_start
+        return prompt
+    def get_training_prompt(self, speaker: dict) -> str:
+        text = self.text_normalizations(speaker["text"])
+        words = speaker["words"]
+        global_features = speaker["global_features"]
+        prompt = self._init_prompt(text)
+        prompt += self.get_global_features(global_features)
+        prompt += self.create_codes(words)
+        prompt += (
+            "\n" + self.special_tokens.audio_end + "\n" + self.special_tokens.eos + "\n"
+        )
+        return prompt
+    def extract_audio_from_tokens(self, tokens: list[int]):
+        codebook1 = [self.c1[i] for i in tokens if i in self.c1]
+        codebook2 = [self.c2[i] for i in tokens if i in self.c2]
+        t = min(len(codebook1), len(codebook2))
+        codebook1 = codebook1[:t]
+        codebook2 = codebook2[:t]
+        return [codebook1, codebook2]

nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py ADDED Viewed

@@ -0,0 +1,36 @@
+from dataclasses import asdict, dataclass
+from typing import Dict
+@dataclass
+class SpecialTokens:
+    """
+    Dataclass containing special tokens used for text and audio processing.
+    """
+    bos: str = "<|im_start|>"
+    eos: str = "<|im_end|>"
+    c1: str = "<|c1_{}|>"
+    c2: str = "<|c2_{}|>"
+    text_start: str = "<|text_start|>"
+    text_end: str = "<|text_end|>"
+    voice_characteristic_start: str = "<|voice_characteristic_start|>"
+    voice_characteristic_end: str = "<|voice_characteristic_end|>"
+    emotion_start: str = "<|emotion_start|>"
+    emotion_end: str = "<|emotion_end|>"
+    audio_start: str = "<|audio_start|>"
+    audio_end: str = "<|audio_end|>"
+    time: str = "<|t_{:.2f}|>"
+    code: str = "<|code|>"
+    energy: str = "<|energy_{}|>"
+    spectral_centroid: str = "<|spectral_centroid_{}|>"
+    pitch: str = "<|pitch_{}|>"
+    word_start: str = "<|word_start|>"
+    word_end: str = "<|word_end|>"
+    features: str = "<|features|>"
+    global_features_start: str = "<|global_features_start|>"
+    global_features_end: str = "<|global_features_end|>"
+    def to_dict(self) -> Dict[str, str]:
+        """Convert the dataclass instance to a dictionary using asdict."""
+        return asdict(self)

nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .sesame import Model
+__all__ = ["Model"]