PyPI - nexaai - Versions diffs - 1.0.19rc6__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19rc7__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.19rc6__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19rc7__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (224) hide show

nexaai/binds/nexa_mlx/py-lib/sd/modeling/vae.py ADDED Viewed

@@ -0,0 +1,274 @@
+# Copyright © 2023 Apple Inc.
+import math
+from typing import List
+import mlx.core as mx
+import mlx.nn as nn
+from .config import AutoencoderConfig
+from .unet import ResnetBlock2D, upsample_nearest
+class Attention(nn.Module):
+    """A single head unmasked attention for use with the VAE."""
+    def __init__(self, dims: int, norm_groups: int = 32):
+        super().__init__()
+        self.group_norm = nn.GroupNorm(norm_groups, dims, pytorch_compatible=True)
+        self.query_proj = nn.Linear(dims, dims)
+        self.key_proj = nn.Linear(dims, dims)
+        self.value_proj = nn.Linear(dims, dims)
+        self.out_proj = nn.Linear(dims, dims)
+    def __call__(self, x):
+        B, H, W, C = x.shape
+        y = self.group_norm(x)
+        queries = self.query_proj(y).reshape(B, H * W, C)
+        keys = self.key_proj(y).reshape(B, H * W, C)
+        values = self.value_proj(y).reshape(B, H * W, C)
+        scale = 1 / math.sqrt(queries.shape[-1])
+        scores = (queries * scale) @ keys.transpose(0, 2, 1)
+        attn = mx.softmax(scores, axis=-1)
+        y = (attn @ values).reshape(B, H, W, C)
+        y = self.out_proj(y)
+        x = x + y
+        return x
+class EncoderDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        resnet_groups: int = 32,
+        add_downsample=True,
+        add_upsample=True,
+    ):
+        super().__init__()
+        # Add the resnet blocks
+        self.resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                groups=resnet_groups,
+            )
+            for i in range(num_layers)
+        ]
+        # Add an optional downsampling layer
+        if add_downsample:
+            self.downsample = nn.Conv2d(
+                out_channels, out_channels, kernel_size=3, stride=2, padding=0
+            )
+        # or upsampling layer
+        if add_upsample:
+            self.upsample = nn.Conv2d(
+                out_channels, out_channels, kernel_size=3, stride=1, padding=1
+            )
+    def __call__(self, x):
+        for resnet in self.resnets:
+            x = resnet(x)
+        if "downsample" in self:
+            x = mx.pad(x, [(0, 0), (0, 1), (0, 1), (0, 0)])
+            x = self.downsample(x)
+        if "upsample" in self:
+            x = self.upsample(upsample_nearest(x))
+        return x
+class Encoder(nn.Module):
+    """Implements the encoder side of the Autoencoder."""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        block_out_channels: List[int] = [64],
+        layers_per_block: int = 2,
+        resnet_groups: int = 32,
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1
+        )
+        channels = [block_out_channels[0]] + list(block_out_channels)
+        self.down_blocks = [
+            EncoderDecoderBlock2D(
+                in_channels,
+                out_channels,
+                num_layers=layers_per_block,
+                resnet_groups=resnet_groups,
+                add_downsample=i < len(block_out_channels) - 1,
+                add_upsample=False,
+            )
+            for i, (in_channels, out_channels) in enumerate(zip(channels, channels[1:]))
+        ]
+        self.mid_blocks = [
+            ResnetBlock2D(
+                in_channels=block_out_channels[-1],
+                out_channels=block_out_channels[-1],
+                groups=resnet_groups,
+            ),
+            Attention(block_out_channels[-1], resnet_groups),
+            ResnetBlock2D(
+                in_channels=block_out_channels[-1],
+                out_channels=block_out_channels[-1],
+                groups=resnet_groups,
+            ),
+        ]
+        self.conv_norm_out = nn.GroupNorm(
+            resnet_groups, block_out_channels[-1], pytorch_compatible=True
+        )
+        self.conv_out = nn.Conv2d(block_out_channels[-1], out_channels, 3, padding=1)
+    def __call__(self, x):
+        x = self.conv_in(x)
+        for l in self.down_blocks:
+            x = l(x)
+        x = self.mid_blocks[0](x)
+        x = self.mid_blocks[1](x)
+        x = self.mid_blocks[2](x)
+        x = self.conv_norm_out(x)
+        x = nn.silu(x)
+        x = self.conv_out(x)
+        return x
+class Decoder(nn.Module):
+    """Implements the decoder side of the Autoencoder."""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        block_out_channels: List[int] = [64],
+        layers_per_block: int = 2,
+        resnet_groups: int = 32,
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+        )
+        self.mid_blocks = [
+            ResnetBlock2D(
+                in_channels=block_out_channels[-1],
+                out_channels=block_out_channels[-1],
+                groups=resnet_groups,
+            ),
+            Attention(block_out_channels[-1], resnet_groups),
+            ResnetBlock2D(
+                in_channels=block_out_channels[-1],
+                out_channels=block_out_channels[-1],
+                groups=resnet_groups,
+            ),
+        ]
+        channels = list(reversed(block_out_channels))
+        channels = [channels[0]] + channels
+        self.up_blocks = [
+            EncoderDecoderBlock2D(
+                in_channels,
+                out_channels,
+                num_layers=layers_per_block,
+                resnet_groups=resnet_groups,
+                add_downsample=False,
+                add_upsample=i < len(block_out_channels) - 1,
+            )
+            for i, (in_channels, out_channels) in enumerate(zip(channels, channels[1:]))
+        ]
+        self.conv_norm_out = nn.GroupNorm(
+            resnet_groups, block_out_channels[0], pytorch_compatible=True
+        )
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+    def __call__(self, x):
+        x = self.conv_in(x)
+        x = self.mid_blocks[0](x)
+        x = self.mid_blocks[1](x)
+        x = self.mid_blocks[2](x)
+        for l in self.up_blocks:
+            x = l(x)
+        x = self.conv_norm_out(x)
+        x = nn.silu(x)
+        x = self.conv_out(x)
+        return x
+class Autoencoder(nn.Module):
+    """The autoencoder that allows us to perform diffusion in the latent space."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.latent_channels = config.latent_channels_in
+        self.scaling_factor = config.scaling_factor
+        self.encoder = Encoder(
+            config.in_channels,
+            config.latent_channels_out,
+            config.block_out_channels,
+            config.layers_per_block,
+            resnet_groups=config.norm_num_groups,
+        )
+        self.decoder = Decoder(
+            config.latent_channels_in,
+            config.out_channels,
+            config.block_out_channels,
+            config.layers_per_block + 1,
+            resnet_groups=config.norm_num_groups,
+        )
+        self.quant_proj = nn.Linear(
+            config.latent_channels_out, config.latent_channels_out
+        )
+        self.post_quant_proj = nn.Linear(
+            config.latent_channels_in, config.latent_channels_in
+        )
+    def decode(self, z):
+        z = z / self.scaling_factor
+        return self.decoder(self.post_quant_proj(z))
+    def encode(self, x):
+        x = self.encoder(x)
+        x = self.quant_proj(x)
+        mean, logvar = x.split(2, axis=-1)
+        mean = mean * self.scaling_factor
+        logvar = logvar + 2 * math.log(self.scaling_factor)
+        return mean, logvar
+    def __call__(self, x, key=None):
+        mean, logvar = self.encode(x)
+        z = mx.random.normal(mean.shape, key=key) * mx.exp(0.5 * logvar) + mean
+        x_hat = self.decode(z)
+        return dict(x_hat=x_hat, z=z, mean=mean, logvar=logvar)

nexaai/binds/nexa_mlx/py-lib/tts/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# patching the _resume method in phonemizer because logger.setLevel(logging.ERROR) doesn't work - the logger instance is created and stored in the package.
+try:
+    from phonemizer.backend.espeak.words_mismatch import BaseWordsMismatch
+    def silent_resume(self, nmismatch, nlines):
+        """Silent version of _resume that suppresses warnings"""
+        pass
+    BaseWordsMismatch._resume = silent_resume
+except ImportError:
+    pass

nexaai/binds/nexa_mlx/py-lib/tts/interface.py ADDED Viewed

@@ -0,0 +1,276 @@
+from typing import Any, List, Optional, Sequence
+import argparse
+import sys
+import os
+import glob
+import tempfile
+import time
+import soundfile as sf
+import mlx.core as mx
+import numpy as np
+from ml import TTS, TTSConfig, TTSResult, TTSSamplerConfig, Path as MLPath
+from mlx_audio.tts.utils import load_model
+from profiling import ProfilingMixin, StopReason
+class MlxTts(TTS, ProfilingMixin):
+    """MLX Audio implementation of TTS interface."""
+    def __init__(
+        self,
+        model_path: MLPath,
+        vocoder_path: MLPath,
+        device: Optional[str] = None,
+    ) -> None:
+        ProfilingMixin.__init__(self)
+        if os.path.isfile(model_path):
+            model_path = os.path.dirname(model_path)
+        # vocoder_path is not used in MLX TTS since the vocoder is integrated
+        super().__init__(model_path, vocoder_path, device)
+        self._sampler_config = TTSSamplerConfig()
+        self.model = None
+        self._model_loaded = False
+        # Load model during initialization (matching C API behavior)
+        self._load_model()
+    def _load_model(self) -> bool:
+        """Load the TTS model."""
+        try:
+            self.model = load_model(self.model_path)
+            self._model_loaded = True
+            return True
+        except Exception as e:
+            print(f"Failed to load TTS model: {e}")
+            return False
+    def destroy(self) -> None:
+        """Destroy the model and free resources."""
+        if self.model is not None:
+            del self.model
+            self.model = None
+            mx.clear_cache()
+        self._model_loaded = False
+    def synthesize(
+        self,
+        text: str,
+        config: Optional[TTSConfig] = None,
+        output_path: Optional[MLPath] = None,
+        clear_cache: bool = True,
+    ) -> TTSResult:
+        """Synthesize speech from text and save to filesystem."""
+        # Ensure model is loaded
+        if not self._model_loaded or self.model is None:
+            raise RuntimeError("TTS model not loaded")
+        # Start profiling
+        self._start_profiling()
+        self._prompt_start()
+        try:
+            # Use default config if not provided
+            if config is None:
+                config = TTSConfig()
+            # Generate output path if not provided
+            if output_path is None:
+                timestamp = int(time.time() * 1000)
+                output_path = os.path.join(tempfile.gettempdir(), f"tts_output_{timestamp}.wav")
+            # Resolve voice path for Kokoro models
+            voice = config.voice
+            if voice and not voice.endswith(".pt") and not os.path.isabs(voice):
+                # For relative voice names like "af_heart", construct full path
+                voice_path = os.path.join(self.model_path, "voices", f"{voice}.pt")
+                if os.path.exists(voice_path):
+                    voice = voice_path
+            # End prompt processing, start decode
+            self._prompt_end()
+            self._decode_start()
+            results = self.model.generate(
+                text=text,
+                voice=voice,
+                speed=config.speed,
+                temperature=self._sampler_config.temperature,
+                seed=config.seed if config.seed != -1 else None,
+                verbose=False,
+                stream=False,
+                join_audio=True,
+            )
+            # Get the results (should be a generator)
+            audio_list = []
+            sample_rate = None
+            for result in results:
+                audio_list.append(result.audio)
+                sample_rate = result.sample_rate
+            if not audio_list:
+                raise RuntimeError("No audio generated")
+            # Concatenate audio if multiple chunks
+            if len(audio_list) > 1:
+                audio = mx.concatenate(audio_list, axis=0)
+            else:
+                audio = audio_list[0]
+            # Convert MLX array to numpy for saving
+            if isinstance(audio, mx.array):
+                audio_np = np.array(audio)
+            else:
+                audio_np = audio
+            # Save audio to file
+            sf.write(output_path, audio_np, sample_rate)
+            if clear_cache:
+                mx.clear_cache()
+            # Calculate metadata
+            channels = 1 if len(audio_np.shape) == 1 else audio_np.shape[1]
+            num_samples = len(audio_np)
+            duration_seconds = num_samples / sample_rate
+            # End decode and profiling
+            self._decode_end()
+            self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
+            self._end_profiling()
+            return TTSResult(
+                audio_path=output_path,
+                duration_seconds=duration_seconds,
+                sample_rate=sample_rate,
+                channels=channels,
+                num_samples=num_samples
+            )
+        except Exception as e:
+            # End profiling on error
+            self._end_profiling()
+            raise e
+    def list_available_voices(self) -> List[str]:
+        """List available voices."""
+        # Common MLX TTS voice names - this could be enhanced to discover voices dynamically
+        default_voices = [
+            "af_heart", "af_bella", "af_nicole", "af_sarah", "af_sky", "af_sunshine",
+            "am_adam", "am_michael", "am_mead", "an_nova", "an_michael",
+            "bf_emma", "bf_isabella", "bm_george", "bm_lewis"
+        ]
+        # Try to discover voices from model directory if available
+        if self.model_path and os.path.exists(self.model_path):
+            discovered_voices = []
+            voice_patterns = [
+                "*.pt",  # Voice files in model root
+                "voices/*.pt",  # Voice files in voices subdirectory
+            ]
+            for pattern in voice_patterns:
+                voice_files = glob.glob(os.path.join(self.model_path, pattern))
+                for voice_file in voice_files:
+                    voice_name = os.path.splitext(os.path.basename(voice_file))[0]
+                    discovered_voices.append(voice_name)
+            if discovered_voices:
+                return discovered_voices
+        return default_voices
+def main():
+    """Main function for command line text-to-speech synthesis."""
+    parser = argparse.ArgumentParser(description="Synthesize speech using MLX TTS")
+    parser.add_argument("model_path", help="Path to the TTS model")
+    parser.add_argument("text", help="Text to synthesize")
+    parser.add_argument("--voice", "-v", default="af_heart", help="Voice to use (default: af_heart)")
+    parser.add_argument("--speed", "-s", type=float, default=1.0, help="Speech speed (default: 1.0)")
+    parser.add_argument("--output", "-o", default="output.wav", help="Output audio file (default: output.wav)")
+    parser.add_argument("--sample-rate", "-sr", type=int, default=24000, help="Sample rate (default: 24000)")
+    parser.add_argument("--temperature", "-t", type=float, default=0.7, help="Temperature for sampling (default: 0.7)")
+    parser.add_argument("--seed", type=int, default=-1, help="Random seed (-1 for random)")
+    parser.add_argument("--list-voices", action="store_true", help="List available voices")
+    args = parser.parse_args()
+    # Check if model path exists
+    if not os.path.exists(args.model_path):
+        print(f"Error: Model path does not exist: {args.model_path}")
+        sys.exit(1)
+    # Initialize TTS adapter
+    print(f"Initializing TTS with model: {args.model_path}")
+    try:
+        tts = MlxTts(
+            model_path=args.model_path,
+            vocoder_path="",  # Not used in MLX TTS
+            device=None
+        )
+        print("TTS model loaded successfully")
+        # List voices if requested
+        if args.list_voices:
+            voices = tts.list_available_voices()
+            print(f"Available voices: {', '.join(voices)}")
+            return
+    except Exception as e:
+        print(f"Error initializing TTS: {e}")
+        sys.exit(1)
+    # Set up synthesis config
+    sampler_config = TTSSamplerConfig(
+        temperature=args.temperature,
+        noise_scale=0.667,
+        length_scale=1.0
+    )
+    tts._sampler_config = sampler_config
+    config = TTSConfig(
+        voice=args.voice,
+        speed=args.speed,
+        seed=args.seed,
+        sample_rate=args.sample_rate
+    )
+    # Synthesize speech
+    print(f"Synthesizing text: '{args.text}'")
+    print(f"Using voice: {args.voice}")
+    print(f"Speed: {args.speed}x")
+    print("-" * 50)
+    try:
+        result = tts.synthesize(args.text, config, args.output)
+        # Print results
+        print("Synthesis Results:")
+        print("=" * 50)
+        print(f"Audio generated:")
+        print(f"  Duration: {result.duration_seconds:.2f} seconds")
+        print(f"  Sample rate: {result.sample_rate} Hz")
+        print(f"  Channels: {result.channels}")
+        print(f"  Samples: {result.num_samples}")
+        print(f"✅ Audio saved to: {result.audio_path}")
+    except Exception as e:
+        print(f"Error during synthesis: {e}")
+        sys.exit(1)
+    finally:
+        # Clean up
+        tts.destroy()
+if __name__ == "__main__":
+    main()

nexaai/binds/nexa_mlx/py-lib/vlm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+import logging
+logging.getLogger("transformers").setLevel(logging.ERROR)