PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py ADDED Viewed

@@ -0,0 +1,156 @@
+from unittest import mock
+import numpy as np
+import pytest
+from mlx_audio.sts.voice_pipeline import VoicePipeline
+class TestVoicePipeline:
+    def test_initialization_default_params(self):
+        """
+        Test that the initialization method initializes the parameters correctly.
+        """
+        pipeline = VoicePipeline()
+        assert pipeline.silence_threshold == 0.03
+        assert pipeline.silence_duration == 1.5
+        assert pipeline.input_sample_rate == 16_000
+        assert pipeline.output_sample_rate == 24_000
+        assert pipeline.streaming_interval == 3
+        assert pipeline.frame_duration_ms == 30
+        assert pipeline.stt_model == "mlx-community/whisper-large-v3-turbo"
+        assert pipeline.llm_model == "Qwen/Qwen2.5-0.5B-Instruct-4bit"
+        assert pipeline.tts_model == "mlx-community/csm-1b-fp16"
+    def test_initialization_custom_params(self):
+        """
+        Test that the initialization method initializes the parameters correctly.
+        """
+        pipeline = VoicePipeline(
+            silence_threshold=0.05,
+            silence_duration=2.0,
+            input_sample_rate=8_000,
+            output_sample_rate=12_000,
+            streaming_interval=5,
+            frame_duration_ms=20,
+            vad_mode=2,
+            stt_model="custom/stt",
+            llm_model="custom/llm",
+            tts_model="custom/tts",
+        )
+        assert pipeline.silence_threshold == 0.05
+        assert pipeline.silence_duration == 2.0
+        assert pipeline.input_sample_rate == 8_000
+        assert pipeline.output_sample_rate == 12_000
+        assert pipeline.streaming_interval == 5
+        assert pipeline.frame_duration_ms == 20
+        assert pipeline.stt_model == "custom/stt"
+        assert pipeline.llm_model == "custom/llm"
+        assert pipeline.tts_model == "custom/tts"
+    @mock.patch("mlx_audio.sts.voice_pipeline.load_llm")
+    @mock.patch("mlx_audio.sts.voice_pipeline.load_tts")
+    @mock.patch("mlx_audio.sts.voice_pipeline.Whisper.from_pretrained")
+    async def test_init_models(self, mock_whisper_load, mock_tts_load, mock_llm_load):
+        """
+        Test that the init_models method initializes the models correctly.
+        """
+        pipeline = VoicePipeline()
+        # Mock the return values of the model loaders
+        mock_llm = mock.AsyncMock()
+        mock_tokenizer = mock.AsyncMock()
+        mock_llm_load.return_value = (mock_llm, mock_tokenizer)
+        mock_tts = mock.AsyncMock()
+        mock_tts_load.return_value = mock_tts
+        mock_stt = mock.AsyncMock()
+        mock_whisper_load.return_value = mock_stt
+        await pipeline.init_models()
+        mock_llm_load.assert_called_once_with(pipeline.llm_model)
+        mock_tts_load.assert_called_once_with(pipeline.tts_model)
+        mock_whisper_load.assert_called_once_with(pipeline.stt_model)
+        assert pipeline.llm is mock_llm
+        assert pipeline.tokenizer is mock_tokenizer
+        assert pipeline.tts is mock_tts
+        assert pipeline.stt is mock_stt
+    def test_is_silent_true(self):
+        """
+        Test that the is_silent method returns True for silent audio frames.
+        """
+        pipeline = VoicePipeline(silence_threshold=0.1)
+        # Create a silent audio frame (very low amplitude)
+        silent_audio_data_np = np.random.uniform(-0.01, 0.01, size=480).astype(
+            np.float32
+        )  # 30ms at 16kHz
+        silent_audio_data_bytes = (
+            (silent_audio_data_np * 32768.0).astype(np.int16).tobytes()
+        )
+        assert pipeline._is_silent(silent_audio_data_np) is np.True_
+        assert pipeline._is_silent(silent_audio_data_bytes) is np.True_
+    def test_is_silent_false(self):
+        """
+        Test that the is_silent method returns False for non-silent audio frames.
+        """
+        pipeline = VoicePipeline(silence_threshold=0.001)
+        # Create a non-silent audio frame (higher amplitude)
+        speech_audio_data_np = np.random.uniform(-2, 2, size=480).astype(np.float32)
+        speech_audio_data_bytes = (
+            (speech_audio_data_np * 32768.0).astype(np.int16).tobytes()
+        )
+        assert pipeline._is_silent(speech_audio_data_np) is np.False_
+        assert pipeline._is_silent(speech_audio_data_bytes) is np.False_
+    @mock.patch("webrtcvad.Vad.is_speech")
+    def test_voice_activity_detection_vad_speech(self, mock_is_speech):
+        """
+        Test that the voice activity detection returns True for speech frames.
+        """
+        pipeline = VoicePipeline()
+        mock_is_speech.return_value = True
+        frame = b"\x00\x00" * (16000 * 30 // 1000)  # 30ms of silence at 16kHz, 16-bit
+        assert pipeline._voice_activity_detection(frame) is True
+        mock_is_speech.assert_called_once_with(frame, pipeline.input_sample_rate)
+    @mock.patch("webrtcvad.Vad.is_speech")
+    def test_voice_activity_detection_vad_silence(self, mock_is_speech):
+        """
+        Test that the voice activity detection returns False for silent frames.
+        """
+        pipeline = VoicePipeline()
+        mock_is_speech.return_value = False
+        frame = b"\x00\x00" * (16000 * 30 // 1000)
+        assert pipeline._voice_activity_detection(frame) is False
+        mock_is_speech.assert_called_once_with(frame, pipeline.input_sample_rate)
+    @mock.patch("webrtcvad.Vad.is_speech")
+    def test_voice_activity_detection_vad_error_fallback_silent(self, mock_is_speech):
+        """
+        Test that the voice activity detection returns False for silent frames.
+        """
+        pipeline = VoicePipeline(silence_threshold=0.1)
+        mock_is_speech.side_effect = ValueError("VAD error")
+        frame_np = np.full(480, 0.001, dtype=np.float32)
+        frame_bytes = (frame_np * 32768.0).astype(np.int16).tobytes()
+        assert pipeline._voice_activity_detection(frame_bytes) is False
+        mock_is_speech.assert_called_once_with(frame_bytes, pipeline.input_sample_rate)
+    @mock.patch("webrtcvad.Vad.is_speech")
+    def test_voice_activity_detection_vad_error_fallback_speech(self, mock_is_speech):
+        pipeline = VoicePipeline(silence_threshold=0.01)
+        mock_is_speech.side_effect = ValueError("VAD error")
+        frame_np = np.full(480, 0.5, dtype=np.float32)
+        frame_bytes = (frame_np * 32768.0).astype(np.int16).tobytes()
+        assert pipeline._voice_activity_detection(frame_bytes) is True
+        mock_is_speech.assert_called_once_with(frame_bytes, pipeline.input_sample_rate)

nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py ADDED Viewed

@@ -0,0 +1,327 @@
+import argparse
+import asyncio
+import logging
+import mlx.core as mx
+import numpy as np
+import sounddevice as sd
+import webrtcvad
+from mlx_lm.generate import generate as generate_text
+from mlx_lm.utils import load as load_llm
+from mlx_audio.stt.models.whisper import Model as Whisper
+from mlx_audio.tts.audio_player import AudioPlayer
+from mlx_audio.tts.utils import load_model as load_tts
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class VoicePipeline:
+    def __init__(
+        self,
+        silence_threshold=0.03,
+        silence_duration=1.5,
+        input_sample_rate=16_000,
+        output_sample_rate=24_000,
+        streaming_interval=3,
+        frame_duration_ms=30,
+        vad_mode=3,
+        stt_model="mlx-community/whisper-large-v3-turbo",
+        llm_model="Qwen/Qwen2.5-0.5B-Instruct-4bit",
+        tts_model="mlx-community/csm-1b-fp16",
+    ):
+        self.silence_threshold = silence_threshold
+        self.silence_duration = silence_duration
+        self.input_sample_rate = input_sample_rate
+        self.output_sample_rate = output_sample_rate
+        self.streaming_interval = streaming_interval
+        self.frame_duration_ms = frame_duration_ms
+        self.stt_model = stt_model
+        self.llm_model = llm_model
+        self.tts_model = tts_model
+        self.vad = webrtcvad.Vad(vad_mode)
+        self.input_audio_queue = asyncio.Queue(maxsize=50)
+        self.transcription_queue = asyncio.Queue()
+        self.output_audio_queue = asyncio.Queue(maxsize=50)
+        self.mlx_lock = asyncio.Lock()
+    async def init_models(self):
+        logger.info(f"Loading text generation model: {self.llm_model}")
+        self.llm, self.tokenizer = await asyncio.to_thread(
+            lambda: load_llm(self.llm_model)
+        )
+        logger.info(f"Loading text-to-speech model: {self.tts_model}")
+        self.tts = await asyncio.to_thread(lambda: load_tts(self.tts_model))
+        logger.info(f"Loading speech-to-text model: {self.stt_model}")
+        self.stt = Whisper.from_pretrained(self.stt_model)
+    async def start(self):
+        self.loop = asyncio.get_running_loop()
+        await self.init_models()
+        tasks = [
+            asyncio.create_task(self._listener()),
+            asyncio.create_task(self._response_processor()),
+            asyncio.create_task(self._audio_output_processor()),
+        ]
+        try:
+            await asyncio.gather(*tasks)
+        finally:
+            for t in tasks:
+                t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+    # speech detection and transcription
+    def _is_silent(self, audio_data):
+        if isinstance(audio_data, bytes):
+            audio_np = np.frombuffer(audio_data, dtype=np.int16)
+            audio_np = (
+                audio_np.astype(np.float32) / 32768.0
+            )  # Normalize if input is bytes
+        else:
+            audio_np = audio_data
+        # Ensure audio_np is float32 for energy calculation.
+        audio_np = audio_np.astype(np.float32)
+        energy = np.linalg.norm(audio_np) / np.sqrt(audio_np.size)
+        return energy < self.silence_threshold
+    def _voice_activity_detection(self, frame):
+        try:
+            return self.vad.is_speech(frame, self.input_sample_rate)
+        except ValueError:
+            # fall back to energy-based detection
+            return not self._is_silent(frame)
+    async def _listener(self):
+        frame_size = int(self.input_sample_rate * (self.frame_duration_ms / 1000.0))
+        stream = sd.InputStream(
+            samplerate=self.input_sample_rate,
+            blocksize=frame_size,
+            channels=1,
+            dtype="int16",
+            callback=self._sd_callback,
+        )
+        stream.start()
+        logger.info("Listening for voice input...")
+        frames = []
+        silent_frames = 0
+        frames_until_silence = int(
+            self.silence_duration * 1000 / self.frame_duration_ms
+        )
+        speaking_detected = False
+        try:
+            while True:
+                frame = await self.input_audio_queue.get()
+                is_speech = self._voice_activity_detection(frame)
+                if is_speech:
+                    speaking_detected = True
+                    silent_frames = 0
+                    frames.append(frame)
+                    # Cancel the current TTS task
+                    if hasattr(self, "current_tts_task") and self.current_tts_task:
+                        # Signal the generator loop to stop
+                        self.current_tts_cancel.set()
+                    # Clear the output audio queue
+                    self.loop.call_soon_threadsafe(self.player.flush)
+                elif speaking_detected:
+                    silent_frames += 1
+                    frames.append(frame)
+                    if silent_frames > frames_until_silence:
+                        # Process the voice input
+                        if frames:
+                            logger.info("Processing voice input...")
+                            await self._process_audio(frames)
+                        frames = []
+                        speaking_detected = False
+                        silent_frames = 0
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            stream.stop()
+            stream.close()
+            raise
+        finally:
+            stream.stop()
+            stream.close()
+    def _sd_callback(self, indata, frames, _time, status):
+        data = indata.reshape(-1).tobytes()
+        def _enqueue():
+            try:
+                self.input_audio_queue.put_nowait(data)
+            except asyncio.QueueFull:
+                return
+        self.loop.call_soon_threadsafe(_enqueue)
+    async def _process_audio(self, frames):
+        audio = (
+            np.frombuffer(b"".join(frames), dtype=np.int16).astype(np.float32) / 32768.0
+        )
+        async with self.mlx_lock:
+            result = await asyncio.to_thread(self.stt.generate, mx.array(audio))
+        text = result.text.strip()
+        if text:
+            logger.info(f"Transcribed: {text}")
+            await self.transcription_queue.put(text)
+    # response generation
+    async def _response_processor(self):
+        while True:
+            text = await self.transcription_queue.get()
+            await self._generate_response(text)
+            self.transcription_queue.task_done()
+    async def _generate_response(self, text):
+        def _get_llm_response(llm, tokenizer, messages, *, verbose=False):
+            prompt = tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            return generate_text(llm, tokenizer, prompt, verbose=verbose).strip()
+        try:
+            logger.info("Generating response...")
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are a helpful voice assistant. You always respond with short sentences and never use punctuation like parentheses or colons that wouldn't appear in conversational speech.",
+                },
+                {"role": "user", "content": text},
+            ]
+            async with self.mlx_lock:
+                response_text = await asyncio.to_thread(
+                    _get_llm_response, self.llm, self.tokenizer, messages, verbose=False
+                )
+            logger.info(f"Generated response: {response_text}")
+            if response_text:
+                self.current_tts_cancel = asyncio.Event()
+                self.current_tts_task = asyncio.create_task(
+                    self._speak_response(response_text, self.current_tts_cancel)
+                )
+        except Exception as e:
+            logger.error(f"Generation error: {e}")
+    # speech generation
+    async def _speak_response(self, text: str, cancel_event: asyncio.Event):
+        """
+        Speak `text`, yielding PCM chunks into `self.output_audio_queue`.
+        Playback can be interrupted at any moment by setting `cancel_event`.
+        """
+        loop = self.loop
+        def _tts_stream(tts, txt, rate, queue, cancel_ev: asyncio.Event):
+            # This runs in a worker thread, so we *must* poll a thread‑safe flag.
+            for chunk in tts.generate(
+                txt,
+                sample_rate=rate,
+                stream=True,
+                streaming_interval=self.streaming_interval,
+                verbose=False,
+            ):
+                if cancel_ev.is_set():  # <-- stop immediately
+                    break
+                loop.call_soon_threadsafe(queue.put_nowait, chunk.audio)
+        try:
+            async with self.mlx_lock:
+                await asyncio.to_thread(
+                    _tts_stream,
+                    self.tts,
+                    text,
+                    self.output_sample_rate,
+                    self.output_audio_queue,
+                    cancel_event,
+                )
+        except asyncio.CancelledError:
+            # The coroutine itself was cancelled from outside → just exit cleanly.
+            pass
+        except Exception as exc:
+            logger.error("Speech synthesis error: %s", exc)
+    async def _audio_output_processor(self):
+        self.player = AudioPlayer(sample_rate=self.output_sample_rate)
+        try:
+            while True:
+                audio = await self.output_audio_queue.get()
+                self.player.queue_audio(audio)
+                self.output_audio_queue.task_done()
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            self.player.stop()
+            raise
+async def main():
+    parser = argparse.ArgumentParser(description="Voice Pipeline")
+    parser.add_argument(
+        "--stt_model",
+        type=str,
+        default="mlx-community/whisper-large-v3-turbo",
+        help="STT model",
+    )
+    parser.add_argument(
+        "--tts_model", type=str, default="mlx-community/csm-1b-fp16", help="TTS model"
+    )
+    parser.add_argument(
+        "--llm_model",
+        type=str,
+        default="mlx-community/Qwen2.5-0.5B-Instruct-4bit",
+        help="LLM model",
+    )
+    parser.add_argument("--vad_mode", type=int, default=3, help="VAD mode")
+    parser.add_argument(
+        "--silence_duration", type=float, default=1.5, help="Silence duration"
+    )
+    parser.add_argument(
+        "--silence_threshold", type=float, default=0.03, help="Silence threshold"
+    )
+    parser.add_argument(
+        "--streaming_interval", type=int, default=3, help="Streaming interval"
+    )
+    args = parser.parse_args()
+    pipeline = VoicePipeline(
+        stt_model=args.stt_model,
+        tts_model=args.tts_model,
+        llm_model=args.llm_model,
+        vad_mode=args.vad_mode,
+        silence_duration=args.silence_duration,
+        silence_threshold=args.silence_threshold,
+        streaming_interval=args.streaming_interval,
+    )
+    await pipeline.start()
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        pass

nexaai/mlx_backend/mlx_audio/stt/__init__.py ADDED Viewed

File without changes

nexaai/mlx_backend/mlx_audio/stt/generate.py ADDED Viewed

@@ -0,0 +1,174 @@
+import argparse
+import json
+import os
+import time
+from pathlib import Path
+from typing import Optional
+import mlx.core as mx
+from mlx_audio.stt.utils import load_model
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate transcriptions from audio files"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Path to the model")
+    parser.add_argument(
+        "--audio", type=str, required=True, help="Path to the audio file"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="Path to save the output"
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="txt",
+        choices=["txt", "srt", "vtt", "json"],
+        help="Output format (txt, srt, vtt, or json)",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    return parser.parse_args()
+def format_timestamp(seconds: float) -> str:
+    """Convert seconds to HH:MM:SS,mmm format for SRT/VTT"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = seconds % 60
+    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}".replace(".", ",")
+def format_vtt_timestamp(seconds: float) -> str:
+    """Convert seconds to HH:MM:SS.mmm format for VTT"""
+    return format_timestamp(seconds).replace(",", ".")
+def save_as_txt(segments, output_path: str):
+    with open(f"{output_path}.txt", "w", encoding="utf-8") as f:
+        f.write(segments.text)
+def save_as_srt(segments, output_path: str):
+    with open(f"{output_path}.srt", "w", encoding="utf-8") as f:
+        for i, sentence in enumerate(segments.sentences, 1):
+            f.write(f"{i}\n")
+            f.write(
+                f"{format_timestamp(sentence.start)} --> {format_timestamp(sentence.end)}\n"
+            )
+            f.write(f"{sentence.text}\n\n")
+def save_as_vtt(segments, output_path: str):
+    with open(f"{output_path}.vtt", "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        if hasattr(segments, "sentences"):
+            sentences = segments.sentences
+            for i, sentence in enumerate(sentences, 1):
+                f.write(f"{i}\n")
+                f.write(
+                    f"{format_vtt_timestamp(sentence.start)} --> {format_vtt_timestamp(sentence.end)}\n"
+                )
+                f.write(f"{sentence.text}\n\n")
+        else:
+            sentences = segments.segments
+            for i, token in enumerate(sentences, 1):
+                f.write(f"{i}\n")
+                f.write(
+                    f"{format_vtt_timestamp(token['start'])} --> {format_vtt_timestamp(token['end'])}\n"
+                )
+                f.write(f"{token['text']}\n\n")
+def save_as_json(segments, output_path: str):
+    if hasattr(segments, "sentences"):
+        result = {
+            "text": segments.text,
+            "sentences": [
+                {
+                    "text": s.text,
+                    "start": s.start,
+                    "end": s.end,
+                    "duration": s.duration,
+                    "tokens": [
+                        {
+                            "text": t.text,
+                            "start": t.start,
+                            "end": t.end,
+                            "duration": t.duration,
+                        }
+                        for t in s.tokens
+                    ],
+                }
+                for s in segments.sentences
+            ],
+        }
+    else:
+        result = {
+            "text": segments.text,
+            "segments": [
+                {
+                    "text": s["text"],
+                    "start": s["start"],
+                    "end": s["end"],
+                    "duration": s["end"] - s["start"],
+                }
+                for s in segments.segments
+            ],
+        }
+    with open(f"{output_path}.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+def generate(
+    model_path: str,
+    audio_path: str,
+    output_path: str,
+    format: str = "txt",
+    verbose: bool = True,
+):
+    model = load_model(model_path)
+    print(f"\n\033[94mModel:\033[0m {model_path}")
+    print(f"\033[94mAudio path:\033[0m {audio_path}")
+    print(f"\033[94mOutput path:\033[0m {output_path}")
+    print(f"\033[94mFormat:\033[0m {format}")
+    mx.reset_peak_memory()
+    start_time = time.time()
+    segments = model.generate(audio_path)
+    end_time = time.time()
+    if verbose:
+        print("\n\033[94mTranscription:\033[0m")
+        print(segments.text)
+        print("\n\033[94mSegments:\033[0m")
+        if hasattr(segments, "segments"):
+            print(segments.segments)
+        elif hasattr(segments, "tokens"):
+            print(segments.tokens)
+        else:
+            print(segments)
+    print(f"\033[94mProcessing time:\033[0m {end_time - start_time:.2f} seconds")
+    print(f"\033[94mPeak memory:\033[0m {mx.get_peak_memory() / 1e9:.2f} GB")
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+    if format == "txt":
+        save_as_txt(segments, output_path)
+    elif format == "srt":
+        save_as_srt(segments, output_path)
+    elif format == "vtt":
+        save_as_vtt(segments, output_path)
+    elif format == "json":
+        save_as_json(segments, output_path)
+    return segments
+if __name__ == "__main__":
+    args = parse_args()
+    generate(args.model, args.audio, args.output, args.format, args.verbose)

nexaai/mlx_backend/mlx_audio/stt/models/__init__.py ADDED Viewed

File without changes

nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .parakeet import Model