npm - voice-mcp-server - Versions diffs - 0.3.0 → 0.3.1 - Mend

voice-mcp-server 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/adapters_real/elevenlabs_speaker.py +114 -64
package/src/adapters_real/live_mic.py +3 -0
package/src/daemon/audio_server.py +24 -16

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voice-mcp-server",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
   "main": "build/index.js",
   "type": "module",

package/src/adapters_real/elevenlabs_speaker.py CHANGED Viewed

@@ -2,9 +2,11 @@ import os
 import time
 import subprocess
 import httpx
+import threading
 from simulation.ports import ISpeaker
 from simulation.models import VirtualAudioFrame
 from dotenv import load_dotenv
+from logger import logger
 load_dotenv()
@@ -19,62 +21,95 @@ class ElevenLabsSpeaker(ISpeaker):
         self.voice_id = os.getenv("ELEVENLABS_VOICE_ID", voice_id)
         self.api_key = os.getenv("ELEVENLABS_API_KEY")
         self.temp_file = "/tmp/elevenlabs_output.mp3"
+        self._lock = threading.RLock()
+        self._is_preparing = False
+        self._stop_event = threading.Event()
+        self._thread = None
+        logger.info(f"ElevenLabs Speaker initialized. Voice ID: {self.voice_id}, API Key Present: {bool(self.api_key)}")
     def speak(self, text: str):
         if not text.strip():
             return
-        self.current_text = text
-        self.words = text.split()
+        # Cancel any current playback or preparation
+        self.flush()
+        with self._lock:
+            self.current_text = text
+            self.words = text.split()
+            self._is_preparing = True
+            self._stop_event.clear()
+            self.start_time = 0 # Won't start until afplay starts
+        self._thread = threading.Thread(target=self._generate_and_play, args=(text,), daemon=True)
+        self._thread.start()
-        if self.api_key:
-            url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
-            headers = {
-                "Accept": "audio/mpeg",
-                "Content-Type": "application/json",
-                "xi-api-key": self.api_key
-            }
-            data = {
-                "text": text,
-                "model_id": "eleven_multilingual_v2",
-                "voice_settings": {
-                    "stability": 0.5,
-                    "similarity_boost": 0.5
+    def _generate_and_play(self, text: str):
+        try:
+            if self.api_key:
+                url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
+                headers = {
+                    "Accept": "audio/mpeg",
+                    "Content-Type": "application/json",
+                    "xi-api-key": self.api_key
                 }
-            }
-            with httpx.Client() as client:
-                try:
+                data = {
+                    "text": text,
+                    "model_id": "eleven_multilingual_v2",
+                    "voice_settings": {
+                        "stability": 0.5,
+                        "similarity_boost": 0.5
+                    }
+                }
+                with httpx.Client() as client:
+                    logger.debug(f"Calling ElevenLabs API for Voice ID: {self.voice_id}")
                     response = client.post(url, json=data, headers=headers, timeout=10.0)
                     response.raise_for_status()
+                    if self._stop_event.is_set():
+                        return
                     with open(self.temp_file, "wb") as f:
                         f.write(response.content)
-                    # Play the downloaded audio
-                    self.start_time = time.time()
-                    self.process = subprocess.Popen(
-                        ["afplay", self.temp_file],
-                        stdout=subprocess.DEVNULL,
-                        stderr=subprocess.DEVNULL
-                    )
-                except Exception as e:
-                    print(f"ElevenLabs API Error: {e}")
-                    # Fallback to macOS say
+                    if self._stop_event.is_set():
+                        return
+                    with self._lock:
+                        # Play the downloaded audio
+                        self.start_time = time.time()
+                        self.process = subprocess.Popen(
+                            ["afplay", self.temp_file],
+                            stdout=subprocess.DEVNULL,
+                            stderr=subprocess.DEVNULL
+                        )
+            else:
+                logger.warning("No ELEVENLABS_API_KEY found, falling back to 'say'")
+                if self._stop_event.is_set(): return
+                with self._lock:
                     self.start_time = time.time()
                     self.process = subprocess.Popen(
                         ["say", text],
                         stdout=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL
                     )
-        else:
-            print("Warning: No ELEVENLABS_API_KEY found, falling back to 'say'")
-            self.start_time = time.time()
-            self.process = subprocess.Popen(
-                ["say", text],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL
-            )
+        except Exception as e:
+            logger.error(f"ElevenLabs Error: {e}")
+            if self._stop_event.is_set(): return
+            # Fallback to macOS say
+            with self._lock:
+                self.start_time = time.time()
+                self.process = subprocess.Popen(
+                    ["say", text],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL
+                )
+        finally:
+            with self._lock:
+                self._is_preparing = False
     def play_frame(self, frame: VirtualAudioFrame):
         pass
@@ -83,35 +118,50 @@ class ElevenLabsSpeaker(ISpeaker):
         pass
     def is_speaking(self) -> bool:
-        if self.process is None:
-            return False
-        is_running = self.process.poll() is None
-        if not is_running:
-            self.current_text = ""
-            self.words = []
-            self.process = None
-        return is_running
+        with self._lock:
+            if self._is_preparing:
+                return True
+            if self.process is None:
+                return False
+            is_running = self.process.poll() is None
+            if not is_running:
+                self.current_text = ""
+                self.words = []
+                self.process = None
+            return is_running
     def has_started_audio(self) -> bool:
-        return self.is_speaking()
+        with self._lock:
+            if self.process is None:
+                return False
+            return self.process.poll() is None
     def flush(self) -> str:
-        if not self.is_speaking():
-            return ""
-        # Immediately kill the playback process
-        self.process.kill()
-        # Explicitly wait for the process to terminate and reap it
-        self.process.wait()
-        elapsed_ms = (time.time() - self.start_time) * 1000
-        words_spoken = int(elapsed_ms * self.words_per_ms)
-        spoken = " ".join(self.words[:words_spoken])
-        self.current_text = ""
-        self.words = []
-        self.process = None
+        # Signal the thread to stop if it's still downloading
+        self._stop_event.set()
-        return spoken
+        with self._lock:
+            if not self.is_speaking():
+                self._is_preparing = False
+                return ""
+            # Immediately kill the playback process if it exists
+            if self.process:
+                self.process.kill()
+                self.process.wait()
+            # If we were preparing but hadn't started afplay yet, we spoken 0 words
+            if self.start_time == 0:
+                words_spoken = 0
+            else:
+                elapsed_ms = (time.time() - self.start_time) * 1000
+                words_spoken = int(elapsed_ms * self.words_per_ms)
+            spoken = " ".join(self.words[:words_spoken])
+            self.current_text = ""
+            self.words = []
+            self.process = None
+            self._is_preparing = False
+            return spoken

package/src/adapters_real/live_mic.py CHANGED Viewed

@@ -64,12 +64,15 @@ class LiveMicrophone(IMicrophone):
             raw_bytes = self.q.get(timeout=0.1) # Block briefly to act as clock
             # If we didn't get 320 bytes, that's weird but we handle it
             if len(raw_bytes) < self.chunk * 2:
+                logger.warning(f"LiveMicrophone read_frame got only {len(raw_bytes)} bytes instead of {self.chunk * 2}")
                 return VirtualAudioFrame(10, False, False, "", b"")
             return VirtualAudioFrame(10, False, False, "", raw_bytes)
         except queue.Empty:
+            logger.error("LiveMicrophone queue is EMPTY on read! (PyAudio might have crashed or stopped feeding data)")
             # If queue is empty, yield silence frame
             return VirtualAudioFrame(10, False, False, "", b"")
     def close(self):
         self.stop_stream()
         self.p.terminate()

package/src/daemon/audio_server.py CHANGED Viewed

@@ -80,7 +80,7 @@ def pre_download_models():
 def run_audio_daemon():
     """Runs the CoreEngine in a persistent background thread."""
-    global engine, mic, speaker, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
+    global engine, mic, speaker, vad, stt, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
     # Pre-download models so the daemon status reflects exactly what is happening
     pre_download_models()
@@ -118,8 +118,13 @@ def run_audio_daemon():
     try:
         while True:
+            current_engine = engine
+            if current_engine is None or mic is None:
+                time.sleep(0.1)
+                continue
             # If dormant, check for commands from FastAPI
-            if engine.state == State.EXECUTING:
+            if current_engine.state == State.EXECUTING:
                 try:
                     cmd = mcp_command_queue.get(timeout=0.1) # Blocks briefly
@@ -127,15 +132,15 @@ def run_audio_daemon():
                     mic.start_stream()
                     if hasattr(vad, "set_active"):
                         vad.set_active(True)
-                    engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
-                    engine.expect_reply = cmd.get("expect_reply", True)
+                    current_engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
+                    current_engine.expect_reply = cmd.get("expect_reply", True)
                 except queue.Empty:
                     pass
             else:
-                engine.tick()
+                current_engine.tick()
                 # Once we drop back to EXECUTING, we finished the conversation loop
-                if engine.state == State.EXECUTING:
+                if current_engine.state == State.EXECUTING:
                     mic.stop_stream()
                     if hasattr(vad, "set_active"):
                         vad.set_active(False)
@@ -231,18 +236,16 @@ async def reload_config():
         # 1. Stop the current engine
         if engine:
             engine.state = State.EXECUTING
-        if mic:
-            mic.close()
+        # Do NOT close the mic during hot-swap to prevent macOS CoreAudio permission drop!
+        # if mic:
+        #     mic.close()
         # 1b. CRITICAL: Explicitly obliterate old models from VRAM to prevent Out-Of-Memory (OOM) crashes on hot-swaps
         import gc
-        try:
-            del speaker
-            del vad
-            del stt
-            del engine
-        except NameError:
-            pass
+        speaker = None
+        vad = None
+        stt = None
+        engine = None
         gc.collect()
@@ -265,7 +268,12 @@ async def reload_config():
                 cfg = compose(config_name="config")
             # 3. Instantiate the new models on the fly
-            mic = instantiate(cfg.microphone)
+            if mic is None:
+                logger.info("Microphone was None during hot-swap, instantiating a new one.")
+                mic = instantiate(cfg.microphone)
+            else:
+                logger.info("Preserving existing Microphone instance during hot-swap.")
             speaker = instantiate(cfg.speaker)
             vad = instantiate(cfg.vad)
             stt = instantiate(cfg.stt)