npm - voicesmith-mcp - Versions diffs - 1.0.16 → 1.0.17 - Mend

voicesmith-mcp 1.0.16 → 1.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicesmith-mcp",
-  "version": "1.0.16",
+  "version": "1.0.17",
   "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
   "bin": {
     "voicesmith-mcp": "bin/cli.js"

package/server.py CHANGED Viewed

@@ -42,6 +42,19 @@ from shared import (
 )
 from config import load_config, save_config, get_config_path, AppConfig
 from session_registry import register_session, rename_session, unregister_session
+from tts.media_duck import duck, unduck, is_bluetooth_output
+async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
+    """Unduck after a brief delay so the MCP response reaches the client first.
+    On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
+    codec switch that macOS performs when the microphone session ends.
+    """
+    if is_bluetooth_output():
+        delay = max(delay, 3.0)
+    await asyncio.sleep(delay)
+    unduck(paused_apps)
 logger = get_logger("server")
@@ -63,6 +76,7 @@ _config: AppConfig = None
 _muted = False
 _listen_cancel_event: asyncio.Event = None
 _listen_active = False
+_suppress_duck = False  # Set by speak_then_listen to prevent inner duck/unduck gaps
 _startup_time = time.time()
 _last_tool_call = time.time()  # Updated on every MCP tool call
 _session_info: dict = None
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
     try:
         _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
-        _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
-        _speech_queue = SpeechQueue(_tts_engine, _audio_player)
+        _audio_player = AudioPlayer(config.tts.audio_player)
+        _speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
         logger.info("TTS subsystem initialized")
     except TTSEngineError as e:
         logger.error(f"TTS initialization failed: {e}")
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
     if prompt:
         logger.info(f"Listening (prompt: {prompt})")
+    # Duck media while recording so the mic doesn't pick up playback
+    # Skip if speak_then_listen already holds the duck
+    paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
     try:
         loop = asyncio.get_running_loop()
-        # Play ready sound so the user knows to start speaking
-        # Skip for push-to-talk (HTTP) — it has its own beep
-        if prompt != "push-to-talk":
-            await loop.run_in_executor(None, _play_ready_sound)
         start = time.perf_counter()
         # Reset VAD state from any prior recording (LSTM hidden state + context)
         _vad.reset()
+        # Play the ready sound AFTER the mic is live (via on_ready callback)
+        # so the user doesn't start speaking into a dead mic.
+        ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
         # Record audio with VAD
         audio = await _mic_capture.record(
             vad=_vad,
             timeout=timeout,
             silence_threshold=silence_threshold,
             cancel_event=_listen_cancel_event,
+            on_ready=ready_cb,
         )
         if _listen_cancel_event.is_set():
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
         logger.error(f"listen failed: {e}")
         return {"success": False, "error": "listen_failed", "message": str(e)}
     finally:
+        if paused_apps:
+            asyncio.create_task(_deferred_unduck(paused_apps))
         _listen_active = False
         _listen_cancel_event = None
         # Reclaim mic for wake listener
@@ -524,19 +544,34 @@ async def speak_then_listen(
         timeout: Max seconds to wait for response (default 15).
         silence_threshold: Seconds of silence before stopping (default 1.5).
     """
-    speak_result = await speak(name, text, speed, block=True)
+    global _suppress_duck
+    # Duck once for the entire speak+listen operation to avoid a
+    # brief unduck gap between speak finishing and listen starting.
+    should_duck = _config and _config.tts.duck_media
+    paused_apps = duck() if should_duck else []
-    if not speak_result.get("success"):
-        return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
+    # Suppress inner ducking in SpeechQueue and listen()
+    saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
+    if _speech_queue and should_duck:
+        _speech_queue._duck_media = False
+    _suppress_duck = True
-    listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
+    try:
+        speak_result = await speak(name, text, speed, block=True)
-    # If listen timed out, speak a nudge and fall back to text
-    if listen_result.get("error") == "timeout":
-        nudge_result = await speak(name, "I didn't catch that. Go ahead and type it.", speed, block=True)
-        listen_result["nudge_spoken"] = nudge_result.get("success", False)
+        if not speak_result.get("success"):
+            return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
-    return {"speak": speak_result, "listen": listen_result}
+        listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
+        return {"speak": speak_result, "listen": listen_result}
+    finally:
+        _suppress_duck = False
+        if _speech_queue:
+            _speech_queue._duck_media = saved_queue_duck
+        if paused_apps:
+            asyncio.create_task(_deferred_unduck(paused_apps))
 @mcp.tool()

package/stt/mic_capture.py CHANGED Viewed

@@ -8,18 +8,20 @@ import socket
 import subprocess
 import threading
 import time
-from typing import Optional
+from typing import Callable, Optional
 import numpy as np
 from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
 from stt.vad import VoiceActivityDetector
+from tts.media_duck import is_bluetooth_output
 logger = get_logger("stt.mic")
 _CHUNK_SAMPLES = 512        # Silero VAD requires exactly 512-sample chunks at 16kHz
 _CHUNK_BYTES   = _CHUNK_SAMPLES * 4   # float32 = 4 bytes/sample → 2048 bytes/chunk
-_ZERO_CHECK_CHUNKS = 10    # ~320ms of silence before detecting TCC denial
+_ZERO_CHECK_CHUNKS = 25    # ~800ms — exceeds CoreAudio cold-start latency (~544ms)
+_ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
 _AUDIO_SERVICE_SOCKET  = "/tmp/voicesmith-audio.sock"
 _LAUNCHAGENT_LABEL     = "com.voicesmith-mcp.audio"
@@ -92,6 +94,7 @@ class MicCapture:
         timeout: float = 15,
         silence_threshold: float = 1.5,
         cancel_event: Optional[asyncio.Event] = None,
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record audio from the microphone until silence is detected.
@@ -106,6 +109,9 @@ class MicCapture:
             timeout: Maximum seconds to wait for speech (default 15).
             silence_threshold: Seconds of silence before stopping (default 1.5).
             cancel_event: Optional asyncio.Event to cancel recording.
+            on_ready: Optional callback invoked once the mic is live and
+                      ready to capture.  Called after hardware warm-up /
+                      flush but before the VAD loop starts.
         Returns:
             Numpy array of recorded audio, or None if cancelled/timeout.
@@ -122,17 +128,17 @@ class MicCapture:
         if platform.system() == "Darwin":
             if _launchagent_available():
                 return await self._record_via_socket(
-                    vad, timeout, silence_threshold, cancel_event
+                    vad, timeout, silence_threshold, cancel_event, on_ready
                 )
             # Legacy: subprocess fallback for installs without the LaunchAgent.
             audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
             if audio_capture_bin:
                 return await self._record_via_subprocess(
-                    audio_capture_bin, vad, timeout, silence_threshold, cancel_event
+                    audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
                 )
         return await self._record_via_sounddevice(
-            vad, timeout, silence_threshold, cancel_event
+            vad, timeout, silence_threshold, cancel_event, on_ready
         )
     # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
@@ -143,6 +149,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
@@ -190,7 +197,10 @@ class MicCapture:
         logger.info("Microphone recording started (audio-service socket)")
         try:
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            # Flush 2 chunks (~64ms) for AudioQueue hardware settle.
+            self._flush_queue(2)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         finally:
             sock.close()  # signals service to stop sending for this session
@@ -206,6 +216,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
         self._recording = True
@@ -239,7 +250,9 @@ class MicCapture:
         reader_thread.start()
         try:
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            self._flush_queue(2)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         finally:
             proc.terminate()
@@ -258,6 +271,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record using sounddevice / PortAudio (fallback for non-macOS)."""
         try:
@@ -281,7 +295,9 @@ class MicCapture:
             stream.start()
             logger.info("Microphone recording started (sounddevice)")
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
+            self._flush_queue(2, chunk_timeout=0.1)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         except MicCaptureError:
             raise
@@ -330,6 +346,8 @@ class MicCapture:
         speech_detected = False
         silence_duration = 0.0
         zero_check_done = False
+        # Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
+        zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
         start_time = loop.time()
         while not self._stop_flag:
@@ -354,7 +372,7 @@ class MicCapture:
             chunks.append(chunk)
-            if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
+            if not zero_check_done and len(chunks) >= zero_threshold:
                 zero_check_done = True
                 if all(np.max(np.abs(c)) == 0.0 for c in chunks):
                     raise MicCaptureError(self._zero_audio_message())

package/tts/audio_player.py CHANGED Viewed

@@ -10,7 +10,6 @@ import time
 import soundfile as sf
 from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
-from tts.media_duck import duck, unduck
 logger = get_logger("tts.audio_player")
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
 class AudioPlayer:
     """Plays audio samples through an external player process."""
-    def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
+    def __init__(self, player_command: str = "mpv") -> None:
         self._player_command = player_command
-        self._duck_media = duck_media
         self._process: subprocess.Popen | None = None
         # Detect platform fallback if player_command is not available
@@ -84,23 +82,19 @@ class AudioPlayer:
             # Cross-session audio lock: prevents overlapping playback
             # flock is kernel-managed — auto-released on crash, no stale locks
-            paused_apps = duck() if self._duck_media else []
-            try:
-                with open(AUDIO_LOCK_PATH, "w") as lock_file:
-                    fcntl.flock(lock_file, fcntl.LOCK_EX)
-                    start = time.perf_counter()
-                    self._process = subprocess.Popen(
-                        cmd,
-                        stdout=subprocess.DEVNULL,
-                        stderr=subprocess.DEVNULL,
-                    )
-                    self._process.wait()
-                    duration_ms = (time.perf_counter() - start) * 1000
-                # Lock released when lock_file closes
-            finally:
-                unduck(paused_apps)
+            with open(AUDIO_LOCK_PATH, "w") as lock_file:
+                fcntl.flock(lock_file, fcntl.LOCK_EX)
+                start = time.perf_counter()
+                self._process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
+                self._process.wait()
+                duration_ms = (time.perf_counter() - start) * 1000
+            # Lock released when lock_file closes
             if self._process.returncode != 0:
                 return PlaybackResult(

package/tts/kokoro_engine.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import time
+import numpy as np
 from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
 logger = get_logger("tts.kokoro")
@@ -47,6 +49,11 @@ class KokoroEngine:
             samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
             synthesis_ms = (time.perf_counter() - start) * 1000
+            # Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
+            # (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
+            pad = int(sample_rate * 0.10)
+            samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
             duration_ms = (len(samples) / sample_rate) * 1000
             return SynthesisResult(

package/tts/media_duck.py CHANGED Viewed

@@ -14,6 +14,8 @@ Usage:
     unduck(paused)         # resume only what we paused
 """
+import ctypes
+import ctypes.util
 import platform
 import subprocess
@@ -89,6 +91,66 @@ tell application "{target}"
 end tell"""
+# ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
+def is_bluetooth_output() -> bool:
+    """Return True if the default audio output is a Bluetooth device.
+    Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
+    of the default output device.  Returns False on non-macOS or on error.
+    """
+    if platform.system() != "Darwin":
+        return False
+    try:
+        lib_path = ctypes.util.find_library("CoreAudio")
+        if not lib_path:
+            return False
+        ca = ctypes.cdll.LoadLibrary(lib_path)
+        class _AudioObjectPropertyAddress(ctypes.Structure):
+            _fields_ = [
+                ("mSelector", ctypes.c_uint32),
+                ("mScope", ctypes.c_uint32),
+                ("mElement", ctypes.c_uint32),
+            ]
+        # CoreAudio FourCC constants
+        _SYS_OBJ   = 1                                            # kAudioObjectSystemObject
+        _SCOPE_G   = int.from_bytes(b"glob", "big")               # kAudioObjectPropertyScopeGlobal
+        _ELEM_M    = 0                                             # kAudioObjectPropertyElementMain
+        _DEF_OUT   = int.from_bytes(b"dOut", "big")                # kAudioHardwarePropertyDefaultOutputDevice
+        _TRANS     = int.from_bytes(b"tran", "big")                # kAudioDevicePropertyTransportType
+        _BT        = int.from_bytes(b"blue", "big")                # kAudioDeviceTransportTypeBluetooth
+        _BT_LE     = int.from_bytes(b"blea", "big")                # kAudioDeviceTransportTypeBluetoothLE
+        # Get default output device ID
+        addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
+        device_id = ctypes.c_uint32(0)
+        size = ctypes.c_uint32(4)
+        err = ca.AudioObjectGetPropertyData(
+            _SYS_OBJ, ctypes.byref(addr), 0, None,
+            ctypes.byref(size), ctypes.byref(device_id),
+        )
+        if err != 0:
+            return False
+        # Get transport type of that device
+        addr.mSelector = _TRANS
+        transport = ctypes.c_uint32(0)
+        size = ctypes.c_uint32(4)
+        err = ca.AudioObjectGetPropertyData(
+            device_id.value, ctypes.byref(addr), 0, None,
+            ctypes.byref(size), ctypes.byref(transport),
+        )
+        if err != 0:
+            return False
+        return transport.value in (_BT, _BT_LE)
+    except Exception:
+        return False
 # ── Public API ────────────────────────────────────────────────────────────────
 def duck() -> list[str]:

package/tts/speech_queue.py CHANGED Viewed

@@ -6,6 +6,7 @@ import time
 from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
 from tts.kokoro_engine import KokoroEngine
 from tts.audio_player import AudioPlayer
+from tts.media_duck import duck, unduck
 logger = get_logger("tts.speech_queue")
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
 class SpeechQueue:
     """Manages sequential speech synthesis and playback."""
-    def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
+    def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
         self._engine = engine
         self._player = player
+        self._duck_media = duck_media
         self._queue: asyncio.Queue = asyncio.Queue()
         self._speaking = False
@@ -60,6 +62,9 @@ class SpeechQueue:
         total_duration_ms = 0.0
         total_synthesis_ms = 0.0
+        # Duck media for the entire utterance, not per-chunk
+        paused_apps = duck() if self._duck_media else []
         try:
             chunks = self.chunk_text(text)
@@ -105,6 +110,7 @@ class SpeechQueue:
                 error=str(e),
             )
         finally:
+            unduck(paused_apps)
             self._speaking = False
     def stop(self) -> bool: