npm - voicesmith-mcp - Versions diffs - 1.0.13 → 1.0.15 - Mend

voicesmith-mcp 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/config.json +2 -1
package/config.py +4 -0
package/package.json +1 -1
package/server.py +38 -7
package/session_registry.py +46 -0
package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
package/stt/mic_capture.py +323 -76
package/tts/__pycache__/audio_player.cpython-314.pyc +0 -0
package/tts/__pycache__/media_duck.cpython-314.pyc +0 -0
package/tts/__pycache__/speech_queue.cpython-314.pyc +0 -0
package/tts/audio_player.py +20 -14
package/tts/media_duck.py +146 -0
package/tts/speech_queue.py +1 -1
package/voice_registry.py +16 -0

package/config.json CHANGED Viewed

@@ -4,7 +4,8 @@
     "voices_path": "~/.local/share/voicesmith-mcp/models/voices-v1.0.bin",
     "default_voice": "am_eric",
     "default_speed": 1.0,
-    "audio_player": "mpv"
+    "audio_player": "mpv",
+    "duck_media": true
   },
   "stt": {
     "model_size": "base",

package/config.py CHANGED Viewed

@@ -26,6 +26,7 @@ class TTSConfig:
     default_voice: str = "am_eric"
     default_speed: float = 1.0
     audio_player: str = "mpv"
+    duck_media: bool = False
 @dataclass
@@ -99,6 +100,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
                     config.tts.default_speed = float(tts["default_speed"])
                 if "audio_player" in tts:
                     config.tts.audio_player = tts["audio_player"]
+                if "duck_media" in tts:
+                    config.tts.duck_media = bool(tts["duck_media"])
             # STT config
             if "stt" in data:
@@ -179,6 +182,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
             "default_voice": config.tts.default_voice,
             "default_speed": config.tts.default_speed,
             "audio_player": config.tts.audio_player,
+            "duck_media": config.tts.duck_media,
         },
         "stt": {
             "model_size": config.stt.model_size,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicesmith-mcp",
-  "version": "1.0.13",
+  "version": "1.0.15",
   "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
   "bin": {
     "voicesmith-mcp": "bin/cli.js"

package/server.py CHANGED Viewed

@@ -41,7 +41,7 @@ from shared import (
     get_logger,
 )
 from config import load_config, save_config, get_config_path, AppConfig
-from session_registry import register_session, unregister_session
+from session_registry import register_session, rename_session, unregister_session
 logger = get_logger("server")
@@ -82,7 +82,7 @@ def _init_tts(config: AppConfig):
     try:
         _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
-        _audio_player = AudioPlayer(config.tts.audio_player)
+        _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
         _speech_queue = SpeechQueue(_tts_engine, _audio_player)
         logger.info("TTS subsystem initialized")
     except TTSEngineError as e:
@@ -454,14 +454,18 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
         logger.info(f"Listening (prompt: {prompt})")
     try:
+        loop = asyncio.get_running_loop()
         # Play ready sound so the user knows to start speaking
         # Skip for push-to-talk (HTTP) — it has its own beep
         if prompt != "push-to-talk":
-            loop = asyncio.get_event_loop()
             await loop.run_in_executor(None, _play_ready_sound)
         start = time.perf_counter()
+        # Reset VAD state from any prior recording (LSTM hidden state + context)
+        _vad.reset()
         # Record audio with VAD
         audio = await _mic_capture.record(
             vad=_vad,
@@ -479,7 +483,6 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
         recording_ms = (time.perf_counter() - start) * 1000
         # Transcribe
-        loop = asyncio.get_event_loop()
         result = await loop.run_in_executor(
             None, _stt_engine.transcribe, audio, STT_SAMPLE_RATE
         )
@@ -565,6 +568,10 @@ async def get_voice_registry() -> dict:
 async def set_voice(name: str, voice: str) -> dict:
     """Assign or reassign a voice to an agent name.
+    Also renames the session so name and voice always match.
+    The name is derived from the voice ID (e.g., "am_fenrir" -> "Fenrir").
+    If the derived name is taken by another session, returns name_occupied error.
     Args:
         name: Agent name to assign.
         voice: Kokoro voice ID (e.g., "am_eric"). Must be valid.
@@ -579,17 +586,41 @@ async def set_voice(name: str, voice: str) -> dict:
             "message": f"Voice '{voice}' not found. Use list_voices to see available options.",
         }
-    _registry.set_voice(name, voice)
+    # Derive canonical name from voice ID (e.g., "am_fenrir" -> "Fenrir")
+    # The voice ID format is {prefix}_{name}, so split on underscore and capitalize
+    parts = voice.split("_", 1)
+    new_name = parts[1].capitalize() if len(parts) == 2 else name
+    old_name = _session_info["name"] if _session_info else name
+    # Update sessions.json with conflict check
+    if _session_info:
+        try:
+            updated = rename_session(os.getpid(), new_name, voice)
+            if updated:
+                _session_info.update(updated)
+        except ValueError:
+            return {
+                "success": False,
+                "error": "name_occupied",
+                "message": f"'{new_name}' is occupied by another session.",
+            }
+    # Update voice registry (remove old entry, add new)
+    _registry.rename_voice(old_name, new_name, voice)
     # Persist last voice name so it survives session restart / resume
     if _config is not None:
-        _config.last_voice_name = name
+        _config.last_voice_name = new_name
         try:
             save_config(_config)
         except Exception as e:
             logger.warning(f"Failed to persist last_voice_name: {e}")
-    return {"success": True, "name": name, "voice": voice}
+    result = {"success": True, "name": new_name, "voice": voice}
+    if old_name != new_name:
+        result["previous_name"] = old_name
+    return result
 @mcp.tool()

package/session_registry.py CHANGED Viewed

@@ -258,6 +258,52 @@ def register_session(
     return session
+def rename_session(pid: int, new_name: str, new_voice: str) -> Optional[dict]:
+    """Rename this server's session in the registry.
+    Updates the name and voice fields for the entry matching pid.
+    Returns the updated session dict, or None if PID not found.
+    Raises ValueError if new_name is taken by another active session.
+    """
+    path = _sessions_path()
+    if not path.exists():
+        return None
+    try:
+        with open(path, "r+") as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            sessions = _read_sessions(path)
+            sessions = _clean_stale(sessions)
+            # Find our entry
+            our_entry = None
+            for s in sessions:
+                if s.get("pid") == pid:
+                    our_entry = s
+                    break
+            if our_entry is None:
+                return None
+            # Check if new_name is taken by another session
+            if new_name != our_entry["name"]:
+                for s in sessions:
+                    if s.get("name") == new_name and s.get("pid") != pid:
+                        raise ValueError(
+                            f"'{new_name}' is occupied by another session (pid {s.get('pid')})"
+                        )
+            our_entry["name"] = new_name
+            our_entry["voice"] = new_voice
+            _write_sessions(path, sessions)
+            return dict(our_entry)
+    except ValueError:
+        raise
+    except OSError as e:
+        logger.warning(f"Failed to rename session: {e}")
+        return None
 def unregister_session() -> None:
     """Remove this server's session from the registry."""
     path = _sessions_path()

package/stt/__pycache__/mic_capture.cpython-314.pyc CHANGED Viewed

Binary file

package/stt/mic_capture.py CHANGED Viewed

@@ -1,7 +1,12 @@
 """Microphone capture with VAD-controlled recording."""
 import asyncio
+import os
+import platform
 import queue
+import socket
+import subprocess
+import threading
 import time
 from typing import Optional
@@ -12,6 +17,65 @@ from stt.vad import VoiceActivityDetector
 logger = get_logger("stt.mic")
+_CHUNK_SAMPLES = 512        # Silero VAD requires exactly 512-sample chunks at 16kHz
+_CHUNK_BYTES   = _CHUNK_SAMPLES * 4   # float32 = 4 bytes/sample → 2048 bytes/chunk
+_ZERO_CHECK_CHUNKS = 10    # ~320ms of silence before detecting TCC denial
+_AUDIO_SERVICE_SOCKET  = "/tmp/voicesmith-audio.sock"
+_LAUNCHAGENT_LABEL     = "com.voicesmith-mcp.audio"
+_LAUNCHAGENT_PLIST     = os.path.expanduser(
+    f"~/Library/LaunchAgents/{_LAUNCHAGENT_LABEL}.plist"
+)
+def _find_app_binary(name: str) -> Optional[str]:
+    """Return path to a named binary inside VoiceSmithMCP.app, or None."""
+    install_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    binary = os.path.join(install_dir, "VoiceSmithMCP.app", "Contents", "MacOS", name)
+    return binary if os.path.isfile(binary) and os.access(binary, os.X_OK) else None
+def _launchagent_available() -> bool:
+    """Return True if the VoiceSmithMCP audio LaunchAgent plist is installed."""
+    return os.path.isfile(_LAUNCHAGENT_PLIST)
+def _ensure_audio_service_running() -> None:
+    """Start the audio LaunchAgent if it is not already running.
+    The service is started via launchctl.  We then wait up to 3 seconds for
+    the Unix socket to appear, which signals the service is ready to accept
+    connections.
+    """
+    # If the socket exists and is connectable, service is already running.
+    if _socket_ready():
+        return
+    logger.info("Starting audio service via launchctl")
+    try:
+        subprocess.run(
+            ["launchctl", "start", _LAUNCHAGENT_LABEL],
+            capture_output=True,
+            timeout=5,
+        )
+    except Exception as e:
+        raise MicCaptureError(f"Failed to start audio service: {e}") from e
+    # Wait up to 3 s for the socket to appear.
+    for _ in range(30):
+        if _socket_ready():
+            return
+        time.sleep(0.1)
+    raise MicCaptureError(
+        "VoiceSmith audio service did not start in time.  "
+        f"Check {_LAUNCHAGENT_PLIST} and launchctl output."
+    )
+def _socket_ready() -> bool:
+    """Return True if the audio service socket file exists."""
+    return os.path.exists(_AUDIO_SERVICE_SOCKET)
 class MicCapture:
     """Microphone capture with voice activity detection."""
@@ -31,7 +95,11 @@ class MicCapture:
     ) -> Optional[np.ndarray]:
         """Record audio from the microphone until silence is detected.
-        Uses VAD to detect speech and stop recording after a period of silence.
+        On macOS, prefers the audio-service LaunchAgent backend which runs
+        under launchd (ppid=1), ensuring macOS TCC attributes mic permission
+        to VoiceSmithMCP.app rather than to the user's terminal app.
+        Falls back to the audio-capture subprocess if the LaunchAgent is not
+        installed, and to sounddevice on non-macOS systems.
         Args:
             vad: VoiceActivityDetector instance for speech detection.
@@ -48,6 +116,150 @@ class MicCapture:
         if self._recording:
             raise MicCaptureError("Another recording is already in progress")
+        # Reset VAD state between recordings.
+        vad.reset()
+        if platform.system() == "Darwin":
+            if _launchagent_available():
+                return await self._record_via_socket(
+                    vad, timeout, silence_threshold, cancel_event
+                )
+            # Legacy: subprocess fallback for installs without the LaunchAgent.
+            audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
+            if audio_capture_bin:
+                return await self._record_via_subprocess(
+                    audio_capture_bin, vad, timeout, silence_threshold, cancel_event
+                )
+        return await self._record_via_sounddevice(
+            vad, timeout, silence_threshold, cancel_event
+        )
+    # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
+    async def _record_via_socket(
+        self,
+        vad: VoiceActivityDetector,
+        timeout: float,
+        silence_threshold: float,
+        cancel_event: Optional[asyncio.Event],
+    ) -> Optional[np.ndarray]:
+        """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
+        The LaunchAgent runs under launchd so macOS TCC attributes mic access
+        to com.voicesmith-mcp.launcher, not to the parent terminal app.
+        """
+        loop = asyncio.get_running_loop()
+        # Ensure the service is up and the socket is ready.
+        try:
+            await loop.run_in_executor(None, _ensure_audio_service_running)
+        except MicCaptureError:
+            raise
+        except Exception as e:
+            raise MicCaptureError(f"Audio service error: {e}") from e
+        # Open socket connection.
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        try:
+            sock.connect(_AUDIO_SERVICE_SOCKET)
+        except OSError as e:
+            sock.close()
+            raise MicCaptureError(f"Cannot connect to audio service: {e}") from e
+        self._recording = True
+        self._stop_flag = False
+        self._audio_queue = queue.Queue()
+        def _reader() -> None:
+            """Background thread: reads socket chunks → audio_queue."""
+            try:
+                while True:
+                    data = b""
+                    while len(data) < _CHUNK_BYTES:
+                        got = sock.recv(_CHUNK_BYTES - len(data))
+                        if not got:
+                            return  # service closed connection
+                        data += got
+                    self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
+            except Exception as exc:
+                logger.debug(f"socket reader thread exiting: {exc}")
+        reader_thread = threading.Thread(target=_reader, daemon=True)
+        reader_thread.start()
+        logger.info("Microphone recording started (audio-service socket)")
+        try:
+            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
+        finally:
+            sock.close()  # signals service to stop sending for this session
+            reader_thread.join(timeout=1)
+            self._recording = False
+    # ── Subprocess backend (macOS legacy fallback) ─────────────────────────────
+    async def _record_via_subprocess(
+        self,
+        binary: str,
+        vad: VoiceActivityDetector,
+        timeout: float,
+        silence_threshold: float,
+        cancel_event: Optional[asyncio.Event],
+    ) -> Optional[np.ndarray]:
+        """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
+        self._recording = True
+        self._stop_flag = False
+        self._audio_queue = queue.Queue()
+        try:
+            proc = subprocess.Popen(
+                [binary],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                close_fds=True,
+            )
+        except Exception as e:
+            self._recording = False
+            raise MicCaptureError(f"Failed to start audio binary: {e}") from e
+        logger.info("Microphone recording started (subprocess fallback)")
+        def _reader() -> None:
+            try:
+                while True:
+                    data = proc.stdout.read(_CHUNK_BYTES)
+                    if not data or len(data) < _CHUNK_BYTES:
+                        break
+                    self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
+            except Exception as exc:
+                logger.debug(f"subprocess reader thread exiting: {exc}")
+        reader_thread = threading.Thread(target=_reader, daemon=True)
+        reader_thread.start()
+        try:
+            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
+        finally:
+            proc.terminate()
+            try:
+                proc.wait(timeout=1)
+            except Exception:
+                proc.kill()
+            reader_thread.join(timeout=1)
+            self._recording = False
+    # ── sounddevice backend (non-macOS fallback) ───────────────────────────────
+    async def _record_via_sounddevice(
+        self,
+        vad: VoiceActivityDetector,
+        timeout: float,
+        silence_threshold: float,
+        cancel_event: Optional[asyncio.Event],
+    ) -> Optional[np.ndarray]:
+        """Record using sounddevice / PortAudio (fallback for non-macOS)."""
         try:
             import sounddevice as sd
         except Exception as e:
@@ -56,15 +268,6 @@ class MicCapture:
         self._recording = True
         self._stop_flag = False
         self._audio_queue = queue.Queue()
-        chunks: list[np.ndarray] = []
-        speech_detected = False
-        silence_duration = 0.0
-        loop = asyncio.get_event_loop()
-        # Reset VAD state — the LSTM hidden state and context window must
-        # be cleared between recordings to avoid stale state from previous
-        # audio affecting speech detection.
-        vad.reset()
         stream = None
         try:
@@ -72,94 +275,138 @@ class MicCapture:
                 samplerate=self._sample_rate,
                 channels=1,
                 dtype="float32",
-                blocksize=512,  # Silero VAD expects 512-sample chunks at 16kHz
+                blocksize=_CHUNK_SAMPLES,
                 callback=self._audio_callback,
             )
             stream.start()
-            logger.info("Microphone recording started")
-            # Discard the first ~200ms of audio to avoid picking up residual
-            # speaker output (Tink sound or TTS playback that just finished).
-            # This prevents VAD from detecting speaker bleed as "speech" and
-            # then cutting off when the bleed stops.
-            flush_chunks = int(0.2 * self._sample_rate / 512)  # ~6 chunks
-            for _ in range(flush_chunks):
-                try:
-                    self._audio_queue.get(timeout=0.1)
-                except queue.Empty:
-                    break
-            start_time = asyncio.get_event_loop().time()
-            while not self._stop_flag:
-                # Check cancellation
-                if cancel_event and cancel_event.is_set():
-                    logger.info("Recording cancelled by event")
-                    break
-                # Check timeout
-                elapsed = asyncio.get_event_loop().time() - start_time
-                if elapsed >= timeout:
-                    if not speech_detected:
-                        logger.info("Recording timed out with no speech detected")
-                    else:
-                        logger.info("Recording timed out")
-                    break
-                # Get audio chunk from queue
-                try:
-                    chunk = await loop.run_in_executor(
-                        None, self._audio_queue.get, True, 0.1
-                    )
-                except queue.Empty:
-                    continue
-                chunks.append(chunk)
-                is_speech = vad.is_speech(chunk)
-                if is_speech:
-                    speech_detected = True
-                    silence_duration = 0.0
-                elif speech_detected:
-                    # Count silence after speech was detected
-                    chunk_duration = len(chunk) / self._sample_rate
-                    silence_duration += chunk_duration
-                    if silence_duration >= silence_threshold:
-                        logger.info(
-                            f"Silence threshold reached ({silence_threshold}s), stopping"
-                        )
-                        break
-            if not chunks or not speech_detected:
-                return None
-            return np.concatenate(chunks).flatten()
+            logger.info("Microphone recording started (sounddevice)")
+            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
+            return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         except MicCaptureError:
             raise
         except Exception as e:
             raise MicCaptureError(f"Recording failed: {e}") from e
         finally:
-            # Safely tear down the audio stream. The CoreAudio IO thread may
-            # still be executing the callback when we call stop(). Wait briefly
-            # between stop() and close() to let the IO thread finish — this
-            # prevents the segfault in libffi/PortAudio where the callback
-            # dereferences freed memory.
             if stream is not None:
                 try:
                     stream.stop()
-                    time.sleep(0.05)  # Let CoreAudio IO thread finish
+                    time.sleep(0.05)
                     stream.close()
                 except Exception as e:
                     logger.debug(f"Stream teardown: {e}")
             self._recording = False
-    def _audio_callback(self, indata, frames, time, status) -> None:
+    # ── Shared helpers ─────────────────────────────────────────────────────────
+    def _flush_queue(self, n_chunks: int, chunk_timeout: float = 0.15) -> None:
+        """Discard the first n_chunks from the audio queue (drops speaker bleed)."""
+        for _ in range(n_chunks):
+            try:
+                self._audio_queue.get(timeout=chunk_timeout)
+            except queue.Empty:
+                break
+    # ── Shared VAD loop ────────────────────────────────────────────────────────
+    async def _run_vad_loop(
+        self,
+        vad: VoiceActivityDetector,
+        timeout: float,
+        silence_threshold: float,
+        cancel_event: Optional[asyncio.Event],
+    ) -> Optional[np.ndarray]:
+        """VAD recording loop — shared by all capture backends.
+        Reads 512-sample float32 chunks from self._audio_queue, runs Silero VAD
+        on each, and returns when silence_threshold is exceeded after speech,
+        timeout elapses, or cancel_event fires.
+        Raises:
+            MicCaptureError: If audio is all-zeros (TCC denial detected).
+        """
+        loop = asyncio.get_running_loop()
+        chunks: list[np.ndarray] = []
+        speech_detected = False
+        silence_duration = 0.0
+        zero_check_done = False
+        start_time = loop.time()
+        while not self._stop_flag:
+            if cancel_event and cancel_event.is_set():
+                logger.info("Recording cancelled by event")
+                break
+            elapsed = loop.time() - start_time
+            if elapsed >= timeout:
+                if not speech_detected:
+                    logger.info("Recording timed out with no speech detected")
+                else:
+                    logger.info("Recording timed out")
+                break
+            try:
+                chunk = await loop.run_in_executor(
+                    None, self._audio_queue.get, True, 0.1
+                )
+            except queue.Empty:
+                continue
+            chunks.append(chunk)
+            if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
+                zero_check_done = True
+                if all(np.max(np.abs(c)) == 0.0 for c in chunks):
+                    raise MicCaptureError(self._zero_audio_message())
+            is_speech = vad.is_speech(chunk)
+            if is_speech:
+                speech_detected = True
+                silence_duration = 0.0
+            elif speech_detected:
+                silence_duration += len(chunk) / self._sample_rate
+                if silence_duration >= silence_threshold:
+                    logger.info(
+                        f"Silence threshold reached ({silence_threshold}s), stopping"
+                    )
+                    break
+        if not chunks or not speech_detected:
+            return None
+        return np.concatenate(chunks).flatten()
+    # ── sounddevice callback ───────────────────────────────────────────────────
+    def _audio_callback(self, indata, frames, time_info, status) -> None:
         """Sounddevice callback — pushes audio chunks to the queue."""
         if status:
             logger.warning(f"Audio callback status: {status}")
         self._audio_queue.put(indata.copy())
+    # ── Error message ──────────────────────────────────────────────────────────
+    @staticmethod
+    def _zero_audio_message() -> str:
+        """Build an error message for zero-amplitude mic input."""
+        msg = (
+            "Microphone is returning silent audio. "
+            "The audio stream opened successfully but every sample is zero."
+        )
+        if platform.system() == "Darwin":
+            msg += (
+                "\n\nmacOS is blocking mic access.  The VoiceSmithMCP audio service "
+                "may not have been granted Microphone permission yet.  "
+                "Check System Settings > Privacy & Security > Microphone and "
+                "ensure VoiceSmithMCP is enabled.\n\n"
+                "If VoiceSmithMCP is not listed, re-run the installer:\n"
+                "  ./install.sh"
+            )
+        return msg
+    # ── Properties / control ──────────────────────────────────────────────────
     @property
     def is_recording(self) -> bool:
         """Return whether the microphone is currently recording."""

package/tts/__pycache__/audio_player.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/__pycache__/media_duck.cpython-314.pyc ADDED Viewed

Binary file

package/tts/__pycache__/speech_queue.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/audio_player.py CHANGED Viewed

@@ -10,6 +10,7 @@ import time
 import soundfile as sf
 from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
+from tts.media_duck import duck, unduck
 logger = get_logger("tts.audio_player")
@@ -17,8 +18,9 @@ logger = get_logger("tts.audio_player")
 class AudioPlayer:
     """Plays audio samples through an external player process."""
-    def __init__(self, player_command: str = "mpv") -> None:
+    def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
         self._player_command = player_command
+        self._duck_media = duck_media
         self._process: subprocess.Popen | None = None
         # Detect platform fallback if player_command is not available
@@ -82,19 +84,23 @@ class AudioPlayer:
             # Cross-session audio lock: prevents overlapping playback
             # flock is kernel-managed — auto-released on crash, no stale locks
-            with open(AUDIO_LOCK_PATH, "w") as lock_file:
-                fcntl.flock(lock_file, fcntl.LOCK_EX)
-                start = time.perf_counter()
-                self._process = subprocess.Popen(
-                    cmd,
-                    stdout=subprocess.DEVNULL,
-                    stderr=subprocess.DEVNULL,
-                )
-                self._process.wait()
-                duration_ms = (time.perf_counter() - start) * 1000
-            # Lock released when lock_file closes
+            paused_apps = duck() if self._duck_media else []
+            try:
+                with open(AUDIO_LOCK_PATH, "w") as lock_file:
+                    fcntl.flock(lock_file, fcntl.LOCK_EX)
+                    start = time.perf_counter()
+                    self._process = subprocess.Popen(
+                        cmd,
+                        stdout=subprocess.DEVNULL,
+                        stderr=subprocess.DEVNULL,
+                    )
+                    self._process.wait()
+                    duration_ms = (time.perf_counter() - start) * 1000
+                # Lock released when lock_file closes
+            finally:
+                unduck(paused_apps)
             if self._process.returncode != 0:
                 return PlaybackResult(

package/tts/media_duck.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""macOS media ducking via osascript.
+Pauses media apps (Apple Music, Spotify) and browser tabs (Chrome, Brave,
+Edge, Safari) before VoiceSmith audio playback and resumes them afterward.
+No-ops on non-macOS systems.
+Browser ducking uses JavaScript injection via AppleScript.  The first time
+each browser is targeted, macOS will prompt for Automation permission — approve
+once and it is remembered.
+Usage:
+    paused = duck()        # pause everything playing; returns opaque token list
+    ...play audio...
+    unduck(paused)         # resume only what we paused
+"""
+import platform
+import subprocess
+from shared import get_logger
+logger = get_logger("tts.media_duck")
+# ── Native media apps ─────────────────────────────────────────────────────────
+# (display name, AppleScript target)
+_APPS = [
+    ("Apple Music", "Music"),
+    ("Spotify",     "Spotify"),
+]
+# ── Browsers ──────────────────────────────────────────────────────────────────
+# (display name, AppleScript target, family: "chrome" | "safari")
+_BROWSERS = [
+    ("Google Chrome",   "Google Chrome",   "chrome"),
+    ("Brave Browser",   "Brave Browser",   "chrome"),
+    ("Microsoft Edge",  "Microsoft Edge",  "chrome"),
+    ("Safari",          "Safari",          "safari"),
+]
+# JS injected into every tab on duck: pause playing media and mark it.
+_JS_PAUSE = (
+    "document.querySelectorAll('video,audio').forEach(function(v){"
+    "if(!v.paused){v.pause();v.dataset.voicesmithPaused='1'}"
+    "})"
+)
+# JS injected on unduck: resume only elements we marked, then clear the mark.
+_JS_RESUME = (
+    "document.querySelectorAll('video,audio').forEach(function(v){"
+    "if(v.dataset.voicesmithPaused){delete v.dataset.voicesmithPaused;v.play()}"
+    "})"
+)
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _osascript(script: str) -> str:
+    """Run an AppleScript (may be multi-line); return stdout stripped, or '' on error."""
+    try:
+        result = subprocess.run(
+            ["osascript"],
+            input=script,
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        return result.stdout.strip()
+    except Exception:
+        return ""
+def _browser_script(target: str, family: str, js: str) -> str:
+    """Build an AppleScript that runs js in every tab of target browser."""
+    if family == "safari":
+        exec_stmt = f'do JavaScript "{js}" in t'
+    else:  # chrome family
+        exec_stmt = f'execute t javascript "{js}"'
+    return f"""\
+tell application "{target}"
+    repeat with w in windows
+        repeat with t in tabs of w
+            try
+                {exec_stmt}
+            end try
+        end repeat
+    end repeat
+end tell"""
+# ── Public API ────────────────────────────────────────────────────────────────
+def duck() -> list[str]:
+    """Pause any playing media apps and browser tabs.
+    Returns:
+        Opaque list of tokens — pass unchanged to unduck().
+    """
+    if platform.system() != "Darwin":
+        return []
+    paused: list[str] = []
+    # Native apps (Music, Spotify)
+    for display_name, target in _APPS:
+        if _osascript(f'application "{target}" is running') != "true":
+            continue
+        if _osascript(f'tell application "{target}" to get player state') == "playing":
+            _osascript(f'tell application "{target}" to pause')
+            paused.append(target)
+            logger.debug(f"Ducked {display_name}")
+    # Browsers — inject pause JS into every tab
+    for display_name, target, family in _BROWSERS:
+        if _osascript(f'application "{target}" is running') != "true":
+            continue
+        _osascript(_browser_script(target, family, _JS_PAUSE))
+        paused.append(f"browser:{target}")
+        logger.debug(f"Ducked browser tabs in {display_name}")
+    return paused
+def unduck(paused: list[str]) -> None:
+    """Resume apps and browser tabs paused by duck().
+    Args:
+        paused: The list returned by a previous duck() call.
+    """
+    if platform.system() != "Darwin":
+        return
+    for token in paused:
+        if token.startswith("browser:"):
+            target = token[len("browser:"):]
+            # family lookup for resume script
+            family = next(
+                (f for _, t, f in _BROWSERS if t == target),
+                "chrome",
+            )
+            _osascript(_browser_script(target, family, _JS_RESUME))
+            logger.debug(f"Unducked browser tabs in {target}")
+        else:
+            _osascript(f'tell application "{token}" to play')
+            logger.debug(f"Unducked {token}")

package/tts/speech_queue.py CHANGED Viewed

@@ -55,7 +55,7 @@ class SpeechQueue:
         speed: float,
     ) -> SpeakResult:
         """Internal: synthesize and play text, blocking until done."""
-        loop = asyncio.get_event_loop()
+        loop = asyncio.get_running_loop()
         self._speaking = True
         total_duration_ms = 0.0
         total_synthesis_ms = 0.0

package/voice_registry.py CHANGED Viewed

@@ -87,6 +87,22 @@ class VoiceRegistry:
         logger.info(f"Set voice '{voice_id}' for '{name}'")
         return True
+    def rename_voice(self, old_name: str, new_name: str, voice_id: str) -> bool:
+        """Rename an agent's registry entry and set a new voice.
+        Removes the old name entry and creates a new one.
+        If old_name == new_name, just updates the voice in place.
+        Returns True if the voice_id is valid, False otherwise.
+        """
+        if voice_id not in ALL_VOICE_IDS:
+            logger.warning(f"Invalid voice ID '{voice_id}' for rename '{old_name}' -> '{new_name}'")
+            return False
+        if old_name != new_name and old_name in self._registry:
+            del self._registry[old_name]
+        self._registry[new_name] = voice_id
+        logger.info(f"Renamed '{old_name}' -> '{new_name}' with voice '{voice_id}'")
+        return True
     def get_registry(self) -> dict[str, str]:
         """Return a copy of the current registry."""
         return dict(self._registry)