npm - voicesmith-mcp - Versions diffs - 1.0.16 → 1.0.18 - Mend

voicesmith-mcp 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +17 -10
package/config.py +4 -0
package/package.json +1 -1
package/server.py +65 -16
package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
package/stt/mic_capture.py +27 -9
package/tts/__pycache__/audio_player.cpython-314.pyc +0 -0
package/tts/__pycache__/kokoro_engine.cpython-314.pyc +0 -0
package/tts/__pycache__/media_duck.cpython-314.pyc +0 -0
package/tts/__pycache__/speech_queue.cpython-314.pyc +0 -0
package/tts/audio_player.py +14 -20
package/tts/kokoro_engine.py +7 -0
package/tts/media_duck.py +62 -0
package/tts/speech_queue.py +7 -1

package/README.md CHANGED Viewed

@@ -39,7 +39,7 @@ What the AI does automatically:
 | Moment | What happens |
 |--------|-------------|
-| You give it a task | Speaks a brief acknowledgment |
+| You give it a task | Gets to work (speaks only when clarifying approach) |
 | It finishes work | Speaks a summary of what was done |
 | It has a question | Asks out loud, then listens for your voice response |
 | Voice tools unavailable | Falls back to text silently |
@@ -112,7 +112,8 @@ The MCP server runs as a local process alongside your IDE. It communicates over
 - **TTS**: Kokoro ONNX — fast neural TTS, 54 voices, no GPU needed
 - **STT**: faster-whisper — OpenAI Whisper running locally via CTranslate2
 - **VAD**: Silero VAD — voice activity detection for clean recordings
-- **Audio**: mpv for playback, sounddevice for recording
+- **Audio**: mpv for playback; CoreAudio via native app bundle on macOS (sounddevice fallback on Linux)
+- **Media ducking**: Auto-pauses Apple Music, Spotify, and browser audio during speech (macOS)
 ## Multi-Session
@@ -131,16 +132,24 @@ Config lives at `~/.local/share/voicesmith-mcp/config.json`. Key settings:
   "main_agent": "Eric",
   "tts": {
     "default_voice": "am_eric",
-    "audio_player": "mpv"
+    "audio_player": "mpv",
+    "duck_media": true
   },
   "stt": {
     "model_size": "base",
     "language": "en",
-    "vad_threshold": 0.3
+    "vad_threshold": 0.3,
+    "nudge_on_timeout": false
   }
 }
 ```
+| Setting | Description | Default |
+|---------|-------------|---------|
+| `tts.duck_media` | Auto-pause music/browser audio during speech (macOS) | `true` |
+| `stt.nudge_on_timeout` | Speak "I didn't catch that" when listen times out | `false` |
+| `stt.vad_threshold` | Voice detection sensitivity (lower = more sensitive) | `0.3` |
 Re-run `npx voicesmith-mcp install` to change your voice or update settings. Existing configuration is preserved — only new defaults are added.
 ## Requirements
@@ -166,16 +175,14 @@ Re-run `npx voicesmith-mcp install` to change your voice or update settings. Exi
 ### The AI can't hear me (listen returns empty or times out)
-**Check microphone permissions.** On macOS, the terminal app that runs your IDE needs microphone access:
+**Check microphone permissions.** On macOS, VoiceSmith uses a native app bundle (`VoiceSmithMCP.app`) for mic access. The first time it records, macOS should show a permission dialog for the app. If it didn't:
 1. Open **System Settings > Privacy & Security > Microphone**
-2. Make sure your terminal app is listed and enabled:
-   - **Warp**, **Terminal.app**, or **iTerm2** — for Claude Code
-   - **Cursor** or **VS Code** — if using those IDEs directly
-3. If the app isn't listed, the first `listen` call should trigger the permission prompt. Approve it and try again.
+2. Look for **VoiceSmithMCP** and make sure it's enabled
+3. If it's not listed, the LaunchAgent may not be running — try reinstalling: `npx voicesmith-mcp install`
 > [!IMPORTANT]
-> The Python process inherits microphone permissions from the app that launched it. If your terminal doesn't have mic access, listen will silently fail.
+> If the server detects silent audio (all zeros for ~320ms), it returns an error pointing you to the microphone permission settings. This usually means macOS TCC denied mic access.
 **Check your audio input device.** If an external mic is selected but not connected, the server opens it but gets silence:
 - Open **System Settings > Sound > Input** and verify the correct mic is selected

package/config.py CHANGED Viewed

@@ -37,6 +37,7 @@ class STTConfig:
     silence_threshold: float = 1.5
     max_listen_timeout: float = 15
     vad_threshold: float = 0.3
+    nudge_on_timeout: bool = False
 @dataclass
@@ -117,6 +118,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
                     config.stt.max_listen_timeout = float(stt["max_listen_timeout"])
                 if "vad_threshold" in stt:
                     config.stt.vad_threshold = float(stt["vad_threshold"])
+                if "nudge_on_timeout" in stt:
+                    config.stt.nudge_on_timeout = bool(stt["nudge_on_timeout"])
             # Top-level config
             if "main_agent" in data:
@@ -191,6 +194,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
             "silence_threshold": config.stt.silence_threshold,
             "max_listen_timeout": config.stt.max_listen_timeout,
             "vad_threshold": config.stt.vad_threshold,
+            "nudge_on_timeout": config.stt.nudge_on_timeout,
         },
         "main_agent": config.main_agent,
         "last_voice_name": config.last_voice_name,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicesmith-mcp",
-  "version": "1.0.16",
+  "version": "1.0.18",
   "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
   "bin": {
     "voicesmith-mcp": "bin/cli.js"

package/server.py CHANGED Viewed

@@ -42,6 +42,19 @@ from shared import (
 )
 from config import load_config, save_config, get_config_path, AppConfig
 from session_registry import register_session, rename_session, unregister_session
+from tts.media_duck import duck, unduck, is_bluetooth_output
+async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
+    """Unduck after a brief delay so the MCP response reaches the client first.
+    On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
+    codec switch that macOS performs when the microphone session ends.
+    """
+    if is_bluetooth_output():
+        delay = max(delay, 3.0)
+    await asyncio.sleep(delay)
+    unduck(paused_apps)
 logger = get_logger("server")
@@ -63,6 +76,7 @@ _config: AppConfig = None
 _muted = False
 _listen_cancel_event: asyncio.Event = None
 _listen_active = False
+_suppress_duck = False  # Set by speak_then_listen to prevent inner duck/unduck gaps
 _startup_time = time.time()
 _last_tool_call = time.time()  # Updated on every MCP tool call
 _session_info: dict = None
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
     try:
         _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
-        _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
-        _speech_queue = SpeechQueue(_tts_engine, _audio_player)
+        _audio_player = AudioPlayer(config.tts.audio_player)
+        _speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
         logger.info("TTS subsystem initialized")
     except TTSEngineError as e:
         logger.error(f"TTS initialization failed: {e}")
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
     if prompt:
         logger.info(f"Listening (prompt: {prompt})")
+    # Duck media while recording so the mic doesn't pick up playback
+    # Skip if speak_then_listen already holds the duck
+    paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
     try:
         loop = asyncio.get_running_loop()
-        # Play ready sound so the user knows to start speaking
-        # Skip for push-to-talk (HTTP) — it has its own beep
-        if prompt != "push-to-talk":
-            await loop.run_in_executor(None, _play_ready_sound)
         start = time.perf_counter()
         # Reset VAD state from any prior recording (LSTM hidden state + context)
         _vad.reset()
+        # Play the ready sound AFTER the mic is live (via on_ready callback)
+        # so the user doesn't start speaking into a dead mic.
+        ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
         # Record audio with VAD
         audio = await _mic_capture.record(
             vad=_vad,
             timeout=timeout,
             silence_threshold=silence_threshold,
             cancel_event=_listen_cancel_event,
+            on_ready=ready_cb,
         )
         if _listen_cancel_event.is_set():
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
         logger.error(f"listen failed: {e}")
         return {"success": False, "error": "listen_failed", "message": str(e)}
     finally:
+        if paused_apps:
+            asyncio.create_task(_deferred_unduck(paused_apps))
         _listen_active = False
         _listen_cancel_event = None
         # Reclaim mic for wake listener
@@ -524,19 +544,48 @@ async def speak_then_listen(
         timeout: Max seconds to wait for response (default 15).
         silence_threshold: Seconds of silence before stopping (default 1.5).
     """
-    speak_result = await speak(name, text, speed, block=True)
+    global _suppress_duck
-    if not speak_result.get("success"):
-        return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
+    # Duck once for the entire speak+listen operation to avoid a
+    # brief unduck gap between speak finishing and listen starting.
+    should_duck = _config and _config.tts.duck_media
+    paused_apps = duck() if should_duck else []
-    listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
+    # Suppress inner ducking in SpeechQueue and listen()
+    saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
+    if _speech_queue and should_duck:
+        _speech_queue._duck_media = False
+    _suppress_duck = True
-    # If listen timed out, speak a nudge and fall back to text
-    if listen_result.get("error") == "timeout":
-        nudge_result = await speak(name, "I didn't catch that. Go ahead and type it.", speed, block=True)
-        listen_result["nudge_spoken"] = nudge_result.get("success", False)
+    try:
+        speak_result = await speak(name, text, speed, block=True)
+        if not speak_result.get("success"):
+            return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
+        listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
-    return {"speak": speak_result, "listen": listen_result}
+        # Optionally speak a nudge on timeout to prompt user to type instead
+        if (listen_result.get("error") == "timeout"
+                and _config and _config.stt.nudge_on_timeout
+                and _speech_queue):
+            nudge_text = "I didn't catch that. Go ahead and type it."
+            voice, _ = _registry.get_voice(name) if _registry else (None, False)
+            if voice and _tts_engine:
+                try:
+                    result = _tts_engine.synthesize(nudge_text, voice, speed)
+                    _audio_player.play(result.samples, result.sample_rate)
+                    listen_result["nudge_spoken"] = True
+                except Exception:
+                    pass
+        return {"speak": speak_result, "listen": listen_result}
+    finally:
+        _suppress_duck = False
+        if _speech_queue:
+            _speech_queue._duck_media = saved_queue_duck
+        if paused_apps:
+            asyncio.create_task(_deferred_unduck(paused_apps))
 @mcp.tool()

package/stt/__pycache__/mic_capture.cpython-314.pyc CHANGED Viewed

Binary file

package/stt/mic_capture.py CHANGED Viewed

@@ -8,18 +8,20 @@ import socket
 import subprocess
 import threading
 import time
-from typing import Optional
+from typing import Callable, Optional
 import numpy as np
 from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
 from stt.vad import VoiceActivityDetector
+from tts.media_duck import is_bluetooth_output
 logger = get_logger("stt.mic")
 _CHUNK_SAMPLES = 512        # Silero VAD requires exactly 512-sample chunks at 16kHz
 _CHUNK_BYTES   = _CHUNK_SAMPLES * 4   # float32 = 4 bytes/sample → 2048 bytes/chunk
-_ZERO_CHECK_CHUNKS = 10    # ~320ms of silence before detecting TCC denial
+_ZERO_CHECK_CHUNKS = 25    # ~800ms — exceeds CoreAudio cold-start latency (~544ms)
+_ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
 _AUDIO_SERVICE_SOCKET  = "/tmp/voicesmith-audio.sock"
 _LAUNCHAGENT_LABEL     = "com.voicesmith-mcp.audio"
@@ -92,6 +94,7 @@ class MicCapture:
         timeout: float = 15,
         silence_threshold: float = 1.5,
         cancel_event: Optional[asyncio.Event] = None,
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record audio from the microphone until silence is detected.
@@ -106,6 +109,9 @@ class MicCapture:
             timeout: Maximum seconds to wait for speech (default 15).
             silence_threshold: Seconds of silence before stopping (default 1.5).
             cancel_event: Optional asyncio.Event to cancel recording.
+            on_ready: Optional callback invoked once the mic is live and
+                      ready to capture.  Called after hardware warm-up /
+                      flush but before the VAD loop starts.
         Returns:
             Numpy array of recorded audio, or None if cancelled/timeout.
@@ -122,17 +128,17 @@ class MicCapture:
         if platform.system() == "Darwin":
             if _launchagent_available():
                 return await self._record_via_socket(
-                    vad, timeout, silence_threshold, cancel_event
+                    vad, timeout, silence_threshold, cancel_event, on_ready
                 )
             # Legacy: subprocess fallback for installs without the LaunchAgent.
             audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
             if audio_capture_bin:
                 return await self._record_via_subprocess(
-                    audio_capture_bin, vad, timeout, silence_threshold, cancel_event
+                    audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
                 )
         return await self._record_via_sounddevice(
-            vad, timeout, silence_threshold, cancel_event
+            vad, timeout, silence_threshold, cancel_event, on_ready
         )
     # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
@@ -143,6 +149,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
@@ -190,7 +197,10 @@ class MicCapture:
         logger.info("Microphone recording started (audio-service socket)")
         try:
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            # Flush 2 chunks (~64ms) for AudioQueue hardware settle.
+            self._flush_queue(2)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         finally:
             sock.close()  # signals service to stop sending for this session
@@ -206,6 +216,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
         self._recording = True
@@ -239,7 +250,9 @@ class MicCapture:
         reader_thread.start()
         try:
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
+            self._flush_queue(2)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         finally:
             proc.terminate()
@@ -258,6 +271,7 @@ class MicCapture:
         timeout: float,
         silence_threshold: float,
         cancel_event: Optional[asyncio.Event],
+        on_ready: Optional[Callable[[], None]] = None,
     ) -> Optional[np.ndarray]:
         """Record using sounddevice / PortAudio (fallback for non-macOS)."""
         try:
@@ -281,7 +295,9 @@ class MicCapture:
             stream.start()
             logger.info("Microphone recording started (sounddevice)")
-            self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
+            self._flush_queue(2, chunk_timeout=0.1)
+            if on_ready:
+                on_ready()
             return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
         except MicCaptureError:
             raise
@@ -330,6 +346,8 @@ class MicCapture:
         speech_detected = False
         silence_duration = 0.0
         zero_check_done = False
+        # Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
+        zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
         start_time = loop.time()
         while not self._stop_flag:
@@ -354,7 +372,7 @@ class MicCapture:
             chunks.append(chunk)
-            if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
+            if not zero_check_done and len(chunks) >= zero_threshold:
                 zero_check_done = True
                 if all(np.max(np.abs(c)) == 0.0 for c in chunks):
                     raise MicCaptureError(self._zero_audio_message())

package/tts/__pycache__/audio_player.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/__pycache__/kokoro_engine.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/__pycache__/media_duck.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/__pycache__/speech_queue.cpython-314.pyc CHANGED Viewed

Binary file

package/tts/audio_player.py CHANGED Viewed

@@ -10,7 +10,6 @@ import time
 import soundfile as sf
 from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
-from tts.media_duck import duck, unduck
 logger = get_logger("tts.audio_player")
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
 class AudioPlayer:
     """Plays audio samples through an external player process."""
-    def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
+    def __init__(self, player_command: str = "mpv") -> None:
         self._player_command = player_command
-        self._duck_media = duck_media
         self._process: subprocess.Popen | None = None
         # Detect platform fallback if player_command is not available
@@ -84,23 +82,19 @@ class AudioPlayer:
             # Cross-session audio lock: prevents overlapping playback
             # flock is kernel-managed — auto-released on crash, no stale locks
-            paused_apps = duck() if self._duck_media else []
-            try:
-                with open(AUDIO_LOCK_PATH, "w") as lock_file:
-                    fcntl.flock(lock_file, fcntl.LOCK_EX)
-                    start = time.perf_counter()
-                    self._process = subprocess.Popen(
-                        cmd,
-                        stdout=subprocess.DEVNULL,
-                        stderr=subprocess.DEVNULL,
-                    )
-                    self._process.wait()
-                    duration_ms = (time.perf_counter() - start) * 1000
-                # Lock released when lock_file closes
-            finally:
-                unduck(paused_apps)
+            with open(AUDIO_LOCK_PATH, "w") as lock_file:
+                fcntl.flock(lock_file, fcntl.LOCK_EX)
+                start = time.perf_counter()
+                self._process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
+                self._process.wait()
+                duration_ms = (time.perf_counter() - start) * 1000
+            # Lock released when lock_file closes
             if self._process.returncode != 0:
                 return PlaybackResult(

package/tts/kokoro_engine.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import time
+import numpy as np
 from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
 logger = get_logger("tts.kokoro")
@@ -47,6 +49,11 @@ class KokoroEngine:
             samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
             synthesis_ms = (time.perf_counter() - start) * 1000
+            # Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
+            # (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
+            pad = int(sample_rate * 0.10)
+            samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
             duration_ms = (len(samples) / sample_rate) * 1000
             return SynthesisResult(

package/tts/media_duck.py CHANGED Viewed

@@ -14,6 +14,8 @@ Usage:
     unduck(paused)         # resume only what we paused
 """
+import ctypes
+import ctypes.util
 import platform
 import subprocess
@@ -89,6 +91,66 @@ tell application "{target}"
 end tell"""
+# ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
+def is_bluetooth_output() -> bool:
+    """Return True if the default audio output is a Bluetooth device.
+    Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
+    of the default output device.  Returns False on non-macOS or on error.
+    """
+    if platform.system() != "Darwin":
+        return False
+    try:
+        lib_path = ctypes.util.find_library("CoreAudio")
+        if not lib_path:
+            return False
+        ca = ctypes.cdll.LoadLibrary(lib_path)
+        class _AudioObjectPropertyAddress(ctypes.Structure):
+            _fields_ = [
+                ("mSelector", ctypes.c_uint32),
+                ("mScope", ctypes.c_uint32),
+                ("mElement", ctypes.c_uint32),
+            ]
+        # CoreAudio FourCC constants
+        _SYS_OBJ   = 1                                            # kAudioObjectSystemObject
+        _SCOPE_G   = int.from_bytes(b"glob", "big")               # kAudioObjectPropertyScopeGlobal
+        _ELEM_M    = 0                                             # kAudioObjectPropertyElementMain
+        _DEF_OUT   = int.from_bytes(b"dOut", "big")                # kAudioHardwarePropertyDefaultOutputDevice
+        _TRANS     = int.from_bytes(b"tran", "big")                # kAudioDevicePropertyTransportType
+        _BT        = int.from_bytes(b"blue", "big")                # kAudioDeviceTransportTypeBluetooth
+        _BT_LE     = int.from_bytes(b"blea", "big")                # kAudioDeviceTransportTypeBluetoothLE
+        # Get default output device ID
+        addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
+        device_id = ctypes.c_uint32(0)
+        size = ctypes.c_uint32(4)
+        err = ca.AudioObjectGetPropertyData(
+            _SYS_OBJ, ctypes.byref(addr), 0, None,
+            ctypes.byref(size), ctypes.byref(device_id),
+        )
+        if err != 0:
+            return False
+        # Get transport type of that device
+        addr.mSelector = _TRANS
+        transport = ctypes.c_uint32(0)
+        size = ctypes.c_uint32(4)
+        err = ca.AudioObjectGetPropertyData(
+            device_id.value, ctypes.byref(addr), 0, None,
+            ctypes.byref(size), ctypes.byref(transport),
+        )
+        if err != 0:
+            return False
+        return transport.value in (_BT, _BT_LE)
+    except Exception:
+        return False
 # ── Public API ────────────────────────────────────────────────────────────────
 def duck() -> list[str]:

package/tts/speech_queue.py CHANGED Viewed

@@ -6,6 +6,7 @@ import time
 from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
 from tts.kokoro_engine import KokoroEngine
 from tts.audio_player import AudioPlayer
+from tts.media_duck import duck, unduck
 logger = get_logger("tts.speech_queue")
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
 class SpeechQueue:
     """Manages sequential speech synthesis and playback."""
-    def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
+    def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
         self._engine = engine
         self._player = player
+        self._duck_media = duck_media
         self._queue: asyncio.Queue = asyncio.Queue()
         self._speaking = False
@@ -60,6 +62,9 @@ class SpeechQueue:
         total_duration_ms = 0.0
         total_synthesis_ms = 0.0
+        # Duck media for the entire utterance, not per-chunk
+        paused_apps = duck() if self._duck_media else []
         try:
             chunks = self.chunk_text(text)
@@ -105,6 +110,7 @@ class SpeechQueue:
                 error=str(e),
             )
         finally:
+            unduck(paused_apps)
             self._speaking = False
     def stop(self) -> bool: