voicesmith-mcp 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -39,7 +39,7 @@ What the AI does automatically:
39
39
 
40
40
  | Moment | What happens |
41
41
  |--------|-------------|
42
- | You give it a task | Speaks a brief acknowledgment |
42
+ | You give it a task | Gets to work (speaks only when clarifying approach) |
43
43
  | It finishes work | Speaks a summary of what was done |
44
44
  | It has a question | Asks out loud, then listens for your voice response |
45
45
  | Voice tools unavailable | Falls back to text silently |
@@ -112,7 +112,8 @@ The MCP server runs as a local process alongside your IDE. It communicates over
112
112
  - **TTS**: Kokoro ONNX — fast neural TTS, 54 voices, no GPU needed
113
113
  - **STT**: faster-whisper — OpenAI Whisper running locally via CTranslate2
114
114
  - **VAD**: Silero VAD — voice activity detection for clean recordings
115
- - **Audio**: mpv for playback, sounddevice for recording
115
+ - **Audio**: mpv for playback; CoreAudio via native app bundle on macOS (sounddevice fallback on Linux)
116
+ - **Media ducking**: Auto-pauses Apple Music, Spotify, and browser audio during speech (macOS)
116
117
 
117
118
  ## Multi-Session
118
119
 
@@ -131,16 +132,24 @@ Config lives at `~/.local/share/voicesmith-mcp/config.json`. Key settings:
131
132
  "main_agent": "Eric",
132
133
  "tts": {
133
134
  "default_voice": "am_eric",
134
- "audio_player": "mpv"
135
+ "audio_player": "mpv",
136
+ "duck_media": true
135
137
  },
136
138
  "stt": {
137
139
  "model_size": "base",
138
140
  "language": "en",
139
- "vad_threshold": 0.3
141
+ "vad_threshold": 0.3,
142
+ "nudge_on_timeout": false
140
143
  }
141
144
  }
142
145
  ```
143
146
 
147
+ | Setting | Description | Default |
148
+ |---------|-------------|---------|
149
+ | `tts.duck_media` | Auto-pause music/browser audio during speech (macOS) | `true` |
150
+ | `stt.nudge_on_timeout` | Speak "I didn't catch that" when listen times out | `false` |
151
+ | `stt.vad_threshold` | Voice detection sensitivity (lower = more sensitive) | `0.3` |
152
+
144
153
  Re-run `npx voicesmith-mcp install` to change your voice or update settings. Existing configuration is preserved — only new defaults are added.
145
154
 
146
155
  ## Requirements
@@ -166,16 +175,14 @@ Re-run `npx voicesmith-mcp install` to change your voice or update settings. Exi
166
175
 
167
176
  ### The AI can't hear me (listen returns empty or times out)
168
177
 
169
- **Check microphone permissions.** On macOS, the terminal app that runs your IDE needs microphone access:
178
+ **Check microphone permissions.** On macOS, VoiceSmith uses a native app bundle (`VoiceSmithMCP.app`) for mic access. The first time it records, macOS should show a permission dialog for the app. If it didn't:
170
179
 
171
180
  1. Open **System Settings > Privacy & Security > Microphone**
172
- 2. Make sure your terminal app is listed and enabled:
173
- - **Warp**, **Terminal.app**, or **iTerm2**for Claude Code
174
- - **Cursor** or **VS Code** — if using those IDEs directly
175
- 3. If the app isn't listed, the first `listen` call should trigger the permission prompt. Approve it and try again.
181
+ 2. Look for **VoiceSmithMCP** and make sure it's enabled
182
+ 3. If it's not listed, the LaunchAgent may not be running try reinstalling: `npx voicesmith-mcp install`
176
183
 
177
184
  > [!IMPORTANT]
178
- > The Python process inherits microphone permissions from the app that launched it. If your terminal doesn't have mic access, listen will silently fail.
185
+ > If the server detects silent audio (all zeros for ~320ms), it returns an error pointing you to the microphone permission settings. This usually means macOS TCC denied mic access.
179
186
 
180
187
  **Check your audio input device.** If an external mic is selected but not connected, the server opens it but gets silence:
181
188
  - Open **System Settings > Sound > Input** and verify the correct mic is selected
package/config.py CHANGED
@@ -37,6 +37,7 @@ class STTConfig:
37
37
  silence_threshold: float = 1.5
38
38
  max_listen_timeout: float = 15
39
39
  vad_threshold: float = 0.3
40
+ nudge_on_timeout: bool = False
40
41
 
41
42
 
42
43
  @dataclass
@@ -117,6 +118,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
117
118
  config.stt.max_listen_timeout = float(stt["max_listen_timeout"])
118
119
  if "vad_threshold" in stt:
119
120
  config.stt.vad_threshold = float(stt["vad_threshold"])
121
+ if "nudge_on_timeout" in stt:
122
+ config.stt.nudge_on_timeout = bool(stt["nudge_on_timeout"])
120
123
 
121
124
  # Top-level config
122
125
  if "main_agent" in data:
@@ -191,6 +194,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
191
194
  "silence_threshold": config.stt.silence_threshold,
192
195
  "max_listen_timeout": config.stt.max_listen_timeout,
193
196
  "vad_threshold": config.stt.vad_threshold,
197
+ "nudge_on_timeout": config.stt.nudge_on_timeout,
194
198
  },
195
199
  "main_agent": config.main_agent,
196
200
  "last_voice_name": config.last_voice_name,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voicesmith-mcp",
3
- "version": "1.0.16",
3
+ "version": "1.0.18",
4
4
  "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
5
5
  "bin": {
6
6
  "voicesmith-mcp": "bin/cli.js"
package/server.py CHANGED
@@ -42,6 +42,19 @@ from shared import (
42
42
  )
43
43
  from config import load_config, save_config, get_config_path, AppConfig
44
44
  from session_registry import register_session, rename_session, unregister_session
45
+ from tts.media_duck import duck, unduck, is_bluetooth_output
46
+
47
+
48
+ async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
49
+ """Unduck after a brief delay so the MCP response reaches the client first.
50
+
51
+ On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
52
+ codec switch that macOS performs when the microphone session ends.
53
+ """
54
+ if is_bluetooth_output():
55
+ delay = max(delay, 3.0)
56
+ await asyncio.sleep(delay)
57
+ unduck(paused_apps)
45
58
 
46
59
  logger = get_logger("server")
47
60
 
@@ -63,6 +76,7 @@ _config: AppConfig = None
63
76
  _muted = False
64
77
  _listen_cancel_event: asyncio.Event = None
65
78
  _listen_active = False
79
+ _suppress_duck = False # Set by speak_then_listen to prevent inner duck/unduck gaps
66
80
  _startup_time = time.time()
67
81
  _last_tool_call = time.time() # Updated on every MCP tool call
68
82
  _session_info: dict = None
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
82
96
 
83
97
  try:
84
98
  _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
85
- _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
86
- _speech_queue = SpeechQueue(_tts_engine, _audio_player)
99
+ _audio_player = AudioPlayer(config.tts.audio_player)
100
+ _speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
87
101
  logger.info("TTS subsystem initialized")
88
102
  except TTSEngineError as e:
89
103
  logger.error(f"TTS initialization failed: {e}")
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
453
467
  if prompt:
454
468
  logger.info(f"Listening (prompt: {prompt})")
455
469
 
470
+ # Duck media while recording so the mic doesn't pick up playback
471
+ # Skip if speak_then_listen already holds the duck
472
+ paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
473
+
456
474
  try:
457
475
  loop = asyncio.get_running_loop()
458
476
 
459
- # Play ready sound so the user knows to start speaking
460
- # Skip for push-to-talk (HTTP) — it has its own beep
461
- if prompt != "push-to-talk":
462
- await loop.run_in_executor(None, _play_ready_sound)
463
-
464
477
  start = time.perf_counter()
465
478
 
466
479
  # Reset VAD state from any prior recording (LSTM hidden state + context)
467
480
  _vad.reset()
468
481
 
482
+ # Play the ready sound AFTER the mic is live (via on_ready callback)
483
+ # so the user doesn't start speaking into a dead mic.
484
+ ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
485
+
469
486
  # Record audio with VAD
470
487
  audio = await _mic_capture.record(
471
488
  vad=_vad,
472
489
  timeout=timeout,
473
490
  silence_threshold=silence_threshold,
474
491
  cancel_event=_listen_cancel_event,
492
+ on_ready=ready_cb,
475
493
  )
476
494
 
477
495
  if _listen_cancel_event.is_set():
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
500
518
  logger.error(f"listen failed: {e}")
501
519
  return {"success": False, "error": "listen_failed", "message": str(e)}
502
520
  finally:
521
+ if paused_apps:
522
+ asyncio.create_task(_deferred_unduck(paused_apps))
503
523
  _listen_active = False
504
524
  _listen_cancel_event = None
505
525
  # Reclaim mic for wake listener
@@ -524,19 +544,48 @@ async def speak_then_listen(
524
544
  timeout: Max seconds to wait for response (default 15).
525
545
  silence_threshold: Seconds of silence before stopping (default 1.5).
526
546
  """
527
- speak_result = await speak(name, text, speed, block=True)
547
+ global _suppress_duck
528
548
 
529
- if not speak_result.get("success"):
530
- return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
549
+ # Duck once for the entire speak+listen operation to avoid a
550
+ # brief unduck gap between speak finishing and listen starting.
551
+ should_duck = _config and _config.tts.duck_media
552
+ paused_apps = duck() if should_duck else []
531
553
 
532
- listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
554
+ # Suppress inner ducking in SpeechQueue and listen()
555
+ saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
556
+ if _speech_queue and should_duck:
557
+ _speech_queue._duck_media = False
558
+ _suppress_duck = True
533
559
 
534
- # If listen timed out, speak a nudge and fall back to text
535
- if listen_result.get("error") == "timeout":
536
- nudge_result = await speak(name, "I didn't catch that. Go ahead and type it.", speed, block=True)
537
- listen_result["nudge_spoken"] = nudge_result.get("success", False)
560
+ try:
561
+ speak_result = await speak(name, text, speed, block=True)
562
+
563
+ if not speak_result.get("success"):
564
+ return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
565
+
566
+ listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
538
567
 
539
- return {"speak": speak_result, "listen": listen_result}
568
+ # Optionally speak a nudge on timeout to prompt user to type instead
569
+ if (listen_result.get("error") == "timeout"
570
+ and _config and _config.stt.nudge_on_timeout
571
+ and _speech_queue):
572
+ nudge_text = "I didn't catch that. Go ahead and type it."
573
+ voice, _ = _registry.get_voice(name) if _registry else (None, False)
574
+ if voice and _tts_engine:
575
+ try:
576
+ result = _tts_engine.synthesize(nudge_text, voice, speed)
577
+ _audio_player.play(result.samples, result.sample_rate)
578
+ listen_result["nudge_spoken"] = True
579
+ except Exception:
580
+ pass
581
+
582
+ return {"speak": speak_result, "listen": listen_result}
583
+ finally:
584
+ _suppress_duck = False
585
+ if _speech_queue:
586
+ _speech_queue._duck_media = saved_queue_duck
587
+ if paused_apps:
588
+ asyncio.create_task(_deferred_unduck(paused_apps))
540
589
 
541
590
 
542
591
  @mcp.tool()
@@ -8,18 +8,20 @@ import socket
8
8
  import subprocess
9
9
  import threading
10
10
  import time
11
- from typing import Optional
11
+ from typing import Callable, Optional
12
12
 
13
13
  import numpy as np
14
14
 
15
15
  from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
16
16
  from stt.vad import VoiceActivityDetector
17
+ from tts.media_duck import is_bluetooth_output
17
18
 
18
19
  logger = get_logger("stt.mic")
19
20
 
20
21
  _CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
21
22
  _CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
22
- _ZERO_CHECK_CHUNKS = 10 # ~320ms of silence before detecting TCC denial
23
+ _ZERO_CHECK_CHUNKS = 25 # ~800ms exceeds CoreAudio cold-start latency (~544ms)
24
+ _ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
23
25
 
24
26
  _AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
25
27
  _LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
@@ -92,6 +94,7 @@ class MicCapture:
92
94
  timeout: float = 15,
93
95
  silence_threshold: float = 1.5,
94
96
  cancel_event: Optional[asyncio.Event] = None,
97
+ on_ready: Optional[Callable[[], None]] = None,
95
98
  ) -> Optional[np.ndarray]:
96
99
  """Record audio from the microphone until silence is detected.
97
100
 
@@ -106,6 +109,9 @@ class MicCapture:
106
109
  timeout: Maximum seconds to wait for speech (default 15).
107
110
  silence_threshold: Seconds of silence before stopping (default 1.5).
108
111
  cancel_event: Optional asyncio.Event to cancel recording.
112
+ on_ready: Optional callback invoked once the mic is live and
113
+ ready to capture. Called after hardware warm-up /
114
+ flush but before the VAD loop starts.
109
115
 
110
116
  Returns:
111
117
  Numpy array of recorded audio, or None if cancelled/timeout.
@@ -122,17 +128,17 @@ class MicCapture:
122
128
  if platform.system() == "Darwin":
123
129
  if _launchagent_available():
124
130
  return await self._record_via_socket(
125
- vad, timeout, silence_threshold, cancel_event
131
+ vad, timeout, silence_threshold, cancel_event, on_ready
126
132
  )
127
133
  # Legacy: subprocess fallback for installs without the LaunchAgent.
128
134
  audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
129
135
  if audio_capture_bin:
130
136
  return await self._record_via_subprocess(
131
- audio_capture_bin, vad, timeout, silence_threshold, cancel_event
137
+ audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
132
138
  )
133
139
 
134
140
  return await self._record_via_sounddevice(
135
- vad, timeout, silence_threshold, cancel_event
141
+ vad, timeout, silence_threshold, cancel_event, on_ready
136
142
  )
137
143
 
138
144
  # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
@@ -143,6 +149,7 @@ class MicCapture:
143
149
  timeout: float,
144
150
  silence_threshold: float,
145
151
  cancel_event: Optional[asyncio.Event],
152
+ on_ready: Optional[Callable[[], None]] = None,
146
153
  ) -> Optional[np.ndarray]:
147
154
  """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
148
155
 
@@ -190,7 +197,10 @@ class MicCapture:
190
197
  logger.info("Microphone recording started (audio-service socket)")
191
198
 
192
199
  try:
193
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
200
+ # Flush 2 chunks (~64ms) for AudioQueue hardware settle.
201
+ self._flush_queue(2)
202
+ if on_ready:
203
+ on_ready()
194
204
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
195
205
  finally:
196
206
  sock.close() # signals service to stop sending for this session
@@ -206,6 +216,7 @@ class MicCapture:
206
216
  timeout: float,
207
217
  silence_threshold: float,
208
218
  cancel_event: Optional[asyncio.Event],
219
+ on_ready: Optional[Callable[[], None]] = None,
209
220
  ) -> Optional[np.ndarray]:
210
221
  """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
211
222
  self._recording = True
@@ -239,7 +250,9 @@ class MicCapture:
239
250
  reader_thread.start()
240
251
 
241
252
  try:
242
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
253
+ self._flush_queue(2)
254
+ if on_ready:
255
+ on_ready()
243
256
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
244
257
  finally:
245
258
  proc.terminate()
@@ -258,6 +271,7 @@ class MicCapture:
258
271
  timeout: float,
259
272
  silence_threshold: float,
260
273
  cancel_event: Optional[asyncio.Event],
274
+ on_ready: Optional[Callable[[], None]] = None,
261
275
  ) -> Optional[np.ndarray]:
262
276
  """Record using sounddevice / PortAudio (fallback for non-macOS)."""
263
277
  try:
@@ -281,7 +295,9 @@ class MicCapture:
281
295
  stream.start()
282
296
  logger.info("Microphone recording started (sounddevice)")
283
297
 
284
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
298
+ self._flush_queue(2, chunk_timeout=0.1)
299
+ if on_ready:
300
+ on_ready()
285
301
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
286
302
  except MicCaptureError:
287
303
  raise
@@ -330,6 +346,8 @@ class MicCapture:
330
346
  speech_detected = False
331
347
  silence_duration = 0.0
332
348
  zero_check_done = False
349
+ # Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
350
+ zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
333
351
  start_time = loop.time()
334
352
 
335
353
  while not self._stop_flag:
@@ -354,7 +372,7 @@ class MicCapture:
354
372
 
355
373
  chunks.append(chunk)
356
374
 
357
- if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
375
+ if not zero_check_done and len(chunks) >= zero_threshold:
358
376
  zero_check_done = True
359
377
  if all(np.max(np.abs(c)) == 0.0 for c in chunks):
360
378
  raise MicCaptureError(self._zero_audio_message())
@@ -10,7 +10,6 @@ import time
10
10
  import soundfile as sf
11
11
 
12
12
  from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
13
- from tts.media_duck import duck, unduck
14
13
 
15
14
  logger = get_logger("tts.audio_player")
16
15
 
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
18
17
  class AudioPlayer:
19
18
  """Plays audio samples through an external player process."""
20
19
 
21
- def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
20
+ def __init__(self, player_command: str = "mpv") -> None:
22
21
  self._player_command = player_command
23
- self._duck_media = duck_media
24
22
  self._process: subprocess.Popen | None = None
25
23
 
26
24
  # Detect platform fallback if player_command is not available
@@ -84,23 +82,19 @@ class AudioPlayer:
84
82
 
85
83
  # Cross-session audio lock: prevents overlapping playback
86
84
  # flock is kernel-managed — auto-released on crash, no stale locks
87
- paused_apps = duck() if self._duck_media else []
88
- try:
89
- with open(AUDIO_LOCK_PATH, "w") as lock_file:
90
- fcntl.flock(lock_file, fcntl.LOCK_EX)
91
-
92
- start = time.perf_counter()
93
- self._process = subprocess.Popen(
94
- cmd,
95
- stdout=subprocess.DEVNULL,
96
- stderr=subprocess.DEVNULL,
97
- )
98
- self._process.wait()
99
- duration_ms = (time.perf_counter() - start) * 1000
100
-
101
- # Lock released when lock_file closes
102
- finally:
103
- unduck(paused_apps)
85
+ with open(AUDIO_LOCK_PATH, "w") as lock_file:
86
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
87
+
88
+ start = time.perf_counter()
89
+ self._process = subprocess.Popen(
90
+ cmd,
91
+ stdout=subprocess.DEVNULL,
92
+ stderr=subprocess.DEVNULL,
93
+ )
94
+ self._process.wait()
95
+ duration_ms = (time.perf_counter() - start) * 1000
96
+
97
+ # Lock released when lock_file closes
104
98
 
105
99
  if self._process.returncode != 0:
106
100
  return PlaybackResult(
@@ -2,6 +2,8 @@
2
2
 
3
3
  import time
4
4
 
5
+ import numpy as np
6
+
5
7
  from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
6
8
 
7
9
  logger = get_logger("tts.kokoro")
@@ -47,6 +49,11 @@ class KokoroEngine:
47
49
  samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
48
50
  synthesis_ms = (time.perf_counter() - start) * 1000
49
51
 
52
+ # Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
53
+ # (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
54
+ pad = int(sample_rate * 0.10)
55
+ samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
56
+
50
57
  duration_ms = (len(samples) / sample_rate) * 1000
51
58
 
52
59
  return SynthesisResult(
package/tts/media_duck.py CHANGED
@@ -14,6 +14,8 @@ Usage:
14
14
  unduck(paused) # resume only what we paused
15
15
  """
16
16
 
17
+ import ctypes
18
+ import ctypes.util
17
19
  import platform
18
20
  import subprocess
19
21
 
@@ -89,6 +91,66 @@ tell application "{target}"
89
91
  end tell"""
90
92
 
91
93
 
94
+ # ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
95
+
96
+ def is_bluetooth_output() -> bool:
97
+ """Return True if the default audio output is a Bluetooth device.
98
+
99
+ Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
100
+ of the default output device. Returns False on non-macOS or on error.
101
+ """
102
+ if platform.system() != "Darwin":
103
+ return False
104
+
105
+ try:
106
+ lib_path = ctypes.util.find_library("CoreAudio")
107
+ if not lib_path:
108
+ return False
109
+ ca = ctypes.cdll.LoadLibrary(lib_path)
110
+
111
+ class _AudioObjectPropertyAddress(ctypes.Structure):
112
+ _fields_ = [
113
+ ("mSelector", ctypes.c_uint32),
114
+ ("mScope", ctypes.c_uint32),
115
+ ("mElement", ctypes.c_uint32),
116
+ ]
117
+
118
+ # CoreAudio FourCC constants
119
+ _SYS_OBJ = 1 # kAudioObjectSystemObject
120
+ _SCOPE_G = int.from_bytes(b"glob", "big") # kAudioObjectPropertyScopeGlobal
121
+ _ELEM_M = 0 # kAudioObjectPropertyElementMain
122
+ _DEF_OUT = int.from_bytes(b"dOut", "big") # kAudioHardwarePropertyDefaultOutputDevice
123
+ _TRANS = int.from_bytes(b"tran", "big") # kAudioDevicePropertyTransportType
124
+ _BT = int.from_bytes(b"blue", "big") # kAudioDeviceTransportTypeBluetooth
125
+ _BT_LE = int.from_bytes(b"blea", "big") # kAudioDeviceTransportTypeBluetoothLE
126
+
127
+ # Get default output device ID
128
+ addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
129
+ device_id = ctypes.c_uint32(0)
130
+ size = ctypes.c_uint32(4)
131
+ err = ca.AudioObjectGetPropertyData(
132
+ _SYS_OBJ, ctypes.byref(addr), 0, None,
133
+ ctypes.byref(size), ctypes.byref(device_id),
134
+ )
135
+ if err != 0:
136
+ return False
137
+
138
+ # Get transport type of that device
139
+ addr.mSelector = _TRANS
140
+ transport = ctypes.c_uint32(0)
141
+ size = ctypes.c_uint32(4)
142
+ err = ca.AudioObjectGetPropertyData(
143
+ device_id.value, ctypes.byref(addr), 0, None,
144
+ ctypes.byref(size), ctypes.byref(transport),
145
+ )
146
+ if err != 0:
147
+ return False
148
+
149
+ return transport.value in (_BT, _BT_LE)
150
+ except Exception:
151
+ return False
152
+
153
+
92
154
  # ── Public API ────────────────────────────────────────────────────────────────
93
155
 
94
156
  def duck() -> list[str]:
@@ -6,6 +6,7 @@ import time
6
6
  from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
7
7
  from tts.kokoro_engine import KokoroEngine
8
8
  from tts.audio_player import AudioPlayer
9
+ from tts.media_duck import duck, unduck
9
10
 
10
11
  logger = get_logger("tts.speech_queue")
11
12
 
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
13
14
  class SpeechQueue:
14
15
  """Manages sequential speech synthesis and playback."""
15
16
 
16
- def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
17
+ def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
17
18
  self._engine = engine
18
19
  self._player = player
20
+ self._duck_media = duck_media
19
21
  self._queue: asyncio.Queue = asyncio.Queue()
20
22
  self._speaking = False
21
23
 
@@ -60,6 +62,9 @@ class SpeechQueue:
60
62
  total_duration_ms = 0.0
61
63
  total_synthesis_ms = 0.0
62
64
 
65
+ # Duck media for the entire utterance, not per-chunk
66
+ paused_apps = duck() if self._duck_media else []
67
+
63
68
  try:
64
69
  chunks = self.chunk_text(text)
65
70
 
@@ -105,6 +110,7 @@ class SpeechQueue:
105
110
  error=str(e),
106
111
  )
107
112
  finally:
113
+ unduck(paused_apps)
108
114
  self._speaking = False
109
115
 
110
116
  def stop(self) -> bool: