voicesmith-mcp 1.0.14 → 1.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config.json CHANGED
@@ -4,7 +4,8 @@
4
4
  "voices_path": "~/.local/share/voicesmith-mcp/models/voices-v1.0.bin",
5
5
  "default_voice": "am_eric",
6
6
  "default_speed": 1.0,
7
- "audio_player": "mpv"
7
+ "audio_player": "mpv",
8
+ "duck_media": true
8
9
  },
9
10
  "stt": {
10
11
  "model_size": "base",
package/config.py CHANGED
@@ -7,6 +7,7 @@ Environment variables override individual config values.
7
7
 
8
8
  import json
9
9
  import os
10
+ import tempfile
10
11
  from dataclasses import dataclass, field, asdict
11
12
  from pathlib import Path
12
13
  from typing import Optional
@@ -26,6 +27,7 @@ class TTSConfig:
26
27
  default_voice: str = "am_eric"
27
28
  default_speed: float = 1.0
28
29
  audio_player: str = "mpv"
30
+ duck_media: bool = False
29
31
 
30
32
 
31
33
  @dataclass
@@ -99,6 +101,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
99
101
  config.tts.default_speed = float(tts["default_speed"])
100
102
  if "audio_player" in tts:
101
103
  config.tts.audio_player = tts["audio_player"]
104
+ if "duck_media" in tts:
105
+ config.tts.duck_media = bool(tts["duck_media"])
102
106
 
103
107
  # STT config
104
108
  if "stt" in data:
@@ -179,6 +183,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
179
183
  "default_voice": config.tts.default_voice,
180
184
  "default_speed": config.tts.default_speed,
181
185
  "audio_player": config.tts.audio_player,
186
+ "duck_media": config.tts.duck_media,
182
187
  },
183
188
  "stt": {
184
189
  "model_size": config.stt.model_size,
@@ -203,7 +208,22 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
203
208
  },
204
209
  }
205
210
 
206
- with open(path, "w") as f:
207
- json.dump(data, f, indent=2)
211
+ # Atomic write: write to temp file then rename. This prevents
212
+ # readers (like the installer) from seeing a truncated file if
213
+ # they read during a write.
214
+ try:
215
+ fd, tmp_path = tempfile.mkstemp(
216
+ dir=path.parent, suffix=".tmp", prefix=".config-"
217
+ )
218
+ with os.fdopen(fd, "w") as f:
219
+ json.dump(data, f, indent=2)
220
+ os.replace(tmp_path, path)
221
+ except Exception as e:
222
+ # Clean up temp file if rename failed
223
+ try:
224
+ os.unlink(tmp_path)
225
+ except OSError:
226
+ pass
227
+ raise
208
228
 
209
229
  logger.debug(f"Saved config to {path}")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voicesmith-mcp",
3
- "version": "1.0.14",
3
+ "version": "1.0.16",
4
4
  "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
5
5
  "bin": {
6
6
  "voicesmith-mcp": "bin/cli.js"
package/server.py CHANGED
@@ -82,7 +82,7 @@ def _init_tts(config: AppConfig):
82
82
 
83
83
  try:
84
84
  _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
85
- _audio_player = AudioPlayer(config.tts.audio_player)
85
+ _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
86
86
  _speech_queue = SpeechQueue(_tts_engine, _audio_player)
87
87
  logger.info("TTS subsystem initialized")
88
88
  except TTSEngineError as e:
@@ -454,14 +454,18 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
454
454
  logger.info(f"Listening (prompt: {prompt})")
455
455
 
456
456
  try:
457
+ loop = asyncio.get_running_loop()
458
+
457
459
  # Play ready sound so the user knows to start speaking
458
460
  # Skip for push-to-talk (HTTP) — it has its own beep
459
461
  if prompt != "push-to-talk":
460
- loop = asyncio.get_event_loop()
461
462
  await loop.run_in_executor(None, _play_ready_sound)
462
463
 
463
464
  start = time.perf_counter()
464
465
 
466
+ # Reset VAD state from any prior recording (LSTM hidden state + context)
467
+ _vad.reset()
468
+
465
469
  # Record audio with VAD
466
470
  audio = await _mic_capture.record(
467
471
  vad=_vad,
@@ -479,7 +483,6 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
479
483
  recording_ms = (time.perf_counter() - start) * 1000
480
484
 
481
485
  # Transcribe
482
- loop = asyncio.get_event_loop()
483
486
  result = await loop.run_in_executor(
484
487
  None, _stt_engine.transcribe, audio, STT_SAMPLE_RATE
485
488
  )
@@ -1,7 +1,12 @@
1
1
  """Microphone capture with VAD-controlled recording."""
2
2
 
3
3
  import asyncio
4
+ import os
5
+ import platform
4
6
  import queue
7
+ import socket
8
+ import subprocess
9
+ import threading
5
10
  import time
6
11
  from typing import Optional
7
12
 
@@ -12,6 +17,65 @@ from stt.vad import VoiceActivityDetector
12
17
 
13
18
  logger = get_logger("stt.mic")
14
19
 
20
+ _CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
21
+ _CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
22
+ _ZERO_CHECK_CHUNKS = 10 # ~320ms of silence before detecting TCC denial
23
+
24
+ _AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
25
+ _LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
26
+ _LAUNCHAGENT_PLIST = os.path.expanduser(
27
+ f"~/Library/LaunchAgents/{_LAUNCHAGENT_LABEL}.plist"
28
+ )
29
+
30
+
31
+ def _find_app_binary(name: str) -> Optional[str]:
32
+ """Return path to a named binary inside VoiceSmithMCP.app, or None."""
33
+ install_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34
+ binary = os.path.join(install_dir, "VoiceSmithMCP.app", "Contents", "MacOS", name)
35
+ return binary if os.path.isfile(binary) and os.access(binary, os.X_OK) else None
36
+
37
+
38
+ def _launchagent_available() -> bool:
39
+ """Return True if the VoiceSmithMCP audio LaunchAgent plist is installed."""
40
+ return os.path.isfile(_LAUNCHAGENT_PLIST)
41
+
42
+
43
+ def _ensure_audio_service_running() -> None:
44
+ """Start the audio LaunchAgent if it is not already running.
45
+
46
+ The service is started via launchctl. We then wait up to 3 seconds for
47
+ the Unix socket to appear, which signals the service is ready to accept
48
+ connections.
49
+ """
50
+ # If the socket exists and is connectable, service is already running.
51
+ if _socket_ready():
52
+ return
53
+
54
+ logger.info("Starting audio service via launchctl")
55
+ try:
56
+ subprocess.run(
57
+ ["launchctl", "start", _LAUNCHAGENT_LABEL],
58
+ capture_output=True,
59
+ timeout=5,
60
+ )
61
+ except Exception as e:
62
+ raise MicCaptureError(f"Failed to start audio service: {e}") from e
63
+
64
+ # Wait up to 3 s for the socket to appear.
65
+ for _ in range(30):
66
+ if _socket_ready():
67
+ return
68
+ time.sleep(0.1)
69
+ raise MicCaptureError(
70
+ "VoiceSmith audio service did not start in time. "
71
+ f"Check {_LAUNCHAGENT_PLIST} and launchctl output."
72
+ )
73
+
74
+
75
+ def _socket_ready() -> bool:
76
+ """Return True if the audio service socket file exists."""
77
+ return os.path.exists(_AUDIO_SERVICE_SOCKET)
78
+
15
79
 
16
80
  class MicCapture:
17
81
  """Microphone capture with voice activity detection."""
@@ -31,7 +95,11 @@ class MicCapture:
31
95
  ) -> Optional[np.ndarray]:
32
96
  """Record audio from the microphone until silence is detected.
33
97
 
34
- Uses VAD to detect speech and stop recording after a period of silence.
98
+ On macOS, prefers the audio-service LaunchAgent backend which runs
99
+ under launchd (ppid=1), ensuring macOS TCC attributes mic permission
100
+ to VoiceSmithMCP.app rather than to the user's terminal app.
101
+ Falls back to the audio-capture subprocess if the LaunchAgent is not
102
+ installed, and to sounddevice on non-macOS systems.
35
103
 
36
104
  Args:
37
105
  vad: VoiceActivityDetector instance for speech detection.
@@ -48,6 +116,150 @@ class MicCapture:
48
116
  if self._recording:
49
117
  raise MicCaptureError("Another recording is already in progress")
50
118
 
119
+ # Reset VAD state between recordings.
120
+ vad.reset()
121
+
122
+ if platform.system() == "Darwin":
123
+ if _launchagent_available():
124
+ return await self._record_via_socket(
125
+ vad, timeout, silence_threshold, cancel_event
126
+ )
127
+ # Legacy: subprocess fallback for installs without the LaunchAgent.
128
+ audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
129
+ if audio_capture_bin:
130
+ return await self._record_via_subprocess(
131
+ audio_capture_bin, vad, timeout, silence_threshold, cancel_event
132
+ )
133
+
134
+ return await self._record_via_sounddevice(
135
+ vad, timeout, silence_threshold, cancel_event
136
+ )
137
+
138
+ # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
139
+
140
+ async def _record_via_socket(
141
+ self,
142
+ vad: VoiceActivityDetector,
143
+ timeout: float,
144
+ silence_threshold: float,
145
+ cancel_event: Optional[asyncio.Event],
146
+ ) -> Optional[np.ndarray]:
147
+ """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
148
+
149
+ The LaunchAgent runs under launchd so macOS TCC attributes mic access
150
+ to com.voicesmith-mcp.launcher, not to the parent terminal app.
151
+ """
152
+ loop = asyncio.get_running_loop()
153
+
154
+ # Ensure the service is up and the socket is ready.
155
+ try:
156
+ await loop.run_in_executor(None, _ensure_audio_service_running)
157
+ except MicCaptureError:
158
+ raise
159
+ except Exception as e:
160
+ raise MicCaptureError(f"Audio service error: {e}") from e
161
+
162
+ # Open socket connection.
163
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
164
+ try:
165
+ sock.connect(_AUDIO_SERVICE_SOCKET)
166
+ except OSError as e:
167
+ sock.close()
168
+ raise MicCaptureError(f"Cannot connect to audio service: {e}") from e
169
+
170
+ self._recording = True
171
+ self._stop_flag = False
172
+ self._audio_queue = queue.Queue()
173
+
174
+ def _reader() -> None:
175
+ """Background thread: reads socket chunks → audio_queue."""
176
+ try:
177
+ while True:
178
+ data = b""
179
+ while len(data) < _CHUNK_BYTES:
180
+ got = sock.recv(_CHUNK_BYTES - len(data))
181
+ if not got:
182
+ return # service closed connection
183
+ data += got
184
+ self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
185
+ except Exception as exc:
186
+ logger.debug(f"socket reader thread exiting: {exc}")
187
+
188
+ reader_thread = threading.Thread(target=_reader, daemon=True)
189
+ reader_thread.start()
190
+ logger.info("Microphone recording started (audio-service socket)")
191
+
192
+ try:
193
+ self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
194
+ return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
195
+ finally:
196
+ sock.close() # signals service to stop sending for this session
197
+ reader_thread.join(timeout=1)
198
+ self._recording = False
199
+
200
+ # ── Subprocess backend (macOS legacy fallback) ─────────────────────────────
201
+
202
+ async def _record_via_subprocess(
203
+ self,
204
+ binary: str,
205
+ vad: VoiceActivityDetector,
206
+ timeout: float,
207
+ silence_threshold: float,
208
+ cancel_event: Optional[asyncio.Event],
209
+ ) -> Optional[np.ndarray]:
210
+ """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
211
+ self._recording = True
212
+ self._stop_flag = False
213
+ self._audio_queue = queue.Queue()
214
+
215
+ try:
216
+ proc = subprocess.Popen(
217
+ [binary],
218
+ stdout=subprocess.PIPE,
219
+ stderr=subprocess.PIPE,
220
+ close_fds=True,
221
+ )
222
+ except Exception as e:
223
+ self._recording = False
224
+ raise MicCaptureError(f"Failed to start audio binary: {e}") from e
225
+
226
+ logger.info("Microphone recording started (subprocess fallback)")
227
+
228
+ def _reader() -> None:
229
+ try:
230
+ while True:
231
+ data = proc.stdout.read(_CHUNK_BYTES)
232
+ if not data or len(data) < _CHUNK_BYTES:
233
+ break
234
+ self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
235
+ except Exception as exc:
236
+ logger.debug(f"subprocess reader thread exiting: {exc}")
237
+
238
+ reader_thread = threading.Thread(target=_reader, daemon=True)
239
+ reader_thread.start()
240
+
241
+ try:
242
+ self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
243
+ return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
244
+ finally:
245
+ proc.terminate()
246
+ try:
247
+ proc.wait(timeout=1)
248
+ except Exception:
249
+ proc.kill()
250
+ reader_thread.join(timeout=1)
251
+ self._recording = False
252
+
253
+ # ── sounddevice backend (non-macOS fallback) ───────────────────────────────
254
+
255
+ async def _record_via_sounddevice(
256
+ self,
257
+ vad: VoiceActivityDetector,
258
+ timeout: float,
259
+ silence_threshold: float,
260
+ cancel_event: Optional[asyncio.Event],
261
+ ) -> Optional[np.ndarray]:
262
+ """Record using sounddevice / PortAudio (fallback for non-macOS)."""
51
263
  try:
52
264
  import sounddevice as sd
53
265
  except Exception as e:
@@ -56,15 +268,6 @@ class MicCapture:
56
268
  self._recording = True
57
269
  self._stop_flag = False
58
270
  self._audio_queue = queue.Queue()
59
- chunks: list[np.ndarray] = []
60
- speech_detected = False
61
- silence_duration = 0.0
62
- loop = asyncio.get_event_loop()
63
-
64
- # Reset VAD state — the LSTM hidden state and context window must
65
- # be cleared between recordings to avoid stale state from previous
66
- # audio affecting speech detection.
67
- vad.reset()
68
271
 
69
272
  stream = None
70
273
  try:
@@ -72,94 +275,138 @@ class MicCapture:
72
275
  samplerate=self._sample_rate,
73
276
  channels=1,
74
277
  dtype="float32",
75
- blocksize=512, # Silero VAD expects 512-sample chunks at 16kHz
278
+ blocksize=_CHUNK_SAMPLES,
76
279
  callback=self._audio_callback,
77
280
  )
78
281
  stream.start()
79
- logger.info("Microphone recording started")
80
-
81
- # Discard the first ~200ms of audio to avoid picking up residual
82
- # speaker output (Tink sound or TTS playback that just finished).
83
- # This prevents VAD from detecting speaker bleed as "speech" and
84
- # then cutting off when the bleed stops.
85
- flush_chunks = int(0.2 * self._sample_rate / 512) # ~6 chunks
86
- for _ in range(flush_chunks):
87
- try:
88
- self._audio_queue.get(timeout=0.1)
89
- except queue.Empty:
90
- break
91
-
92
- start_time = asyncio.get_event_loop().time()
93
-
94
- while not self._stop_flag:
95
- # Check cancellation
96
- if cancel_event and cancel_event.is_set():
97
- logger.info("Recording cancelled by event")
98
- break
99
-
100
- # Check timeout
101
- elapsed = asyncio.get_event_loop().time() - start_time
102
- if elapsed >= timeout:
103
- if not speech_detected:
104
- logger.info("Recording timed out with no speech detected")
105
- else:
106
- logger.info("Recording timed out")
107
- break
108
-
109
- # Get audio chunk from queue
110
- try:
111
- chunk = await loop.run_in_executor(
112
- None, self._audio_queue.get, True, 0.1
113
- )
114
- except queue.Empty:
115
- continue
116
-
117
- chunks.append(chunk)
118
- is_speech = vad.is_speech(chunk)
119
-
120
- if is_speech:
121
- speech_detected = True
122
- silence_duration = 0.0
123
- elif speech_detected:
124
- # Count silence after speech was detected
125
- chunk_duration = len(chunk) / self._sample_rate
126
- silence_duration += chunk_duration
127
- if silence_duration >= silence_threshold:
128
- logger.info(
129
- f"Silence threshold reached ({silence_threshold}s), stopping"
130
- )
131
- break
132
-
133
- if not chunks or not speech_detected:
134
- return None
135
-
136
- return np.concatenate(chunks).flatten()
282
+ logger.info("Microphone recording started (sounddevice)")
137
283
 
284
+ self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
285
+ return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
138
286
  except MicCaptureError:
139
287
  raise
140
288
  except Exception as e:
141
289
  raise MicCaptureError(f"Recording failed: {e}") from e
142
290
  finally:
143
- # Safely tear down the audio stream. The CoreAudio IO thread may
144
- # still be executing the callback when we call stop(). Wait briefly
145
- # between stop() and close() to let the IO thread finish — this
146
- # prevents the segfault in libffi/PortAudio where the callback
147
- # dereferences freed memory.
148
291
  if stream is not None:
149
292
  try:
150
293
  stream.stop()
151
- time.sleep(0.05) # Let CoreAudio IO thread finish
294
+ time.sleep(0.05)
152
295
  stream.close()
153
296
  except Exception as e:
154
297
  logger.debug(f"Stream teardown: {e}")
155
298
  self._recording = False
156
299
 
157
- def _audio_callback(self, indata, frames, time, status) -> None:
300
+ # ── Shared helpers ─────────────────────────────────────────────────────────
301
+
302
+ def _flush_queue(self, n_chunks: int, chunk_timeout: float = 0.15) -> None:
303
+ """Discard the first n_chunks from the audio queue (drops speaker bleed)."""
304
+ for _ in range(n_chunks):
305
+ try:
306
+ self._audio_queue.get(timeout=chunk_timeout)
307
+ except queue.Empty:
308
+ break
309
+
310
+ # ── Shared VAD loop ────────────────────────────────────────────────────────
311
+
312
+ async def _run_vad_loop(
313
+ self,
314
+ vad: VoiceActivityDetector,
315
+ timeout: float,
316
+ silence_threshold: float,
317
+ cancel_event: Optional[asyncio.Event],
318
+ ) -> Optional[np.ndarray]:
319
+ """VAD recording loop — shared by all capture backends.
320
+
321
+ Reads 512-sample float32 chunks from self._audio_queue, runs Silero VAD
322
+ on each, and returns when silence_threshold is exceeded after speech,
323
+ timeout elapses, or cancel_event fires.
324
+
325
+ Raises:
326
+ MicCaptureError: If audio is all-zeros (TCC denial detected).
327
+ """
328
+ loop = asyncio.get_running_loop()
329
+ chunks: list[np.ndarray] = []
330
+ speech_detected = False
331
+ silence_duration = 0.0
332
+ zero_check_done = False
333
+ start_time = loop.time()
334
+
335
+ while not self._stop_flag:
336
+ if cancel_event and cancel_event.is_set():
337
+ logger.info("Recording cancelled by event")
338
+ break
339
+
340
+ elapsed = loop.time() - start_time
341
+ if elapsed >= timeout:
342
+ if not speech_detected:
343
+ logger.info("Recording timed out with no speech detected")
344
+ else:
345
+ logger.info("Recording timed out")
346
+ break
347
+
348
+ try:
349
+ chunk = await loop.run_in_executor(
350
+ None, self._audio_queue.get, True, 0.1
351
+ )
352
+ except queue.Empty:
353
+ continue
354
+
355
+ chunks.append(chunk)
356
+
357
+ if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
358
+ zero_check_done = True
359
+ if all(np.max(np.abs(c)) == 0.0 for c in chunks):
360
+ raise MicCaptureError(self._zero_audio_message())
361
+
362
+ is_speech = vad.is_speech(chunk)
363
+
364
+ if is_speech:
365
+ speech_detected = True
366
+ silence_duration = 0.0
367
+ elif speech_detected:
368
+ silence_duration += len(chunk) / self._sample_rate
369
+ if silence_duration >= silence_threshold:
370
+ logger.info(
371
+ f"Silence threshold reached ({silence_threshold}s), stopping"
372
+ )
373
+ break
374
+
375
+ if not chunks or not speech_detected:
376
+ return None
377
+
378
+ return np.concatenate(chunks).flatten()
379
+
380
+ # ── sounddevice callback ───────────────────────────────────────────────────
381
+
382
+ def _audio_callback(self, indata, frames, time_info, status) -> None:
158
383
  """Sounddevice callback — pushes audio chunks to the queue."""
159
384
  if status:
160
385
  logger.warning(f"Audio callback status: {status}")
161
386
  self._audio_queue.put(indata.copy())
162
387
 
388
+ # ── Error message ──────────────────────────────────────────────────────────
389
+
390
+ @staticmethod
391
+ def _zero_audio_message() -> str:
392
+ """Build an error message for zero-amplitude mic input."""
393
+ msg = (
394
+ "Microphone is returning silent audio. "
395
+ "The audio stream opened successfully but every sample is zero."
396
+ )
397
+ if platform.system() == "Darwin":
398
+ msg += (
399
+ "\n\nmacOS is blocking mic access. The VoiceSmithMCP audio service "
400
+ "may not have been granted Microphone permission yet. "
401
+ "Check System Settings > Privacy & Security > Microphone and "
402
+ "ensure VoiceSmithMCP is enabled.\n\n"
403
+ "If VoiceSmithMCP is not listed, re-run the installer:\n"
404
+ " ./install.sh"
405
+ )
406
+ return msg
407
+
408
+ # ── Properties / control ──────────────────────────────────────────────────
409
+
163
410
  @property
164
411
  def is_recording(self) -> bool:
165
412
  """Return whether the microphone is currently recording."""
@@ -10,6 +10,7 @@ import time
10
10
  import soundfile as sf
11
11
 
12
12
  from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
13
+ from tts.media_duck import duck, unduck
13
14
 
14
15
  logger = get_logger("tts.audio_player")
15
16
 
@@ -17,8 +18,9 @@ logger = get_logger("tts.audio_player")
17
18
  class AudioPlayer:
18
19
  """Plays audio samples through an external player process."""
19
20
 
20
- def __init__(self, player_command: str = "mpv") -> None:
21
+ def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
21
22
  self._player_command = player_command
23
+ self._duck_media = duck_media
22
24
  self._process: subprocess.Popen | None = None
23
25
 
24
26
  # Detect platform fallback if player_command is not available
@@ -82,19 +84,23 @@ class AudioPlayer:
82
84
 
83
85
  # Cross-session audio lock: prevents overlapping playback
84
86
  # flock is kernel-managed — auto-released on crash, no stale locks
85
- with open(AUDIO_LOCK_PATH, "w") as lock_file:
86
- fcntl.flock(lock_file, fcntl.LOCK_EX)
87
-
88
- start = time.perf_counter()
89
- self._process = subprocess.Popen(
90
- cmd,
91
- stdout=subprocess.DEVNULL,
92
- stderr=subprocess.DEVNULL,
93
- )
94
- self._process.wait()
95
- duration_ms = (time.perf_counter() - start) * 1000
96
-
97
- # Lock released when lock_file closes
87
+ paused_apps = duck() if self._duck_media else []
88
+ try:
89
+ with open(AUDIO_LOCK_PATH, "w") as lock_file:
90
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
91
+
92
+ start = time.perf_counter()
93
+ self._process = subprocess.Popen(
94
+ cmd,
95
+ stdout=subprocess.DEVNULL,
96
+ stderr=subprocess.DEVNULL,
97
+ )
98
+ self._process.wait()
99
+ duration_ms = (time.perf_counter() - start) * 1000
100
+
101
+ # Lock released when lock_file closes
102
+ finally:
103
+ unduck(paused_apps)
98
104
 
99
105
  if self._process.returncode != 0:
100
106
  return PlaybackResult(
@@ -0,0 +1,146 @@
1
+ """macOS media ducking via osascript.
2
+
3
+ Pauses media apps (Apple Music, Spotify) and browser tabs (Chrome, Brave,
4
+ Edge, Safari) before VoiceSmith audio playback and resumes them afterward.
5
+ No-ops on non-macOS systems.
6
+
7
+ Browser ducking uses JavaScript injection via AppleScript. The first time
8
+ each browser is targeted, macOS will prompt for Automation permission — approve
9
+ once and it is remembered.
10
+
11
+ Usage:
12
+ paused = duck() # pause everything playing; returns opaque token list
13
+ ...play audio...
14
+ unduck(paused) # resume only what we paused
15
+ """
16
+
17
+ import platform
18
+ import subprocess
19
+
20
+ from shared import get_logger
21
+
22
+ logger = get_logger("tts.media_duck")
23
+
24
+ # ── Native media apps ─────────────────────────────────────────────────────────
25
+
26
+ # (display name, AppleScript target)
27
+ _APPS = [
28
+ ("Apple Music", "Music"),
29
+ ("Spotify", "Spotify"),
30
+ ]
31
+
32
+ # ── Browsers ──────────────────────────────────────────────────────────────────
33
+
34
+ # (display name, AppleScript target, family: "chrome" | "safari")
35
+ _BROWSERS = [
36
+ ("Google Chrome", "Google Chrome", "chrome"),
37
+ ("Brave Browser", "Brave Browser", "chrome"),
38
+ ("Microsoft Edge", "Microsoft Edge", "chrome"),
39
+ ("Safari", "Safari", "safari"),
40
+ ]
41
+
42
+ # JS injected into every tab on duck: pause playing media and mark it.
43
+ _JS_PAUSE = (
44
+ "document.querySelectorAll('video,audio').forEach(function(v){"
45
+ "if(!v.paused){v.pause();v.dataset.voicesmithPaused='1'}"
46
+ "})"
47
+ )
48
+
49
+ # JS injected on unduck: resume only elements we marked, then clear the mark.
50
+ _JS_RESUME = (
51
+ "document.querySelectorAll('video,audio').forEach(function(v){"
52
+ "if(v.dataset.voicesmithPaused){delete v.dataset.voicesmithPaused;v.play()}"
53
+ "})"
54
+ )
55
+
56
+ # ── Helpers ───────────────────────────────────────────────────────────────────
57
+
58
+ def _osascript(script: str) -> str:
59
+ """Run an AppleScript (may be multi-line); return stdout stripped, or '' on error."""
60
+ try:
61
+ result = subprocess.run(
62
+ ["osascript"],
63
+ input=script,
64
+ capture_output=True,
65
+ text=True,
66
+ timeout=5,
67
+ )
68
+ return result.stdout.strip()
69
+ except Exception:
70
+ return ""
71
+
72
+
73
+ def _browser_script(target: str, family: str, js: str) -> str:
74
+ """Build an AppleScript that runs js in every tab of target browser."""
75
+ if family == "safari":
76
+ exec_stmt = f'do JavaScript "{js}" in t'
77
+ else: # chrome family
78
+ exec_stmt = f'execute t javascript "{js}"'
79
+
80
+ return f"""\
81
+ tell application "{target}"
82
+ repeat with w in windows
83
+ repeat with t in tabs of w
84
+ try
85
+ {exec_stmt}
86
+ end try
87
+ end repeat
88
+ end repeat
89
+ end tell"""
90
+
91
+
92
+ # ── Public API ────────────────────────────────────────────────────────────────
93
+
94
+ def duck() -> list[str]:
95
+ """Pause any playing media apps and browser tabs.
96
+
97
+ Returns:
98
+ Opaque list of tokens — pass unchanged to unduck().
99
+ """
100
+ if platform.system() != "Darwin":
101
+ return []
102
+
103
+ paused: list[str] = []
104
+
105
+ # Native apps (Music, Spotify)
106
+ for display_name, target in _APPS:
107
+ if _osascript(f'application "{target}" is running') != "true":
108
+ continue
109
+ if _osascript(f'tell application "{target}" to get player state') == "playing":
110
+ _osascript(f'tell application "{target}" to pause')
111
+ paused.append(target)
112
+ logger.debug(f"Ducked {display_name}")
113
+
114
+ # Browsers — inject pause JS into every tab
115
+ for display_name, target, family in _BROWSERS:
116
+ if _osascript(f'application "{target}" is running') != "true":
117
+ continue
118
+ _osascript(_browser_script(target, family, _JS_PAUSE))
119
+ paused.append(f"browser:{target}")
120
+ logger.debug(f"Ducked browser tabs in {display_name}")
121
+
122
+ return paused
123
+
124
+
125
+ def unduck(paused: list[str]) -> None:
126
+ """Resume apps and browser tabs paused by duck().
127
+
128
+ Args:
129
+ paused: The list returned by a previous duck() call.
130
+ """
131
+ if platform.system() != "Darwin":
132
+ return
133
+
134
+ for token in paused:
135
+ if token.startswith("browser:"):
136
+ target = token[len("browser:"):]
137
+ # family lookup for resume script
138
+ family = next(
139
+ (f for _, t, f in _BROWSERS if t == target),
140
+ "chrome",
141
+ )
142
+ _osascript(_browser_script(target, family, _JS_RESUME))
143
+ logger.debug(f"Unducked browser tabs in {target}")
144
+ else:
145
+ _osascript(f'tell application "{token}" to play')
146
+ logger.debug(f"Unducked {token}")
@@ -55,7 +55,7 @@ class SpeechQueue:
55
55
  speed: float,
56
56
  ) -> SpeakResult:
57
57
  """Internal: synthesize and play text, blocking until done."""
58
- loop = asyncio.get_event_loop()
58
+ loop = asyncio.get_running_loop()
59
59
  self._speaking = True
60
60
  total_duration_ms = 0.0
61
61
  total_synthesis_ms = 0.0