voicesmith-mcp 1.0.15 → 1.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config.py CHANGED
@@ -7,6 +7,7 @@ Environment variables override individual config values.
7
7
 
8
8
  import json
9
9
  import os
10
+ import tempfile
10
11
  from dataclasses import dataclass, field, asdict
11
12
  from pathlib import Path
12
13
  from typing import Optional
@@ -207,7 +208,22 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
207
208
  },
208
209
  }
209
210
 
210
- with open(path, "w") as f:
211
- json.dump(data, f, indent=2)
211
+ # Atomic write: write to temp file then rename. This prevents
212
+ # readers (like the installer) from seeing a truncated file if
213
+ # they read during a write.
214
+ try:
215
+ fd, tmp_path = tempfile.mkstemp(
216
+ dir=path.parent, suffix=".tmp", prefix=".config-"
217
+ )
218
+ with os.fdopen(fd, "w") as f:
219
+ json.dump(data, f, indent=2)
220
+ os.replace(tmp_path, path)
221
+ except Exception as e:
222
+ # Clean up temp file if rename failed
223
+ try:
224
+ os.unlink(tmp_path)
225
+ except OSError:
226
+ pass
227
+ raise
212
228
 
213
229
  logger.debug(f"Saved config to {path}")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voicesmith-mcp",
3
- "version": "1.0.15",
3
+ "version": "1.0.17",
4
4
  "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
5
5
  "bin": {
6
6
  "voicesmith-mcp": "bin/cli.js"
package/server.py CHANGED
@@ -42,6 +42,19 @@ from shared import (
42
42
  )
43
43
  from config import load_config, save_config, get_config_path, AppConfig
44
44
  from session_registry import register_session, rename_session, unregister_session
45
+ from tts.media_duck import duck, unduck, is_bluetooth_output
46
+
47
+
48
+ async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
49
+ """Unduck after a brief delay so the MCP response reaches the client first.
50
+
51
+ On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
52
+ codec switch that macOS performs when the microphone session ends.
53
+ """
54
+ if is_bluetooth_output():
55
+ delay = max(delay, 3.0)
56
+ await asyncio.sleep(delay)
57
+ unduck(paused_apps)
45
58
 
46
59
  logger = get_logger("server")
47
60
 
@@ -63,6 +76,7 @@ _config: AppConfig = None
63
76
  _muted = False
64
77
  _listen_cancel_event: asyncio.Event = None
65
78
  _listen_active = False
79
+ _suppress_duck = False # Set by speak_then_listen to prevent inner duck/unduck gaps
66
80
  _startup_time = time.time()
67
81
  _last_tool_call = time.time() # Updated on every MCP tool call
68
82
  _session_info: dict = None
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
82
96
 
83
97
  try:
84
98
  _tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
85
- _audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
86
- _speech_queue = SpeechQueue(_tts_engine, _audio_player)
99
+ _audio_player = AudioPlayer(config.tts.audio_player)
100
+ _speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
87
101
  logger.info("TTS subsystem initialized")
88
102
  except TTSEngineError as e:
89
103
  logger.error(f"TTS initialization failed: {e}")
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
453
467
  if prompt:
454
468
  logger.info(f"Listening (prompt: {prompt})")
455
469
 
470
+ # Duck media while recording so the mic doesn't pick up playback
471
+ # Skip if speak_then_listen already holds the duck
472
+ paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
473
+
456
474
  try:
457
475
  loop = asyncio.get_running_loop()
458
476
 
459
- # Play ready sound so the user knows to start speaking
460
- # Skip for push-to-talk (HTTP) — it has its own beep
461
- if prompt != "push-to-talk":
462
- await loop.run_in_executor(None, _play_ready_sound)
463
-
464
477
  start = time.perf_counter()
465
478
 
466
479
  # Reset VAD state from any prior recording (LSTM hidden state + context)
467
480
  _vad.reset()
468
481
 
482
+ # Play the ready sound AFTER the mic is live (via on_ready callback)
483
+ # so the user doesn't start speaking into a dead mic.
484
+ ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
485
+
469
486
  # Record audio with VAD
470
487
  audio = await _mic_capture.record(
471
488
  vad=_vad,
472
489
  timeout=timeout,
473
490
  silence_threshold=silence_threshold,
474
491
  cancel_event=_listen_cancel_event,
492
+ on_ready=ready_cb,
475
493
  )
476
494
 
477
495
  if _listen_cancel_event.is_set():
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
500
518
  logger.error(f"listen failed: {e}")
501
519
  return {"success": False, "error": "listen_failed", "message": str(e)}
502
520
  finally:
521
+ if paused_apps:
522
+ asyncio.create_task(_deferred_unduck(paused_apps))
503
523
  _listen_active = False
504
524
  _listen_cancel_event = None
505
525
  # Reclaim mic for wake listener
@@ -524,19 +544,34 @@ async def speak_then_listen(
524
544
  timeout: Max seconds to wait for response (default 15).
525
545
  silence_threshold: Seconds of silence before stopping (default 1.5).
526
546
  """
527
- speak_result = await speak(name, text, speed, block=True)
547
+ global _suppress_duck
548
+
549
+ # Duck once for the entire speak+listen operation to avoid a
550
+ # brief unduck gap between speak finishing and listen starting.
551
+ should_duck = _config and _config.tts.duck_media
552
+ paused_apps = duck() if should_duck else []
528
553
 
529
- if not speak_result.get("success"):
530
- return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
554
+ # Suppress inner ducking in SpeechQueue and listen()
555
+ saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
556
+ if _speech_queue and should_duck:
557
+ _speech_queue._duck_media = False
558
+ _suppress_duck = True
531
559
 
532
- listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
560
+ try:
561
+ speak_result = await speak(name, text, speed, block=True)
533
562
 
534
- # If listen timed out, speak a nudge and fall back to text
535
- if listen_result.get("error") == "timeout":
536
- nudge_result = await speak(name, "I didn't catch that. Go ahead and type it.", speed, block=True)
537
- listen_result["nudge_spoken"] = nudge_result.get("success", False)
563
+ if not speak_result.get("success"):
564
+ return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
538
565
 
539
- return {"speak": speak_result, "listen": listen_result}
566
+ listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
567
+
568
+ return {"speak": speak_result, "listen": listen_result}
569
+ finally:
570
+ _suppress_duck = False
571
+ if _speech_queue:
572
+ _speech_queue._duck_media = saved_queue_duck
573
+ if paused_apps:
574
+ asyncio.create_task(_deferred_unduck(paused_apps))
540
575
 
541
576
 
542
577
  @mcp.tool()
@@ -8,18 +8,20 @@ import socket
8
8
  import subprocess
9
9
  import threading
10
10
  import time
11
- from typing import Optional
11
+ from typing import Callable, Optional
12
12
 
13
13
  import numpy as np
14
14
 
15
15
  from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
16
16
  from stt.vad import VoiceActivityDetector
17
+ from tts.media_duck import is_bluetooth_output
17
18
 
18
19
  logger = get_logger("stt.mic")
19
20
 
20
21
  _CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
21
22
  _CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
22
- _ZERO_CHECK_CHUNKS = 10 # ~320ms of silence before detecting TCC denial
23
+ _ZERO_CHECK_CHUNKS = 25 # ~800ms exceeds CoreAudio cold-start latency (~544ms)
24
+ _ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
23
25
 
24
26
  _AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
25
27
  _LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
@@ -92,6 +94,7 @@ class MicCapture:
92
94
  timeout: float = 15,
93
95
  silence_threshold: float = 1.5,
94
96
  cancel_event: Optional[asyncio.Event] = None,
97
+ on_ready: Optional[Callable[[], None]] = None,
95
98
  ) -> Optional[np.ndarray]:
96
99
  """Record audio from the microphone until silence is detected.
97
100
 
@@ -106,6 +109,9 @@ class MicCapture:
106
109
  timeout: Maximum seconds to wait for speech (default 15).
107
110
  silence_threshold: Seconds of silence before stopping (default 1.5).
108
111
  cancel_event: Optional asyncio.Event to cancel recording.
112
+ on_ready: Optional callback invoked once the mic is live and
113
+ ready to capture. Called after hardware warm-up /
114
+ flush but before the VAD loop starts.
109
115
 
110
116
  Returns:
111
117
  Numpy array of recorded audio, or None if cancelled/timeout.
@@ -122,17 +128,17 @@ class MicCapture:
122
128
  if platform.system() == "Darwin":
123
129
  if _launchagent_available():
124
130
  return await self._record_via_socket(
125
- vad, timeout, silence_threshold, cancel_event
131
+ vad, timeout, silence_threshold, cancel_event, on_ready
126
132
  )
127
133
  # Legacy: subprocess fallback for installs without the LaunchAgent.
128
134
  audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
129
135
  if audio_capture_bin:
130
136
  return await self._record_via_subprocess(
131
- audio_capture_bin, vad, timeout, silence_threshold, cancel_event
137
+ audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
132
138
  )
133
139
 
134
140
  return await self._record_via_sounddevice(
135
- vad, timeout, silence_threshold, cancel_event
141
+ vad, timeout, silence_threshold, cancel_event, on_ready
136
142
  )
137
143
 
138
144
  # ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
@@ -143,6 +149,7 @@ class MicCapture:
143
149
  timeout: float,
144
150
  silence_threshold: float,
145
151
  cancel_event: Optional[asyncio.Event],
152
+ on_ready: Optional[Callable[[], None]] = None,
146
153
  ) -> Optional[np.ndarray]:
147
154
  """Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
148
155
 
@@ -190,7 +197,10 @@ class MicCapture:
190
197
  logger.info("Microphone recording started (audio-service socket)")
191
198
 
192
199
  try:
193
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
200
+ # Flush 2 chunks (~64ms) for AudioQueue hardware settle.
201
+ self._flush_queue(2)
202
+ if on_ready:
203
+ on_ready()
194
204
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
195
205
  finally:
196
206
  sock.close() # signals service to stop sending for this session
@@ -206,6 +216,7 @@ class MicCapture:
206
216
  timeout: float,
207
217
  silence_threshold: float,
208
218
  cancel_event: Optional[asyncio.Event],
219
+ on_ready: Optional[Callable[[], None]] = None,
209
220
  ) -> Optional[np.ndarray]:
210
221
  """Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
211
222
  self._recording = True
@@ -239,7 +250,9 @@ class MicCapture:
239
250
  reader_thread.start()
240
251
 
241
252
  try:
242
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
253
+ self._flush_queue(2)
254
+ if on_ready:
255
+ on_ready()
243
256
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
244
257
  finally:
245
258
  proc.terminate()
@@ -258,6 +271,7 @@ class MicCapture:
258
271
  timeout: float,
259
272
  silence_threshold: float,
260
273
  cancel_event: Optional[asyncio.Event],
274
+ on_ready: Optional[Callable[[], None]] = None,
261
275
  ) -> Optional[np.ndarray]:
262
276
  """Record using sounddevice / PortAudio (fallback for non-macOS)."""
263
277
  try:
@@ -281,7 +295,9 @@ class MicCapture:
281
295
  stream.start()
282
296
  logger.info("Microphone recording started (sounddevice)")
283
297
 
284
- self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
298
+ self._flush_queue(2, chunk_timeout=0.1)
299
+ if on_ready:
300
+ on_ready()
285
301
  return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
286
302
  except MicCaptureError:
287
303
  raise
@@ -330,6 +346,8 @@ class MicCapture:
330
346
  speech_detected = False
331
347
  silence_duration = 0.0
332
348
  zero_check_done = False
349
+ # Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
350
+ zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
333
351
  start_time = loop.time()
334
352
 
335
353
  while not self._stop_flag:
@@ -354,7 +372,7 @@ class MicCapture:
354
372
 
355
373
  chunks.append(chunk)
356
374
 
357
- if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
375
+ if not zero_check_done and len(chunks) >= zero_threshold:
358
376
  zero_check_done = True
359
377
  if all(np.max(np.abs(c)) == 0.0 for c in chunks):
360
378
  raise MicCaptureError(self._zero_audio_message())
@@ -10,7 +10,6 @@ import time
10
10
  import soundfile as sf
11
11
 
12
12
  from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
13
- from tts.media_duck import duck, unduck
14
13
 
15
14
  logger = get_logger("tts.audio_player")
16
15
 
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
18
17
  class AudioPlayer:
19
18
  """Plays audio samples through an external player process."""
20
19
 
21
- def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
20
+ def __init__(self, player_command: str = "mpv") -> None:
22
21
  self._player_command = player_command
23
- self._duck_media = duck_media
24
22
  self._process: subprocess.Popen | None = None
25
23
 
26
24
  # Detect platform fallback if player_command is not available
@@ -84,23 +82,19 @@ class AudioPlayer:
84
82
 
85
83
  # Cross-session audio lock: prevents overlapping playback
86
84
  # flock is kernel-managed — auto-released on crash, no stale locks
87
- paused_apps = duck() if self._duck_media else []
88
- try:
89
- with open(AUDIO_LOCK_PATH, "w") as lock_file:
90
- fcntl.flock(lock_file, fcntl.LOCK_EX)
91
-
92
- start = time.perf_counter()
93
- self._process = subprocess.Popen(
94
- cmd,
95
- stdout=subprocess.DEVNULL,
96
- stderr=subprocess.DEVNULL,
97
- )
98
- self._process.wait()
99
- duration_ms = (time.perf_counter() - start) * 1000
100
-
101
- # Lock released when lock_file closes
102
- finally:
103
- unduck(paused_apps)
85
+ with open(AUDIO_LOCK_PATH, "w") as lock_file:
86
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
87
+
88
+ start = time.perf_counter()
89
+ self._process = subprocess.Popen(
90
+ cmd,
91
+ stdout=subprocess.DEVNULL,
92
+ stderr=subprocess.DEVNULL,
93
+ )
94
+ self._process.wait()
95
+ duration_ms = (time.perf_counter() - start) * 1000
96
+
97
+ # Lock released when lock_file closes
104
98
 
105
99
  if self._process.returncode != 0:
106
100
  return PlaybackResult(
@@ -2,6 +2,8 @@
2
2
 
3
3
  import time
4
4
 
5
+ import numpy as np
6
+
5
7
  from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
6
8
 
7
9
  logger = get_logger("tts.kokoro")
@@ -47,6 +49,11 @@ class KokoroEngine:
47
49
  samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
48
50
  synthesis_ms = (time.perf_counter() - start) * 1000
49
51
 
52
+ # Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
53
+ # (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
54
+ pad = int(sample_rate * 0.10)
55
+ samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
56
+
50
57
  duration_ms = (len(samples) / sample_rate) * 1000
51
58
 
52
59
  return SynthesisResult(
package/tts/media_duck.py CHANGED
@@ -14,6 +14,8 @@ Usage:
14
14
  unduck(paused) # resume only what we paused
15
15
  """
16
16
 
17
+ import ctypes
18
+ import ctypes.util
17
19
  import platform
18
20
  import subprocess
19
21
 
@@ -89,6 +91,66 @@ tell application "{target}"
89
91
  end tell"""
90
92
 
91
93
 
94
+ # ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
95
+
96
+ def is_bluetooth_output() -> bool:
97
+ """Return True if the default audio output is a Bluetooth device.
98
+
99
+ Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
100
+ of the default output device. Returns False on non-macOS or on error.
101
+ """
102
+ if platform.system() != "Darwin":
103
+ return False
104
+
105
+ try:
106
+ lib_path = ctypes.util.find_library("CoreAudio")
107
+ if not lib_path:
108
+ return False
109
+ ca = ctypes.cdll.LoadLibrary(lib_path)
110
+
111
+ class _AudioObjectPropertyAddress(ctypes.Structure):
112
+ _fields_ = [
113
+ ("mSelector", ctypes.c_uint32),
114
+ ("mScope", ctypes.c_uint32),
115
+ ("mElement", ctypes.c_uint32),
116
+ ]
117
+
118
+ # CoreAudio FourCC constants
119
+ _SYS_OBJ = 1 # kAudioObjectSystemObject
120
+ _SCOPE_G = int.from_bytes(b"glob", "big") # kAudioObjectPropertyScopeGlobal
121
+ _ELEM_M = 0 # kAudioObjectPropertyElementMain
122
+ _DEF_OUT = int.from_bytes(b"dOut", "big") # kAudioHardwarePropertyDefaultOutputDevice
123
+ _TRANS = int.from_bytes(b"tran", "big") # kAudioDevicePropertyTransportType
124
+ _BT = int.from_bytes(b"blue", "big") # kAudioDeviceTransportTypeBluetooth
125
+ _BT_LE = int.from_bytes(b"blea", "big") # kAudioDeviceTransportTypeBluetoothLE
126
+
127
+ # Get default output device ID
128
+ addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
129
+ device_id = ctypes.c_uint32(0)
130
+ size = ctypes.c_uint32(4)
131
+ err = ca.AudioObjectGetPropertyData(
132
+ _SYS_OBJ, ctypes.byref(addr), 0, None,
133
+ ctypes.byref(size), ctypes.byref(device_id),
134
+ )
135
+ if err != 0:
136
+ return False
137
+
138
+ # Get transport type of that device
139
+ addr.mSelector = _TRANS
140
+ transport = ctypes.c_uint32(0)
141
+ size = ctypes.c_uint32(4)
142
+ err = ca.AudioObjectGetPropertyData(
143
+ device_id.value, ctypes.byref(addr), 0, None,
144
+ ctypes.byref(size), ctypes.byref(transport),
145
+ )
146
+ if err != 0:
147
+ return False
148
+
149
+ return transport.value in (_BT, _BT_LE)
150
+ except Exception:
151
+ return False
152
+
153
+
92
154
  # ── Public API ────────────────────────────────────────────────────────────────
93
155
 
94
156
  def duck() -> list[str]:
@@ -6,6 +6,7 @@ import time
6
6
  from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
7
7
  from tts.kokoro_engine import KokoroEngine
8
8
  from tts.audio_player import AudioPlayer
9
+ from tts.media_duck import duck, unduck
9
10
 
10
11
  logger = get_logger("tts.speech_queue")
11
12
 
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
13
14
  class SpeechQueue:
14
15
  """Manages sequential speech synthesis and playback."""
15
16
 
16
- def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
17
+ def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
17
18
  self._engine = engine
18
19
  self._player = player
20
+ self._duck_media = duck_media
19
21
  self._queue: asyncio.Queue = asyncio.Queue()
20
22
  self._speaking = False
21
23
 
@@ -60,6 +62,9 @@ class SpeechQueue:
60
62
  total_duration_ms = 0.0
61
63
  total_synthesis_ms = 0.0
62
64
 
65
+ # Duck media for the entire utterance, not per-chunk
66
+ paused_apps = duck() if self._duck_media else []
67
+
63
68
  try:
64
69
  chunks = self.chunk_text(text)
65
70
 
@@ -105,6 +110,7 @@ class SpeechQueue:
105
110
  error=str(e),
106
111
  )
107
112
  finally:
113
+ unduck(paused_apps)
108
114
  self._speaking = False
109
115
 
110
116
  def stop(self) -> bool: