voicesmith-mcp 1.0.16 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -10
- package/config.py +4 -0
- package/package.json +1 -1
- package/server.py +65 -16
- package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
- package/stt/mic_capture.py +27 -9
- package/tts/__pycache__/audio_player.cpython-314.pyc +0 -0
- package/tts/__pycache__/kokoro_engine.cpython-314.pyc +0 -0
- package/tts/__pycache__/media_duck.cpython-314.pyc +0 -0
- package/tts/__pycache__/speech_queue.cpython-314.pyc +0 -0
- package/tts/audio_player.py +14 -20
- package/tts/kokoro_engine.py +7 -0
- package/tts/media_duck.py +62 -0
- package/tts/speech_queue.py +7 -1
package/README.md
CHANGED
|
@@ -39,7 +39,7 @@ What the AI does automatically:
|
|
|
39
39
|
|
|
40
40
|
| Moment | What happens |
|
|
41
41
|
|--------|-------------|
|
|
42
|
-
| You give it a task |
|
|
42
|
+
| You give it a task | Gets to work (speaks only when clarifying approach) |
|
|
43
43
|
| It finishes work | Speaks a summary of what was done |
|
|
44
44
|
| It has a question | Asks out loud, then listens for your voice response |
|
|
45
45
|
| Voice tools unavailable | Falls back to text silently |
|
|
@@ -112,7 +112,8 @@ The MCP server runs as a local process alongside your IDE. It communicates over
|
|
|
112
112
|
- **TTS**: Kokoro ONNX — fast neural TTS, 54 voices, no GPU needed
|
|
113
113
|
- **STT**: faster-whisper — OpenAI Whisper running locally via CTranslate2
|
|
114
114
|
- **VAD**: Silero VAD — voice activity detection for clean recordings
|
|
115
|
-
- **Audio**: mpv for playback
|
|
115
|
+
- **Audio**: mpv for playback; CoreAudio via native app bundle on macOS (sounddevice fallback on Linux)
|
|
116
|
+
- **Media ducking**: Auto-pauses Apple Music, Spotify, and browser audio during speech (macOS)
|
|
116
117
|
|
|
117
118
|
## Multi-Session
|
|
118
119
|
|
|
@@ -131,16 +132,24 @@ Config lives at `~/.local/share/voicesmith-mcp/config.json`. Key settings:
|
|
|
131
132
|
"main_agent": "Eric",
|
|
132
133
|
"tts": {
|
|
133
134
|
"default_voice": "am_eric",
|
|
134
|
-
"audio_player": "mpv"
|
|
135
|
+
"audio_player": "mpv",
|
|
136
|
+
"duck_media": true
|
|
135
137
|
},
|
|
136
138
|
"stt": {
|
|
137
139
|
"model_size": "base",
|
|
138
140
|
"language": "en",
|
|
139
|
-
"vad_threshold": 0.3
|
|
141
|
+
"vad_threshold": 0.3,
|
|
142
|
+
"nudge_on_timeout": false
|
|
140
143
|
}
|
|
141
144
|
}
|
|
142
145
|
```
|
|
143
146
|
|
|
147
|
+
| Setting | Description | Default |
|
|
148
|
+
|---------|-------------|---------|
|
|
149
|
+
| `tts.duck_media` | Auto-pause music/browser audio during speech (macOS) | `true` |
|
|
150
|
+
| `stt.nudge_on_timeout` | Speak "I didn't catch that" when listen times out | `false` |
|
|
151
|
+
| `stt.vad_threshold` | Voice detection sensitivity (lower = more sensitive) | `0.3` |
|
|
152
|
+
|
|
144
153
|
Re-run `npx voicesmith-mcp install` to change your voice or update settings. Existing configuration is preserved — only new defaults are added.
|
|
145
154
|
|
|
146
155
|
## Requirements
|
|
@@ -166,16 +175,14 @@ Re-run `npx voicesmith-mcp install` to change your voice or update settings. Exi
|
|
|
166
175
|
|
|
167
176
|
### The AI can't hear me (listen returns empty or times out)
|
|
168
177
|
|
|
169
|
-
**Check microphone permissions.** On macOS,
|
|
178
|
+
**Check microphone permissions.** On macOS, VoiceSmith uses a native app bundle (`VoiceSmithMCP.app`) for mic access. The first time it records, macOS should show a permission dialog for the app. If it didn't:
|
|
170
179
|
|
|
171
180
|
1. Open **System Settings > Privacy & Security > Microphone**
|
|
172
|
-
2.
|
|
173
|
-
|
|
174
|
-
- **Cursor** or **VS Code** — if using those IDEs directly
|
|
175
|
-
3. If the app isn't listed, the first `listen` call should trigger the permission prompt. Approve it and try again.
|
|
181
|
+
2. Look for **VoiceSmithMCP** and make sure it's enabled
|
|
182
|
+
3. If it's not listed, the LaunchAgent may not be running — try reinstalling: `npx voicesmith-mcp install`
|
|
176
183
|
|
|
177
184
|
> [!IMPORTANT]
|
|
178
|
-
>
|
|
185
|
+
> If the server detects silent audio (all zeros for ~320ms), it returns an error pointing you to the microphone permission settings. This usually means macOS TCC denied mic access.
|
|
179
186
|
|
|
180
187
|
**Check your audio input device.** If an external mic is selected but not connected, the server opens it but gets silence:
|
|
181
188
|
- Open **System Settings > Sound > Input** and verify the correct mic is selected
|
package/config.py
CHANGED
|
@@ -37,6 +37,7 @@ class STTConfig:
|
|
|
37
37
|
silence_threshold: float = 1.5
|
|
38
38
|
max_listen_timeout: float = 15
|
|
39
39
|
vad_threshold: float = 0.3
|
|
40
|
+
nudge_on_timeout: bool = False
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
@dataclass
|
|
@@ -117,6 +118,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
|
|
|
117
118
|
config.stt.max_listen_timeout = float(stt["max_listen_timeout"])
|
|
118
119
|
if "vad_threshold" in stt:
|
|
119
120
|
config.stt.vad_threshold = float(stt["vad_threshold"])
|
|
121
|
+
if "nudge_on_timeout" in stt:
|
|
122
|
+
config.stt.nudge_on_timeout = bool(stt["nudge_on_timeout"])
|
|
120
123
|
|
|
121
124
|
# Top-level config
|
|
122
125
|
if "main_agent" in data:
|
|
@@ -191,6 +194,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
|
|
|
191
194
|
"silence_threshold": config.stt.silence_threshold,
|
|
192
195
|
"max_listen_timeout": config.stt.max_listen_timeout,
|
|
193
196
|
"vad_threshold": config.stt.vad_threshold,
|
|
197
|
+
"nudge_on_timeout": config.stt.nudge_on_timeout,
|
|
194
198
|
},
|
|
195
199
|
"main_agent": config.main_agent,
|
|
196
200
|
"last_voice_name": config.last_voice_name,
|
package/package.json
CHANGED
package/server.py
CHANGED
|
@@ -42,6 +42,19 @@ from shared import (
|
|
|
42
42
|
)
|
|
43
43
|
from config import load_config, save_config, get_config_path, AppConfig
|
|
44
44
|
from session_registry import register_session, rename_session, unregister_session
|
|
45
|
+
from tts.media_duck import duck, unduck, is_bluetooth_output
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
|
|
49
|
+
"""Unduck after a brief delay so the MCP response reaches the client first.
|
|
50
|
+
|
|
51
|
+
On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
|
|
52
|
+
codec switch that macOS performs when the microphone session ends.
|
|
53
|
+
"""
|
|
54
|
+
if is_bluetooth_output():
|
|
55
|
+
delay = max(delay, 3.0)
|
|
56
|
+
await asyncio.sleep(delay)
|
|
57
|
+
unduck(paused_apps)
|
|
45
58
|
|
|
46
59
|
logger = get_logger("server")
|
|
47
60
|
|
|
@@ -63,6 +76,7 @@ _config: AppConfig = None
|
|
|
63
76
|
_muted = False
|
|
64
77
|
_listen_cancel_event: asyncio.Event = None
|
|
65
78
|
_listen_active = False
|
|
79
|
+
_suppress_duck = False # Set by speak_then_listen to prevent inner duck/unduck gaps
|
|
66
80
|
_startup_time = time.time()
|
|
67
81
|
_last_tool_call = time.time() # Updated on every MCP tool call
|
|
68
82
|
_session_info: dict = None
|
|
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
|
|
|
82
96
|
|
|
83
97
|
try:
|
|
84
98
|
_tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
|
|
85
|
-
_audio_player = AudioPlayer(config.tts.audio_player
|
|
86
|
-
_speech_queue = SpeechQueue(_tts_engine, _audio_player)
|
|
99
|
+
_audio_player = AudioPlayer(config.tts.audio_player)
|
|
100
|
+
_speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
|
|
87
101
|
logger.info("TTS subsystem initialized")
|
|
88
102
|
except TTSEngineError as e:
|
|
89
103
|
logger.error(f"TTS initialization failed: {e}")
|
|
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
453
467
|
if prompt:
|
|
454
468
|
logger.info(f"Listening (prompt: {prompt})")
|
|
455
469
|
|
|
470
|
+
# Duck media while recording so the mic doesn't pick up playback
|
|
471
|
+
# Skip if speak_then_listen already holds the duck
|
|
472
|
+
paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
|
|
473
|
+
|
|
456
474
|
try:
|
|
457
475
|
loop = asyncio.get_running_loop()
|
|
458
476
|
|
|
459
|
-
# Play ready sound so the user knows to start speaking
|
|
460
|
-
# Skip for push-to-talk (HTTP) — it has its own beep
|
|
461
|
-
if prompt != "push-to-talk":
|
|
462
|
-
await loop.run_in_executor(None, _play_ready_sound)
|
|
463
|
-
|
|
464
477
|
start = time.perf_counter()
|
|
465
478
|
|
|
466
479
|
# Reset VAD state from any prior recording (LSTM hidden state + context)
|
|
467
480
|
_vad.reset()
|
|
468
481
|
|
|
482
|
+
# Play the ready sound AFTER the mic is live (via on_ready callback)
|
|
483
|
+
# so the user doesn't start speaking into a dead mic.
|
|
484
|
+
ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
|
|
485
|
+
|
|
469
486
|
# Record audio with VAD
|
|
470
487
|
audio = await _mic_capture.record(
|
|
471
488
|
vad=_vad,
|
|
472
489
|
timeout=timeout,
|
|
473
490
|
silence_threshold=silence_threshold,
|
|
474
491
|
cancel_event=_listen_cancel_event,
|
|
492
|
+
on_ready=ready_cb,
|
|
475
493
|
)
|
|
476
494
|
|
|
477
495
|
if _listen_cancel_event.is_set():
|
|
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
500
518
|
logger.error(f"listen failed: {e}")
|
|
501
519
|
return {"success": False, "error": "listen_failed", "message": str(e)}
|
|
502
520
|
finally:
|
|
521
|
+
if paused_apps:
|
|
522
|
+
asyncio.create_task(_deferred_unduck(paused_apps))
|
|
503
523
|
_listen_active = False
|
|
504
524
|
_listen_cancel_event = None
|
|
505
525
|
# Reclaim mic for wake listener
|
|
@@ -524,19 +544,48 @@ async def speak_then_listen(
|
|
|
524
544
|
timeout: Max seconds to wait for response (default 15).
|
|
525
545
|
silence_threshold: Seconds of silence before stopping (default 1.5).
|
|
526
546
|
"""
|
|
527
|
-
|
|
547
|
+
global _suppress_duck
|
|
528
548
|
|
|
529
|
-
|
|
530
|
-
|
|
549
|
+
# Duck once for the entire speak+listen operation to avoid a
|
|
550
|
+
# brief unduck gap between speak finishing and listen starting.
|
|
551
|
+
should_duck = _config and _config.tts.duck_media
|
|
552
|
+
paused_apps = duck() if should_duck else []
|
|
531
553
|
|
|
532
|
-
|
|
554
|
+
# Suppress inner ducking in SpeechQueue and listen()
|
|
555
|
+
saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
|
|
556
|
+
if _speech_queue and should_duck:
|
|
557
|
+
_speech_queue._duck_media = False
|
|
558
|
+
_suppress_duck = True
|
|
533
559
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
560
|
+
try:
|
|
561
|
+
speak_result = await speak(name, text, speed, block=True)
|
|
562
|
+
|
|
563
|
+
if not speak_result.get("success"):
|
|
564
|
+
return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
|
|
565
|
+
|
|
566
|
+
listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
|
|
538
567
|
|
|
539
|
-
|
|
568
|
+
# Optionally speak a nudge on timeout to prompt user to type instead
|
|
569
|
+
if (listen_result.get("error") == "timeout"
|
|
570
|
+
and _config and _config.stt.nudge_on_timeout
|
|
571
|
+
and _speech_queue):
|
|
572
|
+
nudge_text = "I didn't catch that. Go ahead and type it."
|
|
573
|
+
voice, _ = _registry.get_voice(name) if _registry else (None, False)
|
|
574
|
+
if voice and _tts_engine:
|
|
575
|
+
try:
|
|
576
|
+
result = _tts_engine.synthesize(nudge_text, voice, speed)
|
|
577
|
+
_audio_player.play(result.samples, result.sample_rate)
|
|
578
|
+
listen_result["nudge_spoken"] = True
|
|
579
|
+
except Exception:
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
return {"speak": speak_result, "listen": listen_result}
|
|
583
|
+
finally:
|
|
584
|
+
_suppress_duck = False
|
|
585
|
+
if _speech_queue:
|
|
586
|
+
_speech_queue._duck_media = saved_queue_duck
|
|
587
|
+
if paused_apps:
|
|
588
|
+
asyncio.create_task(_deferred_unduck(paused_apps))
|
|
540
589
|
|
|
541
590
|
|
|
542
591
|
@mcp.tool()
|
|
Binary file
|
package/stt/mic_capture.py
CHANGED
|
@@ -8,18 +8,20 @@ import socket
|
|
|
8
8
|
import subprocess
|
|
9
9
|
import threading
|
|
10
10
|
import time
|
|
11
|
-
from typing import Optional
|
|
11
|
+
from typing import Callable, Optional
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
15
|
from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
|
|
16
16
|
from stt.vad import VoiceActivityDetector
|
|
17
|
+
from tts.media_duck import is_bluetooth_output
|
|
17
18
|
|
|
18
19
|
logger = get_logger("stt.mic")
|
|
19
20
|
|
|
20
21
|
_CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
|
|
21
22
|
_CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
|
|
22
|
-
_ZERO_CHECK_CHUNKS =
|
|
23
|
+
_ZERO_CHECK_CHUNKS = 25 # ~800ms — exceeds CoreAudio cold-start latency (~544ms)
|
|
24
|
+
_ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
|
|
23
25
|
|
|
24
26
|
_AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
|
|
25
27
|
_LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
|
|
@@ -92,6 +94,7 @@ class MicCapture:
|
|
|
92
94
|
timeout: float = 15,
|
|
93
95
|
silence_threshold: float = 1.5,
|
|
94
96
|
cancel_event: Optional[asyncio.Event] = None,
|
|
97
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
95
98
|
) -> Optional[np.ndarray]:
|
|
96
99
|
"""Record audio from the microphone until silence is detected.
|
|
97
100
|
|
|
@@ -106,6 +109,9 @@ class MicCapture:
|
|
|
106
109
|
timeout: Maximum seconds to wait for speech (default 15).
|
|
107
110
|
silence_threshold: Seconds of silence before stopping (default 1.5).
|
|
108
111
|
cancel_event: Optional asyncio.Event to cancel recording.
|
|
112
|
+
on_ready: Optional callback invoked once the mic is live and
|
|
113
|
+
ready to capture. Called after hardware warm-up /
|
|
114
|
+
flush but before the VAD loop starts.
|
|
109
115
|
|
|
110
116
|
Returns:
|
|
111
117
|
Numpy array of recorded audio, or None if cancelled/timeout.
|
|
@@ -122,17 +128,17 @@ class MicCapture:
|
|
|
122
128
|
if platform.system() == "Darwin":
|
|
123
129
|
if _launchagent_available():
|
|
124
130
|
return await self._record_via_socket(
|
|
125
|
-
vad, timeout, silence_threshold, cancel_event
|
|
131
|
+
vad, timeout, silence_threshold, cancel_event, on_ready
|
|
126
132
|
)
|
|
127
133
|
# Legacy: subprocess fallback for installs without the LaunchAgent.
|
|
128
134
|
audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
|
|
129
135
|
if audio_capture_bin:
|
|
130
136
|
return await self._record_via_subprocess(
|
|
131
|
-
audio_capture_bin, vad, timeout, silence_threshold, cancel_event
|
|
137
|
+
audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
|
|
132
138
|
)
|
|
133
139
|
|
|
134
140
|
return await self._record_via_sounddevice(
|
|
135
|
-
vad, timeout, silence_threshold, cancel_event
|
|
141
|
+
vad, timeout, silence_threshold, cancel_event, on_ready
|
|
136
142
|
)
|
|
137
143
|
|
|
138
144
|
# ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
|
|
@@ -143,6 +149,7 @@ class MicCapture:
|
|
|
143
149
|
timeout: float,
|
|
144
150
|
silence_threshold: float,
|
|
145
151
|
cancel_event: Optional[asyncio.Event],
|
|
152
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
146
153
|
) -> Optional[np.ndarray]:
|
|
147
154
|
"""Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
|
|
148
155
|
|
|
@@ -190,7 +197,10 @@ class MicCapture:
|
|
|
190
197
|
logger.info("Microphone recording started (audio-service socket)")
|
|
191
198
|
|
|
192
199
|
try:
|
|
193
|
-
|
|
200
|
+
# Flush 2 chunks (~64ms) for AudioQueue hardware settle.
|
|
201
|
+
self._flush_queue(2)
|
|
202
|
+
if on_ready:
|
|
203
|
+
on_ready()
|
|
194
204
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
195
205
|
finally:
|
|
196
206
|
sock.close() # signals service to stop sending for this session
|
|
@@ -206,6 +216,7 @@ class MicCapture:
|
|
|
206
216
|
timeout: float,
|
|
207
217
|
silence_threshold: float,
|
|
208
218
|
cancel_event: Optional[asyncio.Event],
|
|
219
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
209
220
|
) -> Optional[np.ndarray]:
|
|
210
221
|
"""Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
|
|
211
222
|
self._recording = True
|
|
@@ -239,7 +250,9 @@ class MicCapture:
|
|
|
239
250
|
reader_thread.start()
|
|
240
251
|
|
|
241
252
|
try:
|
|
242
|
-
self._flush_queue(
|
|
253
|
+
self._flush_queue(2)
|
|
254
|
+
if on_ready:
|
|
255
|
+
on_ready()
|
|
243
256
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
244
257
|
finally:
|
|
245
258
|
proc.terminate()
|
|
@@ -258,6 +271,7 @@ class MicCapture:
|
|
|
258
271
|
timeout: float,
|
|
259
272
|
silence_threshold: float,
|
|
260
273
|
cancel_event: Optional[asyncio.Event],
|
|
274
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
261
275
|
) -> Optional[np.ndarray]:
|
|
262
276
|
"""Record using sounddevice / PortAudio (fallback for non-macOS)."""
|
|
263
277
|
try:
|
|
@@ -281,7 +295,9 @@ class MicCapture:
|
|
|
281
295
|
stream.start()
|
|
282
296
|
logger.info("Microphone recording started (sounddevice)")
|
|
283
297
|
|
|
284
|
-
self._flush_queue(
|
|
298
|
+
self._flush_queue(2, chunk_timeout=0.1)
|
|
299
|
+
if on_ready:
|
|
300
|
+
on_ready()
|
|
285
301
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
286
302
|
except MicCaptureError:
|
|
287
303
|
raise
|
|
@@ -330,6 +346,8 @@ class MicCapture:
|
|
|
330
346
|
speech_detected = False
|
|
331
347
|
silence_duration = 0.0
|
|
332
348
|
zero_check_done = False
|
|
349
|
+
# Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
|
|
350
|
+
zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
|
|
333
351
|
start_time = loop.time()
|
|
334
352
|
|
|
335
353
|
while not self._stop_flag:
|
|
@@ -354,7 +372,7 @@ class MicCapture:
|
|
|
354
372
|
|
|
355
373
|
chunks.append(chunk)
|
|
356
374
|
|
|
357
|
-
if not zero_check_done and len(chunks) >=
|
|
375
|
+
if not zero_check_done and len(chunks) >= zero_threshold:
|
|
358
376
|
zero_check_done = True
|
|
359
377
|
if all(np.max(np.abs(c)) == 0.0 for c in chunks):
|
|
360
378
|
raise MicCaptureError(self._zero_audio_message())
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/tts/audio_player.py
CHANGED
|
@@ -10,7 +10,6 @@ import time
|
|
|
10
10
|
import soundfile as sf
|
|
11
11
|
|
|
12
12
|
from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
|
|
13
|
-
from tts.media_duck import duck, unduck
|
|
14
13
|
|
|
15
14
|
logger = get_logger("tts.audio_player")
|
|
16
15
|
|
|
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
|
|
|
18
17
|
class AudioPlayer:
|
|
19
18
|
"""Plays audio samples through an external player process."""
|
|
20
19
|
|
|
21
|
-
def __init__(self, player_command: str = "mpv"
|
|
20
|
+
def __init__(self, player_command: str = "mpv") -> None:
|
|
22
21
|
self._player_command = player_command
|
|
23
|
-
self._duck_media = duck_media
|
|
24
22
|
self._process: subprocess.Popen | None = None
|
|
25
23
|
|
|
26
24
|
# Detect platform fallback if player_command is not available
|
|
@@ -84,23 +82,19 @@ class AudioPlayer:
|
|
|
84
82
|
|
|
85
83
|
# Cross-session audio lock: prevents overlapping playback
|
|
86
84
|
# flock is kernel-managed — auto-released on crash, no stale locks
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
# Lock released when lock_file closes
|
|
102
|
-
finally:
|
|
103
|
-
unduck(paused_apps)
|
|
85
|
+
with open(AUDIO_LOCK_PATH, "w") as lock_file:
|
|
86
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
87
|
+
|
|
88
|
+
start = time.perf_counter()
|
|
89
|
+
self._process = subprocess.Popen(
|
|
90
|
+
cmd,
|
|
91
|
+
stdout=subprocess.DEVNULL,
|
|
92
|
+
stderr=subprocess.DEVNULL,
|
|
93
|
+
)
|
|
94
|
+
self._process.wait()
|
|
95
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
96
|
+
|
|
97
|
+
# Lock released when lock_file closes
|
|
104
98
|
|
|
105
99
|
if self._process.returncode != 0:
|
|
106
100
|
return PlaybackResult(
|
package/tts/kokoro_engine.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
5
7
|
from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
|
|
6
8
|
|
|
7
9
|
logger = get_logger("tts.kokoro")
|
|
@@ -47,6 +49,11 @@ class KokoroEngine:
|
|
|
47
49
|
samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
|
|
48
50
|
synthesis_ms = (time.perf_counter() - start) * 1000
|
|
49
51
|
|
|
52
|
+
# Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
|
|
53
|
+
# (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
|
|
54
|
+
pad = int(sample_rate * 0.10)
|
|
55
|
+
samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
|
|
56
|
+
|
|
50
57
|
duration_ms = (len(samples) / sample_rate) * 1000
|
|
51
58
|
|
|
52
59
|
return SynthesisResult(
|
package/tts/media_duck.py
CHANGED
|
@@ -14,6 +14,8 @@ Usage:
|
|
|
14
14
|
unduck(paused) # resume only what we paused
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import ctypes
|
|
18
|
+
import ctypes.util
|
|
17
19
|
import platform
|
|
18
20
|
import subprocess
|
|
19
21
|
|
|
@@ -89,6 +91,66 @@ tell application "{target}"
|
|
|
89
91
|
end tell"""
|
|
90
92
|
|
|
91
93
|
|
|
94
|
+
# ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
def is_bluetooth_output() -> bool:
|
|
97
|
+
"""Return True if the default audio output is a Bluetooth device.
|
|
98
|
+
|
|
99
|
+
Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
|
|
100
|
+
of the default output device. Returns False on non-macOS or on error.
|
|
101
|
+
"""
|
|
102
|
+
if platform.system() != "Darwin":
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
lib_path = ctypes.util.find_library("CoreAudio")
|
|
107
|
+
if not lib_path:
|
|
108
|
+
return False
|
|
109
|
+
ca = ctypes.cdll.LoadLibrary(lib_path)
|
|
110
|
+
|
|
111
|
+
class _AudioObjectPropertyAddress(ctypes.Structure):
|
|
112
|
+
_fields_ = [
|
|
113
|
+
("mSelector", ctypes.c_uint32),
|
|
114
|
+
("mScope", ctypes.c_uint32),
|
|
115
|
+
("mElement", ctypes.c_uint32),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# CoreAudio FourCC constants
|
|
119
|
+
_SYS_OBJ = 1 # kAudioObjectSystemObject
|
|
120
|
+
_SCOPE_G = int.from_bytes(b"glob", "big") # kAudioObjectPropertyScopeGlobal
|
|
121
|
+
_ELEM_M = 0 # kAudioObjectPropertyElementMain
|
|
122
|
+
_DEF_OUT = int.from_bytes(b"dOut", "big") # kAudioHardwarePropertyDefaultOutputDevice
|
|
123
|
+
_TRANS = int.from_bytes(b"tran", "big") # kAudioDevicePropertyTransportType
|
|
124
|
+
_BT = int.from_bytes(b"blue", "big") # kAudioDeviceTransportTypeBluetooth
|
|
125
|
+
_BT_LE = int.from_bytes(b"blea", "big") # kAudioDeviceTransportTypeBluetoothLE
|
|
126
|
+
|
|
127
|
+
# Get default output device ID
|
|
128
|
+
addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
|
|
129
|
+
device_id = ctypes.c_uint32(0)
|
|
130
|
+
size = ctypes.c_uint32(4)
|
|
131
|
+
err = ca.AudioObjectGetPropertyData(
|
|
132
|
+
_SYS_OBJ, ctypes.byref(addr), 0, None,
|
|
133
|
+
ctypes.byref(size), ctypes.byref(device_id),
|
|
134
|
+
)
|
|
135
|
+
if err != 0:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
# Get transport type of that device
|
|
139
|
+
addr.mSelector = _TRANS
|
|
140
|
+
transport = ctypes.c_uint32(0)
|
|
141
|
+
size = ctypes.c_uint32(4)
|
|
142
|
+
err = ca.AudioObjectGetPropertyData(
|
|
143
|
+
device_id.value, ctypes.byref(addr), 0, None,
|
|
144
|
+
ctypes.byref(size), ctypes.byref(transport),
|
|
145
|
+
)
|
|
146
|
+
if err != 0:
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
return transport.value in (_BT, _BT_LE)
|
|
150
|
+
except Exception:
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
|
|
92
154
|
# ── Public API ────────────────────────────────────────────────────────────────
|
|
93
155
|
|
|
94
156
|
def duck() -> list[str]:
|
package/tts/speech_queue.py
CHANGED
|
@@ -6,6 +6,7 @@ import time
|
|
|
6
6
|
from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
|
|
7
7
|
from tts.kokoro_engine import KokoroEngine
|
|
8
8
|
from tts.audio_player import AudioPlayer
|
|
9
|
+
from tts.media_duck import duck, unduck
|
|
9
10
|
|
|
10
11
|
logger = get_logger("tts.speech_queue")
|
|
11
12
|
|
|
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
|
|
|
13
14
|
class SpeechQueue:
|
|
14
15
|
"""Manages sequential speech synthesis and playback."""
|
|
15
16
|
|
|
16
|
-
def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
|
|
17
|
+
def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
|
|
17
18
|
self._engine = engine
|
|
18
19
|
self._player = player
|
|
20
|
+
self._duck_media = duck_media
|
|
19
21
|
self._queue: asyncio.Queue = asyncio.Queue()
|
|
20
22
|
self._speaking = False
|
|
21
23
|
|
|
@@ -60,6 +62,9 @@ class SpeechQueue:
|
|
|
60
62
|
total_duration_ms = 0.0
|
|
61
63
|
total_synthesis_ms = 0.0
|
|
62
64
|
|
|
65
|
+
# Duck media for the entire utterance, not per-chunk
|
|
66
|
+
paused_apps = duck() if self._duck_media else []
|
|
67
|
+
|
|
63
68
|
try:
|
|
64
69
|
chunks = self.chunk_text(text)
|
|
65
70
|
|
|
@@ -105,6 +110,7 @@ class SpeechQueue:
|
|
|
105
110
|
error=str(e),
|
|
106
111
|
)
|
|
107
112
|
finally:
|
|
113
|
+
unduck(paused_apps)
|
|
108
114
|
self._speaking = False
|
|
109
115
|
|
|
110
116
|
def stop(self) -> bool:
|