voicesmith-mcp 1.0.13 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.json +2 -1
- package/config.py +4 -0
- package/package.json +1 -1
- package/server.py +38 -7
- package/session_registry.py +46 -0
- package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
- package/stt/mic_capture.py +323 -76
- package/tts/__pycache__/audio_player.cpython-314.pyc +0 -0
- package/tts/__pycache__/media_duck.cpython-314.pyc +0 -0
- package/tts/__pycache__/speech_queue.cpython-314.pyc +0 -0
- package/tts/audio_player.py +20 -14
- package/tts/media_duck.py +146 -0
- package/tts/speech_queue.py +1 -1
- package/voice_registry.py +16 -0
package/config.json
CHANGED
package/config.py
CHANGED
|
@@ -26,6 +26,7 @@ class TTSConfig:
|
|
|
26
26
|
default_voice: str = "am_eric"
|
|
27
27
|
default_speed: float = 1.0
|
|
28
28
|
audio_player: str = "mpv"
|
|
29
|
+
duck_media: bool = False
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
@dataclass
|
|
@@ -99,6 +100,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
|
|
|
99
100
|
config.tts.default_speed = float(tts["default_speed"])
|
|
100
101
|
if "audio_player" in tts:
|
|
101
102
|
config.tts.audio_player = tts["audio_player"]
|
|
103
|
+
if "duck_media" in tts:
|
|
104
|
+
config.tts.duck_media = bool(tts["duck_media"])
|
|
102
105
|
|
|
103
106
|
# STT config
|
|
104
107
|
if "stt" in data:
|
|
@@ -179,6 +182,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
|
|
|
179
182
|
"default_voice": config.tts.default_voice,
|
|
180
183
|
"default_speed": config.tts.default_speed,
|
|
181
184
|
"audio_player": config.tts.audio_player,
|
|
185
|
+
"duck_media": config.tts.duck_media,
|
|
182
186
|
},
|
|
183
187
|
"stt": {
|
|
184
188
|
"model_size": config.stt.model_size,
|
package/package.json
CHANGED
package/server.py
CHANGED
|
@@ -41,7 +41,7 @@ from shared import (
|
|
|
41
41
|
get_logger,
|
|
42
42
|
)
|
|
43
43
|
from config import load_config, save_config, get_config_path, AppConfig
|
|
44
|
-
from session_registry import register_session, unregister_session
|
|
44
|
+
from session_registry import register_session, rename_session, unregister_session
|
|
45
45
|
|
|
46
46
|
logger = get_logger("server")
|
|
47
47
|
|
|
@@ -82,7 +82,7 @@ def _init_tts(config: AppConfig):
|
|
|
82
82
|
|
|
83
83
|
try:
|
|
84
84
|
_tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
|
|
85
|
-
_audio_player = AudioPlayer(config.tts.audio_player)
|
|
85
|
+
_audio_player = AudioPlayer(config.tts.audio_player, duck_media=config.tts.duck_media)
|
|
86
86
|
_speech_queue = SpeechQueue(_tts_engine, _audio_player)
|
|
87
87
|
logger.info("TTS subsystem initialized")
|
|
88
88
|
except TTSEngineError as e:
|
|
@@ -454,14 +454,18 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
454
454
|
logger.info(f"Listening (prompt: {prompt})")
|
|
455
455
|
|
|
456
456
|
try:
|
|
457
|
+
loop = asyncio.get_running_loop()
|
|
458
|
+
|
|
457
459
|
# Play ready sound so the user knows to start speaking
|
|
458
460
|
# Skip for push-to-talk (HTTP) — it has its own beep
|
|
459
461
|
if prompt != "push-to-talk":
|
|
460
|
-
loop = asyncio.get_event_loop()
|
|
461
462
|
await loop.run_in_executor(None, _play_ready_sound)
|
|
462
463
|
|
|
463
464
|
start = time.perf_counter()
|
|
464
465
|
|
|
466
|
+
# Reset VAD state from any prior recording (LSTM hidden state + context)
|
|
467
|
+
_vad.reset()
|
|
468
|
+
|
|
465
469
|
# Record audio with VAD
|
|
466
470
|
audio = await _mic_capture.record(
|
|
467
471
|
vad=_vad,
|
|
@@ -479,7 +483,6 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
479
483
|
recording_ms = (time.perf_counter() - start) * 1000
|
|
480
484
|
|
|
481
485
|
# Transcribe
|
|
482
|
-
loop = asyncio.get_event_loop()
|
|
483
486
|
result = await loop.run_in_executor(
|
|
484
487
|
None, _stt_engine.transcribe, audio, STT_SAMPLE_RATE
|
|
485
488
|
)
|
|
@@ -565,6 +568,10 @@ async def get_voice_registry() -> dict:
|
|
|
565
568
|
async def set_voice(name: str, voice: str) -> dict:
|
|
566
569
|
"""Assign or reassign a voice to an agent name.
|
|
567
570
|
|
|
571
|
+
Also renames the session so name and voice always match.
|
|
572
|
+
The name is derived from the voice ID (e.g., "am_fenrir" -> "Fenrir").
|
|
573
|
+
If the derived name is taken by another session, returns name_occupied error.
|
|
574
|
+
|
|
568
575
|
Args:
|
|
569
576
|
name: Agent name to assign.
|
|
570
577
|
voice: Kokoro voice ID (e.g., "am_eric"). Must be valid.
|
|
@@ -579,17 +586,41 @@ async def set_voice(name: str, voice: str) -> dict:
|
|
|
579
586
|
"message": f"Voice '{voice}' not found. Use list_voices to see available options.",
|
|
580
587
|
}
|
|
581
588
|
|
|
582
|
-
|
|
589
|
+
# Derive canonical name from voice ID (e.g., "am_fenrir" -> "Fenrir")
|
|
590
|
+
# The voice ID format is {prefix}_{name}, so split on underscore and capitalize
|
|
591
|
+
parts = voice.split("_", 1)
|
|
592
|
+
new_name = parts[1].capitalize() if len(parts) == 2 else name
|
|
593
|
+
|
|
594
|
+
old_name = _session_info["name"] if _session_info else name
|
|
595
|
+
|
|
596
|
+
# Update sessions.json with conflict check
|
|
597
|
+
if _session_info:
|
|
598
|
+
try:
|
|
599
|
+
updated = rename_session(os.getpid(), new_name, voice)
|
|
600
|
+
if updated:
|
|
601
|
+
_session_info.update(updated)
|
|
602
|
+
except ValueError:
|
|
603
|
+
return {
|
|
604
|
+
"success": False,
|
|
605
|
+
"error": "name_occupied",
|
|
606
|
+
"message": f"'{new_name}' is occupied by another session.",
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
# Update voice registry (remove old entry, add new)
|
|
610
|
+
_registry.rename_voice(old_name, new_name, voice)
|
|
583
611
|
|
|
584
612
|
# Persist last voice name so it survives session restart / resume
|
|
585
613
|
if _config is not None:
|
|
586
|
-
_config.last_voice_name =
|
|
614
|
+
_config.last_voice_name = new_name
|
|
587
615
|
try:
|
|
588
616
|
save_config(_config)
|
|
589
617
|
except Exception as e:
|
|
590
618
|
logger.warning(f"Failed to persist last_voice_name: {e}")
|
|
591
619
|
|
|
592
|
-
|
|
620
|
+
result = {"success": True, "name": new_name, "voice": voice}
|
|
621
|
+
if old_name != new_name:
|
|
622
|
+
result["previous_name"] = old_name
|
|
623
|
+
return result
|
|
593
624
|
|
|
594
625
|
|
|
595
626
|
@mcp.tool()
|
package/session_registry.py
CHANGED
|
@@ -258,6 +258,52 @@ def register_session(
|
|
|
258
258
|
return session
|
|
259
259
|
|
|
260
260
|
|
|
261
|
+
def rename_session(pid: int, new_name: str, new_voice: str) -> Optional[dict]:
|
|
262
|
+
"""Rename this server's session in the registry.
|
|
263
|
+
|
|
264
|
+
Updates the name and voice fields for the entry matching pid.
|
|
265
|
+
Returns the updated session dict, or None if PID not found.
|
|
266
|
+
Raises ValueError if new_name is taken by another active session.
|
|
267
|
+
"""
|
|
268
|
+
path = _sessions_path()
|
|
269
|
+
if not path.exists():
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
with open(path, "r+") as f:
|
|
274
|
+
fcntl.flock(f, fcntl.LOCK_EX)
|
|
275
|
+
sessions = _read_sessions(path)
|
|
276
|
+
sessions = _clean_stale(sessions)
|
|
277
|
+
|
|
278
|
+
# Find our entry
|
|
279
|
+
our_entry = None
|
|
280
|
+
for s in sessions:
|
|
281
|
+
if s.get("pid") == pid:
|
|
282
|
+
our_entry = s
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
if our_entry is None:
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
# Check if new_name is taken by another session
|
|
289
|
+
if new_name != our_entry["name"]:
|
|
290
|
+
for s in sessions:
|
|
291
|
+
if s.get("name") == new_name and s.get("pid") != pid:
|
|
292
|
+
raise ValueError(
|
|
293
|
+
f"'{new_name}' is occupied by another session (pid {s.get('pid')})"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
our_entry["name"] = new_name
|
|
297
|
+
our_entry["voice"] = new_voice
|
|
298
|
+
_write_sessions(path, sessions)
|
|
299
|
+
return dict(our_entry)
|
|
300
|
+
except ValueError:
|
|
301
|
+
raise
|
|
302
|
+
except OSError as e:
|
|
303
|
+
logger.warning(f"Failed to rename session: {e}")
|
|
304
|
+
return None
|
|
305
|
+
|
|
306
|
+
|
|
261
307
|
def unregister_session() -> None:
|
|
262
308
|
"""Remove this server's session from the registry."""
|
|
263
309
|
path = _sessions_path()
|
|
Binary file
|
package/stt/mic_capture.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
"""Microphone capture with VAD-controlled recording."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import platform
|
|
4
6
|
import queue
|
|
7
|
+
import socket
|
|
8
|
+
import subprocess
|
|
9
|
+
import threading
|
|
5
10
|
import time
|
|
6
11
|
from typing import Optional
|
|
7
12
|
|
|
@@ -12,6 +17,65 @@ from stt.vad import VoiceActivityDetector
|
|
|
12
17
|
|
|
13
18
|
logger = get_logger("stt.mic")
|
|
14
19
|
|
|
20
|
+
_CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
|
|
21
|
+
_CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
|
|
22
|
+
_ZERO_CHECK_CHUNKS = 10 # ~320ms of silence before detecting TCC denial
|
|
23
|
+
|
|
24
|
+
_AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
|
|
25
|
+
_LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
|
|
26
|
+
_LAUNCHAGENT_PLIST = os.path.expanduser(
|
|
27
|
+
f"~/Library/LaunchAgents/{_LAUNCHAGENT_LABEL}.plist"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _find_app_binary(name: str) -> Optional[str]:
|
|
32
|
+
"""Return path to a named binary inside VoiceSmithMCP.app, or None."""
|
|
33
|
+
install_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
34
|
+
binary = os.path.join(install_dir, "VoiceSmithMCP.app", "Contents", "MacOS", name)
|
|
35
|
+
return binary if os.path.isfile(binary) and os.access(binary, os.X_OK) else None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _launchagent_available() -> bool:
|
|
39
|
+
"""Return True if the VoiceSmithMCP audio LaunchAgent plist is installed."""
|
|
40
|
+
return os.path.isfile(_LAUNCHAGENT_PLIST)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _ensure_audio_service_running() -> None:
|
|
44
|
+
"""Start the audio LaunchAgent if it is not already running.
|
|
45
|
+
|
|
46
|
+
The service is started via launchctl. We then wait up to 3 seconds for
|
|
47
|
+
the Unix socket to appear, which signals the service is ready to accept
|
|
48
|
+
connections.
|
|
49
|
+
"""
|
|
50
|
+
# If the socket exists and is connectable, service is already running.
|
|
51
|
+
if _socket_ready():
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
logger.info("Starting audio service via launchctl")
|
|
55
|
+
try:
|
|
56
|
+
subprocess.run(
|
|
57
|
+
["launchctl", "start", _LAUNCHAGENT_LABEL],
|
|
58
|
+
capture_output=True,
|
|
59
|
+
timeout=5,
|
|
60
|
+
)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
raise MicCaptureError(f"Failed to start audio service: {e}") from e
|
|
63
|
+
|
|
64
|
+
# Wait up to 3 s for the socket to appear.
|
|
65
|
+
for _ in range(30):
|
|
66
|
+
if _socket_ready():
|
|
67
|
+
return
|
|
68
|
+
time.sleep(0.1)
|
|
69
|
+
raise MicCaptureError(
|
|
70
|
+
"VoiceSmith audio service did not start in time. "
|
|
71
|
+
f"Check {_LAUNCHAGENT_PLIST} and launchctl output."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _socket_ready() -> bool:
|
|
76
|
+
"""Return True if the audio service socket file exists."""
|
|
77
|
+
return os.path.exists(_AUDIO_SERVICE_SOCKET)
|
|
78
|
+
|
|
15
79
|
|
|
16
80
|
class MicCapture:
|
|
17
81
|
"""Microphone capture with voice activity detection."""
|
|
@@ -31,7 +95,11 @@ class MicCapture:
|
|
|
31
95
|
) -> Optional[np.ndarray]:
|
|
32
96
|
"""Record audio from the microphone until silence is detected.
|
|
33
97
|
|
|
34
|
-
|
|
98
|
+
On macOS, prefers the audio-service LaunchAgent backend which runs
|
|
99
|
+
under launchd (ppid=1), ensuring macOS TCC attributes mic permission
|
|
100
|
+
to VoiceSmithMCP.app rather than to the user's terminal app.
|
|
101
|
+
Falls back to the audio-capture subprocess if the LaunchAgent is not
|
|
102
|
+
installed, and to sounddevice on non-macOS systems.
|
|
35
103
|
|
|
36
104
|
Args:
|
|
37
105
|
vad: VoiceActivityDetector instance for speech detection.
|
|
@@ -48,6 +116,150 @@ class MicCapture:
|
|
|
48
116
|
if self._recording:
|
|
49
117
|
raise MicCaptureError("Another recording is already in progress")
|
|
50
118
|
|
|
119
|
+
# Reset VAD state between recordings.
|
|
120
|
+
vad.reset()
|
|
121
|
+
|
|
122
|
+
if platform.system() == "Darwin":
|
|
123
|
+
if _launchagent_available():
|
|
124
|
+
return await self._record_via_socket(
|
|
125
|
+
vad, timeout, silence_threshold, cancel_event
|
|
126
|
+
)
|
|
127
|
+
# Legacy: subprocess fallback for installs without the LaunchAgent.
|
|
128
|
+
audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
|
|
129
|
+
if audio_capture_bin:
|
|
130
|
+
return await self._record_via_subprocess(
|
|
131
|
+
audio_capture_bin, vad, timeout, silence_threshold, cancel_event
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return await self._record_via_sounddevice(
|
|
135
|
+
vad, timeout, silence_threshold, cancel_event
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
|
|
139
|
+
|
|
140
|
+
async def _record_via_socket(
|
|
141
|
+
self,
|
|
142
|
+
vad: VoiceActivityDetector,
|
|
143
|
+
timeout: float,
|
|
144
|
+
silence_threshold: float,
|
|
145
|
+
cancel_event: Optional[asyncio.Event],
|
|
146
|
+
) -> Optional[np.ndarray]:
|
|
147
|
+
"""Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
|
|
148
|
+
|
|
149
|
+
The LaunchAgent runs under launchd so macOS TCC attributes mic access
|
|
150
|
+
to com.voicesmith-mcp.launcher, not to the parent terminal app.
|
|
151
|
+
"""
|
|
152
|
+
loop = asyncio.get_running_loop()
|
|
153
|
+
|
|
154
|
+
# Ensure the service is up and the socket is ready.
|
|
155
|
+
try:
|
|
156
|
+
await loop.run_in_executor(None, _ensure_audio_service_running)
|
|
157
|
+
except MicCaptureError:
|
|
158
|
+
raise
|
|
159
|
+
except Exception as e:
|
|
160
|
+
raise MicCaptureError(f"Audio service error: {e}") from e
|
|
161
|
+
|
|
162
|
+
# Open socket connection.
|
|
163
|
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
164
|
+
try:
|
|
165
|
+
sock.connect(_AUDIO_SERVICE_SOCKET)
|
|
166
|
+
except OSError as e:
|
|
167
|
+
sock.close()
|
|
168
|
+
raise MicCaptureError(f"Cannot connect to audio service: {e}") from e
|
|
169
|
+
|
|
170
|
+
self._recording = True
|
|
171
|
+
self._stop_flag = False
|
|
172
|
+
self._audio_queue = queue.Queue()
|
|
173
|
+
|
|
174
|
+
def _reader() -> None:
|
|
175
|
+
"""Background thread: reads socket chunks → audio_queue."""
|
|
176
|
+
try:
|
|
177
|
+
while True:
|
|
178
|
+
data = b""
|
|
179
|
+
while len(data) < _CHUNK_BYTES:
|
|
180
|
+
got = sock.recv(_CHUNK_BYTES - len(data))
|
|
181
|
+
if not got:
|
|
182
|
+
return # service closed connection
|
|
183
|
+
data += got
|
|
184
|
+
self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
|
|
185
|
+
except Exception as exc:
|
|
186
|
+
logger.debug(f"socket reader thread exiting: {exc}")
|
|
187
|
+
|
|
188
|
+
reader_thread = threading.Thread(target=_reader, daemon=True)
|
|
189
|
+
reader_thread.start()
|
|
190
|
+
logger.info("Microphone recording started (audio-service socket)")
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
|
|
194
|
+
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
195
|
+
finally:
|
|
196
|
+
sock.close() # signals service to stop sending for this session
|
|
197
|
+
reader_thread.join(timeout=1)
|
|
198
|
+
self._recording = False
|
|
199
|
+
|
|
200
|
+
# ── Subprocess backend (macOS legacy fallback) ─────────────────────────────
|
|
201
|
+
|
|
202
|
+
async def _record_via_subprocess(
|
|
203
|
+
self,
|
|
204
|
+
binary: str,
|
|
205
|
+
vad: VoiceActivityDetector,
|
|
206
|
+
timeout: float,
|
|
207
|
+
silence_threshold: float,
|
|
208
|
+
cancel_event: Optional[asyncio.Event],
|
|
209
|
+
) -> Optional[np.ndarray]:
|
|
210
|
+
"""Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
|
|
211
|
+
self._recording = True
|
|
212
|
+
self._stop_flag = False
|
|
213
|
+
self._audio_queue = queue.Queue()
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
proc = subprocess.Popen(
|
|
217
|
+
[binary],
|
|
218
|
+
stdout=subprocess.PIPE,
|
|
219
|
+
stderr=subprocess.PIPE,
|
|
220
|
+
close_fds=True,
|
|
221
|
+
)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self._recording = False
|
|
224
|
+
raise MicCaptureError(f"Failed to start audio binary: {e}") from e
|
|
225
|
+
|
|
226
|
+
logger.info("Microphone recording started (subprocess fallback)")
|
|
227
|
+
|
|
228
|
+
def _reader() -> None:
|
|
229
|
+
try:
|
|
230
|
+
while True:
|
|
231
|
+
data = proc.stdout.read(_CHUNK_BYTES)
|
|
232
|
+
if not data or len(data) < _CHUNK_BYTES:
|
|
233
|
+
break
|
|
234
|
+
self._audio_queue.put(np.frombuffer(data, dtype=np.float32).copy())
|
|
235
|
+
except Exception as exc:
|
|
236
|
+
logger.debug(f"subprocess reader thread exiting: {exc}")
|
|
237
|
+
|
|
238
|
+
reader_thread = threading.Thread(target=_reader, daemon=True)
|
|
239
|
+
reader_thread.start()
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES))
|
|
243
|
+
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
244
|
+
finally:
|
|
245
|
+
proc.terminate()
|
|
246
|
+
try:
|
|
247
|
+
proc.wait(timeout=1)
|
|
248
|
+
except Exception:
|
|
249
|
+
proc.kill()
|
|
250
|
+
reader_thread.join(timeout=1)
|
|
251
|
+
self._recording = False
|
|
252
|
+
|
|
253
|
+
# ── sounddevice backend (non-macOS fallback) ───────────────────────────────
|
|
254
|
+
|
|
255
|
+
async def _record_via_sounddevice(
|
|
256
|
+
self,
|
|
257
|
+
vad: VoiceActivityDetector,
|
|
258
|
+
timeout: float,
|
|
259
|
+
silence_threshold: float,
|
|
260
|
+
cancel_event: Optional[asyncio.Event],
|
|
261
|
+
) -> Optional[np.ndarray]:
|
|
262
|
+
"""Record using sounddevice / PortAudio (fallback for non-macOS)."""
|
|
51
263
|
try:
|
|
52
264
|
import sounddevice as sd
|
|
53
265
|
except Exception as e:
|
|
@@ -56,15 +268,6 @@ class MicCapture:
|
|
|
56
268
|
self._recording = True
|
|
57
269
|
self._stop_flag = False
|
|
58
270
|
self._audio_queue = queue.Queue()
|
|
59
|
-
chunks: list[np.ndarray] = []
|
|
60
|
-
speech_detected = False
|
|
61
|
-
silence_duration = 0.0
|
|
62
|
-
loop = asyncio.get_event_loop()
|
|
63
|
-
|
|
64
|
-
# Reset VAD state — the LSTM hidden state and context window must
|
|
65
|
-
# be cleared between recordings to avoid stale state from previous
|
|
66
|
-
# audio affecting speech detection.
|
|
67
|
-
vad.reset()
|
|
68
271
|
|
|
69
272
|
stream = None
|
|
70
273
|
try:
|
|
@@ -72,94 +275,138 @@ class MicCapture:
|
|
|
72
275
|
samplerate=self._sample_rate,
|
|
73
276
|
channels=1,
|
|
74
277
|
dtype="float32",
|
|
75
|
-
blocksize=
|
|
278
|
+
blocksize=_CHUNK_SAMPLES,
|
|
76
279
|
callback=self._audio_callback,
|
|
77
280
|
)
|
|
78
281
|
stream.start()
|
|
79
|
-
logger.info("Microphone recording started")
|
|
80
|
-
|
|
81
|
-
# Discard the first ~200ms of audio to avoid picking up residual
|
|
82
|
-
# speaker output (Tink sound or TTS playback that just finished).
|
|
83
|
-
# This prevents VAD from detecting speaker bleed as "speech" and
|
|
84
|
-
# then cutting off when the bleed stops.
|
|
85
|
-
flush_chunks = int(0.2 * self._sample_rate / 512) # ~6 chunks
|
|
86
|
-
for _ in range(flush_chunks):
|
|
87
|
-
try:
|
|
88
|
-
self._audio_queue.get(timeout=0.1)
|
|
89
|
-
except queue.Empty:
|
|
90
|
-
break
|
|
91
|
-
|
|
92
|
-
start_time = asyncio.get_event_loop().time()
|
|
93
|
-
|
|
94
|
-
while not self._stop_flag:
|
|
95
|
-
# Check cancellation
|
|
96
|
-
if cancel_event and cancel_event.is_set():
|
|
97
|
-
logger.info("Recording cancelled by event")
|
|
98
|
-
break
|
|
99
|
-
|
|
100
|
-
# Check timeout
|
|
101
|
-
elapsed = asyncio.get_event_loop().time() - start_time
|
|
102
|
-
if elapsed >= timeout:
|
|
103
|
-
if not speech_detected:
|
|
104
|
-
logger.info("Recording timed out with no speech detected")
|
|
105
|
-
else:
|
|
106
|
-
logger.info("Recording timed out")
|
|
107
|
-
break
|
|
108
|
-
|
|
109
|
-
# Get audio chunk from queue
|
|
110
|
-
try:
|
|
111
|
-
chunk = await loop.run_in_executor(
|
|
112
|
-
None, self._audio_queue.get, True, 0.1
|
|
113
|
-
)
|
|
114
|
-
except queue.Empty:
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
chunks.append(chunk)
|
|
118
|
-
is_speech = vad.is_speech(chunk)
|
|
119
|
-
|
|
120
|
-
if is_speech:
|
|
121
|
-
speech_detected = True
|
|
122
|
-
silence_duration = 0.0
|
|
123
|
-
elif speech_detected:
|
|
124
|
-
# Count silence after speech was detected
|
|
125
|
-
chunk_duration = len(chunk) / self._sample_rate
|
|
126
|
-
silence_duration += chunk_duration
|
|
127
|
-
if silence_duration >= silence_threshold:
|
|
128
|
-
logger.info(
|
|
129
|
-
f"Silence threshold reached ({silence_threshold}s), stopping"
|
|
130
|
-
)
|
|
131
|
-
break
|
|
132
|
-
|
|
133
|
-
if not chunks or not speech_detected:
|
|
134
|
-
return None
|
|
135
|
-
|
|
136
|
-
return np.concatenate(chunks).flatten()
|
|
282
|
+
logger.info("Microphone recording started (sounddevice)")
|
|
137
283
|
|
|
284
|
+
self._flush_queue(int(0.2 * self._sample_rate / _CHUNK_SAMPLES), chunk_timeout=0.1)
|
|
285
|
+
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
138
286
|
except MicCaptureError:
|
|
139
287
|
raise
|
|
140
288
|
except Exception as e:
|
|
141
289
|
raise MicCaptureError(f"Recording failed: {e}") from e
|
|
142
290
|
finally:
|
|
143
|
-
# Safely tear down the audio stream. The CoreAudio IO thread may
|
|
144
|
-
# still be executing the callback when we call stop(). Wait briefly
|
|
145
|
-
# between stop() and close() to let the IO thread finish — this
|
|
146
|
-
# prevents the segfault in libffi/PortAudio where the callback
|
|
147
|
-
# dereferences freed memory.
|
|
148
291
|
if stream is not None:
|
|
149
292
|
try:
|
|
150
293
|
stream.stop()
|
|
151
|
-
time.sleep(0.05)
|
|
294
|
+
time.sleep(0.05)
|
|
152
295
|
stream.close()
|
|
153
296
|
except Exception as e:
|
|
154
297
|
logger.debug(f"Stream teardown: {e}")
|
|
155
298
|
self._recording = False
|
|
156
299
|
|
|
157
|
-
|
|
300
|
+
# ── Shared helpers ─────────────────────────────────────────────────────────
|
|
301
|
+
|
|
302
|
+
def _flush_queue(self, n_chunks: int, chunk_timeout: float = 0.15) -> None:
|
|
303
|
+
"""Discard the first n_chunks from the audio queue (drops speaker bleed)."""
|
|
304
|
+
for _ in range(n_chunks):
|
|
305
|
+
try:
|
|
306
|
+
self._audio_queue.get(timeout=chunk_timeout)
|
|
307
|
+
except queue.Empty:
|
|
308
|
+
break
|
|
309
|
+
|
|
310
|
+
# ── Shared VAD loop ────────────────────────────────────────────────────────
|
|
311
|
+
|
|
312
|
+
async def _run_vad_loop(
|
|
313
|
+
self,
|
|
314
|
+
vad: VoiceActivityDetector,
|
|
315
|
+
timeout: float,
|
|
316
|
+
silence_threshold: float,
|
|
317
|
+
cancel_event: Optional[asyncio.Event],
|
|
318
|
+
) -> Optional[np.ndarray]:
|
|
319
|
+
"""VAD recording loop — shared by all capture backends.
|
|
320
|
+
|
|
321
|
+
Reads 512-sample float32 chunks from self._audio_queue, runs Silero VAD
|
|
322
|
+
on each, and returns when silence_threshold is exceeded after speech,
|
|
323
|
+
timeout elapses, or cancel_event fires.
|
|
324
|
+
|
|
325
|
+
Raises:
|
|
326
|
+
MicCaptureError: If audio is all-zeros (TCC denial detected).
|
|
327
|
+
"""
|
|
328
|
+
loop = asyncio.get_running_loop()
|
|
329
|
+
chunks: list[np.ndarray] = []
|
|
330
|
+
speech_detected = False
|
|
331
|
+
silence_duration = 0.0
|
|
332
|
+
zero_check_done = False
|
|
333
|
+
start_time = loop.time()
|
|
334
|
+
|
|
335
|
+
while not self._stop_flag:
|
|
336
|
+
if cancel_event and cancel_event.is_set():
|
|
337
|
+
logger.info("Recording cancelled by event")
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
elapsed = loop.time() - start_time
|
|
341
|
+
if elapsed >= timeout:
|
|
342
|
+
if not speech_detected:
|
|
343
|
+
logger.info("Recording timed out with no speech detected")
|
|
344
|
+
else:
|
|
345
|
+
logger.info("Recording timed out")
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
chunk = await loop.run_in_executor(
|
|
350
|
+
None, self._audio_queue.get, True, 0.1
|
|
351
|
+
)
|
|
352
|
+
except queue.Empty:
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
chunks.append(chunk)
|
|
356
|
+
|
|
357
|
+
if not zero_check_done and len(chunks) >= _ZERO_CHECK_CHUNKS:
|
|
358
|
+
zero_check_done = True
|
|
359
|
+
if all(np.max(np.abs(c)) == 0.0 for c in chunks):
|
|
360
|
+
raise MicCaptureError(self._zero_audio_message())
|
|
361
|
+
|
|
362
|
+
is_speech = vad.is_speech(chunk)
|
|
363
|
+
|
|
364
|
+
if is_speech:
|
|
365
|
+
speech_detected = True
|
|
366
|
+
silence_duration = 0.0
|
|
367
|
+
elif speech_detected:
|
|
368
|
+
silence_duration += len(chunk) / self._sample_rate
|
|
369
|
+
if silence_duration >= silence_threshold:
|
|
370
|
+
logger.info(
|
|
371
|
+
f"Silence threshold reached ({silence_threshold}s), stopping"
|
|
372
|
+
)
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
if not chunks or not speech_detected:
|
|
376
|
+
return None
|
|
377
|
+
|
|
378
|
+
return np.concatenate(chunks).flatten()
|
|
379
|
+
|
|
380
|
+
# ── sounddevice callback ───────────────────────────────────────────────────
|
|
381
|
+
|
|
382
|
+
def _audio_callback(self, indata, frames, time_info, status) -> None:
|
|
158
383
|
"""Sounddevice callback — pushes audio chunks to the queue."""
|
|
159
384
|
if status:
|
|
160
385
|
logger.warning(f"Audio callback status: {status}")
|
|
161
386
|
self._audio_queue.put(indata.copy())
|
|
162
387
|
|
|
388
|
+
# ── Error message ──────────────────────────────────────────────────────────
|
|
389
|
+
|
|
390
|
+
@staticmethod
|
|
391
|
+
def _zero_audio_message() -> str:
|
|
392
|
+
"""Build an error message for zero-amplitude mic input."""
|
|
393
|
+
msg = (
|
|
394
|
+
"Microphone is returning silent audio. "
|
|
395
|
+
"The audio stream opened successfully but every sample is zero."
|
|
396
|
+
)
|
|
397
|
+
if platform.system() == "Darwin":
|
|
398
|
+
msg += (
|
|
399
|
+
"\n\nmacOS is blocking mic access. The VoiceSmithMCP audio service "
|
|
400
|
+
"may not have been granted Microphone permission yet. "
|
|
401
|
+
"Check System Settings > Privacy & Security > Microphone and "
|
|
402
|
+
"ensure VoiceSmithMCP is enabled.\n\n"
|
|
403
|
+
"If VoiceSmithMCP is not listed, re-run the installer:\n"
|
|
404
|
+
" ./install.sh"
|
|
405
|
+
)
|
|
406
|
+
return msg
|
|
407
|
+
|
|
408
|
+
# ── Properties / control ──────────────────────────────────────────────────
|
|
409
|
+
|
|
163
410
|
@property
|
|
164
411
|
def is_recording(self) -> bool:
|
|
165
412
|
"""Return whether the microphone is currently recording."""
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/tts/audio_player.py
CHANGED
|
@@ -10,6 +10,7 @@ import time
|
|
|
10
10
|
import soundfile as sf
|
|
11
11
|
|
|
12
12
|
from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
|
|
13
|
+
from tts.media_duck import duck, unduck
|
|
13
14
|
|
|
14
15
|
logger = get_logger("tts.audio_player")
|
|
15
16
|
|
|
@@ -17,8 +18,9 @@ logger = get_logger("tts.audio_player")
|
|
|
17
18
|
class AudioPlayer:
|
|
18
19
|
"""Plays audio samples through an external player process."""
|
|
19
20
|
|
|
20
|
-
def __init__(self, player_command: str = "mpv") -> None:
|
|
21
|
+
def __init__(self, player_command: str = "mpv", duck_media: bool = False) -> None:
|
|
21
22
|
self._player_command = player_command
|
|
23
|
+
self._duck_media = duck_media
|
|
22
24
|
self._process: subprocess.Popen | None = None
|
|
23
25
|
|
|
24
26
|
# Detect platform fallback if player_command is not available
|
|
@@ -82,19 +84,23 @@ class AudioPlayer:
|
|
|
82
84
|
|
|
83
85
|
# Cross-session audio lock: prevents overlapping playback
|
|
84
86
|
# flock is kernel-managed — auto-released on crash, no stale locks
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
87
|
+
paused_apps = duck() if self._duck_media else []
|
|
88
|
+
try:
|
|
89
|
+
with open(AUDIO_LOCK_PATH, "w") as lock_file:
|
|
90
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
91
|
+
|
|
92
|
+
start = time.perf_counter()
|
|
93
|
+
self._process = subprocess.Popen(
|
|
94
|
+
cmd,
|
|
95
|
+
stdout=subprocess.DEVNULL,
|
|
96
|
+
stderr=subprocess.DEVNULL,
|
|
97
|
+
)
|
|
98
|
+
self._process.wait()
|
|
99
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
100
|
+
|
|
101
|
+
# Lock released when lock_file closes
|
|
102
|
+
finally:
|
|
103
|
+
unduck(paused_apps)
|
|
98
104
|
|
|
99
105
|
if self._process.returncode != 0:
|
|
100
106
|
return PlaybackResult(
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""macOS media ducking via osascript.
|
|
2
|
+
|
|
3
|
+
Pauses media apps (Apple Music, Spotify) and browser tabs (Chrome, Brave,
|
|
4
|
+
Edge, Safari) before VoiceSmith audio playback and resumes them afterward.
|
|
5
|
+
No-ops on non-macOS systems.
|
|
6
|
+
|
|
7
|
+
Browser ducking uses JavaScript injection via AppleScript. The first time
|
|
8
|
+
each browser is targeted, macOS will prompt for Automation permission — approve
|
|
9
|
+
once and it is remembered.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
paused = duck() # pause everything playing; returns opaque token list
|
|
13
|
+
...play audio...
|
|
14
|
+
unduck(paused) # resume only what we paused
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import platform
|
|
18
|
+
import subprocess
|
|
19
|
+
|
|
20
|
+
from shared import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger("tts.media_duck")
|
|
23
|
+
|
|
24
|
+
# ── Native media apps ─────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
# (display name, AppleScript target)
|
|
27
|
+
_APPS = [
|
|
28
|
+
("Apple Music", "Music"),
|
|
29
|
+
("Spotify", "Spotify"),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# ── Browsers ──────────────────────────────────────────────────────────────────
|
|
33
|
+
|
|
34
|
+
# (display name, AppleScript target, family: "chrome" | "safari")
|
|
35
|
+
_BROWSERS = [
|
|
36
|
+
("Google Chrome", "Google Chrome", "chrome"),
|
|
37
|
+
("Brave Browser", "Brave Browser", "chrome"),
|
|
38
|
+
("Microsoft Edge", "Microsoft Edge", "chrome"),
|
|
39
|
+
("Safari", "Safari", "safari"),
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# JS injected into every tab on duck: pause playing media and mark it.
|
|
43
|
+
_JS_PAUSE = (
|
|
44
|
+
"document.querySelectorAll('video,audio').forEach(function(v){"
|
|
45
|
+
"if(!v.paused){v.pause();v.dataset.voicesmithPaused='1'}"
|
|
46
|
+
"})"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# JS injected on unduck: resume only elements we marked, then clear the mark.
|
|
50
|
+
_JS_RESUME = (
|
|
51
|
+
"document.querySelectorAll('video,audio').forEach(function(v){"
|
|
52
|
+
"if(v.dataset.voicesmithPaused){delete v.dataset.voicesmithPaused;v.play()}"
|
|
53
|
+
"})"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
def _osascript(script: str) -> str:
|
|
59
|
+
"""Run an AppleScript (may be multi-line); return stdout stripped, or '' on error."""
|
|
60
|
+
try:
|
|
61
|
+
result = subprocess.run(
|
|
62
|
+
["osascript"],
|
|
63
|
+
input=script,
|
|
64
|
+
capture_output=True,
|
|
65
|
+
text=True,
|
|
66
|
+
timeout=5,
|
|
67
|
+
)
|
|
68
|
+
return result.stdout.strip()
|
|
69
|
+
except Exception:
|
|
70
|
+
return ""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _browser_script(target: str, family: str, js: str) -> str:
|
|
74
|
+
"""Build an AppleScript that runs js in every tab of target browser."""
|
|
75
|
+
if family == "safari":
|
|
76
|
+
exec_stmt = f'do JavaScript "{js}" in t'
|
|
77
|
+
else: # chrome family
|
|
78
|
+
exec_stmt = f'execute t javascript "{js}"'
|
|
79
|
+
|
|
80
|
+
return f"""\
|
|
81
|
+
tell application "{target}"
|
|
82
|
+
repeat with w in windows
|
|
83
|
+
repeat with t in tabs of w
|
|
84
|
+
try
|
|
85
|
+
{exec_stmt}
|
|
86
|
+
end try
|
|
87
|
+
end repeat
|
|
88
|
+
end repeat
|
|
89
|
+
end tell"""
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ── Public API ────────────────────────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
def duck() -> list[str]:
|
|
95
|
+
"""Pause any playing media apps and browser tabs.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Opaque list of tokens — pass unchanged to unduck().
|
|
99
|
+
"""
|
|
100
|
+
if platform.system() != "Darwin":
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
paused: list[str] = []
|
|
104
|
+
|
|
105
|
+
# Native apps (Music, Spotify)
|
|
106
|
+
for display_name, target in _APPS:
|
|
107
|
+
if _osascript(f'application "{target}" is running') != "true":
|
|
108
|
+
continue
|
|
109
|
+
if _osascript(f'tell application "{target}" to get player state') == "playing":
|
|
110
|
+
_osascript(f'tell application "{target}" to pause')
|
|
111
|
+
paused.append(target)
|
|
112
|
+
logger.debug(f"Ducked {display_name}")
|
|
113
|
+
|
|
114
|
+
# Browsers — inject pause JS into every tab
|
|
115
|
+
for display_name, target, family in _BROWSERS:
|
|
116
|
+
if _osascript(f'application "{target}" is running') != "true":
|
|
117
|
+
continue
|
|
118
|
+
_osascript(_browser_script(target, family, _JS_PAUSE))
|
|
119
|
+
paused.append(f"browser:{target}")
|
|
120
|
+
logger.debug(f"Ducked browser tabs in {display_name}")
|
|
121
|
+
|
|
122
|
+
return paused
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def unduck(paused: list[str]) -> None:
|
|
126
|
+
"""Resume apps and browser tabs paused by duck().
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
paused: The list returned by a previous duck() call.
|
|
130
|
+
"""
|
|
131
|
+
if platform.system() != "Darwin":
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
for token in paused:
|
|
135
|
+
if token.startswith("browser:"):
|
|
136
|
+
target = token[len("browser:"):]
|
|
137
|
+
# family lookup for resume script
|
|
138
|
+
family = next(
|
|
139
|
+
(f for _, t, f in _BROWSERS if t == target),
|
|
140
|
+
"chrome",
|
|
141
|
+
)
|
|
142
|
+
_osascript(_browser_script(target, family, _JS_RESUME))
|
|
143
|
+
logger.debug(f"Unducked browser tabs in {target}")
|
|
144
|
+
else:
|
|
145
|
+
_osascript(f'tell application "{token}" to play')
|
|
146
|
+
logger.debug(f"Unducked {token}")
|
package/tts/speech_queue.py
CHANGED
|
@@ -55,7 +55,7 @@ class SpeechQueue:
|
|
|
55
55
|
speed: float,
|
|
56
56
|
) -> SpeakResult:
|
|
57
57
|
"""Internal: synthesize and play text, blocking until done."""
|
|
58
|
-
loop = asyncio.
|
|
58
|
+
loop = asyncio.get_running_loop()
|
|
59
59
|
self._speaking = True
|
|
60
60
|
total_duration_ms = 0.0
|
|
61
61
|
total_synthesis_ms = 0.0
|
package/voice_registry.py
CHANGED
|
@@ -87,6 +87,22 @@ class VoiceRegistry:
|
|
|
87
87
|
logger.info(f"Set voice '{voice_id}' for '{name}'")
|
|
88
88
|
return True
|
|
89
89
|
|
|
90
|
+
def rename_voice(self, old_name: str, new_name: str, voice_id: str) -> bool:
|
|
91
|
+
"""Rename an agent's registry entry and set a new voice.
|
|
92
|
+
|
|
93
|
+
Removes the old name entry and creates a new one.
|
|
94
|
+
If old_name == new_name, just updates the voice in place.
|
|
95
|
+
Returns True if the voice_id is valid, False otherwise.
|
|
96
|
+
"""
|
|
97
|
+
if voice_id not in ALL_VOICE_IDS:
|
|
98
|
+
logger.warning(f"Invalid voice ID '{voice_id}' for rename '{old_name}' -> '{new_name}'")
|
|
99
|
+
return False
|
|
100
|
+
if old_name != new_name and old_name in self._registry:
|
|
101
|
+
del self._registry[old_name]
|
|
102
|
+
self._registry[new_name] = voice_id
|
|
103
|
+
logger.info(f"Renamed '{old_name}' -> '{new_name}' with voice '{voice_id}'")
|
|
104
|
+
return True
|
|
105
|
+
|
|
90
106
|
def get_registry(self) -> dict[str, str]:
|
|
91
107
|
"""Return a copy of the current registry."""
|
|
92
108
|
return dict(self._registry)
|