voicesmith-mcp 1.0.16 → 1.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server.py +51 -16
- package/stt/mic_capture.py +27 -9
- package/tts/audio_player.py +14 -20
- package/tts/kokoro_engine.py +7 -0
- package/tts/media_duck.py +62 -0
- package/tts/speech_queue.py +7 -1
package/package.json
CHANGED
package/server.py
CHANGED
|
@@ -42,6 +42,19 @@ from shared import (
|
|
|
42
42
|
)
|
|
43
43
|
from config import load_config, save_config, get_config_path, AppConfig
|
|
44
44
|
from session_registry import register_session, rename_session, unregister_session
|
|
45
|
+
from tts.media_duck import duck, unduck, is_bluetooth_output
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def _deferred_unduck(paused_apps: list[str], delay: float = 0.3) -> None:
|
|
49
|
+
"""Unduck after a brief delay so the MCP response reaches the client first.
|
|
50
|
+
|
|
51
|
+
On Bluetooth output, extends the delay to 3s to allow for the HFP → A2DP
|
|
52
|
+
codec switch that macOS performs when the microphone session ends.
|
|
53
|
+
"""
|
|
54
|
+
if is_bluetooth_output():
|
|
55
|
+
delay = max(delay, 3.0)
|
|
56
|
+
await asyncio.sleep(delay)
|
|
57
|
+
unduck(paused_apps)
|
|
45
58
|
|
|
46
59
|
logger = get_logger("server")
|
|
47
60
|
|
|
@@ -63,6 +76,7 @@ _config: AppConfig = None
|
|
|
63
76
|
_muted = False
|
|
64
77
|
_listen_cancel_event: asyncio.Event = None
|
|
65
78
|
_listen_active = False
|
|
79
|
+
_suppress_duck = False # Set by speak_then_listen to prevent inner duck/unduck gaps
|
|
66
80
|
_startup_time = time.time()
|
|
67
81
|
_last_tool_call = time.time() # Updated on every MCP tool call
|
|
68
82
|
_session_info: dict = None
|
|
@@ -82,8 +96,8 @@ def _init_tts(config: AppConfig):
|
|
|
82
96
|
|
|
83
97
|
try:
|
|
84
98
|
_tts_engine = KokoroEngine(config.tts.model_path, config.tts.voices_path)
|
|
85
|
-
_audio_player = AudioPlayer(config.tts.audio_player
|
|
86
|
-
_speech_queue = SpeechQueue(_tts_engine, _audio_player)
|
|
99
|
+
_audio_player = AudioPlayer(config.tts.audio_player)
|
|
100
|
+
_speech_queue = SpeechQueue(_tts_engine, _audio_player, duck_media=config.tts.duck_media)
|
|
87
101
|
logger.info("TTS subsystem initialized")
|
|
88
102
|
except TTSEngineError as e:
|
|
89
103
|
logger.error(f"TTS initialization failed: {e}")
|
|
@@ -453,25 +467,29 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
453
467
|
if prompt:
|
|
454
468
|
logger.info(f"Listening (prompt: {prompt})")
|
|
455
469
|
|
|
470
|
+
# Duck media while recording so the mic doesn't pick up playback
|
|
471
|
+
# Skip if speak_then_listen already holds the duck
|
|
472
|
+
paused_apps = duck() if (_config and _config.tts.duck_media and not _suppress_duck) else []
|
|
473
|
+
|
|
456
474
|
try:
|
|
457
475
|
loop = asyncio.get_running_loop()
|
|
458
476
|
|
|
459
|
-
# Play ready sound so the user knows to start speaking
|
|
460
|
-
# Skip for push-to-talk (HTTP) — it has its own beep
|
|
461
|
-
if prompt != "push-to-talk":
|
|
462
|
-
await loop.run_in_executor(None, _play_ready_sound)
|
|
463
|
-
|
|
464
477
|
start = time.perf_counter()
|
|
465
478
|
|
|
466
479
|
# Reset VAD state from any prior recording (LSTM hidden state + context)
|
|
467
480
|
_vad.reset()
|
|
468
481
|
|
|
482
|
+
# Play the ready sound AFTER the mic is live (via on_ready callback)
|
|
483
|
+
# so the user doesn't start speaking into a dead mic.
|
|
484
|
+
ready_cb = _play_ready_sound if prompt != "push-to-talk" else None
|
|
485
|
+
|
|
469
486
|
# Record audio with VAD
|
|
470
487
|
audio = await _mic_capture.record(
|
|
471
488
|
vad=_vad,
|
|
472
489
|
timeout=timeout,
|
|
473
490
|
silence_threshold=silence_threshold,
|
|
474
491
|
cancel_event=_listen_cancel_event,
|
|
492
|
+
on_ready=ready_cb,
|
|
475
493
|
)
|
|
476
494
|
|
|
477
495
|
if _listen_cancel_event.is_set():
|
|
@@ -500,6 +518,8 @@ async def listen(timeout: float = 15, prompt: str = "", silence_threshold: float
|
|
|
500
518
|
logger.error(f"listen failed: {e}")
|
|
501
519
|
return {"success": False, "error": "listen_failed", "message": str(e)}
|
|
502
520
|
finally:
|
|
521
|
+
if paused_apps:
|
|
522
|
+
asyncio.create_task(_deferred_unduck(paused_apps))
|
|
503
523
|
_listen_active = False
|
|
504
524
|
_listen_cancel_event = None
|
|
505
525
|
# Reclaim mic for wake listener
|
|
@@ -524,19 +544,34 @@ async def speak_then_listen(
|
|
|
524
544
|
timeout: Max seconds to wait for response (default 15).
|
|
525
545
|
silence_threshold: Seconds of silence before stopping (default 1.5).
|
|
526
546
|
"""
|
|
527
|
-
|
|
547
|
+
global _suppress_duck
|
|
548
|
+
|
|
549
|
+
# Duck once for the entire speak+listen operation to avoid a
|
|
550
|
+
# brief unduck gap between speak finishing and listen starting.
|
|
551
|
+
should_duck = _config and _config.tts.duck_media
|
|
552
|
+
paused_apps = duck() if should_duck else []
|
|
528
553
|
|
|
529
|
-
|
|
530
|
-
|
|
554
|
+
# Suppress inner ducking in SpeechQueue and listen()
|
|
555
|
+
saved_queue_duck = _speech_queue._duck_media if _speech_queue else False
|
|
556
|
+
if _speech_queue and should_duck:
|
|
557
|
+
_speech_queue._duck_media = False
|
|
558
|
+
_suppress_duck = True
|
|
531
559
|
|
|
532
|
-
|
|
560
|
+
try:
|
|
561
|
+
speak_result = await speak(name, text, speed, block=True)
|
|
533
562
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
nudge_result = await speak(name, "I didn't catch that. Go ahead and type it.", speed, block=True)
|
|
537
|
-
listen_result["nudge_spoken"] = nudge_result.get("success", False)
|
|
563
|
+
if not speak_result.get("success"):
|
|
564
|
+
return {"speak": speak_result, "listen": {"success": False, "error": "skipped"}}
|
|
538
565
|
|
|
539
|
-
|
|
566
|
+
listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
|
|
567
|
+
|
|
568
|
+
return {"speak": speak_result, "listen": listen_result}
|
|
569
|
+
finally:
|
|
570
|
+
_suppress_duck = False
|
|
571
|
+
if _speech_queue:
|
|
572
|
+
_speech_queue._duck_media = saved_queue_duck
|
|
573
|
+
if paused_apps:
|
|
574
|
+
asyncio.create_task(_deferred_unduck(paused_apps))
|
|
540
575
|
|
|
541
576
|
|
|
542
577
|
@mcp.tool()
|
package/stt/mic_capture.py
CHANGED
|
@@ -8,18 +8,20 @@ import socket
|
|
|
8
8
|
import subprocess
|
|
9
9
|
import threading
|
|
10
10
|
import time
|
|
11
|
-
from typing import Optional
|
|
11
|
+
from typing import Callable, Optional
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
15
|
from shared import MicCaptureError, STT_SAMPLE_RATE, get_logger
|
|
16
16
|
from stt.vad import VoiceActivityDetector
|
|
17
|
+
from tts.media_duck import is_bluetooth_output
|
|
17
18
|
|
|
18
19
|
logger = get_logger("stt.mic")
|
|
19
20
|
|
|
20
21
|
_CHUNK_SAMPLES = 512 # Silero VAD requires exactly 512-sample chunks at 16kHz
|
|
21
22
|
_CHUNK_BYTES = _CHUNK_SAMPLES * 4 # float32 = 4 bytes/sample → 2048 bytes/chunk
|
|
22
|
-
_ZERO_CHECK_CHUNKS =
|
|
23
|
+
_ZERO_CHECK_CHUNKS = 25 # ~800ms — exceeds CoreAudio cold-start latency (~544ms)
|
|
24
|
+
_ZERO_CHECK_CHUNKS_BT = 75 # ~2.4s — Bluetooth A2DP→HFP codec switch can take 1-2s
|
|
23
25
|
|
|
24
26
|
_AUDIO_SERVICE_SOCKET = "/tmp/voicesmith-audio.sock"
|
|
25
27
|
_LAUNCHAGENT_LABEL = "com.voicesmith-mcp.audio"
|
|
@@ -92,6 +94,7 @@ class MicCapture:
|
|
|
92
94
|
timeout: float = 15,
|
|
93
95
|
silence_threshold: float = 1.5,
|
|
94
96
|
cancel_event: Optional[asyncio.Event] = None,
|
|
97
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
95
98
|
) -> Optional[np.ndarray]:
|
|
96
99
|
"""Record audio from the microphone until silence is detected.
|
|
97
100
|
|
|
@@ -106,6 +109,9 @@ class MicCapture:
|
|
|
106
109
|
timeout: Maximum seconds to wait for speech (default 15).
|
|
107
110
|
silence_threshold: Seconds of silence before stopping (default 1.5).
|
|
108
111
|
cancel_event: Optional asyncio.Event to cancel recording.
|
|
112
|
+
on_ready: Optional callback invoked once the mic is live and
|
|
113
|
+
ready to capture. Called after hardware warm-up /
|
|
114
|
+
flush but before the VAD loop starts.
|
|
109
115
|
|
|
110
116
|
Returns:
|
|
111
117
|
Numpy array of recorded audio, or None if cancelled/timeout.
|
|
@@ -122,17 +128,17 @@ class MicCapture:
|
|
|
122
128
|
if platform.system() == "Darwin":
|
|
123
129
|
if _launchagent_available():
|
|
124
130
|
return await self._record_via_socket(
|
|
125
|
-
vad, timeout, silence_threshold, cancel_event
|
|
131
|
+
vad, timeout, silence_threshold, cancel_event, on_ready
|
|
126
132
|
)
|
|
127
133
|
# Legacy: subprocess fallback for installs without the LaunchAgent.
|
|
128
134
|
audio_capture_bin = _find_app_binary("audio-service") or _find_app_binary("audio-capture")
|
|
129
135
|
if audio_capture_bin:
|
|
130
136
|
return await self._record_via_subprocess(
|
|
131
|
-
audio_capture_bin, vad, timeout, silence_threshold, cancel_event
|
|
137
|
+
audio_capture_bin, vad, timeout, silence_threshold, cancel_event, on_ready
|
|
132
138
|
)
|
|
133
139
|
|
|
134
140
|
return await self._record_via_sounddevice(
|
|
135
|
-
vad, timeout, silence_threshold, cancel_event
|
|
141
|
+
vad, timeout, silence_threshold, cancel_event, on_ready
|
|
136
142
|
)
|
|
137
143
|
|
|
138
144
|
# ── LaunchAgent socket backend (macOS primary) ─────────────────────────────
|
|
@@ -143,6 +149,7 @@ class MicCapture:
|
|
|
143
149
|
timeout: float,
|
|
144
150
|
silence_threshold: float,
|
|
145
151
|
cancel_event: Optional[asyncio.Event],
|
|
152
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
146
153
|
) -> Optional[np.ndarray]:
|
|
147
154
|
"""Record via the VoiceSmithMCP audio LaunchAgent (Unix socket).
|
|
148
155
|
|
|
@@ -190,7 +197,10 @@ class MicCapture:
|
|
|
190
197
|
logger.info("Microphone recording started (audio-service socket)")
|
|
191
198
|
|
|
192
199
|
try:
|
|
193
|
-
|
|
200
|
+
# Flush 2 chunks (~64ms) for AudioQueue hardware settle.
|
|
201
|
+
self._flush_queue(2)
|
|
202
|
+
if on_ready:
|
|
203
|
+
on_ready()
|
|
194
204
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
195
205
|
finally:
|
|
196
206
|
sock.close() # signals service to stop sending for this session
|
|
@@ -206,6 +216,7 @@ class MicCapture:
|
|
|
206
216
|
timeout: float,
|
|
207
217
|
silence_threshold: float,
|
|
208
218
|
cancel_event: Optional[asyncio.Event],
|
|
219
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
209
220
|
) -> Optional[np.ndarray]:
|
|
210
221
|
"""Record using a CoreAudio binary inside VoiceSmithMCP.app (legacy)."""
|
|
211
222
|
self._recording = True
|
|
@@ -239,7 +250,9 @@ class MicCapture:
|
|
|
239
250
|
reader_thread.start()
|
|
240
251
|
|
|
241
252
|
try:
|
|
242
|
-
self._flush_queue(
|
|
253
|
+
self._flush_queue(2)
|
|
254
|
+
if on_ready:
|
|
255
|
+
on_ready()
|
|
243
256
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
244
257
|
finally:
|
|
245
258
|
proc.terminate()
|
|
@@ -258,6 +271,7 @@ class MicCapture:
|
|
|
258
271
|
timeout: float,
|
|
259
272
|
silence_threshold: float,
|
|
260
273
|
cancel_event: Optional[asyncio.Event],
|
|
274
|
+
on_ready: Optional[Callable[[], None]] = None,
|
|
261
275
|
) -> Optional[np.ndarray]:
|
|
262
276
|
"""Record using sounddevice / PortAudio (fallback for non-macOS)."""
|
|
263
277
|
try:
|
|
@@ -281,7 +295,9 @@ class MicCapture:
|
|
|
281
295
|
stream.start()
|
|
282
296
|
logger.info("Microphone recording started (sounddevice)")
|
|
283
297
|
|
|
284
|
-
self._flush_queue(
|
|
298
|
+
self._flush_queue(2, chunk_timeout=0.1)
|
|
299
|
+
if on_ready:
|
|
300
|
+
on_ready()
|
|
285
301
|
return await self._run_vad_loop(vad, timeout, silence_threshold, cancel_event)
|
|
286
302
|
except MicCaptureError:
|
|
287
303
|
raise
|
|
@@ -330,6 +346,8 @@ class MicCapture:
|
|
|
330
346
|
speech_detected = False
|
|
331
347
|
silence_duration = 0.0
|
|
332
348
|
zero_check_done = False
|
|
349
|
+
# Bluetooth A2DP→HFP switch delivers zeros for up to ~2s
|
|
350
|
+
zero_threshold = _ZERO_CHECK_CHUNKS_BT if is_bluetooth_output() else _ZERO_CHECK_CHUNKS
|
|
333
351
|
start_time = loop.time()
|
|
334
352
|
|
|
335
353
|
while not self._stop_flag:
|
|
@@ -354,7 +372,7 @@ class MicCapture:
|
|
|
354
372
|
|
|
355
373
|
chunks.append(chunk)
|
|
356
374
|
|
|
357
|
-
if not zero_check_done and len(chunks) >=
|
|
375
|
+
if not zero_check_done and len(chunks) >= zero_threshold:
|
|
358
376
|
zero_check_done = True
|
|
359
377
|
if all(np.max(np.abs(c)) == 0.0 for c in chunks):
|
|
360
378
|
raise MicCaptureError(self._zero_audio_message())
|
package/tts/audio_player.py
CHANGED
|
@@ -10,7 +10,6 @@ import time
|
|
|
10
10
|
import soundfile as sf
|
|
11
11
|
|
|
12
12
|
from shared import PlaybackResult, AudioPlayerError, AUDIO_LOCK_PATH, get_logger
|
|
13
|
-
from tts.media_duck import duck, unduck
|
|
14
13
|
|
|
15
14
|
logger = get_logger("tts.audio_player")
|
|
16
15
|
|
|
@@ -18,9 +17,8 @@ logger = get_logger("tts.audio_player")
|
|
|
18
17
|
class AudioPlayer:
|
|
19
18
|
"""Plays audio samples through an external player process."""
|
|
20
19
|
|
|
21
|
-
def __init__(self, player_command: str = "mpv"
|
|
20
|
+
def __init__(self, player_command: str = "mpv") -> None:
|
|
22
21
|
self._player_command = player_command
|
|
23
|
-
self._duck_media = duck_media
|
|
24
22
|
self._process: subprocess.Popen | None = None
|
|
25
23
|
|
|
26
24
|
# Detect platform fallback if player_command is not available
|
|
@@ -84,23 +82,19 @@ class AudioPlayer:
|
|
|
84
82
|
|
|
85
83
|
# Cross-session audio lock: prevents overlapping playback
|
|
86
84
|
# flock is kernel-managed — auto-released on crash, no stale locks
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
# Lock released when lock_file closes
|
|
102
|
-
finally:
|
|
103
|
-
unduck(paused_apps)
|
|
85
|
+
with open(AUDIO_LOCK_PATH, "w") as lock_file:
|
|
86
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
87
|
+
|
|
88
|
+
start = time.perf_counter()
|
|
89
|
+
self._process = subprocess.Popen(
|
|
90
|
+
cmd,
|
|
91
|
+
stdout=subprocess.DEVNULL,
|
|
92
|
+
stderr=subprocess.DEVNULL,
|
|
93
|
+
)
|
|
94
|
+
self._process.wait()
|
|
95
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
96
|
+
|
|
97
|
+
# Lock released when lock_file closes
|
|
104
98
|
|
|
105
99
|
if self._process.returncode != 0:
|
|
106
100
|
return PlaybackResult(
|
package/tts/kokoro_engine.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
5
7
|
from shared import SynthesisResult, TTSEngineError, ALL_VOICE_IDS, SAMPLE_RATE, get_logger
|
|
6
8
|
|
|
7
9
|
logger = get_logger("tts.kokoro")
|
|
@@ -47,6 +49,11 @@ class KokoroEngine:
|
|
|
47
49
|
samples, sample_rate = self._model.create(text, voice=voice_id, speed=speed)
|
|
48
50
|
synthesis_ms = (time.perf_counter() - start) * 1000
|
|
49
51
|
|
|
52
|
+
# Pad 100ms silence — kokoro-onnx trim() snaps to 512-sample hops
|
|
53
|
+
# (~21ms at 24kHz) which can clip the trailing edge of the last phoneme.
|
|
54
|
+
pad = int(sample_rate * 0.10)
|
|
55
|
+
samples = np.concatenate([samples, np.zeros(pad, dtype=samples.dtype)])
|
|
56
|
+
|
|
50
57
|
duration_ms = (len(samples) / sample_rate) * 1000
|
|
51
58
|
|
|
52
59
|
return SynthesisResult(
|
package/tts/media_duck.py
CHANGED
|
@@ -14,6 +14,8 @@ Usage:
|
|
|
14
14
|
unduck(paused) # resume only what we paused
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import ctypes
|
|
18
|
+
import ctypes.util
|
|
17
19
|
import platform
|
|
18
20
|
import subprocess
|
|
19
21
|
|
|
@@ -89,6 +91,66 @@ tell application "{target}"
|
|
|
89
91
|
end tell"""
|
|
90
92
|
|
|
91
93
|
|
|
94
|
+
# ── Bluetooth detection (macOS CoreAudio) ────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
def is_bluetooth_output() -> bool:
|
|
97
|
+
"""Return True if the default audio output is a Bluetooth device.
|
|
98
|
+
|
|
99
|
+
Uses CoreAudio's AudioObjectGetPropertyData to check the transport type
|
|
100
|
+
of the default output device. Returns False on non-macOS or on error.
|
|
101
|
+
"""
|
|
102
|
+
if platform.system() != "Darwin":
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
lib_path = ctypes.util.find_library("CoreAudio")
|
|
107
|
+
if not lib_path:
|
|
108
|
+
return False
|
|
109
|
+
ca = ctypes.cdll.LoadLibrary(lib_path)
|
|
110
|
+
|
|
111
|
+
class _AudioObjectPropertyAddress(ctypes.Structure):
|
|
112
|
+
_fields_ = [
|
|
113
|
+
("mSelector", ctypes.c_uint32),
|
|
114
|
+
("mScope", ctypes.c_uint32),
|
|
115
|
+
("mElement", ctypes.c_uint32),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# CoreAudio FourCC constants
|
|
119
|
+
_SYS_OBJ = 1 # kAudioObjectSystemObject
|
|
120
|
+
_SCOPE_G = int.from_bytes(b"glob", "big") # kAudioObjectPropertyScopeGlobal
|
|
121
|
+
_ELEM_M = 0 # kAudioObjectPropertyElementMain
|
|
122
|
+
_DEF_OUT = int.from_bytes(b"dOut", "big") # kAudioHardwarePropertyDefaultOutputDevice
|
|
123
|
+
_TRANS = int.from_bytes(b"tran", "big") # kAudioDevicePropertyTransportType
|
|
124
|
+
_BT = int.from_bytes(b"blue", "big") # kAudioDeviceTransportTypeBluetooth
|
|
125
|
+
_BT_LE = int.from_bytes(b"blea", "big") # kAudioDeviceTransportTypeBluetoothLE
|
|
126
|
+
|
|
127
|
+
# Get default output device ID
|
|
128
|
+
addr = _AudioObjectPropertyAddress(_DEF_OUT, _SCOPE_G, _ELEM_M)
|
|
129
|
+
device_id = ctypes.c_uint32(0)
|
|
130
|
+
size = ctypes.c_uint32(4)
|
|
131
|
+
err = ca.AudioObjectGetPropertyData(
|
|
132
|
+
_SYS_OBJ, ctypes.byref(addr), 0, None,
|
|
133
|
+
ctypes.byref(size), ctypes.byref(device_id),
|
|
134
|
+
)
|
|
135
|
+
if err != 0:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
# Get transport type of that device
|
|
139
|
+
addr.mSelector = _TRANS
|
|
140
|
+
transport = ctypes.c_uint32(0)
|
|
141
|
+
size = ctypes.c_uint32(4)
|
|
142
|
+
err = ca.AudioObjectGetPropertyData(
|
|
143
|
+
device_id.value, ctypes.byref(addr), 0, None,
|
|
144
|
+
ctypes.byref(size), ctypes.byref(transport),
|
|
145
|
+
)
|
|
146
|
+
if err != 0:
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
return transport.value in (_BT, _BT_LE)
|
|
150
|
+
except Exception:
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
|
|
92
154
|
# ── Public API ────────────────────────────────────────────────────────────────
|
|
93
155
|
|
|
94
156
|
def duck() -> list[str]:
|
package/tts/speech_queue.py
CHANGED
|
@@ -6,6 +6,7 @@ import time
|
|
|
6
6
|
from shared import SpeakResult, MAX_CHUNK_LENGTH, get_logger
|
|
7
7
|
from tts.kokoro_engine import KokoroEngine
|
|
8
8
|
from tts.audio_player import AudioPlayer
|
|
9
|
+
from tts.media_duck import duck, unduck
|
|
9
10
|
|
|
10
11
|
logger = get_logger("tts.speech_queue")
|
|
11
12
|
|
|
@@ -13,9 +14,10 @@ logger = get_logger("tts.speech_queue")
|
|
|
13
14
|
class SpeechQueue:
|
|
14
15
|
"""Manages sequential speech synthesis and playback."""
|
|
15
16
|
|
|
16
|
-
def __init__(self, engine: KokoroEngine, player: AudioPlayer) -> None:
|
|
17
|
+
def __init__(self, engine: KokoroEngine, player: AudioPlayer, duck_media: bool = False) -> None:
|
|
17
18
|
self._engine = engine
|
|
18
19
|
self._player = player
|
|
20
|
+
self._duck_media = duck_media
|
|
19
21
|
self._queue: asyncio.Queue = asyncio.Queue()
|
|
20
22
|
self._speaking = False
|
|
21
23
|
|
|
@@ -60,6 +62,9 @@ class SpeechQueue:
|
|
|
60
62
|
total_duration_ms = 0.0
|
|
61
63
|
total_synthesis_ms = 0.0
|
|
62
64
|
|
|
65
|
+
# Duck media for the entire utterance, not per-chunk
|
|
66
|
+
paused_apps = duck() if self._duck_media else []
|
|
67
|
+
|
|
63
68
|
try:
|
|
64
69
|
chunks = self.chunk_text(text)
|
|
65
70
|
|
|
@@ -105,6 +110,7 @@ class SpeechQueue:
|
|
|
105
110
|
error=str(e),
|
|
106
111
|
)
|
|
107
112
|
finally:
|
|
113
|
+
unduck(paused_apps)
|
|
108
114
|
self._speaking = False
|
|
109
115
|
|
|
110
116
|
def stop(self) -> bool:
|