voice-mcp-server 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -2,9 +2,11 @@ import os
|
|
|
2
2
|
import time
|
|
3
3
|
import subprocess
|
|
4
4
|
import httpx
|
|
5
|
+
import threading
|
|
5
6
|
from simulation.ports import ISpeaker
|
|
6
7
|
from simulation.models import VirtualAudioFrame
|
|
7
8
|
from dotenv import load_dotenv
|
|
9
|
+
from logger import logger
|
|
8
10
|
|
|
9
11
|
load_dotenv()
|
|
10
12
|
|
|
@@ -19,62 +21,95 @@ class ElevenLabsSpeaker(ISpeaker):
|
|
|
19
21
|
self.voice_id = os.getenv("ELEVENLABS_VOICE_ID", voice_id)
|
|
20
22
|
self.api_key = os.getenv("ELEVENLABS_API_KEY")
|
|
21
23
|
self.temp_file = "/tmp/elevenlabs_output.mp3"
|
|
24
|
+
|
|
25
|
+
self._lock = threading.RLock()
|
|
26
|
+
self._is_preparing = False
|
|
27
|
+
self._stop_event = threading.Event()
|
|
28
|
+
self._thread = None
|
|
29
|
+
|
|
30
|
+
logger.info(f"ElevenLabs Speaker initialized. Voice ID: {self.voice_id}, API Key Present: {bool(self.api_key)}")
|
|
22
31
|
|
|
23
32
|
def speak(self, text: str):
|
|
24
33
|
if not text.strip():
|
|
25
34
|
return
|
|
26
35
|
|
|
27
|
-
|
|
28
|
-
self.
|
|
36
|
+
# Cancel any current playback or preparation
|
|
37
|
+
self.flush()
|
|
38
|
+
|
|
39
|
+
with self._lock:
|
|
40
|
+
self.current_text = text
|
|
41
|
+
self.words = text.split()
|
|
42
|
+
self._is_preparing = True
|
|
43
|
+
self._stop_event.clear()
|
|
44
|
+
self.start_time = 0 # Won't start until afplay starts
|
|
45
|
+
|
|
46
|
+
self._thread = threading.Thread(target=self._generate_and_play, args=(text,), daemon=True)
|
|
47
|
+
self._thread.start()
|
|
29
48
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
"text": text,
|
|
39
|
-
"model_id": "eleven_multilingual_v2",
|
|
40
|
-
"voice_settings": {
|
|
41
|
-
"stability": 0.5,
|
|
42
|
-
"similarity_boost": 0.5
|
|
49
|
+
def _generate_and_play(self, text: str):
|
|
50
|
+
try:
|
|
51
|
+
if self.api_key:
|
|
52
|
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
|
|
53
|
+
headers = {
|
|
54
|
+
"Accept": "audio/mpeg",
|
|
55
|
+
"Content-Type": "application/json",
|
|
56
|
+
"xi-api-key": self.api_key
|
|
43
57
|
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
58
|
+
data = {
|
|
59
|
+
"text": text,
|
|
60
|
+
"model_id": "eleven_multilingual_v2",
|
|
61
|
+
"voice_settings": {
|
|
62
|
+
"stability": 0.5,
|
|
63
|
+
"similarity_boost": 0.5
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
with httpx.Client() as client:
|
|
68
|
+
logger.debug(f"Calling ElevenLabs API for Voice ID: {self.voice_id}")
|
|
48
69
|
response = client.post(url, json=data, headers=headers, timeout=10.0)
|
|
49
70
|
response.raise_for_status()
|
|
50
71
|
|
|
72
|
+
if self._stop_event.is_set():
|
|
73
|
+
return
|
|
74
|
+
|
|
51
75
|
with open(self.temp_file, "wb") as f:
|
|
52
76
|
f.write(response.content)
|
|
53
77
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
78
|
+
if self._stop_event.is_set():
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
with self._lock:
|
|
82
|
+
# Play the downloaded audio
|
|
83
|
+
self.start_time = time.time()
|
|
84
|
+
self.process = subprocess.Popen(
|
|
85
|
+
["afplay", self.temp_file],
|
|
86
|
+
stdout=subprocess.DEVNULL,
|
|
87
|
+
stderr=subprocess.DEVNULL
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
logger.warning("No ELEVENLABS_API_KEY found, falling back to 'say'")
|
|
91
|
+
if self._stop_event.is_set(): return
|
|
92
|
+
with self._lock:
|
|
64
93
|
self.start_time = time.time()
|
|
65
94
|
self.process = subprocess.Popen(
|
|
66
95
|
["say", text],
|
|
67
96
|
stdout=subprocess.DEVNULL,
|
|
68
97
|
stderr=subprocess.DEVNULL
|
|
69
98
|
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"ElevenLabs Error: {e}")
|
|
101
|
+
if self._stop_event.is_set(): return
|
|
102
|
+
# Fallback to macOS say
|
|
103
|
+
with self._lock:
|
|
104
|
+
self.start_time = time.time()
|
|
105
|
+
self.process = subprocess.Popen(
|
|
106
|
+
["say", text],
|
|
107
|
+
stdout=subprocess.DEVNULL,
|
|
108
|
+
stderr=subprocess.DEVNULL
|
|
109
|
+
)
|
|
110
|
+
finally:
|
|
111
|
+
with self._lock:
|
|
112
|
+
self._is_preparing = False
|
|
78
113
|
|
|
79
114
|
def play_frame(self, frame: VirtualAudioFrame):
|
|
80
115
|
pass
|
|
@@ -83,35 +118,50 @@ class ElevenLabsSpeaker(ISpeaker):
|
|
|
83
118
|
pass
|
|
84
119
|
|
|
85
120
|
def is_speaking(self) -> bool:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
self.
|
|
92
|
-
|
|
93
|
-
|
|
121
|
+
with self._lock:
|
|
122
|
+
if self._is_preparing:
|
|
123
|
+
return True
|
|
124
|
+
if self.process is None:
|
|
125
|
+
return False
|
|
126
|
+
is_running = self.process.poll() is None
|
|
127
|
+
if not is_running:
|
|
128
|
+
self.current_text = ""
|
|
129
|
+
self.words = []
|
|
130
|
+
self.process = None
|
|
131
|
+
return is_running
|
|
94
132
|
|
|
95
133
|
def has_started_audio(self) -> bool:
|
|
96
|
-
|
|
134
|
+
with self._lock:
|
|
135
|
+
if self.process is None:
|
|
136
|
+
return False
|
|
137
|
+
return self.process.poll() is None
|
|
97
138
|
|
|
98
139
|
def flush(self) -> str:
|
|
99
|
-
if
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# Immediately kill the playback process
|
|
103
|
-
self.process.kill()
|
|
104
|
-
|
|
105
|
-
# Explicitly wait for the process to terminate and reap it
|
|
106
|
-
self.process.wait()
|
|
107
|
-
|
|
108
|
-
elapsed_ms = (time.time() - self.start_time) * 1000
|
|
109
|
-
words_spoken = int(elapsed_ms * self.words_per_ms)
|
|
110
|
-
|
|
111
|
-
spoken = " ".join(self.words[:words_spoken])
|
|
112
|
-
|
|
113
|
-
self.current_text = ""
|
|
114
|
-
self.words = []
|
|
115
|
-
self.process = None
|
|
140
|
+
# Signal the thread to stop if it's still downloading
|
|
141
|
+
self._stop_event.set()
|
|
116
142
|
|
|
117
|
-
|
|
143
|
+
with self._lock:
|
|
144
|
+
if not self.is_speaking():
|
|
145
|
+
self._is_preparing = False
|
|
146
|
+
return ""
|
|
147
|
+
|
|
148
|
+
# Immediately kill the playback process if it exists
|
|
149
|
+
if self.process:
|
|
150
|
+
self.process.kill()
|
|
151
|
+
self.process.wait()
|
|
152
|
+
|
|
153
|
+
# If we were preparing but hadn't started afplay yet, we spoken 0 words
|
|
154
|
+
if self.start_time == 0:
|
|
155
|
+
words_spoken = 0
|
|
156
|
+
else:
|
|
157
|
+
elapsed_ms = (time.time() - self.start_time) * 1000
|
|
158
|
+
words_spoken = int(elapsed_ms * self.words_per_ms)
|
|
159
|
+
|
|
160
|
+
spoken = " ".join(self.words[:words_spoken])
|
|
161
|
+
|
|
162
|
+
self.current_text = ""
|
|
163
|
+
self.words = []
|
|
164
|
+
self.process = None
|
|
165
|
+
self._is_preparing = False
|
|
166
|
+
|
|
167
|
+
return spoken
|
|
@@ -64,12 +64,15 @@ class LiveMicrophone(IMicrophone):
|
|
|
64
64
|
raw_bytes = self.q.get(timeout=0.1) # Block briefly to act as clock
|
|
65
65
|
# If we didn't get 320 bytes, that's weird but we handle it
|
|
66
66
|
if len(raw_bytes) < self.chunk * 2:
|
|
67
|
+
logger.warning(f"LiveMicrophone read_frame got only {len(raw_bytes)} bytes instead of {self.chunk * 2}")
|
|
67
68
|
return VirtualAudioFrame(10, False, False, "", b"")
|
|
68
69
|
return VirtualAudioFrame(10, False, False, "", raw_bytes)
|
|
69
70
|
except queue.Empty:
|
|
71
|
+
logger.error("LiveMicrophone queue is EMPTY on read! (PyAudio might have crashed or stopped feeding data)")
|
|
70
72
|
# If queue is empty, yield silence frame
|
|
71
73
|
return VirtualAudioFrame(10, False, False, "", b"")
|
|
72
74
|
|
|
73
75
|
def close(self):
|
|
74
76
|
self.stop_stream()
|
|
75
77
|
self.p.terminate()
|
|
78
|
+
|
|
@@ -80,7 +80,7 @@ def pre_download_models():
|
|
|
80
80
|
|
|
81
81
|
def run_audio_daemon():
|
|
82
82
|
"""Runs the CoreEngine in a persistent background thread."""
|
|
83
|
-
global engine, mic, speaker, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
|
|
83
|
+
global engine, mic, speaker, vad, stt, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
|
|
84
84
|
|
|
85
85
|
# Pre-download models so the daemon status reflects exactly what is happening
|
|
86
86
|
pre_download_models()
|
|
@@ -118,8 +118,13 @@ def run_audio_daemon():
|
|
|
118
118
|
|
|
119
119
|
try:
|
|
120
120
|
while True:
|
|
121
|
+
current_engine = engine
|
|
122
|
+
if current_engine is None or mic is None:
|
|
123
|
+
time.sleep(0.1)
|
|
124
|
+
continue
|
|
125
|
+
|
|
121
126
|
# If dormant, check for commands from FastAPI
|
|
122
|
-
if
|
|
127
|
+
if current_engine.state == State.EXECUTING:
|
|
123
128
|
try:
|
|
124
129
|
cmd = mcp_command_queue.get(timeout=0.1) # Blocks briefly
|
|
125
130
|
|
|
@@ -127,15 +132,15 @@ def run_audio_daemon():
|
|
|
127
132
|
mic.start_stream()
|
|
128
133
|
if hasattr(vad, "set_active"):
|
|
129
134
|
vad.set_active(True)
|
|
130
|
-
|
|
131
|
-
|
|
135
|
+
current_engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
|
|
136
|
+
current_engine.expect_reply = cmd.get("expect_reply", True)
|
|
132
137
|
|
|
133
138
|
except queue.Empty:
|
|
134
139
|
pass
|
|
135
140
|
else:
|
|
136
|
-
|
|
141
|
+
current_engine.tick()
|
|
137
142
|
# Once we drop back to EXECUTING, we finished the conversation loop
|
|
138
|
-
if
|
|
143
|
+
if current_engine.state == State.EXECUTING:
|
|
139
144
|
mic.stop_stream()
|
|
140
145
|
if hasattr(vad, "set_active"):
|
|
141
146
|
vad.set_active(False)
|
|
@@ -231,18 +236,16 @@ async def reload_config():
|
|
|
231
236
|
# 1. Stop the current engine
|
|
232
237
|
if engine:
|
|
233
238
|
engine.state = State.EXECUTING
|
|
234
|
-
|
|
235
|
-
|
|
239
|
+
# Do NOT close the mic during hot-swap to prevent macOS CoreAudio permission drop!
|
|
240
|
+
# if mic:
|
|
241
|
+
# mic.close()
|
|
236
242
|
|
|
237
243
|
# 1b. CRITICAL: Explicitly obliterate old models from VRAM to prevent Out-Of-Memory (OOM) crashes on hot-swaps
|
|
238
244
|
import gc
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
del engine
|
|
244
|
-
except NameError:
|
|
245
|
-
pass
|
|
245
|
+
speaker = None
|
|
246
|
+
vad = None
|
|
247
|
+
stt = None
|
|
248
|
+
engine = None
|
|
246
249
|
|
|
247
250
|
gc.collect()
|
|
248
251
|
|
|
@@ -265,7 +268,12 @@ async def reload_config():
|
|
|
265
268
|
cfg = compose(config_name="config")
|
|
266
269
|
|
|
267
270
|
# 3. Instantiate the new models on the fly
|
|
268
|
-
mic
|
|
271
|
+
if mic is None:
|
|
272
|
+
logger.info("Microphone was None during hot-swap, instantiating a new one.")
|
|
273
|
+
mic = instantiate(cfg.microphone)
|
|
274
|
+
else:
|
|
275
|
+
logger.info("Preserving existing Microphone instance during hot-swap.")
|
|
276
|
+
|
|
269
277
|
speaker = instantiate(cfg.speaker)
|
|
270
278
|
vad = instantiate(cfg.vad)
|
|
271
279
|
stt = instantiate(cfg.stt)
|