voice-mcp-server 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +193 -0
- package/build/index.js +51 -0
- package/config/config.yaml +25 -0
- package/config/microphone/live_mic.yaml +1 -0
- package/config/speaker/elevenlabs_speaker.yaml +3 -0
- package/config/speaker/kokoro_speaker.yaml +3 -0
- package/config/stt/mlx_whisper_large_v3.yaml +2 -0
- package/config/vad/ptt_vad.yaml +8 -0
- package/config/vad/silero_vad.yaml +7 -0
- package/package.json +40 -0
- package/requirements.txt +126 -0
- package/src/adapters_real/__init__.py +0 -0
- package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/adapters_real/echo_llm.py +28 -0
- package/src/adapters_real/elevenlabs_speaker.py +117 -0
- package/src/adapters_real/kokoro_speaker.py +122 -0
- package/src/adapters_real/live_mic.py +64 -0
- package/src/adapters_real/live_speaker.py +66 -0
- package/src/adapters_real/ptt_vad.py +36 -0
- package/src/adapters_real/queue_llm.py +36 -0
- package/src/adapters_real/silero_vad.py +43 -0
- package/src/adapters_real/wav_mic.py +17 -0
- package/src/adapters_real/whisper_stt.py +32 -0
- package/src/daemon/__init__.py +0 -0
- package/src/daemon/audio_server.py +363 -0
- package/src/index.ts +63 -0
- package/src/mcp_server.py +254 -0
- package/src/simulation/__init__.py +0 -0
- package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
- package/src/simulation/adapters.py +131 -0
- package/src/simulation/engine.py +242 -0
- package/src/simulation/models.py +25 -0
- package/src/simulation/ports.py +57 -0
- package/src/simulation/tests/__init__.py +0 -0
- package/src/simulation/tests/test_scenarios.py +510 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import threading
|
|
6
|
+
import queue
|
|
7
|
+
import logging
|
|
8
|
+
from contextlib import asynccontextmanager
|
|
9
|
+
from fastapi import FastAPI, Request, HTTPException
|
|
10
|
+
from fastapi.responses import StreamingResponse
|
|
11
|
+
from hydra import compose, initialize
|
|
12
|
+
from hydra.utils import instantiate
|
|
13
|
+
|
|
14
|
+
# Enforce strict model download locations BEFORE loading any ML libraries
|
|
15
|
+
app_support_dir = os.path.expanduser("~/Library/Application Support/VoiceMCP/models")
|
|
16
|
+
os.makedirs(app_support_dir, exist_ok=True)
|
|
17
|
+
os.environ["HF_HOME"] = os.path.join(app_support_dir, "huggingface")
|
|
18
|
+
os.environ["TORCH_HOME"] = os.path.join(app_support_dir, "torch")
|
|
19
|
+
|
|
20
|
+
# Add src to python path for imports
|
|
21
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
22
|
+
|
|
23
|
+
from simulation.models import Config
|
|
24
|
+
from simulation.engine import CoreEngine, State
|
|
25
|
+
from adapters_real.queue_llm import QueueLLMBridge
|
|
26
|
+
|
|
27
|
+
# --- Global State ---
|
|
28
|
+
mcp_command_queue = queue.Queue()
|
|
29
|
+
mcp_result_queue = queue.Queue()
|
|
30
|
+
active_session_id = None
|
|
31
|
+
mutex_lock = threading.Lock()
|
|
32
|
+
last_active_timestamp = time.time()
|
|
33
|
+
IDLE_TIMEOUT_SECONDS = 900 # 15 minutes
|
|
34
|
+
|
|
35
|
+
# Daemon Lifecycle State
|
|
36
|
+
daemon_status = "DOWNLOADING" # Starts in downloading state to prevent Claude timeouts
|
|
37
|
+
daemon_status_message = "Initializing models..."
|
|
38
|
+
daemon_progress = 0
|
|
39
|
+
|
|
40
|
+
# Engine reference
|
|
41
|
+
engine = None
|
|
42
|
+
mic = None
|
|
43
|
+
speaker = None
|
|
44
|
+
|
|
45
|
+
def pre_download_models():
|
|
46
|
+
"""Forces huggingface_hub to fetch the massive models into our explicit directory before instantiation."""
|
|
47
|
+
global daemon_status_message, daemon_progress
|
|
48
|
+
try:
|
|
49
|
+
from huggingface_hub import snapshot_download, try_to_load_from_cache
|
|
50
|
+
from huggingface_hub.utils import LocalEntryNotFoundError
|
|
51
|
+
|
|
52
|
+
# 1. Kokoro TTS (82M)
|
|
53
|
+
try:
|
|
54
|
+
try_to_load_from_cache(repo_id="hexgrad/Kokoro-82M", filename="kokoro-v1_0.pth")
|
|
55
|
+
daemon_status_message = "Loading Kokoro TTS (82M)..."
|
|
56
|
+
daemon_progress = 10
|
|
57
|
+
# Ensure everything is correct
|
|
58
|
+
snapshot_download(repo_id="hexgrad/Kokoro-82M", allow_patterns=["*.pth", "*.json", "voices/*"], local_files_only=True)
|
|
59
|
+
except (LocalEntryNotFoundError, Exception):
|
|
60
|
+
daemon_status_message = "Downloading Kokoro TTS (82M)..."
|
|
61
|
+
daemon_progress = 5
|
|
62
|
+
snapshot_download(repo_id="hexgrad/Kokoro-82M", allow_patterns=["*.pth", "*.json", "voices/*"])
|
|
63
|
+
|
|
64
|
+
# 2. MLX Whisper Large v3 (3GB)
|
|
65
|
+
try:
|
|
66
|
+
try_to_load_from_cache(repo_id="mlx-community/whisper-large-v3-mlx", filename="weights.npz")
|
|
67
|
+
daemon_status_message = "Loading MLX Whisper Large v3 (3GB)..."
|
|
68
|
+
daemon_progress = 50
|
|
69
|
+
snapshot_download(repo_id="mlx-community/whisper-large-v3-mlx", local_files_only=True)
|
|
70
|
+
except (LocalEntryNotFoundError, Exception):
|
|
71
|
+
daemon_status_message = "Downloading MLX Whisper Large v3 (3GB)..."
|
|
72
|
+
daemon_progress = 30
|
|
73
|
+
snapshot_download(repo_id="mlx-community/whisper-large-v3-mlx")
|
|
74
|
+
|
|
75
|
+
daemon_status_message = "Finalizing AI setup..."
|
|
76
|
+
daemon_progress = 90
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Model download error: {e}", file=sys.stderr)
|
|
79
|
+
daemon_status_message = f"Error downloading models: {e}"
|
|
80
|
+
|
|
81
|
+
def run_audio_daemon():
|
|
82
|
+
"""Runs the CoreEngine in a persistent background thread."""
|
|
83
|
+
global engine, mic, speaker, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
|
|
84
|
+
|
|
85
|
+
# Pre-download models so the daemon status reflects exactly what is happening
|
|
86
|
+
pre_download_models()
|
|
87
|
+
daemon_status_message = "Instantiating hardware..."
|
|
88
|
+
daemon_progress = 95
|
|
89
|
+
|
|
90
|
+
# Load configuration using Hydra
|
|
91
|
+
config_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'config'))
|
|
92
|
+
|
|
93
|
+
with initialize(version_base=None, config_path="../../config"):
|
|
94
|
+
cfg = compose(config_name="config")
|
|
95
|
+
print("Loaded Hydra configuration successfully.")
|
|
96
|
+
|
|
97
|
+
mic = instantiate(cfg.microphone)
|
|
98
|
+
speaker = instantiate(cfg.speaker)
|
|
99
|
+
vad = instantiate(cfg.vad)
|
|
100
|
+
stt = instantiate(cfg.stt)
|
|
101
|
+
llm = QueueLLMBridge(mcp_command_queue, mcp_result_queue)
|
|
102
|
+
|
|
103
|
+
config = Config(
|
|
104
|
+
vad_probability_threshold=cfg.vad.get("vad_probability_threshold", 0.80),
|
|
105
|
+
vad_bargein_threshold_ms=cfg.vad.get("vad_bargein_threshold_ms", 500),
|
|
106
|
+
endpointing_patience_normal_ms=cfg.vad.get("endpointing_patience_normal_ms", 1500),
|
|
107
|
+
endpointing_patience_interrupted_ms=cfg.vad.get("endpointing_patience_interrupted_ms", 700),
|
|
108
|
+
vad_silence_grace_ms=cfg.config.get("vad_silence_grace_ms", 100)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
engine = CoreEngine(config, mic, speaker, vad, stt, llm)
|
|
112
|
+
engine.state = State.EXECUTING # Start dormant
|
|
113
|
+
|
|
114
|
+
daemon_status = "READY"
|
|
115
|
+
daemon_status_message = "Audio Engine is online."
|
|
116
|
+
daemon_progress = 100
|
|
117
|
+
print("Audio Daemon Started. Waiting for commands.", file=sys.stderr)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
while True:
|
|
121
|
+
# If dormant, check for commands from FastAPI
|
|
122
|
+
if engine.state == State.EXECUTING:
|
|
123
|
+
try:
|
|
124
|
+
cmd = mcp_command_queue.get(timeout=0.1) # Blocks briefly
|
|
125
|
+
|
|
126
|
+
# We got a command, wake up the hardware!
|
|
127
|
+
mic.start_stream()
|
|
128
|
+
engine.start_conversation(cmd.get("text", ""))
|
|
129
|
+
engine.expect_reply = cmd.get("expect_reply", True)
|
|
130
|
+
|
|
131
|
+
except queue.Empty:
|
|
132
|
+
pass
|
|
133
|
+
else:
|
|
134
|
+
engine.tick()
|
|
135
|
+
# Once we drop back to EXECUTING, we finished the conversation loop
|
|
136
|
+
if engine.state == State.EXECUTING:
|
|
137
|
+
mic.stop_stream()
|
|
138
|
+
last_active_timestamp = time.time()
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f"Daemon exception: {e}", file=sys.stderr)
|
|
142
|
+
finally:
|
|
143
|
+
if mic:
|
|
144
|
+
mic.close()
|
|
145
|
+
|
|
146
|
+
async def watchdog():
|
|
147
|
+
"""Monitors idle time and self-destructs if inactive."""
|
|
148
|
+
global last_active_timestamp
|
|
149
|
+
while True:
|
|
150
|
+
await asyncio.sleep(60)
|
|
151
|
+
idle_time = time.time() - last_active_timestamp
|
|
152
|
+
if idle_time > IDLE_TIMEOUT_SECONDS:
|
|
153
|
+
print(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.", file=sys.stderr)
|
|
154
|
+
if mic:
|
|
155
|
+
mic.close()
|
|
156
|
+
os._exit(0)
|
|
157
|
+
|
|
158
|
+
def parent_pid_polling():
|
|
159
|
+
"""Polls the parent PID. If the parent dies, the daemon instantly self-destructs."""
|
|
160
|
+
while True:
|
|
161
|
+
time.sleep(3.0)
|
|
162
|
+
if os.getppid() == 1:
|
|
163
|
+
print("Parent process died. Stopping daemon to prevent Zombie microphone lock.", file=sys.stderr)
|
|
164
|
+
os._exit(0)
|
|
165
|
+
|
|
166
|
+
@asynccontextmanager
|
|
167
|
+
async def lifespan(app: FastAPI):
|
|
168
|
+
# Boot the daemon thread on startup
|
|
169
|
+
daemon_thread = threading.Thread(target=run_audio_daemon, daemon=True)
|
|
170
|
+
daemon_thread.start()
|
|
171
|
+
|
|
172
|
+
# Start the watchdog
|
|
173
|
+
asyncio.create_task(watchdog())
|
|
174
|
+
|
|
175
|
+
# Start the Parent PID Poller
|
|
176
|
+
polling_thread = threading.Thread(target=parent_pid_polling, daemon=True)
|
|
177
|
+
polling_thread.start()
|
|
178
|
+
|
|
179
|
+
yield
|
|
180
|
+
# Shutdown logic
|
|
181
|
+
if mic:
|
|
182
|
+
mic.close()
|
|
183
|
+
|
|
184
|
+
app = FastAPI(lifespan=lifespan)
|
|
185
|
+
|
|
186
|
+
@app.get("/health")
|
|
187
|
+
async def health_check():
|
|
188
|
+
# If the app is up, we are technically "healthy" enough for the MCP client to connect,
|
|
189
|
+
# even if we are downloading. The actual block happens in /converse.
|
|
190
|
+
return {
|
|
191
|
+
"status": "ok",
|
|
192
|
+
"daemon_status": daemon_status,
|
|
193
|
+
"message": daemon_status_message,
|
|
194
|
+
"progress": daemon_progress
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
@app.get("/status")
|
|
198
|
+
async def status_sse(request: Request):
|
|
199
|
+
"""Server-Sent Events endpoint to broadcast download/status progress to the UI."""
|
|
200
|
+
async def event_generator():
|
|
201
|
+
last_msg = ""
|
|
202
|
+
while True:
|
|
203
|
+
if await request.is_disconnected():
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
# Only yield if the message changed to save bandwidth, unless we just connected
|
|
207
|
+
if daemon_status_message != last_msg:
|
|
208
|
+
last_msg = daemon_status_message
|
|
209
|
+
yield {
|
|
210
|
+
"event": "status_update",
|
|
211
|
+
"data": f'{{"status": "{daemon_status}", "message": "{daemon_status_message}"}}'
|
|
212
|
+
}
|
|
213
|
+
await asyncio.sleep(0.5)
|
|
214
|
+
|
|
215
|
+
from sse_starlette.sse import EventSourceResponse
|
|
216
|
+
return EventSourceResponse(event_generator())
|
|
217
|
+
|
|
218
|
+
@app.post("/reload")
|
|
219
|
+
async def reload_config():
|
|
220
|
+
global engine, mic, speaker, vad, stt, daemon_status, daemon_status_message
|
|
221
|
+
|
|
222
|
+
if daemon_status == "DOWNLOADING":
|
|
223
|
+
return {"status": "error", "message": "Cannot reload while downloading models."}
|
|
224
|
+
|
|
225
|
+
daemon_status = "RELOADING"
|
|
226
|
+
daemon_status_message = "Hot-swapping audio models..."
|
|
227
|
+
|
|
228
|
+
with mutex_lock:
|
|
229
|
+
# 1. Stop the current engine
|
|
230
|
+
if engine:
|
|
231
|
+
engine.state = State.EXECUTING
|
|
232
|
+
if mic:
|
|
233
|
+
mic.close()
|
|
234
|
+
|
|
235
|
+
# 1b. CRITICAL: Explicitly obliterate old models from VRAM to prevent Out-Of-Memory (OOM) crashes on hot-swaps
|
|
236
|
+
import gc
|
|
237
|
+
try:
|
|
238
|
+
del speaker
|
|
239
|
+
del vad
|
|
240
|
+
del stt
|
|
241
|
+
del engine
|
|
242
|
+
except NameError:
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
gc.collect()
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
import mlx.core as mx
|
|
249
|
+
mx.metal.clear_cache()
|
|
250
|
+
except ImportError:
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
import torch
|
|
255
|
+
if torch.backends.mps.is_available():
|
|
256
|
+
torch.mps.empty_cache()
|
|
257
|
+
except ImportError:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# 2. Re-read the YAML file using Hydra
|
|
262
|
+
with initialize(version_base=None, config_path="../../config"):
|
|
263
|
+
cfg = compose(config_name="config")
|
|
264
|
+
|
|
265
|
+
# 3. Instantiate the new models on the fly
|
|
266
|
+
mic = instantiate(cfg.microphone)
|
|
267
|
+
speaker = instantiate(cfg.speaker)
|
|
268
|
+
vad = instantiate(cfg.vad)
|
|
269
|
+
stt = instantiate(cfg.stt)
|
|
270
|
+
llm = QueueLLMBridge(mcp_command_queue, mcp_result_queue)
|
|
271
|
+
|
|
272
|
+
config = Config(
|
|
273
|
+
vad_probability_threshold=cfg.vad.get("vad_probability_threshold", 0.80),
|
|
274
|
+
vad_bargein_threshold_ms=cfg.vad.get("vad_bargein_threshold_ms", 500),
|
|
275
|
+
endpointing_patience_normal_ms=cfg.vad.get("endpointing_patience_normal_ms", 1500),
|
|
276
|
+
endpointing_patience_interrupted_ms=cfg.vad.get("endpointing_patience_interrupted_ms", 700),
|
|
277
|
+
vad_silence_grace_ms=cfg.config.get("vad_silence_grace_ms", 100)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
engine = CoreEngine(config, mic, speaker, vad, stt, llm)
|
|
281
|
+
engine.state = State.EXECUTING
|
|
282
|
+
|
|
283
|
+
daemon_status = "READY"
|
|
284
|
+
daemon_status_message = "Audio Engine reloaded successfully."
|
|
285
|
+
return {"status": "ok", "message": "Audio engine hot-swapped successfully."}
|
|
286
|
+
|
|
287
|
+
except Exception as e:
|
|
288
|
+
daemon_status = "ERROR"
|
|
289
|
+
daemon_status_message = f"Failed to reload: {str(e)}"
|
|
290
|
+
return {"status": "error", "message": daemon_status_message}
|
|
291
|
+
|
|
292
|
+
@app.post("/converse")
|
|
293
|
+
async def converse(request: Request):
|
|
294
|
+
global active_session_id, last_active_timestamp
|
|
295
|
+
|
|
296
|
+
# Fast-Fail Graceful State to prevent Claude Timeout during the 3GB initial download
|
|
297
|
+
if daemon_status == "DOWNLOADING":
|
|
298
|
+
return {
|
|
299
|
+
"status": "system_busy",
|
|
300
|
+
"message": f"SYSTEM NOTIFICATION: Speak MCP is currently initializing. {daemon_status_message} Please instruct the user to wait a moment and try again."
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
body = await request.json()
|
|
304
|
+
session_id = body.get("session_id")
|
|
305
|
+
text_to_speak = body.get("text_to_speak", "")
|
|
306
|
+
expect_reply = body.get("expect_reply", True)
|
|
307
|
+
|
|
308
|
+
with mutex_lock:
|
|
309
|
+
if active_session_id is not None and active_session_id != session_id:
|
|
310
|
+
return {
|
|
311
|
+
"status": "system_busy",
|
|
312
|
+
"message": "Microphone is in use by another session. Fallback to text."
|
|
313
|
+
}
|
|
314
|
+
# Lock the logical session
|
|
315
|
+
active_session_id = session_id
|
|
316
|
+
last_active_timestamp = time.time()
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
# Feed command to daemon
|
|
320
|
+
mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
|
|
321
|
+
|
|
322
|
+
# Wait for human to interact or natural termination, checking for client disconnects
|
|
323
|
+
while True:
|
|
324
|
+
if await request.is_disconnected():
|
|
325
|
+
print(f"[{session_id}] Client disconnected! Aborting audio loop.", file=sys.stderr)
|
|
326
|
+
# Client hung up (e.g. reload or ctrl+c). We must reset the engine immediately.
|
|
327
|
+
if speaker:
|
|
328
|
+
speaker.flush()
|
|
329
|
+
if engine:
|
|
330
|
+
engine.state = State.EXECUTING # This will trigger mic.stop_stream() in the loop
|
|
331
|
+
raise HTTPException(status_code=499, detail="Client Disconnected")
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
# Use a short timeout so we can loop and check for is_disconnected()
|
|
335
|
+
result = await asyncio.to_thread(mcp_result_queue.get, timeout=0.1)
|
|
336
|
+
last_active_timestamp = time.time()
|
|
337
|
+
return result
|
|
338
|
+
except queue.Empty:
|
|
339
|
+
await asyncio.sleep(0.01)
|
|
340
|
+
|
|
341
|
+
finally:
|
|
342
|
+
# Always release the logical lock when the request ends
|
|
343
|
+
with mutex_lock:
|
|
344
|
+
active_session_id = None
|
|
345
|
+
|
|
346
|
+
if __name__ == "__main__":
|
|
347
|
+
import uvicorn
|
|
348
|
+
import os
|
|
349
|
+
|
|
350
|
+
# Isolate socket to user directory to prevent /tmp hijacking and permission issues
|
|
351
|
+
app_support_dir = os.path.expanduser("~/Library/Application Support/SpeakMCP")
|
|
352
|
+
os.makedirs(app_support_dir, exist_ok=True)
|
|
353
|
+
socket_path = os.path.join(app_support_dir, "daemon.sock")
|
|
354
|
+
|
|
355
|
+
# Cleanup orphaned socket to prevent "Address already in use" deadlock
|
|
356
|
+
if os.path.exists(socket_path):
|
|
357
|
+
try:
|
|
358
|
+
os.unlink(socket_path)
|
|
359
|
+
except OSError:
|
|
360
|
+
pass
|
|
361
|
+
|
|
362
|
+
# Important: run with workers=1 to ensure singleton
|
|
363
|
+
uvicorn.run(app, uds=socket_path, workers=1)
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { spawn } from "node:child_process";
|
|
4
|
+
import { join, dirname } from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
import { existsSync } from "node:fs";
|
|
7
|
+
|
|
8
|
+
// Get the directory of the current module
|
|
9
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
10
|
+
const __dirname = dirname(__filename);
|
|
11
|
+
|
|
12
|
+
// Root of the project
|
|
13
|
+
const projectRoot = join(__dirname, "..");
|
|
14
|
+
|
|
15
|
+
// Path to the Python script
|
|
16
|
+
const pythonScriptPath = join(projectRoot, "src", "mcp_server.py");
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Locate the best Python executable to use.
|
|
20
|
+
* Priority:
|
|
21
|
+
* 1. Local venv inside the project
|
|
22
|
+
* 2. System python3
|
|
23
|
+
*/
|
|
24
|
+
function getPythonExecutable(): string {
|
|
25
|
+
const venvPath = join(projectRoot, "venv", "bin", "python3");
|
|
26
|
+
if (existsSync(venvPath)) {
|
|
27
|
+
return venvPath;
|
|
28
|
+
}
|
|
29
|
+
return "python3";
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const pythonExecutable = getPythonExecutable();
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Start the Python MCP Server and bridge standard I/O.
|
|
36
|
+
*/
|
|
37
|
+
function startBridge() {
|
|
38
|
+
const pythonProcess = spawn(pythonExecutable, [pythonScriptPath], {
|
|
39
|
+
stdio: ["pipe", "pipe", "inherit"],
|
|
40
|
+
env: {
|
|
41
|
+
...process.env,
|
|
42
|
+
// Ensure Python output isn't buffered
|
|
43
|
+
PYTHONUNBUFFERED: "1",
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
// Pipe our stdin into Python's stdin
|
|
48
|
+
process.stdin.pipe(pythonProcess.stdin!);
|
|
49
|
+
|
|
50
|
+
// Pipe Python's stdout back to our stdout
|
|
51
|
+
pythonProcess.stdout!.pipe(process.stdout);
|
|
52
|
+
|
|
53
|
+
// Handle process termination
|
|
54
|
+
pythonProcess.on("exit", (code) => {
|
|
55
|
+
process.exit(code ?? 0);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
// Forward signals
|
|
59
|
+
process.on("SIGINT", () => pythonProcess.kill("SIGINT"));
|
|
60
|
+
process.on("SIGTERM", () => pythonProcess.kill("SIGTERM"));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
startBridge();
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
# 1. Save the actual OS-level stdout (FD 1) to a cloned file descriptor
|
|
5
|
+
original_stdout_fd = os.dup(1)
|
|
6
|
+
|
|
7
|
+
# 2. Force OS-level stdout (FD 1) to point to stderr (FD 2).
|
|
8
|
+
# Now, ALL C-level and Python-level logs safely pipe to stderr.
|
|
9
|
+
os.dup2(2, 1)
|
|
10
|
+
|
|
11
|
+
# 3. Redirect Python's sys.stdout just to be thorough
|
|
12
|
+
sys.stdout = sys.stderr
|
|
13
|
+
|
|
14
|
+
import uuid
|
|
15
|
+
import subprocess
|
|
16
|
+
import json
|
|
17
|
+
import socket
|
|
18
|
+
import http.client
|
|
19
|
+
import time
|
|
20
|
+
import logging
|
|
21
|
+
import asyncio
|
|
22
|
+
import random
|
|
23
|
+
|
|
24
|
+
from mcp.server.fastmcp import FastMCP, Context
|
|
25
|
+
|
|
26
|
+
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
|
|
27
|
+
|
|
28
|
+
# Inject the busy-signal instructions into the server instructions
|
|
29
|
+
instructions = """
|
|
30
|
+
<voice_loop_instructions>
|
|
31
|
+
If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
|
|
32
|
+
DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
|
|
33
|
+
CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
|
|
34
|
+
</voice_loop_instructions>
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Initialize FastMCP Server
|
|
38
|
+
mcp = FastMCP("voice-mcp-server-client", instructions=instructions)
|
|
39
|
+
|
|
40
|
+
SESSION_ID = str(uuid.uuid4())
|
|
41
|
+
|
|
42
|
+
# We use Unix Domain Sockets to bypass macOS firewall popups
|
|
43
|
+
# Isolate socket to user directory to prevent /tmp hijacking
|
|
44
|
+
app_support_dir = os.path.expanduser("~/Library/Application Support/SpeakMCP")
|
|
45
|
+
os.makedirs(app_support_dir, exist_ok=True)
|
|
46
|
+
SOCKET_PATH = os.path.join(app_support_dir, "daemon.sock")
|
|
47
|
+
|
|
48
|
+
class UDSHTTPConnection(http.client.HTTPConnection):
|
|
49
|
+
"""Subclass to force http.client over Unix Domain Sockets."""
|
|
50
|
+
def __init__(self, socket_path, timeout=300.0):
|
|
51
|
+
super().__init__("localhost", timeout=timeout)
|
|
52
|
+
self.socket_path = socket_path
|
|
53
|
+
|
|
54
|
+
def connect(self):
|
|
55
|
+
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
56
|
+
self.sock.settimeout(self.timeout)
|
|
57
|
+
self.sock.connect(self.socket_path)
|
|
58
|
+
|
|
59
|
+
def make_uds_request(method: str, path: str, payload: dict = None, timeout: float = 1.0) -> tuple[int, dict]:
|
|
60
|
+
"""Helper to cleanly make UDS requests and parse JSON."""
|
|
61
|
+
conn = UDSHTTPConnection(SOCKET_PATH, timeout=timeout)
|
|
62
|
+
try:
|
|
63
|
+
body = json.dumps(payload).encode('utf-8') if payload else None
|
|
64
|
+
headers = {'Content-Type': 'application/json'} if payload else {}
|
|
65
|
+
conn.request(method, path, body=body, headers=headers)
|
|
66
|
+
response = conn.getresponse()
|
|
67
|
+
data = response.read().decode('utf-8')
|
|
68
|
+
return response.status, json.loads(data) if data else {}
|
|
69
|
+
finally:
|
|
70
|
+
conn.close()
|
|
71
|
+
|
|
72
|
+
def check_daemon_health():
|
|
73
|
+
try:
|
|
74
|
+
status, _ = make_uds_request("GET", "/health", timeout=1.0)
|
|
75
|
+
return status == 200
|
|
76
|
+
except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
def ensure_daemon_running():
|
|
80
|
+
"""Checks if daemon is up, auto-boots it if not, and polls until ready."""
|
|
81
|
+
if check_daemon_health():
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
logging.info("Daemon is down, attempting to boot detached process...")
|
|
85
|
+
# Boot the daemon detached
|
|
86
|
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
87
|
+
python_exec = os.path.join(project_root, "venv", "bin", "python3")
|
|
88
|
+
daemon_script = os.path.join(project_root, "src", "daemon", "audio_server.py")
|
|
89
|
+
|
|
90
|
+
subprocess.Popen(
|
|
91
|
+
[python_exec, daemon_script],
|
|
92
|
+
stdout=subprocess.DEVNULL,
|
|
93
|
+
stderr=subprocess.DEVNULL,
|
|
94
|
+
start_new_session=True # Detach entirely so it survives CLI restarts
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Poll until health check passes (give it time to load ML models)
|
|
98
|
+
max_retries = 120 # 60 seconds
|
|
99
|
+
for _ in range(max_retries):
|
|
100
|
+
if check_daemon_health():
|
|
101
|
+
return
|
|
102
|
+
time.sleep(0.5)
|
|
103
|
+
|
|
104
|
+
raise RuntimeError("Failed to auto-boot Voice Audio Daemon. Health check timed out.")
|
|
105
|
+
|
|
106
|
+
@mcp.tool()
|
|
107
|
+
def configure_audio_engine(speaker_adapter: str = None, vad_adapter: str = None, stt_adapter: str = None) -> dict:
|
|
108
|
+
"""
|
|
109
|
+
Dynamically hot-swap the Voice Audio Daemon's AI models and hardware without restarting.
|
|
110
|
+
Args:
|
|
111
|
+
speaker_adapter: Valid options: 'kokoro_speaker', 'elevenlabs_speaker', 'live_speaker'.
|
|
112
|
+
vad_adapter: Valid options: 'silero_vad' (Conversational), 'ptt_vad' (Walkie-Talkie).
|
|
113
|
+
stt_adapter: Valid options: 'mlx_whisper_large_v3', 'whisper_stt'.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
ensure_daemon_running()
|
|
117
|
+
import re
|
|
118
|
+
|
|
119
|
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
120
|
+
config_path = os.path.join(project_root, "config", "config.yaml")
|
|
121
|
+
|
|
122
|
+
with open(config_path, "r") as f:
|
|
123
|
+
content = f.read()
|
|
124
|
+
|
|
125
|
+
if speaker_adapter:
|
|
126
|
+
content = re.sub(r"- speaker: .*", f"- speaker: {speaker_adapter}", content)
|
|
127
|
+
if vad_adapter:
|
|
128
|
+
content = re.sub(r"- vad: .*", f"- vad: {vad_adapter}", content)
|
|
129
|
+
if stt_adapter:
|
|
130
|
+
content = re.sub(r"- stt: .*", f"- stt: {stt_adapter}", content)
|
|
131
|
+
|
|
132
|
+
with open(config_path, "w") as f:
|
|
133
|
+
f.write(content)
|
|
134
|
+
|
|
135
|
+
# Trigger Daemon hot-reload
|
|
136
|
+
status, response_data = make_uds_request("POST", "/reload", timeout=15.0)
|
|
137
|
+
return response_data
|
|
138
|
+
|
|
139
|
+
except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
|
|
140
|
+
return {
|
|
141
|
+
"status": "error",
|
|
142
|
+
"message": "CRITICAL: The Voice Audio Daemon failed to respond to the reload request."
|
|
143
|
+
}
|
|
144
|
+
except Exception as e:
|
|
145
|
+
return {
|
|
146
|
+
"status": "error",
|
|
147
|
+
"message": f"CRITICAL Error dynamically reloading audio daemon: {str(e)}"
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async def render_visualizer(ctx: Context):
|
|
151
|
+
"""Renders a fake audio visualizer using MCP progress notifications."""
|
|
152
|
+
if not ctx: return
|
|
153
|
+
bars = [" ", "▂", "▃", "▄", "▅", "▆", "▇", "█"]
|
|
154
|
+
try:
|
|
155
|
+
while True:
|
|
156
|
+
spectrum = "".join(random.choice(bars) for _ in range(12))
|
|
157
|
+
await ctx.report_progress(100, 100, message=f"🎙️ {spectrum} 🎙️")
|
|
158
|
+
await asyncio.sleep(0.1)
|
|
159
|
+
except asyncio.CancelledError:
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
@mcp.tool()
|
|
163
|
+
async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Context = None) -> dict:
|
|
164
|
+
"""
|
|
165
|
+
Speak a prompt to the user and listen for a response.
|
|
166
|
+
If expect_reply is False, the tool returns immediately after queuing the speech.
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
ensure_daemon_running()
|
|
170
|
+
|
|
171
|
+
async def _do_converse():
|
|
172
|
+
return await asyncio.to_thread(
|
|
173
|
+
make_uds_request,
|
|
174
|
+
"POST",
|
|
175
|
+
"/converse",
|
|
176
|
+
{"session_id": SESSION_ID, "text_to_speak": text_to_speak, "expect_reply": expect_reply},
|
|
177
|
+
300.0
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Start the visualizer!
|
|
181
|
+
vis_task = asyncio.create_task(render_visualizer(ctx)) if ctx else None
|
|
182
|
+
try:
|
|
183
|
+
status, response_data = await _do_converse()
|
|
184
|
+
finally:
|
|
185
|
+
if vis_task:
|
|
186
|
+
vis_task.cancel()
|
|
187
|
+
|
|
188
|
+
# Handle the initialization (download) state automatically with native progress
|
|
189
|
+
if response_data and response_data.get("status") == "system_busy" and "initializing" in response_data.get("message", "").lower():
|
|
190
|
+
if ctx:
|
|
191
|
+
await ctx.info("Speak MCP: Initializing Local AI Models. This may take a few minutes...")
|
|
192
|
+
|
|
193
|
+
while True:
|
|
194
|
+
try:
|
|
195
|
+
# Async request for health to not block the event loop
|
|
196
|
+
h_status, h_data = await asyncio.to_thread(make_uds_request, "GET", "/health", None, 5.0)
|
|
197
|
+
if h_status == 200:
|
|
198
|
+
d_status = h_data.get("daemon_status")
|
|
199
|
+
d_msg = h_data.get("message", "")
|
|
200
|
+
d_progress = h_data.get("progress", 0)
|
|
201
|
+
|
|
202
|
+
# Report progress back to Gemini CLI for native rendering
|
|
203
|
+
if ctx:
|
|
204
|
+
await ctx.report_progress(d_progress, 100, message=d_msg)
|
|
205
|
+
|
|
206
|
+
if d_status == "READY":
|
|
207
|
+
if ctx:
|
|
208
|
+
await ctx.info("Speak MCP: Setup Complete!")
|
|
209
|
+
|
|
210
|
+
# After setup, the models are ready! Now perform the ACTUAL converse call with visualizer.
|
|
211
|
+
vis_task2 = asyncio.create_task(render_visualizer(ctx)) if ctx else None
|
|
212
|
+
try:
|
|
213
|
+
status, final_response = await _do_converse()
|
|
214
|
+
return final_response
|
|
215
|
+
finally:
|
|
216
|
+
if vis_task2:
|
|
217
|
+
vis_task2.cancel()
|
|
218
|
+
|
|
219
|
+
elif d_status == "ERROR":
|
|
220
|
+
return {"status": "error", "message": d_msg}
|
|
221
|
+
except Exception:
|
|
222
|
+
pass
|
|
223
|
+
await asyncio.sleep(1.0)
|
|
224
|
+
|
|
225
|
+
return response_data
|
|
226
|
+
|
|
227
|
+
except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
"status": "error",
|
|
231
|
+
"user_transcript": "",
|
|
232
|
+
"message": "CRITICAL: The Voice Audio Daemon failed to respond."
|
|
233
|
+
}
|
|
234
|
+
except TimeoutError:
|
|
235
|
+
return {
|
|
236
|
+
"status": "error",
|
|
237
|
+
"user_transcript": "",
|
|
238
|
+
"message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
|
|
239
|
+
}
|
|
240
|
+
except Exception as e:
|
|
241
|
+
return {
|
|
242
|
+
"status": "error",
|
|
243
|
+
"user_transcript": "",
|
|
244
|
+
"message": f"CRITICAL Error starting audio daemon: {str(e)}"
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if __name__ == "__main__":
|
|
248
|
+
# 4. Restore the OS-level stdout just before handing control to the MCP SDK
|
|
249
|
+
os.dup2(original_stdout_fd, 1)
|
|
250
|
+
os.close(original_stdout_fd)
|
|
251
|
+
sys.stdout = sys.__stdout__
|
|
252
|
+
|
|
253
|
+
# 5. Now the JSON-RPC protocol has an absolutely pristine stdout pipe
|
|
254
|
+
mcp.run()
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|