voice-mcp-server 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +193 -0
  3. package/build/index.js +51 -0
  4. package/config/config.yaml +25 -0
  5. package/config/microphone/live_mic.yaml +1 -0
  6. package/config/speaker/elevenlabs_speaker.yaml +3 -0
  7. package/config/speaker/kokoro_speaker.yaml +3 -0
  8. package/config/stt/mlx_whisper_large_v3.yaml +2 -0
  9. package/config/vad/ptt_vad.yaml +8 -0
  10. package/config/vad/silero_vad.yaml +7 -0
  11. package/package.json +40 -0
  12. package/requirements.txt +126 -0
  13. package/src/adapters_real/__init__.py +0 -0
  14. package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
  15. package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
  16. package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
  17. package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
  18. package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
  19. package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
  20. package/src/adapters_real/echo_llm.py +28 -0
  21. package/src/adapters_real/elevenlabs_speaker.py +117 -0
  22. package/src/adapters_real/kokoro_speaker.py +122 -0
  23. package/src/adapters_real/live_mic.py +64 -0
  24. package/src/adapters_real/live_speaker.py +66 -0
  25. package/src/adapters_real/ptt_vad.py +36 -0
  26. package/src/adapters_real/queue_llm.py +36 -0
  27. package/src/adapters_real/silero_vad.py +43 -0
  28. package/src/adapters_real/wav_mic.py +17 -0
  29. package/src/adapters_real/whisper_stt.py +32 -0
  30. package/src/daemon/__init__.py +0 -0
  31. package/src/daemon/audio_server.py +363 -0
  32. package/src/index.ts +63 -0
  33. package/src/mcp_server.py +254 -0
  34. package/src/simulation/__init__.py +0 -0
  35. package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
  37. package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
  38. package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
  39. package/src/simulation/adapters.py +131 -0
  40. package/src/simulation/engine.py +242 -0
  41. package/src/simulation/models.py +25 -0
  42. package/src/simulation/ports.py +57 -0
  43. package/src/simulation/tests/__init__.py +0 -0
  44. package/src/simulation/tests/test_scenarios.py +510 -0
  45. package/tsconfig.json +15 -0
@@ -0,0 +1,363 @@
1
+ import asyncio
2
+ import sys
3
+ import os
4
+ import time
5
+ import threading
6
+ import queue
7
+ import logging
8
+ from contextlib import asynccontextmanager
9
+ from fastapi import FastAPI, Request, HTTPException
10
+ from fastapi.responses import StreamingResponse
11
+ from hydra import compose, initialize
12
+ from hydra.utils import instantiate
13
+
14
+ # Enforce strict model download locations BEFORE loading any ML libraries
15
+ app_support_dir = os.path.expanduser("~/Library/Application Support/VoiceMCP/models")
16
+ os.makedirs(app_support_dir, exist_ok=True)
17
+ os.environ["HF_HOME"] = os.path.join(app_support_dir, "huggingface")
18
+ os.environ["TORCH_HOME"] = os.path.join(app_support_dir, "torch")
19
+
20
+ # Add src to python path for imports
21
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
22
+
23
+ from simulation.models import Config
24
+ from simulation.engine import CoreEngine, State
25
+ from adapters_real.queue_llm import QueueLLMBridge
26
+
27
+ # --- Global State ---
28
+ mcp_command_queue = queue.Queue()
29
+ mcp_result_queue = queue.Queue()
30
+ active_session_id = None
31
+ mutex_lock = threading.Lock()
32
+ last_active_timestamp = time.time()
33
+ IDLE_TIMEOUT_SECONDS = 900 # 15 minutes
34
+
35
+ # Daemon Lifecycle State
36
+ daemon_status = "DOWNLOADING" # Starts in downloading state to prevent Claude timeouts
37
+ daemon_status_message = "Initializing models..."
38
+ daemon_progress = 0
39
+
40
+ # Engine reference
41
+ engine = None
42
+ mic = None
43
+ speaker = None
44
+
45
+ def pre_download_models():
46
+ """Forces huggingface_hub to fetch the massive models into our explicit directory before instantiation."""
47
+ global daemon_status_message, daemon_progress
48
+ try:
49
+ from huggingface_hub import snapshot_download, try_to_load_from_cache
50
+ from huggingface_hub.utils import LocalEntryNotFoundError
51
+
52
+ # 1. Kokoro TTS (82M)
53
+ try:
54
+ try_to_load_from_cache(repo_id="hexgrad/Kokoro-82M", filename="kokoro-v1_0.pth")
55
+ daemon_status_message = "Loading Kokoro TTS (82M)..."
56
+ daemon_progress = 10
57
+ # Ensure everything is correct
58
+ snapshot_download(repo_id="hexgrad/Kokoro-82M", allow_patterns=["*.pth", "*.json", "voices/*"], local_files_only=True)
59
+ except (LocalEntryNotFoundError, Exception):
60
+ daemon_status_message = "Downloading Kokoro TTS (82M)..."
61
+ daemon_progress = 5
62
+ snapshot_download(repo_id="hexgrad/Kokoro-82M", allow_patterns=["*.pth", "*.json", "voices/*"])
63
+
64
+ # 2. MLX Whisper Large v3 (3GB)
65
+ try:
66
+ try_to_load_from_cache(repo_id="mlx-community/whisper-large-v3-mlx", filename="weights.npz")
67
+ daemon_status_message = "Loading MLX Whisper Large v3 (3GB)..."
68
+ daemon_progress = 50
69
+ snapshot_download(repo_id="mlx-community/whisper-large-v3-mlx", local_files_only=True)
70
+ except (LocalEntryNotFoundError, Exception):
71
+ daemon_status_message = "Downloading MLX Whisper Large v3 (3GB)..."
72
+ daemon_progress = 30
73
+ snapshot_download(repo_id="mlx-community/whisper-large-v3-mlx")
74
+
75
+ daemon_status_message = "Finalizing AI setup..."
76
+ daemon_progress = 90
77
+ except Exception as e:
78
+ print(f"Model download error: {e}", file=sys.stderr)
79
+ daemon_status_message = f"Error downloading models: {e}"
80
+
81
+ def run_audio_daemon():
82
+ """Runs the CoreEngine in a persistent background thread."""
83
+ global engine, mic, speaker, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
84
+
85
+ # Pre-download models so the daemon status reflects exactly what is happening
86
+ pre_download_models()
87
+ daemon_status_message = "Instantiating hardware..."
88
+ daemon_progress = 95
89
+
90
+ # Load configuration using Hydra
91
+ config_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'config'))
92
+
93
+ with initialize(version_base=None, config_path="../../config"):
94
+ cfg = compose(config_name="config")
95
+ print("Loaded Hydra configuration successfully.")
96
+
97
+ mic = instantiate(cfg.microphone)
98
+ speaker = instantiate(cfg.speaker)
99
+ vad = instantiate(cfg.vad)
100
+ stt = instantiate(cfg.stt)
101
+ llm = QueueLLMBridge(mcp_command_queue, mcp_result_queue)
102
+
103
+ config = Config(
104
+ vad_probability_threshold=cfg.vad.get("vad_probability_threshold", 0.80),
105
+ vad_bargein_threshold_ms=cfg.vad.get("vad_bargein_threshold_ms", 500),
106
+ endpointing_patience_normal_ms=cfg.vad.get("endpointing_patience_normal_ms", 1500),
107
+ endpointing_patience_interrupted_ms=cfg.vad.get("endpointing_patience_interrupted_ms", 700),
108
+ vad_silence_grace_ms=cfg.config.get("vad_silence_grace_ms", 100)
109
+ )
110
+
111
+ engine = CoreEngine(config, mic, speaker, vad, stt, llm)
112
+ engine.state = State.EXECUTING # Start dormant
113
+
114
+ daemon_status = "READY"
115
+ daemon_status_message = "Audio Engine is online."
116
+ daemon_progress = 100
117
+ print("Audio Daemon Started. Waiting for commands.", file=sys.stderr)
118
+
119
+ try:
120
+ while True:
121
+ # If dormant, check for commands from FastAPI
122
+ if engine.state == State.EXECUTING:
123
+ try:
124
+ cmd = mcp_command_queue.get(timeout=0.1) # Blocks briefly
125
+
126
+ # We got a command, wake up the hardware!
127
+ mic.start_stream()
128
+ engine.start_conversation(cmd.get("text", ""))
129
+ engine.expect_reply = cmd.get("expect_reply", True)
130
+
131
+ except queue.Empty:
132
+ pass
133
+ else:
134
+ engine.tick()
135
+ # Once we drop back to EXECUTING, we finished the conversation loop
136
+ if engine.state == State.EXECUTING:
137
+ mic.stop_stream()
138
+ last_active_timestamp = time.time()
139
+
140
+ except Exception as e:
141
+ print(f"Daemon exception: {e}", file=sys.stderr)
142
+ finally:
143
+ if mic:
144
+ mic.close()
145
+
146
+ async def watchdog():
147
+ """Monitors idle time and self-destructs if inactive."""
148
+ global last_active_timestamp
149
+ while True:
150
+ await asyncio.sleep(60)
151
+ idle_time = time.time() - last_active_timestamp
152
+ if idle_time > IDLE_TIMEOUT_SECONDS:
153
+ print(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.", file=sys.stderr)
154
+ if mic:
155
+ mic.close()
156
+ os._exit(0)
157
+
158
+ def parent_pid_polling():
159
+ """Polls the parent PID. If the parent dies, the daemon instantly self-destructs."""
160
+ while True:
161
+ time.sleep(3.0)
162
+ if os.getppid() == 1:
163
+ print("Parent process died. Stopping daemon to prevent Zombie microphone lock.", file=sys.stderr)
164
+ os._exit(0)
165
+
166
+ @asynccontextmanager
167
+ async def lifespan(app: FastAPI):
168
+ # Boot the daemon thread on startup
169
+ daemon_thread = threading.Thread(target=run_audio_daemon, daemon=True)
170
+ daemon_thread.start()
171
+
172
+ # Start the watchdog
173
+ asyncio.create_task(watchdog())
174
+
175
+ # Start the Parent PID Poller
176
+ polling_thread = threading.Thread(target=parent_pid_polling, daemon=True)
177
+ polling_thread.start()
178
+
179
+ yield
180
+ # Shutdown logic
181
+ if mic:
182
+ mic.close()
183
+
184
+ app = FastAPI(lifespan=lifespan)
185
+
186
+ @app.get("/health")
187
+ async def health_check():
188
+ # If the app is up, we are technically "healthy" enough for the MCP client to connect,
189
+ # even if we are downloading. The actual block happens in /converse.
190
+ return {
191
+ "status": "ok",
192
+ "daemon_status": daemon_status,
193
+ "message": daemon_status_message,
194
+ "progress": daemon_progress
195
+ }
196
+
197
+ @app.get("/status")
198
+ async def status_sse(request: Request):
199
+ """Server-Sent Events endpoint to broadcast download/status progress to the UI."""
200
+ async def event_generator():
201
+ last_msg = ""
202
+ while True:
203
+ if await request.is_disconnected():
204
+ break
205
+
206
+ # Only yield if the message changed to save bandwidth, unless we just connected
207
+ if daemon_status_message != last_msg:
208
+ last_msg = daemon_status_message
209
+ yield {
210
+ "event": "status_update",
211
+ "data": f'{{"status": "{daemon_status}", "message": "{daemon_status_message}"}}'
212
+ }
213
+ await asyncio.sleep(0.5)
214
+
215
+ from sse_starlette.sse import EventSourceResponse
216
+ return EventSourceResponse(event_generator())
217
+
218
+ @app.post("/reload")
219
+ async def reload_config():
220
+ global engine, mic, speaker, vad, stt, daemon_status, daemon_status_message
221
+
222
+ if daemon_status == "DOWNLOADING":
223
+ return {"status": "error", "message": "Cannot reload while downloading models."}
224
+
225
+ daemon_status = "RELOADING"
226
+ daemon_status_message = "Hot-swapping audio models..."
227
+
228
+ with mutex_lock:
229
+ # 1. Stop the current engine
230
+ if engine:
231
+ engine.state = State.EXECUTING
232
+ if mic:
233
+ mic.close()
234
+
235
+ # 1b. CRITICAL: Explicitly obliterate old models from VRAM to prevent Out-Of-Memory (OOM) crashes on hot-swaps
236
+ import gc
237
+ try:
238
+ del speaker
239
+ del vad
240
+ del stt
241
+ del engine
242
+ except NameError:
243
+ pass
244
+
245
+ gc.collect()
246
+
247
+ try:
248
+ import mlx.core as mx
249
+ mx.metal.clear_cache()
250
+ except ImportError:
251
+ pass
252
+
253
+ try:
254
+ import torch
255
+ if torch.backends.mps.is_available():
256
+ torch.mps.empty_cache()
257
+ except ImportError:
258
+ pass
259
+
260
+ try:
261
+ # 2. Re-read the YAML file using Hydra
262
+ with initialize(version_base=None, config_path="../../config"):
263
+ cfg = compose(config_name="config")
264
+
265
+ # 3. Instantiate the new models on the fly
266
+ mic = instantiate(cfg.microphone)
267
+ speaker = instantiate(cfg.speaker)
268
+ vad = instantiate(cfg.vad)
269
+ stt = instantiate(cfg.stt)
270
+ llm = QueueLLMBridge(mcp_command_queue, mcp_result_queue)
271
+
272
+ config = Config(
273
+ vad_probability_threshold=cfg.vad.get("vad_probability_threshold", 0.80),
274
+ vad_bargein_threshold_ms=cfg.vad.get("vad_bargein_threshold_ms", 500),
275
+ endpointing_patience_normal_ms=cfg.vad.get("endpointing_patience_normal_ms", 1500),
276
+ endpointing_patience_interrupted_ms=cfg.vad.get("endpointing_patience_interrupted_ms", 700),
277
+ vad_silence_grace_ms=cfg.config.get("vad_silence_grace_ms", 100)
278
+ )
279
+
280
+ engine = CoreEngine(config, mic, speaker, vad, stt, llm)
281
+ engine.state = State.EXECUTING
282
+
283
+ daemon_status = "READY"
284
+ daemon_status_message = "Audio Engine reloaded successfully."
285
+ return {"status": "ok", "message": "Audio engine hot-swapped successfully."}
286
+
287
+ except Exception as e:
288
+ daemon_status = "ERROR"
289
+ daemon_status_message = f"Failed to reload: {str(e)}"
290
+ return {"status": "error", "message": daemon_status_message}
291
+
292
+ @app.post("/converse")
293
+ async def converse(request: Request):
294
+ global active_session_id, last_active_timestamp
295
+
296
+ # Fast-Fail Graceful State to prevent Claude Timeout during the 3GB initial download
297
+ if daemon_status == "DOWNLOADING":
298
+ return {
299
+ "status": "system_busy",
300
+ "message": f"SYSTEM NOTIFICATION: Speak MCP is currently initializing. {daemon_status_message} Please instruct the user to wait a moment and try again."
301
+ }
302
+
303
+ body = await request.json()
304
+ session_id = body.get("session_id")
305
+ text_to_speak = body.get("text_to_speak", "")
306
+ expect_reply = body.get("expect_reply", True)
307
+
308
+ with mutex_lock:
309
+ if active_session_id is not None and active_session_id != session_id:
310
+ return {
311
+ "status": "system_busy",
312
+ "message": "Microphone is in use by another session. Fallback to text."
313
+ }
314
+ # Lock the logical session
315
+ active_session_id = session_id
316
+ last_active_timestamp = time.time()
317
+
318
+ try:
319
+ # Feed command to daemon
320
+ mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
321
+
322
+ # Wait for human to interact or natural termination, checking for client disconnects
323
+ while True:
324
+ if await request.is_disconnected():
325
+ print(f"[{session_id}] Client disconnected! Aborting audio loop.", file=sys.stderr)
326
+ # Client hung up (e.g. reload or ctrl+c). We must reset the engine immediately.
327
+ if speaker:
328
+ speaker.flush()
329
+ if engine:
330
+ engine.state = State.EXECUTING # This will trigger mic.stop_stream() in the loop
331
+ raise HTTPException(status_code=499, detail="Client Disconnected")
332
+
333
+ try:
334
+ # Use a short timeout so we can loop and check for is_disconnected()
335
+ result = await asyncio.to_thread(mcp_result_queue.get, timeout=0.1)
336
+ last_active_timestamp = time.time()
337
+ return result
338
+ except queue.Empty:
339
+ await asyncio.sleep(0.01)
340
+
341
+ finally:
342
+ # Always release the logical lock when the request ends
343
+ with mutex_lock:
344
+ active_session_id = None
345
+
346
+ if __name__ == "__main__":
347
+ import uvicorn
348
+ import os
349
+
350
+ # Isolate socket to user directory to prevent /tmp hijacking and permission issues
351
+ app_support_dir = os.path.expanduser("~/Library/Application Support/SpeakMCP")
352
+ os.makedirs(app_support_dir, exist_ok=True)
353
+ socket_path = os.path.join(app_support_dir, "daemon.sock")
354
+
355
+ # Cleanup orphaned socket to prevent "Address already in use" deadlock
356
+ if os.path.exists(socket_path):
357
+ try:
358
+ os.unlink(socket_path)
359
+ except OSError:
360
+ pass
361
+
362
+ # Important: run with workers=1 to ensure singleton
363
+ uvicorn.run(app, uds=socket_path, workers=1)
package/src/index.ts ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { spawn } from "node:child_process";
4
+ import { join, dirname } from "node:path";
5
+ import { fileURLToPath } from "node:url";
6
+ import { existsSync } from "node:fs";
7
+
8
+ // Get the directory of the current module
9
+ const __filename = fileURLToPath(import.meta.url);
10
+ const __dirname = dirname(__filename);
11
+
12
+ // Root of the project
13
+ const projectRoot = join(__dirname, "..");
14
+
15
+ // Path to the Python script
16
+ const pythonScriptPath = join(projectRoot, "src", "mcp_server.py");
17
+
18
+ /**
19
+ * Locate the best Python executable to use.
20
+ * Priority:
21
+ * 1. Local venv inside the project
22
+ * 2. System python3
23
+ */
24
+ function getPythonExecutable(): string {
25
+ const venvPath = join(projectRoot, "venv", "bin", "python3");
26
+ if (existsSync(venvPath)) {
27
+ return venvPath;
28
+ }
29
+ return "python3";
30
+ }
31
+
32
+ const pythonExecutable = getPythonExecutable();
33
+
34
+ /**
35
+ * Start the Python MCP Server and bridge standard I/O.
36
+ */
37
+ function startBridge() {
38
+ const pythonProcess = spawn(pythonExecutable, [pythonScriptPath], {
39
+ stdio: ["pipe", "pipe", "inherit"],
40
+ env: {
41
+ ...process.env,
42
+ // Ensure Python output isn't buffered
43
+ PYTHONUNBUFFERED: "1",
44
+ },
45
+ });
46
+
47
+ // Pipe our stdin into Python's stdin
48
+ process.stdin.pipe(pythonProcess.stdin!);
49
+
50
+ // Pipe Python's stdout back to our stdout
51
+ pythonProcess.stdout!.pipe(process.stdout);
52
+
53
+ // Handle process termination
54
+ pythonProcess.on("exit", (code) => {
55
+ process.exit(code ?? 0);
56
+ });
57
+
58
+ // Forward signals
59
+ process.on("SIGINT", () => pythonProcess.kill("SIGINT"));
60
+ process.on("SIGTERM", () => pythonProcess.kill("SIGTERM"));
61
+ }
62
+
63
+ startBridge();
@@ -0,0 +1,254 @@
1
+ import os
2
+ import sys
3
+
4
+ # 1. Save the actual OS-level stdout (FD 1) to a cloned file descriptor
5
+ original_stdout_fd = os.dup(1)
6
+
7
+ # 2. Force OS-level stdout (FD 1) to point to stderr (FD 2).
8
+ # Now, ALL C-level and Python-level logs safely pipe to stderr.
9
+ os.dup2(2, 1)
10
+
11
+ # 3. Redirect Python's sys.stdout just to be thorough
12
+ sys.stdout = sys.stderr
13
+
14
+ import uuid
15
+ import subprocess
16
+ import json
17
+ import socket
18
+ import http.client
19
+ import time
20
+ import logging
21
+ import asyncio
22
+ import random
23
+
24
+ from mcp.server.fastmcp import FastMCP, Context
25
+
26
+ logging.basicConfig(level=logging.INFO, stream=sys.stderr)
27
+
28
+ # Inject the busy-signal instructions into the server instructions
29
+ instructions = """
30
+ <voice_loop_instructions>
31
+ If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
32
+ DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
33
+ CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
34
+ </voice_loop_instructions>
35
+ """
36
+
37
+ # Initialize FastMCP Server
38
+ mcp = FastMCP("voice-mcp-server-client", instructions=instructions)
39
+
40
+ SESSION_ID = str(uuid.uuid4())
41
+
42
+ # We use Unix Domain Sockets to bypass macOS firewall popups
43
+ # Isolate socket to user directory to prevent /tmp hijacking
44
+ app_support_dir = os.path.expanduser("~/Library/Application Support/SpeakMCP")
45
+ os.makedirs(app_support_dir, exist_ok=True)
46
+ SOCKET_PATH = os.path.join(app_support_dir, "daemon.sock")
47
+
48
+ class UDSHTTPConnection(http.client.HTTPConnection):
49
+ """Subclass to force http.client over Unix Domain Sockets."""
50
+ def __init__(self, socket_path, timeout=300.0):
51
+ super().__init__("localhost", timeout=timeout)
52
+ self.socket_path = socket_path
53
+
54
+ def connect(self):
55
+ self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
56
+ self.sock.settimeout(self.timeout)
57
+ self.sock.connect(self.socket_path)
58
+
59
+ def make_uds_request(method: str, path: str, payload: dict = None, timeout: float = 1.0) -> tuple[int, dict]:
60
+ """Helper to cleanly make UDS requests and parse JSON."""
61
+ conn = UDSHTTPConnection(SOCKET_PATH, timeout=timeout)
62
+ try:
63
+ body = json.dumps(payload).encode('utf-8') if payload else None
64
+ headers = {'Content-Type': 'application/json'} if payload else {}
65
+ conn.request(method, path, body=body, headers=headers)
66
+ response = conn.getresponse()
67
+ data = response.read().decode('utf-8')
68
+ return response.status, json.loads(data) if data else {}
69
+ finally:
70
+ conn.close()
71
+
72
+ def check_daemon_health():
73
+ try:
74
+ status, _ = make_uds_request("GET", "/health", timeout=1.0)
75
+ return status == 200
76
+ except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
77
+ return False
78
+
79
+ def ensure_daemon_running():
80
+ """Checks if daemon is up, auto-boots it if not, and polls until ready."""
81
+ if check_daemon_health():
82
+ return
83
+
84
+ logging.info("Daemon is down, attempting to boot detached process...")
85
+ # Boot the daemon detached
86
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
87
+ python_exec = os.path.join(project_root, "venv", "bin", "python3")
88
+ daemon_script = os.path.join(project_root, "src", "daemon", "audio_server.py")
89
+
90
+ subprocess.Popen(
91
+ [python_exec, daemon_script],
92
+ stdout=subprocess.DEVNULL,
93
+ stderr=subprocess.DEVNULL,
94
+ start_new_session=True # Detach entirely so it survives CLI restarts
95
+ )
96
+
97
+ # Poll until health check passes (give it time to load ML models)
98
+ max_retries = 120 # 60 seconds
99
+ for _ in range(max_retries):
100
+ if check_daemon_health():
101
+ return
102
+ time.sleep(0.5)
103
+
104
+ raise RuntimeError("Failed to auto-boot Voice Audio Daemon. Health check timed out.")
105
+
106
+ @mcp.tool()
107
+ def configure_audio_engine(speaker_adapter: str = None, vad_adapter: str = None, stt_adapter: str = None) -> dict:
108
+ """
109
+ Dynamically hot-swap the Voice Audio Daemon's AI models and hardware without restarting.
110
+ Args:
111
+ speaker_adapter: Valid options: 'kokoro_speaker', 'elevenlabs_speaker', 'live_speaker'.
112
+ vad_adapter: Valid options: 'silero_vad' (Conversational), 'ptt_vad' (Walkie-Talkie).
113
+ stt_adapter: Valid options: 'mlx_whisper_large_v3', 'whisper_stt'.
114
+ """
115
+ try:
116
+ ensure_daemon_running()
117
+ import re
118
+
119
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
120
+ config_path = os.path.join(project_root, "config", "config.yaml")
121
+
122
+ with open(config_path, "r") as f:
123
+ content = f.read()
124
+
125
+ if speaker_adapter:
126
+ content = re.sub(r"- speaker: .*", f"- speaker: {speaker_adapter}", content)
127
+ if vad_adapter:
128
+ content = re.sub(r"- vad: .*", f"- vad: {vad_adapter}", content)
129
+ if stt_adapter:
130
+ content = re.sub(r"- stt: .*", f"- stt: {stt_adapter}", content)
131
+
132
+ with open(config_path, "w") as f:
133
+ f.write(content)
134
+
135
+ # Trigger Daemon hot-reload
136
+ status, response_data = make_uds_request("POST", "/reload", timeout=15.0)
137
+ return response_data
138
+
139
+ except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
140
+ return {
141
+ "status": "error",
142
+ "message": "CRITICAL: The Voice Audio Daemon failed to respond to the reload request."
143
+ }
144
+ except Exception as e:
145
+ return {
146
+ "status": "error",
147
+ "message": f"CRITICAL Error dynamically reloading audio daemon: {str(e)}"
148
+ }
149
+
150
+ async def render_visualizer(ctx: Context):
151
+ """Renders a fake audio visualizer using MCP progress notifications."""
152
+ if not ctx: return
153
+ bars = [" ", "▂", "▃", "▄", "▅", "▆", "▇", "█"]
154
+ try:
155
+ while True:
156
+ spectrum = "".join(random.choice(bars) for _ in range(12))
157
+ await ctx.report_progress(100, 100, message=f"🎙️ {spectrum} 🎙️")
158
+ await asyncio.sleep(0.1)
159
+ except asyncio.CancelledError:
160
+ pass
161
+
162
+ @mcp.tool()
163
+ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Context = None) -> dict:
164
+ """
165
+ Speak a prompt to the user and listen for a response.
166
+ If expect_reply is False, the tool returns immediately after queuing the speech.
167
+ """
168
+ try:
169
+ ensure_daemon_running()
170
+
171
+ async def _do_converse():
172
+ return await asyncio.to_thread(
173
+ make_uds_request,
174
+ "POST",
175
+ "/converse",
176
+ {"session_id": SESSION_ID, "text_to_speak": text_to_speak, "expect_reply": expect_reply},
177
+ 300.0
178
+ )
179
+
180
+ # Start the visualizer!
181
+ vis_task = asyncio.create_task(render_visualizer(ctx)) if ctx else None
182
+ try:
183
+ status, response_data = await _do_converse()
184
+ finally:
185
+ if vis_task:
186
+ vis_task.cancel()
187
+
188
+ # Handle the initialization (download) state automatically with native progress
189
+ if response_data and response_data.get("status") == "system_busy" and "initializing" in response_data.get("message", "").lower():
190
+ if ctx:
191
+ await ctx.info("Speak MCP: Initializing Local AI Models. This may take a few minutes...")
192
+
193
+ while True:
194
+ try:
195
+ # Async request for health to not block the event loop
196
+ h_status, h_data = await asyncio.to_thread(make_uds_request, "GET", "/health", None, 5.0)
197
+ if h_status == 200:
198
+ d_status = h_data.get("daemon_status")
199
+ d_msg = h_data.get("message", "")
200
+ d_progress = h_data.get("progress", 0)
201
+
202
+ # Report progress back to Gemini CLI for native rendering
203
+ if ctx:
204
+ await ctx.report_progress(d_progress, 100, message=d_msg)
205
+
206
+ if d_status == "READY":
207
+ if ctx:
208
+ await ctx.info("Speak MCP: Setup Complete!")
209
+
210
+ # After setup, the models are ready! Now perform the ACTUAL converse call with visualizer.
211
+ vis_task2 = asyncio.create_task(render_visualizer(ctx)) if ctx else None
212
+ try:
213
+ status, final_response = await _do_converse()
214
+ return final_response
215
+ finally:
216
+ if vis_task2:
217
+ vis_task2.cancel()
218
+
219
+ elif d_status == "ERROR":
220
+ return {"status": "error", "message": d_msg}
221
+ except Exception:
222
+ pass
223
+ await asyncio.sleep(1.0)
224
+
225
+ return response_data
226
+
227
+ except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
228
+
229
+ return {
230
+ "status": "error",
231
+ "user_transcript": "",
232
+ "message": "CRITICAL: The Voice Audio Daemon failed to respond."
233
+ }
234
+ except TimeoutError:
235
+ return {
236
+ "status": "error",
237
+ "user_transcript": "",
238
+ "message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
239
+ }
240
+ except Exception as e:
241
+ return {
242
+ "status": "error",
243
+ "user_transcript": "",
244
+ "message": f"CRITICAL Error starting audio daemon: {str(e)}"
245
+ }
246
+
247
+ if __name__ == "__main__":
248
+ # 4. Restore the OS-level stdout just before handing control to the MCP SDK
249
+ os.dup2(original_stdout_fd, 1)
250
+ os.close(original_stdout_fd)
251
+ sys.stdout = sys.__stdout__
252
+
253
+ # 5. Now the JSON-RPC protocol has an absolutely pristine stdout pipe
254
+ mcp.run()
File without changes