voice-mcp-server 0.1.24 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/config/config.yaml +1 -1
- package/config/vad/ptt_vad.yaml +1 -1
- package/package.json +1 -1
- package/requirements.txt +1 -0
- package/src/__pycache__/logger.cpython-312.pyc +0 -0
- package/src/__pycache__/mcp_server.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/adapters_real/kokoro_speaker.py +7 -6
- package/src/adapters_real/live_mic.py +15 -4
- package/src/adapters_real/ptt_sidecar +0 -0
- package/src/adapters_real/ptt_sidecar.swift +156 -0
- package/src/adapters_real/ptt_vad.py +143 -25
- package/src/adapters_real/whisper_stt.py +5 -4
- package/src/daemon/__pycache__/audio_server.cpython-312.pyc +0 -0
- package/src/daemon/audio_server.py +52 -15
- package/src/logger.py +29 -0
- package/src/mcp_server.py +143 -15
- package/src/simulation/__pycache__/adapters.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/engine.py +67 -19
- package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc +0 -0
- package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc +0 -0
- package/src/simulation/tests/test_abort_daemon.py +109 -0
- package/src/simulation/tests/test_mcp_cancellation.py +83 -0
- package/src/simulation/tests/test_ptt_vad.py +81 -0
|
@@ -4,7 +4,6 @@ import os
|
|
|
4
4
|
import time
|
|
5
5
|
import threading
|
|
6
6
|
import queue
|
|
7
|
-
import logging
|
|
8
7
|
from contextlib import asynccontextmanager
|
|
9
8
|
from fastapi import FastAPI, Request, HTTPException
|
|
10
9
|
from fastapi.responses import StreamingResponse
|
|
@@ -20,6 +19,7 @@ os.environ["TORCH_HOME"] = os.path.join(app_support_dir, "torch")
|
|
|
20
19
|
# Add src to python path for imports
|
|
21
20
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
22
21
|
|
|
22
|
+
from logger import logger
|
|
23
23
|
from simulation.models import Config
|
|
24
24
|
from simulation.engine import CoreEngine, State
|
|
25
25
|
from adapters_real.queue_llm import QueueLLMBridge
|
|
@@ -75,7 +75,7 @@ def pre_download_models():
|
|
|
75
75
|
daemon_status_message = "Finalizing AI setup..."
|
|
76
76
|
daemon_progress = 90
|
|
77
77
|
except Exception as e:
|
|
78
|
-
|
|
78
|
+
logger.error(f"Model download error: {e}")
|
|
79
79
|
daemon_status_message = f"Error downloading models: {e}"
|
|
80
80
|
|
|
81
81
|
def run_audio_daemon():
|
|
@@ -92,7 +92,7 @@ def run_audio_daemon():
|
|
|
92
92
|
|
|
93
93
|
with initialize(version_base=None, config_path="../../config"):
|
|
94
94
|
cfg = compose(config_name="config")
|
|
95
|
-
|
|
95
|
+
logger.info("Loaded Hydra configuration successfully.")
|
|
96
96
|
|
|
97
97
|
mic = instantiate(cfg.microphone)
|
|
98
98
|
speaker = instantiate(cfg.speaker)
|
|
@@ -114,7 +114,7 @@ def run_audio_daemon():
|
|
|
114
114
|
daemon_status = "READY"
|
|
115
115
|
daemon_status_message = "Audio Engine is online."
|
|
116
116
|
daemon_progress = 100
|
|
117
|
-
|
|
117
|
+
logger.info("Audio Daemon Started. Waiting for commands.")
|
|
118
118
|
|
|
119
119
|
try:
|
|
120
120
|
while True:
|
|
@@ -125,7 +125,9 @@ def run_audio_daemon():
|
|
|
125
125
|
|
|
126
126
|
# We got a command, wake up the hardware!
|
|
127
127
|
mic.start_stream()
|
|
128
|
-
|
|
128
|
+
if hasattr(vad, "set_active"):
|
|
129
|
+
vad.set_active(True)
|
|
130
|
+
engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
|
|
129
131
|
engine.expect_reply = cmd.get("expect_reply", True)
|
|
130
132
|
|
|
131
133
|
except queue.Empty:
|
|
@@ -135,10 +137,12 @@ def run_audio_daemon():
|
|
|
135
137
|
# Once we drop back to EXECUTING, we finished the conversation loop
|
|
136
138
|
if engine.state == State.EXECUTING:
|
|
137
139
|
mic.stop_stream()
|
|
140
|
+
if hasattr(vad, "set_active"):
|
|
141
|
+
vad.set_active(False)
|
|
138
142
|
last_active_timestamp = time.time()
|
|
139
143
|
|
|
140
144
|
except Exception as e:
|
|
141
|
-
|
|
145
|
+
logger.error(f"Daemon exception: {e}")
|
|
142
146
|
finally:
|
|
143
147
|
if mic:
|
|
144
148
|
mic.close()
|
|
@@ -150,17 +154,19 @@ async def watchdog():
|
|
|
150
154
|
await asyncio.sleep(60)
|
|
151
155
|
idle_time = time.time() - last_active_timestamp
|
|
152
156
|
if idle_time > IDLE_TIMEOUT_SECONDS:
|
|
153
|
-
|
|
157
|
+
logger.info(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.")
|
|
154
158
|
if mic:
|
|
155
159
|
mic.close()
|
|
156
160
|
os._exit(0)
|
|
157
161
|
|
|
158
162
|
def parent_pid_polling():
|
|
159
163
|
"""Polls the parent PID. If the parent dies, the daemon instantly self-destructs."""
|
|
164
|
+
original_ppid = os.getppid()
|
|
160
165
|
while True:
|
|
161
166
|
time.sleep(3.0)
|
|
162
|
-
|
|
163
|
-
|
|
167
|
+
current_ppid = os.getppid()
|
|
168
|
+
if current_ppid == 1 or current_ppid != original_ppid:
|
|
169
|
+
logger.warning("Parent process died. Stopping daemon to prevent Zombie microphone lock.")
|
|
164
170
|
os._exit(0)
|
|
165
171
|
|
|
166
172
|
@asynccontextmanager
|
|
@@ -172,10 +178,6 @@ async def lifespan(app: FastAPI):
|
|
|
172
178
|
# Start the watchdog
|
|
173
179
|
asyncio.create_task(watchdog())
|
|
174
180
|
|
|
175
|
-
# Start the Parent PID Poller
|
|
176
|
-
polling_thread = threading.Thread(target=parent_pid_polling, daemon=True)
|
|
177
|
-
polling_thread.start()
|
|
178
|
-
|
|
179
181
|
yield
|
|
180
182
|
# Shutdown logic
|
|
181
183
|
if mic:
|
|
@@ -289,6 +291,36 @@ async def reload_config():
|
|
|
289
291
|
daemon_status_message = f"Failed to reload: {str(e)}"
|
|
290
292
|
return {"status": "error", "message": daemon_status_message}
|
|
291
293
|
|
|
294
|
+
@app.post("/abort")
|
|
295
|
+
async def abort_conversation():
|
|
296
|
+
global engine, mic, speaker, vad, active_session_id
|
|
297
|
+
logger.info("Received /abort command from client. Stopping audio.")
|
|
298
|
+
with mutex_lock:
|
|
299
|
+
if speaker:
|
|
300
|
+
speaker.flush()
|
|
301
|
+
if engine:
|
|
302
|
+
engine.state = State.EXECUTING
|
|
303
|
+
engine.buffer = []
|
|
304
|
+
if hasattr(engine.vad, "set_active"):
|
|
305
|
+
engine.vad.set_active(False)
|
|
306
|
+
if mic:
|
|
307
|
+
mic.stop_stream()
|
|
308
|
+
|
|
309
|
+
while not mcp_command_queue.empty():
|
|
310
|
+
try: mcp_command_queue.get_nowait()
|
|
311
|
+
except queue.Empty: break
|
|
312
|
+
|
|
313
|
+
mcp_result_queue.put({
|
|
314
|
+
"status": "ok",
|
|
315
|
+
"user_transcript": "",
|
|
316
|
+
"was_interrupted": True,
|
|
317
|
+
"message": "User manually aborted the voice loop using the panic button. You MUST NOT try to speak to the user right now. Wait for them to initiate the next interaction."
|
|
318
|
+
})
|
|
319
|
+
|
|
320
|
+
active_session_id = None
|
|
321
|
+
|
|
322
|
+
return {"status": "ok"}
|
|
323
|
+
|
|
292
324
|
@app.post("/converse")
|
|
293
325
|
async def converse(request: Request):
|
|
294
326
|
global active_session_id, last_active_timestamp
|
|
@@ -304,6 +336,7 @@ async def converse(request: Request):
|
|
|
304
336
|
session_id = body.get("session_id")
|
|
305
337
|
text_to_speak = body.get("text_to_speak", "")
|
|
306
338
|
expect_reply = body.get("expect_reply", True)
|
|
339
|
+
standby_mode = body.get("standby_mode", False)
|
|
307
340
|
|
|
308
341
|
with mutex_lock:
|
|
309
342
|
if active_session_id is not None and active_session_id != session_id:
|
|
@@ -317,17 +350,19 @@ async def converse(request: Request):
|
|
|
317
350
|
|
|
318
351
|
try:
|
|
319
352
|
# Feed command to daemon
|
|
320
|
-
mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
|
|
353
|
+
mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply, "standby_mode": standby_mode})
|
|
321
354
|
|
|
322
355
|
# Wait for human to interact or natural termination, checking for client disconnects
|
|
323
356
|
while True:
|
|
324
357
|
if await request.is_disconnected():
|
|
325
|
-
|
|
358
|
+
logger.warning(f"[{session_id}] Client disconnected! Aborting audio loop.")
|
|
326
359
|
# Client hung up (e.g. reload or ctrl+c). We must reset the engine immediately.
|
|
327
360
|
if speaker:
|
|
328
361
|
speaker.flush()
|
|
329
362
|
if engine:
|
|
330
363
|
engine.state = State.EXECUTING # This will trigger mic.stop_stream() in the loop
|
|
364
|
+
if hasattr(vad, "set_active"):
|
|
365
|
+
vad.set_active(False)
|
|
331
366
|
raise HTTPException(status_code=499, detail="Client Disconnected")
|
|
332
367
|
|
|
333
368
|
try:
|
|
@@ -336,6 +371,8 @@ async def converse(request: Request):
|
|
|
336
371
|
last_active_timestamp = time.time()
|
|
337
372
|
return result
|
|
338
373
|
except queue.Empty:
|
|
374
|
+
if standby_mode:
|
|
375
|
+
last_active_timestamp = time.time()
|
|
339
376
|
await asyncio.sleep(0.01)
|
|
340
377
|
|
|
341
378
|
finally:
|
package/src/logger.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
def setup_logger(name="VoiceMCP", level=logging.INFO):
|
|
6
|
+
logger = logging.getLogger(name)
|
|
7
|
+
if not logger.handlers:
|
|
8
|
+
logger.setLevel(level)
|
|
9
|
+
# Use a professional telemetry format
|
|
10
|
+
formatter = logging.Formatter(
|
|
11
|
+
fmt='%(asctime)s.%(msecs)03d | %(levelname)-7s | %(module)-15s | %(message)s',
|
|
12
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Output to stderr to avoid breaking stdio (MCP communication)
|
|
16
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
17
|
+
handler.setFormatter(formatter)
|
|
18
|
+
logger.addHandler(handler)
|
|
19
|
+
|
|
20
|
+
# File logger for persistent telemetry
|
|
21
|
+
log_dir = os.path.expanduser("~/Library/Application Support/VoiceMCP/logs")
|
|
22
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
23
|
+
file_handler = logging.FileHandler(os.path.join(log_dir, "telemetry.log"))
|
|
24
|
+
file_handler.setFormatter(formatter)
|
|
25
|
+
logger.addHandler(file_handler)
|
|
26
|
+
|
|
27
|
+
return logger
|
|
28
|
+
|
|
29
|
+
logger = setup_logger()
|
package/src/mcp_server.py
CHANGED
|
@@ -17,20 +17,64 @@ import json
|
|
|
17
17
|
import socket
|
|
18
18
|
import http.client
|
|
19
19
|
import time
|
|
20
|
-
import logging
|
|
21
20
|
import asyncio
|
|
22
21
|
import random
|
|
23
22
|
|
|
24
23
|
from mcp.server.fastmcp import FastMCP, Context
|
|
24
|
+
from logger import logger
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# Inject the busy-signal instructions into the server instructions
|
|
26
|
+
# Inject the advanced conversational instructions into the server
|
|
29
27
|
instructions = """
|
|
30
28
|
<voice_loop_instructions>
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
# VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
|
|
30
|
+
You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
|
|
31
|
+
|
|
32
|
+
## Core Hardware Constraints & Your Senses
|
|
33
|
+
1. **Push-To-Talk (PTT):** The user communicates with you by pressing and holding the `Right Option (⌥)` key.
|
|
34
|
+
2. **Deaf by Default:** You execute tools strictly sequentially. When you run non-voice tools (reading files, searching, editing), your microphone is physically OFF. The user cannot interrupt you during these times.
|
|
35
|
+
3. **Hardware Watchdog:** To save the user's Unified Memory, your backend audio daemon will self-destruct and sleep if you are completely silent for 15 minutes.
|
|
36
|
+
4. **The Panic Button (Double-Tap):** Due to a known bug in the Gemini CLI, clicking "Stop" in the UI will NOT tell the audio daemon to stop talking or listening. To forcefully stop your voice or close the microphone, the user must DOUBLE-TAP the `Right Option` key.
|
|
37
|
+
|
|
38
|
+
To prevent the user from feeling abandoned, confused, or locked out, you must orchestrate the conversation using the following rules:
|
|
39
|
+
|
|
40
|
+
## 1. First Contact (Onboarding)
|
|
41
|
+
Since voice interfaces lack visual menus, the user might not know the physical controls. On your VERY FIRST conversational turn in a new session, you MUST seamlessly weave a brief explanation of the controls into your greeting.
|
|
42
|
+
*Example:* "Hey, I'm ready to dive in. Just a quick heads up—whenever you want to talk, just press and hold the Right Option key. To force me to stop talking or listening, just double-tap it quickly. If you ever need time to think, just ask me to pause. What are we working on today?"
|
|
43
|
+
CRITICAL: Do not repeat this instruction after the first interaction.
|
|
44
|
+
|
|
45
|
+
## 2. Floor Management (`expect_reply` Heuristics)
|
|
46
|
+
Think of the microphone as a shared conversational token.
|
|
47
|
+
|
|
48
|
+
**Keep the Token (`expect_reply: false`):**
|
|
49
|
+
Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
|
|
50
|
+
- *Acknowledgment:* "Got it, looking into the routing file."
|
|
51
|
+
- *The "Head Down" Warning (CRITICAL):* If you are about to do a heavy search or multi-file edit, warn the user they cannot interrupt you. "I'm going to run a deep codebase search. I'll be deaf for a minute, so the Right Option key won't work until I'm done."
|
|
52
|
+
|
|
53
|
+
**Yield the Token (`expect_reply: true`):**
|
|
54
|
+
Use this ONLY when you genuinely need the user to speak. This MUST be the final tool call in your current execution sequence.
|
|
55
|
+
- *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
|
|
56
|
+
|
|
57
|
+
## 3. Handling Hardware Interruptions (`was_interrupted: true`)
|
|
58
|
+
If `voice_converse` returns `was_interrupted: true`, it means the user held the Right Option key and cut you off mid-sentence. Instantly drop your previous train of thought. Do not try to finish your sentence. Acknowledge the interruption naturally and pivot immediately to their new input. (e.g., "Ah, good catch, switching to the backend folder now.")
|
|
59
|
+
|
|
60
|
+
## 4. Handling User Think Time & The 15-Minute Watchdog
|
|
61
|
+
If the user says "give me a minute", "let me think", or similar:
|
|
62
|
+
1. Acknowledge them quickly using `voice_converse(..., expect_reply=False)`.
|
|
63
|
+
2. Gently warn them about the 15-minute hardware watchdog.
|
|
64
|
+
3. Remind them to hold the `Right Option` key when they are ready to return.
|
|
65
|
+
4. IMMEDIATELY call the `wait_for_user()` tool.
|
|
66
|
+
*Example:* "Take your time. Just hold the Right Option key to wake me up when you're ready. As a heads up, my audio engine spins down after 15 minutes to save your Mac's memory, but I'll be right here."
|
|
67
|
+
|
|
68
|
+
## 5. Handling Silences / Timeouts
|
|
69
|
+
If you ask a question (`expect_reply: true`) but the user doesn't press the Right Option key, the tool will return `{"status": "silence_timeout"}`.
|
|
70
|
+
CRITICAL: Do not treat this as an error. Act like a human colleague voluntarily giving them space. Gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`.
|
|
71
|
+
- *Example:* "Looks like you're focused. I'll pause my mic and stand by. Just hold the Right Option key when you want to pick it up."
|
|
72
|
+
|
|
73
|
+
## 6. General Rules of Engagement
|
|
74
|
+
- **Be Conversational & Terse:** Never use AI-isms ("As an AI..."). Speak like a human engineer.
|
|
75
|
+
- **Never Dump Code:** Never read raw code blocks out loud. Summarize conceptually.
|
|
76
|
+
- **Interleave Work:** Do not chain multiple silent tools together without muttering an update (`expect_reply: false`).
|
|
77
|
+
- **Handling System Busy:** If you get `"status": "system_busy"`, output a standard text message explaining the audio channel is locked, and continue via text.
|
|
34
78
|
</voice_loop_instructions>
|
|
35
79
|
"""
|
|
36
80
|
|
|
@@ -81,10 +125,10 @@ def ensure_daemon_running():
|
|
|
81
125
|
if check_daemon_health():
|
|
82
126
|
return
|
|
83
127
|
|
|
84
|
-
|
|
128
|
+
logger.info("Daemon is down, attempting to boot detached process...")
|
|
85
129
|
# Boot the daemon detached
|
|
86
130
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
87
|
-
python_exec =
|
|
131
|
+
python_exec = sys.executable
|
|
88
132
|
daemon_script = os.path.join(project_root, "src", "daemon", "audio_server.py")
|
|
89
133
|
|
|
90
134
|
subprocess.Popen(
|
|
@@ -159,20 +203,35 @@ async def render_visualizer(ctx: Context):
|
|
|
159
203
|
except asyncio.CancelledError:
|
|
160
204
|
pass
|
|
161
205
|
|
|
206
|
+
import threading
|
|
207
|
+
|
|
208
|
+
def fire_abort():
|
|
209
|
+
logger.info("Firing synchronous abort request to daemon...")
|
|
210
|
+
try:
|
|
211
|
+
make_uds_request("POST", "/abort", None, 5.0)
|
|
212
|
+
logger.info("Abort request sent successfully.")
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Failed to send abort request: {e}")
|
|
215
|
+
|
|
216
|
+
async def make_cancellable_converse_request(payload: dict, timeout: float) -> tuple[int, dict]:
|
|
217
|
+
try:
|
|
218
|
+
return await asyncio.to_thread(make_uds_request, "POST", "/converse", payload, timeout)
|
|
219
|
+
except asyncio.CancelledError:
|
|
220
|
+
# If the MCP client cancels this tool call, immediately tell the daemon to abort audio
|
|
221
|
+
logger.warning("Tool call was cancelled by MCP client! Triggering abort.")
|
|
222
|
+
threading.Thread(target=fire_abort, daemon=True).start()
|
|
223
|
+
raise
|
|
224
|
+
|
|
162
225
|
@mcp.tool()
|
|
163
226
|
async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Context = None) -> dict:
|
|
164
227
|
"""
|
|
165
|
-
Speak a prompt to the user and listen for a response.
|
|
166
|
-
If expect_reply is False, the tool returns immediately after queuing the speech.
|
|
228
|
+
Speak a prompt to the user and listen for a response. If expect_reply is False, the tool queues the speech and returns immediately. If expect_reply is True, it yields the floor to the user. If the returned JSON contains `was_interrupted: true`, the user used the Right Option key to cut you off mid-speech; you MUST completely abandon your previous thought and address their new input.
|
|
167
229
|
"""
|
|
168
230
|
try:
|
|
169
231
|
ensure_daemon_running()
|
|
170
232
|
|
|
171
233
|
async def _do_converse():
|
|
172
|
-
return await
|
|
173
|
-
make_uds_request,
|
|
174
|
-
"POST",
|
|
175
|
-
"/converse",
|
|
234
|
+
return await make_cancellable_converse_request(
|
|
176
235
|
{"session_id": SESSION_ID, "text_to_speak": text_to_speak, "expect_reply": expect_reply},
|
|
177
236
|
300.0
|
|
178
237
|
)
|
|
@@ -204,6 +263,7 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
|
|
|
204
263
|
await ctx.report_progress(d_progress, 100, message=d_msg)
|
|
205
264
|
|
|
206
265
|
if d_status == "READY":
|
|
266
|
+
logger.info("Model initialized to RAM")
|
|
207
267
|
if ctx:
|
|
208
268
|
await ctx.info("Voice MCP: Setup Complete!")
|
|
209
269
|
|
|
@@ -244,6 +304,74 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
|
|
|
244
304
|
"message": f"CRITICAL Error starting audio daemon: {str(e)}"
|
|
245
305
|
}
|
|
246
306
|
|
|
307
|
+
@mcp.tool()
|
|
308
|
+
async def wait_for_user(ctx: Context = None) -> dict:
|
|
309
|
+
"""
|
|
310
|
+
Call this tool IMMEDIATELY after using voice_converse(expect_reply=False) to acknowledge a user's explicit request for time to think. It suspends the AI indefinitely until the user presses the Right Option key to wake you back up. Note: The underlying audio daemon will self-destruct after 15 minutes of idle time to free Unified Memory, so you must warn the user of this limit before calling.
|
|
311
|
+
"""
|
|
312
|
+
try:
|
|
313
|
+
ensure_daemon_running()
|
|
314
|
+
if ctx:
|
|
315
|
+
await ctx.info("🎙️ Waiting for user to speak... 🎙️")
|
|
316
|
+
|
|
317
|
+
status, response_data = await make_cancellable_converse_request(
|
|
318
|
+
{"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
|
|
319
|
+
3600.0
|
|
320
|
+
)
|
|
321
|
+
return response_data
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
# The daemon likely died from the 15-minute watchdog to save RAM.
|
|
325
|
+
# Implement the "Ghost Wake-Up": silently listen for Right Option, then boot the daemon.
|
|
326
|
+
if ctx:
|
|
327
|
+
await ctx.info("💤 Audio Engine sleeping to save RAM. Press Right Option to wake... 💤")
|
|
328
|
+
|
|
329
|
+
import pynput
|
|
330
|
+
loop = asyncio.get_running_loop()
|
|
331
|
+
wake_event = asyncio.Event()
|
|
332
|
+
|
|
333
|
+
def on_press(key):
|
|
334
|
+
if key in (pynput.keyboard.Key.alt_r, pynput.keyboard.Key.ctrl_r):
|
|
335
|
+
loop.call_soon_threadsafe(wake_event.set)
|
|
336
|
+
|
|
337
|
+
listener = pynput.keyboard.Listener(on_press=on_press)
|
|
338
|
+
listener.start()
|
|
339
|
+
|
|
340
|
+
await wake_event.wait()
|
|
341
|
+
listener.stop()
|
|
342
|
+
|
|
343
|
+
if ctx:
|
|
344
|
+
await ctx.info("🚀 Waking up Audio Engine... This might take a few seconds... 🚀")
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
ensure_daemon_running()
|
|
348
|
+
status, response_data = await make_cancellable_converse_request(
|
|
349
|
+
{"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
|
|
350
|
+
3600.0
|
|
351
|
+
)
|
|
352
|
+
return response_data
|
|
353
|
+
except Exception as retry_e:
|
|
354
|
+
return {
|
|
355
|
+
"status": "error",
|
|
356
|
+
"user_transcript": "",
|
|
357
|
+
"message": f"CRITICAL Error waking up audio daemon: {str(retry_e)}"
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
import signal
|
|
361
|
+
|
|
362
|
+
def cleanup_on_exit(signum, frame):
|
|
363
|
+
logger.warning(f"Received termination signal {signum}. Firing abort request to daemon...")
|
|
364
|
+
try:
|
|
365
|
+
# Use a short timeout to prevent hanging the shutdown process
|
|
366
|
+
make_uds_request("POST", "/abort", None, 1.0)
|
|
367
|
+
logger.info("Abort request sent successfully during shutdown.")
|
|
368
|
+
except Exception as e:
|
|
369
|
+
logger.error(f"Failed to send abort request during shutdown: {e}")
|
|
370
|
+
sys.exit(0)
|
|
371
|
+
|
|
372
|
+
signal.signal(signal.SIGINT, cleanup_on_exit)
|
|
373
|
+
signal.signal(signal.SIGTERM, cleanup_on_exit)
|
|
374
|
+
|
|
247
375
|
if __name__ == "__main__":
|
|
248
376
|
# 4. Restore the OS-level stdout just before handing control to the MCP SDK
|
|
249
377
|
os.dup2(original_stdout_fd, 1)
|
|
Binary file
|
|
Binary file
|
package/src/simulation/engine.py
CHANGED
|
@@ -2,6 +2,7 @@ from enum import Enum
|
|
|
2
2
|
from typing import List
|
|
3
3
|
from .models import Config, VirtualAudioFrame
|
|
4
4
|
from .ports import IMicrophone, ISpeaker, IVAD, ISTT, ILLMBridge
|
|
5
|
+
from logger import logger
|
|
5
6
|
|
|
6
7
|
class State(Enum):
|
|
7
8
|
IDLE = 1
|
|
@@ -9,6 +10,7 @@ class State(Enum):
|
|
|
9
10
|
LISTENING = 3
|
|
10
11
|
PROCESSING = 4
|
|
11
12
|
EXECUTING = 5
|
|
13
|
+
STANDBY = 6
|
|
12
14
|
|
|
13
15
|
class CoreEngine:
|
|
14
16
|
def __init__(self, config: Config, mic: IMicrophone, speaker: ISpeaker, vad: IVAD, stt: ISTT, llm: ILLMBridge):
|
|
@@ -30,17 +32,28 @@ class CoreEngine:
|
|
|
30
32
|
self.latest_transcription = ""
|
|
31
33
|
self.last_tool_call_result = None
|
|
32
34
|
self.expect_reply = True
|
|
35
|
+
self.standby_mode = False
|
|
33
36
|
|
|
34
37
|
self.total_recording_ms = 0
|
|
35
38
|
self.total_listening_ms = 0
|
|
36
39
|
self.has_started_speaking = False
|
|
37
40
|
self.processing_wait_ms = 0
|
|
38
41
|
|
|
39
|
-
def start_conversation(self, initial_text: str):
|
|
42
|
+
def start_conversation(self, initial_text: str, standby_mode: bool = False):
|
|
40
43
|
self.expect_reply = True
|
|
44
|
+
self.standby_mode = standby_mode
|
|
41
45
|
if initial_text:
|
|
42
46
|
self.state = State.AI_SPEAKING
|
|
43
47
|
self.speaker.speak(initial_text)
|
|
48
|
+
elif self.standby_mode:
|
|
49
|
+
# We are entering standby mode to wait for the user indefinitely.
|
|
50
|
+
# If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
|
|
51
|
+
if hasattr(self.vad, "is_pressed"):
|
|
52
|
+
if hasattr(self.mic, "stop_stream"):
|
|
53
|
+
logger.debug("Microphone stream stopped")
|
|
54
|
+
self.mic.stop_stream()
|
|
55
|
+
self.state = State.STANDBY
|
|
56
|
+
self._reset_listening_state()
|
|
44
57
|
else:
|
|
45
58
|
self.state = State.LISTENING
|
|
46
59
|
self._reset_listening_state()
|
|
@@ -111,23 +124,32 @@ class CoreEngine:
|
|
|
111
124
|
else:
|
|
112
125
|
spoken_text = self.speaker.flush()
|
|
113
126
|
self.was_interrupted = True
|
|
127
|
+
logger.info("Barge-in detected! User interrupted the AI.")
|
|
114
128
|
self.state = State.LISTENING
|
|
115
129
|
self.current_silence_duration_ms = 0
|
|
116
130
|
self.total_recording_ms = self.current_speech_duration_ms
|
|
117
131
|
self.has_started_speaking = True
|
|
118
132
|
self.total_listening_ms = 0
|
|
119
133
|
elif not self.speaker.is_speaking():
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
self.
|
|
123
|
-
|
|
124
|
-
self.total_recording_ms = self.current_speech_duration_ms
|
|
125
|
-
self.has_started_speaking = True
|
|
126
|
-
self.total_listening_ms = 0
|
|
127
|
-
elif self.state == State.EXECUTING:
|
|
128
|
-
if hasattr(self.mic, 'stop_stream'):
|
|
134
|
+
if self.standby_mode:
|
|
135
|
+
self.state = State.STANDBY
|
|
136
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
|
|
137
|
+
logger.debug("Microphone stream stopped")
|
|
129
138
|
self.mic.stop_stream()
|
|
130
|
-
self.
|
|
139
|
+
self._reset_listening_state()
|
|
140
|
+
else:
|
|
141
|
+
self.state = State.LISTENING if self.expect_reply else State.EXECUTING
|
|
142
|
+
if self.state == State.LISTENING:
|
|
143
|
+
self.was_interrupted = False
|
|
144
|
+
self.current_silence_duration_ms = 0
|
|
145
|
+
self.total_recording_ms = self.current_speech_duration_ms
|
|
146
|
+
self.has_started_speaking = True
|
|
147
|
+
self.total_listening_ms = 0
|
|
148
|
+
elif self.state == State.EXECUTING:
|
|
149
|
+
if hasattr(self.mic, 'stop_stream'):
|
|
150
|
+
logger.debug("Microphone stream stopped")
|
|
151
|
+
self.mic.stop_stream()
|
|
152
|
+
self.llm.start_request({"status": "notification_delivered"})
|
|
131
153
|
else:
|
|
132
154
|
self.current_grace_ms += self.tick_ms
|
|
133
155
|
if self.current_grace_ms > self.config.vad_silence_grace_ms:
|
|
@@ -138,14 +160,22 @@ class CoreEngine:
|
|
|
138
160
|
self.current_grace_ms = 0
|
|
139
161
|
|
|
140
162
|
if not self.speaker.is_speaking():
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
self.
|
|
144
|
-
|
|
145
|
-
elif self.state == State.EXECUTING:
|
|
146
|
-
if hasattr(self.mic, 'stop_stream'):
|
|
163
|
+
if self.standby_mode:
|
|
164
|
+
self.state = State.STANDBY
|
|
165
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
|
|
166
|
+
logger.debug("Microphone stream stopped")
|
|
147
167
|
self.mic.stop_stream()
|
|
148
|
-
self.
|
|
168
|
+
self._reset_listening_state()
|
|
169
|
+
else:
|
|
170
|
+
self.state = State.LISTENING if self.expect_reply else State.EXECUTING
|
|
171
|
+
if self.state == State.LISTENING:
|
|
172
|
+
self._reset_listening_state()
|
|
173
|
+
self.was_interrupted = False
|
|
174
|
+
elif self.state == State.EXECUTING:
|
|
175
|
+
if hasattr(self.mic, 'stop_stream'):
|
|
176
|
+
logger.debug("Microphone stream stopped")
|
|
177
|
+
self.mic.stop_stream()
|
|
178
|
+
self.llm.start_request({"status": "notification_delivered"})
|
|
149
179
|
|
|
150
180
|
elif self.state == State.LISTENING:
|
|
151
181
|
self.buffer.append(frame)
|
|
@@ -178,6 +208,7 @@ class CoreEngine:
|
|
|
178
208
|
return
|
|
179
209
|
|
|
180
210
|
if not self.has_started_speaking and self.total_listening_ms >= self.config.listening_timeout_ms:
|
|
211
|
+
logger.info("Silence timeout reached. Prompting LLM.")
|
|
181
212
|
self.llm.start_request({"status": "silence_timeout", "user_transcript": ""})
|
|
182
213
|
self.state = State.PROCESSING
|
|
183
214
|
self.processing_wait_ms = 0
|
|
@@ -189,15 +220,31 @@ class CoreEngine:
|
|
|
189
220
|
else:
|
|
190
221
|
self._reset_listening_state()
|
|
191
222
|
|
|
223
|
+
elif self.state == State.STANDBY:
|
|
224
|
+
if is_speech:
|
|
225
|
+
self.standby_mode = False
|
|
226
|
+
self.state = State.LISTENING
|
|
227
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
|
|
228
|
+
# We closed it earlier for PTT, so we need to reopen it.
|
|
229
|
+
logger.debug("Microphone stream started")
|
|
230
|
+
self.mic.start_stream()
|
|
231
|
+
self._reset_listening_state()
|
|
232
|
+
self.buffer.append(frame)
|
|
233
|
+
self.total_listening_ms += self.tick_ms
|
|
234
|
+
self.current_speech_duration_ms += self.tick_ms
|
|
235
|
+
self.has_started_speaking = True
|
|
236
|
+
self.total_recording_ms += self.tick_ms
|
|
237
|
+
|
|
192
238
|
elif self.state == State.PROCESSING:
|
|
193
239
|
self.buffer.append(frame)
|
|
194
240
|
self.processing_wait_ms += self.tick_ms
|
|
195
241
|
|
|
196
242
|
if self.processing_wait_ms >= self.config.llm_timeout_ms:
|
|
197
243
|
import sys
|
|
198
|
-
|
|
244
|
+
logger.error("LLM Timeout reached. Assuming agent abandoned the voice loop. Tearing down hardware.")
|
|
199
245
|
self.state = State.EXECUTING
|
|
200
246
|
if hasattr(self.mic, 'stop_stream'):
|
|
247
|
+
logger.debug("Microphone stream stopped")
|
|
201
248
|
self.mic.stop_stream()
|
|
202
249
|
self.processing_wait_ms = 0
|
|
203
250
|
self.buffer = []
|
|
@@ -209,6 +256,7 @@ class CoreEngine:
|
|
|
209
256
|
|
|
210
257
|
orphan_speech = any(f.has_speech for f in self.buffer)
|
|
211
258
|
if orphan_speech:
|
|
259
|
+
logger.warning("Orphan speech detected. Interrupted previous context.")
|
|
212
260
|
self.was_interrupted = True
|
|
213
261
|
self.state = State.LISTENING
|
|
214
262
|
self.has_started_speaking = True
|
|
Binary file
|
|
Binary file
|