voice-mcp-server 0.1.24 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -125,7 +125,7 @@ def run_audio_daemon():
|
|
|
125
125
|
|
|
126
126
|
# We got a command, wake up the hardware!
|
|
127
127
|
mic.start_stream()
|
|
128
|
-
engine.start_conversation(cmd.get("text", ""))
|
|
128
|
+
engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
|
|
129
129
|
engine.expect_reply = cmd.get("expect_reply", True)
|
|
130
130
|
|
|
131
131
|
except queue.Empty:
|
|
@@ -304,6 +304,7 @@ async def converse(request: Request):
|
|
|
304
304
|
session_id = body.get("session_id")
|
|
305
305
|
text_to_speak = body.get("text_to_speak", "")
|
|
306
306
|
expect_reply = body.get("expect_reply", True)
|
|
307
|
+
standby_mode = body.get("standby_mode", False)
|
|
307
308
|
|
|
308
309
|
with mutex_lock:
|
|
309
310
|
if active_session_id is not None and active_session_id != session_id:
|
|
@@ -317,7 +318,7 @@ async def converse(request: Request):
|
|
|
317
318
|
|
|
318
319
|
try:
|
|
319
320
|
# Feed command to daemon
|
|
320
|
-
mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
|
|
321
|
+
mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply, "standby_mode": standby_mode})
|
|
321
322
|
|
|
322
323
|
# Wait for human to interact or natural termination, checking for client disconnects
|
|
323
324
|
while True:
|
|
@@ -336,6 +337,8 @@ async def converse(request: Request):
|
|
|
336
337
|
last_active_timestamp = time.time()
|
|
337
338
|
return result
|
|
338
339
|
except queue.Empty:
|
|
340
|
+
if standby_mode:
|
|
341
|
+
last_active_timestamp = time.time()
|
|
339
342
|
await asyncio.sleep(0.01)
|
|
340
343
|
|
|
341
344
|
finally:
|
package/src/mcp_server.py
CHANGED
|
@@ -25,12 +25,53 @@ from mcp.server.fastmcp import FastMCP, Context
|
|
|
25
25
|
|
|
26
26
|
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
|
|
27
27
|
|
|
28
|
-
# Inject the
|
|
28
|
+
# Inject the advanced conversational instructions into the server
|
|
29
29
|
instructions = """
|
|
30
30
|
<voice_loop_instructions>
|
|
31
|
+
# VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
|
|
32
|
+
You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
|
|
33
|
+
|
|
34
|
+
## Core Constraint: Sequential Execution
|
|
35
|
+
You execute tools strictly sequentially. Your primary communication tool is `voice_converse(text_to_speak, expect_reply)`.
|
|
36
|
+
When you run non-voice tools (reading files, searching, editing), you are "deaf" to the user and your microphone is OFF. To prevent the user from feeling abandoned or locked out, you must proactively orchestrate the conversation using the following rules:
|
|
37
|
+
|
|
38
|
+
## 1. Floor Management (`expect_reply` Heuristics)
|
|
39
|
+
Think of the microphone as a shared conversational token.
|
|
40
|
+
|
|
41
|
+
**Keep the Token (`expect_reply: false`):**
|
|
42
|
+
Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
|
|
43
|
+
- *Acknowledgment:* "Got it, looking into the routing file."
|
|
44
|
+
- *Transitions:* "Auth tests passed, moving on to the user models."
|
|
45
|
+
- *The "Head Down" Warning:* "I'm going to run a deep codebase search. I'll be deaf for a minute while it runs."
|
|
46
|
+
|
|
47
|
+
**Yield the Token (`expect_reply: true`):**
|
|
48
|
+
Use this ONLY when you genuinely need the user to speak. Crucially, when you set this to True, it MUST be the final tool call in your current execution sequence, as you are pausing your logic to wait for human input.
|
|
49
|
+
- *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
|
|
50
|
+
- *Consent Gates:* "I've drafted the refactor for the database schema. Should I go ahead and apply it?"
|
|
51
|
+
- *Task Completion:* "All done with the UI updates. What should we tackle next?"
|
|
52
|
+
|
|
53
|
+
## 2. Rules of Engagement
|
|
54
|
+
- **Be Conversational & Terse:** Never use AI-isms ("As an AI...", "I will now execute the tool..."). Speak like a human engineer ("Let's check...", "Ah, I see the bug...", "On it.").
|
|
55
|
+
- **Never Dump Code:** Never read raw code blocks, markdown, or complex lists out loud. Summarize conceptually.
|
|
56
|
+
- **Interleave Work:** Do not chain multiple silent tools together for long periods without "muttering" an update to the user (`expect_reply: false`).
|
|
57
|
+
|
|
58
|
+
## 3. Handling Silences / Timeouts
|
|
59
|
+
If you ask a question (`expect_reply: true`) but the user is deep in thought, reviewing code, or steps away, the `voice_converse` tool will return `{"status": "silence_timeout"}`.
|
|
60
|
+
|
|
61
|
+
CRITICAL: Do not treat this as an error, and do not mention microphones, timeouts, or technical constraints. Act like a human colleague voluntarily giving them space.
|
|
62
|
+
|
|
63
|
+
You MUST gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`. Use brief, casual, supportive phrases such as:
|
|
64
|
+
- "Take your time. Just say my name when you're ready to continue."
|
|
65
|
+
- "Looks like you're focused. I'll pause my mic and stand by."
|
|
66
|
+
- "I'll let you look that over. Ping me when you want to pick it up."
|
|
67
|
+
- "No rush, I'll be right here when you need me."
|
|
68
|
+
|
|
69
|
+
## 4. Handling User Think Time
|
|
70
|
+
If the user says "give me a minute", "let me think", or similar, you MUST acknowledge them quickly using `voice_converse(..., expect_reply=False)`, and then immediately call the `wait_for_user()` tool. This will suspend your execution indefinitely until they are ready to speak again.
|
|
71
|
+
|
|
72
|
+
## 5. Handling System Busy
|
|
31
73
|
If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
|
|
32
74
|
DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
|
|
33
|
-
CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
|
|
34
75
|
</voice_loop_instructions>
|
|
35
76
|
"""
|
|
36
77
|
|
|
@@ -244,6 +285,45 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
|
|
|
244
285
|
"message": f"CRITICAL Error starting audio daemon: {str(e)}"
|
|
245
286
|
}
|
|
246
287
|
|
|
288
|
+
@mcp.tool()
|
|
289
|
+
async def wait_for_user(ctx: Context = None) -> dict:
|
|
290
|
+
"""
|
|
291
|
+
Call this tool when the user explicitly asks for time to think.
|
|
292
|
+
It suspends the AI indefinitely until the user speaks.
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
ensure_daemon_running()
|
|
296
|
+
if ctx:
|
|
297
|
+
await ctx.info("🎙️ Waiting for user to speak... 🎙️")
|
|
298
|
+
|
|
299
|
+
status, response_data = await asyncio.to_thread(
|
|
300
|
+
make_uds_request,
|
|
301
|
+
"POST",
|
|
302
|
+
"/converse",
|
|
303
|
+
{"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
|
|
304
|
+
3600.0
|
|
305
|
+
)
|
|
306
|
+
return response_data
|
|
307
|
+
|
|
308
|
+
except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
|
|
309
|
+
return {
|
|
310
|
+
"status": "error",
|
|
311
|
+
"user_transcript": "",
|
|
312
|
+
"message": "CRITICAL: The Voice Audio Daemon failed to respond."
|
|
313
|
+
}
|
|
314
|
+
except TimeoutError:
|
|
315
|
+
return {
|
|
316
|
+
"status": "error",
|
|
317
|
+
"user_transcript": "",
|
|
318
|
+
"message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
|
|
319
|
+
}
|
|
320
|
+
except Exception as e:
|
|
321
|
+
return {
|
|
322
|
+
"status": "error",
|
|
323
|
+
"user_transcript": "",
|
|
324
|
+
"message": f"CRITICAL Error during standby: {str(e)}"
|
|
325
|
+
}
|
|
326
|
+
|
|
247
327
|
if __name__ == "__main__":
|
|
248
328
|
# 4. Restore the OS-level stdout just before handing control to the MCP SDK
|
|
249
329
|
os.dup2(original_stdout_fd, 1)
|
|
Binary file
|
package/src/simulation/engine.py
CHANGED
|
@@ -9,6 +9,7 @@ class State(Enum):
|
|
|
9
9
|
LISTENING = 3
|
|
10
10
|
PROCESSING = 4
|
|
11
11
|
EXECUTING = 5
|
|
12
|
+
STANDBY = 6
|
|
12
13
|
|
|
13
14
|
class CoreEngine:
|
|
14
15
|
def __init__(self, config: Config, mic: IMicrophone, speaker: ISpeaker, vad: IVAD, stt: ISTT, llm: ILLMBridge):
|
|
@@ -30,17 +31,27 @@ class CoreEngine:
|
|
|
30
31
|
self.latest_transcription = ""
|
|
31
32
|
self.last_tool_call_result = None
|
|
32
33
|
self.expect_reply = True
|
|
34
|
+
self.standby_mode = False
|
|
33
35
|
|
|
34
36
|
self.total_recording_ms = 0
|
|
35
37
|
self.total_listening_ms = 0
|
|
36
38
|
self.has_started_speaking = False
|
|
37
39
|
self.processing_wait_ms = 0
|
|
38
40
|
|
|
39
|
-
def start_conversation(self, initial_text: str):
|
|
41
|
+
def start_conversation(self, initial_text: str, standby_mode: bool = False):
|
|
40
42
|
self.expect_reply = True
|
|
43
|
+
self.standby_mode = standby_mode
|
|
41
44
|
if initial_text:
|
|
42
45
|
self.state = State.AI_SPEAKING
|
|
43
46
|
self.speaker.speak(initial_text)
|
|
47
|
+
elif self.standby_mode:
|
|
48
|
+
# We are entering standby mode to wait for the user indefinitely.
|
|
49
|
+
# If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
|
|
50
|
+
if hasattr(self.vad, "is_pressed"):
|
|
51
|
+
if hasattr(self.mic, "stop_stream"):
|
|
52
|
+
self.mic.stop_stream()
|
|
53
|
+
self.state = State.STANDBY
|
|
54
|
+
self._reset_listening_state()
|
|
44
55
|
else:
|
|
45
56
|
self.state = State.LISTENING
|
|
46
57
|
self._reset_listening_state()
|
|
@@ -117,17 +128,23 @@ class CoreEngine:
|
|
|
117
128
|
self.has_started_speaking = True
|
|
118
129
|
self.total_listening_ms = 0
|
|
119
130
|
elif not self.speaker.is_speaking():
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
self.
|
|
123
|
-
self.current_silence_duration_ms = 0
|
|
124
|
-
self.total_recording_ms = self.current_speech_duration_ms
|
|
125
|
-
self.has_started_speaking = True
|
|
126
|
-
self.total_listening_ms = 0
|
|
127
|
-
elif self.state == State.EXECUTING:
|
|
128
|
-
if hasattr(self.mic, 'stop_stream'):
|
|
131
|
+
if self.standby_mode:
|
|
132
|
+
self.state = State.STANDBY
|
|
133
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
|
|
129
134
|
self.mic.stop_stream()
|
|
130
|
-
self.
|
|
135
|
+
self._reset_listening_state()
|
|
136
|
+
else:
|
|
137
|
+
self.state = State.LISTENING if self.expect_reply else State.EXECUTING
|
|
138
|
+
if self.state == State.LISTENING:
|
|
139
|
+
self.was_interrupted = False
|
|
140
|
+
self.current_silence_duration_ms = 0
|
|
141
|
+
self.total_recording_ms = self.current_speech_duration_ms
|
|
142
|
+
self.has_started_speaking = True
|
|
143
|
+
self.total_listening_ms = 0
|
|
144
|
+
elif self.state == State.EXECUTING:
|
|
145
|
+
if hasattr(self.mic, 'stop_stream'):
|
|
146
|
+
self.mic.stop_stream()
|
|
147
|
+
self.llm.start_request({"status": "notification_delivered"})
|
|
131
148
|
else:
|
|
132
149
|
self.current_grace_ms += self.tick_ms
|
|
133
150
|
if self.current_grace_ms > self.config.vad_silence_grace_ms:
|
|
@@ -138,14 +155,20 @@ class CoreEngine:
|
|
|
138
155
|
self.current_grace_ms = 0
|
|
139
156
|
|
|
140
157
|
if not self.speaker.is_speaking():
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
self.
|
|
144
|
-
self.was_interrupted = False
|
|
145
|
-
elif self.state == State.EXECUTING:
|
|
146
|
-
if hasattr(self.mic, 'stop_stream'):
|
|
158
|
+
if self.standby_mode:
|
|
159
|
+
self.state = State.STANDBY
|
|
160
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
|
|
147
161
|
self.mic.stop_stream()
|
|
148
|
-
self.
|
|
162
|
+
self._reset_listening_state()
|
|
163
|
+
else:
|
|
164
|
+
self.state = State.LISTENING if self.expect_reply else State.EXECUTING
|
|
165
|
+
if self.state == State.LISTENING:
|
|
166
|
+
self._reset_listening_state()
|
|
167
|
+
self.was_interrupted = False
|
|
168
|
+
elif self.state == State.EXECUTING:
|
|
169
|
+
if hasattr(self.mic, 'stop_stream'):
|
|
170
|
+
self.mic.stop_stream()
|
|
171
|
+
self.llm.start_request({"status": "notification_delivered"})
|
|
149
172
|
|
|
150
173
|
elif self.state == State.LISTENING:
|
|
151
174
|
self.buffer.append(frame)
|
|
@@ -189,6 +212,20 @@ class CoreEngine:
|
|
|
189
212
|
else:
|
|
190
213
|
self._reset_listening_state()
|
|
191
214
|
|
|
215
|
+
elif self.state == State.STANDBY:
|
|
216
|
+
if is_speech:
|
|
217
|
+
self.standby_mode = False
|
|
218
|
+
self.state = State.LISTENING
|
|
219
|
+
if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
|
|
220
|
+
# We closed it earlier for PTT, so we need to reopen it.
|
|
221
|
+
self.mic.start_stream()
|
|
222
|
+
self._reset_listening_state()
|
|
223
|
+
self.buffer.append(frame)
|
|
224
|
+
self.total_listening_ms += self.tick_ms
|
|
225
|
+
self.current_speech_duration_ms += self.tick_ms
|
|
226
|
+
self.has_started_speaking = True
|
|
227
|
+
self.total_recording_ms += self.tick_ms
|
|
228
|
+
|
|
192
229
|
elif self.state == State.PROCESSING:
|
|
193
230
|
self.buffer.append(frame)
|
|
194
231
|
self.processing_wait_ms += self.tick_ms
|