voice-mcp-server 0.1.24 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voice-mcp-server",
3
- "version": "0.1.24",
3
+ "version": "0.1.25",
4
4
  "description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
5
5
  "main": "build/index.js",
6
6
  "type": "module",
@@ -125,7 +125,7 @@ def run_audio_daemon():
125
125
 
126
126
  # We got a command, wake up the hardware!
127
127
  mic.start_stream()
128
- engine.start_conversation(cmd.get("text", ""))
128
+ engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
129
129
  engine.expect_reply = cmd.get("expect_reply", True)
130
130
 
131
131
  except queue.Empty:
@@ -304,6 +304,7 @@ async def converse(request: Request):
304
304
  session_id = body.get("session_id")
305
305
  text_to_speak = body.get("text_to_speak", "")
306
306
  expect_reply = body.get("expect_reply", True)
307
+ standby_mode = body.get("standby_mode", False)
307
308
 
308
309
  with mutex_lock:
309
310
  if active_session_id is not None and active_session_id != session_id:
@@ -317,7 +318,7 @@ async def converse(request: Request):
317
318
 
318
319
  try:
319
320
  # Feed command to daemon
320
- mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
321
+ mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply, "standby_mode": standby_mode})
321
322
 
322
323
  # Wait for human to interact or natural termination, checking for client disconnects
323
324
  while True:
@@ -336,6 +337,8 @@ async def converse(request: Request):
336
337
  last_active_timestamp = time.time()
337
338
  return result
338
339
  except queue.Empty:
340
+ if standby_mode:
341
+ last_active_timestamp = time.time()
339
342
  await asyncio.sleep(0.01)
340
343
 
341
344
  finally:
package/src/mcp_server.py CHANGED
@@ -25,12 +25,53 @@ from mcp.server.fastmcp import FastMCP, Context
25
25
 
26
26
  logging.basicConfig(level=logging.INFO, stream=sys.stderr)
27
27
 
28
- # Inject the busy-signal instructions into the server instructions
28
+ # Inject the advanced conversational instructions into the server
29
29
  instructions = """
30
30
  <voice_loop_instructions>
31
+ # VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
32
+ You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
33
+
34
+ ## Core Constraint: Sequential Execution
35
+ You execute tools strictly sequentially. Your primary communication tool is `voice_converse(text_to_speak, expect_reply)`.
36
+ When you run non-voice tools (reading files, searching, editing), you are "deaf" to the user and your microphone is OFF. To prevent the user from feeling abandoned or locked out, you must proactively orchestrate the conversation using the following rules:
37
+
38
+ ## 1. Floor Management (`expect_reply` Heuristics)
39
+ Think of the microphone as a shared conversational token.
40
+
41
+ **Keep the Token (`expect_reply: false`):**
42
+ Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
43
+ - *Acknowledgment:* "Got it, looking into the routing file."
44
+ - *Transitions:* "Auth tests passed, moving on to the user models."
45
+ - *The "Head Down" Warning:* "I'm going to run a deep codebase search. I'll be deaf for a minute while it runs."
46
+
47
+ **Yield the Token (`expect_reply: true`):**
48
+ Use this ONLY when you genuinely need the user to speak. Crucially, when you set this to True, it MUST be the final tool call in your current execution sequence, as you are pausing your logic to wait for human input.
49
+ - *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
50
+ - *Consent Gates:* "I've drafted the refactor for the database schema. Should I go ahead and apply it?"
51
+ - *Task Completion:* "All done with the UI updates. What should we tackle next?"
52
+
53
+ ## 2. Rules of Engagement
54
+ - **Be Conversational & Terse:** Never use AI-isms ("As an AI...", "I will now execute the tool..."). Speak like a human engineer ("Let's check...", "Ah, I see the bug...", "On it.").
55
+ - **Never Dump Code:** Never read raw code blocks, markdown, or complex lists out loud. Summarize conceptually.
56
+ - **Interleave Work:** Do not chain multiple silent tools together for long periods without "muttering" an update to the user (`expect_reply: false`).
57
+
58
+ ## 3. Handling Silences / Timeouts
59
+ If you ask a question (`expect_reply: true`) but the user is deep in thought, reviewing code, or steps away, the `voice_converse` tool will return `{"status": "silence_timeout"}`.
60
+
61
+ CRITICAL: Do not treat this as an error, and do not mention microphones, timeouts, or technical constraints. Act like a human colleague voluntarily giving them space.
62
+
63
+ You MUST gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`. Use brief, casual, supportive phrases such as:
64
+ - "Take your time. Just say my name when you're ready to continue."
65
+ - "Looks like you're focused. I'll pause my mic and stand by."
66
+ - "I'll let you look that over. Ping me when you want to pick it up."
67
+ - "No rush, I'll be right here when you need me."
68
+
69
+ ## 4. Handling User Think Time
70
+ If the user says "give me a minute", "let me think", or similar, you MUST acknowledge them quickly using `voice_converse(..., expect_reply=False)`, and then immediately call the `wait_for_user()` tool. This will suspend your execution indefinitely until they are ready to speak again.
71
+
72
+ ## 5. Handling System Busy
31
73
  If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
32
74
  DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
33
- CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
34
75
  </voice_loop_instructions>
35
76
  """
36
77
 
@@ -244,6 +285,45 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
244
285
  "message": f"CRITICAL Error starting audio daemon: {str(e)}"
245
286
  }
246
287
 
288
+ @mcp.tool()
289
+ async def wait_for_user(ctx: Context = None) -> dict:
290
+ """
291
+ Call this tool when the user explicitly asks for time to think.
292
+ It suspends the AI indefinitely until the user speaks.
293
+ """
294
+ try:
295
+ ensure_daemon_running()
296
+ if ctx:
297
+ await ctx.info("🎙️ Waiting for user to speak... 🎙️")
298
+
299
+ status, response_data = await asyncio.to_thread(
300
+ make_uds_request,
301
+ "POST",
302
+ "/converse",
303
+ {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
304
+ 3600.0
305
+ )
306
+ return response_data
307
+
308
+ except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
309
+ return {
310
+ "status": "error",
311
+ "user_transcript": "",
312
+ "message": "CRITICAL: The Voice Audio Daemon failed to respond."
313
+ }
314
+ except TimeoutError:
315
+ return {
316
+ "status": "error",
317
+ "user_transcript": "",
318
+ "message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
319
+ }
320
+ except Exception as e:
321
+ return {
322
+ "status": "error",
323
+ "user_transcript": "",
324
+ "message": f"CRITICAL Error during standby: {str(e)}"
325
+ }
326
+
247
327
  if __name__ == "__main__":
248
328
  # 4. Restore the OS-level stdout just before handing control to the MCP SDK
249
329
  os.dup2(original_stdout_fd, 1)
@@ -9,6 +9,7 @@ class State(Enum):
9
9
  LISTENING = 3
10
10
  PROCESSING = 4
11
11
  EXECUTING = 5
12
+ STANDBY = 6
12
13
 
13
14
  class CoreEngine:
14
15
  def __init__(self, config: Config, mic: IMicrophone, speaker: ISpeaker, vad: IVAD, stt: ISTT, llm: ILLMBridge):
@@ -30,17 +31,27 @@ class CoreEngine:
30
31
  self.latest_transcription = ""
31
32
  self.last_tool_call_result = None
32
33
  self.expect_reply = True
34
+ self.standby_mode = False
33
35
 
34
36
  self.total_recording_ms = 0
35
37
  self.total_listening_ms = 0
36
38
  self.has_started_speaking = False
37
39
  self.processing_wait_ms = 0
38
40
 
39
- def start_conversation(self, initial_text: str):
41
+ def start_conversation(self, initial_text: str, standby_mode: bool = False):
40
42
  self.expect_reply = True
43
+ self.standby_mode = standby_mode
41
44
  if initial_text:
42
45
  self.state = State.AI_SPEAKING
43
46
  self.speaker.speak(initial_text)
47
+ elif self.standby_mode:
48
+ # We are entering standby mode to wait for the user indefinitely.
49
+ # If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
50
+ if hasattr(self.vad, "is_pressed"):
51
+ if hasattr(self.mic, "stop_stream"):
52
+ self.mic.stop_stream()
53
+ self.state = State.STANDBY
54
+ self._reset_listening_state()
44
55
  else:
45
56
  self.state = State.LISTENING
46
57
  self._reset_listening_state()
@@ -117,17 +128,23 @@ class CoreEngine:
117
128
  self.has_started_speaking = True
118
129
  self.total_listening_ms = 0
119
130
  elif not self.speaker.is_speaking():
120
- self.state = State.LISTENING if self.expect_reply else State.EXECUTING
121
- if self.state == State.LISTENING:
122
- self.was_interrupted = False
123
- self.current_silence_duration_ms = 0
124
- self.total_recording_ms = self.current_speech_duration_ms
125
- self.has_started_speaking = True
126
- self.total_listening_ms = 0
127
- elif self.state == State.EXECUTING:
128
- if hasattr(self.mic, 'stop_stream'):
131
+ if self.standby_mode:
132
+ self.state = State.STANDBY
133
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
129
134
  self.mic.stop_stream()
130
- self.llm.start_request({"status": "notification_delivered"})
135
+ self._reset_listening_state()
136
+ else:
137
+ self.state = State.LISTENING if self.expect_reply else State.EXECUTING
138
+ if self.state == State.LISTENING:
139
+ self.was_interrupted = False
140
+ self.current_silence_duration_ms = 0
141
+ self.total_recording_ms = self.current_speech_duration_ms
142
+ self.has_started_speaking = True
143
+ self.total_listening_ms = 0
144
+ elif self.state == State.EXECUTING:
145
+ if hasattr(self.mic, 'stop_stream'):
146
+ self.mic.stop_stream()
147
+ self.llm.start_request({"status": "notification_delivered"})
131
148
  else:
132
149
  self.current_grace_ms += self.tick_ms
133
150
  if self.current_grace_ms > self.config.vad_silence_grace_ms:
@@ -138,14 +155,20 @@ class CoreEngine:
138
155
  self.current_grace_ms = 0
139
156
 
140
157
  if not self.speaker.is_speaking():
141
- self.state = State.LISTENING if self.expect_reply else State.EXECUTING
142
- if self.state == State.LISTENING:
143
- self._reset_listening_state()
144
- self.was_interrupted = False
145
- elif self.state == State.EXECUTING:
146
- if hasattr(self.mic, 'stop_stream'):
158
+ if self.standby_mode:
159
+ self.state = State.STANDBY
160
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
147
161
  self.mic.stop_stream()
148
- self.llm.start_request({"status": "notification_delivered"})
162
+ self._reset_listening_state()
163
+ else:
164
+ self.state = State.LISTENING if self.expect_reply else State.EXECUTING
165
+ if self.state == State.LISTENING:
166
+ self._reset_listening_state()
167
+ self.was_interrupted = False
168
+ elif self.state == State.EXECUTING:
169
+ if hasattr(self.mic, 'stop_stream'):
170
+ self.mic.stop_stream()
171
+ self.llm.start_request({"status": "notification_delivered"})
149
172
 
150
173
  elif self.state == State.LISTENING:
151
174
  self.buffer.append(frame)
@@ -189,6 +212,20 @@ class CoreEngine:
189
212
  else:
190
213
  self._reset_listening_state()
191
214
 
215
+ elif self.state == State.STANDBY:
216
+ if is_speech:
217
+ self.standby_mode = False
218
+ self.state = State.LISTENING
219
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
220
+ # We closed it earlier for PTT, so we need to reopen it.
221
+ self.mic.start_stream()
222
+ self._reset_listening_state()
223
+ self.buffer.append(frame)
224
+ self.total_listening_ms += self.tick_ms
225
+ self.current_speech_duration_ms += self.tick_ms
226
+ self.has_started_speaking = True
227
+ self.total_recording_ms += self.tick_ms
228
+
192
229
  elif self.state == State.PROCESSING:
193
230
  self.buffer.append(frame)
194
231
  self.processing_wait_ms += self.tick_ms