npm - voice-mcp-server - Versions diffs - 0.1.24 → 0.1.25 - Mend

voice-mcp-server 0.1.24 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/daemon/audio_server.py +5 -2
package/src/mcp_server.py +82 -2
package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
package/src/simulation/engine.py +55 -18

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voice-mcp-server",
-  "version": "0.1.24",
+  "version": "0.1.25",
   "description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
   "main": "build/index.js",
   "type": "module",

package/src/daemon/audio_server.py CHANGED Viewed

@@ -125,7 +125,7 @@ def run_audio_daemon():
                     # We got a command, wake up the hardware!
                     mic.start_stream()
-                    engine.start_conversation(cmd.get("text", ""))
+                    engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
                     engine.expect_reply = cmd.get("expect_reply", True)
                 except queue.Empty:
@@ -304,6 +304,7 @@ async def converse(request: Request):
     session_id = body.get("session_id")
     text_to_speak = body.get("text_to_speak", "")
     expect_reply = body.get("expect_reply", True)
+    standby_mode = body.get("standby_mode", False)
     with mutex_lock:
         if active_session_id is not None and active_session_id != session_id:
@@ -317,7 +318,7 @@ async def converse(request: Request):
     try:
         # Feed command to daemon
-        mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
+        mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply, "standby_mode": standby_mode})
         # Wait for human to interact or natural termination, checking for client disconnects
         while True:
@@ -336,6 +337,8 @@ async def converse(request: Request):
                 last_active_timestamp = time.time()
                 return result
             except queue.Empty:
+                if standby_mode:
+                    last_active_timestamp = time.time()
                 await asyncio.sleep(0.01)
     finally:

package/src/mcp_server.py CHANGED Viewed

@@ -25,12 +25,53 @@ from mcp.server.fastmcp import FastMCP, Context
 logging.basicConfig(level=logging.INFO, stream=sys.stderr)
-# Inject the busy-signal instructions into the server instructions
+# Inject the advanced conversational instructions into the server
 instructions = """
 <voice_loop_instructions>
+# VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
+You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
+## Core Constraint: Sequential Execution
+You execute tools strictly sequentially. Your primary communication tool is `voice_converse(text_to_speak, expect_reply)`.
+When you run non-voice tools (reading files, searching, editing), you are "deaf" to the user and your microphone is OFF. To prevent the user from feeling abandoned or locked out, you must proactively orchestrate the conversation using the following rules:
+## 1. Floor Management (`expect_reply` Heuristics)
+Think of the microphone as a shared conversational token.
+**Keep the Token (`expect_reply: false`):**
+Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
+- *Acknowledgment:* "Got it, looking into the routing file."
+- *Transitions:* "Auth tests passed, moving on to the user models."
+- *The "Head Down" Warning:* "I'm going to run a deep codebase search. I'll be deaf for a minute while it runs."
+**Yield the Token (`expect_reply: true`):**
+Use this ONLY when you genuinely need the user to speak. Crucially, when you set this to True, it MUST be the final tool call in your current execution sequence, as you are pausing your logic to wait for human input.
+- *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
+- *Consent Gates:* "I've drafted the refactor for the database schema. Should I go ahead and apply it?"
+- *Task Completion:* "All done with the UI updates. What should we tackle next?"
+## 2. Rules of Engagement
+- **Be Conversational & Terse:** Never use AI-isms ("As an AI...", "I will now execute the tool..."). Speak like a human engineer ("Let's check...", "Ah, I see the bug...", "On it.").
+- **Never Dump Code:** Never read raw code blocks, markdown, or complex lists out loud. Summarize conceptually.
+- **Interleave Work:** Do not chain multiple silent tools together for long periods without "muttering" an update to the user (`expect_reply: false`).
+## 3. Handling Silences / Timeouts
+If you ask a question (`expect_reply: true`) but the user is deep in thought, reviewing code, or steps away, the `voice_converse` tool will return `{"status": "silence_timeout"}`.
+CRITICAL: Do not treat this as an error, and do not mention microphones, timeouts, or technical constraints. Act like a human colleague voluntarily giving them space.
+You MUST gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`. Use brief, casual, supportive phrases such as:
+- "Take your time. Just say my name when you're ready to continue."
+- "Looks like you're focused. I'll pause my mic and stand by."
+- "I'll let you look that over. Ping me when you want to pick it up."
+- "No rush, I'll be right here when you need me."
+## 4. Handling User Think Time
+If the user says "give me a minute", "let me think", or similar, you MUST acknowledge them quickly using `voice_converse(..., expect_reply=False)`, and then immediately call the `wait_for_user()` tool. This will suspend your execution indefinitely until they are ready to speak again.
+## 5. Handling System Busy
 If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
 DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
-CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
 </voice_loop_instructions>
 """
@@ -244,6 +285,45 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
             "message": f"CRITICAL Error starting audio daemon: {str(e)}"
         }
+@mcp.tool()
+async def wait_for_user(ctx: Context = None) -> dict:
+    """
+    Call this tool when the user explicitly asks for time to think.
+    It suspends the AI indefinitely until the user speaks.
+    """
+    try:
+        ensure_daemon_running()
+        if ctx:
+            await ctx.info("🎙️ Waiting for user to speak... 🎙️")
+        status, response_data = await asyncio.to_thread(
+            make_uds_request,
+            "POST",
+            "/converse",
+            {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
+            3600.0
+        )
+        return response_data
+    except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
+        return {
+            "status": "error",
+            "user_transcript": "",
+            "message": "CRITICAL: The Voice Audio Daemon failed to respond."
+        }
+    except TimeoutError:
+         return {
+            "status": "error",
+            "user_transcript": "",
+            "message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
+        }
+    except Exception as e:
+         return {
+            "status": "error",
+            "user_transcript": "",
+            "message": f"CRITICAL Error during standby: {str(e)}"
+        }
 if __name__ == "__main__":
     # 4. Restore the OS-level stdout just before handing control to the MCP SDK
     os.dup2(original_stdout_fd, 1)

package/src/simulation/__pycache__/engine.cpython-312.pyc CHANGED Viewed

Binary file

package/src/simulation/engine.py CHANGED Viewed

@@ -9,6 +9,7 @@ class State(Enum):
     LISTENING = 3
     PROCESSING = 4
     EXECUTING = 5
+    STANDBY = 6
 class CoreEngine:
     def __init__(self, config: Config, mic: IMicrophone, speaker: ISpeaker, vad: IVAD, stt: ISTT, llm: ILLMBridge):
@@ -30,17 +31,27 @@ class CoreEngine:
         self.latest_transcription = ""
         self.last_tool_call_result = None
         self.expect_reply = True
+        self.standby_mode = False
         self.total_recording_ms = 0
         self.total_listening_ms = 0
         self.has_started_speaking = False
         self.processing_wait_ms = 0
-    def start_conversation(self, initial_text: str):
+    def start_conversation(self, initial_text: str, standby_mode: bool = False):
         self.expect_reply = True
+        self.standby_mode = standby_mode
         if initial_text:
             self.state = State.AI_SPEAKING
             self.speaker.speak(initial_text)
+        elif self.standby_mode:
+            # We are entering standby mode to wait for the user indefinitely.
+            # If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
+            if hasattr(self.vad, "is_pressed"):
+                if hasattr(self.mic, "stop_stream"):
+                    self.mic.stop_stream()
+            self.state = State.STANDBY
+            self._reset_listening_state()
         else:
             self.state = State.LISTENING
             self._reset_listening_state()
@@ -117,17 +128,23 @@ class CoreEngine:
                         self.has_started_speaking = True
                         self.total_listening_ms = 0
                 elif not self.speaker.is_speaking():
-                    self.state = State.LISTENING if self.expect_reply else State.EXECUTING
-                    if self.state == State.LISTENING:
-                        self.was_interrupted = False
-                        self.current_silence_duration_ms = 0
-                        self.total_recording_ms = self.current_speech_duration_ms
-                        self.has_started_speaking = True
-                        self.total_listening_ms = 0
-                    elif self.state == State.EXECUTING:
-                        if hasattr(self.mic, 'stop_stream'):
+                    if self.standby_mode:
+                        self.state = State.STANDBY
+                        if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
                             self.mic.stop_stream()
-                        self.llm.start_request({"status": "notification_delivered"})
+                        self._reset_listening_state()
+                    else:
+                        self.state = State.LISTENING if self.expect_reply else State.EXECUTING
+                        if self.state == State.LISTENING:
+                            self.was_interrupted = False
+                            self.current_silence_duration_ms = 0
+                            self.total_recording_ms = self.current_speech_duration_ms
+                            self.has_started_speaking = True
+                            self.total_listening_ms = 0
+                        elif self.state == State.EXECUTING:
+                            if hasattr(self.mic, 'stop_stream'):
+                                self.mic.stop_stream()
+                            self.llm.start_request({"status": "notification_delivered"})
             else:
                 self.current_grace_ms += self.tick_ms
                 if self.current_grace_ms > self.config.vad_silence_grace_ms:
@@ -138,14 +155,20 @@ class CoreEngine:
                     self.current_grace_ms = 0
                 if not self.speaker.is_speaking():
-                    self.state = State.LISTENING if self.expect_reply else State.EXECUTING
-                    if self.state == State.LISTENING:
-                        self._reset_listening_state()
-                        self.was_interrupted = False
-                    elif self.state == State.EXECUTING:
-                        if hasattr(self.mic, 'stop_stream'):
+                    if self.standby_mode:
+                        self.state = State.STANDBY
+                        if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
                             self.mic.stop_stream()
-                        self.llm.start_request({"status": "notification_delivered"})
+                        self._reset_listening_state()
+                    else:
+                        self.state = State.LISTENING if self.expect_reply else State.EXECUTING
+                        if self.state == State.LISTENING:
+                            self._reset_listening_state()
+                            self.was_interrupted = False
+                        elif self.state == State.EXECUTING:
+                            if hasattr(self.mic, 'stop_stream'):
+                                self.mic.stop_stream()
+                            self.llm.start_request({"status": "notification_delivered"})
         elif self.state == State.LISTENING:
             self.buffer.append(frame)
@@ -189,6 +212,20 @@ class CoreEngine:
                 else:
                     self._reset_listening_state()
+        elif self.state == State.STANDBY:
+            if is_speech:
+                self.standby_mode = False
+                self.state = State.LISTENING
+                if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
+                    # We closed it earlier for PTT, so we need to reopen it.
+                    self.mic.start_stream()
+                self._reset_listening_state()
+                self.buffer.append(frame)
+                self.total_listening_ms += self.tick_ms
+                self.current_speech_duration_ms += self.tick_ms
+                self.has_started_speaking = True
+                self.total_recording_ms += self.tick_ms
         elif self.state == State.PROCESSING:
             self.buffer.append(frame)
             self.processing_wait_ms += self.tick_ms