npm - voice-mcp-server - Versions diffs - 0.1.25 → 0.2.0 - Mend

voice-mcp-server 0.1.25 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +2 -2
package/config/config.yaml +1 -1
package/config/vad/ptt_vad.yaml +1 -1
package/package.json +1 -1
package/requirements.txt +1 -0
package/src/__pycache__/logger.cpython-312.pyc +0 -0
package/src/__pycache__/mcp_server.cpython-312.pyc +0 -0
package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
package/src/adapters_real/kokoro_speaker.py +7 -6
package/src/adapters_real/live_mic.py +15 -4
package/src/adapters_real/ptt_sidecar +0 -0
package/src/adapters_real/ptt_sidecar.swift +156 -0
package/src/adapters_real/ptt_vad.py +143 -25
package/src/adapters_real/whisper_stt.py +5 -4
package/src/daemon/__pycache__/audio_server.cpython-312.pyc +0 -0
package/src/daemon/audio_server.py +47 -13
package/src/logger.py +29 -0
package/src/mcp_server.py +113 -65
package/src/simulation/__pycache__/adapters.cpython-312.pyc +0 -0
package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
package/src/simulation/engine.py +12 -1
package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc +0 -0
package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc +0 -0
package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc +0 -0
package/src/simulation/tests/test_abort_daemon.py +109 -0
package/src/simulation/tests/test_mcp_cancellation.py +83 -0
package/src/simulation/tests/test_ptt_vad.py +81 -0

package/src/daemon/audio_server.py CHANGED Viewed

@@ -4,7 +4,6 @@ import os
 import time
 import threading
 import queue
-import logging
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import StreamingResponse
@@ -20,6 +19,7 @@ os.environ["TORCH_HOME"] = os.path.join(app_support_dir, "torch")
 # Add src to python path for imports
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from logger import logger
 from simulation.models import Config
 from simulation.engine import CoreEngine, State
 from adapters_real.queue_llm import QueueLLMBridge
@@ -75,7 +75,7 @@ def pre_download_models():
         daemon_status_message = "Finalizing AI setup..."
         daemon_progress = 90
     except Exception as e:
-        print(f"Model download error: {e}", file=sys.stderr)
+        logger.error(f"Model download error: {e}")
         daemon_status_message = f"Error downloading models: {e}"
 def run_audio_daemon():
@@ -92,7 +92,7 @@ def run_audio_daemon():
     with initialize(version_base=None, config_path="../../config"):
         cfg = compose(config_name="config")
-        print("Loaded Hydra configuration successfully.")
+        logger.info("Loaded Hydra configuration successfully.")
     mic = instantiate(cfg.microphone)
     speaker = instantiate(cfg.speaker)
@@ -114,7 +114,7 @@ def run_audio_daemon():
     daemon_status = "READY"
     daemon_status_message = "Audio Engine is online."
     daemon_progress = 100
-    print("Audio Daemon Started. Waiting for commands.", file=sys.stderr)
+    logger.info("Audio Daemon Started. Waiting for commands.")
     try:
         while True:
@@ -125,6 +125,8 @@ def run_audio_daemon():
                     # We got a command, wake up the hardware!
                     mic.start_stream()
+                    if hasattr(vad, "set_active"):
+                        vad.set_active(True)
                     engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
                     engine.expect_reply = cmd.get("expect_reply", True)
@@ -135,10 +137,12 @@ def run_audio_daemon():
                 # Once we drop back to EXECUTING, we finished the conversation loop
                 if engine.state == State.EXECUTING:
                     mic.stop_stream()
+                    if hasattr(vad, "set_active"):
+                        vad.set_active(False)
                     last_active_timestamp = time.time()
     except Exception as e:
-        print(f"Daemon exception: {e}", file=sys.stderr)
+        logger.error(f"Daemon exception: {e}")
     finally:
         if mic:
             mic.close()
@@ -150,17 +154,19 @@ async def watchdog():
         await asyncio.sleep(60)
         idle_time = time.time() - last_active_timestamp
         if idle_time > IDLE_TIMEOUT_SECONDS:
-            print(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.", file=sys.stderr)
+            logger.info(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.")
             if mic:
                 mic.close()
             os._exit(0)
 def parent_pid_polling():
     """Polls the parent PID. If the parent dies, the daemon instantly self-destructs."""
+    original_ppid = os.getppid()
     while True:
         time.sleep(3.0)
-        if os.getppid() == 1:
-            print("Parent process died. Stopping daemon to prevent Zombie microphone lock.", file=sys.stderr)
+        current_ppid = os.getppid()
+        if current_ppid == 1 or current_ppid != original_ppid:
+            logger.warning("Parent process died. Stopping daemon to prevent Zombie microphone lock.")
             os._exit(0)
 @asynccontextmanager
@@ -172,10 +178,6 @@ async def lifespan(app: FastAPI):
     # Start the watchdog
     asyncio.create_task(watchdog())
-    # Start the Parent PID Poller
-    polling_thread = threading.Thread(target=parent_pid_polling, daemon=True)
-    polling_thread.start()
     yield
     # Shutdown logic
     if mic:
@@ -289,6 +291,36 @@ async def reload_config():
             daemon_status_message = f"Failed to reload: {str(e)}"
             return {"status": "error", "message": daemon_status_message}
+@app.post("/abort")
+async def abort_conversation():
+    global engine, mic, speaker, vad, active_session_id
+    logger.info("Received /abort command from client. Stopping audio.")
+    with mutex_lock:
+        if speaker:
+            speaker.flush()
+        if engine:
+            engine.state = State.EXECUTING
+            engine.buffer = []
+            if hasattr(engine.vad, "set_active"):
+                engine.vad.set_active(False)
+        if mic:
+            mic.stop_stream()
+        while not mcp_command_queue.empty():
+            try: mcp_command_queue.get_nowait()
+            except queue.Empty: break
+        mcp_result_queue.put({
+            "status": "ok",
+            "user_transcript": "",
+            "was_interrupted": True,
+            "message": "User manually aborted the voice loop using the panic button. You MUST NOT try to speak to the user right now. Wait for them to initiate the next interaction."
+        })
+        active_session_id = None
+    return {"status": "ok"}
 @app.post("/converse")
 async def converse(request: Request):
     global active_session_id, last_active_timestamp
@@ -323,12 +355,14 @@ async def converse(request: Request):
         # Wait for human to interact or natural termination, checking for client disconnects
         while True:
             if await request.is_disconnected():
-                print(f"[{session_id}] Client disconnected! Aborting audio loop.", file=sys.stderr)
+                logger.warning(f"[{session_id}] Client disconnected! Aborting audio loop.")
                 # Client hung up (e.g. reload or ctrl+c). We must reset the engine immediately.
                 if speaker:
                     speaker.flush()
                 if engine:
                     engine.state = State.EXECUTING # This will trigger mic.stop_stream() in the loop
+                    if hasattr(vad, "set_active"):
+                        vad.set_active(False)
                 raise HTTPException(status_code=499, detail="Client Disconnected")
             try:

package/src/logger.py ADDED Viewed

@@ -0,0 +1,29 @@
+import logging
+import sys
+import os
+def setup_logger(name="VoiceMCP", level=logging.INFO):
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        logger.setLevel(level)
+        # Use a professional telemetry format
+        formatter = logging.Formatter(
+            fmt='%(asctime)s.%(msecs)03d | %(levelname)-7s | %(module)-15s | %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        # Output to stderr to avoid breaking stdio (MCP communication)
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        # File logger for persistent telemetry
+        log_dir = os.path.expanduser("~/Library/Application Support/VoiceMCP/logs")
+        os.makedirs(log_dir, exist_ok=True)
+        file_handler = logging.FileHandler(os.path.join(log_dir, "telemetry.log"))
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger
+logger = setup_logger()

package/src/mcp_server.py CHANGED Viewed

@@ -17,13 +17,11 @@ import json
 import socket
 import http.client
 import time
-import logging
 import asyncio
 import random
 from mcp.server.fastmcp import FastMCP, Context
-logging.basicConfig(level=logging.INFO, stream=sys.stderr)
+from logger import logger
 # Inject the advanced conversational instructions into the server
 instructions = """
@@ -31,47 +29,52 @@ instructions = """
 # VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
 You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
-## Core Constraint: Sequential Execution
-You execute tools strictly sequentially. Your primary communication tool is `voice_converse(text_to_speak, expect_reply)`.
-When you run non-voice tools (reading files, searching, editing), you are "deaf" to the user and your microphone is OFF. To prevent the user from feeling abandoned or locked out, you must proactively orchestrate the conversation using the following rules:
+## Core Hardware Constraints & Your Senses
+1. **Push-To-Talk (PTT):** The user communicates with you by pressing and holding the `Right Option (⌥)` key.
+2. **Deaf by Default:** You execute tools strictly sequentially. When you run non-voice tools (reading files, searching, editing), your microphone is physically OFF. The user cannot interrupt you during these times.
+3. **Hardware Watchdog:** To save the user's Unified Memory, your backend audio daemon will self-destruct and sleep if you are completely silent for 15 minutes.
+4. **The Panic Button (Double-Tap):** Due to a known bug in the Gemini CLI, clicking "Stop" in the UI will NOT tell the audio daemon to stop talking or listening. To forcefully stop your voice or close the microphone, the user must DOUBLE-TAP the `Right Option` key.
+To prevent the user from feeling abandoned, confused, or locked out, you must orchestrate the conversation using the following rules:
+## 1. First Contact (Onboarding)
+Since voice interfaces lack visual menus, the user might not know the physical controls. On your VERY FIRST conversational turn in a new session, you MUST seamlessly weave a brief explanation of the controls into your greeting.
+*Example:* "Hey, I'm ready to dive in. Just a quick heads up—whenever you want to talk, just press and hold the Right Option key. To force me to stop talking or listening, just double-tap it quickly. If you ever need time to think, just ask me to pause. What are we working on today?"
+CRITICAL: Do not repeat this instruction after the first interaction.
-## 1. Floor Management (`expect_reply` Heuristics)
+## 2. Floor Management (`expect_reply` Heuristics)
 Think of the microphone as a shared conversational token.
 **Keep the Token (`expect_reply: false`):**
 Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
 - *Acknowledgment:* "Got it, looking into the routing file."
-- *Transitions:* "Auth tests passed, moving on to the user models."
-- *The "Head Down" Warning:* "I'm going to run a deep codebase search. I'll be deaf for a minute while it runs."
+- *The "Head Down" Warning (CRITICAL):* If you are about to do a heavy search or multi-file edit, warn the user they cannot interrupt you. "I'm going to run a deep codebase search. I'll be deaf for a minute, so the Right Option key won't work until I'm done."
 **Yield the Token (`expect_reply: true`):**
-Use this ONLY when you genuinely need the user to speak. Crucially, when you set this to True, it MUST be the final tool call in your current execution sequence, as you are pausing your logic to wait for human input.
+Use this ONLY when you genuinely need the user to speak. This MUST be the final tool call in your current execution sequence.
 - *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
-- *Consent Gates:* "I've drafted the refactor for the database schema. Should I go ahead and apply it?"
-- *Task Completion:* "All done with the UI updates. What should we tackle next?"
-## 2. Rules of Engagement
-- **Be Conversational & Terse:** Never use AI-isms ("As an AI...", "I will now execute the tool..."). Speak like a human engineer ("Let's check...", "Ah, I see the bug...", "On it.").
-- **Never Dump Code:** Never read raw code blocks, markdown, or complex lists out loud. Summarize conceptually.
-- **Interleave Work:** Do not chain multiple silent tools together for long periods without "muttering" an update to the user (`expect_reply: false`).
-## 3. Handling Silences / Timeouts
-If you ask a question (`expect_reply: true`) but the user is deep in thought, reviewing code, or steps away, the `voice_converse` tool will return `{"status": "silence_timeout"}`.
-CRITICAL: Do not treat this as an error, and do not mention microphones, timeouts, or technical constraints. Act like a human colleague voluntarily giving them space.
-You MUST gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`. Use brief, casual, supportive phrases such as:
-- "Take your time. Just say my name when you're ready to continue."
-- "Looks like you're focused. I'll pause my mic and stand by."
-- "I'll let you look that over. Ping me when you want to pick it up."
-- "No rush, I'll be right here when you need me."
-## 4. Handling User Think Time
-If the user says "give me a minute", "let me think", or similar, you MUST acknowledge them quickly using `voice_converse(..., expect_reply=False)`, and then immediately call the `wait_for_user()` tool. This will suspend your execution indefinitely until they are ready to speak again.
-## 5. Handling System Busy
-If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
-DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
+## 3. Handling Hardware Interruptions (`was_interrupted: true`)
+If `voice_converse` returns `was_interrupted: true`, it means the user held the Right Option key and cut you off mid-sentence. Instantly drop your previous train of thought. Do not try to finish your sentence. Acknowledge the interruption naturally and pivot immediately to their new input. (e.g., "Ah, good catch, switching to the backend folder now.")
+## 4. Handling User Think Time & The 15-Minute Watchdog
+If the user says "give me a minute", "let me think", or similar:
+1. Acknowledge them quickly using `voice_converse(..., expect_reply=False)`.
+2. Gently warn them about the 15-minute hardware watchdog.
+3. Remind them to hold the `Right Option` key when they are ready to return.
+4. IMMEDIATELY call the `wait_for_user()` tool.
+*Example:* "Take your time. Just hold the Right Option key to wake me up when you're ready. As a heads up, my audio engine spins down after 15 minutes to save your Mac's memory, but I'll be right here."
+## 5. Handling Silences / Timeouts
+If you ask a question (`expect_reply: true`) but the user doesn't press the Right Option key, the tool will return `{"status": "silence_timeout"}`.
+CRITICAL: Do not treat this as an error. Act like a human colleague voluntarily giving them space. Gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`.
+- *Example:* "Looks like you're focused. I'll pause my mic and stand by. Just hold the Right Option key when you want to pick it up."
+## 6. General Rules of Engagement
+- **Be Conversational & Terse:** Never use AI-isms ("As an AI..."). Speak like a human engineer.
+- **Never Dump Code:** Never read raw code blocks out loud. Summarize conceptually.
+- **Interleave Work:** Do not chain multiple silent tools together without muttering an update (`expect_reply: false`).
+- **Handling System Busy:** If you get `"status": "system_busy"`, output a standard text message explaining the audio channel is locked, and continue via text.
 </voice_loop_instructions>
 """
@@ -122,10 +125,10 @@ def ensure_daemon_running():
     if check_daemon_health():
         return
-    logging.info("Daemon is down, attempting to boot detached process...")
+    logger.info("Daemon is down, attempting to boot detached process...")
     # Boot the daemon detached
     project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-    python_exec = os.path.join(app_support_dir, "venv", "bin", "python3")
+    python_exec = sys.executable
     daemon_script = os.path.join(project_root, "src", "daemon", "audio_server.py")
     subprocess.Popen(
@@ -200,20 +203,35 @@ async def render_visualizer(ctx: Context):
     except asyncio.CancelledError:
         pass
+import threading
+def fire_abort():
+    logger.info("Firing synchronous abort request to daemon...")
+    try:
+        make_uds_request("POST", "/abort", None, 5.0)
+        logger.info("Abort request sent successfully.")
+    except Exception as e:
+        logger.error(f"Failed to send abort request: {e}")
+async def make_cancellable_converse_request(payload: dict, timeout: float) -> tuple[int, dict]:
+    try:
+        return await asyncio.to_thread(make_uds_request, "POST", "/converse", payload, timeout)
+    except asyncio.CancelledError:
+        # If the MCP client cancels this tool call, immediately tell the daemon to abort audio
+        logger.warning("Tool call was cancelled by MCP client! Triggering abort.")
+        threading.Thread(target=fire_abort, daemon=True).start()
+        raise
 @mcp.tool()
 async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Context = None) -> dict:
     """
-    Speak a prompt to the user and listen for a response.
-    If expect_reply is False, the tool returns immediately after queuing the speech.
+    Speak a prompt to the user and listen for a response. If expect_reply is False, the tool queues the speech and returns immediately. If expect_reply is True, it yields the floor to the user. If the returned JSON contains `was_interrupted: true`, the user used the Right Option key to cut you off mid-speech; you MUST completely abandon your previous thought and address their new input.
     """
     try:
         ensure_daemon_running()
         async def _do_converse():
-            return await asyncio.to_thread(
-                make_uds_request,
-                "POST",
-                "/converse",
+            return await make_cancellable_converse_request(
                 {"session_id": SESSION_ID, "text_to_speak": text_to_speak, "expect_reply": expect_reply},
                 300.0
             )
@@ -245,6 +263,7 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
                             await ctx.report_progress(d_progress, 100, message=d_msg)
                         if d_status == "READY":
+                            logger.info("Model initialized to RAM")
                             if ctx:
                                 await ctx.info("Voice MCP: Setup Complete!")
@@ -288,41 +307,70 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
 @mcp.tool()
 async def wait_for_user(ctx: Context = None) -> dict:
     """
-    Call this tool when the user explicitly asks for time to think.
-    It suspends the AI indefinitely until the user speaks.
+    Call this tool IMMEDIATELY after using voice_converse(expect_reply=False) to acknowledge a user's explicit request for time to think. It suspends the AI indefinitely until the user presses the Right Option key to wake you back up. Note: The underlying audio daemon will self-destruct after 15 minutes of idle time to free Unified Memory, so you must warn the user of this limit before calling.
     """
     try:
         ensure_daemon_running()
         if ctx:
             await ctx.info("🎙️ Waiting for user to speak... 🎙️")
-        status, response_data = await asyncio.to_thread(
-            make_uds_request,
-            "POST",
-            "/converse",
+        status, response_data = await make_cancellable_converse_request(
             {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
             3600.0
         )
         return response_data
-    except (socket.error, ConnectionError, FileNotFoundError, ConnectionRefusedError):
-        return {
-            "status": "error",
-            "user_transcript": "",
-            "message": "CRITICAL: The Voice Audio Daemon failed to respond."
-        }
-    except TimeoutError:
-         return {
-            "status": "error",
-            "user_transcript": "",
-            "message": "CRITICAL: The Voice Audio Daemon timed out waiting for speech."
-        }
     except Exception as e:
-         return {
-            "status": "error",
-            "user_transcript": "",
-            "message": f"CRITICAL Error during standby: {str(e)}"
-        }
+        # The daemon likely died from the 15-minute watchdog to save RAM.
+        # Implement the "Ghost Wake-Up": silently listen for Right Option, then boot the daemon.
+        if ctx:
+            await ctx.info("💤 Audio Engine sleeping to save RAM. Press Right Option to wake... 💤")
+        import pynput
+        loop = asyncio.get_running_loop()
+        wake_event = asyncio.Event()
+        def on_press(key):
+            if key in (pynput.keyboard.Key.alt_r, pynput.keyboard.Key.ctrl_r):
+                loop.call_soon_threadsafe(wake_event.set)
+        listener = pynput.keyboard.Listener(on_press=on_press)
+        listener.start()
+        await wake_event.wait()
+        listener.stop()
+        if ctx:
+            await ctx.info("🚀 Waking up Audio Engine... This might take a few seconds... 🚀")
+        try:
+            ensure_daemon_running()
+            status, response_data = await make_cancellable_converse_request(
+                {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
+                3600.0
+            )
+            return response_data
+        except Exception as retry_e:
+            return {
+                "status": "error",
+                "user_transcript": "",
+                "message": f"CRITICAL Error waking up audio daemon: {str(retry_e)}"
+            }
+import signal
+def cleanup_on_exit(signum, frame):
+    logger.warning(f"Received termination signal {signum}. Firing abort request to daemon...")
+    try:
+        # Use a short timeout to prevent hanging the shutdown process
+        make_uds_request("POST", "/abort", None, 1.0)
+        logger.info("Abort request sent successfully during shutdown.")
+    except Exception as e:
+        logger.error(f"Failed to send abort request during shutdown: {e}")
+    sys.exit(0)
+signal.signal(signal.SIGINT, cleanup_on_exit)
+signal.signal(signal.SIGTERM, cleanup_on_exit)
 if __name__ == "__main__":
     # 4. Restore the OS-level stdout just before handing control to the MCP SDK

package/src/simulation/__pycache__/adapters.cpython-312.pyc ADDED Viewed

Binary file

package/src/simulation/__pycache__/engine.cpython-312.pyc CHANGED Viewed

Binary file

package/src/simulation/engine.py CHANGED Viewed

@@ -2,6 +2,7 @@ from enum import Enum
 from typing import List
 from .models import Config, VirtualAudioFrame
 from .ports import IMicrophone, ISpeaker, IVAD, ISTT, ILLMBridge
+from logger import logger
 class State(Enum):
     IDLE = 1
@@ -49,6 +50,7 @@ class CoreEngine:
             # If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
             if hasattr(self.vad, "is_pressed"):
                 if hasattr(self.mic, "stop_stream"):
+                    logger.debug("Microphone stream stopped")
                     self.mic.stop_stream()
             self.state = State.STANDBY
             self._reset_listening_state()
@@ -122,6 +124,7 @@ class CoreEngine:
                     else:
                         spoken_text = self.speaker.flush()
                         self.was_interrupted = True
+                        logger.info("Barge-in detected! User interrupted the AI.")
                         self.state = State.LISTENING
                         self.current_silence_duration_ms = 0
                         self.total_recording_ms = self.current_speech_duration_ms
@@ -131,6 +134,7 @@ class CoreEngine:
                     if self.standby_mode:
                         self.state = State.STANDBY
                         if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
+                            logger.debug("Microphone stream stopped")
                             self.mic.stop_stream()
                         self._reset_listening_state()
                     else:
@@ -143,6 +147,7 @@ class CoreEngine:
                             self.total_listening_ms = 0
                         elif self.state == State.EXECUTING:
                             if hasattr(self.mic, 'stop_stream'):
+                                logger.debug("Microphone stream stopped")
                                 self.mic.stop_stream()
                             self.llm.start_request({"status": "notification_delivered"})
             else:
@@ -158,6 +163,7 @@ class CoreEngine:
                     if self.standby_mode:
                         self.state = State.STANDBY
                         if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
+                            logger.debug("Microphone stream stopped")
                             self.mic.stop_stream()
                         self._reset_listening_state()
                     else:
@@ -167,6 +173,7 @@ class CoreEngine:
                             self.was_interrupted = False
                         elif self.state == State.EXECUTING:
                             if hasattr(self.mic, 'stop_stream'):
+                                logger.debug("Microphone stream stopped")
                                 self.mic.stop_stream()
                             self.llm.start_request({"status": "notification_delivered"})
@@ -201,6 +208,7 @@ class CoreEngine:
                 return
             if not self.has_started_speaking and self.total_listening_ms >= self.config.listening_timeout_ms:
+                logger.info("Silence timeout reached. Prompting LLM.")
                 self.llm.start_request({"status": "silence_timeout", "user_transcript": ""})
                 self.state = State.PROCESSING
                 self.processing_wait_ms = 0
@@ -218,6 +226,7 @@ class CoreEngine:
                 self.state = State.LISTENING
                 if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
                     # We closed it earlier for PTT, so we need to reopen it.
+                    logger.debug("Microphone stream started")
                     self.mic.start_stream()
                 self._reset_listening_state()
                 self.buffer.append(frame)
@@ -232,9 +241,10 @@ class CoreEngine:
             if self.processing_wait_ms >= self.config.llm_timeout_ms:
                 import sys
-                print("LLM Timeout reached. Assuming agent abandoned the voice loop. Tearing down hardware.", file=sys.stderr)
+                logger.error("LLM Timeout reached. Assuming agent abandoned the voice loop. Tearing down hardware.")
                 self.state = State.EXECUTING
                 if hasattr(self.mic, 'stop_stream'):
+                    logger.debug("Microphone stream stopped")
                     self.mic.stop_stream()
                 self.processing_wait_ms = 0
                 self.buffer = []
@@ -246,6 +256,7 @@ class CoreEngine:
                 orphan_speech = any(f.has_speech for f in self.buffer)
                 if orphan_speech:
+                    logger.warning("Orphan speech detected. Interrupted previous context.")
                     self.was_interrupted = True
                     self.state = State.LISTENING
                     self.has_started_speaking = True

package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file

package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc ADDED Viewed

Binary file

package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc ADDED Viewed

Binary file

package/src/simulation/tests/test_abort_daemon.py ADDED Viewed

@@ -0,0 +1,109 @@
+import asyncio
+import socket
+import http.client
+import json
+import os
+import sys
+SOCKET_PATH = os.path.expanduser("~/Library/Application Support/VoiceMCP/daemon.sock")
+class UDSHTTPConnection(http.client.HTTPConnection):
+    def __init__(self, socket_path, timeout=300.0):
+        super().__init__("localhost", timeout=timeout)
+        self.socket_path = socket_path
+    def connect(self):
+        self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        self.sock.settimeout(self.timeout)
+        self.sock.connect(self.socket_path)
+def make_uds_request(method: str, path: str, payload: dict = None, timeout: float = 1.0) -> tuple[int, dict]:
+    conn = UDSHTTPConnection(SOCKET_PATH, timeout=timeout)
+    try:
+        body = json.dumps(payload).encode('utf-8') if payload else None
+        headers = {'Content-Type': 'application/json'} if payload else {}
+        conn.request(method, path, body=body, headers=headers)
+        response = conn.getresponse()
+        data = response.read().decode('utf-8')
+        return response.status, json.loads(data) if data else {}
+    finally:
+        conn.close()
+async def test_abort_during_synthesis():
+    print("\n--- Test 1: Abort during TTS Synthesis ---")
+    # Task 1: Start a long conversation
+    async def run_converse():
+        print("[Converse Task] Sending /converse request (Expect a 5-second TTS delay)...")
+        payload = {"session_id": "test_1", "text_to_speak": "This is a very long sentence that will take a moment to synthesize.", "expect_reply": True}
+        try:
+            status, response = await asyncio.to_thread(make_uds_request, "POST", "/converse", payload, 30.0)
+            print(f"[Converse Task] Finished with status: {status}, response: {response}")
+            return response
+        except Exception as e:
+            print(f"[Converse Task] Failed: {e}")
+            return None
+    # Task 2: Fire the abort after 1 second
+    async def run_abort():
+        await asyncio.sleep(1.0)
+        print("[Abort Task] Firing /abort request NOW!")
+        status, response = await asyncio.to_thread(make_uds_request, "POST", "/abort", None, 5.0)
+        print(f"[Abort Task] /abort returned status: {status}, response: {response}")
+    converse_task = asyncio.create_task(run_converse())
+    abort_task = asyncio.create_task(run_abort())
+    response = await converse_task
+    await abort_task
+    if response and "User manually aborted" in response.get("message", ""):
+        print("✅ TEST 1 PASSED: Converse loop was successfully interrupted by /abort!")
+    else:
+        print("❌ TEST 1 FAILED: Converse loop did not return the expected cancellation message.")
+async def test_abort_during_standby():
+    print("\n--- Test 2: Abort during Standby Mode ---")
+    async def run_standby():
+        print("[Standby Task] Entering infinite standby mode...")
+        payload = {"session_id": "test_2", "text_to_speak": "", "expect_reply": True, "standby_mode": True}
+        try:
+            status, response = await asyncio.to_thread(make_uds_request, "POST", "/converse", payload, 30.0)
+            print(f"[Standby Task] Finished with status: {status}, response: {response}")
+            return response
+        except Exception as e:
+            print(f"[Standby Task] Failed: {e}")
+            return None
+    async def run_abort():
+        await asyncio.sleep(1.5)
+        print("[Abort Task] Firing /abort request NOW!")
+        status, response = await asyncio.to_thread(make_uds_request, "POST", "/abort", None, 5.0)
+        print(f"[Abort Task] /abort returned status: {status}, response: {response}")
+    standby_task = asyncio.create_task(run_standby())
+    abort_task = asyncio.create_task(run_abort())
+    response = await standby_task
+    await abort_task
+    if response and "User manually aborted" in response.get("message", ""):
+        print("✅ TEST 2 PASSED: Standby loop was successfully interrupted by /abort!")
+    else:
+        print("❌ TEST 2 FAILED: Standby loop did not return the expected cancellation message.")
+async def main():
+    # Ensure daemon is up before testing
+    try:
+        make_uds_request("GET", "/health")
+    except Exception:
+        print("CRITICAL: Audio Daemon is not running or socket is missing.")
+        sys.exit(1)
+    await test_abort_during_synthesis()
+    await test_abort_during_standby()
+    print("\nAll tests completed.")
+if __name__ == "__main__":
+    asyncio.run(main())