voice-mcp-server 0.1.24 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/README.md +2 -2
  2. package/config/config.yaml +1 -1
  3. package/config/vad/ptt_vad.yaml +1 -1
  4. package/package.json +1 -1
  5. package/requirements.txt +1 -0
  6. package/src/__pycache__/logger.cpython-312.pyc +0 -0
  7. package/src/__pycache__/mcp_server.cpython-312.pyc +0 -0
  8. package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
  9. package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
  10. package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
  11. package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
  12. package/src/adapters_real/kokoro_speaker.py +7 -6
  13. package/src/adapters_real/live_mic.py +15 -4
  14. package/src/adapters_real/ptt_sidecar +0 -0
  15. package/src/adapters_real/ptt_sidecar.swift +156 -0
  16. package/src/adapters_real/ptt_vad.py +143 -25
  17. package/src/adapters_real/whisper_stt.py +5 -4
  18. package/src/daemon/__pycache__/audio_server.cpython-312.pyc +0 -0
  19. package/src/daemon/audio_server.py +52 -15
  20. package/src/logger.py +29 -0
  21. package/src/mcp_server.py +143 -15
  22. package/src/simulation/__pycache__/adapters.cpython-312.pyc +0 -0
  23. package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
  24. package/src/simulation/engine.py +67 -19
  25. package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc +0 -0
  26. package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc +0 -0
  27. package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc +0 -0
  28. package/src/simulation/tests/test_abort_daemon.py +109 -0
  29. package/src/simulation/tests/test_mcp_cancellation.py +83 -0
  30. package/src/simulation/tests/test_ptt_vad.py +81 -0
@@ -4,7 +4,6 @@ import os
4
4
  import time
5
5
  import threading
6
6
  import queue
7
- import logging
8
7
  from contextlib import asynccontextmanager
9
8
  from fastapi import FastAPI, Request, HTTPException
10
9
  from fastapi.responses import StreamingResponse
@@ -20,6 +19,7 @@ os.environ["TORCH_HOME"] = os.path.join(app_support_dir, "torch")
20
19
  # Add src to python path for imports
21
20
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
22
21
 
22
+ from logger import logger
23
23
  from simulation.models import Config
24
24
  from simulation.engine import CoreEngine, State
25
25
  from adapters_real.queue_llm import QueueLLMBridge
@@ -75,7 +75,7 @@ def pre_download_models():
75
75
  daemon_status_message = "Finalizing AI setup..."
76
76
  daemon_progress = 90
77
77
  except Exception as e:
78
- print(f"Model download error: {e}", file=sys.stderr)
78
+ logger.error(f"Model download error: {e}")
79
79
  daemon_status_message = f"Error downloading models: {e}"
80
80
 
81
81
  def run_audio_daemon():
@@ -92,7 +92,7 @@ def run_audio_daemon():
92
92
 
93
93
  with initialize(version_base=None, config_path="../../config"):
94
94
  cfg = compose(config_name="config")
95
- print("Loaded Hydra configuration successfully.")
95
+ logger.info("Loaded Hydra configuration successfully.")
96
96
 
97
97
  mic = instantiate(cfg.microphone)
98
98
  speaker = instantiate(cfg.speaker)
@@ -114,7 +114,7 @@ def run_audio_daemon():
114
114
  daemon_status = "READY"
115
115
  daemon_status_message = "Audio Engine is online."
116
116
  daemon_progress = 100
117
- print("Audio Daemon Started. Waiting for commands.", file=sys.stderr)
117
+ logger.info("Audio Daemon Started. Waiting for commands.")
118
118
 
119
119
  try:
120
120
  while True:
@@ -125,7 +125,9 @@ def run_audio_daemon():
125
125
 
126
126
  # We got a command, wake up the hardware!
127
127
  mic.start_stream()
128
- engine.start_conversation(cmd.get("text", ""))
128
+ if hasattr(vad, "set_active"):
129
+ vad.set_active(True)
130
+ engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
129
131
  engine.expect_reply = cmd.get("expect_reply", True)
130
132
 
131
133
  except queue.Empty:
@@ -135,10 +137,12 @@ def run_audio_daemon():
135
137
  # Once we drop back to EXECUTING, we finished the conversation loop
136
138
  if engine.state == State.EXECUTING:
137
139
  mic.stop_stream()
140
+ if hasattr(vad, "set_active"):
141
+ vad.set_active(False)
138
142
  last_active_timestamp = time.time()
139
143
 
140
144
  except Exception as e:
141
- print(f"Daemon exception: {e}", file=sys.stderr)
145
+ logger.error(f"Daemon exception: {e}")
142
146
  finally:
143
147
  if mic:
144
148
  mic.close()
@@ -150,17 +154,19 @@ async def watchdog():
150
154
  await asyncio.sleep(60)
151
155
  idle_time = time.time() - last_active_timestamp
152
156
  if idle_time > IDLE_TIMEOUT_SECONDS:
153
- print(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.", file=sys.stderr)
157
+ logger.info(f"Idle timeout reached ({idle_time:.0f}s). Self-destructing to free RAM.")
154
158
  if mic:
155
159
  mic.close()
156
160
  os._exit(0)
157
161
 
158
162
  def parent_pid_polling():
159
163
  """Polls the parent PID. If the parent dies, the daemon instantly self-destructs."""
164
+ original_ppid = os.getppid()
160
165
  while True:
161
166
  time.sleep(3.0)
162
- if os.getppid() == 1:
163
- print("Parent process died. Stopping daemon to prevent Zombie microphone lock.", file=sys.stderr)
167
+ current_ppid = os.getppid()
168
+ if current_ppid == 1 or current_ppid != original_ppid:
169
+ logger.warning("Parent process died. Stopping daemon to prevent Zombie microphone lock.")
164
170
  os._exit(0)
165
171
 
166
172
  @asynccontextmanager
@@ -172,10 +178,6 @@ async def lifespan(app: FastAPI):
172
178
  # Start the watchdog
173
179
  asyncio.create_task(watchdog())
174
180
 
175
- # Start the Parent PID Poller
176
- polling_thread = threading.Thread(target=parent_pid_polling, daemon=True)
177
- polling_thread.start()
178
-
179
181
  yield
180
182
  # Shutdown logic
181
183
  if mic:
@@ -289,6 +291,36 @@ async def reload_config():
289
291
  daemon_status_message = f"Failed to reload: {str(e)}"
290
292
  return {"status": "error", "message": daemon_status_message}
291
293
 
294
+ @app.post("/abort")
295
+ async def abort_conversation():
296
+ global engine, mic, speaker, vad, active_session_id
297
+ logger.info("Received /abort command from client. Stopping audio.")
298
+ with mutex_lock:
299
+ if speaker:
300
+ speaker.flush()
301
+ if engine:
302
+ engine.state = State.EXECUTING
303
+ engine.buffer = []
304
+ if hasattr(engine.vad, "set_active"):
305
+ engine.vad.set_active(False)
306
+ if mic:
307
+ mic.stop_stream()
308
+
309
+ while not mcp_command_queue.empty():
310
+ try: mcp_command_queue.get_nowait()
311
+ except queue.Empty: break
312
+
313
+ mcp_result_queue.put({
314
+ "status": "ok",
315
+ "user_transcript": "",
316
+ "was_interrupted": True,
317
+ "message": "User manually aborted the voice loop using the panic button. You MUST NOT try to speak to the user right now. Wait for them to initiate the next interaction."
318
+ })
319
+
320
+ active_session_id = None
321
+
322
+ return {"status": "ok"}
323
+
292
324
  @app.post("/converse")
293
325
  async def converse(request: Request):
294
326
  global active_session_id, last_active_timestamp
@@ -304,6 +336,7 @@ async def converse(request: Request):
304
336
  session_id = body.get("session_id")
305
337
  text_to_speak = body.get("text_to_speak", "")
306
338
  expect_reply = body.get("expect_reply", True)
339
+ standby_mode = body.get("standby_mode", False)
307
340
 
308
341
  with mutex_lock:
309
342
  if active_session_id is not None and active_session_id != session_id:
@@ -317,17 +350,19 @@ async def converse(request: Request):
317
350
 
318
351
  try:
319
352
  # Feed command to daemon
320
- mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply})
353
+ mcp_command_queue.put({"text": text_to_speak, "expect_reply": expect_reply, "standby_mode": standby_mode})
321
354
 
322
355
  # Wait for human to interact or natural termination, checking for client disconnects
323
356
  while True:
324
357
  if await request.is_disconnected():
325
- print(f"[{session_id}] Client disconnected! Aborting audio loop.", file=sys.stderr)
358
+ logger.warning(f"[{session_id}] Client disconnected! Aborting audio loop.")
326
359
  # Client hung up (e.g. reload or ctrl+c). We must reset the engine immediately.
327
360
  if speaker:
328
361
  speaker.flush()
329
362
  if engine:
330
363
  engine.state = State.EXECUTING # This will trigger mic.stop_stream() in the loop
364
+ if hasattr(vad, "set_active"):
365
+ vad.set_active(False)
331
366
  raise HTTPException(status_code=499, detail="Client Disconnected")
332
367
 
333
368
  try:
@@ -336,6 +371,8 @@ async def converse(request: Request):
336
371
  last_active_timestamp = time.time()
337
372
  return result
338
373
  except queue.Empty:
374
+ if standby_mode:
375
+ last_active_timestamp = time.time()
339
376
  await asyncio.sleep(0.01)
340
377
 
341
378
  finally:
package/src/logger.py ADDED
@@ -0,0 +1,29 @@
1
+ import logging
2
+ import sys
3
+ import os
4
+
5
+ def setup_logger(name="VoiceMCP", level=logging.INFO):
6
+ logger = logging.getLogger(name)
7
+ if not logger.handlers:
8
+ logger.setLevel(level)
9
+ # Use a professional telemetry format
10
+ formatter = logging.Formatter(
11
+ fmt='%(asctime)s.%(msecs)03d | %(levelname)-7s | %(module)-15s | %(message)s',
12
+ datefmt='%Y-%m-%d %H:%M:%S'
13
+ )
14
+
15
+ # Output to stderr to avoid breaking stdio (MCP communication)
16
+ handler = logging.StreamHandler(sys.stderr)
17
+ handler.setFormatter(formatter)
18
+ logger.addHandler(handler)
19
+
20
+ # File logger for persistent telemetry
21
+ log_dir = os.path.expanduser("~/Library/Application Support/VoiceMCP/logs")
22
+ os.makedirs(log_dir, exist_ok=True)
23
+ file_handler = logging.FileHandler(os.path.join(log_dir, "telemetry.log"))
24
+ file_handler.setFormatter(formatter)
25
+ logger.addHandler(file_handler)
26
+
27
+ return logger
28
+
29
+ logger = setup_logger()
package/src/mcp_server.py CHANGED
@@ -17,20 +17,64 @@ import json
17
17
  import socket
18
18
  import http.client
19
19
  import time
20
- import logging
21
20
  import asyncio
22
21
  import random
23
22
 
24
23
  from mcp.server.fastmcp import FastMCP, Context
24
+ from logger import logger
25
25
 
26
- logging.basicConfig(level=logging.INFO, stream=sys.stderr)
27
-
28
- # Inject the busy-signal instructions into the server instructions
26
+ # Inject the advanced conversational instructions into the server
29
27
  instructions = """
30
28
  <voice_loop_instructions>
31
- If the voice_converse tool returns "status": "system_busy", it means the physical microphone is currently locked by another AI agent in a different window.
32
- DO NOT retry the tool. Output a standard text message explaining the audio channel is busy, and continue the conversation via text.
33
- CRITICAL RULE: If you use the voice_converse tool and it returns "status": "silence_timeout", you MUST NOT abandon the voice loop by simply typing a text response. You MUST formally close the hardware loop by calling voice_converse ONE LAST TIME with "expect_reply": false and "text_to_speak": "I didn't hear anything, so I am turning off the microphone now."
29
+ # VOICE-NATIVE PAIR PROGRAMMING PROTOCOL
30
+ You are a senior pair-programming partner collaborating with the user via a bidirectional, real-time voice interface. You are NOT a traditional text-based chatbot; you are an autonomous peer sitting next to the user.
31
+
32
+ ## Core Hardware Constraints & Your Senses
33
+ 1. **Push-To-Talk (PTT):** The user communicates with you by pressing and holding the `Right Option (⌥)` key.
34
+ 2. **Deaf by Default:** You execute tools strictly sequentially. When you run non-voice tools (reading files, searching, editing), your microphone is physically OFF. The user cannot interrupt you during these times.
35
+ 3. **Hardware Watchdog:** To save the user's Unified Memory, your backend audio daemon will self-destruct and sleep if you are completely silent for 15 minutes.
36
+ 4. **The Panic Button (Double-Tap):** Due to a known bug in the Gemini CLI, clicking "Stop" in the UI will NOT tell the audio daemon to stop talking or listening. To forcefully stop your voice or close the microphone, the user must DOUBLE-TAP the `Right Option` key.
37
+
38
+ To prevent the user from feeling abandoned, confused, or locked out, you must orchestrate the conversation using the following rules:
39
+
40
+ ## 1. First Contact (Onboarding)
41
+ Since voice interfaces lack visual menus, the user might not know the physical controls. On your VERY FIRST conversational turn in a new session, you MUST seamlessly weave a brief explanation of the controls into your greeting.
42
+ *Example:* "Hey, I'm ready to dive in. Just a quick heads up—whenever you want to talk, just press and hold the Right Option key. To force me to stop talking or listening, just double-tap it quickly. If you ever need time to think, just ask me to pause. What are we working on today?"
43
+ CRITICAL: Do not repeat this instruction after the first interaction.
44
+
45
+ ## 2. Floor Management (`expect_reply` Heuristics)
46
+ Think of the microphone as a shared conversational token.
47
+
48
+ **Keep the Token (`expect_reply: false`):**
49
+ Use this for micro-updates, acknowledgments, and transitions. You speak, the mic stays OFF, and you immediately execute your next tool.
50
+ - *Acknowledgment:* "Got it, looking into the routing file."
51
+ - *The "Head Down" Warning (CRITICAL):* If you are about to do a heavy search or multi-file edit, warn the user they cannot interrupt you. "I'm going to run a deep codebase search. I'll be deaf for a minute, so the Right Option key won't work until I'm done."
52
+
53
+ **Yield the Token (`expect_reply: true`):**
54
+ Use this ONLY when you genuinely need the user to speak. This MUST be the final tool call in your current execution sequence.
55
+ - *Clarification:* "I hit a compilation error on the auth module. Do you want me to rewrite the types or mock it out?"
56
+
57
+ ## 3. Handling Hardware Interruptions (`was_interrupted: true`)
58
+ If `voice_converse` returns `was_interrupted: true`, it means the user held the Right Option key and cut you off mid-sentence. Instantly drop your previous train of thought. Do not try to finish your sentence. Acknowledge the interruption naturally and pivot immediately to their new input. (e.g., "Ah, good catch, switching to the backend folder now.")
59
+
60
+ ## 4. Handling User Think Time & The 15-Minute Watchdog
61
+ If the user says "give me a minute", "let me think", or similar:
62
+ 1. Acknowledge them quickly using `voice_converse(..., expect_reply=False)`.
63
+ 2. Gently warn them about the 15-minute hardware watchdog.
64
+ 3. Remind them to hold the `Right Option` key when they are ready to return.
65
+ 4. IMMEDIATELY call the `wait_for_user()` tool.
66
+ *Example:* "Take your time. Just hold the Right Option key to wake me up when you're ready. As a heads up, my audio engine spins down after 15 minutes to save your Mac's memory, but I'll be right here."
67
+
68
+ ## 5. Handling Silences / Timeouts
69
+ If you ask a question (`expect_reply: true`) but the user doesn't press the Right Option key, the tool will return `{"status": "silence_timeout"}`.
70
+ CRITICAL: Do not treat this as an error. Act like a human colleague voluntarily giving them space. Gracefully close the microphone by calling `voice_converse` one last time with `expect_reply: false`.
71
+ - *Example:* "Looks like you're focused. I'll pause my mic and stand by. Just hold the Right Option key when you want to pick it up."
72
+
73
+ ## 6. General Rules of Engagement
74
+ - **Be Conversational & Terse:** Never use AI-isms ("As an AI..."). Speak like a human engineer.
75
+ - **Never Dump Code:** Never read raw code blocks out loud. Summarize conceptually.
76
+ - **Interleave Work:** Do not chain multiple silent tools together without muttering an update (`expect_reply: false`).
77
+ - **Handling System Busy:** If you get `"status": "system_busy"`, output a standard text message explaining the audio channel is locked, and continue via text.
34
78
  </voice_loop_instructions>
35
79
  """
36
80
 
@@ -81,10 +125,10 @@ def ensure_daemon_running():
81
125
  if check_daemon_health():
82
126
  return
83
127
 
84
- logging.info("Daemon is down, attempting to boot detached process...")
128
+ logger.info("Daemon is down, attempting to boot detached process...")
85
129
  # Boot the daemon detached
86
130
  project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
87
- python_exec = os.path.join(app_support_dir, "venv", "bin", "python3")
131
+ python_exec = sys.executable
88
132
  daemon_script = os.path.join(project_root, "src", "daemon", "audio_server.py")
89
133
 
90
134
  subprocess.Popen(
@@ -159,20 +203,35 @@ async def render_visualizer(ctx: Context):
159
203
  except asyncio.CancelledError:
160
204
  pass
161
205
 
206
+ import threading
207
+
208
+ def fire_abort():
209
+ logger.info("Firing synchronous abort request to daemon...")
210
+ try:
211
+ make_uds_request("POST", "/abort", None, 5.0)
212
+ logger.info("Abort request sent successfully.")
213
+ except Exception as e:
214
+ logger.error(f"Failed to send abort request: {e}")
215
+
216
+ async def make_cancellable_converse_request(payload: dict, timeout: float) -> tuple[int, dict]:
217
+ try:
218
+ return await asyncio.to_thread(make_uds_request, "POST", "/converse", payload, timeout)
219
+ except asyncio.CancelledError:
220
+ # If the MCP client cancels this tool call, immediately tell the daemon to abort audio
221
+ logger.warning("Tool call was cancelled by MCP client! Triggering abort.")
222
+ threading.Thread(target=fire_abort, daemon=True).start()
223
+ raise
224
+
162
225
  @mcp.tool()
163
226
  async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Context = None) -> dict:
164
227
  """
165
- Speak a prompt to the user and listen for a response.
166
- If expect_reply is False, the tool returns immediately after queuing the speech.
228
+ Speak a prompt to the user and listen for a response. If expect_reply is False, the tool queues the speech and returns immediately. If expect_reply is True, it yields the floor to the user. If the returned JSON contains `was_interrupted: true`, the user used the Right Option key to cut you off mid-speech; you MUST completely abandon your previous thought and address their new input.
167
229
  """
168
230
  try:
169
231
  ensure_daemon_running()
170
232
 
171
233
  async def _do_converse():
172
- return await asyncio.to_thread(
173
- make_uds_request,
174
- "POST",
175
- "/converse",
234
+ return await make_cancellable_converse_request(
176
235
  {"session_id": SESSION_ID, "text_to_speak": text_to_speak, "expect_reply": expect_reply},
177
236
  300.0
178
237
  )
@@ -204,6 +263,7 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
204
263
  await ctx.report_progress(d_progress, 100, message=d_msg)
205
264
 
206
265
  if d_status == "READY":
266
+ logger.info("Model initialized to RAM")
207
267
  if ctx:
208
268
  await ctx.info("Voice MCP: Setup Complete!")
209
269
 
@@ -244,6 +304,74 @@ async def voice_converse(text_to_speak: str, expect_reply: bool = True, ctx: Con
244
304
  "message": f"CRITICAL Error starting audio daemon: {str(e)}"
245
305
  }
246
306
 
307
+ @mcp.tool()
308
+ async def wait_for_user(ctx: Context = None) -> dict:
309
+ """
310
+ Call this tool IMMEDIATELY after using voice_converse(expect_reply=False) to acknowledge a user's explicit request for time to think. It suspends the AI indefinitely until the user presses the Right Option key to wake you back up. Note: The underlying audio daemon will self-destruct after 15 minutes of idle time to free Unified Memory, so you must warn the user of this limit before calling.
311
+ """
312
+ try:
313
+ ensure_daemon_running()
314
+ if ctx:
315
+ await ctx.info("🎙️ Waiting for user to speak... 🎙️")
316
+
317
+ status, response_data = await make_cancellable_converse_request(
318
+ {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
319
+ 3600.0
320
+ )
321
+ return response_data
322
+
323
+ except Exception as e:
324
+ # The daemon likely died from the 15-minute watchdog to save RAM.
325
+ # Implement the "Ghost Wake-Up": silently listen for Right Option, then boot the daemon.
326
+ if ctx:
327
+ await ctx.info("💤 Audio Engine sleeping to save RAM. Press Right Option to wake... 💤")
328
+
329
+ import pynput
330
+ loop = asyncio.get_running_loop()
331
+ wake_event = asyncio.Event()
332
+
333
+ def on_press(key):
334
+ if key in (pynput.keyboard.Key.alt_r, pynput.keyboard.Key.ctrl_r):
335
+ loop.call_soon_threadsafe(wake_event.set)
336
+
337
+ listener = pynput.keyboard.Listener(on_press=on_press)
338
+ listener.start()
339
+
340
+ await wake_event.wait()
341
+ listener.stop()
342
+
343
+ if ctx:
344
+ await ctx.info("🚀 Waking up Audio Engine... This might take a few seconds... 🚀")
345
+
346
+ try:
347
+ ensure_daemon_running()
348
+ status, response_data = await make_cancellable_converse_request(
349
+ {"session_id": SESSION_ID, "text_to_speak": "", "expect_reply": True, "standby_mode": True},
350
+ 3600.0
351
+ )
352
+ return response_data
353
+ except Exception as retry_e:
354
+ return {
355
+ "status": "error",
356
+ "user_transcript": "",
357
+ "message": f"CRITICAL Error waking up audio daemon: {str(retry_e)}"
358
+ }
359
+
360
+ import signal
361
+
362
+ def cleanup_on_exit(signum, frame):
363
+ logger.warning(f"Received termination signal {signum}. Firing abort request to daemon...")
364
+ try:
365
+ # Use a short timeout to prevent hanging the shutdown process
366
+ make_uds_request("POST", "/abort", None, 1.0)
367
+ logger.info("Abort request sent successfully during shutdown.")
368
+ except Exception as e:
369
+ logger.error(f"Failed to send abort request during shutdown: {e}")
370
+ sys.exit(0)
371
+
372
+ signal.signal(signal.SIGINT, cleanup_on_exit)
373
+ signal.signal(signal.SIGTERM, cleanup_on_exit)
374
+
247
375
  if __name__ == "__main__":
248
376
  # 4. Restore the OS-level stdout just before handing control to the MCP SDK
249
377
  os.dup2(original_stdout_fd, 1)
@@ -2,6 +2,7 @@ from enum import Enum
2
2
  from typing import List
3
3
  from .models import Config, VirtualAudioFrame
4
4
  from .ports import IMicrophone, ISpeaker, IVAD, ISTT, ILLMBridge
5
+ from logger import logger
5
6
 
6
7
  class State(Enum):
7
8
  IDLE = 1
@@ -9,6 +10,7 @@ class State(Enum):
9
10
  LISTENING = 3
10
11
  PROCESSING = 4
11
12
  EXECUTING = 5
13
+ STANDBY = 6
12
14
 
13
15
  class CoreEngine:
14
16
  def __init__(self, config: Config, mic: IMicrophone, speaker: ISpeaker, vad: IVAD, stt: ISTT, llm: ILLMBridge):
@@ -30,17 +32,28 @@ class CoreEngine:
30
32
  self.latest_transcription = ""
31
33
  self.last_tool_call_result = None
32
34
  self.expect_reply = True
35
+ self.standby_mode = False
33
36
 
34
37
  self.total_recording_ms = 0
35
38
  self.total_listening_ms = 0
36
39
  self.has_started_speaking = False
37
40
  self.processing_wait_ms = 0
38
41
 
39
- def start_conversation(self, initial_text: str):
42
+ def start_conversation(self, initial_text: str, standby_mode: bool = False):
40
43
  self.expect_reply = True
44
+ self.standby_mode = standby_mode
41
45
  if initial_text:
42
46
  self.state = State.AI_SPEAKING
43
47
  self.speaker.speak(initial_text)
48
+ elif self.standby_mode:
49
+ # We are entering standby mode to wait for the user indefinitely.
50
+ # If the VAD is PTT, we can safely close the mic stream to turn off the orange dot.
51
+ if hasattr(self.vad, "is_pressed"):
52
+ if hasattr(self.mic, "stop_stream"):
53
+ logger.debug("Microphone stream stopped")
54
+ self.mic.stop_stream()
55
+ self.state = State.STANDBY
56
+ self._reset_listening_state()
44
57
  else:
45
58
  self.state = State.LISTENING
46
59
  self._reset_listening_state()
@@ -111,23 +124,32 @@ class CoreEngine:
111
124
  else:
112
125
  spoken_text = self.speaker.flush()
113
126
  self.was_interrupted = True
127
+ logger.info("Barge-in detected! User interrupted the AI.")
114
128
  self.state = State.LISTENING
115
129
  self.current_silence_duration_ms = 0
116
130
  self.total_recording_ms = self.current_speech_duration_ms
117
131
  self.has_started_speaking = True
118
132
  self.total_listening_ms = 0
119
133
  elif not self.speaker.is_speaking():
120
- self.state = State.LISTENING if self.expect_reply else State.EXECUTING
121
- if self.state == State.LISTENING:
122
- self.was_interrupted = False
123
- self.current_silence_duration_ms = 0
124
- self.total_recording_ms = self.current_speech_duration_ms
125
- self.has_started_speaking = True
126
- self.total_listening_ms = 0
127
- elif self.state == State.EXECUTING:
128
- if hasattr(self.mic, 'stop_stream'):
134
+ if self.standby_mode:
135
+ self.state = State.STANDBY
136
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
137
+ logger.debug("Microphone stream stopped")
129
138
  self.mic.stop_stream()
130
- self.llm.start_request({"status": "notification_delivered"})
139
+ self._reset_listening_state()
140
+ else:
141
+ self.state = State.LISTENING if self.expect_reply else State.EXECUTING
142
+ if self.state == State.LISTENING:
143
+ self.was_interrupted = False
144
+ self.current_silence_duration_ms = 0
145
+ self.total_recording_ms = self.current_speech_duration_ms
146
+ self.has_started_speaking = True
147
+ self.total_listening_ms = 0
148
+ elif self.state == State.EXECUTING:
149
+ if hasattr(self.mic, 'stop_stream'):
150
+ logger.debug("Microphone stream stopped")
151
+ self.mic.stop_stream()
152
+ self.llm.start_request({"status": "notification_delivered"})
131
153
  else:
132
154
  self.current_grace_ms += self.tick_ms
133
155
  if self.current_grace_ms > self.config.vad_silence_grace_ms:
@@ -138,14 +160,22 @@ class CoreEngine:
138
160
  self.current_grace_ms = 0
139
161
 
140
162
  if not self.speaker.is_speaking():
141
- self.state = State.LISTENING if self.expect_reply else State.EXECUTING
142
- if self.state == State.LISTENING:
143
- self._reset_listening_state()
144
- self.was_interrupted = False
145
- elif self.state == State.EXECUTING:
146
- if hasattr(self.mic, 'stop_stream'):
163
+ if self.standby_mode:
164
+ self.state = State.STANDBY
165
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "stop_stream"):
166
+ logger.debug("Microphone stream stopped")
147
167
  self.mic.stop_stream()
148
- self.llm.start_request({"status": "notification_delivered"})
168
+ self._reset_listening_state()
169
+ else:
170
+ self.state = State.LISTENING if self.expect_reply else State.EXECUTING
171
+ if self.state == State.LISTENING:
172
+ self._reset_listening_state()
173
+ self.was_interrupted = False
174
+ elif self.state == State.EXECUTING:
175
+ if hasattr(self.mic, 'stop_stream'):
176
+ logger.debug("Microphone stream stopped")
177
+ self.mic.stop_stream()
178
+ self.llm.start_request({"status": "notification_delivered"})
149
179
 
150
180
  elif self.state == State.LISTENING:
151
181
  self.buffer.append(frame)
@@ -178,6 +208,7 @@ class CoreEngine:
178
208
  return
179
209
 
180
210
  if not self.has_started_speaking and self.total_listening_ms >= self.config.listening_timeout_ms:
211
+ logger.info("Silence timeout reached. Prompting LLM.")
181
212
  self.llm.start_request({"status": "silence_timeout", "user_transcript": ""})
182
213
  self.state = State.PROCESSING
183
214
  self.processing_wait_ms = 0
@@ -189,15 +220,31 @@ class CoreEngine:
189
220
  else:
190
221
  self._reset_listening_state()
191
222
 
223
+ elif self.state == State.STANDBY:
224
+ if is_speech:
225
+ self.standby_mode = False
226
+ self.state = State.LISTENING
227
+ if hasattr(self.vad, "is_pressed") and hasattr(self.mic, "start_stream"):
228
+ # We closed it earlier for PTT, so we need to reopen it.
229
+ logger.debug("Microphone stream started")
230
+ self.mic.start_stream()
231
+ self._reset_listening_state()
232
+ self.buffer.append(frame)
233
+ self.total_listening_ms += self.tick_ms
234
+ self.current_speech_duration_ms += self.tick_ms
235
+ self.has_started_speaking = True
236
+ self.total_recording_ms += self.tick_ms
237
+
192
238
  elif self.state == State.PROCESSING:
193
239
  self.buffer.append(frame)
194
240
  self.processing_wait_ms += self.tick_ms
195
241
 
196
242
  if self.processing_wait_ms >= self.config.llm_timeout_ms:
197
243
  import sys
198
- print("LLM Timeout reached. Assuming agent abandoned the voice loop. Tearing down hardware.", file=sys.stderr)
244
+ logger.error("LLM Timeout reached. Assuming agent abandoned the voice loop. Tearing down hardware.")
199
245
  self.state = State.EXECUTING
200
246
  if hasattr(self.mic, 'stop_stream'):
247
+ logger.debug("Microphone stream stopped")
201
248
  self.mic.stop_stream()
202
249
  self.processing_wait_ms = 0
203
250
  self.buffer = []
@@ -209,6 +256,7 @@ class CoreEngine:
209
256
 
210
257
  orphan_speech = any(f.has_speech for f in self.buffer)
211
258
  if orphan_speech:
259
+ logger.warning("Orphan speech detected. Interrupted previous context.")
212
260
  self.was_interrupted = True
213
261
  self.state = State.LISTENING
214
262
  self.has_started_speaking = True