stackchan-mcp 0.9.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. stackchan_mcp/__init__.py +81 -0
  2. stackchan_mcp/__main__.py +12 -0
  3. stackchan_mcp/_libs/SOURCES.md +130 -0
  4. stackchan_mcp/_libs/opus.dll +0 -0
  5. stackchan_mcp/audio_input_hook.py +432 -0
  6. stackchan_mcp/audio_stream.py +162 -0
  7. stackchan_mcp/capture_server.py +469 -0
  8. stackchan_mcp/cli.py +958 -0
  9. stackchan_mcp/esp32_client.py +983 -0
  10. stackchan_mcp/event_log.py +189 -0
  11. stackchan_mcp/gateway.py +274 -0
  12. stackchan_mcp/handlers/__init__.py +7 -0
  13. stackchan_mcp/handlers/audio.py +21 -0
  14. stackchan_mcp/handlers/camera.py +25 -0
  15. stackchan_mcp/handlers/robot.py +52 -0
  16. stackchan_mcp/http_server.py +398 -0
  17. stackchan_mcp/mcp_router.py +126 -0
  18. stackchan_mcp/mdns_advertiser.py +347 -0
  19. stackchan_mcp/notify.example.yml +21 -0
  20. stackchan_mcp/notify_config.py +235 -0
  21. stackchan_mcp/ownership.py +270 -0
  22. stackchan_mcp/protocol.py +95 -0
  23. stackchan_mcp/queue.py +191 -0
  24. stackchan_mcp/server.py +28 -0
  25. stackchan_mcp/stdio_server.py +1365 -0
  26. stackchan_mcp/stt/__init__.py +62 -0
  27. stackchan_mcp/stt/audio_utils.py +102 -0
  28. stackchan_mcp/stt/base.py +94 -0
  29. stackchan_mcp/stt/faster_whisper.py +217 -0
  30. stackchan_mcp/stt/openai_whisper.py +177 -0
  31. stackchan_mcp/stt/orchestrator.py +568 -0
  32. stackchan_mcp/tools.py +82 -0
  33. stackchan_mcp/tts/__init__.py +62 -0
  34. stackchan_mcp/tts/audio_utils.py +177 -0
  35. stackchan_mcp/tts/base.py +86 -0
  36. stackchan_mcp/tts/orchestrator.py +688 -0
  37. stackchan_mcp/tts/voicevox.py +184 -0
  38. stackchan_mcp-0.9.1.dist-info/METADATA +324 -0
  39. stackchan_mcp-0.9.1.dist-info/RECORD +43 -0
  40. stackchan_mcp-0.9.1.dist-info/WHEEL +5 -0
  41. stackchan_mcp-0.9.1.dist-info/entry_points.txt +2 -0
  42. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE +39 -0
  43. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE-THIRD-PARTY +65 -0
@@ -0,0 +1,983 @@
1
+ """ESP32 connection manager.
2
+
3
+ Acts as a WebSocket server that ESP32 connects TO,
4
+ and as an MCP client that sends commands TO the ESP32.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ from collections.abc import Sequence
11
+ import json
12
+ import logging
13
+ import os
14
+ import time
15
+ import uuid
16
+ from typing import Any
17
+
18
+ import websockets
19
+ import websockets.exceptions
20
+ from websockets.asyncio.server import ServerConnection
21
+
22
+ from .audio_input_hook import push_audio_capture
23
+ from .audio_stream import (
24
+ handle_audio_frame,
25
+ is_recording,
26
+ is_recording_session,
27
+ start_recording,
28
+ stop_recording,
29
+ )
30
+ from .notify_config import (
31
+ DEFAULT_MESSAGE_TEMPLATES,
32
+ NotifyConfig,
33
+ load_notify_config,
34
+ render_template,
35
+ )
36
+ from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Timeout for waiting for ESP32 responses
41
+ RESPONSE_TIMEOUT = 10.0
42
+
43
+ ToolCall = tuple[str, dict[str, Any]]
44
+ ToolCallResult = tuple[Any, dict[str, Any] | None]
45
+
46
+ _TOOL_LANES = {
47
+ "self.robot.": "servo",
48
+ "self.led.": "led",
49
+ "self.display.": "avatar",
50
+ "self.screen.": "display",
51
+ "self.audio_speaker.": "audio",
52
+ "self.camera.": "camera",
53
+ "self.touch.": "touch",
54
+ "self.get_device_status": "status",
55
+ }
56
+
57
+
58
+ def _hardware_lane(tool_name: str) -> str:
59
+ """Return the hardware lane used for per-peripheral dispatch ordering."""
60
+ for prefix, lane in _TOOL_LANES.items():
61
+ if tool_name.startswith(prefix):
62
+ return lane
63
+ return "default"
64
+
65
+
66
+ def _retrieve_future_exception(future: asyncio.Future[Any]) -> None:
67
+ """Mark a completed Future exception as observed, if it has one."""
68
+ if future.done() and not future.cancelled():
69
+ future.exception()
70
+
71
+
72
+ class ESP32Connection:
73
+ """Manages a single ESP32 device connection."""
74
+
75
+ def __init__(self, ws: ServerConnection, session_id: str):
76
+ self._ws = ws
77
+ self.session_id = session_id
78
+ self.device_id: str = "unknown"
79
+ self.tools: list[dict[str, Any]] = []
80
+ self._request_id = 0
81
+ self._pending: dict[int, asyncio.Future[dict[str, Any]]] = {}
82
+ self._connected = True
83
+ self._initialized = False
84
+ # Phase 4.5 avatar: pending load_avatar_set calls waiting for the
85
+ # device's `avatar_set_loaded` reply. Keyed by expected checksum
86
+ # so that overlapping fetches (different sets) can be discriminated.
87
+ self._avatar_set_waiters: dict[str, asyncio.Future[dict[str, Any]]] = {}
88
+ # Device-declared WebSocket protocol version (from the hello
89
+ # message). Defaults to 1, which matches the firmware's default
90
+ # (firmware/main/protocols/websocket_protocol.h: ``version_ = 1``)
91
+ # and the audio framing this gateway emits today (raw Opus
92
+ # payload). v2/v3 add a BinaryProtocol header that this gateway
93
+ # does not yet wrap — see Issue follow-up to #70.
94
+ self.protocol_version: int = 1
95
+
96
+ @property
97
+ def connected(self) -> bool:
98
+ return self._connected
99
+
100
+ @property
101
+ def initialized(self) -> bool:
102
+ return self._initialized
103
+
104
+ def _next_id(self) -> int:
105
+ self._request_id += 1
106
+ return self._request_id
107
+
108
+ async def send_mcp_request(
109
+ self, method: str, params: dict[str, Any]
110
+ ) -> tuple[Any, dict[str, Any] | None]:
111
+ """Send an MCP request to ESP32 and wait for response.
112
+
113
+ Returns (result, error).
114
+ """
115
+ if not self._connected:
116
+ return None, {"code": -32000, "message": "ESP32 not connected"}
117
+
118
+ req_id = self._next_id()
119
+ message = make_mcp_message(self.session_id, method, params, req_id)
120
+
121
+ future: asyncio.Future[dict[str, Any]] = asyncio.get_event_loop().create_future()
122
+ self._pending[req_id] = future
123
+
124
+ try:
125
+ await self._ws_send(json.dumps(message))
126
+ response = await asyncio.wait_for(future, timeout=RESPONSE_TIMEOUT)
127
+ return parse_jsonrpc_response(response)
128
+ except asyncio.CancelledError:
129
+ self._pending.pop(req_id, None)
130
+ raise
131
+ except asyncio.TimeoutError:
132
+ self._pending.pop(req_id, None)
133
+ return None, {"code": -32000, "message": f"Timeout waiting for ESP32 response (method={method})"}
134
+ except Exception as exc:
135
+ self._pending.pop(req_id, None)
136
+ _retrieve_future_exception(future)
137
+ return None, {"code": -32000, "message": f"ESP32 communication error: {exc}"}
138
+
139
+ async def initialize(self, vision_url: str = "", vision_token: str = "") -> bool:
140
+ """Send MCP initialize to ESP32."""
141
+ capabilities: dict[str, Any] = {}
142
+ if vision_url:
143
+ vision: dict[str, Any] = {"url": vision_url}
144
+ if vision_token:
145
+ vision["token"] = vision_token
146
+ capabilities["vision"] = vision
147
+ result, error = await self.send_mcp_request("initialize", {"capabilities": capabilities})
148
+ if error:
149
+ logger.error("ESP32 initialize failed: %s", error)
150
+ return False
151
+
152
+ logger.info(
153
+ "ESP32 initialized: protocol=%s server=%s",
154
+ result.get("protocolVersion", "?"),
155
+ result.get("serverInfo", {}),
156
+ )
157
+ self._initialized = True
158
+ return True
159
+
160
+ async def discover_tools(self) -> list[dict[str, Any]]:
161
+ """Discover tools available on ESP32."""
162
+ all_tools: list[dict[str, Any]] = []
163
+ cursor = ""
164
+
165
+ while True:
166
+ params: dict[str, Any] = {"cursor": cursor}
167
+ result, error = await self.send_mcp_request("tools/list", params)
168
+
169
+ if error:
170
+ logger.error("tools/list failed: %s", error)
171
+ break
172
+
173
+ tools = result.get("tools", [])
174
+ all_tools.extend(tools)
175
+
176
+ next_cursor = result.get("nextCursor", "")
177
+ if not next_cursor:
178
+ break
179
+ cursor = next_cursor
180
+
181
+ self.tools = all_tools
182
+ logger.info("Discovered %d tools on ESP32", len(all_tools))
183
+ return all_tools
184
+
185
+ async def call_tool(
186
+ self, name: str, arguments: dict[str, Any]
187
+ ) -> tuple[Any, dict[str, Any] | None]:
188
+ """Call a tool on ESP32."""
189
+ return await self.send_mcp_request(
190
+ "tools/call", {"name": name, "arguments": arguments}
191
+ )
192
+
193
+ async def send_avatar_set_fetch(
194
+ self,
195
+ url: str,
196
+ token: str,
197
+ mode: str,
198
+ checksum: str,
199
+ expected_size: int,
200
+ timeout: float = 60.0,
201
+ ) -> dict[str, Any]:
202
+ """Send avatar_set_fetch notification and wait for avatar_set_loaded.
203
+
204
+ Returns the device's reply dict ({ok, checksum, error}). Returns a
205
+ synthesized {ok: False, error: ...} dict on timeout or send failure.
206
+ """
207
+ if not self._connected:
208
+ return {"ok": False, "checksum": checksum, "error": "not_connected"}
209
+
210
+ future: asyncio.Future[dict[str, Any]] = asyncio.get_event_loop().create_future()
211
+ # Last-writer-wins on duplicate checksum: cancel the previous waiter
212
+ # so the same set being re-pushed doesn't strand callers.
213
+ previous = self._avatar_set_waiters.pop(checksum, None)
214
+ if previous is not None and not previous.done():
215
+ previous.cancel()
216
+ self._avatar_set_waiters[checksum] = future
217
+
218
+ msg = {
219
+ "type": "avatar_set_fetch",
220
+ "url": url,
221
+ "token": token,
222
+ "mode": mode,
223
+ "checksum": checksum,
224
+ "expected_size": expected_size,
225
+ }
226
+ try:
227
+ await self._ws.send(json.dumps(msg))
228
+ result = await asyncio.wait_for(future, timeout=timeout)
229
+ return result
230
+ except asyncio.TimeoutError:
231
+ self._avatar_set_waiters.pop(checksum, None)
232
+ return {"ok": False, "checksum": checksum, "error": "device_timeout"}
233
+ except asyncio.CancelledError:
234
+ return {"ok": False, "checksum": checksum, "error": "superseded"}
235
+ except Exception as exc:
236
+ self._avatar_set_waiters.pop(checksum, None)
237
+ return {"ok": False, "checksum": checksum, "error": f"send_failed: {exc}"}
238
+
239
+ def handle_avatar_set_loaded(self, payload: dict[str, Any]) -> None:
240
+ """Resolve a pending send_avatar_set_fetch by checksum."""
241
+ checksum = payload.get("checksum", "")
242
+ future = self._avatar_set_waiters.pop(checksum, None)
243
+ if future is not None and not future.done():
244
+ future.set_result(payload)
245
+ else:
246
+ logger.warning(
247
+ "avatar_set_loaded for unknown checksum=%s (no pending waiter)",
248
+ checksum,
249
+ )
250
+
251
+ def handle_response(self, payload: dict[str, Any]) -> None:
252
+ """Handle an incoming MCP response from ESP32."""
253
+ req_id = payload.get("id")
254
+ if req_id is not None and req_id in self._pending:
255
+ future = self._pending.pop(req_id)
256
+ if not future.done():
257
+ future.set_result(payload)
258
+ else:
259
+ # Notification (no id) — log and discard for now
260
+ method = payload.get("method", "")
261
+ logger.info("ESP32 notification: %s", method)
262
+
263
+ async def _ws_send(self, payload: bytes | str) -> None:
264
+ """Send a payload, translating websockets errors to ConnectionError.
265
+
266
+ The ``websockets`` library raises its own exception hierarchy
267
+ (``ConnectionClosed`` and friends), which is *not* a subclass
268
+ of the built-in :class:`ConnectionError`. Without translation
269
+ the orchestrator's ``except ConnectionError`` filter — and the
270
+ MCP handler's ``except RuntimeError`` filter — would let those
271
+ errors leak as raw tracebacks into the MCP transport, breaking
272
+ the say() tool's clean error JSON contract on mid-stream
273
+ disconnect.
274
+ """
275
+ try:
276
+ await self._ws.send(payload)
277
+ except (
278
+ websockets.exceptions.ConnectionClosed,
279
+ OSError,
280
+ ) as exc:
281
+ # Mark the connection dead so subsequent calls fail fast
282
+ # rather than each one re-discovering the broken socket.
283
+ self.disconnect()
284
+ raise ConnectionError(f"WebSocket send failed: {exc}") from exc
285
+
286
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
287
+ """Send a single Opus frame to the ESP32 as a WebSocket binary frame.
288
+
289
+ The device's ``OnData`` handler (firmware/main/protocols/
290
+ websocket_protocol.cc) treats every binary frame as an Opus
291
+ audio payload to feed into its decoder, so this method is the
292
+ TTS pipeline's egress point.
293
+ """
294
+ if not self._connected:
295
+ raise ConnectionError("ESP32 not connected")
296
+ await self._ws_send(opus_frame)
297
+
298
+ async def send_tts_state(self, state: str) -> None:
299
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
300
+
301
+ The device's :func:`Application::OnIncomingJson` translates
302
+ ``{"type":"tts","state":"start"}`` into
303
+ :data:`kDeviceStateSpeaking`, which is the gate for
304
+ :func:`OnIncomingAudio` pushing packets into the decode queue
305
+ (see ``firmware/main/application.cc``). Without bracketing the
306
+ audio frames in start/stop, the device drops them on the floor
307
+ and the speaker stays silent — the TTS tool returns success
308
+ without anything actually playing.
309
+ """
310
+ if not self._connected:
311
+ raise ConnectionError("ESP32 not connected")
312
+ message = {
313
+ "session_id": self.session_id,
314
+ "type": "tts",
315
+ "state": state,
316
+ }
317
+ await self._ws_send(json.dumps(message))
318
+
319
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
320
+ """Send a listen state notification (``start`` / ``stop``).
321
+
322
+ Server-driven counterpart to the device's existing
323
+ :func:`Protocol::SendStartListening` (Issue #91). The
324
+ firmware's :func:`Application::OnIncomingJson` dispatches
325
+ ``state: "start"`` to :func:`Application::StartListening` and
326
+ ``state: "stop"`` to :func:`Application::StopListening`.
327
+
328
+ ``mode`` is currently accepted only for ``state="start"`` and is
329
+ carried on the wire for forward-compatibility — the firmware
330
+ accepts but ignores it in Phase 1 because
331
+ :func:`HandleStartListeningEvent` unconditionally enters
332
+ ``kListeningModeManualStop`` (the gateway controls the stop
333
+ boundary explicitly).
334
+ """
335
+ if not self._connected:
336
+ raise ConnectionError("ESP32 not connected")
337
+ message: dict[str, Any] = {
338
+ "session_id": self.session_id,
339
+ "type": "listen",
340
+ "state": state,
341
+ }
342
+ if state == "start":
343
+ message["mode"] = mode
344
+ await self._ws_send(json.dumps(message))
345
+
346
+ def disconnect(self) -> None:
347
+ """Mark connection as disconnected."""
348
+ self._connected = False
349
+ self._initialized = False
350
+ # Cancel all pending futures
351
+ for future in self._pending.values():
352
+ if not future.done():
353
+ future.set_exception(ConnectionError("ESP32 disconnected"))
354
+ self._pending.clear()
355
+
356
+
357
+ class ESP32Manager:
358
+ """Manages ESP32 device connections.
359
+
360
+ Runs a WebSocket server that ESP32 devices connect to.
361
+ Currently supports a single device connection.
362
+ """
363
+
364
+ def __init__(self, notify_config: NotifyConfig | None = None):
365
+ self._connection: ESP32Connection | None = None
366
+ self._server: Any = None
367
+ self._lock = asyncio.Lock()
368
+ self._notify_config = notify_config or load_notify_config()
369
+ self._init_tasks: list[asyncio.Task] = []
370
+ self._vision_url: str = ""
371
+ self._vision_token: str = ""
372
+ # Per-device serialisation for TTS send sequences. Acquired by
373
+ # the orchestrator around the entire start → frames → stop
374
+ # block so concurrent ``say()`` invocations cannot interleave
375
+ # their Opus frames on the same WebSocket or overlap their
376
+ # ``tts.start``/``tts.stop`` notifications (which would yank
377
+ # the firmware out of ``kDeviceStateSpeaking`` mid-utterance
378
+ # and silently drop the remaining audio). The lock is scoped
379
+ # to the manager because the manager owns the device today —
380
+ # if multi-device support lands later, the lock should move
381
+ # onto :class:`ESP32Connection` instead.
382
+ self._tts_lock = asyncio.Lock()
383
+ # Inbound STT capture (Issue #91) shares the TTS lock rather
384
+ # than running on a separate one. The firmware's
385
+ # ``HandleStartListeningEvent`` aborts any in-flight TTS when
386
+ # a listen.start arrives mid-speaking (state ==
387
+ # ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
388
+ # ``SetListeningMode(kListeningModeManualStop)``), so two
389
+ # operations on the same device's audio path would
390
+ # otherwise step on each other: a ``listen()`` could yank a
391
+ # ``say()`` out of speaking mid-utterance, or a ``say()``
392
+ # could start streaming TTS frames into the buffer a
393
+ # concurrent ``listen()`` is capturing. Treating the audio
394
+ # path as a single resource makes the device's state machine
395
+ # observable from gateway code; if a full-duplex contract
396
+ # ever lands later the lock can split again.
397
+ self._listen_lock = self._tts_lock
398
+ # Device-driven listen capture (= wake word / button / LCD touch
399
+ # paths on the firmware side that call ToggleChatState /
400
+ # WakeWordInvoke / StartListening without an MCP-driven
401
+ # ``listen()`` tool call). When ``_audio_hook_url`` is set, we
402
+ # open the shared audio_stream recording slot on inbound
403
+ # ``{"type":"listen","state":"start"}`` and forward the buffered
404
+ # Opus frames to the hook on the matching ``"stop"`` message.
405
+ # See :mod:`stackchan_mcp.audio_input_hook` for the rationale
406
+ # and protocol details.
407
+ self._audio_hook_url: str = ""
408
+ self._audio_hook_token: str = ""
409
+ # session_id (when device-driven listen has the recording slot
410
+ # open) or None. Storing the session_id rather than a plain bool
411
+ # lets the per-handler disconnect cleanup confirm it still owns
412
+ # the recording before tearing it down — otherwise a stale
413
+ # disconnect can clobber the active buffer of an unrelated
414
+ # session (e.g., a fresh reconnection or an MCP-driven listen()
415
+ # that already took the slot).
416
+ self._device_driven_session_id: str | None = None
417
+ self._tool_lane_locks = {
418
+ "servo": asyncio.Lock(),
419
+ "led": asyncio.Lock(),
420
+ "avatar": asyncio.Lock(),
421
+ "display": asyncio.Lock(),
422
+ "audio": asyncio.Lock(),
423
+ "camera": asyncio.Lock(),
424
+ "touch": asyncio.Lock(),
425
+ "status": asyncio.Lock(),
426
+ "default": asyncio.Lock(),
427
+ }
428
+
429
+ def set_notify_config(self, notify_config: NotifyConfig) -> None:
430
+ """Replace the startup notification config used for future events."""
431
+ self._notify_config = notify_config
432
+
433
+ @property
434
+ def device_connected(self) -> bool:
435
+ return self._connection is not None and self._connection.connected
436
+
437
+ @property
438
+ def connection(self) -> ESP32Connection | None:
439
+ return self._connection
440
+
441
+ @property
442
+ def tts_lock(self) -> asyncio.Lock:
443
+ """Per-device lock guarding the TTS send sequence.
444
+
445
+ See :attr:`_tts_lock` for the rationale; the orchestrator wraps
446
+ the start → frames → stop block in ``async with`` on this lock.
447
+ """
448
+ return self._tts_lock
449
+
450
+ @property
451
+ def listen_lock(self) -> asyncio.Lock:
452
+ """Per-device lock guarding the STT capture sequence.
453
+
454
+ See :attr:`_listen_lock` for the rationale; the orchestrator
455
+ wraps the entire ``listen.start`` → wait → ``listen.stop``
456
+ block in ``async with`` on this lock so two concurrent
457
+ ``listen()`` calls cannot share the inbound recording slot.
458
+ """
459
+ return self._listen_lock
460
+
461
+ async def start(
462
+ self,
463
+ host: str = "0.0.0.0",
464
+ port: int = 8765,
465
+ vision_url: str = "",
466
+ vision_token: str = "",
467
+ audio_hook_url: str = "",
468
+ audio_hook_token: str = "",
469
+ ) -> None:
470
+ """Start the WebSocket server for ESP32 connections."""
471
+ self._vision_url = vision_url
472
+ self._vision_token = vision_token
473
+ self._audio_hook_url = audio_hook_url
474
+ self._audio_hook_token = audio_hook_token
475
+ if audio_hook_url:
476
+ logger.info(
477
+ "Device-driven listen capture enabled (audio hook %s)",
478
+ audio_hook_url,
479
+ )
480
+ logger.info("ESP32 WebSocket server starting on ws://%s:%d", host, port)
481
+ self._server = await websockets.serve(
482
+ self._handler,
483
+ host,
484
+ port,
485
+ process_request=self._check_auth,
486
+ )
487
+
488
+ async def stop(self) -> None:
489
+ """Stop the WebSocket server."""
490
+ # Cancel any pending initialization tasks
491
+ for task in self._init_tasks:
492
+ task.cancel()
493
+ self._init_tasks.clear()
494
+
495
+ if self._server:
496
+ self._server.close()
497
+ await self._server.wait_closed()
498
+ self._server = None
499
+
500
+ def _check_auth(
501
+ self, connection: ServerConnection, request: websockets.http11.Request
502
+ ) -> None | websockets.http11.Response:
503
+ """Validate Bearer token.
504
+
505
+ websockets 16+ passes (connection, request) to process_request.
506
+ """
507
+ expected = os.getenv("STACKCHAN_TOKEN") or os.getenv("BEARER_TOKEN")
508
+ if not expected:
509
+ logger.warning("STACKCHAN_TOKEN not set — accepting all connections")
510
+ return None
511
+
512
+ auth = request.headers.get("Authorization", "")
513
+ if auth == f"Bearer {expected}":
514
+ return None
515
+
516
+ logger.warning("ESP32 auth rejected")
517
+ return websockets.http11.Response(
518
+ 401, "Unauthorized", websockets.datastructures.Headers()
519
+ )
520
+
521
+ async def _handler(self, ws: ServerConnection) -> None:
522
+ """Handle an incoming ESP32 WebSocket connection.
523
+
524
+ Architecture: the message read loop runs continuously, dispatching
525
+ MCP responses to pending futures. Initialization (initialize + tools/list)
526
+ runs as a separate task so it doesn't block the read loop.
527
+ """
528
+ session_id = str(uuid.uuid4())
529
+ device_id = (
530
+ ws.request.headers.get("Device-Id", "unknown") if ws.request else "unknown"
531
+ )
532
+ logger.info("ESP32 connecting: device=%s", device_id)
533
+
534
+ connection = ESP32Connection(ws, session_id)
535
+ connection.device_id = device_id
536
+
537
+ try:
538
+ async for message in ws:
539
+ if isinstance(message, bytes):
540
+ # Binary = audio frame. Forward to the audio_stream
541
+ # module which buffers it for STT capture (Issue
542
+ # #91) when a recording slot is open, or discards
543
+ # it otherwise. Only protocol v1 is supported on
544
+ # the inbound side today; the orchestrator gates
545
+ # listen() on protocol_version=1 so v2/v3 frames
546
+ # cannot reach this point with recording active.
547
+ await handle_audio_frame(message, session_id)
548
+ continue
549
+
550
+ try:
551
+ data = json.loads(message)
552
+ except json.JSONDecodeError:
553
+ logger.warning("Invalid JSON from ESP32: %s", str(message)[:100])
554
+ continue
555
+
556
+ msg_type = data.get("type", "")
557
+
558
+ if msg_type == "hello":
559
+ # ESP32 hello handshake
560
+ features = data.get("features", {})
561
+ if not features.get("mcp"):
562
+ logger.warning("ESP32 does not support MCP, rejecting")
563
+ await ws.close()
564
+ return
565
+
566
+ # Capture the device's WebSocket protocol version
567
+ # so callers (e.g. the TTS pipeline) can decide
568
+ # whether their wire format is compatible. The
569
+ # firmware accepts raw Opus only on v1; v2/v3 wrap
570
+ # the payload in a BinaryProtocol header.
571
+ raw_version = data.get("version", 1)
572
+ try:
573
+ connection.protocol_version = int(raw_version)
574
+ except (TypeError, ValueError):
575
+ connection.protocol_version = 1
576
+ if connection.protocol_version != 1:
577
+ logger.warning(
578
+ "ESP32 negotiated WebSocket protocol "
579
+ "version=%s; the gateway emits raw Opus "
580
+ "binary frames matching v1 only. TTS "
581
+ "calls (say) will be blocked at the "
582
+ "orchestrator until v2/v3 BinaryProtocol "
583
+ "header wrapping is implemented",
584
+ connection.protocol_version,
585
+ )
586
+
587
+ # Send hello response
588
+ resp = HelloResponse(session_id=session_id)
589
+ await ws.send(resp.model_dump_json())
590
+
591
+ # Register connection
592
+ async with self._lock:
593
+ if self._connection and self._connection.connected:
594
+ logger.warning("Replacing existing ESP32 connection")
595
+ self._connection.disconnect()
596
+ self._connection = connection
597
+
598
+ # Start initialization as a separate task so the read loop
599
+ # continues to pump messages (responses to initialize/tools_list)
600
+ task = asyncio.create_task(self._init_device(connection, device_id))
601
+ self._init_tasks.append(task)
602
+ task.add_done_callback(lambda t: self._init_tasks.remove(t) if t in self._init_tasks else None)
603
+
604
+ elif msg_type == "mcp":
605
+ # MCP response from ESP32
606
+ payload = data.get("payload", {})
607
+ connection.handle_response(payload)
608
+
609
+ elif msg_type == "avatar_set_loaded":
610
+ # Phase 4.5 avatar (saiverse-stackchan-addon): device
611
+ # reports the result of a load_avatar_set fetch (see
612
+ # docs/intent/stackchan_avatar_pipeline.md §C-3 in
613
+ # the SAIVerse repository).
614
+ connection.handle_avatar_set_loaded(data)
615
+
616
+ elif msg_type == "stackchan-event":
617
+ await self._emit_stackchan_event(data)
618
+
619
+ elif msg_type == "listen":
620
+ # Device-driven listening start/stop notification
621
+ # (wake word, button press, LCD touch — anything
622
+ # that calls Application::ToggleChatState /
623
+ # WakeWordInvoke / StartListening on the firmware
624
+ # side). The MCP-driven listen() tool sends the
625
+ # same wire format in the reverse direction and
626
+ # already opens its own recording slot via the STT
627
+ # orchestrator, so we only act when the device
628
+ # initiated the capture AND an audio hook URL is
629
+ # configured to receive the result. See
630
+ # :mod:`stackchan_mcp.audio_input_hook` for the
631
+ # forwarding pipeline.
632
+ state = data.get("state", "")
633
+ if state == "start":
634
+ if not self._audio_hook_url:
635
+ logger.debug(
636
+ "device-driven listen.start session=%s "
637
+ "ignored (STACKCHAN_AUDIO_HOOK_URL not "
638
+ "configured)",
639
+ session_id,
640
+ )
641
+ elif is_recording():
642
+ # An MCP-driven listen() already owns the
643
+ # recording slot; let it complete rather
644
+ # than corrupting its buffer.
645
+ logger.debug(
646
+ "device-driven listen.start session=%s "
647
+ "ignored (MCP-driven recording active)",
648
+ session_id,
649
+ )
650
+ else:
651
+ start_recording(session_id)
652
+ self._device_driven_session_id = session_id
653
+ logger.info(
654
+ "device-driven listen started: "
655
+ "session=%s mode=%s",
656
+ session_id, data.get("mode", ""),
657
+ )
658
+ elif state == "stop":
659
+ if self._device_driven_session_id == session_id:
660
+ self._device_driven_session_id = None
661
+ frames = stop_recording()
662
+ logger.info(
663
+ "device-driven listen stopped: "
664
+ "session=%s frames=%d",
665
+ session_id, len(frames),
666
+ )
667
+ # Push asynchronously so the WebSocket read
668
+ # loop is not blocked by the HTTP POST
669
+ # round-trip. The task is fire-and-forget;
670
+ # failures are logged inside
671
+ # push_audio_capture and do not propagate.
672
+ asyncio.create_task(
673
+ push_audio_capture(
674
+ self._audio_hook_url,
675
+ self._audio_hook_token,
676
+ frames,
677
+ session_id=session_id,
678
+ )
679
+ )
680
+ else:
681
+ logger.debug(
682
+ "listen message with unknown state=%r "
683
+ "session=%s",
684
+ state, session_id,
685
+ )
686
+
687
+ else:
688
+ logger.debug("ESP32 message type=%s (ignored)", msg_type)
689
+
690
+ except websockets.exceptions.ConnectionClosed:
691
+ logger.info("ESP32 disconnected: device=%s", device_id)
692
+ finally:
693
+ # If the device disconnected mid-capture, drop any partial
694
+ # buffer rather than letting it leak into the next
695
+ # connection's recording slot (mirrors the discard logic in
696
+ # audio_stream.handle_audio_frame for session-mismatched
697
+ # frames).
698
+ #
699
+ # Guard the cleanup by session_id: a stale disconnect must
700
+ # not tear down the active buffer of an unrelated session
701
+ # that may have grabbed the recording slot since (a fresh
702
+ # reconnection or an MCP-driven listen() that took over).
703
+ # The audio_stream layer also tracks the recording session,
704
+ # so we double-check via is_recording_session().
705
+ if self._device_driven_session_id == session_id and (
706
+ is_recording_session(session_id)
707
+ ):
708
+ self._device_driven_session_id = None
709
+ discarded = stop_recording()
710
+ if discarded:
711
+ logger.warning(
712
+ "device-driven listen aborted mid-capture: "
713
+ "session=%s discarded %d frames",
714
+ session_id, len(discarded),
715
+ )
716
+ elif self._device_driven_session_id == session_id:
717
+ # Our handler thought it owned the slot, but audio_stream
718
+ # disagrees — clear our local flag without tearing down
719
+ # the slot, then keep going.
720
+ self._device_driven_session_id = None
721
+ connection.disconnect()
722
+ async with self._lock:
723
+ if self._connection is connection:
724
+ self._connection = None
725
+
726
+ async def _init_device(self, connection: ESP32Connection, device_id: str) -> None:
727
+ """Initialize MCP session with a newly connected device."""
728
+ if await connection.initialize(
729
+ vision_url=self._vision_url,
730
+ vision_token=self._vision_token,
731
+ ):
732
+ await connection.discover_tools()
733
+ logger.info(
734
+ "ESP32 ready: device=%s tools=%d",
735
+ device_id,
736
+ len(connection.tools),
737
+ )
738
+ else:
739
+ logger.error("ESP32 MCP initialization failed")
740
+
741
+ async def _emit_stackchan_event(self, payload: dict[str, Any]) -> None:
742
+ """Forward a firmware-originated stackchan event to the MCP client."""
743
+ event_type = payload.get("event_type")
744
+ subtype = payload.get("subtype")
745
+ duration_ms = payload.get("duration_ms")
746
+ ts = payload.get("ts")
747
+ session_id = payload.get("session_id")
748
+
749
+ if event_type != "touch":
750
+ logger.warning("Malformed stackchan-event frame: event_type=%r", event_type)
751
+ return
752
+ if subtype not in {"tap", "stroke"}:
753
+ logger.warning("Malformed stackchan-event frame: subtype=%r", subtype)
754
+ return
755
+ if (
756
+ isinstance(duration_ms, bool)
757
+ or not isinstance(duration_ms, int)
758
+ or duration_ms < 0
759
+ ):
760
+ logger.warning(
761
+ "Malformed stackchan-event frame: duration_ms=%r",
762
+ duration_ms,
763
+ )
764
+ return
765
+ if isinstance(ts, bool) or not isinstance(ts, int) or ts < 0:
766
+ logger.warning("Malformed stackchan-event frame: ts=%r", ts)
767
+ return
768
+ if not isinstance(session_id, str) or not session_id:
769
+ logger.warning("Malformed stackchan-event frame: session_id=%r", session_id)
770
+ return
771
+
772
+ config = self._notify_config
773
+ message = config.messages.get(
774
+ (event_type, subtype),
775
+ DEFAULT_MESSAGE_TEMPLATES[(event_type, subtype)],
776
+ )
777
+ ts_unix = time.time()
778
+ event_payload = {
779
+ "event_type": event_type,
780
+ "subtype": subtype,
781
+ "duration_ms": duration_ms,
782
+ "action": message.action,
783
+ "ts": ts,
784
+ "ts_unix": ts_unix,
785
+ "session_id": session_id,
786
+ }
787
+ legacy_params = {
788
+ "event_type": event_type,
789
+ "subtype": subtype,
790
+ "duration_ms": duration_ms,
791
+ "action": message.action,
792
+ "ts": ts,
793
+ "session_id": session_id,
794
+ }
795
+ logger.info(
796
+ "stackchan-event: %s/%s action=%s duration=%sms ts=%s session=%s",
797
+ event_type,
798
+ subtype,
799
+ message.action,
800
+ duration_ms,
801
+ ts,
802
+ session_id,
803
+ )
804
+
805
+ if not (
806
+ config.legacy_event_enabled
807
+ or config.channels_enabled
808
+ or config.jsonl_enabled
809
+ ):
810
+ logger.info(
811
+ "stackchan-event received and dropped: notification paths disabled"
812
+ )
813
+ return
814
+
815
+ from .stdio_server import notify_stackchan_event
816
+
817
+ if config.legacy_event_enabled:
818
+ await notify_stackchan_event("stackchan/event", legacy_params)
819
+
820
+ if config.channels_enabled:
821
+ content = render_template(message.template, event_payload)
822
+ # Channel notification meta must be all-string per CC binary's
823
+ # Zod schema (matches public plugins: telegram/discord/imessage
824
+ # all use string fields like chat_id, message_id, ts in ISO).
825
+ channel_meta = {
826
+ "event_type": event_type,
827
+ "subtype": subtype,
828
+ "duration_ms": str(duration_ms),
829
+ "action": message.action,
830
+ "ts": str(ts),
831
+ "ts_unix": str(ts_unix),
832
+ "session_id": session_id,
833
+ }
834
+ await notify_stackchan_event(
835
+ "notifications/claude/channel",
836
+ {"content": content, "meta": channel_meta},
837
+ )
838
+
839
+ if config.jsonl_enabled:
840
+ # ``log_event`` swallows OS / permission errors internally; the
841
+ # broad except below is a second-tier guard so any unforeseen
842
+ # helper bug cannot break the in-band notification paths above.
843
+ from .event_log import log_event
844
+
845
+ try:
846
+ log_event(
847
+ event_type=event_type,
848
+ subtype=subtype,
849
+ duration_ms=duration_ms,
850
+ ts=ts,
851
+ session_id=session_id,
852
+ action=message.action,
853
+ path=config.jsonl_path,
854
+ ts_unix=ts_unix,
855
+ )
856
+ except Exception as exc: # pragma: no cover - defensive guard
857
+ logger.warning(
858
+ "stackchan-event log persistence raised unexpectedly: %s", exc
859
+ )
860
+
861
+ async def call_tool(
862
+ self, name: str, arguments: dict[str, Any]
863
+ ) -> ToolCallResult:
864
+ """Call a tool on the connected ESP32 device."""
865
+ result = await self.call_tools([(name, arguments)])
866
+ return result[0]
867
+
868
+ async def call_tools(self, calls: Sequence[ToolCall]) -> list[ToolCallResult]:
869
+ """Call multiple ESP32 tools while preserving per-hardware ordering.
870
+
871
+ Existing single-tool callers should continue to use ``call_tool``.
872
+ This helper is for compound gateway flows that can safely overlap
873
+ hardware-independent peripherals, such as servo + LEDs + avatar.
874
+ Calls sharing the same hardware lane are serialized; calls on
875
+ different lanes are dispatched concurrently.
876
+ """
877
+ if not calls:
878
+ return []
879
+ if not self._connection or not self._connection.connected:
880
+ return [
881
+ (None, {"code": -32000, "message": "No ESP32 device connected"})
882
+ for _ in calls
883
+ ]
884
+ if not self._connection.initialized:
885
+ return [
886
+ (None, {"code": -32000, "message": "ESP32 not initialized"})
887
+ for _ in calls
888
+ ]
889
+
890
+ connection = self._connection
891
+ return list(
892
+ await asyncio.gather(
893
+ *(
894
+ self._call_tool_on_connection(connection, name, arguments)
895
+ for name, arguments in calls
896
+ )
897
+ )
898
+ )
899
+
900
+ async def _call_tool_on_connection(
901
+ self,
902
+ connection: ESP32Connection,
903
+ name: str,
904
+ arguments: dict[str, Any],
905
+ ) -> ToolCallResult:
906
+ lane = _hardware_lane(name)
907
+ lock = self._tool_lane_locks[lane]
908
+ async with lock:
909
+ if connection is not self._connection or not connection.connected:
910
+ return None, {"code": -32000, "message": "ESP32 not connected"}
911
+ return await connection.call_tool(name, arguments)
912
+
913
+ async def send_avatar_set_fetch(
914
+ self,
915
+ url: str,
916
+ token: str,
917
+ mode: str,
918
+ checksum: str,
919
+ expected_size: int,
920
+ timeout: float = 60.0,
921
+ ) -> dict[str, Any]:
922
+ """Forward an avatar_set_fetch to the device and await the reply.
923
+
924
+ Phase 4.5 avatar (saiverse-stackchan-addon). Returns a dict with
925
+ keys {ok, checksum, error}; ok=False is returned with a synthetic
926
+ error when no device is connected (rather than raising) so the
927
+ MCP tool surfaces a clean error JSON to the caller.
928
+ """
929
+ if not self._connection or not self._connection.connected:
930
+ return {"ok": False, "checksum": checksum, "error": "no_device"}
931
+ return await self._connection.send_avatar_set_fetch(
932
+ url, token, mode, checksum, expected_size, timeout
933
+ )
934
+
935
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
936
+ """Push a single Opus frame to the connected device.
937
+
938
+ Used by the TTS pipeline to deliver synthesised audio. Raises
939
+ :class:`ConnectionError` if no device is currently attached so
940
+ the orchestrator can surface a clean error to the MCP client
941
+ instead of silently dropping audio.
942
+ """
943
+ if not self._connection or not self._connection.connected:
944
+ raise ConnectionError("No ESP32 device connected")
945
+ await self._connection.send_audio_frame(opus_frame)
946
+
947
+ async def send_tts_state(self, state: str) -> None:
948
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
949
+
950
+ Required around audio frame egress so the device transitions
951
+ into ``kDeviceStateSpeaking`` and back; see
952
+ :meth:`ESP32Connection.send_tts_state` for the full rationale.
953
+ """
954
+ if not self._connection or not self._connection.connected:
955
+ raise ConnectionError("No ESP32 device connected")
956
+ await self._connection.send_tts_state(state)
957
+
958
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
959
+ """Send a listen state notification to put the device into /
960
+ out of listening mode (Issue #91).
961
+
962
+ See :meth:`ESP32Connection.send_listen_state` for the wire
963
+ format and the firmware-side dispatch.
964
+ """
965
+ if not self._connection or not self._connection.connected:
966
+ raise ConnectionError("No ESP32 device connected")
967
+ await self._connection.send_listen_state(state, mode=mode)
968
+
969
+ def get_status(self) -> dict[str, Any]:
970
+ """Get current connection status."""
971
+ if not self._connection or not self._connection.connected:
972
+ return {
973
+ "connected": False,
974
+ "device_id": None,
975
+ "tools_count": 0,
976
+ }
977
+ return {
978
+ "connected": True,
979
+ "device_id": self._connection.device_id,
980
+ "initialized": self._connection.initialized,
981
+ "tools_count": len(self._connection.tools),
982
+ "tools": [t.get("name", "") for t in self._connection.tools],
983
+ }