stackchan-mcp 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/PKG-INFO +25 -1
  2. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/README.md +10 -0
  3. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/pyproject.toml +35 -1
  4. stackchan_mcp-0.6.0/stackchan_mcp/audio_stream.py +151 -0
  5. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/cli.py +53 -0
  6. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/esp32_client.py +201 -1
  7. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/stdio_server.py +234 -0
  8. stackchan_mcp-0.6.0/stackchan_mcp/stt/__init__.py +62 -0
  9. stackchan_mcp-0.6.0/stackchan_mcp/stt/audio_utils.py +102 -0
  10. stackchan_mcp-0.6.0/stackchan_mcp/stt/base.py +94 -0
  11. stackchan_mcp-0.6.0/stackchan_mcp/stt/faster_whisper.py +217 -0
  12. stackchan_mcp-0.6.0/stackchan_mcp/stt/openai_whisper.py +177 -0
  13. stackchan_mcp-0.6.0/stackchan_mcp/stt/orchestrator.py +306 -0
  14. stackchan_mcp-0.6.0/stackchan_mcp/tts/__init__.py +55 -0
  15. stackchan_mcp-0.6.0/stackchan_mcp/tts/audio_utils.py +177 -0
  16. stackchan_mcp-0.6.0/stackchan_mcp/tts/base.py +86 -0
  17. stackchan_mcp-0.6.0/stackchan_mcp/tts/orchestrator.py +282 -0
  18. stackchan_mcp-0.6.0/stackchan_mcp/tts/voicevox.py +184 -0
  19. stackchan_mcp-0.6.0/tests/_audio_fixtures.py +46 -0
  20. stackchan_mcp-0.6.0/tests/test_audio_stream.py +145 -0
  21. stackchan_mcp-0.6.0/tests/test_audio_utils.py +222 -0
  22. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_cli.py +101 -0
  23. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_esp32_client.py +236 -1
  24. stackchan_mcp-0.6.0/tests/test_orchestrator.py +541 -0
  25. stackchan_mcp-0.6.0/tests/test_stdio_server.py +377 -0
  26. stackchan_mcp-0.6.0/tests/test_stt_audio_utils.py +100 -0
  27. stackchan_mcp-0.6.0/tests/test_stt_framework.py +195 -0
  28. stackchan_mcp-0.6.0/tests/test_stt_orchestrator.py +441 -0
  29. stackchan_mcp-0.6.0/tests/test_tts_framework.py +173 -0
  30. stackchan_mcp-0.6.0/tests/test_voicevox.py +193 -0
  31. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/uv.lock +794 -2
  32. stackchan_mcp-0.4.0/stackchan_mcp/audio_stream.py +0 -34
  33. stackchan_mcp-0.4.0/tests/test_stdio_server.py +0 -148
  34. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/.env.example +0 -0
  35. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/.gitignore +0 -0
  36. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/LICENSE +0 -0
  37. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__init__.py +0 -0
  38. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__main__.py +0 -0
  39. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/capture_server.py +0 -0
  40. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/gateway.py +0 -0
  41. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/__init__.py +0 -0
  42. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/audio.py +0 -0
  43. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/camera.py +0 -0
  44. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/robot.py +0 -0
  45. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/mcp_router.py +0 -0
  46. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/protocol.py +0 -0
  47. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/server.py +0 -0
  48. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tools.py +0 -0
  49. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/conftest.py +0 -0
  50. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_capture_server.py +0 -0
  51. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_gateway.py +0 -0
  52. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_mcp_router.py +0 -0
  53. {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_protocol.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stackchan-mcp
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
5
5
  Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
6
6
  Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
@@ -27,6 +27,20 @@ Requires-Dist: mcp>=1.0
27
27
  Requires-Dist: pydantic>=2
28
28
  Requires-Dist: python-dotenv
29
29
  Requires-Dist: websockets>=12
30
+ Provides-Extra: stt
31
+ Requires-Dist: opuslib>=3; extra == 'stt'
32
+ Provides-Extra: stt-faster-whisper
33
+ Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
34
+ Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
35
+ Provides-Extra: stt-openai
36
+ Requires-Dist: openai>=1.0; extra == 'stt-openai'
37
+ Requires-Dist: opuslib>=3; extra == 'stt-openai'
38
+ Provides-Extra: tts
39
+ Requires-Dist: httpx>=0.27; extra == 'tts'
40
+ Requires-Dist: opuslib>=3; extra == 'tts'
41
+ Provides-Extra: tts-voicevox
42
+ Requires-Dist: httpx>=0.27; extra == 'tts-voicevox'
43
+ Requires-Dist: opuslib>=3; extra == 'tts-voicevox'
30
44
  Description-Content-Type: text/markdown
31
45
 
32
46
  # gateway
@@ -191,6 +205,16 @@ Same shape, under `mcpServers`.
191
205
  | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
192
206
  | `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
193
207
  | `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
208
+ | `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
209
+ | `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
210
+ | `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
211
+ | `clear_leds` | Turn all 12 base RGB LEDs off. |
212
+
213
+ The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
214
+ (expander pin 13, not an ESP32 GPIO), so all four LED tools share the
215
+ PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
216
+ init fails at boot, the LED tools degrade with `available=false`
217
+ instead of cascading errors.
194
218
 
195
219
  The mapping from these names to ESP32-side `self.*` MCP tools is in
196
220
  `stackchan_mcp/stdio_server.py`.
@@ -160,6 +160,16 @@ Same shape, under `mcpServers`.
160
160
  | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
161
161
  | `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
162
162
  | `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
163
+ | `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
164
+ | `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
165
+ | `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
166
+ | `clear_leds` | Turn all 12 base RGB LEDs off. |
167
+
168
+ The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
169
+ (expander pin 13, not an ESP32 GPIO), so all four LED tools share the
170
+ PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
171
+ init fails at boot, the LED tools degrade with `available=false`
172
+ instead of cascading errors.
163
173
 
164
174
  The mapping from these names to ESP32-side `self.*` MCP tools is in
165
175
  `stackchan_mcp/stdio_server.py`.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "stackchan-mcp"
3
- version = "0.4.0"
3
+ version = "0.6.0"
4
4
  description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,6 +32,40 @@ dependencies = [
32
32
  "aiohttp>=3",
33
33
  ]
34
34
 
35
+ [project.optional-dependencies]
36
+ # Phase 4 TTS — see Issue #70.
37
+ # Concrete engines (VOICEVOX, Irodori) consume these libraries:
38
+ # * httpx — VOICEVOX HTTP engine client
39
+ # * opuslib — Opus encoding for the device's audio decoder
40
+ # `tts-voicevox` is a no-op alias provided so users can declare intent
41
+ # explicitly; the VOICEVOX engine itself is an external HTTP process and
42
+ # adds no Python dependencies of its own.
43
+ tts = [
44
+ "httpx>=0.27",
45
+ "opuslib>=3",
46
+ ]
47
+ tts-voicevox = [
48
+ "stackchan-mcp[tts]",
49
+ ]
50
+
51
+ # Phase 4 STT — see Issue #91.
52
+ # The base `stt` extra carries `opuslib` for decoding the device's
53
+ # inbound Opus frames. Concrete engines live behind their own extras
54
+ # so users only pull in the heavy ML dependencies they actually need.
55
+ # * faster-whisper — local Whisper via CTranslate2 (default, MIT)
56
+ # * openai — OpenAI Whisper API client (cloud)
57
+ stt = [
58
+ "opuslib>=3",
59
+ ]
60
+ stt-faster-whisper = [
61
+ "stackchan-mcp[stt]",
62
+ "faster-whisper>=1.0",
63
+ ]
64
+ stt-openai = [
65
+ "stackchan-mcp[stt]",
66
+ "openai>=1.0",
67
+ ]
68
+
35
69
  [project.urls]
36
70
  Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
37
71
  Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
@@ -0,0 +1,151 @@
1
+ """Opus audio frame handling for the gateway <-> device link.
2
+
3
+ Outbound (TTS) frames are produced by
4
+ :mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
5
+ ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
6
+
7
+ The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
8
+ binary frames coming up from the device land in
9
+ :func:`handle_audio_frame`, which buffers them into a module-level
10
+ recording slot when one is active. The
11
+ :mod:`stackchan_mcp.stt.orchestrator` opens the slot via
12
+ :func:`start_recording` before sending ``listen.start`` to the device
13
+ and closes it via :func:`stop_recording` after the capture window;
14
+ outside an active recording, inbound frames are still discarded.
15
+
16
+ The recording slot is intentionally a module-level singleton: the
17
+ device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
18
+ one connection, and the STT orchestrator serialises ``listen()`` calls
19
+ through :attr:`ESP32Manager.listen_lock`, so concurrent captures
20
+ cannot race the buffer. If multi-device support lands later, this
21
+ should move onto the connection object.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ from typing import TYPE_CHECKING, Iterable
28
+
29
+ if TYPE_CHECKING:
30
+ from .esp32_client import ESP32Manager
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # --- Recording slot (inbound STT capture) ---------------------------------
36
+ #
37
+ # A single capture at a time is enforced by the orchestrator's
38
+ # ``listen_lock``; this module only owns the buffer itself.
39
+
40
+ _recording_session_id: str | None = None
41
+ _recording_frames: list[bytes] = []
42
+
43
+
44
+ def start_recording(session_id: str) -> None:
45
+ """Open a fresh recording slot for ``session_id``.
46
+
47
+ Any frames already buffered are discarded so a previous call that
48
+ crashed before ``stop_recording`` cannot leak into the next
49
+ capture. The orchestrator wraps start/stop in a try/finally to
50
+ guarantee the slot is closed even on error.
51
+ """
52
+ global _recording_session_id, _recording_frames
53
+ if _recording_session_id is not None:
54
+ # Defensive: the lock should prevent this, but if it ever
55
+ # fires we leak no audio — just log loudly so the regression
56
+ # is visible.
57
+ logger.warning(
58
+ "start_recording called while session=%s was still active; "
59
+ "dropping %d buffered frames",
60
+ _recording_session_id,
61
+ len(_recording_frames),
62
+ )
63
+ _recording_session_id = session_id
64
+ _recording_frames = []
65
+
66
+
67
+ def stop_recording() -> list[bytes]:
68
+ """Close the recording slot and return the buffered Opus frames.
69
+
70
+ Returns an empty list if no recording was active. The slot is
71
+ cleared whether or not frames were captured so the next call to
72
+ :func:`start_recording` starts clean.
73
+ """
74
+ global _recording_session_id, _recording_frames
75
+ frames = _recording_frames
76
+ _recording_session_id = None
77
+ _recording_frames = []
78
+ return frames
79
+
80
+
81
+ def is_recording() -> bool:
82
+ """Return ``True`` when a recording slot is currently open."""
83
+ return _recording_session_id is not None
84
+
85
+
86
+ async def handle_audio_frame(data: bytes, session_id: str) -> None:
87
+ """Process an incoming binary Opus frame from the device.
88
+
89
+ When a recording slot is active (see :func:`start_recording`) AND
90
+ the frame belongs to the recording's session, appends the frame
91
+ to the in-memory buffer for later decoding by the STT
92
+ orchestrator. Frames from a different session — typical during
93
+ a connection swap, where the old WebSocket handler is still
94
+ draining incoming bytes after :meth:`ESP32Connection.disconnect`
95
+ has been called on the main task — are dropped so they cannot
96
+ bleed into the new connection's capture buffer.
97
+
98
+ Outside of an active recording the frame is logged at debug
99
+ level and discarded; the device may emit audio on its own (e.g.
100
+ after an autonomous wake-word detection) and the gateway has no
101
+ STT pipeline running for those frames yet.
102
+ """
103
+ if _recording_session_id is None:
104
+ logger.debug(
105
+ "audio_frame session=%s bytes=%d (discarded — no active recording)",
106
+ session_id,
107
+ len(data),
108
+ )
109
+ return
110
+ if _recording_session_id != session_id:
111
+ # A different connection is sending audio while a recording
112
+ # for this session is in flight. This happens when ESP32
113
+ # reconnects: ``ESP32Manager._handler`` swaps in a new
114
+ # ``ESP32Connection`` and marks the old one disconnected,
115
+ # but the old socket's ``async for message in ws`` loop can
116
+ # still drain a frame or two before the close lands. Letting
117
+ # those into the buffer would corrupt the new session's
118
+ # transcription, so drop them here.
119
+ logger.debug(
120
+ "audio_frame session=%s bytes=%d (discarded — does not match "
121
+ "recording session=%s)",
122
+ session_id,
123
+ len(data),
124
+ _recording_session_id,
125
+ )
126
+ return
127
+ _recording_frames.append(data)
128
+ logger.debug(
129
+ "audio_frame session=%s bytes=%d buffered (recording active)",
130
+ session_id,
131
+ len(data),
132
+ )
133
+
134
+
135
+ async def push_opus_frames(
136
+ esp32: ESP32Manager,
137
+ frames: Iterable[bytes],
138
+ ) -> int:
139
+ """Push Opus frames to the connected ESP32.
140
+
141
+ Returns the number of frames sent so the caller can report this to
142
+ the MCP client. Raises :class:`ConnectionError` (via
143
+ :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
144
+ mid-stream — the orchestrator turns that into a clean MCP error
145
+ rather than letting it bubble up as a stack trace.
146
+ """
147
+ sent = 0
148
+ for frame in frames:
149
+ await esp32.send_audio_frame(frame)
150
+ sent += 1
151
+ return sent
@@ -16,6 +16,7 @@ import asyncio
16
16
  import errno
17
17
  import logging
18
18
  import os
19
+ import platform
19
20
  import shutil
20
21
  import socket
21
22
  import subprocess
@@ -377,6 +378,56 @@ def _load_dotenv() -> None:
377
378
  load_dotenv()
378
379
 
379
380
 
381
+ # Default Homebrew prefixes that ship libopus.dylib on macOS. Apple
382
+ # Silicon installs default to ``/opt/homebrew``; Intel Macs use
383
+ # ``/usr/local``. Keeping both keeps the helper portable across
384
+ # contributor machines.
385
+ _HOMEBREW_LIB_DIRS = ("/opt/homebrew/lib", "/usr/local/lib")
386
+
387
+
388
+ def _ensure_libopus_findable() -> None:
389
+ """Make libopus reachable to opuslib's ``ctypes.find_library`` on macOS.
390
+
391
+ ``opuslib.api`` calls ``ctypes.util.find_library("opus")`` at
392
+ import time. On macOS that walks ``DYLD_LIBRARY_PATH`` plus a
393
+ couple of system-default directories — but not Homebrew's
394
+ ``/opt/homebrew/lib`` (Apple Silicon) or ``/usr/local/lib`` (Intel),
395
+ so a vanilla ``brew install opus`` lands a working libopus that
396
+ opuslib still cannot find. Users then see ``Could not find Opus
397
+ library`` even though the dylib is on disk.
398
+
399
+ Prepend any Homebrew-style lib directories that exist so the next
400
+ ``find_library`` call (triggered by the lazy ``import opuslib``
401
+ inside :func:`audio_utils.encode_opus_frames`) succeeds. We
402
+ deliberately *prepend* and skip duplicates so an explicit
403
+ ``DYLD_LIBRARY_PATH`` set by the operator (e.g. for a custom build
404
+ of libopus) keeps priority. No-op on non-macOS hosts.
405
+ """
406
+ if platform.system() != "Darwin":
407
+ return
408
+
409
+ existing = os.environ.get("DYLD_LIBRARY_PATH", "")
410
+ paths: list[str] = [p for p in existing.split(":") if p]
411
+
412
+ prepended: list[str] = []
413
+ for candidate in _HOMEBREW_LIB_DIRS:
414
+ if candidate in paths:
415
+ continue
416
+ if not os.path.isdir(candidate):
417
+ continue
418
+ prepended.append(candidate)
419
+
420
+ if not prepended:
421
+ return
422
+
423
+ os.environ["DYLD_LIBRARY_PATH"] = ":".join(prepended + paths)
424
+ logger.debug(
425
+ "Prepended Homebrew lib dirs to DYLD_LIBRARY_PATH so opuslib "
426
+ "can find libopus: %s",
427
+ prepended,
428
+ )
429
+
430
+
380
431
  def _run_preflight() -> int:
381
432
  """Run preflight diagnostics. Returns the desired process exit code.
382
433
 
@@ -387,6 +438,7 @@ def _run_preflight() -> int:
387
438
  warns about a missing ``STACKCHAN_TOKEN``.
388
439
  """
389
440
  _load_dotenv()
441
+ _ensure_libopus_findable()
390
442
 
391
443
  issues = 0
392
444
  print(f"stackchan-mcp {__version__} preflight")
@@ -527,6 +579,7 @@ def main(argv: list[str] | None = None) -> None:
527
579
  sys.exit(_run_preflight())
528
580
 
529
581
  _load_dotenv()
582
+ _ensure_libopus_findable()
530
583
 
531
584
  logging.basicConfig(
532
585
  level=logging.INFO,
@@ -14,8 +14,10 @@ import uuid
14
14
  from typing import Any
15
15
 
16
16
  import websockets
17
+ import websockets.exceptions
17
18
  from websockets.asyncio.server import ServerConnection
18
19
 
20
+ from .audio_stream import handle_audio_frame
19
21
  from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
20
22
 
21
23
  logger = logging.getLogger(__name__)
@@ -36,6 +38,13 @@ class ESP32Connection:
36
38
  self._pending: dict[int, asyncio.Future[dict[str, Any]]] = {}
37
39
  self._connected = True
38
40
  self._initialized = False
41
+ # Device-declared WebSocket protocol version (from the hello
42
+ # message). Defaults to 1, which matches the firmware's default
43
+ # (firmware/main/protocols/websocket_protocol.h: ``version_ = 1``)
44
+ # and the audio framing this gateway emits today (raw Opus
45
+ # payload). v2/v3 add a BinaryProtocol header that this gateway
46
+ # does not yet wrap — see Issue follow-up to #70.
47
+ self.protocol_version: int = 1
39
48
 
40
49
  @property
41
50
  def connected(self) -> bool:
@@ -142,6 +151,89 @@ class ESP32Connection:
142
151
  method = payload.get("method", "")
143
152
  logger.info("ESP32 notification: %s", method)
144
153
 
154
+ async def _ws_send(self, payload: bytes | str) -> None:
155
+ """Send a payload, translating websockets errors to ConnectionError.
156
+
157
+ The ``websockets`` library raises its own exception hierarchy
158
+ (``ConnectionClosed`` and friends), which is *not* a subclass
159
+ of the built-in :class:`ConnectionError`. Without translation
160
+ the orchestrator's ``except ConnectionError`` filter — and the
161
+ MCP handler's ``except RuntimeError`` filter — would let those
162
+ errors leak as raw tracebacks into the MCP transport, breaking
163
+ the say() tool's clean error JSON contract on mid-stream
164
+ disconnect.
165
+ """
166
+ try:
167
+ await self._ws.send(payload)
168
+ except (
169
+ websockets.exceptions.ConnectionClosed,
170
+ OSError,
171
+ ) as exc:
172
+ # Mark the connection dead so subsequent calls fail fast
173
+ # rather than each one re-discovering the broken socket.
174
+ self.disconnect()
175
+ raise ConnectionError(f"WebSocket send failed: {exc}") from exc
176
+
177
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
178
+ """Send a single Opus frame to the ESP32 as a WebSocket binary frame.
179
+
180
+ The device's ``OnData`` handler (firmware/main/protocols/
181
+ websocket_protocol.cc) treats every binary frame as an Opus
182
+ audio payload to feed into its decoder, so this method is the
183
+ TTS pipeline's egress point.
184
+ """
185
+ if not self._connected:
186
+ raise ConnectionError("ESP32 not connected")
187
+ await self._ws_send(opus_frame)
188
+
189
+ async def send_tts_state(self, state: str) -> None:
190
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
191
+
192
+ The device's :func:`Application::OnIncomingJson` translates
193
+ ``{"type":"tts","state":"start"}`` into
194
+ :data:`kDeviceStateSpeaking`, which is the gate for
195
+ :func:`OnIncomingAudio` pushing packets into the decode queue
196
+ (see ``firmware/main/application.cc``). Without bracketing the
197
+ audio frames in start/stop, the device drops them on the floor
198
+ and the speaker stays silent — the TTS tool returns success
199
+ without anything actually playing.
200
+ """
201
+ if not self._connected:
202
+ raise ConnectionError("ESP32 not connected")
203
+ message = {
204
+ "session_id": self.session_id,
205
+ "type": "tts",
206
+ "state": state,
207
+ }
208
+ await self._ws_send(json.dumps(message))
209
+
210
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
211
+ """Send a listen state notification (``start`` / ``stop``).
212
+
213
+ Server-driven counterpart to the device's existing
214
+ :func:`Protocol::SendStartListening` (Issue #91). The
215
+ firmware's :func:`Application::OnIncomingJson` dispatches
216
+ ``state: "start"`` to :func:`Application::StartListening` and
217
+ ``state: "stop"`` to :func:`Application::StopListening`.
218
+
219
+ ``mode`` is currently accepted only for ``state="start"`` and is
220
+ carried on the wire for forward-compatibility — the firmware
221
+ accepts but ignores it in Phase 1 because
222
+ :func:`HandleStartListeningEvent` unconditionally enters
223
+ ``kListeningModeManualStop`` (the gateway controls the stop
224
+ boundary explicitly).
225
+ """
226
+ if not self._connected:
227
+ raise ConnectionError("ESP32 not connected")
228
+ message: dict[str, Any] = {
229
+ "session_id": self.session_id,
230
+ "type": "listen",
231
+ "state": state,
232
+ }
233
+ if state == "start":
234
+ message["mode"] = mode
235
+ await self._ws_send(json.dumps(message))
236
+
145
237
  def disconnect(self) -> None:
146
238
  """Mark connection as disconnected."""
147
239
  self._connected = False
@@ -167,6 +259,32 @@ class ESP32Manager:
167
259
  self._init_tasks: list[asyncio.Task] = []
168
260
  self._vision_url: str = ""
169
261
  self._vision_token: str = ""
262
+ # Per-device serialisation for TTS send sequences. Acquired by
263
+ # the orchestrator around the entire start → frames → stop
264
+ # block so concurrent ``say()`` invocations cannot interleave
265
+ # their Opus frames on the same WebSocket or overlap their
266
+ # ``tts.start``/``tts.stop`` notifications (which would yank
267
+ # the firmware out of ``kDeviceStateSpeaking`` mid-utterance
268
+ # and silently drop the remaining audio). The lock is scoped
269
+ # to the manager because the manager owns the device today —
270
+ # if multi-device support lands later, the lock should move
271
+ # onto :class:`ESP32Connection` instead.
272
+ self._tts_lock = asyncio.Lock()
273
+ # Inbound STT capture (Issue #91) shares the TTS lock rather
274
+ # than running on a separate one. The firmware's
275
+ # ``HandleStartListeningEvent`` aborts any in-flight TTS when
276
+ # a listen.start arrives mid-speaking (state ==
277
+ # ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
278
+ # ``SetListeningMode(kListeningModeManualStop)``), so two
279
+ # operations on the same device's audio path would
280
+ # otherwise step on each other: a ``listen()`` could yank a
281
+ # ``say()`` out of speaking mid-utterance, or a ``say()``
282
+ # could start streaming TTS frames into the buffer a
283
+ # concurrent ``listen()`` is capturing. Treating the audio
284
+ # path as a single resource makes the device's state machine
285
+ # observable from gateway code; if a full-duplex contract
286
+ # ever lands later the lock can split again.
287
+ self._listen_lock = self._tts_lock
170
288
 
171
289
  @property
172
290
  def device_connected(self) -> bool:
@@ -176,6 +294,26 @@ class ESP32Manager:
176
294
  def connection(self) -> ESP32Connection | None:
177
295
  return self._connection
178
296
 
297
+ @property
298
+ def tts_lock(self) -> asyncio.Lock:
299
+ """Per-device lock guarding the TTS send sequence.
300
+
301
+ See :attr:`_tts_lock` for the rationale; the orchestrator wraps
302
+ the start → frames → stop block in ``async with`` on this lock.
303
+ """
304
+ return self._tts_lock
305
+
306
+ @property
307
+ def listen_lock(self) -> asyncio.Lock:
308
+ """Per-device lock guarding the STT capture sequence.
309
+
310
+ See :attr:`_listen_lock` for the rationale; the orchestrator
311
+ wraps the entire ``listen.start`` → wait → ``listen.stop``
312
+ block in ``async with`` on this lock so two concurrent
313
+ ``listen()`` calls cannot share the inbound recording slot.
314
+ """
315
+ return self._listen_lock
316
+
179
317
  async def start(
180
318
  self,
181
319
  host: str = "0.0.0.0",
@@ -246,7 +384,14 @@ class ESP32Manager:
246
384
  try:
247
385
  async for message in ws:
248
386
  if isinstance(message, bytes):
249
- # Binary = audio frame, ignore for now
387
+ # Binary = audio frame. Forward to the audio_stream
388
+ # module which buffers it for STT capture (Issue
389
+ # #91) when a recording slot is open, or discards
390
+ # it otherwise. Only protocol v1 is supported on
391
+ # the inbound side today; the orchestrator gates
392
+ # listen() on protocol_version=1 so v2/v3 frames
393
+ # cannot reach this point with recording active.
394
+ await handle_audio_frame(message, session_id)
250
395
  continue
251
396
 
252
397
  try:
@@ -265,6 +410,27 @@ class ESP32Manager:
265
410
  await ws.close()
266
411
  return
267
412
 
413
+ # Capture the device's WebSocket protocol version
414
+ # so callers (e.g. the TTS pipeline) can decide
415
+ # whether their wire format is compatible. The
416
+ # firmware accepts raw Opus only on v1; v2/v3 wrap
417
+ # the payload in a BinaryProtocol header.
418
+ raw_version = data.get("version", 1)
419
+ try:
420
+ connection.protocol_version = int(raw_version)
421
+ except (TypeError, ValueError):
422
+ connection.protocol_version = 1
423
+ if connection.protocol_version != 1:
424
+ logger.warning(
425
+ "ESP32 negotiated WebSocket protocol "
426
+ "version=%s; the gateway emits raw Opus "
427
+ "binary frames matching v1 only. TTS "
428
+ "calls (say) will be blocked at the "
429
+ "orchestrator until v2/v3 BinaryProtocol "
430
+ "header wrapping is implemented",
431
+ connection.protocol_version,
432
+ )
433
+
268
434
  # Send hello response
269
435
  resp = HelloResponse(session_id=session_id)
270
436
  await ws.send(resp.model_dump_json())
@@ -323,6 +489,40 @@ class ESP32Manager:
323
489
  return None, {"code": -32000, "message": "ESP32 not initialized"}
324
490
  return await self._connection.call_tool(name, arguments)
325
491
 
492
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
493
+ """Push a single Opus frame to the connected device.
494
+
495
+ Used by the TTS pipeline to deliver synthesised audio. Raises
496
+ :class:`ConnectionError` if no device is currently attached so
497
+ the orchestrator can surface a clean error to the MCP client
498
+ instead of silently dropping audio.
499
+ """
500
+ if not self._connection or not self._connection.connected:
501
+ raise ConnectionError("No ESP32 device connected")
502
+ await self._connection.send_audio_frame(opus_frame)
503
+
504
+ async def send_tts_state(self, state: str) -> None:
505
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
506
+
507
+ Required around audio frame egress so the device transitions
508
+ into ``kDeviceStateSpeaking`` and back; see
509
+ :meth:`ESP32Connection.send_tts_state` for the full rationale.
510
+ """
511
+ if not self._connection or not self._connection.connected:
512
+ raise ConnectionError("No ESP32 device connected")
513
+ await self._connection.send_tts_state(state)
514
+
515
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
516
+ """Send a listen state notification to put the device into /
517
+ out of listening mode (Issue #91).
518
+
519
+ See :meth:`ESP32Connection.send_listen_state` for the wire
520
+ format and the firmware-side dispatch.
521
+ """
522
+ if not self._connection or not self._connection.connected:
523
+ raise ConnectionError("No ESP32 device connected")
524
+ await self._connection.send_listen_state(state, mode=mode)
525
+
326
526
  def get_status(self) -> dict[str, Any]:
327
527
  """Get current connection status."""
328
528
  if not self._connection or not self._connection.connected: