stackchan-mcp 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/PKG-INFO +42 -2
  2. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/README.md +35 -1
  3. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/pyproject.toml +17 -1
  4. stackchan_mcp-0.5.0/stackchan_mcp/audio_stream.py +52 -0
  5. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/cli.py +53 -0
  6. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/esp32_client.py +128 -0
  7. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/stdio_server.py +227 -1
  8. stackchan_mcp-0.5.0/stackchan_mcp/tts/__init__.py +55 -0
  9. stackchan_mcp-0.5.0/stackchan_mcp/tts/audio_utils.py +177 -0
  10. stackchan_mcp-0.5.0/stackchan_mcp/tts/base.py +86 -0
  11. stackchan_mcp-0.5.0/stackchan_mcp/tts/orchestrator.py +282 -0
  12. stackchan_mcp-0.5.0/stackchan_mcp/tts/voicevox.py +184 -0
  13. stackchan_mcp-0.5.0/tests/_audio_fixtures.py +46 -0
  14. stackchan_mcp-0.5.0/tests/test_audio_stream.py +60 -0
  15. stackchan_mcp-0.5.0/tests/test_audio_utils.py +222 -0
  16. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_cli.py +101 -0
  17. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_esp32_client.py +155 -1
  18. stackchan_mcp-0.5.0/tests/test_orchestrator.py +541 -0
  19. stackchan_mcp-0.5.0/tests/test_stdio_server.py +377 -0
  20. stackchan_mcp-0.5.0/tests/test_tts_framework.py +173 -0
  21. stackchan_mcp-0.5.0/tests/test_voicevox.py +193 -0
  22. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/uv.lock +21 -1
  23. stackchan_mcp-0.3.0/stackchan_mcp/audio_stream.py +0 -34
  24. stackchan_mcp-0.3.0/tests/test_stdio_server.py +0 -66
  25. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/.env.example +0 -0
  26. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/.gitignore +0 -0
  27. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/LICENSE +0 -0
  28. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/__init__.py +0 -0
  29. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/__main__.py +0 -0
  30. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/capture_server.py +0 -0
  31. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/gateway.py +0 -0
  32. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/handlers/__init__.py +0 -0
  33. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/handlers/audio.py +0 -0
  34. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/handlers/camera.py +0 -0
  35. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/handlers/robot.py +0 -0
  36. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/mcp_router.py +0 -0
  37. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/protocol.py +0 -0
  38. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/server.py +0 -0
  39. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/stackchan_mcp/tools.py +0 -0
  40. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/conftest.py +0 -0
  41. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_capture_server.py +0 -0
  42. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_gateway.py +0 -0
  43. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_mcp_router.py +0 -0
  44. {stackchan_mcp-0.3.0 → stackchan_mcp-0.5.0}/tests/test_protocol.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stackchan-mcp
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
5
5
  Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
6
6
  Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
@@ -27,6 +27,12 @@ Requires-Dist: mcp>=1.0
27
27
  Requires-Dist: pydantic>=2
28
28
  Requires-Dist: python-dotenv
29
29
  Requires-Dist: websockets>=12
30
+ Provides-Extra: tts
31
+ Requires-Dist: httpx>=0.27; extra == 'tts'
32
+ Requires-Dist: opuslib>=3; extra == 'tts'
33
+ Provides-Extra: tts-voicevox
34
+ Requires-Dist: httpx>=0.27; extra == 'tts-voicevox'
35
+ Requires-Dist: opuslib>=3; extra == 'tts-voicevox'
30
36
  Description-Content-Type: text/markdown
31
37
 
32
38
  # gateway
@@ -114,6 +120,29 @@ will notice the dropped WebSocket and retry while idle. The retry delay starts
114
120
  at 5 seconds and backs off up to 60 seconds. After the gateway is listening
115
121
  again, check `get_status` from the stdio MCP side to confirm the device is back.
116
122
 
123
+ ## Configuration changes
124
+
125
+ The gateway reads `.env` once at process start. Because the gateway runs as a
126
+ **stdio MCP server** (it has no standalone CLI mode beyond `--help` /
127
+ `--version` / `--check`), editing `.env` while it is connected to an MCP
128
+ client does not take effect on the running process — and killing the gateway
129
+ process directly will not auto-restart it; the MCP client owns the lifecycle.
130
+
131
+ After editing `.env` (for example to update `STACKCHAN_TOKEN`, `VISION_URL`,
132
+ or `VISION_TOKEN`):
133
+
134
+ 1. Reconnect the MCP client. In Claude Code this is `/mcp` to reconnect, or a
135
+ full Claude Code restart.
136
+ 2. Confirm `mcp__stackchan-mcp__get_status` returns `connected: true` with the
137
+ expected `tools_count`.
138
+ 3. If the ESP32 was already connected with a stale auth credential, hard-reset
139
+ the device (`esptool.py --before default_reset --after hard_reset chip_id`,
140
+ or DTR/RTS toggle via pyserial) so it reconnects with the fresh
141
+ configuration.
142
+
143
+ `STACKCHAN_TOKEN` takes precedence over the legacy `BEARER_TOKEN`; setting
144
+ either is enough, but if you have both, keep them aligned.
145
+
117
146
  ## Tests
118
147
 
119
148
  ```bash
@@ -165,8 +194,19 @@ Same shape, under `mcpServers`.
165
194
  | `get_touch_state` | Touch sensor state (press/release/stroke) |
166
195
  | `set_avatar(face)` | Switch avatar expression (`idle` / `happy` / `thinking` / `sad` / `surprised` / `embarrassed`), or `off` to hide the avatar and disable blink so the underlying xiaozhi-esp32 screens (WiFi config UI, OTA, settings) are visible. A subsequent `set_avatar(<other face>)` brings it back and restores blink. |
167
196
  | `set_blink(state)` | Blink animation on/off |
168
- | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`) |
197
+ | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
198
+ | `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
169
199
  | `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
200
+ | `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
201
+ | `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
202
+ | `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
203
+ | `clear_leds` | Turn all 12 base RGB LEDs off. |
204
+
205
+ The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
206
+ (expander pin 13, not an ESP32 GPIO), so all four LED tools share the
207
+ PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
208
+ init fails at boot, the LED tools degrade with `available=false`
209
+ instead of cascading errors.
170
210
 
171
211
  The mapping from these names to ESP32-side `self.*` MCP tools is in
172
212
  `stackchan_mcp/stdio_server.py`.
@@ -83,6 +83,29 @@ will notice the dropped WebSocket and retry while idle. The retry delay starts
83
83
  at 5 seconds and backs off up to 60 seconds. After the gateway is listening
84
84
  again, check `get_status` from the stdio MCP side to confirm the device is back.
85
85
 
86
+ ## Configuration changes
87
+
88
+ The gateway reads `.env` once at process start. Because the gateway runs as a
89
+ **stdio MCP server** (it has no standalone CLI mode beyond `--help` /
90
+ `--version` / `--check`), editing `.env` while it is connected to an MCP
91
+ client does not take effect on the running process — and killing the gateway
92
+ process directly will not auto-restart it; the MCP client owns the lifecycle.
93
+
94
+ After editing `.env` (for example to update `STACKCHAN_TOKEN`, `VISION_URL`,
95
+ or `VISION_TOKEN`):
96
+
97
+ 1. Reconnect the MCP client. In Claude Code this is `/mcp` to reconnect, or a
98
+ full Claude Code restart.
99
+ 2. Confirm `mcp__stackchan-mcp__get_status` returns `connected: true` with the
100
+ expected `tools_count`.
101
+ 3. If the ESP32 was already connected with a stale auth credential, hard-reset
102
+ the device (`esptool.py --before default_reset --after hard_reset chip_id`,
103
+ or DTR/RTS toggle via pyserial) so it reconnects with the fresh
104
+ configuration.
105
+
106
+ `STACKCHAN_TOKEN` takes precedence over the legacy `BEARER_TOKEN`; setting
107
+ either is enough, but if you have both, keep them aligned.
108
+
86
109
  ## Tests
87
110
 
88
111
  ```bash
@@ -134,8 +157,19 @@ Same shape, under `mcpServers`.
134
157
  | `get_touch_state` | Touch sensor state (press/release/stroke) |
135
158
  | `set_avatar(face)` | Switch avatar expression (`idle` / `happy` / `thinking` / `sad` / `surprised` / `embarrassed`), or `off` to hide the avatar and disable blink so the underlying xiaozhi-esp32 screens (WiFi config UI, OTA, settings) are visible. A subsequent `set_avatar(<other face>)` brings it back and restores blink. |
136
159
  | `set_blink(state)` | Blink animation on/off |
137
- | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`) |
160
+ | `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
161
+ | `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
138
162
  | `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
163
+ | `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
164
+ | `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
165
+ | `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
166
+ | `clear_leds` | Turn all 12 base RGB LEDs off. |
167
+
168
+ The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
169
+ (expander pin 13, not an ESP32 GPIO), so all four LED tools share the
170
+ PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
171
+ init fails at boot, the LED tools degrade with `available=false`
172
+ instead of cascading errors.
139
173
 
140
174
  The mapping from these names to ESP32-side `self.*` MCP tools is in
141
175
  `stackchan_mcp/stdio_server.py`.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "stackchan-mcp"
3
- version = "0.3.0"
3
+ version = "0.5.0"
4
4
  description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,6 +32,22 @@ dependencies = [
32
32
  "aiohttp>=3",
33
33
  ]
34
34
 
35
+ [project.optional-dependencies]
36
+ # Phase 4 TTS — see Issue #70.
37
+ # Concrete engines (VOICEVOX, Irodori) consume these libraries:
38
+ # * httpx — VOICEVOX HTTP engine client
39
+ # * opuslib — Opus encoding for the device's audio decoder
40
+ # `tts-voicevox` is a no-op alias provided so users can declare intent
41
+ # explicitly; the VOICEVOX engine itself is an external HTTP process and
42
+ # adds no Python dependencies of its own.
43
+ tts = [
44
+ "httpx>=0.27",
45
+ "opuslib>=3",
46
+ ]
47
+ tts-voicevox = [
48
+ "stackchan-mcp[tts]",
49
+ ]
50
+
35
51
  [project.urls]
36
52
  Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
37
53
  Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
@@ -0,0 +1,52 @@
1
+ """Opus audio frame handling for the gateway <-> device link.
2
+
3
+ Outbound (TTS) frames are produced by
4
+ :mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
5
+ ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
6
+
7
+ The inbound side (STT pipeline, Phase 4 / Issue #8) is still a stub —
8
+ binary frames coming up from the device are logged and discarded for
9
+ now. Wiring that up belongs to the STT half of Phase 4.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from typing import TYPE_CHECKING, Iterable
16
+
17
+ if TYPE_CHECKING:
18
+ from .esp32_client import ESP32Manager
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ async def handle_audio_frame(data: bytes, session_id: str) -> None:
24
+ """Process an incoming binary Opus frame from the device (stub).
25
+
26
+ The STT half of Phase 4 will pipe this into a recogniser; until
27
+ then we just log the size at debug level.
28
+ """
29
+ logger.debug(
30
+ "audio_frame session=%s bytes=%d (discarded — STT not wired up)",
31
+ session_id,
32
+ len(data),
33
+ )
34
+
35
+
36
+ async def push_opus_frames(
37
+ esp32: ESP32Manager,
38
+ frames: Iterable[bytes],
39
+ ) -> int:
40
+ """Push Opus frames to the connected ESP32.
41
+
42
+ Returns the number of frames sent so the caller can report this to
43
+ the MCP client. Raises :class:`ConnectionError` (via
44
+ :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
45
+ mid-stream — the orchestrator turns that into a clean MCP error
46
+ rather than letting it bubble up as a stack trace.
47
+ """
48
+ sent = 0
49
+ for frame in frames:
50
+ await esp32.send_audio_frame(frame)
51
+ sent += 1
52
+ return sent
@@ -16,6 +16,7 @@ import asyncio
16
16
  import errno
17
17
  import logging
18
18
  import os
19
+ import platform
19
20
  import shutil
20
21
  import socket
21
22
  import subprocess
@@ -377,6 +378,56 @@ def _load_dotenv() -> None:
377
378
  load_dotenv()
378
379
 
379
380
 
381
+ # Default Homebrew prefixes that ship libopus.dylib on macOS. Apple
382
+ # Silicon installs default to ``/opt/homebrew``; Intel Macs use
383
+ # ``/usr/local``. Keeping both keeps the helper portable across
384
+ # contributor machines.
385
+ _HOMEBREW_LIB_DIRS = ("/opt/homebrew/lib", "/usr/local/lib")
386
+
387
+
388
+ def _ensure_libopus_findable() -> None:
389
+ """Make libopus reachable to opuslib's ``ctypes.find_library`` on macOS.
390
+
391
+ ``opuslib.api`` calls ``ctypes.util.find_library("opus")`` at
392
+ import time. On macOS that walks ``DYLD_LIBRARY_PATH`` plus a
393
+ couple of system-default directories — but not Homebrew's
394
+ ``/opt/homebrew/lib`` (Apple Silicon) or ``/usr/local/lib`` (Intel),
395
+ so a vanilla ``brew install opus`` lands a working libopus that
396
+ opuslib still cannot find. Users then see ``Could not find Opus
397
+ library`` even though the dylib is on disk.
398
+
399
+ Prepend any Homebrew-style lib directories that exist so the next
400
+ ``find_library`` call (triggered by the lazy ``import opuslib``
401
+ inside :func:`audio_utils.encode_opus_frames`) succeeds. We
402
+ deliberately *prepend* and skip duplicates so an explicit
403
+ ``DYLD_LIBRARY_PATH`` set by the operator (e.g. for a custom build
404
+ of libopus) keeps priority. No-op on non-macOS hosts.
405
+ """
406
+ if platform.system() != "Darwin":
407
+ return
408
+
409
+ existing = os.environ.get("DYLD_LIBRARY_PATH", "")
410
+ paths: list[str] = [p for p in existing.split(":") if p]
411
+
412
+ prepended: list[str] = []
413
+ for candidate in _HOMEBREW_LIB_DIRS:
414
+ if candidate in paths:
415
+ continue
416
+ if not os.path.isdir(candidate):
417
+ continue
418
+ prepended.append(candidate)
419
+
420
+ if not prepended:
421
+ return
422
+
423
+ os.environ["DYLD_LIBRARY_PATH"] = ":".join(prepended + paths)
424
+ logger.debug(
425
+ "Prepended Homebrew lib dirs to DYLD_LIBRARY_PATH so opuslib "
426
+ "can find libopus: %s",
427
+ prepended,
428
+ )
429
+
430
+
380
431
  def _run_preflight() -> int:
381
432
  """Run preflight diagnostics. Returns the desired process exit code.
382
433
 
@@ -387,6 +438,7 @@ def _run_preflight() -> int:
387
438
  warns about a missing ``STACKCHAN_TOKEN``.
388
439
  """
389
440
  _load_dotenv()
441
+ _ensure_libopus_findable()
390
442
 
391
443
  issues = 0
392
444
  print(f"stackchan-mcp {__version__} preflight")
@@ -527,6 +579,7 @@ def main(argv: list[str] | None = None) -> None:
527
579
  sys.exit(_run_preflight())
528
580
 
529
581
  _load_dotenv()
582
+ _ensure_libopus_findable()
530
583
 
531
584
  logging.basicConfig(
532
585
  level=logging.INFO,
@@ -14,6 +14,7 @@ import uuid
14
14
  from typing import Any
15
15
 
16
16
  import websockets
17
+ import websockets.exceptions
17
18
  from websockets.asyncio.server import ServerConnection
18
19
 
19
20
  from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
@@ -36,6 +37,13 @@ class ESP32Connection:
36
37
  self._pending: dict[int, asyncio.Future[dict[str, Any]]] = {}
37
38
  self._connected = True
38
39
  self._initialized = False
40
+ # Device-declared WebSocket protocol version (from the hello
41
+ # message). Defaults to 1, which matches the firmware's default
42
+ # (firmware/main/protocols/websocket_protocol.h: ``version_ = 1``)
43
+ # and the audio framing this gateway emits today (raw Opus
44
+ # payload). v2/v3 add a BinaryProtocol header that this gateway
45
+ # does not yet wrap — see Issue follow-up to #70.
46
+ self.protocol_version: int = 1
39
47
 
40
48
  @property
41
49
  def connected(self) -> bool:
@@ -142,6 +150,62 @@ class ESP32Connection:
142
150
  method = payload.get("method", "")
143
151
  logger.info("ESP32 notification: %s", method)
144
152
 
153
+ async def _ws_send(self, payload: bytes | str) -> None:
154
+ """Send a payload, translating websockets errors to ConnectionError.
155
+
156
+ The ``websockets`` library raises its own exception hierarchy
157
+ (``ConnectionClosed`` and friends), which is *not* a subclass
158
+ of the built-in :class:`ConnectionError`. Without translation
159
+ the orchestrator's ``except ConnectionError`` filter — and the
160
+ MCP handler's ``except RuntimeError`` filter — would let those
161
+ errors leak as raw tracebacks into the MCP transport, breaking
162
+ the say() tool's clean error JSON contract on mid-stream
163
+ disconnect.
164
+ """
165
+ try:
166
+ await self._ws.send(payload)
167
+ except (
168
+ websockets.exceptions.ConnectionClosed,
169
+ OSError,
170
+ ) as exc:
171
+ # Mark the connection dead so subsequent calls fail fast
172
+ # rather than each one re-discovering the broken socket.
173
+ self.disconnect()
174
+ raise ConnectionError(f"WebSocket send failed: {exc}") from exc
175
+
176
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
177
+ """Send a single Opus frame to the ESP32 as a WebSocket binary frame.
178
+
179
+ The device's ``OnData`` handler (firmware/main/protocols/
180
+ websocket_protocol.cc) treats every binary frame as an Opus
181
+ audio payload to feed into its decoder, so this method is the
182
+ TTS pipeline's egress point.
183
+ """
184
+ if not self._connected:
185
+ raise ConnectionError("ESP32 not connected")
186
+ await self._ws_send(opus_frame)
187
+
188
+ async def send_tts_state(self, state: str) -> None:
189
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
190
+
191
+ The device's :func:`Application::OnIncomingJson` translates
192
+ ``{"type":"tts","state":"start"}`` into
193
+ :data:`kDeviceStateSpeaking`, which is the gate for
194
+ :func:`OnIncomingAudio` pushing packets into the decode queue
195
+ (see ``firmware/main/application.cc``). Without bracketing the
196
+ audio frames in start/stop, the device drops them on the floor
197
+ and the speaker stays silent — the TTS tool returns success
198
+ without anything actually playing.
199
+ """
200
+ if not self._connected:
201
+ raise ConnectionError("ESP32 not connected")
202
+ message = {
203
+ "session_id": self.session_id,
204
+ "type": "tts",
205
+ "state": state,
206
+ }
207
+ await self._ws_send(json.dumps(message))
208
+
145
209
  def disconnect(self) -> None:
146
210
  """Mark connection as disconnected."""
147
211
  self._connected = False
@@ -167,6 +231,17 @@ class ESP32Manager:
167
231
  self._init_tasks: list[asyncio.Task] = []
168
232
  self._vision_url: str = ""
169
233
  self._vision_token: str = ""
234
+ # Per-device serialisation for TTS send sequences. Acquired by
235
+ # the orchestrator around the entire start → frames → stop
236
+ # block so concurrent ``say()`` invocations cannot interleave
237
+ # their Opus frames on the same WebSocket or overlap their
238
+ # ``tts.start``/``tts.stop`` notifications (which would yank
239
+ # the firmware out of ``kDeviceStateSpeaking`` mid-utterance
240
+ # and silently drop the remaining audio). The lock is scoped
241
+ # to the manager because the manager owns the device today —
242
+ # if multi-device support lands later, the lock should move
243
+ # onto :class:`ESP32Connection` instead.
244
+ self._tts_lock = asyncio.Lock()
170
245
 
171
246
  @property
172
247
  def device_connected(self) -> bool:
@@ -176,6 +251,15 @@ class ESP32Manager:
176
251
  def connection(self) -> ESP32Connection | None:
177
252
  return self._connection
178
253
 
254
+ @property
255
+ def tts_lock(self) -> asyncio.Lock:
256
+ """Per-device lock guarding the TTS send sequence.
257
+
258
+ See :attr:`_tts_lock` for the rationale; the orchestrator wraps
259
+ the start → frames → stop block in ``async with`` on this lock.
260
+ """
261
+ return self._tts_lock
262
+
179
263
  async def start(
180
264
  self,
181
265
  host: str = "0.0.0.0",
@@ -265,6 +349,27 @@ class ESP32Manager:
265
349
  await ws.close()
266
350
  return
267
351
 
352
+ # Capture the device's WebSocket protocol version
353
+ # so callers (e.g. the TTS pipeline) can decide
354
+ # whether their wire format is compatible. The
355
+ # firmware accepts raw Opus only on v1; v2/v3 wrap
356
+ # the payload in a BinaryProtocol header.
357
+ raw_version = data.get("version", 1)
358
+ try:
359
+ connection.protocol_version = int(raw_version)
360
+ except (TypeError, ValueError):
361
+ connection.protocol_version = 1
362
+ if connection.protocol_version != 1:
363
+ logger.warning(
364
+ "ESP32 negotiated WebSocket protocol "
365
+ "version=%s; the gateway emits raw Opus "
366
+ "binary frames matching v1 only. TTS "
367
+ "calls (say) will be blocked at the "
368
+ "orchestrator until v2/v3 BinaryProtocol "
369
+ "header wrapping is implemented",
370
+ connection.protocol_version,
371
+ )
372
+
268
373
  # Send hello response
269
374
  resp = HelloResponse(session_id=session_id)
270
375
  await ws.send(resp.model_dump_json())
@@ -323,6 +428,29 @@ class ESP32Manager:
323
428
  return None, {"code": -32000, "message": "ESP32 not initialized"}
324
429
  return await self._connection.call_tool(name, arguments)
325
430
 
431
+ async def send_audio_frame(self, opus_frame: bytes) -> None:
432
+ """Push a single Opus frame to the connected device.
433
+
434
+ Used by the TTS pipeline to deliver synthesised audio. Raises
435
+ :class:`ConnectionError` if no device is currently attached so
436
+ the orchestrator can surface a clean error to the MCP client
437
+ instead of silently dropping audio.
438
+ """
439
+ if not self._connection or not self._connection.connected:
440
+ raise ConnectionError("No ESP32 device connected")
441
+ await self._connection.send_audio_frame(opus_frame)
442
+
443
+ async def send_tts_state(self, state: str) -> None:
444
+ """Send a TTS state notification (``start`` / ``stop`` / ...).
445
+
446
+ Required around audio frame egress so the device transitions
447
+ into ``kDeviceStateSpeaking`` and back; see
448
+ :meth:`ESP32Connection.send_tts_state` for the full rationale.
449
+ """
450
+ if not self._connection or not self._connection.connected:
451
+ raise ConnectionError("No ESP32 device connected")
452
+ await self._connection.send_tts_state(state)
453
+
326
454
  def get_status(self) -> dict[str, Any]:
327
455
  """Get current connection status."""
328
456
  if not self._connection or not self._connection.connected: