stackchan-mcp 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/PKG-INFO +25 -1
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/README.md +10 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/pyproject.toml +35 -1
- stackchan_mcp-0.6.0/stackchan_mcp/audio_stream.py +151 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/cli.py +53 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/esp32_client.py +201 -1
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/stdio_server.py +234 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/__init__.py +62 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/audio_utils.py +102 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/base.py +94 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/faster_whisper.py +217 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/openai_whisper.py +177 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/orchestrator.py +306 -0
- stackchan_mcp-0.6.0/stackchan_mcp/tts/__init__.py +55 -0
- stackchan_mcp-0.6.0/stackchan_mcp/tts/audio_utils.py +177 -0
- stackchan_mcp-0.6.0/stackchan_mcp/tts/base.py +86 -0
- stackchan_mcp-0.6.0/stackchan_mcp/tts/orchestrator.py +282 -0
- stackchan_mcp-0.6.0/stackchan_mcp/tts/voicevox.py +184 -0
- stackchan_mcp-0.6.0/tests/_audio_fixtures.py +46 -0
- stackchan_mcp-0.6.0/tests/test_audio_stream.py +145 -0
- stackchan_mcp-0.6.0/tests/test_audio_utils.py +222 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_cli.py +101 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_esp32_client.py +236 -1
- stackchan_mcp-0.6.0/tests/test_orchestrator.py +541 -0
- stackchan_mcp-0.6.0/tests/test_stdio_server.py +377 -0
- stackchan_mcp-0.6.0/tests/test_stt_audio_utils.py +100 -0
- stackchan_mcp-0.6.0/tests/test_stt_framework.py +195 -0
- stackchan_mcp-0.6.0/tests/test_stt_orchestrator.py +441 -0
- stackchan_mcp-0.6.0/tests/test_tts_framework.py +173 -0
- stackchan_mcp-0.6.0/tests/test_voicevox.py +193 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/uv.lock +794 -2
- stackchan_mcp-0.4.0/stackchan_mcp/audio_stream.py +0 -34
- stackchan_mcp-0.4.0/tests/test_stdio_server.py +0 -148
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/.env.example +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/.gitignore +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/LICENSE +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__init__.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__main__.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/capture_server.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/gateway.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/__init__.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/audio.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/camera.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/robot.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/mcp_router.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/protocol.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/server.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tools.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/conftest.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_capture_server.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_gateway.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_mcp_router.py +0 -0
- {stackchan_mcp-0.4.0 → stackchan_mcp-0.6.0}/tests/test_protocol.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stackchan-mcp
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
|
|
5
5
|
Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
6
6
|
Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
@@ -27,6 +27,20 @@ Requires-Dist: mcp>=1.0
|
|
|
27
27
|
Requires-Dist: pydantic>=2
|
|
28
28
|
Requires-Dist: python-dotenv
|
|
29
29
|
Requires-Dist: websockets>=12
|
|
30
|
+
Provides-Extra: stt
|
|
31
|
+
Requires-Dist: opuslib>=3; extra == 'stt'
|
|
32
|
+
Provides-Extra: stt-faster-whisper
|
|
33
|
+
Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
|
|
34
|
+
Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
|
|
35
|
+
Provides-Extra: stt-openai
|
|
36
|
+
Requires-Dist: openai>=1.0; extra == 'stt-openai'
|
|
37
|
+
Requires-Dist: opuslib>=3; extra == 'stt-openai'
|
|
38
|
+
Provides-Extra: tts
|
|
39
|
+
Requires-Dist: httpx>=0.27; extra == 'tts'
|
|
40
|
+
Requires-Dist: opuslib>=3; extra == 'tts'
|
|
41
|
+
Provides-Extra: tts-voicevox
|
|
42
|
+
Requires-Dist: httpx>=0.27; extra == 'tts-voicevox'
|
|
43
|
+
Requires-Dist: opuslib>=3; extra == 'tts-voicevox'
|
|
30
44
|
Description-Content-Type: text/markdown
|
|
31
45
|
|
|
32
46
|
# gateway
|
|
@@ -191,6 +205,16 @@ Same shape, under `mcpServers`.
|
|
|
191
205
|
| `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
|
|
192
206
|
| `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
|
|
193
207
|
| `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
|
|
208
|
+
| `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
|
|
209
|
+
| `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
|
|
210
|
+
| `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
|
|
211
|
+
| `clear_leds` | Turn all 12 base RGB LEDs off. |
|
|
212
|
+
|
|
213
|
+
The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
|
|
214
|
+
(expander pin 13, not an ESP32 GPIO), so all four LED tools share the
|
|
215
|
+
PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
|
|
216
|
+
init fails at boot, the LED tools degrade with `available=false`
|
|
217
|
+
instead of cascading errors.
|
|
194
218
|
|
|
195
219
|
The mapping from these names to ESP32-side `self.*` MCP tools is in
|
|
196
220
|
`stackchan_mcp/stdio_server.py`.
|
|
@@ -160,6 +160,16 @@ Same shape, under `mcpServers`.
|
|
|
160
160
|
| `set_mouth(state)` | Mouth shape (`closed` / `half` / `open` / `e` / `u`), one-shot, held until next call |
|
|
161
161
|
| `set_mouth_sequence(steps)` | Queue and play a list of `{shape, duration_ms}` steps locally for TTS lip-sync. The firmware walks the queue without per-step network RTT. Calling `set_mouth`, `set_avatar`, or this tool again interrupts the in-flight sequence; autonomous blink is paused while a sequence is playing. |
|
|
162
162
|
| `check_vm_en` | Read PY32 VM EN GPIO state (servo power supply diagnostic) |
|
|
163
|
+
| `set_led(index, r, g, b)` | Set one of the 12 base RGB LEDs by index (`0..11`); channels `0..255`. Updates immediately. |
|
|
164
|
+
| `set_all_leds(r, g, b)` | Set all 12 base RGB LEDs to the same color. Updates immediately. |
|
|
165
|
+
| `set_leds(colors)` | Batch-set the first N LEDs from a `[[r,g,b], ...]` array (1..12 entries). Single I2C burst + one latch — use this for animations / multi-color patterns instead of N individual `set_led` calls. Trailing LEDs (beyond `len(colors)`) keep their previous color. Validation is atomic: a malformed entry rejects the whole call without mutating any LED. |
|
|
166
|
+
| `clear_leds` | Turn all 12 base RGB LEDs off. |
|
|
167
|
+
|
|
168
|
+
The 12 base LEDs are 12× WS2812C wired to the PY32L020 IO expander
|
|
169
|
+
(expander pin 13, not an ESP32 GPIO), so all four LED tools share the
|
|
170
|
+
PY32 I2C bus with the servo-power and Si12T touch paths. If the PY32
|
|
171
|
+
init fails at boot, the LED tools degrade with `available=false`
|
|
172
|
+
instead of cascading errors.
|
|
163
173
|
|
|
164
174
|
The mapping from these names to ESP32-side `self.*` MCP tools is in
|
|
165
175
|
`stackchan_mcp/stdio_server.py`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "stackchan-mcp"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -32,6 +32,40 @@ dependencies = [
|
|
|
32
32
|
"aiohttp>=3",
|
|
33
33
|
]
|
|
34
34
|
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
# Phase 4 TTS — see Issue #70.
|
|
37
|
+
# Concrete engines (VOICEVOX, Irodori) consume these libraries:
|
|
38
|
+
# * httpx — VOICEVOX HTTP engine client
|
|
39
|
+
# * opuslib — Opus encoding for the device's audio decoder
|
|
40
|
+
# `tts-voicevox` is a no-op alias provided so users can declare intent
|
|
41
|
+
# explicitly; the VOICEVOX engine itself is an external HTTP process and
|
|
42
|
+
# adds no Python dependencies of its own.
|
|
43
|
+
tts = [
|
|
44
|
+
"httpx>=0.27",
|
|
45
|
+
"opuslib>=3",
|
|
46
|
+
]
|
|
47
|
+
tts-voicevox = [
|
|
48
|
+
"stackchan-mcp[tts]",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Phase 4 STT — see Issue #91.
|
|
52
|
+
# The base `stt` extra carries `opuslib` for decoding the device's
|
|
53
|
+
# inbound Opus frames. Concrete engines live behind their own extras
|
|
54
|
+
# so users only pull in the heavy ML dependencies they actually need.
|
|
55
|
+
# * faster-whisper — local Whisper via CTranslate2 (default, MIT)
|
|
56
|
+
# * openai — OpenAI Whisper API client (cloud)
|
|
57
|
+
stt = [
|
|
58
|
+
"opuslib>=3",
|
|
59
|
+
]
|
|
60
|
+
stt-faster-whisper = [
|
|
61
|
+
"stackchan-mcp[stt]",
|
|
62
|
+
"faster-whisper>=1.0",
|
|
63
|
+
]
|
|
64
|
+
stt-openai = [
|
|
65
|
+
"stackchan-mcp[stt]",
|
|
66
|
+
"openai>=1.0",
|
|
67
|
+
]
|
|
68
|
+
|
|
35
69
|
[project.urls]
|
|
36
70
|
Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
37
71
|
Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Opus audio frame handling for the gateway <-> device link.
|
|
2
|
+
|
|
3
|
+
Outbound (TTS) frames are produced by
|
|
4
|
+
:mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
|
|
5
|
+
ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
|
|
6
|
+
|
|
7
|
+
The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
|
|
8
|
+
binary frames coming up from the device land in
|
|
9
|
+
:func:`handle_audio_frame`, which buffers them into a module-level
|
|
10
|
+
recording slot when one is active. The
|
|
11
|
+
:mod:`stackchan_mcp.stt.orchestrator` opens the slot via
|
|
12
|
+
:func:`start_recording` before sending ``listen.start`` to the device
|
|
13
|
+
and closes it via :func:`stop_recording` after the capture window;
|
|
14
|
+
outside an active recording, inbound frames are still discarded.
|
|
15
|
+
|
|
16
|
+
The recording slot is intentionally a module-level singleton: the
|
|
17
|
+
device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
|
|
18
|
+
one connection, and the STT orchestrator serialises ``listen()`` calls
|
|
19
|
+
through :attr:`ESP32Manager.listen_lock`, so concurrent captures
|
|
20
|
+
cannot race the buffer. If multi-device support lands later, this
|
|
21
|
+
should move onto the connection object.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import logging
|
|
27
|
+
from typing import TYPE_CHECKING, Iterable
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from .esp32_client import ESP32Manager
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Recording slot (inbound STT capture) ---------------------------------
|
|
36
|
+
#
|
|
37
|
+
# A single capture at a time is enforced by the orchestrator's
|
|
38
|
+
# ``listen_lock``; this module only owns the buffer itself.
|
|
39
|
+
|
|
40
|
+
_recording_session_id: str | None = None
|
|
41
|
+
_recording_frames: list[bytes] = []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def start_recording(session_id: str) -> None:
|
|
45
|
+
"""Open a fresh recording slot for ``session_id``.
|
|
46
|
+
|
|
47
|
+
Any frames already buffered are discarded so a previous call that
|
|
48
|
+
crashed before ``stop_recording`` cannot leak into the next
|
|
49
|
+
capture. The orchestrator wraps start/stop in a try/finally to
|
|
50
|
+
guarantee the slot is closed even on error.
|
|
51
|
+
"""
|
|
52
|
+
global _recording_session_id, _recording_frames
|
|
53
|
+
if _recording_session_id is not None:
|
|
54
|
+
# Defensive: the lock should prevent this, but if it ever
|
|
55
|
+
# fires we leak no audio — just log loudly so the regression
|
|
56
|
+
# is visible.
|
|
57
|
+
logger.warning(
|
|
58
|
+
"start_recording called while session=%s was still active; "
|
|
59
|
+
"dropping %d buffered frames",
|
|
60
|
+
_recording_session_id,
|
|
61
|
+
len(_recording_frames),
|
|
62
|
+
)
|
|
63
|
+
_recording_session_id = session_id
|
|
64
|
+
_recording_frames = []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def stop_recording() -> list[bytes]:
|
|
68
|
+
"""Close the recording slot and return the buffered Opus frames.
|
|
69
|
+
|
|
70
|
+
Returns an empty list if no recording was active. The slot is
|
|
71
|
+
cleared whether or not frames were captured so the next call to
|
|
72
|
+
:func:`start_recording` starts clean.
|
|
73
|
+
"""
|
|
74
|
+
global _recording_session_id, _recording_frames
|
|
75
|
+
frames = _recording_frames
|
|
76
|
+
_recording_session_id = None
|
|
77
|
+
_recording_frames = []
|
|
78
|
+
return frames
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_recording() -> bool:
|
|
82
|
+
"""Return ``True`` when a recording slot is currently open."""
|
|
83
|
+
return _recording_session_id is not None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def handle_audio_frame(data: bytes, session_id: str) -> None:
|
|
87
|
+
"""Process an incoming binary Opus frame from the device.
|
|
88
|
+
|
|
89
|
+
When a recording slot is active (see :func:`start_recording`) AND
|
|
90
|
+
the frame belongs to the recording's session, appends the frame
|
|
91
|
+
to the in-memory buffer for later decoding by the STT
|
|
92
|
+
orchestrator. Frames from a different session — typical during
|
|
93
|
+
a connection swap, where the old WebSocket handler is still
|
|
94
|
+
draining incoming bytes after :meth:`ESP32Connection.disconnect`
|
|
95
|
+
has been called on the main task — are dropped so they cannot
|
|
96
|
+
bleed into the new connection's capture buffer.
|
|
97
|
+
|
|
98
|
+
Outside of an active recording the frame is logged at debug
|
|
99
|
+
level and discarded; the device may emit audio on its own (e.g.
|
|
100
|
+
after an autonomous wake-word detection) and the gateway has no
|
|
101
|
+
STT pipeline running for those frames yet.
|
|
102
|
+
"""
|
|
103
|
+
if _recording_session_id is None:
|
|
104
|
+
logger.debug(
|
|
105
|
+
"audio_frame session=%s bytes=%d (discarded — no active recording)",
|
|
106
|
+
session_id,
|
|
107
|
+
len(data),
|
|
108
|
+
)
|
|
109
|
+
return
|
|
110
|
+
if _recording_session_id != session_id:
|
|
111
|
+
# A different connection is sending audio while a recording
|
|
112
|
+
# for this session is in flight. This happens when ESP32
|
|
113
|
+
# reconnects: ``ESP32Manager._handler`` swaps in a new
|
|
114
|
+
# ``ESP32Connection`` and marks the old one disconnected,
|
|
115
|
+
# but the old socket's ``async for message in ws`` loop can
|
|
116
|
+
# still drain a frame or two before the close lands. Letting
|
|
117
|
+
# those into the buffer would corrupt the new session's
|
|
118
|
+
# transcription, so drop them here.
|
|
119
|
+
logger.debug(
|
|
120
|
+
"audio_frame session=%s bytes=%d (discarded — does not match "
|
|
121
|
+
"recording session=%s)",
|
|
122
|
+
session_id,
|
|
123
|
+
len(data),
|
|
124
|
+
_recording_session_id,
|
|
125
|
+
)
|
|
126
|
+
return
|
|
127
|
+
_recording_frames.append(data)
|
|
128
|
+
logger.debug(
|
|
129
|
+
"audio_frame session=%s bytes=%d buffered (recording active)",
|
|
130
|
+
session_id,
|
|
131
|
+
len(data),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def push_opus_frames(
|
|
136
|
+
esp32: ESP32Manager,
|
|
137
|
+
frames: Iterable[bytes],
|
|
138
|
+
) -> int:
|
|
139
|
+
"""Push Opus frames to the connected ESP32.
|
|
140
|
+
|
|
141
|
+
Returns the number of frames sent so the caller can report this to
|
|
142
|
+
the MCP client. Raises :class:`ConnectionError` (via
|
|
143
|
+
:meth:`ESP32Manager.send_audio_frame`) if the device disconnects
|
|
144
|
+
mid-stream — the orchestrator turns that into a clean MCP error
|
|
145
|
+
rather than letting it bubble up as a stack trace.
|
|
146
|
+
"""
|
|
147
|
+
sent = 0
|
|
148
|
+
for frame in frames:
|
|
149
|
+
await esp32.send_audio_frame(frame)
|
|
150
|
+
sent += 1
|
|
151
|
+
return sent
|
|
@@ -16,6 +16,7 @@ import asyncio
|
|
|
16
16
|
import errno
|
|
17
17
|
import logging
|
|
18
18
|
import os
|
|
19
|
+
import platform
|
|
19
20
|
import shutil
|
|
20
21
|
import socket
|
|
21
22
|
import subprocess
|
|
@@ -377,6 +378,56 @@ def _load_dotenv() -> None:
|
|
|
377
378
|
load_dotenv()
|
|
378
379
|
|
|
379
380
|
|
|
381
|
+
# Default Homebrew prefixes that ship libopus.dylib on macOS. Apple
|
|
382
|
+
# Silicon installs default to ``/opt/homebrew``; Intel Macs use
|
|
383
|
+
# ``/usr/local``. Keeping both keeps the helper portable across
|
|
384
|
+
# contributor machines.
|
|
385
|
+
_HOMEBREW_LIB_DIRS = ("/opt/homebrew/lib", "/usr/local/lib")
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _ensure_libopus_findable() -> None:
|
|
389
|
+
"""Make libopus reachable to opuslib's ``ctypes.find_library`` on macOS.
|
|
390
|
+
|
|
391
|
+
``opuslib.api`` calls ``ctypes.util.find_library("opus")`` at
|
|
392
|
+
import time. On macOS that walks ``DYLD_LIBRARY_PATH`` plus a
|
|
393
|
+
couple of system-default directories — but not Homebrew's
|
|
394
|
+
``/opt/homebrew/lib`` (Apple Silicon) or ``/usr/local/lib`` (Intel),
|
|
395
|
+
so a vanilla ``brew install opus`` lands a working libopus that
|
|
396
|
+
opuslib still cannot find. Users then see ``Could not find Opus
|
|
397
|
+
library`` even though the dylib is on disk.
|
|
398
|
+
|
|
399
|
+
Prepend any Homebrew-style lib directories that exist so the next
|
|
400
|
+
``find_library`` call (triggered by the lazy ``import opuslib``
|
|
401
|
+
inside :func:`audio_utils.encode_opus_frames`) succeeds. We
|
|
402
|
+
deliberately *prepend* and skip duplicates so an explicit
|
|
403
|
+
``DYLD_LIBRARY_PATH`` set by the operator (e.g. for a custom build
|
|
404
|
+
of libopus) keeps priority. No-op on non-macOS hosts.
|
|
405
|
+
"""
|
|
406
|
+
if platform.system() != "Darwin":
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
existing = os.environ.get("DYLD_LIBRARY_PATH", "")
|
|
410
|
+
paths: list[str] = [p for p in existing.split(":") if p]
|
|
411
|
+
|
|
412
|
+
prepended: list[str] = []
|
|
413
|
+
for candidate in _HOMEBREW_LIB_DIRS:
|
|
414
|
+
if candidate in paths:
|
|
415
|
+
continue
|
|
416
|
+
if not os.path.isdir(candidate):
|
|
417
|
+
continue
|
|
418
|
+
prepended.append(candidate)
|
|
419
|
+
|
|
420
|
+
if not prepended:
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
os.environ["DYLD_LIBRARY_PATH"] = ":".join(prepended + paths)
|
|
424
|
+
logger.debug(
|
|
425
|
+
"Prepended Homebrew lib dirs to DYLD_LIBRARY_PATH so opuslib "
|
|
426
|
+
"can find libopus: %s",
|
|
427
|
+
prepended,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
|
|
380
431
|
def _run_preflight() -> int:
|
|
381
432
|
"""Run preflight diagnostics. Returns the desired process exit code.
|
|
382
433
|
|
|
@@ -387,6 +438,7 @@ def _run_preflight() -> int:
|
|
|
387
438
|
warns about a missing ``STACKCHAN_TOKEN``.
|
|
388
439
|
"""
|
|
389
440
|
_load_dotenv()
|
|
441
|
+
_ensure_libopus_findable()
|
|
390
442
|
|
|
391
443
|
issues = 0
|
|
392
444
|
print(f"stackchan-mcp {__version__} preflight")
|
|
@@ -527,6 +579,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
527
579
|
sys.exit(_run_preflight())
|
|
528
580
|
|
|
529
581
|
_load_dotenv()
|
|
582
|
+
_ensure_libopus_findable()
|
|
530
583
|
|
|
531
584
|
logging.basicConfig(
|
|
532
585
|
level=logging.INFO,
|
|
@@ -14,8 +14,10 @@ import uuid
|
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
16
|
import websockets
|
|
17
|
+
import websockets.exceptions
|
|
17
18
|
from websockets.asyncio.server import ServerConnection
|
|
18
19
|
|
|
20
|
+
from .audio_stream import handle_audio_frame
|
|
19
21
|
from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
|
|
20
22
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
@@ -36,6 +38,13 @@ class ESP32Connection:
|
|
|
36
38
|
self._pending: dict[int, asyncio.Future[dict[str, Any]]] = {}
|
|
37
39
|
self._connected = True
|
|
38
40
|
self._initialized = False
|
|
41
|
+
# Device-declared WebSocket protocol version (from the hello
|
|
42
|
+
# message). Defaults to 1, which matches the firmware's default
|
|
43
|
+
# (firmware/main/protocols/websocket_protocol.h: ``version_ = 1``)
|
|
44
|
+
# and the audio framing this gateway emits today (raw Opus
|
|
45
|
+
# payload). v2/v3 add a BinaryProtocol header that this gateway
|
|
46
|
+
# does not yet wrap — see Issue follow-up to #70.
|
|
47
|
+
self.protocol_version: int = 1
|
|
39
48
|
|
|
40
49
|
@property
|
|
41
50
|
def connected(self) -> bool:
|
|
@@ -142,6 +151,89 @@ class ESP32Connection:
|
|
|
142
151
|
method = payload.get("method", "")
|
|
143
152
|
logger.info("ESP32 notification: %s", method)
|
|
144
153
|
|
|
154
|
+
async def _ws_send(self, payload: bytes | str) -> None:
|
|
155
|
+
"""Send a payload, translating websockets errors to ConnectionError.
|
|
156
|
+
|
|
157
|
+
The ``websockets`` library raises its own exception hierarchy
|
|
158
|
+
(``ConnectionClosed`` and friends), which is *not* a subclass
|
|
159
|
+
of the built-in :class:`ConnectionError`. Without translation
|
|
160
|
+
the orchestrator's ``except ConnectionError`` filter — and the
|
|
161
|
+
MCP handler's ``except RuntimeError`` filter — would let those
|
|
162
|
+
errors leak as raw tracebacks into the MCP transport, breaking
|
|
163
|
+
the say() tool's clean error JSON contract on mid-stream
|
|
164
|
+
disconnect.
|
|
165
|
+
"""
|
|
166
|
+
try:
|
|
167
|
+
await self._ws.send(payload)
|
|
168
|
+
except (
|
|
169
|
+
websockets.exceptions.ConnectionClosed,
|
|
170
|
+
OSError,
|
|
171
|
+
) as exc:
|
|
172
|
+
# Mark the connection dead so subsequent calls fail fast
|
|
173
|
+
# rather than each one re-discovering the broken socket.
|
|
174
|
+
self.disconnect()
|
|
175
|
+
raise ConnectionError(f"WebSocket send failed: {exc}") from exc
|
|
176
|
+
|
|
177
|
+
async def send_audio_frame(self, opus_frame: bytes) -> None:
|
|
178
|
+
"""Send a single Opus frame to the ESP32 as a WebSocket binary frame.
|
|
179
|
+
|
|
180
|
+
The device's ``OnData`` handler (firmware/main/protocols/
|
|
181
|
+
websocket_protocol.cc) treats every binary frame as an Opus
|
|
182
|
+
audio payload to feed into its decoder, so this method is the
|
|
183
|
+
TTS pipeline's egress point.
|
|
184
|
+
"""
|
|
185
|
+
if not self._connected:
|
|
186
|
+
raise ConnectionError("ESP32 not connected")
|
|
187
|
+
await self._ws_send(opus_frame)
|
|
188
|
+
|
|
189
|
+
async def send_tts_state(self, state: str) -> None:
|
|
190
|
+
"""Send a TTS state notification (``start`` / ``stop`` / ...).
|
|
191
|
+
|
|
192
|
+
The device's :func:`Application::OnIncomingJson` translates
|
|
193
|
+
``{"type":"tts","state":"start"}`` into
|
|
194
|
+
:data:`kDeviceStateSpeaking`, which is the gate for
|
|
195
|
+
:func:`OnIncomingAudio` pushing packets into the decode queue
|
|
196
|
+
(see ``firmware/main/application.cc``). Without bracketing the
|
|
197
|
+
audio frames in start/stop, the device drops them on the floor
|
|
198
|
+
and the speaker stays silent — the TTS tool returns success
|
|
199
|
+
without anything actually playing.
|
|
200
|
+
"""
|
|
201
|
+
if not self._connected:
|
|
202
|
+
raise ConnectionError("ESP32 not connected")
|
|
203
|
+
message = {
|
|
204
|
+
"session_id": self.session_id,
|
|
205
|
+
"type": "tts",
|
|
206
|
+
"state": state,
|
|
207
|
+
}
|
|
208
|
+
await self._ws_send(json.dumps(message))
|
|
209
|
+
|
|
210
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
211
|
+
"""Send a listen state notification (``start`` / ``stop``).
|
|
212
|
+
|
|
213
|
+
Server-driven counterpart to the device's existing
|
|
214
|
+
:func:`Protocol::SendStartListening` (Issue #91). The
|
|
215
|
+
firmware's :func:`Application::OnIncomingJson` dispatches
|
|
216
|
+
``state: "start"`` to :func:`Application::StartListening` and
|
|
217
|
+
``state: "stop"`` to :func:`Application::StopListening`.
|
|
218
|
+
|
|
219
|
+
``mode`` is currently accepted only for ``state="start"`` and is
|
|
220
|
+
carried on the wire for forward-compatibility — the firmware
|
|
221
|
+
accepts but ignores it in Phase 1 because
|
|
222
|
+
:func:`HandleStartListeningEvent` unconditionally enters
|
|
223
|
+
``kListeningModeManualStop`` (the gateway controls the stop
|
|
224
|
+
boundary explicitly).
|
|
225
|
+
"""
|
|
226
|
+
if not self._connected:
|
|
227
|
+
raise ConnectionError("ESP32 not connected")
|
|
228
|
+
message: dict[str, Any] = {
|
|
229
|
+
"session_id": self.session_id,
|
|
230
|
+
"type": "listen",
|
|
231
|
+
"state": state,
|
|
232
|
+
}
|
|
233
|
+
if state == "start":
|
|
234
|
+
message["mode"] = mode
|
|
235
|
+
await self._ws_send(json.dumps(message))
|
|
236
|
+
|
|
145
237
|
def disconnect(self) -> None:
|
|
146
238
|
"""Mark connection as disconnected."""
|
|
147
239
|
self._connected = False
|
|
@@ -167,6 +259,32 @@ class ESP32Manager:
|
|
|
167
259
|
self._init_tasks: list[asyncio.Task] = []
|
|
168
260
|
self._vision_url: str = ""
|
|
169
261
|
self._vision_token: str = ""
|
|
262
|
+
# Per-device serialisation for TTS send sequences. Acquired by
|
|
263
|
+
# the orchestrator around the entire start → frames → stop
|
|
264
|
+
# block so concurrent ``say()`` invocations cannot interleave
|
|
265
|
+
# their Opus frames on the same WebSocket or overlap their
|
|
266
|
+
# ``tts.start``/``tts.stop`` notifications (which would yank
|
|
267
|
+
# the firmware out of ``kDeviceStateSpeaking`` mid-utterance
|
|
268
|
+
# and silently drop the remaining audio). The lock is scoped
|
|
269
|
+
# to the manager because the manager owns the device today —
|
|
270
|
+
# if multi-device support lands later, the lock should move
|
|
271
|
+
# onto :class:`ESP32Connection` instead.
|
|
272
|
+
self._tts_lock = asyncio.Lock()
|
|
273
|
+
# Inbound STT capture (Issue #91) shares the TTS lock rather
|
|
274
|
+
# than running on a separate one. The firmware's
|
|
275
|
+
# ``HandleStartListeningEvent`` aborts any in-flight TTS when
|
|
276
|
+
# a listen.start arrives mid-speaking (state ==
|
|
277
|
+
# ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
|
|
278
|
+
# ``SetListeningMode(kListeningModeManualStop)``), so two
|
|
279
|
+
# operations on the same device's audio path would
|
|
280
|
+
# otherwise step on each other: a ``listen()`` could yank a
|
|
281
|
+
# ``say()`` out of speaking mid-utterance, or a ``say()``
|
|
282
|
+
# could start streaming TTS frames into the buffer a
|
|
283
|
+
# concurrent ``listen()`` is capturing. Treating the audio
|
|
284
|
+
# path as a single resource makes the device's state machine
|
|
285
|
+
# observable from gateway code; if a full-duplex contract
|
|
286
|
+
# ever lands later the lock can split again.
|
|
287
|
+
self._listen_lock = self._tts_lock
|
|
170
288
|
|
|
171
289
|
@property
|
|
172
290
|
def device_connected(self) -> bool:
|
|
@@ -176,6 +294,26 @@ class ESP32Manager:
|
|
|
176
294
|
def connection(self) -> ESP32Connection | None:
|
|
177
295
|
return self._connection
|
|
178
296
|
|
|
297
|
+
@property
|
|
298
|
+
def tts_lock(self) -> asyncio.Lock:
|
|
299
|
+
"""Per-device lock guarding the TTS send sequence.
|
|
300
|
+
|
|
301
|
+
See :attr:`_tts_lock` for the rationale; the orchestrator wraps
|
|
302
|
+
the start → frames → stop block in ``async with`` on this lock.
|
|
303
|
+
"""
|
|
304
|
+
return self._tts_lock
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def listen_lock(self) -> asyncio.Lock:
|
|
308
|
+
"""Per-device lock guarding the STT capture sequence.
|
|
309
|
+
|
|
310
|
+
See :attr:`_listen_lock` for the rationale; the orchestrator
|
|
311
|
+
wraps the entire ``listen.start`` → wait → ``listen.stop``
|
|
312
|
+
block in ``async with`` on this lock so two concurrent
|
|
313
|
+
``listen()`` calls cannot share the inbound recording slot.
|
|
314
|
+
"""
|
|
315
|
+
return self._listen_lock
|
|
316
|
+
|
|
179
317
|
async def start(
|
|
180
318
|
self,
|
|
181
319
|
host: str = "0.0.0.0",
|
|
@@ -246,7 +384,14 @@ class ESP32Manager:
|
|
|
246
384
|
try:
|
|
247
385
|
async for message in ws:
|
|
248
386
|
if isinstance(message, bytes):
|
|
249
|
-
# Binary = audio frame
|
|
387
|
+
# Binary = audio frame. Forward to the audio_stream
|
|
388
|
+
# module which buffers it for STT capture (Issue
|
|
389
|
+
# #91) when a recording slot is open, or discards
|
|
390
|
+
# it otherwise. Only protocol v1 is supported on
|
|
391
|
+
# the inbound side today; the orchestrator gates
|
|
392
|
+
# listen() on protocol_version=1 so v2/v3 frames
|
|
393
|
+
# cannot reach this point with recording active.
|
|
394
|
+
await handle_audio_frame(message, session_id)
|
|
250
395
|
continue
|
|
251
396
|
|
|
252
397
|
try:
|
|
@@ -265,6 +410,27 @@ class ESP32Manager:
|
|
|
265
410
|
await ws.close()
|
|
266
411
|
return
|
|
267
412
|
|
|
413
|
+
# Capture the device's WebSocket protocol version
|
|
414
|
+
# so callers (e.g. the TTS pipeline) can decide
|
|
415
|
+
# whether their wire format is compatible. The
|
|
416
|
+
# firmware accepts raw Opus only on v1; v2/v3 wrap
|
|
417
|
+
# the payload in a BinaryProtocol header.
|
|
418
|
+
raw_version = data.get("version", 1)
|
|
419
|
+
try:
|
|
420
|
+
connection.protocol_version = int(raw_version)
|
|
421
|
+
except (TypeError, ValueError):
|
|
422
|
+
connection.protocol_version = 1
|
|
423
|
+
if connection.protocol_version != 1:
|
|
424
|
+
logger.warning(
|
|
425
|
+
"ESP32 negotiated WebSocket protocol "
|
|
426
|
+
"version=%s; the gateway emits raw Opus "
|
|
427
|
+
"binary frames matching v1 only. TTS "
|
|
428
|
+
"calls (say) will be blocked at the "
|
|
429
|
+
"orchestrator until v2/v3 BinaryProtocol "
|
|
430
|
+
"header wrapping is implemented",
|
|
431
|
+
connection.protocol_version,
|
|
432
|
+
)
|
|
433
|
+
|
|
268
434
|
# Send hello response
|
|
269
435
|
resp = HelloResponse(session_id=session_id)
|
|
270
436
|
await ws.send(resp.model_dump_json())
|
|
@@ -323,6 +489,40 @@ class ESP32Manager:
|
|
|
323
489
|
return None, {"code": -32000, "message": "ESP32 not initialized"}
|
|
324
490
|
return await self._connection.call_tool(name, arguments)
|
|
325
491
|
|
|
492
|
+
async def send_audio_frame(self, opus_frame: bytes) -> None:
|
|
493
|
+
"""Push a single Opus frame to the connected device.
|
|
494
|
+
|
|
495
|
+
Used by the TTS pipeline to deliver synthesised audio. Raises
|
|
496
|
+
:class:`ConnectionError` if no device is currently attached so
|
|
497
|
+
the orchestrator can surface a clean error to the MCP client
|
|
498
|
+
instead of silently dropping audio.
|
|
499
|
+
"""
|
|
500
|
+
if not self._connection or not self._connection.connected:
|
|
501
|
+
raise ConnectionError("No ESP32 device connected")
|
|
502
|
+
await self._connection.send_audio_frame(opus_frame)
|
|
503
|
+
|
|
504
|
+
async def send_tts_state(self, state: str) -> None:
|
|
505
|
+
"""Send a TTS state notification (``start`` / ``stop`` / ...).
|
|
506
|
+
|
|
507
|
+
Required around audio frame egress so the device transitions
|
|
508
|
+
into ``kDeviceStateSpeaking`` and back; see
|
|
509
|
+
:meth:`ESP32Connection.send_tts_state` for the full rationale.
|
|
510
|
+
"""
|
|
511
|
+
if not self._connection or not self._connection.connected:
|
|
512
|
+
raise ConnectionError("No ESP32 device connected")
|
|
513
|
+
await self._connection.send_tts_state(state)
|
|
514
|
+
|
|
515
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
516
|
+
"""Send a listen state notification to put the device into /
|
|
517
|
+
out of listening mode (Issue #91).
|
|
518
|
+
|
|
519
|
+
See :meth:`ESP32Connection.send_listen_state` for the wire
|
|
520
|
+
format and the firmware-side dispatch.
|
|
521
|
+
"""
|
|
522
|
+
if not self._connection or not self._connection.connected:
|
|
523
|
+
raise ConnectionError("No ESP32 device connected")
|
|
524
|
+
await self._connection.send_listen_state(state, mode=mode)
|
|
525
|
+
|
|
326
526
|
def get_status(self) -> dict[str, Any]:
|
|
327
527
|
"""Get current connection status."""
|
|
328
528
|
if not self._connection or not self._connection.connected:
|