stackchan-mcp 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/PKG-INFO +9 -1
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/pyproject.toml +19 -1
- stackchan_mcp-0.7.0/stackchan_mcp/audio_stream.py +151 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/esp32_client.py +73 -1
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/stdio_server.py +176 -3
- stackchan_mcp-0.7.0/stackchan_mcp/stt/__init__.py +62 -0
- stackchan_mcp-0.7.0/stackchan_mcp/stt/audio_utils.py +102 -0
- stackchan_mcp-0.7.0/stackchan_mcp/stt/base.py +94 -0
- stackchan_mcp-0.7.0/stackchan_mcp/stt/faster_whisper.py +217 -0
- stackchan_mcp-0.7.0/stackchan_mcp/stt/openai_whisper.py +177 -0
- stackchan_mcp-0.7.0/stackchan_mcp/stt/orchestrator.py +552 -0
- stackchan_mcp-0.7.0/tests/test_audio_stream.py +145 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_esp32_client.py +81 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_stdio_server.py +194 -0
- stackchan_mcp-0.7.0/tests/test_stt_audio_utils.py +100 -0
- stackchan_mcp-0.7.0/tests/test_stt_framework.py +195 -0
- stackchan_mcp-0.7.0/tests/test_stt_orchestrator.py +1150 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/uv.lock +775 -3
- stackchan_mcp-0.5.0/stackchan_mcp/audio_stream.py +0 -52
- stackchan_mcp-0.5.0/tests/test_audio_stream.py +0 -60
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/.env.example +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/.gitignore +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/LICENSE +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/README.md +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/__main__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/capture_server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/cli.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/gateway.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/audio.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/camera.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/robot.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/mcp_router.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/protocol.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tools.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/audio_utils.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/base.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/orchestrator.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/voicevox.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/_audio_fixtures.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/conftest.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_audio_utils.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_capture_server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_cli.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_gateway.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_mcp_router.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_orchestrator.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_protocol.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_tts_framework.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_voicevox.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stackchan-mcp
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
|
|
5
5
|
Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
6
6
|
Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
@@ -27,6 +27,14 @@ Requires-Dist: mcp>=1.0
|
|
|
27
27
|
Requires-Dist: pydantic>=2
|
|
28
28
|
Requires-Dist: python-dotenv
|
|
29
29
|
Requires-Dist: websockets>=12
|
|
30
|
+
Provides-Extra: stt
|
|
31
|
+
Requires-Dist: opuslib>=3; extra == 'stt'
|
|
32
|
+
Provides-Extra: stt-faster-whisper
|
|
33
|
+
Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
|
|
34
|
+
Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
|
|
35
|
+
Provides-Extra: stt-openai
|
|
36
|
+
Requires-Dist: openai>=1.0; extra == 'stt-openai'
|
|
37
|
+
Requires-Dist: opuslib>=3; extra == 'stt-openai'
|
|
30
38
|
Provides-Extra: tts
|
|
31
39
|
Requires-Dist: httpx>=0.27; extra == 'tts'
|
|
32
40
|
Requires-Dist: opuslib>=3; extra == 'tts'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "stackchan-mcp"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.0"
|
|
4
4
|
description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -48,6 +48,24 @@ tts-voicevox = [
|
|
|
48
48
|
"stackchan-mcp[tts]",
|
|
49
49
|
]
|
|
50
50
|
|
|
51
|
+
# Phase 4 STT — see Issue #91.
|
|
52
|
+
# The base `stt` extra carries `opuslib` for decoding the device's
|
|
53
|
+
# inbound Opus frames. Concrete engines live behind their own extras
|
|
54
|
+
# so users only pull in the heavy ML dependencies they actually need.
|
|
55
|
+
# * faster-whisper — local Whisper via CTranslate2 (default, MIT)
|
|
56
|
+
# * openai — OpenAI Whisper API client (cloud)
|
|
57
|
+
stt = [
|
|
58
|
+
"opuslib>=3",
|
|
59
|
+
]
|
|
60
|
+
stt-faster-whisper = [
|
|
61
|
+
"stackchan-mcp[stt]",
|
|
62
|
+
"faster-whisper>=1.0",
|
|
63
|
+
]
|
|
64
|
+
stt-openai = [
|
|
65
|
+
"stackchan-mcp[stt]",
|
|
66
|
+
"openai>=1.0",
|
|
67
|
+
]
|
|
68
|
+
|
|
51
69
|
[project.urls]
|
|
52
70
|
Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
53
71
|
Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Opus audio frame handling for the gateway <-> device link.
|
|
2
|
+
|
|
3
|
+
Outbound (TTS) frames are produced by
|
|
4
|
+
:mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
|
|
5
|
+
ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
|
|
6
|
+
|
|
7
|
+
The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
|
|
8
|
+
binary frames coming up from the device land in
|
|
9
|
+
:func:`handle_audio_frame`, which buffers them into a module-level
|
|
10
|
+
recording slot when one is active. The
|
|
11
|
+
:mod:`stackchan_mcp.stt.orchestrator` opens the slot via
|
|
12
|
+
:func:`start_recording` before sending ``listen.start`` to the device
|
|
13
|
+
and closes it via :func:`stop_recording` after the capture window;
|
|
14
|
+
outside an active recording, inbound frames are still discarded.
|
|
15
|
+
|
|
16
|
+
The recording slot is intentionally a module-level singleton: the
|
|
17
|
+
device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
|
|
18
|
+
one connection, and the STT orchestrator serialises ``listen()`` calls
|
|
19
|
+
through :attr:`ESP32Manager.listen_lock`, so concurrent captures
|
|
20
|
+
cannot race the buffer. If multi-device support lands later, this
|
|
21
|
+
should move onto the connection object.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import logging
|
|
27
|
+
from typing import TYPE_CHECKING, Iterable
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from .esp32_client import ESP32Manager
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Recording slot (inbound STT capture) ---------------------------------
|
|
36
|
+
#
|
|
37
|
+
# A single capture at a time is enforced by the orchestrator's
|
|
38
|
+
# ``listen_lock``; this module only owns the buffer itself.
|
|
39
|
+
|
|
40
|
+
_recording_session_id: str | None = None
|
|
41
|
+
_recording_frames: list[bytes] = []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def start_recording(session_id: str) -> None:
|
|
45
|
+
"""Open a fresh recording slot for ``session_id``.
|
|
46
|
+
|
|
47
|
+
Any frames already buffered are discarded so a previous call that
|
|
48
|
+
crashed before ``stop_recording`` cannot leak into the next
|
|
49
|
+
capture. The orchestrator wraps start/stop in a try/finally to
|
|
50
|
+
guarantee the slot is closed even on error.
|
|
51
|
+
"""
|
|
52
|
+
global _recording_session_id, _recording_frames
|
|
53
|
+
if _recording_session_id is not None:
|
|
54
|
+
# Defensive: the lock should prevent this, but if it ever
|
|
55
|
+
# fires we leak no audio — just log loudly so the regression
|
|
56
|
+
# is visible.
|
|
57
|
+
logger.warning(
|
|
58
|
+
"start_recording called while session=%s was still active; "
|
|
59
|
+
"dropping %d buffered frames",
|
|
60
|
+
_recording_session_id,
|
|
61
|
+
len(_recording_frames),
|
|
62
|
+
)
|
|
63
|
+
_recording_session_id = session_id
|
|
64
|
+
_recording_frames = []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def stop_recording() -> list[bytes]:
|
|
68
|
+
"""Close the recording slot and return the buffered Opus frames.
|
|
69
|
+
|
|
70
|
+
Returns an empty list if no recording was active. The slot is
|
|
71
|
+
cleared whether or not frames were captured so the next call to
|
|
72
|
+
:func:`start_recording` starts clean.
|
|
73
|
+
"""
|
|
74
|
+
global _recording_session_id, _recording_frames
|
|
75
|
+
frames = _recording_frames
|
|
76
|
+
_recording_session_id = None
|
|
77
|
+
_recording_frames = []
|
|
78
|
+
return frames
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_recording() -> bool:
|
|
82
|
+
"""Return ``True`` when a recording slot is currently open."""
|
|
83
|
+
return _recording_session_id is not None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def handle_audio_frame(data: bytes, session_id: str) -> None:
|
|
87
|
+
"""Process an incoming binary Opus frame from the device.
|
|
88
|
+
|
|
89
|
+
When a recording slot is active (see :func:`start_recording`) AND
|
|
90
|
+
the frame belongs to the recording's session, appends the frame
|
|
91
|
+
to the in-memory buffer for later decoding by the STT
|
|
92
|
+
orchestrator. Frames from a different session — typical during
|
|
93
|
+
a connection swap, where the old WebSocket handler is still
|
|
94
|
+
draining incoming bytes after :meth:`ESP32Connection.disconnect`
|
|
95
|
+
has been called on the main task — are dropped so they cannot
|
|
96
|
+
bleed into the new connection's capture buffer.
|
|
97
|
+
|
|
98
|
+
Outside of an active recording the frame is logged at debug
|
|
99
|
+
level and discarded; the device may emit audio on its own (e.g.
|
|
100
|
+
after an autonomous wake-word detection) and the gateway has no
|
|
101
|
+
STT pipeline running for those frames yet.
|
|
102
|
+
"""
|
|
103
|
+
if _recording_session_id is None:
|
|
104
|
+
logger.debug(
|
|
105
|
+
"audio_frame session=%s bytes=%d (discarded — no active recording)",
|
|
106
|
+
session_id,
|
|
107
|
+
len(data),
|
|
108
|
+
)
|
|
109
|
+
return
|
|
110
|
+
if _recording_session_id != session_id:
|
|
111
|
+
# A different connection is sending audio while a recording
|
|
112
|
+
# for this session is in flight. This happens when ESP32
|
|
113
|
+
# reconnects: ``ESP32Manager._handler`` swaps in a new
|
|
114
|
+
# ``ESP32Connection`` and marks the old one disconnected,
|
|
115
|
+
# but the old socket's ``async for message in ws`` loop can
|
|
116
|
+
# still drain a frame or two before the close lands. Letting
|
|
117
|
+
# those into the buffer would corrupt the new session's
|
|
118
|
+
# transcription, so drop them here.
|
|
119
|
+
logger.debug(
|
|
120
|
+
"audio_frame session=%s bytes=%d (discarded — does not match "
|
|
121
|
+
"recording session=%s)",
|
|
122
|
+
session_id,
|
|
123
|
+
len(data),
|
|
124
|
+
_recording_session_id,
|
|
125
|
+
)
|
|
126
|
+
return
|
|
127
|
+
_recording_frames.append(data)
|
|
128
|
+
logger.debug(
|
|
129
|
+
"audio_frame session=%s bytes=%d buffered (recording active)",
|
|
130
|
+
session_id,
|
|
131
|
+
len(data),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def push_opus_frames(
|
|
136
|
+
esp32: ESP32Manager,
|
|
137
|
+
frames: Iterable[bytes],
|
|
138
|
+
) -> int:
|
|
139
|
+
"""Push Opus frames to the connected ESP32.
|
|
140
|
+
|
|
141
|
+
Returns the number of frames sent so the caller can report this to
|
|
142
|
+
the MCP client. Raises :class:`ConnectionError` (via
|
|
143
|
+
:meth:`ESP32Manager.send_audio_frame`) if the device disconnects
|
|
144
|
+
mid-stream — the orchestrator turns that into a clean MCP error
|
|
145
|
+
rather than letting it bubble up as a stack trace.
|
|
146
|
+
"""
|
|
147
|
+
sent = 0
|
|
148
|
+
for frame in frames:
|
|
149
|
+
await esp32.send_audio_frame(frame)
|
|
150
|
+
sent += 1
|
|
151
|
+
return sent
|
|
@@ -17,6 +17,7 @@ import websockets
|
|
|
17
17
|
import websockets.exceptions
|
|
18
18
|
from websockets.asyncio.server import ServerConnection
|
|
19
19
|
|
|
20
|
+
from .audio_stream import handle_audio_frame
|
|
20
21
|
from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
@@ -206,6 +207,33 @@ class ESP32Connection:
|
|
|
206
207
|
}
|
|
207
208
|
await self._ws_send(json.dumps(message))
|
|
208
209
|
|
|
210
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
211
|
+
"""Send a listen state notification (``start`` / ``stop``).
|
|
212
|
+
|
|
213
|
+
Server-driven counterpart to the device's existing
|
|
214
|
+
:func:`Protocol::SendStartListening` (Issue #91). The
|
|
215
|
+
firmware's :func:`Application::OnIncomingJson` dispatches
|
|
216
|
+
``state: "start"`` to :func:`Application::StartListening` and
|
|
217
|
+
``state: "stop"`` to :func:`Application::StopListening`.
|
|
218
|
+
|
|
219
|
+
``mode`` is currently accepted only for ``state="start"`` and is
|
|
220
|
+
carried on the wire for forward-compatibility — the firmware
|
|
221
|
+
accepts but ignores it in Phase 1 because
|
|
222
|
+
:func:`HandleStartListeningEvent` unconditionally enters
|
|
223
|
+
``kListeningModeManualStop`` (the gateway controls the stop
|
|
224
|
+
boundary explicitly).
|
|
225
|
+
"""
|
|
226
|
+
if not self._connected:
|
|
227
|
+
raise ConnectionError("ESP32 not connected")
|
|
228
|
+
message: dict[str, Any] = {
|
|
229
|
+
"session_id": self.session_id,
|
|
230
|
+
"type": "listen",
|
|
231
|
+
"state": state,
|
|
232
|
+
}
|
|
233
|
+
if state == "start":
|
|
234
|
+
message["mode"] = mode
|
|
235
|
+
await self._ws_send(json.dumps(message))
|
|
236
|
+
|
|
209
237
|
def disconnect(self) -> None:
|
|
210
238
|
"""Mark connection as disconnected."""
|
|
211
239
|
self._connected = False
|
|
@@ -242,6 +270,21 @@ class ESP32Manager:
|
|
|
242
270
|
# if multi-device support lands later, the lock should move
|
|
243
271
|
# onto :class:`ESP32Connection` instead.
|
|
244
272
|
self._tts_lock = asyncio.Lock()
|
|
273
|
+
# Inbound STT capture (Issue #91) shares the TTS lock rather
|
|
274
|
+
# than running on a separate one. The firmware's
|
|
275
|
+
# ``HandleStartListeningEvent`` aborts any in-flight TTS when
|
|
276
|
+
# a listen.start arrives mid-speaking (state ==
|
|
277
|
+
# ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
|
|
278
|
+
# ``SetListeningMode(kListeningModeManualStop)``), so two
|
|
279
|
+
# operations on the same device's audio path would
|
|
280
|
+
# otherwise step on each other: a ``listen()`` could yank a
|
|
281
|
+
# ``say()`` out of speaking mid-utterance, or a ``say()``
|
|
282
|
+
# could start streaming TTS frames into the buffer a
|
|
283
|
+
# concurrent ``listen()`` is capturing. Treating the audio
|
|
284
|
+
# path as a single resource makes the device's state machine
|
|
285
|
+
# observable from gateway code; if a full-duplex contract
|
|
286
|
+
# ever lands later the lock can split again.
|
|
287
|
+
self._listen_lock = self._tts_lock
|
|
245
288
|
|
|
246
289
|
@property
|
|
247
290
|
def device_connected(self) -> bool:
|
|
@@ -260,6 +303,17 @@ class ESP32Manager:
|
|
|
260
303
|
"""
|
|
261
304
|
return self._tts_lock
|
|
262
305
|
|
|
306
|
+
@property
|
|
307
|
+
def listen_lock(self) -> asyncio.Lock:
|
|
308
|
+
"""Per-device lock guarding the STT capture sequence.
|
|
309
|
+
|
|
310
|
+
See :attr:`_listen_lock` for the rationale; the orchestrator
|
|
311
|
+
wraps the entire ``listen.start`` → wait → ``listen.stop``
|
|
312
|
+
block in ``async with`` on this lock so two concurrent
|
|
313
|
+
``listen()`` calls cannot share the inbound recording slot.
|
|
314
|
+
"""
|
|
315
|
+
return self._listen_lock
|
|
316
|
+
|
|
263
317
|
async def start(
|
|
264
318
|
self,
|
|
265
319
|
host: str = "0.0.0.0",
|
|
@@ -330,7 +384,14 @@ class ESP32Manager:
|
|
|
330
384
|
try:
|
|
331
385
|
async for message in ws:
|
|
332
386
|
if isinstance(message, bytes):
|
|
333
|
-
# Binary = audio frame
|
|
387
|
+
# Binary = audio frame. Forward to the audio_stream
|
|
388
|
+
# module which buffers it for STT capture (Issue
|
|
389
|
+
# #91) when a recording slot is open, or discards
|
|
390
|
+
# it otherwise. Only protocol v1 is supported on
|
|
391
|
+
# the inbound side today; the orchestrator gates
|
|
392
|
+
# listen() on protocol_version=1 so v2/v3 frames
|
|
393
|
+
# cannot reach this point with recording active.
|
|
394
|
+
await handle_audio_frame(message, session_id)
|
|
334
395
|
continue
|
|
335
396
|
|
|
336
397
|
try:
|
|
@@ -451,6 +512,17 @@ class ESP32Manager:
|
|
|
451
512
|
raise ConnectionError("No ESP32 device connected")
|
|
452
513
|
await self._connection.send_tts_state(state)
|
|
453
514
|
|
|
515
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
516
|
+
"""Send a listen state notification to put the device into /
|
|
517
|
+
out of listening mode (Issue #91).
|
|
518
|
+
|
|
519
|
+
See :meth:`ESP32Connection.send_listen_state` for the wire
|
|
520
|
+
format and the firmware-side dispatch.
|
|
521
|
+
"""
|
|
522
|
+
if not self._connection or not self._connection.connected:
|
|
523
|
+
raise ConnectionError("No ESP32 device connected")
|
|
524
|
+
await self._connection.send_listen_state(state, mode=mode)
|
|
525
|
+
|
|
454
526
|
def get_status(self) -> dict[str, Any]:
|
|
455
527
|
"""Get current connection status."""
|
|
456
528
|
if not self._connection or not self._connection.connected:
|
|
@@ -15,6 +15,7 @@ from mcp.server.stdio import stdio_server
|
|
|
15
15
|
from mcp.types import TextContent, Tool
|
|
16
16
|
|
|
17
17
|
from .gateway import get_gateway
|
|
18
|
+
from .stt import listen_and_transcribe
|
|
18
19
|
from .tts import synthesize_and_send
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
@@ -102,8 +103,14 @@ def create_server() -> Server:
|
|
|
102
103
|
Tool(
|
|
103
104
|
name="move_head",
|
|
104
105
|
description=(
|
|
105
|
-
"Move the robot's head to
|
|
106
|
-
"yaw: horizontal (-90 to 90), pitch: vertical (
|
|
106
|
+
"Move the robot's head to safe, recommended angles. "
|
|
107
|
+
"yaw: horizontal (-90 to 90), pitch: vertical (5 to 85, "
|
|
108
|
+
"the M5Stack-recommended operating range). Out-of-range "
|
|
109
|
+
"requests are rejected at this MCP layer; for advanced "
|
|
110
|
+
"callers that need the firmware hard clamp (pitch 0..88), "
|
|
111
|
+
"use the firmware-side `set_head_angles` device tool, "
|
|
112
|
+
"which exposes a permissive schema and the authoritative "
|
|
113
|
+
"two-tier guard described in the README."
|
|
107
114
|
),
|
|
108
115
|
inputSchema={
|
|
109
116
|
"type": "object",
|
|
@@ -111,10 +118,19 @@ def create_server() -> Server:
|
|
|
111
118
|
"yaw": {
|
|
112
119
|
"type": "integer",
|
|
113
120
|
"description": "Horizontal angle in degrees (-90 to 90)",
|
|
121
|
+
"minimum": -90,
|
|
122
|
+
"maximum": 90,
|
|
114
123
|
},
|
|
115
124
|
"pitch": {
|
|
116
125
|
"type": "integer",
|
|
117
|
-
"description":
|
|
126
|
+
"description": (
|
|
127
|
+
"Vertical angle in degrees (5 to 85, "
|
|
128
|
+
"M5Stack-recommended operating range). For the "
|
|
129
|
+
"wider firmware hard clamp (0..88), use the "
|
|
130
|
+
"`set_head_angles` device tool instead."
|
|
131
|
+
),
|
|
132
|
+
"minimum": 5,
|
|
133
|
+
"maximum": 85,
|
|
118
134
|
},
|
|
119
135
|
},
|
|
120
136
|
"required": ["yaw", "pitch"],
|
|
@@ -408,6 +424,91 @@ def create_server() -> Server:
|
|
|
408
424
|
"required": ["text"],
|
|
409
425
|
},
|
|
410
426
|
),
|
|
427
|
+
Tool(
|
|
428
|
+
name="listen",
|
|
429
|
+
description=(
|
|
430
|
+
"Capture a short utterance from the device microphone and "
|
|
431
|
+
"transcribe it via a gateway-side STT engine (Phase 4, "
|
|
432
|
+
"Issue #91). The gateway sends a 'listen' notification "
|
|
433
|
+
"over the existing WebSocket to put the device firmware "
|
|
434
|
+
"into listening mode, buffers the Opus frames the device "
|
|
435
|
+
"streams up during the capture window, then decodes and "
|
|
436
|
+
"transcribes them once the window closes. Requires a "
|
|
437
|
+
"minimal firmware change to handle the inbound 'listen' "
|
|
438
|
+
"wire type (paired with this gateway release). Engine is "
|
|
439
|
+
"selectable via 'engine' (default 'faster-whisper', local). "
|
|
440
|
+
"Optional 'motion' feedback can switch the avatar to "
|
|
441
|
+
"'thinking' during capture ('face-only') or tilt the head "
|
|
442
|
+
"up while preserving yaw ('look-up'). "
|
|
443
|
+
"Install the relevant extra "
|
|
444
|
+
"('pip install stackchan-mcp[stt-faster-whisper]' or "
|
|
445
|
+
"'stt-openai'); calling this tool before an engine is "
|
|
446
|
+
"registered returns a clear error."
|
|
447
|
+
),
|
|
448
|
+
inputSchema={
|
|
449
|
+
"type": "object",
|
|
450
|
+
"properties": {
|
|
451
|
+
"duration_ms": {
|
|
452
|
+
"type": "integer",
|
|
453
|
+
"description": (
|
|
454
|
+
"Capture window in milliseconds. Clamped to "
|
|
455
|
+
"[100, 30000]."
|
|
456
|
+
),
|
|
457
|
+
"default": 5000,
|
|
458
|
+
"minimum": 100,
|
|
459
|
+
"maximum": 30000,
|
|
460
|
+
},
|
|
461
|
+
"engine": {
|
|
462
|
+
"type": "string",
|
|
463
|
+
"description": (
|
|
464
|
+
"Engine identifier (e.g. 'faster-whisper', "
|
|
465
|
+
"'openai-whisper'). Default 'faster-whisper'."
|
|
466
|
+
),
|
|
467
|
+
"default": "faster-whisper",
|
|
468
|
+
},
|
|
469
|
+
"language": {
|
|
470
|
+
"type": "string",
|
|
471
|
+
"description": (
|
|
472
|
+
"ISO 639-1 language code (e.g. 'ja'). Pass "
|
|
473
|
+
"an empty string or omit for autodetect."
|
|
474
|
+
),
|
|
475
|
+
"default": "ja",
|
|
476
|
+
},
|
|
477
|
+
"model": {
|
|
478
|
+
"type": "string",
|
|
479
|
+
"description": (
|
|
480
|
+
"Engine-specific model identifier (e.g. "
|
|
481
|
+
"'base' / 'small' / 'medium' for faster-"
|
|
482
|
+
"whisper, 'whisper-1' for OpenAI). Engines "
|
|
483
|
+
"fall back to their default when omitted."
|
|
484
|
+
),
|
|
485
|
+
},
|
|
486
|
+
"motion": {
|
|
487
|
+
"type": "string",
|
|
488
|
+
"enum": ["none", "face-only", "look-up"],
|
|
489
|
+
"description": (
|
|
490
|
+
"Optional visible feedback during capture. "
|
|
491
|
+
"'none' preserves the previous behaviour. "
|
|
492
|
+
"'face-only' shows the thinking avatar during "
|
|
493
|
+
"capture and restores idle at the end. "
|
|
494
|
+
"'look-up' preserves yaw, tilts pitch to "
|
|
495
|
+
"look_up_pitch, and holds the pose on success."
|
|
496
|
+
),
|
|
497
|
+
"default": "none",
|
|
498
|
+
},
|
|
499
|
+
"look_up_pitch": {
|
|
500
|
+
"type": "number",
|
|
501
|
+
"description": (
|
|
502
|
+
"Pitch angle for motion='look-up'. Must be "
|
|
503
|
+
"between 5 and 85 degrees."
|
|
504
|
+
),
|
|
505
|
+
"default": 50.0,
|
|
506
|
+
"minimum": 5,
|
|
507
|
+
"maximum": 85,
|
|
508
|
+
},
|
|
509
|
+
},
|
|
510
|
+
},
|
|
511
|
+
),
|
|
411
512
|
]
|
|
412
513
|
|
|
413
514
|
@server.call_tool()
|
|
@@ -439,6 +540,25 @@ def create_server() -> Server:
|
|
|
439
540
|
]
|
|
440
541
|
return [TextContent(type="text", text=json.dumps(result))]
|
|
441
542
|
|
|
543
|
+
if name == "listen":
|
|
544
|
+
# STT runs on the gateway side. The orchestrator drives the
|
|
545
|
+
# device's listening state via ``listen.start``/``stop``
|
|
546
|
+
# notifications, buffers the inbound Opus frames, decodes
|
|
547
|
+
# them, and hands the PCM blob to the registered engine.
|
|
548
|
+
# Same error-class discipline as say(): ValueError /
|
|
549
|
+
# NotImplementedError / RuntimeError all turn into clean
|
|
550
|
+
# MCP error JSON.
|
|
551
|
+
try:
|
|
552
|
+
result = await listen_and_transcribe(arguments, gateway=gw)
|
|
553
|
+
except (ValueError, NotImplementedError, RuntimeError) as exc:
|
|
554
|
+
return [
|
|
555
|
+
TextContent(
|
|
556
|
+
type="text",
|
|
557
|
+
text=json.dumps({"error": str(exc)}),
|
|
558
|
+
)
|
|
559
|
+
]
|
|
560
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
561
|
+
|
|
442
562
|
if not gw.esp32.device_connected:
|
|
443
563
|
return [
|
|
444
564
|
TextContent(
|
|
@@ -447,6 +567,59 @@ def create_server() -> Server:
|
|
|
447
567
|
)
|
|
448
568
|
]
|
|
449
569
|
|
|
570
|
+
if name == "move_head":
|
|
571
|
+
# Belt-and-suspenders validation for the recommended pitch range.
|
|
572
|
+
# The Tool inputSchema already declares minimum/maximum for both
|
|
573
|
+
# yaw and pitch, but mcp Python SDK server-side enforcement of
|
|
574
|
+
# JSON Schema bounds is not guaranteed across versions and
|
|
575
|
+
# clients. Reject out-of-recommended values here as a clean
|
|
576
|
+
# MCP error JSON before any motion command reaches the device.
|
|
577
|
+
# Callers that genuinely need the firmware hard clamp 0..88
|
|
578
|
+
# should use the firmware-side `set_head_angles` device tool,
|
|
579
|
+
# which exposes the authoritative two-tier guard described in
|
|
580
|
+
# the README "Y-axis (pitch) safe range" section.
|
|
581
|
+
yaw_val = arguments.get("yaw")
|
|
582
|
+
pitch_val = arguments.get("pitch")
|
|
583
|
+
if (
|
|
584
|
+
not isinstance(yaw_val, int)
|
|
585
|
+
or isinstance(yaw_val, bool)
|
|
586
|
+
or not (-90 <= yaw_val <= 90)
|
|
587
|
+
):
|
|
588
|
+
return [
|
|
589
|
+
TextContent(
|
|
590
|
+
type="text",
|
|
591
|
+
text=json.dumps(
|
|
592
|
+
{
|
|
593
|
+
"error": (
|
|
594
|
+
"yaw must be an integer in -90..90 "
|
|
595
|
+
f"(got {yaw_val!r})"
|
|
596
|
+
)
|
|
597
|
+
}
|
|
598
|
+
),
|
|
599
|
+
)
|
|
600
|
+
]
|
|
601
|
+
if (
|
|
602
|
+
not isinstance(pitch_val, int)
|
|
603
|
+
or isinstance(pitch_val, bool)
|
|
604
|
+
or not (5 <= pitch_val <= 85)
|
|
605
|
+
):
|
|
606
|
+
return [
|
|
607
|
+
TextContent(
|
|
608
|
+
type="text",
|
|
609
|
+
text=json.dumps(
|
|
610
|
+
{
|
|
611
|
+
"error": (
|
|
612
|
+
"pitch must be an integer in 5..85 "
|
|
613
|
+
"(M5Stack-recommended operating range; "
|
|
614
|
+
"for the wider firmware hard clamp "
|
|
615
|
+
"0..88 use `set_head_angles`). got "
|
|
616
|
+
f"{pitch_val!r}"
|
|
617
|
+
)
|
|
618
|
+
}
|
|
619
|
+
),
|
|
620
|
+
)
|
|
621
|
+
]
|
|
622
|
+
|
|
450
623
|
# Map MCP client tool names to ESP32 MCP tool names (self.* prefix)
|
|
451
624
|
tool_map: dict[str, tuple[str, dict[str, Any]]] = {
|
|
452
625
|
"get_device_info": (
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""STT framework for Phase 4 (Issue #91).
|
|
2
|
+
|
|
3
|
+
Companion to :mod:`stackchan_mcp.tts`: this package provides the
|
|
4
|
+
engine-agnostic skeleton for the gateway-side ``listen(duration_ms)``
|
|
5
|
+
MCP tool plus the concrete faster-whisper (default, local) and
|
|
6
|
+
OpenAI Whisper API engines.
|
|
7
|
+
|
|
8
|
+
Engines whose modules require optional extras to import are registered
|
|
9
|
+
behind ``try / except ImportError`` so the framework still works when
|
|
10
|
+
the corresponding extra is missing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Callable
|
|
17
|
+
|
|
18
|
+
from .base import EngineRegistry, STTEngine, get_registry
|
|
19
|
+
from .orchestrator import DEFAULT_ENGINE, listen_and_transcribe
|
|
20
|
+
|
|
21
|
+
_logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _try_register(register_fn: Callable[[], None], engine_label: str) -> None:
|
|
25
|
+
"""Run ``register_fn`` and swallow ImportErrors.
|
|
26
|
+
|
|
27
|
+
Used so an engine whose top-level module needs an optional extra
|
|
28
|
+
(e.g. faster-whisper / openai) can fail to register cleanly without
|
|
29
|
+
breaking the rest of the framework. Engine modules themselves
|
|
30
|
+
import cleanly; their heavy dependencies are imported lazily inside
|
|
31
|
+
:meth:`STTEngine.transcribe` so this layer just lights up the
|
|
32
|
+
registry slot.
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
register_fn()
|
|
36
|
+
except ImportError as exc:
|
|
37
|
+
_logger.debug("Skipping %s engine registration: %s", engine_label, exc)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _register_faster_whisper() -> None:
|
|
41
|
+
from .faster_whisper import FasterWhisperEngine
|
|
42
|
+
|
|
43
|
+
get_registry().register(FasterWhisperEngine())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _register_openai_whisper() -> None:
|
|
47
|
+
from .openai_whisper import OpenAIWhisperEngine
|
|
48
|
+
|
|
49
|
+
get_registry().register(OpenAIWhisperEngine())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_try_register(_register_faster_whisper, "faster-whisper")
|
|
53
|
+
_try_register(_register_openai_whisper, "openai-whisper")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"DEFAULT_ENGINE",
|
|
58
|
+
"EngineRegistry",
|
|
59
|
+
"STTEngine",
|
|
60
|
+
"get_registry",
|
|
61
|
+
"listen_and_transcribe",
|
|
62
|
+
]
|