stackchan-mcp 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/PKG-INFO +9 -1
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/pyproject.toml +19 -1
- stackchan_mcp-0.6.0/stackchan_mcp/audio_stream.py +151 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/esp32_client.py +73 -1
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/stdio_server.py +79 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/__init__.py +62 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/audio_utils.py +102 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/base.py +94 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/faster_whisper.py +217 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/openai_whisper.py +177 -0
- stackchan_mcp-0.6.0/stackchan_mcp/stt/orchestrator.py +306 -0
- stackchan_mcp-0.6.0/tests/test_audio_stream.py +145 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_esp32_client.py +81 -0
- stackchan_mcp-0.6.0/tests/test_stt_audio_utils.py +100 -0
- stackchan_mcp-0.6.0/tests/test_stt_framework.py +195 -0
- stackchan_mcp-0.6.0/tests/test_stt_orchestrator.py +441 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/uv.lock +775 -3
- stackchan_mcp-0.5.0/stackchan_mcp/audio_stream.py +0 -52
- stackchan_mcp-0.5.0/tests/test_audio_stream.py +0 -60
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/.env.example +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/.gitignore +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/LICENSE +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/README.md +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__main__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/capture_server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/cli.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/gateway.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/audio.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/camera.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/robot.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/mcp_router.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/protocol.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tools.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/__init__.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/audio_utils.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/base.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/orchestrator.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/voicevox.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/_audio_fixtures.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/conftest.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_audio_utils.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_capture_server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_cli.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_gateway.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_mcp_router.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_orchestrator.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_protocol.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_stdio_server.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_tts_framework.py +0 -0
- {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_voicevox.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stackchan-mcp
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
|
|
5
5
|
Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
6
6
|
Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
|
|
@@ -27,6 +27,14 @@ Requires-Dist: mcp>=1.0
|
|
|
27
27
|
Requires-Dist: pydantic>=2
|
|
28
28
|
Requires-Dist: python-dotenv
|
|
29
29
|
Requires-Dist: websockets>=12
|
|
30
|
+
Provides-Extra: stt
|
|
31
|
+
Requires-Dist: opuslib>=3; extra == 'stt'
|
|
32
|
+
Provides-Extra: stt-faster-whisper
|
|
33
|
+
Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
|
|
34
|
+
Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
|
|
35
|
+
Provides-Extra: stt-openai
|
|
36
|
+
Requires-Dist: openai>=1.0; extra == 'stt-openai'
|
|
37
|
+
Requires-Dist: opuslib>=3; extra == 'stt-openai'
|
|
30
38
|
Provides-Extra: tts
|
|
31
39
|
Requires-Dist: httpx>=0.27; extra == 'tts'
|
|
32
40
|
Requires-Dist: opuslib>=3; extra == 'tts'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "stackchan-mcp"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -48,6 +48,24 @@ tts-voicevox = [
|
|
|
48
48
|
"stackchan-mcp[tts]",
|
|
49
49
|
]
|
|
50
50
|
|
|
51
|
+
# Phase 4 STT — see Issue #91.
|
|
52
|
+
# The base `stt` extra carries `opuslib` for decoding the device's
|
|
53
|
+
# inbound Opus frames. Concrete engines live behind their own extras
|
|
54
|
+
# so users only pull in the heavy ML dependencies they actually need.
|
|
55
|
+
# * faster-whisper — local Whisper via CTranslate2 (default, MIT)
|
|
56
|
+
# * openai — OpenAI Whisper API client (cloud)
|
|
57
|
+
stt = [
|
|
58
|
+
"opuslib>=3",
|
|
59
|
+
]
|
|
60
|
+
stt-faster-whisper = [
|
|
61
|
+
"stackchan-mcp[stt]",
|
|
62
|
+
"faster-whisper>=1.0",
|
|
63
|
+
]
|
|
64
|
+
stt-openai = [
|
|
65
|
+
"stackchan-mcp[stt]",
|
|
66
|
+
"openai>=1.0",
|
|
67
|
+
]
|
|
68
|
+
|
|
51
69
|
[project.urls]
|
|
52
70
|
Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
53
71
|
Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Opus audio frame handling for the gateway <-> device link.
|
|
2
|
+
|
|
3
|
+
Outbound (TTS) frames are produced by
|
|
4
|
+
:mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
|
|
5
|
+
ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
|
|
6
|
+
|
|
7
|
+
The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
|
|
8
|
+
binary frames coming up from the device land in
|
|
9
|
+
:func:`handle_audio_frame`, which buffers them into a module-level
|
|
10
|
+
recording slot when one is active. The
|
|
11
|
+
:mod:`stackchan_mcp.stt.orchestrator` opens the slot via
|
|
12
|
+
:func:`start_recording` before sending ``listen.start`` to the device
|
|
13
|
+
and closes it via :func:`stop_recording` after the capture window;
|
|
14
|
+
outside an active recording, inbound frames are still discarded.
|
|
15
|
+
|
|
16
|
+
The recording slot is intentionally a module-level singleton: the
|
|
17
|
+
device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
|
|
18
|
+
one connection, and the STT orchestrator serialises ``listen()`` calls
|
|
19
|
+
through :attr:`ESP32Manager.listen_lock`, so concurrent captures
|
|
20
|
+
cannot race the buffer. If multi-device support lands later, this
|
|
21
|
+
should move onto the connection object.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import logging
|
|
27
|
+
from typing import TYPE_CHECKING, Iterable
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from .esp32_client import ESP32Manager
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Recording slot (inbound STT capture) ---------------------------------
|
|
36
|
+
#
|
|
37
|
+
# A single capture at a time is enforced by the orchestrator's
|
|
38
|
+
# ``listen_lock``; this module only owns the buffer itself.
|
|
39
|
+
|
|
40
|
+
_recording_session_id: str | None = None
|
|
41
|
+
_recording_frames: list[bytes] = []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def start_recording(session_id: str) -> None:
|
|
45
|
+
"""Open a fresh recording slot for ``session_id``.
|
|
46
|
+
|
|
47
|
+
Any frames already buffered are discarded so a previous call that
|
|
48
|
+
crashed before ``stop_recording`` cannot leak into the next
|
|
49
|
+
capture. The orchestrator wraps start/stop in a try/finally to
|
|
50
|
+
guarantee the slot is closed even on error.
|
|
51
|
+
"""
|
|
52
|
+
global _recording_session_id, _recording_frames
|
|
53
|
+
if _recording_session_id is not None:
|
|
54
|
+
# Defensive: the lock should prevent this, but if it ever
|
|
55
|
+
# fires we leak no audio — just log loudly so the regression
|
|
56
|
+
# is visible.
|
|
57
|
+
logger.warning(
|
|
58
|
+
"start_recording called while session=%s was still active; "
|
|
59
|
+
"dropping %d buffered frames",
|
|
60
|
+
_recording_session_id,
|
|
61
|
+
len(_recording_frames),
|
|
62
|
+
)
|
|
63
|
+
_recording_session_id = session_id
|
|
64
|
+
_recording_frames = []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def stop_recording() -> list[bytes]:
|
|
68
|
+
"""Close the recording slot and return the buffered Opus frames.
|
|
69
|
+
|
|
70
|
+
Returns an empty list if no recording was active. The slot is
|
|
71
|
+
cleared whether or not frames were captured so the next call to
|
|
72
|
+
:func:`start_recording` starts clean.
|
|
73
|
+
"""
|
|
74
|
+
global _recording_session_id, _recording_frames
|
|
75
|
+
frames = _recording_frames
|
|
76
|
+
_recording_session_id = None
|
|
77
|
+
_recording_frames = []
|
|
78
|
+
return frames
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_recording() -> bool:
|
|
82
|
+
"""Return ``True`` when a recording slot is currently open."""
|
|
83
|
+
return _recording_session_id is not None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def handle_audio_frame(data: bytes, session_id: str) -> None:
|
|
87
|
+
"""Process an incoming binary Opus frame from the device.
|
|
88
|
+
|
|
89
|
+
When a recording slot is active (see :func:`start_recording`) AND
|
|
90
|
+
the frame belongs to the recording's session, appends the frame
|
|
91
|
+
to the in-memory buffer for later decoding by the STT
|
|
92
|
+
orchestrator. Frames from a different session — typical during
|
|
93
|
+
a connection swap, where the old WebSocket handler is still
|
|
94
|
+
draining incoming bytes after :meth:`ESP32Connection.disconnect`
|
|
95
|
+
has been called on the main task — are dropped so they cannot
|
|
96
|
+
bleed into the new connection's capture buffer.
|
|
97
|
+
|
|
98
|
+
Outside of an active recording the frame is logged at debug
|
|
99
|
+
level and discarded; the device may emit audio on its own (e.g.
|
|
100
|
+
after an autonomous wake-word detection) and the gateway has no
|
|
101
|
+
STT pipeline running for those frames yet.
|
|
102
|
+
"""
|
|
103
|
+
if _recording_session_id is None:
|
|
104
|
+
logger.debug(
|
|
105
|
+
"audio_frame session=%s bytes=%d (discarded — no active recording)",
|
|
106
|
+
session_id,
|
|
107
|
+
len(data),
|
|
108
|
+
)
|
|
109
|
+
return
|
|
110
|
+
if _recording_session_id != session_id:
|
|
111
|
+
# A different connection is sending audio while a recording
|
|
112
|
+
# for this session is in flight. This happens when ESP32
|
|
113
|
+
# reconnects: ``ESP32Manager._handler`` swaps in a new
|
|
114
|
+
# ``ESP32Connection`` and marks the old one disconnected,
|
|
115
|
+
# but the old socket's ``async for message in ws`` loop can
|
|
116
|
+
# still drain a frame or two before the close lands. Letting
|
|
117
|
+
# those into the buffer would corrupt the new session's
|
|
118
|
+
# transcription, so drop them here.
|
|
119
|
+
logger.debug(
|
|
120
|
+
"audio_frame session=%s bytes=%d (discarded — does not match "
|
|
121
|
+
"recording session=%s)",
|
|
122
|
+
session_id,
|
|
123
|
+
len(data),
|
|
124
|
+
_recording_session_id,
|
|
125
|
+
)
|
|
126
|
+
return
|
|
127
|
+
_recording_frames.append(data)
|
|
128
|
+
logger.debug(
|
|
129
|
+
"audio_frame session=%s bytes=%d buffered (recording active)",
|
|
130
|
+
session_id,
|
|
131
|
+
len(data),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def push_opus_frames(
|
|
136
|
+
esp32: ESP32Manager,
|
|
137
|
+
frames: Iterable[bytes],
|
|
138
|
+
) -> int:
|
|
139
|
+
"""Push Opus frames to the connected ESP32.
|
|
140
|
+
|
|
141
|
+
Returns the number of frames sent so the caller can report this to
|
|
142
|
+
the MCP client. Raises :class:`ConnectionError` (via
|
|
143
|
+
:meth:`ESP32Manager.send_audio_frame`) if the device disconnects
|
|
144
|
+
mid-stream — the orchestrator turns that into a clean MCP error
|
|
145
|
+
rather than letting it bubble up as a stack trace.
|
|
146
|
+
"""
|
|
147
|
+
sent = 0
|
|
148
|
+
for frame in frames:
|
|
149
|
+
await esp32.send_audio_frame(frame)
|
|
150
|
+
sent += 1
|
|
151
|
+
return sent
|
|
@@ -17,6 +17,7 @@ import websockets
|
|
|
17
17
|
import websockets.exceptions
|
|
18
18
|
from websockets.asyncio.server import ServerConnection
|
|
19
19
|
|
|
20
|
+
from .audio_stream import handle_audio_frame
|
|
20
21
|
from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
@@ -206,6 +207,33 @@ class ESP32Connection:
|
|
|
206
207
|
}
|
|
207
208
|
await self._ws_send(json.dumps(message))
|
|
208
209
|
|
|
210
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
211
|
+
"""Send a listen state notification (``start`` / ``stop``).
|
|
212
|
+
|
|
213
|
+
Server-driven counterpart to the device's existing
|
|
214
|
+
:func:`Protocol::SendStartListening` (Issue #91). The
|
|
215
|
+
firmware's :func:`Application::OnIncomingJson` dispatches
|
|
216
|
+
``state: "start"`` to :func:`Application::StartListening` and
|
|
217
|
+
``state: "stop"`` to :func:`Application::StopListening`.
|
|
218
|
+
|
|
219
|
+
``mode`` is currently accepted only for ``state="start"`` and is
|
|
220
|
+
carried on the wire for forward-compatibility — the firmware
|
|
221
|
+
accepts but ignores it in Phase 1 because
|
|
222
|
+
:func:`HandleStartListeningEvent` unconditionally enters
|
|
223
|
+
``kListeningModeManualStop`` (the gateway controls the stop
|
|
224
|
+
boundary explicitly).
|
|
225
|
+
"""
|
|
226
|
+
if not self._connected:
|
|
227
|
+
raise ConnectionError("ESP32 not connected")
|
|
228
|
+
message: dict[str, Any] = {
|
|
229
|
+
"session_id": self.session_id,
|
|
230
|
+
"type": "listen",
|
|
231
|
+
"state": state,
|
|
232
|
+
}
|
|
233
|
+
if state == "start":
|
|
234
|
+
message["mode"] = mode
|
|
235
|
+
await self._ws_send(json.dumps(message))
|
|
236
|
+
|
|
209
237
|
def disconnect(self) -> None:
|
|
210
238
|
"""Mark connection as disconnected."""
|
|
211
239
|
self._connected = False
|
|
@@ -242,6 +270,21 @@ class ESP32Manager:
|
|
|
242
270
|
# if multi-device support lands later, the lock should move
|
|
243
271
|
# onto :class:`ESP32Connection` instead.
|
|
244
272
|
self._tts_lock = asyncio.Lock()
|
|
273
|
+
# Inbound STT capture (Issue #91) shares the TTS lock rather
|
|
274
|
+
# than running on a separate one. The firmware's
|
|
275
|
+
# ``HandleStartListeningEvent`` aborts any in-flight TTS when
|
|
276
|
+
# a listen.start arrives mid-speaking (state ==
|
|
277
|
+
# ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
|
|
278
|
+
# ``SetListeningMode(kListeningModeManualStop)``), so two
|
|
279
|
+
# operations on the same device's audio path would
|
|
280
|
+
# otherwise step on each other: a ``listen()`` could yank a
|
|
281
|
+
# ``say()`` out of speaking mid-utterance, or a ``say()``
|
|
282
|
+
# could start streaming TTS frames into the buffer a
|
|
283
|
+
# concurrent ``listen()`` is capturing. Treating the audio
|
|
284
|
+
# path as a single resource makes the device's state machine
|
|
285
|
+
# observable from gateway code; if a full-duplex contract
|
|
286
|
+
# ever lands later the lock can split again.
|
|
287
|
+
self._listen_lock = self._tts_lock
|
|
245
288
|
|
|
246
289
|
@property
|
|
247
290
|
def device_connected(self) -> bool:
|
|
@@ -260,6 +303,17 @@ class ESP32Manager:
|
|
|
260
303
|
"""
|
|
261
304
|
return self._tts_lock
|
|
262
305
|
|
|
306
|
+
@property
|
|
307
|
+
def listen_lock(self) -> asyncio.Lock:
|
|
308
|
+
"""Per-device lock guarding the STT capture sequence.
|
|
309
|
+
|
|
310
|
+
See :attr:`_listen_lock` for the rationale; the orchestrator
|
|
311
|
+
wraps the entire ``listen.start`` → wait → ``listen.stop``
|
|
312
|
+
block in ``async with`` on this lock so two concurrent
|
|
313
|
+
``listen()`` calls cannot share the inbound recording slot.
|
|
314
|
+
"""
|
|
315
|
+
return self._listen_lock
|
|
316
|
+
|
|
263
317
|
async def start(
|
|
264
318
|
self,
|
|
265
319
|
host: str = "0.0.0.0",
|
|
@@ -330,7 +384,14 @@ class ESP32Manager:
|
|
|
330
384
|
try:
|
|
331
385
|
async for message in ws:
|
|
332
386
|
if isinstance(message, bytes):
|
|
333
|
-
# Binary = audio frame
|
|
387
|
+
# Binary = audio frame. Forward to the audio_stream
|
|
388
|
+
# module which buffers it for STT capture (Issue
|
|
389
|
+
# #91) when a recording slot is open, or discards
|
|
390
|
+
# it otherwise. Only protocol v1 is supported on
|
|
391
|
+
# the inbound side today; the orchestrator gates
|
|
392
|
+
# listen() on protocol_version=1 so v2/v3 frames
|
|
393
|
+
# cannot reach this point with recording active.
|
|
394
|
+
await handle_audio_frame(message, session_id)
|
|
334
395
|
continue
|
|
335
396
|
|
|
336
397
|
try:
|
|
@@ -451,6 +512,17 @@ class ESP32Manager:
|
|
|
451
512
|
raise ConnectionError("No ESP32 device connected")
|
|
452
513
|
await self._connection.send_tts_state(state)
|
|
453
514
|
|
|
515
|
+
async def send_listen_state(self, state: str, mode: str = "manual") -> None:
|
|
516
|
+
"""Send a listen state notification to put the device into /
|
|
517
|
+
out of listening mode (Issue #91).
|
|
518
|
+
|
|
519
|
+
See :meth:`ESP32Connection.send_listen_state` for the wire
|
|
520
|
+
format and the firmware-side dispatch.
|
|
521
|
+
"""
|
|
522
|
+
if not self._connection or not self._connection.connected:
|
|
523
|
+
raise ConnectionError("No ESP32 device connected")
|
|
524
|
+
await self._connection.send_listen_state(state, mode=mode)
|
|
525
|
+
|
|
454
526
|
def get_status(self) -> dict[str, Any]:
|
|
455
527
|
"""Get current connection status."""
|
|
456
528
|
if not self._connection or not self._connection.connected:
|
|
@@ -15,6 +15,7 @@ from mcp.server.stdio import stdio_server
|
|
|
15
15
|
from mcp.types import TextContent, Tool
|
|
16
16
|
|
|
17
17
|
from .gateway import get_gateway
|
|
18
|
+
from .stt import listen_and_transcribe
|
|
18
19
|
from .tts import synthesize_and_send
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
@@ -408,6 +409,65 @@ def create_server() -> Server:
|
|
|
408
409
|
"required": ["text"],
|
|
409
410
|
},
|
|
410
411
|
),
|
|
412
|
+
Tool(
|
|
413
|
+
name="listen",
|
|
414
|
+
description=(
|
|
415
|
+
"Capture a short utterance from the device microphone and "
|
|
416
|
+
"transcribe it via a gateway-side STT engine (Phase 4, "
|
|
417
|
+
"Issue #91). The gateway sends a 'listen' notification "
|
|
418
|
+
"over the existing WebSocket to put the device firmware "
|
|
419
|
+
"into listening mode, buffers the Opus frames the device "
|
|
420
|
+
"streams up during the capture window, then decodes and "
|
|
421
|
+
"transcribes them once the window closes. Requires a "
|
|
422
|
+
"minimal firmware change to handle the inbound 'listen' "
|
|
423
|
+
"wire type (paired with this gateway release). Engine is "
|
|
424
|
+
"selectable via 'engine' (default 'faster-whisper', local). "
|
|
425
|
+
"Install the relevant extra "
|
|
426
|
+
"('pip install stackchan-mcp[stt-faster-whisper]' or "
|
|
427
|
+
"'stt-openai'); calling this tool before an engine is "
|
|
428
|
+
"registered returns a clear error."
|
|
429
|
+
),
|
|
430
|
+
inputSchema={
|
|
431
|
+
"type": "object",
|
|
432
|
+
"properties": {
|
|
433
|
+
"duration_ms": {
|
|
434
|
+
"type": "integer",
|
|
435
|
+
"description": (
|
|
436
|
+
"Capture window in milliseconds. Clamped to "
|
|
437
|
+
"[100, 30000]."
|
|
438
|
+
),
|
|
439
|
+
"default": 5000,
|
|
440
|
+
"minimum": 100,
|
|
441
|
+
"maximum": 30000,
|
|
442
|
+
},
|
|
443
|
+
"engine": {
|
|
444
|
+
"type": "string",
|
|
445
|
+
"description": (
|
|
446
|
+
"Engine identifier (e.g. 'faster-whisper', "
|
|
447
|
+
"'openai-whisper'). Default 'faster-whisper'."
|
|
448
|
+
),
|
|
449
|
+
"default": "faster-whisper",
|
|
450
|
+
},
|
|
451
|
+
"language": {
|
|
452
|
+
"type": "string",
|
|
453
|
+
"description": (
|
|
454
|
+
"ISO 639-1 language code (e.g. 'ja'). Pass "
|
|
455
|
+
"an empty string or omit for autodetect."
|
|
456
|
+
),
|
|
457
|
+
"default": "ja",
|
|
458
|
+
},
|
|
459
|
+
"model": {
|
|
460
|
+
"type": "string",
|
|
461
|
+
"description": (
|
|
462
|
+
"Engine-specific model identifier (e.g. "
|
|
463
|
+
"'base' / 'small' / 'medium' for faster-"
|
|
464
|
+
"whisper, 'whisper-1' for OpenAI). Engines "
|
|
465
|
+
"fall back to their default when omitted."
|
|
466
|
+
),
|
|
467
|
+
},
|
|
468
|
+
},
|
|
469
|
+
},
|
|
470
|
+
),
|
|
411
471
|
]
|
|
412
472
|
|
|
413
473
|
@server.call_tool()
|
|
@@ -439,6 +499,25 @@ def create_server() -> Server:
|
|
|
439
499
|
]
|
|
440
500
|
return [TextContent(type="text", text=json.dumps(result))]
|
|
441
501
|
|
|
502
|
+
if name == "listen":
|
|
503
|
+
# STT runs on the gateway side. The orchestrator drives the
|
|
504
|
+
# device's listening state via ``listen.start``/``stop``
|
|
505
|
+
# notifications, buffers the inbound Opus frames, decodes
|
|
506
|
+
# them, and hands the PCM blob to the registered engine.
|
|
507
|
+
# Same error-class discipline as say(): ValueError /
|
|
508
|
+
# NotImplementedError / RuntimeError all turn into clean
|
|
509
|
+
# MCP error JSON.
|
|
510
|
+
try:
|
|
511
|
+
result = await listen_and_transcribe(arguments, gateway=gw)
|
|
512
|
+
except (ValueError, NotImplementedError, RuntimeError) as exc:
|
|
513
|
+
return [
|
|
514
|
+
TextContent(
|
|
515
|
+
type="text",
|
|
516
|
+
text=json.dumps({"error": str(exc)}),
|
|
517
|
+
)
|
|
518
|
+
]
|
|
519
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
520
|
+
|
|
442
521
|
if not gw.esp32.device_connected:
|
|
443
522
|
return [
|
|
444
523
|
TextContent(
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""STT framework for Phase 4 (Issue #91).
|
|
2
|
+
|
|
3
|
+
Companion to :mod:`stackchan_mcp.tts`: this package provides the
|
|
4
|
+
engine-agnostic skeleton for the gateway-side ``listen(duration_ms)``
|
|
5
|
+
MCP tool plus the concrete faster-whisper (default, local) and
|
|
6
|
+
OpenAI Whisper API engines.
|
|
7
|
+
|
|
8
|
+
Engines whose modules require optional extras to import are registered
|
|
9
|
+
behind ``try / except ImportError`` so the framework still works when
|
|
10
|
+
the corresponding extra is missing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Callable
|
|
17
|
+
|
|
18
|
+
from .base import EngineRegistry, STTEngine, get_registry
|
|
19
|
+
from .orchestrator import DEFAULT_ENGINE, listen_and_transcribe
|
|
20
|
+
|
|
21
|
+
_logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _try_register(register_fn: Callable[[], None], engine_label: str) -> None:
|
|
25
|
+
"""Run ``register_fn`` and swallow ImportErrors.
|
|
26
|
+
|
|
27
|
+
Used so an engine whose top-level module needs an optional extra
|
|
28
|
+
(e.g. faster-whisper / openai) can fail to register cleanly without
|
|
29
|
+
breaking the rest of the framework. Engine modules themselves
|
|
30
|
+
import cleanly; their heavy dependencies are imported lazily inside
|
|
31
|
+
:meth:`STTEngine.transcribe` so this layer just lights up the
|
|
32
|
+
registry slot.
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
register_fn()
|
|
36
|
+
except ImportError as exc:
|
|
37
|
+
_logger.debug("Skipping %s engine registration: %s", engine_label, exc)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _register_faster_whisper() -> None:
|
|
41
|
+
from .faster_whisper import FasterWhisperEngine
|
|
42
|
+
|
|
43
|
+
get_registry().register(FasterWhisperEngine())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _register_openai_whisper() -> None:
|
|
47
|
+
from .openai_whisper import OpenAIWhisperEngine
|
|
48
|
+
|
|
49
|
+
get_registry().register(OpenAIWhisperEngine())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_try_register(_register_faster_whisper, "faster-whisper")
|
|
53
|
+
_try_register(_register_openai_whisper, "openai-whisper")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"DEFAULT_ENGINE",
|
|
58
|
+
"EngineRegistry",
|
|
59
|
+
"STTEngine",
|
|
60
|
+
"get_registry",
|
|
61
|
+
"listen_and_transcribe",
|
|
62
|
+
]
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Audio utilities for the STT pipeline.
|
|
2
|
+
|
|
3
|
+
Mirror of :mod:`stackchan_mcp.tts.audio_utils` for the inbound direction:
|
|
4
|
+
the helpers here decode Opus frames coming up from the device and
|
|
5
|
+
concatenate them into a single PCM blob that a recogniser can consume.
|
|
6
|
+
|
|
7
|
+
``opuslib`` is imported lazily inside :func:`decode_opus_frames` so the
|
|
8
|
+
rest of the module stays usable in environments where the ``[stt]``
|
|
9
|
+
extra is not installed.
|
|
10
|
+
|
|
11
|
+
Device-side Opus parameters come from the firmware's hello handshake
|
|
12
|
+
(``firmware/main/protocols/websocket_protocol.cc::GetHelloMessage``)::
|
|
13
|
+
|
|
14
|
+
sample_rate = 16000 Hz
|
|
15
|
+
channels = 1
|
|
16
|
+
frame_duration_ms = OPUS_FRAME_DURATION_MS (60 ms)
|
|
17
|
+
samples_per_frame = sample_rate * frame_duration_ms / 1000 = 960
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from typing import Iterable
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
#: Opus sample rate the device encoder is configured for.
|
|
29
|
+
DEVICE_SAMPLE_RATE = 16000
|
|
30
|
+
|
|
31
|
+
#: Opus channel count (mono).
|
|
32
|
+
DEVICE_CHANNELS = 1
|
|
33
|
+
|
|
34
|
+
#: Opus frame duration in milliseconds (matches the firmware's
|
|
35
|
+
#: ``OPUS_FRAME_DURATION_MS``). Kept symmetric with
|
|
36
|
+
#: :data:`stackchan_mcp.tts.audio_utils.DEVICE_FRAME_DURATION_MS`.
|
|
37
|
+
DEVICE_FRAME_DURATION_MS = 60
|
|
38
|
+
|
|
39
|
+
#: PCM samples per Opus frame at the device's settings (= 960).
|
|
40
|
+
SAMPLES_PER_FRAME = DEVICE_SAMPLE_RATE * DEVICE_FRAME_DURATION_MS // 1000
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def decode_opus_frames(
|
|
44
|
+
frames: Iterable[bytes],
|
|
45
|
+
*,
|
|
46
|
+
sample_rate: int = DEVICE_SAMPLE_RATE,
|
|
47
|
+
channels: int = DEVICE_CHANNELS,
|
|
48
|
+
frame_duration_ms: int = DEVICE_FRAME_DURATION_MS,
|
|
49
|
+
) -> bytes:
|
|
50
|
+
"""Decode an iterable of Opus frames into a single PCM blob.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
frames: Iterable of raw Opus payloads (i.e. the protocol v1
|
|
54
|
+
wire format the firmware emits when ``protocol_version=1``;
|
|
55
|
+
see :class:`stackchan_mcp.esp32_client.ESP32Connection`).
|
|
56
|
+
Each frame must contain exactly ``frame_duration_ms`` of
|
|
57
|
+
audio at ``sample_rate`` mono.
|
|
58
|
+
sample_rate: Decoder sample rate (Hz). Defaults to the device's
|
|
59
|
+
16 kHz.
|
|
60
|
+
channels: Channel count. Defaults to mono.
|
|
61
|
+
frame_duration_ms: Per-frame duration in ms. Defaults to the
|
|
62
|
+
device's 60 ms cadence.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Signed 16-bit little-endian PCM bytes concatenated across all
|
|
66
|
+
frames. Frames that fail to decode are logged at warning level
|
|
67
|
+
and skipped — partial transcription is better than failing the
|
|
68
|
+
whole listen() call because one frame got mangled on the wire.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
RuntimeError: if ``opuslib`` is not installed. The error
|
|
72
|
+
message points at the right install command so the caller
|
|
73
|
+
can surface a clean MCP error.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
import opuslib # type: ignore[import-not-found]
|
|
77
|
+
except ImportError as exc: # pragma: no cover - exercised via integration
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
"opuslib is not installed. Install with "
|
|
80
|
+
"'pip install stackchan-mcp[stt]' to enable Opus decoding."
|
|
81
|
+
) from exc
|
|
82
|
+
|
|
83
|
+
samples_per_frame = sample_rate * frame_duration_ms // 1000
|
|
84
|
+
decoder = opuslib.Decoder(sample_rate, channels)
|
|
85
|
+
|
|
86
|
+
pcm_chunks: list[bytes] = []
|
|
87
|
+
for index, frame in enumerate(frames):
|
|
88
|
+
if not frame:
|
|
89
|
+
continue
|
|
90
|
+
try:
|
|
91
|
+
pcm = decoder.decode(frame, samples_per_frame)
|
|
92
|
+
except Exception as exc: # pragma: no cover - decode errors are rare
|
|
93
|
+
logger.warning(
|
|
94
|
+
"Opus decode failed for frame %d (size=%d): %s; skipping",
|
|
95
|
+
index,
|
|
96
|
+
len(frame),
|
|
97
|
+
exc,
|
|
98
|
+
)
|
|
99
|
+
continue
|
|
100
|
+
pcm_chunks.append(pcm)
|
|
101
|
+
|
|
102
|
+
return b"".join(pcm_chunks)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""STT engine abstraction.
|
|
2
|
+
|
|
3
|
+
Each concrete engine takes 16 kHz mono PCM (signed 16-bit LE) and
|
|
4
|
+
returns a transcription. Opus decoding from the device wire format and
|
|
5
|
+
PCM buffering are handled by :mod:`stackchan_mcp.stt.orchestrator` so
|
|
6
|
+
engines stay focused on recognition.
|
|
7
|
+
|
|
8
|
+
This module is intentionally dependency-free: it must import cleanly
|
|
9
|
+
without ``faster-whisper`` / ``openai`` / ``opuslib`` so that callers
|
|
10
|
+
can introspect the registered engines even when the optional ``[stt]``
|
|
11
|
+
extras are not installed. Mirrors :mod:`stackchan_mcp.tts.base`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class STTEngine(ABC):
|
|
21
|
+
"""Abstract base for STT engines.
|
|
22
|
+
|
|
23
|
+
Subclasses must set :attr:`name` to a stable identifier (matched
|
|
24
|
+
against the ``engine`` argument of the ``listen`` MCP tool) and
|
|
25
|
+
implement :meth:`transcribe`.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
#: Stable identifier used to look this engine up in the registry.
|
|
29
|
+
#: Concrete subclasses must override with a non-empty string.
|
|
30
|
+
name: str = ""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
async def transcribe(self, pcm: bytes, **opts: Any) -> dict[str, Any]:
|
|
34
|
+
"""Transcribe 16 kHz mono PCM (signed 16-bit LE) into text.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
pcm: Raw PCM bytes at 16 kHz, mono, signed 16-bit
|
|
38
|
+
little-endian. The orchestrator handles Opus decoding
|
|
39
|
+
and frame concatenation before calling this method.
|
|
40
|
+
**opts: Engine-specific options. Recognised keys include
|
|
41
|
+
``language`` (ISO 639-1 code, e.g. ``"ja"``, or
|
|
42
|
+
``None`` for autodetect) and ``model`` (engine-specific
|
|
43
|
+
model name, e.g. ``"base"`` / ``"small"`` for
|
|
44
|
+
faster-whisper). Engines should ignore unknown options
|
|
45
|
+
rather than raise, so the ``listen`` tool can pass a
|
|
46
|
+
uniform argument set.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Dict with at least ``text`` (transcribed string) and
|
|
50
|
+
``language`` (ISO 639-1 code that the engine used or
|
|
51
|
+
detected). Engines may add extra keys (e.g. ``segments``,
|
|
52
|
+
``confidence``) for diagnostics — the orchestrator surfaces
|
|
53
|
+
``text`` and ``language`` to the caller and leaves the rest
|
|
54
|
+
available for future extensions.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EngineRegistry:
|
|
59
|
+
"""Tracks available STT engines by name.
|
|
60
|
+
|
|
61
|
+
Concrete engines register themselves at import time when their
|
|
62
|
+
optional dependencies are satisfied (see
|
|
63
|
+
:mod:`stackchan_mcp.stt.faster_whisper` and
|
|
64
|
+
:mod:`stackchan_mcp.stt.openai_whisper`).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(self) -> None:
|
|
68
|
+
self._engines: dict[str, STTEngine] = {}
|
|
69
|
+
|
|
70
|
+
def register(self, engine: STTEngine) -> None:
|
|
71
|
+
"""Register ``engine`` under ``engine.name``.
|
|
72
|
+
|
|
73
|
+
Replaces any previously registered engine with the same name —
|
|
74
|
+
this is intentional so tests can inject fakes.
|
|
75
|
+
"""
|
|
76
|
+
if not engine.name:
|
|
77
|
+
raise ValueError("STTEngine.name must be a non-empty string")
|
|
78
|
+
self._engines[engine.name] = engine
|
|
79
|
+
|
|
80
|
+
def get(self, name: str) -> STTEngine | None:
|
|
81
|
+
"""Return the engine registered under ``name``, or ``None``."""
|
|
82
|
+
return self._engines.get(name)
|
|
83
|
+
|
|
84
|
+
def names(self) -> list[str]:
|
|
85
|
+
"""Return all registered engine names, sorted alphabetically."""
|
|
86
|
+
return sorted(self._engines.keys())
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
_default_registry = EngineRegistry()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_registry() -> EngineRegistry:
|
|
93
|
+
"""Return the process-wide default :class:`EngineRegistry`."""
|
|
94
|
+
return _default_registry
|