stackchan-mcp 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/PKG-INFO +9 -1
  2. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/pyproject.toml +19 -1
  3. stackchan_mcp-0.7.0/stackchan_mcp/audio_stream.py +151 -0
  4. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/esp32_client.py +73 -1
  5. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/stdio_server.py +176 -3
  6. stackchan_mcp-0.7.0/stackchan_mcp/stt/__init__.py +62 -0
  7. stackchan_mcp-0.7.0/stackchan_mcp/stt/audio_utils.py +102 -0
  8. stackchan_mcp-0.7.0/stackchan_mcp/stt/base.py +94 -0
  9. stackchan_mcp-0.7.0/stackchan_mcp/stt/faster_whisper.py +217 -0
  10. stackchan_mcp-0.7.0/stackchan_mcp/stt/openai_whisper.py +177 -0
  11. stackchan_mcp-0.7.0/stackchan_mcp/stt/orchestrator.py +552 -0
  12. stackchan_mcp-0.7.0/tests/test_audio_stream.py +145 -0
  13. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_esp32_client.py +81 -0
  14. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_stdio_server.py +194 -0
  15. stackchan_mcp-0.7.0/tests/test_stt_audio_utils.py +100 -0
  16. stackchan_mcp-0.7.0/tests/test_stt_framework.py +195 -0
  17. stackchan_mcp-0.7.0/tests/test_stt_orchestrator.py +1150 -0
  18. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/uv.lock +775 -3
  19. stackchan_mcp-0.5.0/stackchan_mcp/audio_stream.py +0 -52
  20. stackchan_mcp-0.5.0/tests/test_audio_stream.py +0 -60
  21. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/.env.example +0 -0
  22. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/.gitignore +0 -0
  23. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/LICENSE +0 -0
  24. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/README.md +0 -0
  25. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/__init__.py +0 -0
  26. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/__main__.py +0 -0
  27. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/capture_server.py +0 -0
  28. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/cli.py +0 -0
  29. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/gateway.py +0 -0
  30. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/__init__.py +0 -0
  31. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/audio.py +0 -0
  32. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/camera.py +0 -0
  33. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/handlers/robot.py +0 -0
  34. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/mcp_router.py +0 -0
  35. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/protocol.py +0 -0
  36. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/server.py +0 -0
  37. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tools.py +0 -0
  38. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/__init__.py +0 -0
  39. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/audio_utils.py +0 -0
  40. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/base.py +0 -0
  41. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/orchestrator.py +0 -0
  42. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/tts/voicevox.py +0 -0
  43. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/_audio_fixtures.py +0 -0
  44. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/conftest.py +0 -0
  45. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_audio_utils.py +0 -0
  46. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_capture_server.py +0 -0
  47. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_cli.py +0 -0
  48. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_gateway.py +0 -0
  49. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_mcp_router.py +0 -0
  50. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_orchestrator.py +0 -0
  51. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_protocol.py +0 -0
  52. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_tts_framework.py +0 -0
  53. {stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/tests/test_voicevox.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stackchan-mcp
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
5
5
  Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
6
6
  Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
@@ -27,6 +27,14 @@ Requires-Dist: mcp>=1.0
27
27
  Requires-Dist: pydantic>=2
28
28
  Requires-Dist: python-dotenv
29
29
  Requires-Dist: websockets>=12
30
+ Provides-Extra: stt
31
+ Requires-Dist: opuslib>=3; extra == 'stt'
32
+ Provides-Extra: stt-faster-whisper
33
+ Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
34
+ Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
35
+ Provides-Extra: stt-openai
36
+ Requires-Dist: openai>=1.0; extra == 'stt-openai'
37
+ Requires-Dist: opuslib>=3; extra == 'stt-openai'
30
38
  Provides-Extra: tts
31
39
  Requires-Dist: httpx>=0.27; extra == 'tts'
32
40
  Requires-Dist: opuslib>=3; extra == 'tts'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "stackchan-mcp"
3
- version = "0.5.0"
3
+ version = "0.7.0"
4
4
  description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -48,6 +48,24 @@ tts-voicevox = [
48
48
  "stackchan-mcp[tts]",
49
49
  ]
50
50
 
51
+ # Phase 4 STT — see Issue #91.
52
+ # The base `stt` extra carries `opuslib` for decoding the device's
53
+ # inbound Opus frames. Concrete engines live behind their own extras
54
+ # so users only pull in the heavy ML dependencies they actually need.
55
+ # * faster-whisper — local Whisper via CTranslate2 (default, MIT)
56
+ # * openai — OpenAI Whisper API client (cloud)
57
+ stt = [
58
+ "opuslib>=3",
59
+ ]
60
+ stt-faster-whisper = [
61
+ "stackchan-mcp[stt]",
62
+ "faster-whisper>=1.0",
63
+ ]
64
+ stt-openai = [
65
+ "stackchan-mcp[stt]",
66
+ "openai>=1.0",
67
+ ]
68
+
51
69
  [project.urls]
52
70
  Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
53
71
  Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
@@ -0,0 +1,151 @@
1
+ """Opus audio frame handling for the gateway <-> device link.
2
+
3
+ Outbound (TTS) frames are produced by
4
+ :mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
5
+ ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
6
+
7
+ The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
8
+ binary frames coming up from the device land in
9
+ :func:`handle_audio_frame`, which buffers them into a module-level
10
+ recording slot when one is active. The
11
+ :mod:`stackchan_mcp.stt.orchestrator` opens the slot via
12
+ :func:`start_recording` before sending ``listen.start`` to the device
13
+ and closes it via :func:`stop_recording` after the capture window;
14
+ outside an active recording, inbound frames are still discarded.
15
+
16
+ The recording slot is intentionally a module-level singleton: the
17
+ device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
18
+ one connection, and the STT orchestrator serialises ``listen()`` calls
19
+ through :attr:`ESP32Manager.listen_lock`, so concurrent captures
20
+ cannot race the buffer. If multi-device support lands later, this
21
+ should move onto the connection object.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ from typing import TYPE_CHECKING, Iterable
28
+
29
+ if TYPE_CHECKING:
30
+ from .esp32_client import ESP32Manager
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # --- Recording slot (inbound STT capture) ---------------------------------
36
+ #
37
+ # A single capture at a time is enforced by the orchestrator's
38
+ # ``listen_lock``; this module only owns the buffer itself.
39
+
40
+ _recording_session_id: str | None = None
41
+ _recording_frames: list[bytes] = []
42
+
43
+
44
+ def start_recording(session_id: str) -> None:
45
+ """Open a fresh recording slot for ``session_id``.
46
+
47
+ Any frames already buffered are discarded so a previous call that
48
+ crashed before ``stop_recording`` cannot leak into the next
49
+ capture. The orchestrator wraps start/stop in a try/finally to
50
+ guarantee the slot is closed even on error.
51
+ """
52
+ global _recording_session_id, _recording_frames
53
+ if _recording_session_id is not None:
54
+ # Defensive: the lock should prevent this, but if it ever
55
+ # fires we leak no audio — just log loudly so the regression
56
+ # is visible.
57
+ logger.warning(
58
+ "start_recording called while session=%s was still active; "
59
+ "dropping %d buffered frames",
60
+ _recording_session_id,
61
+ len(_recording_frames),
62
+ )
63
+ _recording_session_id = session_id
64
+ _recording_frames = []
65
+
66
+
67
+ def stop_recording() -> list[bytes]:
68
+ """Close the recording slot and return the buffered Opus frames.
69
+
70
+ Returns an empty list if no recording was active. The slot is
71
+ cleared whether or not frames were captured so the next call to
72
+ :func:`start_recording` starts clean.
73
+ """
74
+ global _recording_session_id, _recording_frames
75
+ frames = _recording_frames
76
+ _recording_session_id = None
77
+ _recording_frames = []
78
+ return frames
79
+
80
+
81
+ def is_recording() -> bool:
82
+ """Return ``True`` when a recording slot is currently open."""
83
+ return _recording_session_id is not None
84
+
85
+
86
+ async def handle_audio_frame(data: bytes, session_id: str) -> None:
87
+ """Process an incoming binary Opus frame from the device.
88
+
89
+ When a recording slot is active (see :func:`start_recording`) AND
90
+ the frame belongs to the recording's session, appends the frame
91
+ to the in-memory buffer for later decoding by the STT
92
+ orchestrator. Frames from a different session — typical during
93
+ a connection swap, where the old WebSocket handler is still
94
+ draining incoming bytes after :meth:`ESP32Connection.disconnect`
95
+ has been called on the main task — are dropped so they cannot
96
+ bleed into the new connection's capture buffer.
97
+
98
+ Outside of an active recording the frame is logged at debug
99
+ level and discarded; the device may emit audio on its own (e.g.
100
+ after an autonomous wake-word detection) and the gateway has no
101
+ STT pipeline running for those frames yet.
102
+ """
103
+ if _recording_session_id is None:
104
+ logger.debug(
105
+ "audio_frame session=%s bytes=%d (discarded — no active recording)",
106
+ session_id,
107
+ len(data),
108
+ )
109
+ return
110
+ if _recording_session_id != session_id:
111
+ # A different connection is sending audio while a recording
112
+ # for this session is in flight. This happens when ESP32
113
+ # reconnects: ``ESP32Manager._handler`` swaps in a new
114
+ # ``ESP32Connection`` and marks the old one disconnected,
115
+ # but the old socket's ``async for message in ws`` loop can
116
+ # still drain a frame or two before the close lands. Letting
117
+ # those into the buffer would corrupt the new session's
118
+ # transcription, so drop them here.
119
+ logger.debug(
120
+ "audio_frame session=%s bytes=%d (discarded — does not match "
121
+ "recording session=%s)",
122
+ session_id,
123
+ len(data),
124
+ _recording_session_id,
125
+ )
126
+ return
127
+ _recording_frames.append(data)
128
+ logger.debug(
129
+ "audio_frame session=%s bytes=%d buffered (recording active)",
130
+ session_id,
131
+ len(data),
132
+ )
133
+
134
+
135
+ async def push_opus_frames(
136
+ esp32: ESP32Manager,
137
+ frames: Iterable[bytes],
138
+ ) -> int:
139
+ """Push Opus frames to the connected ESP32.
140
+
141
+ Returns the number of frames sent so the caller can report this to
142
+ the MCP client. Raises :class:`ConnectionError` (via
143
+ :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
144
+ mid-stream — the orchestrator turns that into a clean MCP error
145
+ rather than letting it bubble up as a stack trace.
146
+ """
147
+ sent = 0
148
+ for frame in frames:
149
+ await esp32.send_audio_frame(frame)
150
+ sent += 1
151
+ return sent
@@ -17,6 +17,7 @@ import websockets
17
17
  import websockets.exceptions
18
18
  from websockets.asyncio.server import ServerConnection
19
19
 
20
+ from .audio_stream import handle_audio_frame
20
21
  from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
21
22
 
22
23
  logger = logging.getLogger(__name__)
@@ -206,6 +207,33 @@ class ESP32Connection:
206
207
  }
207
208
  await self._ws_send(json.dumps(message))
208
209
 
210
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
211
+ """Send a listen state notification (``start`` / ``stop``).
212
+
213
+ Server-driven counterpart to the device's existing
214
+ :func:`Protocol::SendStartListening` (Issue #91). The
215
+ firmware's :func:`Application::OnIncomingJson` dispatches
216
+ ``state: "start"`` to :func:`Application::StartListening` and
217
+ ``state: "stop"`` to :func:`Application::StopListening`.
218
+
219
+ ``mode`` is currently accepted only for ``state="start"`` and is
220
+ carried on the wire for forward-compatibility — the firmware
221
+ accepts but ignores it in Phase 1 because
222
+ :func:`HandleStartListeningEvent` unconditionally enters
223
+ ``kListeningModeManualStop`` (the gateway controls the stop
224
+ boundary explicitly).
225
+ """
226
+ if not self._connected:
227
+ raise ConnectionError("ESP32 not connected")
228
+ message: dict[str, Any] = {
229
+ "session_id": self.session_id,
230
+ "type": "listen",
231
+ "state": state,
232
+ }
233
+ if state == "start":
234
+ message["mode"] = mode
235
+ await self._ws_send(json.dumps(message))
236
+
209
237
  def disconnect(self) -> None:
210
238
  """Mark connection as disconnected."""
211
239
  self._connected = False
@@ -242,6 +270,21 @@ class ESP32Manager:
242
270
  # if multi-device support lands later, the lock should move
243
271
  # onto :class:`ESP32Connection` instead.
244
272
  self._tts_lock = asyncio.Lock()
273
+ # Inbound STT capture (Issue #91) shares the TTS lock rather
274
+ # than running on a separate one. The firmware's
275
+ # ``HandleStartListeningEvent`` aborts any in-flight TTS when
276
+ # a listen.start arrives mid-speaking (state ==
277
+ # ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
278
+ # ``SetListeningMode(kListeningModeManualStop)``), so two
279
+ # operations on the same device's audio path would
280
+ # otherwise step on each other: a ``listen()`` could yank a
281
+ # ``say()`` out of speaking mid-utterance, or a ``say()``
282
+ # could start streaming TTS frames into the buffer a
283
+ # concurrent ``listen()`` is capturing. Treating the audio
284
+ # path as a single resource makes the device's state machine
285
+ # observable from gateway code; if a full-duplex contract
286
+ # ever lands later the lock can split again.
287
+ self._listen_lock = self._tts_lock
245
288
 
246
289
  @property
247
290
  def device_connected(self) -> bool:
@@ -260,6 +303,17 @@ class ESP32Manager:
260
303
  """
261
304
  return self._tts_lock
262
305
 
306
+ @property
307
+ def listen_lock(self) -> asyncio.Lock:
308
+ """Per-device lock guarding the STT capture sequence.
309
+
310
+ See :attr:`_listen_lock` for the rationale; the orchestrator
311
+ wraps the entire ``listen.start`` → wait → ``listen.stop``
312
+ block in ``async with`` on this lock so two concurrent
313
+ ``listen()`` calls cannot share the inbound recording slot.
314
+ """
315
+ return self._listen_lock
316
+
263
317
  async def start(
264
318
  self,
265
319
  host: str = "0.0.0.0",
@@ -330,7 +384,14 @@ class ESP32Manager:
330
384
  try:
331
385
  async for message in ws:
332
386
  if isinstance(message, bytes):
333
- # Binary = audio frame, ignore for now
387
+ # Binary = audio frame. Forward to the audio_stream
388
+ # module which buffers it for STT capture (Issue
389
+ # #91) when a recording slot is open, or discards
390
+ # it otherwise. Only protocol v1 is supported on
391
+ # the inbound side today; the orchestrator gates
392
+ # listen() on protocol_version=1 so v2/v3 frames
393
+ # cannot reach this point with recording active.
394
+ await handle_audio_frame(message, session_id)
334
395
  continue
335
396
 
336
397
  try:
@@ -451,6 +512,17 @@ class ESP32Manager:
451
512
  raise ConnectionError("No ESP32 device connected")
452
513
  await self._connection.send_tts_state(state)
453
514
 
515
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
516
+ """Send a listen state notification to put the device into /
517
+ out of listening mode (Issue #91).
518
+
519
+ See :meth:`ESP32Connection.send_listen_state` for the wire
520
+ format and the firmware-side dispatch.
521
+ """
522
+ if not self._connection or not self._connection.connected:
523
+ raise ConnectionError("No ESP32 device connected")
524
+ await self._connection.send_listen_state(state, mode=mode)
525
+
454
526
  def get_status(self) -> dict[str, Any]:
455
527
  """Get current connection status."""
456
528
  if not self._connection or not self._connection.connected:
@@ -15,6 +15,7 @@ from mcp.server.stdio import stdio_server
15
15
  from mcp.types import TextContent, Tool
16
16
 
17
17
  from .gateway import get_gateway
18
+ from .stt import listen_and_transcribe
18
19
  from .tts import synthesize_and_send
19
20
 
20
21
  logger = logging.getLogger(__name__)
@@ -102,8 +103,14 @@ def create_server() -> Server:
102
103
  Tool(
103
104
  name="move_head",
104
105
  description=(
105
- "Move the robot's head to the specified angles. "
106
- "yaw: horizontal (-90 to 90), pitch: vertical (-30 to 30)."
106
+ "Move the robot's head to safe, recommended angles. "
107
+ "yaw: horizontal (-90 to 90), pitch: vertical (5 to 85, "
108
+ "the M5Stack-recommended operating range). Out-of-range "
109
+ "requests are rejected at this MCP layer; for advanced "
110
+ "callers that need the firmware hard clamp (pitch 0..88), "
111
+ "use the firmware-side `set_head_angles` device tool, "
112
+ "which exposes a permissive schema and the authoritative "
113
+ "two-tier guard described in the README."
107
114
  ),
108
115
  inputSchema={
109
116
  "type": "object",
@@ -111,10 +118,19 @@ def create_server() -> Server:
111
118
  "yaw": {
112
119
  "type": "integer",
113
120
  "description": "Horizontal angle in degrees (-90 to 90)",
121
+ "minimum": -90,
122
+ "maximum": 90,
114
123
  },
115
124
  "pitch": {
116
125
  "type": "integer",
117
- "description": "Vertical angle in degrees (-30 to 30)",
126
+ "description": (
127
+ "Vertical angle in degrees (5 to 85, "
128
+ "M5Stack-recommended operating range). For the "
129
+ "wider firmware hard clamp (0..88), use the "
130
+ "`set_head_angles` device tool instead."
131
+ ),
132
+ "minimum": 5,
133
+ "maximum": 85,
118
134
  },
119
135
  },
120
136
  "required": ["yaw", "pitch"],
@@ -408,6 +424,91 @@ def create_server() -> Server:
408
424
  "required": ["text"],
409
425
  },
410
426
  ),
427
+ Tool(
428
+ name="listen",
429
+ description=(
430
+ "Capture a short utterance from the device microphone and "
431
+ "transcribe it via a gateway-side STT engine (Phase 4, "
432
+ "Issue #91). The gateway sends a 'listen' notification "
433
+ "over the existing WebSocket to put the device firmware "
434
+ "into listening mode, buffers the Opus frames the device "
435
+ "streams up during the capture window, then decodes and "
436
+ "transcribes them once the window closes. Requires a "
437
+ "minimal firmware change to handle the inbound 'listen' "
438
+ "wire type (paired with this gateway release). Engine is "
439
+ "selectable via 'engine' (default 'faster-whisper', local). "
440
+ "Optional 'motion' feedback can switch the avatar to "
441
+ "'thinking' during capture ('face-only') or tilt the head "
442
+ "up while preserving yaw ('look-up'). "
443
+ "Install the relevant extra "
444
+ "('pip install stackchan-mcp[stt-faster-whisper]' or "
445
+ "'stt-openai'); calling this tool before an engine is "
446
+ "registered returns a clear error."
447
+ ),
448
+ inputSchema={
449
+ "type": "object",
450
+ "properties": {
451
+ "duration_ms": {
452
+ "type": "integer",
453
+ "description": (
454
+ "Capture window in milliseconds. Clamped to "
455
+ "[100, 30000]."
456
+ ),
457
+ "default": 5000,
458
+ "minimum": 100,
459
+ "maximum": 30000,
460
+ },
461
+ "engine": {
462
+ "type": "string",
463
+ "description": (
464
+ "Engine identifier (e.g. 'faster-whisper', "
465
+ "'openai-whisper'). Default 'faster-whisper'."
466
+ ),
467
+ "default": "faster-whisper",
468
+ },
469
+ "language": {
470
+ "type": "string",
471
+ "description": (
472
+ "ISO 639-1 language code (e.g. 'ja'). Pass "
473
+ "an empty string or omit for autodetect."
474
+ ),
475
+ "default": "ja",
476
+ },
477
+ "model": {
478
+ "type": "string",
479
+ "description": (
480
+ "Engine-specific model identifier (e.g. "
481
+ "'base' / 'small' / 'medium' for faster-"
482
+ "whisper, 'whisper-1' for OpenAI). Engines "
483
+ "fall back to their default when omitted."
484
+ ),
485
+ },
486
+ "motion": {
487
+ "type": "string",
488
+ "enum": ["none", "face-only", "look-up"],
489
+ "description": (
490
+ "Optional visible feedback during capture. "
491
+ "'none' preserves the previous behaviour. "
492
+ "'face-only' shows the thinking avatar during "
493
+ "capture and restores idle at the end. "
494
+ "'look-up' preserves yaw, tilts pitch to "
495
+ "look_up_pitch, and holds the pose on success."
496
+ ),
497
+ "default": "none",
498
+ },
499
+ "look_up_pitch": {
500
+ "type": "number",
501
+ "description": (
502
+ "Pitch angle for motion='look-up'. Must be "
503
+ "between 5 and 85 degrees."
504
+ ),
505
+ "default": 50.0,
506
+ "minimum": 5,
507
+ "maximum": 85,
508
+ },
509
+ },
510
+ },
511
+ ),
411
512
  ]
412
513
 
413
514
  @server.call_tool()
@@ -439,6 +540,25 @@ def create_server() -> Server:
439
540
  ]
440
541
  return [TextContent(type="text", text=json.dumps(result))]
441
542
 
543
+ if name == "listen":
544
+ # STT runs on the gateway side. The orchestrator drives the
545
+ # device's listening state via ``listen.start``/``stop``
546
+ # notifications, buffers the inbound Opus frames, decodes
547
+ # them, and hands the PCM blob to the registered engine.
548
+ # Same error-class discipline as say(): ValueError /
549
+ # NotImplementedError / RuntimeError all turn into clean
550
+ # MCP error JSON.
551
+ try:
552
+ result = await listen_and_transcribe(arguments, gateway=gw)
553
+ except (ValueError, NotImplementedError, RuntimeError) as exc:
554
+ return [
555
+ TextContent(
556
+ type="text",
557
+ text=json.dumps({"error": str(exc)}),
558
+ )
559
+ ]
560
+ return [TextContent(type="text", text=json.dumps(result))]
561
+
442
562
  if not gw.esp32.device_connected:
443
563
  return [
444
564
  TextContent(
@@ -447,6 +567,59 @@ def create_server() -> Server:
447
567
  )
448
568
  ]
449
569
 
570
+ if name == "move_head":
571
+ # Belt-and-suspenders validation for the recommended pitch range.
572
+ # The Tool inputSchema already declares minimum/maximum for both
573
+ # yaw and pitch, but mcp Python SDK server-side enforcement of
574
+ # JSON Schema bounds is not guaranteed across versions and
575
+ # clients. Reject out-of-recommended values here as a clean
576
+ # MCP error JSON before any motion command reaches the device.
577
+ # Callers that genuinely need the firmware hard clamp 0..88
578
+ # should use the firmware-side `set_head_angles` device tool,
579
+ # which exposes the authoritative two-tier guard described in
580
+ # the README "Y-axis (pitch) safe range" section.
581
+ yaw_val = arguments.get("yaw")
582
+ pitch_val = arguments.get("pitch")
583
+ if (
584
+ not isinstance(yaw_val, int)
585
+ or isinstance(yaw_val, bool)
586
+ or not (-90 <= yaw_val <= 90)
587
+ ):
588
+ return [
589
+ TextContent(
590
+ type="text",
591
+ text=json.dumps(
592
+ {
593
+ "error": (
594
+ "yaw must be an integer in -90..90 "
595
+ f"(got {yaw_val!r})"
596
+ )
597
+ }
598
+ ),
599
+ )
600
+ ]
601
+ if (
602
+ not isinstance(pitch_val, int)
603
+ or isinstance(pitch_val, bool)
604
+ or not (5 <= pitch_val <= 85)
605
+ ):
606
+ return [
607
+ TextContent(
608
+ type="text",
609
+ text=json.dumps(
610
+ {
611
+ "error": (
612
+ "pitch must be an integer in 5..85 "
613
+ "(M5Stack-recommended operating range; "
614
+ "for the wider firmware hard clamp "
615
+ "0..88 use `set_head_angles`). got "
616
+ f"{pitch_val!r}"
617
+ )
618
+ }
619
+ ),
620
+ )
621
+ ]
622
+
450
623
  # Map MCP client tool names to ESP32 MCP tool names (self.* prefix)
451
624
  tool_map: dict[str, tuple[str, dict[str, Any]]] = {
452
625
  "get_device_info": (
@@ -0,0 +1,62 @@
1
+ """STT framework for Phase 4 (Issue #91).
2
+
3
+ Companion to :mod:`stackchan_mcp.tts`: this package provides the
4
+ engine-agnostic skeleton for the gateway-side ``listen(duration_ms)``
5
+ MCP tool plus the concrete faster-whisper (default, local) and
6
+ OpenAI Whisper API engines.
7
+
8
+ Engines whose modules require optional extras to import are registered
9
+ behind ``try / except ImportError`` so the framework still works when
10
+ the corresponding extra is missing.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from typing import Callable
17
+
18
+ from .base import EngineRegistry, STTEngine, get_registry
19
+ from .orchestrator import DEFAULT_ENGINE, listen_and_transcribe
20
+
21
+ _logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _try_register(register_fn: Callable[[], None], engine_label: str) -> None:
25
+ """Run ``register_fn`` and swallow ImportErrors.
26
+
27
+ Used so an engine whose top-level module needs an optional extra
28
+ (e.g. faster-whisper / openai) can fail to register cleanly without
29
+ breaking the rest of the framework. Engine modules themselves
30
+ import cleanly; their heavy dependencies are imported lazily inside
31
+ :meth:`STTEngine.transcribe` so this layer just lights up the
32
+ registry slot.
33
+ """
34
+ try:
35
+ register_fn()
36
+ except ImportError as exc:
37
+ _logger.debug("Skipping %s engine registration: %s", engine_label, exc)
38
+
39
+
40
+ def _register_faster_whisper() -> None:
41
+ from .faster_whisper import FasterWhisperEngine
42
+
43
+ get_registry().register(FasterWhisperEngine())
44
+
45
+
46
+ def _register_openai_whisper() -> None:
47
+ from .openai_whisper import OpenAIWhisperEngine
48
+
49
+ get_registry().register(OpenAIWhisperEngine())
50
+
51
+
52
+ _try_register(_register_faster_whisper, "faster-whisper")
53
+ _try_register(_register_openai_whisper, "openai-whisper")
54
+
55
+
56
+ __all__ = [
57
+ "DEFAULT_ENGINE",
58
+ "EngineRegistry",
59
+ "STTEngine",
60
+ "get_registry",
61
+ "listen_and_transcribe",
62
+ ]