stackchan-mcp 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/PKG-INFO +9 -1
  2. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/pyproject.toml +19 -1
  3. stackchan_mcp-0.6.0/stackchan_mcp/audio_stream.py +151 -0
  4. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/esp32_client.py +73 -1
  5. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/stdio_server.py +79 -0
  6. stackchan_mcp-0.6.0/stackchan_mcp/stt/__init__.py +62 -0
  7. stackchan_mcp-0.6.0/stackchan_mcp/stt/audio_utils.py +102 -0
  8. stackchan_mcp-0.6.0/stackchan_mcp/stt/base.py +94 -0
  9. stackchan_mcp-0.6.0/stackchan_mcp/stt/faster_whisper.py +217 -0
  10. stackchan_mcp-0.6.0/stackchan_mcp/stt/openai_whisper.py +177 -0
  11. stackchan_mcp-0.6.0/stackchan_mcp/stt/orchestrator.py +306 -0
  12. stackchan_mcp-0.6.0/tests/test_audio_stream.py +145 -0
  13. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_esp32_client.py +81 -0
  14. stackchan_mcp-0.6.0/tests/test_stt_audio_utils.py +100 -0
  15. stackchan_mcp-0.6.0/tests/test_stt_framework.py +195 -0
  16. stackchan_mcp-0.6.0/tests/test_stt_orchestrator.py +441 -0
  17. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/uv.lock +775 -3
  18. stackchan_mcp-0.5.0/stackchan_mcp/audio_stream.py +0 -52
  19. stackchan_mcp-0.5.0/tests/test_audio_stream.py +0 -60
  20. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/.env.example +0 -0
  21. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/.gitignore +0 -0
  22. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/LICENSE +0 -0
  23. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/README.md +0 -0
  24. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__init__.py +0 -0
  25. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/__main__.py +0 -0
  26. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/capture_server.py +0 -0
  27. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/cli.py +0 -0
  28. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/gateway.py +0 -0
  29. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/__init__.py +0 -0
  30. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/audio.py +0 -0
  31. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/camera.py +0 -0
  32. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/handlers/robot.py +0 -0
  33. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/mcp_router.py +0 -0
  34. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/protocol.py +0 -0
  35. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/server.py +0 -0
  36. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tools.py +0 -0
  37. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/__init__.py +0 -0
  38. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/audio_utils.py +0 -0
  39. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/base.py +0 -0
  40. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/orchestrator.py +0 -0
  41. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/stackchan_mcp/tts/voicevox.py +0 -0
  42. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/_audio_fixtures.py +0 -0
  43. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/conftest.py +0 -0
  44. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_audio_utils.py +0 -0
  45. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_capture_server.py +0 -0
  46. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_cli.py +0 -0
  47. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_gateway.py +0 -0
  48. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_mcp_router.py +0 -0
  49. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_orchestrator.py +0 -0
  50. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_protocol.py +0 -0
  51. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_stdio_server.py +0 -0
  52. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_tts_framework.py +0 -0
  53. {stackchan_mcp-0.5.0 → stackchan_mcp-0.6.0}/tests/test_voicevox.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stackchan-mcp
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
5
5
  Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
6
6
  Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
@@ -27,6 +27,14 @@ Requires-Dist: mcp>=1.0
27
27
  Requires-Dist: pydantic>=2
28
28
  Requires-Dist: python-dotenv
29
29
  Requires-Dist: websockets>=12
30
+ Provides-Extra: stt
31
+ Requires-Dist: opuslib>=3; extra == 'stt'
32
+ Provides-Extra: stt-faster-whisper
33
+ Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
34
+ Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
35
+ Provides-Extra: stt-openai
36
+ Requires-Dist: openai>=1.0; extra == 'stt-openai'
37
+ Requires-Dist: opuslib>=3; extra == 'stt-openai'
30
38
  Provides-Extra: tts
31
39
  Requires-Dist: httpx>=0.27; extra == 'tts'
32
40
  Requires-Dist: opuslib>=3; extra == 'tts'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "stackchan-mcp"
3
- version = "0.5.0"
3
+ version = "0.6.0"
4
4
  description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -48,6 +48,24 @@ tts-voicevox = [
48
48
  "stackchan-mcp[tts]",
49
49
  ]
50
50
 
51
+ # Phase 4 STT — see Issue #91.
52
+ # The base `stt` extra carries `opuslib` for decoding the device's
53
+ # inbound Opus frames. Concrete engines live behind their own extras
54
+ # so users only pull in the heavy ML dependencies they actually need.
55
+ # * faster-whisper — local Whisper via CTranslate2 (default, MIT)
56
+ # * openai — OpenAI Whisper API client (cloud)
57
+ stt = [
58
+ "opuslib>=3",
59
+ ]
60
+ stt-faster-whisper = [
61
+ "stackchan-mcp[stt]",
62
+ "faster-whisper>=1.0",
63
+ ]
64
+ stt-openai = [
65
+ "stackchan-mcp[stt]",
66
+ "openai>=1.0",
67
+ ]
68
+
51
69
  [project.urls]
52
70
  Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
53
71
  Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"
@@ -0,0 +1,151 @@
1
+ """Opus audio frame handling for the gateway <-> device link.
2
+
3
+ Outbound (TTS) frames are produced by
4
+ :mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
5
+ ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
6
+
7
+ The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
8
+ binary frames coming up from the device land in
9
+ :func:`handle_audio_frame`, which buffers them into a module-level
10
+ recording slot when one is active. The
11
+ :mod:`stackchan_mcp.stt.orchestrator` opens the slot via
12
+ :func:`start_recording` before sending ``listen.start`` to the device
13
+ and closes it via :func:`stop_recording` after the capture window;
14
+ outside an active recording, inbound frames are still discarded.
15
+
16
+ The recording slot is intentionally a module-level singleton: the
17
+ device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
18
+ one connection, and the STT orchestrator serialises ``listen()`` calls
19
+ through :attr:`ESP32Manager.listen_lock`, so concurrent captures
20
+ cannot race the buffer. If multi-device support lands later, this
21
+ should move onto the connection object.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ from typing import TYPE_CHECKING, Iterable
28
+
29
+ if TYPE_CHECKING:
30
+ from .esp32_client import ESP32Manager
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # --- Recording slot (inbound STT capture) ---------------------------------
36
+ #
37
+ # A single capture at a time is enforced by the orchestrator's
38
+ # ``listen_lock``; this module only owns the buffer itself.
39
+
40
+ _recording_session_id: str | None = None
41
+ _recording_frames: list[bytes] = []
42
+
43
+
44
+ def start_recording(session_id: str) -> None:
45
+ """Open a fresh recording slot for ``session_id``.
46
+
47
+ Any frames already buffered are discarded so a previous call that
48
+ crashed before ``stop_recording`` cannot leak into the next
49
+ capture. The orchestrator wraps start/stop in a try/finally to
50
+ guarantee the slot is closed even on error.
51
+ """
52
+ global _recording_session_id, _recording_frames
53
+ if _recording_session_id is not None:
54
+ # Defensive: the lock should prevent this, but if it ever
55
+ # fires we leak no audio — just log loudly so the regression
56
+ # is visible.
57
+ logger.warning(
58
+ "start_recording called while session=%s was still active; "
59
+ "dropping %d buffered frames",
60
+ _recording_session_id,
61
+ len(_recording_frames),
62
+ )
63
+ _recording_session_id = session_id
64
+ _recording_frames = []
65
+
66
+
67
+ def stop_recording() -> list[bytes]:
68
+ """Close the recording slot and return the buffered Opus frames.
69
+
70
+ Returns an empty list if no recording was active. The slot is
71
+ cleared whether or not frames were captured so the next call to
72
+ :func:`start_recording` starts clean.
73
+ """
74
+ global _recording_session_id, _recording_frames
75
+ frames = _recording_frames
76
+ _recording_session_id = None
77
+ _recording_frames = []
78
+ return frames
79
+
80
+
81
+ def is_recording() -> bool:
82
+ """Return ``True`` when a recording slot is currently open."""
83
+ return _recording_session_id is not None
84
+
85
+
86
+ async def handle_audio_frame(data: bytes, session_id: str) -> None:
87
+ """Process an incoming binary Opus frame from the device.
88
+
89
+ When a recording slot is active (see :func:`start_recording`) AND
90
+ the frame belongs to the recording's session, appends the frame
91
+ to the in-memory buffer for later decoding by the STT
92
+ orchestrator. Frames from a different session — typical during
93
+ a connection swap, where the old WebSocket handler is still
94
+ draining incoming bytes after :meth:`ESP32Connection.disconnect`
95
+ has been called on the main task — are dropped so they cannot
96
+ bleed into the new connection's capture buffer.
97
+
98
+ Outside of an active recording the frame is logged at debug
99
+ level and discarded; the device may emit audio on its own (e.g.
100
+ after an autonomous wake-word detection) and the gateway has no
101
+ STT pipeline running for those frames yet.
102
+ """
103
+ if _recording_session_id is None:
104
+ logger.debug(
105
+ "audio_frame session=%s bytes=%d (discarded — no active recording)",
106
+ session_id,
107
+ len(data),
108
+ )
109
+ return
110
+ if _recording_session_id != session_id:
111
+ # A different connection is sending audio while a recording
112
+ # for this session is in flight. This happens when ESP32
113
+ # reconnects: ``ESP32Manager._handler`` swaps in a new
114
+ # ``ESP32Connection`` and marks the old one disconnected,
115
+ # but the old socket's ``async for message in ws`` loop can
116
+ # still drain a frame or two before the close lands. Letting
117
+ # those into the buffer would corrupt the new session's
118
+ # transcription, so drop them here.
119
+ logger.debug(
120
+ "audio_frame session=%s bytes=%d (discarded — does not match "
121
+ "recording session=%s)",
122
+ session_id,
123
+ len(data),
124
+ _recording_session_id,
125
+ )
126
+ return
127
+ _recording_frames.append(data)
128
+ logger.debug(
129
+ "audio_frame session=%s bytes=%d buffered (recording active)",
130
+ session_id,
131
+ len(data),
132
+ )
133
+
134
+
135
+ async def push_opus_frames(
136
+ esp32: ESP32Manager,
137
+ frames: Iterable[bytes],
138
+ ) -> int:
139
+ """Push Opus frames to the connected ESP32.
140
+
141
+ Returns the number of frames sent so the caller can report this to
142
+ the MCP client. Raises :class:`ConnectionError` (via
143
+ :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
144
+ mid-stream — the orchestrator turns that into a clean MCP error
145
+ rather than letting it bubble up as a stack trace.
146
+ """
147
+ sent = 0
148
+ for frame in frames:
149
+ await esp32.send_audio_frame(frame)
150
+ sent += 1
151
+ return sent
@@ -17,6 +17,7 @@ import websockets
17
17
  import websockets.exceptions
18
18
  from websockets.asyncio.server import ServerConnection
19
19
 
20
+ from .audio_stream import handle_audio_frame
20
21
  from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
21
22
 
22
23
  logger = logging.getLogger(__name__)
@@ -206,6 +207,33 @@ class ESP32Connection:
206
207
  }
207
208
  await self._ws_send(json.dumps(message))
208
209
 
210
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
211
+ """Send a listen state notification (``start`` / ``stop``).
212
+
213
+ Server-driven counterpart to the device's existing
214
+ :func:`Protocol::SendStartListening` (Issue #91). The
215
+ firmware's :func:`Application::OnIncomingJson` dispatches
216
+ ``state: "start"`` to :func:`Application::StartListening` and
217
+ ``state: "stop"`` to :func:`Application::StopListening`.
218
+
219
+ ``mode`` is currently accepted only for ``state="start"`` and is
220
+ carried on the wire for forward-compatibility — the firmware
221
+ accepts but ignores it in Phase 1 because
222
+ :func:`HandleStartListeningEvent` unconditionally enters
223
+ ``kListeningModeManualStop`` (the gateway controls the stop
224
+ boundary explicitly).
225
+ """
226
+ if not self._connected:
227
+ raise ConnectionError("ESP32 not connected")
228
+ message: dict[str, Any] = {
229
+ "session_id": self.session_id,
230
+ "type": "listen",
231
+ "state": state,
232
+ }
233
+ if state == "start":
234
+ message["mode"] = mode
235
+ await self._ws_send(json.dumps(message))
236
+
209
237
  def disconnect(self) -> None:
210
238
  """Mark connection as disconnected."""
211
239
  self._connected = False
@@ -242,6 +270,21 @@ class ESP32Manager:
242
270
  # if multi-device support lands later, the lock should move
243
271
  # onto :class:`ESP32Connection` instead.
244
272
  self._tts_lock = asyncio.Lock()
273
+ # Inbound STT capture (Issue #91) shares the TTS lock rather
274
+ # than running on a separate one. The firmware's
275
+ # ``HandleStartListeningEvent`` aborts any in-flight TTS when
276
+ # a listen.start arrives mid-speaking (state ==
277
+ # ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
278
+ # ``SetListeningMode(kListeningModeManualStop)``), so two
279
+ # operations on the same device's audio path would
280
+ # otherwise step on each other: a ``listen()`` could yank a
281
+ # ``say()`` out of speaking mid-utterance, or a ``say()``
282
+ # could start streaming TTS frames into the buffer a
283
+ # concurrent ``listen()`` is capturing. Treating the audio
284
+ # path as a single resource makes the device's state machine
285
+ # observable from gateway code; if a full-duplex contract
286
+ # ever lands later the lock can split again.
287
+ self._listen_lock = self._tts_lock
245
288
 
246
289
  @property
247
290
  def device_connected(self) -> bool:
@@ -260,6 +303,17 @@ class ESP32Manager:
260
303
  """
261
304
  return self._tts_lock
262
305
 
306
+ @property
307
+ def listen_lock(self) -> asyncio.Lock:
308
+ """Per-device lock guarding the STT capture sequence.
309
+
310
+ See :attr:`_listen_lock` for the rationale; the orchestrator
311
+ wraps the entire ``listen.start`` → wait → ``listen.stop``
312
+ block in ``async with`` on this lock so two concurrent
313
+ ``listen()`` calls cannot share the inbound recording slot.
314
+ """
315
+ return self._listen_lock
316
+
263
317
  async def start(
264
318
  self,
265
319
  host: str = "0.0.0.0",
@@ -330,7 +384,14 @@ class ESP32Manager:
330
384
  try:
331
385
  async for message in ws:
332
386
  if isinstance(message, bytes):
333
- # Binary = audio frame, ignore for now
387
+ # Binary = audio frame. Forward to the audio_stream
388
+ # module which buffers it for STT capture (Issue
389
+ # #91) when a recording slot is open, or discards
390
+ # it otherwise. Only protocol v1 is supported on
391
+ # the inbound side today; the orchestrator gates
392
+ # listen() on protocol_version=1 so v2/v3 frames
393
+ # cannot reach this point with recording active.
394
+ await handle_audio_frame(message, session_id)
334
395
  continue
335
396
 
336
397
  try:
@@ -451,6 +512,17 @@ class ESP32Manager:
451
512
  raise ConnectionError("No ESP32 device connected")
452
513
  await self._connection.send_tts_state(state)
453
514
 
515
+ async def send_listen_state(self, state: str, mode: str = "manual") -> None:
516
+ """Send a listen state notification to put the device into /
517
+ out of listening mode (Issue #91).
518
+
519
+ See :meth:`ESP32Connection.send_listen_state` for the wire
520
+ format and the firmware-side dispatch.
521
+ """
522
+ if not self._connection or not self._connection.connected:
523
+ raise ConnectionError("No ESP32 device connected")
524
+ await self._connection.send_listen_state(state, mode=mode)
525
+
454
526
  def get_status(self) -> dict[str, Any]:
455
527
  """Get current connection status."""
456
528
  if not self._connection or not self._connection.connected:
@@ -15,6 +15,7 @@ from mcp.server.stdio import stdio_server
15
15
  from mcp.types import TextContent, Tool
16
16
 
17
17
  from .gateway import get_gateway
18
+ from .stt import listen_and_transcribe
18
19
  from .tts import synthesize_and_send
19
20
 
20
21
  logger = logging.getLogger(__name__)
@@ -408,6 +409,65 @@ def create_server() -> Server:
408
409
  "required": ["text"],
409
410
  },
410
411
  ),
412
+ Tool(
413
+ name="listen",
414
+ description=(
415
+ "Capture a short utterance from the device microphone and "
416
+ "transcribe it via a gateway-side STT engine (Phase 4, "
417
+ "Issue #91). The gateway sends a 'listen' notification "
418
+ "over the existing WebSocket to put the device firmware "
419
+ "into listening mode, buffers the Opus frames the device "
420
+ "streams up during the capture window, then decodes and "
421
+ "transcribes them once the window closes. Requires a "
422
+ "minimal firmware change to handle the inbound 'listen' "
423
+ "wire type (paired with this gateway release). Engine is "
424
+ "selectable via 'engine' (default 'faster-whisper', local). "
425
+ "Install the relevant extra "
426
+ "('pip install stackchan-mcp[stt-faster-whisper]' or "
427
+ "'stt-openai'); calling this tool before an engine is "
428
+ "registered returns a clear error."
429
+ ),
430
+ inputSchema={
431
+ "type": "object",
432
+ "properties": {
433
+ "duration_ms": {
434
+ "type": "integer",
435
+ "description": (
436
+ "Capture window in milliseconds. Clamped to "
437
+ "[100, 30000]."
438
+ ),
439
+ "default": 5000,
440
+ "minimum": 100,
441
+ "maximum": 30000,
442
+ },
443
+ "engine": {
444
+ "type": "string",
445
+ "description": (
446
+ "Engine identifier (e.g. 'faster-whisper', "
447
+ "'openai-whisper'). Default 'faster-whisper'."
448
+ ),
449
+ "default": "faster-whisper",
450
+ },
451
+ "language": {
452
+ "type": "string",
453
+ "description": (
454
+ "ISO 639-1 language code (e.g. 'ja'). Pass "
455
+ "an empty string or omit for autodetect."
456
+ ),
457
+ "default": "ja",
458
+ },
459
+ "model": {
460
+ "type": "string",
461
+ "description": (
462
+ "Engine-specific model identifier (e.g. "
463
+ "'base' / 'small' / 'medium' for faster-"
464
+ "whisper, 'whisper-1' for OpenAI). Engines "
465
+ "fall back to their default when omitted."
466
+ ),
467
+ },
468
+ },
469
+ },
470
+ ),
411
471
  ]
412
472
 
413
473
  @server.call_tool()
@@ -439,6 +499,25 @@ def create_server() -> Server:
439
499
  ]
440
500
  return [TextContent(type="text", text=json.dumps(result))]
441
501
 
502
+ if name == "listen":
503
+ # STT runs on the gateway side. The orchestrator drives the
504
+ # device's listening state via ``listen.start``/``stop``
505
+ # notifications, buffers the inbound Opus frames, decodes
506
+ # them, and hands the PCM blob to the registered engine.
507
+ # Same error-class discipline as say(): ValueError /
508
+ # NotImplementedError / RuntimeError all turn into clean
509
+ # MCP error JSON.
510
+ try:
511
+ result = await listen_and_transcribe(arguments, gateway=gw)
512
+ except (ValueError, NotImplementedError, RuntimeError) as exc:
513
+ return [
514
+ TextContent(
515
+ type="text",
516
+ text=json.dumps({"error": str(exc)}),
517
+ )
518
+ ]
519
+ return [TextContent(type="text", text=json.dumps(result))]
520
+
442
521
  if not gw.esp32.device_connected:
443
522
  return [
444
523
  TextContent(
@@ -0,0 +1,62 @@
1
+ """STT framework for Phase 4 (Issue #91).
2
+
3
+ Companion to :mod:`stackchan_mcp.tts`: this package provides the
4
+ engine-agnostic skeleton for the gateway-side ``listen(duration_ms)``
5
+ MCP tool plus the concrete faster-whisper (default, local) and
6
+ OpenAI Whisper API engines.
7
+
8
+ Engines whose modules require optional extras to import are registered
9
+ behind ``try / except ImportError`` so the framework still works when
10
+ the corresponding extra is missing.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from typing import Callable
17
+
18
+ from .base import EngineRegistry, STTEngine, get_registry
19
+ from .orchestrator import DEFAULT_ENGINE, listen_and_transcribe
20
+
21
+ _logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _try_register(register_fn: Callable[[], None], engine_label: str) -> None:
25
+ """Run ``register_fn`` and swallow ImportErrors.
26
+
27
+ Used so an engine whose top-level module needs an optional extra
28
+ (e.g. faster-whisper / openai) can fail to register cleanly without
29
+ breaking the rest of the framework. Engine modules themselves
30
+ import cleanly; their heavy dependencies are imported lazily inside
31
+ :meth:`STTEngine.transcribe` so this layer just lights up the
32
+ registry slot.
33
+ """
34
+ try:
35
+ register_fn()
36
+ except ImportError as exc:
37
+ _logger.debug("Skipping %s engine registration: %s", engine_label, exc)
38
+
39
+
40
+ def _register_faster_whisper() -> None:
41
+ from .faster_whisper import FasterWhisperEngine
42
+
43
+ get_registry().register(FasterWhisperEngine())
44
+
45
+
46
+ def _register_openai_whisper() -> None:
47
+ from .openai_whisper import OpenAIWhisperEngine
48
+
49
+ get_registry().register(OpenAIWhisperEngine())
50
+
51
+
52
+ _try_register(_register_faster_whisper, "faster-whisper")
53
+ _try_register(_register_openai_whisper, "openai-whisper")
54
+
55
+
56
+ __all__ = [
57
+ "DEFAULT_ENGINE",
58
+ "EngineRegistry",
59
+ "STTEngine",
60
+ "get_registry",
61
+ "listen_and_transcribe",
62
+ ]
@@ -0,0 +1,102 @@
1
+ """Audio utilities for the STT pipeline.
2
+
3
+ Mirror of :mod:`stackchan_mcp.tts.audio_utils` for the inbound direction:
4
+ the helpers here decode Opus frames coming up from the device and
5
+ concatenate them into a single PCM blob that a recogniser can consume.
6
+
7
+ ``opuslib`` is imported lazily inside :func:`decode_opus_frames` so the
8
+ rest of the module stays usable in environments where the ``[stt]``
9
+ extra is not installed.
10
+
11
+ Device-side Opus parameters come from the firmware's hello handshake
12
+ (``firmware/main/protocols/websocket_protocol.cc::GetHelloMessage``)::
13
+
14
+ sample_rate = 16000 Hz
15
+ channels = 1
16
+ frame_duration_ms = OPUS_FRAME_DURATION_MS (60 ms)
17
+ samples_per_frame = sample_rate * frame_duration_ms / 1000 = 960
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ from typing import Iterable
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ #: Opus sample rate the device encoder is configured for.
29
+ DEVICE_SAMPLE_RATE = 16000
30
+
31
+ #: Opus channel count (mono).
32
+ DEVICE_CHANNELS = 1
33
+
34
+ #: Opus frame duration in milliseconds (matches the firmware's
35
+ #: ``OPUS_FRAME_DURATION_MS``). Kept symmetric with
36
+ #: :data:`stackchan_mcp.tts.audio_utils.DEVICE_FRAME_DURATION_MS`.
37
+ DEVICE_FRAME_DURATION_MS = 60
38
+
39
+ #: PCM samples per Opus frame at the device's settings (= 960).
40
+ SAMPLES_PER_FRAME = DEVICE_SAMPLE_RATE * DEVICE_FRAME_DURATION_MS // 1000
41
+
42
+
43
+ def decode_opus_frames(
44
+ frames: Iterable[bytes],
45
+ *,
46
+ sample_rate: int = DEVICE_SAMPLE_RATE,
47
+ channels: int = DEVICE_CHANNELS,
48
+ frame_duration_ms: int = DEVICE_FRAME_DURATION_MS,
49
+ ) -> bytes:
50
+ """Decode an iterable of Opus frames into a single PCM blob.
51
+
52
+ Args:
53
+ frames: Iterable of raw Opus payloads (i.e. the protocol v1
54
+ wire format the firmware emits when ``protocol_version=1``;
55
+ see :class:`stackchan_mcp.esp32_client.ESP32Connection`).
56
+ Each frame must contain exactly ``frame_duration_ms`` of
57
+ audio at ``sample_rate`` mono.
58
+ sample_rate: Decoder sample rate (Hz). Defaults to the device's
59
+ 16 kHz.
60
+ channels: Channel count. Defaults to mono.
61
+ frame_duration_ms: Per-frame duration in ms. Defaults to the
62
+ device's 60 ms cadence.
63
+
64
+ Returns:
65
+ Signed 16-bit little-endian PCM bytes concatenated across all
66
+ frames. Frames that fail to decode are logged at warning level
67
+ and skipped — partial transcription is better than failing the
68
+ whole listen() call because one frame got mangled on the wire.
69
+
70
+ Raises:
71
+ RuntimeError: if ``opuslib`` is not installed. The error
72
+ message points at the right install command so the caller
73
+ can surface a clean MCP error.
74
+ """
75
+ try:
76
+ import opuslib # type: ignore[import-not-found]
77
+ except ImportError as exc: # pragma: no cover - exercised via integration
78
+ raise RuntimeError(
79
+ "opuslib is not installed. Install with "
80
+ "'pip install stackchan-mcp[stt]' to enable Opus decoding."
81
+ ) from exc
82
+
83
+ samples_per_frame = sample_rate * frame_duration_ms // 1000
84
+ decoder = opuslib.Decoder(sample_rate, channels)
85
+
86
+ pcm_chunks: list[bytes] = []
87
+ for index, frame in enumerate(frames):
88
+ if not frame:
89
+ continue
90
+ try:
91
+ pcm = decoder.decode(frame, samples_per_frame)
92
+ except Exception as exc: # pragma: no cover - decode errors are rare
93
+ logger.warning(
94
+ "Opus decode failed for frame %d (size=%d): %s; skipping",
95
+ index,
96
+ len(frame),
97
+ exc,
98
+ )
99
+ continue
100
+ pcm_chunks.append(pcm)
101
+
102
+ return b"".join(pcm_chunks)
@@ -0,0 +1,94 @@
1
+ """STT engine abstraction.
2
+
3
+ Each concrete engine takes 16 kHz mono PCM (signed 16-bit LE) and
4
+ returns a transcription. Opus decoding from the device wire format and
5
+ PCM buffering are handled by :mod:`stackchan_mcp.stt.orchestrator` so
6
+ engines stay focused on recognition.
7
+
8
+ This module is intentionally dependency-free: it must import cleanly
9
+ without ``faster-whisper`` / ``openai`` / ``opuslib`` so that callers
10
+ can introspect the registered engines even when the optional ``[stt]``
11
+ extras are not installed. Mirrors :mod:`stackchan_mcp.tts.base`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from typing import Any
18
+
19
+
20
+ class STTEngine(ABC):
21
+ """Abstract base for STT engines.
22
+
23
+ Subclasses must set :attr:`name` to a stable identifier (matched
24
+ against the ``engine`` argument of the ``listen`` MCP tool) and
25
+ implement :meth:`transcribe`.
26
+ """
27
+
28
+ #: Stable identifier used to look this engine up in the registry.
29
+ #: Concrete subclasses must override with a non-empty string.
30
+ name: str = ""
31
+
32
+ @abstractmethod
33
+ async def transcribe(self, pcm: bytes, **opts: Any) -> dict[str, Any]:
34
+ """Transcribe 16 kHz mono PCM (signed 16-bit LE) into text.
35
+
36
+ Args:
37
+ pcm: Raw PCM bytes at 16 kHz, mono, signed 16-bit
38
+ little-endian. The orchestrator handles Opus decoding
39
+ and frame concatenation before calling this method.
40
+ **opts: Engine-specific options. Recognised keys include
41
+ ``language`` (ISO 639-1 code, e.g. ``"ja"``, or
42
+ ``None`` for autodetect) and ``model`` (engine-specific
43
+ model name, e.g. ``"base"`` / ``"small"`` for
44
+ faster-whisper). Engines should ignore unknown options
45
+ rather than raise, so the ``listen`` tool can pass a
46
+ uniform argument set.
47
+
48
+ Returns:
49
+ Dict with at least ``text`` (transcribed string) and
50
+ ``language`` (ISO 639-1 code that the engine used or
51
+ detected). Engines may add extra keys (e.g. ``segments``,
52
+ ``confidence``) for diagnostics — the orchestrator surfaces
53
+ ``text`` and ``language`` to the caller and leaves the rest
54
+ available for future extensions.
55
+ """
56
+
57
+
58
+ class EngineRegistry:
59
+ """Tracks available STT engines by name.
60
+
61
+ Concrete engines register themselves at import time when their
62
+ optional dependencies are satisfied (see
63
+ :mod:`stackchan_mcp.stt.faster_whisper` and
64
+ :mod:`stackchan_mcp.stt.openai_whisper`).
65
+ """
66
+
67
+ def __init__(self) -> None:
68
+ self._engines: dict[str, STTEngine] = {}
69
+
70
+ def register(self, engine: STTEngine) -> None:
71
+ """Register ``engine`` under ``engine.name``.
72
+
73
+ Replaces any previously registered engine with the same name —
74
+ this is intentional so tests can inject fakes.
75
+ """
76
+ if not engine.name:
77
+ raise ValueError("STTEngine.name must be a non-empty string")
78
+ self._engines[engine.name] = engine
79
+
80
+ def get(self, name: str) -> STTEngine | None:
81
+ """Return the engine registered under ``name``, or ``None``."""
82
+ return self._engines.get(name)
83
+
84
+ def names(self) -> list[str]:
85
+ """Return all registered engine names, sorted alphabetically."""
86
+ return sorted(self._engines.keys())
87
+
88
+
89
+ _default_registry = EngineRegistry()
90
+
91
+
92
+ def get_registry() -> EngineRegistry:
93
+ """Return the process-wide default :class:`EngineRegistry`."""
94
+ return _default_registry