PyPI - stackchan-mcp - Versions diffs - 0.5.0__tar.gz → 0.7.0__tar.gz - Mend

stackchan-mcp 0.5.0tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: stackchan-mcp
-Version: 0.5.0
+Version: 0.7.0
 Summary: Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP.
 Project-URL: Homepage, https://github.com/kisaragi-mochi/stackchan-mcp
 Project-URL: Repository, https://github.com/kisaragi-mochi/stackchan-mcp
@@ -27,6 +27,14 @@ Requires-Dist: mcp>=1.0
 Requires-Dist: pydantic>=2
 Requires-Dist: python-dotenv
 Requires-Dist: websockets>=12
+Provides-Extra: stt
+Requires-Dist: opuslib>=3; extra == 'stt'
+Provides-Extra: stt-faster-whisper
+Requires-Dist: faster-whisper>=1.0; extra == 'stt-faster-whisper'
+Requires-Dist: opuslib>=3; extra == 'stt-faster-whisper'
+Provides-Extra: stt-openai
+Requires-Dist: openai>=1.0; extra == 'stt-openai'
+Requires-Dist: opuslib>=3; extra == 'stt-openai'
 Provides-Extra: tts
 Requires-Dist: httpx>=0.27; extra == 'tts'
 Requires-Dist: opuslib>=3; extra == 'tts'

{stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "stackchan-mcp"
-version = "0.5.0"
+version = "0.7.0"
 description = "Two-faced MCP gateway for StackChan (xiaozhi-esp32): bridges stdio MCP clients to the ESP32 over WebSocket + HTTP."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -48,6 +48,24 @@ tts-voicevox = [
     "stackchan-mcp[tts]",
 ]
+# Phase 4 STT — see Issue #91.
+# The base `stt` extra carries `opuslib` for decoding the device's
+# inbound Opus frames. Concrete engines live behind their own extras
+# so users only pull in the heavy ML dependencies they actually need.
+#   * faster-whisper — local Whisper via CTranslate2 (default, MIT)
+#   * openai         — OpenAI Whisper API client (cloud)
+stt = [
+    "opuslib>=3",
+]
+stt-faster-whisper = [
+    "stackchan-mcp[stt]",
+    "faster-whisper>=1.0",
+]
+stt-openai = [
+    "stackchan-mcp[stt]",
+    "openai>=1.0",
+]
 [project.urls]
 Homepage = "https://github.com/kisaragi-mochi/stackchan-mcp"
 Repository = "https://github.com/kisaragi-mochi/stackchan-mcp"

stackchan_mcp-0.7.0/stackchan_mcp/audio_stream.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""Opus audio frame handling for the gateway <-> device link.
+Outbound (TTS) frames are produced by
+:mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
+ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
+The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
+binary frames coming up from the device land in
+:func:`handle_audio_frame`, which buffers them into a module-level
+recording slot when one is active. The
+:mod:`stackchan_mcp.stt.orchestrator` opens the slot via
+:func:`start_recording` before sending ``listen.start`` to the device
+and closes it via :func:`stop_recording` after the capture window;
+outside an active recording, inbound frames are still discarded.
+The recording slot is intentionally a module-level singleton: the
+device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
+one connection, and the STT orchestrator serialises ``listen()`` calls
+through :attr:`ESP32Manager.listen_lock`, so concurrent captures
+cannot race the buffer. If multi-device support lands later, this
+should move onto the connection object.
+"""
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, Iterable
+if TYPE_CHECKING:
+    from .esp32_client import ESP32Manager
+logger = logging.getLogger(__name__)
+# --- Recording slot (inbound STT capture) ---------------------------------
+#
+# A single capture at a time is enforced by the orchestrator's
+# ``listen_lock``; this module only owns the buffer itself.
+_recording_session_id: str | None = None
+_recording_frames: list[bytes] = []
+def start_recording(session_id: str) -> None:
+    """Open a fresh recording slot for ``session_id``.
+    Any frames already buffered are discarded so a previous call that
+    crashed before ``stop_recording`` cannot leak into the next
+    capture. The orchestrator wraps start/stop in a try/finally to
+    guarantee the slot is closed even on error.
+    """
+    global _recording_session_id, _recording_frames
+    if _recording_session_id is not None:
+        # Defensive: the lock should prevent this, but if it ever
+        # fires we leak no audio — just log loudly so the regression
+        # is visible.
+        logger.warning(
+            "start_recording called while session=%s was still active; "
+            "dropping %d buffered frames",
+            _recording_session_id,
+            len(_recording_frames),
+        )
+    _recording_session_id = session_id
+    _recording_frames = []
+def stop_recording() -> list[bytes]:
+    """Close the recording slot and return the buffered Opus frames.
+    Returns an empty list if no recording was active. The slot is
+    cleared whether or not frames were captured so the next call to
+    :func:`start_recording` starts clean.
+    """
+    global _recording_session_id, _recording_frames
+    frames = _recording_frames
+    _recording_session_id = None
+    _recording_frames = []
+    return frames
+def is_recording() -> bool:
+    """Return ``True`` when a recording slot is currently open."""
+    return _recording_session_id is not None
+async def handle_audio_frame(data: bytes, session_id: str) -> None:
+    """Process an incoming binary Opus frame from the device.
+    When a recording slot is active (see :func:`start_recording`) AND
+    the frame belongs to the recording's session, appends the frame
+    to the in-memory buffer for later decoding by the STT
+    orchestrator. Frames from a different session — typical during
+    a connection swap, where the old WebSocket handler is still
+    draining incoming bytes after :meth:`ESP32Connection.disconnect`
+    has been called on the main task — are dropped so they cannot
+    bleed into the new connection's capture buffer.
+    Outside of an active recording the frame is logged at debug
+    level and discarded; the device may emit audio on its own (e.g.
+    after an autonomous wake-word detection) and the gateway has no
+    STT pipeline running for those frames yet.
+    """
+    if _recording_session_id is None:
+        logger.debug(
+            "audio_frame session=%s bytes=%d (discarded — no active recording)",
+            session_id,
+            len(data),
+        )
+        return
+    if _recording_session_id != session_id:
+        # A different connection is sending audio while a recording
+        # for this session is in flight. This happens when ESP32
+        # reconnects: ``ESP32Manager._handler`` swaps in a new
+        # ``ESP32Connection`` and marks the old one disconnected,
+        # but the old socket's ``async for message in ws`` loop can
+        # still drain a frame or two before the close lands. Letting
+        # those into the buffer would corrupt the new session's
+        # transcription, so drop them here.
+        logger.debug(
+            "audio_frame session=%s bytes=%d (discarded — does not match "
+            "recording session=%s)",
+            session_id,
+            len(data),
+            _recording_session_id,
+        )
+        return
+    _recording_frames.append(data)
+    logger.debug(
+        "audio_frame session=%s bytes=%d buffered (recording active)",
+        session_id,
+        len(data),
+    )
+async def push_opus_frames(
+    esp32: ESP32Manager,
+    frames: Iterable[bytes],
+) -> int:
+    """Push Opus frames to the connected ESP32.
+    Returns the number of frames sent so the caller can report this to
+    the MCP client. Raises :class:`ConnectionError` (via
+    :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
+    mid-stream — the orchestrator turns that into a clean MCP error
+    rather than letting it bubble up as a stack trace.
+    """
+    sent = 0
+    for frame in frames:
+        await esp32.send_audio_frame(frame)
+        sent += 1
+    return sent

{stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/esp32_client.py RENAMED Viewed

@@ -17,6 +17,7 @@ import websockets
 import websockets.exceptions
 from websockets.asyncio.server import ServerConnection
+from .audio_stream import handle_audio_frame
 from .protocol import HelloResponse, make_mcp_message, parse_jsonrpc_response
 logger = logging.getLogger(__name__)
@@ -206,6 +207,33 @@ class ESP32Connection:
         }
         await self._ws_send(json.dumps(message))
+    async def send_listen_state(self, state: str, mode: str = "manual") -> None:
+        """Send a listen state notification (``start`` / ``stop``).
+        Server-driven counterpart to the device's existing
+        :func:`Protocol::SendStartListening` (Issue #91). The
+        firmware's :func:`Application::OnIncomingJson` dispatches
+        ``state: "start"`` to :func:`Application::StartListening` and
+        ``state: "stop"`` to :func:`Application::StopListening`.
+        ``mode`` is currently accepted only for ``state="start"`` and is
+        carried on the wire for forward-compatibility — the firmware
+        accepts but ignores it in Phase 1 because
+        :func:`HandleStartListeningEvent` unconditionally enters
+        ``kListeningModeManualStop`` (the gateway controls the stop
+        boundary explicitly).
+        """
+        if not self._connected:
+            raise ConnectionError("ESP32 not connected")
+        message: dict[str, Any] = {
+            "session_id": self.session_id,
+            "type": "listen",
+            "state": state,
+        }
+        if state == "start":
+            message["mode"] = mode
+        await self._ws_send(json.dumps(message))
     def disconnect(self) -> None:
         """Mark connection as disconnected."""
         self._connected = False
@@ -242,6 +270,21 @@ class ESP32Manager:
         # if multi-device support lands later, the lock should move
         # onto :class:`ESP32Connection` instead.
         self._tts_lock = asyncio.Lock()
+        # Inbound STT capture (Issue #91) shares the TTS lock rather
+        # than running on a separate one. The firmware's
+        # ``HandleStartListeningEvent`` aborts any in-flight TTS when
+        # a listen.start arrives mid-speaking (state ==
+        # ``kDeviceStateSpeaking`` → ``AbortSpeaking`` →
+        # ``SetListeningMode(kListeningModeManualStop)``), so two
+        # operations on the same device's audio path would
+        # otherwise step on each other: a ``listen()`` could yank a
+        # ``say()`` out of speaking mid-utterance, or a ``say()``
+        # could start streaming TTS frames into the buffer a
+        # concurrent ``listen()`` is capturing. Treating the audio
+        # path as a single resource makes the device's state machine
+        # observable from gateway code; if a full-duplex contract
+        # ever lands later the lock can split again.
+        self._listen_lock = self._tts_lock
     @property
     def device_connected(self) -> bool:
@@ -260,6 +303,17 @@ class ESP32Manager:
         """
         return self._tts_lock
+    @property
+    def listen_lock(self) -> asyncio.Lock:
+        """Per-device lock guarding the STT capture sequence.
+        See :attr:`_listen_lock` for the rationale; the orchestrator
+        wraps the entire ``listen.start`` → wait → ``listen.stop``
+        block in ``async with`` on this lock so two concurrent
+        ``listen()`` calls cannot share the inbound recording slot.
+        """
+        return self._listen_lock
     async def start(
         self,
         host: str = "0.0.0.0",
@@ -330,7 +384,14 @@ class ESP32Manager:
         try:
             async for message in ws:
                 if isinstance(message, bytes):
-                    # Binary = audio frame, ignore for now
+                    # Binary = audio frame. Forward to the audio_stream
+                    # module which buffers it for STT capture (Issue
+                    # #91) when a recording slot is open, or discards
+                    # it otherwise. Only protocol v1 is supported on
+                    # the inbound side today; the orchestrator gates
+                    # listen() on protocol_version=1 so v2/v3 frames
+                    # cannot reach this point with recording active.
+                    await handle_audio_frame(message, session_id)
                     continue
                 try:
@@ -451,6 +512,17 @@ class ESP32Manager:
             raise ConnectionError("No ESP32 device connected")
         await self._connection.send_tts_state(state)
+    async def send_listen_state(self, state: str, mode: str = "manual") -> None:
+        """Send a listen state notification to put the device into /
+        out of listening mode (Issue #91).
+        See :meth:`ESP32Connection.send_listen_state` for the wire
+        format and the firmware-side dispatch.
+        """
+        if not self._connection or not self._connection.connected:
+            raise ConnectionError("No ESP32 device connected")
+        await self._connection.send_listen_state(state, mode=mode)
     def get_status(self) -> dict[str, Any]:
         """Get current connection status."""
         if not self._connection or not self._connection.connected:

{stackchan_mcp-0.5.0 → stackchan_mcp-0.7.0}/stackchan_mcp/stdio_server.py RENAMED Viewed

@@ -15,6 +15,7 @@ from mcp.server.stdio import stdio_server
 from mcp.types import TextContent, Tool
 from .gateway import get_gateway
+from .stt import listen_and_transcribe
 from .tts import synthesize_and_send
 logger = logging.getLogger(__name__)
@@ -102,8 +103,14 @@ def create_server() -> Server:
             Tool(
                 name="move_head",
                 description=(
-                    "Move the robot's head to the specified angles. "
-                    "yaw: horizontal (-90 to 90), pitch: vertical (-30 to 30)."
+                    "Move the robot's head to safe, recommended angles. "
+                    "yaw: horizontal (-90 to 90), pitch: vertical (5 to 85, "
+                    "the M5Stack-recommended operating range). Out-of-range "
+                    "requests are rejected at this MCP layer; for advanced "
+                    "callers that need the firmware hard clamp (pitch 0..88), "
+                    "use the firmware-side `set_head_angles` device tool, "
+                    "which exposes a permissive schema and the authoritative "
+                    "two-tier guard described in the README."
                 ),
                 inputSchema={
                     "type": "object",
@@ -111,10 +118,19 @@ def create_server() -> Server:
                         "yaw": {
                             "type": "integer",
                             "description": "Horizontal angle in degrees (-90 to 90)",
+                            "minimum": -90,
+                            "maximum": 90,
                         },
                         "pitch": {
                             "type": "integer",
-                            "description": "Vertical angle in degrees (-30 to 30)",
+                            "description": (
+                                "Vertical angle in degrees (5 to 85, "
+                                "M5Stack-recommended operating range). For the "
+                                "wider firmware hard clamp (0..88), use the "
+                                "`set_head_angles` device tool instead."
+                            ),
+                            "minimum": 5,
+                            "maximum": 85,
                         },
                     },
                     "required": ["yaw", "pitch"],
@@ -408,6 +424,91 @@ def create_server() -> Server:
                     "required": ["text"],
                 },
             ),
+            Tool(
+                name="listen",
+                description=(
+                    "Capture a short utterance from the device microphone and "
+                    "transcribe it via a gateway-side STT engine (Phase 4, "
+                    "Issue #91). The gateway sends a 'listen' notification "
+                    "over the existing WebSocket to put the device firmware "
+                    "into listening mode, buffers the Opus frames the device "
+                    "streams up during the capture window, then decodes and "
+                    "transcribes them once the window closes. Requires a "
+                    "minimal firmware change to handle the inbound 'listen' "
+                    "wire type (paired with this gateway release). Engine is "
+                    "selectable via 'engine' (default 'faster-whisper', local). "
+                    "Optional 'motion' feedback can switch the avatar to "
+                    "'thinking' during capture ('face-only') or tilt the head "
+                    "up while preserving yaw ('look-up'). "
+                    "Install the relevant extra "
+                    "('pip install stackchan-mcp[stt-faster-whisper]' or "
+                    "'stt-openai'); calling this tool before an engine is "
+                    "registered returns a clear error."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "duration_ms": {
+                            "type": "integer",
+                            "description": (
+                                "Capture window in milliseconds. Clamped to "
+                                "[100, 30000]."
+                            ),
+                            "default": 5000,
+                            "minimum": 100,
+                            "maximum": 30000,
+                        },
+                        "engine": {
+                            "type": "string",
+                            "description": (
+                                "Engine identifier (e.g. 'faster-whisper', "
+                                "'openai-whisper'). Default 'faster-whisper'."
+                            ),
+                            "default": "faster-whisper",
+                        },
+                        "language": {
+                            "type": "string",
+                            "description": (
+                                "ISO 639-1 language code (e.g. 'ja'). Pass "
+                                "an empty string or omit for autodetect."
+                            ),
+                            "default": "ja",
+                        },
+                        "model": {
+                            "type": "string",
+                            "description": (
+                                "Engine-specific model identifier (e.g. "
+                                "'base' / 'small' / 'medium' for faster-"
+                                "whisper, 'whisper-1' for OpenAI). Engines "
+                                "fall back to their default when omitted."
+                            ),
+                        },
+                        "motion": {
+                            "type": "string",
+                            "enum": ["none", "face-only", "look-up"],
+                            "description": (
+                                "Optional visible feedback during capture. "
+                                "'none' preserves the previous behaviour. "
+                                "'face-only' shows the thinking avatar during "
+                                "capture and restores idle at the end. "
+                                "'look-up' preserves yaw, tilts pitch to "
+                                "look_up_pitch, and holds the pose on success."
+                            ),
+                            "default": "none",
+                        },
+                        "look_up_pitch": {
+                            "type": "number",
+                            "description": (
+                                "Pitch angle for motion='look-up'. Must be "
+                                "between 5 and 85 degrees."
+                            ),
+                            "default": 50.0,
+                            "minimum": 5,
+                            "maximum": 85,
+                        },
+                    },
+                },
+            ),
         ]
     @server.call_tool()
@@ -439,6 +540,25 @@ def create_server() -> Server:
                 ]
             return [TextContent(type="text", text=json.dumps(result))]
+        if name == "listen":
+            # STT runs on the gateway side. The orchestrator drives the
+            # device's listening state via ``listen.start``/``stop``
+            # notifications, buffers the inbound Opus frames, decodes
+            # them, and hands the PCM blob to the registered engine.
+            # Same error-class discipline as say(): ValueError /
+            # NotImplementedError / RuntimeError all turn into clean
+            # MCP error JSON.
+            try:
+                result = await listen_and_transcribe(arguments, gateway=gw)
+            except (ValueError, NotImplementedError, RuntimeError) as exc:
+                return [
+                    TextContent(
+                        type="text",
+                        text=json.dumps({"error": str(exc)}),
+                    )
+                ]
+            return [TextContent(type="text", text=json.dumps(result))]
         if not gw.esp32.device_connected:
             return [
                 TextContent(
@@ -447,6 +567,59 @@ def create_server() -> Server:
                 )
             ]
+        if name == "move_head":
+            # Belt-and-suspenders validation for the recommended pitch range.
+            # The Tool inputSchema already declares minimum/maximum for both
+            # yaw and pitch, but mcp Python SDK server-side enforcement of
+            # JSON Schema bounds is not guaranteed across versions and
+            # clients. Reject out-of-recommended values here as a clean
+            # MCP error JSON before any motion command reaches the device.
+            # Callers that genuinely need the firmware hard clamp 0..88
+            # should use the firmware-side `set_head_angles` device tool,
+            # which exposes the authoritative two-tier guard described in
+            # the README "Y-axis (pitch) safe range" section.
+            yaw_val = arguments.get("yaw")
+            pitch_val = arguments.get("pitch")
+            if (
+                not isinstance(yaw_val, int)
+                or isinstance(yaw_val, bool)
+                or not (-90 <= yaw_val <= 90)
+            ):
+                return [
+                    TextContent(
+                        type="text",
+                        text=json.dumps(
+                            {
+                                "error": (
+                                    "yaw must be an integer in -90..90 "
+                                    f"(got {yaw_val!r})"
+                                )
+                            }
+                        ),
+                    )
+                ]
+            if (
+                not isinstance(pitch_val, int)
+                or isinstance(pitch_val, bool)
+                or not (5 <= pitch_val <= 85)
+            ):
+                return [
+                    TextContent(
+                        type="text",
+                        text=json.dumps(
+                            {
+                                "error": (
+                                    "pitch must be an integer in 5..85 "
+                                    "(M5Stack-recommended operating range; "
+                                    "for the wider firmware hard clamp "
+                                    "0..88 use `set_head_angles`). got "
+                                    f"{pitch_val!r}"
+                                )
+                            }
+                        ),
+                    )
+                ]
         # Map MCP client tool names to ESP32 MCP tool names (self.* prefix)
         tool_map: dict[str, tuple[str, dict[str, Any]]] = {
             "get_device_info": (

stackchan_mcp-0.7.0/stackchan_mcp/stt/__init__.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""STT framework for Phase 4 (Issue #91).
+Companion to :mod:`stackchan_mcp.tts`: this package provides the
+engine-agnostic skeleton for the gateway-side ``listen(duration_ms)``
+MCP tool plus the concrete faster-whisper (default, local) and
+OpenAI Whisper API engines.
+Engines whose modules require optional extras to import are registered
+behind ``try / except ImportError`` so the framework still works when
+the corresponding extra is missing.
+"""
+from __future__ import annotations
+import logging
+from typing import Callable
+from .base import EngineRegistry, STTEngine, get_registry
+from .orchestrator import DEFAULT_ENGINE, listen_and_transcribe
+_logger = logging.getLogger(__name__)
+def _try_register(register_fn: Callable[[], None], engine_label: str) -> None:
+    """Run ``register_fn`` and swallow ImportErrors.
+    Used so an engine whose top-level module needs an optional extra
+    (e.g. faster-whisper / openai) can fail to register cleanly without
+    breaking the rest of the framework. Engine modules themselves
+    import cleanly; their heavy dependencies are imported lazily inside
+    :meth:`STTEngine.transcribe` so this layer just lights up the
+    registry slot.
+    """
+    try:
+        register_fn()
+    except ImportError as exc:
+        _logger.debug("Skipping %s engine registration: %s", engine_label, exc)
+def _register_faster_whisper() -> None:
+    from .faster_whisper import FasterWhisperEngine
+    get_registry().register(FasterWhisperEngine())
+def _register_openai_whisper() -> None:
+    from .openai_whisper import OpenAIWhisperEngine
+    get_registry().register(OpenAIWhisperEngine())
+_try_register(_register_faster_whisper, "faster-whisper")
+_try_register(_register_openai_whisper, "openai-whisper")
+__all__ = [
+    "DEFAULT_ENGINE",
+    "EngineRegistry",
+    "STTEngine",
+    "get_registry",
+    "listen_and_transcribe",
+]

stackchan-mcp 0.5.0__tar.gz → 0.7.0__tar.gz

stackchan-mcp 0.5.0tar.gz → 0.7.0tar.gz