stackchan-mcp 0.9.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stackchan_mcp/__init__.py +81 -0
- stackchan_mcp/__main__.py +12 -0
- stackchan_mcp/_libs/SOURCES.md +130 -0
- stackchan_mcp/_libs/opus.dll +0 -0
- stackchan_mcp/audio_input_hook.py +432 -0
- stackchan_mcp/audio_stream.py +162 -0
- stackchan_mcp/capture_server.py +469 -0
- stackchan_mcp/cli.py +958 -0
- stackchan_mcp/esp32_client.py +983 -0
- stackchan_mcp/event_log.py +189 -0
- stackchan_mcp/gateway.py +274 -0
- stackchan_mcp/handlers/__init__.py +7 -0
- stackchan_mcp/handlers/audio.py +21 -0
- stackchan_mcp/handlers/camera.py +25 -0
- stackchan_mcp/handlers/robot.py +52 -0
- stackchan_mcp/http_server.py +398 -0
- stackchan_mcp/mcp_router.py +126 -0
- stackchan_mcp/mdns_advertiser.py +347 -0
- stackchan_mcp/notify.example.yml +21 -0
- stackchan_mcp/notify_config.py +235 -0
- stackchan_mcp/ownership.py +270 -0
- stackchan_mcp/protocol.py +95 -0
- stackchan_mcp/queue.py +191 -0
- stackchan_mcp/server.py +28 -0
- stackchan_mcp/stdio_server.py +1365 -0
- stackchan_mcp/stt/__init__.py +62 -0
- stackchan_mcp/stt/audio_utils.py +102 -0
- stackchan_mcp/stt/base.py +94 -0
- stackchan_mcp/stt/faster_whisper.py +217 -0
- stackchan_mcp/stt/openai_whisper.py +177 -0
- stackchan_mcp/stt/orchestrator.py +568 -0
- stackchan_mcp/tools.py +82 -0
- stackchan_mcp/tts/__init__.py +62 -0
- stackchan_mcp/tts/audio_utils.py +177 -0
- stackchan_mcp/tts/base.py +86 -0
- stackchan_mcp/tts/orchestrator.py +688 -0
- stackchan_mcp/tts/voicevox.py +184 -0
- stackchan_mcp-0.9.1.dist-info/METADATA +324 -0
- stackchan_mcp-0.9.1.dist-info/RECORD +43 -0
- stackchan_mcp-0.9.1.dist-info/WHEEL +5 -0
- stackchan_mcp-0.9.1.dist-info/entry_points.txt +2 -0
- stackchan_mcp-0.9.1.dist-info/licenses/LICENSE +39 -0
- stackchan_mcp-0.9.1.dist-info/licenses/LICENSE-THIRD-PARTY +65 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Opus audio frame handling for the gateway <-> device link.
|
|
2
|
+
|
|
3
|
+
Outbound (TTS) frames are produced by
|
|
4
|
+
:mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
|
|
5
|
+
ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
|
|
6
|
+
|
|
7
|
+
The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
|
|
8
|
+
binary frames coming up from the device land in
|
|
9
|
+
:func:`handle_audio_frame`, which buffers them into a module-level
|
|
10
|
+
recording slot when one is active. The
|
|
11
|
+
:mod:`stackchan_mcp.stt.orchestrator` opens the slot via
|
|
12
|
+
:func:`start_recording` before sending ``listen.start`` to the device
|
|
13
|
+
and closes it via :func:`stop_recording` after the capture window;
|
|
14
|
+
outside an active recording, inbound frames are still discarded.
|
|
15
|
+
|
|
16
|
+
The recording slot is intentionally a module-level singleton: the
|
|
17
|
+
device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
|
|
18
|
+
one connection, and the STT orchestrator serialises ``listen()`` calls
|
|
19
|
+
through :attr:`ESP32Manager.listen_lock`, so concurrent captures
|
|
20
|
+
cannot race the buffer. If multi-device support lands later, this
|
|
21
|
+
should move onto the connection object.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import logging
|
|
27
|
+
from typing import TYPE_CHECKING, Iterable
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from .esp32_client import ESP32Manager
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Recording slot (inbound STT capture) ---------------------------------
|
|
36
|
+
#
|
|
37
|
+
# A single capture at a time is enforced by the orchestrator's
|
|
38
|
+
# ``listen_lock``; this module only owns the buffer itself.
|
|
39
|
+
|
|
40
|
+
_recording_session_id: str | None = None
|
|
41
|
+
_recording_frames: list[bytes] = []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def start_recording(session_id: str) -> None:
|
|
45
|
+
"""Open a fresh recording slot for ``session_id``.
|
|
46
|
+
|
|
47
|
+
Any frames already buffered are discarded so a previous call that
|
|
48
|
+
crashed before ``stop_recording`` cannot leak into the next
|
|
49
|
+
capture. The orchestrator wraps start/stop in a try/finally to
|
|
50
|
+
guarantee the slot is closed even on error.
|
|
51
|
+
"""
|
|
52
|
+
global _recording_session_id, _recording_frames
|
|
53
|
+
if _recording_session_id is not None:
|
|
54
|
+
# Defensive: the lock should prevent this, but if it ever
|
|
55
|
+
# fires we leak no audio — just log loudly so the regression
|
|
56
|
+
# is visible.
|
|
57
|
+
logger.warning(
|
|
58
|
+
"start_recording called while session=%s was still active; "
|
|
59
|
+
"dropping %d buffered frames",
|
|
60
|
+
_recording_session_id,
|
|
61
|
+
len(_recording_frames),
|
|
62
|
+
)
|
|
63
|
+
_recording_session_id = session_id
|
|
64
|
+
_recording_frames = []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def stop_recording() -> list[bytes]:
|
|
68
|
+
"""Close the recording slot and return the buffered Opus frames.
|
|
69
|
+
|
|
70
|
+
Returns an empty list if no recording was active. The slot is
|
|
71
|
+
cleared whether or not frames were captured so the next call to
|
|
72
|
+
:func:`start_recording` starts clean.
|
|
73
|
+
"""
|
|
74
|
+
global _recording_session_id, _recording_frames
|
|
75
|
+
frames = _recording_frames
|
|
76
|
+
_recording_session_id = None
|
|
77
|
+
_recording_frames = []
|
|
78
|
+
return frames
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_recording() -> bool:
|
|
82
|
+
"""Return ``True`` when a recording slot is currently open."""
|
|
83
|
+
return _recording_session_id is not None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def is_recording_session(session_id: str) -> bool:
|
|
87
|
+
"""Return ``True`` when the recording slot belongs to ``session_id``.
|
|
88
|
+
|
|
89
|
+
Used by per-session disconnect cleanup paths to confirm they still
|
|
90
|
+
own the recording before tearing it down. A stale handler whose
|
|
91
|
+
session was replaced by a fresh reconnection (or by an MCP-driven
|
|
92
|
+
``listen()``) must not clear the active buffer.
|
|
93
|
+
"""
|
|
94
|
+
return _recording_session_id == session_id
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def handle_audio_frame(data: bytes, session_id: str) -> None:
|
|
98
|
+
"""Process an incoming binary Opus frame from the device.
|
|
99
|
+
|
|
100
|
+
When a recording slot is active (see :func:`start_recording`) AND
|
|
101
|
+
the frame belongs to the recording's session, appends the frame
|
|
102
|
+
to the in-memory buffer for later decoding by the STT
|
|
103
|
+
orchestrator. Frames from a different session — typical during
|
|
104
|
+
a connection swap, where the old WebSocket handler is still
|
|
105
|
+
draining incoming bytes after :meth:`ESP32Connection.disconnect`
|
|
106
|
+
has been called on the main task — are dropped so they cannot
|
|
107
|
+
bleed into the new connection's capture buffer.
|
|
108
|
+
|
|
109
|
+
Outside of an active recording the frame is logged at debug
|
|
110
|
+
level and discarded; the device may emit audio on its own (e.g.
|
|
111
|
+
after an autonomous wake-word detection) and the gateway has no
|
|
112
|
+
STT pipeline running for those frames yet.
|
|
113
|
+
"""
|
|
114
|
+
if _recording_session_id is None:
|
|
115
|
+
logger.debug(
|
|
116
|
+
"audio_frame session=%s bytes=%d (discarded — no active recording)",
|
|
117
|
+
session_id,
|
|
118
|
+
len(data),
|
|
119
|
+
)
|
|
120
|
+
return
|
|
121
|
+
if _recording_session_id != session_id:
|
|
122
|
+
# A different connection is sending audio while a recording
|
|
123
|
+
# for this session is in flight. This happens when ESP32
|
|
124
|
+
# reconnects: ``ESP32Manager._handler`` swaps in a new
|
|
125
|
+
# ``ESP32Connection`` and marks the old one disconnected,
|
|
126
|
+
# but the old socket's ``async for message in ws`` loop can
|
|
127
|
+
# still drain a frame or two before the close lands. Letting
|
|
128
|
+
# those into the buffer would corrupt the new session's
|
|
129
|
+
# transcription, so drop them here.
|
|
130
|
+
logger.debug(
|
|
131
|
+
"audio_frame session=%s bytes=%d (discarded — does not match "
|
|
132
|
+
"recording session=%s)",
|
|
133
|
+
session_id,
|
|
134
|
+
len(data),
|
|
135
|
+
_recording_session_id,
|
|
136
|
+
)
|
|
137
|
+
return
|
|
138
|
+
_recording_frames.append(data)
|
|
139
|
+
logger.debug(
|
|
140
|
+
"audio_frame session=%s bytes=%d buffered (recording active)",
|
|
141
|
+
session_id,
|
|
142
|
+
len(data),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
async def push_opus_frames(
|
|
147
|
+
esp32: ESP32Manager,
|
|
148
|
+
frames: Iterable[bytes],
|
|
149
|
+
) -> int:
|
|
150
|
+
"""Push Opus frames to the connected ESP32.
|
|
151
|
+
|
|
152
|
+
Returns the number of frames sent so the caller can report this to
|
|
153
|
+
the MCP client. Raises :class:`ConnectionError` (via
|
|
154
|
+
:meth:`ESP32Manager.send_audio_frame`) if the device disconnects
|
|
155
|
+
mid-stream — the orchestrator turns that into a clean MCP error
|
|
156
|
+
rather than letting it bubble up as a stack trace.
|
|
157
|
+
"""
|
|
158
|
+
sent = 0
|
|
159
|
+
for frame in frames:
|
|
160
|
+
await esp32.send_audio_frame(frame)
|
|
161
|
+
sent += 1
|
|
162
|
+
return sent
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""HTTP capture server for receiving photos from ESP32 and PCM from external producers.
|
|
2
|
+
|
|
3
|
+
Two POST endpoints share this server:
|
|
4
|
+
|
|
5
|
+
- ``POST /capture``: ESP32's camera.Explain() uploads JPEG photos as
|
|
6
|
+
multipart/form-data (fields: ``question`` text + ``file`` JPEG).
|
|
7
|
+
Authenticated via ``CAPTURE_TOKEN_KEY`` (the gateway's ``vision_token``).
|
|
8
|
+
The server saves the JPEG to ``~/.stackchan/captures/`` and returns the
|
|
9
|
+
file path so the MCP client can view the image via the Read tool.
|
|
10
|
+
|
|
11
|
+
- ``POST /pcm``: External producers (the SAIVerse voice-tts addon, etc.)
|
|
12
|
+
upload PCM audio for the device's speaker as a streaming body
|
|
13
|
+
(Content-Type: application/octet-stream, Transfer-Encoding: chunked).
|
|
14
|
+
Authenticated separately via ``PCM_TOKEN_KEY``. The request body is
|
|
15
|
+
fed directly into :func:`stackchan_mcp.tts.send_pcm_stream` so the
|
|
16
|
+
audio reaches the device with low latency, without buffering the
|
|
17
|
+
whole utterance.
|
|
18
|
+
|
|
19
|
+
The PCM endpoint is the entry point of the gateway's "external PCM
|
|
20
|
+
input" path — the receiving counterpart of the stdio ``say()`` MCP tool.
|
|
21
|
+
``say()`` synthesises audio with a registered TTS engine inside the
|
|
22
|
+
gateway; ``POST /pcm`` lets external producers (which already did the
|
|
23
|
+
synthesis themselves, e.g. with a voice-cloning model the gateway does
|
|
24
|
+
not host) push the finished PCM through the same back-half pipeline
|
|
25
|
+
(:func:`send_pcm_stream`).
|
|
26
|
+
|
|
27
|
+
Required PCM request headers:
|
|
28
|
+
|
|
29
|
+
- ``Authorization: Bearer <PCM_TOKEN>`` — token comparison against
|
|
30
|
+
``PCM_TOKEN_KEY`` (gateway's ``pcm_token`` property)
|
|
31
|
+
- ``X-Sample-Rate: <int>`` — sample rate of the source PCM (e.g. 32000).
|
|
32
|
+
The gateway resamples to the device's 16 kHz before Opus encoding.
|
|
33
|
+
- ``X-Channels: 1`` (optional, defaults to 1) — only mono is supported
|
|
34
|
+
for now (the device decoder is configured for mono).
|
|
35
|
+
- ``X-Message-Id: <str>`` (optional) — opaque identifier echoed back in
|
|
36
|
+
the log line so the producer can correlate uploads with downstream
|
|
37
|
+
device state.
|
|
38
|
+
|
|
39
|
+
The handler stores the active :class:`Gateway` instance in the
|
|
40
|
+
application's ``GATEWAY_KEY`` so it can dispatch to ``send_pcm_stream``
|
|
41
|
+
without coupling :mod:`capture_server` to the gateway module at import
|
|
42
|
+
time (lazy import inside the handler keeps the optional ``[tts]``
|
|
43
|
+
extra unnecessary for capture-only deployments).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import asyncio
|
|
49
|
+
import hashlib
|
|
50
|
+
import json
|
|
51
|
+
import logging
|
|
52
|
+
import os
|
|
53
|
+
import secrets
|
|
54
|
+
import time
|
|
55
|
+
from dataclasses import dataclass
|
|
56
|
+
from typing import TYPE_CHECKING, AsyncIterator
|
|
57
|
+
|
|
58
|
+
from aiohttp import web
|
|
59
|
+
|
|
60
|
+
if TYPE_CHECKING:
|
|
61
|
+
from .gateway import Gateway
|
|
62
|
+
|
|
63
|
+
logger = logging.getLogger(__name__)
|
|
64
|
+
|
|
65
|
+
CAPTURE_DIR = os.path.expanduser("~/.stackchan/captures")
|
|
66
|
+
CAPTURE_TOKEN_KEY = web.AppKey("capture_token", str)
|
|
67
|
+
PCM_TOKEN_KEY = web.AppKey("pcm_token", str)
|
|
68
|
+
GATEWAY_KEY: web.AppKey = web.AppKey("gateway", object)
|
|
69
|
+
|
|
70
|
+
# Phase 4.5 avatar (saiverse-stackchan-addon): in-memory staging for
|
|
71
|
+
# one-time avatar set downloads. See docs/intent/stackchan_avatar_pipeline.md
|
|
72
|
+
# §C-2 in the SAIVerse repository.
|
|
73
|
+
AVATAR_SETS_KEY = web.AppKey("avatar_sets", dict)
|
|
74
|
+
AVATAR_SETS_LOCK_KEY = web.AppKey("avatar_sets_lock", asyncio.Lock)
|
|
75
|
+
|
|
76
|
+
# A staging entry is GC'd if it hasn't been fetched within this window.
|
|
77
|
+
AVATAR_SET_STAGING_TTL_SEC = 120.0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class _AvatarStaging:
|
|
82
|
+
token: str
|
|
83
|
+
mode: str
|
|
84
|
+
payload: bytes
|
|
85
|
+
sha256: str
|
|
86
|
+
created_at: float
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# Per-route upload cap for the JPEG capture endpoint. The PCM endpoint
|
|
90
|
+
# intentionally streams arbitrarily long payloads (multi-minute TTS),
|
|
91
|
+
# so the application-wide ``client_max_size`` is disabled and each
|
|
92
|
+
# route enforces its own limit. JPEG captures from the ESP32 camera
|
|
93
|
+
# top out around 200 KB at full resolution; 8 MiB is generous headroom
|
|
94
|
+
# against a misbehaving / malicious uploader without inviting unbounded
|
|
95
|
+
# disk consumption on the gateway host.
|
|
96
|
+
CAPTURE_MAX_BYTES = 8 * 1024 * 1024
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _is_authorized(auth_header: str, expected_token: str) -> bool:
|
|
100
|
+
"""Return whether the bearer auth header matches the expected token."""
|
|
101
|
+
return auth_header == f"Bearer {expected_token}"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
async def handle_capture(request: web.Request) -> web.Response:
|
|
105
|
+
"""Handle photo upload from ESP32."""
|
|
106
|
+
expected_token = request.app[CAPTURE_TOKEN_KEY]
|
|
107
|
+
if expected_token and not _is_authorized(
|
|
108
|
+
request.headers.get("Authorization", ""), expected_token
|
|
109
|
+
):
|
|
110
|
+
logger.warning("Capture upload auth rejected")
|
|
111
|
+
return web.Response(
|
|
112
|
+
text='{"error": "Unauthorized"}',
|
|
113
|
+
status=401,
|
|
114
|
+
content_type="application/json",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Per-route body cap. The application-wide client_max_size is
|
|
118
|
+
# disabled because /pcm streams arbitrary-length audio, so
|
|
119
|
+
# /capture's defense lives here. Reject up front based on the
|
|
120
|
+
# advertised Content-Length when available, and enforce again
|
|
121
|
+
# while streaming so a misadvertised header cannot bypass the cap.
|
|
122
|
+
content_length = request.content_length
|
|
123
|
+
if content_length is not None and content_length > CAPTURE_MAX_BYTES:
|
|
124
|
+
logger.warning(
|
|
125
|
+
"Capture upload rejected: Content-Length %d exceeds %d",
|
|
126
|
+
content_length, CAPTURE_MAX_BYTES,
|
|
127
|
+
)
|
|
128
|
+
return web.Response(
|
|
129
|
+
text=json.dumps(
|
|
130
|
+
{"error": f"Upload exceeds {CAPTURE_MAX_BYTES} bytes"}
|
|
131
|
+
),
|
|
132
|
+
status=413,
|
|
133
|
+
content_type="application/json",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
os.makedirs(CAPTURE_DIR, exist_ok=True)
|
|
137
|
+
|
|
138
|
+
reader = await request.multipart()
|
|
139
|
+
question = ""
|
|
140
|
+
image_path = ""
|
|
141
|
+
bytes_written = 0
|
|
142
|
+
|
|
143
|
+
async for part in reader:
|
|
144
|
+
if part.name == "question":
|
|
145
|
+
question = (await part.read()).decode("utf-8")
|
|
146
|
+
elif part.name == "file":
|
|
147
|
+
timestamp = int(time.time() * 1000)
|
|
148
|
+
filename = f"capture_{timestamp}.jpg"
|
|
149
|
+
image_path = os.path.join(CAPTURE_DIR, filename)
|
|
150
|
+
with open(image_path, "wb") as f:
|
|
151
|
+
while True:
|
|
152
|
+
chunk = await part.read_chunk(8192)
|
|
153
|
+
if not chunk:
|
|
154
|
+
break
|
|
155
|
+
bytes_written += len(chunk)
|
|
156
|
+
if bytes_written > CAPTURE_MAX_BYTES:
|
|
157
|
+
# Overran the cap mid-stream — delete the
|
|
158
|
+
# partial file and bail out with 413 so the
|
|
159
|
+
# gateway host disk does not fill up.
|
|
160
|
+
f.close()
|
|
161
|
+
try:
|
|
162
|
+
os.remove(image_path)
|
|
163
|
+
except OSError:
|
|
164
|
+
pass
|
|
165
|
+
logger.warning(
|
|
166
|
+
"Capture upload truncated at %d bytes (cap %d)",
|
|
167
|
+
bytes_written, CAPTURE_MAX_BYTES,
|
|
168
|
+
)
|
|
169
|
+
return web.Response(
|
|
170
|
+
text=json.dumps(
|
|
171
|
+
{"error": f"Upload exceeds {CAPTURE_MAX_BYTES} bytes"}
|
|
172
|
+
),
|
|
173
|
+
status=413,
|
|
174
|
+
content_type="application/json",
|
|
175
|
+
)
|
|
176
|
+
f.write(chunk)
|
|
177
|
+
|
|
178
|
+
if image_path and os.path.exists(image_path):
|
|
179
|
+
file_size = os.path.getsize(image_path)
|
|
180
|
+
logger.info(
|
|
181
|
+
"Captured photo: %s (%d bytes), question: %s",
|
|
182
|
+
image_path,
|
|
183
|
+
file_size,
|
|
184
|
+
question,
|
|
185
|
+
)
|
|
186
|
+
result = json.dumps({
|
|
187
|
+
"image_path": image_path,
|
|
188
|
+
"size_bytes": file_size,
|
|
189
|
+
"question": question,
|
|
190
|
+
})
|
|
191
|
+
return web.Response(text=result, content_type="application/json")
|
|
192
|
+
|
|
193
|
+
return web.Response(
|
|
194
|
+
text='{"error": "No image received"}',
|
|
195
|
+
status=400,
|
|
196
|
+
content_type="application/json",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
async def stage_avatar_set(
|
|
201
|
+
app: web.Application,
|
|
202
|
+
mode: str,
|
|
203
|
+
payload: bytes,
|
|
204
|
+
) -> tuple[str, str, str]:
|
|
205
|
+
"""Stage an avatar set for one-time HTTP download.
|
|
206
|
+
|
|
207
|
+
Returns (short_id, token, sha256). The caller hands these to the
|
|
208
|
+
device via WS avatar_set_fetch; the device performs a GET against
|
|
209
|
+
/avatar_set/{short_id} with Authorization: Bearer <token>.
|
|
210
|
+
|
|
211
|
+
The staging entry is consumed on the first successful fetch and
|
|
212
|
+
GC'd after AVATAR_SET_STAGING_TTL_SEC if never fetched.
|
|
213
|
+
"""
|
|
214
|
+
if mode not in ("layered", "matrix"):
|
|
215
|
+
raise ValueError(f"unknown avatar mode: {mode}")
|
|
216
|
+
|
|
217
|
+
short_id = secrets.token_hex(8)
|
|
218
|
+
token = secrets.token_urlsafe(32)
|
|
219
|
+
sha256 = "sha256:" + hashlib.sha256(payload).hexdigest()
|
|
220
|
+
|
|
221
|
+
staging = _AvatarStaging(
|
|
222
|
+
token=token,
|
|
223
|
+
mode=mode,
|
|
224
|
+
payload=payload,
|
|
225
|
+
sha256=sha256,
|
|
226
|
+
created_at=time.time(),
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
sets = app[AVATAR_SETS_KEY]
|
|
230
|
+
async with app[AVATAR_SETS_LOCK_KEY]:
|
|
231
|
+
# Best-effort GC of stale entries before inserting.
|
|
232
|
+
now = time.time()
|
|
233
|
+
expired = [
|
|
234
|
+
k for k, v in sets.items()
|
|
235
|
+
if now - v.created_at > AVATAR_SET_STAGING_TTL_SEC
|
|
236
|
+
]
|
|
237
|
+
for k in expired:
|
|
238
|
+
sets.pop(k, None)
|
|
239
|
+
sets[short_id] = staging
|
|
240
|
+
|
|
241
|
+
logger.info(
|
|
242
|
+
"Staged avatar set: short_id=%s mode=%s bytes=%d sha256=%s",
|
|
243
|
+
short_id, mode, len(payload), sha256,
|
|
244
|
+
)
|
|
245
|
+
return short_id, token, sha256
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
async def handle_avatar_set_fetch(request: web.Request) -> web.Response:
|
|
249
|
+
"""Serve a staged avatar set (one-time)."""
|
|
250
|
+
short_id = request.match_info.get("short_id", "")
|
|
251
|
+
if not short_id:
|
|
252
|
+
return web.Response(status=400, text="missing short_id")
|
|
253
|
+
|
|
254
|
+
sets = request.app[AVATAR_SETS_KEY]
|
|
255
|
+
# Validate the request fully (existence, TTL, auth) before consuming
|
|
256
|
+
# the staged entry. An unauthenticated probe must not be able to
|
|
257
|
+
# invalidate a legitimate transfer just by guessing the short_id,
|
|
258
|
+
# and a real fetch that fails auth due to a transient header issue
|
|
259
|
+
# must still find the entry on retry.
|
|
260
|
+
async with request.app[AVATAR_SETS_LOCK_KEY]:
|
|
261
|
+
staging = sets.get(short_id)
|
|
262
|
+
if staging is None:
|
|
263
|
+
return web.Response(status=404, text="not_found_or_consumed")
|
|
264
|
+
|
|
265
|
+
if time.time() - staging.created_at > AVATAR_SET_STAGING_TTL_SEC:
|
|
266
|
+
# Expired — drop the slot so it doesn't linger.
|
|
267
|
+
sets.pop(short_id, None)
|
|
268
|
+
return web.Response(status=410, text="staging_expired")
|
|
269
|
+
|
|
270
|
+
auth = request.headers.get("Authorization", "")
|
|
271
|
+
if auth != f"Bearer {staging.token}":
|
|
272
|
+
logger.warning(
|
|
273
|
+
"Avatar set fetch auth rejected for short_id=%s", short_id
|
|
274
|
+
)
|
|
275
|
+
return web.Response(status=401, text="unauthorized")
|
|
276
|
+
|
|
277
|
+
# Auth confirmed: consume the one-time entry now.
|
|
278
|
+
sets.pop(short_id, None)
|
|
279
|
+
|
|
280
|
+
logger.info(
|
|
281
|
+
"Serving avatar set: short_id=%s mode=%s bytes=%d",
|
|
282
|
+
short_id, staging.mode, len(staging.payload),
|
|
283
|
+
)
|
|
284
|
+
return web.Response(
|
|
285
|
+
body=staging.payload,
|
|
286
|
+
content_type="application/octet-stream",
|
|
287
|
+
headers={
|
|
288
|
+
"X-Avatar-Mode": staging.mode,
|
|
289
|
+
"X-Avatar-Sha256": staging.sha256,
|
|
290
|
+
"Content-Length": str(len(staging.payload)),
|
|
291
|
+
},
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
async def _pcm_chunks_from_request(
|
|
296
|
+
request: web.Request,
|
|
297
|
+
) -> AsyncIterator[bytes]:
|
|
298
|
+
"""Yield PCM byte chunks from the request body.
|
|
299
|
+
|
|
300
|
+
``request.content`` is an :class:`aiohttp.StreamReader` that delivers
|
|
301
|
+
raw bytes as the chunked transfer arrives. ``iter_chunked(size)``
|
|
302
|
+
breaks the stream into ``<= size`` byte pieces, matching the
|
|
303
|
+
``send_pcm_stream`` contract (any chunk size, internally re-aligned
|
|
304
|
+
to Opus frame boundaries).
|
|
305
|
+
|
|
306
|
+
Empty chunks (= heartbeat / cancellation tick) reach
|
|
307
|
+
``send_pcm_stream`` unchanged and are handled as no-ops there.
|
|
308
|
+
"""
|
|
309
|
+
async for chunk in request.content.iter_chunked(8192):
|
|
310
|
+
yield chunk
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
async def handle_pcm(request: web.Request) -> web.Response:
|
|
314
|
+
"""Stream PCM bytes from an external producer to the connected device.
|
|
315
|
+
|
|
316
|
+
See the module docstring for the request shape (headers, token,
|
|
317
|
+
body framing). The handler authenticates, validates the sample
|
|
318
|
+
rate header, then hands the body off to
|
|
319
|
+
:func:`stackchan_mcp.tts.send_pcm_stream`.
|
|
320
|
+
|
|
321
|
+
Returns 200 with a JSON summary on success (frame count, duration,
|
|
322
|
+
source label), 401 on token mismatch, 400 on missing /
|
|
323
|
+
malformed sample-rate header, 503 when no device is connected, or
|
|
324
|
+
500 with a clean error string on encoding / push failures (mirrors
|
|
325
|
+
the error-class discipline of the stdio ``say()`` tool).
|
|
326
|
+
"""
|
|
327
|
+
expected_token = request.app[PCM_TOKEN_KEY]
|
|
328
|
+
if expected_token and not _is_authorized(
|
|
329
|
+
request.headers.get("Authorization", ""), expected_token
|
|
330
|
+
):
|
|
331
|
+
logger.warning("PCM upload auth rejected")
|
|
332
|
+
return web.Response(
|
|
333
|
+
text='{"error": "Unauthorized"}',
|
|
334
|
+
status=401,
|
|
335
|
+
content_type="application/json",
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
rate_header = request.headers.get("X-Sample-Rate", "")
|
|
339
|
+
try:
|
|
340
|
+
source_rate = int(rate_header)
|
|
341
|
+
except (TypeError, ValueError):
|
|
342
|
+
return web.Response(
|
|
343
|
+
text=json.dumps(
|
|
344
|
+
{"error": f"Missing or invalid X-Sample-Rate header: {rate_header!r}"}
|
|
345
|
+
),
|
|
346
|
+
status=400,
|
|
347
|
+
content_type="application/json",
|
|
348
|
+
)
|
|
349
|
+
if source_rate <= 0:
|
|
350
|
+
# Non-positive rates would crash resample_pcm16_linear with a
|
|
351
|
+
# ZeroDivisionError (which the RuntimeError handler below does
|
|
352
|
+
# not translate) and never produce a valid frame anyway. Reject
|
|
353
|
+
# at the boundary so the caller gets a clean 400 instead of
|
|
354
|
+
# an internal server error trail.
|
|
355
|
+
return web.Response(
|
|
356
|
+
text=json.dumps(
|
|
357
|
+
{"error": f"X-Sample-Rate must be a positive integer: {rate_header!r}"}
|
|
358
|
+
),
|
|
359
|
+
status=400,
|
|
360
|
+
content_type="application/json",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
channels_header = request.headers.get("X-Channels", "1")
|
|
364
|
+
try:
|
|
365
|
+
channels = int(channels_header)
|
|
366
|
+
except (TypeError, ValueError):
|
|
367
|
+
channels = 1
|
|
368
|
+
if channels != 1:
|
|
369
|
+
# send_pcm_stream is configured for mono via DEVICE_CHANNELS. Multi-
|
|
370
|
+
# channel sources would need downmix before they get here; rejecting
|
|
371
|
+
# them up front is clearer than silently mixing.
|
|
372
|
+
return web.Response(
|
|
373
|
+
text=json.dumps(
|
|
374
|
+
{"error": f"Only mono PCM is supported, got channels={channels}"}
|
|
375
|
+
),
|
|
376
|
+
status=400,
|
|
377
|
+
content_type="application/json",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
message_id = request.headers.get("X-Message-Id", "")
|
|
381
|
+
source_label = f"http_pcm:{message_id}" if message_id else "http_pcm"
|
|
382
|
+
|
|
383
|
+
gateway = request.app[GATEWAY_KEY]
|
|
384
|
+
if gateway is None:
|
|
385
|
+
return web.Response(
|
|
386
|
+
text='{"error": "Gateway not available"}',
|
|
387
|
+
status=503,
|
|
388
|
+
content_type="application/json",
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Lazy import: tts.send_pcm_stream pulls in opuslib, which is in the
|
|
392
|
+
# ``[tts]`` extra. Capture-only deployments must keep working
|
|
393
|
+
# without the extra, so we only require it when /pcm is actually
|
|
394
|
+
# used.
|
|
395
|
+
try:
|
|
396
|
+
from .tts import send_pcm_stream
|
|
397
|
+
except ImportError as exc:
|
|
398
|
+
return web.Response(
|
|
399
|
+
text=json.dumps(
|
|
400
|
+
{
|
|
401
|
+
"error": f"PCM endpoint requires the [tts] extra: {exc}",
|
|
402
|
+
}
|
|
403
|
+
),
|
|
404
|
+
status=500,
|
|
405
|
+
content_type="application/json",
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
result = await send_pcm_stream(
|
|
410
|
+
gateway,
|
|
411
|
+
_pcm_chunks_from_request(request),
|
|
412
|
+
source_rate=source_rate,
|
|
413
|
+
source_label=source_label,
|
|
414
|
+
)
|
|
415
|
+
except RuntimeError as exc:
|
|
416
|
+
# send_pcm_stream raises RuntimeError on no-device / protocol
|
|
417
|
+
# mismatch / opuslib missing / disconnect mid-stream. Translate
|
|
418
|
+
# to a clean HTTP error rather than letting the traceback leak.
|
|
419
|
+
message = str(exc)
|
|
420
|
+
status = 503 if "no esp32" in message.lower() else 500
|
|
421
|
+
return web.Response(
|
|
422
|
+
text=json.dumps({"error": message}),
|
|
423
|
+
status=status,
|
|
424
|
+
content_type="application/json",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return web.Response(text=json.dumps(result), content_type="application/json")
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def create_capture_app(
|
|
431
|
+
capture_token: str = "",
|
|
432
|
+
pcm_token: str = "",
|
|
433
|
+
gateway: "Gateway | None" = None,
|
|
434
|
+
) -> web.Application:
|
|
435
|
+
"""Create the HTTP server application hosting /capture and /pcm.
|
|
436
|
+
|
|
437
|
+
``capture_token`` authenticates ESP32 photo uploads (legacy single-
|
|
438
|
+
arg form is kept so existing tests keep working). ``pcm_token``
|
|
439
|
+
authenticates external PCM producers; if omitted the gateway will
|
|
440
|
+
accept any /pcm request, which matches the "no STACKCHAN_TOKEN set"
|
|
441
|
+
fallback behaviour the rest of the gateway already uses for ad-hoc
|
|
442
|
+
local development.
|
|
443
|
+
|
|
444
|
+
``gateway`` is the active :class:`Gateway` instance the /pcm handler
|
|
445
|
+
dispatches to. May be ``None`` for tests of /capture alone; /pcm
|
|
446
|
+
will return 503 in that case.
|
|
447
|
+
"""
|
|
448
|
+
# ``client_max_size=0`` disables aiohttp's per-request body size
|
|
449
|
+
# cap (default 1 MiB). The /pcm endpoint legitimately streams
|
|
450
|
+
# arbitrarily long PCM utterances (multi-minute TTS, live audio
|
|
451
|
+
# mixes); a 1 MiB cap would silently cut a chunked-transfer
|
|
452
|
+
# producer off mid-stream once its cumulative body exceeded that
|
|
453
|
+
# limit — observed in practice with a 200-second TTS push, which
|
|
454
|
+
# aborted around 36 s in (~2 MiB of source-rate PCM through the
|
|
455
|
+
# transfer-encoding pipe). The handler itself enforces no separate
|
|
456
|
+
# cap; back-pressure comes from the device-side Opus push rate
|
|
457
|
+
# inside ``send_pcm_stream``, which is the right place for it.
|
|
458
|
+
# /capture only receives JPEG snapshots from the ESP32 (well under
|
|
459
|
+
# 1 MiB each) so removing the cap costs it nothing.
|
|
460
|
+
app = web.Application(client_max_size=0)
|
|
461
|
+
app[CAPTURE_TOKEN_KEY] = capture_token
|
|
462
|
+
app[AVATAR_SETS_KEY] = {}
|
|
463
|
+
app[AVATAR_SETS_LOCK_KEY] = asyncio.Lock()
|
|
464
|
+
app[PCM_TOKEN_KEY] = pcm_token
|
|
465
|
+
app[GATEWAY_KEY] = gateway
|
|
466
|
+
app.router.add_post("/capture", handle_capture)
|
|
467
|
+
app.router.add_get("/avatar_set/{short_id}", handle_avatar_set_fetch)
|
|
468
|
+
app.router.add_post("/pcm", handle_pcm)
|
|
469
|
+
return app
|