stackchan-mcp 0.9.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. stackchan_mcp/__init__.py +81 -0
  2. stackchan_mcp/__main__.py +12 -0
  3. stackchan_mcp/_libs/SOURCES.md +130 -0
  4. stackchan_mcp/_libs/opus.dll +0 -0
  5. stackchan_mcp/audio_input_hook.py +432 -0
  6. stackchan_mcp/audio_stream.py +162 -0
  7. stackchan_mcp/capture_server.py +469 -0
  8. stackchan_mcp/cli.py +958 -0
  9. stackchan_mcp/esp32_client.py +983 -0
  10. stackchan_mcp/event_log.py +189 -0
  11. stackchan_mcp/gateway.py +274 -0
  12. stackchan_mcp/handlers/__init__.py +7 -0
  13. stackchan_mcp/handlers/audio.py +21 -0
  14. stackchan_mcp/handlers/camera.py +25 -0
  15. stackchan_mcp/handlers/robot.py +52 -0
  16. stackchan_mcp/http_server.py +398 -0
  17. stackchan_mcp/mcp_router.py +126 -0
  18. stackchan_mcp/mdns_advertiser.py +347 -0
  19. stackchan_mcp/notify.example.yml +21 -0
  20. stackchan_mcp/notify_config.py +235 -0
  21. stackchan_mcp/ownership.py +270 -0
  22. stackchan_mcp/protocol.py +95 -0
  23. stackchan_mcp/queue.py +191 -0
  24. stackchan_mcp/server.py +28 -0
  25. stackchan_mcp/stdio_server.py +1365 -0
  26. stackchan_mcp/stt/__init__.py +62 -0
  27. stackchan_mcp/stt/audio_utils.py +102 -0
  28. stackchan_mcp/stt/base.py +94 -0
  29. stackchan_mcp/stt/faster_whisper.py +217 -0
  30. stackchan_mcp/stt/openai_whisper.py +177 -0
  31. stackchan_mcp/stt/orchestrator.py +568 -0
  32. stackchan_mcp/tools.py +82 -0
  33. stackchan_mcp/tts/__init__.py +62 -0
  34. stackchan_mcp/tts/audio_utils.py +177 -0
  35. stackchan_mcp/tts/base.py +86 -0
  36. stackchan_mcp/tts/orchestrator.py +688 -0
  37. stackchan_mcp/tts/voicevox.py +184 -0
  38. stackchan_mcp-0.9.1.dist-info/METADATA +324 -0
  39. stackchan_mcp-0.9.1.dist-info/RECORD +43 -0
  40. stackchan_mcp-0.9.1.dist-info/WHEEL +5 -0
  41. stackchan_mcp-0.9.1.dist-info/entry_points.txt +2 -0
  42. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE +39 -0
  43. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE-THIRD-PARTY +65 -0
@@ -0,0 +1,162 @@
1
+ """Opus audio frame handling for the gateway <-> device link.
2
+
3
+ Outbound (TTS) frames are produced by
4
+ :mod:`stackchan_mcp.tts.audio_utils` and pushed here to the connected
5
+ ESP32 via :meth:`stackchan_mcp.esp32_client.ESP32Manager.send_audio_frame`.
6
+
7
+ The inbound side (STT pipeline, Phase 4 / Issue #91) is now wired:
8
+ binary frames coming up from the device land in
9
+ :func:`handle_audio_frame`, which buffers them into a module-level
10
+ recording slot when one is active. The
11
+ :mod:`stackchan_mcp.stt.orchestrator` opens the slot via
12
+ :func:`start_recording` before sending ``listen.start`` to the device
13
+ and closes it via :func:`stop_recording` after the capture window;
14
+ outside an active recording, inbound frames are still discarded.
15
+
16
+ The recording slot is intentionally a module-level singleton: the
17
+ device's :class:`stackchan_mcp.esp32_client.ESP32Manager` only manages
18
+ one connection, and the STT orchestrator serialises ``listen()`` calls
19
+ through :attr:`ESP32Manager.listen_lock`, so concurrent captures
20
+ cannot race the buffer. If multi-device support lands later, this
21
+ should move onto the connection object.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ from typing import TYPE_CHECKING, Iterable
28
+
29
+ if TYPE_CHECKING:
30
+ from .esp32_client import ESP32Manager
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # --- Recording slot (inbound STT capture) ---------------------------------
36
+ #
37
+ # A single capture at a time is enforced by the orchestrator's
38
+ # ``listen_lock``; this module only owns the buffer itself.
39
+
40
+ _recording_session_id: str | None = None
41
+ _recording_frames: list[bytes] = []
42
+
43
+
44
+ def start_recording(session_id: str) -> None:
45
+ """Open a fresh recording slot for ``session_id``.
46
+
47
+ Any frames already buffered are discarded so a previous call that
48
+ crashed before ``stop_recording`` cannot leak into the next
49
+ capture. The orchestrator wraps start/stop in a try/finally to
50
+ guarantee the slot is closed even on error.
51
+ """
52
+ global _recording_session_id, _recording_frames
53
+ if _recording_session_id is not None:
54
+ # Defensive: the lock should prevent this, but if it ever
55
+ # fires we leak no audio — just log loudly so the regression
56
+ # is visible.
57
+ logger.warning(
58
+ "start_recording called while session=%s was still active; "
59
+ "dropping %d buffered frames",
60
+ _recording_session_id,
61
+ len(_recording_frames),
62
+ )
63
+ _recording_session_id = session_id
64
+ _recording_frames = []
65
+
66
+
67
+ def stop_recording() -> list[bytes]:
68
+ """Close the recording slot and return the buffered Opus frames.
69
+
70
+ Returns an empty list if no recording was active. The slot is
71
+ cleared whether or not frames were captured so the next call to
72
+ :func:`start_recording` starts clean.
73
+ """
74
+ global _recording_session_id, _recording_frames
75
+ frames = _recording_frames
76
+ _recording_session_id = None
77
+ _recording_frames = []
78
+ return frames
79
+
80
+
81
+ def is_recording() -> bool:
82
+ """Return ``True`` when a recording slot is currently open."""
83
+ return _recording_session_id is not None
84
+
85
+
86
+ def is_recording_session(session_id: str) -> bool:
87
+ """Return ``True`` when the recording slot belongs to ``session_id``.
88
+
89
+ Used by per-session disconnect cleanup paths to confirm they still
90
+ own the recording before tearing it down. A stale handler whose
91
+ session was replaced by a fresh reconnection (or by an MCP-driven
92
+ ``listen()``) must not clear the active buffer.
93
+ """
94
+ return _recording_session_id == session_id
95
+
96
+
97
+ async def handle_audio_frame(data: bytes, session_id: str) -> None:
98
+ """Process an incoming binary Opus frame from the device.
99
+
100
+ When a recording slot is active (see :func:`start_recording`) AND
101
+ the frame belongs to the recording's session, appends the frame
102
+ to the in-memory buffer for later decoding by the STT
103
+ orchestrator. Frames from a different session — typical during
104
+ a connection swap, where the old WebSocket handler is still
105
+ draining incoming bytes after :meth:`ESP32Connection.disconnect`
106
+ has been called on the main task — are dropped so they cannot
107
+ bleed into the new connection's capture buffer.
108
+
109
+ Outside of an active recording the frame is logged at debug
110
+ level and discarded; the device may emit audio on its own (e.g.
111
+ after an autonomous wake-word detection) and the gateway has no
112
+ STT pipeline running for those frames yet.
113
+ """
114
+ if _recording_session_id is None:
115
+ logger.debug(
116
+ "audio_frame session=%s bytes=%d (discarded — no active recording)",
117
+ session_id,
118
+ len(data),
119
+ )
120
+ return
121
+ if _recording_session_id != session_id:
122
+ # A different connection is sending audio while a recording
123
+ # for this session is in flight. This happens when ESP32
124
+ # reconnects: ``ESP32Manager._handler`` swaps in a new
125
+ # ``ESP32Connection`` and marks the old one disconnected,
126
+ # but the old socket's ``async for message in ws`` loop can
127
+ # still drain a frame or two before the close lands. Letting
128
+ # those into the buffer would corrupt the new session's
129
+ # transcription, so drop them here.
130
+ logger.debug(
131
+ "audio_frame session=%s bytes=%d (discarded — does not match "
132
+ "recording session=%s)",
133
+ session_id,
134
+ len(data),
135
+ _recording_session_id,
136
+ )
137
+ return
138
+ _recording_frames.append(data)
139
+ logger.debug(
140
+ "audio_frame session=%s bytes=%d buffered (recording active)",
141
+ session_id,
142
+ len(data),
143
+ )
144
+
145
+
146
+ async def push_opus_frames(
147
+ esp32: ESP32Manager,
148
+ frames: Iterable[bytes],
149
+ ) -> int:
150
+ """Push Opus frames to the connected ESP32.
151
+
152
+ Returns the number of frames sent so the caller can report this to
153
+ the MCP client. Raises :class:`ConnectionError` (via
154
+ :meth:`ESP32Manager.send_audio_frame`) if the device disconnects
155
+ mid-stream — the orchestrator turns that into a clean MCP error
156
+ rather than letting it bubble up as a stack trace.
157
+ """
158
+ sent = 0
159
+ for frame in frames:
160
+ await esp32.send_audio_frame(frame)
161
+ sent += 1
162
+ return sent
@@ -0,0 +1,469 @@
1
+ """HTTP capture server for receiving photos from ESP32 and PCM from external producers.
2
+
3
+ Two POST endpoints share this server:
4
+
5
+ - ``POST /capture``: ESP32's camera.Explain() uploads JPEG photos as
6
+ multipart/form-data (fields: ``question`` text + ``file`` JPEG).
7
+ Authenticated via ``CAPTURE_TOKEN_KEY`` (the gateway's ``vision_token``).
8
+ The server saves the JPEG to ``~/.stackchan/captures/`` and returns the
9
+ file path so the MCP client can view the image via the Read tool.
10
+
11
+ - ``POST /pcm``: External producers (the SAIVerse voice-tts addon, etc.)
12
+ upload PCM audio for the device's speaker as a streaming body
13
+ (Content-Type: application/octet-stream, Transfer-Encoding: chunked).
14
+ Authenticated separately via ``PCM_TOKEN_KEY``. The request body is
15
+ fed directly into :func:`stackchan_mcp.tts.send_pcm_stream` so the
16
+ audio reaches the device with low latency, without buffering the
17
+ whole utterance.
18
+
19
+ The PCM endpoint is the entry point of the gateway's "external PCM
20
+ input" path — the receiving counterpart of the stdio ``say()`` MCP tool.
21
+ ``say()`` synthesises audio with a registered TTS engine inside the
22
+ gateway; ``POST /pcm`` lets external producers (which already did the
23
+ synthesis themselves, e.g. with a voice-cloning model the gateway does
24
+ not host) push the finished PCM through the same back-half pipeline
25
+ (:func:`send_pcm_stream`).
26
+
27
+ Required PCM request headers:
28
+
29
+ - ``Authorization: Bearer <PCM_TOKEN>`` — token comparison against
30
+ ``PCM_TOKEN_KEY`` (gateway's ``pcm_token`` property)
31
+ - ``X-Sample-Rate: <int>`` — sample rate of the source PCM (e.g. 32000).
32
+ The gateway resamples to the device's 16 kHz before Opus encoding.
33
+ - ``X-Channels: 1`` (optional, defaults to 1) — only mono is supported
34
+ for now (the device decoder is configured for mono).
35
+ - ``X-Message-Id: <str>`` (optional) — opaque identifier echoed back in
36
+ the log line so the producer can correlate uploads with downstream
37
+ device state.
38
+
39
+ The handler stores the active :class:`Gateway` instance in the
40
+ application's ``GATEWAY_KEY`` so it can dispatch to ``send_pcm_stream``
41
+ without coupling :mod:`capture_server` to the gateway module at import
42
+ time (lazy import inside the handler keeps the optional ``[tts]``
43
+ extra unnecessary for capture-only deployments).
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import asyncio
49
+ import hashlib
50
+ import json
51
+ import logging
52
+ import os
53
+ import secrets
54
+ import time
55
+ from dataclasses import dataclass
56
+ from typing import TYPE_CHECKING, AsyncIterator
57
+
58
+ from aiohttp import web
59
+
60
+ if TYPE_CHECKING:
61
+ from .gateway import Gateway
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+ CAPTURE_DIR = os.path.expanduser("~/.stackchan/captures")
66
+ CAPTURE_TOKEN_KEY = web.AppKey("capture_token", str)
67
+ PCM_TOKEN_KEY = web.AppKey("pcm_token", str)
68
+ GATEWAY_KEY: web.AppKey = web.AppKey("gateway", object)
69
+
70
+ # Phase 4.5 avatar (saiverse-stackchan-addon): in-memory staging for
71
+ # one-time avatar set downloads. See docs/intent/stackchan_avatar_pipeline.md
72
+ # §C-2 in the SAIVerse repository.
73
+ AVATAR_SETS_KEY = web.AppKey("avatar_sets", dict)
74
+ AVATAR_SETS_LOCK_KEY = web.AppKey("avatar_sets_lock", asyncio.Lock)
75
+
76
+ # A staging entry is GC'd if it hasn't been fetched within this window.
77
+ AVATAR_SET_STAGING_TTL_SEC = 120.0
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class _AvatarStaging:
82
+ token: str
83
+ mode: str
84
+ payload: bytes
85
+ sha256: str
86
+ created_at: float
87
+
88
+
89
+ # Per-route upload cap for the JPEG capture endpoint. The PCM endpoint
90
+ # intentionally streams arbitrarily long payloads (multi-minute TTS),
91
+ # so the application-wide ``client_max_size`` is disabled and each
92
+ # route enforces its own limit. JPEG captures from the ESP32 camera
93
+ # top out around 200 KB at full resolution; 8 MiB is generous headroom
94
+ # against a misbehaving / malicious uploader without inviting unbounded
95
+ # disk consumption on the gateway host.
96
+ CAPTURE_MAX_BYTES = 8 * 1024 * 1024
97
+
98
+
99
+ def _is_authorized(auth_header: str, expected_token: str) -> bool:
100
+ """Return whether the bearer auth header matches the expected token."""
101
+ return auth_header == f"Bearer {expected_token}"
102
+
103
+
104
+ async def handle_capture(request: web.Request) -> web.Response:
105
+ """Handle photo upload from ESP32."""
106
+ expected_token = request.app[CAPTURE_TOKEN_KEY]
107
+ if expected_token and not _is_authorized(
108
+ request.headers.get("Authorization", ""), expected_token
109
+ ):
110
+ logger.warning("Capture upload auth rejected")
111
+ return web.Response(
112
+ text='{"error": "Unauthorized"}',
113
+ status=401,
114
+ content_type="application/json",
115
+ )
116
+
117
+ # Per-route body cap. The application-wide client_max_size is
118
+ # disabled because /pcm streams arbitrary-length audio, so
119
+ # /capture's defense lives here. Reject up front based on the
120
+ # advertised Content-Length when available, and enforce again
121
+ # while streaming so a misadvertised header cannot bypass the cap.
122
+ content_length = request.content_length
123
+ if content_length is not None and content_length > CAPTURE_MAX_BYTES:
124
+ logger.warning(
125
+ "Capture upload rejected: Content-Length %d exceeds %d",
126
+ content_length, CAPTURE_MAX_BYTES,
127
+ )
128
+ return web.Response(
129
+ text=json.dumps(
130
+ {"error": f"Upload exceeds {CAPTURE_MAX_BYTES} bytes"}
131
+ ),
132
+ status=413,
133
+ content_type="application/json",
134
+ )
135
+
136
+ os.makedirs(CAPTURE_DIR, exist_ok=True)
137
+
138
+ reader = await request.multipart()
139
+ question = ""
140
+ image_path = ""
141
+ bytes_written = 0
142
+
143
+ async for part in reader:
144
+ if part.name == "question":
145
+ question = (await part.read()).decode("utf-8")
146
+ elif part.name == "file":
147
+ timestamp = int(time.time() * 1000)
148
+ filename = f"capture_{timestamp}.jpg"
149
+ image_path = os.path.join(CAPTURE_DIR, filename)
150
+ with open(image_path, "wb") as f:
151
+ while True:
152
+ chunk = await part.read_chunk(8192)
153
+ if not chunk:
154
+ break
155
+ bytes_written += len(chunk)
156
+ if bytes_written > CAPTURE_MAX_BYTES:
157
+ # Overran the cap mid-stream — delete the
158
+ # partial file and bail out with 413 so the
159
+ # gateway host disk does not fill up.
160
+ f.close()
161
+ try:
162
+ os.remove(image_path)
163
+ except OSError:
164
+ pass
165
+ logger.warning(
166
+ "Capture upload truncated at %d bytes (cap %d)",
167
+ bytes_written, CAPTURE_MAX_BYTES,
168
+ )
169
+ return web.Response(
170
+ text=json.dumps(
171
+ {"error": f"Upload exceeds {CAPTURE_MAX_BYTES} bytes"}
172
+ ),
173
+ status=413,
174
+ content_type="application/json",
175
+ )
176
+ f.write(chunk)
177
+
178
+ if image_path and os.path.exists(image_path):
179
+ file_size = os.path.getsize(image_path)
180
+ logger.info(
181
+ "Captured photo: %s (%d bytes), question: %s",
182
+ image_path,
183
+ file_size,
184
+ question,
185
+ )
186
+ result = json.dumps({
187
+ "image_path": image_path,
188
+ "size_bytes": file_size,
189
+ "question": question,
190
+ })
191
+ return web.Response(text=result, content_type="application/json")
192
+
193
+ return web.Response(
194
+ text='{"error": "No image received"}',
195
+ status=400,
196
+ content_type="application/json",
197
+ )
198
+
199
+
200
+ async def stage_avatar_set(
201
+ app: web.Application,
202
+ mode: str,
203
+ payload: bytes,
204
+ ) -> tuple[str, str, str]:
205
+ """Stage an avatar set for one-time HTTP download.
206
+
207
+ Returns (short_id, token, sha256). The caller hands these to the
208
+ device via WS avatar_set_fetch; the device performs a GET against
209
+ /avatar_set/{short_id} with Authorization: Bearer <token>.
210
+
211
+ The staging entry is consumed on the first successful fetch and
212
+ GC'd after AVATAR_SET_STAGING_TTL_SEC if never fetched.
213
+ """
214
+ if mode not in ("layered", "matrix"):
215
+ raise ValueError(f"unknown avatar mode: {mode}")
216
+
217
+ short_id = secrets.token_hex(8)
218
+ token = secrets.token_urlsafe(32)
219
+ sha256 = "sha256:" + hashlib.sha256(payload).hexdigest()
220
+
221
+ staging = _AvatarStaging(
222
+ token=token,
223
+ mode=mode,
224
+ payload=payload,
225
+ sha256=sha256,
226
+ created_at=time.time(),
227
+ )
228
+
229
+ sets = app[AVATAR_SETS_KEY]
230
+ async with app[AVATAR_SETS_LOCK_KEY]:
231
+ # Best-effort GC of stale entries before inserting.
232
+ now = time.time()
233
+ expired = [
234
+ k for k, v in sets.items()
235
+ if now - v.created_at > AVATAR_SET_STAGING_TTL_SEC
236
+ ]
237
+ for k in expired:
238
+ sets.pop(k, None)
239
+ sets[short_id] = staging
240
+
241
+ logger.info(
242
+ "Staged avatar set: short_id=%s mode=%s bytes=%d sha256=%s",
243
+ short_id, mode, len(payload), sha256,
244
+ )
245
+ return short_id, token, sha256
246
+
247
+
248
+ async def handle_avatar_set_fetch(request: web.Request) -> web.Response:
249
+ """Serve a staged avatar set (one-time)."""
250
+ short_id = request.match_info.get("short_id", "")
251
+ if not short_id:
252
+ return web.Response(status=400, text="missing short_id")
253
+
254
+ sets = request.app[AVATAR_SETS_KEY]
255
+ # Validate the request fully (existence, TTL, auth) before consuming
256
+ # the staged entry. An unauthenticated probe must not be able to
257
+ # invalidate a legitimate transfer just by guessing the short_id,
258
+ # and a real fetch that fails auth due to a transient header issue
259
+ # must still find the entry on retry.
260
+ async with request.app[AVATAR_SETS_LOCK_KEY]:
261
+ staging = sets.get(short_id)
262
+ if staging is None:
263
+ return web.Response(status=404, text="not_found_or_consumed")
264
+
265
+ if time.time() - staging.created_at > AVATAR_SET_STAGING_TTL_SEC:
266
+ # Expired — drop the slot so it doesn't linger.
267
+ sets.pop(short_id, None)
268
+ return web.Response(status=410, text="staging_expired")
269
+
270
+ auth = request.headers.get("Authorization", "")
271
+ if auth != f"Bearer {staging.token}":
272
+ logger.warning(
273
+ "Avatar set fetch auth rejected for short_id=%s", short_id
274
+ )
275
+ return web.Response(status=401, text="unauthorized")
276
+
277
+ # Auth confirmed: consume the one-time entry now.
278
+ sets.pop(short_id, None)
279
+
280
+ logger.info(
281
+ "Serving avatar set: short_id=%s mode=%s bytes=%d",
282
+ short_id, staging.mode, len(staging.payload),
283
+ )
284
+ return web.Response(
285
+ body=staging.payload,
286
+ content_type="application/octet-stream",
287
+ headers={
288
+ "X-Avatar-Mode": staging.mode,
289
+ "X-Avatar-Sha256": staging.sha256,
290
+ "Content-Length": str(len(staging.payload)),
291
+ },
292
+ )
293
+
294
+
295
+ async def _pcm_chunks_from_request(
296
+ request: web.Request,
297
+ ) -> AsyncIterator[bytes]:
298
+ """Yield PCM byte chunks from the request body.
299
+
300
+ ``request.content`` is an :class:`aiohttp.StreamReader` that delivers
301
+ raw bytes as the chunked transfer arrives. ``iter_chunked(size)``
302
+ breaks the stream into ``<= size`` byte pieces, matching the
303
+ ``send_pcm_stream`` contract (any chunk size, internally re-aligned
304
+ to Opus frame boundaries).
305
+
306
+ Empty chunks (= heartbeat / cancellation tick) reach
307
+ ``send_pcm_stream`` unchanged and are handled as no-ops there.
308
+ """
309
+ async for chunk in request.content.iter_chunked(8192):
310
+ yield chunk
311
+
312
+
313
+ async def handle_pcm(request: web.Request) -> web.Response:
314
+ """Stream PCM bytes from an external producer to the connected device.
315
+
316
+ See the module docstring for the request shape (headers, token,
317
+ body framing). The handler authenticates, validates the sample
318
+ rate header, then hands the body off to
319
+ :func:`stackchan_mcp.tts.send_pcm_stream`.
320
+
321
+ Returns 200 with a JSON summary on success (frame count, duration,
322
+ source label), 401 on token mismatch, 400 on missing /
323
+ malformed sample-rate header, 503 when no device is connected, or
324
+ 500 with a clean error string on encoding / push failures (mirrors
325
+ the error-class discipline of the stdio ``say()`` tool).
326
+ """
327
+ expected_token = request.app[PCM_TOKEN_KEY]
328
+ if expected_token and not _is_authorized(
329
+ request.headers.get("Authorization", ""), expected_token
330
+ ):
331
+ logger.warning("PCM upload auth rejected")
332
+ return web.Response(
333
+ text='{"error": "Unauthorized"}',
334
+ status=401,
335
+ content_type="application/json",
336
+ )
337
+
338
+ rate_header = request.headers.get("X-Sample-Rate", "")
339
+ try:
340
+ source_rate = int(rate_header)
341
+ except (TypeError, ValueError):
342
+ return web.Response(
343
+ text=json.dumps(
344
+ {"error": f"Missing or invalid X-Sample-Rate header: {rate_header!r}"}
345
+ ),
346
+ status=400,
347
+ content_type="application/json",
348
+ )
349
+ if source_rate <= 0:
350
+ # Non-positive rates would crash resample_pcm16_linear with a
351
+ # ZeroDivisionError (which the RuntimeError handler below does
352
+ # not translate) and never produce a valid frame anyway. Reject
353
+ # at the boundary so the caller gets a clean 400 instead of
354
+ # an internal server error trail.
355
+ return web.Response(
356
+ text=json.dumps(
357
+ {"error": f"X-Sample-Rate must be a positive integer: {rate_header!r}"}
358
+ ),
359
+ status=400,
360
+ content_type="application/json",
361
+ )
362
+
363
+ channels_header = request.headers.get("X-Channels", "1")
364
+ try:
365
+ channels = int(channels_header)
366
+ except (TypeError, ValueError):
367
+ channels = 1
368
+ if channels != 1:
369
+ # send_pcm_stream is configured for mono via DEVICE_CHANNELS. Multi-
370
+ # channel sources would need downmix before they get here; rejecting
371
+ # them up front is clearer than silently mixing.
372
+ return web.Response(
373
+ text=json.dumps(
374
+ {"error": f"Only mono PCM is supported, got channels={channels}"}
375
+ ),
376
+ status=400,
377
+ content_type="application/json",
378
+ )
379
+
380
+ message_id = request.headers.get("X-Message-Id", "")
381
+ source_label = f"http_pcm:{message_id}" if message_id else "http_pcm"
382
+
383
+ gateway = request.app[GATEWAY_KEY]
384
+ if gateway is None:
385
+ return web.Response(
386
+ text='{"error": "Gateway not available"}',
387
+ status=503,
388
+ content_type="application/json",
389
+ )
390
+
391
+ # Lazy import: tts.send_pcm_stream pulls in opuslib, which is in the
392
+ # ``[tts]`` extra. Capture-only deployments must keep working
393
+ # without the extra, so we only require it when /pcm is actually
394
+ # used.
395
+ try:
396
+ from .tts import send_pcm_stream
397
+ except ImportError as exc:
398
+ return web.Response(
399
+ text=json.dumps(
400
+ {
401
+ "error": f"PCM endpoint requires the [tts] extra: {exc}",
402
+ }
403
+ ),
404
+ status=500,
405
+ content_type="application/json",
406
+ )
407
+
408
+ try:
409
+ result = await send_pcm_stream(
410
+ gateway,
411
+ _pcm_chunks_from_request(request),
412
+ source_rate=source_rate,
413
+ source_label=source_label,
414
+ )
415
+ except RuntimeError as exc:
416
+ # send_pcm_stream raises RuntimeError on no-device / protocol
417
+ # mismatch / opuslib missing / disconnect mid-stream. Translate
418
+ # to a clean HTTP error rather than letting the traceback leak.
419
+ message = str(exc)
420
+ status = 503 if "no esp32" in message.lower() else 500
421
+ return web.Response(
422
+ text=json.dumps({"error": message}),
423
+ status=status,
424
+ content_type="application/json",
425
+ )
426
+
427
+ return web.Response(text=json.dumps(result), content_type="application/json")
428
+
429
+
430
+ def create_capture_app(
431
+ capture_token: str = "",
432
+ pcm_token: str = "",
433
+ gateway: "Gateway | None" = None,
434
+ ) -> web.Application:
435
+ """Create the HTTP server application hosting /capture and /pcm.
436
+
437
+ ``capture_token`` authenticates ESP32 photo uploads (legacy single-
438
+ arg form is kept so existing tests keep working). ``pcm_token``
439
+ authenticates external PCM producers; if omitted the gateway will
440
+ accept any /pcm request, which matches the "no STACKCHAN_TOKEN set"
441
+ fallback behaviour the rest of the gateway already uses for ad-hoc
442
+ local development.
443
+
444
+ ``gateway`` is the active :class:`Gateway` instance the /pcm handler
445
+ dispatches to. May be ``None`` for tests of /capture alone; /pcm
446
+ will return 503 in that case.
447
+ """
448
+ # ``client_max_size=0`` disables aiohttp's per-request body size
449
+ # cap (default 1 MiB). The /pcm endpoint legitimately streams
450
+ # arbitrarily long PCM utterances (multi-minute TTS, live audio
451
+ # mixes); a 1 MiB cap would silently cut a chunked-transfer
452
+ # producer off mid-stream once its cumulative body exceeded that
453
+ # limit — observed in practice with a 200-second TTS push, which
454
+ # aborted around 36 s in (~2 MiB of source-rate PCM through the
455
+ # transfer-encoding pipe). The handler itself enforces no separate
456
+ # cap; back-pressure comes from the device-side Opus push rate
457
+ # inside ``send_pcm_stream``, which is the right place for it.
458
+ # /capture only receives JPEG snapshots from the ESP32 (well under
459
+ # 1 MiB each) so removing the cap costs it nothing.
460
+ app = web.Application(client_max_size=0)
461
+ app[CAPTURE_TOKEN_KEY] = capture_token
462
+ app[AVATAR_SETS_KEY] = {}
463
+ app[AVATAR_SETS_LOCK_KEY] = asyncio.Lock()
464
+ app[PCM_TOKEN_KEY] = pcm_token
465
+ app[GATEWAY_KEY] = gateway
466
+ app.router.add_post("/capture", handle_capture)
467
+ app.router.add_get("/avatar_set/{short_id}", handle_avatar_set_fetch)
468
+ app.router.add_post("/pcm", handle_pcm)
469
+ return app