stackchan-mcp 0.9.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. stackchan_mcp/__init__.py +81 -0
  2. stackchan_mcp/__main__.py +12 -0
  3. stackchan_mcp/_libs/SOURCES.md +130 -0
  4. stackchan_mcp/_libs/opus.dll +0 -0
  5. stackchan_mcp/audio_input_hook.py +432 -0
  6. stackchan_mcp/audio_stream.py +162 -0
  7. stackchan_mcp/capture_server.py +469 -0
  8. stackchan_mcp/cli.py +958 -0
  9. stackchan_mcp/esp32_client.py +983 -0
  10. stackchan_mcp/event_log.py +189 -0
  11. stackchan_mcp/gateway.py +274 -0
  12. stackchan_mcp/handlers/__init__.py +7 -0
  13. stackchan_mcp/handlers/audio.py +21 -0
  14. stackchan_mcp/handlers/camera.py +25 -0
  15. stackchan_mcp/handlers/robot.py +52 -0
  16. stackchan_mcp/http_server.py +398 -0
  17. stackchan_mcp/mcp_router.py +126 -0
  18. stackchan_mcp/mdns_advertiser.py +347 -0
  19. stackchan_mcp/notify.example.yml +21 -0
  20. stackchan_mcp/notify_config.py +235 -0
  21. stackchan_mcp/ownership.py +270 -0
  22. stackchan_mcp/protocol.py +95 -0
  23. stackchan_mcp/queue.py +191 -0
  24. stackchan_mcp/server.py +28 -0
  25. stackchan_mcp/stdio_server.py +1365 -0
  26. stackchan_mcp/stt/__init__.py +62 -0
  27. stackchan_mcp/stt/audio_utils.py +102 -0
  28. stackchan_mcp/stt/base.py +94 -0
  29. stackchan_mcp/stt/faster_whisper.py +217 -0
  30. stackchan_mcp/stt/openai_whisper.py +177 -0
  31. stackchan_mcp/stt/orchestrator.py +568 -0
  32. stackchan_mcp/tools.py +82 -0
  33. stackchan_mcp/tts/__init__.py +62 -0
  34. stackchan_mcp/tts/audio_utils.py +177 -0
  35. stackchan_mcp/tts/base.py +86 -0
  36. stackchan_mcp/tts/orchestrator.py +688 -0
  37. stackchan_mcp/tts/voicevox.py +184 -0
  38. stackchan_mcp-0.9.1.dist-info/METADATA +324 -0
  39. stackchan_mcp-0.9.1.dist-info/RECORD +43 -0
  40. stackchan_mcp-0.9.1.dist-info/WHEEL +5 -0
  41. stackchan_mcp-0.9.1.dist-info/entry_points.txt +2 -0
  42. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE +39 -0
  43. stackchan_mcp-0.9.1.dist-info/licenses/LICENSE-THIRD-PARTY +65 -0
@@ -0,0 +1,81 @@
1
+ """stackchan-mcp: Two-faced gateway for StackChan (xiaozhi-esp32).
2
+
3
+ MCP client side: stdio MCP server (mcp Python SDK)
4
+ ESP32 side: WebSocket server (MCP client over JSON-RPC 2.0)
5
+ """
6
+
7
+ import os as _os
8
+ import platform as _platform
9
+ import sys as _sys
10
+ from importlib.metadata import PackageNotFoundError, version
11
+ from pathlib import Path as _Path
12
+
13
+ try:
14
+ __version__ = version("stackchan-mcp")
15
+ except PackageNotFoundError: # pragma: no cover - source checkout without install
16
+ __version__ = "0.0.0+unknown"
17
+
18
+ # Windows: register the bundled native libs directory with the DLL
19
+ # search path before any submodule pulls in `opuslib` (or any other
20
+ # wrapper that calls `ctypes.util.find_library`). On Linux/macOS the
21
+ # system package manager typically already provides libopus, so we do
22
+ # nothing on those platforms.
23
+ #
24
+ # Why this is here and not in tts/__init__.py or stt/__init__.py:
25
+ # opuslib's libopus lookup happens at import time (the wrapper's
26
+ # top-level module unconditionally calls `find_library('opus')` and
27
+ # raises if it returns None). That means we need the DLL search path
28
+ # update to have run before *any* code imports opuslib, no matter
29
+ # which subpackage of stackchan_mcp loads first. The package
30
+ # `__init__.py` is the only place guaranteed to run before all
31
+ # sibling submodules.
32
+ #
33
+ # Why we update BOTH `os.add_dll_directory()` AND `os.environ["PATH"]`:
34
+ # - `os.add_dll_directory()` is the modern, isolated mechanism used by
35
+ # `LoadLibraryEx(..., LOAD_LIBRARY_SEARCH_USER_DIRS)`. Importantly,
36
+ # `ctypes.util.find_library()` on Windows uses the legacy
37
+ # `LoadLibraryW()` path which does **not** consult the
38
+ # `add_dll_directory()` list (see CPython issue #43603). Since
39
+ # `opuslib/api/__init__.py` calls exactly that — `find_library('opus')`
40
+ # — we also have to prepend the directory to PATH so the legacy
41
+ # resolver picks it up.
42
+ # - We add to `add_dll_directory()` too because direct `ctypes.CDLL(...)`
43
+ # / extension-module imports use the modern resolver, and we want
44
+ # bundle discovery to work for both API styles future-proof.
45
+ #
46
+ # See `stackchan_mcp/_libs/SOURCES.md` for the bundled DLL provenance.
47
+ # Architecture gate: the bundled `opus.dll` is built for `win_amd64`
48
+ # (x86_64). On Windows ARM64 / Windows x86 (32-bit), loading the x64
49
+ # DLL would fail with a native-image mismatch — *exactly* the
50
+ # "looks installed but fails at runtime" footgun this bundling is
51
+ # meant to remove. Skip the DLL search-path setup on those
52
+ # architectures so the user falls back to the same
53
+ # "find_library returns None" failure mode they had before this
54
+ # fix, which at least produces a clean ImportError on
55
+ # `import opuslib` rather than a confusing crash inside the DLL
56
+ # loader. A platform-specific wheel build would have rejected those
57
+ # architectures at install time (no compatible wheel), so this
58
+ # guard mostly matters for users who bypass wheel selection (e.g.
59
+ # by installing from sdist on a non-x64 Windows host).
60
+ _machine = _platform.machine().upper() if _sys.platform == "win32" else ""
61
+ _dll_dir_handle = None # kept alive at module scope; see comment below
62
+
63
+ if _sys.platform == "win32" and _machine in ("AMD64", "X86_64"):
64
+ _libs_dir = _Path(__file__).resolve().parent / "_libs"
65
+ if _libs_dir.is_dir():
66
+ # Retain the directory handle at module scope. Per CPython docs
67
+ # (`os.add_dll_directory`), the returned object is "an opaque
68
+ # value that has a `close()` method ... the returned object
69
+ # remains valid until close() is called". On garbage
70
+ # collection, the directory de-registers itself, so direct
71
+ # `ctypes.CDLL(...)` callers that rely on the modern resolver
72
+ # path would lose access to the bundle. Holding the handle on
73
+ # the module keeps the registration live for the process
74
+ # lifetime — matching the intent documented above for both
75
+ # `find_library` (legacy) and `LoadLibraryEx` (modern) lookup
76
+ # paths.
77
+ _dll_dir_handle = _os.add_dll_directory(str(_libs_dir))
78
+ _libs_str = str(_libs_dir)
79
+ _existing_path = _os.environ.get("PATH", "")
80
+ if _libs_str not in _existing_path.split(_os.pathsep):
81
+ _os.environ["PATH"] = _libs_str + _os.pathsep + _existing_path
@@ -0,0 +1,12 @@
1
+ """Entry point: ``python -m stackchan_mcp``.
2
+
3
+ The actual implementation lives in :mod:`stackchan_mcp.cli` so that the
4
+ console script and ``python -m`` paths share a single side-effect-free
5
+ import surface.
6
+ """
7
+
8
+ from .cli import main
9
+
10
+
11
+ if __name__ == "__main__":
12
+ main()
@@ -0,0 +1,130 @@
1
+ # Bundled Native Libraries
2
+
3
+ This directory contains pre-built native shared libraries that the
4
+ gateway needs on platforms where the system package manager does not
5
+ typically ship them. They are loaded at import time by
6
+ `stackchan_mcp/__init__.py` via `os.add_dll_directory()` (Windows) so
7
+ that `ctypes.util.find_library()` calls inside Python wrapper packages
8
+ (e.g. `opuslib`) resolve to the bundled copy without any user setup.
9
+
10
+ ## Why bundle?
11
+
12
+ The Python wrapper packages that depend on these libraries (currently
13
+ `opuslib`, pulled in via the `[tts]` and `[stt]` extras) only ship
14
+ Python bindings — they do **not** ship the underlying native library.
15
+ On Linux and macOS most users already have `libopus` available through
16
+ their distro's package manager (`apt install libopus0`,
17
+ `brew install opus`, etc.), but on Windows there is no equivalent
18
+ default install path, which means a plain `pip install stackchan-mcp[tts]`
19
+ fails at runtime with `Could not find Opus library. Make sure it is
20
+ installed.` even though the Python wrappers installed cleanly.
21
+
22
+ Bundling the Windows binary in the wheel removes that footgun: every
23
+ Windows user who installs `stackchan-mcp[tts]` (or `[stt]`) gets a
24
+ working installation on the first try, with no extra `vcpkg` /
25
+ `conda install -c conda-forge libopus` / manual DLL placement step.
26
+
27
+ The decision to bundle (vs. download at install time vs. require source
28
+ build) was made on these criteria:
29
+
30
+ | Criterion | Verdict for libopus |
31
+ |---|---|
32
+ | Maturity of the dependency | Mature (Opus is a frozen IETF codec, RFC 6716, 2012) |
33
+ | Frequency of security advisories | Very low (the codec parser is small and well-audited) |
34
+ | File size | ~480 KB — fits comfortably in the wheel |
35
+ | Re-distribution license | BSD 3-clause (Xiph) — redistribution allowed with attribution |
36
+ | Long-term availability of upstream | Excellent (Xiph.Org maintains the source indefinitely) |
37
+
38
+ If any of those change (e.g. a future ML-based bundle that ships
39
+ hundreds of MB), revisit and consider the "CI downloads a pinned
40
+ version at build time" approach instead.
41
+
42
+ ## opus.dll
43
+
44
+ | Field | Value |
45
+ |---|---|
46
+ | Architecture | x86_64 (`win_amd64`) |
47
+ | License | BSD 3-clause + Xiph extension — see <https://opus-codec.org/license/> |
48
+ | Provenance | Built from upstream Opus source by the publish workflow via vcpkg |
49
+ | Build command | `vcpkg install opus:x64-windows` (CI runner: `windows-latest`) |
50
+
51
+ ### Provenance note
52
+
53
+ `opus.dll` is **not** tracked in git. The publish workflow
54
+ (`.github/workflows/publish.yml`, job `build-windows-wheel`)
55
+ bootstraps a fresh vcpkg checkout on a `windows-latest` runner,
56
+ runs `vcpkg install opus:x64-windows`, copies the produced
57
+ `opus.dll` into `stackchan_mcp/_libs/`, and logs its SHA256 to the
58
+ job log so reviewers can spot vcpkg-side binary drift before a tag
59
+ publishes. The wheel build that follows picks the DLL up via
60
+ `tool.hatch.build.targets.wheel.artifacts` in
61
+ `gateway/pyproject.toml`, and the resulting wheel is renamed from
62
+ `*-py3-none-any.whl` to `*-py3-none-win_amd64.whl` so pip resolves
63
+ it only on Windows x64 installs.
64
+
65
+ Builds on the Ubuntu runner (sdist + the `py3-none-any` wheel they
66
+ produce) do not place a DLL under `_libs/`, so those distributions
67
+ ship clean — non-Windows installs and non-x64 Windows installs
68
+ either fall back to a system `libopus` (Linux/macOS) or get a
69
+ clean "no compatible wheel" install-time message (Windows ARM64 /
70
+ x86 32-bit).
71
+
72
+ ### Local development
73
+
74
+ If you need a local Windows checkout to test the bundling path
75
+ (running `uv build` outside CI), mirror the CI step by:
76
+
77
+ 1. Installing libopus via vcpkg (`vcpkg install opus:x64-windows`)
78
+ and copying the produced DLL into `stackchan_mcp/_libs/opus.dll`.
79
+ 2. Or downloading the same `opus.dll` from a release artifact
80
+ uploaded by the publish workflow.
81
+ 3. Or installing system libopus and copying it into the directory.
82
+
83
+ The DLL is gitignored (see `gateway/.gitignore`) so a local copy
84
+ never sneaks into a commit.
85
+
86
+ ## License compliance
87
+
88
+ The Opus codec is distributed under the 3-clause BSD license (with the
89
+ optional Xiph patent grant), which permits redistribution in source or
90
+ binary form provided the copyright notice and license text are
91
+ preserved. The canonical notice ships at the top of every gateway
92
+ distribution as `LICENSE-THIRD-PARTY` (declared in
93
+ `gateway/pyproject.toml`'s `license-files`); the same text is
94
+ reproduced below as the bundling-rationale narrative for readers of
95
+ this document.
96
+
97
+ ```
98
+ Copyright 2001-2023 Xiph.Org, Skype Limited, Octasic,
99
+ Jean-Marc Valin, Timothy B. Terriberry,
100
+ CSIRO, Gregory Maxwell, Mark Borgerding,
101
+ Erik de Castro Lopo
102
+
103
+ Redistribution and use in source and binary forms, with or without
104
+ modification, are permitted provided that the following conditions
105
+ are met:
106
+
107
+ - Redistributions of source code must retain the above copyright
108
+ notice, this list of conditions and the following disclaimer.
109
+
110
+ - Redistributions in binary form must reproduce the above copyright
111
+ notice, this list of conditions and the following disclaimer in the
112
+ documentation and/or other materials provided with the distribution.
113
+
114
+ - Neither the name of Internet Society, IETF or IETF Trust, nor the
115
+ names of specific contributors, may be used to endorse or promote
116
+ products derived from this software without specific prior written
117
+ permission.
118
+
119
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
120
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
121
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
122
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
123
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
124
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
125
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
129
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130
+ ```
Binary file
@@ -0,0 +1,432 @@
1
+ """Device-driven listen audio forwarding to an external HTTP hook.
2
+
3
+ When the ESP32 device autonomously enters listening mode — wake word
4
+ detection (``WakeWordInvoke``), button press, or LCD touch
5
+ (``ToggleChatState``) — the gateway's MCP-driven STT pipeline is not
6
+ running because there is no concurrent ``listen()`` tool call to open a
7
+ recording slot. The inbound Opus frames are therefore discarded today
8
+ (see :mod:`stackchan_mcp.audio_stream` module docstring).
9
+
10
+ This module fills that gap. When configured with a hook URL, the
11
+ gateway opens a recording slot on inbound ``{"type":"listen",
12
+ "state":"start"}`` messages from the device, buffers the Opus frames
13
+ into the same module-level slot used by the MCP-driven path, packs them
14
+ into an Ogg/Opus container on ``{"state":"stop"}``, and POSTs the
15
+ payload to the hook.
16
+
17
+ Configuration:
18
+
19
+ - ``STACKCHAN_AUDIO_HOOK_URL`` — HTTP(S) URL of the receiver. The
20
+ device-driven capture path is silently disabled when unset.
21
+ - ``STACKCHAN_AUDIO_HOOK_TOKEN`` — Bearer token; falls back to
22
+ ``STACKCHAN_TOKEN`` so a single-token setup works without extra
23
+ configuration.
24
+
25
+ The capture path is opt-in by design: stackchan-mcp's primary listen
26
+ model is MCP-client-driven (the ``listen()`` tool), and device-driven
27
+ capture only makes sense when an external service is set up to receive
28
+ the audio. Leaving the hook URL unset keeps the gateway's behaviour
29
+ unchanged from server-driven listen.
30
+
31
+ Ogg container implementation note: we assemble the container directly
32
+ in pure Python (RFC 3533 + RFC 7845) rather than pulling in pyogg or
33
+ similar. The format is well-specified and our inputs are fixed (single
34
+ stream, mono, 16 kHz, 60 ms Opus frames), so a 200-line implementation
35
+ keeps the dependency surface unchanged from the existing ``[stt]`` /
36
+ ``[tts]`` extras (just ``opuslib`` for codec, no container library).
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import logging
42
+ import struct
43
+ from typing import Sequence
44
+
45
+ import aiohttp
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ # --- Device audio parameters -------------------------------------------------
51
+
52
+ #: Sample rate the firmware emits (matches the STT pipeline's expectation;
53
+ #: see :data:`stackchan_mcp.stt.audio_utils.DEVICE_SAMPLE_RATE`).
54
+ DEVICE_SAMPLE_RATE = 16000
55
+
56
+ #: Frame duration the firmware emits. 60 ms is the xiaozhi-esp32 default;
57
+ #: each WebSocket binary message carries exactly one Opus frame.
58
+ DEVICE_FRAME_DURATION_MS = 60
59
+
60
+ #: Number of audio samples per Opus frame, at the device sample rate.
61
+ SAMPLES_PER_FRAME = DEVICE_SAMPLE_RATE * DEVICE_FRAME_DURATION_MS // 1000 # 960
62
+
63
+ #: Opus granule positions are always expressed in 48 kHz samples, even when
64
+ #: the underlying stream is 16 kHz mono (RFC 7845 §4.1.7). So one 60 ms frame
65
+ #: advances the granule by 48000 * 60/1000 = 2880.
66
+ GRANULE_PER_FRAME = 48000 * DEVICE_FRAME_DURATION_MS // 1000 # 2880
67
+
68
+
69
+ # --- Ogg/Opus container ------------------------------------------------------
70
+ #
71
+ # Ogg page layout (RFC 3533 §6):
72
+ # 0..3 "OggS"
73
+ # 4 stream_structure_version (0)
74
+ # 5 header_type_flag (0x02 BOS, 0x04 EOS, 0x01 continued; can OR)
75
+ # 6..13 granule_position (int64 LE)
76
+ # 14..17 bitstream_serial_number (uint32 LE)
77
+ # 18..21 page_sequence_number (uint32 LE)
78
+ # 22..25 CRC32 (zeroed during calculation, then patched)
79
+ # 26 number_of_page_segments (1..255)
80
+ # 27.. segment_table (one byte per segment, 0..255 each)
81
+ # .. segment data (concatenated)
82
+ #
83
+ # CRC32 polynomial: 0x04C11DB7, MSB-first, no initial value, no final XOR.
84
+ # This differs from zlib.crc32; we precompute a table.
85
+
86
+ _OGG_MAGIC = b"OggS"
87
+ _OPUS_HEAD_MAGIC = b"OpusHead"
88
+ _OPUS_TAGS_MAGIC = b"OpusTags"
89
+
90
+ _HEADER_BOS = 0x02
91
+ _HEADER_EOS = 0x04
92
+ _HEADER_CONTINUED = 0x01
93
+
94
+
95
+ def _build_ogg_crc_table() -> list[int]:
96
+ """Precompute the 256-entry CRC32 table for Ogg's MSB-first polynomial.
97
+
98
+ Ogg's CRC is a "vanilla" 32-bit CRC (no reflection, no XOR-out) with
99
+ polynomial 0x04C11DB7. zlib.crc32 uses the same polynomial but with
100
+ bit-reflected input/output and XOR-out 0xFFFFFFFF, so we cannot reuse
101
+ it directly.
102
+ """
103
+ poly = 0x04C11DB7
104
+ table = []
105
+ for byte in range(256):
106
+ crc = byte << 24
107
+ for _ in range(8):
108
+ if crc & 0x80000000:
109
+ crc = ((crc << 1) ^ poly) & 0xFFFFFFFF
110
+ else:
111
+ crc = (crc << 1) & 0xFFFFFFFF
112
+ table.append(crc)
113
+ return table
114
+
115
+
116
+ _OGG_CRC_TABLE = _build_ogg_crc_table()
117
+
118
+
119
+ def _ogg_crc32(data: bytes) -> int:
120
+ """Compute Ogg's CRC32 over ``data`` (table-driven, MSB-first)."""
121
+ crc = 0
122
+ for byte in data:
123
+ crc = ((crc << 8) ^ _OGG_CRC_TABLE[((crc >> 24) ^ byte) & 0xFF]) & 0xFFFFFFFF
124
+ return crc
125
+
126
+
127
+ def _packet_to_segments(packet: bytes) -> list[bytes]:
128
+ """Split an Ogg packet into 255-byte lacing segments (RFC 3533 §6).
129
+
130
+ Packets longer than 255 bytes are split into 255-byte runs; packets
131
+ whose length is exactly a multiple of 255 are terminated with a
132
+ zero-length segment so the parser knows the packet ended there
133
+ (otherwise it would expect a continuation into the next page).
134
+ Variable-bitrate Opus frames can exceed 255 bytes in practice, so
135
+ this split must happen before page assembly.
136
+ """
137
+ segments: list[bytes] = []
138
+ if not packet:
139
+ # An empty packet is itself a single zero-length segment.
140
+ return [b""]
141
+ pos = 0
142
+ n = len(packet)
143
+ while pos < n:
144
+ chunk = packet[pos:pos + 255]
145
+ segments.append(chunk)
146
+ pos += 255
147
+ if len(packet) % 255 == 0:
148
+ # Packet ends exactly on a 255-byte boundary — append a
149
+ # terminating zero-length segment per RFC 3533.
150
+ segments.append(b"")
151
+ return segments
152
+
153
+
154
+ def _build_ogg_page(
155
+ *,
156
+ header_type: int,
157
+ granule_position: int,
158
+ serial: int,
159
+ page_sequence: int,
160
+ segments: Sequence[bytes],
161
+ ) -> bytes:
162
+ """Assemble one Ogg page from a list of segments (each ≤ 255 bytes).
163
+
164
+ Caller is responsible for splitting larger packets into ≤ 255-byte
165
+ segments and emitting a 0-byte terminating segment when a packet
166
+ happens to end exactly on a 255-byte boundary (Ogg packetisation
167
+ rule, RFC 3533 §6). For our 60 ms Opus frames at 16 kbps target
168
+ this never fires — frames are well under 255 bytes — but we keep
169
+ the API segment-oriented for correctness.
170
+ """
171
+ if len(segments) < 1 or len(segments) > 255:
172
+ raise ValueError(
173
+ f"Ogg page must have 1..255 segments, got {len(segments)}"
174
+ )
175
+ segment_table = bytes(len(s) for s in segments)
176
+ for seg in segments:
177
+ if len(seg) > 255:
178
+ raise ValueError(
179
+ f"Ogg segment exceeds 255 bytes ({len(seg)}); "
180
+ "split into multiple segments before calling _build_ogg_page"
181
+ )
182
+
183
+ body = b"".join(segments)
184
+ header = struct.pack(
185
+ "<4sBBqII",
186
+ _OGG_MAGIC,
187
+ 0, # stream_structure_version
188
+ header_type,
189
+ granule_position,
190
+ serial,
191
+ page_sequence,
192
+ )
193
+ # CRC field placeholder (4 bytes of 0x00), then segment count and table.
194
+ header_with_crc_placeholder = (
195
+ header + b"\x00\x00\x00\x00" + bytes([len(segments)]) + segment_table
196
+ )
197
+ full_page = header_with_crc_placeholder + body
198
+ crc = _ogg_crc32(full_page)
199
+ # Patch CRC at offset 22.
200
+ return full_page[:22] + struct.pack("<I", crc) + full_page[26:]
201
+
202
+
203
+ def _build_opus_head_packet(
204
+ *,
205
+ channels: int = 1,
206
+ pre_skip: int = 0,
207
+ input_sample_rate: int = DEVICE_SAMPLE_RATE,
208
+ ) -> bytes:
209
+ """OpusHead identification header packet (RFC 7845 §5.1)."""
210
+ return struct.pack(
211
+ "<8sBBHIhB",
212
+ _OPUS_HEAD_MAGIC,
213
+ 1, # version
214
+ channels,
215
+ pre_skip,
216
+ input_sample_rate, # informational only; decoder always runs at 48 kHz
217
+ 0, # output_gain (Q7.8 fixed-point dB), 0 = unchanged
218
+ 0, # channel_mapping_family: 0 = mono/stereo
219
+ )
220
+
221
+
222
+ def _build_opus_tags_packet(vendor: bytes = b"stackchan-mcp") -> bytes:
223
+ """OpusTags comment header packet (RFC 7845 §5.2). Empty comment list."""
224
+ return (
225
+ _OPUS_TAGS_MAGIC
226
+ + struct.pack("<I", len(vendor))
227
+ + vendor
228
+ + struct.pack("<I", 0) # comment_count = 0
229
+ )
230
+
231
+
232
+ # How many Opus frames to pack into a single audio page. The Ogg spec
233
+ # allows up to 255 segments per page (and our frames fit in one segment
234
+ # each at this bitrate). Smaller pages give finer-grained recovery on
235
+ # corruption but waste header bytes; 50 is a comfortable middle
236
+ # (~3 seconds of audio per page).
237
+ _FRAMES_PER_PAGE = 50
238
+
239
+
240
+ def pack_opus_frames_to_ogg(
241
+ frames: Sequence[bytes],
242
+ *,
243
+ serial: int = 1,
244
+ channels: int = 1,
245
+ pre_skip: int = 0,
246
+ ) -> bytes:
247
+ """Pack raw Opus frames into a complete Ogg/Opus stream.
248
+
249
+ Args:
250
+ frames: One raw Opus packet per element, as emitted by the
251
+ xiaozhi-esp32 firmware (one packet per WebSocket binary
252
+ message). Empty input yields an empty bytes object so the
253
+ caller can short-circuit "no audio" without raising.
254
+ serial: Ogg bitstream serial number. Required to be present in
255
+ every page; the value itself is opaque to the decoder,
256
+ but uniqueness matters when multiplexing — we are not, so
257
+ any non-zero value works.
258
+ channels: 1 (mono) or 2 (stereo). The firmware sends mono.
259
+ pre_skip: Samples to drop at the start of decoded output, in
260
+ 48 kHz units (RFC 7845 §5.1). 0 is the conservative default;
261
+ a real encoder reports its actual look-ahead here.
262
+
263
+ Returns:
264
+ A bytes object containing the full Ogg/Opus stream (BOS page
265
+ with OpusHead, page with OpusTags, one or more audio pages, the
266
+ last marked EOS). Ready to be POSTed as ``audio/ogg``.
267
+ """
268
+ if not frames:
269
+ return b""
270
+
271
+ out = bytearray()
272
+ page_seq = 0
273
+
274
+ # Page 0: BOS with OpusHead.
275
+ out += _build_ogg_page(
276
+ header_type=_HEADER_BOS,
277
+ granule_position=0,
278
+ serial=serial,
279
+ page_sequence=page_seq,
280
+ segments=[_build_opus_head_packet(
281
+ channels=channels,
282
+ pre_skip=pre_skip,
283
+ input_sample_rate=DEVICE_SAMPLE_RATE,
284
+ )],
285
+ )
286
+ page_seq += 1
287
+
288
+ # Page 1: OpusTags. RFC 7845 requires this as the second page,
289
+ # before any audio data.
290
+ out += _build_ogg_page(
291
+ header_type=0,
292
+ granule_position=0,
293
+ serial=serial,
294
+ page_sequence=page_seq,
295
+ segments=[_build_opus_tags_packet()],
296
+ )
297
+ page_seq += 1
298
+
299
+ # Audio pages: ``_FRAMES_PER_PAGE`` frames per page until the last
300
+ # page, which is marked EOS regardless of fill.
301
+ total_frames = len(frames)
302
+ granule = 0
303
+ for start in range(0, total_frames, _FRAMES_PER_PAGE):
304
+ end = min(start + _FRAMES_PER_PAGE, total_frames)
305
+ page_frames = frames[start:end]
306
+ granule += len(page_frames) * GRANULE_PER_FRAME
307
+ is_last_page = end == total_frames
308
+ # Split each opus packet into Ogg lacing segments. VBR opus can
309
+ # produce packets > 255 bytes, which Ogg encodes as multiple
310
+ # 255-byte segments plus a trailing remainder; packets whose
311
+ # length is an exact multiple of 255 need a zero-length
312
+ # terminator (RFC 3533 §6). _build_ogg_page expects ≤ 255
313
+ # segments per page, so a page's segment count can exceed
314
+ # _FRAMES_PER_PAGE when individual frames have to be split.
315
+ # Flush mid-batch when the segment table is about to overflow
316
+ # so each emitted page stays inside the 255-segment limit.
317
+ segments: list[bytes] = []
318
+ for frame in page_frames:
319
+ frame_segs = _packet_to_segments(frame)
320
+ if len(segments) + len(frame_segs) > 255:
321
+ out += _build_ogg_page(
322
+ header_type=0, # continuation page
323
+ granule_position=granule,
324
+ serial=serial,
325
+ page_sequence=page_seq,
326
+ segments=segments,
327
+ )
328
+ page_seq += 1
329
+ segments = []
330
+ segments.extend(frame_segs)
331
+ if segments:
332
+ out += _build_ogg_page(
333
+ header_type=_HEADER_EOS if is_last_page else 0,
334
+ granule_position=granule,
335
+ serial=serial,
336
+ page_sequence=page_seq,
337
+ segments=segments,
338
+ )
339
+ page_seq += 1
340
+
341
+ return bytes(out)
342
+
343
+
344
+ # --- HTTP push --------------------------------------------------------------
345
+
346
+
347
+ async def push_audio_capture(
348
+ hook_url: str,
349
+ token: str,
350
+ frames: Sequence[bytes],
351
+ *,
352
+ session_id: str = "",
353
+ timeout_s: float = 10.0,
354
+ ) -> bool:
355
+ """POST a device-driven listen capture to the configured hook URL.
356
+
357
+ Args:
358
+ hook_url: Receiver URL (typically the SAIVerse-side
359
+ ``audio_input_relay`` endpoint). Must be set.
360
+ token: Bearer token for ``Authorization: Bearer <token>``.
361
+ Empty string disables auth header (mirroring
362
+ ``STACKCHAN_TOKEN`` semantics — gateway logs a warning at
363
+ startup when the token is unset).
364
+ frames: Raw Opus packets from the device for this listen window.
365
+ session_id: ESP32 connection session ID, forwarded to the
366
+ receiver via the ``X-StackChan-Session`` header so the
367
+ receiver can correlate captures with vessel pairing.
368
+ timeout_s: Total HTTP timeout (default 10s; an Ogg blob for a
369
+ 5-minute capture is well under 1 MB so this is generous).
370
+
371
+ Returns:
372
+ ``True`` if the POST returned 2xx, ``False`` otherwise (including
373
+ on Ogg pack failure or network error). Failures are logged at
374
+ WARNING; callers do not need to log again.
375
+ """
376
+ if not frames:
377
+ logger.debug(
378
+ "audio_input_hook: skipping push, no frames (session=%s)", session_id
379
+ )
380
+ return False
381
+
382
+ try:
383
+ ogg_payload = pack_opus_frames_to_ogg(frames)
384
+ except Exception as exc:
385
+ logger.warning(
386
+ "audio_input_hook: Ogg pack failed for %d frames (session=%s): %s",
387
+ len(frames), session_id, exc,
388
+ )
389
+ return False
390
+
391
+ headers = {
392
+ "Content-Type": "audio/ogg",
393
+ "X-StackChan-Session": session_id,
394
+ }
395
+ if token:
396
+ headers["Authorization"] = f"Bearer {token}"
397
+
398
+ try:
399
+ async with aiohttp.ClientSession() as session:
400
+ async with session.post(
401
+ hook_url,
402
+ data=ogg_payload,
403
+ headers=headers,
404
+ timeout=aiohttp.ClientTimeout(total=timeout_s),
405
+ ) as response:
406
+ if 200 <= response.status < 300:
407
+ logger.info(
408
+ "audio_input_hook: pushed %d frames (%d bytes) "
409
+ "session=%s status=%d",
410
+ len(frames), len(ogg_payload), session_id,
411
+ response.status,
412
+ )
413
+ return True
414
+ body_snippet = (await response.text())[:200]
415
+ logger.warning(
416
+ "audio_input_hook: POST returned status=%d session=%s "
417
+ "body=%r",
418
+ response.status, session_id, body_snippet,
419
+ )
420
+ return False
421
+ except aiohttp.ClientError as exc:
422
+ logger.warning(
423
+ "audio_input_hook: POST failed (network error) session=%s: %s",
424
+ session_id, exc,
425
+ )
426
+ return False
427
+ except Exception as exc:
428
+ logger.warning(
429
+ "audio_input_hook: POST failed (unexpected) session=%s: %s",
430
+ session_id, exc,
431
+ )
432
+ return False