stackchan-mcp 0.9.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stackchan_mcp/__init__.py +81 -0
- stackchan_mcp/__main__.py +12 -0
- stackchan_mcp/_libs/SOURCES.md +130 -0
- stackchan_mcp/_libs/opus.dll +0 -0
- stackchan_mcp/audio_input_hook.py +432 -0
- stackchan_mcp/audio_stream.py +162 -0
- stackchan_mcp/capture_server.py +469 -0
- stackchan_mcp/cli.py +958 -0
- stackchan_mcp/esp32_client.py +983 -0
- stackchan_mcp/event_log.py +189 -0
- stackchan_mcp/gateway.py +274 -0
- stackchan_mcp/handlers/__init__.py +7 -0
- stackchan_mcp/handlers/audio.py +21 -0
- stackchan_mcp/handlers/camera.py +25 -0
- stackchan_mcp/handlers/robot.py +52 -0
- stackchan_mcp/http_server.py +398 -0
- stackchan_mcp/mcp_router.py +126 -0
- stackchan_mcp/mdns_advertiser.py +347 -0
- stackchan_mcp/notify.example.yml +21 -0
- stackchan_mcp/notify_config.py +235 -0
- stackchan_mcp/ownership.py +270 -0
- stackchan_mcp/protocol.py +95 -0
- stackchan_mcp/queue.py +191 -0
- stackchan_mcp/server.py +28 -0
- stackchan_mcp/stdio_server.py +1365 -0
- stackchan_mcp/stt/__init__.py +62 -0
- stackchan_mcp/stt/audio_utils.py +102 -0
- stackchan_mcp/stt/base.py +94 -0
- stackchan_mcp/stt/faster_whisper.py +217 -0
- stackchan_mcp/stt/openai_whisper.py +177 -0
- stackchan_mcp/stt/orchestrator.py +568 -0
- stackchan_mcp/tools.py +82 -0
- stackchan_mcp/tts/__init__.py +62 -0
- stackchan_mcp/tts/audio_utils.py +177 -0
- stackchan_mcp/tts/base.py +86 -0
- stackchan_mcp/tts/orchestrator.py +688 -0
- stackchan_mcp/tts/voicevox.py +184 -0
- stackchan_mcp-0.9.1.dist-info/METADATA +324 -0
- stackchan_mcp-0.9.1.dist-info/RECORD +43 -0
- stackchan_mcp-0.9.1.dist-info/WHEEL +5 -0
- stackchan_mcp-0.9.1.dist-info/entry_points.txt +2 -0
- stackchan_mcp-0.9.1.dist-info/licenses/LICENSE +39 -0
- stackchan_mcp-0.9.1.dist-info/licenses/LICENSE-THIRD-PARTY +65 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""stackchan-mcp: Two-faced gateway for StackChan (xiaozhi-esp32).
|
|
2
|
+
|
|
3
|
+
MCP client side: stdio MCP server (mcp Python SDK)
|
|
4
|
+
ESP32 side: WebSocket server (MCP client over JSON-RPC 2.0)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os as _os
|
|
8
|
+
import platform as _platform
|
|
9
|
+
import sys as _sys
|
|
10
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
11
|
+
from pathlib import Path as _Path
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
__version__ = version("stackchan-mcp")
|
|
15
|
+
except PackageNotFoundError: # pragma: no cover - source checkout without install
|
|
16
|
+
__version__ = "0.0.0+unknown"
|
|
17
|
+
|
|
18
|
+
# Windows: register the bundled native libs directory with the DLL
|
|
19
|
+
# search path before any submodule pulls in `opuslib` (or any other
|
|
20
|
+
# wrapper that calls `ctypes.util.find_library`). On Linux/macOS the
|
|
21
|
+
# system package manager typically already provides libopus, so we do
|
|
22
|
+
# nothing on those platforms.
|
|
23
|
+
#
|
|
24
|
+
# Why this is here and not in tts/__init__.py or stt/__init__.py:
|
|
25
|
+
# opuslib's libopus lookup happens at import time (the wrapper's
|
|
26
|
+
# top-level module unconditionally calls `find_library('opus')` and
|
|
27
|
+
# raises if it returns None). That means we need the DLL search path
|
|
28
|
+
# update to have run before *any* code imports opuslib, no matter
|
|
29
|
+
# which subpackage of stackchan_mcp loads first. The package
|
|
30
|
+
# `__init__.py` is the only place guaranteed to run before all
|
|
31
|
+
# sibling submodules.
|
|
32
|
+
#
|
|
33
|
+
# Why we update BOTH `os.add_dll_directory()` AND `os.environ["PATH"]`:
|
|
34
|
+
# - `os.add_dll_directory()` is the modern, isolated mechanism used by
|
|
35
|
+
# `LoadLibraryEx(..., LOAD_LIBRARY_SEARCH_USER_DIRS)`. Importantly,
|
|
36
|
+
# `ctypes.util.find_library()` on Windows uses the legacy
|
|
37
|
+
# `LoadLibraryW()` path which does **not** consult the
|
|
38
|
+
# `add_dll_directory()` list (see CPython issue #43603). Since
|
|
39
|
+
# `opuslib/api/__init__.py` calls exactly that — `find_library('opus')`
|
|
40
|
+
# — we also have to prepend the directory to PATH so the legacy
|
|
41
|
+
# resolver picks it up.
|
|
42
|
+
# - We add to `add_dll_directory()` too because direct `ctypes.CDLL(...)`
|
|
43
|
+
# / extension-module imports use the modern resolver, and we want
|
|
44
|
+
# bundle discovery to work for both API styles future-proof.
|
|
45
|
+
#
|
|
46
|
+
# See `stackchan_mcp/_libs/SOURCES.md` for the bundled DLL provenance.
|
|
47
|
+
# Architecture gate: the bundled `opus.dll` is built for `win_amd64`
|
|
48
|
+
# (x86_64). On Windows ARM64 / Windows x86 (32-bit), loading the x64
|
|
49
|
+
# DLL would fail with a native-image mismatch — *exactly* the
|
|
50
|
+
# "looks installed but fails at runtime" footgun this bundling is
|
|
51
|
+
# meant to remove. Skip the DLL search-path setup on those
|
|
52
|
+
# architectures so the user falls back to the same
|
|
53
|
+
# "find_library returns None" failure mode they had before this
|
|
54
|
+
# fix, which at least produces a clean ImportError on
|
|
55
|
+
# `import opuslib` rather than a confusing crash inside the DLL
|
|
56
|
+
# loader. A platform-specific wheel build would have rejected those
|
|
57
|
+
# architectures at install time (no compatible wheel), so this
|
|
58
|
+
# guard mostly matters for users who bypass wheel selection (e.g.
|
|
59
|
+
# by installing from sdist on a non-x64 Windows host).
|
|
60
|
+
_machine = _platform.machine().upper() if _sys.platform == "win32" else ""
|
|
61
|
+
_dll_dir_handle = None # kept alive at module scope; see comment below
|
|
62
|
+
|
|
63
|
+
if _sys.platform == "win32" and _machine in ("AMD64", "X86_64"):
|
|
64
|
+
_libs_dir = _Path(__file__).resolve().parent / "_libs"
|
|
65
|
+
if _libs_dir.is_dir():
|
|
66
|
+
# Retain the directory handle at module scope. Per CPython docs
|
|
67
|
+
# (`os.add_dll_directory`), the returned object is "an opaque
|
|
68
|
+
# value that has a `close()` method ... the returned object
|
|
69
|
+
# remains valid until close() is called". On garbage
|
|
70
|
+
# collection, the directory de-registers itself, so direct
|
|
71
|
+
# `ctypes.CDLL(...)` callers that rely on the modern resolver
|
|
72
|
+
# path would lose access to the bundle. Holding the handle on
|
|
73
|
+
# the module keeps the registration live for the process
|
|
74
|
+
# lifetime — matching the intent documented above for both
|
|
75
|
+
# `find_library` (legacy) and `LoadLibraryEx` (modern) lookup
|
|
76
|
+
# paths.
|
|
77
|
+
_dll_dir_handle = _os.add_dll_directory(str(_libs_dir))
|
|
78
|
+
_libs_str = str(_libs_dir)
|
|
79
|
+
_existing_path = _os.environ.get("PATH", "")
|
|
80
|
+
if _libs_str not in _existing_path.split(_os.pathsep):
|
|
81
|
+
_os.environ["PATH"] = _libs_str + _os.pathsep + _existing_path
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Entry point: ``python -m stackchan_mcp``.
|
|
2
|
+
|
|
3
|
+
The actual implementation lives in :mod:`stackchan_mcp.cli` so that the
|
|
4
|
+
console script and ``python -m`` paths share a single side-effect-free
|
|
5
|
+
import surface.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .cli import main
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
main()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Bundled Native Libraries
|
|
2
|
+
|
|
3
|
+
This directory contains pre-built native shared libraries that the
|
|
4
|
+
gateway needs on platforms where the system package manager does not
|
|
5
|
+
typically ship them. They are loaded at import time by
|
|
6
|
+
`stackchan_mcp/__init__.py` via `os.add_dll_directory()` (Windows) so
|
|
7
|
+
that `ctypes.util.find_library()` calls inside Python wrapper packages
|
|
8
|
+
(e.g. `opuslib`) resolve to the bundled copy without any user setup.
|
|
9
|
+
|
|
10
|
+
## Why bundle?
|
|
11
|
+
|
|
12
|
+
The Python wrapper packages that depend on these libraries (currently
|
|
13
|
+
`opuslib`, pulled in via the `[tts]` and `[stt]` extras) only ship
|
|
14
|
+
Python bindings — they do **not** ship the underlying native library.
|
|
15
|
+
On Linux and macOS most users already have `libopus` available through
|
|
16
|
+
their distro's package manager (`apt install libopus0`,
|
|
17
|
+
`brew install opus`, etc.), but on Windows there is no equivalent
|
|
18
|
+
default install path, which means a plain `pip install stackchan-mcp[tts]`
|
|
19
|
+
fails at runtime with `Could not find Opus library. Make sure it is
|
|
20
|
+
installed.` even though the Python wrappers installed cleanly.
|
|
21
|
+
|
|
22
|
+
Bundling the Windows binary in the wheel removes that footgun: every
|
|
23
|
+
Windows user who installs `stackchan-mcp[tts]` (or `[stt]`) gets a
|
|
24
|
+
working installation on the first try, with no extra `vcpkg` /
|
|
25
|
+
`conda install -c conda-forge libopus` / manual DLL placement step.
|
|
26
|
+
|
|
27
|
+
The decision to bundle (vs. download at install time vs. require source
|
|
28
|
+
build) was made on these criteria:
|
|
29
|
+
|
|
30
|
+
| Criterion | Verdict for libopus |
|
|
31
|
+
|---|---|
|
|
32
|
+
| Maturity of the dependency | Mature (Opus is a frozen IETF codec, RFC 6716, 2012) |
|
|
33
|
+
| Frequency of security advisories | Very low (the codec parser is small and well-audited) |
|
|
34
|
+
| File size | ~480 KB — fits comfortably in the wheel |
|
|
35
|
+
| Re-distribution license | BSD 3-clause (Xiph) — redistribution allowed with attribution |
|
|
36
|
+
| Long-term availability of upstream | Excellent (Xiph.Org maintains the source indefinitely) |
|
|
37
|
+
|
|
38
|
+
If any of those change (e.g. a future ML-based bundle that ships
|
|
39
|
+
hundreds of MB), revisit and consider the "CI downloads a pinned
|
|
40
|
+
version at build time" approach instead.
|
|
41
|
+
|
|
42
|
+
## opus.dll
|
|
43
|
+
|
|
44
|
+
| Field | Value |
|
|
45
|
+
|---|---|
|
|
46
|
+
| Architecture | x86_64 (`win_amd64`) |
|
|
47
|
+
| License | BSD 3-clause + Xiph extension — see <https://opus-codec.org/license/> |
|
|
48
|
+
| Provenance | Built from upstream Opus source by the publish workflow via vcpkg |
|
|
49
|
+
| Build command | `vcpkg install opus:x64-windows` (CI runner: `windows-latest`) |
|
|
50
|
+
|
|
51
|
+
### Provenance note
|
|
52
|
+
|
|
53
|
+
`opus.dll` is **not** tracked in git. The publish workflow
|
|
54
|
+
(`.github/workflows/publish.yml`, job `build-windows-wheel`)
|
|
55
|
+
bootstraps a fresh vcpkg checkout on a `windows-latest` runner,
|
|
56
|
+
runs `vcpkg install opus:x64-windows`, copies the produced
|
|
57
|
+
`opus.dll` into `stackchan_mcp/_libs/`, and logs its SHA256 to the
|
|
58
|
+
job log so reviewers can spot vcpkg-side binary drift before a tag
|
|
59
|
+
publishes. The wheel build that follows picks the DLL up via
|
|
60
|
+
`tool.hatch.build.targets.wheel.artifacts` in
|
|
61
|
+
`gateway/pyproject.toml`, and the resulting wheel is renamed from
|
|
62
|
+
`*-py3-none-any.whl` to `*-py3-none-win_amd64.whl` so pip resolves
|
|
63
|
+
it only on Windows x64 installs.
|
|
64
|
+
|
|
65
|
+
Builds on the Ubuntu runner (sdist + the `py3-none-any` wheel they
|
|
66
|
+
produce) do not place a DLL under `_libs/`, so those distributions
|
|
67
|
+
ship clean — non-Windows installs and non-x64 Windows installs
|
|
68
|
+
either fall back to a system `libopus` (Linux/macOS) or get a
|
|
69
|
+
clean "no compatible wheel" install-time message (Windows ARM64 /
|
|
70
|
+
x86 32-bit).
|
|
71
|
+
|
|
72
|
+
### Local development
|
|
73
|
+
|
|
74
|
+
If you need a local Windows checkout to test the bundling path
|
|
75
|
+
(running `uv build` outside CI), mirror the CI step by:
|
|
76
|
+
|
|
77
|
+
1. Installing libopus via vcpkg (`vcpkg install opus:x64-windows`)
|
|
78
|
+
and copying the produced DLL into `stackchan_mcp/_libs/opus.dll`.
|
|
79
|
+
2. Or downloading the same `opus.dll` from a release artifact
|
|
80
|
+
uploaded by the publish workflow.
|
|
81
|
+
3. Or installing system libopus and copying it into the directory.
|
|
82
|
+
|
|
83
|
+
The DLL is gitignored (see `gateway/.gitignore`) so a local copy
|
|
84
|
+
never sneaks into a commit.
|
|
85
|
+
|
|
86
|
+
## License compliance
|
|
87
|
+
|
|
88
|
+
The Opus codec is distributed under the 3-clause BSD license (with the
|
|
89
|
+
optional Xiph patent grant), which permits redistribution in source or
|
|
90
|
+
binary form provided the copyright notice and license text are
|
|
91
|
+
preserved. The canonical notice ships at the top of every gateway
|
|
92
|
+
distribution as `LICENSE-THIRD-PARTY` (declared in
|
|
93
|
+
`gateway/pyproject.toml`'s `license-files`); the same text is
|
|
94
|
+
reproduced below as the bundling-rationale narrative for readers of
|
|
95
|
+
this document.
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
Copyright 2001-2023 Xiph.Org, Skype Limited, Octasic,
|
|
99
|
+
Jean-Marc Valin, Timothy B. Terriberry,
|
|
100
|
+
CSIRO, Gregory Maxwell, Mark Borgerding,
|
|
101
|
+
Erik de Castro Lopo
|
|
102
|
+
|
|
103
|
+
Redistribution and use in source and binary forms, with or without
|
|
104
|
+
modification, are permitted provided that the following conditions
|
|
105
|
+
are met:
|
|
106
|
+
|
|
107
|
+
- Redistributions of source code must retain the above copyright
|
|
108
|
+
notice, this list of conditions and the following disclaimer.
|
|
109
|
+
|
|
110
|
+
- Redistributions in binary form must reproduce the above copyright
|
|
111
|
+
notice, this list of conditions and the following disclaimer in the
|
|
112
|
+
documentation and/or other materials provided with the distribution.
|
|
113
|
+
|
|
114
|
+
- Neither the name of Internet Society, IETF or IETF Trust, nor the
|
|
115
|
+
names of specific contributors, may be used to endorse or promote
|
|
116
|
+
products derived from this software without specific prior written
|
|
117
|
+
permission.
|
|
118
|
+
|
|
119
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
120
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
121
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
122
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
123
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
124
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
125
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
126
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
127
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
128
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
129
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
130
|
+
```
|
|
Binary file
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Device-driven listen audio forwarding to an external HTTP hook.
|
|
2
|
+
|
|
3
|
+
When the ESP32 device autonomously enters listening mode — wake word
|
|
4
|
+
detection (``WakeWordInvoke``), button press, or LCD touch
|
|
5
|
+
(``ToggleChatState``) — the gateway's MCP-driven STT pipeline is not
|
|
6
|
+
running because there is no concurrent ``listen()`` tool call to open a
|
|
7
|
+
recording slot. The inbound Opus frames are therefore discarded today
|
|
8
|
+
(see :mod:`stackchan_mcp.audio_stream` module docstring).
|
|
9
|
+
|
|
10
|
+
This module fills that gap. When configured with a hook URL, the
|
|
11
|
+
gateway opens a recording slot on inbound ``{"type":"listen",
|
|
12
|
+
"state":"start"}`` messages from the device, buffers the Opus frames
|
|
13
|
+
into the same module-level slot used by the MCP-driven path, packs them
|
|
14
|
+
into an Ogg/Opus container on ``{"state":"stop"}``, and POSTs the
|
|
15
|
+
payload to the hook.
|
|
16
|
+
|
|
17
|
+
Configuration:
|
|
18
|
+
|
|
19
|
+
- ``STACKCHAN_AUDIO_HOOK_URL`` — HTTP(S) URL of the receiver. The
|
|
20
|
+
device-driven capture path is silently disabled when unset.
|
|
21
|
+
- ``STACKCHAN_AUDIO_HOOK_TOKEN`` — Bearer token; falls back to
|
|
22
|
+
``STACKCHAN_TOKEN`` so a single-token setup works without extra
|
|
23
|
+
configuration.
|
|
24
|
+
|
|
25
|
+
The capture path is opt-in by design: stackchan-mcp's primary listen
|
|
26
|
+
model is MCP-client-driven (the ``listen()`` tool), and device-driven
|
|
27
|
+
capture only makes sense when an external service is set up to receive
|
|
28
|
+
the audio. Leaving the hook URL unset keeps the gateway's behaviour
|
|
29
|
+
unchanged from server-driven listen.
|
|
30
|
+
|
|
31
|
+
Ogg container implementation note: we assemble the container directly
|
|
32
|
+
in pure Python (RFC 3533 + RFC 7845) rather than pulling in pyogg or
|
|
33
|
+
similar. The format is well-specified and our inputs are fixed (single
|
|
34
|
+
stream, mono, 16 kHz, 60 ms Opus frames), so a 200-line implementation
|
|
35
|
+
keeps the dependency surface unchanged from the existing ``[stt]`` /
|
|
36
|
+
``[tts]`` extras (just ``opuslib`` for codec, no container library).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import logging
|
|
42
|
+
import struct
|
|
43
|
+
from typing import Sequence
|
|
44
|
+
|
|
45
|
+
import aiohttp
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# --- Device audio parameters -------------------------------------------------
|
|
51
|
+
|
|
52
|
+
#: Sample rate the firmware emits (matches the STT pipeline's expectation;
|
|
53
|
+
#: see :data:`stackchan_mcp.stt.audio_utils.DEVICE_SAMPLE_RATE`).
|
|
54
|
+
DEVICE_SAMPLE_RATE = 16000
|
|
55
|
+
|
|
56
|
+
#: Frame duration the firmware emits. 60 ms is the xiaozhi-esp32 default;
|
|
57
|
+
#: each WebSocket binary message carries exactly one Opus frame.
|
|
58
|
+
DEVICE_FRAME_DURATION_MS = 60
|
|
59
|
+
|
|
60
|
+
#: Number of audio samples per Opus frame, at the device sample rate.
|
|
61
|
+
SAMPLES_PER_FRAME = DEVICE_SAMPLE_RATE * DEVICE_FRAME_DURATION_MS // 1000 # 960
|
|
62
|
+
|
|
63
|
+
#: Opus granule positions are always expressed in 48 kHz samples, even when
|
|
64
|
+
#: the underlying stream is 16 kHz mono (RFC 7845 §4.1.7). So one 60 ms frame
|
|
65
|
+
#: advances the granule by 48000 * 60/1000 = 2880.
|
|
66
|
+
GRANULE_PER_FRAME = 48000 * DEVICE_FRAME_DURATION_MS // 1000 # 2880
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# --- Ogg/Opus container ------------------------------------------------------
|
|
70
|
+
#
|
|
71
|
+
# Ogg page layout (RFC 3533 §6):
|
|
72
|
+
# 0..3 "OggS"
|
|
73
|
+
# 4 stream_structure_version (0)
|
|
74
|
+
# 5 header_type_flag (0x02 BOS, 0x04 EOS, 0x01 continued; can OR)
|
|
75
|
+
# 6..13 granule_position (int64 LE)
|
|
76
|
+
# 14..17 bitstream_serial_number (uint32 LE)
|
|
77
|
+
# 18..21 page_sequence_number (uint32 LE)
|
|
78
|
+
# 22..25 CRC32 (zeroed during calculation, then patched)
|
|
79
|
+
# 26 number_of_page_segments (1..255)
|
|
80
|
+
# 27.. segment_table (one byte per segment, 0..255 each)
|
|
81
|
+
# .. segment data (concatenated)
|
|
82
|
+
#
|
|
83
|
+
# CRC32 polynomial: 0x04C11DB7, MSB-first, no initial value, no final XOR.
|
|
84
|
+
# This differs from zlib.crc32; we precompute a table.
|
|
85
|
+
|
|
86
|
+
_OGG_MAGIC = b"OggS"
|
|
87
|
+
_OPUS_HEAD_MAGIC = b"OpusHead"
|
|
88
|
+
_OPUS_TAGS_MAGIC = b"OpusTags"
|
|
89
|
+
|
|
90
|
+
_HEADER_BOS = 0x02
|
|
91
|
+
_HEADER_EOS = 0x04
|
|
92
|
+
_HEADER_CONTINUED = 0x01
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _build_ogg_crc_table() -> list[int]:
|
|
96
|
+
"""Precompute the 256-entry CRC32 table for Ogg's MSB-first polynomial.
|
|
97
|
+
|
|
98
|
+
Ogg's CRC is a "vanilla" 32-bit CRC (no reflection, no XOR-out) with
|
|
99
|
+
polynomial 0x04C11DB7. zlib.crc32 uses the same polynomial but with
|
|
100
|
+
bit-reflected input/output and XOR-out 0xFFFFFFFF, so we cannot reuse
|
|
101
|
+
it directly.
|
|
102
|
+
"""
|
|
103
|
+
poly = 0x04C11DB7
|
|
104
|
+
table = []
|
|
105
|
+
for byte in range(256):
|
|
106
|
+
crc = byte << 24
|
|
107
|
+
for _ in range(8):
|
|
108
|
+
if crc & 0x80000000:
|
|
109
|
+
crc = ((crc << 1) ^ poly) & 0xFFFFFFFF
|
|
110
|
+
else:
|
|
111
|
+
crc = (crc << 1) & 0xFFFFFFFF
|
|
112
|
+
table.append(crc)
|
|
113
|
+
return table
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
_OGG_CRC_TABLE = _build_ogg_crc_table()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _ogg_crc32(data: bytes) -> int:
|
|
120
|
+
"""Compute Ogg's CRC32 over ``data`` (table-driven, MSB-first)."""
|
|
121
|
+
crc = 0
|
|
122
|
+
for byte in data:
|
|
123
|
+
crc = ((crc << 8) ^ _OGG_CRC_TABLE[((crc >> 24) ^ byte) & 0xFF]) & 0xFFFFFFFF
|
|
124
|
+
return crc
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _packet_to_segments(packet: bytes) -> list[bytes]:
|
|
128
|
+
"""Split an Ogg packet into 255-byte lacing segments (RFC 3533 §6).
|
|
129
|
+
|
|
130
|
+
Packets longer than 255 bytes are split into 255-byte runs; packets
|
|
131
|
+
whose length is exactly a multiple of 255 are terminated with a
|
|
132
|
+
zero-length segment so the parser knows the packet ended there
|
|
133
|
+
(otherwise it would expect a continuation into the next page).
|
|
134
|
+
Variable-bitrate Opus frames can exceed 255 bytes in practice, so
|
|
135
|
+
this split must happen before page assembly.
|
|
136
|
+
"""
|
|
137
|
+
segments: list[bytes] = []
|
|
138
|
+
if not packet:
|
|
139
|
+
# An empty packet is itself a single zero-length segment.
|
|
140
|
+
return [b""]
|
|
141
|
+
pos = 0
|
|
142
|
+
n = len(packet)
|
|
143
|
+
while pos < n:
|
|
144
|
+
chunk = packet[pos:pos + 255]
|
|
145
|
+
segments.append(chunk)
|
|
146
|
+
pos += 255
|
|
147
|
+
if len(packet) % 255 == 0:
|
|
148
|
+
# Packet ends exactly on a 255-byte boundary — append a
|
|
149
|
+
# terminating zero-length segment per RFC 3533.
|
|
150
|
+
segments.append(b"")
|
|
151
|
+
return segments
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _build_ogg_page(
|
|
155
|
+
*,
|
|
156
|
+
header_type: int,
|
|
157
|
+
granule_position: int,
|
|
158
|
+
serial: int,
|
|
159
|
+
page_sequence: int,
|
|
160
|
+
segments: Sequence[bytes],
|
|
161
|
+
) -> bytes:
|
|
162
|
+
"""Assemble one Ogg page from a list of segments (each ≤ 255 bytes).
|
|
163
|
+
|
|
164
|
+
Caller is responsible for splitting larger packets into ≤ 255-byte
|
|
165
|
+
segments and emitting a 0-byte terminating segment when a packet
|
|
166
|
+
happens to end exactly on a 255-byte boundary (Ogg packetisation
|
|
167
|
+
rule, RFC 3533 §6). For our 60 ms Opus frames at 16 kbps target
|
|
168
|
+
this never fires — frames are well under 255 bytes — but we keep
|
|
169
|
+
the API segment-oriented for correctness.
|
|
170
|
+
"""
|
|
171
|
+
if len(segments) < 1 or len(segments) > 255:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"Ogg page must have 1..255 segments, got {len(segments)}"
|
|
174
|
+
)
|
|
175
|
+
segment_table = bytes(len(s) for s in segments)
|
|
176
|
+
for seg in segments:
|
|
177
|
+
if len(seg) > 255:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Ogg segment exceeds 255 bytes ({len(seg)}); "
|
|
180
|
+
"split into multiple segments before calling _build_ogg_page"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
body = b"".join(segments)
|
|
184
|
+
header = struct.pack(
|
|
185
|
+
"<4sBBqII",
|
|
186
|
+
_OGG_MAGIC,
|
|
187
|
+
0, # stream_structure_version
|
|
188
|
+
header_type,
|
|
189
|
+
granule_position,
|
|
190
|
+
serial,
|
|
191
|
+
page_sequence,
|
|
192
|
+
)
|
|
193
|
+
# CRC field placeholder (4 bytes of 0x00), then segment count and table.
|
|
194
|
+
header_with_crc_placeholder = (
|
|
195
|
+
header + b"\x00\x00\x00\x00" + bytes([len(segments)]) + segment_table
|
|
196
|
+
)
|
|
197
|
+
full_page = header_with_crc_placeholder + body
|
|
198
|
+
crc = _ogg_crc32(full_page)
|
|
199
|
+
# Patch CRC at offset 22.
|
|
200
|
+
return full_page[:22] + struct.pack("<I", crc) + full_page[26:]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _build_opus_head_packet(
|
|
204
|
+
*,
|
|
205
|
+
channels: int = 1,
|
|
206
|
+
pre_skip: int = 0,
|
|
207
|
+
input_sample_rate: int = DEVICE_SAMPLE_RATE,
|
|
208
|
+
) -> bytes:
|
|
209
|
+
"""OpusHead identification header packet (RFC 7845 §5.1)."""
|
|
210
|
+
return struct.pack(
|
|
211
|
+
"<8sBBHIhB",
|
|
212
|
+
_OPUS_HEAD_MAGIC,
|
|
213
|
+
1, # version
|
|
214
|
+
channels,
|
|
215
|
+
pre_skip,
|
|
216
|
+
input_sample_rate, # informational only; decoder always runs at 48 kHz
|
|
217
|
+
0, # output_gain (Q7.8 fixed-point dB), 0 = unchanged
|
|
218
|
+
0, # channel_mapping_family: 0 = mono/stereo
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _build_opus_tags_packet(vendor: bytes = b"stackchan-mcp") -> bytes:
|
|
223
|
+
"""OpusTags comment header packet (RFC 7845 §5.2). Empty comment list."""
|
|
224
|
+
return (
|
|
225
|
+
_OPUS_TAGS_MAGIC
|
|
226
|
+
+ struct.pack("<I", len(vendor))
|
|
227
|
+
+ vendor
|
|
228
|
+
+ struct.pack("<I", 0) # comment_count = 0
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# How many Opus frames to pack into a single audio page. The Ogg spec
|
|
233
|
+
# allows up to 255 segments per page (and our frames fit in one segment
|
|
234
|
+
# each at this bitrate). Smaller pages give finer-grained recovery on
|
|
235
|
+
# corruption but waste header bytes; 50 is a comfortable middle
|
|
236
|
+
# (~3 seconds of audio per page).
|
|
237
|
+
_FRAMES_PER_PAGE = 50
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def pack_opus_frames_to_ogg(
|
|
241
|
+
frames: Sequence[bytes],
|
|
242
|
+
*,
|
|
243
|
+
serial: int = 1,
|
|
244
|
+
channels: int = 1,
|
|
245
|
+
pre_skip: int = 0,
|
|
246
|
+
) -> bytes:
|
|
247
|
+
"""Pack raw Opus frames into a complete Ogg/Opus stream.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
frames: One raw Opus packet per element, as emitted by the
|
|
251
|
+
xiaozhi-esp32 firmware (one packet per WebSocket binary
|
|
252
|
+
message). Empty input yields an empty bytes object so the
|
|
253
|
+
caller can short-circuit "no audio" without raising.
|
|
254
|
+
serial: Ogg bitstream serial number. Required to be present in
|
|
255
|
+
every page; the value itself is opaque to the decoder,
|
|
256
|
+
but uniqueness matters when multiplexing — we are not, so
|
|
257
|
+
any non-zero value works.
|
|
258
|
+
channels: 1 (mono) or 2 (stereo). The firmware sends mono.
|
|
259
|
+
pre_skip: Samples to drop at the start of decoded output, in
|
|
260
|
+
48 kHz units (RFC 7845 §5.1). 0 is the conservative default;
|
|
261
|
+
a real encoder reports its actual look-ahead here.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
A bytes object containing the full Ogg/Opus stream (BOS page
|
|
265
|
+
with OpusHead, page with OpusTags, one or more audio pages, the
|
|
266
|
+
last marked EOS). Ready to be POSTed as ``audio/ogg``.
|
|
267
|
+
"""
|
|
268
|
+
if not frames:
|
|
269
|
+
return b""
|
|
270
|
+
|
|
271
|
+
out = bytearray()
|
|
272
|
+
page_seq = 0
|
|
273
|
+
|
|
274
|
+
# Page 0: BOS with OpusHead.
|
|
275
|
+
out += _build_ogg_page(
|
|
276
|
+
header_type=_HEADER_BOS,
|
|
277
|
+
granule_position=0,
|
|
278
|
+
serial=serial,
|
|
279
|
+
page_sequence=page_seq,
|
|
280
|
+
segments=[_build_opus_head_packet(
|
|
281
|
+
channels=channels,
|
|
282
|
+
pre_skip=pre_skip,
|
|
283
|
+
input_sample_rate=DEVICE_SAMPLE_RATE,
|
|
284
|
+
)],
|
|
285
|
+
)
|
|
286
|
+
page_seq += 1
|
|
287
|
+
|
|
288
|
+
# Page 1: OpusTags. RFC 7845 requires this as the second page,
|
|
289
|
+
# before any audio data.
|
|
290
|
+
out += _build_ogg_page(
|
|
291
|
+
header_type=0,
|
|
292
|
+
granule_position=0,
|
|
293
|
+
serial=serial,
|
|
294
|
+
page_sequence=page_seq,
|
|
295
|
+
segments=[_build_opus_tags_packet()],
|
|
296
|
+
)
|
|
297
|
+
page_seq += 1
|
|
298
|
+
|
|
299
|
+
# Audio pages: ``_FRAMES_PER_PAGE`` frames per page until the last
|
|
300
|
+
# page, which is marked EOS regardless of fill.
|
|
301
|
+
total_frames = len(frames)
|
|
302
|
+
granule = 0
|
|
303
|
+
for start in range(0, total_frames, _FRAMES_PER_PAGE):
|
|
304
|
+
end = min(start + _FRAMES_PER_PAGE, total_frames)
|
|
305
|
+
page_frames = frames[start:end]
|
|
306
|
+
granule += len(page_frames) * GRANULE_PER_FRAME
|
|
307
|
+
is_last_page = end == total_frames
|
|
308
|
+
# Split each opus packet into Ogg lacing segments. VBR opus can
|
|
309
|
+
# produce packets > 255 bytes, which Ogg encodes as multiple
|
|
310
|
+
# 255-byte segments plus a trailing remainder; packets whose
|
|
311
|
+
# length is an exact multiple of 255 need a zero-length
|
|
312
|
+
# terminator (RFC 3533 §6). _build_ogg_page expects ≤ 255
|
|
313
|
+
# segments per page, so a page's segment count can exceed
|
|
314
|
+
# _FRAMES_PER_PAGE when individual frames have to be split.
|
|
315
|
+
# Flush mid-batch when the segment table is about to overflow
|
|
316
|
+
# so each emitted page stays inside the 255-segment limit.
|
|
317
|
+
segments: list[bytes] = []
|
|
318
|
+
for frame in page_frames:
|
|
319
|
+
frame_segs = _packet_to_segments(frame)
|
|
320
|
+
if len(segments) + len(frame_segs) > 255:
|
|
321
|
+
out += _build_ogg_page(
|
|
322
|
+
header_type=0, # continuation page
|
|
323
|
+
granule_position=granule,
|
|
324
|
+
serial=serial,
|
|
325
|
+
page_sequence=page_seq,
|
|
326
|
+
segments=segments,
|
|
327
|
+
)
|
|
328
|
+
page_seq += 1
|
|
329
|
+
segments = []
|
|
330
|
+
segments.extend(frame_segs)
|
|
331
|
+
if segments:
|
|
332
|
+
out += _build_ogg_page(
|
|
333
|
+
header_type=_HEADER_EOS if is_last_page else 0,
|
|
334
|
+
granule_position=granule,
|
|
335
|
+
serial=serial,
|
|
336
|
+
page_sequence=page_seq,
|
|
337
|
+
segments=segments,
|
|
338
|
+
)
|
|
339
|
+
page_seq += 1
|
|
340
|
+
|
|
341
|
+
return bytes(out)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# --- HTTP push --------------------------------------------------------------
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
async def push_audio_capture(
|
|
348
|
+
hook_url: str,
|
|
349
|
+
token: str,
|
|
350
|
+
frames: Sequence[bytes],
|
|
351
|
+
*,
|
|
352
|
+
session_id: str = "",
|
|
353
|
+
timeout_s: float = 10.0,
|
|
354
|
+
) -> bool:
|
|
355
|
+
"""POST a device-driven listen capture to the configured hook URL.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
hook_url: Receiver URL (typically the SAIVerse-side
|
|
359
|
+
``audio_input_relay`` endpoint). Must be set.
|
|
360
|
+
token: Bearer token for ``Authorization: Bearer <token>``.
|
|
361
|
+
Empty string disables auth header (mirroring
|
|
362
|
+
``STACKCHAN_TOKEN`` semantics — gateway logs a warning at
|
|
363
|
+
startup when the token is unset).
|
|
364
|
+
frames: Raw Opus packets from the device for this listen window.
|
|
365
|
+
session_id: ESP32 connection session ID, forwarded to the
|
|
366
|
+
receiver via the ``X-StackChan-Session`` header so the
|
|
367
|
+
receiver can correlate captures with vessel pairing.
|
|
368
|
+
timeout_s: Total HTTP timeout (default 10s; an Ogg blob for a
|
|
369
|
+
5-minute capture is well under 1 MB so this is generous).
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
``True`` if the POST returned 2xx, ``False`` otherwise (including
|
|
373
|
+
on Ogg pack failure or network error). Failures are logged at
|
|
374
|
+
WARNING; callers do not need to log again.
|
|
375
|
+
"""
|
|
376
|
+
if not frames:
|
|
377
|
+
logger.debug(
|
|
378
|
+
"audio_input_hook: skipping push, no frames (session=%s)", session_id
|
|
379
|
+
)
|
|
380
|
+
return False
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
ogg_payload = pack_opus_frames_to_ogg(frames)
|
|
384
|
+
except Exception as exc:
|
|
385
|
+
logger.warning(
|
|
386
|
+
"audio_input_hook: Ogg pack failed for %d frames (session=%s): %s",
|
|
387
|
+
len(frames), session_id, exc,
|
|
388
|
+
)
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
headers = {
|
|
392
|
+
"Content-Type": "audio/ogg",
|
|
393
|
+
"X-StackChan-Session": session_id,
|
|
394
|
+
}
|
|
395
|
+
if token:
|
|
396
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
397
|
+
|
|
398
|
+
try:
|
|
399
|
+
async with aiohttp.ClientSession() as session:
|
|
400
|
+
async with session.post(
|
|
401
|
+
hook_url,
|
|
402
|
+
data=ogg_payload,
|
|
403
|
+
headers=headers,
|
|
404
|
+
timeout=aiohttp.ClientTimeout(total=timeout_s),
|
|
405
|
+
) as response:
|
|
406
|
+
if 200 <= response.status < 300:
|
|
407
|
+
logger.info(
|
|
408
|
+
"audio_input_hook: pushed %d frames (%d bytes) "
|
|
409
|
+
"session=%s status=%d",
|
|
410
|
+
len(frames), len(ogg_payload), session_id,
|
|
411
|
+
response.status,
|
|
412
|
+
)
|
|
413
|
+
return True
|
|
414
|
+
body_snippet = (await response.text())[:200]
|
|
415
|
+
logger.warning(
|
|
416
|
+
"audio_input_hook: POST returned status=%d session=%s "
|
|
417
|
+
"body=%r",
|
|
418
|
+
response.status, session_id, body_snippet,
|
|
419
|
+
)
|
|
420
|
+
return False
|
|
421
|
+
except aiohttp.ClientError as exc:
|
|
422
|
+
logger.warning(
|
|
423
|
+
"audio_input_hook: POST failed (network error) session=%s: %s",
|
|
424
|
+
session_id, exc,
|
|
425
|
+
)
|
|
426
|
+
return False
|
|
427
|
+
except Exception as exc:
|
|
428
|
+
logger.warning(
|
|
429
|
+
"audio_input_hook: POST failed (unexpected) session=%s: %s",
|
|
430
|
+
session_id, exc,
|
|
431
|
+
)
|
|
432
|
+
return False
|