alter-runtime 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alter_runtime/__init__.py +11 -0
- alter_runtime/adapters/__init__.py +19 -0
- alter_runtime/adapters/claude_jsonl_watcher.py +545 -0
- alter_runtime/adapters/git_watcher.py +457 -0
- alter_runtime/adapters/household/__init__.py +29 -0
- alter_runtime/adapters/household/_base.py +138 -0
- alter_runtime/adapters/household/compost/__init__.py +17 -0
- alter_runtime/adapters/household/compost/adapter.py +81 -0
- alter_runtime/adapters/household/compost/storage.py +75 -0
- alter_runtime/adapters/household/compost/tests/__init__.py +0 -0
- alter_runtime/adapters/household/compost/tests/test_adapter.py +62 -0
- alter_runtime/adapters/household/compost/tests/test_storage.py +23 -0
- alter_runtime/adapters/household/compost/tests/test_traits.py +38 -0
- alter_runtime/adapters/household/compost/traits.py +79 -0
- alter_runtime/adapters/household/self_hoster/__init__.py +30 -0
- alter_runtime/adapters/household/self_hoster/adapter.py +248 -0
- alter_runtime/adapters/household/self_hoster/storage.py +83 -0
- alter_runtime/adapters/household/self_hoster/tests/__init__.py +0 -0
- alter_runtime/adapters/household/self_hoster/tests/test_adapter.py +216 -0
- alter_runtime/adapters/household/self_hoster/tests/test_storage.py +25 -0
- alter_runtime/adapters/household/self_hoster/tests/test_traits.py +55 -0
- alter_runtime/adapters/household/self_hoster/traits.py +105 -0
- alter_runtime/adapters/household/tapo_ecosystem/__init__.py +22 -0
- alter_runtime/adapters/household/tapo_ecosystem/adapter.py +98 -0
- alter_runtime/adapters/household/tapo_ecosystem/storage.py +95 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/__init__.py +0 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_adapter.py +55 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_storage.py +28 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_traits.py +45 -0
- alter_runtime/adapters/household/tapo_ecosystem/traits.py +97 -0
- alter_runtime/adapters/household/workshop_tools/__init__.py +25 -0
- alter_runtime/adapters/household/workshop_tools/adapter.py +77 -0
- alter_runtime/adapters/household/workshop_tools/storage.py +92 -0
- alter_runtime/adapters/household/workshop_tools/tests/__init__.py +0 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_adapter.py +48 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_storage.py +26 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_traits.py +45 -0
- alter_runtime/adapters/household/workshop_tools/traits.py +95 -0
- alter_runtime/adapters/worktree_watcher.py +378 -0
- alter_runtime/atlas/__init__.py +48 -0
- alter_runtime/atlas/base.py +102 -0
- alter_runtime/atlas/ledger.py +196 -0
- alter_runtime/atlas/observations.py +136 -0
- alter_runtime/atlas/schema.py +106 -0
- alter_runtime/cap_cache.py +392 -0
- alter_runtime/cli.py +517 -0
- alter_runtime/clients/__init__.py +0 -0
- alter_runtime/clients/token_usage_client.py +273 -0
- alter_runtime/config.py +648 -0
- alter_runtime/consent.py +425 -0
- alter_runtime/daemon.py +518 -0
- alter_runtime/floor_loop.py +335 -0
- alter_runtime/floor_preflight.py +734 -0
- alter_runtime/http_auth.py +173 -0
- alter_runtime/notifiers/__init__.py +18 -0
- alter_runtime/notifiers/desktop.py +321 -0
- alter_runtime/sdk/__init__.py +12 -0
- alter_runtime/sdk/client.py +399 -0
- alter_runtime/service_install.py +616 -0
- alter_runtime/services/__init__.py +59 -0
- alter_runtime/services/launchd/com.alter.runtime.plist.in +90 -0
- alter_runtime/services/systemd/alter-runtime.service.in +74 -0
- alter_runtime/services/systemd/cf-access-env.conf.in +29 -0
- alter_runtime/sockets/__init__.py +20 -0
- alter_runtime/sockets/dbus.py +272 -0
- alter_runtime/sockets/unix.py +702 -0
- alter_runtime/subscribers/__init__.py +58 -0
- alter_runtime/subscribers/active_sessions_cron_emitter.py +313 -0
- alter_runtime/subscribers/active_sessions_do_publisher.py +1159 -0
- alter_runtime/subscribers/active_sessions_gc.py +432 -0
- alter_runtime/subscribers/active_sessions_writer.py +446 -0
- alter_runtime/subscribers/adapters_writer.py +415 -0
- alter_runtime/subscribers/agent_frames.py +461 -0
- alter_runtime/subscribers/bus.py +188 -0
- alter_runtime/subscribers/cache_writer.py +347 -0
- alter_runtime/subscribers/ceremony_echo.py +290 -0
- alter_runtime/subscribers/do_sse.py +864 -0
- alter_runtime/subscribers/ebpf.py +506 -0
- alter_runtime/subscribers/inbox_writer.py +469 -0
- alter_runtime/subscribers/mcp_fallback.py +391 -0
- alter_runtime/subscribers/presence_writer.py +426 -0
- alter_runtime/subscribers/session_presence.py +467 -0
- alter_runtime/subscribers/sse.py +125 -0
- alter_runtime/subscribers/weave_intent_writer.py +608 -0
- alter_runtime/update_loop.py +519 -0
- alter_runtime/weave/__init__.py +21 -0
- alter_runtime/weave/resolver.py +544 -0
- alter_runtime-0.3.0.dist-info/METADATA +289 -0
- alter_runtime-0.3.0.dist-info/RECORD +92 -0
- alter_runtime-0.3.0.dist-info/WHEEL +4 -0
- alter_runtime-0.3.0.dist-info/entry_points.txt +2 -0
- alter_runtime-0.3.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
"""DoSseSubscriber - primary L1 ingress via Cloudflare Durable Object SSE.
|
|
2
|
+
|
|
3
|
+
Opens a long-lived Server-Sent Events connection against
|
|
4
|
+
``https://mcp.truealter.com/events/{handle}/stream`` (templated from
|
|
5
|
+
:class:`DaemonConfig.do_sse_endpoint`) authenticated with the alter-cli session
|
|
6
|
+
JWT, parses frames via :func:`parse_sse_frames`, and publishes:
|
|
7
|
+
|
|
8
|
+
* ``identity.frame`` - the raw :class:`SSEFrame` (for subscribers that need the
|
|
9
|
+
event name or the raw payload string).
|
|
10
|
+
* ``identity.event`` - the parsed JSON body as a ``dict`` (convenience for
|
|
11
|
+
99% of consumers; skipped when the payload isn't valid JSON).
|
|
12
|
+
* ``identity.connected`` - sentinel published once the first byte of a new
|
|
13
|
+
connection lands. Payload: ``{"handle": str, "reconnect_count": int}``.
|
|
14
|
+
* ``identity.disconnected`` - sentinel published when the connection drops or
|
|
15
|
+
stalls past :attr:`DaemonConfig.fallback_trigger_after_seconds`. Payload:
|
|
16
|
+
``{"handle": str, "reason": str, "reconnect_count": int}``.
|
|
17
|
+
|
|
18
|
+
The subscriber is the only component that talks to the network on the hot
|
|
19
|
+
path; every other component reads from the bus. That separation is deliberate
|
|
20
|
+
so the fallback path (:class:`McpFallbackSubscriber`) can take over without
|
|
21
|
+
the fan-out layer noticing.
|
|
22
|
+
|
|
23
|
+
Reconnection semantics (D-RT9, §6.2 of *Alter-to-Alter Messaging*):
|
|
24
|
+
|
|
25
|
+
* The SSE ``id:`` field from the most recently parsed frame is remembered
|
|
26
|
+
across reconnects and sent back as the ``Last-Event-ID`` HTTP header so the
|
|
27
|
+
DO can replay anything we missed.
|
|
28
|
+
* Transient errors (``httpx.TransportError``, 502/503/504, socket reset) are
|
|
29
|
+
logged at warning level and retried with exponential backoff capped at
|
|
30
|
+
``MAX_BACKOFF_SECONDS``.
|
|
31
|
+
* Authentication errors (401/403) are logged at error level and retried more
|
|
32
|
+
slowly - the JWT may have been rotated and the daemon does not own
|
|
33
|
+
re-authentication; the operator is expected to run ``alter login``.
|
|
34
|
+
* Cancellation (``asyncio.CancelledError``) propagates so the supervisor can
|
|
35
|
+
shut the component down cleanly.
|
|
36
|
+
|
|
37
|
+
The subscriber never raises out of :meth:`run` - the supervisor's
|
|
38
|
+
exponential-backoff restart machinery is kept as a last-resort safety net for
|
|
39
|
+
truly unexpected exceptions.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
import base64
|
|
46
|
+
import json
|
|
47
|
+
import logging
|
|
48
|
+
import random
|
|
49
|
+
import ssl
|
|
50
|
+
import time
|
|
51
|
+
from collections import OrderedDict
|
|
52
|
+
from dataclasses import dataclass, field
|
|
53
|
+
from typing import TYPE_CHECKING
|
|
54
|
+
|
|
55
|
+
import httpx
|
|
56
|
+
|
|
57
|
+
from alter_runtime.config import ConfigError, DaemonConfig
|
|
58
|
+
from alter_runtime.daemon import Component
|
|
59
|
+
from alter_runtime.http_auth import backend_default_headers
|
|
60
|
+
from alter_runtime.subscribers.sse import SSEFrame, parse_sse_frames
|
|
61
|
+
|
|
62
|
+
if TYPE_CHECKING:
|
|
63
|
+
from alter_runtime.config import Session
|
|
64
|
+
from alter_runtime.subscribers.bus import EventBus
|
|
65
|
+
|
|
66
|
+
__all__ = ["DoSseSubscriber"]
|
|
67
|
+
|
|
68
|
+
logger = logging.getLogger("alter_runtime.subscribers.do_sse")
|
|
69
|
+
|
|
70
|
+
#: Topic published once per successful connection start.
|
|
71
|
+
TOPIC_CONNECTED = "identity.connected"
|
|
72
|
+
#: Topic published when the connection drops or the stream stalls.
|
|
73
|
+
TOPIC_DISCONNECTED = "identity.disconnected"
|
|
74
|
+
#: Topic published per raw SSE frame.
|
|
75
|
+
TOPIC_FRAME = "identity.frame"
|
|
76
|
+
#: Topic published with the parsed JSON event body (skipped on parse failure).
|
|
77
|
+
TOPIC_EVENT = "identity.event"
|
|
78
|
+
|
|
79
|
+
#: Initial back-off in seconds after a transient network error.
|
|
80
|
+
BASE_BACKOFF_SECONDS: float = 1.0
|
|
81
|
+
#: Upper bound on exponential back-off.
|
|
82
|
+
MAX_BACKOFF_SECONDS: float = 60.0
|
|
83
|
+
#: Slow back-off for authentication failures - the user probably needs to
|
|
84
|
+
#: run ``alter login`` to fix the session, so we don't want to hammer the edge.
|
|
85
|
+
AUTH_ERROR_BACKOFF_SECONDS: float = 300.0
|
|
86
|
+
#: A connection that survives at least this long is treated as "healthy" - the
|
|
87
|
+
#: next reconnect starts the backoff fresh; a connection that dies sooner is a
|
|
88
|
+
#: flap and the backoff keeps climbing.
|
|
89
|
+
STABLE_CONNECTION_SECONDS: float = 30.0
|
|
90
|
+
|
|
91
|
+
#: Maximum number of recently-seen SSE frame IDs retained for dedup. 256 is
|
|
92
|
+
#: large enough to absorb an SSE replay burst (Last-Event-ID resume after a
|
|
93
|
+
#: transient drop usually replays the tail of the backlog) while staying
|
|
94
|
+
#: cheap to keep in-memory. Closes runtime/M-3 from
|
|
95
|
+
#: pentest-findings-2026-04-15.md.
|
|
96
|
+
FRAME_DEDUP_WINDOW: int = 256
|
|
97
|
+
|
|
98
|
+
#: Max age (seconds) of a JWT ``iat`` claim before we refuse to use the
|
|
99
|
+
#: bearer. Pre-T-0 kill-switch: if the session.json is ancient we'd rather
|
|
100
|
+
#: fail loud than let a leaked long-lived bearer ferry events into every
|
|
101
|
+
#: CC session. The ~30s target referenced in the task description is too
|
|
102
|
+
#: tight for alter-cli's current session lifecycle (JWTs live for hours);
|
|
103
|
+
#: 24h is the practical middle ground and still catches the stolen-laptop
|
|
104
|
+
#: scenario where a dormant session.json is replayed weeks later. The
|
|
105
|
+
#: tighter 30s check applies at _frame_ freshness once DO signer lands.
|
|
106
|
+
MAX_BEARER_AGE_SECONDS: float = 24 * 60 * 60
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _build_tls_context() -> ssl.SSLContext:
|
|
110
|
+
"""Construct a strict TLS context for the DO SSE client.
|
|
111
|
+
|
|
112
|
+
No ``CERT_NONE``, no hostname-check disable. Explicit construction so
|
|
113
|
+
that any future attempt to weaken the client's TLS posture has to edit
|
|
114
|
+
this function and trigger review, rather than silently flipping a
|
|
115
|
+
kwarg on the httpx.AsyncClient constructor.
|
|
116
|
+
"""
|
|
117
|
+
context = ssl.create_default_context()
|
|
118
|
+
context.check_hostname = True
|
|
119
|
+
context.verify_mode = ssl.CERT_REQUIRED
|
|
120
|
+
# Minimum TLS 1.2 - matches Cloudflare's edge minimum. Setting
|
|
121
|
+
# ``minimum_version`` instead of deprecated options= flags keeps us on
|
|
122
|
+
# the modern OpenSSL path.
|
|
123
|
+
context.minimum_version = ssl.TLSVersion.TLSv1_2
|
|
124
|
+
return context
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _peek_jwt_iat(jwt: str) -> float | None:
|
|
128
|
+
"""Return the ``iat`` (issued-at) claim of ``jwt`` as a UNIX timestamp.
|
|
129
|
+
|
|
130
|
+
Best-effort parse - no signature verification (the DO does that). We
|
|
131
|
+
only read ``iat`` so we can reject dormant session bearers at start-up.
|
|
132
|
+
Returns ``None`` if the token doesn't carry a readable payload.
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
parts = jwt.split(".")
|
|
136
|
+
if len(parts) < 2:
|
|
137
|
+
return None
|
|
138
|
+
payload_b64 = parts[1]
|
|
139
|
+
# base64url requires padding in fours; pad up before decoding.
|
|
140
|
+
padding = "=" * (-len(payload_b64) % 4)
|
|
141
|
+
payload_bytes = base64.urlsafe_b64decode(payload_b64 + padding)
|
|
142
|
+
payload = json.loads(payload_bytes.decode("utf-8"))
|
|
143
|
+
except (ValueError, json.JSONDecodeError, UnicodeDecodeError):
|
|
144
|
+
return None
|
|
145
|
+
iat = payload.get("iat")
|
|
146
|
+
if isinstance(iat, (int, float)):
|
|
147
|
+
return float(iat)
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
#: Length in bytes of a valid Ed25519 signature (RFC 8032).
|
|
152
|
+
ED25519_SIGNATURE_BYTES: int = 64
|
|
153
|
+
#: Length in bytes of a valid Ed25519 public key (RFC 8032).
|
|
154
|
+
ED25519_PUBKEY_BYTES: int = 32
|
|
155
|
+
#: Prefix on :attr:`DaemonConfig.frame_signature_pubkey` and on the
|
|
156
|
+
#: ``pubkey`` field emitted by the DO. Matches the canonical envelope
|
|
157
|
+
#: convention in ``/tmp/alter-unification-canonical-fragment.md``.
|
|
158
|
+
ED25519_PUBKEY_PREFIX: str = "ed25519:"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _b64url_decode(value: str) -> bytes:
|
|
162
|
+
"""Decode a base64url-encoded value, tolerating missing padding.
|
|
163
|
+
|
|
164
|
+
Matches the encoding convention used in
|
|
165
|
+
``cloudflare/workers/handle-alter/src/ed25519.ts`` (``bufToBase64Url``
|
|
166
|
+
strips trailing ``=``). Raises :class:`ValueError` on malformed input
|
|
167
|
+
so the caller can treat the frame as malformed-signature.
|
|
168
|
+
"""
|
|
169
|
+
padding = "=" * (-len(value) % 4)
|
|
170
|
+
return base64.urlsafe_b64decode(value + padding)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _canonical_bytes_without_signature(payload: dict) -> bytes:
|
|
174
|
+
"""Return the canonical JSON bytes of ``payload`` with ``signature`` removed.
|
|
175
|
+
|
|
176
|
+
Uses stable key ordering (``sort_keys=True``) and the most compact
|
|
177
|
+
separator set so the output matches the CF Worker's ``canonicalise``
|
|
178
|
+
(``cloudflare/workers/handle-alter/src/ed25519.ts:137-151``). The
|
|
179
|
+
``signature`` field is excluded before serialisation so both signer
|
|
180
|
+
and verifier compute bytes over the same logical object.
|
|
181
|
+
"""
|
|
182
|
+
stripped = {k: v for k, v in payload.items() if k != "signature"}
|
|
183
|
+
return json.dumps(
|
|
184
|
+
stripped,
|
|
185
|
+
sort_keys=True,
|
|
186
|
+
ensure_ascii=False,
|
|
187
|
+
allow_nan=False,
|
|
188
|
+
separators=(",", ":"),
|
|
189
|
+
).encode("utf-8")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class _FrameSignatureOutcome(str): # noqa: N801 - sentinel-style enum
|
|
193
|
+
"""Structured outcome classes emitted by :func:`_verify_frame_signature`.
|
|
194
|
+
|
|
195
|
+
Kept as string constants (not :class:`enum.Enum`) so they can double as
|
|
196
|
+
log fields without extra serialisation and so ``_ConnectionState``
|
|
197
|
+
counter attribute names derive directly from the value.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
OK = "ok"
|
|
201
|
+
UNSIGNED = "unsigned"
|
|
202
|
+
MALFORMED_SIG = "malformed_sig"
|
|
203
|
+
WRONG_KEY = "wrong_key"
|
|
204
|
+
BAD_SIG = "bad_sig"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _verify_frame_signature(payload: dict, pinned_pubkey_b64: str) -> str:
|
|
208
|
+
"""Verify an Ed25519 signature on a parsed SSE frame payload.
|
|
209
|
+
|
|
210
|
+
Expected wire contract (the DO emitter MUST adopt this - see follow-up
|
|
211
|
+
note in :attr:`DaemonConfig.require_frame_signature`)::
|
|
212
|
+
|
|
213
|
+
{
|
|
214
|
+
"signature": "<base64url>", # 64-byte Ed25519 signature
|
|
215
|
+
"pubkey": "ed25519:<base64url>", # 32-byte public key
|
|
216
|
+
"kind": "...", # event kind
|
|
217
|
+
"payload": { ... }, # opaque event body
|
|
218
|
+
... # any other top-level fields
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
The signature is computed over the canonical (``sort_keys=True``,
|
|
222
|
+
compact-separator) JSON bytes of the object with the ``signature``
|
|
223
|
+
field removed. This matches ``canonicalise()`` in
|
|
224
|
+
``cloudflare/workers/handle-alter/src/ed25519.ts`` so the DO can
|
|
225
|
+
re-use the existing Web Crypto signer.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
payload:
|
|
230
|
+
The parsed JSON object from ``frame.data``. MUST be a ``dict``;
|
|
231
|
+
non-dict payloads should be treated as unsigned by the caller.
|
|
232
|
+
pinned_pubkey_b64:
|
|
233
|
+
The base64url-encoded 32-byte Ed25519 public key (without the
|
|
234
|
+
``ed25519:`` prefix) that the daemon trusts. Frames declaring any
|
|
235
|
+
other ``pubkey`` are rejected as ``wrong_key`` - this is the
|
|
236
|
+
self-certifying-frame defence.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
str
|
|
241
|
+
One of the :class:`_FrameSignatureOutcome` constants:
|
|
242
|
+
``"ok"``, ``"unsigned"``, ``"malformed_sig"``, ``"wrong_key"``, or
|
|
243
|
+
``"bad_sig"``. Never raises.
|
|
244
|
+
"""
|
|
245
|
+
# Lazy import so the cryptography dependency is only loaded when
|
|
246
|
+
# enforcement is actually invoked. Keeps start-up cold cost minimal
|
|
247
|
+
# on systems where require_frame_signature stays False.
|
|
248
|
+
try:
|
|
249
|
+
from cryptography.exceptions import InvalidSignature
|
|
250
|
+
from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PublicKey
|
|
251
|
+
except ImportError: # pragma: no cover - dependency declared in pyproject.toml
|
|
252
|
+
logger.error(
|
|
253
|
+
"do_sse: `cryptography` package is missing but require_frame_signature=True; "
|
|
254
|
+
"treating all frames as malformed_sig until the dependency is installed."
|
|
255
|
+
)
|
|
256
|
+
return _FrameSignatureOutcome.MALFORMED_SIG
|
|
257
|
+
|
|
258
|
+
signature_b64 = payload.get("signature")
|
|
259
|
+
declared_pubkey = payload.get("pubkey")
|
|
260
|
+
if not isinstance(signature_b64, str) or not isinstance(declared_pubkey, str):
|
|
261
|
+
return _FrameSignatureOutcome.UNSIGNED
|
|
262
|
+
|
|
263
|
+
# Normalise the declared pubkey - strip the ``ed25519:`` prefix if
|
|
264
|
+
# present so the comparison is substrate-agnostic.
|
|
265
|
+
if declared_pubkey.startswith(ED25519_PUBKEY_PREFIX):
|
|
266
|
+
declared_pubkey_b64 = declared_pubkey[len(ED25519_PUBKEY_PREFIX) :]
|
|
267
|
+
else:
|
|
268
|
+
declared_pubkey_b64 = declared_pubkey
|
|
269
|
+
|
|
270
|
+
if declared_pubkey_b64 != pinned_pubkey_b64:
|
|
271
|
+
return _FrameSignatureOutcome.WRONG_KEY
|
|
272
|
+
|
|
273
|
+
# Decode the signature; malformed base64 or wrong length → drop.
|
|
274
|
+
try:
|
|
275
|
+
signature_bytes = _b64url_decode(signature_b64)
|
|
276
|
+
except (ValueError, Exception): # noqa: BLE001 - base64 can raise binascii.Error
|
|
277
|
+
return _FrameSignatureOutcome.MALFORMED_SIG
|
|
278
|
+
if len(signature_bytes) != ED25519_SIGNATURE_BYTES:
|
|
279
|
+
return _FrameSignatureOutcome.MALFORMED_SIG
|
|
280
|
+
|
|
281
|
+
# Decode + load the pinned pubkey. Malformed pinned key is a
|
|
282
|
+
# configuration bug; surface it as malformed_sig so the daemon's
|
|
283
|
+
# structured log shows the failure class clearly without taking down
|
|
284
|
+
# the subscriber.
|
|
285
|
+
try:
|
|
286
|
+
pubkey_bytes = _b64url_decode(pinned_pubkey_b64)
|
|
287
|
+
if len(pubkey_bytes) != ED25519_PUBKEY_BYTES:
|
|
288
|
+
return _FrameSignatureOutcome.MALFORMED_SIG
|
|
289
|
+
pubkey = Ed25519PublicKey.from_public_bytes(pubkey_bytes)
|
|
290
|
+
except (ValueError, Exception): # noqa: BLE001
|
|
291
|
+
return _FrameSignatureOutcome.MALFORMED_SIG
|
|
292
|
+
|
|
293
|
+
canonical = _canonical_bytes_without_signature(payload)
|
|
294
|
+
try:
|
|
295
|
+
pubkey.verify(signature_bytes, canonical)
|
|
296
|
+
except InvalidSignature:
|
|
297
|
+
return _FrameSignatureOutcome.BAD_SIG
|
|
298
|
+
except Exception: # noqa: BLE001 - defensive; crypto lib shouldn't raise here
|
|
299
|
+
return _FrameSignatureOutcome.BAD_SIG
|
|
300
|
+
return _FrameSignatureOutcome.OK
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@dataclass
|
|
304
|
+
class _ConnectionState:
|
|
305
|
+
"""Internal book-keeping for the DO SSE connection.
|
|
306
|
+
|
|
307
|
+
Kept on the subscriber instance so tests can assert on reconnect counts
|
|
308
|
+
and the current ``Last-Event-ID`` checkpoint.
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
reconnect_count: int = 0
|
|
312
|
+
last_event_id: str | None = None
|
|
313
|
+
was_connected: bool = False
|
|
314
|
+
backoff: float = BASE_BACKOFF_SECONDS
|
|
315
|
+
# Monotonic timestamp of the most recent successful connect, or None
|
|
316
|
+
# when not connected.
|
|
317
|
+
connected_at: float | None = None
|
|
318
|
+
# Records for debugging / tests
|
|
319
|
+
history: list[str] = field(default_factory=list)
|
|
320
|
+
|
|
321
|
+
# --- Frame-signature enforcement counters (C-1 closure) -------------
|
|
322
|
+
#: Frames that passed Ed25519 verification (``require_frame_signature``
|
|
323
|
+
#: is ``True`` and the signature was valid against the pinned pubkey).
|
|
324
|
+
frames_verified_ok: int = 0
|
|
325
|
+
#: Frames dropped because the ``signature`` (or ``pubkey``) field was
|
|
326
|
+
#: absent while enforcement is on.
|
|
327
|
+
frames_dropped_unsigned: int = 0
|
|
328
|
+
#: Frames dropped because the declared signature could not be decoded
|
|
329
|
+
#: or was not 64 bytes (Ed25519 signature length).
|
|
330
|
+
frames_dropped_malformed_sig: int = 0
|
|
331
|
+
#: Frames dropped because the frame's declared ``pubkey`` differs from
|
|
332
|
+
#: the daemon's pinned :attr:`DaemonConfig.frame_signature_pubkey`.
|
|
333
|
+
frames_dropped_wrong_key: int = 0
|
|
334
|
+
#: Frames dropped because Ed25519 verification failed against the
|
|
335
|
+
#: pinned pubkey (decoded signature was the right shape but the
|
|
336
|
+
#: cryptographic check did not pass).
|
|
337
|
+
frames_dropped_bad_sig: int = 0
|
|
338
|
+
#: Frames that arrived without a signature while enforcement is still
|
|
339
|
+
#: ``False``. Not dropped - counted so operators can see the baseline
|
|
340
|
+
#: signature-coverage deficit before flipping the switch on.
|
|
341
|
+
frames_warned_unsigned: int = 0
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class DoSseSubscriber(Component):
|
|
345
|
+
"""Tails the per-handle DO SSE stream and publishes frames onto the bus.
|
|
346
|
+
|
|
347
|
+
Parameters
|
|
348
|
+
----------
|
|
349
|
+
config:
|
|
350
|
+
Loaded :class:`DaemonConfig`. Used for ``do_sse_endpoint`` and the
|
|
351
|
+
fallback trigger threshold.
|
|
352
|
+
session:
|
|
353
|
+
Authenticated alter-cli :class:`Session`. Supplies ``handle`` and
|
|
354
|
+
``jwt``.
|
|
355
|
+
bus:
|
|
356
|
+
The shared :class:`EventBus` instance.
|
|
357
|
+
http_client:
|
|
358
|
+
Optional override for the HTTP client. Tests pass an
|
|
359
|
+
``httpx.AsyncClient(transport=httpx.MockTransport(...))`` here so they
|
|
360
|
+
don't need the network.
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
name = "do_sse"
|
|
364
|
+
|
|
365
|
+
def __init__(
|
|
366
|
+
self,
|
|
367
|
+
config: DaemonConfig,
|
|
368
|
+
session: Session,
|
|
369
|
+
bus: EventBus,
|
|
370
|
+
*,
|
|
371
|
+
http_client: httpx.AsyncClient | None = None,
|
|
372
|
+
) -> None:
|
|
373
|
+
# --- C-7 kill-switch: bearer + scheme invariants ---------------
|
|
374
|
+
# If there's no session bearer, the daemon would happily open an
|
|
375
|
+
# unauthenticated SSE stream - and the DO would (correctly) bounce
|
|
376
|
+
# it, but we'd still log + reconnect forever. Refuse at construct
|
|
377
|
+
# time instead so operator sees a clear error and runs ``alter login``.
|
|
378
|
+
if not getattr(session, "jwt", None):
|
|
379
|
+
raise ConfigError(
|
|
380
|
+
"do_sse: session has no bearer JWT - run `alter login` before starting the runtime."
|
|
381
|
+
)
|
|
382
|
+
# Defence-in-depth: reject non-https even though config.load()
|
|
383
|
+
# already checks. The subscriber is the hot-path component; a
|
|
384
|
+
# weakened config that bypasses load() should still not be able
|
|
385
|
+
# to drive an http:// stream.
|
|
386
|
+
endpoint = getattr(config, "do_sse_endpoint", "") or ""
|
|
387
|
+
if not endpoint.lower().startswith("https://"):
|
|
388
|
+
raise ConfigError(
|
|
389
|
+
f"do_sse: do_sse_endpoint={endpoint!r} is not https:// - "
|
|
390
|
+
"refusing to start (pre-T-0 kill-switch)."
|
|
391
|
+
)
|
|
392
|
+
# Freshness sanity check - if the bearer's ``iat`` is older than
|
|
393
|
+
# MAX_BEARER_AGE_SECONDS, refuse to use it. Logged-not-raised when
|
|
394
|
+
# ``iat`` is absent because some deployments mint opaque tokens.
|
|
395
|
+
iat = _peek_jwt_iat(session.jwt)
|
|
396
|
+
if iat is not None:
|
|
397
|
+
age = time.time() - iat
|
|
398
|
+
if age > MAX_BEARER_AGE_SECONDS:
|
|
399
|
+
raise ConfigError(
|
|
400
|
+
f"do_sse: bearer JWT iat is {age:.0f}s old (>"
|
|
401
|
+
f"{MAX_BEARER_AGE_SECONDS:.0f}s) - refusing to use a "
|
|
402
|
+
"dormant session. Run `alter login` to refresh."
|
|
403
|
+
)
|
|
404
|
+
elif not getattr(session, "jwt_expires_at", None):
|
|
405
|
+
# No iat *and* no expiry - opaque or stripped token. Log, don't
|
|
406
|
+
# crash; follow-up PR adds mandatory expiry metadata end-to-end.
|
|
407
|
+
logger.warning(
|
|
408
|
+
"do_sse: bearer has no parseable iat and no jwt_expires_at; "
|
|
409
|
+
"proceeding but freshness cannot be verified."
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# --- C-1 kill-switch: enforcement requires a pinned trust root ----
|
|
413
|
+
# When frame-signature enforcement is on, we MUST have a pinned
|
|
414
|
+
# Ed25519 public key to verify against - otherwise the verifier
|
|
415
|
+
# would trust whatever ``pubkey`` the frame itself declared, which
|
|
416
|
+
# is a self-certifying (i.e. useless) check. Refuse to construct so
|
|
417
|
+
# the operator sees the misconfiguration before the daemon fans out
|
|
418
|
+
# unverified events. Interim pin is the env var
|
|
419
|
+
# ``ALTER_RUNTIME_FRAME_SIG_PUBKEY``; follow-up PR replaces this
|
|
420
|
+
# with a DO ``/state`` fetch on start-up.
|
|
421
|
+
if getattr(config, "require_frame_signature", False):
|
|
422
|
+
pinned = getattr(config, "frame_signature_pubkey", None)
|
|
423
|
+
if not pinned:
|
|
424
|
+
raise ConfigError(
|
|
425
|
+
"do_sse: require_frame_signature=True but "
|
|
426
|
+
"frame_signature_pubkey is unset - refusing to start. "
|
|
427
|
+
"Set ALTER_RUNTIME_FRAME_SIG_PUBKEY to the DO's pinned "
|
|
428
|
+
"Ed25519 public key (`ed25519:<base64url>`)."
|
|
429
|
+
)
|
|
430
|
+
# M-E-2 (pentest-pass2 §6.5): validate pubkey format at construct
|
|
431
|
+
# time so operators see a clear ConfigError instead of silent
|
|
432
|
+
# frame-drops caused by malformed_sig later.
|
|
433
|
+
import re as _re
|
|
434
|
+
|
|
435
|
+
# 32-byte Ed25519 public key encodes to 43 base64url chars
|
|
436
|
+
# without padding (ceil(32 * 4 / 3) = 43) or 44 with an
|
|
437
|
+
# ``=`` pad byte. The pre-pentest regex (`+=*`) accepted
|
|
438
|
+
# arbitrarily-short keys that decoded to <32 bytes - those
|
|
439
|
+
# were caught further downstream but the failure was a
|
|
440
|
+
# silent ``frames_dropped_malformed_sig`` deficit instead
|
|
441
|
+
# of a clear refusal-to-start. Tighten the shape gate so
|
|
442
|
+
# the operator sees the misconfiguration at construct time.
|
|
443
|
+
if not _re.match(r"^ed25519:[A-Za-z0-9_-]{43,}=*$", pinned):
|
|
444
|
+
raise ConfigError(
|
|
445
|
+
f"do_sse: frame_signature_pubkey={pinned!r} is not in the "
|
|
446
|
+
"expected `ed25519:<base64url>` format - refusing to start."
|
|
447
|
+
)
|
|
448
|
+
_raw_pubkey = pinned[len(ED25519_PUBKEY_PREFIX) :]
|
|
449
|
+
_padding = "=" * (-len(_raw_pubkey) % 4)
|
|
450
|
+
try:
|
|
451
|
+
_decoded = base64.urlsafe_b64decode(_raw_pubkey + _padding)
|
|
452
|
+
except Exception as _exc:
|
|
453
|
+
raise ConfigError(
|
|
454
|
+
f"do_sse: frame_signature_pubkey base64url decode failed: {_exc}"
|
|
455
|
+
) from _exc
|
|
456
|
+
if len(_decoded) != ED25519_PUBKEY_BYTES:
|
|
457
|
+
raise ConfigError(
|
|
458
|
+
f"do_sse: frame_signature_pubkey decoded to {len(_decoded)} bytes; "
|
|
459
|
+
f"Ed25519 public keys must be exactly {ED25519_PUBKEY_BYTES} bytes."
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
self._config = config
|
|
463
|
+
self._session = session
|
|
464
|
+
self._bus = bus
|
|
465
|
+
self._http_client = http_client
|
|
466
|
+
self._owns_client = http_client is None
|
|
467
|
+
self._stop_event = asyncio.Event()
|
|
468
|
+
# Read the module-level constant at __init__ time (not at class-def
|
|
469
|
+
# time) so tests that monkeypatch the constant see the patched value.
|
|
470
|
+
self._state = _ConnectionState(backoff=BASE_BACKOFF_SECONDS)
|
|
471
|
+
# Pass-3 B1 (2026-04-14): the DO SSE gate requires a
|
|
472
|
+
# scope=alter_events.subscribe capability JWT, not session.jwt.
|
|
473
|
+
# Cache (token, expires_epoch); re-mint within 30s of expiry.
|
|
474
|
+
self._subscribe_cap: tuple[str, float] | None = None
|
|
475
|
+
# Bounded LRU of recently-seen frame IDs - prevents a DO replay from
|
|
476
|
+
# being fanned out to every local surface twice. Closes runtime/M-3.
|
|
477
|
+
self._seen_frame_ids: OrderedDict[str, None] = OrderedDict()
|
|
478
|
+
|
|
479
|
+
# ------------------------------------------------------------------
|
|
480
|
+
# Component lifecycle
|
|
481
|
+
# ------------------------------------------------------------------
|
|
482
|
+
|
|
483
|
+
async def run(self) -> None:
|
|
484
|
+
"""Long-lived reconnect loop. Never raises except on cancellation."""
|
|
485
|
+
url = self._config.do_sse_endpoint.format(handle=self._session.handle)
|
|
486
|
+
logger.info(
|
|
487
|
+
"do_sse starting handle=%s url=%s",
|
|
488
|
+
self._session.handle,
|
|
489
|
+
url,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Owned client uses a strict TLS context - no CERT_NONE, hostname
|
|
493
|
+
# verification on, TLS 1.2+. Tests can still inject a MockTransport
|
|
494
|
+
# via ``http_client=`` to bypass the network entirely.
|
|
495
|
+
# Backend default headers — CF Access service-token bundle
|
|
496
|
+
# (D-SUBSTRATE-UNIFIED-1 §2.3 Option A) merged with the canonical
|
|
497
|
+
# ``X-Alter-Client-*`` identity headers (D-MIN-VERSION-FLOOR-1 §3).
|
|
498
|
+
# The X-Alter-* headers are required on every authenticated
|
|
499
|
+
# backend call so the server-side floor middleware can identify
|
|
500
|
+
# the daemon; without them the middleware returns HTTP 426
|
|
501
|
+
# ``client_identification_required``.
|
|
502
|
+
client = self._http_client or httpx.AsyncClient(
|
|
503
|
+
timeout=httpx.Timeout(
|
|
504
|
+
connect=10.0,
|
|
505
|
+
read=None, # SSE holds the connection open indefinitely
|
|
506
|
+
write=10.0,
|
|
507
|
+
pool=10.0,
|
|
508
|
+
),
|
|
509
|
+
verify=_build_tls_context(),
|
|
510
|
+
headers=backend_default_headers(),
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
try:
|
|
514
|
+
while not self._stop_event.is_set():
|
|
515
|
+
try:
|
|
516
|
+
await self._run_one_connection(client, url)
|
|
517
|
+
# Clean EOF: treat as disconnect + backoff
|
|
518
|
+
await self._on_disconnect("stream_closed")
|
|
519
|
+
except asyncio.CancelledError:
|
|
520
|
+
raise
|
|
521
|
+
except httpx.HTTPStatusError as exc:
|
|
522
|
+
await self._handle_http_status_error(exc)
|
|
523
|
+
except (httpx.TransportError, httpx.RequestError) as exc:
|
|
524
|
+
await self._on_disconnect(f"transport_error: {type(exc).__name__}")
|
|
525
|
+
await self._backoff_then_retry()
|
|
526
|
+
except Exception as exc:
|
|
527
|
+
logger.exception("do_sse unexpected error: %s", exc)
|
|
528
|
+
await self._on_disconnect(f"unexpected: {type(exc).__name__}")
|
|
529
|
+
await self._backoff_then_retry()
|
|
530
|
+
finally:
|
|
531
|
+
if self._owns_client:
|
|
532
|
+
try:
|
|
533
|
+
await client.aclose()
|
|
534
|
+
except Exception: # pragma: no cover - defensive
|
|
535
|
+
pass
|
|
536
|
+
logger.info("do_sse stopped handle=%s", self._session.handle)
|
|
537
|
+
|
|
538
|
+
async def stop(self) -> None:
|
|
539
|
+
"""Cooperative shutdown - releases the reconnect loop."""
|
|
540
|
+
self._stop_event.set()
|
|
541
|
+
|
|
542
|
+
# ------------------------------------------------------------------
|
|
543
|
+
# Connection
|
|
544
|
+
# ------------------------------------------------------------------
|
|
545
|
+
|
|
546
|
+
async def _run_one_connection(self, client: httpx.AsyncClient, url: str) -> None:
|
|
547
|
+
"""Open one SSE stream and dispatch frames until it closes or stalls."""
|
|
548
|
+
cap = await self._ensure_subscribe_cap(client)
|
|
549
|
+
headers = {
|
|
550
|
+
"Accept": "text/event-stream",
|
|
551
|
+
"Authorization": f"Bearer {cap}",
|
|
552
|
+
"Cache-Control": "no-cache",
|
|
553
|
+
"User-Agent": f"alter-runtime/{_package_version()}",
|
|
554
|
+
}
|
|
555
|
+
if self._state.last_event_id is not None:
|
|
556
|
+
headers["Last-Event-ID"] = self._state.last_event_id
|
|
557
|
+
|
|
558
|
+
logger.debug(
|
|
559
|
+
"do_sse connecting url=%s last_event_id=%s",
|
|
560
|
+
url,
|
|
561
|
+
self._state.last_event_id,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
async with client.stream("GET", url, headers=headers) as response:
|
|
565
|
+
if response.status_code != 200:
|
|
566
|
+
# Read a small body preview for error logs, then surface the
|
|
567
|
+
# status so the outer loop can decide on a backoff policy.
|
|
568
|
+
try:
|
|
569
|
+
body_preview = (await response.aread())[:256].decode("utf-8", errors="replace")
|
|
570
|
+
except Exception:
|
|
571
|
+
body_preview = "<body unreadable>"
|
|
572
|
+
logger.warning(
|
|
573
|
+
"do_sse non-200 status=%d url=%s body=%r",
|
|
574
|
+
response.status_code,
|
|
575
|
+
url,
|
|
576
|
+
body_preview,
|
|
577
|
+
)
|
|
578
|
+
raise httpx.HTTPStatusError(
|
|
579
|
+
f"do_sse got HTTP {response.status_code}",
|
|
580
|
+
request=response.request,
|
|
581
|
+
response=response,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
await self._on_connect()
|
|
585
|
+
await self._consume_stream(response)
|
|
586
|
+
|
|
587
|
+
async def _ensure_subscribe_cap(self, client: httpx.AsyncClient) -> str:
|
|
588
|
+
"""Return a cached or freshly-minted subscribe-scope capability JWT.
|
|
589
|
+
|
|
590
|
+
Pentest Pass-3 B1 (2026-04-14): the DO (``HandleAlterDO.handleStream``)
|
|
591
|
+
requires a capability with scope ``alter_events.subscribe`` and
|
|
592
|
+
``aud_recipient == sub == <handle>``. We acquire it from the alter-api
|
|
593
|
+
backend using ``session.jwt`` as bearer, cache until 30s before
|
|
594
|
+
expiry, and re-mint on each reconnect afterwards.
|
|
595
|
+
"""
|
|
596
|
+
now = time.time()
|
|
597
|
+
if self._subscribe_cap is not None:
|
|
598
|
+
token, exp = self._subscribe_cap
|
|
599
|
+
if exp - now > 30.0:
|
|
600
|
+
return token
|
|
601
|
+
|
|
602
|
+
mint_url = f"{self._session.api.rstrip('/')}/api/v1/messaging/subscribe-capability"
|
|
603
|
+
headers = {
|
|
604
|
+
"Authorization": f"Bearer {self._session.jwt}",
|
|
605
|
+
"Accept": "application/json",
|
|
606
|
+
"User-Agent": f"alter-runtime/{_package_version()}",
|
|
607
|
+
}
|
|
608
|
+
# Short timeout - a 5xx here drops into the outer reconnect/backoff
|
|
609
|
+
# path in run() via httpx.HTTPStatusError, which is the same
|
|
610
|
+
# degradation channel as any other auth failure.
|
|
611
|
+
resp = await client.post(mint_url, headers=headers, timeout=10.0)
|
|
612
|
+
resp.raise_for_status()
|
|
613
|
+
body = resp.json()
|
|
614
|
+
token = body["capability"]
|
|
615
|
+
exp_iso = body.get("expires_at") or ""
|
|
616
|
+
try:
|
|
617
|
+
# datetime.fromisoformat accepts +HH:MM offsets; the backend
|
|
618
|
+
# returns UTC with 'Z' in FastAPI's default JSON encoder.
|
|
619
|
+
from datetime import datetime as _dt
|
|
620
|
+
|
|
621
|
+
exp_dt = _dt.fromisoformat(exp_iso.replace("Z", "+00:00"))
|
|
622
|
+
exp_epoch = exp_dt.timestamp()
|
|
623
|
+
except Exception:
|
|
624
|
+
# Conservative fallback if the timestamp can't be parsed.
|
|
625
|
+
# Pass-4 H-1 coordination (2026-04-14): worker ceiling is 60s
|
|
626
|
+
# (HandleAlterDO SUBSCRIBE_CAP_MAX_LIFETIME_SECONDS); backend
|
|
627
|
+
# mints with SUBSCRIBE_CAPABILITY_TTL_SECONDS clamped to 60s.
|
|
628
|
+
# Assuming 300s here caused the client to hold an already-
|
|
629
|
+
# rejected cap for 235s before reconnect - set to 60s to
|
|
630
|
+
# match the shipped ceiling.
|
|
631
|
+
exp_epoch = now + 60.0
|
|
632
|
+
self._subscribe_cap = (token, exp_epoch)
|
|
633
|
+
logger.debug(
|
|
634
|
+
"do_sse minted subscribe cap handle=%s exp=%.0f",
|
|
635
|
+
self._session.handle,
|
|
636
|
+
exp_epoch,
|
|
637
|
+
)
|
|
638
|
+
return token
|
|
639
|
+
|
|
640
|
+
async def _consume_stream(self, response: httpx.Response) -> None:
|
|
641
|
+
"""Read chunks from ``response``, parse SSE, dispatch each frame.
|
|
642
|
+
|
|
643
|
+
Uses a stall watchdog: if no bytes arrive for
|
|
644
|
+
``fallback_trigger_after_seconds``, the iteration unblocks and the
|
|
645
|
+
caller treats this as a disconnect (fallback will pick up).
|
|
646
|
+
"""
|
|
647
|
+
buffer = ""
|
|
648
|
+
stall_seconds = max(self._config.fallback_trigger_after_seconds, 1.0)
|
|
649
|
+
|
|
650
|
+
# ``aiter_text`` yields decoded str chunks as they arrive. We wrap each
|
|
651
|
+
# chunk fetch in a timeout so a stalled keepalive doesn't wedge us.
|
|
652
|
+
iterator = response.aiter_text()
|
|
653
|
+
while not self._stop_event.is_set():
|
|
654
|
+
try:
|
|
655
|
+
chunk = await asyncio.wait_for(iterator.__anext__(), timeout=stall_seconds)
|
|
656
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
657
|
+
logger.warning(
|
|
658
|
+
"do_sse stream stalled > %.1fs - treating as disconnect", stall_seconds
|
|
659
|
+
)
|
|
660
|
+
return
|
|
661
|
+
except StopAsyncIteration:
|
|
662
|
+
return
|
|
663
|
+
|
|
664
|
+
if not chunk:
|
|
665
|
+
continue
|
|
666
|
+
buffer += chunk
|
|
667
|
+
frames, buffer = parse_sse_frames(buffer)
|
|
668
|
+
for frame in frames:
|
|
669
|
+
await self._dispatch_frame(frame)
|
|
670
|
+
|
|
671
|
+
# ------------------------------------------------------------------
|
|
672
|
+
# Frame dispatch + connection state publishing
|
|
673
|
+
# ------------------------------------------------------------------
|
|
674
|
+
|
|
675
|
+
async def _dispatch_frame(self, frame: SSEFrame) -> None:
|
|
676
|
+
"""Publish a parsed frame to the bus and advance Last-Event-ID.
|
|
677
|
+
|
|
678
|
+
Frame-signature enforcement (C-1) happens BEFORE any publish: if
|
|
679
|
+
``require_frame_signature=True`` and the frame fails verification
|
|
680
|
+
against the pinned pubkey, it is dropped with a structured warning
|
|
681
|
+
log and the matching ``_ConnectionState`` counter is incremented.
|
|
682
|
+
"""
|
|
683
|
+
if frame.id is not None:
|
|
684
|
+
# Frame-ID dedup - drops duplicates within FRAME_DEDUP_WINDOW so
|
|
685
|
+
# a DO replay after a transient drop doesn't fan out twice. Raw
|
|
686
|
+
# untagged frames (`frame.id is None`) skip the check because the
|
|
687
|
+
# DO does not emit untagged payloads on the hot path; this is a
|
|
688
|
+
# test-fixture escape hatch.
|
|
689
|
+
if frame.id in self._seen_frame_ids:
|
|
690
|
+
logger.debug("do_sse duplicate frame id=%s - dropping", frame.id)
|
|
691
|
+
return
|
|
692
|
+
self._seen_frame_ids[frame.id] = None
|
|
693
|
+
if len(self._seen_frame_ids) > FRAME_DEDUP_WINDOW:
|
|
694
|
+
self._seen_frame_ids.popitem(last=False)
|
|
695
|
+
self._state.last_event_id = frame.id
|
|
696
|
+
|
|
697
|
+
# Parse payload once - used for both signature verification and
|
|
698
|
+
# the convenience TOPIC_EVENT publish below. Non-JSON / non-dict
|
|
699
|
+
# payloads are opaque to the signature check (the wire contract
|
|
700
|
+
# always wraps signed bodies in a top-level object).
|
|
701
|
+
parsed_payload: dict | None = None
|
|
702
|
+
try:
|
|
703
|
+
candidate = frame.json
|
|
704
|
+
if isinstance(candidate, dict):
|
|
705
|
+
parsed_payload = candidate
|
|
706
|
+
except (ValueError, json.JSONDecodeError):
|
|
707
|
+
parsed_payload = None
|
|
708
|
+
|
|
709
|
+
# --- C-1 frame-signature enforcement ---------------------------
|
|
710
|
+
if self._config.require_frame_signature:
|
|
711
|
+
if parsed_payload is None:
|
|
712
|
+
# Enforcement is on but the frame isn't a signable object.
|
|
713
|
+
# Drop as unsigned rather than silently letting opaque
|
|
714
|
+
# payloads bypass the check.
|
|
715
|
+
self._state.frames_dropped_unsigned += 1
|
|
716
|
+
logger.warning(
|
|
717
|
+
"do_sse dropping non-dict frame under signature enforcement id=%s event=%s",
|
|
718
|
+
frame.id,
|
|
719
|
+
frame.event,
|
|
720
|
+
)
|
|
721
|
+
return
|
|
722
|
+
pinned = self._config.frame_signature_pubkey or ""
|
|
723
|
+
if pinned.startswith(ED25519_PUBKEY_PREFIX):
|
|
724
|
+
pinned = pinned[len(ED25519_PUBKEY_PREFIX) :]
|
|
725
|
+
outcome = _verify_frame_signature(parsed_payload, pinned)
|
|
726
|
+
if outcome == _FrameSignatureOutcome.OK:
|
|
727
|
+
self._state.frames_verified_ok += 1
|
|
728
|
+
else:
|
|
729
|
+
# Structured log + counter increment + drop. Keep the log
|
|
730
|
+
# at WARNING (not ERROR) - enforcement drops are expected
|
|
731
|
+
# during the DO-signer rollout and flooding ERROR would
|
|
732
|
+
# desensitise operators before the migration completes.
|
|
733
|
+
counter_attr = f"frames_dropped_{outcome}"
|
|
734
|
+
current = getattr(self._state, counter_attr, 0)
|
|
735
|
+
setattr(self._state, counter_attr, current + 1)
|
|
736
|
+
logger.warning(
|
|
737
|
+
"do_sse dropping frame under signature enforcement id=%s event=%s outcome=%s",
|
|
738
|
+
frame.id,
|
|
739
|
+
frame.event,
|
|
740
|
+
outcome,
|
|
741
|
+
)
|
|
742
|
+
return
|
|
743
|
+
else:
|
|
744
|
+
# Enforcement off - count unsigned frames so operators can see
|
|
745
|
+
# the signature-coverage deficit in metrics before flipping
|
|
746
|
+
# the switch. Does NOT alter dispatch behaviour.
|
|
747
|
+
if parsed_payload is None or "signature" not in parsed_payload:
|
|
748
|
+
self._state.frames_warned_unsigned += 1
|
|
749
|
+
|
|
750
|
+
await self._bus.publish(TOPIC_FRAME, frame)
|
|
751
|
+
|
|
752
|
+
if parsed_payload is not None:
|
|
753
|
+
await self._bus.publish(TOPIC_EVENT, parsed_payload)
|
|
754
|
+
else:
|
|
755
|
+
logger.debug("do_sse frame not JSON event=%s", frame.event)
|
|
756
|
+
|
|
757
|
+
async def _on_connect(self) -> None:
|
|
758
|
+
"""Publish an ``identity.connected`` sentinel."""
|
|
759
|
+
self._state.was_connected = True
|
|
760
|
+
self._state.connected_at = time.monotonic()
|
|
761
|
+
self._state.history.append("connect")
|
|
762
|
+
logger.info(
|
|
763
|
+
"do_sse connected handle=%s reconnect_count=%d",
|
|
764
|
+
self._session.handle,
|
|
765
|
+
self._state.reconnect_count,
|
|
766
|
+
)
|
|
767
|
+
await self._bus.publish(
|
|
768
|
+
TOPIC_CONNECTED,
|
|
769
|
+
{
|
|
770
|
+
"handle": self._session.handle,
|
|
771
|
+
"reconnect_count": self._state.reconnect_count,
|
|
772
|
+
},
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
async def _on_disconnect(self, reason: str) -> None:
|
|
776
|
+
"""Publish an ``identity.disconnected`` sentinel (only if we were connected)."""
|
|
777
|
+
self._state.history.append(f"disconnect:{reason}")
|
|
778
|
+
# Don't emit spurious disconnect events if we never managed to connect
|
|
779
|
+
# in the first place - the fallback listens for transitions from
|
|
780
|
+
# connected→disconnected; a cold start is handled separately by the
|
|
781
|
+
# config-driven start-up delay in McpFallbackSubscriber.
|
|
782
|
+
if not self._state.was_connected:
|
|
783
|
+
return
|
|
784
|
+
self._state.was_connected = False
|
|
785
|
+
dwell = time.monotonic() - (self._state.connected_at or 0.0)
|
|
786
|
+
self._state.connected_at = None
|
|
787
|
+
if dwell >= STABLE_CONNECTION_SECONDS:
|
|
788
|
+
# Connection was healthy; treat the next reconnect as a fresh start.
|
|
789
|
+
self._state.backoff = BASE_BACKOFF_SECONDS
|
|
790
|
+
# else: short-lived connection - a flap. Leave self._state.backoff
|
|
791
|
+
# climbing so _backoff_then_retry keeps escalating toward MAX_BACKOFF.
|
|
792
|
+
self._state.reconnect_count += 1
|
|
793
|
+
logger.warning(
|
|
794
|
+
"do_sse disconnected handle=%s reason=%s reconnect_count=%d",
|
|
795
|
+
self._session.handle,
|
|
796
|
+
reason,
|
|
797
|
+
self._state.reconnect_count,
|
|
798
|
+
)
|
|
799
|
+
await self._bus.publish(
|
|
800
|
+
TOPIC_DISCONNECTED,
|
|
801
|
+
{
|
|
802
|
+
"handle": self._session.handle,
|
|
803
|
+
"reason": reason,
|
|
804
|
+
"reconnect_count": self._state.reconnect_count,
|
|
805
|
+
},
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
# ------------------------------------------------------------------
|
|
809
|
+
# Error handling + backoff
|
|
810
|
+
# ------------------------------------------------------------------
|
|
811
|
+
|
|
812
|
+
async def _handle_http_status_error(self, exc: httpx.HTTPStatusError) -> None:
|
|
813
|
+
status = exc.response.status_code if exc.response is not None else 0
|
|
814
|
+
await self._on_disconnect(f"http_status:{status}")
|
|
815
|
+
if status in (401, 403):
|
|
816
|
+
logger.error(
|
|
817
|
+
"do_sse auth failure status=%d - sleeping %.0fs; run `alter login` to refresh",
|
|
818
|
+
status,
|
|
819
|
+
AUTH_ERROR_BACKOFF_SECONDS,
|
|
820
|
+
)
|
|
821
|
+
await self._sleep_interruptible(AUTH_ERROR_BACKOFF_SECONDS)
|
|
822
|
+
return
|
|
823
|
+
await self._backoff_then_retry()
|
|
824
|
+
|
|
825
|
+
async def _backoff_then_retry(self) -> None:
|
|
826
|
+
"""Sleep a jittered fraction of the current backoff ceiling, then
|
|
827
|
+
double the ceiling for next time (capped at MAX_BACKOFF_SECONDS).
|
|
828
|
+
|
|
829
|
+
Full Jitter (AWS "Exponential Backoff and Jitter"): the actual sleep is
|
|
830
|
+
uniform(0, ceiling). This decorrelates reconnects across daemons so a
|
|
831
|
+
shared-cause outage doesn't produce a synchronised thundering herd, and
|
|
832
|
+
bounds the worst-case wait at the current ceiling.
|
|
833
|
+
"""
|
|
834
|
+
ceiling = self._state.backoff
|
|
835
|
+
delay = random.uniform(0.0, ceiling)
|
|
836
|
+
self._state.backoff = min(ceiling * 2, MAX_BACKOFF_SECONDS)
|
|
837
|
+
logger.info("do_sse reconnecting in %.1fs (ceiling %.1fs)", delay, ceiling)
|
|
838
|
+
await self._sleep_interruptible(delay)
|
|
839
|
+
|
|
840
|
+
async def _sleep_interruptible(self, seconds: float) -> None:
|
|
841
|
+
"""Wait ``seconds`` or until ``stop()`` is called, whichever is first."""
|
|
842
|
+
try:
|
|
843
|
+
await asyncio.wait_for(self._stop_event.wait(), timeout=seconds)
|
|
844
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
845
|
+
return
|
|
846
|
+
|
|
847
|
+
# ------------------------------------------------------------------
|
|
848
|
+
# Test introspection
|
|
849
|
+
# ------------------------------------------------------------------
|
|
850
|
+
|
|
851
|
+
@property
|
|
852
|
+
def state(self) -> _ConnectionState:
|
|
853
|
+
"""Current connection state (used by tests)."""
|
|
854
|
+
return self._state
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _package_version() -> str:
|
|
858
|
+
"""Best-effort version string for the User-Agent header."""
|
|
859
|
+
try:
|
|
860
|
+
from alter_runtime import __version__
|
|
861
|
+
|
|
862
|
+
return __version__
|
|
863
|
+
except Exception: # pragma: no cover
|
|
864
|
+
return "unknown"
|