alter-runtime 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. alter_runtime/__init__.py +11 -0
  2. alter_runtime/adapters/__init__.py +19 -0
  3. alter_runtime/adapters/claude_jsonl_watcher.py +545 -0
  4. alter_runtime/adapters/git_watcher.py +457 -0
  5. alter_runtime/adapters/household/__init__.py +29 -0
  6. alter_runtime/adapters/household/_base.py +138 -0
  7. alter_runtime/adapters/household/compost/__init__.py +17 -0
  8. alter_runtime/adapters/household/compost/adapter.py +81 -0
  9. alter_runtime/adapters/household/compost/storage.py +75 -0
  10. alter_runtime/adapters/household/compost/tests/__init__.py +0 -0
  11. alter_runtime/adapters/household/compost/tests/test_adapter.py +62 -0
  12. alter_runtime/adapters/household/compost/tests/test_storage.py +23 -0
  13. alter_runtime/adapters/household/compost/tests/test_traits.py +38 -0
  14. alter_runtime/adapters/household/compost/traits.py +79 -0
  15. alter_runtime/adapters/household/self_hoster/__init__.py +30 -0
  16. alter_runtime/adapters/household/self_hoster/adapter.py +248 -0
  17. alter_runtime/adapters/household/self_hoster/storage.py +83 -0
  18. alter_runtime/adapters/household/self_hoster/tests/__init__.py +0 -0
  19. alter_runtime/adapters/household/self_hoster/tests/test_adapter.py +216 -0
  20. alter_runtime/adapters/household/self_hoster/tests/test_storage.py +25 -0
  21. alter_runtime/adapters/household/self_hoster/tests/test_traits.py +55 -0
  22. alter_runtime/adapters/household/self_hoster/traits.py +105 -0
  23. alter_runtime/adapters/household/tapo_ecosystem/__init__.py +22 -0
  24. alter_runtime/adapters/household/tapo_ecosystem/adapter.py +98 -0
  25. alter_runtime/adapters/household/tapo_ecosystem/storage.py +95 -0
  26. alter_runtime/adapters/household/tapo_ecosystem/tests/__init__.py +0 -0
  27. alter_runtime/adapters/household/tapo_ecosystem/tests/test_adapter.py +55 -0
  28. alter_runtime/adapters/household/tapo_ecosystem/tests/test_storage.py +28 -0
  29. alter_runtime/adapters/household/tapo_ecosystem/tests/test_traits.py +45 -0
  30. alter_runtime/adapters/household/tapo_ecosystem/traits.py +97 -0
  31. alter_runtime/adapters/household/workshop_tools/__init__.py +25 -0
  32. alter_runtime/adapters/household/workshop_tools/adapter.py +77 -0
  33. alter_runtime/adapters/household/workshop_tools/storage.py +92 -0
  34. alter_runtime/adapters/household/workshop_tools/tests/__init__.py +0 -0
  35. alter_runtime/adapters/household/workshop_tools/tests/test_adapter.py +48 -0
  36. alter_runtime/adapters/household/workshop_tools/tests/test_storage.py +26 -0
  37. alter_runtime/adapters/household/workshop_tools/tests/test_traits.py +45 -0
  38. alter_runtime/adapters/household/workshop_tools/traits.py +95 -0
  39. alter_runtime/adapters/worktree_watcher.py +378 -0
  40. alter_runtime/atlas/__init__.py +48 -0
  41. alter_runtime/atlas/base.py +102 -0
  42. alter_runtime/atlas/ledger.py +196 -0
  43. alter_runtime/atlas/observations.py +136 -0
  44. alter_runtime/atlas/schema.py +106 -0
  45. alter_runtime/cap_cache.py +392 -0
  46. alter_runtime/cli.py +517 -0
  47. alter_runtime/clients/__init__.py +0 -0
  48. alter_runtime/clients/token_usage_client.py +273 -0
  49. alter_runtime/config.py +648 -0
  50. alter_runtime/consent.py +425 -0
  51. alter_runtime/daemon.py +518 -0
  52. alter_runtime/floor_loop.py +335 -0
  53. alter_runtime/floor_preflight.py +734 -0
  54. alter_runtime/http_auth.py +173 -0
  55. alter_runtime/notifiers/__init__.py +18 -0
  56. alter_runtime/notifiers/desktop.py +321 -0
  57. alter_runtime/sdk/__init__.py +12 -0
  58. alter_runtime/sdk/client.py +399 -0
  59. alter_runtime/service_install.py +616 -0
  60. alter_runtime/services/__init__.py +59 -0
  61. alter_runtime/services/launchd/com.alter.runtime.plist.in +90 -0
  62. alter_runtime/services/systemd/alter-runtime.service.in +74 -0
  63. alter_runtime/services/systemd/cf-access-env.conf.in +29 -0
  64. alter_runtime/sockets/__init__.py +20 -0
  65. alter_runtime/sockets/dbus.py +272 -0
  66. alter_runtime/sockets/unix.py +702 -0
  67. alter_runtime/subscribers/__init__.py +58 -0
  68. alter_runtime/subscribers/active_sessions_cron_emitter.py +313 -0
  69. alter_runtime/subscribers/active_sessions_do_publisher.py +1159 -0
  70. alter_runtime/subscribers/active_sessions_gc.py +432 -0
  71. alter_runtime/subscribers/active_sessions_writer.py +446 -0
  72. alter_runtime/subscribers/adapters_writer.py +415 -0
  73. alter_runtime/subscribers/agent_frames.py +461 -0
  74. alter_runtime/subscribers/bus.py +188 -0
  75. alter_runtime/subscribers/cache_writer.py +347 -0
  76. alter_runtime/subscribers/ceremony_echo.py +290 -0
  77. alter_runtime/subscribers/do_sse.py +864 -0
  78. alter_runtime/subscribers/ebpf.py +506 -0
  79. alter_runtime/subscribers/inbox_writer.py +469 -0
  80. alter_runtime/subscribers/mcp_fallback.py +391 -0
  81. alter_runtime/subscribers/presence_writer.py +426 -0
  82. alter_runtime/subscribers/session_presence.py +467 -0
  83. alter_runtime/subscribers/sse.py +125 -0
  84. alter_runtime/subscribers/weave_intent_writer.py +608 -0
  85. alter_runtime/update_loop.py +519 -0
  86. alter_runtime/weave/__init__.py +21 -0
  87. alter_runtime/weave/resolver.py +544 -0
  88. alter_runtime-0.3.0.dist-info/METADATA +289 -0
  89. alter_runtime-0.3.0.dist-info/RECORD +92 -0
  90. alter_runtime-0.3.0.dist-info/WHEEL +4 -0
  91. alter_runtime-0.3.0.dist-info/entry_points.txt +2 -0
  92. alter_runtime-0.3.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,864 @@
1
+ """DoSseSubscriber - primary L1 ingress via Cloudflare Durable Object SSE.
2
+
3
+ Opens a long-lived Server-Sent Events connection against
4
+ ``https://mcp.truealter.com/events/{handle}/stream`` (templated from
5
+ :class:`DaemonConfig.do_sse_endpoint`) authenticated with the alter-cli session
6
+ JWT, parses frames via :func:`parse_sse_frames`, and publishes:
7
+
8
+ * ``identity.frame`` - the raw :class:`SSEFrame` (for subscribers that need the
9
+ event name or the raw payload string).
10
+ * ``identity.event`` - the parsed JSON body as a ``dict`` (convenience for
11
+ 99% of consumers; skipped when the payload isn't valid JSON).
12
+ * ``identity.connected`` - sentinel published once the first byte of a new
13
+ connection lands. Payload: ``{"handle": str, "reconnect_count": int}``.
14
+ * ``identity.disconnected`` - sentinel published when the connection drops or
15
+ stalls past :attr:`DaemonConfig.fallback_trigger_after_seconds`. Payload:
16
+ ``{"handle": str, "reason": str, "reconnect_count": int}``.
17
+
18
+ The subscriber is the only component that talks to the network on the hot
19
+ path; every other component reads from the bus. That separation is deliberate
20
+ so the fallback path (:class:`McpFallbackSubscriber`) can take over without
21
+ the fan-out layer noticing.
22
+
23
+ Reconnection semantics (D-RT9, §6.2 of *Alter-to-Alter Messaging*):
24
+
25
+ * The SSE ``id:`` field from the most recently parsed frame is remembered
26
+ across reconnects and sent back as the ``Last-Event-ID`` HTTP header so the
27
+ DO can replay anything we missed.
28
+ * Transient errors (``httpx.TransportError``, 502/503/504, socket reset) are
29
+ logged at warning level and retried with exponential backoff capped at
30
+ ``MAX_BACKOFF_SECONDS``.
31
+ * Authentication errors (401/403) are logged at error level and retried more
32
+ slowly - the JWT may have been rotated and the daemon does not own
33
+ re-authentication; the operator is expected to run ``alter login``.
34
+ * Cancellation (``asyncio.CancelledError``) propagates so the supervisor can
35
+ shut the component down cleanly.
36
+
37
+ The subscriber never raises out of :meth:`run` - the supervisor's
38
+ exponential-backoff restart machinery is kept as a last-resort safety net for
39
+ truly unexpected exceptions.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import asyncio
45
+ import base64
46
+ import json
47
+ import logging
48
+ import random
49
+ import ssl
50
+ import time
51
+ from collections import OrderedDict
52
+ from dataclasses import dataclass, field
53
+ from typing import TYPE_CHECKING
54
+
55
+ import httpx
56
+
57
+ from alter_runtime.config import ConfigError, DaemonConfig
58
+ from alter_runtime.daemon import Component
59
+ from alter_runtime.http_auth import backend_default_headers
60
+ from alter_runtime.subscribers.sse import SSEFrame, parse_sse_frames
61
+
62
+ if TYPE_CHECKING:
63
+ from alter_runtime.config import Session
64
+ from alter_runtime.subscribers.bus import EventBus
65
+
66
+ __all__ = ["DoSseSubscriber"]
67
+
68
+ logger = logging.getLogger("alter_runtime.subscribers.do_sse")
69
+
70
+ #: Topic published once per successful connection start.
71
+ TOPIC_CONNECTED = "identity.connected"
72
+ #: Topic published when the connection drops or the stream stalls.
73
+ TOPIC_DISCONNECTED = "identity.disconnected"
74
+ #: Topic published per raw SSE frame.
75
+ TOPIC_FRAME = "identity.frame"
76
+ #: Topic published with the parsed JSON event body (skipped on parse failure).
77
+ TOPIC_EVENT = "identity.event"
78
+
79
+ #: Initial back-off in seconds after a transient network error.
80
+ BASE_BACKOFF_SECONDS: float = 1.0
81
+ #: Upper bound on exponential back-off.
82
+ MAX_BACKOFF_SECONDS: float = 60.0
83
+ #: Slow back-off for authentication failures - the user probably needs to
84
+ #: run ``alter login`` to fix the session, so we don't want to hammer the edge.
85
+ AUTH_ERROR_BACKOFF_SECONDS: float = 300.0
86
+ #: A connection that survives at least this long is treated as "healthy" - the
87
+ #: next reconnect starts the backoff fresh; a connection that dies sooner is a
88
+ #: flap and the backoff keeps climbing.
89
+ STABLE_CONNECTION_SECONDS: float = 30.0
90
+
91
+ #: Maximum number of recently-seen SSE frame IDs retained for dedup. 256 is
92
+ #: large enough to absorb an SSE replay burst (Last-Event-ID resume after a
93
+ #: transient drop usually replays the tail of the backlog) while staying
94
+ #: cheap to keep in-memory. Closes runtime/M-3 from
95
+ #: pentest-findings-2026-04-15.md.
96
+ FRAME_DEDUP_WINDOW: int = 256
97
+
98
+ #: Max age (seconds) of a JWT ``iat`` claim before we refuse to use the
99
+ #: bearer. Pre-T-0 kill-switch: if the session.json is ancient we'd rather
100
+ #: fail loud than let a leaked long-lived bearer ferry events into every
101
+ #: CC session. The ~30s target referenced in the task description is too
102
+ #: tight for alter-cli's current session lifecycle (JWTs live for hours);
103
+ #: 24h is the practical middle ground and still catches the stolen-laptop
104
+ #: scenario where a dormant session.json is replayed weeks later. The
105
+ #: tighter 30s check applies at _frame_ freshness once DO signer lands.
106
+ MAX_BEARER_AGE_SECONDS: float = 24 * 60 * 60
107
+
108
+
109
+ def _build_tls_context() -> ssl.SSLContext:
110
+ """Construct a strict TLS context for the DO SSE client.
111
+
112
+ No ``CERT_NONE``, no hostname-check disable. Explicit construction so
113
+ that any future attempt to weaken the client's TLS posture has to edit
114
+ this function and trigger review, rather than silently flipping a
115
+ kwarg on the httpx.AsyncClient constructor.
116
+ """
117
+ context = ssl.create_default_context()
118
+ context.check_hostname = True
119
+ context.verify_mode = ssl.CERT_REQUIRED
120
+ # Minimum TLS 1.2 - matches Cloudflare's edge minimum. Setting
121
+ # ``minimum_version`` instead of deprecated options= flags keeps us on
122
+ # the modern OpenSSL path.
123
+ context.minimum_version = ssl.TLSVersion.TLSv1_2
124
+ return context
125
+
126
+
127
+ def _peek_jwt_iat(jwt: str) -> float | None:
128
+ """Return the ``iat`` (issued-at) claim of ``jwt`` as a UNIX timestamp.
129
+
130
+ Best-effort parse - no signature verification (the DO does that). We
131
+ only read ``iat`` so we can reject dormant session bearers at start-up.
132
+ Returns ``None`` if the token doesn't carry a readable payload.
133
+ """
134
+ try:
135
+ parts = jwt.split(".")
136
+ if len(parts) < 2:
137
+ return None
138
+ payload_b64 = parts[1]
139
+ # base64url requires padding in fours; pad up before decoding.
140
+ padding = "=" * (-len(payload_b64) % 4)
141
+ payload_bytes = base64.urlsafe_b64decode(payload_b64 + padding)
142
+ payload = json.loads(payload_bytes.decode("utf-8"))
143
+ except (ValueError, json.JSONDecodeError, UnicodeDecodeError):
144
+ return None
145
+ iat = payload.get("iat")
146
+ if isinstance(iat, (int, float)):
147
+ return float(iat)
148
+ return None
149
+
150
+
151
+ #: Length in bytes of a valid Ed25519 signature (RFC 8032).
152
+ ED25519_SIGNATURE_BYTES: int = 64
153
+ #: Length in bytes of a valid Ed25519 public key (RFC 8032).
154
+ ED25519_PUBKEY_BYTES: int = 32
155
+ #: Prefix on :attr:`DaemonConfig.frame_signature_pubkey` and on the
156
+ #: ``pubkey`` field emitted by the DO. Matches the canonical envelope
157
+ #: convention in ``/tmp/alter-unification-canonical-fragment.md``.
158
+ ED25519_PUBKEY_PREFIX: str = "ed25519:"
159
+
160
+
161
+ def _b64url_decode(value: str) -> bytes:
162
+ """Decode a base64url-encoded value, tolerating missing padding.
163
+
164
+ Matches the encoding convention used in
165
+ ``cloudflare/workers/handle-alter/src/ed25519.ts`` (``bufToBase64Url``
166
+ strips trailing ``=``). Raises :class:`ValueError` on malformed input
167
+ so the caller can treat the frame as malformed-signature.
168
+ """
169
+ padding = "=" * (-len(value) % 4)
170
+ return base64.urlsafe_b64decode(value + padding)
171
+
172
+
173
+ def _canonical_bytes_without_signature(payload: dict) -> bytes:
174
+ """Return the canonical JSON bytes of ``payload`` with ``signature`` removed.
175
+
176
+ Uses stable key ordering (``sort_keys=True``) and the most compact
177
+ separator set so the output matches the CF Worker's ``canonicalise``
178
+ (``cloudflare/workers/handle-alter/src/ed25519.ts:137-151``). The
179
+ ``signature`` field is excluded before serialisation so both signer
180
+ and verifier compute bytes over the same logical object.
181
+ """
182
+ stripped = {k: v for k, v in payload.items() if k != "signature"}
183
+ return json.dumps(
184
+ stripped,
185
+ sort_keys=True,
186
+ ensure_ascii=False,
187
+ allow_nan=False,
188
+ separators=(",", ":"),
189
+ ).encode("utf-8")
190
+
191
+
192
+ class _FrameSignatureOutcome(str): # noqa: N801 - sentinel-style enum
193
+ """Structured outcome classes emitted by :func:`_verify_frame_signature`.
194
+
195
+ Kept as string constants (not :class:`enum.Enum`) so they can double as
196
+ log fields without extra serialisation and so ``_ConnectionState``
197
+ counter attribute names derive directly from the value.
198
+ """
199
+
200
+ OK = "ok"
201
+ UNSIGNED = "unsigned"
202
+ MALFORMED_SIG = "malformed_sig"
203
+ WRONG_KEY = "wrong_key"
204
+ BAD_SIG = "bad_sig"
205
+
206
+
207
+ def _verify_frame_signature(payload: dict, pinned_pubkey_b64: str) -> str:
208
+ """Verify an Ed25519 signature on a parsed SSE frame payload.
209
+
210
+ Expected wire contract (the DO emitter MUST adopt this - see follow-up
211
+ note in :attr:`DaemonConfig.require_frame_signature`)::
212
+
213
+ {
214
+ "signature": "<base64url>", # 64-byte Ed25519 signature
215
+ "pubkey": "ed25519:<base64url>", # 32-byte public key
216
+ "kind": "...", # event kind
217
+ "payload": { ... }, # opaque event body
218
+ ... # any other top-level fields
219
+ }
220
+
221
+ The signature is computed over the canonical (``sort_keys=True``,
222
+ compact-separator) JSON bytes of the object with the ``signature``
223
+ field removed. This matches ``canonicalise()`` in
224
+ ``cloudflare/workers/handle-alter/src/ed25519.ts`` so the DO can
225
+ re-use the existing Web Crypto signer.
226
+
227
+ Parameters
228
+ ----------
229
+ payload:
230
+ The parsed JSON object from ``frame.data``. MUST be a ``dict``;
231
+ non-dict payloads should be treated as unsigned by the caller.
232
+ pinned_pubkey_b64:
233
+ The base64url-encoded 32-byte Ed25519 public key (without the
234
+ ``ed25519:`` prefix) that the daemon trusts. Frames declaring any
235
+ other ``pubkey`` are rejected as ``wrong_key`` - this is the
236
+ self-certifying-frame defence.
237
+
238
+ Returns
239
+ -------
240
+ str
241
+ One of the :class:`_FrameSignatureOutcome` constants:
242
+ ``"ok"``, ``"unsigned"``, ``"malformed_sig"``, ``"wrong_key"``, or
243
+ ``"bad_sig"``. Never raises.
244
+ """
245
+ # Lazy import so the cryptography dependency is only loaded when
246
+ # enforcement is actually invoked. Keeps start-up cold cost minimal
247
+ # on systems where require_frame_signature stays False.
248
+ try:
249
+ from cryptography.exceptions import InvalidSignature
250
+ from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PublicKey
251
+ except ImportError: # pragma: no cover - dependency declared in pyproject.toml
252
+ logger.error(
253
+ "do_sse: `cryptography` package is missing but require_frame_signature=True; "
254
+ "treating all frames as malformed_sig until the dependency is installed."
255
+ )
256
+ return _FrameSignatureOutcome.MALFORMED_SIG
257
+
258
+ signature_b64 = payload.get("signature")
259
+ declared_pubkey = payload.get("pubkey")
260
+ if not isinstance(signature_b64, str) or not isinstance(declared_pubkey, str):
261
+ return _FrameSignatureOutcome.UNSIGNED
262
+
263
+ # Normalise the declared pubkey - strip the ``ed25519:`` prefix if
264
+ # present so the comparison is substrate-agnostic.
265
+ if declared_pubkey.startswith(ED25519_PUBKEY_PREFIX):
266
+ declared_pubkey_b64 = declared_pubkey[len(ED25519_PUBKEY_PREFIX) :]
267
+ else:
268
+ declared_pubkey_b64 = declared_pubkey
269
+
270
+ if declared_pubkey_b64 != pinned_pubkey_b64:
271
+ return _FrameSignatureOutcome.WRONG_KEY
272
+
273
+ # Decode the signature; malformed base64 or wrong length → drop.
274
+ try:
275
+ signature_bytes = _b64url_decode(signature_b64)
276
+ except (ValueError, Exception): # noqa: BLE001 - base64 can raise binascii.Error
277
+ return _FrameSignatureOutcome.MALFORMED_SIG
278
+ if len(signature_bytes) != ED25519_SIGNATURE_BYTES:
279
+ return _FrameSignatureOutcome.MALFORMED_SIG
280
+
281
+ # Decode + load the pinned pubkey. Malformed pinned key is a
282
+ # configuration bug; surface it as malformed_sig so the daemon's
283
+ # structured log shows the failure class clearly without taking down
284
+ # the subscriber.
285
+ try:
286
+ pubkey_bytes = _b64url_decode(pinned_pubkey_b64)
287
+ if len(pubkey_bytes) != ED25519_PUBKEY_BYTES:
288
+ return _FrameSignatureOutcome.MALFORMED_SIG
289
+ pubkey = Ed25519PublicKey.from_public_bytes(pubkey_bytes)
290
+ except (ValueError, Exception): # noqa: BLE001
291
+ return _FrameSignatureOutcome.MALFORMED_SIG
292
+
293
+ canonical = _canonical_bytes_without_signature(payload)
294
+ try:
295
+ pubkey.verify(signature_bytes, canonical)
296
+ except InvalidSignature:
297
+ return _FrameSignatureOutcome.BAD_SIG
298
+ except Exception: # noqa: BLE001 - defensive; crypto lib shouldn't raise here
299
+ return _FrameSignatureOutcome.BAD_SIG
300
+ return _FrameSignatureOutcome.OK
301
+
302
+
303
+ @dataclass
304
+ class _ConnectionState:
305
+ """Internal book-keeping for the DO SSE connection.
306
+
307
+ Kept on the subscriber instance so tests can assert on reconnect counts
308
+ and the current ``Last-Event-ID`` checkpoint.
309
+ """
310
+
311
+ reconnect_count: int = 0
312
+ last_event_id: str | None = None
313
+ was_connected: bool = False
314
+ backoff: float = BASE_BACKOFF_SECONDS
315
+ # Monotonic timestamp of the most recent successful connect, or None
316
+ # when not connected.
317
+ connected_at: float | None = None
318
+ # Records for debugging / tests
319
+ history: list[str] = field(default_factory=list)
320
+
321
+ # --- Frame-signature enforcement counters (C-1 closure) -------------
322
+ #: Frames that passed Ed25519 verification (``require_frame_signature``
323
+ #: is ``True`` and the signature was valid against the pinned pubkey).
324
+ frames_verified_ok: int = 0
325
+ #: Frames dropped because the ``signature`` (or ``pubkey``) field was
326
+ #: absent while enforcement is on.
327
+ frames_dropped_unsigned: int = 0
328
+ #: Frames dropped because the declared signature could not be decoded
329
+ #: or was not 64 bytes (Ed25519 signature length).
330
+ frames_dropped_malformed_sig: int = 0
331
+ #: Frames dropped because the frame's declared ``pubkey`` differs from
332
+ #: the daemon's pinned :attr:`DaemonConfig.frame_signature_pubkey`.
333
+ frames_dropped_wrong_key: int = 0
334
+ #: Frames dropped because Ed25519 verification failed against the
335
+ #: pinned pubkey (decoded signature was the right shape but the
336
+ #: cryptographic check did not pass).
337
+ frames_dropped_bad_sig: int = 0
338
+ #: Frames that arrived without a signature while enforcement is still
339
+ #: ``False``. Not dropped - counted so operators can see the baseline
340
+ #: signature-coverage deficit before flipping the switch on.
341
+ frames_warned_unsigned: int = 0
342
+
343
+
344
+ class DoSseSubscriber(Component):
345
+ """Tails the per-handle DO SSE stream and publishes frames onto the bus.
346
+
347
+ Parameters
348
+ ----------
349
+ config:
350
+ Loaded :class:`DaemonConfig`. Used for ``do_sse_endpoint`` and the
351
+ fallback trigger threshold.
352
+ session:
353
+ Authenticated alter-cli :class:`Session`. Supplies ``handle`` and
354
+ ``jwt``.
355
+ bus:
356
+ The shared :class:`EventBus` instance.
357
+ http_client:
358
+ Optional override for the HTTP client. Tests pass an
359
+ ``httpx.AsyncClient(transport=httpx.MockTransport(...))`` here so they
360
+ don't need the network.
361
+ """
362
+
363
+ name = "do_sse"
364
+
365
+ def __init__(
366
+ self,
367
+ config: DaemonConfig,
368
+ session: Session,
369
+ bus: EventBus,
370
+ *,
371
+ http_client: httpx.AsyncClient | None = None,
372
+ ) -> None:
373
+ # --- C-7 kill-switch: bearer + scheme invariants ---------------
374
+ # If there's no session bearer, the daemon would happily open an
375
+ # unauthenticated SSE stream - and the DO would (correctly) bounce
376
+ # it, but we'd still log + reconnect forever. Refuse at construct
377
+ # time instead so operator sees a clear error and runs ``alter login``.
378
+ if not getattr(session, "jwt", None):
379
+ raise ConfigError(
380
+ "do_sse: session has no bearer JWT - run `alter login` before starting the runtime."
381
+ )
382
+ # Defence-in-depth: reject non-https even though config.load()
383
+ # already checks. The subscriber is the hot-path component; a
384
+ # weakened config that bypasses load() should still not be able
385
+ # to drive an http:// stream.
386
+ endpoint = getattr(config, "do_sse_endpoint", "") or ""
387
+ if not endpoint.lower().startswith("https://"):
388
+ raise ConfigError(
389
+ f"do_sse: do_sse_endpoint={endpoint!r} is not https:// - "
390
+ "refusing to start (pre-T-0 kill-switch)."
391
+ )
392
+ # Freshness sanity check - if the bearer's ``iat`` is older than
393
+ # MAX_BEARER_AGE_SECONDS, refuse to use it. Logged-not-raised when
394
+ # ``iat`` is absent because some deployments mint opaque tokens.
395
+ iat = _peek_jwt_iat(session.jwt)
396
+ if iat is not None:
397
+ age = time.time() - iat
398
+ if age > MAX_BEARER_AGE_SECONDS:
399
+ raise ConfigError(
400
+ f"do_sse: bearer JWT iat is {age:.0f}s old (>"
401
+ f"{MAX_BEARER_AGE_SECONDS:.0f}s) - refusing to use a "
402
+ "dormant session. Run `alter login` to refresh."
403
+ )
404
+ elif not getattr(session, "jwt_expires_at", None):
405
+ # No iat *and* no expiry - opaque or stripped token. Log, don't
406
+ # crash; follow-up PR adds mandatory expiry metadata end-to-end.
407
+ logger.warning(
408
+ "do_sse: bearer has no parseable iat and no jwt_expires_at; "
409
+ "proceeding but freshness cannot be verified."
410
+ )
411
+
412
+ # --- C-1 kill-switch: enforcement requires a pinned trust root ----
413
+ # When frame-signature enforcement is on, we MUST have a pinned
414
+ # Ed25519 public key to verify against - otherwise the verifier
415
+ # would trust whatever ``pubkey`` the frame itself declared, which
416
+ # is a self-certifying (i.e. useless) check. Refuse to construct so
417
+ # the operator sees the misconfiguration before the daemon fans out
418
+ # unverified events. Interim pin is the env var
419
+ # ``ALTER_RUNTIME_FRAME_SIG_PUBKEY``; follow-up PR replaces this
420
+ # with a DO ``/state`` fetch on start-up.
421
+ if getattr(config, "require_frame_signature", False):
422
+ pinned = getattr(config, "frame_signature_pubkey", None)
423
+ if not pinned:
424
+ raise ConfigError(
425
+ "do_sse: require_frame_signature=True but "
426
+ "frame_signature_pubkey is unset - refusing to start. "
427
+ "Set ALTER_RUNTIME_FRAME_SIG_PUBKEY to the DO's pinned "
428
+ "Ed25519 public key (`ed25519:<base64url>`)."
429
+ )
430
+ # M-E-2 (pentest-pass2 §6.5): validate pubkey format at construct
431
+ # time so operators see a clear ConfigError instead of silent
432
+ # frame-drops caused by malformed_sig later.
433
+ import re as _re
434
+
435
+ # 32-byte Ed25519 public key encodes to 43 base64url chars
436
+ # without padding (ceil(32 * 4 / 3) = 43) or 44 with an
437
+ # ``=`` pad byte. The pre-pentest regex (`+=*`) accepted
438
+ # arbitrarily-short keys that decoded to <32 bytes - those
439
+ # were caught further downstream but the failure was a
440
+ # silent ``frames_dropped_malformed_sig`` deficit instead
441
+ # of a clear refusal-to-start. Tighten the shape gate so
442
+ # the operator sees the misconfiguration at construct time.
443
+ if not _re.match(r"^ed25519:[A-Za-z0-9_-]{43,}=*$", pinned):
444
+ raise ConfigError(
445
+ f"do_sse: frame_signature_pubkey={pinned!r} is not in the "
446
+ "expected `ed25519:<base64url>` format - refusing to start."
447
+ )
448
+ _raw_pubkey = pinned[len(ED25519_PUBKEY_PREFIX) :]
449
+ _padding = "=" * (-len(_raw_pubkey) % 4)
450
+ try:
451
+ _decoded = base64.urlsafe_b64decode(_raw_pubkey + _padding)
452
+ except Exception as _exc:
453
+ raise ConfigError(
454
+ f"do_sse: frame_signature_pubkey base64url decode failed: {_exc}"
455
+ ) from _exc
456
+ if len(_decoded) != ED25519_PUBKEY_BYTES:
457
+ raise ConfigError(
458
+ f"do_sse: frame_signature_pubkey decoded to {len(_decoded)} bytes; "
459
+ f"Ed25519 public keys must be exactly {ED25519_PUBKEY_BYTES} bytes."
460
+ )
461
+
462
+ self._config = config
463
+ self._session = session
464
+ self._bus = bus
465
+ self._http_client = http_client
466
+ self._owns_client = http_client is None
467
+ self._stop_event = asyncio.Event()
468
+ # Read the module-level constant at __init__ time (not at class-def
469
+ # time) so tests that monkeypatch the constant see the patched value.
470
+ self._state = _ConnectionState(backoff=BASE_BACKOFF_SECONDS)
471
+ # Pass-3 B1 (2026-04-14): the DO SSE gate requires a
472
+ # scope=alter_events.subscribe capability JWT, not session.jwt.
473
+ # Cache (token, expires_epoch); re-mint within 30s of expiry.
474
+ self._subscribe_cap: tuple[str, float] | None = None
475
+ # Bounded LRU of recently-seen frame IDs - prevents a DO replay from
476
+ # being fanned out to every local surface twice. Closes runtime/M-3.
477
+ self._seen_frame_ids: OrderedDict[str, None] = OrderedDict()
478
+
479
+ # ------------------------------------------------------------------
480
+ # Component lifecycle
481
+ # ------------------------------------------------------------------
482
+
483
+ async def run(self) -> None:
484
+ """Long-lived reconnect loop. Never raises except on cancellation."""
485
+ url = self._config.do_sse_endpoint.format(handle=self._session.handle)
486
+ logger.info(
487
+ "do_sse starting handle=%s url=%s",
488
+ self._session.handle,
489
+ url,
490
+ )
491
+
492
+ # Owned client uses a strict TLS context - no CERT_NONE, hostname
493
+ # verification on, TLS 1.2+. Tests can still inject a MockTransport
494
+ # via ``http_client=`` to bypass the network entirely.
495
+ # Backend default headers — CF Access service-token bundle
496
+ # (D-SUBSTRATE-UNIFIED-1 §2.3 Option A) merged with the canonical
497
+ # ``X-Alter-Client-*`` identity headers (D-MIN-VERSION-FLOOR-1 §3).
498
+ # The X-Alter-* headers are required on every authenticated
499
+ # backend call so the server-side floor middleware can identify
500
+ # the daemon; without them the middleware returns HTTP 426
501
+ # ``client_identification_required``.
502
+ client = self._http_client or httpx.AsyncClient(
503
+ timeout=httpx.Timeout(
504
+ connect=10.0,
505
+ read=None, # SSE holds the connection open indefinitely
506
+ write=10.0,
507
+ pool=10.0,
508
+ ),
509
+ verify=_build_tls_context(),
510
+ headers=backend_default_headers(),
511
+ )
512
+
513
+ try:
514
+ while not self._stop_event.is_set():
515
+ try:
516
+ await self._run_one_connection(client, url)
517
+ # Clean EOF: treat as disconnect + backoff
518
+ await self._on_disconnect("stream_closed")
519
+ except asyncio.CancelledError:
520
+ raise
521
+ except httpx.HTTPStatusError as exc:
522
+ await self._handle_http_status_error(exc)
523
+ except (httpx.TransportError, httpx.RequestError) as exc:
524
+ await self._on_disconnect(f"transport_error: {type(exc).__name__}")
525
+ await self._backoff_then_retry()
526
+ except Exception as exc:
527
+ logger.exception("do_sse unexpected error: %s", exc)
528
+ await self._on_disconnect(f"unexpected: {type(exc).__name__}")
529
+ await self._backoff_then_retry()
530
+ finally:
531
+ if self._owns_client:
532
+ try:
533
+ await client.aclose()
534
+ except Exception: # pragma: no cover - defensive
535
+ pass
536
+ logger.info("do_sse stopped handle=%s", self._session.handle)
537
+
538
+ async def stop(self) -> None:
539
+ """Cooperative shutdown - releases the reconnect loop."""
540
+ self._stop_event.set()
541
+
542
+ # ------------------------------------------------------------------
543
+ # Connection
544
+ # ------------------------------------------------------------------
545
+
546
+ async def _run_one_connection(self, client: httpx.AsyncClient, url: str) -> None:
547
+ """Open one SSE stream and dispatch frames until it closes or stalls."""
548
+ cap = await self._ensure_subscribe_cap(client)
549
+ headers = {
550
+ "Accept": "text/event-stream",
551
+ "Authorization": f"Bearer {cap}",
552
+ "Cache-Control": "no-cache",
553
+ "User-Agent": f"alter-runtime/{_package_version()}",
554
+ }
555
+ if self._state.last_event_id is not None:
556
+ headers["Last-Event-ID"] = self._state.last_event_id
557
+
558
+ logger.debug(
559
+ "do_sse connecting url=%s last_event_id=%s",
560
+ url,
561
+ self._state.last_event_id,
562
+ )
563
+
564
+ async with client.stream("GET", url, headers=headers) as response:
565
+ if response.status_code != 200:
566
+ # Read a small body preview for error logs, then surface the
567
+ # status so the outer loop can decide on a backoff policy.
568
+ try:
569
+ body_preview = (await response.aread())[:256].decode("utf-8", errors="replace")
570
+ except Exception:
571
+ body_preview = "<body unreadable>"
572
+ logger.warning(
573
+ "do_sse non-200 status=%d url=%s body=%r",
574
+ response.status_code,
575
+ url,
576
+ body_preview,
577
+ )
578
+ raise httpx.HTTPStatusError(
579
+ f"do_sse got HTTP {response.status_code}",
580
+ request=response.request,
581
+ response=response,
582
+ )
583
+
584
+ await self._on_connect()
585
+ await self._consume_stream(response)
586
+
587
+ async def _ensure_subscribe_cap(self, client: httpx.AsyncClient) -> str:
588
+ """Return a cached or freshly-minted subscribe-scope capability JWT.
589
+
590
+ Pentest Pass-3 B1 (2026-04-14): the DO (``HandleAlterDO.handleStream``)
591
+ requires a capability with scope ``alter_events.subscribe`` and
592
+ ``aud_recipient == sub == <handle>``. We acquire it from the alter-api
593
+ backend using ``session.jwt`` as bearer, cache until 30s before
594
+ expiry, and re-mint on each reconnect afterwards.
595
+ """
596
+ now = time.time()
597
+ if self._subscribe_cap is not None:
598
+ token, exp = self._subscribe_cap
599
+ if exp - now > 30.0:
600
+ return token
601
+
602
+ mint_url = f"{self._session.api.rstrip('/')}/api/v1/messaging/subscribe-capability"
603
+ headers = {
604
+ "Authorization": f"Bearer {self._session.jwt}",
605
+ "Accept": "application/json",
606
+ "User-Agent": f"alter-runtime/{_package_version()}",
607
+ }
608
+ # Short timeout - a 5xx here drops into the outer reconnect/backoff
609
+ # path in run() via httpx.HTTPStatusError, which is the same
610
+ # degradation channel as any other auth failure.
611
+ resp = await client.post(mint_url, headers=headers, timeout=10.0)
612
+ resp.raise_for_status()
613
+ body = resp.json()
614
+ token = body["capability"]
615
+ exp_iso = body.get("expires_at") or ""
616
+ try:
617
+ # datetime.fromisoformat accepts +HH:MM offsets; the backend
618
+ # returns UTC with 'Z' in FastAPI's default JSON encoder.
619
+ from datetime import datetime as _dt
620
+
621
+ exp_dt = _dt.fromisoformat(exp_iso.replace("Z", "+00:00"))
622
+ exp_epoch = exp_dt.timestamp()
623
+ except Exception:
624
+ # Conservative fallback if the timestamp can't be parsed.
625
+ # Pass-4 H-1 coordination (2026-04-14): worker ceiling is 60s
626
+ # (HandleAlterDO SUBSCRIBE_CAP_MAX_LIFETIME_SECONDS); backend
627
+ # mints with SUBSCRIBE_CAPABILITY_TTL_SECONDS clamped to 60s.
628
+ # Assuming 300s here caused the client to hold an already-
629
+ # rejected cap for 235s before reconnect - set to 60s to
630
+ # match the shipped ceiling.
631
+ exp_epoch = now + 60.0
632
+ self._subscribe_cap = (token, exp_epoch)
633
+ logger.debug(
634
+ "do_sse minted subscribe cap handle=%s exp=%.0f",
635
+ self._session.handle,
636
+ exp_epoch,
637
+ )
638
+ return token
639
+
640
+ async def _consume_stream(self, response: httpx.Response) -> None:
641
+ """Read chunks from ``response``, parse SSE, dispatch each frame.
642
+
643
+ Uses a stall watchdog: if no bytes arrive for
644
+ ``fallback_trigger_after_seconds``, the iteration unblocks and the
645
+ caller treats this as a disconnect (fallback will pick up).
646
+ """
647
+ buffer = ""
648
+ stall_seconds = max(self._config.fallback_trigger_after_seconds, 1.0)
649
+
650
+ # ``aiter_text`` yields decoded str chunks as they arrive. We wrap each
651
+ # chunk fetch in a timeout so a stalled keepalive doesn't wedge us.
652
+ iterator = response.aiter_text()
653
+ while not self._stop_event.is_set():
654
+ try:
655
+ chunk = await asyncio.wait_for(iterator.__anext__(), timeout=stall_seconds)
656
+ except (TimeoutError, asyncio.TimeoutError):
657
+ logger.warning(
658
+ "do_sse stream stalled > %.1fs - treating as disconnect", stall_seconds
659
+ )
660
+ return
661
+ except StopAsyncIteration:
662
+ return
663
+
664
+ if not chunk:
665
+ continue
666
+ buffer += chunk
667
+ frames, buffer = parse_sse_frames(buffer)
668
+ for frame in frames:
669
+ await self._dispatch_frame(frame)
670
+
671
+ # ------------------------------------------------------------------
672
+ # Frame dispatch + connection state publishing
673
+ # ------------------------------------------------------------------
674
+
675
+ async def _dispatch_frame(self, frame: SSEFrame) -> None:
676
+ """Publish a parsed frame to the bus and advance Last-Event-ID.
677
+
678
+ Frame-signature enforcement (C-1) happens BEFORE any publish: if
679
+ ``require_frame_signature=True`` and the frame fails verification
680
+ against the pinned pubkey, it is dropped with a structured warning
681
+ log and the matching ``_ConnectionState`` counter is incremented.
682
+ """
683
+ if frame.id is not None:
684
+ # Frame-ID dedup - drops duplicates within FRAME_DEDUP_WINDOW so
685
+ # a DO replay after a transient drop doesn't fan out twice. Raw
686
+ # untagged frames (`frame.id is None`) skip the check because the
687
+ # DO does not emit untagged payloads on the hot path; this is a
688
+ # test-fixture escape hatch.
689
+ if frame.id in self._seen_frame_ids:
690
+ logger.debug("do_sse duplicate frame id=%s - dropping", frame.id)
691
+ return
692
+ self._seen_frame_ids[frame.id] = None
693
+ if len(self._seen_frame_ids) > FRAME_DEDUP_WINDOW:
694
+ self._seen_frame_ids.popitem(last=False)
695
+ self._state.last_event_id = frame.id
696
+
697
+ # Parse payload once - used for both signature verification and
698
+ # the convenience TOPIC_EVENT publish below. Non-JSON / non-dict
699
+ # payloads are opaque to the signature check (the wire contract
700
+ # always wraps signed bodies in a top-level object).
701
+ parsed_payload: dict | None = None
702
+ try:
703
+ candidate = frame.json
704
+ if isinstance(candidate, dict):
705
+ parsed_payload = candidate
706
+ except (ValueError, json.JSONDecodeError):
707
+ parsed_payload = None
708
+
709
+ # --- C-1 frame-signature enforcement ---------------------------
710
+ if self._config.require_frame_signature:
711
+ if parsed_payload is None:
712
+ # Enforcement is on but the frame isn't a signable object.
713
+ # Drop as unsigned rather than silently letting opaque
714
+ # payloads bypass the check.
715
+ self._state.frames_dropped_unsigned += 1
716
+ logger.warning(
717
+ "do_sse dropping non-dict frame under signature enforcement id=%s event=%s",
718
+ frame.id,
719
+ frame.event,
720
+ )
721
+ return
722
+ pinned = self._config.frame_signature_pubkey or ""
723
+ if pinned.startswith(ED25519_PUBKEY_PREFIX):
724
+ pinned = pinned[len(ED25519_PUBKEY_PREFIX) :]
725
+ outcome = _verify_frame_signature(parsed_payload, pinned)
726
+ if outcome == _FrameSignatureOutcome.OK:
727
+ self._state.frames_verified_ok += 1
728
+ else:
729
+ # Structured log + counter increment + drop. Keep the log
730
+ # at WARNING (not ERROR) - enforcement drops are expected
731
+ # during the DO-signer rollout and flooding ERROR would
732
+ # desensitise operators before the migration completes.
733
+ counter_attr = f"frames_dropped_{outcome}"
734
+ current = getattr(self._state, counter_attr, 0)
735
+ setattr(self._state, counter_attr, current + 1)
736
+ logger.warning(
737
+ "do_sse dropping frame under signature enforcement id=%s event=%s outcome=%s",
738
+ frame.id,
739
+ frame.event,
740
+ outcome,
741
+ )
742
+ return
743
+ else:
744
+ # Enforcement off - count unsigned frames so operators can see
745
+ # the signature-coverage deficit in metrics before flipping
746
+ # the switch. Does NOT alter dispatch behaviour.
747
+ if parsed_payload is None or "signature" not in parsed_payload:
748
+ self._state.frames_warned_unsigned += 1
749
+
750
+ await self._bus.publish(TOPIC_FRAME, frame)
751
+
752
+ if parsed_payload is not None:
753
+ await self._bus.publish(TOPIC_EVENT, parsed_payload)
754
+ else:
755
+ logger.debug("do_sse frame not JSON event=%s", frame.event)
756
+
757
+ async def _on_connect(self) -> None:
758
+ """Publish an ``identity.connected`` sentinel."""
759
+ self._state.was_connected = True
760
+ self._state.connected_at = time.monotonic()
761
+ self._state.history.append("connect")
762
+ logger.info(
763
+ "do_sse connected handle=%s reconnect_count=%d",
764
+ self._session.handle,
765
+ self._state.reconnect_count,
766
+ )
767
+ await self._bus.publish(
768
+ TOPIC_CONNECTED,
769
+ {
770
+ "handle": self._session.handle,
771
+ "reconnect_count": self._state.reconnect_count,
772
+ },
773
+ )
774
+
775
+ async def _on_disconnect(self, reason: str) -> None:
776
+ """Publish an ``identity.disconnected`` sentinel (only if we were connected)."""
777
+ self._state.history.append(f"disconnect:{reason}")
778
+ # Don't emit spurious disconnect events if we never managed to connect
779
+ # in the first place - the fallback listens for transitions from
780
+ # connected→disconnected; a cold start is handled separately by the
781
+ # config-driven start-up delay in McpFallbackSubscriber.
782
+ if not self._state.was_connected:
783
+ return
784
+ self._state.was_connected = False
785
+ dwell = time.monotonic() - (self._state.connected_at or 0.0)
786
+ self._state.connected_at = None
787
+ if dwell >= STABLE_CONNECTION_SECONDS:
788
+ # Connection was healthy; treat the next reconnect as a fresh start.
789
+ self._state.backoff = BASE_BACKOFF_SECONDS
790
+ # else: short-lived connection - a flap. Leave self._state.backoff
791
+ # climbing so _backoff_then_retry keeps escalating toward MAX_BACKOFF.
792
+ self._state.reconnect_count += 1
793
+ logger.warning(
794
+ "do_sse disconnected handle=%s reason=%s reconnect_count=%d",
795
+ self._session.handle,
796
+ reason,
797
+ self._state.reconnect_count,
798
+ )
799
+ await self._bus.publish(
800
+ TOPIC_DISCONNECTED,
801
+ {
802
+ "handle": self._session.handle,
803
+ "reason": reason,
804
+ "reconnect_count": self._state.reconnect_count,
805
+ },
806
+ )
807
+
808
+ # ------------------------------------------------------------------
809
+ # Error handling + backoff
810
+ # ------------------------------------------------------------------
811
+
812
+ async def _handle_http_status_error(self, exc: httpx.HTTPStatusError) -> None:
813
+ status = exc.response.status_code if exc.response is not None else 0
814
+ await self._on_disconnect(f"http_status:{status}")
815
+ if status in (401, 403):
816
+ logger.error(
817
+ "do_sse auth failure status=%d - sleeping %.0fs; run `alter login` to refresh",
818
+ status,
819
+ AUTH_ERROR_BACKOFF_SECONDS,
820
+ )
821
+ await self._sleep_interruptible(AUTH_ERROR_BACKOFF_SECONDS)
822
+ return
823
+ await self._backoff_then_retry()
824
+
825
+ async def _backoff_then_retry(self) -> None:
826
+ """Sleep a jittered fraction of the current backoff ceiling, then
827
+ double the ceiling for next time (capped at MAX_BACKOFF_SECONDS).
828
+
829
+ Full Jitter (AWS "Exponential Backoff and Jitter"): the actual sleep is
830
+ uniform(0, ceiling). This decorrelates reconnects across daemons so a
831
+ shared-cause outage doesn't produce a synchronised thundering herd, and
832
+ bounds the worst-case wait at the current ceiling.
833
+ """
834
+ ceiling = self._state.backoff
835
+ delay = random.uniform(0.0, ceiling)
836
+ self._state.backoff = min(ceiling * 2, MAX_BACKOFF_SECONDS)
837
+ logger.info("do_sse reconnecting in %.1fs (ceiling %.1fs)", delay, ceiling)
838
+ await self._sleep_interruptible(delay)
839
+
840
+ async def _sleep_interruptible(self, seconds: float) -> None:
841
+ """Wait ``seconds`` or until ``stop()`` is called, whichever is first."""
842
+ try:
843
+ await asyncio.wait_for(self._stop_event.wait(), timeout=seconds)
844
+ except (TimeoutError, asyncio.TimeoutError):
845
+ return
846
+
847
+ # ------------------------------------------------------------------
848
+ # Test introspection
849
+ # ------------------------------------------------------------------
850
+
851
+ @property
852
+ def state(self) -> _ConnectionState:
853
+ """Current connection state (used by tests)."""
854
+ return self._state
855
+
856
+
857
+ def _package_version() -> str:
858
+ """Best-effort version string for the User-Agent header."""
859
+ try:
860
+ from alter_runtime import __version__
861
+
862
+ return __version__
863
+ except Exception: # pragma: no cover
864
+ return "unknown"