alter-runtime 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alter_runtime/__init__.py +11 -0
- alter_runtime/adapters/__init__.py +19 -0
- alter_runtime/adapters/claude_jsonl_watcher.py +545 -0
- alter_runtime/adapters/git_watcher.py +457 -0
- alter_runtime/adapters/household/__init__.py +29 -0
- alter_runtime/adapters/household/_base.py +138 -0
- alter_runtime/adapters/household/compost/__init__.py +17 -0
- alter_runtime/adapters/household/compost/adapter.py +81 -0
- alter_runtime/adapters/household/compost/storage.py +75 -0
- alter_runtime/adapters/household/compost/tests/__init__.py +0 -0
- alter_runtime/adapters/household/compost/tests/test_adapter.py +62 -0
- alter_runtime/adapters/household/compost/tests/test_storage.py +23 -0
- alter_runtime/adapters/household/compost/tests/test_traits.py +38 -0
- alter_runtime/adapters/household/compost/traits.py +79 -0
- alter_runtime/adapters/household/self_hoster/__init__.py +30 -0
- alter_runtime/adapters/household/self_hoster/adapter.py +248 -0
- alter_runtime/adapters/household/self_hoster/storage.py +83 -0
- alter_runtime/adapters/household/self_hoster/tests/__init__.py +0 -0
- alter_runtime/adapters/household/self_hoster/tests/test_adapter.py +216 -0
- alter_runtime/adapters/household/self_hoster/tests/test_storage.py +25 -0
- alter_runtime/adapters/household/self_hoster/tests/test_traits.py +55 -0
- alter_runtime/adapters/household/self_hoster/traits.py +105 -0
- alter_runtime/adapters/household/tapo_ecosystem/__init__.py +22 -0
- alter_runtime/adapters/household/tapo_ecosystem/adapter.py +98 -0
- alter_runtime/adapters/household/tapo_ecosystem/storage.py +95 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/__init__.py +0 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_adapter.py +55 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_storage.py +28 -0
- alter_runtime/adapters/household/tapo_ecosystem/tests/test_traits.py +45 -0
- alter_runtime/adapters/household/tapo_ecosystem/traits.py +97 -0
- alter_runtime/adapters/household/workshop_tools/__init__.py +25 -0
- alter_runtime/adapters/household/workshop_tools/adapter.py +77 -0
- alter_runtime/adapters/household/workshop_tools/storage.py +92 -0
- alter_runtime/adapters/household/workshop_tools/tests/__init__.py +0 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_adapter.py +48 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_storage.py +26 -0
- alter_runtime/adapters/household/workshop_tools/tests/test_traits.py +45 -0
- alter_runtime/adapters/household/workshop_tools/traits.py +95 -0
- alter_runtime/adapters/worktree_watcher.py +378 -0
- alter_runtime/atlas/__init__.py +48 -0
- alter_runtime/atlas/base.py +102 -0
- alter_runtime/atlas/ledger.py +196 -0
- alter_runtime/atlas/observations.py +136 -0
- alter_runtime/atlas/schema.py +106 -0
- alter_runtime/cap_cache.py +392 -0
- alter_runtime/cli.py +517 -0
- alter_runtime/clients/__init__.py +0 -0
- alter_runtime/clients/token_usage_client.py +273 -0
- alter_runtime/config.py +648 -0
- alter_runtime/consent.py +425 -0
- alter_runtime/daemon.py +518 -0
- alter_runtime/floor_loop.py +335 -0
- alter_runtime/floor_preflight.py +734 -0
- alter_runtime/http_auth.py +173 -0
- alter_runtime/notifiers/__init__.py +18 -0
- alter_runtime/notifiers/desktop.py +321 -0
- alter_runtime/sdk/__init__.py +12 -0
- alter_runtime/sdk/client.py +399 -0
- alter_runtime/service_install.py +616 -0
- alter_runtime/services/__init__.py +59 -0
- alter_runtime/services/launchd/com.alter.runtime.plist.in +90 -0
- alter_runtime/services/systemd/alter-runtime.service.in +74 -0
- alter_runtime/services/systemd/cf-access-env.conf.in +29 -0
- alter_runtime/sockets/__init__.py +20 -0
- alter_runtime/sockets/dbus.py +272 -0
- alter_runtime/sockets/unix.py +702 -0
- alter_runtime/subscribers/__init__.py +58 -0
- alter_runtime/subscribers/active_sessions_cron_emitter.py +313 -0
- alter_runtime/subscribers/active_sessions_do_publisher.py +1159 -0
- alter_runtime/subscribers/active_sessions_gc.py +432 -0
- alter_runtime/subscribers/active_sessions_writer.py +446 -0
- alter_runtime/subscribers/adapters_writer.py +415 -0
- alter_runtime/subscribers/agent_frames.py +461 -0
- alter_runtime/subscribers/bus.py +188 -0
- alter_runtime/subscribers/cache_writer.py +347 -0
- alter_runtime/subscribers/ceremony_echo.py +290 -0
- alter_runtime/subscribers/do_sse.py +864 -0
- alter_runtime/subscribers/ebpf.py +506 -0
- alter_runtime/subscribers/inbox_writer.py +469 -0
- alter_runtime/subscribers/mcp_fallback.py +391 -0
- alter_runtime/subscribers/presence_writer.py +426 -0
- alter_runtime/subscribers/session_presence.py +467 -0
- alter_runtime/subscribers/sse.py +125 -0
- alter_runtime/subscribers/weave_intent_writer.py +608 -0
- alter_runtime/update_loop.py +519 -0
- alter_runtime/weave/__init__.py +21 -0
- alter_runtime/weave/resolver.py +544 -0
- alter_runtime-0.3.0.dist-info/METADATA +289 -0
- alter_runtime-0.3.0.dist-info/RECORD +92 -0
- alter_runtime-0.3.0.dist-info/WHEEL +4 -0
- alter_runtime-0.3.0.dist-info/entry_points.txt +2 -0
- alter_runtime-0.3.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1159 @@
|
|
|
1
|
+
"""ActiveSessionsDoPublisher - tails the JSONL and POSTs to the per-handle DO.
|
|
2
|
+
|
|
3
|
+
Wave C of D-COORD-D2. Companion publisher to
|
|
4
|
+
:class:`ActiveSessionsWriter` (the local-disk writer) and
|
|
5
|
+
:class:`ActiveSessionsGc` (the idle/terminated sweeper).
|
|
6
|
+
|
|
7
|
+
The writer + GC pass already produce ``session_started`` /
|
|
8
|
+
``session_heartbeat`` / ``session_ended`` envelopes into
|
|
9
|
+
``~/.local/share/alter-runtime/active-sessions.jsonl``. This component
|
|
10
|
+
tails that file and republishes each new envelope to the per-``~handle``
|
|
11
|
+
Cloudflare Durable Object at
|
|
12
|
+
``${do_publish_url}/events/{handle}/sessions/ingest``, so that
|
|
13
|
+
cross-host visibility (D-COORD-D1 Wave B) and cross-tool fan-out (Codex,
|
|
14
|
+
Cursor, alter-cli, android, widget) get a single canonical SSE stream
|
|
15
|
+
at ``/events/{handle}/sessions``.
|
|
16
|
+
|
|
17
|
+
Design contract per D-COORD-D2 §7 (amendment) + the Wave C brief:
|
|
18
|
+
|
|
19
|
+
* **Tail, don't re-read.** The publisher maintains a byte-offset
|
|
20
|
+
checkpoint at ``${XDG_STATE_HOME}/alter-runtime/active-sessions-publisher.pos``
|
|
21
|
+
so each tick only reads the bytes appended since the last successful
|
|
22
|
+
POST. This matches the GC pass's file-position pattern at
|
|
23
|
+
:class:`ActiveSessionsGc` (file-mediated, no bus coupling).
|
|
24
|
+
* **Filter contract: publish ALL envelopes.** ``session_started`` /
|
|
25
|
+
``session_heartbeat`` / ``session_ended`` ALL get POSTed. The DO is
|
|
26
|
+
responsible for filtering ``session_ended`` out of live SSE - that's
|
|
27
|
+
the Worker agent's contract. Do not filter on the publisher side.
|
|
28
|
+
* **Idempotent on failure.** A failed POST does NOT advance the
|
|
29
|
+
offset; the next tick re-reads from the same start. Max 3 attempts
|
|
30
|
+
per record (with exponential backoff between ticks); after 3
|
|
31
|
+
failures the record is skipped with a structured log line and the
|
|
32
|
+
offset advances past it, so a single poison-pill record cannot stall
|
|
33
|
+
the entire tail.
|
|
34
|
+
* **Local-first.** The writer's append-only-on-disk path is the source
|
|
35
|
+
of truth; the DO is downstream eventual consistency. The publisher
|
|
36
|
+
never short-circuits the disk write - even when the DO is
|
|
37
|
+
unreachable, envelopes still land on disk for the next tail attempt.
|
|
38
|
+
|
|
39
|
+
Auth flow per cycle mirrors :class:`SessionPresenceWriter` in shape
|
|
40
|
+
but uses the **handle-alter realm** mint endpoint - the publisher
|
|
41
|
+
mints a cap-JWT scoped ``alter_events.sessions.ingest`` via
|
|
42
|
+
``POST {api}/api/v1/messaging/sessions-ingest-capability``
|
|
43
|
+
(Authorization: Bearer session.jwt; parameterless body), caches it
|
|
44
|
+
in-memory until 30s before its declared ``expires_at``, and attaches
|
|
45
|
+
it as ``Authorization: Bearer <cap>`` on the per-record ingest POST.
|
|
46
|
+
|
|
47
|
+
The endpoint is the parameterless self-directed mint added by alter#1138
|
|
48
|
+
under D-COORD-D2 Wave C wire fix - it signs caps with
|
|
49
|
+
``MESSAGING_CAP_SIGNING_KEY`` matching the Worker's ``ALTER_API_PUBKEY``
|
|
50
|
+
(both handle-alter realm). An earlier #46 wiring routed via the
|
|
51
|
+
org-alter realm cap-mint surface, which signs with
|
|
52
|
+
``ALTER_COLLECTIVE_CAP_SIGNING_KEY`` (Worker verification would fail),
|
|
53
|
+
allowlists ``alter_org.*`` scopes only (rejects
|
|
54
|
+
``alter_events.sessions.ingest`` with 422), and is principal-tier-only
|
|
55
|
+
(blocks all non-``~blake``/``~drew`` handles). All three failures are
|
|
56
|
+
eliminated by minting in-realm.
|
|
57
|
+
|
|
58
|
+
A 401 from the Worker drops the cached cap and triggers a single
|
|
59
|
+
re-mint+retry guard; a second 401 surfaces the error to the tick loop
|
|
60
|
+
without an infinite re-mint storm. When the alter-cli session is absent
|
|
61
|
+
the component idles silently - the writer + GC loops still produce a
|
|
62
|
+
fully usable local-disk record. The Worker-side scope match lives at
|
|
63
|
+
``cloudflare/workers/handle-alter/src/sessions.ts`` (constant
|
|
64
|
+
``SESSIONS_INGEST_SCOPE``).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
from __future__ import annotations
|
|
68
|
+
|
|
69
|
+
import asyncio
|
|
70
|
+
import contextlib
|
|
71
|
+
import errno
|
|
72
|
+
import fcntl
|
|
73
|
+
import json
|
|
74
|
+
import logging
|
|
75
|
+
import os
|
|
76
|
+
import sys
|
|
77
|
+
import time
|
|
78
|
+
from dataclasses import dataclass, field
|
|
79
|
+
from datetime import datetime
|
|
80
|
+
from pathlib import Path
|
|
81
|
+
from typing import TYPE_CHECKING, Any
|
|
82
|
+
|
|
83
|
+
import httpx
|
|
84
|
+
|
|
85
|
+
from alter_runtime.config import DaemonConfig, data_dir, runtime_state_dir
|
|
86
|
+
from alter_runtime.daemon import Component
|
|
87
|
+
from alter_runtime.subscribers.active_sessions_writer import ACTIVE_SESSIONS_FILENAME
|
|
88
|
+
from alter_runtime.subscribers.do_sse import _build_tls_context
|
|
89
|
+
|
|
90
|
+
if TYPE_CHECKING:
|
|
91
|
+
from alter_runtime.config import Session
|
|
92
|
+
|
|
93
|
+
__all__ = [
|
|
94
|
+
"ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME",
|
|
95
|
+
"BATCH_MAX_RECORDS",
|
|
96
|
+
"BATCH_MAX_BYTES",
|
|
97
|
+
"INGEST_SCOPE",
|
|
98
|
+
"MAX_POST_ATTEMPTS",
|
|
99
|
+
"ActiveSessionsDoPublisher",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
logger = logging.getLogger("alter_runtime.subscribers.active_sessions_do_publisher")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
#: Offset checkpoint filename (within ``runtime_state_dir()``).
|
|
106
|
+
ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME: str = "active-sessions-publisher.pos"
|
|
107
|
+
|
|
108
|
+
#: Maximum POST attempts before the publisher gives up on a single record
|
|
109
|
+
#: and advances the offset past it. The skip is logged with the record's
|
|
110
|
+
#: ``id`` / ``version`` so an operator can manually replay if required.
|
|
111
|
+
MAX_POST_ATTEMPTS: int = 3
|
|
112
|
+
|
|
113
|
+
#: Upper bound on the inter-tick exponential backoff when the DO is
|
|
114
|
+
#: returning errors. Matches :class:`SessionPresenceWriter` for parity.
|
|
115
|
+
MAX_POLL_BACKOFF_SECONDS: float = 60.0
|
|
116
|
+
|
|
117
|
+
#: Maximum line length we accept before treating the record as malformed
|
|
118
|
+
#: and skipping. Schema records are O(few hundred bytes); 64 KiB is a
|
|
119
|
+
#: generous upper bound that still bounds memory use under a runaway
|
|
120
|
+
#: writer.
|
|
121
|
+
MAX_LINE_BYTES: int = 64 * 1024
|
|
122
|
+
|
|
123
|
+
#: Maximum number of records per ingest-batch POST. Matches the Worker-side
|
|
124
|
+
#: limit for ``POST /events/{handle}/sessions/ingest-batch`` (413 over 100).
|
|
125
|
+
BATCH_MAX_RECORDS: int = 100
|
|
126
|
+
|
|
127
|
+
#: Maximum body size per ingest-batch POST in bytes. Matches the Worker-side
|
|
128
|
+
#: limit (413 over 256 KiB).
|
|
129
|
+
BATCH_MAX_BYTES: int = 256 * 1024
|
|
130
|
+
|
|
131
|
+
#: Cap scope required for ``/events/{handle}/sessions/ingest`` POSTs.
|
|
132
|
+
#: Matches the Worker-side constant at
|
|
133
|
+
#: ``cloudflare/workers/handle-alter/src/sessions.ts``
|
|
134
|
+
#: (``SESSIONS_INGEST_SCOPE``) and the handle-alter realm
|
|
135
|
+
#: ``SESSIONS_INGEST_CAPABILITY_SCOPE`` server-side. Re-export so
|
|
136
|
+
#: tests can pin against the cap claim returned by the mint endpoint.
|
|
137
|
+
INGEST_SCOPE: str = "alter_events.sessions.ingest"
|
|
138
|
+
|
|
139
|
+
#: Refresh leeway - re-mint when expiry is closer than this. Mirrors
|
|
140
|
+
#: :data:`alter_runtime.subscribers.session_presence.CAP_REFRESH_LEAD_SECONDS`.
|
|
141
|
+
CAP_REFRESH_LEAD_SECONDS: float = 30.0
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# State (exposed for tests)
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class _PublisherState:
|
|
151
|
+
"""Internal state - surfaced via ``state`` property for tests."""
|
|
152
|
+
|
|
153
|
+
posted_count: int = 0
|
|
154
|
+
skipped_count: int = 0
|
|
155
|
+
failed_attempts: int = 0
|
|
156
|
+
backoff: float = 0.0
|
|
157
|
+
last_post_at: float = 0.0
|
|
158
|
+
history: list[str] = field(default_factory=list)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass
|
|
162
|
+
class _CachedCap:
|
|
163
|
+
"""In-memory cap-JWT cache.
|
|
164
|
+
|
|
165
|
+
Bounded multi-use caps (proposed-D-CAP-1) are honoured the same way
|
|
166
|
+
:class:`SessionPresenceWriter._CachedCap` honours them - but the
|
|
167
|
+
publisher does not stripe ``X-Cap-Use-Index`` because the ingest
|
|
168
|
+
Worker route does not require it. Each successful POST consumes one
|
|
169
|
+
use; once exhausted (or stale) the next call re-mints.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
capability: str
|
|
173
|
+
expires_at_unix: float
|
|
174
|
+
uses_available: int
|
|
175
|
+
use_counter: int
|
|
176
|
+
|
|
177
|
+
def is_fresh(self, now: float) -> bool:
|
|
178
|
+
return self.expires_at_unix - now > CAP_REFRESH_LEAD_SECONDS
|
|
179
|
+
|
|
180
|
+
def has_uses(self) -> bool:
|
|
181
|
+
return self.use_counter < self.uses_available
|
|
182
|
+
|
|
183
|
+
def take_use(self) -> None:
|
|
184
|
+
self.use_counter += 1
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ---------------------------------------------------------------------------
|
|
188
|
+
# Internal exception types - kept private so callers cannot grep for
|
|
189
|
+
# them outside this module.
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class _SessionMissing(Exception):
|
|
194
|
+
"""Raised when the alter-cli session is absent."""
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class _CapMintError(Exception):
|
|
198
|
+
"""Raised when /api/v1/messaging/sessions-ingest-capability refuses
|
|
199
|
+
or returns a malformed body."""
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
# Component
|
|
204
|
+
# ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class ActiveSessionsDoPublisher(Component):
|
|
208
|
+
"""Tail ``active-sessions.jsonl`` and POST each envelope to the DO.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
config:
|
|
213
|
+
Loaded :class:`DaemonConfig`. Reads ``do_publish_url``,
|
|
214
|
+
``do_publish_enabled``, ``do_publish_poll_interval_seconds``.
|
|
215
|
+
session:
|
|
216
|
+
Authenticated alter-cli :class:`Session`. Used for the bearer
|
|
217
|
+
JWT when minting ``alter_events.sessions.ingest`` caps.
|
|
218
|
+
Without a session the component idles silently and re-checks on
|
|
219
|
+
every tick - the writer + GC continue producing local-disk
|
|
220
|
+
envelopes regardless.
|
|
221
|
+
sessions_path:
|
|
222
|
+
Override the JSONL path. Tests redirect to ``tmp_path``.
|
|
223
|
+
pos_path:
|
|
224
|
+
Override the offset-checkpoint path. Tests redirect to ``tmp_path``.
|
|
225
|
+
http_client:
|
|
226
|
+
Optional ``httpx.AsyncClient`` override for tests.
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
name = "active_sessions_do_publisher"
|
|
230
|
+
|
|
231
|
+
def __init__(
|
|
232
|
+
self,
|
|
233
|
+
config: DaemonConfig,
|
|
234
|
+
session: Session | None,
|
|
235
|
+
*,
|
|
236
|
+
sessions_path: Path | None = None,
|
|
237
|
+
pos_path: Path | None = None,
|
|
238
|
+
http_client: httpx.AsyncClient | None = None,
|
|
239
|
+
) -> None:
|
|
240
|
+
self._config = config
|
|
241
|
+
self._session = session
|
|
242
|
+
self._sessions_path: Path = (
|
|
243
|
+
sessions_path if sessions_path is not None else data_dir() / ACTIVE_SESSIONS_FILENAME
|
|
244
|
+
)
|
|
245
|
+
self._pos_path: Path = (
|
|
246
|
+
pos_path
|
|
247
|
+
if pos_path is not None
|
|
248
|
+
else runtime_state_dir() / ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME
|
|
249
|
+
)
|
|
250
|
+
self._http_client = http_client
|
|
251
|
+
self._owns_client = http_client is None
|
|
252
|
+
self._stop_event = asyncio.Event()
|
|
253
|
+
self._state = _PublisherState()
|
|
254
|
+
self._cap: _CachedCap | None = None
|
|
255
|
+
# In-memory attempt counter, keyed by (id, version). Persists
|
|
256
|
+
# across ticks but not across daemon restarts - a restart resets
|
|
257
|
+
# the counter and the record gets another MAX_POST_ATTEMPTS
|
|
258
|
+
# attempts, which is the desired behaviour (a daemon restart is
|
|
259
|
+
# an explicit operator intervention).
|
|
260
|
+
self._attempts: dict[tuple[str, int], int] = {}
|
|
261
|
+
|
|
262
|
+
# ------------------------------------------------------------------
|
|
263
|
+
# Component lifecycle
|
|
264
|
+
# ------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
async def run(self) -> None:
|
|
267
|
+
if not self._config.do_publish_enabled:
|
|
268
|
+
logger.info("active_sessions_do_publisher disabled by config - idle")
|
|
269
|
+
await self._stop_event.wait()
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
if self._session is None:
|
|
273
|
+
# Fail-loud-once: log a single warning and idle silently.
|
|
274
|
+
# The writer + GC continue producing local-disk envelopes
|
|
275
|
+
# regardless; the DO mirror just stays empty until the
|
|
276
|
+
# daemon next reloads with a populated session.json.
|
|
277
|
+
logger.warning(
|
|
278
|
+
"active_sessions_do_publisher: no alter-cli session - "
|
|
279
|
+
"DO publish disabled. Run `alter login` to mint cap "
|
|
280
|
+
"credentials, or set ALTER_RUNTIME_DO_PUBLISH_ENABLED=0 "
|
|
281
|
+
"to silence this warning."
|
|
282
|
+
)
|
|
283
|
+
await self._stop_event.wait()
|
|
284
|
+
return
|
|
285
|
+
|
|
286
|
+
logger.info(
|
|
287
|
+
"active_sessions_do_publisher starting sessions=%s pos=%s "
|
|
288
|
+
"interval=%.1fs publish_url=%s",
|
|
289
|
+
self._sessions_path,
|
|
290
|
+
self._pos_path,
|
|
291
|
+
self._config.do_publish_poll_interval_seconds,
|
|
292
|
+
self._config.do_publish_url,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Backend default headers — X-Alter-Client-* identity bundle
|
|
296
|
+
# (D-MIN-VERSION-FLOOR-1 §3) + CF Access service-token bundle. The
|
|
297
|
+
# active-sessions DO publisher POSTs to ``mcp.truealter.com``
|
|
298
|
+
# which sits behind the same edge as the rest of the backend; the
|
|
299
|
+
# X-Alter-* headers are required for the server-side floor gate.
|
|
300
|
+
from alter_runtime.http_auth import backend_default_headers
|
|
301
|
+
|
|
302
|
+
client = self._http_client or httpx.AsyncClient(
|
|
303
|
+
timeout=httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0),
|
|
304
|
+
verify=_build_tls_context(),
|
|
305
|
+
headers=backend_default_headers(),
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
while not self._stop_event.is_set():
|
|
310
|
+
await self._tick_safe(client)
|
|
311
|
+
await self._sleep_interruptible(self._config.do_publish_poll_interval_seconds)
|
|
312
|
+
finally:
|
|
313
|
+
if self._owns_client:
|
|
314
|
+
with contextlib.suppress(Exception):
|
|
315
|
+
await client.aclose()
|
|
316
|
+
logger.info("active_sessions_do_publisher stopped")
|
|
317
|
+
|
|
318
|
+
async def stop(self) -> None:
|
|
319
|
+
self._stop_event.set()
|
|
320
|
+
|
|
321
|
+
# ------------------------------------------------------------------
|
|
322
|
+
# Tick
|
|
323
|
+
# ------------------------------------------------------------------
|
|
324
|
+
|
|
325
|
+
async def _tick_safe(self, client: httpx.AsyncClient) -> None:
|
|
326
|
+
"""Wrap ``_tick`` with last-resort exception swallowing.
|
|
327
|
+
|
|
328
|
+
Mirrors :class:`SessionPresenceWriter._poll_once_safe`. The
|
|
329
|
+
supervisor restarts on bare exceptions, but we'd rather log and
|
|
330
|
+
continue than tear down the component for a transient blip.
|
|
331
|
+
"""
|
|
332
|
+
try:
|
|
333
|
+
await self._tick(client)
|
|
334
|
+
self._state.backoff = 0.0
|
|
335
|
+
except asyncio.CancelledError:
|
|
336
|
+
raise
|
|
337
|
+
except _SessionMissing:
|
|
338
|
+
# Session disappeared between run() and tick - idle until
|
|
339
|
+
# the next cycle. No re-log: the run() entry already warned.
|
|
340
|
+
self._state.backoff = max(self._state.backoff, 5.0)
|
|
341
|
+
except (httpx.HTTPError, _CapMintError) as exc:
|
|
342
|
+
self._state.backoff = min(
|
|
343
|
+
max(self._state.backoff * 2 if self._state.backoff else 2.0, 2.0),
|
|
344
|
+
MAX_POLL_BACKOFF_SECONDS,
|
|
345
|
+
)
|
|
346
|
+
logger.warning(
|
|
347
|
+
"active_sessions_do_publisher tick failed: %s - backoff %.1fs",
|
|
348
|
+
exc,
|
|
349
|
+
self._state.backoff,
|
|
350
|
+
)
|
|
351
|
+
except Exception as exc: # noqa: BLE001 - last-resort safety net
|
|
352
|
+
logger.exception("active_sessions_do_publisher unexpected: %s", exc)
|
|
353
|
+
self._state.backoff = MAX_POLL_BACKOFF_SECONDS
|
|
354
|
+
|
|
355
|
+
async def _tick(self, client: httpx.AsyncClient) -> None:
|
|
356
|
+
"""One sweep: read new bytes since the last offset, POST as a batch.
|
|
357
|
+
|
|
358
|
+
Attempts the new ``ingest-batch`` endpoint (one POST per tick for up
|
|
359
|
+
to ``BATCH_MAX_RECORDS`` / ``BATCH_MAX_BYTES``). Falls back to the
|
|
360
|
+
legacy per-record ``ingest`` path when the Worker returns 404 or 405
|
|
361
|
+
(old Worker without the batch route).
|
|
362
|
+
|
|
363
|
+
Backlog drain: when a single tick reads more than ``BATCH_MAX_RECORDS``
|
|
364
|
+
records or ``BATCH_MAX_BYTES`` bytes (e.g. after a long sleep or log
|
|
365
|
+
rotation), the tick emits multiple sequential sub-batch POSTs, advancing
|
|
366
|
+
the offset after each successful sub-batch so the position checkpoint
|
|
367
|
+
always reflects the last fully committed position.
|
|
368
|
+
"""
|
|
369
|
+
if not self._sessions_path.exists():
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
offset = self._load_offset()
|
|
373
|
+
new_bytes, new_offset = self._read_since(offset)
|
|
374
|
+
if not new_bytes:
|
|
375
|
+
# Defensive persist-on-shrink (B-A hardening). When
|
|
376
|
+
# ``_read_since`` detects a shrink (rotation / truncation) it
|
|
377
|
+
# resets its read base to 0 and returns ``new_offset`` reflecting
|
|
378
|
+
# the *new* file size. If the new file is empty or unchanged we
|
|
379
|
+
# get here with empty ``new_bytes`` - without persisting the
|
|
380
|
+
# reset, the stale large offset stays on disk and EVERY
|
|
381
|
+
# subsequent empty tick re-detects the shrink, re-logging the
|
|
382
|
+
# rotation and keeping the wedge armed until fresh bytes arrive.
|
|
383
|
+
# Persist the reset base now so a quiet post-rotation window
|
|
384
|
+
# cannot keep re-triggering the replay path. Only writes when the
|
|
385
|
+
# offset actually moved DOWN (shrink); the steady-state
|
|
386
|
+
# caught-up tick (new_offset == offset) writes nothing, leaving
|
|
387
|
+
# the success-path advancement untouched.
|
|
388
|
+
if new_offset < offset:
|
|
389
|
+
self._save_offset(new_offset)
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
# Parse all lines from the newly read bytes into (raw_bytes, record?)
|
|
393
|
+
# tuples. Malformed/oversized/blank lines are handled inline below and
|
|
394
|
+
# are advanced past unconditionally.
|
|
395
|
+
parsed_lines: list[tuple[bytes, dict[str, Any] | None]] = []
|
|
396
|
+
for raw_line in new_bytes.splitlines(keepends=True):
|
|
397
|
+
line_text = raw_line.decode("utf-8", errors="replace")
|
|
398
|
+
stripped = line_text.strip()
|
|
399
|
+
if not stripped:
|
|
400
|
+
parsed_lines.append((raw_line, None))
|
|
401
|
+
continue
|
|
402
|
+
if len(stripped) > MAX_LINE_BYTES:
|
|
403
|
+
logger.warning(
|
|
404
|
+
"active_sessions_do_publisher: oversize line (%d bytes) - skipping",
|
|
405
|
+
len(stripped),
|
|
406
|
+
)
|
|
407
|
+
parsed_lines.append((raw_line, None))
|
|
408
|
+
continue
|
|
409
|
+
try:
|
|
410
|
+
record = json.loads(stripped)
|
|
411
|
+
except (ValueError, json.JSONDecodeError):
|
|
412
|
+
logger.warning("active_sessions_do_publisher: malformed JSON line - skipping")
|
|
413
|
+
parsed_lines.append((raw_line, None))
|
|
414
|
+
continue
|
|
415
|
+
if not isinstance(record, dict):
|
|
416
|
+
parsed_lines.append((raw_line, None))
|
|
417
|
+
continue
|
|
418
|
+
handle = record.get("handle")
|
|
419
|
+
if not isinstance(handle, str) or not handle:
|
|
420
|
+
logger.warning("active_sessions_do_publisher: missing handle - skipping")
|
|
421
|
+
parsed_lines.append((raw_line, None))
|
|
422
|
+
continue
|
|
423
|
+
parsed_lines.append((raw_line, record))
|
|
424
|
+
|
|
425
|
+
# Drain the parsed lines as sub-batches. Each sub-batch is at most
|
|
426
|
+
# BATCH_MAX_RECORDS records and BATCH_MAX_BYTES of JSON body. We
|
|
427
|
+
# attempt the batch endpoint first; on 404/405 we fall back to the
|
|
428
|
+
# per-record path for the remainder of this tick.
|
|
429
|
+
#
|
|
430
|
+
# Seed ``consumed_offset`` from the read BASE, not the (possibly
|
|
431
|
+
# stale) pre-rotation ``offset``. ``_read_since`` returns
|
|
432
|
+
# ``new_offset == read_base + len(new_bytes)`` where ``read_base``
|
|
433
|
+
# is 0 right after a rotation reset (offset > size) and == ``offset``
|
|
434
|
+
# in steady state. ``new_offset - len(new_bytes)`` recovers that
|
|
435
|
+
# base exactly. Using the stale ``offset`` here is the replay-loop
|
|
436
|
+
# bug (B-A): after a rotation it persists ``stale_large +
|
|
437
|
+
# line_lengths``, so the next tick re-detects the shrink and
|
|
438
|
+
# re-POSTs the entire file forever.
|
|
439
|
+
consumed_offset = new_offset - len(new_bytes)
|
|
440
|
+
use_batch: bool = True # Flipped to False on first 404/405 response.
|
|
441
|
+
i = 0
|
|
442
|
+
while i < len(parsed_lines):
|
|
443
|
+
# Collect the next sub-batch: skip None (auto-advance) entries
|
|
444
|
+
# and accumulate valid records up to the caps.
|
|
445
|
+
sub_batch_lines: list[bytes] = []
|
|
446
|
+
sub_batch_records: list[dict[str, Any]] = []
|
|
447
|
+
sub_batch_byte_size: int = 0
|
|
448
|
+
j = i
|
|
449
|
+
# Advance past leading None lines (blank / malformed / no-handle).
|
|
450
|
+
while j < len(parsed_lines) and parsed_lines[j][1] is None:
|
|
451
|
+
raw_line, _ = parsed_lines[j]
|
|
452
|
+
consumed_offset += len(raw_line)
|
|
453
|
+
self._save_offset(consumed_offset)
|
|
454
|
+
j += 1
|
|
455
|
+
|
|
456
|
+
if j >= len(parsed_lines):
|
|
457
|
+
break
|
|
458
|
+
|
|
459
|
+
# Fill the sub-batch up to BATCH_MAX_RECORDS or BATCH_MAX_BYTES.
|
|
460
|
+
k = j
|
|
461
|
+
while k < len(parsed_lines):
|
|
462
|
+
raw_line, record = parsed_lines[k]
|
|
463
|
+
if record is None:
|
|
464
|
+
# Non-None lines follow valid records in the slice;
|
|
465
|
+
# a None after valid records terminates the sub-batch so
|
|
466
|
+
# it is handled in the NEXT iteration.
|
|
467
|
+
break
|
|
468
|
+
encoded = json.dumps(record, separators=(",", ":")).encode("utf-8")
|
|
469
|
+
# If adding this record would exceed either cap, close the
|
|
470
|
+
# sub-batch (but only if we already have records - a single
|
|
471
|
+
# oversized record still gets its own sub-batch attempt).
|
|
472
|
+
if sub_batch_records and (
|
|
473
|
+
len(sub_batch_records) >= BATCH_MAX_RECORDS
|
|
474
|
+
or sub_batch_byte_size + len(encoded) > BATCH_MAX_BYTES
|
|
475
|
+
):
|
|
476
|
+
break
|
|
477
|
+
sub_batch_lines.append(raw_line)
|
|
478
|
+
sub_batch_records.append(record)
|
|
479
|
+
sub_batch_byte_size += len(encoded)
|
|
480
|
+
k += 1
|
|
481
|
+
|
|
482
|
+
if not sub_batch_records:
|
|
483
|
+
# All remaining entries are None - already advanced above.
|
|
484
|
+
i = k
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
if use_batch:
|
|
488
|
+
# Attempt batch POST.
|
|
489
|
+
batch_result = await self._post_batch(client, sub_batch_records)
|
|
490
|
+
|
|
491
|
+
if batch_result == "fallback":
|
|
492
|
+
# 404/405 - old Worker. Switch to per-record for this tick.
|
|
493
|
+
use_batch = False
|
|
494
|
+
# Fall through to the single-record path below.
|
|
495
|
+
elif batch_result == "transient":
|
|
496
|
+
# Network / 5xx / 429 - leave offset un-advanced and let
|
|
497
|
+
# the next tick retry the whole sub-batch.
|
|
498
|
+
self._state.failed_attempts += 1
|
|
499
|
+
logger.info(
|
|
500
|
+
"active_sessions_do_publisher: batch POST transient failure "
|
|
501
|
+
"(%d records) - will retry next tick",
|
|
502
|
+
len(sub_batch_records),
|
|
503
|
+
)
|
|
504
|
+
return
|
|
505
|
+
elif isinstance(batch_result, list):
|
|
506
|
+
# Per-record results from the Worker. ``batch_result`` is a
|
|
507
|
+
# list of dicts: {index, ok, error?, status?}. Records whose
|
|
508
|
+
# result is ok:true (or idempotent duplicate) are accepted.
|
|
509
|
+
# Records whose result is ok:false are terminal (schema
|
|
510
|
+
# rejection) and are skipped, mirroring MAX_POST_ATTEMPTS
|
|
511
|
+
# skip. The batch is considered fully consumed when every
|
|
512
|
+
# record is either accepted or terminally skipped.
|
|
513
|
+
accepted = 0
|
|
514
|
+
skipped = 0
|
|
515
|
+
for res in batch_result:
|
|
516
|
+
idx = res.get("index", -1)
|
|
517
|
+
if res.get("ok"):
|
|
518
|
+
accepted += 1
|
|
519
|
+
else:
|
|
520
|
+
skipped += 1
|
|
521
|
+
logger.warning(
|
|
522
|
+
"active_sessions_do_publisher: batch record index=%d "
|
|
523
|
+
"terminal failure status=%s error=%r - skipping",
|
|
524
|
+
idx,
|
|
525
|
+
res.get("status"),
|
|
526
|
+
res.get("error"),
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# All records accounted for - advance offset past the whole
|
|
530
|
+
# sub-batch.
|
|
531
|
+
for raw_line in sub_batch_lines:
|
|
532
|
+
consumed_offset += len(raw_line)
|
|
533
|
+
self._save_offset(consumed_offset)
|
|
534
|
+
self._state.posted_count += accepted
|
|
535
|
+
self._state.skipped_count += skipped
|
|
536
|
+
self._state.last_post_at = time.time()
|
|
537
|
+
i = k
|
|
538
|
+
continue
|
|
539
|
+
else:
|
|
540
|
+
# batch_result == "success": every record accepted, no
|
|
541
|
+
# per-record results (simple 200 with accepted count only).
|
|
542
|
+
for raw_line in sub_batch_lines:
|
|
543
|
+
consumed_offset += len(raw_line)
|
|
544
|
+
self._save_offset(consumed_offset)
|
|
545
|
+
self._state.posted_count += len(sub_batch_records)
|
|
546
|
+
self._state.last_post_at = time.time()
|
|
547
|
+
i = k
|
|
548
|
+
continue
|
|
549
|
+
|
|
550
|
+
# Per-record fallback path (use_batch is False). Process records
|
|
551
|
+
# one at a time using the existing _post_record logic, which
|
|
552
|
+
# preserves today's idempotent-on-failure contract exactly.
|
|
553
|
+
all_consumed = True
|
|
554
|
+
for idx2, (raw_line, record) in enumerate(zip(sub_batch_lines, sub_batch_records)):
|
|
555
|
+
if record is None:
|
|
556
|
+
consumed_offset += len(raw_line)
|
|
557
|
+
self._save_offset(consumed_offset)
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
attempt_key = self._attempt_key(record)
|
|
561
|
+
handle = record.get("handle", "")
|
|
562
|
+
published = await self._post_record(client, handle, record)
|
|
563
|
+
if published:
|
|
564
|
+
self._attempts.pop(attempt_key, None)
|
|
565
|
+
consumed_offset += len(raw_line)
|
|
566
|
+
self._save_offset(consumed_offset)
|
|
567
|
+
self._state.posted_count += 1
|
|
568
|
+
self._state.last_post_at = time.time()
|
|
569
|
+
continue
|
|
570
|
+
|
|
571
|
+
self._attempts[attempt_key] = self._attempts.get(attempt_key, 0) + 1
|
|
572
|
+
self._state.failed_attempts += 1
|
|
573
|
+
if self._attempts[attempt_key] >= MAX_POST_ATTEMPTS:
|
|
574
|
+
logger.warning(
|
|
575
|
+
"active_sessions_do_publisher: giving up on record "
|
|
576
|
+
"id=%s version=%s after %d attempts - advancing offset",
|
|
577
|
+
record.get("id"),
|
|
578
|
+
record.get("version"),
|
|
579
|
+
self._attempts[attempt_key],
|
|
580
|
+
)
|
|
581
|
+
self._attempts.pop(attempt_key, None)
|
|
582
|
+
consumed_offset += len(raw_line)
|
|
583
|
+
self._save_offset(consumed_offset)
|
|
584
|
+
self._state.skipped_count += 1
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
logger.info(
|
|
588
|
+
"active_sessions_do_publisher: POST failed for id=%s "
|
|
589
|
+
"version=%s attempt=%d/%d - will retry next tick",
|
|
590
|
+
record.get("id"),
|
|
591
|
+
record.get("version"),
|
|
592
|
+
self._attempts[attempt_key],
|
|
593
|
+
MAX_POST_ATTEMPTS,
|
|
594
|
+
)
|
|
595
|
+
all_consumed = False
|
|
596
|
+
return # Stop tick; next tick resumes from here.
|
|
597
|
+
|
|
598
|
+
if not all_consumed:
|
|
599
|
+
return
|
|
600
|
+
i = k
|
|
601
|
+
|
|
602
|
+
# ------------------------------------------------------------------
|
|
603
|
+
# Cap minting
|
|
604
|
+
# ------------------------------------------------------------------
|
|
605
|
+
|
|
606
|
+
async def _get_cap(self, client: httpx.AsyncClient) -> str:
|
|
607
|
+
"""Return a fresh cap-JWT, minting one if cache is stale or used up.
|
|
608
|
+
|
|
609
|
+
Mirrors :meth:`SessionPresenceWriter._get_cap` - single mint per
|
|
610
|
+
cap window, in-memory cache, refresh on leeway, bounded
|
|
611
|
+
multi-use caps honoured.
|
|
612
|
+
"""
|
|
613
|
+
session = self._session
|
|
614
|
+
if session is None:
|
|
615
|
+
raise _SessionMissing()
|
|
616
|
+
|
|
617
|
+
now = time.time()
|
|
618
|
+
cap = self._cap
|
|
619
|
+
if cap is not None and cap.is_fresh(now) and cap.has_uses():
|
|
620
|
+
cap.take_use()
|
|
621
|
+
return cap.capability
|
|
622
|
+
|
|
623
|
+
# D-COORD-D2 Wave C wire fix: mint via the parameterless
|
|
624
|
+
# handle-alter-realm endpoint (alter#1138). The org-alter
|
|
625
|
+
# realm route is wrong realm here - see module docstring for
|
|
626
|
+
# the three failure modes it triggers for
|
|
627
|
+
# ``alter_events.sessions.ingest`` caps. TTL is server-configured
|
|
628
|
+
# (``SESSIONS_INGEST_CAPABILITY_TTL_SECONDS``, clamped [30, 300],
|
|
629
|
+
# default 60s); the per-handle 6/min rate limit means the cache +
|
|
630
|
+
# leeway-refresh path is the steady-state pattern.
|
|
631
|
+
url = f"{session.api.rstrip('/')}/api/v1/messaging/sessions-ingest-capability"
|
|
632
|
+
headers = {
|
|
633
|
+
"Authorization": f"Bearer {session.jwt}",
|
|
634
|
+
"Accept": "application/json",
|
|
635
|
+
}
|
|
636
|
+
response = await client.post(url, headers=headers)
|
|
637
|
+
if response.status_code in (401, 403):
|
|
638
|
+
raise _CapMintError(
|
|
639
|
+
f"cap-mint rejected (HTTP {response.status_code}): {response.text[:200]}"
|
|
640
|
+
)
|
|
641
|
+
response.raise_for_status()
|
|
642
|
+
|
|
643
|
+
try:
|
|
644
|
+
data = response.json()
|
|
645
|
+
except ValueError as exc:
|
|
646
|
+
raise _CapMintError("cap-mint returned non-JSON body") from exc
|
|
647
|
+
|
|
648
|
+
if not isinstance(data, dict):
|
|
649
|
+
raise _CapMintError("cap-mint returned non-object body")
|
|
650
|
+
|
|
651
|
+
capability = data.get("capability")
|
|
652
|
+
expires_at = data.get("expires_at")
|
|
653
|
+
if not isinstance(capability, str) or not capability:
|
|
654
|
+
raise _CapMintError("cap-mint response missing capability")
|
|
655
|
+
if not isinstance(expires_at, str) or not expires_at:
|
|
656
|
+
raise _CapMintError("cap-mint response missing expires_at")
|
|
657
|
+
|
|
658
|
+
try:
|
|
659
|
+
expires_at_unix = datetime.fromisoformat(expires_at.replace("Z", "+00:00")).timestamp()
|
|
660
|
+
except ValueError as exc:
|
|
661
|
+
raise _CapMintError(f"cap-mint returned non-ISO expires_at: {expires_at}") from exc
|
|
662
|
+
|
|
663
|
+
# The handle-alter realm endpoint mints time-bounded JWTs -
|
|
664
|
+
# the Worker's verifier checks scope + exp only and does no
|
|
665
|
+
# per-use accounting, so the cap may be reused for any number
|
|
666
|
+
# of POSTs within its TTL window. Cache refresh is governed by
|
|
667
|
+
# the leeway gate (``CAP_REFRESH_LEAD_SECONDS``); the
|
|
668
|
+
# server-side per-handle 6/min rate limit on the mint endpoint
|
|
669
|
+
# bounds re-mint storms even under burst load.
|
|
670
|
+
cap = _CachedCap(
|
|
671
|
+
capability=capability,
|
|
672
|
+
expires_at_unix=expires_at_unix,
|
|
673
|
+
uses_available=sys.maxsize,
|
|
674
|
+
use_counter=1,
|
|
675
|
+
)
|
|
676
|
+
self._cap = cap
|
|
677
|
+
return capability
|
|
678
|
+
|
|
679
|
+
# ------------------------------------------------------------------
|
|
680
|
+
# Batch POST
|
|
681
|
+
# ------------------------------------------------------------------
|
|
682
|
+
|
|
683
|
+
async def _post_batch(
|
|
684
|
+
self,
|
|
685
|
+
client: httpx.AsyncClient,
|
|
686
|
+
records: list[dict[str, Any]],
|
|
687
|
+
) -> str | list[dict[str, Any]]:
|
|
688
|
+
"""POST ``records`` as a JSON array to ``/events/{handle}/sessions/ingest-batch``.
|
|
689
|
+
|
|
690
|
+
All records in a sub-batch share the same ``handle`` (the caller reads
|
|
691
|
+
the first record's handle - mixed-handle batches are not supported by
|
|
692
|
+
the Worker, but in practice all records in a single session log share
|
|
693
|
+
one handle). Returns one of:
|
|
694
|
+
|
|
695
|
+
``"success"``
|
|
696
|
+
HTTP 200 with ``accepted >= 1`` and no per-record failures; all
|
|
697
|
+
records accepted.
|
|
698
|
+
``list[dict]``
|
|
699
|
+
HTTP 200 with a ``results`` array containing at least one
|
|
700
|
+
``ok: false`` entry. The list contains the per-record result dicts
|
|
701
|
+
so ``_tick`` can advance past terminal failures individually.
|
|
702
|
+
``"transient"``
|
|
703
|
+
Network error, 5xx, or 429 - caller should leave offset
|
|
704
|
+
un-advanced and retry next tick.
|
|
705
|
+
``"fallback"``
|
|
706
|
+
HTTP 404 or 405 - Worker does not have the batch route. Caller
|
|
707
|
+
should switch to the per-record path for this tick.
|
|
708
|
+
"""
|
|
709
|
+
from urllib.parse import quote
|
|
710
|
+
|
|
711
|
+
handle = records[0].get("handle", "")
|
|
712
|
+
encoded_handle = quote(handle, safe="~")
|
|
713
|
+
url = (
|
|
714
|
+
f"{self._config.do_publish_url.rstrip('/')}"
|
|
715
|
+
f"/events/{encoded_handle}/sessions/ingest-batch"
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
cap = await self._get_cap(client)
|
|
720
|
+
except _SessionMissing:
|
|
721
|
+
raise
|
|
722
|
+
except (httpx.HTTPError, _CapMintError) as exc:
|
|
723
|
+
logger.warning(
|
|
724
|
+
"active_sessions_do_publisher: cap-mint failed for batch: %s",
|
|
725
|
+
exc,
|
|
726
|
+
)
|
|
727
|
+
return "transient"
|
|
728
|
+
|
|
729
|
+
response = await self._do_batch_post(client, url, cap, records)
|
|
730
|
+
if response is None:
|
|
731
|
+
return "transient"
|
|
732
|
+
|
|
733
|
+
if response.status_code in (404, 405):
|
|
734
|
+
# Worker does not have the batch route yet.
|
|
735
|
+
logger.info(
|
|
736
|
+
"active_sessions_do_publisher: ingest-batch returned HTTP %d "
|
|
737
|
+
"- falling back to per-record path for this tick",
|
|
738
|
+
response.status_code,
|
|
739
|
+
)
|
|
740
|
+
return "fallback"
|
|
741
|
+
|
|
742
|
+
if response.status_code in (401, 403):
|
|
743
|
+
# Cap rejected - drop and re-mint once, then retry.
|
|
744
|
+
logger.info(
|
|
745
|
+
"active_sessions_do_publisher: ingest-batch auth rejected (HTTP %d) "
|
|
746
|
+
"- re-minting cap and retrying once",
|
|
747
|
+
response.status_code,
|
|
748
|
+
)
|
|
749
|
+
self._cap = None
|
|
750
|
+
try:
|
|
751
|
+
cap = await self._get_cap(client)
|
|
752
|
+
except _SessionMissing:
|
|
753
|
+
raise
|
|
754
|
+
except (httpx.HTTPError, _CapMintError) as exc:
|
|
755
|
+
logger.warning(
|
|
756
|
+
"active_sessions_do_publisher: cap re-mint failed for batch: %s",
|
|
757
|
+
exc,
|
|
758
|
+
)
|
|
759
|
+
return "transient"
|
|
760
|
+
response = await self._do_batch_post(client, url, cap, records)
|
|
761
|
+
if response is None:
|
|
762
|
+
return "transient"
|
|
763
|
+
if response.status_code in (401, 403):
|
|
764
|
+
# Second auth failure - surface as transient so the next tick
|
|
765
|
+
# retries (no infinite re-mint loop).
|
|
766
|
+
logger.warning(
|
|
767
|
+
"active_sessions_do_publisher: ingest-batch auth failed twice "
|
|
768
|
+
"(HTTP %d) - treating as transient",
|
|
769
|
+
response.status_code,
|
|
770
|
+
)
|
|
771
|
+
return "transient"
|
|
772
|
+
|
|
773
|
+
if response.status_code in (429,) or response.status_code >= 500:
|
|
774
|
+
logger.warning(
|
|
775
|
+
"active_sessions_do_publisher: ingest-batch transient HTTP %d - "
|
|
776
|
+
"will retry next tick",
|
|
777
|
+
response.status_code,
|
|
778
|
+
)
|
|
779
|
+
return "transient"
|
|
780
|
+
|
|
781
|
+
if response.status_code == 413:
|
|
782
|
+
# Over-limit. Should not happen given the BATCH_MAX_RECORDS /
|
|
783
|
+
# BATCH_MAX_BYTES pre-send guards, but a single record that alone
|
|
784
|
+
# exceeds the Worker's 256 KiB cap would 413 a sub-batch-of-one and,
|
|
785
|
+
# if treated as transient, retry the identical body forever - a
|
|
786
|
+
# poison-pill wedge that stalls all session egress for the handle.
|
|
787
|
+
# Fall back to the per-record single-ingest path for this tick: an
|
|
788
|
+
# oversized record gets the existing MAX_POST_ATTEMPTS terminal skip
|
|
789
|
+
# and any well-sized records still post. Never infinite-retry a 413.
|
|
790
|
+
logger.warning(
|
|
791
|
+
"active_sessions_do_publisher: ingest-batch 413 (over Worker limit) "
|
|
792
|
+
"- falling back to per-record path for this tick"
|
|
793
|
+
)
|
|
794
|
+
return "fallback"
|
|
795
|
+
|
|
796
|
+
if response.status_code == 400:
|
|
797
|
+
# Whole-body malformed (not a JSON array / unparseable). This is a
|
|
798
|
+
# framing fault, not a transient condition - retrying the identical
|
|
799
|
+
# body loops forever. Fall back to the per-record single-ingest path
|
|
800
|
+
# for this tick, which serialises one record at a time and sidesteps
|
|
801
|
+
# any array-framing fault. Should be rare - the publisher produces
|
|
802
|
+
# well-formed JSON.
|
|
803
|
+
try:
|
|
804
|
+
body_preview = response.text[:200]
|
|
805
|
+
except Exception: # pragma: no cover
|
|
806
|
+
body_preview = "<unreadable>"
|
|
807
|
+
logger.warning(
|
|
808
|
+
"active_sessions_do_publisher: ingest-batch 400 (malformed body) "
|
|
809
|
+
"body=%r - falling back to per-record path for this tick",
|
|
810
|
+
body_preview,
|
|
811
|
+
)
|
|
812
|
+
return "fallback"
|
|
813
|
+
|
|
814
|
+
if 200 <= response.status_code < 300:
|
|
815
|
+
try:
|
|
816
|
+
data = response.json()
|
|
817
|
+
except (ValueError, json.JSONDecodeError):
|
|
818
|
+
# Non-JSON 2xx: accept all records (old Worker variant).
|
|
819
|
+
return "success"
|
|
820
|
+
|
|
821
|
+
if not isinstance(data, dict):
|
|
822
|
+
return "success"
|
|
823
|
+
|
|
824
|
+
results = data.get("results")
|
|
825
|
+
if isinstance(results, list) and results:
|
|
826
|
+
# Check whether any per-record result is ok:false.
|
|
827
|
+
has_failures = any(isinstance(r, dict) and not r.get("ok", True) for r in results)
|
|
828
|
+
if has_failures:
|
|
829
|
+
# Return the results list so _tick can skip individual
|
|
830
|
+
# terminal failures.
|
|
831
|
+
return [r for r in results if isinstance(r, dict)]
|
|
832
|
+
return "success"
|
|
833
|
+
|
|
834
|
+
# Unexpected status - treat as transient.
|
|
835
|
+
try:
|
|
836
|
+
body_preview = response.text[:200]
|
|
837
|
+
except Exception: # pragma: no cover
|
|
838
|
+
body_preview = "<unreadable>"
|
|
839
|
+
logger.warning(
|
|
840
|
+
"active_sessions_do_publisher: ingest-batch unexpected HTTP %d "
|
|
841
|
+
"body=%r - treating as transient",
|
|
842
|
+
response.status_code,
|
|
843
|
+
body_preview,
|
|
844
|
+
)
|
|
845
|
+
return "transient"
|
|
846
|
+
|
|
847
|
+
async def _do_batch_post(
|
|
848
|
+
self,
|
|
849
|
+
client: httpx.AsyncClient,
|
|
850
|
+
url: str,
|
|
851
|
+
cap: str,
|
|
852
|
+
records: list[dict[str, Any]],
|
|
853
|
+
) -> httpx.Response | None:
|
|
854
|
+
"""Single batch POST. Returns the Response or None on transport error."""
|
|
855
|
+
headers = {
|
|
856
|
+
"Authorization": f"Bearer {cap}",
|
|
857
|
+
"Content-Type": "application/json",
|
|
858
|
+
"Accept": "application/json",
|
|
859
|
+
}
|
|
860
|
+
try:
|
|
861
|
+
return await client.post(url, json=records, headers=headers)
|
|
862
|
+
except httpx.HTTPError as exc:
|
|
863
|
+
logger.warning(
|
|
864
|
+
"active_sessions_do_publisher: batch POST raised %s",
|
|
865
|
+
exc,
|
|
866
|
+
)
|
|
867
|
+
return None
|
|
868
|
+
|
|
869
|
+
# ------------------------------------------------------------------
|
|
870
|
+
# POST
|
|
871
|
+
# ------------------------------------------------------------------
|
|
872
|
+
|
|
873
|
+
async def _post_record(
|
|
874
|
+
self,
|
|
875
|
+
client: httpx.AsyncClient,
|
|
876
|
+
handle: str,
|
|
877
|
+
record: dict[str, Any],
|
|
878
|
+
) -> bool:
|
|
879
|
+
"""POST one envelope. Returns True on 2xx, False otherwise.
|
|
880
|
+
|
|
881
|
+
On 401/403 from the Worker the cached cap is dropped and a
|
|
882
|
+
single re-mint+retry is attempted. A second 401 surfaces as a
|
|
883
|
+
terminal failure for this record (counts against
|
|
884
|
+
MAX_POST_ATTEMPTS) - the publisher never loops on auth.
|
|
885
|
+
"""
|
|
886
|
+
# The DO route uses the URL-encoded handle, mirroring the Worker's
|
|
887
|
+
# router which normalises + decodes again on receive.
|
|
888
|
+
from urllib.parse import quote
|
|
889
|
+
|
|
890
|
+
encoded_handle = quote(handle, safe="~")
|
|
891
|
+
url = f"{self._config.do_publish_url.rstrip('/')}/events/{encoded_handle}/sessions/ingest"
|
|
892
|
+
|
|
893
|
+
try:
|
|
894
|
+
cap = await self._get_cap(client)
|
|
895
|
+
except _SessionMissing:
|
|
896
|
+
# Surface to _tick_safe so the standard idle path is taken
|
|
897
|
+
# without per-record noise.
|
|
898
|
+
raise
|
|
899
|
+
except (httpx.HTTPError, _CapMintError) as exc:
|
|
900
|
+
logger.warning(
|
|
901
|
+
"active_sessions_do_publisher: cap-mint failed: %s - id=%s",
|
|
902
|
+
exc,
|
|
903
|
+
record.get("id"),
|
|
904
|
+
)
|
|
905
|
+
return False
|
|
906
|
+
|
|
907
|
+
response = await self._do_post(client, url, cap, record)
|
|
908
|
+
if response is None:
|
|
909
|
+
return False
|
|
910
|
+
|
|
911
|
+
if 200 <= response.status_code < 300:
|
|
912
|
+
return True
|
|
913
|
+
|
|
914
|
+
if response.status_code in (401, 403):
|
|
915
|
+
# Cap was rejected: drop it, re-mint, single retry. Guards
|
|
916
|
+
# against an infinite re-mint loop by only retrying once
|
|
917
|
+
# per record per tick.
|
|
918
|
+
logger.info(
|
|
919
|
+
"active_sessions_do_publisher: ingest auth rejected (HTTP %d) - "
|
|
920
|
+
"re-minting cap and retrying once for id=%s",
|
|
921
|
+
response.status_code,
|
|
922
|
+
record.get("id"),
|
|
923
|
+
)
|
|
924
|
+
self._cap = None
|
|
925
|
+
try:
|
|
926
|
+
cap = await self._get_cap(client)
|
|
927
|
+
except _SessionMissing:
|
|
928
|
+
raise
|
|
929
|
+
except (httpx.HTTPError, _CapMintError) as exc:
|
|
930
|
+
logger.warning(
|
|
931
|
+
"active_sessions_do_publisher: cap re-mint failed: %s - id=%s",
|
|
932
|
+
exc,
|
|
933
|
+
record.get("id"),
|
|
934
|
+
)
|
|
935
|
+
return False
|
|
936
|
+
|
|
937
|
+
retry = await self._do_post(client, url, cap, record)
|
|
938
|
+
if retry is None:
|
|
939
|
+
return False
|
|
940
|
+
if 200 <= retry.status_code < 300:
|
|
941
|
+
return True
|
|
942
|
+
response = retry
|
|
943
|
+
|
|
944
|
+
# 4xx/5xx (other than auth handled above): log + treat as
|
|
945
|
+
# terminal failure for this record (counts against
|
|
946
|
+
# MAX_POST_ATTEMPTS). The Worker is the authority and we honour
|
|
947
|
+
# its rejection rather than retrying forever.
|
|
948
|
+
try:
|
|
949
|
+
body_preview = response.text[:200]
|
|
950
|
+
except Exception: # pragma: no cover - defensive
|
|
951
|
+
body_preview = "<unreadable>"
|
|
952
|
+
logger.warning(
|
|
953
|
+
"active_sessions_do_publisher: POST rejected status=%d body=%r id=%s",
|
|
954
|
+
response.status_code,
|
|
955
|
+
body_preview,
|
|
956
|
+
record.get("id"),
|
|
957
|
+
)
|
|
958
|
+
return False
|
|
959
|
+
|
|
960
|
+
async def _do_post(
|
|
961
|
+
self,
|
|
962
|
+
client: httpx.AsyncClient,
|
|
963
|
+
url: str,
|
|
964
|
+
cap: str,
|
|
965
|
+
record: dict[str, Any],
|
|
966
|
+
) -> httpx.Response | None:
|
|
967
|
+
"""Single POST. Returns the Response or None on transport error."""
|
|
968
|
+
headers = {
|
|
969
|
+
"Authorization": f"Bearer {cap}",
|
|
970
|
+
"Content-Type": "application/json",
|
|
971
|
+
"Accept": "application/json",
|
|
972
|
+
}
|
|
973
|
+
try:
|
|
974
|
+
return await client.post(url, json=record, headers=headers)
|
|
975
|
+
except httpx.HTTPError as exc:
|
|
976
|
+
logger.warning(
|
|
977
|
+
"active_sessions_do_publisher: POST raised %s - id=%s",
|
|
978
|
+
exc,
|
|
979
|
+
record.get("id"),
|
|
980
|
+
)
|
|
981
|
+
return None
|
|
982
|
+
|
|
983
|
+
# ------------------------------------------------------------------
|
|
984
|
+
# File reading / offset checkpoint
|
|
985
|
+
# ------------------------------------------------------------------
|
|
986
|
+
|
|
987
|
+
def _read_since(self, offset: int) -> tuple[bytes, int]:
|
|
988
|
+
"""Read all bytes from ``offset`` to EOF under a shared lock.
|
|
989
|
+
|
|
990
|
+
Returns ``(bytes_read, new_offset)``. ``new_offset`` is the
|
|
991
|
+
absolute file size after reading - i.e., the value we should
|
|
992
|
+
persist once every line has been POSTed.
|
|
993
|
+
|
|
994
|
+
Empty return on missing/unreadable file.
|
|
995
|
+
"""
|
|
996
|
+
try:
|
|
997
|
+
stat = self._sessions_path.stat()
|
|
998
|
+
except FileNotFoundError:
|
|
999
|
+
return b"", offset
|
|
1000
|
+
|
|
1001
|
+
size = stat.st_size
|
|
1002
|
+
if offset > size:
|
|
1003
|
+
# File shrank (likely a rotation: writer moved jsonl ->
|
|
1004
|
+
# jsonl.1 and started fresh). Reset to 0 and re-read from
|
|
1005
|
+
# the top of the new file.
|
|
1006
|
+
logger.info(
|
|
1007
|
+
"active_sessions_do_publisher: jsonl shrank "
|
|
1008
|
+
"(offset=%d > size=%d) - resetting to 0 (rotation)",
|
|
1009
|
+
offset,
|
|
1010
|
+
size,
|
|
1011
|
+
)
|
|
1012
|
+
offset = 0
|
|
1013
|
+
|
|
1014
|
+
if offset == size:
|
|
1015
|
+
return b"", offset
|
|
1016
|
+
|
|
1017
|
+
flags = os.O_RDONLY
|
|
1018
|
+
try:
|
|
1019
|
+
fd = os.open(self._sessions_path, flags)
|
|
1020
|
+
except OSError as exc:
|
|
1021
|
+
logger.warning("active_sessions_do_publisher: open failed: %s", exc)
|
|
1022
|
+
return b"", offset
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
try:
|
|
1026
|
+
fcntl.flock(fd, fcntl.LOCK_SH)
|
|
1027
|
+
except OSError as exc: # pragma: no cover - exotic FS
|
|
1028
|
+
if exc.errno not in (errno.ENOTSUP, errno.EINVAL):
|
|
1029
|
+
logger.warning("active_sessions_do_publisher: LOCK_SH failed: %s", exc)
|
|
1030
|
+
return b"", offset
|
|
1031
|
+
try:
|
|
1032
|
+
os.lseek(fd, offset, os.SEEK_SET)
|
|
1033
|
+
buf = b""
|
|
1034
|
+
while True:
|
|
1035
|
+
chunk = os.read(fd, 65536)
|
|
1036
|
+
if not chunk:
|
|
1037
|
+
break
|
|
1038
|
+
buf += chunk
|
|
1039
|
+
finally:
|
|
1040
|
+
with contextlib.suppress(OSError):
|
|
1041
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
1042
|
+
finally:
|
|
1043
|
+
os.close(fd)
|
|
1044
|
+
|
|
1045
|
+
return buf, offset + len(buf)
|
|
1046
|
+
|
|
1047
|
+
def _load_offset(self) -> int:
|
|
1048
|
+
"""Read the last persisted offset. Returns 0 when absent/invalid."""
|
|
1049
|
+
if not self._pos_path.exists():
|
|
1050
|
+
return 0
|
|
1051
|
+
try:
|
|
1052
|
+
raw = self._pos_path.read_text(encoding="utf-8").strip()
|
|
1053
|
+
except OSError as exc:
|
|
1054
|
+
logger.warning(
|
|
1055
|
+
"active_sessions_do_publisher: cannot read pos file %s: %s",
|
|
1056
|
+
self._pos_path,
|
|
1057
|
+
exc,
|
|
1058
|
+
)
|
|
1059
|
+
return 0
|
|
1060
|
+
try:
|
|
1061
|
+
value = int(raw)
|
|
1062
|
+
except ValueError:
|
|
1063
|
+
logger.warning(
|
|
1064
|
+
"active_sessions_do_publisher: malformed pos file %s: %r - resetting to 0",
|
|
1065
|
+
self._pos_path,
|
|
1066
|
+
raw,
|
|
1067
|
+
)
|
|
1068
|
+
return 0
|
|
1069
|
+
return max(value, 0)
|
|
1070
|
+
|
|
1071
|
+
def _save_offset(self, offset: int) -> None:
|
|
1072
|
+
"""Atomically persist the current offset checkpoint."""
|
|
1073
|
+
self._ensure_parent(self._pos_path)
|
|
1074
|
+
tmp_path = self._pos_path.with_suffix(self._pos_path.suffix + ".tmp")
|
|
1075
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
|
|
1076
|
+
try:
|
|
1077
|
+
fd = os.open(tmp_path, flags, 0o600)
|
|
1078
|
+
except OSError as exc:
|
|
1079
|
+
logger.warning(
|
|
1080
|
+
"active_sessions_do_publisher: cannot open pos tmp %s: %s",
|
|
1081
|
+
tmp_path,
|
|
1082
|
+
exc,
|
|
1083
|
+
)
|
|
1084
|
+
return
|
|
1085
|
+
try:
|
|
1086
|
+
with contextlib.suppress(OSError):
|
|
1087
|
+
os.fchmod(fd, 0o600)
|
|
1088
|
+
os.write(fd, str(offset).encode("utf-8"))
|
|
1089
|
+
os.fsync(fd)
|
|
1090
|
+
finally:
|
|
1091
|
+
os.close(fd)
|
|
1092
|
+
try:
|
|
1093
|
+
os.replace(tmp_path, self._pos_path)
|
|
1094
|
+
except OSError as exc:
|
|
1095
|
+
logger.warning(
|
|
1096
|
+
"active_sessions_do_publisher: cannot replace pos %s: %s",
|
|
1097
|
+
self._pos_path,
|
|
1098
|
+
exc,
|
|
1099
|
+
)
|
|
1100
|
+
return
|
|
1101
|
+
with contextlib.suppress(OSError):
|
|
1102
|
+
os.chmod(self._pos_path, 0o600)
|
|
1103
|
+
|
|
1104
|
+
def _ensure_parent(self, path: Path) -> None:
|
|
1105
|
+
parent = path.parent
|
|
1106
|
+
if not parent.exists():
|
|
1107
|
+
parent.mkdir(parents=True, exist_ok=True, mode=0o700)
|
|
1108
|
+
with contextlib.suppress(OSError):
|
|
1109
|
+
os.chmod(parent, 0o700)
|
|
1110
|
+
|
|
1111
|
+
# ------------------------------------------------------------------
|
|
1112
|
+
# Helpers
|
|
1113
|
+
# ------------------------------------------------------------------
|
|
1114
|
+
|
|
1115
|
+
@staticmethod
|
|
1116
|
+
def _attempt_key(record: dict[str, Any]) -> tuple[str, int]:
|
|
1117
|
+
"""Stable in-memory attempt-counter key per record.
|
|
1118
|
+
|
|
1119
|
+
Falls back to ``("", -1)`` when id/version are missing - those
|
|
1120
|
+
records get their own bucket so a single malformed line cannot
|
|
1121
|
+
evict a real one.
|
|
1122
|
+
"""
|
|
1123
|
+
record_id = record.get("id")
|
|
1124
|
+
version = record.get("version")
|
|
1125
|
+
if not isinstance(record_id, str):
|
|
1126
|
+
record_id = ""
|
|
1127
|
+
if not isinstance(version, int):
|
|
1128
|
+
version = -1
|
|
1129
|
+
return record_id, version
|
|
1130
|
+
|
|
1131
|
+
async def _sleep_interruptible(self, seconds: float) -> None:
|
|
1132
|
+
"""Sleep ``seconds`` or until stop is set, whichever comes first."""
|
|
1133
|
+
effective = max(seconds, self._state.backoff)
|
|
1134
|
+
if effective <= 0:
|
|
1135
|
+
return
|
|
1136
|
+
try:
|
|
1137
|
+
await asyncio.wait_for(self._stop_event.wait(), timeout=effective)
|
|
1138
|
+
except (TimeoutError, asyncio.TimeoutError):
|
|
1139
|
+
return
|
|
1140
|
+
|
|
1141
|
+
# ------------------------------------------------------------------
|
|
1142
|
+
# Test introspection
|
|
1143
|
+
# ------------------------------------------------------------------
|
|
1144
|
+
|
|
1145
|
+
@property
|
|
1146
|
+
def state(self) -> _PublisherState:
|
|
1147
|
+
return self._state
|
|
1148
|
+
|
|
1149
|
+
@property
|
|
1150
|
+
def sessions_path(self) -> Path:
|
|
1151
|
+
return self._sessions_path
|
|
1152
|
+
|
|
1153
|
+
@property
|
|
1154
|
+
def pos_path(self) -> Path:
|
|
1155
|
+
return self._pos_path
|
|
1156
|
+
|
|
1157
|
+
@property
|
|
1158
|
+
def attempts(self) -> dict[tuple[str, int], int]:
|
|
1159
|
+
return dict(self._attempts)
|