alter-runtime 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. alter_runtime/__init__.py +11 -0
  2. alter_runtime/adapters/__init__.py +19 -0
  3. alter_runtime/adapters/claude_jsonl_watcher.py +545 -0
  4. alter_runtime/adapters/git_watcher.py +457 -0
  5. alter_runtime/adapters/household/__init__.py +29 -0
  6. alter_runtime/adapters/household/_base.py +138 -0
  7. alter_runtime/adapters/household/compost/__init__.py +17 -0
  8. alter_runtime/adapters/household/compost/adapter.py +81 -0
  9. alter_runtime/adapters/household/compost/storage.py +75 -0
  10. alter_runtime/adapters/household/compost/tests/__init__.py +0 -0
  11. alter_runtime/adapters/household/compost/tests/test_adapter.py +62 -0
  12. alter_runtime/adapters/household/compost/tests/test_storage.py +23 -0
  13. alter_runtime/adapters/household/compost/tests/test_traits.py +38 -0
  14. alter_runtime/adapters/household/compost/traits.py +79 -0
  15. alter_runtime/adapters/household/self_hoster/__init__.py +30 -0
  16. alter_runtime/adapters/household/self_hoster/adapter.py +248 -0
  17. alter_runtime/adapters/household/self_hoster/storage.py +83 -0
  18. alter_runtime/adapters/household/self_hoster/tests/__init__.py +0 -0
  19. alter_runtime/adapters/household/self_hoster/tests/test_adapter.py +216 -0
  20. alter_runtime/adapters/household/self_hoster/tests/test_storage.py +25 -0
  21. alter_runtime/adapters/household/self_hoster/tests/test_traits.py +55 -0
  22. alter_runtime/adapters/household/self_hoster/traits.py +105 -0
  23. alter_runtime/adapters/household/tapo_ecosystem/__init__.py +22 -0
  24. alter_runtime/adapters/household/tapo_ecosystem/adapter.py +98 -0
  25. alter_runtime/adapters/household/tapo_ecosystem/storage.py +95 -0
  26. alter_runtime/adapters/household/tapo_ecosystem/tests/__init__.py +0 -0
  27. alter_runtime/adapters/household/tapo_ecosystem/tests/test_adapter.py +55 -0
  28. alter_runtime/adapters/household/tapo_ecosystem/tests/test_storage.py +28 -0
  29. alter_runtime/adapters/household/tapo_ecosystem/tests/test_traits.py +45 -0
  30. alter_runtime/adapters/household/tapo_ecosystem/traits.py +97 -0
  31. alter_runtime/adapters/household/workshop_tools/__init__.py +25 -0
  32. alter_runtime/adapters/household/workshop_tools/adapter.py +77 -0
  33. alter_runtime/adapters/household/workshop_tools/storage.py +92 -0
  34. alter_runtime/adapters/household/workshop_tools/tests/__init__.py +0 -0
  35. alter_runtime/adapters/household/workshop_tools/tests/test_adapter.py +48 -0
  36. alter_runtime/adapters/household/workshop_tools/tests/test_storage.py +26 -0
  37. alter_runtime/adapters/household/workshop_tools/tests/test_traits.py +45 -0
  38. alter_runtime/adapters/household/workshop_tools/traits.py +95 -0
  39. alter_runtime/adapters/worktree_watcher.py +378 -0
  40. alter_runtime/atlas/__init__.py +48 -0
  41. alter_runtime/atlas/base.py +102 -0
  42. alter_runtime/atlas/ledger.py +196 -0
  43. alter_runtime/atlas/observations.py +136 -0
  44. alter_runtime/atlas/schema.py +106 -0
  45. alter_runtime/cap_cache.py +392 -0
  46. alter_runtime/cli.py +517 -0
  47. alter_runtime/clients/__init__.py +0 -0
  48. alter_runtime/clients/token_usage_client.py +273 -0
  49. alter_runtime/config.py +648 -0
  50. alter_runtime/consent.py +425 -0
  51. alter_runtime/daemon.py +518 -0
  52. alter_runtime/floor_loop.py +335 -0
  53. alter_runtime/floor_preflight.py +734 -0
  54. alter_runtime/http_auth.py +173 -0
  55. alter_runtime/notifiers/__init__.py +18 -0
  56. alter_runtime/notifiers/desktop.py +321 -0
  57. alter_runtime/sdk/__init__.py +12 -0
  58. alter_runtime/sdk/client.py +399 -0
  59. alter_runtime/service_install.py +616 -0
  60. alter_runtime/services/__init__.py +59 -0
  61. alter_runtime/services/launchd/com.alter.runtime.plist.in +90 -0
  62. alter_runtime/services/systemd/alter-runtime.service.in +74 -0
  63. alter_runtime/services/systemd/cf-access-env.conf.in +29 -0
  64. alter_runtime/sockets/__init__.py +20 -0
  65. alter_runtime/sockets/dbus.py +272 -0
  66. alter_runtime/sockets/unix.py +702 -0
  67. alter_runtime/subscribers/__init__.py +58 -0
  68. alter_runtime/subscribers/active_sessions_cron_emitter.py +313 -0
  69. alter_runtime/subscribers/active_sessions_do_publisher.py +1159 -0
  70. alter_runtime/subscribers/active_sessions_gc.py +432 -0
  71. alter_runtime/subscribers/active_sessions_writer.py +446 -0
  72. alter_runtime/subscribers/adapters_writer.py +415 -0
  73. alter_runtime/subscribers/agent_frames.py +461 -0
  74. alter_runtime/subscribers/bus.py +188 -0
  75. alter_runtime/subscribers/cache_writer.py +347 -0
  76. alter_runtime/subscribers/ceremony_echo.py +290 -0
  77. alter_runtime/subscribers/do_sse.py +864 -0
  78. alter_runtime/subscribers/ebpf.py +506 -0
  79. alter_runtime/subscribers/inbox_writer.py +469 -0
  80. alter_runtime/subscribers/mcp_fallback.py +391 -0
  81. alter_runtime/subscribers/presence_writer.py +426 -0
  82. alter_runtime/subscribers/session_presence.py +467 -0
  83. alter_runtime/subscribers/sse.py +125 -0
  84. alter_runtime/subscribers/weave_intent_writer.py +608 -0
  85. alter_runtime/update_loop.py +519 -0
  86. alter_runtime/weave/__init__.py +21 -0
  87. alter_runtime/weave/resolver.py +544 -0
  88. alter_runtime-0.3.0.dist-info/METADATA +289 -0
  89. alter_runtime-0.3.0.dist-info/RECORD +92 -0
  90. alter_runtime-0.3.0.dist-info/WHEEL +4 -0
  91. alter_runtime-0.3.0.dist-info/entry_points.txt +2 -0
  92. alter_runtime-0.3.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1159 @@
1
+ """ActiveSessionsDoPublisher - tails the JSONL and POSTs to the per-handle DO.
2
+
3
+ Wave C of D-COORD-D2. Companion publisher to
4
+ :class:`ActiveSessionsWriter` (the local-disk writer) and
5
+ :class:`ActiveSessionsGc` (the idle/terminated sweeper).
6
+
7
+ The writer + GC pass already produce ``session_started`` /
8
+ ``session_heartbeat`` / ``session_ended`` envelopes into
9
+ ``~/.local/share/alter-runtime/active-sessions.jsonl``. This component
10
+ tails that file and republishes each new envelope to the per-``~handle``
11
+ Cloudflare Durable Object at
12
+ ``${do_publish_url}/events/{handle}/sessions/ingest``, so that
13
+ cross-host visibility (D-COORD-D1 Wave B) and cross-tool fan-out (Codex,
14
+ Cursor, alter-cli, android, widget) get a single canonical SSE stream
15
+ at ``/events/{handle}/sessions``.
16
+
17
+ Design contract per D-COORD-D2 §7 (amendment) + the Wave C brief:
18
+
19
+ * **Tail, don't re-read.** The publisher maintains a byte-offset
20
+ checkpoint at ``${XDG_STATE_HOME}/alter-runtime/active-sessions-publisher.pos``
21
+ so each tick only reads the bytes appended since the last successful
22
+ POST. This matches the GC pass's file-position pattern at
23
+ :class:`ActiveSessionsGc` (file-mediated, no bus coupling).
24
+ * **Filter contract: publish ALL envelopes.** ``session_started`` /
25
+ ``session_heartbeat`` / ``session_ended`` ALL get POSTed. The DO is
26
+ responsible for filtering ``session_ended`` out of live SSE - that's
27
+ the Worker agent's contract. Do not filter on the publisher side.
28
+ * **Idempotent on failure.** A failed POST does NOT advance the
29
+ offset; the next tick re-reads from the same start. Max 3 attempts
30
+ per record (with exponential backoff between ticks); after 3
31
+ failures the record is skipped with a structured log line and the
32
+ offset advances past it, so a single poison-pill record cannot stall
33
+ the entire tail.
34
+ * **Local-first.** The writer's append-only-on-disk path is the source
35
+ of truth; the DO is downstream eventual consistency. The publisher
36
+ never short-circuits the disk write - even when the DO is
37
+ unreachable, envelopes still land on disk for the next tail attempt.
38
+
39
+ Auth flow per cycle mirrors :class:`SessionPresenceWriter` in shape
40
+ but uses the **handle-alter realm** mint endpoint - the publisher
41
+ mints a cap-JWT scoped ``alter_events.sessions.ingest`` via
42
+ ``POST {api}/api/v1/messaging/sessions-ingest-capability``
43
+ (Authorization: Bearer session.jwt; parameterless body), caches it
44
+ in-memory until 30s before its declared ``expires_at``, and attaches
45
+ it as ``Authorization: Bearer <cap>`` on the per-record ingest POST.
46
+
47
+ The endpoint is the parameterless self-directed mint added by alter#1138
48
+ under D-COORD-D2 Wave C wire fix - it signs caps with
49
+ ``MESSAGING_CAP_SIGNING_KEY`` matching the Worker's ``ALTER_API_PUBKEY``
50
+ (both handle-alter realm). An earlier #46 wiring routed via the
51
+ org-alter realm cap-mint surface, which signs with
52
+ ``ALTER_COLLECTIVE_CAP_SIGNING_KEY`` (Worker verification would fail),
53
+ allowlists ``alter_org.*`` scopes only (rejects
54
+ ``alter_events.sessions.ingest`` with 422), and is principal-tier-only
55
+ (blocks all non-``~blake``/``~drew`` handles). All three failures are
56
+ eliminated by minting in-realm.
57
+
58
+ A 401 from the Worker drops the cached cap and triggers a single
59
+ re-mint+retry guard; a second 401 surfaces the error to the tick loop
60
+ without an infinite re-mint storm. When the alter-cli session is absent
61
+ the component idles silently - the writer + GC loops still produce a
62
+ fully usable local-disk record. The Worker-side scope match lives at
63
+ ``cloudflare/workers/handle-alter/src/sessions.ts`` (constant
64
+ ``SESSIONS_INGEST_SCOPE``).
65
+ """
66
+
67
+ from __future__ import annotations
68
+
69
+ import asyncio
70
+ import contextlib
71
+ import errno
72
+ import fcntl
73
+ import json
74
+ import logging
75
+ import os
76
+ import sys
77
+ import time
78
+ from dataclasses import dataclass, field
79
+ from datetime import datetime
80
+ from pathlib import Path
81
+ from typing import TYPE_CHECKING, Any
82
+
83
+ import httpx
84
+
85
+ from alter_runtime.config import DaemonConfig, data_dir, runtime_state_dir
86
+ from alter_runtime.daemon import Component
87
+ from alter_runtime.subscribers.active_sessions_writer import ACTIVE_SESSIONS_FILENAME
88
+ from alter_runtime.subscribers.do_sse import _build_tls_context
89
+
90
+ if TYPE_CHECKING:
91
+ from alter_runtime.config import Session
92
+
93
+ __all__ = [
94
+ "ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME",
95
+ "BATCH_MAX_RECORDS",
96
+ "BATCH_MAX_BYTES",
97
+ "INGEST_SCOPE",
98
+ "MAX_POST_ATTEMPTS",
99
+ "ActiveSessionsDoPublisher",
100
+ ]
101
+
102
+ logger = logging.getLogger("alter_runtime.subscribers.active_sessions_do_publisher")
103
+
104
+
105
+ #: Offset checkpoint filename (within ``runtime_state_dir()``).
106
+ ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME: str = "active-sessions-publisher.pos"
107
+
108
+ #: Maximum POST attempts before the publisher gives up on a single record
109
+ #: and advances the offset past it. The skip is logged with the record's
110
+ #: ``id`` / ``version`` so an operator can manually replay if required.
111
+ MAX_POST_ATTEMPTS: int = 3
112
+
113
+ #: Upper bound on the inter-tick exponential backoff when the DO is
114
+ #: returning errors. Matches :class:`SessionPresenceWriter` for parity.
115
+ MAX_POLL_BACKOFF_SECONDS: float = 60.0
116
+
117
+ #: Maximum line length we accept before treating the record as malformed
118
+ #: and skipping. Schema records are O(few hundred bytes); 64 KiB is a
119
+ #: generous upper bound that still bounds memory use under a runaway
120
+ #: writer.
121
+ MAX_LINE_BYTES: int = 64 * 1024
122
+
123
+ #: Maximum number of records per ingest-batch POST. Matches the Worker-side
124
+ #: limit for ``POST /events/{handle}/sessions/ingest-batch`` (413 over 100).
125
+ BATCH_MAX_RECORDS: int = 100
126
+
127
+ #: Maximum body size per ingest-batch POST in bytes. Matches the Worker-side
128
+ #: limit (413 over 256 KiB).
129
+ BATCH_MAX_BYTES: int = 256 * 1024
130
+
131
+ #: Cap scope required for ``/events/{handle}/sessions/ingest`` POSTs.
132
+ #: Matches the Worker-side constant at
133
+ #: ``cloudflare/workers/handle-alter/src/sessions.ts``
134
+ #: (``SESSIONS_INGEST_SCOPE``) and the handle-alter realm
135
+ #: ``SESSIONS_INGEST_CAPABILITY_SCOPE`` server-side. Re-export so
136
+ #: tests can pin against the cap claim returned by the mint endpoint.
137
+ INGEST_SCOPE: str = "alter_events.sessions.ingest"
138
+
139
+ #: Refresh leeway - re-mint when expiry is closer than this. Mirrors
140
+ #: :data:`alter_runtime.subscribers.session_presence.CAP_REFRESH_LEAD_SECONDS`.
141
+ CAP_REFRESH_LEAD_SECONDS: float = 30.0
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # State (exposed for tests)
146
+ # ---------------------------------------------------------------------------
147
+
148
+
149
+ @dataclass
150
+ class _PublisherState:
151
+ """Internal state - surfaced via ``state`` property for tests."""
152
+
153
+ posted_count: int = 0
154
+ skipped_count: int = 0
155
+ failed_attempts: int = 0
156
+ backoff: float = 0.0
157
+ last_post_at: float = 0.0
158
+ history: list[str] = field(default_factory=list)
159
+
160
+
161
+ @dataclass
162
+ class _CachedCap:
163
+ """In-memory cap-JWT cache.
164
+
165
+ Bounded multi-use caps (proposed-D-CAP-1) are honoured the same way
166
+ :class:`SessionPresenceWriter._CachedCap` honours them - but the
167
+ publisher does not stripe ``X-Cap-Use-Index`` because the ingest
168
+ Worker route does not require it. Each successful POST consumes one
169
+ use; once exhausted (or stale) the next call re-mints.
170
+ """
171
+
172
+ capability: str
173
+ expires_at_unix: float
174
+ uses_available: int
175
+ use_counter: int
176
+
177
+ def is_fresh(self, now: float) -> bool:
178
+ return self.expires_at_unix - now > CAP_REFRESH_LEAD_SECONDS
179
+
180
+ def has_uses(self) -> bool:
181
+ return self.use_counter < self.uses_available
182
+
183
+ def take_use(self) -> None:
184
+ self.use_counter += 1
185
+
186
+
187
+ # ---------------------------------------------------------------------------
188
+ # Internal exception types - kept private so callers cannot grep for
189
+ # them outside this module.
190
+ # ---------------------------------------------------------------------------
191
+
192
+
193
+ class _SessionMissing(Exception):
194
+ """Raised when the alter-cli session is absent."""
195
+
196
+
197
+ class _CapMintError(Exception):
198
+ """Raised when /api/v1/messaging/sessions-ingest-capability refuses
199
+ or returns a malformed body."""
200
+
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # Component
204
+ # ---------------------------------------------------------------------------
205
+
206
+
207
+ class ActiveSessionsDoPublisher(Component):
208
+ """Tail ``active-sessions.jsonl`` and POST each envelope to the DO.
209
+
210
+ Parameters
211
+ ----------
212
+ config:
213
+ Loaded :class:`DaemonConfig`. Reads ``do_publish_url``,
214
+ ``do_publish_enabled``, ``do_publish_poll_interval_seconds``.
215
+ session:
216
+ Authenticated alter-cli :class:`Session`. Used for the bearer
217
+ JWT when minting ``alter_events.sessions.ingest`` caps.
218
+ Without a session the component idles silently and re-checks on
219
+ every tick - the writer + GC continue producing local-disk
220
+ envelopes regardless.
221
+ sessions_path:
222
+ Override the JSONL path. Tests redirect to ``tmp_path``.
223
+ pos_path:
224
+ Override the offset-checkpoint path. Tests redirect to ``tmp_path``.
225
+ http_client:
226
+ Optional ``httpx.AsyncClient`` override for tests.
227
+ """
228
+
229
+ name = "active_sessions_do_publisher"
230
+
231
+ def __init__(
232
+ self,
233
+ config: DaemonConfig,
234
+ session: Session | None,
235
+ *,
236
+ sessions_path: Path | None = None,
237
+ pos_path: Path | None = None,
238
+ http_client: httpx.AsyncClient | None = None,
239
+ ) -> None:
240
+ self._config = config
241
+ self._session = session
242
+ self._sessions_path: Path = (
243
+ sessions_path if sessions_path is not None else data_dir() / ACTIVE_SESSIONS_FILENAME
244
+ )
245
+ self._pos_path: Path = (
246
+ pos_path
247
+ if pos_path is not None
248
+ else runtime_state_dir() / ACTIVE_SESSIONS_PUBLISHER_POS_FILENAME
249
+ )
250
+ self._http_client = http_client
251
+ self._owns_client = http_client is None
252
+ self._stop_event = asyncio.Event()
253
+ self._state = _PublisherState()
254
+ self._cap: _CachedCap | None = None
255
+ # In-memory attempt counter, keyed by (id, version). Persists
256
+ # across ticks but not across daemon restarts - a restart resets
257
+ # the counter and the record gets another MAX_POST_ATTEMPTS
258
+ # attempts, which is the desired behaviour (a daemon restart is
259
+ # an explicit operator intervention).
260
+ self._attempts: dict[tuple[str, int], int] = {}
261
+
262
+ # ------------------------------------------------------------------
263
+ # Component lifecycle
264
+ # ------------------------------------------------------------------
265
+
266
+ async def run(self) -> None:
267
+ if not self._config.do_publish_enabled:
268
+ logger.info("active_sessions_do_publisher disabled by config - idle")
269
+ await self._stop_event.wait()
270
+ return
271
+
272
+ if self._session is None:
273
+ # Fail-loud-once: log a single warning and idle silently.
274
+ # The writer + GC continue producing local-disk envelopes
275
+ # regardless; the DO mirror just stays empty until the
276
+ # daemon next reloads with a populated session.json.
277
+ logger.warning(
278
+ "active_sessions_do_publisher: no alter-cli session - "
279
+ "DO publish disabled. Run `alter login` to mint cap "
280
+ "credentials, or set ALTER_RUNTIME_DO_PUBLISH_ENABLED=0 "
281
+ "to silence this warning."
282
+ )
283
+ await self._stop_event.wait()
284
+ return
285
+
286
+ logger.info(
287
+ "active_sessions_do_publisher starting sessions=%s pos=%s "
288
+ "interval=%.1fs publish_url=%s",
289
+ self._sessions_path,
290
+ self._pos_path,
291
+ self._config.do_publish_poll_interval_seconds,
292
+ self._config.do_publish_url,
293
+ )
294
+
295
+ # Backend default headers — X-Alter-Client-* identity bundle
296
+ # (D-MIN-VERSION-FLOOR-1 §3) + CF Access service-token bundle. The
297
+ # active-sessions DO publisher POSTs to ``mcp.truealter.com``
298
+ # which sits behind the same edge as the rest of the backend; the
299
+ # X-Alter-* headers are required for the server-side floor gate.
300
+ from alter_runtime.http_auth import backend_default_headers
301
+
302
+ client = self._http_client or httpx.AsyncClient(
303
+ timeout=httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0),
304
+ verify=_build_tls_context(),
305
+ headers=backend_default_headers(),
306
+ )
307
+
308
+ try:
309
+ while not self._stop_event.is_set():
310
+ await self._tick_safe(client)
311
+ await self._sleep_interruptible(self._config.do_publish_poll_interval_seconds)
312
+ finally:
313
+ if self._owns_client:
314
+ with contextlib.suppress(Exception):
315
+ await client.aclose()
316
+ logger.info("active_sessions_do_publisher stopped")
317
+
318
+ async def stop(self) -> None:
319
+ self._stop_event.set()
320
+
321
+ # ------------------------------------------------------------------
322
+ # Tick
323
+ # ------------------------------------------------------------------
324
+
325
+ async def _tick_safe(self, client: httpx.AsyncClient) -> None:
326
+ """Wrap ``_tick`` with last-resort exception swallowing.
327
+
328
+ Mirrors :class:`SessionPresenceWriter._poll_once_safe`. The
329
+ supervisor restarts on bare exceptions, but we'd rather log and
330
+ continue than tear down the component for a transient blip.
331
+ """
332
+ try:
333
+ await self._tick(client)
334
+ self._state.backoff = 0.0
335
+ except asyncio.CancelledError:
336
+ raise
337
+ except _SessionMissing:
338
+ # Session disappeared between run() and tick - idle until
339
+ # the next cycle. No re-log: the run() entry already warned.
340
+ self._state.backoff = max(self._state.backoff, 5.0)
341
+ except (httpx.HTTPError, _CapMintError) as exc:
342
+ self._state.backoff = min(
343
+ max(self._state.backoff * 2 if self._state.backoff else 2.0, 2.0),
344
+ MAX_POLL_BACKOFF_SECONDS,
345
+ )
346
+ logger.warning(
347
+ "active_sessions_do_publisher tick failed: %s - backoff %.1fs",
348
+ exc,
349
+ self._state.backoff,
350
+ )
351
+ except Exception as exc: # noqa: BLE001 - last-resort safety net
352
+ logger.exception("active_sessions_do_publisher unexpected: %s", exc)
353
+ self._state.backoff = MAX_POLL_BACKOFF_SECONDS
354
+
355
+ async def _tick(self, client: httpx.AsyncClient) -> None:
356
+ """One sweep: read new bytes since the last offset, POST as a batch.
357
+
358
+ Attempts the new ``ingest-batch`` endpoint (one POST per tick for up
359
+ to ``BATCH_MAX_RECORDS`` / ``BATCH_MAX_BYTES``). Falls back to the
360
+ legacy per-record ``ingest`` path when the Worker returns 404 or 405
361
+ (old Worker without the batch route).
362
+
363
+ Backlog drain: when a single tick reads more than ``BATCH_MAX_RECORDS``
364
+ records or ``BATCH_MAX_BYTES`` bytes (e.g. after a long sleep or log
365
+ rotation), the tick emits multiple sequential sub-batch POSTs, advancing
366
+ the offset after each successful sub-batch so the position checkpoint
367
+ always reflects the last fully committed position.
368
+ """
369
+ if not self._sessions_path.exists():
370
+ return
371
+
372
+ offset = self._load_offset()
373
+ new_bytes, new_offset = self._read_since(offset)
374
+ if not new_bytes:
375
+ # Defensive persist-on-shrink (B-A hardening). When
376
+ # ``_read_since`` detects a shrink (rotation / truncation) it
377
+ # resets its read base to 0 and returns ``new_offset`` reflecting
378
+ # the *new* file size. If the new file is empty or unchanged we
379
+ # get here with empty ``new_bytes`` - without persisting the
380
+ # reset, the stale large offset stays on disk and EVERY
381
+ # subsequent empty tick re-detects the shrink, re-logging the
382
+ # rotation and keeping the wedge armed until fresh bytes arrive.
383
+ # Persist the reset base now so a quiet post-rotation window
384
+ # cannot keep re-triggering the replay path. Only writes when the
385
+ # offset actually moved DOWN (shrink); the steady-state
386
+ # caught-up tick (new_offset == offset) writes nothing, leaving
387
+ # the success-path advancement untouched.
388
+ if new_offset < offset:
389
+ self._save_offset(new_offset)
390
+ return
391
+
392
+ # Parse all lines from the newly read bytes into (raw_bytes, record?)
393
+ # tuples. Malformed/oversized/blank lines are handled inline below and
394
+ # are advanced past unconditionally.
395
+ parsed_lines: list[tuple[bytes, dict[str, Any] | None]] = []
396
+ for raw_line in new_bytes.splitlines(keepends=True):
397
+ line_text = raw_line.decode("utf-8", errors="replace")
398
+ stripped = line_text.strip()
399
+ if not stripped:
400
+ parsed_lines.append((raw_line, None))
401
+ continue
402
+ if len(stripped) > MAX_LINE_BYTES:
403
+ logger.warning(
404
+ "active_sessions_do_publisher: oversize line (%d bytes) - skipping",
405
+ len(stripped),
406
+ )
407
+ parsed_lines.append((raw_line, None))
408
+ continue
409
+ try:
410
+ record = json.loads(stripped)
411
+ except (ValueError, json.JSONDecodeError):
412
+ logger.warning("active_sessions_do_publisher: malformed JSON line - skipping")
413
+ parsed_lines.append((raw_line, None))
414
+ continue
415
+ if not isinstance(record, dict):
416
+ parsed_lines.append((raw_line, None))
417
+ continue
418
+ handle = record.get("handle")
419
+ if not isinstance(handle, str) or not handle:
420
+ logger.warning("active_sessions_do_publisher: missing handle - skipping")
421
+ parsed_lines.append((raw_line, None))
422
+ continue
423
+ parsed_lines.append((raw_line, record))
424
+
425
+ # Drain the parsed lines as sub-batches. Each sub-batch is at most
426
+ # BATCH_MAX_RECORDS records and BATCH_MAX_BYTES of JSON body. We
427
+ # attempt the batch endpoint first; on 404/405 we fall back to the
428
+ # per-record path for the remainder of this tick.
429
+ #
430
+ # Seed ``consumed_offset`` from the read BASE, not the (possibly
431
+ # stale) pre-rotation ``offset``. ``_read_since`` returns
432
+ # ``new_offset == read_base + len(new_bytes)`` where ``read_base``
433
+ # is 0 right after a rotation reset (offset > size) and == ``offset``
434
+ # in steady state. ``new_offset - len(new_bytes)`` recovers that
435
+ # base exactly. Using the stale ``offset`` here is the replay-loop
436
+ # bug (B-A): after a rotation it persists ``stale_large +
437
+ # line_lengths``, so the next tick re-detects the shrink and
438
+ # re-POSTs the entire file forever.
439
+ consumed_offset = new_offset - len(new_bytes)
440
+ use_batch: bool = True # Flipped to False on first 404/405 response.
441
+ i = 0
442
+ while i < len(parsed_lines):
443
+ # Collect the next sub-batch: skip None (auto-advance) entries
444
+ # and accumulate valid records up to the caps.
445
+ sub_batch_lines: list[bytes] = []
446
+ sub_batch_records: list[dict[str, Any]] = []
447
+ sub_batch_byte_size: int = 0
448
+ j = i
449
+ # Advance past leading None lines (blank / malformed / no-handle).
450
+ while j < len(parsed_lines) and parsed_lines[j][1] is None:
451
+ raw_line, _ = parsed_lines[j]
452
+ consumed_offset += len(raw_line)
453
+ self._save_offset(consumed_offset)
454
+ j += 1
455
+
456
+ if j >= len(parsed_lines):
457
+ break
458
+
459
+ # Fill the sub-batch up to BATCH_MAX_RECORDS or BATCH_MAX_BYTES.
460
+ k = j
461
+ while k < len(parsed_lines):
462
+ raw_line, record = parsed_lines[k]
463
+ if record is None:
464
+ # Non-None lines follow valid records in the slice;
465
+ # a None after valid records terminates the sub-batch so
466
+ # it is handled in the NEXT iteration.
467
+ break
468
+ encoded = json.dumps(record, separators=(",", ":")).encode("utf-8")
469
+ # If adding this record would exceed either cap, close the
470
+ # sub-batch (but only if we already have records - a single
471
+ # oversized record still gets its own sub-batch attempt).
472
+ if sub_batch_records and (
473
+ len(sub_batch_records) >= BATCH_MAX_RECORDS
474
+ or sub_batch_byte_size + len(encoded) > BATCH_MAX_BYTES
475
+ ):
476
+ break
477
+ sub_batch_lines.append(raw_line)
478
+ sub_batch_records.append(record)
479
+ sub_batch_byte_size += len(encoded)
480
+ k += 1
481
+
482
+ if not sub_batch_records:
483
+ # All remaining entries are None - already advanced above.
484
+ i = k
485
+ continue
486
+
487
+ if use_batch:
488
+ # Attempt batch POST.
489
+ batch_result = await self._post_batch(client, sub_batch_records)
490
+
491
+ if batch_result == "fallback":
492
+ # 404/405 - old Worker. Switch to per-record for this tick.
493
+ use_batch = False
494
+ # Fall through to the single-record path below.
495
+ elif batch_result == "transient":
496
+ # Network / 5xx / 429 - leave offset un-advanced and let
497
+ # the next tick retry the whole sub-batch.
498
+ self._state.failed_attempts += 1
499
+ logger.info(
500
+ "active_sessions_do_publisher: batch POST transient failure "
501
+ "(%d records) - will retry next tick",
502
+ len(sub_batch_records),
503
+ )
504
+ return
505
+ elif isinstance(batch_result, list):
506
+ # Per-record results from the Worker. ``batch_result`` is a
507
+ # list of dicts: {index, ok, error?, status?}. Records whose
508
+ # result is ok:true (or idempotent duplicate) are accepted.
509
+ # Records whose result is ok:false are terminal (schema
510
+ # rejection) and are skipped, mirroring MAX_POST_ATTEMPTS
511
+ # skip. The batch is considered fully consumed when every
512
+ # record is either accepted or terminally skipped.
513
+ accepted = 0
514
+ skipped = 0
515
+ for res in batch_result:
516
+ idx = res.get("index", -1)
517
+ if res.get("ok"):
518
+ accepted += 1
519
+ else:
520
+ skipped += 1
521
+ logger.warning(
522
+ "active_sessions_do_publisher: batch record index=%d "
523
+ "terminal failure status=%s error=%r - skipping",
524
+ idx,
525
+ res.get("status"),
526
+ res.get("error"),
527
+ )
528
+
529
+ # All records accounted for - advance offset past the whole
530
+ # sub-batch.
531
+ for raw_line in sub_batch_lines:
532
+ consumed_offset += len(raw_line)
533
+ self._save_offset(consumed_offset)
534
+ self._state.posted_count += accepted
535
+ self._state.skipped_count += skipped
536
+ self._state.last_post_at = time.time()
537
+ i = k
538
+ continue
539
+ else:
540
+ # batch_result == "success": every record accepted, no
541
+ # per-record results (simple 200 with accepted count only).
542
+ for raw_line in sub_batch_lines:
543
+ consumed_offset += len(raw_line)
544
+ self._save_offset(consumed_offset)
545
+ self._state.posted_count += len(sub_batch_records)
546
+ self._state.last_post_at = time.time()
547
+ i = k
548
+ continue
549
+
550
+ # Per-record fallback path (use_batch is False). Process records
551
+ # one at a time using the existing _post_record logic, which
552
+ # preserves today's idempotent-on-failure contract exactly.
553
+ all_consumed = True
554
+ for idx2, (raw_line, record) in enumerate(zip(sub_batch_lines, sub_batch_records)):
555
+ if record is None:
556
+ consumed_offset += len(raw_line)
557
+ self._save_offset(consumed_offset)
558
+ continue
559
+
560
+ attempt_key = self._attempt_key(record)
561
+ handle = record.get("handle", "")
562
+ published = await self._post_record(client, handle, record)
563
+ if published:
564
+ self._attempts.pop(attempt_key, None)
565
+ consumed_offset += len(raw_line)
566
+ self._save_offset(consumed_offset)
567
+ self._state.posted_count += 1
568
+ self._state.last_post_at = time.time()
569
+ continue
570
+
571
+ self._attempts[attempt_key] = self._attempts.get(attempt_key, 0) + 1
572
+ self._state.failed_attempts += 1
573
+ if self._attempts[attempt_key] >= MAX_POST_ATTEMPTS:
574
+ logger.warning(
575
+ "active_sessions_do_publisher: giving up on record "
576
+ "id=%s version=%s after %d attempts - advancing offset",
577
+ record.get("id"),
578
+ record.get("version"),
579
+ self._attempts[attempt_key],
580
+ )
581
+ self._attempts.pop(attempt_key, None)
582
+ consumed_offset += len(raw_line)
583
+ self._save_offset(consumed_offset)
584
+ self._state.skipped_count += 1
585
+ continue
586
+
587
+ logger.info(
588
+ "active_sessions_do_publisher: POST failed for id=%s "
589
+ "version=%s attempt=%d/%d - will retry next tick",
590
+ record.get("id"),
591
+ record.get("version"),
592
+ self._attempts[attempt_key],
593
+ MAX_POST_ATTEMPTS,
594
+ )
595
+ all_consumed = False
596
+ return # Stop tick; next tick resumes from here.
597
+
598
+ if not all_consumed:
599
+ return
600
+ i = k
601
+
602
+ # ------------------------------------------------------------------
603
+ # Cap minting
604
+ # ------------------------------------------------------------------
605
+
606
+ async def _get_cap(self, client: httpx.AsyncClient) -> str:
607
+ """Return a fresh cap-JWT, minting one if cache is stale or used up.
608
+
609
+ Mirrors :meth:`SessionPresenceWriter._get_cap` - single mint per
610
+ cap window, in-memory cache, refresh on leeway, bounded
611
+ multi-use caps honoured.
612
+ """
613
+ session = self._session
614
+ if session is None:
615
+ raise _SessionMissing()
616
+
617
+ now = time.time()
618
+ cap = self._cap
619
+ if cap is not None and cap.is_fresh(now) and cap.has_uses():
620
+ cap.take_use()
621
+ return cap.capability
622
+
623
+ # D-COORD-D2 Wave C wire fix: mint via the parameterless
624
+ # handle-alter-realm endpoint (alter#1138). The org-alter
625
+ # realm route is wrong realm here - see module docstring for
626
+ # the three failure modes it triggers for
627
+ # ``alter_events.sessions.ingest`` caps. TTL is server-configured
628
+ # (``SESSIONS_INGEST_CAPABILITY_TTL_SECONDS``, clamped [30, 300],
629
+ # default 60s); the per-handle 6/min rate limit means the cache +
630
+ # leeway-refresh path is the steady-state pattern.
631
+ url = f"{session.api.rstrip('/')}/api/v1/messaging/sessions-ingest-capability"
632
+ headers = {
633
+ "Authorization": f"Bearer {session.jwt}",
634
+ "Accept": "application/json",
635
+ }
636
+ response = await client.post(url, headers=headers)
637
+ if response.status_code in (401, 403):
638
+ raise _CapMintError(
639
+ f"cap-mint rejected (HTTP {response.status_code}): {response.text[:200]}"
640
+ )
641
+ response.raise_for_status()
642
+
643
+ try:
644
+ data = response.json()
645
+ except ValueError as exc:
646
+ raise _CapMintError("cap-mint returned non-JSON body") from exc
647
+
648
+ if not isinstance(data, dict):
649
+ raise _CapMintError("cap-mint returned non-object body")
650
+
651
+ capability = data.get("capability")
652
+ expires_at = data.get("expires_at")
653
+ if not isinstance(capability, str) or not capability:
654
+ raise _CapMintError("cap-mint response missing capability")
655
+ if not isinstance(expires_at, str) or not expires_at:
656
+ raise _CapMintError("cap-mint response missing expires_at")
657
+
658
+ try:
659
+ expires_at_unix = datetime.fromisoformat(expires_at.replace("Z", "+00:00")).timestamp()
660
+ except ValueError as exc:
661
+ raise _CapMintError(f"cap-mint returned non-ISO expires_at: {expires_at}") from exc
662
+
663
+ # The handle-alter realm endpoint mints time-bounded JWTs -
664
+ # the Worker's verifier checks scope + exp only and does no
665
+ # per-use accounting, so the cap may be reused for any number
666
+ # of POSTs within its TTL window. Cache refresh is governed by
667
+ # the leeway gate (``CAP_REFRESH_LEAD_SECONDS``); the
668
+ # server-side per-handle 6/min rate limit on the mint endpoint
669
+ # bounds re-mint storms even under burst load.
670
+ cap = _CachedCap(
671
+ capability=capability,
672
+ expires_at_unix=expires_at_unix,
673
+ uses_available=sys.maxsize,
674
+ use_counter=1,
675
+ )
676
+ self._cap = cap
677
+ return capability
678
+
679
+ # ------------------------------------------------------------------
680
+ # Batch POST
681
+ # ------------------------------------------------------------------
682
+
683
+ async def _post_batch(
684
+ self,
685
+ client: httpx.AsyncClient,
686
+ records: list[dict[str, Any]],
687
+ ) -> str | list[dict[str, Any]]:
688
+ """POST ``records`` as a JSON array to ``/events/{handle}/sessions/ingest-batch``.
689
+
690
+ All records in a sub-batch share the same ``handle`` (the caller reads
691
+ the first record's handle - mixed-handle batches are not supported by
692
+ the Worker, but in practice all records in a single session log share
693
+ one handle). Returns one of:
694
+
695
+ ``"success"``
696
+ HTTP 200 with ``accepted >= 1`` and no per-record failures; all
697
+ records accepted.
698
+ ``list[dict]``
699
+ HTTP 200 with a ``results`` array containing at least one
700
+ ``ok: false`` entry. The list contains the per-record result dicts
701
+ so ``_tick`` can advance past terminal failures individually.
702
+ ``"transient"``
703
+ Network error, 5xx, or 429 - caller should leave offset
704
+ un-advanced and retry next tick.
705
+ ``"fallback"``
706
+ HTTP 404 or 405 - Worker does not have the batch route. Caller
707
+ should switch to the per-record path for this tick.
708
+ """
709
+ from urllib.parse import quote
710
+
711
+ handle = records[0].get("handle", "")
712
+ encoded_handle = quote(handle, safe="~")
713
+ url = (
714
+ f"{self._config.do_publish_url.rstrip('/')}"
715
+ f"/events/{encoded_handle}/sessions/ingest-batch"
716
+ )
717
+
718
+ try:
719
+ cap = await self._get_cap(client)
720
+ except _SessionMissing:
721
+ raise
722
+ except (httpx.HTTPError, _CapMintError) as exc:
723
+ logger.warning(
724
+ "active_sessions_do_publisher: cap-mint failed for batch: %s",
725
+ exc,
726
+ )
727
+ return "transient"
728
+
729
+ response = await self._do_batch_post(client, url, cap, records)
730
+ if response is None:
731
+ return "transient"
732
+
733
+ if response.status_code in (404, 405):
734
+ # Worker does not have the batch route yet.
735
+ logger.info(
736
+ "active_sessions_do_publisher: ingest-batch returned HTTP %d "
737
+ "- falling back to per-record path for this tick",
738
+ response.status_code,
739
+ )
740
+ return "fallback"
741
+
742
+ if response.status_code in (401, 403):
743
+ # Cap rejected - drop and re-mint once, then retry.
744
+ logger.info(
745
+ "active_sessions_do_publisher: ingest-batch auth rejected (HTTP %d) "
746
+ "- re-minting cap and retrying once",
747
+ response.status_code,
748
+ )
749
+ self._cap = None
750
+ try:
751
+ cap = await self._get_cap(client)
752
+ except _SessionMissing:
753
+ raise
754
+ except (httpx.HTTPError, _CapMintError) as exc:
755
+ logger.warning(
756
+ "active_sessions_do_publisher: cap re-mint failed for batch: %s",
757
+ exc,
758
+ )
759
+ return "transient"
760
+ response = await self._do_batch_post(client, url, cap, records)
761
+ if response is None:
762
+ return "transient"
763
+ if response.status_code in (401, 403):
764
+ # Second auth failure - surface as transient so the next tick
765
+ # retries (no infinite re-mint loop).
766
+ logger.warning(
767
+ "active_sessions_do_publisher: ingest-batch auth failed twice "
768
+ "(HTTP %d) - treating as transient",
769
+ response.status_code,
770
+ )
771
+ return "transient"
772
+
773
+ if response.status_code in (429,) or response.status_code >= 500:
774
+ logger.warning(
775
+ "active_sessions_do_publisher: ingest-batch transient HTTP %d - "
776
+ "will retry next tick",
777
+ response.status_code,
778
+ )
779
+ return "transient"
780
+
781
+ if response.status_code == 413:
782
+ # Over-limit. Should not happen given the BATCH_MAX_RECORDS /
783
+ # BATCH_MAX_BYTES pre-send guards, but a single record that alone
784
+ # exceeds the Worker's 256 KiB cap would 413 a sub-batch-of-one and,
785
+ # if treated as transient, retry the identical body forever - a
786
+ # poison-pill wedge that stalls all session egress for the handle.
787
+ # Fall back to the per-record single-ingest path for this tick: an
788
+ # oversized record gets the existing MAX_POST_ATTEMPTS terminal skip
789
+ # and any well-sized records still post. Never infinite-retry a 413.
790
+ logger.warning(
791
+ "active_sessions_do_publisher: ingest-batch 413 (over Worker limit) "
792
+ "- falling back to per-record path for this tick"
793
+ )
794
+ return "fallback"
795
+
796
+ if response.status_code == 400:
797
+ # Whole-body malformed (not a JSON array / unparseable). This is a
798
+ # framing fault, not a transient condition - retrying the identical
799
+ # body loops forever. Fall back to the per-record single-ingest path
800
+ # for this tick, which serialises one record at a time and sidesteps
801
+ # any array-framing fault. Should be rare - the publisher produces
802
+ # well-formed JSON.
803
+ try:
804
+ body_preview = response.text[:200]
805
+ except Exception: # pragma: no cover
806
+ body_preview = "<unreadable>"
807
+ logger.warning(
808
+ "active_sessions_do_publisher: ingest-batch 400 (malformed body) "
809
+ "body=%r - falling back to per-record path for this tick",
810
+ body_preview,
811
+ )
812
+ return "fallback"
813
+
814
+ if 200 <= response.status_code < 300:
815
+ try:
816
+ data = response.json()
817
+ except (ValueError, json.JSONDecodeError):
818
+ # Non-JSON 2xx: accept all records (old Worker variant).
819
+ return "success"
820
+
821
+ if not isinstance(data, dict):
822
+ return "success"
823
+
824
+ results = data.get("results")
825
+ if isinstance(results, list) and results:
826
+ # Check whether any per-record result is ok:false.
827
+ has_failures = any(isinstance(r, dict) and not r.get("ok", True) for r in results)
828
+ if has_failures:
829
+ # Return the results list so _tick can skip individual
830
+ # terminal failures.
831
+ return [r for r in results if isinstance(r, dict)]
832
+ return "success"
833
+
834
+ # Unexpected status - treat as transient.
835
+ try:
836
+ body_preview = response.text[:200]
837
+ except Exception: # pragma: no cover
838
+ body_preview = "<unreadable>"
839
+ logger.warning(
840
+ "active_sessions_do_publisher: ingest-batch unexpected HTTP %d "
841
+ "body=%r - treating as transient",
842
+ response.status_code,
843
+ body_preview,
844
+ )
845
+ return "transient"
846
+
847
+ async def _do_batch_post(
848
+ self,
849
+ client: httpx.AsyncClient,
850
+ url: str,
851
+ cap: str,
852
+ records: list[dict[str, Any]],
853
+ ) -> httpx.Response | None:
854
+ """Single batch POST. Returns the Response or None on transport error."""
855
+ headers = {
856
+ "Authorization": f"Bearer {cap}",
857
+ "Content-Type": "application/json",
858
+ "Accept": "application/json",
859
+ }
860
+ try:
861
+ return await client.post(url, json=records, headers=headers)
862
+ except httpx.HTTPError as exc:
863
+ logger.warning(
864
+ "active_sessions_do_publisher: batch POST raised %s",
865
+ exc,
866
+ )
867
+ return None
868
+
869
+ # ------------------------------------------------------------------
870
+ # POST
871
+ # ------------------------------------------------------------------
872
+
873
+ async def _post_record(
874
+ self,
875
+ client: httpx.AsyncClient,
876
+ handle: str,
877
+ record: dict[str, Any],
878
+ ) -> bool:
879
+ """POST one envelope. Returns True on 2xx, False otherwise.
880
+
881
+ On 401/403 from the Worker the cached cap is dropped and a
882
+ single re-mint+retry is attempted. A second 401 surfaces as a
883
+ terminal failure for this record (counts against
884
+ MAX_POST_ATTEMPTS) - the publisher never loops on auth.
885
+ """
886
+ # The DO route uses the URL-encoded handle, mirroring the Worker's
887
+ # router which normalises + decodes again on receive.
888
+ from urllib.parse import quote
889
+
890
+ encoded_handle = quote(handle, safe="~")
891
+ url = f"{self._config.do_publish_url.rstrip('/')}/events/{encoded_handle}/sessions/ingest"
892
+
893
+ try:
894
+ cap = await self._get_cap(client)
895
+ except _SessionMissing:
896
+ # Surface to _tick_safe so the standard idle path is taken
897
+ # without per-record noise.
898
+ raise
899
+ except (httpx.HTTPError, _CapMintError) as exc:
900
+ logger.warning(
901
+ "active_sessions_do_publisher: cap-mint failed: %s - id=%s",
902
+ exc,
903
+ record.get("id"),
904
+ )
905
+ return False
906
+
907
+ response = await self._do_post(client, url, cap, record)
908
+ if response is None:
909
+ return False
910
+
911
+ if 200 <= response.status_code < 300:
912
+ return True
913
+
914
+ if response.status_code in (401, 403):
915
+ # Cap was rejected: drop it, re-mint, single retry. Guards
916
+ # against an infinite re-mint loop by only retrying once
917
+ # per record per tick.
918
+ logger.info(
919
+ "active_sessions_do_publisher: ingest auth rejected (HTTP %d) - "
920
+ "re-minting cap and retrying once for id=%s",
921
+ response.status_code,
922
+ record.get("id"),
923
+ )
924
+ self._cap = None
925
+ try:
926
+ cap = await self._get_cap(client)
927
+ except _SessionMissing:
928
+ raise
929
+ except (httpx.HTTPError, _CapMintError) as exc:
930
+ logger.warning(
931
+ "active_sessions_do_publisher: cap re-mint failed: %s - id=%s",
932
+ exc,
933
+ record.get("id"),
934
+ )
935
+ return False
936
+
937
+ retry = await self._do_post(client, url, cap, record)
938
+ if retry is None:
939
+ return False
940
+ if 200 <= retry.status_code < 300:
941
+ return True
942
+ response = retry
943
+
944
+ # 4xx/5xx (other than auth handled above): log + treat as
945
+ # terminal failure for this record (counts against
946
+ # MAX_POST_ATTEMPTS). The Worker is the authority and we honour
947
+ # its rejection rather than retrying forever.
948
+ try:
949
+ body_preview = response.text[:200]
950
+ except Exception: # pragma: no cover - defensive
951
+ body_preview = "<unreadable>"
952
+ logger.warning(
953
+ "active_sessions_do_publisher: POST rejected status=%d body=%r id=%s",
954
+ response.status_code,
955
+ body_preview,
956
+ record.get("id"),
957
+ )
958
+ return False
959
+
960
+ async def _do_post(
961
+ self,
962
+ client: httpx.AsyncClient,
963
+ url: str,
964
+ cap: str,
965
+ record: dict[str, Any],
966
+ ) -> httpx.Response | None:
967
+ """Single POST. Returns the Response or None on transport error."""
968
+ headers = {
969
+ "Authorization": f"Bearer {cap}",
970
+ "Content-Type": "application/json",
971
+ "Accept": "application/json",
972
+ }
973
+ try:
974
+ return await client.post(url, json=record, headers=headers)
975
+ except httpx.HTTPError as exc:
976
+ logger.warning(
977
+ "active_sessions_do_publisher: POST raised %s - id=%s",
978
+ exc,
979
+ record.get("id"),
980
+ )
981
+ return None
982
+
983
+ # ------------------------------------------------------------------
984
+ # File reading / offset checkpoint
985
+ # ------------------------------------------------------------------
986
+
987
+ def _read_since(self, offset: int) -> tuple[bytes, int]:
988
+ """Read all bytes from ``offset`` to EOF under a shared lock.
989
+
990
+ Returns ``(bytes_read, new_offset)``. ``new_offset`` is the
991
+ absolute file size after reading - i.e., the value we should
992
+ persist once every line has been POSTed.
993
+
994
+ Empty return on missing/unreadable file.
995
+ """
996
+ try:
997
+ stat = self._sessions_path.stat()
998
+ except FileNotFoundError:
999
+ return b"", offset
1000
+
1001
+ size = stat.st_size
1002
+ if offset > size:
1003
+ # File shrank (likely a rotation: writer moved jsonl ->
1004
+ # jsonl.1 and started fresh). Reset to 0 and re-read from
1005
+ # the top of the new file.
1006
+ logger.info(
1007
+ "active_sessions_do_publisher: jsonl shrank "
1008
+ "(offset=%d > size=%d) - resetting to 0 (rotation)",
1009
+ offset,
1010
+ size,
1011
+ )
1012
+ offset = 0
1013
+
1014
+ if offset == size:
1015
+ return b"", offset
1016
+
1017
+ flags = os.O_RDONLY
1018
+ try:
1019
+ fd = os.open(self._sessions_path, flags)
1020
+ except OSError as exc:
1021
+ logger.warning("active_sessions_do_publisher: open failed: %s", exc)
1022
+ return b"", offset
1023
+
1024
+ try:
1025
+ try:
1026
+ fcntl.flock(fd, fcntl.LOCK_SH)
1027
+ except OSError as exc: # pragma: no cover - exotic FS
1028
+ if exc.errno not in (errno.ENOTSUP, errno.EINVAL):
1029
+ logger.warning("active_sessions_do_publisher: LOCK_SH failed: %s", exc)
1030
+ return b"", offset
1031
+ try:
1032
+ os.lseek(fd, offset, os.SEEK_SET)
1033
+ buf = b""
1034
+ while True:
1035
+ chunk = os.read(fd, 65536)
1036
+ if not chunk:
1037
+ break
1038
+ buf += chunk
1039
+ finally:
1040
+ with contextlib.suppress(OSError):
1041
+ fcntl.flock(fd, fcntl.LOCK_UN)
1042
+ finally:
1043
+ os.close(fd)
1044
+
1045
+ return buf, offset + len(buf)
1046
+
1047
+ def _load_offset(self) -> int:
1048
+ """Read the last persisted offset. Returns 0 when absent/invalid."""
1049
+ if not self._pos_path.exists():
1050
+ return 0
1051
+ try:
1052
+ raw = self._pos_path.read_text(encoding="utf-8").strip()
1053
+ except OSError as exc:
1054
+ logger.warning(
1055
+ "active_sessions_do_publisher: cannot read pos file %s: %s",
1056
+ self._pos_path,
1057
+ exc,
1058
+ )
1059
+ return 0
1060
+ try:
1061
+ value = int(raw)
1062
+ except ValueError:
1063
+ logger.warning(
1064
+ "active_sessions_do_publisher: malformed pos file %s: %r - resetting to 0",
1065
+ self._pos_path,
1066
+ raw,
1067
+ )
1068
+ return 0
1069
+ return max(value, 0)
1070
+
1071
+ def _save_offset(self, offset: int) -> None:
1072
+ """Atomically persist the current offset checkpoint."""
1073
+ self._ensure_parent(self._pos_path)
1074
+ tmp_path = self._pos_path.with_suffix(self._pos_path.suffix + ".tmp")
1075
+ flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
1076
+ try:
1077
+ fd = os.open(tmp_path, flags, 0o600)
1078
+ except OSError as exc:
1079
+ logger.warning(
1080
+ "active_sessions_do_publisher: cannot open pos tmp %s: %s",
1081
+ tmp_path,
1082
+ exc,
1083
+ )
1084
+ return
1085
+ try:
1086
+ with contextlib.suppress(OSError):
1087
+ os.fchmod(fd, 0o600)
1088
+ os.write(fd, str(offset).encode("utf-8"))
1089
+ os.fsync(fd)
1090
+ finally:
1091
+ os.close(fd)
1092
+ try:
1093
+ os.replace(tmp_path, self._pos_path)
1094
+ except OSError as exc:
1095
+ logger.warning(
1096
+ "active_sessions_do_publisher: cannot replace pos %s: %s",
1097
+ self._pos_path,
1098
+ exc,
1099
+ )
1100
+ return
1101
+ with contextlib.suppress(OSError):
1102
+ os.chmod(self._pos_path, 0o600)
1103
+
1104
+ def _ensure_parent(self, path: Path) -> None:
1105
+ parent = path.parent
1106
+ if not parent.exists():
1107
+ parent.mkdir(parents=True, exist_ok=True, mode=0o700)
1108
+ with contextlib.suppress(OSError):
1109
+ os.chmod(parent, 0o700)
1110
+
1111
+ # ------------------------------------------------------------------
1112
+ # Helpers
1113
+ # ------------------------------------------------------------------
1114
+
1115
+ @staticmethod
1116
+ def _attempt_key(record: dict[str, Any]) -> tuple[str, int]:
1117
+ """Stable in-memory attempt-counter key per record.
1118
+
1119
+ Falls back to ``("", -1)`` when id/version are missing - those
1120
+ records get their own bucket so a single malformed line cannot
1121
+ evict a real one.
1122
+ """
1123
+ record_id = record.get("id")
1124
+ version = record.get("version")
1125
+ if not isinstance(record_id, str):
1126
+ record_id = ""
1127
+ if not isinstance(version, int):
1128
+ version = -1
1129
+ return record_id, version
1130
+
1131
+ async def _sleep_interruptible(self, seconds: float) -> None:
1132
+ """Sleep ``seconds`` or until stop is set, whichever comes first."""
1133
+ effective = max(seconds, self._state.backoff)
1134
+ if effective <= 0:
1135
+ return
1136
+ try:
1137
+ await asyncio.wait_for(self._stop_event.wait(), timeout=effective)
1138
+ except (TimeoutError, asyncio.TimeoutError):
1139
+ return
1140
+
1141
+ # ------------------------------------------------------------------
1142
+ # Test introspection
1143
+ # ------------------------------------------------------------------
1144
+
1145
+ @property
1146
+ def state(self) -> _PublisherState:
1147
+ return self._state
1148
+
1149
+ @property
1150
+ def sessions_path(self) -> Path:
1151
+ return self._sessions_path
1152
+
1153
+ @property
1154
+ def pos_path(self) -> Path:
1155
+ return self._pos_path
1156
+
1157
+ @property
1158
+ def attempts(self) -> dict[tuple[str, int], int]:
1159
+ return dict(self._attempts)