@bookedsolid/rea 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,504 @@
1
+ /**
2
+ * Live `serve.state.json` publisher (BUG-005, 0.9.0).
3
+ *
4
+ * Before 0.9.0 `.rea/serve.state.json` was written once at `rea serve` boot
5
+ * and never touched again. `rea status` therefore only surfaced
6
+ * `session_id`, `started_at`, and `metrics_port` — agents planning a
7
+ * multi-downstream workflow had no way to see "is helixir's circuit open
8
+ * right now?" without calling `__rea__health` through the MCP transport
9
+ * (which, ironically, wouldn't work if the gateway was the thing that had
10
+ * wedged).
11
+ *
12
+ * The publisher subscribes to two signals:
13
+ *
14
+ * 1. Circuit-breaker `onStateChange` — transitions to/from open/half-open
15
+ * update the per-downstream block.
16
+ * 2. Supervisor events from the pool — `child_died_unexpectedly` and
17
+ * `respawned` update per-downstream liveness.
18
+ *
19
+ * Each update debounces to at most one write per ~250 ms via a trailing
20
+ * timer so a storm of transitions (e.g. open → half-open → open → half-open
21
+ * during a flap) doesn't spam the filesystem.
22
+ *
23
+ * Writes reuse the atomic temp+rename pattern from `serve.ts`. The write
24
+ * carries the same ownership key (`session_id`) as the boot write so a
25
+ * racing second `rea serve` instance is still correctly distinguished at
26
+ * shutdown.
27
+ *
28
+ * ## Why not an IPC endpoint?
29
+ *
30
+ * We briefly considered piggy-backing a `/downstreams.json` route on the
31
+ * metrics HTTP server. Rejected on the grounds of:
32
+ *
33
+ * - `rea status` works when `REA_METRICS_PORT` is unset (common in local
34
+ * dev); a disk snapshot keeps it authoritative.
35
+ * - The write rate is bounded (debounced) and the snapshot is tiny (few
36
+ * hundred bytes).
37
+ * - The on-disk file is the one surface a CRASHED gateway leaves behind
38
+ * — IPC evaporates the moment the process dies, whereas a file survives
39
+ * for post-mortem inspection.
40
+ */
41
+ import crypto from 'node:crypto';
42
+ import fs from 'node:fs';
43
+ import path from 'node:path';
44
+ /** Atomic write helper — duplicated from serve.ts intentionally to keep this module standalone. */
45
+ function writeFileAtomic(filePath, data) {
46
+ const dir = path.dirname(filePath);
47
+ const base = path.basename(filePath);
48
+ const tmp = path.join(dir, `.${base}.${crypto.randomUUID()}.tmp`);
49
+ fs.writeFileSync(tmp, data, { encoding: 'utf8', mode: 0o600 });
50
+ try {
51
+ fs.renameSync(tmp, filePath);
52
+ }
53
+ catch (e) {
54
+ try {
55
+ fs.unlinkSync(tmp);
56
+ }
57
+ catch {
58
+ /* ignored */
59
+ }
60
+ throw e;
61
+ }
62
+ }
63
+ /**
64
+ * Backoff for the yield-retry path. When a newly-started publisher finds the
65
+ * state file owned by a live peer, we log once and then poll at this interval
66
+ * so that the moment the peer exits (ESRCH on the next `isStaleLock` check
67
+ * or `ownsStateFile` reclaim path), we actually publish our snapshot.
68
+ *
69
+ * Intentionally far longer than the normal debounce — this is the worst-case
70
+ * "two `rea serve` processes are up at once" path, not the hot path. Kept in
71
+ * seconds-scale so `rea status` eventually reflects the new session without
72
+ * hammering the sidecar lock.
73
+ *
74
+ * Codex 0.9.0 pass-5 P2a: before this retry existed, `flushNow()` yielded
75
+ * silently and never re-tried. The new gateway therefore never published its
76
+ * own snapshot while the old one was still alive; once the old one exited,
77
+ * nothing triggered a fresh write unless an unrelated supervisor event
78
+ * happened to land, leaving `rea status` stuck on a stale view.
79
+ */
80
+ const YIELD_RETRY_MS = 2_000;
81
+ export class LiveStatePublisher {
82
+ opts;
83
+ timer = null;
84
+ /**
85
+ * Separate timer for the yield-retry path. Kept distinct from `timer` so a
86
+ * scheduled debounce doesn't cancel the retry and vice-versa — they serve
87
+ * different purposes (coalesce vs. poll). Cleared by `stop()`.
88
+ */
89
+ yieldRetryTimer = null;
90
+ stopped = false;
91
+ constructor(opts) {
92
+ // Build the resolved options defensively. `exactOptionalPropertyTypes`
93
+ // refuses to accept `logger: undefined` against `logger?: Logger`, so
94
+ // we branch on presence instead of assigning `undefined`. Same treatment
95
+ // for `lastErrorRedactor`.
96
+ const base = {
97
+ baseDir: opts.baseDir,
98
+ stateFilePath: opts.stateFilePath,
99
+ sessionId: opts.sessionId,
100
+ startedAt: opts.startedAt,
101
+ metricsPort: opts.metricsPort,
102
+ pool: opts.pool,
103
+ breaker: opts.breaker,
104
+ sessionBlocker: opts.sessionBlocker,
105
+ debounceMs: opts.debounceMs ?? 250,
106
+ };
107
+ const withLogger = opts.logger !== undefined ? { ...base, logger: opts.logger } : base;
108
+ this.opts =
109
+ opts.lastErrorRedactor !== undefined
110
+ ? { ...withLogger, lastErrorRedactor: opts.lastErrorRedactor }
111
+ : withLogger;
112
+ }
113
+ /**
114
+ * Schedule a write. Coalesces multiple calls within the debounce window
115
+ * into a single flush. Safe to call from circuit-breaker and supervisor
116
+ * event paths without worrying about write rate.
117
+ */
118
+ scheduleUpdate() {
119
+ if (this.stopped)
120
+ return;
121
+ if (this.timer !== null)
122
+ return;
123
+ this.timer = setTimeout(() => {
124
+ this.timer = null;
125
+ this.flushNow();
126
+ }, this.opts.debounceMs);
127
+ // Allow the Node process to exit even if a pending debounce timer is
128
+ // scheduled — cleanup on shutdown will flush explicitly.
129
+ if (typeof this.timer.unref === 'function')
130
+ this.timer.unref();
131
+ }
132
+ /**
133
+ * Write the current snapshot synchronously, bypassing the debounce.
134
+ * Called on boot (to publish the initial downstream block) and on
135
+ * shutdown (to flush any pending updates before the state file is
136
+ * ownership-cleaned).
137
+ *
138
+ * ## Ownership handoff (Codex P1 + P2b)
139
+ *
140
+ * The ownership check + rename is performed under a sidecar lockfile
141
+ * (`serve.state.json.lock`) created with `O_EXCL` (`wx`). This converts
142
+ * what was two non-atomic steps into a serialized critical section.
143
+ *
144
+ * Flow:
145
+ *
146
+ * 1. Acquire the lock (`open(path, 'wx')`). If EEXIST, a concurrent
147
+ * writer — either another publisher in THIS process (not possible
148
+ * given the debounce, but cheap to defend against) or another
149
+ * `rea serve` instance with overlapping lifetime — holds it. Skip
150
+ * this flush silently; the debounce timer will try again, and on
151
+ * shutdown the concurrent writer's own state will be authoritative.
152
+ * 2. Under the lock: re-read the on-disk `session_id`. If it belongs
153
+ * to a DIFFERENT session, another instance has already claimed the
154
+ * breadcrumb. Release the lock and yield (log-only).
155
+ * 3. Under the lock: atomically rename our temp file over the target.
156
+ * Because the concurrent writer cannot execute step 3 until we
157
+ * release the lock, and we only reach step 3 after confirming the
158
+ * on-disk session matches ours, the "older clobbers newer"
159
+ * race Codex flagged is closed.
160
+ * 4. Release the lock (unlink the sidecar) in a finally block.
161
+ *
162
+ * Stale locks from a crashed process with the same PID would deadlock
163
+ * the critical section forever — so the acquire step checks the lock
164
+ * file's contents (written as our PID + random nonce) and, if the
165
+ * owning PID is no longer running, steals it. The steal path is
166
+ * intentionally narrow (PID-check only, no timestamp TTL) because
167
+ * holding the lock longer than a single flushNow invocation is a bug.
168
+ */
169
+ flushNow() {
170
+ if (this.stopped)
171
+ return;
172
+ let lockFd = null;
173
+ try {
174
+ lockFd = this.acquireLock();
175
+ if (lockFd === null) {
176
+ // A concurrent writer holds the lock. Skip this flush; a later
177
+ // debounced scheduleUpdate or the shutdown flushNow will retry.
178
+ return;
179
+ }
180
+ if (!this.ownsStateFile()) {
181
+ // A different session has stamped the file. Yield ownership
182
+ // silently; the newer instance is the authoritative writer.
183
+ this.opts.logger?.info({
184
+ event: 'live_state.yielded',
185
+ message: 'another rea serve session owns serve.state.json — yielding live-state writes for this process',
186
+ });
187
+ // Codex 0.9.0 pass-5 P2a: schedule a longer-interval retry so that
188
+ // when the live peer exits, this process DOES eventually publish
189
+ // its own snapshot instead of leaving `rea status` stuck on the
190
+ // previous owner's session. Without this, the only way a yielding
191
+ // gateway ever reclaims is if some unrelated event happens to land
192
+ // a `scheduleUpdate()` — which may be never on an idle gateway.
193
+ this.scheduleYieldRetry();
194
+ return;
195
+ }
196
+ const snapshot = this.buildSnapshot();
197
+ writeFileAtomic(this.opts.stateFilePath, JSON.stringify(snapshot, null, 2) + '\n');
198
+ }
199
+ catch (err) {
200
+ // Publishing the live state is best-effort — a write failure (disk
201
+ // full, permission changed under us) must never break the gateway's
202
+ // tool-routing path. Log and continue.
203
+ this.opts.logger?.warn({
204
+ event: 'live_state.write_failed',
205
+ message: 'failed to update serve.state.json — rea status may show stale downstream data',
206
+ error: err instanceof Error ? err.message : String(err),
207
+ });
208
+ }
209
+ finally {
210
+ if (lockFd !== null)
211
+ this.releaseLock(lockFd);
212
+ }
213
+ }
214
+ /** Path to the sidecar lockfile. Resolved once per call; trivial cost. */
215
+ lockFilePath() {
216
+ return `${this.opts.stateFilePath}.lock`;
217
+ }
218
+ /**
219
+ * Try to acquire the sidecar lock. Returns the lock file descriptor on
220
+ * success, or `null` on contention. Throws only on unexpected I/O errors
221
+ * (permissions, disk full) — those propagate out of `flushNow`'s try
222
+ * block and land in the `write_failed` log path.
223
+ *
224
+ * Stale-lock recovery: if a lockfile exists but its recorded PID is not
225
+ * currently running, the file is unlinked and one retry is issued. This
226
+ * covers the case where a previous `rea serve` SIGKILL'd mid-flush and
227
+ * left a dangling lockfile.
228
+ */
229
+ acquireLock() {
230
+ const lockPath = this.lockFilePath();
231
+ const payload = `${process.pid} ${crypto.randomUUID()}\n`;
232
+ for (let attempt = 0; attempt < 2; attempt++) {
233
+ try {
234
+ const fd = fs.openSync(lockPath, 'wx', 0o600);
235
+ fs.writeSync(fd, payload);
236
+ return fd;
237
+ }
238
+ catch (err) {
239
+ const code = err.code;
240
+ if (code !== 'EEXIST')
241
+ throw err;
242
+ // Someone holds the lock. Check if the holder is a live process;
243
+ // if not, steal it exactly once.
244
+ if (attempt === 0 && this.isStaleLock(lockPath)) {
245
+ try {
246
+ fs.unlinkSync(lockPath);
247
+ }
248
+ catch {
249
+ // Best-effort; another racer may have already unlinked. Loop
250
+ // around and attempt the open again regardless.
251
+ }
252
+ continue;
253
+ }
254
+ return null;
255
+ }
256
+ }
257
+ return null;
258
+ }
259
+ /**
260
+ * Release the sidecar lock. Best-effort — if the unlink fails, the next
261
+ * flushNow will see a dangling lock and the stale-lock recovery path
262
+ * will clean it up. We MUST still close the fd so we don't leak it.
263
+ */
264
+ releaseLock(fd) {
265
+ try {
266
+ fs.closeSync(fd);
267
+ }
268
+ catch {
269
+ /* ignored */
270
+ }
271
+ try {
272
+ fs.unlinkSync(this.lockFilePath());
273
+ }
274
+ catch {
275
+ /* ignored — stale-lock recovery on next flush will handle it */
276
+ }
277
+ }
278
+ /**
279
+ * Returns true iff the lock file's recorded PID is not currently alive.
280
+ * Uses `process.kill(pid, 0)` which sends no signal but errors with
281
+ * ESRCH when the PID is gone. Any parse error or unexpected kill error
282
+ * is treated as "not stale" to err on the side of NOT stealing a live
283
+ * peer's lock.
284
+ */
285
+ isStaleLock(lockPath) {
286
+ let raw;
287
+ try {
288
+ raw = fs.readFileSync(lockPath, 'utf8');
289
+ }
290
+ catch {
291
+ return false;
292
+ }
293
+ const match = /^(\d+)\s/.exec(raw);
294
+ if (match === null)
295
+ return false;
296
+ const pid = Number(match[1]);
297
+ if (!Number.isFinite(pid) || pid <= 0)
298
+ return false;
299
+ if (pid === process.pid) {
300
+ // Our own process already holds the lock — this should be impossible
301
+ // given `flushNow` runs single-threaded on the event loop, but don't
302
+ // steal from ourselves.
303
+ return false;
304
+ }
305
+ try {
306
+ process.kill(pid, 0);
307
+ return false; // Process is alive — lock is not stale.
308
+ }
309
+ catch (err) {
310
+ const code = err.code;
311
+ // ESRCH: no such process. EPERM: process exists but we can't signal
312
+ // it (different uid) — treat as NOT stale because the holder is
313
+ // alive from someone else's perspective.
314
+ return code === 'ESRCH';
315
+ }
316
+ }
317
+ /**
318
+ * Returns true iff this publisher is allowed to write the on-disk state
319
+ * file on behalf of its session. The check runs under the sidecar lock
320
+ * (see `flushNow`) so the read + subsequent rename form one serialized
321
+ * critical section.
322
+ *
323
+ * Ownership resolves against three buckets:
324
+ *
325
+ * 1. **Safe-to-write**: the file is absent, corrupt, or has a missing/
326
+ * malformed `session_id`. No competing session is on disk, so we
327
+ * write without hesitation.
328
+ * 2. **We own it**: the stored `session_id` matches ours. Normal
329
+ * steady-state — every flush lands here.
330
+ * 3. **Another session owns it**: the stored `session_id` differs
331
+ * from ours. Before 0.9.0 pass-4 this was an unconditional yield,
332
+ * which was strictly safer but broke the crash-recovery case —
333
+ * a NEW `rea serve` launched after an unclean shutdown would
334
+ * observe the crashed session's id and yield forever, leaving
335
+ * `rea status` permanently stuck. Codex pass-4 P1 flagged this.
336
+ *
337
+ * The 0.9.0 `owner_pid` field exists exactly to disambiguate this
338
+ * bucket. If `owner_pid` is alive, an overlapping writer is still
339
+ * running and we yield (silent). If `owner_pid` is gone (ESRCH)
340
+ * or missing from the payload (pre-0.9.0 file or same-process
341
+ * write), we treat the file as abandoned and take over.
342
+ *
343
+ * `process.kill(pid, 0)` returns ESRCH for a missing PID, EPERM for a
344
+ * live PID we cannot signal. We treat EPERM as "alive from someone's
345
+ * perspective" and yield — never steal a file the kernel is uncertain
346
+ * about.
347
+ */
348
+ ownsStateFile() {
349
+ let raw;
350
+ try {
351
+ raw = fs.readFileSync(this.opts.stateFilePath, 'utf8');
352
+ }
353
+ catch (err) {
354
+ const code = err.code;
355
+ // ENOENT is expected on the very first write.
356
+ if (code === 'ENOENT')
357
+ return true;
358
+ // Any other read error (permissions, EIO) — fall through and try to
359
+ // write; the write will surface the real error via the outer
360
+ // try/catch. We do NOT want to silently suppress writes on a
361
+ // transient read hiccup.
362
+ return true;
363
+ }
364
+ let parsed;
365
+ try {
366
+ parsed = JSON.parse(raw);
367
+ }
368
+ catch {
369
+ // Unparseable file — treat as "not owned by anyone", safe to overwrite.
370
+ return true;
371
+ }
372
+ if (typeof parsed.session_id !== 'string')
373
+ return true;
374
+ if (parsed.session_id === this.opts.sessionId)
375
+ return true;
376
+ // Foreign session_id. Use owner_pid to decide whether to yield or steal.
377
+ if (typeof parsed.owner_pid !== 'number' || !Number.isFinite(parsed.owner_pid) || parsed.owner_pid <= 0) {
378
+ // Pre-0.9.0 file (no owner_pid recorded) or malformed value. We
379
+ // cannot prove the writer is alive, and refusing to write forever
380
+ // is the bigger hazard — claim the file. This is the same
381
+ // conservative "better a stale snapshot gets replaced by a valid
382
+ // one" rule the old code applied to unparseable files.
383
+ this.opts.logger?.info({
384
+ event: 'live_state.reclaimed',
385
+ message: 'serve.state.json has a foreign session_id without owner_pid — treating as abandoned',
386
+ });
387
+ return true;
388
+ }
389
+ const ownerPid = parsed.owner_pid;
390
+ try {
391
+ process.kill(ownerPid, 0);
392
+ // PID is alive — another `rea serve` instance is still writing.
393
+ return false;
394
+ }
395
+ catch (err) {
396
+ const code = err.code;
397
+ if (code === 'ESRCH') {
398
+ // Writer is gone. File is abandoned; steal ownership.
399
+ this.opts.logger?.info({
400
+ event: 'live_state.reclaimed',
401
+ message: `serve.state.json previous owner pid ${ownerPid} is gone — reclaiming for session ${this.opts.sessionId}`,
402
+ });
403
+ return true;
404
+ }
405
+ // EPERM or any other signal error — the PID exists but we can't
406
+ // signal it. Err on the side of yielding; do not steal from a
407
+ // possibly-live peer.
408
+ return false;
409
+ }
410
+ }
411
+ /**
412
+ * Schedule a longer-interval retry of `flushNow`. Used by the yield path
413
+ * so a new gateway waiting on a live peer eventually reclaims the file
414
+ * when the peer exits. Idempotent — if a retry is already pending, this
415
+ * call is a no-op.
416
+ *
417
+ * Distinct from `scheduleUpdate()` because:
418
+ * - The debounce timer coalesces rapid events; this timer polls at a
419
+ * slow cadence for ownership changes.
420
+ * - Scheduling yield retries on the debounce timer would mean one
421
+ * supervisor event during the wait cancels the retry, and the
422
+ * debounce timer ALSO can't be re-scheduled while `timer !== null`.
423
+ */
424
+ scheduleYieldRetry() {
425
+ if (this.stopped)
426
+ return;
427
+ if (this.yieldRetryTimer !== null)
428
+ return;
429
+ this.yieldRetryTimer = setTimeout(() => {
430
+ this.yieldRetryTimer = null;
431
+ this.flushNow();
432
+ }, YIELD_RETRY_MS);
433
+ if (typeof this.yieldRetryTimer.unref === 'function')
434
+ this.yieldRetryTimer.unref();
435
+ }
436
+ /**
437
+ * Stop further scheduled writes. Called from the gateway shutdown path
438
+ * AFTER the final flush. Clears any pending timer; no more writes will
439
+ * occur after this returns.
440
+ */
441
+ stop() {
442
+ this.stopped = true;
443
+ if (this.timer !== null) {
444
+ clearTimeout(this.timer);
445
+ this.timer = null;
446
+ }
447
+ if (this.yieldRetryTimer !== null) {
448
+ clearTimeout(this.yieldRetryTimer);
449
+ this.yieldRetryTimer = null;
450
+ }
451
+ }
452
+ /** Exposed for tests. Builds the canonical payload from live sources. */
453
+ buildSnapshot() {
454
+ const downstreams = this.buildDownstreamBlock();
455
+ return {
456
+ session_id: this.opts.sessionId,
457
+ started_at: this.opts.startedAt,
458
+ metrics_port: this.opts.metricsPort,
459
+ downstreams,
460
+ updated_at: new Date().toISOString(),
461
+ // Stamp the owning PID so a future `rea serve` can distinguish
462
+ // "another live session is writing this file" from "the previous
463
+ // writer crashed and left orphaned breadcrumbs". See `ownsStateFile`.
464
+ owner_pid: process.pid,
465
+ };
466
+ }
467
+ buildDownstreamBlock() {
468
+ const health = this.opts.pool.healthSnapshot();
469
+ const blockerSnapshot = new Map();
470
+ for (const entry of this.opts.sessionBlocker.snapshot()) {
471
+ blockerSnapshot.set(entry.server, {
472
+ open_transitions: entry.open_transitions,
473
+ emitted: entry.emitted,
474
+ });
475
+ }
476
+ const out = [];
477
+ for (const h of health) {
478
+ const circuitEntry = this.opts.breaker.getCircuit(h.name);
479
+ const circuitState = circuitEntry?.state ?? 'closed';
480
+ let retryAt = null;
481
+ if (circuitState === 'open' && circuitEntry?.openedAt != null) {
482
+ retryAt = new Date(circuitEntry.openedAt + circuitEntry.cooldownMs).toISOString();
483
+ }
484
+ const blocker = blockerSnapshot.get(h.name);
485
+ // Run `last_error` through the optional redactor before persistence.
486
+ // Null passes through unchanged; absent redactor = pre-0.9.0 behavior.
487
+ const lastError = h.last_error !== null && this.opts.lastErrorRedactor
488
+ ? this.opts.lastErrorRedactor(h.last_error)
489
+ : h.last_error;
490
+ out.push({
491
+ name: h.name,
492
+ connected: h.connected,
493
+ healthy: h.healthy,
494
+ circuit_state: circuitState,
495
+ retry_at: retryAt,
496
+ last_error: lastError,
497
+ tools_count: h.tools_count,
498
+ open_transitions: blocker?.open_transitions ?? 0,
499
+ session_blocker_emitted: blocker?.emitted ?? false,
500
+ });
501
+ }
502
+ return out;
503
+ }
504
+ }
@@ -31,9 +31,11 @@
31
31
  */
32
32
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
33
33
  import { DownstreamPool } from './downstream-pool.js';
34
+ import { SessionBlockerTracker } from './session-blocker.js';
35
+ import { LiveStatePublisher } from './live-state.js';
34
36
  import type { Registry } from '../registry/types.js';
35
37
  import type { Policy } from '../policy/types.js';
36
- import { type Logger } from './log.js';
38
+ import { type FieldRedactor, type Logger } from './log.js';
37
39
  import { type MetricsRegistry } from './observability/metrics.js';
38
40
  export interface GatewayOptions {
39
41
  baseDir: string;
@@ -52,6 +54,36 @@ export interface GatewayOptions {
52
54
  * tests without bringing in the metrics surface.
53
55
  */
54
56
  metrics?: MetricsRegistry;
57
+ /**
58
+ * 0.9.0 — when provided, the gateway attaches a live-state publisher that
59
+ * rewrites `.rea/serve.state.json` on circuit-breaker and supervisor
60
+ * events so `rea status --json` can report per-downstream circuit state.
61
+ * Tests that don't care about the state file simply omit this; the
62
+ * gateway still tracks circuit state internally for routing decisions.
63
+ */
64
+ liveStateFilePath?: string;
65
+ /**
66
+ * 0.9.0 — boot-time metadata propagated into `serve.state.json` so
67
+ * `rea status` can surface them alongside the new downstream block.
68
+ * Used only when `liveStateFilePath` is supplied.
69
+ */
70
+ liveStateSessionId?: string;
71
+ liveStateStartedAt?: string;
72
+ liveStateMetricsPort?: number | null;
73
+ /**
74
+ * 0.9.0 pass-7 — optional redactor applied to every downstream
75
+ * `last_error` before it is written into `.rea/serve.state.json`. When
76
+ * omitted, error strings are persisted verbatim (length-bounded and
77
+ * control-char-stripped at display time, but CONTENT unredacted).
78
+ *
79
+ * `rea serve` wires this to the same pattern set the gateway logger
80
+ * uses, so a misbehaving downstream that echoes `AWS_SECRET_ACCESS_KEY`
81
+ * or similar into an error message has the secret replaced with
82
+ * `[REDACTED]` before it reaches the operator's terminal via
83
+ * `rea status`. Direct `createGateway` consumers may omit this if they
84
+ * have their own redaction pipeline or are in a test environment.
85
+ */
86
+ liveStateLastErrorRedactor?: FieldRedactor;
55
87
  }
56
88
  export interface GatewayHandle {
57
89
  /** Expose the Server for test harnesses that attach InMemoryTransport. */
@@ -66,5 +98,16 @@ export interface GatewayHandle {
66
98
  logger: Logger;
67
99
  /** Optional metrics registry (undefined when the caller did not supply one). */
68
100
  metrics: MetricsRegistry | undefined;
101
+ /**
102
+ * 0.9.0 — exposed for tests + serve.ts shutdown path so the final flush
103
+ * can be forced before the state file is ownership-cleaned. `null` when
104
+ * the caller did not provide `liveStateFilePath`.
105
+ */
106
+ livePublisher: LiveStatePublisher | null;
107
+ /**
108
+ * 0.9.0 — per-session blocker tracker. Exposed so tests can observe
109
+ * emissions and so a future reload path can reset counters on SIGHUP.
110
+ */
111
+ sessionBlocker: SessionBlockerTracker;
69
112
  }
70
113
  export declare function createGateway(opts: GatewayOptions): GatewayHandle;