@bookedsolid/rea 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,8 @@ import { executeChain } from './middleware/chain.js';
53
53
  import { RateLimiter } from './rate-limiter.js';
54
54
  import { CircuitBreaker } from './circuit-breaker.js';
55
55
  import { currentSessionId } from './session.js';
56
+ import { SessionBlockerTracker } from './session-blocker.js';
57
+ import { LiveStatePublisher } from './live-state.js';
56
58
  import { InvocationStatus, Tier } from '../policy/types.js';
57
59
  import { log } from '../cli/utils.js';
58
60
  import { createLogger } from './log.js';
@@ -127,6 +129,36 @@ export function createGateway(opts) {
127
129
  const pool = new DownstreamPool(registry, logger);
128
130
  const gatewayVersion = getPkgVersion();
129
131
  const startedAtMs = Date.now();
132
+ // 0.9.0 — SESSION_BLOCKER tracker. One per gateway process. The audit
133
+ // sink wraps `appendAuditRecord` so a fired record lands in the hash
134
+ // chain for forensic inspection.
135
+ const sessionBlocker = new SessionBlockerTracker(currentSessionId(), {}, logger, async (event) => {
136
+ try {
137
+ await appendAuditRecord(baseDir, {
138
+ tool_name: 'session_blocker',
139
+ server_name: event.server_name,
140
+ status: InvocationStatus.Error,
141
+ tier: Tier.Read,
142
+ autonomy_level: String(policy.autonomy_level),
143
+ session_id: event.session_id,
144
+ duration_ms: 0,
145
+ metadata: {
146
+ event: event.event,
147
+ open_transitions: event.open_transitions,
148
+ threshold: event.threshold,
149
+ emitted_at: event.emitted_at,
150
+ },
151
+ });
152
+ }
153
+ catch (err) {
154
+ logger.error({
155
+ event: 'session_blocker.audit_failed',
156
+ server_name: event.server_name,
157
+ message: 'failed to append SESSION_BLOCKER audit record — log remains the sole record',
158
+ error: err instanceof Error ? err.message : String(err),
159
+ });
160
+ }
161
+ });
130
162
  // BUG-011 (0.6.2) — process-lifetime counter of failed audit appends from
131
163
  // the `__rea__health` short-circuit. Exposed on the health snapshot as
132
164
  // `summary.audit_fail_count` so operators can detect the silent-audit-gap
@@ -135,6 +167,9 @@ export function createGateway(opts) {
135
167
  const server = new Server({ name: 'rea', version: gatewayVersion }, { capabilities: { tools: {} } });
136
168
  // Build the circuit breaker with observability hooks wired in — state
137
169
  // transitions log a structured record AND update the Prometheus gauge.
170
+ // 0.9.0: also feed SESSION_BLOCKER tracker and live-state publisher so
171
+ // `rea status` and the audit chain surface per-session outages.
172
+ let livePublisher = null;
138
173
  const breaker = new CircuitBreaker({
139
174
  onStateChange: (event) => {
140
175
  const level = event.to === 'open' ? 'warn' : 'info';
@@ -155,9 +190,56 @@ export function createGateway(opts) {
155
190
  metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.open);
156
191
  break;
157
192
  }
193
+ sessionBlocker.recordCircuitTransition({
194
+ server: event.server,
195
+ from: event.from,
196
+ to: event.to,
197
+ });
198
+ livePublisher?.scheduleUpdate();
158
199
  },
159
200
  });
160
201
  const staticChain = buildMiddlewareChain(opts, { breaker });
202
+ // Pool supervisor events → live-state publisher. Covers three kinds:
203
+ // - `child_died_unexpectedly` — child exited outside a caller-initiated
204
+ // close(). Session-blocker counts this indirectly through the breaker
205
+ // transition it eventually triggers.
206
+ // - `respawned` — successful reconnect. Forwarded to session-blocker as
207
+ // an intentional no-op (see `recordRespawn` JSDoc): respawn is NOT
208
+ // equivalent to circuit recovery, so we do NOT clear blocker state
209
+ // on reconnect. The method exists to make the wiring site obvious
210
+ // on the call graph and to give us one place to change if the
211
+ // semantics ever shift — but today it deliberately records nothing.
212
+ // - `health_changed` — a non-transition mutation of a field surfaced in
213
+ // `rea status` (health, last_error, tools_count). Codex 0.9.0 pass-2
214
+ // P2a: without this, the first failure below the breaker threshold
215
+ // or a successful `listTools` count change never reached the
216
+ // publisher, leaving `rea status` showing stale downstream data.
217
+ // `scheduleUpdate()` is debounced (250 ms default) so storm bursts
218
+ // coalesce to one write.
219
+ pool.onSupervisorEvent((event) => {
220
+ if (event.kind === 'respawned')
221
+ sessionBlocker.recordRespawn(event.server);
222
+ livePublisher?.scheduleUpdate();
223
+ });
224
+ if (opts.liveStateFilePath !== undefined) {
225
+ // Build options defensively — exactOptionalPropertyTypes refuses
226
+ // `lastErrorRedactor: undefined` against `lastErrorRedactor?: FieldRedactor`.
227
+ const publisherOpts = {
228
+ baseDir,
229
+ stateFilePath: opts.liveStateFilePath,
230
+ sessionId: opts.liveStateSessionId ?? currentSessionId(),
231
+ startedAt: opts.liveStateStartedAt ?? new Date(startedAtMs).toISOString(),
232
+ metricsPort: opts.liveStateMetricsPort ?? null,
233
+ pool,
234
+ breaker,
235
+ sessionBlocker,
236
+ logger,
237
+ ...(opts.liveStateLastErrorRedactor !== undefined
238
+ ? { lastErrorRedactor: opts.liveStateLastErrorRedactor }
239
+ : {}),
240
+ };
241
+ livePublisher = new LiveStatePublisher(publisherOpts);
242
+ }
161
243
  // Read `.rea/HALT` without ever throwing. Returns `{halt, reason}` where
162
244
  // `reason` is the (trimmed) file contents or null when the file is absent
163
245
  // / unreadable. The meta-tool never surfaces I/O errors — health is the one
@@ -462,12 +544,21 @@ export function createGateway(opts) {
462
544
  }
463
545
  const activeTransport = transport ?? new StdioServerTransport();
464
546
  await server.connect(activeTransport);
547
+ // Publish the initial live-state snapshot so `rea status` sees the
548
+ // `downstreams` block from the first moment the gateway is up, not
549
+ // only after the first circuit transition.
550
+ livePublisher?.flushNow();
465
551
  }
466
552
  async function stop() {
467
553
  if (stopping)
468
554
  return;
469
555
  stopping = true;
470
556
  logger.info({ event: 'gateway.shutdown', message: 'gateway stop requested' });
557
+ // Final flush BEFORE we drop the publisher so any last-moment transition
558
+ // (e.g. a circuit closing as pool.close() quiesces it) is reflected on
559
+ // disk for the very last `rea status` after shutdown.
560
+ livePublisher?.flushNow();
561
+ livePublisher?.stop();
471
562
  try {
472
563
  await server.close();
473
564
  }
@@ -476,5 +567,14 @@ export function createGateway(opts) {
476
567
  }
477
568
  await pool.close();
478
569
  }
479
- return { server, start, stop, pool, logger, metrics };
570
+ return {
571
+ server,
572
+ start,
573
+ stop,
574
+ pool,
575
+ logger,
576
+ metrics,
577
+ livePublisher,
578
+ sessionBlocker,
579
+ };
480
580
  }
@@ -0,0 +1,132 @@
1
+ /**
2
+ * SESSION_BLOCKER tracker (BUG-004, 0.9.0).
3
+ *
4
+ * When a downstream MCP server fails repeatedly in a single session the
5
+ * operator needs one LOUD signal — not a log stream full of identical
6
+ * circuit-open records. This module owns the per-(session_id, server_name)
7
+ * counter and emits exactly one `SESSION_BLOCKER` event once a threshold is
8
+ * crossed; the event is replayed neither on continued failure nor on a
9
+ * circuit-breaker flap. Recovery (downstream returns to healthy) resets the
10
+ * counter and re-arms the emission.
11
+ *
12
+ * ## Why this lives separately from CircuitBreaker
13
+ *
14
+ * The circuit breaker tracks CONSECUTIVE CALL-LEVEL failures per server —
15
+ * it is wire-hot and opens/closes many times across a long session. The
16
+ * session blocker tracks OPEN-LEVEL failures per session: every
17
+ * circuit-open transition counts as ONE. A downstream that flaps
18
+ * open→closed→open three times in ten minutes is already a blocker from an
19
+ * operator perspective — it should be surfaced once, not muted by the
20
+ * breaker's own internal recoveries.
21
+ *
22
+ * ## Emission semantics
23
+ *
24
+ * - Increment on every circuit transition to `open`.
25
+ * - When the counter for (session, server) crosses `threshold`, fire
26
+ * ONE `SESSION_BLOCKER` record (structured log + audit append). The
27
+ * counter keeps incrementing — subsequent opens do NOT re-fire.
28
+ * - On circuit recovery (transition to `closed`) the counter resets and
29
+ * the "already emitted" flag clears; the next threshold crossing will
30
+ * fire a fresh record.
31
+ * - On session change (new session_id) every counter is dropped — a new
32
+ * `rea serve` instance starts fresh.
33
+ *
34
+ * ## Why audit
35
+ *
36
+ * The hash-chained audit log is the single place an operator can look for a
37
+ * forensic record of persistent downstream outages. A `SESSION_BLOCKER`
38
+ * record in the audit trail pinpoints the session + downstream that went
39
+ * dark, independent of whichever log sink the operator had configured.
40
+ *
41
+ * Audit appends are best-effort; a failure to write never breaks the
42
+ * gateway. The log-side emission happens first and unconditionally.
43
+ */
44
+ import type { Logger } from './log.js';
45
+ /**
46
+ * Event shape observed by the tracker. Only `from` → `to` and `server` are
47
+ * needed; the tracker does not care about retryAt/reason.
48
+ */
49
+ export interface CircuitTransitionEvent {
50
+ server: string;
51
+ from: 'closed' | 'open' | 'half-open';
52
+ to: 'closed' | 'open' | 'half-open';
53
+ }
54
+ /**
55
+ * Structured record emitted when a session-level block threshold is
56
+ * crossed. Exposed so tests and audit-append helpers can construct the
57
+ * canonical shape without re-declaring the fields.
58
+ */
59
+ export interface SessionBlockerEvent {
60
+ event: 'SESSION_BLOCKER';
61
+ session_id: string;
62
+ server_name: string;
63
+ open_transitions: number;
64
+ threshold: number;
65
+ /** ISO timestamp at emission. */
66
+ emitted_at: string;
67
+ message: string;
68
+ }
69
+ /**
70
+ * Callback the tracker invokes when a SESSION_BLOCKER fires. The gateway
71
+ * wires this to `appendAuditRecord` so forensic capture survives logger
72
+ * downtime. Errors raised by the sink are swallowed — a broken audit
73
+ * pipeline must never break state tracking.
74
+ */
75
+ export type SessionBlockerAuditSink = (event: SessionBlockerEvent) => Promise<void> | void;
76
+ export interface SessionBlockerOptions {
77
+ /**
78
+ * Number of open-transitions required to fire the event. Default: 3 —
79
+ * matches Jake's "after N consecutive same-downstream failures in one
80
+ * session" from the bug report. Low enough to catch real outages quickly,
81
+ * high enough that a single noisy reconnect doesn't spuriously fire.
82
+ */
83
+ threshold?: number;
84
+ }
85
+ /**
86
+ * Per-(session_id, server_name) SESSION_BLOCKER tracker.
87
+ *
88
+ * Stateful and single-instance per gateway process. The circuit breaker's
89
+ * `onStateChange` listener plus the pool's respawn events feed it; the
90
+ * tracker decides whether to emit.
91
+ */
92
+ export declare class SessionBlockerTracker {
93
+ private readonly threshold;
94
+ private readonly logger;
95
+ private readonly auditSink;
96
+ private sessionId;
97
+ private readonly entries;
98
+ constructor(sessionId: string, options?: SessionBlockerOptions, logger?: Logger, auditSink?: SessionBlockerAuditSink);
99
+ /**
100
+ * Replace the tracked session id and clear all counters. Called from the
101
+ * serve entry when a fresh session boots. In practice `session_id` is
102
+ * assigned once per process — this is here for test determinism and
103
+ * future multi-session transports.
104
+ */
105
+ resetForSession(sessionId: string): void;
106
+ /**
107
+ * Feed a circuit-breaker transition. Fires a SESSION_BLOCKER record when
108
+ * the threshold is crossed for the first time. Subsequent opens increment
109
+ * the counter but do NOT re-fire until recovery resets.
110
+ */
111
+ recordCircuitTransition(event: CircuitTransitionEvent): void;
112
+ /**
113
+ * Feed a respawn event from the supervisor. A successful respawn is NOT
114
+ * the same as circuit recovery — the circuit closes only after a
115
+ * successful probe tool call, not just after reconnect. We intentionally
116
+ * do nothing here so the respawn path does not mask a live outage.
117
+ * Exposed as a method so the wiring site is obvious at the call graph.
118
+ */
119
+ recordRespawn(_server: string): void;
120
+ /**
121
+ * Snapshot for observability / status — the `rea status` JSON output
122
+ * surfaces per-server transition counts so operators can see "this one
123
+ * has failed twice but hasn't crossed threshold yet".
124
+ */
125
+ snapshot(): Array<{
126
+ server: string;
127
+ open_transitions: number;
128
+ emitted: boolean;
129
+ }>;
130
+ private getOrCreate;
131
+ private fire;
132
+ }
@@ -0,0 +1,163 @@
1
+ /**
2
+ * SESSION_BLOCKER tracker (BUG-004, 0.9.0).
3
+ *
4
+ * When a downstream MCP server fails repeatedly in a single session the
5
+ * operator needs one LOUD signal — not a log stream full of identical
6
+ * circuit-open records. This module owns the per-(session_id, server_name)
7
+ * counter and emits exactly one `SESSION_BLOCKER` event once a threshold is
8
+ * crossed; the event is replayed neither on continued failure nor on a
9
+ * circuit-breaker flap. Recovery (downstream returns to healthy) resets the
10
+ * counter and re-arms the emission.
11
+ *
12
+ * ## Why this lives separately from CircuitBreaker
13
+ *
14
+ * The circuit breaker tracks CONSECUTIVE CALL-LEVEL failures per server —
15
+ * it is wire-hot and opens/closes many times across a long session. The
16
+ * session blocker tracks OPEN-LEVEL failures per session: every
17
+ * circuit-open transition counts as ONE. A downstream that flaps
18
+ * open→closed→open three times in ten minutes is already a blocker from an
19
+ * operator perspective — it should be surfaced once, not muted by the
20
+ * breaker's own internal recoveries.
21
+ *
22
+ * ## Emission semantics
23
+ *
24
+ * - Increment on every circuit transition to `open`.
25
+ * - When the counter for (session, server) crosses `threshold`, fire
26
+ * ONE `SESSION_BLOCKER` record (structured log + audit append). The
27
+ * counter keeps incrementing — subsequent opens do NOT re-fire.
28
+ * - On circuit recovery (transition to `closed`) the counter resets and
29
+ * the "already emitted" flag clears; the next threshold crossing will
30
+ * fire a fresh record.
31
+ * - On session change (new session_id) every counter is dropped — a new
32
+ * `rea serve` instance starts fresh.
33
+ *
34
+ * ## Why audit
35
+ *
36
+ * The hash-chained audit log is the single place an operator can look for a
37
+ * forensic record of persistent downstream outages. A `SESSION_BLOCKER`
38
+ * record in the audit trail pinpoints the session + downstream that went
39
+ * dark, independent of whichever log sink the operator had configured.
40
+ *
41
+ * Audit appends are best-effort; a failure to write never breaks the
42
+ * gateway. The log-side emission happens first and unconditionally.
43
+ */
44
+ /**
45
+ * Per-(session_id, server_name) SESSION_BLOCKER tracker.
46
+ *
47
+ * Stateful and single-instance per gateway process. The circuit breaker's
48
+ * `onStateChange` listener plus the pool's respawn events feed it; the
49
+ * tracker decides whether to emit.
50
+ */
51
+ export class SessionBlockerTracker {
52
+ threshold;
53
+ logger;
54
+ auditSink;
55
+ sessionId;
56
+ entries = new Map();
57
+ constructor(sessionId, options = {}, logger, auditSink) {
58
+ this.threshold = Math.max(1, options.threshold ?? 3);
59
+ this.logger = logger;
60
+ this.auditSink = auditSink;
61
+ this.sessionId = sessionId;
62
+ }
63
+ /**
64
+ * Replace the tracked session id and clear all counters. Called from the
65
+ * serve entry when a fresh session boots. In practice `session_id` is
66
+ * assigned once per process — this is here for test determinism and
67
+ * future multi-session transports.
68
+ */
69
+ resetForSession(sessionId) {
70
+ this.sessionId = sessionId;
71
+ this.entries.clear();
72
+ }
73
+ /**
74
+ * Feed a circuit-breaker transition. Fires a SESSION_BLOCKER record when
75
+ * the threshold is crossed for the first time. Subsequent opens increment
76
+ * the counter but do NOT re-fire until recovery resets.
77
+ */
78
+ recordCircuitTransition(event) {
79
+ const entry = this.getOrCreate(event.server);
80
+ if (event.to === 'closed') {
81
+ // Recovery resets state — a future threshold crossing will fire a
82
+ // fresh record rather than being muted by the prior one.
83
+ entry.openTransitions = 0;
84
+ entry.alreadyEmitted = false;
85
+ return;
86
+ }
87
+ if (event.to !== 'open')
88
+ return;
89
+ entry.openTransitions += 1;
90
+ if (!entry.alreadyEmitted && entry.openTransitions >= this.threshold) {
91
+ entry.alreadyEmitted = true;
92
+ this.fire(event.server, entry.openTransitions);
93
+ }
94
+ }
95
+ /**
96
+ * Feed a respawn event from the supervisor. A successful respawn is NOT
97
+ * the same as circuit recovery — the circuit closes only after a
98
+ * successful probe tool call, not just after reconnect. We intentionally
99
+ * do nothing here so the respawn path does not mask a live outage.
100
+ * Exposed as a method so the wiring site is obvious at the call graph.
101
+ */
102
+ recordRespawn(_server) {
103
+ // Intentional no-op. See JSDoc.
104
+ }
105
+ /**
106
+ * Snapshot for observability / status — the `rea status` JSON output
107
+ * surfaces per-server transition counts so operators can see "this one
108
+ * has failed twice but hasn't crossed threshold yet".
109
+ */
110
+ snapshot() {
111
+ const out = [];
112
+ for (const [server, state] of this.entries) {
113
+ out.push({
114
+ server,
115
+ open_transitions: state.openTransitions,
116
+ emitted: state.alreadyEmitted,
117
+ });
118
+ }
119
+ return out;
120
+ }
121
+ getOrCreate(server) {
122
+ let entry = this.entries.get(server);
123
+ if (entry === undefined) {
124
+ entry = { openTransitions: 0, alreadyEmitted: false };
125
+ this.entries.set(server, entry);
126
+ }
127
+ return entry;
128
+ }
129
+ fire(server, count) {
130
+ const event = {
131
+ event: 'SESSION_BLOCKER',
132
+ session_id: this.sessionId,
133
+ server_name: server,
134
+ open_transitions: count,
135
+ threshold: this.threshold,
136
+ emitted_at: new Date().toISOString(),
137
+ message: `downstream "${server}" has opened the circuit ${count} time(s) in this session ` +
138
+ `(threshold ${this.threshold}). This is a SESSION_BLOCKER — the gateway will keep ` +
139
+ `routing around it, but operator attention is required to restore capacity.`,
140
+ };
141
+ // LOUD structured log at error level. This is the primary surface for
142
+ // live operators tailing stderr; the audit record below is the forensic
143
+ // companion.
144
+ this.logger?.error({
145
+ event: 'session_blocker',
146
+ server_name: server,
147
+ message: event.message,
148
+ session_id: this.sessionId,
149
+ open_transitions: count,
150
+ threshold: this.threshold,
151
+ });
152
+ if (this.auditSink === undefined)
153
+ return;
154
+ // Fire-and-forget: a slow audit sink must not block the circuit-state
155
+ // transition path. The sink itself is contracted to swallow errors.
156
+ void Promise.resolve()
157
+ .then(() => this.auditSink(event))
158
+ .catch(() => {
159
+ // All errors are already swallowed in the sink; this is a defensive
160
+ // catch for an unlikely sync throw on the thenable boundary.
161
+ });
162
+ }
163
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bookedsolid/rea",
3
- "version": "0.8.0",
3
+ "version": "0.9.0",
4
4
  "description": "Agentic governance layer for Claude Code — policy enforcement, hook-based safety gates, audit logging, and Codex-integrated adversarial review for AI-assisted projects",
5
5
  "license": "MIT",
6
6
  "author": "Booked Solid Technology <oss@bookedsolid.tech> (https://bookedsolid.tech)",