@bookedsolid/rea 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +268 -51
- package/SECURITY.md +24 -7
- package/THREAT_MODEL.md +196 -18
- package/dist/cli/serve.d.ts +8 -0
- package/dist/cli/serve.js +32 -6
- package/dist/cli/status.d.ts +40 -1
- package/dist/cli/status.js +101 -2
- package/dist/gateway/circuit-breaker.d.ts +8 -2
- package/dist/gateway/downstream-pool.d.ts +13 -1
- package/dist/gateway/downstream-pool.js +30 -2
- package/dist/gateway/downstream.d.ts +157 -0
- package/dist/gateway/downstream.js +307 -5
- package/dist/gateway/live-state.d.ts +252 -0
- package/dist/gateway/live-state.js +504 -0
- package/dist/gateway/server.d.ts +44 -1
- package/dist/gateway/server.js +101 -1
- package/dist/gateway/session-blocker.d.ts +132 -0
- package/dist/gateway/session-blocker.js +163 -0
- package/package.json +1 -1
package/dist/gateway/server.js
CHANGED
|
@@ -53,6 +53,8 @@ import { executeChain } from './middleware/chain.js';
|
|
|
53
53
|
import { RateLimiter } from './rate-limiter.js';
|
|
54
54
|
import { CircuitBreaker } from './circuit-breaker.js';
|
|
55
55
|
import { currentSessionId } from './session.js';
|
|
56
|
+
import { SessionBlockerTracker } from './session-blocker.js';
|
|
57
|
+
import { LiveStatePublisher } from './live-state.js';
|
|
56
58
|
import { InvocationStatus, Tier } from '../policy/types.js';
|
|
57
59
|
import { log } from '../cli/utils.js';
|
|
58
60
|
import { createLogger } from './log.js';
|
|
@@ -127,6 +129,36 @@ export function createGateway(opts) {
|
|
|
127
129
|
const pool = new DownstreamPool(registry, logger);
|
|
128
130
|
const gatewayVersion = getPkgVersion();
|
|
129
131
|
const startedAtMs = Date.now();
|
|
132
|
+
// 0.9.0 — SESSION_BLOCKER tracker. One per gateway process. The audit
|
|
133
|
+
// sink wraps `appendAuditRecord` so a fired record lands in the hash
|
|
134
|
+
// chain for forensic inspection.
|
|
135
|
+
const sessionBlocker = new SessionBlockerTracker(currentSessionId(), {}, logger, async (event) => {
|
|
136
|
+
try {
|
|
137
|
+
await appendAuditRecord(baseDir, {
|
|
138
|
+
tool_name: 'session_blocker',
|
|
139
|
+
server_name: event.server_name,
|
|
140
|
+
status: InvocationStatus.Error,
|
|
141
|
+
tier: Tier.Read,
|
|
142
|
+
autonomy_level: String(policy.autonomy_level),
|
|
143
|
+
session_id: event.session_id,
|
|
144
|
+
duration_ms: 0,
|
|
145
|
+
metadata: {
|
|
146
|
+
event: event.event,
|
|
147
|
+
open_transitions: event.open_transitions,
|
|
148
|
+
threshold: event.threshold,
|
|
149
|
+
emitted_at: event.emitted_at,
|
|
150
|
+
},
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
catch (err) {
|
|
154
|
+
logger.error({
|
|
155
|
+
event: 'session_blocker.audit_failed',
|
|
156
|
+
server_name: event.server_name,
|
|
157
|
+
message: 'failed to append SESSION_BLOCKER audit record — log remains the sole record',
|
|
158
|
+
error: err instanceof Error ? err.message : String(err),
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
});
|
|
130
162
|
// BUG-011 (0.6.2) — process-lifetime counter of failed audit appends from
|
|
131
163
|
// the `__rea__health` short-circuit. Exposed on the health snapshot as
|
|
132
164
|
// `summary.audit_fail_count` so operators can detect the silent-audit-gap
|
|
@@ -135,6 +167,9 @@ export function createGateway(opts) {
|
|
|
135
167
|
const server = new Server({ name: 'rea', version: gatewayVersion }, { capabilities: { tools: {} } });
|
|
136
168
|
// Build the circuit breaker with observability hooks wired in — state
|
|
137
169
|
// transitions log a structured record AND update the Prometheus gauge.
|
|
170
|
+
// 0.9.0: also feed SESSION_BLOCKER tracker and live-state publisher so
|
|
171
|
+
// `rea status` and the audit chain surface per-session outages.
|
|
172
|
+
let livePublisher = null;
|
|
138
173
|
const breaker = new CircuitBreaker({
|
|
139
174
|
onStateChange: (event) => {
|
|
140
175
|
const level = event.to === 'open' ? 'warn' : 'info';
|
|
@@ -155,9 +190,56 @@ export function createGateway(opts) {
|
|
|
155
190
|
metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.open);
|
|
156
191
|
break;
|
|
157
192
|
}
|
|
193
|
+
sessionBlocker.recordCircuitTransition({
|
|
194
|
+
server: event.server,
|
|
195
|
+
from: event.from,
|
|
196
|
+
to: event.to,
|
|
197
|
+
});
|
|
198
|
+
livePublisher?.scheduleUpdate();
|
|
158
199
|
},
|
|
159
200
|
});
|
|
160
201
|
const staticChain = buildMiddlewareChain(opts, { breaker });
|
|
202
|
+
// Pool supervisor events → live-state publisher. Covers three kinds:
|
|
203
|
+
// - `child_died_unexpectedly` — child exited outside a caller-initiated
|
|
204
|
+
// close(). Session-blocker counts this indirectly through the breaker
|
|
205
|
+
// transition it eventually triggers.
|
|
206
|
+
// - `respawned` — successful reconnect. Forwarded to session-blocker as
|
|
207
|
+
// an intentional no-op (see `recordRespawn` JSDoc): respawn is NOT
|
|
208
|
+
// equivalent to circuit recovery, so we do NOT clear blocker state
|
|
209
|
+
// on reconnect. The method exists to make the wiring site obvious
|
|
210
|
+
// on the call graph and to give us one place to change if the
|
|
211
|
+
// semantics ever shift — but today it deliberately records nothing.
|
|
212
|
+
// - `health_changed` — a non-transition mutation of a field surfaced in
|
|
213
|
+
// `rea status` (health, last_error, tools_count). Codex 0.9.0 pass-2
|
|
214
|
+
// P2a: without this, the first failure below the breaker threshold
|
|
215
|
+
// or a successful `listTools` count change never reached the
|
|
216
|
+
// publisher, leaving `rea status` showing stale downstream data.
|
|
217
|
+
// `scheduleUpdate()` is debounced (250 ms default) so storm bursts
|
|
218
|
+
// coalesce to one write.
|
|
219
|
+
pool.onSupervisorEvent((event) => {
|
|
220
|
+
if (event.kind === 'respawned')
|
|
221
|
+
sessionBlocker.recordRespawn(event.server);
|
|
222
|
+
livePublisher?.scheduleUpdate();
|
|
223
|
+
});
|
|
224
|
+
if (opts.liveStateFilePath !== undefined) {
|
|
225
|
+
// Build options defensively — exactOptionalPropertyTypes refuses
|
|
226
|
+
// `lastErrorRedactor: undefined` against `lastErrorRedactor?: FieldRedactor`.
|
|
227
|
+
const publisherOpts = {
|
|
228
|
+
baseDir,
|
|
229
|
+
stateFilePath: opts.liveStateFilePath,
|
|
230
|
+
sessionId: opts.liveStateSessionId ?? currentSessionId(),
|
|
231
|
+
startedAt: opts.liveStateStartedAt ?? new Date(startedAtMs).toISOString(),
|
|
232
|
+
metricsPort: opts.liveStateMetricsPort ?? null,
|
|
233
|
+
pool,
|
|
234
|
+
breaker,
|
|
235
|
+
sessionBlocker,
|
|
236
|
+
logger,
|
|
237
|
+
...(opts.liveStateLastErrorRedactor !== undefined
|
|
238
|
+
? { lastErrorRedactor: opts.liveStateLastErrorRedactor }
|
|
239
|
+
: {}),
|
|
240
|
+
};
|
|
241
|
+
livePublisher = new LiveStatePublisher(publisherOpts);
|
|
242
|
+
}
|
|
161
243
|
// Read `.rea/HALT` without ever throwing. Returns `{halt, reason}` where
|
|
162
244
|
// `reason` is the (trimmed) file contents or null when the file is absent
|
|
163
245
|
// / unreadable. The meta-tool never surfaces I/O errors — health is the one
|
|
@@ -462,12 +544,21 @@ export function createGateway(opts) {
|
|
|
462
544
|
}
|
|
463
545
|
const activeTransport = transport ?? new StdioServerTransport();
|
|
464
546
|
await server.connect(activeTransport);
|
|
547
|
+
// Publish the initial live-state snapshot so `rea status` sees the
|
|
548
|
+
// `downstreams` block from the first moment the gateway is up, not
|
|
549
|
+
// only after the first circuit transition.
|
|
550
|
+
livePublisher?.flushNow();
|
|
465
551
|
}
|
|
466
552
|
async function stop() {
|
|
467
553
|
if (stopping)
|
|
468
554
|
return;
|
|
469
555
|
stopping = true;
|
|
470
556
|
logger.info({ event: 'gateway.shutdown', message: 'gateway stop requested' });
|
|
557
|
+
// Final flush BEFORE we drop the publisher so any last-moment transition
|
|
558
|
+
// (e.g. a circuit closing as pool.close() quiesces it) is reflected on
|
|
559
|
+
// disk for the very last `rea status` after shutdown.
|
|
560
|
+
livePublisher?.flushNow();
|
|
561
|
+
livePublisher?.stop();
|
|
471
562
|
try {
|
|
472
563
|
await server.close();
|
|
473
564
|
}
|
|
@@ -476,5 +567,14 @@ export function createGateway(opts) {
|
|
|
476
567
|
}
|
|
477
568
|
await pool.close();
|
|
478
569
|
}
|
|
479
|
-
return {
|
|
570
|
+
return {
|
|
571
|
+
server,
|
|
572
|
+
start,
|
|
573
|
+
stop,
|
|
574
|
+
pool,
|
|
575
|
+
logger,
|
|
576
|
+
metrics,
|
|
577
|
+
livePublisher,
|
|
578
|
+
sessionBlocker,
|
|
579
|
+
};
|
|
480
580
|
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SESSION_BLOCKER tracker (BUG-004, 0.9.0).
|
|
3
|
+
*
|
|
4
|
+
* When a downstream MCP server fails repeatedly in a single session the
|
|
5
|
+
* operator needs one LOUD signal — not a log stream full of identical
|
|
6
|
+
* circuit-open records. This module owns the per-(session_id, server_name)
|
|
7
|
+
* counter and emits exactly one `SESSION_BLOCKER` event once a threshold is
|
|
8
|
+
* crossed; the event is replayed neither on continued failure nor on a
|
|
9
|
+
* circuit-breaker flap. Recovery (downstream returns to healthy) resets the
|
|
10
|
+
* counter and re-arms the emission.
|
|
11
|
+
*
|
|
12
|
+
* ## Why this lives separately from CircuitBreaker
|
|
13
|
+
*
|
|
14
|
+
* The circuit breaker tracks CONSECUTIVE CALL-LEVEL failures per server —
|
|
15
|
+
* it is wire-hot and opens/closes many times across a long session. The
|
|
16
|
+
* session blocker tracks OPEN-LEVEL failures per session: every
|
|
17
|
+
* circuit-open transition counts as ONE. A downstream that flaps
|
|
18
|
+
* open→closed→open three times in ten minutes is already a blocker from an
|
|
19
|
+
* operator perspective — it should be surfaced once, not muted by the
|
|
20
|
+
* breaker's own internal recoveries.
|
|
21
|
+
*
|
|
22
|
+
* ## Emission semantics
|
|
23
|
+
*
|
|
24
|
+
* - Increment on every circuit transition to `open`.
|
|
25
|
+
* - When the counter for (session, server) crosses `threshold`, fire
|
|
26
|
+
* ONE `SESSION_BLOCKER` record (structured log + audit append). The
|
|
27
|
+
* counter keeps incrementing — subsequent opens do NOT re-fire.
|
|
28
|
+
* - On circuit recovery (transition to `closed`) the counter resets and
|
|
29
|
+
* the "already emitted" flag clears; the next threshold crossing will
|
|
30
|
+
* fire a fresh record.
|
|
31
|
+
* - On session change (new session_id) every counter is dropped — a new
|
|
32
|
+
* `rea serve` instance starts fresh.
|
|
33
|
+
*
|
|
34
|
+
* ## Why audit
|
|
35
|
+
*
|
|
36
|
+
* The hash-chained audit log is the single place an operator can look for a
|
|
37
|
+
* forensic record of persistent downstream outages. A `SESSION_BLOCKER`
|
|
38
|
+
* record in the audit trail pinpoints the session + downstream that went
|
|
39
|
+
* dark, independent of whichever log sink the operator had configured.
|
|
40
|
+
*
|
|
41
|
+
* Audit appends are best-effort; a failure to write never breaks the
|
|
42
|
+
* gateway. The log-side emission happens first and unconditionally.
|
|
43
|
+
*/
|
|
44
|
+
import type { Logger } from './log.js';
|
|
45
|
+
/**
|
|
46
|
+
* Event shape observed by the tracker. Only `from` → `to` and `server` are
|
|
47
|
+
* needed; the tracker does not care about retryAt/reason.
|
|
48
|
+
*/
|
|
49
|
+
export interface CircuitTransitionEvent {
|
|
50
|
+
server: string;
|
|
51
|
+
from: 'closed' | 'open' | 'half-open';
|
|
52
|
+
to: 'closed' | 'open' | 'half-open';
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Structured record emitted when a session-level block threshold is
|
|
56
|
+
* crossed. Exposed so tests and audit-append helpers can construct the
|
|
57
|
+
* canonical shape without re-declaring the fields.
|
|
58
|
+
*/
|
|
59
|
+
export interface SessionBlockerEvent {
|
|
60
|
+
event: 'SESSION_BLOCKER';
|
|
61
|
+
session_id: string;
|
|
62
|
+
server_name: string;
|
|
63
|
+
open_transitions: number;
|
|
64
|
+
threshold: number;
|
|
65
|
+
/** ISO timestamp at emission. */
|
|
66
|
+
emitted_at: string;
|
|
67
|
+
message: string;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Callback the tracker invokes when a SESSION_BLOCKER fires. The gateway
|
|
71
|
+
* wires this to `appendAuditRecord` so forensic capture survives logger
|
|
72
|
+
* downtime. Errors raised by the sink are swallowed — a broken audit
|
|
73
|
+
* pipeline must never break state tracking.
|
|
74
|
+
*/
|
|
75
|
+
export type SessionBlockerAuditSink = (event: SessionBlockerEvent) => Promise<void> | void;
|
|
76
|
+
export interface SessionBlockerOptions {
|
|
77
|
+
/**
|
|
78
|
+
* Number of open-transitions required to fire the event. Default: 3 —
|
|
79
|
+
* matches Jake's "after N consecutive same-downstream failures in one
|
|
80
|
+
* session" from the bug report. Low enough to catch real outages quickly,
|
|
81
|
+
* high enough that a single noisy reconnect doesn't spuriously fire.
|
|
82
|
+
*/
|
|
83
|
+
threshold?: number;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Per-(session_id, server_name) SESSION_BLOCKER tracker.
|
|
87
|
+
*
|
|
88
|
+
* Stateful and single-instance per gateway process. The circuit breaker's
|
|
89
|
+
* `onStateChange` listener plus the pool's respawn events feed it; the
|
|
90
|
+
* tracker decides whether to emit.
|
|
91
|
+
*/
|
|
92
|
+
export declare class SessionBlockerTracker {
|
|
93
|
+
private readonly threshold;
|
|
94
|
+
private readonly logger;
|
|
95
|
+
private readonly auditSink;
|
|
96
|
+
private sessionId;
|
|
97
|
+
private readonly entries;
|
|
98
|
+
constructor(sessionId: string, options?: SessionBlockerOptions, logger?: Logger, auditSink?: SessionBlockerAuditSink);
|
|
99
|
+
/**
|
|
100
|
+
* Replace the tracked session id and clear all counters. Called from the
|
|
101
|
+
* serve entry when a fresh session boots. In practice `session_id` is
|
|
102
|
+
* assigned once per process — this is here for test determinism and
|
|
103
|
+
* future multi-session transports.
|
|
104
|
+
*/
|
|
105
|
+
resetForSession(sessionId: string): void;
|
|
106
|
+
/**
|
|
107
|
+
* Feed a circuit-breaker transition. Fires a SESSION_BLOCKER record when
|
|
108
|
+
* the threshold is crossed for the first time. Subsequent opens increment
|
|
109
|
+
* the counter but do NOT re-fire until recovery resets.
|
|
110
|
+
*/
|
|
111
|
+
recordCircuitTransition(event: CircuitTransitionEvent): void;
|
|
112
|
+
/**
|
|
113
|
+
* Feed a respawn event from the supervisor. A successful respawn is NOT
|
|
114
|
+
* the same as circuit recovery — the circuit closes only after a
|
|
115
|
+
* successful probe tool call, not just after reconnect. We intentionally
|
|
116
|
+
* do nothing here so the respawn path does not mask a live outage.
|
|
117
|
+
* Exposed as a method so the wiring site is obvious at the call graph.
|
|
118
|
+
*/
|
|
119
|
+
recordRespawn(_server: string): void;
|
|
120
|
+
/**
|
|
121
|
+
* Snapshot for observability / status — the `rea status` JSON output
|
|
122
|
+
* surfaces per-server transition counts so operators can see "this one
|
|
123
|
+
* has failed twice but hasn't crossed threshold yet".
|
|
124
|
+
*/
|
|
125
|
+
snapshot(): Array<{
|
|
126
|
+
server: string;
|
|
127
|
+
open_transitions: number;
|
|
128
|
+
emitted: boolean;
|
|
129
|
+
}>;
|
|
130
|
+
private getOrCreate;
|
|
131
|
+
private fire;
|
|
132
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SESSION_BLOCKER tracker (BUG-004, 0.9.0).
|
|
3
|
+
*
|
|
4
|
+
* When a downstream MCP server fails repeatedly in a single session the
|
|
5
|
+
* operator needs one LOUD signal — not a log stream full of identical
|
|
6
|
+
* circuit-open records. This module owns the per-(session_id, server_name)
|
|
7
|
+
* counter and emits exactly one `SESSION_BLOCKER` event once a threshold is
|
|
8
|
+
* crossed; the event is replayed neither on continued failure nor on a
|
|
9
|
+
* circuit-breaker flap. Recovery (downstream returns to healthy) resets the
|
|
10
|
+
* counter and re-arms the emission.
|
|
11
|
+
*
|
|
12
|
+
* ## Why this lives separately from CircuitBreaker
|
|
13
|
+
*
|
|
14
|
+
* The circuit breaker tracks CONSECUTIVE CALL-LEVEL failures per server —
|
|
15
|
+
* it is wire-hot and opens/closes many times across a long session. The
|
|
16
|
+
* session blocker tracks OPEN-LEVEL failures per session: every
|
|
17
|
+
* circuit-open transition counts as ONE. A downstream that flaps
|
|
18
|
+
* open→closed→open three times in ten minutes is already a blocker from an
|
|
19
|
+
* operator perspective — it should be surfaced once, not muted by the
|
|
20
|
+
* breaker's own internal recoveries.
|
|
21
|
+
*
|
|
22
|
+
* ## Emission semantics
|
|
23
|
+
*
|
|
24
|
+
* - Increment on every circuit transition to `open`.
|
|
25
|
+
* - When the counter for (session, server) crosses `threshold`, fire
|
|
26
|
+
* ONE `SESSION_BLOCKER` record (structured log + audit append). The
|
|
27
|
+
* counter keeps incrementing — subsequent opens do NOT re-fire.
|
|
28
|
+
* - On circuit recovery (transition to `closed`) the counter resets and
|
|
29
|
+
* the "already emitted" flag clears; the next threshold crossing will
|
|
30
|
+
* fire a fresh record.
|
|
31
|
+
* - On session change (new session_id) every counter is dropped — a new
|
|
32
|
+
* `rea serve` instance starts fresh.
|
|
33
|
+
*
|
|
34
|
+
* ## Why audit
|
|
35
|
+
*
|
|
36
|
+
* The hash-chained audit log is the single place an operator can look for a
|
|
37
|
+
* forensic record of persistent downstream outages. A `SESSION_BLOCKER`
|
|
38
|
+
* record in the audit trail pinpoints the session + downstream that went
|
|
39
|
+
* dark, independent of whichever log sink the operator had configured.
|
|
40
|
+
*
|
|
41
|
+
* Audit appends are best-effort; a failure to write never breaks the
|
|
42
|
+
* gateway. The log-side emission happens first and unconditionally.
|
|
43
|
+
*/
|
|
44
|
+
/**
|
|
45
|
+
* Per-(session_id, server_name) SESSION_BLOCKER tracker.
|
|
46
|
+
*
|
|
47
|
+
* Stateful and single-instance per gateway process. The circuit breaker's
|
|
48
|
+
* `onStateChange` listener plus the pool's respawn events feed it; the
|
|
49
|
+
* tracker decides whether to emit.
|
|
50
|
+
*/
|
|
51
|
+
export class SessionBlockerTracker {
|
|
52
|
+
threshold;
|
|
53
|
+
logger;
|
|
54
|
+
auditSink;
|
|
55
|
+
sessionId;
|
|
56
|
+
entries = new Map();
|
|
57
|
+
constructor(sessionId, options = {}, logger, auditSink) {
|
|
58
|
+
this.threshold = Math.max(1, options.threshold ?? 3);
|
|
59
|
+
this.logger = logger;
|
|
60
|
+
this.auditSink = auditSink;
|
|
61
|
+
this.sessionId = sessionId;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Replace the tracked session id and clear all counters. Called from the
|
|
65
|
+
* serve entry when a fresh session boots. In practice `session_id` is
|
|
66
|
+
* assigned once per process — this is here for test determinism and
|
|
67
|
+
* future multi-session transports.
|
|
68
|
+
*/
|
|
69
|
+
resetForSession(sessionId) {
|
|
70
|
+
this.sessionId = sessionId;
|
|
71
|
+
this.entries.clear();
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Feed a circuit-breaker transition. Fires a SESSION_BLOCKER record when
|
|
75
|
+
* the threshold is crossed for the first time. Subsequent opens increment
|
|
76
|
+
* the counter but do NOT re-fire until recovery resets.
|
|
77
|
+
*/
|
|
78
|
+
recordCircuitTransition(event) {
|
|
79
|
+
const entry = this.getOrCreate(event.server);
|
|
80
|
+
if (event.to === 'closed') {
|
|
81
|
+
// Recovery resets state — a future threshold crossing will fire a
|
|
82
|
+
// fresh record rather than being muted by the prior one.
|
|
83
|
+
entry.openTransitions = 0;
|
|
84
|
+
entry.alreadyEmitted = false;
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
if (event.to !== 'open')
|
|
88
|
+
return;
|
|
89
|
+
entry.openTransitions += 1;
|
|
90
|
+
if (!entry.alreadyEmitted && entry.openTransitions >= this.threshold) {
|
|
91
|
+
entry.alreadyEmitted = true;
|
|
92
|
+
this.fire(event.server, entry.openTransitions);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Feed a respawn event from the supervisor. A successful respawn is NOT
|
|
97
|
+
* the same as circuit recovery — the circuit closes only after a
|
|
98
|
+
* successful probe tool call, not just after reconnect. We intentionally
|
|
99
|
+
* do nothing here so the respawn path does not mask a live outage.
|
|
100
|
+
* Exposed as a method so the wiring site is obvious at the call graph.
|
|
101
|
+
*/
|
|
102
|
+
recordRespawn(_server) {
|
|
103
|
+
// Intentional no-op. See JSDoc.
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Snapshot for observability / status — the `rea status` JSON output
|
|
107
|
+
* surfaces per-server transition counts so operators can see "this one
|
|
108
|
+
* has failed twice but hasn't crossed threshold yet".
|
|
109
|
+
*/
|
|
110
|
+
snapshot() {
|
|
111
|
+
const out = [];
|
|
112
|
+
for (const [server, state] of this.entries) {
|
|
113
|
+
out.push({
|
|
114
|
+
server,
|
|
115
|
+
open_transitions: state.openTransitions,
|
|
116
|
+
emitted: state.alreadyEmitted,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
getOrCreate(server) {
|
|
122
|
+
let entry = this.entries.get(server);
|
|
123
|
+
if (entry === undefined) {
|
|
124
|
+
entry = { openTransitions: 0, alreadyEmitted: false };
|
|
125
|
+
this.entries.set(server, entry);
|
|
126
|
+
}
|
|
127
|
+
return entry;
|
|
128
|
+
}
|
|
129
|
+
fire(server, count) {
|
|
130
|
+
const event = {
|
|
131
|
+
event: 'SESSION_BLOCKER',
|
|
132
|
+
session_id: this.sessionId,
|
|
133
|
+
server_name: server,
|
|
134
|
+
open_transitions: count,
|
|
135
|
+
threshold: this.threshold,
|
|
136
|
+
emitted_at: new Date().toISOString(),
|
|
137
|
+
message: `downstream "${server}" has opened the circuit ${count} time(s) in this session ` +
|
|
138
|
+
`(threshold ${this.threshold}). This is a SESSION_BLOCKER — the gateway will keep ` +
|
|
139
|
+
`routing around it, but operator attention is required to restore capacity.`,
|
|
140
|
+
};
|
|
141
|
+
// LOUD structured log at error level. This is the primary surface for
|
|
142
|
+
// live operators tailing stderr; the audit record below is the forensic
|
|
143
|
+
// companion.
|
|
144
|
+
this.logger?.error({
|
|
145
|
+
event: 'session_blocker',
|
|
146
|
+
server_name: server,
|
|
147
|
+
message: event.message,
|
|
148
|
+
session_id: this.sessionId,
|
|
149
|
+
open_transitions: count,
|
|
150
|
+
threshold: this.threshold,
|
|
151
|
+
});
|
|
152
|
+
if (this.auditSink === undefined)
|
|
153
|
+
return;
|
|
154
|
+
// Fire-and-forget: a slow audit sink must not block the circuit-state
|
|
155
|
+
// transition path. The sink itself is contracted to swallow errors.
|
|
156
|
+
void Promise.resolve()
|
|
157
|
+
.then(() => this.auditSink(event))
|
|
158
|
+
.catch(() => {
|
|
159
|
+
// All errors are already swallowed in the sink; this is a defensive
|
|
160
|
+
// catch for an unlikely sync throw on the thenable boundary.
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@bookedsolid/rea",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.1",
|
|
4
4
|
"description": "Agentic governance layer for Claude Code — policy enforcement, hook-based safety gates, audit logging, and Codex-integrated adversarial review for AI-assisted projects",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Booked Solid Technology <oss@bookedsolid.tech> (https://bookedsolid.tech)",
|