@bookedsolid/rea 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +268 -51
- package/SECURITY.md +24 -7
- package/THREAT_MODEL.md +196 -18
- package/dist/cli/serve.d.ts +8 -0
- package/dist/cli/serve.js +32 -6
- package/dist/cli/status.d.ts +40 -1
- package/dist/cli/status.js +101 -2
- package/dist/gateway/circuit-breaker.d.ts +8 -2
- package/dist/gateway/downstream-pool.d.ts +13 -1
- package/dist/gateway/downstream-pool.js +30 -2
- package/dist/gateway/downstream.d.ts +157 -0
- package/dist/gateway/downstream.js +307 -5
- package/dist/gateway/live-state.d.ts +252 -0
- package/dist/gateway/live-state.js +504 -0
- package/dist/gateway/server.d.ts +44 -1
- package/dist/gateway/server.js +101 -1
- package/dist/gateway/session-blocker.d.ts +132 -0
- package/dist/gateway/session-blocker.js +163 -0
- package/package.json +1 -1
|
@@ -25,7 +25,14 @@ export interface CircuitStatus {
|
|
|
25
25
|
serverName: string;
|
|
26
26
|
retryAt?: string;
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
/**
|
|
29
|
+
* Internal per-server circuit state. Exported so observability consumers
|
|
30
|
+
* (live-state publisher, tests) can read `openedAt` and `cooldownMs` to
|
|
31
|
+
* compute a `retry_at` timestamp without duplicating the arithmetic.
|
|
32
|
+
* Treat fields as read-only from outside the breaker — mutating them
|
|
33
|
+
* breaks the invariants `recordSuccess` / `recordFailure` enforce.
|
|
34
|
+
*/
|
|
35
|
+
export interface CircuitEntry {
|
|
29
36
|
state: CircuitState;
|
|
30
37
|
consecutiveFailures: number;
|
|
31
38
|
openedAt: number | null;
|
|
@@ -57,4 +64,3 @@ export declare class CircuitBreaker {
|
|
|
57
64
|
recordFailure(serverName: string): void;
|
|
58
65
|
getCircuit(serverName: string): CircuitEntry | undefined;
|
|
59
66
|
}
|
|
60
|
-
export {};
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* The gateway splits on the FIRST `__` — downstream tools that themselves
|
|
6
6
|
* contain `__` in their name continue to work because the split is one-shot.
|
|
7
7
|
*/
|
|
8
|
-
import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
|
|
8
|
+
import { DownstreamConnection, type DownstreamSupervisorEvent, type DownstreamToolInfo } from './downstream.js';
|
|
9
9
|
import type { Registry } from '../registry/types.js';
|
|
10
10
|
import type { Logger } from './log.js';
|
|
11
11
|
export interface PrefixedTool extends DownstreamToolInfo {
|
|
@@ -44,7 +44,19 @@ export declare class DownstreamPool {
|
|
|
44
44
|
* or is skipped. Stale but truthful > absent.
|
|
45
45
|
*/
|
|
46
46
|
private readonly lastToolsCount;
|
|
47
|
+
/**
|
|
48
|
+
* Optional supervisor event listener wired by the gateway. The pool
|
|
49
|
+
* re-emits per-connection events through this single sink so the
|
|
50
|
+
* SESSION_BLOCKER tracker + state publisher only need to subscribe once.
|
|
51
|
+
*/
|
|
52
|
+
private supervisorListener;
|
|
47
53
|
constructor(registry: Registry, logger?: Logger);
|
|
54
|
+
/**
|
|
55
|
+
* Register a supervisor-event sink. Replaces any previously registered
|
|
56
|
+
* listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
|
|
57
|
+
* and live state publisher.
|
|
58
|
+
*/
|
|
59
|
+
onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
|
|
48
60
|
get size(): number;
|
|
49
61
|
connectAll(): Promise<void>;
|
|
50
62
|
/**
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* The gateway splits on the FIRST `__` — downstream tools that themselves
|
|
6
6
|
* contain `__` in their name continue to work because the split is one-shot.
|
|
7
7
|
*/
|
|
8
|
-
import { DownstreamConnection } from './downstream.js';
|
|
8
|
+
import { DownstreamConnection, } from './downstream.js';
|
|
9
9
|
export class DownstreamPool {
|
|
10
10
|
connections = new Map();
|
|
11
11
|
/**
|
|
@@ -15,13 +15,31 @@ export class DownstreamPool {
|
|
|
15
15
|
* or is skipped. Stale but truthful > absent.
|
|
16
16
|
*/
|
|
17
17
|
lastToolsCount = new Map();
|
|
18
|
+
/**
|
|
19
|
+
* Optional supervisor event listener wired by the gateway. The pool
|
|
20
|
+
* re-emits per-connection events through this single sink so the
|
|
21
|
+
* SESSION_BLOCKER tracker + state publisher only need to subscribe once.
|
|
22
|
+
*/
|
|
23
|
+
supervisorListener = null;
|
|
18
24
|
constructor(registry, logger) {
|
|
19
25
|
for (const server of registry.servers) {
|
|
20
26
|
if (!server.enabled)
|
|
21
27
|
continue;
|
|
22
|
-
|
|
28
|
+
const conn = new DownstreamConnection(server, logger);
|
|
29
|
+
conn.onSupervisorEvent((event) => {
|
|
30
|
+
this.supervisorListener?.(event);
|
|
31
|
+
});
|
|
32
|
+
this.connections.set(server.name, conn);
|
|
23
33
|
}
|
|
24
34
|
}
|
|
35
|
+
/**
|
|
36
|
+
* Register a supervisor-event sink. Replaces any previously registered
|
|
37
|
+
* listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
|
|
38
|
+
* and live state publisher.
|
|
39
|
+
*/
|
|
40
|
+
onSupervisorEvent(listener) {
|
|
41
|
+
this.supervisorListener = listener;
|
|
42
|
+
}
|
|
25
43
|
get size() {
|
|
26
44
|
return this.connections.size;
|
|
27
45
|
}
|
|
@@ -52,7 +70,17 @@ export class DownstreamPool {
|
|
|
52
70
|
continue;
|
|
53
71
|
try {
|
|
54
72
|
const tools = await conn.listTools();
|
|
73
|
+
// Codex 0.9.0 pass-2 P2a: emit a `health_changed` supervisor event
|
|
74
|
+
// whenever the cached tools count actually changes. Without this,
|
|
75
|
+
// a successful listTools would update the value in memory but the
|
|
76
|
+
// live-state publisher would never flush the change — `rea status`
|
|
77
|
+
// would keep reporting a stale `tools_count` until some unrelated
|
|
78
|
+
// circuit/respawn event flushed a snapshot.
|
|
79
|
+
const prev = this.lastToolsCount.get(server);
|
|
55
80
|
this.lastToolsCount.set(server, tools.length);
|
|
81
|
+
if (prev !== tools.length) {
|
|
82
|
+
this.supervisorListener?.({ kind: 'health_changed', server });
|
|
83
|
+
}
|
|
56
84
|
for (const t of tools) {
|
|
57
85
|
const prefixed = {
|
|
58
86
|
...t,
|
|
@@ -30,6 +30,26 @@
|
|
|
30
30
|
* successful reconnect — in that case we mark the connection unhealthy and
|
|
31
31
|
* let the circuit breaker take over.
|
|
32
32
|
*
|
|
33
|
+
* ## Supervisor / child-death detection (0.9.0, BUG-002..003)
|
|
34
|
+
*
|
|
35
|
+
* The SDK `StdioClientTransport` exposes `onclose` + `onerror` callbacks that
|
|
36
|
+
* fire when the child process exits or the stdio pipe errors outside a
|
|
37
|
+
* caller-initiated `close()`. We wire both and treat an unexpected close as
|
|
38
|
+
* "child is dead" — the next `callTool` must force a fresh connect rather
|
|
39
|
+
* than calling into a stale `Client` that will reply `Not connected`.
|
|
40
|
+
*
|
|
41
|
+
* Before 0.9.0 the supervisor was reactive only: a dead child was not noticed
|
|
42
|
+
* until the NEXT tool call tried to use it, at which point the circuit could
|
|
43
|
+
* flap open → half-open → open with the child still dead because the
|
|
44
|
+
* half-open probe re-used the zombie client. 0.9.0 makes death detection
|
|
45
|
+
* eager: `onclose` nulls `this.client` so the very next call takes the
|
|
46
|
+
* `connect()` branch and actually respawns the child.
|
|
47
|
+
*
|
|
48
|
+
* "Not connected" error messages from the SDK (our in-flight fallback) are
|
|
49
|
+
* now also treated as fatal for the current client — we null it before the
|
|
50
|
+
* one-shot reconnect path so we spawn fresh rather than retrying with the
|
|
51
|
+
* same dead handle.
|
|
52
|
+
*
|
|
33
53
|
* ## Why not request-level retries
|
|
34
54
|
*
|
|
35
55
|
* MCP tool calls are not idempotent by default. Retrying `send_message` after
|
|
@@ -74,6 +94,42 @@ export interface BuiltChildEnv {
|
|
|
74
94
|
secretKeys: string[];
|
|
75
95
|
}
|
|
76
96
|
export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
|
|
97
|
+
/**
|
|
98
|
+
* Event emitted by {@link DownstreamConnection} when the supervisor observes
|
|
99
|
+
* a lifecycle transition worth surfacing. Consumers (the pool, the
|
|
100
|
+
* SESSION_BLOCKER tracker, observability sinks) subscribe via
|
|
101
|
+
* {@link DownstreamConnection.onSupervisorEvent}.
|
|
102
|
+
*
|
|
103
|
+
* The `kind` is a narrow closed set so sinks can switch exhaustively. `reason`
|
|
104
|
+
* carries the operator-readable detail; it is already bounded by
|
|
105
|
+
* `boundedDiagnosticString` at the call site.
|
|
106
|
+
*/
|
|
107
|
+
export type DownstreamSupervisorEvent = {
|
|
108
|
+
kind: 'child_died_unexpectedly';
|
|
109
|
+
server: string;
|
|
110
|
+
reason: string;
|
|
111
|
+
} | {
|
|
112
|
+
kind: 'respawned';
|
|
113
|
+
server: string;
|
|
114
|
+
} | {
|
|
115
|
+
/**
|
|
116
|
+
* A non-transition health change. Fires whenever a visible field in
|
|
117
|
+
* {@link DownstreamHealth} (health, last_error, tools_count) mutates
|
|
118
|
+
* WITHOUT being accompanied by a breaker transition or respawn event.
|
|
119
|
+
*
|
|
120
|
+
* Codex 0.9.0 pass-2 P2a: without this event, the first failed call/
|
|
121
|
+
* reconnect below the breaker threshold (or a successful `listTools`
|
|
122
|
+
* that updates the cached tool count) never reaches the live state
|
|
123
|
+
* publisher, so `rea status` would show stale data until some later,
|
|
124
|
+
* unrelated circuit or respawn event finally flushed a snapshot.
|
|
125
|
+
*
|
|
126
|
+
* Firing is best-effort from the connection class; the pool additionally
|
|
127
|
+
* emits this kind after `listAllTools` updates `lastToolsCount` so a
|
|
128
|
+
* tool-catalog change is always visible in the next debounced snapshot.
|
|
129
|
+
*/
|
|
130
|
+
kind: 'health_changed';
|
|
131
|
+
server: string;
|
|
132
|
+
};
|
|
77
133
|
export declare class DownstreamConnection {
|
|
78
134
|
#private;
|
|
79
135
|
private readonly config;
|
|
@@ -84,6 +140,29 @@ export declare class DownstreamConnection {
|
|
|
84
140
|
*/
|
|
85
141
|
private readonly logger?;
|
|
86
142
|
private client;
|
|
143
|
+
/**
|
|
144
|
+
* Handle to the currently active transport, so our `onclose`/`onerror`
|
|
145
|
+
* hooks can tell "this is the transport we care about" vs "a stale callback
|
|
146
|
+
* firing after we already swapped to a new transport". Cleared in `close()`
|
|
147
|
+
* BEFORE we invoke `client.close()` so our own tear-down does not race the
|
|
148
|
+
* supervisor path.
|
|
149
|
+
*/
|
|
150
|
+
private activeTransport;
|
|
151
|
+
/**
|
|
152
|
+
* Set of transports currently being torn down by an in-flight `close()`.
|
|
153
|
+
* `onclose` / `onerror` callbacks that fire for a transport in this set
|
|
154
|
+
* must NOT be promoted to an "unexpected child death" — they are our own
|
|
155
|
+
* tear-down signal.
|
|
156
|
+
*
|
|
157
|
+
* Codex P2 (0.9.0 review): the earlier `closingIntentionally` boolean was
|
|
158
|
+
* connection-wide. Under concurrent calls, one call's `await this.close()`
|
|
159
|
+
* could overlap with another call's reconnect that had already installed
|
|
160
|
+
* a NEW transport. A genuine `onclose` from the new transport would hit
|
|
161
|
+
* the boolean guard and be silently ignored, reintroducing the stale-
|
|
162
|
+
* handle bug the patch targeted. Per-transport scoping eliminates the
|
|
163
|
+
* race: only the exact transport we asked to close is silenced.
|
|
164
|
+
*/
|
|
165
|
+
private readonly closingTransports;
|
|
87
166
|
/**
|
|
88
167
|
* Whether a reconnect has already been attempted in the CURRENT failure
|
|
89
168
|
* episode. Resets to `false` after a reconnect succeeds (so a later,
|
|
@@ -93,7 +172,30 @@ export declare class DownstreamConnection {
|
|
|
93
172
|
private reconnectAttempted;
|
|
94
173
|
/** Epoch ms of the last successful reconnect. Used by the flapping guard. */
|
|
95
174
|
private lastReconnectAt;
|
|
175
|
+
/**
|
|
176
|
+
* Epoch ms of the most recent unexpected child-death event. Stamped by
|
|
177
|
+
* `handleUnexpectedClose()`. 0 means "never died unexpectedly".
|
|
178
|
+
*
|
|
179
|
+
* Codex 0.9.0 pass-5 P2b: when `handleUnexpectedClose` nulls `this.client`,
|
|
180
|
+
* the very next `callTool` takes the top-level `client === null` branch,
|
|
181
|
+
* which normally bypasses the flap-window check entirely (that check lives
|
|
182
|
+
* in the catch branch below, conditioned on `lastReconnectAt`). A downstream
|
|
183
|
+
* that crashes immediately after every spawn would therefore be respawned
|
|
184
|
+
* unconditionally on every incoming call — exactly the loop the flap
|
|
185
|
+
* window is supposed to suppress. Consulting this timestamp in the
|
|
186
|
+
* `client === null` branch lets us refuse the respawn when the previous
|
|
187
|
+
* death is within the flap window, and the caller gets a clear error
|
|
188
|
+
* instead of watching the child die again.
|
|
189
|
+
*/
|
|
190
|
+
private unexpectedDeathAt;
|
|
96
191
|
private health;
|
|
192
|
+
/**
|
|
193
|
+
* Optional supervisor-event listener. Set via
|
|
194
|
+
* {@link onSupervisorEvent}. A single subscriber is sufficient — the pool
|
|
195
|
+
* is the one consumer. Listener failures are swallowed; a broken consumer
|
|
196
|
+
* must never break the connection lifecycle.
|
|
197
|
+
*/
|
|
198
|
+
private supervisorListener;
|
|
97
199
|
constructor(config: RegistryServer,
|
|
98
200
|
/**
|
|
99
201
|
* Optional structured logger (G5). When omitted, connection lifecycle
|
|
@@ -105,6 +207,61 @@ export declare class DownstreamConnection {
|
|
|
105
207
|
get isHealthy(): boolean;
|
|
106
208
|
/** True iff the underlying MCP client is currently connected. */
|
|
107
209
|
get isConnected(): boolean;
|
|
210
|
+
/**
|
|
211
|
+
* Register a supervisor-event listener. Intended for the pool to wire up
|
|
212
|
+
* SESSION_BLOCKER tracking + observability hooks without the connection
|
|
213
|
+
* class having to know about either. Only one listener is supported — a
|
|
214
|
+
* second call replaces the first. Pass `null` to detach.
|
|
215
|
+
*/
|
|
216
|
+
onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
|
|
217
|
+
/**
|
|
218
|
+
* Invoke the supervisor listener if registered. Swallows listener errors —
|
|
219
|
+
* a broken observer must never break the connection state machine.
|
|
220
|
+
*/
|
|
221
|
+
private emitSupervisorEvent;
|
|
222
|
+
/**
|
|
223
|
+
* Emit a `health_changed` event. Called from every site that mutates a
|
|
224
|
+
* health/last_error/tools_count-visible field WITHOUT firing one of the
|
|
225
|
+
* louder supervisor events (`child_died_unexpectedly` / `respawned`).
|
|
226
|
+
* Addresses Codex 0.9.0 pass-2 P2a — live-state was only scheduled from
|
|
227
|
+
* breaker transitions and respawns, so transient errors below the breaker
|
|
228
|
+
* threshold would leave `rea status` showing stale data.
|
|
229
|
+
*/
|
|
230
|
+
private emitHealthChanged;
|
|
231
|
+
/**
|
|
232
|
+
* Handle an unexpected transport close. Fires when the child process exits
|
|
233
|
+
* outside a caller-initiated `close()`, or when the stdio pipe errors in a
|
|
234
|
+
* way the SDK surfaces as a close event.
|
|
235
|
+
*
|
|
236
|
+
* Contract:
|
|
237
|
+
* - Only runs for the currently-active transport (stale callbacks from
|
|
238
|
+
* an already-swapped transport are ignored).
|
|
239
|
+
* - Does NOT run when WE initiated the close (the transport is a member
|
|
240
|
+
* of `closingTransports` for the duration of our own `close()` call).
|
|
241
|
+
* - Nulls `this.client` so the next `callTool` takes the `connect()`
|
|
242
|
+
* branch and actually respawns the child.
|
|
243
|
+
* - Marks the connection unhealthy so the pool knows not to route
|
|
244
|
+
* traffic to it while we wait for the next call.
|
|
245
|
+
* - Emits a `child_died_unexpectedly` supervisor event so the pool's
|
|
246
|
+
* SESSION_BLOCKER tracker can count this even though no callTool has
|
|
247
|
+
* failed yet (the child may die mid-idle).
|
|
248
|
+
*/
|
|
249
|
+
private handleUnexpectedClose;
|
|
250
|
+
/**
|
|
251
|
+
* Handle a transport-layer protocol error. onerror does NOT always imply
|
|
252
|
+
* close — the SDK emits it for protocol errors too. We record the error
|
|
253
|
+
* text but leave connection invalidation to the eventual onclose callback,
|
|
254
|
+
* which is guaranteed to follow a fatal transport error on stdio.
|
|
255
|
+
*
|
|
256
|
+
* Codex 0.9.0 pass-6 P2: filter stale/intentional-close callbacks the
|
|
257
|
+
* same way `handleUnexpectedClose` does. Without this, a delayed
|
|
258
|
+
* onerror from a PREVIOUSLY-ACTIVE transport (one we've already torn
|
|
259
|
+
* down or replaced) can clobber the HEALTHY replacement connection's
|
|
260
|
+
* last_error and emit a spurious health_changed, leaving `rea status`
|
|
261
|
+
* showing a stale error on a perfectly live child. The `onclose`
|
|
262
|
+
* hook already enforced this filter; the `onerror` hook did not.
|
|
263
|
+
*/
|
|
264
|
+
private handleTransportError;
|
|
108
265
|
/**
|
|
109
266
|
* Last error observed, or null if the connection has never failed (or fully
|
|
110
267
|
* recovered).
|