@bookedsolid/rea 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,14 @@ export interface CircuitStatus {
25
25
  serverName: string;
26
26
  retryAt?: string;
27
27
  }
28
- interface CircuitEntry {
28
+ /**
29
+ * Internal per-server circuit state. Exported so observability consumers
30
+ * (live-state publisher, tests) can read `openedAt` and `cooldownMs` to
31
+ * compute a `retry_at` timestamp without duplicating the arithmetic.
32
+ * Treat fields as read-only from outside the breaker — mutating them
33
+ * breaks the invariants `recordSuccess` / `recordFailure` enforce.
34
+ */
35
+ export interface CircuitEntry {
29
36
  state: CircuitState;
30
37
  consecutiveFailures: number;
31
38
  openedAt: number | null;
@@ -57,4 +64,3 @@ export declare class CircuitBreaker {
57
64
  recordFailure(serverName: string): void;
58
65
  getCircuit(serverName: string): CircuitEntry | undefined;
59
66
  }
60
- export {};
@@ -5,7 +5,7 @@
5
5
  * The gateway splits on the FIRST `__` — downstream tools that themselves
6
6
  * contain `__` in their name continue to work because the split is one-shot.
7
7
  */
8
- import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
8
+ import { DownstreamConnection, type DownstreamSupervisorEvent, type DownstreamToolInfo } from './downstream.js';
9
9
  import type { Registry } from '../registry/types.js';
10
10
  import type { Logger } from './log.js';
11
11
  export interface PrefixedTool extends DownstreamToolInfo {
@@ -44,7 +44,19 @@ export declare class DownstreamPool {
44
44
  * or is skipped. Stale but truthful > absent.
45
45
  */
46
46
  private readonly lastToolsCount;
47
+ /**
48
+ * Optional supervisor event listener wired by the gateway. The pool
49
+ * re-emits per-connection events through this single sink so the
50
+ * SESSION_BLOCKER tracker + state publisher only need to subscribe once.
51
+ */
52
+ private supervisorListener;
47
53
  constructor(registry: Registry, logger?: Logger);
54
+ /**
55
+ * Register a supervisor-event sink. Replaces any previously registered
56
+ * listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
57
+ * and live state publisher.
58
+ */
59
+ onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
48
60
  get size(): number;
49
61
  connectAll(): Promise<void>;
50
62
  /**
@@ -5,7 +5,7 @@
5
5
  * The gateway splits on the FIRST `__` — downstream tools that themselves
6
6
  * contain `__` in their name continue to work because the split is one-shot.
7
7
  */
8
- import { DownstreamConnection } from './downstream.js';
8
+ import { DownstreamConnection, } from './downstream.js';
9
9
  export class DownstreamPool {
10
10
  connections = new Map();
11
11
  /**
@@ -15,13 +15,31 @@ export class DownstreamPool {
15
15
  * or is skipped. Stale but truthful > absent.
16
16
  */
17
17
  lastToolsCount = new Map();
18
+ /**
19
+ * Optional supervisor event listener wired by the gateway. The pool
20
+ * re-emits per-connection events through this single sink so the
21
+ * SESSION_BLOCKER tracker + state publisher only need to subscribe once.
22
+ */
23
+ supervisorListener = null;
18
24
  constructor(registry, logger) {
19
25
  for (const server of registry.servers) {
20
26
  if (!server.enabled)
21
27
  continue;
22
- this.connections.set(server.name, new DownstreamConnection(server, logger));
28
+ const conn = new DownstreamConnection(server, logger);
29
+ conn.onSupervisorEvent((event) => {
30
+ this.supervisorListener?.(event);
31
+ });
32
+ this.connections.set(server.name, conn);
23
33
  }
24
34
  }
35
+ /**
36
+ * Register a supervisor-event sink. Replaces any previously registered
37
+ * listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
38
+ * and live state publisher.
39
+ */
40
+ onSupervisorEvent(listener) {
41
+ this.supervisorListener = listener;
42
+ }
25
43
  get size() {
26
44
  return this.connections.size;
27
45
  }
@@ -52,7 +70,17 @@ export class DownstreamPool {
52
70
  continue;
53
71
  try {
54
72
  const tools = await conn.listTools();
73
+ // Codex 0.9.0 pass-2 P2a: emit a `health_changed` supervisor event
74
+ // whenever the cached tools count actually changes. Without this,
75
+ // a successful listTools would update the value in memory but the
76
+ // live-state publisher would never flush the change — `rea status`
77
+ // would keep reporting a stale `tools_count` until some unrelated
78
+ // circuit/respawn event flushed a snapshot.
79
+ const prev = this.lastToolsCount.get(server);
55
80
  this.lastToolsCount.set(server, tools.length);
81
+ if (prev !== tools.length) {
82
+ this.supervisorListener?.({ kind: 'health_changed', server });
83
+ }
56
84
  for (const t of tools) {
57
85
  const prefixed = {
58
86
  ...t,
@@ -30,6 +30,26 @@
30
30
  * successful reconnect — in that case we mark the connection unhealthy and
31
31
  * let the circuit breaker take over.
32
32
  *
33
+ * ## Supervisor / child-death detection (0.9.0, BUG-002..003)
34
+ *
35
+ * The SDK `StdioClientTransport` exposes `onclose` + `onerror` callbacks that
36
+ * fire when the child process exits or the stdio pipe errors outside a
37
+ * caller-initiated `close()`. We wire both and treat an unexpected close as
38
+ * "child is dead" — the next `callTool` must force a fresh connect rather
39
+ * than calling into a stale `Client` that will reply `Not connected`.
40
+ *
41
+ * Before 0.9.0 the supervisor was reactive only: a dead child was not noticed
42
+ * until the NEXT tool call tried to use it, at which point the circuit could
43
+ * flap open → half-open → open with the child still dead because the
44
+ * half-open probe re-used the zombie client. 0.9.0 makes death detection
45
+ * eager: `onclose` nulls `this.client` so the very next call takes the
46
+ * `connect()` branch and actually respawns the child.
47
+ *
48
+ * "Not connected" error messages from the SDK (our in-flight fallback) are
49
+ * now also treated as fatal for the current client — we null it before the
50
+ * one-shot reconnect path so we spawn fresh rather than retrying with the
51
+ * same dead handle.
52
+ *
33
53
  * ## Why not request-level retries
34
54
  *
35
55
  * MCP tool calls are not idempotent by default. Retrying `send_message` after
@@ -74,6 +94,42 @@ export interface BuiltChildEnv {
74
94
  secretKeys: string[];
75
95
  }
76
96
  export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
97
+ /**
98
+ * Event emitted by {@link DownstreamConnection} when the supervisor observes
99
+ * a lifecycle transition worth surfacing. Consumers (the pool, the
100
+ * SESSION_BLOCKER tracker, observability sinks) subscribe via
101
+ * {@link DownstreamConnection.onSupervisorEvent}.
102
+ *
103
+ * The `kind` is a narrow closed set so sinks can switch exhaustively. `reason`
104
+ * carries the operator-readable detail; it is already bounded by
105
+ * `boundedDiagnosticString` at the call site.
106
+ */
107
+ export type DownstreamSupervisorEvent = {
108
+ kind: 'child_died_unexpectedly';
109
+ server: string;
110
+ reason: string;
111
+ } | {
112
+ kind: 'respawned';
113
+ server: string;
114
+ } | {
115
+ /**
116
+ * A non-transition health change. Fires whenever a visible field in
117
+ * {@link DownstreamHealth} (health, last_error, tools_count) mutates
118
+ * WITHOUT being accompanied by a breaker transition or respawn event.
119
+ *
120
+ * Codex 0.9.0 pass-2 P2a: without this event, the first failed call/
121
+ * reconnect below the breaker threshold (or a successful `listTools`
122
+ * that updates the cached tool count) never reaches the live state
123
+ * publisher, so `rea status` would show stale data until some later,
124
+ * unrelated circuit or respawn event finally flushed a snapshot.
125
+ *
126
+ * Firing is best-effort from the connection class; the pool additionally
127
+ * emits this kind after `listAllTools` updates `lastToolsCount` so a
128
+ * tool-catalog change is always visible in the next debounced snapshot.
129
+ */
130
+ kind: 'health_changed';
131
+ server: string;
132
+ };
77
133
  export declare class DownstreamConnection {
78
134
  #private;
79
135
  private readonly config;
@@ -84,6 +140,29 @@ export declare class DownstreamConnection {
84
140
  */
85
141
  private readonly logger?;
86
142
  private client;
143
+ /**
144
+ * Handle to the currently active transport, so our `onclose`/`onerror`
145
+ * hooks can tell "this is the transport we care about" vs "a stale callback
146
+ * firing after we already swapped to a new transport". Cleared in `close()`
147
+ * BEFORE we invoke `client.close()` so our own tear-down does not race the
148
+ * supervisor path.
149
+ */
150
+ private activeTransport;
151
+ /**
152
+ * Set of transports currently being torn down by an in-flight `close()`.
153
+ * `onclose` / `onerror` callbacks that fire for a transport in this set
154
+ * must NOT be promoted to an "unexpected child death" — they are our own
155
+ * tear-down signal.
156
+ *
157
+ * Codex P2 (0.9.0 review): the earlier `closingIntentionally` boolean was
158
+ * connection-wide. Under concurrent calls, one call's `await this.close()`
159
+ * could overlap with another call's reconnect that had already installed
160
+ * a NEW transport. A genuine `onclose` from the new transport would hit
161
+ * the boolean guard and be silently ignored, reintroducing the stale-
162
+ * handle bug the patch targeted. Per-transport scoping eliminates the
163
+ * race: only the exact transport we asked to close is silenced.
164
+ */
165
+ private readonly closingTransports;
87
166
  /**
88
167
  * Whether a reconnect has already been attempted in the CURRENT failure
89
168
  * episode. Resets to `false` after a reconnect succeeds (so a later,
@@ -93,7 +172,30 @@ export declare class DownstreamConnection {
93
172
  private reconnectAttempted;
94
173
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
95
174
  private lastReconnectAt;
175
+ /**
176
+ * Epoch ms of the most recent unexpected child-death event. Stamped by
177
+ * `handleUnexpectedClose()`. 0 means "never died unexpectedly".
178
+ *
179
+ * Codex 0.9.0 pass-5 P2b: when `handleUnexpectedClose` nulls `this.client`,
180
+ * the very next `callTool` takes the top-level `client === null` branch,
181
+ * which normally bypasses the flap-window check entirely (that check lives
182
+ * in the catch branch below, conditioned on `lastReconnectAt`). A downstream
183
+ * that crashes immediately after every spawn would therefore be respawned
184
+ * unconditionally on every incoming call — exactly the loop the flap
185
+ * window is supposed to suppress. Consulting this timestamp in the
186
+ * `client === null` branch lets us refuse the respawn when the previous
187
+ * death is within the flap window, and the caller gets a clear error
188
+ * instead of watching the child die again.
189
+ */
190
+ private unexpectedDeathAt;
96
191
  private health;
192
+ /**
193
+ * Optional supervisor-event listener. Set via
194
+ * {@link onSupervisorEvent}. A single subscriber is sufficient — the pool
195
+ * is the one consumer. Listener failures are swallowed; a broken consumer
196
+ * must never break the connection lifecycle.
197
+ */
198
+ private supervisorListener;
97
199
  constructor(config: RegistryServer,
98
200
  /**
99
201
  * Optional structured logger (G5). When omitted, connection lifecycle
@@ -105,6 +207,61 @@ export declare class DownstreamConnection {
105
207
  get isHealthy(): boolean;
106
208
  /** True iff the underlying MCP client is currently connected. */
107
209
  get isConnected(): boolean;
210
+ /**
211
+ * Register a supervisor-event listener. Intended for the pool to wire up
212
+ * SESSION_BLOCKER tracking + observability hooks without the connection
213
+ * class having to know about either. Only one listener is supported — a
214
+ * second call replaces the first. Pass `null` to detach.
215
+ */
216
+ onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
217
+ /**
218
+ * Invoke the supervisor listener if registered. Swallows listener errors —
219
+ * a broken observer must never break the connection state machine.
220
+ */
221
+ private emitSupervisorEvent;
222
+ /**
223
+ * Emit a `health_changed` event. Called from every site that mutates a
224
+ * health/last_error/tools_count-visible field WITHOUT firing one of the
225
+ * louder supervisor events (`child_died_unexpectedly` / `respawned`).
226
+ * Addresses Codex 0.9.0 pass-2 P2a — live-state was only scheduled from
227
+ * breaker transitions and respawns, so transient errors below the breaker
228
+ * threshold would leave `rea status` showing stale data.
229
+ */
230
+ private emitHealthChanged;
231
+ /**
232
+ * Handle an unexpected transport close. Fires when the child process exits
233
+ * outside a caller-initiated `close()`, or when the stdio pipe errors in a
234
+ * way the SDK surfaces as a close event.
235
+ *
236
+ * Contract:
237
+ * - Only runs for the currently-active transport (stale callbacks from
238
+ * an already-swapped transport are ignored).
239
+ * - Does NOT run when WE initiated the close (the transport is a member
240
+ * of `closingTransports` for the duration of our own `close()` call).
241
+ * - Nulls `this.client` so the next `callTool` takes the `connect()`
242
+ * branch and actually respawns the child.
243
+ * - Marks the connection unhealthy so the pool knows not to route
244
+ * traffic to it while we wait for the next call.
245
+ * - Emits a `child_died_unexpectedly` supervisor event so the pool's
246
+ * SESSION_BLOCKER tracker can count this even though no callTool has
247
+ * failed yet (the child may die mid-idle).
248
+ */
249
+ private handleUnexpectedClose;
250
+ /**
251
+ * Handle a transport-layer protocol error. onerror does NOT always imply
252
+ * close — the SDK emits it for protocol errors too. We record the error
253
+ * text but leave connection invalidation to the eventual onclose callback,
254
+ * which is guaranteed to follow a fatal transport error on stdio.
255
+ *
256
+ * Codex 0.9.0 pass-6 P2: filter stale/intentional-close callbacks the
257
+ * same way `handleUnexpectedClose` does. Without this, a delayed
258
+ * onerror from a PREVIOUSLY-ACTIVE transport (one we've already torn
259
+ * down or replaced) can clobber the HEALTHY replacement connection's
260
+ * last_error and emit a spurious health_changed, leaving `rea status`
261
+ * showing a stale error on a perfectly live child. The `onclose`
262
+ * hook already enforced this filter; the `onerror` hook did not.
263
+ */
264
+ private handleTransportError;
108
265
  /**
109
266
  * Last error observed, or null if the connection has never failed (or fully
110
267
  * recovered).