@bookedsolid/rea 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,18 @@
5
5
  * later `rea serve` that has raced in and rewritten the breadcrumbs
6
6
  * is never unexpectedly unlinked.
7
7
  */
8
+ /**
9
+ * Serve-state file shape. 0.9.0 added the `downstreams` block; older code
10
+ * that reads the state file treats a missing `downstreams` as "no live
11
+ * view available" and falls back to the pre-0.9 fields. `session_id` is
12
+ * the ownership key used by `cleanupStateIfOwned` during shutdown.
13
+ */
8
14
  interface ServeState {
9
15
  session_id: string;
10
16
  started_at: string;
11
17
  metrics_port: number | null;
18
+ /** 0.9.0 — populated after the gateway starts; absent on this initial write. */
19
+ downstreams?: unknown[];
12
20
  }
13
21
  /**
14
22
  * Atomic file write: stage to a per-pid temp name, then rename(2). The
package/dist/cli/serve.js CHANGED
@@ -249,12 +249,30 @@ export async function runServe() {
249
249
  console.error('');
250
250
  process.exit(1);
251
251
  }
252
+ // Metadata we'll also stamp into the state file below so `rea status`
253
+ // sees the session-id and start time alongside the new downstream block.
254
+ const startedAt = new Date().toISOString();
255
+ const statePath = reaPath(baseDir, SERVE_STATE_FILE);
252
256
  const handle = createGateway({
253
257
  baseDir,
254
258
  policy,
255
259
  registry: gatedRegistry,
256
260
  logger,
257
261
  metrics: metricsRegistry,
262
+ // 0.9.0 — let the gateway own live writes to serve.state.json so
263
+ // circuit-breaker transitions and supervisor events are reflected on
264
+ // disk for `rea status --json`. Legacy shape (session_id, started_at,
265
+ // metrics_port) is preserved for backward compatibility.
266
+ liveStateFilePath: statePath,
267
+ liveStateSessionId: sessionId,
268
+ liveStateStartedAt: startedAt,
269
+ liveStateMetricsPort: metricsServer?.port() ?? null,
270
+ // 0.9.0 pass-7 — reuse the gateway log redactor so downstream error
271
+ // strings are scrubbed for secret-shaped content BEFORE hitting
272
+ // serve.state.json or the operator's terminal via `rea status`.
273
+ // The redactor already incorporates SECRET_PATTERNS plus any
274
+ // operator-defined policy.redact.patterns loaded above.
275
+ liveStateLastErrorRedactor: logRedactor,
258
276
  });
259
277
  // ── HALT acknowledgement at startup (G5) ─────────────────────────────────
260
278
  const haltPath = reaPath(baseDir, HALT_FILE);
@@ -280,13 +298,21 @@ export async function runServe() {
280
298
  codexProbe.start();
281
299
  }
282
300
  // ── Pidfile + state (AFTER metrics boot so we persist the real port) ─────
283
- const startedAt = new Date().toISOString();
301
+ //
302
+ // 0.9.0: the gateway's LiveStatePublisher owns all writes to
303
+ // serve.state.json, including the boot-time snapshot. Earlier drafts
304
+ // used the legacy `writeStateFile()` here to cover the bootstrap window
305
+ // between now and `handle.start()`'s first flush, but that write
306
+ // bypassed the sidecar-lock protocol and reintroduced the TOCTOU race
307
+ // P2b was designed to close (Codex 0.9.0 pass-3 P1: an overlapping
308
+ // older `rea serve` could clobber this unprotected write and the
309
+ // newer instance would later cleanup its own file during shutdown).
310
+ //
311
+ // Routing the boot write through `handle.livePublisher.flushNow()`
312
+ // means the boot snapshot is guarded by the same lock as every
313
+ // subsequent flush; overlapping gateways serialize cleanly.
284
314
  const pidPath = writePidfile(baseDir);
285
- const statePath = writeStateFile(baseDir, {
286
- session_id: sessionId,
287
- started_at: startedAt,
288
- metrics_port: metricsServer?.port() ?? null,
289
- });
315
+ handle.livePublisher?.flushNow();
290
316
  let shuttingDown = false;
291
317
  const shutdown = async (signal) => {
292
318
  // A second signal (e.g. SIGTERM then SIGINT) must NOT re-enter cleanup —
@@ -47,6 +47,24 @@ export declare function sanitizeForTerminal(value: string): string;
47
47
  export interface StatusOptions {
48
48
  json?: boolean | undefined;
49
49
  }
50
+ /**
51
+ * Per-downstream live state surfaced in both JSON and pretty outputs
52
+ * (0.9.0, BUG-005). Mirrors `LiveDownstreamState` in
53
+ * `src/gateway/live-state.ts`; duplicated here to keep the CLI surface
54
+ * independent of gateway internals (the CLI can be built without the
55
+ * gateway module in a trimmed install).
56
+ */
57
+ export interface LiveDownstreamSnapshot {
58
+ name: string;
59
+ connected: boolean;
60
+ healthy: boolean;
61
+ circuit_state: 'closed' | 'open' | 'half-open';
62
+ retry_at: string | null;
63
+ last_error: string | null;
64
+ tools_count: number | null;
65
+ open_transitions: number;
66
+ session_blocker_emitted: boolean;
67
+ }
50
68
  interface ServeLiveness {
51
69
  running: boolean;
52
70
  pid: number | null;
@@ -56,6 +74,13 @@ interface ServeLiveness {
56
74
  session_id: string | null;
57
75
  started_at: string | null;
58
76
  metrics_port: number | null;
77
+ /**
78
+ * 0.9.0 — per-downstream live block, or `null` when the state file was
79
+ * written by an older gateway version that did not include it. A
80
+ * zero-length array means "gateway is running with no downstreams
81
+ * configured", which is a distinct signal from "unknown".
82
+ */
83
+ downstreams: LiveDownstreamSnapshot[] | null;
59
84
  }
60
85
  interface AuditStats {
61
86
  present: boolean;
@@ -96,21 +96,64 @@ function readPidfile(baseDir) {
96
96
  return null;
97
97
  }
98
98
  }
99
+ /**
100
+ * Parse a single downstream entry from `serve.state.json`. Every field is
101
+ * validated — an unexpected type yields a null for that field rather than
102
+ * poisoning the whole entry, because the state file is touched on a hot
103
+ * path and we would rather surface a half-useful snapshot than a
104
+ * "corrupt, try again" error to the operator.
105
+ *
106
+ * Returns `null` when the entry's `name` is missing or not a string, since
107
+ * a downstream with no name is unusable for display.
108
+ */
109
+ function parseDownstreamEntry(raw) {
110
+ if (typeof raw !== 'object' || raw === null)
111
+ return null;
112
+ const r = raw;
113
+ if (typeof r.name !== 'string' || r.name.length === 0)
114
+ return null;
115
+ const circuit = r.circuit_state === 'open' || r.circuit_state === 'half-open' || r.circuit_state === 'closed'
116
+ ? r.circuit_state
117
+ : 'closed';
118
+ return {
119
+ name: r.name,
120
+ connected: typeof r.connected === 'boolean' ? r.connected : false,
121
+ healthy: typeof r.healthy === 'boolean' ? r.healthy : false,
122
+ circuit_state: circuit,
123
+ retry_at: typeof r.retry_at === 'string' ? r.retry_at : null,
124
+ last_error: typeof r.last_error === 'string' ? r.last_error : null,
125
+ tools_count: typeof r.tools_count === 'number' && Number.isInteger(r.tools_count) ? r.tools_count : null,
126
+ open_transitions: typeof r.open_transitions === 'number' && Number.isInteger(r.open_transitions)
127
+ ? r.open_transitions
128
+ : 0,
129
+ session_blocker_emitted: typeof r.session_blocker_emitted === 'boolean' ? r.session_blocker_emitted : false,
130
+ };
131
+ }
99
132
  function readServeState(baseDir) {
100
133
  const p = reaPath(baseDir, SERVE_STATE_FILE);
101
134
  try {
102
135
  const raw = fs.readFileSync(p, 'utf8');
103
136
  const parsed = JSON.parse(raw);
137
+ let downstreams = null;
138
+ if (Array.isArray(parsed.downstreams)) {
139
+ downstreams = [];
140
+ for (const entry of parsed.downstreams) {
141
+ const ds = parseDownstreamEntry(entry);
142
+ if (ds !== null)
143
+ downstreams.push(ds);
144
+ }
145
+ }
104
146
  return {
105
147
  session_id: typeof parsed.session_id === 'string' ? parsed.session_id : null,
106
148
  started_at: typeof parsed.started_at === 'string' ? parsed.started_at : null,
107
149
  metrics_port: typeof parsed.metrics_port === 'number' && Number.isInteger(parsed.metrics_port)
108
150
  ? parsed.metrics_port
109
151
  : null,
152
+ downstreams,
110
153
  };
111
154
  }
112
155
  catch {
113
- return { session_id: null, started_at: null, metrics_port: null };
156
+ return { session_id: null, started_at: null, metrics_port: null, downstreams: null };
114
157
  }
115
158
  }
116
159
  function probeServe(baseDir) {
@@ -124,6 +167,7 @@ function probeServe(baseDir) {
124
167
  session_id: null,
125
168
  started_at: null,
126
169
  metrics_port: null,
170
+ downstreams: null,
127
171
  };
128
172
  }
129
173
  const alive = isProcessAlive(pid);
@@ -135,6 +179,7 @@ function probeServe(baseDir) {
135
179
  session_id: state.session_id,
136
180
  started_at: state.started_at,
137
181
  metrics_port: state.metrics_port,
182
+ downstreams: state.downstreams,
138
183
  };
139
184
  }
140
185
  /**
@@ -356,6 +401,46 @@ function printPretty(payload) {
356
401
  }
357
402
  }
358
403
  console.log('');
404
+ // 0.9.0 — per-downstream block. Only shown when the serve process is
405
+ // believed to be running AND the state file carried the new array. An
406
+ // older gateway version that predates the publisher leaves `downstreams`
407
+ // null; we print an explanatory hint instead of rendering an empty
408
+ // table that looks like "zero downstreams".
409
+ if (s.running) {
410
+ console.log(' Downstreams');
411
+ if (s.downstreams === null) {
412
+ console.log(` (state file has no downstream block — upgrade gateway to ≥0.9.0)`);
413
+ }
414
+ else if (s.downstreams.length === 0) {
415
+ console.log(` (no downstream servers declared in .rea/registry.yaml)`);
416
+ }
417
+ else {
418
+ for (const d of s.downstreams) {
419
+ const name = sanitizeForTerminal(d.name);
420
+ const lastErr = safePretty(d.last_error);
421
+ const retryAt = safePretty(d.retry_at);
422
+ const healthToken = d.healthy ? (d.connected ? 'healthy' : 'connecting') : 'UNHEALTHY';
423
+ const circuit = d.circuit_state.toUpperCase();
424
+ console.log(` ${name}`);
425
+ console.log(` Health: ${healthToken}`);
426
+ console.log(` Circuit: ${circuit}`);
427
+ if (retryAt !== null && d.circuit_state === 'open') {
428
+ console.log(` Retry at: ${retryAt}`);
429
+ }
430
+ if (d.tools_count !== null) {
431
+ console.log(` Tools advertised: ${d.tools_count}`);
432
+ }
433
+ if (d.open_transitions > 0) {
434
+ const blockerSuffix = d.session_blocker_emitted ? ' (SESSION_BLOCKER fired)' : '';
435
+ console.log(` Open transitions: ${d.open_transitions}${blockerSuffix}`);
436
+ }
437
+ if (lastErr !== null) {
438
+ console.log(` Last error: ${lastErr}`);
439
+ }
440
+ }
441
+ }
442
+ console.log('');
443
+ }
359
444
  console.log(' Audit log');
360
445
  if (!a.present) {
361
446
  console.log(` State: not yet written`);
@@ -25,7 +25,14 @@ export interface CircuitStatus {
25
25
  serverName: string;
26
26
  retryAt?: string;
27
27
  }
28
- interface CircuitEntry {
28
+ /**
29
+ * Internal per-server circuit state. Exported so observability consumers
30
+ * (live-state publisher, tests) can read `openedAt` and `cooldownMs` to
31
+ * compute a `retry_at` timestamp without duplicating the arithmetic.
32
+ * Treat fields as read-only from outside the breaker — mutating them
33
+ * breaks the invariants `recordSuccess` / `recordFailure` enforce.
34
+ */
35
+ export interface CircuitEntry {
29
36
  state: CircuitState;
30
37
  consecutiveFailures: number;
31
38
  openedAt: number | null;
@@ -57,4 +64,3 @@ export declare class CircuitBreaker {
57
64
  recordFailure(serverName: string): void;
58
65
  getCircuit(serverName: string): CircuitEntry | undefined;
59
66
  }
60
- export {};
@@ -5,7 +5,7 @@
5
5
  * The gateway splits on the FIRST `__` — downstream tools that themselves
6
6
  * contain `__` in their name continue to work because the split is one-shot.
7
7
  */
8
- import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
8
+ import { DownstreamConnection, type DownstreamSupervisorEvent, type DownstreamToolInfo } from './downstream.js';
9
9
  import type { Registry } from '../registry/types.js';
10
10
  import type { Logger } from './log.js';
11
11
  export interface PrefixedTool extends DownstreamToolInfo {
@@ -44,7 +44,19 @@ export declare class DownstreamPool {
44
44
  * or is skipped. Stale but truthful > absent.
45
45
  */
46
46
  private readonly lastToolsCount;
47
+ /**
48
+ * Optional supervisor event listener wired by the gateway. The pool
49
+ * re-emits per-connection events through this single sink so the
50
+ * SESSION_BLOCKER tracker + state publisher only need to subscribe once.
51
+ */
52
+ private supervisorListener;
47
53
  constructor(registry: Registry, logger?: Logger);
54
+ /**
55
+ * Register a supervisor-event sink. Replaces any previously registered
56
+ * listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
57
+ * and live state publisher.
58
+ */
59
+ onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
48
60
  get size(): number;
49
61
  connectAll(): Promise<void>;
50
62
  /**
@@ -5,7 +5,7 @@
5
5
  * The gateway splits on the FIRST `__` — downstream tools that themselves
6
6
  * contain `__` in their name continue to work because the split is one-shot.
7
7
  */
8
- import { DownstreamConnection } from './downstream.js';
8
+ import { DownstreamConnection, } from './downstream.js';
9
9
  export class DownstreamPool {
10
10
  connections = new Map();
11
11
  /**
@@ -15,13 +15,31 @@ export class DownstreamPool {
15
15
  * or is skipped. Stale but truthful > absent.
16
16
  */
17
17
  lastToolsCount = new Map();
18
+ /**
19
+ * Optional supervisor event listener wired by the gateway. The pool
20
+ * re-emits per-connection events through this single sink so the
21
+ * SESSION_BLOCKER tracker + state publisher only need to subscribe once.
22
+ */
23
+ supervisorListener = null;
18
24
  constructor(registry, logger) {
19
25
  for (const server of registry.servers) {
20
26
  if (!server.enabled)
21
27
  continue;
22
- this.connections.set(server.name, new DownstreamConnection(server, logger));
28
+ const conn = new DownstreamConnection(server, logger);
29
+ conn.onSupervisorEvent((event) => {
30
+ this.supervisorListener?.(event);
31
+ });
32
+ this.connections.set(server.name, conn);
23
33
  }
24
34
  }
35
+ /**
36
+ * Register a supervisor-event sink. Replaces any previously registered
37
+ * listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
38
+ * and live state publisher.
39
+ */
40
+ onSupervisorEvent(listener) {
41
+ this.supervisorListener = listener;
42
+ }
25
43
  get size() {
26
44
  return this.connections.size;
27
45
  }
@@ -52,7 +70,17 @@ export class DownstreamPool {
52
70
  continue;
53
71
  try {
54
72
  const tools = await conn.listTools();
73
+ // Codex 0.9.0 pass-2 P2a: emit a `health_changed` supervisor event
74
+ // whenever the cached tools count actually changes. Without this,
75
+ // a successful listTools would update the value in memory but the
76
+ // live-state publisher would never flush the change — `rea status`
77
+ // would keep reporting a stale `tools_count` until some unrelated
78
+ // circuit/respawn event flushed a snapshot.
79
+ const prev = this.lastToolsCount.get(server);
55
80
  this.lastToolsCount.set(server, tools.length);
81
+ if (prev !== tools.length) {
82
+ this.supervisorListener?.({ kind: 'health_changed', server });
83
+ }
56
84
  for (const t of tools) {
57
85
  const prefixed = {
58
86
  ...t,
@@ -30,6 +30,26 @@
30
30
  * successful reconnect — in that case we mark the connection unhealthy and
31
31
  * let the circuit breaker take over.
32
32
  *
33
+ * ## Supervisor / child-death detection (0.9.0, BUG-002..003)
34
+ *
35
+ * The SDK `StdioClientTransport` exposes `onclose` + `onerror` callbacks that
36
+ * fire when the child process exits or the stdio pipe errors outside a
37
+ * caller-initiated `close()`. We wire both and treat an unexpected close as
38
+ * "child is dead" — the next `callTool` must force a fresh connect rather
39
+ * than calling into a stale `Client` that will reply `Not connected`.
40
+ *
41
+ * Before 0.9.0 the supervisor was reactive only: a dead child was not noticed
42
+ * until the NEXT tool call tried to use it, at which point the circuit could
43
+ * flap open → half-open → open with the child still dead because the
44
+ * half-open probe re-used the zombie client. 0.9.0 makes death detection
45
+ * eager: `onclose` nulls `this.client` so the very next call takes the
46
+ * `connect()` branch and actually respawns the child.
47
+ *
48
+ * "Not connected" error messages from the SDK (our in-flight fallback) are
49
+ * now also treated as fatal for the current client — we null it before the
50
+ * one-shot reconnect path so we spawn fresh rather than retrying with the
51
+ * same dead handle.
52
+ *
33
53
  * ## Why not request-level retries
34
54
  *
35
55
  * MCP tool calls are not idempotent by default. Retrying `send_message` after
@@ -74,6 +94,42 @@ export interface BuiltChildEnv {
74
94
  secretKeys: string[];
75
95
  }
76
96
  export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
97
+ /**
98
+ * Event emitted by {@link DownstreamConnection} when the supervisor observes
99
+ * a lifecycle transition worth surfacing. Consumers (the pool, the
100
+ * SESSION_BLOCKER tracker, observability sinks) subscribe via
101
+ * {@link DownstreamConnection.onSupervisorEvent}.
102
+ *
103
+ * The `kind` is a narrow closed set so sinks can switch exhaustively. `reason`
104
+ * carries the operator-readable detail; it is already bounded by
105
+ * `boundedDiagnosticString` at the call site.
106
+ */
107
+ export type DownstreamSupervisorEvent = {
108
+ kind: 'child_died_unexpectedly';
109
+ server: string;
110
+ reason: string;
111
+ } | {
112
+ kind: 'respawned';
113
+ server: string;
114
+ } | {
115
+ /**
116
+ * A non-transition health change. Fires whenever a visible field in
117
+ * {@link DownstreamHealth} (health, last_error, tools_count) mutates
118
+ * WITHOUT being accompanied by a breaker transition or respawn event.
119
+ *
120
+ * Codex 0.9.0 pass-2 P2a: without this event, the first failed call/
121
+ * reconnect below the breaker threshold (or a successful `listTools`
122
+ * that updates the cached tool count) never reaches the live state
123
+ * publisher, so `rea status` would show stale data until some later,
124
+ * unrelated circuit or respawn event finally flushed a snapshot.
125
+ *
126
+ * Firing is best-effort from the connection class; the pool additionally
127
+ * emits this kind after `listAllTools` updates `lastToolsCount` so a
128
+ * tool-catalog change is always visible in the next debounced snapshot.
129
+ */
130
+ kind: 'health_changed';
131
+ server: string;
132
+ };
77
133
  export declare class DownstreamConnection {
78
134
  #private;
79
135
  private readonly config;
@@ -84,6 +140,29 @@ export declare class DownstreamConnection {
84
140
  */
85
141
  private readonly logger?;
86
142
  private client;
143
+ /**
144
+ * Handle to the currently active transport, so our `onclose`/`onerror`
145
+ * hooks can tell "this is the transport we care about" vs "a stale callback
146
+ * firing after we already swapped to a new transport". Cleared in `close()`
147
+ * BEFORE we invoke `client.close()` so our own tear-down does not race the
148
+ * supervisor path.
149
+ */
150
+ private activeTransport;
151
+ /**
152
+ * Set of transports currently being torn down by an in-flight `close()`.
153
+ * `onclose` / `onerror` callbacks that fire for a transport in this set
154
+ * must NOT be promoted to an "unexpected child death" — they are our own
155
+ * tear-down signal.
156
+ *
157
+ * Codex P2 (0.9.0 review): the earlier `closingIntentionally` boolean was
158
+ * connection-wide. Under concurrent calls, one call's `await this.close()`
159
+ * could overlap with another call's reconnect that had already installed
160
+ * a NEW transport. A genuine `onclose` from the new transport would hit
161
+ * the boolean guard and be silently ignored, reintroducing the stale-
162
+ * handle bug the patch targeted. Per-transport scoping eliminates the
163
+ * race: only the exact transport we asked to close is silenced.
164
+ */
165
+ private readonly closingTransports;
87
166
  /**
88
167
  * Whether a reconnect has already been attempted in the CURRENT failure
89
168
  * episode. Resets to `false` after a reconnect succeeds (so a later,
@@ -93,7 +172,30 @@ export declare class DownstreamConnection {
93
172
  private reconnectAttempted;
94
173
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
95
174
  private lastReconnectAt;
175
+ /**
176
+ * Epoch ms of the most recent unexpected child-death event. Stamped by
177
+ * `handleUnexpectedClose()`. 0 means "never died unexpectedly".
178
+ *
179
+ * Codex 0.9.0 pass-5 P2b: when `handleUnexpectedClose` nulls `this.client`,
180
+ * the very next `callTool` takes the top-level `client === null` branch,
181
+ * which normally bypasses the flap-window check entirely (that check lives
182
+ * in the catch branch below, conditioned on `lastReconnectAt`). A downstream
183
+ * that crashes immediately after every spawn would therefore be respawned
184
+ * unconditionally on every incoming call — exactly the loop the flap
185
+ * window is supposed to suppress. Consulting this timestamp in the
186
+ * `client === null` branch lets us refuse the respawn when the previous
187
+ * death is within the flap window, and the caller gets a clear error
188
+ * instead of watching the child die again.
189
+ */
190
+ private unexpectedDeathAt;
96
191
  private health;
192
+ /**
193
+ * Optional supervisor-event listener. Set via
194
+ * {@link onSupervisorEvent}. A single subscriber is sufficient — the pool
195
+ * is the one consumer. Listener failures are swallowed; a broken consumer
196
+ * must never break the connection lifecycle.
197
+ */
198
+ private supervisorListener;
97
199
  constructor(config: RegistryServer,
98
200
  /**
99
201
  * Optional structured logger (G5). When omitted, connection lifecycle
@@ -105,6 +207,61 @@ export declare class DownstreamConnection {
105
207
  get isHealthy(): boolean;
106
208
  /** True iff the underlying MCP client is currently connected. */
107
209
  get isConnected(): boolean;
210
+ /**
211
+ * Register a supervisor-event listener. Intended for the pool to wire up
212
+ * SESSION_BLOCKER tracking + observability hooks without the connection
213
+ * class having to know about either. Only one listener is supported — a
214
+ * second call replaces the first. Pass `null` to detach.
215
+ */
216
+ onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
217
+ /**
218
+ * Invoke the supervisor listener if registered. Swallows listener errors —
219
+ * a broken observer must never break the connection state machine.
220
+ */
221
+ private emitSupervisorEvent;
222
+ /**
223
+ * Emit a `health_changed` event. Called from every site that mutates a
224
+ * health/last_error/tools_count-visible field WITHOUT firing one of the
225
+ * louder supervisor events (`child_died_unexpectedly` / `respawned`).
226
+ * Addresses Codex 0.9.0 pass-2 P2a — live-state was only scheduled from
227
+ * breaker transitions and respawns, so transient errors below the breaker
228
+ * threshold would leave `rea status` showing stale data.
229
+ */
230
+ private emitHealthChanged;
231
+ /**
232
+ * Handle an unexpected transport close. Fires when the child process exits
233
+ * outside a caller-initiated `close()`, or when the stdio pipe errors in a
234
+ * way the SDK surfaces as a close event.
235
+ *
236
+ * Contract:
237
+ * - Only runs for the currently-active transport (stale callbacks from
238
+ * an already-swapped transport are ignored).
239
+ * - Does NOT run when WE initiated the close (the transport is a member
240
+ * of `closingTransports` for the duration of our own `close()` call).
241
+ * - Nulls `this.client` so the next `callTool` takes the `connect()`
242
+ * branch and actually respawns the child.
243
+ * - Marks the connection unhealthy so the pool knows not to route
244
+ * traffic to it while we wait for the next call.
245
+ * - Emits a `child_died_unexpectedly` supervisor event so the pool's
246
+ * SESSION_BLOCKER tracker can count this even though no callTool has
247
+ * failed yet (the child may die mid-idle).
248
+ */
249
+ private handleUnexpectedClose;
250
+ /**
251
+ * Handle a transport-layer protocol error. onerror does NOT always imply
252
+ * close — the SDK emits it for protocol errors too. We record the error
253
+ * text but leave connection invalidation to the eventual onclose callback,
254
+ * which is guaranteed to follow a fatal transport error on stdio.
255
+ *
256
+ * Codex 0.9.0 pass-6 P2: filter stale/intentional-close callbacks the
257
+ * same way `handleUnexpectedClose` does. Without this, a delayed
258
+ * onerror from a PREVIOUSLY-ACTIVE transport (one we've already torn
259
+ * down or replaced) can clobber the HEALTHY replacement connection's
260
+ * last_error and emit a spurious health_changed, leaving `rea status`
261
+ * showing a stale error on a perfectly live child. The `onclose`
262
+ * hook already enforced this filter; the `onerror` hook did not.
263
+ */
264
+ private handleTransportError;
108
265
  /**
109
266
  * Last error observed, or null if the connection has never failed (or fully
110
267
  * recovered).