@bookedsolid/rea 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/serve.d.ts +8 -0
- package/dist/cli/serve.js +32 -6
- package/dist/cli/status.d.ts +25 -0
- package/dist/cli/status.js +86 -1
- package/dist/gateway/circuit-breaker.d.ts +8 -2
- package/dist/gateway/downstream-pool.d.ts +13 -1
- package/dist/gateway/downstream-pool.js +30 -2
- package/dist/gateway/downstream.d.ts +157 -0
- package/dist/gateway/downstream.js +307 -5
- package/dist/gateway/live-state.d.ts +252 -0
- package/dist/gateway/live-state.js +504 -0
- package/dist/gateway/server.d.ts +44 -1
- package/dist/gateway/server.js +101 -1
- package/dist/gateway/session-blocker.d.ts +132 -0
- package/dist/gateway/session-blocker.js +163 -0
- package/package.json +1 -1
package/dist/cli/serve.d.ts
CHANGED
|
@@ -5,10 +5,18 @@
|
|
|
5
5
|
* later `rea serve` that has raced in and rewritten the breadcrumbs
|
|
6
6
|
* is never unexpectedly unlinked.
|
|
7
7
|
*/
|
|
8
|
+
/**
|
|
9
|
+
* Serve-state file shape. 0.9.0 added the `downstreams` block; older code
|
|
10
|
+
* that reads the state file treats a missing `downstreams` as "no live
|
|
11
|
+
* view available" and falls back to the pre-0.9 fields. `session_id` is
|
|
12
|
+
* the ownership key used by `cleanupStateIfOwned` during shutdown.
|
|
13
|
+
*/
|
|
8
14
|
interface ServeState {
|
|
9
15
|
session_id: string;
|
|
10
16
|
started_at: string;
|
|
11
17
|
metrics_port: number | null;
|
|
18
|
+
/** 0.9.0 — populated after the gateway starts; absent on this initial write. */
|
|
19
|
+
downstreams?: unknown[];
|
|
12
20
|
}
|
|
13
21
|
/**
|
|
14
22
|
* Atomic file write: stage to a per-pid temp name, then rename(2). The
|
package/dist/cli/serve.js
CHANGED
|
@@ -249,12 +249,30 @@ export async function runServe() {
|
|
|
249
249
|
console.error('');
|
|
250
250
|
process.exit(1);
|
|
251
251
|
}
|
|
252
|
+
// Metadata we'll also stamp into the state file below so `rea status`
|
|
253
|
+
// sees the session-id and start time alongside the new downstream block.
|
|
254
|
+
const startedAt = new Date().toISOString();
|
|
255
|
+
const statePath = reaPath(baseDir, SERVE_STATE_FILE);
|
|
252
256
|
const handle = createGateway({
|
|
253
257
|
baseDir,
|
|
254
258
|
policy,
|
|
255
259
|
registry: gatedRegistry,
|
|
256
260
|
logger,
|
|
257
261
|
metrics: metricsRegistry,
|
|
262
|
+
// 0.9.0 — let the gateway own live writes to serve.state.json so
|
|
263
|
+
// circuit-breaker transitions and supervisor events are reflected on
|
|
264
|
+
// disk for `rea status --json`. Legacy shape (session_id, started_at,
|
|
265
|
+
// metrics_port) is preserved for backward compatibility.
|
|
266
|
+
liveStateFilePath: statePath,
|
|
267
|
+
liveStateSessionId: sessionId,
|
|
268
|
+
liveStateStartedAt: startedAt,
|
|
269
|
+
liveStateMetricsPort: metricsServer?.port() ?? null,
|
|
270
|
+
// 0.9.0 pass-7 — reuse the gateway log redactor so downstream error
|
|
271
|
+
// strings are scrubbed for secret-shaped content BEFORE hitting
|
|
272
|
+
// serve.state.json or the operator's terminal via `rea status`.
|
|
273
|
+
// The redactor already incorporates SECRET_PATTERNS plus any
|
|
274
|
+
// operator-defined policy.redact.patterns loaded above.
|
|
275
|
+
liveStateLastErrorRedactor: logRedactor,
|
|
258
276
|
});
|
|
259
277
|
// ── HALT acknowledgement at startup (G5) ─────────────────────────────────
|
|
260
278
|
const haltPath = reaPath(baseDir, HALT_FILE);
|
|
@@ -280,13 +298,21 @@ export async function runServe() {
|
|
|
280
298
|
codexProbe.start();
|
|
281
299
|
}
|
|
282
300
|
// ── Pidfile + state (AFTER metrics boot so we persist the real port) ─────
|
|
283
|
-
|
|
301
|
+
//
|
|
302
|
+
// 0.9.0: the gateway's LiveStatePublisher owns all writes to
|
|
303
|
+
// serve.state.json, including the boot-time snapshot. Earlier drafts
|
|
304
|
+
// used the legacy `writeStateFile()` here to cover the bootstrap window
|
|
305
|
+
// between now and `handle.start()`'s first flush, but that write
|
|
306
|
+
// bypassed the sidecar-lock protocol and reintroduced the TOCTOU race
|
|
307
|
+
// P2b was designed to close (Codex 0.9.0 pass-3 P1: an overlapping
|
|
308
|
+
// older `rea serve` could clobber this unprotected write and the
|
|
309
|
+
// newer instance would later cleanup its own file during shutdown).
|
|
310
|
+
//
|
|
311
|
+
// Routing the boot write through `handle.livePublisher.flushNow()`
|
|
312
|
+
// means the boot snapshot is guarded by the same lock as every
|
|
313
|
+
// subsequent flush; overlapping gateways serialize cleanly.
|
|
284
314
|
const pidPath = writePidfile(baseDir);
|
|
285
|
-
|
|
286
|
-
session_id: sessionId,
|
|
287
|
-
started_at: startedAt,
|
|
288
|
-
metrics_port: metricsServer?.port() ?? null,
|
|
289
|
-
});
|
|
315
|
+
handle.livePublisher?.flushNow();
|
|
290
316
|
let shuttingDown = false;
|
|
291
317
|
const shutdown = async (signal) => {
|
|
292
318
|
// A second signal (e.g. SIGTERM then SIGINT) must NOT re-enter cleanup —
|
package/dist/cli/status.d.ts
CHANGED
|
@@ -47,6 +47,24 @@ export declare function sanitizeForTerminal(value: string): string;
|
|
|
47
47
|
export interface StatusOptions {
|
|
48
48
|
json?: boolean | undefined;
|
|
49
49
|
}
|
|
50
|
+
/**
|
|
51
|
+
* Per-downstream live state surfaced in both JSON and pretty outputs
|
|
52
|
+
* (0.9.0, BUG-005). Mirrors `LiveDownstreamState` in
|
|
53
|
+
* `src/gateway/live-state.ts`; duplicated here to keep the CLI surface
|
|
54
|
+
* independent of gateway internals (the CLI can be built without the
|
|
55
|
+
* gateway module in a trimmed install).
|
|
56
|
+
*/
|
|
57
|
+
export interface LiveDownstreamSnapshot {
|
|
58
|
+
name: string;
|
|
59
|
+
connected: boolean;
|
|
60
|
+
healthy: boolean;
|
|
61
|
+
circuit_state: 'closed' | 'open' | 'half-open';
|
|
62
|
+
retry_at: string | null;
|
|
63
|
+
last_error: string | null;
|
|
64
|
+
tools_count: number | null;
|
|
65
|
+
open_transitions: number;
|
|
66
|
+
session_blocker_emitted: boolean;
|
|
67
|
+
}
|
|
50
68
|
interface ServeLiveness {
|
|
51
69
|
running: boolean;
|
|
52
70
|
pid: number | null;
|
|
@@ -56,6 +74,13 @@ interface ServeLiveness {
|
|
|
56
74
|
session_id: string | null;
|
|
57
75
|
started_at: string | null;
|
|
58
76
|
metrics_port: number | null;
|
|
77
|
+
/**
|
|
78
|
+
* 0.9.0 — per-downstream live block, or `null` when the state file was
|
|
79
|
+
* written by an older gateway version that did not include it. A
|
|
80
|
+
* zero-length array means "gateway is running with no downstreams
|
|
81
|
+
* configured", which is a distinct signal from "unknown".
|
|
82
|
+
*/
|
|
83
|
+
downstreams: LiveDownstreamSnapshot[] | null;
|
|
59
84
|
}
|
|
60
85
|
interface AuditStats {
|
|
61
86
|
present: boolean;
|
package/dist/cli/status.js
CHANGED
|
@@ -96,21 +96,64 @@ function readPidfile(baseDir) {
|
|
|
96
96
|
return null;
|
|
97
97
|
}
|
|
98
98
|
}
|
|
99
|
+
/**
|
|
100
|
+
* Parse a single downstream entry from `serve.state.json`. Every field is
|
|
101
|
+
* validated — an unexpected type yields a null for that field rather than
|
|
102
|
+
* poisoning the whole entry, because the state file is touched on a hot
|
|
103
|
+
* path and we would rather surface a half-useful snapshot than a
|
|
104
|
+
* "corrupt, try again" error to the operator.
|
|
105
|
+
*
|
|
106
|
+
* Returns `null` when the entry's `name` is missing or not a string, since
|
|
107
|
+
* a downstream with no name is unusable for display.
|
|
108
|
+
*/
|
|
109
|
+
function parseDownstreamEntry(raw) {
|
|
110
|
+
if (typeof raw !== 'object' || raw === null)
|
|
111
|
+
return null;
|
|
112
|
+
const r = raw;
|
|
113
|
+
if (typeof r.name !== 'string' || r.name.length === 0)
|
|
114
|
+
return null;
|
|
115
|
+
const circuit = r.circuit_state === 'open' || r.circuit_state === 'half-open' || r.circuit_state === 'closed'
|
|
116
|
+
? r.circuit_state
|
|
117
|
+
: 'closed';
|
|
118
|
+
return {
|
|
119
|
+
name: r.name,
|
|
120
|
+
connected: typeof r.connected === 'boolean' ? r.connected : false,
|
|
121
|
+
healthy: typeof r.healthy === 'boolean' ? r.healthy : false,
|
|
122
|
+
circuit_state: circuit,
|
|
123
|
+
retry_at: typeof r.retry_at === 'string' ? r.retry_at : null,
|
|
124
|
+
last_error: typeof r.last_error === 'string' ? r.last_error : null,
|
|
125
|
+
tools_count: typeof r.tools_count === 'number' && Number.isInteger(r.tools_count) ? r.tools_count : null,
|
|
126
|
+
open_transitions: typeof r.open_transitions === 'number' && Number.isInteger(r.open_transitions)
|
|
127
|
+
? r.open_transitions
|
|
128
|
+
: 0,
|
|
129
|
+
session_blocker_emitted: typeof r.session_blocker_emitted === 'boolean' ? r.session_blocker_emitted : false,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
99
132
|
function readServeState(baseDir) {
|
|
100
133
|
const p = reaPath(baseDir, SERVE_STATE_FILE);
|
|
101
134
|
try {
|
|
102
135
|
const raw = fs.readFileSync(p, 'utf8');
|
|
103
136
|
const parsed = JSON.parse(raw);
|
|
137
|
+
let downstreams = null;
|
|
138
|
+
if (Array.isArray(parsed.downstreams)) {
|
|
139
|
+
downstreams = [];
|
|
140
|
+
for (const entry of parsed.downstreams) {
|
|
141
|
+
const ds = parseDownstreamEntry(entry);
|
|
142
|
+
if (ds !== null)
|
|
143
|
+
downstreams.push(ds);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
104
146
|
return {
|
|
105
147
|
session_id: typeof parsed.session_id === 'string' ? parsed.session_id : null,
|
|
106
148
|
started_at: typeof parsed.started_at === 'string' ? parsed.started_at : null,
|
|
107
149
|
metrics_port: typeof parsed.metrics_port === 'number' && Number.isInteger(parsed.metrics_port)
|
|
108
150
|
? parsed.metrics_port
|
|
109
151
|
: null,
|
|
152
|
+
downstreams,
|
|
110
153
|
};
|
|
111
154
|
}
|
|
112
155
|
catch {
|
|
113
|
-
return { session_id: null, started_at: null, metrics_port: null };
|
|
156
|
+
return { session_id: null, started_at: null, metrics_port: null, downstreams: null };
|
|
114
157
|
}
|
|
115
158
|
}
|
|
116
159
|
function probeServe(baseDir) {
|
|
@@ -124,6 +167,7 @@ function probeServe(baseDir) {
|
|
|
124
167
|
session_id: null,
|
|
125
168
|
started_at: null,
|
|
126
169
|
metrics_port: null,
|
|
170
|
+
downstreams: null,
|
|
127
171
|
};
|
|
128
172
|
}
|
|
129
173
|
const alive = isProcessAlive(pid);
|
|
@@ -135,6 +179,7 @@ function probeServe(baseDir) {
|
|
|
135
179
|
session_id: state.session_id,
|
|
136
180
|
started_at: state.started_at,
|
|
137
181
|
metrics_port: state.metrics_port,
|
|
182
|
+
downstreams: state.downstreams,
|
|
138
183
|
};
|
|
139
184
|
}
|
|
140
185
|
/**
|
|
@@ -356,6 +401,46 @@ function printPretty(payload) {
|
|
|
356
401
|
}
|
|
357
402
|
}
|
|
358
403
|
console.log('');
|
|
404
|
+
// 0.9.0 — per-downstream block. Only shown when the serve process is
|
|
405
|
+
// believed to be running AND the state file carried the new array. An
|
|
406
|
+
// older gateway version that predates the publisher leaves `downstreams`
|
|
407
|
+
// null; we print an explanatory hint instead of rendering an empty
|
|
408
|
+
// table that looks like "zero downstreams".
|
|
409
|
+
if (s.running) {
|
|
410
|
+
console.log(' Downstreams');
|
|
411
|
+
if (s.downstreams === null) {
|
|
412
|
+
console.log(` (state file has no downstream block — upgrade gateway to ≥0.9.0)`);
|
|
413
|
+
}
|
|
414
|
+
else if (s.downstreams.length === 0) {
|
|
415
|
+
console.log(` (no downstream servers declared in .rea/registry.yaml)`);
|
|
416
|
+
}
|
|
417
|
+
else {
|
|
418
|
+
for (const d of s.downstreams) {
|
|
419
|
+
const name = sanitizeForTerminal(d.name);
|
|
420
|
+
const lastErr = safePretty(d.last_error);
|
|
421
|
+
const retryAt = safePretty(d.retry_at);
|
|
422
|
+
const healthToken = d.healthy ? (d.connected ? 'healthy' : 'connecting') : 'UNHEALTHY';
|
|
423
|
+
const circuit = d.circuit_state.toUpperCase();
|
|
424
|
+
console.log(` ${name}`);
|
|
425
|
+
console.log(` Health: ${healthToken}`);
|
|
426
|
+
console.log(` Circuit: ${circuit}`);
|
|
427
|
+
if (retryAt !== null && d.circuit_state === 'open') {
|
|
428
|
+
console.log(` Retry at: ${retryAt}`);
|
|
429
|
+
}
|
|
430
|
+
if (d.tools_count !== null) {
|
|
431
|
+
console.log(` Tools advertised: ${d.tools_count}`);
|
|
432
|
+
}
|
|
433
|
+
if (d.open_transitions > 0) {
|
|
434
|
+
const blockerSuffix = d.session_blocker_emitted ? ' (SESSION_BLOCKER fired)' : '';
|
|
435
|
+
console.log(` Open transitions: ${d.open_transitions}${blockerSuffix}`);
|
|
436
|
+
}
|
|
437
|
+
if (lastErr !== null) {
|
|
438
|
+
console.log(` Last error: ${lastErr}`);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
console.log('');
|
|
443
|
+
}
|
|
359
444
|
console.log(' Audit log');
|
|
360
445
|
if (!a.present) {
|
|
361
446
|
console.log(` State: not yet written`);
|
|
@@ -25,7 +25,14 @@ export interface CircuitStatus {
|
|
|
25
25
|
serverName: string;
|
|
26
26
|
retryAt?: string;
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
/**
|
|
29
|
+
* Internal per-server circuit state. Exported so observability consumers
|
|
30
|
+
* (live-state publisher, tests) can read `openedAt` and `cooldownMs` to
|
|
31
|
+
* compute a `retry_at` timestamp without duplicating the arithmetic.
|
|
32
|
+
* Treat fields as read-only from outside the breaker — mutating them
|
|
33
|
+
* breaks the invariants `recordSuccess` / `recordFailure` enforce.
|
|
34
|
+
*/
|
|
35
|
+
export interface CircuitEntry {
|
|
29
36
|
state: CircuitState;
|
|
30
37
|
consecutiveFailures: number;
|
|
31
38
|
openedAt: number | null;
|
|
@@ -57,4 +64,3 @@ export declare class CircuitBreaker {
|
|
|
57
64
|
recordFailure(serverName: string): void;
|
|
58
65
|
getCircuit(serverName: string): CircuitEntry | undefined;
|
|
59
66
|
}
|
|
60
|
-
export {};
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* The gateway splits on the FIRST `__` — downstream tools that themselves
|
|
6
6
|
* contain `__` in their name continue to work because the split is one-shot.
|
|
7
7
|
*/
|
|
8
|
-
import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
|
|
8
|
+
import { DownstreamConnection, type DownstreamSupervisorEvent, type DownstreamToolInfo } from './downstream.js';
|
|
9
9
|
import type { Registry } from '../registry/types.js';
|
|
10
10
|
import type { Logger } from './log.js';
|
|
11
11
|
export interface PrefixedTool extends DownstreamToolInfo {
|
|
@@ -44,7 +44,19 @@ export declare class DownstreamPool {
|
|
|
44
44
|
* or is skipped. Stale but truthful > absent.
|
|
45
45
|
*/
|
|
46
46
|
private readonly lastToolsCount;
|
|
47
|
+
/**
|
|
48
|
+
* Optional supervisor event listener wired by the gateway. The pool
|
|
49
|
+
* re-emits per-connection events through this single sink so the
|
|
50
|
+
* SESSION_BLOCKER tracker + state publisher only need to subscribe once.
|
|
51
|
+
*/
|
|
52
|
+
private supervisorListener;
|
|
47
53
|
constructor(registry: Registry, logger?: Logger);
|
|
54
|
+
/**
|
|
55
|
+
* Register a supervisor-event sink. Replaces any previously registered
|
|
56
|
+
* listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
|
|
57
|
+
* and live state publisher.
|
|
58
|
+
*/
|
|
59
|
+
onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
|
|
48
60
|
get size(): number;
|
|
49
61
|
connectAll(): Promise<void>;
|
|
50
62
|
/**
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* The gateway splits on the FIRST `__` — downstream tools that themselves
|
|
6
6
|
* contain `__` in their name continue to work because the split is one-shot.
|
|
7
7
|
*/
|
|
8
|
-
import { DownstreamConnection } from './downstream.js';
|
|
8
|
+
import { DownstreamConnection, } from './downstream.js';
|
|
9
9
|
export class DownstreamPool {
|
|
10
10
|
connections = new Map();
|
|
11
11
|
/**
|
|
@@ -15,13 +15,31 @@ export class DownstreamPool {
|
|
|
15
15
|
* or is skipped. Stale but truthful > absent.
|
|
16
16
|
*/
|
|
17
17
|
lastToolsCount = new Map();
|
|
18
|
+
/**
|
|
19
|
+
* Optional supervisor event listener wired by the gateway. The pool
|
|
20
|
+
* re-emits per-connection events through this single sink so the
|
|
21
|
+
* SESSION_BLOCKER tracker + state publisher only need to subscribe once.
|
|
22
|
+
*/
|
|
23
|
+
supervisorListener = null;
|
|
18
24
|
constructor(registry, logger) {
|
|
19
25
|
for (const server of registry.servers) {
|
|
20
26
|
if (!server.enabled)
|
|
21
27
|
continue;
|
|
22
|
-
|
|
28
|
+
const conn = new DownstreamConnection(server, logger);
|
|
29
|
+
conn.onSupervisorEvent((event) => {
|
|
30
|
+
this.supervisorListener?.(event);
|
|
31
|
+
});
|
|
32
|
+
this.connections.set(server.name, conn);
|
|
23
33
|
}
|
|
24
34
|
}
|
|
35
|
+
/**
|
|
36
|
+
* Register a supervisor-event sink. Replaces any previously registered
|
|
37
|
+
* listener. Intended for the gateway to wire the SESSION_BLOCKER tracker
|
|
38
|
+
* and live state publisher.
|
|
39
|
+
*/
|
|
40
|
+
onSupervisorEvent(listener) {
|
|
41
|
+
this.supervisorListener = listener;
|
|
42
|
+
}
|
|
25
43
|
get size() {
|
|
26
44
|
return this.connections.size;
|
|
27
45
|
}
|
|
@@ -52,7 +70,17 @@ export class DownstreamPool {
|
|
|
52
70
|
continue;
|
|
53
71
|
try {
|
|
54
72
|
const tools = await conn.listTools();
|
|
73
|
+
// Codex 0.9.0 pass-2 P2a: emit a `health_changed` supervisor event
|
|
74
|
+
// whenever the cached tools count actually changes. Without this,
|
|
75
|
+
// a successful listTools would update the value in memory but the
|
|
76
|
+
// live-state publisher would never flush the change — `rea status`
|
|
77
|
+
// would keep reporting a stale `tools_count` until some unrelated
|
|
78
|
+
// circuit/respawn event flushed a snapshot.
|
|
79
|
+
const prev = this.lastToolsCount.get(server);
|
|
55
80
|
this.lastToolsCount.set(server, tools.length);
|
|
81
|
+
if (prev !== tools.length) {
|
|
82
|
+
this.supervisorListener?.({ kind: 'health_changed', server });
|
|
83
|
+
}
|
|
56
84
|
for (const t of tools) {
|
|
57
85
|
const prefixed = {
|
|
58
86
|
...t,
|
|
@@ -30,6 +30,26 @@
|
|
|
30
30
|
* successful reconnect — in that case we mark the connection unhealthy and
|
|
31
31
|
* let the circuit breaker take over.
|
|
32
32
|
*
|
|
33
|
+
* ## Supervisor / child-death detection (0.9.0, BUG-002..003)
|
|
34
|
+
*
|
|
35
|
+
* The SDK `StdioClientTransport` exposes `onclose` + `onerror` callbacks that
|
|
36
|
+
* fire when the child process exits or the stdio pipe errors outside a
|
|
37
|
+
* caller-initiated `close()`. We wire both and treat an unexpected close as
|
|
38
|
+
* "child is dead" — the next `callTool` must force a fresh connect rather
|
|
39
|
+
* than calling into a stale `Client` that will reply `Not connected`.
|
|
40
|
+
*
|
|
41
|
+
* Before 0.9.0 the supervisor was reactive only: a dead child was not noticed
|
|
42
|
+
* until the NEXT tool call tried to use it, at which point the circuit could
|
|
43
|
+
* flap open → half-open → open with the child still dead because the
|
|
44
|
+
* half-open probe re-used the zombie client. 0.9.0 makes death detection
|
|
45
|
+
* eager: `onclose` nulls `this.client` so the very next call takes the
|
|
46
|
+
* `connect()` branch and actually respawns the child.
|
|
47
|
+
*
|
|
48
|
+
* "Not connected" error messages from the SDK (our in-flight fallback) are
|
|
49
|
+
* now also treated as fatal for the current client — we null it before the
|
|
50
|
+
* one-shot reconnect path so we spawn fresh rather than retrying with the
|
|
51
|
+
* same dead handle.
|
|
52
|
+
*
|
|
33
53
|
* ## Why not request-level retries
|
|
34
54
|
*
|
|
35
55
|
* MCP tool calls are not idempotent by default. Retrying `send_message` after
|
|
@@ -74,6 +94,42 @@ export interface BuiltChildEnv {
|
|
|
74
94
|
secretKeys: string[];
|
|
75
95
|
}
|
|
76
96
|
export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
|
|
97
|
+
/**
|
|
98
|
+
* Event emitted by {@link DownstreamConnection} when the supervisor observes
|
|
99
|
+
* a lifecycle transition worth surfacing. Consumers (the pool, the
|
|
100
|
+
* SESSION_BLOCKER tracker, observability sinks) subscribe via
|
|
101
|
+
* {@link DownstreamConnection.onSupervisorEvent}.
|
|
102
|
+
*
|
|
103
|
+
* The `kind` is a narrow closed set so sinks can switch exhaustively. `reason`
|
|
104
|
+
* carries the operator-readable detail; it is already bounded by
|
|
105
|
+
* `boundedDiagnosticString` at the call site.
|
|
106
|
+
*/
|
|
107
|
+
export type DownstreamSupervisorEvent = {
|
|
108
|
+
kind: 'child_died_unexpectedly';
|
|
109
|
+
server: string;
|
|
110
|
+
reason: string;
|
|
111
|
+
} | {
|
|
112
|
+
kind: 'respawned';
|
|
113
|
+
server: string;
|
|
114
|
+
} | {
|
|
115
|
+
/**
|
|
116
|
+
* A non-transition health change. Fires whenever a visible field in
|
|
117
|
+
* {@link DownstreamHealth} (health, last_error, tools_count) mutates
|
|
118
|
+
* WITHOUT being accompanied by a breaker transition or respawn event.
|
|
119
|
+
*
|
|
120
|
+
* Codex 0.9.0 pass-2 P2a: without this event, the first failed call/
|
|
121
|
+
* reconnect below the breaker threshold (or a successful `listTools`
|
|
122
|
+
* that updates the cached tool count) never reaches the live state
|
|
123
|
+
* publisher, so `rea status` would show stale data until some later,
|
|
124
|
+
* unrelated circuit or respawn event finally flushed a snapshot.
|
|
125
|
+
*
|
|
126
|
+
* Firing is best-effort from the connection class; the pool additionally
|
|
127
|
+
* emits this kind after `listAllTools` updates `lastToolsCount` so a
|
|
128
|
+
* tool-catalog change is always visible in the next debounced snapshot.
|
|
129
|
+
*/
|
|
130
|
+
kind: 'health_changed';
|
|
131
|
+
server: string;
|
|
132
|
+
};
|
|
77
133
|
export declare class DownstreamConnection {
|
|
78
134
|
#private;
|
|
79
135
|
private readonly config;
|
|
@@ -84,6 +140,29 @@ export declare class DownstreamConnection {
|
|
|
84
140
|
*/
|
|
85
141
|
private readonly logger?;
|
|
86
142
|
private client;
|
|
143
|
+
/**
|
|
144
|
+
* Handle to the currently active transport, so our `onclose`/`onerror`
|
|
145
|
+
* hooks can tell "this is the transport we care about" vs "a stale callback
|
|
146
|
+
* firing after we already swapped to a new transport". Cleared in `close()`
|
|
147
|
+
* BEFORE we invoke `client.close()` so our own tear-down does not race the
|
|
148
|
+
* supervisor path.
|
|
149
|
+
*/
|
|
150
|
+
private activeTransport;
|
|
151
|
+
/**
|
|
152
|
+
* Set of transports currently being torn down by an in-flight `close()`.
|
|
153
|
+
* `onclose` / `onerror` callbacks that fire for a transport in this set
|
|
154
|
+
* must NOT be promoted to an "unexpected child death" — they are our own
|
|
155
|
+
* tear-down signal.
|
|
156
|
+
*
|
|
157
|
+
* Codex P2 (0.9.0 review): the earlier `closingIntentionally` boolean was
|
|
158
|
+
* connection-wide. Under concurrent calls, one call's `await this.close()`
|
|
159
|
+
* could overlap with another call's reconnect that had already installed
|
|
160
|
+
* a NEW transport. A genuine `onclose` from the new transport would hit
|
|
161
|
+
* the boolean guard and be silently ignored, reintroducing the stale-
|
|
162
|
+
* handle bug the patch targeted. Per-transport scoping eliminates the
|
|
163
|
+
* race: only the exact transport we asked to close is silenced.
|
|
164
|
+
*/
|
|
165
|
+
private readonly closingTransports;
|
|
87
166
|
/**
|
|
88
167
|
* Whether a reconnect has already been attempted in the CURRENT failure
|
|
89
168
|
* episode. Resets to `false` after a reconnect succeeds (so a later,
|
|
@@ -93,7 +172,30 @@ export declare class DownstreamConnection {
|
|
|
93
172
|
private reconnectAttempted;
|
|
94
173
|
/** Epoch ms of the last successful reconnect. Used by the flapping guard. */
|
|
95
174
|
private lastReconnectAt;
|
|
175
|
+
/**
|
|
176
|
+
* Epoch ms of the most recent unexpected child-death event. Stamped by
|
|
177
|
+
* `handleUnexpectedClose()`. 0 means "never died unexpectedly".
|
|
178
|
+
*
|
|
179
|
+
* Codex 0.9.0 pass-5 P2b: when `handleUnexpectedClose` nulls `this.client`,
|
|
180
|
+
* the very next `callTool` takes the top-level `client === null` branch,
|
|
181
|
+
* which normally bypasses the flap-window check entirely (that check lives
|
|
182
|
+
* in the catch branch below, conditioned on `lastReconnectAt`). A downstream
|
|
183
|
+
* that crashes immediately after every spawn would therefore be respawned
|
|
184
|
+
* unconditionally on every incoming call — exactly the loop the flap
|
|
185
|
+
* window is supposed to suppress. Consulting this timestamp in the
|
|
186
|
+
* `client === null` branch lets us refuse the respawn when the previous
|
|
187
|
+
* death is within the flap window, and the caller gets a clear error
|
|
188
|
+
* instead of watching the child die again.
|
|
189
|
+
*/
|
|
190
|
+
private unexpectedDeathAt;
|
|
96
191
|
private health;
|
|
192
|
+
/**
|
|
193
|
+
* Optional supervisor-event listener. Set via
|
|
194
|
+
* {@link onSupervisorEvent}. A single subscriber is sufficient — the pool
|
|
195
|
+
* is the one consumer. Listener failures are swallowed; a broken consumer
|
|
196
|
+
* must never break the connection lifecycle.
|
|
197
|
+
*/
|
|
198
|
+
private supervisorListener;
|
|
97
199
|
constructor(config: RegistryServer,
|
|
98
200
|
/**
|
|
99
201
|
* Optional structured logger (G5). When omitted, connection lifecycle
|
|
@@ -105,6 +207,61 @@ export declare class DownstreamConnection {
|
|
|
105
207
|
get isHealthy(): boolean;
|
|
106
208
|
/** True iff the underlying MCP client is currently connected. */
|
|
107
209
|
get isConnected(): boolean;
|
|
210
|
+
/**
|
|
211
|
+
* Register a supervisor-event listener. Intended for the pool to wire up
|
|
212
|
+
* SESSION_BLOCKER tracking + observability hooks without the connection
|
|
213
|
+
* class having to know about either. Only one listener is supported — a
|
|
214
|
+
* second call replaces the first. Pass `null` to detach.
|
|
215
|
+
*/
|
|
216
|
+
onSupervisorEvent(listener: ((event: DownstreamSupervisorEvent) => void) | null): void;
|
|
217
|
+
/**
|
|
218
|
+
* Invoke the supervisor listener if registered. Swallows listener errors —
|
|
219
|
+
* a broken observer must never break the connection state machine.
|
|
220
|
+
*/
|
|
221
|
+
private emitSupervisorEvent;
|
|
222
|
+
/**
|
|
223
|
+
* Emit a `health_changed` event. Called from every site that mutates a
|
|
224
|
+
* health/last_error/tools_count-visible field WITHOUT firing one of the
|
|
225
|
+
* louder supervisor events (`child_died_unexpectedly` / `respawned`).
|
|
226
|
+
* Addresses Codex 0.9.0 pass-2 P2a — live-state was only scheduled from
|
|
227
|
+
* breaker transitions and respawns, so transient errors below the breaker
|
|
228
|
+
* threshold would leave `rea status` showing stale data.
|
|
229
|
+
*/
|
|
230
|
+
private emitHealthChanged;
|
|
231
|
+
/**
|
|
232
|
+
* Handle an unexpected transport close. Fires when the child process exits
|
|
233
|
+
* outside a caller-initiated `close()`, or when the stdio pipe errors in a
|
|
234
|
+
* way the SDK surfaces as a close event.
|
|
235
|
+
*
|
|
236
|
+
* Contract:
|
|
237
|
+
* - Only runs for the currently-active transport (stale callbacks from
|
|
238
|
+
* an already-swapped transport are ignored).
|
|
239
|
+
* - Does NOT run when WE initiated the close (the transport is a member
|
|
240
|
+
* of `closingTransports` for the duration of our own `close()` call).
|
|
241
|
+
* - Nulls `this.client` so the next `callTool` takes the `connect()`
|
|
242
|
+
* branch and actually respawns the child.
|
|
243
|
+
* - Marks the connection unhealthy so the pool knows not to route
|
|
244
|
+
* traffic to it while we wait for the next call.
|
|
245
|
+
* - Emits a `child_died_unexpectedly` supervisor event so the pool's
|
|
246
|
+
* SESSION_BLOCKER tracker can count this even though no callTool has
|
|
247
|
+
* failed yet (the child may die mid-idle).
|
|
248
|
+
*/
|
|
249
|
+
private handleUnexpectedClose;
|
|
250
|
+
/**
|
|
251
|
+
* Handle a transport-layer protocol error. onerror does NOT always imply
|
|
252
|
+
* close — the SDK emits it for protocol errors too. We record the error
|
|
253
|
+
* text but leave connection invalidation to the eventual onclose callback,
|
|
254
|
+
* which is guaranteed to follow a fatal transport error on stdio.
|
|
255
|
+
*
|
|
256
|
+
* Codex 0.9.0 pass-6 P2: filter stale/intentional-close callbacks the
|
|
257
|
+
* same way `handleUnexpectedClose` does. Without this, a delayed
|
|
258
|
+
* onerror from a PREVIOUSLY-ACTIVE transport (one we've already torn
|
|
259
|
+
* down or replaced) can clobber the HEALTHY replacement connection's
|
|
260
|
+
* last_error and emit a spurious health_changed, leaving `rea status`
|
|
261
|
+
* showing a stale error on a perfectly live child. The `onclose`
|
|
262
|
+
* hook already enforced this filter; the `onerror` hook did not.
|
|
263
|
+
*/
|
|
264
|
+
private handleTransportError;
|
|
108
265
|
/**
|
|
109
266
|
* Last error observed, or null if the connection has never failed (or fully
|
|
110
267
|
* recovered).
|