@bookedsolid/rea 0.2.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.husky/pre-push +15 -18
- package/README.md +41 -1
- package/THREAT_MODEL.md +100 -29
- package/dist/audit/append.d.ts +21 -8
- package/dist/audit/append.js +48 -83
- package/dist/audit/fs.d.ts +68 -0
- package/dist/audit/fs.js +171 -0
- package/dist/cli/audit.d.ts +40 -0
- package/dist/cli/audit.js +205 -0
- package/dist/cli/doctor.d.ts +19 -4
- package/dist/cli/doctor.js +172 -5
- package/dist/cli/index.js +26 -1
- package/dist/cli/init.js +93 -7
- package/dist/cli/install/pre-push.d.ts +335 -0
- package/dist/cli/install/pre-push.js +2818 -0
- package/dist/cli/serve.d.ts +64 -0
- package/dist/cli/serve.js +270 -2
- package/dist/cli/status.d.ts +90 -0
- package/dist/cli/status.js +399 -0
- package/dist/cli/utils.d.ts +4 -0
- package/dist/cli/utils.js +4 -0
- package/dist/gateway/audit/rotator.d.ts +116 -0
- package/dist/gateway/audit/rotator.js +289 -0
- package/dist/gateway/circuit-breaker.d.ts +17 -0
- package/dist/gateway/circuit-breaker.js +32 -3
- package/dist/gateway/downstream-pool.d.ts +2 -1
- package/dist/gateway/downstream-pool.js +2 -2
- package/dist/gateway/downstream.d.ts +39 -3
- package/dist/gateway/downstream.js +73 -14
- package/dist/gateway/log.d.ts +122 -0
- package/dist/gateway/log.js +334 -0
- package/dist/gateway/middleware/audit.d.ts +24 -1
- package/dist/gateway/middleware/audit.js +103 -58
- package/dist/gateway/middleware/blocked-paths.d.ts +0 -9
- package/dist/gateway/middleware/blocked-paths.js +439 -67
- package/dist/gateway/middleware/injection.d.ts +218 -13
- package/dist/gateway/middleware/injection.js +433 -51
- package/dist/gateway/middleware/kill-switch.d.ts +10 -1
- package/dist/gateway/middleware/kill-switch.js +20 -1
- package/dist/gateway/observability/metrics.d.ts +125 -0
- package/dist/gateway/observability/metrics.js +321 -0
- package/dist/gateway/server.d.ts +19 -0
- package/dist/gateway/server.js +99 -15
- package/dist/policy/loader.d.ts +47 -0
- package/dist/policy/loader.js +47 -0
- package/dist/policy/profiles.d.ts +13 -0
- package/dist/policy/profiles.js +12 -0
- package/dist/policy/types.d.ts +52 -0
- package/dist/registry/fingerprint.d.ts +73 -0
- package/dist/registry/fingerprint.js +81 -0
- package/dist/registry/fingerprints-store.d.ts +62 -0
- package/dist/registry/fingerprints-store.js +111 -0
- package/dist/registry/interpolate.d.ts +58 -0
- package/dist/registry/interpolate.js +121 -0
- package/dist/registry/loader.d.ts +2 -2
- package/dist/registry/loader.js +22 -1
- package/dist/registry/tofu-gate.d.ts +41 -0
- package/dist/registry/tofu-gate.js +189 -0
- package/dist/registry/tofu.d.ts +111 -0
- package/dist/registry/tofu.js +173 -0
- package/dist/registry/types.d.ts +9 -1
- package/package.json +3 -1
- package/profiles/bst-internal-no-codex.yaml +5 -0
- package/profiles/bst-internal.yaml +7 -0
- package/scripts/tarball-smoke.sh +197 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Audit rotation (G1). Size- and age-based rotation for `.rea/audit.jsonl`
|
|
3
|
+
* that preserves hash-chain continuity across the rotation boundary.
|
|
4
|
+
*
|
|
5
|
+
* ## Triggers
|
|
6
|
+
*
|
|
7
|
+
* Rotation fires when EITHER threshold is crossed:
|
|
8
|
+
*
|
|
9
|
+
* - `max_bytes` — the current `audit.jsonl` is at or above this many bytes.
|
|
10
|
+
* Default when the policy block is present but `max_bytes` is unset:
|
|
11
|
+
* `DEFAULT_MAX_BYTES` (50 MiB).
|
|
12
|
+
* - `max_age_days` — the first record's `timestamp` is older than this many
|
|
13
|
+
* days. Default when unset: `DEFAULT_MAX_AGE_DAYS` (30).
|
|
14
|
+
*
|
|
15
|
+
* Back-compat: if the `audit.rotation` policy block is ABSENT entirely,
|
|
16
|
+
* rotation is DISABLED. Defaults only apply when the operator has opted in
|
|
17
|
+
* by declaring the block (even empty). This is deliberate — we do not want
|
|
18
|
+
* a 0.2.x install to observe new file-movement behavior on 0.3.0 upgrade
|
|
19
|
+
* without being asked.
|
|
20
|
+
*
|
|
21
|
+
* ## Rotation marker
|
|
22
|
+
*
|
|
23
|
+
* On rotation, the current file is renamed to `audit-YYYYMMDD-HHMMSS.jsonl`
|
|
24
|
+
* in the same directory. A fresh `audit.jsonl` is created containing EXACTLY
|
|
25
|
+
* one record: a rotation marker.
|
|
26
|
+
*
|
|
27
|
+
* tool_name: 'audit.rotation'
|
|
28
|
+
* server_name: 'rea'
|
|
29
|
+
* status: 'allowed'
|
|
30
|
+
* tier: 'read'
|
|
31
|
+
* autonomy_level: 'system'
|
|
32
|
+
* prev_hash: hash of the LAST record in the rotated file
|
|
33
|
+
* metadata.rotated_from: the rotated filename (basename)
|
|
34
|
+
* metadata.rotated_at: ISO-8601 instant of rotation
|
|
35
|
+
*
|
|
36
|
+
* The marker's `prev_hash` is the chain bridge — an operator verifying the
|
|
37
|
+
* chain with `rea audit verify --since <rotated-file>` walks rotated →
|
|
38
|
+
* marker → current and every transition must line up.
|
|
39
|
+
*
|
|
40
|
+
* ## Concurrency
|
|
41
|
+
*
|
|
42
|
+
* `maybeRotate` is called BEFORE the per-append lock is acquired. It takes
|
|
43
|
+
* its own short-lived lock on `.rea/` to perform the rename + marker write
|
|
44
|
+
* atomically. Callers that beat the rotator to the lock simply append to
|
|
45
|
+
* the (now fresh) file — correctness is preserved because the rotation
|
|
46
|
+
* marker is a legitimate chain anchor.
|
|
47
|
+
*/
|
|
48
|
+
import fs from 'node:fs/promises';
|
|
49
|
+
import path from 'node:path';
|
|
50
|
+
import { Tier, InvocationStatus } from '../../policy/types.js';
|
|
51
|
+
import { computeHash, readLastRecord, withAuditLock } from '../../audit/fs.js';
|
|
52
|
+
/** 50 MiB. Only applied when the operator has declared `audit.rotation`. */
|
|
53
|
+
export const DEFAULT_MAX_BYTES = 50 * 1024 * 1024;
|
|
54
|
+
/** 30 days. Only applied when the operator has declared `audit.rotation`. */
|
|
55
|
+
export const DEFAULT_MAX_AGE_DAYS = 30;
|
|
56
|
+
export const ROTATION_TOOL_NAME = 'audit.rotation';
|
|
57
|
+
export const ROTATION_SERVER_NAME = 'rea';
|
|
58
|
+
/**
|
|
59
|
+
* Compute the effective rotation thresholds from policy. If the operator has
|
|
60
|
+
* NOT declared an `audit.rotation` block, BOTH thresholds are undefined and
|
|
61
|
+
* rotation is disabled (back-compat with 0.2.x).
|
|
62
|
+
*
|
|
63
|
+
* If the block IS declared but individual knobs are missing, apply the
|
|
64
|
+
* documented defaults.
|
|
65
|
+
*/
|
|
66
|
+
function effectiveThresholds(policy) {
|
|
67
|
+
const rot = policy?.audit?.rotation;
|
|
68
|
+
if (rot === undefined) {
|
|
69
|
+
return { maxBytes: undefined, maxAgeMs: undefined };
|
|
70
|
+
}
|
|
71
|
+
// An explicit `audit.rotation: {}` block opts in to both defaults.
|
|
72
|
+
const maxBytes = rot.max_bytes ?? DEFAULT_MAX_BYTES;
|
|
73
|
+
const maxAgeDays = rot.max_age_days ?? DEFAULT_MAX_AGE_DAYS;
|
|
74
|
+
return { maxBytes, maxAgeMs: maxAgeDays * 24 * 60 * 60 * 1000 };
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Build the rotation timestamp filename. UTC for sortability.
|
|
78
|
+
* Format: `audit-YYYYMMDD-HHMMSS.jsonl`. Collisions (two rotations in the
|
|
79
|
+
* same second) are resolved by appending `-1`, `-2`, etc.
|
|
80
|
+
*/
|
|
81
|
+
export function rotationFilename(at) {
|
|
82
|
+
const y = at.getUTCFullYear().toString().padStart(4, '0');
|
|
83
|
+
const m = (at.getUTCMonth() + 1).toString().padStart(2, '0');
|
|
84
|
+
const d = at.getUTCDate().toString().padStart(2, '0');
|
|
85
|
+
const hh = at.getUTCHours().toString().padStart(2, '0');
|
|
86
|
+
const mm = at.getUTCMinutes().toString().padStart(2, '0');
|
|
87
|
+
const ss = at.getUTCSeconds().toString().padStart(2, '0');
|
|
88
|
+
return `audit-${y}${m}${d}-${hh}${mm}${ss}.jsonl`;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Probe the first record's timestamp WITHOUT loading the whole file into
|
|
92
|
+
* memory as a JSON blob. We read up to the first newline and parse just
|
|
93
|
+
* that line. Returns `undefined` if the file is empty / unreadable / the
|
|
94
|
+
* first line isn't valid JSON with a usable `timestamp` field.
|
|
95
|
+
*/
|
|
96
|
+
async function readFirstTimestamp(auditFile) {
|
|
97
|
+
let fh;
|
|
98
|
+
try {
|
|
99
|
+
fh = await fs.open(auditFile, 'r');
|
|
100
|
+
// 64 KiB is enough for the first record under any realistic schema.
|
|
101
|
+
const buf = Buffer.alloc(64 * 1024);
|
|
102
|
+
const { bytesRead } = await fh.read(buf, 0, buf.length, 0);
|
|
103
|
+
if (bytesRead === 0)
|
|
104
|
+
return undefined;
|
|
105
|
+
const chunk = buf.slice(0, bytesRead).toString('utf8');
|
|
106
|
+
const newline = chunk.indexOf('\n');
|
|
107
|
+
const firstLine = newline === -1 ? chunk : chunk.slice(0, newline);
|
|
108
|
+
if (firstLine.length === 0)
|
|
109
|
+
return undefined;
|
|
110
|
+
const parsed = JSON.parse(firstLine);
|
|
111
|
+
if (typeof parsed.timestamp !== 'string')
|
|
112
|
+
return undefined;
|
|
113
|
+
const ts = Date.parse(parsed.timestamp);
|
|
114
|
+
if (Number.isNaN(ts))
|
|
115
|
+
return undefined;
|
|
116
|
+
return new Date(ts);
|
|
117
|
+
}
|
|
118
|
+
catch {
|
|
119
|
+
return undefined;
|
|
120
|
+
}
|
|
121
|
+
finally {
|
|
122
|
+
if (fh)
|
|
123
|
+
await fh.close();
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Decide whether the current audit file has crossed any rotation threshold.
|
|
128
|
+
* Exported for testing.
|
|
129
|
+
*/
|
|
130
|
+
export async function shouldRotate(auditFile, thresholds, now = new Date()) {
|
|
131
|
+
if (thresholds.maxBytes === undefined && thresholds.maxAgeMs === undefined) {
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
let size;
|
|
135
|
+
try {
|
|
136
|
+
const stat = await fs.stat(auditFile);
|
|
137
|
+
if (!stat.isFile())
|
|
138
|
+
return false;
|
|
139
|
+
size = stat.size;
|
|
140
|
+
}
|
|
141
|
+
catch (err) {
|
|
142
|
+
if (err.code === 'ENOENT')
|
|
143
|
+
return false;
|
|
144
|
+
throw err;
|
|
145
|
+
}
|
|
146
|
+
// Empty files never rotate — rotating an empty file would create a chain
|
|
147
|
+
// anchored on genesis with a dangling predecessor.
|
|
148
|
+
if (size === 0)
|
|
149
|
+
return false;
|
|
150
|
+
if (thresholds.maxBytes !== undefined && size >= thresholds.maxBytes) {
|
|
151
|
+
return true;
|
|
152
|
+
}
|
|
153
|
+
if (thresholds.maxAgeMs !== undefined) {
|
|
154
|
+
const firstTs = await readFirstTimestamp(auditFile);
|
|
155
|
+
if (firstTs !== undefined) {
|
|
156
|
+
const ageMs = now.getTime() - firstTs.getTime();
|
|
157
|
+
if (ageMs >= thresholds.maxAgeMs)
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return false;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Pick a rotation filename that doesn't collide with an existing file.
|
|
165
|
+
* Returns the absolute path.
|
|
166
|
+
*/
|
|
167
|
+
async function pickRotationPath(reaDir, at) {
|
|
168
|
+
const base = rotationFilename(at);
|
|
169
|
+
const baseNoExt = base.replace(/\.jsonl$/, '');
|
|
170
|
+
let candidate = path.join(reaDir, base);
|
|
171
|
+
let suffix = 1;
|
|
172
|
+
while (true) {
|
|
173
|
+
try {
|
|
174
|
+
await fs.access(candidate);
|
|
175
|
+
}
|
|
176
|
+
catch (err) {
|
|
177
|
+
if (err.code === 'ENOENT') {
|
|
178
|
+
return candidate;
|
|
179
|
+
}
|
|
180
|
+
throw err;
|
|
181
|
+
}
|
|
182
|
+
candidate = path.join(reaDir, `${baseNoExt}-${suffix}.jsonl`);
|
|
183
|
+
suffix += 1;
|
|
184
|
+
if (suffix > 1000) {
|
|
185
|
+
throw new Error(`Unable to pick rotation filename in ${reaDir} — 1000 collisions`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Perform the rotation unconditionally. Assumes the caller has already
|
|
191
|
+
* determined rotation is warranted and holds (or is about to acquire) any
|
|
192
|
+
* outer locks. `performRotation` takes its own lock on `.rea/` to make the
|
|
193
|
+
* rename + marker write atomic w.r.t. other append-path lockers.
|
|
194
|
+
*
|
|
195
|
+
* Returns `{ rotated: false }` if the audit file is empty or missing — an
|
|
196
|
+
* empty file is a no-op by design (see `rea audit rotate` empty-case).
|
|
197
|
+
*/
|
|
198
|
+
export async function performRotation(auditFile, now = new Date()) {
|
|
199
|
+
const reaDir = path.dirname(auditFile);
|
|
200
|
+
// Ensure the parent exists so withAuditLock can place a lock file. The
|
|
201
|
+
// caller normally creates this; we mkdir defensively for the force-rotate
|
|
202
|
+
// path (`rea audit rotate` on a green-field install).
|
|
203
|
+
await fs.mkdir(reaDir, { recursive: true });
|
|
204
|
+
return withAuditLock(auditFile, async () => {
|
|
205
|
+
// Re-check the file under the lock. Another writer may have rotated
|
|
206
|
+
// between the caller's decision and our lock acquisition.
|
|
207
|
+
let size;
|
|
208
|
+
try {
|
|
209
|
+
const stat = await fs.stat(auditFile);
|
|
210
|
+
if (!stat.isFile())
|
|
211
|
+
return { rotated: false };
|
|
212
|
+
size = stat.size;
|
|
213
|
+
}
|
|
214
|
+
catch (err) {
|
|
215
|
+
if (err.code === 'ENOENT')
|
|
216
|
+
return { rotated: false };
|
|
217
|
+
throw err;
|
|
218
|
+
}
|
|
219
|
+
if (size === 0)
|
|
220
|
+
return { rotated: false };
|
|
221
|
+
// Pull the last record's hash BEFORE renaming — so we can anchor the
|
|
222
|
+
// marker's prev_hash on the old chain's tail. readLastRecord also
|
|
223
|
+
// performs partial-write recovery under our lock (idempotent).
|
|
224
|
+
const { hash: tailHash } = await readLastRecord(auditFile);
|
|
225
|
+
const rotatedPath = await pickRotationPath(reaDir, now);
|
|
226
|
+
await fs.rename(auditFile, rotatedPath);
|
|
227
|
+
// Write the rotation marker into a fresh audit.jsonl. The marker's
|
|
228
|
+
// prev_hash is the old tail's hash — operators can walk rotated →
|
|
229
|
+
// marker and the chain holds.
|
|
230
|
+
const markerBase = {
|
|
231
|
+
timestamp: now.toISOString(),
|
|
232
|
+
session_id: 'system',
|
|
233
|
+
tool_name: ROTATION_TOOL_NAME,
|
|
234
|
+
server_name: ROTATION_SERVER_NAME,
|
|
235
|
+
tier: Tier.Read,
|
|
236
|
+
status: InvocationStatus.Allowed,
|
|
237
|
+
autonomy_level: 'system',
|
|
238
|
+
duration_ms: 0,
|
|
239
|
+
prev_hash: tailHash,
|
|
240
|
+
metadata: {
|
|
241
|
+
rotated_from: path.basename(rotatedPath),
|
|
242
|
+
rotated_at: now.toISOString(),
|
|
243
|
+
},
|
|
244
|
+
};
|
|
245
|
+
const markerHash = computeHash(markerBase);
|
|
246
|
+
const marker = { ...markerBase, hash: markerHash };
|
|
247
|
+
const line = JSON.stringify(marker) + '\n';
|
|
248
|
+
await fs.writeFile(auditFile, line, { flag: 'w' });
|
|
249
|
+
return { rotated: true, rotatedTo: rotatedPath };
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Called by the append path BEFORE acquiring its own lock. Cheap when no
|
|
254
|
+
* rotation is due (one stat, maybe one 64 KiB read for age check); idempotent
|
|
255
|
+
* when rotation IS due (performRotation re-checks under the lock).
|
|
256
|
+
*
|
|
257
|
+
* Never throws. On any error, logs to stderr and returns `rotated: false`
|
|
258
|
+
* — a broken rotator must NOT break the audit append.
|
|
259
|
+
*/
|
|
260
|
+
export async function maybeRotate(auditFile, policy, now = new Date()) {
|
|
261
|
+
try {
|
|
262
|
+
const thresholds = effectiveThresholds(policy);
|
|
263
|
+
if (thresholds.maxBytes === undefined && thresholds.maxAgeMs === undefined) {
|
|
264
|
+
return { rotated: false };
|
|
265
|
+
}
|
|
266
|
+
const due = await shouldRotate(auditFile, thresholds, now);
|
|
267
|
+
if (!due)
|
|
268
|
+
return { rotated: false };
|
|
269
|
+
return await performRotation(auditFile, now);
|
|
270
|
+
}
|
|
271
|
+
catch (err) {
|
|
272
|
+
console.error('[rea] AUDIT ROTATION FAILED:', err instanceof Error ? err.message : String(err));
|
|
273
|
+
return { rotated: false };
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* CLI-invoked force rotation (`rea audit rotate`). Unlike `maybeRotate` this
|
|
278
|
+
* DOES ignore thresholds — the operator asked explicitly — but empty files
|
|
279
|
+
* are still a no-op because rotating an empty chain produces a marker with
|
|
280
|
+
* no predecessor.
|
|
281
|
+
*/
|
|
282
|
+
export async function forceRotate(auditFile, now = new Date()) {
|
|
283
|
+
return performRotation(auditFile, now);
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Exposed for tests/callers that already know the policy shape. Tests that
|
|
287
|
+
* want to stub thresholds can call `performRotation` directly.
|
|
288
|
+
*/
|
|
289
|
+
export { effectiveThresholds as _effectiveThresholds };
|
|
@@ -1,9 +1,24 @@
|
|
|
1
1
|
export type CircuitState = 'closed' | 'open' | 'half-open';
|
|
2
|
+
/**
|
|
3
|
+
* Callback invoked on every circuit state transition (G5). The constructor
|
|
4
|
+
* can wire this to a structured logger and/or a metrics gauge so state
|
|
5
|
+
* changes are observable without requiring the breaker itself to depend on
|
|
6
|
+
* those modules.
|
|
7
|
+
*/
|
|
8
|
+
export type CircuitStateChangeListener = (event: {
|
|
9
|
+
server: string;
|
|
10
|
+
from: CircuitState;
|
|
11
|
+
to: CircuitState;
|
|
12
|
+
reason: 'failure_threshold' | 'cooldown_elapsed' | 'recovered' | 'half_open_failed';
|
|
13
|
+
retryAt?: string;
|
|
14
|
+
}) => void;
|
|
2
15
|
export interface CircuitBreakerOptions {
|
|
3
16
|
/** Consecutive failures before opening the circuit. Default: 5 */
|
|
4
17
|
failureThreshold?: number;
|
|
5
18
|
/** Milliseconds to wait in open state before moving to half-open. Default: 30_000 */
|
|
6
19
|
cooldownMs?: number;
|
|
20
|
+
/** Optional listener for state transitions. See {@link CircuitStateChangeListener}. */
|
|
21
|
+
onStateChange?: CircuitStateChangeListener;
|
|
7
22
|
}
|
|
8
23
|
export interface CircuitStatus {
|
|
9
24
|
state: CircuitState;
|
|
@@ -29,7 +44,9 @@ interface CircuitEntry {
|
|
|
29
44
|
export declare class CircuitBreaker {
|
|
30
45
|
private circuits;
|
|
31
46
|
private defaultOptions;
|
|
47
|
+
private readonly onStateChange;
|
|
32
48
|
constructor(defaults?: CircuitBreakerOptions);
|
|
49
|
+
private notify;
|
|
33
50
|
private getOrCreate;
|
|
34
51
|
/**
|
|
35
52
|
* Returns null if the call may proceed, or a CircuitStatus if the circuit is open.
|
|
@@ -10,11 +10,23 @@
|
|
|
10
10
|
export class CircuitBreaker {
|
|
11
11
|
circuits = new Map();
|
|
12
12
|
defaultOptions;
|
|
13
|
+
onStateChange;
|
|
13
14
|
constructor(defaults = {}) {
|
|
14
15
|
this.defaultOptions = {
|
|
15
16
|
failureThreshold: defaults.failureThreshold ?? 5,
|
|
16
17
|
cooldownMs: defaults.cooldownMs ?? 30_000,
|
|
17
18
|
};
|
|
19
|
+
this.onStateChange = defaults.onStateChange;
|
|
20
|
+
}
|
|
21
|
+
notify(event) {
|
|
22
|
+
if (this.onStateChange === undefined)
|
|
23
|
+
return;
|
|
24
|
+
try {
|
|
25
|
+
this.onStateChange(event);
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
// Listeners must never break the breaker. Swallow.
|
|
29
|
+
}
|
|
18
30
|
}
|
|
19
31
|
getOrCreate(serverName) {
|
|
20
32
|
let entry = this.circuits.get(serverName);
|
|
@@ -43,7 +55,12 @@ export class CircuitBreaker {
|
|
|
43
55
|
if (elapsed >= entry.cooldownMs) {
|
|
44
56
|
entry.state = 'half-open';
|
|
45
57
|
entry.consecutiveFailures = 0;
|
|
46
|
-
|
|
58
|
+
this.notify({
|
|
59
|
+
server: serverName,
|
|
60
|
+
from: 'open',
|
|
61
|
+
to: 'half-open',
|
|
62
|
+
reason: 'cooldown_elapsed',
|
|
63
|
+
});
|
|
47
64
|
return null;
|
|
48
65
|
}
|
|
49
66
|
const retryAt = new Date((entry.openedAt ?? 0) + entry.cooldownMs).toISOString();
|
|
@@ -61,7 +78,12 @@ export class CircuitBreaker {
|
|
|
61
78
|
entry.state = 'closed';
|
|
62
79
|
entry.consecutiveFailures = 0;
|
|
63
80
|
entry.openedAt = null;
|
|
64
|
-
|
|
81
|
+
this.notify({
|
|
82
|
+
server: serverName,
|
|
83
|
+
from: 'half-open',
|
|
84
|
+
to: 'closed',
|
|
85
|
+
reason: 'recovered',
|
|
86
|
+
});
|
|
65
87
|
}
|
|
66
88
|
else if (entry.state === 'closed') {
|
|
67
89
|
entry.consecutiveFailures = 0;
|
|
@@ -71,13 +93,20 @@ export class CircuitBreaker {
|
|
|
71
93
|
const entry = this.getOrCreate(serverName);
|
|
72
94
|
if (entry.state === 'open')
|
|
73
95
|
return;
|
|
96
|
+
const previous = entry.state;
|
|
74
97
|
entry.consecutiveFailures++;
|
|
75
98
|
const shouldOpen = entry.state === 'half-open' || entry.consecutiveFailures >= entry.failureThreshold;
|
|
76
99
|
if (shouldOpen) {
|
|
77
100
|
entry.state = 'open';
|
|
78
101
|
entry.openedAt = Date.now();
|
|
79
102
|
const retryAt = new Date(entry.openedAt + entry.cooldownMs).toISOString();
|
|
80
|
-
|
|
103
|
+
this.notify({
|
|
104
|
+
server: serverName,
|
|
105
|
+
from: previous,
|
|
106
|
+
to: 'open',
|
|
107
|
+
reason: previous === 'half-open' ? 'half_open_failed' : 'failure_threshold',
|
|
108
|
+
retryAt,
|
|
109
|
+
});
|
|
81
110
|
}
|
|
82
111
|
}
|
|
83
112
|
getCircuit(serverName) {
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
|
|
9
9
|
import type { Registry } from '../registry/types.js';
|
|
10
|
+
import type { Logger } from './log.js';
|
|
10
11
|
export interface PrefixedTool extends DownstreamToolInfo {
|
|
11
12
|
/** Server name, not prefixed. */
|
|
12
13
|
server: string;
|
|
@@ -15,7 +16,7 @@ export interface PrefixedTool extends DownstreamToolInfo {
|
|
|
15
16
|
}
|
|
16
17
|
export declare class DownstreamPool {
|
|
17
18
|
private readonly connections;
|
|
18
|
-
constructor(registry: Registry);
|
|
19
|
+
constructor(registry: Registry, logger?: Logger);
|
|
19
20
|
get size(): number;
|
|
20
21
|
connectAll(): Promise<void>;
|
|
21
22
|
/**
|
|
@@ -8,11 +8,11 @@
|
|
|
8
8
|
import { DownstreamConnection } from './downstream.js';
|
|
9
9
|
export class DownstreamPool {
|
|
10
10
|
connections = new Map();
|
|
11
|
-
constructor(registry) {
|
|
11
|
+
constructor(registry, logger) {
|
|
12
12
|
for (const server of registry.servers) {
|
|
13
13
|
if (!server.enabled)
|
|
14
14
|
continue;
|
|
15
|
-
this.connections.set(server.name, new DownstreamConnection(server));
|
|
15
|
+
this.connections.set(server.name, new DownstreamConnection(server, logger));
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
get size() {
|
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
* a transport error could double-post. We leave the decision to the caller.
|
|
37
37
|
*/
|
|
38
38
|
import type { RegistryServer } from '../registry/types.js';
|
|
39
|
+
import type { Logger } from './log.js';
|
|
39
40
|
export interface DownstreamToolInfo {
|
|
40
41
|
name: string;
|
|
41
42
|
description?: string;
|
|
@@ -43,15 +44,44 @@ export interface DownstreamToolInfo {
|
|
|
43
44
|
}
|
|
44
45
|
/**
|
|
45
46
|
* Build the child env by layering:
|
|
46
|
-
* allowlist → registry env_passthrough → registry env.
|
|
47
|
+
* allowlist → registry env_passthrough → interpolated registry env.
|
|
47
48
|
* Later entries win. Missing host values are skipped so `process.env[name]`
|
|
48
49
|
* being undefined does not serialize as the literal string "undefined".
|
|
49
50
|
*
|
|
51
|
+
* The explicit `env:` map may contain `${VAR}` placeholders (see
|
|
52
|
+
* `registry/interpolate.ts` for the exact grammar). Placeholders referencing
|
|
53
|
+
* unset host vars are returned via the `missing` array — the caller MUST
|
|
54
|
+
* refuse to spawn the server if `missing.length > 0`, otherwise the child
|
|
55
|
+
* receives unresolved `${...}` strings which are nearly always wrong.
|
|
56
|
+
*
|
|
50
57
|
* Exported for testing.
|
|
51
58
|
*/
|
|
52
|
-
export
|
|
59
|
+
export interface BuiltChildEnv {
|
|
60
|
+
/** Fully resolved env to pass to the child transport. */
|
|
61
|
+
env: Record<string, string>;
|
|
62
|
+
/**
|
|
63
|
+
* Names of `${VAR}` references that were not set in `hostEnv`. When
|
|
64
|
+
* non-empty, the caller MUST NOT spawn the child — mark the connection
|
|
65
|
+
* unhealthy and log each entry.
|
|
66
|
+
*/
|
|
67
|
+
missing: string[];
|
|
68
|
+
/**
|
|
69
|
+
* Keys in `env` whose value is secret-bearing (either because the key
|
|
70
|
+
* name matches the secret-name heuristic, or because one of its
|
|
71
|
+
* interpolated `${VAR}` references did). Callers MUST NOT log the
|
|
72
|
+
* corresponding values.
|
|
73
|
+
*/
|
|
74
|
+
secretKeys: string[];
|
|
75
|
+
}
|
|
76
|
+
export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
|
|
53
77
|
export declare class DownstreamConnection {
|
|
54
78
|
private readonly config;
|
|
79
|
+
/**
|
|
80
|
+
* Optional structured logger (G5). When omitted, connection lifecycle
|
|
81
|
+
* events are simply not logged — keeping the class usable in unit tests
|
|
82
|
+
* that don't care about observability.
|
|
83
|
+
*/
|
|
84
|
+
private readonly logger?;
|
|
55
85
|
private client;
|
|
56
86
|
/**
|
|
57
87
|
* Whether a reconnect has already been attempted in the CURRENT failure
|
|
@@ -63,7 +93,13 @@ export declare class DownstreamConnection {
|
|
|
63
93
|
/** Epoch ms of the last successful reconnect. Used by the flapping guard. */
|
|
64
94
|
private lastReconnectAt;
|
|
65
95
|
private health;
|
|
66
|
-
constructor(config: RegistryServer
|
|
96
|
+
constructor(config: RegistryServer,
|
|
97
|
+
/**
|
|
98
|
+
* Optional structured logger (G5). When omitted, connection lifecycle
|
|
99
|
+
* events are simply not logged — keeping the class usable in unit tests
|
|
100
|
+
* that don't care about observability.
|
|
101
|
+
*/
|
|
102
|
+
logger?: Logger | undefined);
|
|
67
103
|
get name(): string;
|
|
68
104
|
get isHealthy(): boolean;
|
|
69
105
|
connect(): Promise<void>;
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
*/
|
|
38
38
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
39
39
|
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
|
40
|
+
import { interpolateEnv } from '../registry/interpolate.js';
|
|
40
41
|
/**
|
|
41
42
|
* Neutral env vars every child inherits. These are the ones shells/toolchains
|
|
42
43
|
* need to function but carry no secrets in a well-configured environment.
|
|
@@ -66,14 +67,6 @@ const DEFAULT_ENV_ALLOWLIST = [
|
|
|
66
67
|
* handle it.
|
|
67
68
|
*/
|
|
68
69
|
const RECONNECT_FLAP_WINDOW_MS = 30_000;
|
|
69
|
-
/**
|
|
70
|
-
* Build the child env by layering:
|
|
71
|
-
* allowlist → registry env_passthrough → registry env.
|
|
72
|
-
* Later entries win. Missing host values are skipped so `process.env[name]`
|
|
73
|
-
* being undefined does not serialize as the literal string "undefined".
|
|
74
|
-
*
|
|
75
|
-
* Exported for testing.
|
|
76
|
-
*/
|
|
77
70
|
export function buildChildEnv(config, hostEnv = process.env) {
|
|
78
71
|
const out = {};
|
|
79
72
|
for (const name of DEFAULT_ENV_ALLOWLIST) {
|
|
@@ -88,14 +81,21 @@ export function buildChildEnv(config, hostEnv = process.env) {
|
|
|
88
81
|
out[name] = v;
|
|
89
82
|
}
|
|
90
83
|
}
|
|
84
|
+
// Interpolate placeholders in config.env BEFORE layering it on top.
|
|
85
|
+
// `interpolateEnv` is pure — no I/O, throws only on malformed syntax
|
|
86
|
+
// (unterminated brace, empty `${}`, illegal var name). Missing host
|
|
87
|
+
// vars are reported via `result.missing`; the caller decides whether
|
|
88
|
+
// to refuse the spawn.
|
|
89
|
+
const interp = interpolateEnv(config.env, hostEnv);
|
|
91
90
|
// Explicit config.env wins — operator typed these values deliberately.
|
|
92
|
-
for (const [k, v] of Object.entries(
|
|
91
|
+
for (const [k, v] of Object.entries(interp.resolved)) {
|
|
93
92
|
out[k] = v;
|
|
94
93
|
}
|
|
95
|
-
return out;
|
|
94
|
+
return { env: out, missing: interp.missing, secretKeys: interp.secretKeys };
|
|
96
95
|
}
|
|
97
96
|
export class DownstreamConnection {
|
|
98
97
|
config;
|
|
98
|
+
logger;
|
|
99
99
|
client = null;
|
|
100
100
|
/**
|
|
101
101
|
* Whether a reconnect has already been attempted in the CURRENT failure
|
|
@@ -107,8 +107,15 @@ export class DownstreamConnection {
|
|
|
107
107
|
/** Epoch ms of the last successful reconnect. Used by the flapping guard. */
|
|
108
108
|
lastReconnectAt = 0;
|
|
109
109
|
health = 'healthy';
|
|
110
|
-
constructor(config
|
|
110
|
+
constructor(config,
|
|
111
|
+
/**
|
|
112
|
+
* Optional structured logger (G5). When omitted, connection lifecycle
|
|
113
|
+
* events are simply not logged — keeping the class usable in unit tests
|
|
114
|
+
* that don't care about observability.
|
|
115
|
+
*/
|
|
116
|
+
logger) {
|
|
111
117
|
this.config = config;
|
|
118
|
+
this.logger = logger;
|
|
112
119
|
}
|
|
113
120
|
get name() {
|
|
114
121
|
return this.config.name;
|
|
@@ -119,10 +126,40 @@ export class DownstreamConnection {
|
|
|
119
126
|
async connect() {
|
|
120
127
|
if (this.client !== null)
|
|
121
128
|
return;
|
|
129
|
+
// Resolve env BEFORE spawning. If any `${VAR}` reference in the registry's
|
|
130
|
+
// explicit env: map is unset at startup, refuse to spawn this server:
|
|
131
|
+
// - log a clear, secret-safe error (only the var name appears; the
|
|
132
|
+
// resolved value would not exist anyway since it's missing)
|
|
133
|
+
// - mark this connection unhealthy so the pool skips it
|
|
134
|
+
// - leave every other server's spawn path untouched (the gateway as a
|
|
135
|
+
// whole keeps coming up)
|
|
136
|
+
//
|
|
137
|
+
// Malformed syntax (unterminated brace, `${}`, illegal identifier) throws
|
|
138
|
+
// from interpolateEnv — that's a load-time error and we propagate it so
|
|
139
|
+
// the operator sees it at startup with server context attached.
|
|
140
|
+
let built;
|
|
141
|
+
try {
|
|
142
|
+
built = buildChildEnv(this.config);
|
|
143
|
+
}
|
|
144
|
+
catch (err) {
|
|
145
|
+
this.health = 'unhealthy';
|
|
146
|
+
throw new Error(`failed to resolve env for downstream "${this.config.name}": ${err instanceof Error ? err.message : err}`);
|
|
147
|
+
}
|
|
148
|
+
if (built.missing.length > 0) {
|
|
149
|
+
this.health = 'unhealthy';
|
|
150
|
+
// One line per missing var so grep/jq users can find the exact gap.
|
|
151
|
+
// We intentionally do NOT log the env key name's VALUE (there is none —
|
|
152
|
+
// it's unresolved) nor any other env values.
|
|
153
|
+
for (const missingVar of built.missing) {
|
|
154
|
+
console.error(`[rea-gateway] refusing to start downstream "${this.config.name}": ` +
|
|
155
|
+
`env references ${'${'}${missingVar}${'}'} but process.env.${missingVar} is not set`);
|
|
156
|
+
}
|
|
157
|
+
throw new Error(`downstream "${this.config.name}" refused to start — missing env: ${built.missing.join(', ')}`);
|
|
158
|
+
}
|
|
122
159
|
const transport = new StdioClientTransport({
|
|
123
160
|
command: this.config.command,
|
|
124
161
|
args: this.config.args,
|
|
125
|
-
env:
|
|
162
|
+
env: built.env,
|
|
126
163
|
});
|
|
127
164
|
const client = new Client({ name: `rea-gateway-client:${this.config.name}`, version: '0.2.0' }, { capabilities: {} });
|
|
128
165
|
try {
|
|
@@ -157,11 +194,16 @@ export class DownstreamConnection {
|
|
|
157
194
|
}
|
|
158
195
|
catch (err) {
|
|
159
196
|
const message = err instanceof Error ? err.message : String(err);
|
|
160
|
-
const withinFlapWindow = this.lastReconnectAt !== 0 &&
|
|
161
|
-
Date.now() - this.lastReconnectAt < RECONNECT_FLAP_WINDOW_MS;
|
|
197
|
+
const withinFlapWindow = this.lastReconnectAt !== 0 && Date.now() - this.lastReconnectAt < RECONNECT_FLAP_WINDOW_MS;
|
|
162
198
|
if (!this.reconnectAttempted && !withinFlapWindow) {
|
|
163
199
|
this.reconnectAttempted = true;
|
|
164
200
|
this.health = 'degraded';
|
|
201
|
+
this.logger?.warn({
|
|
202
|
+
event: 'downstream.reconnect_attempt',
|
|
203
|
+
server_name: this.config.name,
|
|
204
|
+
message: `downstream "${this.config.name}" will reconnect once after error`,
|
|
205
|
+
reason: message,
|
|
206
|
+
});
|
|
165
207
|
try {
|
|
166
208
|
await this.close();
|
|
167
209
|
await this.connect();
|
|
@@ -170,14 +212,31 @@ export class DownstreamConnection {
|
|
|
170
212
|
// stamp the reconnect time so flap-guard can refuse rapid repeats.
|
|
171
213
|
this.reconnectAttempted = false;
|
|
172
214
|
this.lastReconnectAt = Date.now();
|
|
215
|
+
this.logger?.info({
|
|
216
|
+
event: 'downstream.reconnected',
|
|
217
|
+
server_name: this.config.name,
|
|
218
|
+
message: `downstream "${this.config.name}" reconnected successfully`,
|
|
219
|
+
});
|
|
173
220
|
return result;
|
|
174
221
|
}
|
|
175
222
|
catch (reconnectErr) {
|
|
176
223
|
this.health = 'unhealthy';
|
|
224
|
+
this.logger?.error({
|
|
225
|
+
event: 'downstream.reconnect_failed',
|
|
226
|
+
server_name: this.config.name,
|
|
227
|
+
message: `downstream "${this.config.name}" unhealthy after one reconnect`,
|
|
228
|
+
error: reconnectErr instanceof Error ? reconnectErr.message : String(reconnectErr),
|
|
229
|
+
});
|
|
177
230
|
throw new Error(`downstream "${this.config.name}" unhealthy after one reconnect: ${reconnectErr instanceof Error ? reconnectErr.message : reconnectErr}`);
|
|
178
231
|
}
|
|
179
232
|
}
|
|
180
233
|
this.health = 'unhealthy';
|
|
234
|
+
this.logger?.error({
|
|
235
|
+
event: 'downstream.call_failed',
|
|
236
|
+
server_name: this.config.name,
|
|
237
|
+
message: `downstream "${this.config.name}" call failed`,
|
|
238
|
+
error: message,
|
|
239
|
+
});
|
|
181
240
|
throw new Error(`downstream "${this.config.name}" call failed: ${message}`);
|
|
182
241
|
}
|
|
183
242
|
}
|