@bookedsolid/rea 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.husky/pre-push +15 -18
  2. package/README.md +41 -1
  3. package/THREAT_MODEL.md +100 -29
  4. package/dist/audit/append.d.ts +21 -8
  5. package/dist/audit/append.js +48 -83
  6. package/dist/audit/fs.d.ts +68 -0
  7. package/dist/audit/fs.js +171 -0
  8. package/dist/cli/audit.d.ts +40 -0
  9. package/dist/cli/audit.js +205 -0
  10. package/dist/cli/doctor.d.ts +19 -4
  11. package/dist/cli/doctor.js +172 -5
  12. package/dist/cli/index.js +26 -1
  13. package/dist/cli/init.js +93 -7
  14. package/dist/cli/install/pre-push.d.ts +335 -0
  15. package/dist/cli/install/pre-push.js +2818 -0
  16. package/dist/cli/serve.d.ts +64 -0
  17. package/dist/cli/serve.js +270 -2
  18. package/dist/cli/status.d.ts +90 -0
  19. package/dist/cli/status.js +399 -0
  20. package/dist/cli/utils.d.ts +4 -0
  21. package/dist/cli/utils.js +4 -0
  22. package/dist/gateway/audit/rotator.d.ts +116 -0
  23. package/dist/gateway/audit/rotator.js +289 -0
  24. package/dist/gateway/circuit-breaker.d.ts +17 -0
  25. package/dist/gateway/circuit-breaker.js +32 -3
  26. package/dist/gateway/downstream-pool.d.ts +2 -1
  27. package/dist/gateway/downstream-pool.js +2 -2
  28. package/dist/gateway/downstream.d.ts +39 -3
  29. package/dist/gateway/downstream.js +73 -14
  30. package/dist/gateway/log.d.ts +122 -0
  31. package/dist/gateway/log.js +334 -0
  32. package/dist/gateway/middleware/audit.d.ts +24 -1
  33. package/dist/gateway/middleware/audit.js +103 -58
  34. package/dist/gateway/middleware/blocked-paths.d.ts +0 -9
  35. package/dist/gateway/middleware/blocked-paths.js +439 -67
  36. package/dist/gateway/middleware/injection.d.ts +218 -13
  37. package/dist/gateway/middleware/injection.js +433 -51
  38. package/dist/gateway/middleware/kill-switch.d.ts +10 -1
  39. package/dist/gateway/middleware/kill-switch.js +20 -1
  40. package/dist/gateway/observability/metrics.d.ts +125 -0
  41. package/dist/gateway/observability/metrics.js +321 -0
  42. package/dist/gateway/server.d.ts +19 -0
  43. package/dist/gateway/server.js +99 -15
  44. package/dist/policy/loader.d.ts +47 -0
  45. package/dist/policy/loader.js +47 -0
  46. package/dist/policy/profiles.d.ts +13 -0
  47. package/dist/policy/profiles.js +12 -0
  48. package/dist/policy/types.d.ts +52 -0
  49. package/dist/registry/fingerprint.d.ts +73 -0
  50. package/dist/registry/fingerprint.js +81 -0
  51. package/dist/registry/fingerprints-store.d.ts +62 -0
  52. package/dist/registry/fingerprints-store.js +111 -0
  53. package/dist/registry/interpolate.d.ts +58 -0
  54. package/dist/registry/interpolate.js +121 -0
  55. package/dist/registry/loader.d.ts +2 -2
  56. package/dist/registry/loader.js +22 -1
  57. package/dist/registry/tofu-gate.d.ts +41 -0
  58. package/dist/registry/tofu-gate.js +189 -0
  59. package/dist/registry/tofu.d.ts +111 -0
  60. package/dist/registry/tofu.js +173 -0
  61. package/dist/registry/types.d.ts +9 -1
  62. package/package.json +3 -1
  63. package/profiles/bst-internal-no-codex.yaml +5 -0
  64. package/profiles/bst-internal.yaml +7 -0
  65. package/scripts/tarball-smoke.sh +197 -0
@@ -0,0 +1,289 @@
1
+ /**
2
+ * Audit rotation (G1). Size- and age-based rotation for `.rea/audit.jsonl`
3
+ * that preserves hash-chain continuity across the rotation boundary.
4
+ *
5
+ * ## Triggers
6
+ *
7
+ * Rotation fires when EITHER threshold is crossed:
8
+ *
9
+ * - `max_bytes` — the current `audit.jsonl` is at or above this many bytes.
10
+ * Default when the policy block is present but `max_bytes` is unset:
11
+ * `DEFAULT_MAX_BYTES` (50 MiB).
12
+ * - `max_age_days` — the first record's `timestamp` is older than this many
13
+ * days. Default when unset: `DEFAULT_MAX_AGE_DAYS` (30).
14
+ *
15
+ * Back-compat: if the `audit.rotation` policy block is ABSENT entirely,
16
+ * rotation is DISABLED. Defaults only apply when the operator has opted in
17
+ * by declaring the block (even empty). This is deliberate — we do not want
18
+ * a 0.2.x install to observe new file-movement behavior on 0.3.0 upgrade
19
+ * without being asked.
20
+ *
21
+ * ## Rotation marker
22
+ *
23
+ * On rotation, the current file is renamed to `audit-YYYYMMDD-HHMMSS.jsonl`
24
+ * in the same directory. A fresh `audit.jsonl` is created containing EXACTLY
25
+ * one record: a rotation marker.
26
+ *
27
+ * tool_name: 'audit.rotation'
28
+ * server_name: 'rea'
29
+ * status: 'allowed'
30
+ * tier: 'read'
31
+ * autonomy_level: 'system'
32
+ * prev_hash: hash of the LAST record in the rotated file
33
+ * metadata.rotated_from: the rotated filename (basename)
34
+ * metadata.rotated_at: ISO-8601 instant of rotation
35
+ *
36
+ * The marker's `prev_hash` is the chain bridge — an operator verifying the
37
+ * chain with `rea audit verify --since <rotated-file>` walks rotated →
38
+ * marker → current and every transition must line up.
39
+ *
40
+ * ## Concurrency
41
+ *
42
+ * `maybeRotate` is called BEFORE the per-append lock is acquired. It takes
43
+ * its own short-lived lock on `.rea/` to perform the rename + marker write
44
+ * atomically. Callers that beat the rotator to the lock simply append to
45
+ * the (now fresh) file — correctness is preserved because the rotation
46
+ * marker is a legitimate chain anchor.
47
+ */
48
+ import fs from 'node:fs/promises';
49
+ import path from 'node:path';
50
+ import { Tier, InvocationStatus } from '../../policy/types.js';
51
+ import { computeHash, readLastRecord, withAuditLock } from '../../audit/fs.js';
52
+ /** 50 MiB. Only applied when the operator has declared `audit.rotation`. */
53
+ export const DEFAULT_MAX_BYTES = 50 * 1024 * 1024;
54
+ /** 30 days. Only applied when the operator has declared `audit.rotation`. */
55
+ export const DEFAULT_MAX_AGE_DAYS = 30;
56
+ export const ROTATION_TOOL_NAME = 'audit.rotation';
57
+ export const ROTATION_SERVER_NAME = 'rea';
58
+ /**
59
+ * Compute the effective rotation thresholds from policy. If the operator has
60
+ * NOT declared an `audit.rotation` block, BOTH thresholds are undefined and
61
+ * rotation is disabled (back-compat with 0.2.x).
62
+ *
63
+ * If the block IS declared but individual knobs are missing, apply the
64
+ * documented defaults.
65
+ */
66
+ function effectiveThresholds(policy) {
67
+ const rot = policy?.audit?.rotation;
68
+ if (rot === undefined) {
69
+ return { maxBytes: undefined, maxAgeMs: undefined };
70
+ }
71
+ // An explicit `audit.rotation: {}` block opts in to both defaults.
72
+ const maxBytes = rot.max_bytes ?? DEFAULT_MAX_BYTES;
73
+ const maxAgeDays = rot.max_age_days ?? DEFAULT_MAX_AGE_DAYS;
74
+ return { maxBytes, maxAgeMs: maxAgeDays * 24 * 60 * 60 * 1000 };
75
+ }
76
+ /**
77
+ * Build the rotation timestamp filename. UTC for sortability.
78
+ * Format: `audit-YYYYMMDD-HHMMSS.jsonl`. Collisions (two rotations in the
79
+ * same second) are resolved by appending `-1`, `-2`, etc.
80
+ */
81
+ export function rotationFilename(at) {
82
+ const y = at.getUTCFullYear().toString().padStart(4, '0');
83
+ const m = (at.getUTCMonth() + 1).toString().padStart(2, '0');
84
+ const d = at.getUTCDate().toString().padStart(2, '0');
85
+ const hh = at.getUTCHours().toString().padStart(2, '0');
86
+ const mm = at.getUTCMinutes().toString().padStart(2, '0');
87
+ const ss = at.getUTCSeconds().toString().padStart(2, '0');
88
+ return `audit-${y}${m}${d}-${hh}${mm}${ss}.jsonl`;
89
+ }
90
+ /**
91
+ * Probe the first record's timestamp WITHOUT loading the whole file into
92
+ * memory as a JSON blob. We read up to the first newline and parse just
93
+ * that line. Returns `undefined` if the file is empty / unreadable / the
94
+ * first line isn't valid JSON with a usable `timestamp` field.
95
+ */
96
+ async function readFirstTimestamp(auditFile) {
97
+ let fh;
98
+ try {
99
+ fh = await fs.open(auditFile, 'r');
100
+ // 64 KiB is enough for the first record under any realistic schema.
101
+ const buf = Buffer.alloc(64 * 1024);
102
+ const { bytesRead } = await fh.read(buf, 0, buf.length, 0);
103
+ if (bytesRead === 0)
104
+ return undefined;
105
+ const chunk = buf.slice(0, bytesRead).toString('utf8');
106
+ const newline = chunk.indexOf('\n');
107
+ const firstLine = newline === -1 ? chunk : chunk.slice(0, newline);
108
+ if (firstLine.length === 0)
109
+ return undefined;
110
+ const parsed = JSON.parse(firstLine);
111
+ if (typeof parsed.timestamp !== 'string')
112
+ return undefined;
113
+ const ts = Date.parse(parsed.timestamp);
114
+ if (Number.isNaN(ts))
115
+ return undefined;
116
+ return new Date(ts);
117
+ }
118
+ catch {
119
+ return undefined;
120
+ }
121
+ finally {
122
+ if (fh)
123
+ await fh.close();
124
+ }
125
+ }
126
+ /**
127
+ * Decide whether the current audit file has crossed any rotation threshold.
128
+ * Exported for testing.
129
+ */
130
+ export async function shouldRotate(auditFile, thresholds, now = new Date()) {
131
+ if (thresholds.maxBytes === undefined && thresholds.maxAgeMs === undefined) {
132
+ return false;
133
+ }
134
+ let size;
135
+ try {
136
+ const stat = await fs.stat(auditFile);
137
+ if (!stat.isFile())
138
+ return false;
139
+ size = stat.size;
140
+ }
141
+ catch (err) {
142
+ if (err.code === 'ENOENT')
143
+ return false;
144
+ throw err;
145
+ }
146
+ // Empty files never rotate — rotating an empty file would create a chain
147
+ // anchored on genesis with a dangling predecessor.
148
+ if (size === 0)
149
+ return false;
150
+ if (thresholds.maxBytes !== undefined && size >= thresholds.maxBytes) {
151
+ return true;
152
+ }
153
+ if (thresholds.maxAgeMs !== undefined) {
154
+ const firstTs = await readFirstTimestamp(auditFile);
155
+ if (firstTs !== undefined) {
156
+ const ageMs = now.getTime() - firstTs.getTime();
157
+ if (ageMs >= thresholds.maxAgeMs)
158
+ return true;
159
+ }
160
+ }
161
+ return false;
162
+ }
163
+ /**
164
+ * Pick a rotation filename that doesn't collide with an existing file.
165
+ * Returns the absolute path.
166
+ */
167
+ async function pickRotationPath(reaDir, at) {
168
+ const base = rotationFilename(at);
169
+ const baseNoExt = base.replace(/\.jsonl$/, '');
170
+ let candidate = path.join(reaDir, base);
171
+ let suffix = 1;
172
+ while (true) {
173
+ try {
174
+ await fs.access(candidate);
175
+ }
176
+ catch (err) {
177
+ if (err.code === 'ENOENT') {
178
+ return candidate;
179
+ }
180
+ throw err;
181
+ }
182
+ candidate = path.join(reaDir, `${baseNoExt}-${suffix}.jsonl`);
183
+ suffix += 1;
184
+ if (suffix > 1000) {
185
+ throw new Error(`Unable to pick rotation filename in ${reaDir} — 1000 collisions`);
186
+ }
187
+ }
188
+ }
189
+ /**
190
+ * Perform the rotation unconditionally. Assumes the caller has already
191
+ * determined rotation is warranted and holds (or is about to acquire) any
192
+ * outer locks. `performRotation` takes its own lock on `.rea/` to make the
193
+ * rename + marker write atomic w.r.t. other append-path lockers.
194
+ *
195
+ * Returns `{ rotated: false }` if the audit file is empty or missing — an
196
+ * empty file is a no-op by design (see `rea audit rotate` empty-case).
197
+ */
198
+ export async function performRotation(auditFile, now = new Date()) {
199
+ const reaDir = path.dirname(auditFile);
200
+ // Ensure the parent exists so withAuditLock can place a lock file. The
201
+ // caller normally creates this; we mkdir defensively for the force-rotate
202
+ // path (`rea audit rotate` on a green-field install).
203
+ await fs.mkdir(reaDir, { recursive: true });
204
+ return withAuditLock(auditFile, async () => {
205
+ // Re-check the file under the lock. Another writer may have rotated
206
+ // between the caller's decision and our lock acquisition.
207
+ let size;
208
+ try {
209
+ const stat = await fs.stat(auditFile);
210
+ if (!stat.isFile())
211
+ return { rotated: false };
212
+ size = stat.size;
213
+ }
214
+ catch (err) {
215
+ if (err.code === 'ENOENT')
216
+ return { rotated: false };
217
+ throw err;
218
+ }
219
+ if (size === 0)
220
+ return { rotated: false };
221
+ // Pull the last record's hash BEFORE renaming — so we can anchor the
222
+ // marker's prev_hash on the old chain's tail. readLastRecord also
223
+ // performs partial-write recovery under our lock (idempotent).
224
+ const { hash: tailHash } = await readLastRecord(auditFile);
225
+ const rotatedPath = await pickRotationPath(reaDir, now);
226
+ await fs.rename(auditFile, rotatedPath);
227
+ // Write the rotation marker into a fresh audit.jsonl. The marker's
228
+ // prev_hash is the old tail's hash — operators can walk rotated →
229
+ // marker and the chain holds.
230
+ const markerBase = {
231
+ timestamp: now.toISOString(),
232
+ session_id: 'system',
233
+ tool_name: ROTATION_TOOL_NAME,
234
+ server_name: ROTATION_SERVER_NAME,
235
+ tier: Tier.Read,
236
+ status: InvocationStatus.Allowed,
237
+ autonomy_level: 'system',
238
+ duration_ms: 0,
239
+ prev_hash: tailHash,
240
+ metadata: {
241
+ rotated_from: path.basename(rotatedPath),
242
+ rotated_at: now.toISOString(),
243
+ },
244
+ };
245
+ const markerHash = computeHash(markerBase);
246
+ const marker = { ...markerBase, hash: markerHash };
247
+ const line = JSON.stringify(marker) + '\n';
248
+ await fs.writeFile(auditFile, line, { flag: 'w' });
249
+ return { rotated: true, rotatedTo: rotatedPath };
250
+ });
251
+ }
252
+ /**
253
+ * Called by the append path BEFORE acquiring its own lock. Cheap when no
254
+ * rotation is due (one stat, maybe one 64 KiB read for age check); idempotent
255
+ * when rotation IS due (performRotation re-checks under the lock).
256
+ *
257
+ * Never throws. On any error, logs to stderr and returns `rotated: false`
258
+ * — a broken rotator must NOT break the audit append.
259
+ */
260
+ export async function maybeRotate(auditFile, policy, now = new Date()) {
261
+ try {
262
+ const thresholds = effectiveThresholds(policy);
263
+ if (thresholds.maxBytes === undefined && thresholds.maxAgeMs === undefined) {
264
+ return { rotated: false };
265
+ }
266
+ const due = await shouldRotate(auditFile, thresholds, now);
267
+ if (!due)
268
+ return { rotated: false };
269
+ return await performRotation(auditFile, now);
270
+ }
271
+ catch (err) {
272
+ console.error('[rea] AUDIT ROTATION FAILED:', err instanceof Error ? err.message : String(err));
273
+ return { rotated: false };
274
+ }
275
+ }
276
+ /**
277
+ * CLI-invoked force rotation (`rea audit rotate`). Unlike `maybeRotate` this
278
+ * DOES ignore thresholds — the operator asked explicitly — but empty files
279
+ * are still a no-op because rotating an empty chain produces a marker with
280
+ * no predecessor.
281
+ */
282
+ export async function forceRotate(auditFile, now = new Date()) {
283
+ return performRotation(auditFile, now);
284
+ }
285
+ /**
286
+ * Exposed for tests/callers that already know the policy shape. Tests that
287
+ * want to stub thresholds can call `performRotation` directly.
288
+ */
289
+ export { effectiveThresholds as _effectiveThresholds };
@@ -1,9 +1,24 @@
1
1
  export type CircuitState = 'closed' | 'open' | 'half-open';
2
+ /**
3
+ * Callback invoked on every circuit state transition (G5). The constructor
4
+ * can wire this to a structured logger and/or a metrics gauge so state
5
+ * changes are observable without requiring the breaker itself to depend on
6
+ * those modules.
7
+ */
8
+ export type CircuitStateChangeListener = (event: {
9
+ server: string;
10
+ from: CircuitState;
11
+ to: CircuitState;
12
+ reason: 'failure_threshold' | 'cooldown_elapsed' | 'recovered' | 'half_open_failed';
13
+ retryAt?: string;
14
+ }) => void;
2
15
  export interface CircuitBreakerOptions {
3
16
  /** Consecutive failures before opening the circuit. Default: 5 */
4
17
  failureThreshold?: number;
5
18
  /** Milliseconds to wait in open state before moving to half-open. Default: 30_000 */
6
19
  cooldownMs?: number;
20
+ /** Optional listener for state transitions. See {@link CircuitStateChangeListener}. */
21
+ onStateChange?: CircuitStateChangeListener;
7
22
  }
8
23
  export interface CircuitStatus {
9
24
  state: CircuitState;
@@ -29,7 +44,9 @@ interface CircuitEntry {
29
44
  export declare class CircuitBreaker {
30
45
  private circuits;
31
46
  private defaultOptions;
47
+ private readonly onStateChange;
32
48
  constructor(defaults?: CircuitBreakerOptions);
49
+ private notify;
33
50
  private getOrCreate;
34
51
  /**
35
52
  * Returns null if the call may proceed, or a CircuitStatus if the circuit is open.
@@ -10,11 +10,23 @@
10
10
  export class CircuitBreaker {
11
11
  circuits = new Map();
12
12
  defaultOptions;
13
+ onStateChange;
13
14
  constructor(defaults = {}) {
14
15
  this.defaultOptions = {
15
16
  failureThreshold: defaults.failureThreshold ?? 5,
16
17
  cooldownMs: defaults.cooldownMs ?? 30_000,
17
18
  };
19
+ this.onStateChange = defaults.onStateChange;
20
+ }
21
+ notify(event) {
22
+ if (this.onStateChange === undefined)
23
+ return;
24
+ try {
25
+ this.onStateChange(event);
26
+ }
27
+ catch {
28
+ // Listeners must never break the breaker. Swallow.
29
+ }
18
30
  }
19
31
  getOrCreate(serverName) {
20
32
  let entry = this.circuits.get(serverName);
@@ -43,7 +55,12 @@ export class CircuitBreaker {
43
55
  if (elapsed >= entry.cooldownMs) {
44
56
  entry.state = 'half-open';
45
57
  entry.consecutiveFailures = 0;
46
- console.error(`[rea] circuit-breaker: "${serverName}" transitioned open → half-open (probing recovery)`);
58
+ this.notify({
59
+ server: serverName,
60
+ from: 'open',
61
+ to: 'half-open',
62
+ reason: 'cooldown_elapsed',
63
+ });
47
64
  return null;
48
65
  }
49
66
  const retryAt = new Date((entry.openedAt ?? 0) + entry.cooldownMs).toISOString();
@@ -61,7 +78,12 @@ export class CircuitBreaker {
61
78
  entry.state = 'closed';
62
79
  entry.consecutiveFailures = 0;
63
80
  entry.openedAt = null;
64
- console.error(`[rea] circuit-breaker: "${serverName}" recovered — circuit closed`);
81
+ this.notify({
82
+ server: serverName,
83
+ from: 'half-open',
84
+ to: 'closed',
85
+ reason: 'recovered',
86
+ });
65
87
  }
66
88
  else if (entry.state === 'closed') {
67
89
  entry.consecutiveFailures = 0;
@@ -71,13 +93,20 @@ export class CircuitBreaker {
71
93
  const entry = this.getOrCreate(serverName);
72
94
  if (entry.state === 'open')
73
95
  return;
96
+ const previous = entry.state;
74
97
  entry.consecutiveFailures++;
75
98
  const shouldOpen = entry.state === 'half-open' || entry.consecutiveFailures >= entry.failureThreshold;
76
99
  if (shouldOpen) {
77
100
  entry.state = 'open';
78
101
  entry.openedAt = Date.now();
79
102
  const retryAt = new Date(entry.openedAt + entry.cooldownMs).toISOString();
80
- console.error(`[rea] circuit-breaker: "${serverName}" OPENED after ${entry.consecutiveFailures} failure(s) — will retry at ${retryAt}`);
103
+ this.notify({
104
+ server: serverName,
105
+ from: previous,
106
+ to: 'open',
107
+ reason: previous === 'half-open' ? 'half_open_failed' : 'failure_threshold',
108
+ retryAt,
109
+ });
81
110
  }
82
111
  }
83
112
  getCircuit(serverName) {
@@ -7,6 +7,7 @@
7
7
  */
8
8
  import { DownstreamConnection, type DownstreamToolInfo } from './downstream.js';
9
9
  import type { Registry } from '../registry/types.js';
10
+ import type { Logger } from './log.js';
10
11
  export interface PrefixedTool extends DownstreamToolInfo {
11
12
  /** Server name, not prefixed. */
12
13
  server: string;
@@ -15,7 +16,7 @@ export interface PrefixedTool extends DownstreamToolInfo {
15
16
  }
16
17
  export declare class DownstreamPool {
17
18
  private readonly connections;
18
- constructor(registry: Registry);
19
+ constructor(registry: Registry, logger?: Logger);
19
20
  get size(): number;
20
21
  connectAll(): Promise<void>;
21
22
  /**
@@ -8,11 +8,11 @@
8
8
  import { DownstreamConnection } from './downstream.js';
9
9
  export class DownstreamPool {
10
10
  connections = new Map();
11
- constructor(registry) {
11
+ constructor(registry, logger) {
12
12
  for (const server of registry.servers) {
13
13
  if (!server.enabled)
14
14
  continue;
15
- this.connections.set(server.name, new DownstreamConnection(server));
15
+ this.connections.set(server.name, new DownstreamConnection(server, logger));
16
16
  }
17
17
  }
18
18
  get size() {
@@ -36,6 +36,7 @@
36
36
  * a transport error could double-post. We leave the decision to the caller.
37
37
  */
38
38
  import type { RegistryServer } from '../registry/types.js';
39
+ import type { Logger } from './log.js';
39
40
  export interface DownstreamToolInfo {
40
41
  name: string;
41
42
  description?: string;
@@ -43,15 +44,44 @@ export interface DownstreamToolInfo {
43
44
  }
44
45
  /**
45
46
  * Build the child env by layering:
46
- * allowlist → registry env_passthrough → registry env.
47
+ * allowlist → registry env_passthrough → interpolated registry env.
47
48
  * Later entries win. Missing host values are skipped so `process.env[name]`
48
49
  * being undefined does not serialize as the literal string "undefined".
49
50
  *
51
+ * The explicit `env:` map may contain `${VAR}` placeholders (see
52
+ * `registry/interpolate.ts` for the exact grammar). Placeholders referencing
53
+ * unset host vars are returned via the `missing` array — the caller MUST
54
+ * refuse to spawn the server if `missing.length > 0`, otherwise the child
55
+ * receives unresolved `${...}` strings which are nearly always wrong.
56
+ *
50
57
  * Exported for testing.
51
58
  */
52
- export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): Record<string, string>;
59
+ export interface BuiltChildEnv {
60
+ /** Fully resolved env to pass to the child transport. */
61
+ env: Record<string, string>;
62
+ /**
63
+ * Names of `${VAR}` references that were not set in `hostEnv`. When
64
+ * non-empty, the caller MUST NOT spawn the child — mark the connection
65
+ * unhealthy and log each entry.
66
+ */
67
+ missing: string[];
68
+ /**
69
+ * Keys in `env` whose value is secret-bearing (either because the key
70
+ * name matches the secret-name heuristic, or because one of its
71
+ * interpolated `${VAR}` references did). Callers MUST NOT log the
72
+ * corresponding values.
73
+ */
74
+ secretKeys: string[];
75
+ }
76
+ export declare function buildChildEnv(config: RegistryServer, hostEnv?: NodeJS.ProcessEnv): BuiltChildEnv;
53
77
  export declare class DownstreamConnection {
54
78
  private readonly config;
79
+ /**
80
+ * Optional structured logger (G5). When omitted, connection lifecycle
81
+ * events are simply not logged — keeping the class usable in unit tests
82
+ * that don't care about observability.
83
+ */
84
+ private readonly logger?;
55
85
  private client;
56
86
  /**
57
87
  * Whether a reconnect has already been attempted in the CURRENT failure
@@ -63,7 +93,13 @@ export declare class DownstreamConnection {
63
93
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
64
94
  private lastReconnectAt;
65
95
  private health;
66
- constructor(config: RegistryServer);
96
+ constructor(config: RegistryServer,
97
+ /**
98
+ * Optional structured logger (G5). When omitted, connection lifecycle
99
+ * events are simply not logged — keeping the class usable in unit tests
100
+ * that don't care about observability.
101
+ */
102
+ logger?: Logger | undefined);
67
103
  get name(): string;
68
104
  get isHealthy(): boolean;
69
105
  connect(): Promise<void>;
@@ -37,6 +37,7 @@
37
37
  */
38
38
  import { Client } from '@modelcontextprotocol/sdk/client/index.js';
39
39
  import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
40
+ import { interpolateEnv } from '../registry/interpolate.js';
40
41
  /**
41
42
  * Neutral env vars every child inherits. These are the ones shells/toolchains
42
43
  * need to function but carry no secrets in a well-configured environment.
@@ -66,14 +67,6 @@ const DEFAULT_ENV_ALLOWLIST = [
66
67
  * handle it.
67
68
  */
68
69
  const RECONNECT_FLAP_WINDOW_MS = 30_000;
69
- /**
70
- * Build the child env by layering:
71
- * allowlist → registry env_passthrough → registry env.
72
- * Later entries win. Missing host values are skipped so `process.env[name]`
73
- * being undefined does not serialize as the literal string "undefined".
74
- *
75
- * Exported for testing.
76
- */
77
70
  export function buildChildEnv(config, hostEnv = process.env) {
78
71
  const out = {};
79
72
  for (const name of DEFAULT_ENV_ALLOWLIST) {
@@ -88,14 +81,21 @@ export function buildChildEnv(config, hostEnv = process.env) {
88
81
  out[name] = v;
89
82
  }
90
83
  }
84
+ // Interpolate placeholders in config.env BEFORE layering it on top.
85
+ // `interpolateEnv` is pure — no I/O, throws only on malformed syntax
86
+ // (unterminated brace, empty `${}`, illegal var name). Missing host
87
+ // vars are reported via `result.missing`; the caller decides whether
88
+ // to refuse the spawn.
89
+ const interp = interpolateEnv(config.env, hostEnv);
91
90
  // Explicit config.env wins — operator typed these values deliberately.
92
- for (const [k, v] of Object.entries(config.env)) {
91
+ for (const [k, v] of Object.entries(interp.resolved)) {
93
92
  out[k] = v;
94
93
  }
95
- return out;
94
+ return { env: out, missing: interp.missing, secretKeys: interp.secretKeys };
96
95
  }
97
96
  export class DownstreamConnection {
98
97
  config;
98
+ logger;
99
99
  client = null;
100
100
  /**
101
101
  * Whether a reconnect has already been attempted in the CURRENT failure
@@ -107,8 +107,15 @@ export class DownstreamConnection {
107
107
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
108
108
  lastReconnectAt = 0;
109
109
  health = 'healthy';
110
- constructor(config) {
110
+ constructor(config,
111
+ /**
112
+ * Optional structured logger (G5). When omitted, connection lifecycle
113
+ * events are simply not logged — keeping the class usable in unit tests
114
+ * that don't care about observability.
115
+ */
116
+ logger) {
111
117
  this.config = config;
118
+ this.logger = logger;
112
119
  }
113
120
  get name() {
114
121
  return this.config.name;
@@ -119,10 +126,40 @@ export class DownstreamConnection {
119
126
  async connect() {
120
127
  if (this.client !== null)
121
128
  return;
129
+ // Resolve env BEFORE spawning. If any `${VAR}` reference in the registry's
130
+ // explicit env: map is unset at startup, refuse to spawn this server:
131
+ // - log a clear, secret-safe error (only the var name appears; the
132
+ // resolved value would not exist anyway since it's missing)
133
+ // - mark this connection unhealthy so the pool skips it
134
+ // - leave every other server's spawn path untouched (the gateway as a
135
+ // whole keeps coming up)
136
+ //
137
+ // Malformed syntax (unterminated brace, `${}`, illegal identifier) throws
138
+ // from interpolateEnv — that's a load-time error and we propagate it so
139
+ // the operator sees it at startup with server context attached.
140
+ let built;
141
+ try {
142
+ built = buildChildEnv(this.config);
143
+ }
144
+ catch (err) {
145
+ this.health = 'unhealthy';
146
+ throw new Error(`failed to resolve env for downstream "${this.config.name}": ${err instanceof Error ? err.message : err}`);
147
+ }
148
+ if (built.missing.length > 0) {
149
+ this.health = 'unhealthy';
150
+ // One line per missing var so grep/jq users can find the exact gap.
151
+ // We intentionally do NOT log the env key name's VALUE (there is none —
152
+ // it's unresolved) nor any other env values.
153
+ for (const missingVar of built.missing) {
154
+ console.error(`[rea-gateway] refusing to start downstream "${this.config.name}": ` +
155
+ `env references ${'${'}${missingVar}${'}'} but process.env.${missingVar} is not set`);
156
+ }
157
+ throw new Error(`downstream "${this.config.name}" refused to start — missing env: ${built.missing.join(', ')}`);
158
+ }
122
159
  const transport = new StdioClientTransport({
123
160
  command: this.config.command,
124
161
  args: this.config.args,
125
- env: buildChildEnv(this.config),
162
+ env: built.env,
126
163
  });
127
164
  const client = new Client({ name: `rea-gateway-client:${this.config.name}`, version: '0.2.0' }, { capabilities: {} });
128
165
  try {
@@ -157,11 +194,16 @@ export class DownstreamConnection {
157
194
  }
158
195
  catch (err) {
159
196
  const message = err instanceof Error ? err.message : String(err);
160
- const withinFlapWindow = this.lastReconnectAt !== 0 &&
161
- Date.now() - this.lastReconnectAt < RECONNECT_FLAP_WINDOW_MS;
197
+ const withinFlapWindow = this.lastReconnectAt !== 0 && Date.now() - this.lastReconnectAt < RECONNECT_FLAP_WINDOW_MS;
162
198
  if (!this.reconnectAttempted && !withinFlapWindow) {
163
199
  this.reconnectAttempted = true;
164
200
  this.health = 'degraded';
201
+ this.logger?.warn({
202
+ event: 'downstream.reconnect_attempt',
203
+ server_name: this.config.name,
204
+ message: `downstream "${this.config.name}" will reconnect once after error`,
205
+ reason: message,
206
+ });
165
207
  try {
166
208
  await this.close();
167
209
  await this.connect();
@@ -170,14 +212,31 @@ export class DownstreamConnection {
170
212
  // stamp the reconnect time so flap-guard can refuse rapid repeats.
171
213
  this.reconnectAttempted = false;
172
214
  this.lastReconnectAt = Date.now();
215
+ this.logger?.info({
216
+ event: 'downstream.reconnected',
217
+ server_name: this.config.name,
218
+ message: `downstream "${this.config.name}" reconnected successfully`,
219
+ });
173
220
  return result;
174
221
  }
175
222
  catch (reconnectErr) {
176
223
  this.health = 'unhealthy';
224
+ this.logger?.error({
225
+ event: 'downstream.reconnect_failed',
226
+ server_name: this.config.name,
227
+ message: `downstream "${this.config.name}" unhealthy after one reconnect`,
228
+ error: reconnectErr instanceof Error ? reconnectErr.message : String(reconnectErr),
229
+ });
177
230
  throw new Error(`downstream "${this.config.name}" unhealthy after one reconnect: ${reconnectErr instanceof Error ? reconnectErr.message : reconnectErr}`);
178
231
  }
179
232
  }
180
233
  this.health = 'unhealthy';
234
+ this.logger?.error({
235
+ event: 'downstream.call_failed',
236
+ server_name: this.config.name,
237
+ message: `downstream "${this.config.name}" call failed`,
238
+ error: message,
239
+ });
181
240
  throw new Error(`downstream "${this.config.name}" call failed: ${message}`);
182
241
  }
183
242
  }