@bookedsolid/rea 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/.husky/pre-push +15 -18
  2. package/README.md +41 -1
  3. package/dist/cli/doctor.d.ts +19 -4
  4. package/dist/cli/doctor.js +172 -5
  5. package/dist/cli/index.js +9 -1
  6. package/dist/cli/init.js +93 -7
  7. package/dist/cli/install/pre-push.d.ts +335 -0
  8. package/dist/cli/install/pre-push.js +2818 -0
  9. package/dist/cli/serve.d.ts +64 -0
  10. package/dist/cli/serve.js +270 -2
  11. package/dist/cli/status.d.ts +90 -0
  12. package/dist/cli/status.js +399 -0
  13. package/dist/cli/utils.d.ts +4 -0
  14. package/dist/cli/utils.js +4 -0
  15. package/dist/gateway/circuit-breaker.d.ts +17 -0
  16. package/dist/gateway/circuit-breaker.js +32 -3
  17. package/dist/gateway/downstream-pool.d.ts +2 -1
  18. package/dist/gateway/downstream-pool.js +2 -2
  19. package/dist/gateway/downstream.d.ts +39 -3
  20. package/dist/gateway/downstream.js +73 -14
  21. package/dist/gateway/log.d.ts +122 -0
  22. package/dist/gateway/log.js +334 -0
  23. package/dist/gateway/middleware/audit.d.ts +10 -1
  24. package/dist/gateway/middleware/audit.js +26 -1
  25. package/dist/gateway/middleware/blocked-paths.d.ts +0 -9
  26. package/dist/gateway/middleware/blocked-paths.js +439 -67
  27. package/dist/gateway/middleware/injection.d.ts +218 -13
  28. package/dist/gateway/middleware/injection.js +433 -51
  29. package/dist/gateway/middleware/kill-switch.d.ts +10 -1
  30. package/dist/gateway/middleware/kill-switch.js +20 -1
  31. package/dist/gateway/observability/metrics.d.ts +125 -0
  32. package/dist/gateway/observability/metrics.js +321 -0
  33. package/dist/gateway/server.d.ts +19 -0
  34. package/dist/gateway/server.js +99 -15
  35. package/dist/policy/loader.d.ts +13 -0
  36. package/dist/policy/loader.js +28 -0
  37. package/dist/policy/profiles.d.ts +13 -0
  38. package/dist/policy/profiles.js +12 -0
  39. package/dist/policy/types.d.ts +28 -0
  40. package/dist/registry/fingerprint.d.ts +73 -0
  41. package/dist/registry/fingerprint.js +81 -0
  42. package/dist/registry/fingerprints-store.d.ts +62 -0
  43. package/dist/registry/fingerprints-store.js +111 -0
  44. package/dist/registry/interpolate.d.ts +58 -0
  45. package/dist/registry/interpolate.js +121 -0
  46. package/dist/registry/loader.d.ts +2 -2
  47. package/dist/registry/loader.js +22 -1
  48. package/dist/registry/tofu-gate.d.ts +41 -0
  49. package/dist/registry/tofu-gate.js +189 -0
  50. package/dist/registry/tofu.d.ts +111 -0
  51. package/dist/registry/tofu.js +173 -0
  52. package/dist/registry/types.d.ts +9 -1
  53. package/package.json +1 -1
  54. package/profiles/bst-internal-no-codex.yaml +5 -0
  55. package/profiles/bst-internal.yaml +7 -0
  56. package/scripts/tarball-smoke.sh +197 -0
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Minimal Prometheus-style metrics for `rea serve` (G5).
3
+ *
4
+ * The gateway exposes an OPT-IN `/metrics` endpoint when `REA_METRICS_PORT`
5
+ * is set. The project rule is "no silent listeners" — without that env var
6
+ * nothing binds to a port, ever. When set, we bind to 127.0.0.1 ONLY and
7
+ * respond in the standard Prometheus text-exposition format.
8
+ *
9
+ * ## What we expose
10
+ *
11
+ * rea_downstream_calls_total{server="<n>"} counter
12
+ * rea_downstream_errors_total{server="<n>"} counter
13
+ * rea_downstream_in_flight{server="<n>"} gauge
14
+ * rea_audit_lines_appended_total counter
15
+ * rea_circuit_breaker_state{server="<n>"} gauge (0=closed, 1=half-open, 2=open)
16
+ * rea_seconds_since_last_halt_check gauge
17
+ *
18
+ * Conventions match https://prometheus.io/docs/instrumenting/exposition_formats/
19
+ * — Unix-epoch timestamps omitted, `# HELP` / `# TYPE` lines included.
20
+ *
21
+ * ## What this is NOT
22
+ *
23
+ * - Not full OpenTelemetry. No traces, no histograms, no exemplars. If a user
24
+ * needs those, they can scrape these metrics and forward, or switch to an
25
+ * OTel pipeline later — the primitives are isolated in this file.
26
+ * - Not served over TLS. This is loopback-only tooling. Any cross-host scrape
27
+ * should tunnel through SSH or a reverse proxy.
28
+ * - Not a labelled cardinality bomb. Labels are limited to `server` (the set
29
+ * of downstreams is fixed by the registry) — we do NOT label by `tool_name`
30
+ * or anything user-controlled, which would let a downstream blow up the
31
+ * metrics store.
32
+ *
33
+ * ## Why handcrafted?
34
+ *
35
+ * prom-client is small but pulls its own tree of transitive deps we don't
36
+ * otherwise need. The exposition format is ~30 lines; we keep dep count
37
+ * low and avoid the supply-chain surface.
38
+ */
39
+ import type { Logger } from '../log.js';
40
+ /**
41
+ * Encoded values for the circuit-breaker gauge. Keep numerically ordered by
42
+ * severity so a `max()` query surfaces the worst state.
43
+ */
44
+ export declare const CIRCUIT_GAUGE: {
45
+ readonly closed: 0;
46
+ readonly halfOpen: 1;
47
+ readonly open: 2;
48
+ };
49
+ export type CircuitGaugeValue = (typeof CIRCUIT_GAUGE)[keyof typeof CIRCUIT_GAUGE];
50
+ /**
51
+ * In-process state for the counters and gauges. A single instance is owned
52
+ * by the gateway and passed to any collaborator that needs to record.
53
+ *
54
+ * Methods mutate synchronously and never throw — metrics failures must not
55
+ * interrupt a tool call.
56
+ */
57
+ export declare class MetricsRegistry {
58
+ private readonly downstreamCalls;
59
+ private readonly downstreamErrors;
60
+ private readonly downstreamInFlight;
61
+ private readonly circuitState;
62
+ private auditLinesAppended;
63
+ private lastHaltCheckMs;
64
+ incDownstreamCall(server: string): void;
65
+ incDownstreamError(server: string): void;
66
+ incDownstreamInFlight(server: string): void;
67
+ decDownstreamInFlight(server: string): void;
68
+ incAuditLines(n?: number): void;
69
+ setCircuitState(server: string, value: CircuitGaugeValue): void;
70
+ markHaltCheck(nowMs?: number): void;
71
+ /** Snapshot for tests / diagnostics. */
72
+ snapshot(): {
73
+ downstreamCalls: Record<string, number>;
74
+ downstreamErrors: Record<string, number>;
75
+ downstreamInFlight: Record<string, number>;
76
+ circuitState: Record<string, CircuitGaugeValue>;
77
+ auditLinesAppended: number;
78
+ lastHaltCheckMs: number | null;
79
+ };
80
+ /**
81
+ * Render the Prometheus text exposition. Every metric gets HELP + TYPE
82
+ * headers even when its table is empty — that makes the output stable
83
+ * across scrapes and easier to diff.
84
+ */
85
+ render(nowMs?: number): string;
86
+ }
87
+ export interface MetricsServer {
88
+ /** Returns the port actually bound (useful for tests that pass port 0). */
89
+ port(): number;
90
+ close(): Promise<void>;
91
+ }
92
+ export interface StartMetricsServerOptions {
93
+ port: number;
94
+ registry: MetricsRegistry;
95
+ logger?: Logger;
96
+ /**
97
+ * Override the bind host. Only loopback values (`127.0.0.1`, `::1`) are
98
+ * accepted; any other value — including `localhost`, `0.0.0.0`, `::`, or
99
+ * any LAN IP — throws a TypeError before a socket is opened. The
100
+ * /metrics endpoint has no auth, so binding a non-loopback interface
101
+ * would expose gateway internals to the network.
102
+ *
103
+ * Default: `127.0.0.1`.
104
+ */
105
+ host?: string;
106
+ }
107
+ /**
108
+ * Start a loopback-only HTTP server that serves `/metrics`.
109
+ *
110
+ * Security posture:
111
+ * - Binds to 127.0.0.1 by default. Callers cannot override to a public
112
+ * interface from the CLI; the `host` option exists for test injection.
113
+ * - Rejects every non-GET request with 405 (Prometheus scrapers only GET).
114
+ * - Rejects every path ≠ `/metrics` with 404. The body is a fixed string —
115
+ * we do NOT echo the request path, which would allow response splitting
116
+ * or reflected content.
117
+ * - No query-string parsing, no request body read, no cookies.
118
+ */
119
+ export declare function startMetricsServer(opts: StartMetricsServerOptions): Promise<MetricsServer>;
120
+ /**
121
+ * Parse and validate `REA_METRICS_PORT`. Returns the numeric port, or `null`
122
+ * if the env var is unset / malformed. An out-of-range or non-numeric value
123
+ * logs a warning and also returns null — we never silently bind on a default.
124
+ */
125
+ export declare function resolveMetricsPort(raw: string | undefined, logger?: Logger): number | null;
@@ -0,0 +1,321 @@
1
+ /**
2
+ * Minimal Prometheus-style metrics for `rea serve` (G5).
3
+ *
4
+ * The gateway exposes an OPT-IN `/metrics` endpoint when `REA_METRICS_PORT`
5
+ * is set. The project rule is "no silent listeners" — without that env var
6
+ * nothing binds to a port, ever. When set, we bind to 127.0.0.1 ONLY and
7
+ * respond in the standard Prometheus text-exposition format.
8
+ *
9
+ * ## What we expose
10
+ *
11
+ * rea_downstream_calls_total{server="<n>"} counter
12
+ * rea_downstream_errors_total{server="<n>"} counter
13
+ * rea_downstream_in_flight{server="<n>"} gauge
14
+ * rea_audit_lines_appended_total counter
15
+ * rea_circuit_breaker_state{server="<n>"} gauge (0=closed, 1=half-open, 2=open)
16
+ * rea_seconds_since_last_halt_check gauge
17
+ *
18
+ * Conventions match https://prometheus.io/docs/instrumenting/exposition_formats/
19
+ * — Unix-epoch timestamps omitted, `# HELP` / `# TYPE` lines included.
20
+ *
21
+ * ## What this is NOT
22
+ *
23
+ * - Not full OpenTelemetry. No traces, no histograms, no exemplars. If a user
24
+ * needs those, they can scrape these metrics and forward, or switch to an
25
+ * OTel pipeline later — the primitives are isolated in this file.
26
+ * - Not served over TLS. This is loopback-only tooling. Any cross-host scrape
27
+ * should tunnel through SSH or a reverse proxy.
28
+ * - Not a labelled cardinality bomb. Labels are limited to `server` (the set
29
+ * of downstreams is fixed by the registry) — we do NOT label by `tool_name`
30
+ * or anything user-controlled, which would let a downstream blow up the
31
+ * metrics store.
32
+ *
33
+ * ## Why handcrafted?
34
+ *
35
+ * prom-client is small but pulls its own tree of transitive deps we don't
36
+ * otherwise need. The exposition format is ~30 lines; we keep dep count
37
+ * low and avoid the supply-chain surface.
38
+ */
39
+ import http from 'node:http';
40
+ /**
41
+ * Loopback address we bind to. IPv4 first by convention — operators expect
42
+ * `curl http://127.0.0.1:<port>/metrics` to work without dual-stack surprise.
43
+ */
44
+ const LOOPBACK = '127.0.0.1';
45
+ /**
46
+ * Strict allowlist of host values that `startMetricsServer` will accept.
47
+ * Anything else (0.0.0.0, ::, LAN IPs, hostnames) is rejected at the API
48
+ * boundary so no in-process caller can accidentally expose the
49
+ * unauthenticated /metrics surface to the network.
50
+ *
51
+ * SECURITY: Do NOT add non-loopback entries. If you need off-host scraping,
52
+ * tunnel via SSH or front 127.0.0.1 with a TLS-terminating reverse proxy.
53
+ */
54
+ const ALLOWED_HOSTS = new Set(['127.0.0.1', '::1']);
55
+ /** Path we serve. All other paths get 404. */
56
+ const METRICS_PATH = '/metrics';
57
+ /**
58
+ * Wall-clock budget for `server.close()`. Past this point any surviving
59
+ * keep-alive sockets are destroyed outright so shutdown never waits on a
60
+ * Prometheus scraper that is holding the connection open.
61
+ */
62
+ const CLOSE_DEADLINE_MS = 2_000;
63
+ /**
64
+ * Encoded values for the circuit-breaker gauge. Keep numerically ordered by
65
+ * severity so a `max()` query surfaces the worst state.
66
+ */
67
+ export const CIRCUIT_GAUGE = {
68
+ closed: 0,
69
+ halfOpen: 1,
70
+ open: 2,
71
+ };
72
+ /**
73
+ * In-process state for the counters and gauges. A single instance is owned
74
+ * by the gateway and passed to any collaborator that needs to record.
75
+ *
76
+ * Methods mutate synchronously and never throw — metrics failures must not
77
+ * interrupt a tool call.
78
+ */
79
+ export class MetricsRegistry {
80
+ downstreamCalls = new Map();
81
+ downstreamErrors = new Map();
82
+ downstreamInFlight = new Map();
83
+ circuitState = new Map();
84
+ auditLinesAppended = 0;
85
+ lastHaltCheckMs = null;
86
+ incDownstreamCall(server) {
87
+ this.downstreamCalls.set(server, (this.downstreamCalls.get(server) ?? 0) + 1);
88
+ }
89
+ incDownstreamError(server) {
90
+ this.downstreamErrors.set(server, (this.downstreamErrors.get(server) ?? 0) + 1);
91
+ }
92
+ incDownstreamInFlight(server) {
93
+ this.downstreamInFlight.set(server, (this.downstreamInFlight.get(server) ?? 0) + 1);
94
+ }
95
+ decDownstreamInFlight(server) {
96
+ const next = Math.max(0, (this.downstreamInFlight.get(server) ?? 0) - 1);
97
+ this.downstreamInFlight.set(server, next);
98
+ }
99
+ incAuditLines(n = 1) {
100
+ this.auditLinesAppended += Math.max(0, n | 0);
101
+ }
102
+ setCircuitState(server, value) {
103
+ this.circuitState.set(server, value);
104
+ }
105
+ markHaltCheck(nowMs = Date.now()) {
106
+ this.lastHaltCheckMs = nowMs;
107
+ }
108
+ /** Snapshot for tests / diagnostics. */
109
+ snapshot() {
110
+ return {
111
+ downstreamCalls: Object.fromEntries(this.downstreamCalls),
112
+ downstreamErrors: Object.fromEntries(this.downstreamErrors),
113
+ downstreamInFlight: Object.fromEntries(this.downstreamInFlight),
114
+ circuitState: Object.fromEntries(this.circuitState),
115
+ auditLinesAppended: this.auditLinesAppended,
116
+ lastHaltCheckMs: this.lastHaltCheckMs,
117
+ };
118
+ }
119
+ /**
120
+ * Render the Prometheus text exposition. Every metric gets HELP + TYPE
121
+ * headers even when its table is empty — that makes the output stable
122
+ * across scrapes and easier to diff.
123
+ */
124
+ render(nowMs = Date.now()) {
125
+ const lines = [];
126
+ const emitCounter = (name, help, rows) => {
127
+ lines.push(`# HELP ${name} ${help}`);
128
+ lines.push(`# TYPE ${name} counter`);
129
+ for (const [server, v] of rows) {
130
+ lines.push(`${name}{server="${escapeLabel(server)}"} ${v}`);
131
+ }
132
+ };
133
+ const emitGauge = (name, help, rows) => {
134
+ lines.push(`# HELP ${name} ${help}`);
135
+ lines.push(`# TYPE ${name} gauge`);
136
+ for (const [server, v] of rows) {
137
+ lines.push(`${name}{server="${escapeLabel(server)}"} ${v}`);
138
+ }
139
+ };
140
+ emitCounter('rea_downstream_calls_total', 'Total tool calls dispatched to each downstream server.', this.downstreamCalls);
141
+ emitCounter('rea_downstream_errors_total', 'Total failed tool calls per downstream server.', this.downstreamErrors);
142
+ emitGauge('rea_downstream_in_flight', 'Tool calls currently executing against each downstream server.', this.downstreamInFlight);
143
+ lines.push('# HELP rea_audit_lines_appended_total Audit lines appended since gateway start.');
144
+ lines.push('# TYPE rea_audit_lines_appended_total counter');
145
+ lines.push(`rea_audit_lines_appended_total ${this.auditLinesAppended}`);
146
+ emitGauge('rea_circuit_breaker_state', 'Circuit breaker state per server (0=closed, 1=half-open, 2=open).', this.circuitState);
147
+ lines.push('# HELP rea_seconds_since_last_halt_check Seconds since the middleware last consulted .rea/HALT.');
148
+ lines.push('# TYPE rea_seconds_since_last_halt_check gauge');
149
+ const secondsSince = this.lastHaltCheckMs === null ? -1 : Math.max(0, (nowMs - this.lastHaltCheckMs) / 1000);
150
+ lines.push(`rea_seconds_since_last_halt_check ${secondsSince}`);
151
+ // Prometheus requires a trailing newline.
152
+ return lines.join('\n') + '\n';
153
+ }
154
+ }
155
+ /**
156
+ * Sanitize label values per Prometheus rules (escape `\`, `"`, and newlines).
157
+ * Server names come from the registry which already restricts the allowed
158
+ * charset, but defense-in-depth costs nothing.
159
+ */
160
+ function escapeLabel(value) {
161
+ return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n');
162
+ }
163
+ /**
164
+ * Start a loopback-only HTTP server that serves `/metrics`.
165
+ *
166
+ * Security posture:
167
+ * - Binds to 127.0.0.1 by default. Callers cannot override to a public
168
+ * interface from the CLI; the `host` option exists for test injection.
169
+ * - Rejects every non-GET request with 405 (Prometheus scrapers only GET).
170
+ * - Rejects every path ≠ `/metrics` with 404. The body is a fixed string —
171
+ * we do NOT echo the request path, which would allow response splitting
172
+ * or reflected content.
173
+ * - No query-string parsing, no request body read, no cookies.
174
+ */
175
+ export function startMetricsServer(opts) {
176
+ return new Promise((resolve, reject) => {
177
+ // Track every live socket so shutdown can guarantee a bounded wall-clock.
178
+ // `server.close()` on its own only stops accepting NEW connections —
179
+ // keep-alive sessions (like a sticky Prometheus scraper) drain on their
180
+ // own schedule. We destroy tracked sockets past the deadline.
181
+ const sockets = new Set();
182
+ const server = http.createServer((req, res) => {
183
+ // Defensive: if the url is missing or non-string we treat it as 404.
184
+ const url = typeof req.url === 'string' ? req.url : '';
185
+ // Strip any query string; exposition format endpoints ignore it.
186
+ const pathOnly = url.split('?')[0] ?? '';
187
+ if (req.method !== 'GET') {
188
+ res.statusCode = 405;
189
+ res.setHeader('Content-Type', 'text/plain; charset=utf-8');
190
+ res.setHeader('Allow', 'GET');
191
+ res.end('method not allowed\n');
192
+ return;
193
+ }
194
+ if (pathOnly !== METRICS_PATH) {
195
+ res.statusCode = 404;
196
+ res.setHeader('Content-Type', 'text/plain; charset=utf-8');
197
+ // Fixed body — never echo `url` here. XSS via a text/plain body is
198
+ // limited but avoiding reflection costs nothing.
199
+ res.end('not found\n');
200
+ return;
201
+ }
202
+ try {
203
+ const body = opts.registry.render();
204
+ res.statusCode = 200;
205
+ // Prometheus convention: version=0.0.4 exposition format is served as
206
+ // text/plain. No charset is strictly required by the standard but
207
+ // utf-8 is safe.
208
+ res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
209
+ res.end(body);
210
+ }
211
+ catch (err) {
212
+ // Don't leak stack or internals. Log for the operator.
213
+ opts.logger?.error({
214
+ event: 'metrics.render_failed',
215
+ message: 'failed to render metrics',
216
+ error: err instanceof Error ? err.message : String(err),
217
+ });
218
+ res.statusCode = 500;
219
+ res.setHeader('Content-Type', 'text/plain; charset=utf-8');
220
+ res.end('internal error\n');
221
+ }
222
+ });
223
+ server.on('connection', (socket) => {
224
+ sockets.add(socket);
225
+ socket.once('close', () => {
226
+ sockets.delete(socket);
227
+ });
228
+ });
229
+ server.on('error', (err) => {
230
+ reject(err);
231
+ });
232
+ // Resolve the bind host with defense-in-depth:
233
+ // 1. The public `host` option is validated against a strict loopback
234
+ // allowlist. Non-loopback values throw synchronously BEFORE a socket
235
+ // opens — a caller bug cannot silently bind 0.0.0.0 and expose the
236
+ // unauthenticated endpoint.
237
+ // 2. Default when unset: 127.0.0.1.
238
+ let host;
239
+ if (opts.host === undefined) {
240
+ host = LOOPBACK;
241
+ }
242
+ else if (ALLOWED_HOSTS.has(opts.host)) {
243
+ host = opts.host;
244
+ }
245
+ else {
246
+ reject(new TypeError(`rea metrics: refusing to bind host "${opts.host}" — only loopback (127.0.0.1, ::1) is permitted; the endpoint has no auth`));
247
+ return;
248
+ }
249
+ server.listen(opts.port, host, () => {
250
+ const addr = server.address();
251
+ const actualPort = addr !== null && typeof addr === 'object' ? addr.port : opts.port;
252
+ opts.logger?.info({
253
+ event: 'metrics.listening',
254
+ message: `metrics endpoint bound on ${host}:${actualPort}${METRICS_PATH}`,
255
+ port: actualPort,
256
+ host,
257
+ });
258
+ resolve({
259
+ port: () => actualPort,
260
+ close: () => new Promise((closeResolve) => {
261
+ let settled = false;
262
+ const finish = () => {
263
+ if (settled)
264
+ return;
265
+ settled = true;
266
+ clearTimeout(deadline);
267
+ closeResolve();
268
+ };
269
+ // Happy path: server.close() fires when all in-flight requests
270
+ // plus their underlying sockets have drained naturally.
271
+ server.close(() => finish());
272
+ // Fallback path: after CLOSE_DEADLINE_MS, destroy any surviving
273
+ // sockets so the close callback can fire. `closeIdleConnections`
274
+ // handles idle keep-alive sessions first (Node 18.2+), then we
275
+ // destroy whatever is left — including in-flight requests, which
276
+ // a stalled scraper could pin indefinitely otherwise.
277
+ const deadline = setTimeout(() => {
278
+ try {
279
+ server.closeIdleConnections?.();
280
+ }
281
+ catch {
282
+ // Best-effort — method is optional on older Node.
283
+ }
284
+ for (const sock of sockets) {
285
+ try {
286
+ sock.destroy();
287
+ }
288
+ catch {
289
+ // Sockets may already be closing.
290
+ }
291
+ }
292
+ sockets.clear();
293
+ // Some platforms don't deliver the close() callback after a
294
+ // forced socket shutdown — settle directly.
295
+ finish();
296
+ }, CLOSE_DEADLINE_MS);
297
+ // Don't let the timer hold the process open if shutdown beats it.
298
+ deadline.unref();
299
+ }),
300
+ });
301
+ });
302
+ });
303
+ }
304
+ /**
305
+ * Parse and validate `REA_METRICS_PORT`. Returns the numeric port, or `null`
306
+ * if the env var is unset / malformed. An out-of-range or non-numeric value
307
+ * logs a warning and also returns null — we never silently bind on a default.
308
+ */
309
+ export function resolveMetricsPort(raw, logger) {
310
+ if (raw === undefined || raw.trim() === '')
311
+ return null;
312
+ const n = Number(raw);
313
+ if (!Number.isInteger(n) || n < 1 || n > 65535) {
314
+ logger?.warn({
315
+ event: 'metrics.port_invalid',
316
+ message: `REA_METRICS_PORT="${raw}" is not a valid TCP port; metrics endpoint will NOT start`,
317
+ });
318
+ return null;
319
+ }
320
+ return n;
321
+ }
@@ -33,10 +33,25 @@ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
33
33
  import { DownstreamPool } from './downstream-pool.js';
34
34
  import type { Registry } from '../registry/types.js';
35
35
  import type { Policy } from '../policy/types.js';
36
+ import { type Logger } from './log.js';
37
+ import { type MetricsRegistry } from './observability/metrics.js';
36
38
  export interface GatewayOptions {
37
39
  baseDir: string;
38
40
  policy: Policy;
39
41
  registry: Registry;
42
+ /**
43
+ * Optional structured logger. If omitted, a default logger is created that
44
+ * writes to `process.stderr` honoring `REA_LOG_LEVEL`. Tests inject their
45
+ * own logger to capture records.
46
+ */
47
+ logger?: Logger;
48
+ /**
49
+ * Optional metrics registry. When supplied, the terminal middleware and
50
+ * connection lifecycle events increment counters/gauges on it. When
51
+ * omitted, no metrics are recorded — this keeps the gateway usable in
52
+ * tests without bringing in the metrics surface.
53
+ */
54
+ metrics?: MetricsRegistry;
40
55
  }
41
56
  export interface GatewayHandle {
42
57
  /** Expose the Server for test harnesses that attach InMemoryTransport. */
@@ -47,5 +62,9 @@ export interface GatewayHandle {
47
62
  stop(): Promise<void>;
48
63
  /** Exposed for tests. */
49
64
  pool: DownstreamPool;
65
+ /** The active logger — shared with serve.ts so startup messages stay in one sink. */
66
+ logger: Logger;
67
+ /** Optional metrics registry (undefined when the caller did not supply one). */
68
+ metrics: MetricsRegistry | undefined;
50
69
  }
51
70
  export declare function createGateway(opts: GatewayOptions): GatewayHandle;
@@ -31,7 +31,7 @@
31
31
  */
32
32
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
33
33
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
34
- import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
34
+ import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
35
35
  import { DownstreamPool, splitPrefixed } from './downstream-pool.js';
36
36
  import { createAuditMiddleware } from './middleware/audit.js';
37
37
  import { createKillSwitchMiddleware } from './middleware/kill-switch.js';
@@ -41,7 +41,7 @@ import { createBlockedPathsMiddleware } from './middleware/blocked-paths.js';
41
41
  import { createRateLimitMiddleware } from './middleware/rate-limit.js';
42
42
  import { createCircuitBreakerMiddleware } from './middleware/circuit-breaker.js';
43
43
  import { createInjectionMiddleware } from './middleware/injection.js';
44
- import { createRedactMiddleware, } from './middleware/redact.js';
44
+ import { createRedactMiddleware } from './middleware/redact.js';
45
45
  import { wrapRegex } from './redact-safe/match-timeout.js';
46
46
  import { createResultSizeCapMiddleware } from './middleware/result-size-cap.js';
47
47
  import { executeChain } from './middleware/chain.js';
@@ -50,6 +50,8 @@ import { CircuitBreaker } from './circuit-breaker.js';
50
50
  import { currentSessionId } from './session.js';
51
51
  import { InvocationStatus, Tier } from '../policy/types.js';
52
52
  import { log } from '../cli/utils.js';
53
+ import { createLogger } from './log.js';
54
+ import { CIRCUIT_GAUGE } from './observability/metrics.js';
53
55
  /**
54
56
  * Build the ordered middleware chain used on every CallToolRequest.
55
57
  * Order is prescriptive — DO NOT reorder without reading THREAT_MODEL.md §
@@ -82,30 +84,68 @@ function compileUserRedactPatterns(policy, matchTimeoutMs) {
82
84
  }
83
85
  return out;
84
86
  }
85
- function buildMiddlewareChain(opts) {
86
- const { baseDir, policy } = opts;
87
+ function buildMiddlewareChain(opts, deps) {
88
+ const { baseDir, policy, metrics } = opts;
87
89
  const matchTimeoutMs = policy.redact?.match_timeout_ms ?? 100;
88
90
  const userPatterns = compileUserRedactPatterns(policy, matchTimeoutMs);
89
91
  return [
90
- createAuditMiddleware(baseDir, policy),
91
- createKillSwitchMiddleware(baseDir),
92
+ // Metrics threaded through so `rea_audit_lines_appended_total` advances
93
+ // on every durable audit append and `rea_seconds_since_last_halt_check`
94
+ // reflects per-invocation cadence, not gateway uptime.
95
+ createAuditMiddleware(baseDir, policy, metrics),
96
+ createKillSwitchMiddleware(baseDir, metrics),
92
97
  createTierMiddleware(),
93
98
  createPolicyMiddleware(policy, undefined, baseDir),
94
99
  createBlockedPathsMiddleware(policy, baseDir),
95
100
  createRateLimitMiddleware(new RateLimiter()),
96
- createCircuitBreakerMiddleware(new CircuitBreaker()),
97
- createInjectionMiddleware(policy.injection_detection === 'warn' ? 'warn' : 'block', {
98
- matchTimeoutMs,
99
- }),
101
+ createCircuitBreakerMiddleware(deps.breaker),
102
+ createInjectionMiddleware(policy.injection_detection === 'warn' ? 'warn' : 'block', (() => {
103
+ // G9 follow-up: preserve the tri-state for `suspiciousBlocksWrites`
104
+ // (true / false / undefined-omitted). With `exactOptionalPropertyTypes`
105
+ // we must omit the key entirely rather than passing `undefined` so
106
+ // the middleware's `?? true` / `?? false` default logic runs for
107
+ // consumers who did not configure the flag. `bst-internal*` profiles
108
+ // pin the flag explicitly.
109
+ const pinned = policy.injection?.suspicious_blocks_writes;
110
+ return pinned === undefined
111
+ ? { matchTimeoutMs }
112
+ : { matchTimeoutMs, suspiciousBlocksWrites: pinned };
113
+ })()),
100
114
  createRedactMiddleware({ matchTimeoutMs, userPatterns }),
101
115
  createResultSizeCapMiddleware(),
102
116
  ];
103
117
  }
104
118
  export function createGateway(opts) {
105
119
  const { registry } = opts;
106
- const pool = new DownstreamPool(registry);
120
+ const logger = opts.logger ?? createLogger({ base: { session_id: currentSessionId() } });
121
+ const metrics = opts.metrics;
122
+ const pool = new DownstreamPool(registry, logger);
107
123
  const server = new Server({ name: 'rea', version: '0.2.0' }, { capabilities: { tools: {} } });
108
- const staticChain = buildMiddlewareChain(opts);
124
+ // Build the circuit breaker with observability hooks wired in — state
125
+ // transitions log a structured record AND update the Prometheus gauge.
126
+ const breaker = new CircuitBreaker({
127
+ onStateChange: (event) => {
128
+ const level = event.to === 'open' ? 'warn' : 'info';
129
+ logger[level]({
130
+ event: `circuit.${event.to.replace('-', '_')}`,
131
+ server_name: event.server,
132
+ message: `circuit-breaker: "${event.server}" ${event.from} → ${event.to} (${event.reason})`,
133
+ ...(event.retryAt !== undefined ? { retry_at: event.retryAt } : {}),
134
+ });
135
+ switch (event.to) {
136
+ case 'closed':
137
+ metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.closed);
138
+ break;
139
+ case 'half-open':
140
+ metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.halfOpen);
141
+ break;
142
+ case 'open':
143
+ metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.open);
144
+ break;
145
+ }
146
+ },
147
+ });
148
+ const staticChain = buildMiddlewareChain(opts, { breaker });
109
149
  // ── Handlers ─────────────────────────────────────────────────────────────
110
150
  server.setRequestHandler(ListToolsRequestSchema, async () => {
111
151
  if (pool.size === 0)
@@ -159,13 +199,19 @@ export function createGateway(opts) {
159
199
  context.error = 'No downstream servers in .rea/registry.yaml — add one to enable proxying';
160
200
  return;
161
201
  }
202
+ metrics?.incDownstreamCall(serverName);
203
+ metrics?.incDownstreamInFlight(serverName);
162
204
  try {
163
205
  context.result = await pool.callTool(prefixed, context.arguments);
164
206
  }
165
207
  catch (err) {
208
+ metrics?.incDownstreamError(serverName);
166
209
  context.status = InvocationStatus.Error;
167
210
  context.error = err instanceof Error ? err.message : String(err);
168
211
  }
212
+ finally {
213
+ metrics?.decDownstreamInFlight(serverName);
214
+ }
169
215
  };
170
216
  try {
171
217
  await executeChain([...staticChain, terminal], ctx);
@@ -224,14 +270,51 @@ export function createGateway(opts) {
224
270
  // Connect to downstream children first so the `listTools` catalog is ready
225
271
  // by the time the upstream client connects.
226
272
  if (pool.size === 0) {
227
- log('rea serve: no downstream servers in .rea/registry.yaml — running in no-op mode. Add servers to enable proxying.');
273
+ logger.info({
274
+ event: 'gateway.no_downstreams',
275
+ message: 'no downstream servers in .rea/registry.yaml — running in no-op mode. Add servers to enable proxying.',
276
+ });
228
277
  }
229
278
  else {
279
+ for (const s of registry.servers) {
280
+ if (!s.enabled)
281
+ continue;
282
+ logger.info({
283
+ event: 'downstream.connect_attempt',
284
+ server_name: s.name,
285
+ message: `connecting downstream "${s.name}"`,
286
+ });
287
+ }
230
288
  try {
231
289
  await pool.connectAll();
290
+ for (const s of registry.servers) {
291
+ if (!s.enabled)
292
+ continue;
293
+ const conn = pool.getConnection(s.name);
294
+ if (conn !== undefined && conn.isHealthy) {
295
+ logger.info({
296
+ event: 'downstream.connected',
297
+ server_name: s.name,
298
+ message: `downstream "${s.name}" connected`,
299
+ });
300
+ // Every healthy downstream starts in the closed state — record
301
+ // the initial circuit-breaker gauge so scrapers see a baseline.
302
+ metrics?.setCircuitState(s.name, CIRCUIT_GAUGE.closed);
303
+ }
304
+ else {
305
+ logger.warn({
306
+ event: 'downstream.unhealthy_on_start',
307
+ server_name: s.name,
308
+ message: `downstream "${s.name}" did not come up healthy`,
309
+ });
310
+ }
311
+ }
232
312
  }
233
313
  catch (err) {
234
- log(`rea serve: downstream connect error: ${err instanceof Error ? err.message : err}`);
314
+ logger.error({
315
+ event: 'downstream.connect_failed',
316
+ message: `downstream connect error: ${err instanceof Error ? err.message : err}`,
317
+ });
235
318
  // Continue — individual connections may still be healthy.
236
319
  }
237
320
  }
@@ -242,6 +325,7 @@ export function createGateway(opts) {
242
325
  if (stopping)
243
326
  return;
244
327
  stopping = true;
328
+ logger.info({ event: 'gateway.shutdown', message: 'gateway stop requested' });
245
329
  try {
246
330
  await server.close();
247
331
  }
@@ -250,7 +334,7 @@ export function createGateway(opts) {
250
334
  }
251
335
  await pool.close();
252
336
  }
253
- return { server, start, stop, pool };
337
+ return { server, start, stop, pool, logger, metrics };
254
338
  }
255
339
  // Prevent TS from complaining about the unused `Tier` import when the file is
256
340
  // compiled in isolation; keeping the import pins the semantic dependency edge