@bookedsolid/rea 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.husky/pre-push +15 -18
- package/README.md +41 -1
- package/dist/cli/doctor.d.ts +19 -4
- package/dist/cli/doctor.js +172 -5
- package/dist/cli/index.js +9 -1
- package/dist/cli/init.js +93 -7
- package/dist/cli/install/pre-push.d.ts +335 -0
- package/dist/cli/install/pre-push.js +2818 -0
- package/dist/cli/serve.d.ts +64 -0
- package/dist/cli/serve.js +270 -2
- package/dist/cli/status.d.ts +90 -0
- package/dist/cli/status.js +399 -0
- package/dist/cli/utils.d.ts +4 -0
- package/dist/cli/utils.js +4 -0
- package/dist/gateway/circuit-breaker.d.ts +17 -0
- package/dist/gateway/circuit-breaker.js +32 -3
- package/dist/gateway/downstream-pool.d.ts +2 -1
- package/dist/gateway/downstream-pool.js +2 -2
- package/dist/gateway/downstream.d.ts +39 -3
- package/dist/gateway/downstream.js +73 -14
- package/dist/gateway/log.d.ts +122 -0
- package/dist/gateway/log.js +334 -0
- package/dist/gateway/middleware/audit.d.ts +10 -1
- package/dist/gateway/middleware/audit.js +26 -1
- package/dist/gateway/middleware/blocked-paths.d.ts +0 -9
- package/dist/gateway/middleware/blocked-paths.js +439 -67
- package/dist/gateway/middleware/injection.d.ts +218 -13
- package/dist/gateway/middleware/injection.js +433 -51
- package/dist/gateway/middleware/kill-switch.d.ts +10 -1
- package/dist/gateway/middleware/kill-switch.js +20 -1
- package/dist/gateway/observability/metrics.d.ts +125 -0
- package/dist/gateway/observability/metrics.js +321 -0
- package/dist/gateway/server.d.ts +19 -0
- package/dist/gateway/server.js +99 -15
- package/dist/policy/loader.d.ts +13 -0
- package/dist/policy/loader.js +28 -0
- package/dist/policy/profiles.d.ts +13 -0
- package/dist/policy/profiles.js +12 -0
- package/dist/policy/types.d.ts +28 -0
- package/dist/registry/fingerprint.d.ts +73 -0
- package/dist/registry/fingerprint.js +81 -0
- package/dist/registry/fingerprints-store.d.ts +62 -0
- package/dist/registry/fingerprints-store.js +111 -0
- package/dist/registry/interpolate.d.ts +58 -0
- package/dist/registry/interpolate.js +121 -0
- package/dist/registry/loader.d.ts +2 -2
- package/dist/registry/loader.js +22 -1
- package/dist/registry/tofu-gate.d.ts +41 -0
- package/dist/registry/tofu-gate.js +189 -0
- package/dist/registry/tofu.d.ts +111 -0
- package/dist/registry/tofu.js +173 -0
- package/dist/registry/types.d.ts +9 -1
- package/package.json +1 -1
- package/profiles/bst-internal-no-codex.yaml +5 -0
- package/profiles/bst-internal.yaml +7 -0
- package/scripts/tarball-smoke.sh +197 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal Prometheus-style metrics for `rea serve` (G5).
|
|
3
|
+
*
|
|
4
|
+
* The gateway exposes an OPT-IN `/metrics` endpoint when `REA_METRICS_PORT`
|
|
5
|
+
* is set. The project rule is "no silent listeners" — without that env var
|
|
6
|
+
* nothing binds to a port, ever. When set, we bind to 127.0.0.1 ONLY and
|
|
7
|
+
* respond in the standard Prometheus text-exposition format.
|
|
8
|
+
*
|
|
9
|
+
* ## What we expose
|
|
10
|
+
*
|
|
11
|
+
* rea_downstream_calls_total{server="<n>"} counter
|
|
12
|
+
* rea_downstream_errors_total{server="<n>"} counter
|
|
13
|
+
* rea_downstream_in_flight{server="<n>"} gauge
|
|
14
|
+
* rea_audit_lines_appended_total counter
|
|
15
|
+
* rea_circuit_breaker_state{server="<n>"} gauge (0=closed, 1=half-open, 2=open)
|
|
16
|
+
* rea_seconds_since_last_halt_check gauge
|
|
17
|
+
*
|
|
18
|
+
* Conventions match https://prometheus.io/docs/instrumenting/exposition_formats/
|
|
19
|
+
* — Unix-epoch timestamps omitted, `# HELP` / `# TYPE` lines included.
|
|
20
|
+
*
|
|
21
|
+
* ## What this is NOT
|
|
22
|
+
*
|
|
23
|
+
* - Not full OpenTelemetry. No traces, no histograms, no exemplars. If a user
|
|
24
|
+
* needs those, they can scrape these metrics and forward, or switch to an
|
|
25
|
+
* OTel pipeline later — the primitives are isolated in this file.
|
|
26
|
+
* - Not served over TLS. This is loopback-only tooling. Any cross-host scrape
|
|
27
|
+
* should tunnel through SSH or a reverse proxy.
|
|
28
|
+
* - Not a labelled cardinality bomb. Labels are limited to `server` (the set
|
|
29
|
+
* of downstreams is fixed by the registry) — we do NOT label by `tool_name`
|
|
30
|
+
* or anything user-controlled, which would let a downstream blow up the
|
|
31
|
+
* metrics store.
|
|
32
|
+
*
|
|
33
|
+
* ## Why handcrafted?
|
|
34
|
+
*
|
|
35
|
+
* prom-client is small but pulls its own tree of transitive deps we don't
|
|
36
|
+
* otherwise need. The exposition format is ~30 lines; we keep dep count
|
|
37
|
+
* low and avoid the supply-chain surface.
|
|
38
|
+
*/
|
|
39
|
+
import type { Logger } from '../log.js';
|
|
40
|
+
/**
|
|
41
|
+
* Encoded values for the circuit-breaker gauge. Keep numerically ordered by
|
|
42
|
+
* severity so a `max()` query surfaces the worst state.
|
|
43
|
+
*/
|
|
44
|
+
export declare const CIRCUIT_GAUGE: {
|
|
45
|
+
readonly closed: 0;
|
|
46
|
+
readonly halfOpen: 1;
|
|
47
|
+
readonly open: 2;
|
|
48
|
+
};
|
|
49
|
+
export type CircuitGaugeValue = (typeof CIRCUIT_GAUGE)[keyof typeof CIRCUIT_GAUGE];
|
|
50
|
+
/**
|
|
51
|
+
* In-process state for the counters and gauges. A single instance is owned
|
|
52
|
+
* by the gateway and passed to any collaborator that needs to record.
|
|
53
|
+
*
|
|
54
|
+
* Methods mutate synchronously and never throw — metrics failures must not
|
|
55
|
+
* interrupt a tool call.
|
|
56
|
+
*/
|
|
57
|
+
export declare class MetricsRegistry {
|
|
58
|
+
private readonly downstreamCalls;
|
|
59
|
+
private readonly downstreamErrors;
|
|
60
|
+
private readonly downstreamInFlight;
|
|
61
|
+
private readonly circuitState;
|
|
62
|
+
private auditLinesAppended;
|
|
63
|
+
private lastHaltCheckMs;
|
|
64
|
+
incDownstreamCall(server: string): void;
|
|
65
|
+
incDownstreamError(server: string): void;
|
|
66
|
+
incDownstreamInFlight(server: string): void;
|
|
67
|
+
decDownstreamInFlight(server: string): void;
|
|
68
|
+
incAuditLines(n?: number): void;
|
|
69
|
+
setCircuitState(server: string, value: CircuitGaugeValue): void;
|
|
70
|
+
markHaltCheck(nowMs?: number): void;
|
|
71
|
+
/** Snapshot for tests / diagnostics. */
|
|
72
|
+
snapshot(): {
|
|
73
|
+
downstreamCalls: Record<string, number>;
|
|
74
|
+
downstreamErrors: Record<string, number>;
|
|
75
|
+
downstreamInFlight: Record<string, number>;
|
|
76
|
+
circuitState: Record<string, CircuitGaugeValue>;
|
|
77
|
+
auditLinesAppended: number;
|
|
78
|
+
lastHaltCheckMs: number | null;
|
|
79
|
+
};
|
|
80
|
+
/**
|
|
81
|
+
* Render the Prometheus text exposition. Every metric gets HELP + TYPE
|
|
82
|
+
* headers even when its table is empty — that makes the output stable
|
|
83
|
+
* across scrapes and easier to diff.
|
|
84
|
+
*/
|
|
85
|
+
render(nowMs?: number): string;
|
|
86
|
+
}
|
|
87
|
+
export interface MetricsServer {
|
|
88
|
+
/** Returns the port actually bound (useful for tests that pass port 0). */
|
|
89
|
+
port(): number;
|
|
90
|
+
close(): Promise<void>;
|
|
91
|
+
}
|
|
92
|
+
export interface StartMetricsServerOptions {
|
|
93
|
+
port: number;
|
|
94
|
+
registry: MetricsRegistry;
|
|
95
|
+
logger?: Logger;
|
|
96
|
+
/**
|
|
97
|
+
* Override the bind host. Only loopback values (`127.0.0.1`, `::1`) are
|
|
98
|
+
* accepted; any other value — including `localhost`, `0.0.0.0`, `::`, or
|
|
99
|
+
* any LAN IP — throws a TypeError before a socket is opened. The
|
|
100
|
+
* /metrics endpoint has no auth, so binding a non-loopback interface
|
|
101
|
+
* would expose gateway internals to the network.
|
|
102
|
+
*
|
|
103
|
+
* Default: `127.0.0.1`.
|
|
104
|
+
*/
|
|
105
|
+
host?: string;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Start a loopback-only HTTP server that serves `/metrics`.
|
|
109
|
+
*
|
|
110
|
+
* Security posture:
|
|
111
|
+
* - Binds to 127.0.0.1 by default. Callers cannot override to a public
|
|
112
|
+
* interface from the CLI; the `host` option exists for test injection.
|
|
113
|
+
* - Rejects every non-GET request with 405 (Prometheus scrapers only GET).
|
|
114
|
+
* - Rejects every path ≠ `/metrics` with 404. The body is a fixed string —
|
|
115
|
+
* we do NOT echo the request path, which would allow response splitting
|
|
116
|
+
* or reflected content.
|
|
117
|
+
* - No query-string parsing, no request body read, no cookies.
|
|
118
|
+
*/
|
|
119
|
+
export declare function startMetricsServer(opts: StartMetricsServerOptions): Promise<MetricsServer>;
|
|
120
|
+
/**
|
|
121
|
+
* Parse and validate `REA_METRICS_PORT`. Returns the numeric port, or `null`
|
|
122
|
+
* if the env var is unset / malformed. An out-of-range or non-numeric value
|
|
123
|
+
* logs a warning and also returns null — we never silently bind on a default.
|
|
124
|
+
*/
|
|
125
|
+
export declare function resolveMetricsPort(raw: string | undefined, logger?: Logger): number | null;
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal Prometheus-style metrics for `rea serve` (G5).
|
|
3
|
+
*
|
|
4
|
+
* The gateway exposes an OPT-IN `/metrics` endpoint when `REA_METRICS_PORT`
|
|
5
|
+
* is set. The project rule is "no silent listeners" — without that env var
|
|
6
|
+
* nothing binds to a port, ever. When set, we bind to 127.0.0.1 ONLY and
|
|
7
|
+
* respond in the standard Prometheus text-exposition format.
|
|
8
|
+
*
|
|
9
|
+
* ## What we expose
|
|
10
|
+
*
|
|
11
|
+
* rea_downstream_calls_total{server="<n>"} counter
|
|
12
|
+
* rea_downstream_errors_total{server="<n>"} counter
|
|
13
|
+
* rea_downstream_in_flight{server="<n>"} gauge
|
|
14
|
+
* rea_audit_lines_appended_total counter
|
|
15
|
+
* rea_circuit_breaker_state{server="<n>"} gauge (0=closed, 1=half-open, 2=open)
|
|
16
|
+
* rea_seconds_since_last_halt_check gauge
|
|
17
|
+
*
|
|
18
|
+
* Conventions match https://prometheus.io/docs/instrumenting/exposition_formats/
|
|
19
|
+
* — Unix-epoch timestamps omitted, `# HELP` / `# TYPE` lines included.
|
|
20
|
+
*
|
|
21
|
+
* ## What this is NOT
|
|
22
|
+
*
|
|
23
|
+
* - Not full OpenTelemetry. No traces, no histograms, no exemplars. If a user
|
|
24
|
+
* needs those, they can scrape these metrics and forward, or switch to an
|
|
25
|
+
* OTel pipeline later — the primitives are isolated in this file.
|
|
26
|
+
* - Not served over TLS. This is loopback-only tooling. Any cross-host scrape
|
|
27
|
+
* should tunnel through SSH or a reverse proxy.
|
|
28
|
+
* - Not a labelled cardinality bomb. Labels are limited to `server` (the set
|
|
29
|
+
* of downstreams is fixed by the registry) — we do NOT label by `tool_name`
|
|
30
|
+
* or anything user-controlled, which would let a downstream blow up the
|
|
31
|
+
* metrics store.
|
|
32
|
+
*
|
|
33
|
+
* ## Why handcrafted?
|
|
34
|
+
*
|
|
35
|
+
* prom-client is small but pulls its own tree of transitive deps we don't
|
|
36
|
+
* otherwise need. The exposition format is ~30 lines; we keep dep count
|
|
37
|
+
* low and avoid the supply-chain surface.
|
|
38
|
+
*/
|
|
39
|
+
import http from 'node:http';
|
|
40
|
+
/**
|
|
41
|
+
* Loopback address we bind to. IPv4 first by convention — operators expect
|
|
42
|
+
* `curl http://127.0.0.1:<port>/metrics` to work without dual-stack surprise.
|
|
43
|
+
*/
|
|
44
|
+
const LOOPBACK = '127.0.0.1';
|
|
45
|
+
/**
|
|
46
|
+
* Strict allowlist of host values that `startMetricsServer` will accept.
|
|
47
|
+
* Anything else (0.0.0.0, ::, LAN IPs, hostnames) is rejected at the API
|
|
48
|
+
* boundary so no in-process caller can accidentally expose the
|
|
49
|
+
* unauthenticated /metrics surface to the network.
|
|
50
|
+
*
|
|
51
|
+
* SECURITY: Do NOT add non-loopback entries. If you need off-host scraping,
|
|
52
|
+
* tunnel via SSH or front 127.0.0.1 with a TLS-terminating reverse proxy.
|
|
53
|
+
*/
|
|
54
|
+
const ALLOWED_HOSTS = new Set(['127.0.0.1', '::1']);
|
|
55
|
+
/** Path we serve. All other paths get 404. */
|
|
56
|
+
const METRICS_PATH = '/metrics';
|
|
57
|
+
/**
|
|
58
|
+
* Wall-clock budget for `server.close()`. Past this point any surviving
|
|
59
|
+
* keep-alive sockets are destroyed outright so shutdown never waits on a
|
|
60
|
+
* Prometheus scraper that is holding the connection open.
|
|
61
|
+
*/
|
|
62
|
+
const CLOSE_DEADLINE_MS = 2_000;
|
|
63
|
+
/**
|
|
64
|
+
* Encoded values for the circuit-breaker gauge. Keep numerically ordered by
|
|
65
|
+
* severity so a `max()` query surfaces the worst state.
|
|
66
|
+
*/
|
|
67
|
+
export const CIRCUIT_GAUGE = {
|
|
68
|
+
closed: 0,
|
|
69
|
+
halfOpen: 1,
|
|
70
|
+
open: 2,
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* In-process state for the counters and gauges. A single instance is owned
|
|
74
|
+
* by the gateway and passed to any collaborator that needs to record.
|
|
75
|
+
*
|
|
76
|
+
* Methods mutate synchronously and never throw — metrics failures must not
|
|
77
|
+
* interrupt a tool call.
|
|
78
|
+
*/
|
|
79
|
+
export class MetricsRegistry {
|
|
80
|
+
downstreamCalls = new Map();
|
|
81
|
+
downstreamErrors = new Map();
|
|
82
|
+
downstreamInFlight = new Map();
|
|
83
|
+
circuitState = new Map();
|
|
84
|
+
auditLinesAppended = 0;
|
|
85
|
+
lastHaltCheckMs = null;
|
|
86
|
+
incDownstreamCall(server) {
|
|
87
|
+
this.downstreamCalls.set(server, (this.downstreamCalls.get(server) ?? 0) + 1);
|
|
88
|
+
}
|
|
89
|
+
incDownstreamError(server) {
|
|
90
|
+
this.downstreamErrors.set(server, (this.downstreamErrors.get(server) ?? 0) + 1);
|
|
91
|
+
}
|
|
92
|
+
incDownstreamInFlight(server) {
|
|
93
|
+
this.downstreamInFlight.set(server, (this.downstreamInFlight.get(server) ?? 0) + 1);
|
|
94
|
+
}
|
|
95
|
+
decDownstreamInFlight(server) {
|
|
96
|
+
const next = Math.max(0, (this.downstreamInFlight.get(server) ?? 0) - 1);
|
|
97
|
+
this.downstreamInFlight.set(server, next);
|
|
98
|
+
}
|
|
99
|
+
incAuditLines(n = 1) {
|
|
100
|
+
this.auditLinesAppended += Math.max(0, n | 0);
|
|
101
|
+
}
|
|
102
|
+
setCircuitState(server, value) {
|
|
103
|
+
this.circuitState.set(server, value);
|
|
104
|
+
}
|
|
105
|
+
markHaltCheck(nowMs = Date.now()) {
|
|
106
|
+
this.lastHaltCheckMs = nowMs;
|
|
107
|
+
}
|
|
108
|
+
/** Snapshot for tests / diagnostics. */
|
|
109
|
+
snapshot() {
|
|
110
|
+
return {
|
|
111
|
+
downstreamCalls: Object.fromEntries(this.downstreamCalls),
|
|
112
|
+
downstreamErrors: Object.fromEntries(this.downstreamErrors),
|
|
113
|
+
downstreamInFlight: Object.fromEntries(this.downstreamInFlight),
|
|
114
|
+
circuitState: Object.fromEntries(this.circuitState),
|
|
115
|
+
auditLinesAppended: this.auditLinesAppended,
|
|
116
|
+
lastHaltCheckMs: this.lastHaltCheckMs,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Render the Prometheus text exposition. Every metric gets HELP + TYPE
|
|
121
|
+
* headers even when its table is empty — that makes the output stable
|
|
122
|
+
* across scrapes and easier to diff.
|
|
123
|
+
*/
|
|
124
|
+
render(nowMs = Date.now()) {
|
|
125
|
+
const lines = [];
|
|
126
|
+
const emitCounter = (name, help, rows) => {
|
|
127
|
+
lines.push(`# HELP ${name} ${help}`);
|
|
128
|
+
lines.push(`# TYPE ${name} counter`);
|
|
129
|
+
for (const [server, v] of rows) {
|
|
130
|
+
lines.push(`${name}{server="${escapeLabel(server)}"} ${v}`);
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
const emitGauge = (name, help, rows) => {
|
|
134
|
+
lines.push(`# HELP ${name} ${help}`);
|
|
135
|
+
lines.push(`# TYPE ${name} gauge`);
|
|
136
|
+
for (const [server, v] of rows) {
|
|
137
|
+
lines.push(`${name}{server="${escapeLabel(server)}"} ${v}`);
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
emitCounter('rea_downstream_calls_total', 'Total tool calls dispatched to each downstream server.', this.downstreamCalls);
|
|
141
|
+
emitCounter('rea_downstream_errors_total', 'Total failed tool calls per downstream server.', this.downstreamErrors);
|
|
142
|
+
emitGauge('rea_downstream_in_flight', 'Tool calls currently executing against each downstream server.', this.downstreamInFlight);
|
|
143
|
+
lines.push('# HELP rea_audit_lines_appended_total Audit lines appended since gateway start.');
|
|
144
|
+
lines.push('# TYPE rea_audit_lines_appended_total counter');
|
|
145
|
+
lines.push(`rea_audit_lines_appended_total ${this.auditLinesAppended}`);
|
|
146
|
+
emitGauge('rea_circuit_breaker_state', 'Circuit breaker state per server (0=closed, 1=half-open, 2=open).', this.circuitState);
|
|
147
|
+
lines.push('# HELP rea_seconds_since_last_halt_check Seconds since the middleware last consulted .rea/HALT.');
|
|
148
|
+
lines.push('# TYPE rea_seconds_since_last_halt_check gauge');
|
|
149
|
+
const secondsSince = this.lastHaltCheckMs === null ? -1 : Math.max(0, (nowMs - this.lastHaltCheckMs) / 1000);
|
|
150
|
+
lines.push(`rea_seconds_since_last_halt_check ${secondsSince}`);
|
|
151
|
+
// Prometheus requires a trailing newline.
|
|
152
|
+
return lines.join('\n') + '\n';
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Sanitize label values per Prometheus rules (escape `\`, `"`, and newlines).
|
|
157
|
+
* Server names come from the registry which already restricts the allowed
|
|
158
|
+
* charset, but defense-in-depth costs nothing.
|
|
159
|
+
*/
|
|
160
|
+
function escapeLabel(value) {
|
|
161
|
+
return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Start a loopback-only HTTP server that serves `/metrics`.
|
|
165
|
+
*
|
|
166
|
+
* Security posture:
|
|
167
|
+
* - Binds to 127.0.0.1 by default. Callers cannot override to a public
|
|
168
|
+
* interface from the CLI; the `host` option exists for test injection.
|
|
169
|
+
* - Rejects every non-GET request with 405 (Prometheus scrapers only GET).
|
|
170
|
+
* - Rejects every path ≠ `/metrics` with 404. The body is a fixed string —
|
|
171
|
+
* we do NOT echo the request path, which would allow response splitting
|
|
172
|
+
* or reflected content.
|
|
173
|
+
* - No query-string parsing, no request body read, no cookies.
|
|
174
|
+
*/
|
|
175
|
+
export function startMetricsServer(opts) {
|
|
176
|
+
return new Promise((resolve, reject) => {
|
|
177
|
+
// Track every live socket so shutdown can guarantee a bounded wall-clock.
|
|
178
|
+
// `server.close()` on its own only stops accepting NEW connections —
|
|
179
|
+
// keep-alive sessions (like a sticky Prometheus scraper) drain on their
|
|
180
|
+
// own schedule. We destroy tracked sockets past the deadline.
|
|
181
|
+
const sockets = new Set();
|
|
182
|
+
const server = http.createServer((req, res) => {
|
|
183
|
+
// Defensive: if the url is missing or non-string we treat it as 404.
|
|
184
|
+
const url = typeof req.url === 'string' ? req.url : '';
|
|
185
|
+
// Strip any query string; exposition format endpoints ignore it.
|
|
186
|
+
const pathOnly = url.split('?')[0] ?? '';
|
|
187
|
+
if (req.method !== 'GET') {
|
|
188
|
+
res.statusCode = 405;
|
|
189
|
+
res.setHeader('Content-Type', 'text/plain; charset=utf-8');
|
|
190
|
+
res.setHeader('Allow', 'GET');
|
|
191
|
+
res.end('method not allowed\n');
|
|
192
|
+
return;
|
|
193
|
+
}
|
|
194
|
+
if (pathOnly !== METRICS_PATH) {
|
|
195
|
+
res.statusCode = 404;
|
|
196
|
+
res.setHeader('Content-Type', 'text/plain; charset=utf-8');
|
|
197
|
+
// Fixed body — never echo `url` here. XSS via a text/plain body is
|
|
198
|
+
// limited but avoiding reflection costs nothing.
|
|
199
|
+
res.end('not found\n');
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
try {
|
|
203
|
+
const body = opts.registry.render();
|
|
204
|
+
res.statusCode = 200;
|
|
205
|
+
// Prometheus convention: version=0.0.4 exposition format is served as
|
|
206
|
+
// text/plain. No charset is strictly required by the standard but
|
|
207
|
+
// utf-8 is safe.
|
|
208
|
+
res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
|
|
209
|
+
res.end(body);
|
|
210
|
+
}
|
|
211
|
+
catch (err) {
|
|
212
|
+
// Don't leak stack or internals. Log for the operator.
|
|
213
|
+
opts.logger?.error({
|
|
214
|
+
event: 'metrics.render_failed',
|
|
215
|
+
message: 'failed to render metrics',
|
|
216
|
+
error: err instanceof Error ? err.message : String(err),
|
|
217
|
+
});
|
|
218
|
+
res.statusCode = 500;
|
|
219
|
+
res.setHeader('Content-Type', 'text/plain; charset=utf-8');
|
|
220
|
+
res.end('internal error\n');
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
server.on('connection', (socket) => {
|
|
224
|
+
sockets.add(socket);
|
|
225
|
+
socket.once('close', () => {
|
|
226
|
+
sockets.delete(socket);
|
|
227
|
+
});
|
|
228
|
+
});
|
|
229
|
+
server.on('error', (err) => {
|
|
230
|
+
reject(err);
|
|
231
|
+
});
|
|
232
|
+
// Resolve the bind host with defense-in-depth:
|
|
233
|
+
// 1. The public `host` option is validated against a strict loopback
|
|
234
|
+
// allowlist. Non-loopback values throw synchronously BEFORE a socket
|
|
235
|
+
// opens — a caller bug cannot silently bind 0.0.0.0 and expose the
|
|
236
|
+
// unauthenticated endpoint.
|
|
237
|
+
// 2. Default when unset: 127.0.0.1.
|
|
238
|
+
let host;
|
|
239
|
+
if (opts.host === undefined) {
|
|
240
|
+
host = LOOPBACK;
|
|
241
|
+
}
|
|
242
|
+
else if (ALLOWED_HOSTS.has(opts.host)) {
|
|
243
|
+
host = opts.host;
|
|
244
|
+
}
|
|
245
|
+
else {
|
|
246
|
+
reject(new TypeError(`rea metrics: refusing to bind host "${opts.host}" — only loopback (127.0.0.1, ::1) is permitted; the endpoint has no auth`));
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
server.listen(opts.port, host, () => {
|
|
250
|
+
const addr = server.address();
|
|
251
|
+
const actualPort = addr !== null && typeof addr === 'object' ? addr.port : opts.port;
|
|
252
|
+
opts.logger?.info({
|
|
253
|
+
event: 'metrics.listening',
|
|
254
|
+
message: `metrics endpoint bound on ${host}:${actualPort}${METRICS_PATH}`,
|
|
255
|
+
port: actualPort,
|
|
256
|
+
host,
|
|
257
|
+
});
|
|
258
|
+
resolve({
|
|
259
|
+
port: () => actualPort,
|
|
260
|
+
close: () => new Promise((closeResolve) => {
|
|
261
|
+
let settled = false;
|
|
262
|
+
const finish = () => {
|
|
263
|
+
if (settled)
|
|
264
|
+
return;
|
|
265
|
+
settled = true;
|
|
266
|
+
clearTimeout(deadline);
|
|
267
|
+
closeResolve();
|
|
268
|
+
};
|
|
269
|
+
// Happy path: server.close() fires when all in-flight requests
|
|
270
|
+
// plus their underlying sockets have drained naturally.
|
|
271
|
+
server.close(() => finish());
|
|
272
|
+
// Fallback path: after CLOSE_DEADLINE_MS, destroy any surviving
|
|
273
|
+
// sockets so the close callback can fire. `closeIdleConnections`
|
|
274
|
+
// handles idle keep-alive sessions first (Node 18.2+), then we
|
|
275
|
+
// destroy whatever is left — including in-flight requests, which
|
|
276
|
+
// a stalled scraper could pin indefinitely otherwise.
|
|
277
|
+
const deadline = setTimeout(() => {
|
|
278
|
+
try {
|
|
279
|
+
server.closeIdleConnections?.();
|
|
280
|
+
}
|
|
281
|
+
catch {
|
|
282
|
+
// Best-effort — method is optional on older Node.
|
|
283
|
+
}
|
|
284
|
+
for (const sock of sockets) {
|
|
285
|
+
try {
|
|
286
|
+
sock.destroy();
|
|
287
|
+
}
|
|
288
|
+
catch {
|
|
289
|
+
// Sockets may already be closing.
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
sockets.clear();
|
|
293
|
+
// Some platforms don't deliver the close() callback after a
|
|
294
|
+
// forced socket shutdown — settle directly.
|
|
295
|
+
finish();
|
|
296
|
+
}, CLOSE_DEADLINE_MS);
|
|
297
|
+
// Don't let the timer hold the process open if shutdown beats it.
|
|
298
|
+
deadline.unref();
|
|
299
|
+
}),
|
|
300
|
+
});
|
|
301
|
+
});
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Parse and validate `REA_METRICS_PORT`. Returns the numeric port, or `null`
|
|
306
|
+
* if the env var is unset / malformed. An out-of-range or non-numeric value
|
|
307
|
+
* logs a warning and also returns null — we never silently bind on a default.
|
|
308
|
+
*/
|
|
309
|
+
export function resolveMetricsPort(raw, logger) {
|
|
310
|
+
if (raw === undefined || raw.trim() === '')
|
|
311
|
+
return null;
|
|
312
|
+
const n = Number(raw);
|
|
313
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
314
|
+
logger?.warn({
|
|
315
|
+
event: 'metrics.port_invalid',
|
|
316
|
+
message: `REA_METRICS_PORT="${raw}" is not a valid TCP port; metrics endpoint will NOT start`,
|
|
317
|
+
});
|
|
318
|
+
return null;
|
|
319
|
+
}
|
|
320
|
+
return n;
|
|
321
|
+
}
|
package/dist/gateway/server.d.ts
CHANGED
|
@@ -33,10 +33,25 @@ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
|
33
33
|
import { DownstreamPool } from './downstream-pool.js';
|
|
34
34
|
import type { Registry } from '../registry/types.js';
|
|
35
35
|
import type { Policy } from '../policy/types.js';
|
|
36
|
+
import { type Logger } from './log.js';
|
|
37
|
+
import { type MetricsRegistry } from './observability/metrics.js';
|
|
36
38
|
export interface GatewayOptions {
|
|
37
39
|
baseDir: string;
|
|
38
40
|
policy: Policy;
|
|
39
41
|
registry: Registry;
|
|
42
|
+
/**
|
|
43
|
+
* Optional structured logger. If omitted, a default logger is created that
|
|
44
|
+
* writes to `process.stderr` honoring `REA_LOG_LEVEL`. Tests inject their
|
|
45
|
+
* own logger to capture records.
|
|
46
|
+
*/
|
|
47
|
+
logger?: Logger;
|
|
48
|
+
/**
|
|
49
|
+
* Optional metrics registry. When supplied, the terminal middleware and
|
|
50
|
+
* connection lifecycle events increment counters/gauges on it. When
|
|
51
|
+
* omitted, no metrics are recorded — this keeps the gateway usable in
|
|
52
|
+
* tests without bringing in the metrics surface.
|
|
53
|
+
*/
|
|
54
|
+
metrics?: MetricsRegistry;
|
|
40
55
|
}
|
|
41
56
|
export interface GatewayHandle {
|
|
42
57
|
/** Expose the Server for test harnesses that attach InMemoryTransport. */
|
|
@@ -47,5 +62,9 @@ export interface GatewayHandle {
|
|
|
47
62
|
stop(): Promise<void>;
|
|
48
63
|
/** Exposed for tests. */
|
|
49
64
|
pool: DownstreamPool;
|
|
65
|
+
/** The active logger — shared with serve.ts so startup messages stay in one sink. */
|
|
66
|
+
logger: Logger;
|
|
67
|
+
/** Optional metrics registry (undefined when the caller did not supply one). */
|
|
68
|
+
metrics: MetricsRegistry | undefined;
|
|
50
69
|
}
|
|
51
70
|
export declare function createGateway(opts: GatewayOptions): GatewayHandle;
|
package/dist/gateway/server.js
CHANGED
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
*/
|
|
32
32
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
33
33
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
34
|
-
import { CallToolRequestSchema, ListToolsRequestSchema
|
|
34
|
+
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
|
35
35
|
import { DownstreamPool, splitPrefixed } from './downstream-pool.js';
|
|
36
36
|
import { createAuditMiddleware } from './middleware/audit.js';
|
|
37
37
|
import { createKillSwitchMiddleware } from './middleware/kill-switch.js';
|
|
@@ -41,7 +41,7 @@ import { createBlockedPathsMiddleware } from './middleware/blocked-paths.js';
|
|
|
41
41
|
import { createRateLimitMiddleware } from './middleware/rate-limit.js';
|
|
42
42
|
import { createCircuitBreakerMiddleware } from './middleware/circuit-breaker.js';
|
|
43
43
|
import { createInjectionMiddleware } from './middleware/injection.js';
|
|
44
|
-
import { createRedactMiddleware
|
|
44
|
+
import { createRedactMiddleware } from './middleware/redact.js';
|
|
45
45
|
import { wrapRegex } from './redact-safe/match-timeout.js';
|
|
46
46
|
import { createResultSizeCapMiddleware } from './middleware/result-size-cap.js';
|
|
47
47
|
import { executeChain } from './middleware/chain.js';
|
|
@@ -50,6 +50,8 @@ import { CircuitBreaker } from './circuit-breaker.js';
|
|
|
50
50
|
import { currentSessionId } from './session.js';
|
|
51
51
|
import { InvocationStatus, Tier } from '../policy/types.js';
|
|
52
52
|
import { log } from '../cli/utils.js';
|
|
53
|
+
import { createLogger } from './log.js';
|
|
54
|
+
import { CIRCUIT_GAUGE } from './observability/metrics.js';
|
|
53
55
|
/**
|
|
54
56
|
* Build the ordered middleware chain used on every CallToolRequest.
|
|
55
57
|
* Order is prescriptive — DO NOT reorder without reading THREAT_MODEL.md §
|
|
@@ -82,30 +84,68 @@ function compileUserRedactPatterns(policy, matchTimeoutMs) {
|
|
|
82
84
|
}
|
|
83
85
|
return out;
|
|
84
86
|
}
|
|
85
|
-
function buildMiddlewareChain(opts) {
|
|
86
|
-
const { baseDir, policy } = opts;
|
|
87
|
+
function buildMiddlewareChain(opts, deps) {
|
|
88
|
+
const { baseDir, policy, metrics } = opts;
|
|
87
89
|
const matchTimeoutMs = policy.redact?.match_timeout_ms ?? 100;
|
|
88
90
|
const userPatterns = compileUserRedactPatterns(policy, matchTimeoutMs);
|
|
89
91
|
return [
|
|
90
|
-
|
|
91
|
-
|
|
92
|
+
// Metrics threaded through so `rea_audit_lines_appended_total` advances
|
|
93
|
+
// on every durable audit append and `rea_seconds_since_last_halt_check`
|
|
94
|
+
// reflects per-invocation cadence, not gateway uptime.
|
|
95
|
+
createAuditMiddleware(baseDir, policy, metrics),
|
|
96
|
+
createKillSwitchMiddleware(baseDir, metrics),
|
|
92
97
|
createTierMiddleware(),
|
|
93
98
|
createPolicyMiddleware(policy, undefined, baseDir),
|
|
94
99
|
createBlockedPathsMiddleware(policy, baseDir),
|
|
95
100
|
createRateLimitMiddleware(new RateLimiter()),
|
|
96
|
-
createCircuitBreakerMiddleware(
|
|
97
|
-
createInjectionMiddleware(policy.injection_detection === 'warn' ? 'warn' : 'block', {
|
|
98
|
-
|
|
99
|
-
|
|
101
|
+
createCircuitBreakerMiddleware(deps.breaker),
|
|
102
|
+
createInjectionMiddleware(policy.injection_detection === 'warn' ? 'warn' : 'block', (() => {
|
|
103
|
+
// G9 follow-up: preserve the tri-state for `suspiciousBlocksWrites`
|
|
104
|
+
// (true / false / undefined-omitted). With `exactOptionalPropertyTypes`
|
|
105
|
+
// we must omit the key entirely rather than passing `undefined` so
|
|
106
|
+
// the middleware's `?? true` / `?? false` default logic runs for
|
|
107
|
+
// consumers who did not configure the flag. `bst-internal*` profiles
|
|
108
|
+
// pin the flag explicitly.
|
|
109
|
+
const pinned = policy.injection?.suspicious_blocks_writes;
|
|
110
|
+
return pinned === undefined
|
|
111
|
+
? { matchTimeoutMs }
|
|
112
|
+
: { matchTimeoutMs, suspiciousBlocksWrites: pinned };
|
|
113
|
+
})()),
|
|
100
114
|
createRedactMiddleware({ matchTimeoutMs, userPatterns }),
|
|
101
115
|
createResultSizeCapMiddleware(),
|
|
102
116
|
];
|
|
103
117
|
}
|
|
104
118
|
export function createGateway(opts) {
|
|
105
119
|
const { registry } = opts;
|
|
106
|
-
const
|
|
120
|
+
const logger = opts.logger ?? createLogger({ base: { session_id: currentSessionId() } });
|
|
121
|
+
const metrics = opts.metrics;
|
|
122
|
+
const pool = new DownstreamPool(registry, logger);
|
|
107
123
|
const server = new Server({ name: 'rea', version: '0.2.0' }, { capabilities: { tools: {} } });
|
|
108
|
-
|
|
124
|
+
// Build the circuit breaker with observability hooks wired in — state
|
|
125
|
+
// transitions log a structured record AND update the Prometheus gauge.
|
|
126
|
+
const breaker = new CircuitBreaker({
|
|
127
|
+
onStateChange: (event) => {
|
|
128
|
+
const level = event.to === 'open' ? 'warn' : 'info';
|
|
129
|
+
logger[level]({
|
|
130
|
+
event: `circuit.${event.to.replace('-', '_')}`,
|
|
131
|
+
server_name: event.server,
|
|
132
|
+
message: `circuit-breaker: "${event.server}" ${event.from} → ${event.to} (${event.reason})`,
|
|
133
|
+
...(event.retryAt !== undefined ? { retry_at: event.retryAt } : {}),
|
|
134
|
+
});
|
|
135
|
+
switch (event.to) {
|
|
136
|
+
case 'closed':
|
|
137
|
+
metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.closed);
|
|
138
|
+
break;
|
|
139
|
+
case 'half-open':
|
|
140
|
+
metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.halfOpen);
|
|
141
|
+
break;
|
|
142
|
+
case 'open':
|
|
143
|
+
metrics?.setCircuitState(event.server, CIRCUIT_GAUGE.open);
|
|
144
|
+
break;
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
const staticChain = buildMiddlewareChain(opts, { breaker });
|
|
109
149
|
// ── Handlers ─────────────────────────────────────────────────────────────
|
|
110
150
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
111
151
|
if (pool.size === 0)
|
|
@@ -159,13 +199,19 @@ export function createGateway(opts) {
|
|
|
159
199
|
context.error = 'No downstream servers in .rea/registry.yaml — add one to enable proxying';
|
|
160
200
|
return;
|
|
161
201
|
}
|
|
202
|
+
metrics?.incDownstreamCall(serverName);
|
|
203
|
+
metrics?.incDownstreamInFlight(serverName);
|
|
162
204
|
try {
|
|
163
205
|
context.result = await pool.callTool(prefixed, context.arguments);
|
|
164
206
|
}
|
|
165
207
|
catch (err) {
|
|
208
|
+
metrics?.incDownstreamError(serverName);
|
|
166
209
|
context.status = InvocationStatus.Error;
|
|
167
210
|
context.error = err instanceof Error ? err.message : String(err);
|
|
168
211
|
}
|
|
212
|
+
finally {
|
|
213
|
+
metrics?.decDownstreamInFlight(serverName);
|
|
214
|
+
}
|
|
169
215
|
};
|
|
170
216
|
try {
|
|
171
217
|
await executeChain([...staticChain, terminal], ctx);
|
|
@@ -224,14 +270,51 @@ export function createGateway(opts) {
|
|
|
224
270
|
// Connect to downstream children first so the `listTools` catalog is ready
|
|
225
271
|
// by the time the upstream client connects.
|
|
226
272
|
if (pool.size === 0) {
|
|
227
|
-
|
|
273
|
+
logger.info({
|
|
274
|
+
event: 'gateway.no_downstreams',
|
|
275
|
+
message: 'no downstream servers in .rea/registry.yaml — running in no-op mode. Add servers to enable proxying.',
|
|
276
|
+
});
|
|
228
277
|
}
|
|
229
278
|
else {
|
|
279
|
+
for (const s of registry.servers) {
|
|
280
|
+
if (!s.enabled)
|
|
281
|
+
continue;
|
|
282
|
+
logger.info({
|
|
283
|
+
event: 'downstream.connect_attempt',
|
|
284
|
+
server_name: s.name,
|
|
285
|
+
message: `connecting downstream "${s.name}"`,
|
|
286
|
+
});
|
|
287
|
+
}
|
|
230
288
|
try {
|
|
231
289
|
await pool.connectAll();
|
|
290
|
+
for (const s of registry.servers) {
|
|
291
|
+
if (!s.enabled)
|
|
292
|
+
continue;
|
|
293
|
+
const conn = pool.getConnection(s.name);
|
|
294
|
+
if (conn !== undefined && conn.isHealthy) {
|
|
295
|
+
logger.info({
|
|
296
|
+
event: 'downstream.connected',
|
|
297
|
+
server_name: s.name,
|
|
298
|
+
message: `downstream "${s.name}" connected`,
|
|
299
|
+
});
|
|
300
|
+
// Every healthy downstream starts in the closed state — record
|
|
301
|
+
// the initial circuit-breaker gauge so scrapers see a baseline.
|
|
302
|
+
metrics?.setCircuitState(s.name, CIRCUIT_GAUGE.closed);
|
|
303
|
+
}
|
|
304
|
+
else {
|
|
305
|
+
logger.warn({
|
|
306
|
+
event: 'downstream.unhealthy_on_start',
|
|
307
|
+
server_name: s.name,
|
|
308
|
+
message: `downstream "${s.name}" did not come up healthy`,
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
}
|
|
232
312
|
}
|
|
233
313
|
catch (err) {
|
|
234
|
-
|
|
314
|
+
logger.error({
|
|
315
|
+
event: 'downstream.connect_failed',
|
|
316
|
+
message: `downstream connect error: ${err instanceof Error ? err.message : err}`,
|
|
317
|
+
});
|
|
235
318
|
// Continue — individual connections may still be healthy.
|
|
236
319
|
}
|
|
237
320
|
}
|
|
@@ -242,6 +325,7 @@ export function createGateway(opts) {
|
|
|
242
325
|
if (stopping)
|
|
243
326
|
return;
|
|
244
327
|
stopping = true;
|
|
328
|
+
logger.info({ event: 'gateway.shutdown', message: 'gateway stop requested' });
|
|
245
329
|
try {
|
|
246
330
|
await server.close();
|
|
247
331
|
}
|
|
@@ -250,7 +334,7 @@ export function createGateway(opts) {
|
|
|
250
334
|
}
|
|
251
335
|
await pool.close();
|
|
252
336
|
}
|
|
253
|
-
return { server, start, stop, pool };
|
|
337
|
+
return { server, start, stop, pool, logger, metrics };
|
|
254
338
|
}
|
|
255
339
|
// Prevent TS from complaining about the unused `Tier` import when the file is
|
|
256
340
|
// compiled in isolation; keeping the import pins the semantic dependency edge
|