@bookedsolid/rea 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,11 +141,22 @@ export async function appendAuditRecord(baseDir, input) {
141
141
  .then(async () => {
142
142
  record = await doAppend(resolvedBase, input);
143
143
  });
144
- writeQueues.set(key, next.finally(() => {
144
+ writeQueues.set(key, next
145
+ .finally(() => {
145
146
  // Keep the queue lean — once this write resolves, drop the reference
146
147
  // if nothing newer is chained behind it.
147
148
  if (writeQueues.get(key) === next)
148
149
  writeQueues.delete(key);
150
+ })
151
+ // Swallow rejections on the stored promise so Node doesn't flag it as
152
+ // an unhandled rejection. The current caller already owns this error
153
+ // via the `await next` below; the NEXT caller that chains off this
154
+ // entry also .catch()-es it at the top of its chain. Without this
155
+ // terminal .catch(), a failed audit append surfaces as a spurious
156
+ // `unhandledRejection` event — which matters in tests that run the
157
+ // whole process and in long-lived servers that would log it.
158
+ .catch(() => {
159
+ /* handled by caller */
149
160
  }));
150
161
  await next;
151
162
  return record;
@@ -19,6 +19,33 @@ export interface CheckResult {
19
19
  * Exported so tests can drive this without spinning up the full `runDoctor`.
20
20
  */
21
21
  export declare function checkFingerprintStore(baseDir: string): Promise<CheckResult>;
22
+ /**
23
+ * Detect whether `baseDir` is a git repository. Returns true for the three
24
+ * shapes git itself accepts:
25
+ *
26
+ * 1. `.git/` is a directory (vanilla repo).
27
+ * 2. `.git` is a file with `gitdir: <path>` (linked worktree, submodule).
28
+ * The target gitdir is resolved and must exist on disk — a stale or
29
+ * orphaned gitlink (submodule whose parent moved, worktree whose main
30
+ * repo was deleted) is NOT a git repo and must return false, otherwise
31
+ * doctor short-circuits the non-git escape hatch and hard-fails on the
32
+ * pre-push check against a `.git/hooks/` that doesn't exist (F1 from
33
+ * Codex review of 0.5.1).
34
+ * 3. Anything else (including a plain file a user accidentally named
35
+ * `.git`, or a symlink to nowhere) → false.
36
+ *
37
+ * Filesystem-shape predicate only. Deliberately does not consult `GIT_DIR`
38
+ * or shell out to `git rev-parse` — `rea doctor` already checks things
39
+ * inside `baseDir/.git/hooks/`, so the shape-on-disk is the right question
40
+ * for the escape hatch. A GIT_DIR-aware secondary signal is a follow-up.
41
+ *
42
+ * Security note (F3): removing `.git/` does NOT bypass governance. The
43
+ * governance artifact is the pre-push hook; a directory with no `.git/`
44
+ * has no commits to push and no pre-push event to bypass. The escape
45
+ * hatch is a UX predicate for knowledge repos and non-source directories,
46
+ * NOT a trust boundary. Do not key security decisions on the return value.
47
+ */
48
+ export declare function isGitRepo(baseDir: string): boolean;
22
49
  /**
23
50
  * Translate a `CodexProbeState` into two doctor CheckResults: one for
24
51
  * responsiveness (pass/warn) and one informational line about the last
@@ -240,6 +240,75 @@ function checkSettingsJson(baseDir) {
240
240
  };
241
241
  }
242
242
  }
243
+ /**
244
+ * Detect whether `baseDir` is a git repository. Returns true for the three
245
+ * shapes git itself accepts:
246
+ *
247
+ * 1. `.git/` is a directory (vanilla repo).
248
+ * 2. `.git` is a file with `gitdir: <path>` (linked worktree, submodule).
249
+ * The target gitdir is resolved and must exist on disk — a stale or
250
+ * orphaned gitlink (submodule whose parent moved, worktree whose main
251
+ * repo was deleted) is NOT a git repo and must return false, otherwise
252
+ * doctor short-circuits the non-git escape hatch and hard-fails on the
253
+ * pre-push check against a `.git/hooks/` that doesn't exist (F1 from
254
+ * Codex review of 0.5.1).
255
+ * 3. Anything else (including a plain file a user accidentally named
256
+ * `.git`, or a symlink to nowhere) → false.
257
+ *
258
+ * Filesystem-shape predicate only. Deliberately does not consult `GIT_DIR`
259
+ * or shell out to `git rev-parse` — `rea doctor` already checks things
260
+ * inside `baseDir/.git/hooks/`, so the shape-on-disk is the right question
261
+ * for the escape hatch. A GIT_DIR-aware secondary signal is a follow-up.
262
+ *
263
+ * Security note (F3): removing `.git/` does NOT bypass governance. The
264
+ * governance artifact is the pre-push hook; a directory with no `.git/`
265
+ * has no commits to push and no pre-push event to bypass. The escape
266
+ * hatch is a UX predicate for knowledge repos and non-source directories,
267
+ * NOT a trust boundary. Do not key security decisions on the return value.
268
+ */
269
+ export function isGitRepo(baseDir) {
270
+ const dotGit = path.join(baseDir, '.git');
271
+ let stat;
272
+ try {
273
+ // statSync follows symlinks, so a `.git` symlink to a real gitdir is
274
+ // treated like the real thing; a dangling symlink throws ENOENT and
275
+ // falls into the catch → false.
276
+ stat = fs.statSync(dotGit);
277
+ }
278
+ catch {
279
+ return false;
280
+ }
281
+ if (stat.isDirectory())
282
+ return true;
283
+ if (!stat.isFile())
284
+ return false;
285
+ // Gitlink file: `gitdir: <absolute-or-relative-path>`. Read and verify
286
+ // the target resolves. If the target is missing, git itself would fail
287
+ // in this directory, so we treat it as non-git.
288
+ let content;
289
+ try {
290
+ content = fs.readFileSync(dotGit, 'utf8');
291
+ }
292
+ catch {
293
+ return false;
294
+ }
295
+ // `\s*$` on the old shape was inert (greedy `.+` consumed trailing spaces
296
+ // and `\s ⊂ .`) — the `.trim()` below did all the work. Tighten to
297
+ // `(\S.*?)` with an explicit trailing-space class so the captured group
298
+ // starts at the first non-whitespace char and stops before trailing
299
+ // whitespace. Still handles CRLF, leading tabs, and path-internal spaces.
300
+ const match = /^gitdir:\s*(\S.*?)[ \t]*\r?$/m.exec(content);
301
+ const rawTarget = match?.[1];
302
+ if (rawTarget === undefined)
303
+ return false;
304
+ const targetPath = rawTarget;
305
+ if (targetPath.length === 0)
306
+ return false;
307
+ const resolved = path.isAbsolute(targetPath)
308
+ ? targetPath
309
+ : path.join(baseDir, targetPath);
310
+ return fs.existsSync(resolved);
311
+ }
243
312
  function checkCommitMsgHook(baseDir) {
244
313
  const hookPath = path.join(baseDir, '.git', 'hooks', 'commit-msg');
245
314
  if (!fs.existsSync(hookPath)) {
@@ -443,10 +512,23 @@ export function collectChecks(baseDir, codexProbeState, prePushState) {
443
512
  checkAgentsPresent(baseDir),
444
513
  checkHooksInstalled(baseDir),
445
514
  checkSettingsJson(baseDir),
446
- checkCommitMsgHook(baseDir),
447
515
  ];
448
- if (prePushState !== undefined) {
449
- checks.push(checkPrePushHook(prePushState));
516
+ // Non-git escape hatch: when `.git/` is absent, both git-hook checks are
517
+ // meaningless (commit-msg + pre-push can't be invoked without git). Emit
518
+ // one informational line so `rea doctor` exits 0 in knowledge repos and
519
+ // other non-source-code directories that consume rea governance.
520
+ if (isGitRepo(baseDir)) {
521
+ checks.push(checkCommitMsgHook(baseDir));
522
+ if (prePushState !== undefined) {
523
+ checks.push(checkPrePushHook(prePushState));
524
+ }
525
+ }
526
+ else {
527
+ checks.push({
528
+ label: 'git hooks',
529
+ status: 'info',
530
+ detail: 'no `.git/` at baseDir — commit-msg / pre-push checks skipped (not a git repo)',
531
+ });
450
532
  }
451
533
  if (codexRequiredFromPolicy(baseDir)) {
452
534
  checks.push(checkCodexAgent(baseDir), checkCodexCommand(baseDir));
@@ -14,8 +14,36 @@ export interface PrefixedTool extends DownstreamToolInfo {
14
14
  /** Full prefixed name, as exposed to the upstream client. */
15
15
  name: string;
16
16
  }
17
+ /**
18
+ * Per-downstream state surfaced by the `__rea__health` meta-tool. Kept
19
+ * separate from the richer internal state so we only expose what a caller
20
+ * can actually reason about.
21
+ */
22
+ export interface DownstreamHealth {
23
+ name: string;
24
+ /** Registered in the registry (always true for entries present in the pool). */
25
+ enabled: boolean;
26
+ /** Underlying MCP client currently connected. */
27
+ connected: boolean;
28
+ /** Gateway considers this downstream healthy enough to route calls to. */
29
+ healthy: boolean;
30
+ /** Last error observed, or null if the connection is clean or never errored. */
31
+ last_error: string | null;
32
+ /**
33
+ * Number of tools advertised by the downstream on the most recent
34
+ * successful `tools/list`, or null when never listed / listing failed.
35
+ */
36
+ tools_count: number | null;
37
+ }
17
38
  export declare class DownstreamPool {
18
39
  private readonly connections;
40
+ /**
41
+ * Cached tool counts from the most recent successful `listAllTools` cycle,
42
+ * keyed by server name. Surfaced via `healthSnapshot()` so the meta-tool
43
+ * can report per-server counts even when the current listing pass fails
44
+ * or is skipped. Stale but truthful > absent.
45
+ */
46
+ private readonly lastToolsCount;
19
47
  constructor(registry: Registry, logger?: Logger);
20
48
  get size(): number;
21
49
  connectAll(): Promise<void>;
@@ -25,6 +53,12 @@ export declare class DownstreamPool {
25
53
  * will see a smaller catalog rather than a crash.
26
54
  */
27
55
  listAllTools(): Promise<PrefixedTool[]>;
56
+ /**
57
+ * Snapshot per-server connection state for the `__rea__health` meta-tool.
58
+ * Pure / non-blocking — no MCP I/O — so it can be called while HALT is
59
+ * active or while other tool calls are in-flight.
60
+ */
61
+ healthSnapshot(): DownstreamHealth[];
28
62
  /**
29
63
  * Split a prefixed tool name and dispatch. Returns the raw result from the
30
64
  * downstream (the gateway response handler shapes it for the upstream reply).
@@ -8,6 +8,13 @@
8
8
  import { DownstreamConnection } from './downstream.js';
9
9
  export class DownstreamPool {
10
10
  connections = new Map();
11
+ /**
12
+ * Cached tool counts from the most recent successful `listAllTools` cycle,
13
+ * keyed by server name. Surfaced via `healthSnapshot()` so the meta-tool
14
+ * can report per-server counts even when the current listing pass fails
15
+ * or is skipped. Stale but truthful > absent.
16
+ */
17
+ lastToolsCount = new Map();
11
18
  constructor(registry, logger) {
12
19
  for (const server of registry.servers) {
13
20
  if (!server.enabled)
@@ -45,6 +52,7 @@ export class DownstreamPool {
45
52
  continue;
46
53
  try {
47
54
  const tools = await conn.listTools();
55
+ this.lastToolsCount.set(server, tools.length);
48
56
  for (const t of tools) {
49
57
  const prefixed = {
50
58
  ...t,
@@ -60,6 +68,35 @@ export class DownstreamPool {
60
68
  }
61
69
  return out;
62
70
  }
71
+ /**
72
+ * Snapshot per-server connection state for the `__rea__health` meta-tool.
73
+ * Pure / non-blocking — no MCP I/O — so it can be called while HALT is
74
+ * active or while other tool calls are in-flight.
75
+ */
76
+ healthSnapshot() {
77
+ const out = [];
78
+ for (const [name, conn] of this.connections) {
79
+ const cached = this.lastToolsCount.get(name);
80
+ const connected = conn.isConnected;
81
+ const healthy = conn.isHealthy;
82
+ // Only surface the cached tool count when the connection is BOTH
83
+ // connected AND healthy right now. Codex F1 caught that a dead
84
+ // downstream was showing its last-successful count alongside
85
+ // `healthy: false`, which is a worse-than-null diagnostic — operators
86
+ // would read "5 tools reachable" from a server that is reachable
87
+ // through exactly zero tools.
88
+ const tools_count = connected && healthy && typeof cached === 'number' ? cached : null;
89
+ out.push({
90
+ name,
91
+ enabled: true,
92
+ connected,
93
+ healthy,
94
+ last_error: conn.lastError,
95
+ tools_count,
96
+ });
97
+ }
98
+ return out;
99
+ }
63
100
  /**
64
101
  * Split a prefixed tool name and dispatch. Returns the raw result from the
65
102
  * downstream (the gateway response handler shapes it for the upstream reply).
@@ -93,6 +93,13 @@ export declare class DownstreamConnection {
93
93
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
94
94
  private lastReconnectAt;
95
95
  private health;
96
+ /**
97
+ * The most recent error observed on this connection (connect or call
98
+ * failure). Surfaced via `__rea__health` so callers can diagnose an empty
99
+ * tool catalog without digging through stderr logs. Set to `null` after a
100
+ * successful connect/reconnect.
101
+ */
102
+ private lastErrorMessage;
96
103
  constructor(config: RegistryServer,
97
104
  /**
98
105
  * Optional structured logger (G5). When omitted, connection lifecycle
@@ -102,6 +109,10 @@ export declare class DownstreamConnection {
102
109
  logger?: Logger | undefined);
103
110
  get name(): string;
104
111
  get isHealthy(): boolean;
112
+ /** True iff the underlying MCP client is currently connected. */
113
+ get isConnected(): boolean;
114
+ /** Last error observed, or null if the connection has never failed (or fully recovered). */
115
+ get lastError(): string | null;
105
116
  connect(): Promise<void>;
106
117
  listTools(): Promise<DownstreamToolInfo[]>;
107
118
  /**
@@ -107,6 +107,13 @@ export class DownstreamConnection {
107
107
  /** Epoch ms of the last successful reconnect. Used by the flapping guard. */
108
108
  lastReconnectAt = 0;
109
109
  health = 'healthy';
110
+ /**
111
+ * The most recent error observed on this connection (connect or call
112
+ * failure). Surfaced via `__rea__health` so callers can diagnose an empty
113
+ * tool catalog without digging through stderr logs. Set to `null` after a
114
+ * successful connect/reconnect.
115
+ */
116
+ lastErrorMessage = null;
110
117
  constructor(config,
111
118
  /**
112
119
  * Optional structured logger (G5). When omitted, connection lifecycle
@@ -123,6 +130,14 @@ export class DownstreamConnection {
123
130
  get isHealthy() {
124
131
  return this.health !== 'unhealthy';
125
132
  }
133
+ /** True iff the underlying MCP client is currently connected. */
134
+ get isConnected() {
135
+ return this.client !== null;
136
+ }
137
+ /** Last error observed, or null if the connection has never failed (or fully recovered). */
138
+ get lastError() {
139
+ return this.lastErrorMessage;
140
+ }
126
141
  async connect() {
127
142
  if (this.client !== null)
128
143
  return;
@@ -143,10 +158,13 @@ export class DownstreamConnection {
143
158
  }
144
159
  catch (err) {
145
160
  this.health = 'unhealthy';
146
- throw new Error(`failed to resolve env for downstream "${this.config.name}": ${err instanceof Error ? err.message : err}`);
161
+ const msg = `failed to resolve env for downstream "${this.config.name}": ${err instanceof Error ? err.message : err}`;
162
+ this.lastErrorMessage = msg;
163
+ throw new Error(msg);
147
164
  }
148
165
  if (built.missing.length > 0) {
149
166
  this.health = 'unhealthy';
167
+ this.lastErrorMessage = `missing env: ${built.missing.join(', ')}`;
150
168
  // One line per missing var so grep/jq users can find the exact gap.
151
169
  // We intentionally do NOT log the env key name's VALUE (there is none —
152
170
  // it's unresolved) nor any other env values.
@@ -166,10 +184,13 @@ export class DownstreamConnection {
166
184
  await client.connect(transport);
167
185
  this.client = client;
168
186
  this.health = 'healthy';
187
+ this.lastErrorMessage = null;
169
188
  }
170
189
  catch (err) {
171
190
  this.health = 'unhealthy';
172
- throw new Error(`failed to connect to downstream "${this.config.name}" (${this.config.command}): ${err instanceof Error ? err.message : err}`);
191
+ const msg = `failed to connect to downstream "${this.config.name}" (${this.config.command}): ${err instanceof Error ? err.message : err}`;
192
+ this.lastErrorMessage = msg;
193
+ throw new Error(msg);
173
194
  }
174
195
  }
175
196
  async listTools() {
@@ -190,7 +211,13 @@ export class DownstreamConnection {
190
211
  await this.connect();
191
212
  }
192
213
  try {
193
- return await this.client.callTool({ name: toolName, arguments: args });
214
+ const result = await this.client.callTool({ name: toolName, arguments: args });
215
+ // Clear any lingering error from a previous transient failure. Without
216
+ // this, a connection that failed once and then recovered on the very
217
+ // next call (same client, no reconnect) would forever report the old
218
+ // error via `__rea__health`, misleading operators about live state.
219
+ this.lastErrorMessage = null;
220
+ return result;
194
221
  }
195
222
  catch (err) {
196
223
  const message = err instanceof Error ? err.message : String(err);
@@ -212,6 +239,7 @@ export class DownstreamConnection {
212
239
  // stamp the reconnect time so flap-guard can refuse rapid repeats.
213
240
  this.reconnectAttempted = false;
214
241
  this.lastReconnectAt = Date.now();
242
+ this.lastErrorMessage = null;
215
243
  this.logger?.info({
216
244
  event: 'downstream.reconnected',
217
245
  server_name: this.config.name,
@@ -221,16 +249,19 @@ export class DownstreamConnection {
221
249
  }
222
250
  catch (reconnectErr) {
223
251
  this.health = 'unhealthy';
252
+ const errMsg = reconnectErr instanceof Error ? reconnectErr.message : String(reconnectErr);
253
+ this.lastErrorMessage = errMsg;
224
254
  this.logger?.error({
225
255
  event: 'downstream.reconnect_failed',
226
256
  server_name: this.config.name,
227
257
  message: `downstream "${this.config.name}" unhealthy after one reconnect`,
228
- error: reconnectErr instanceof Error ? reconnectErr.message : String(reconnectErr),
258
+ error: errMsg,
229
259
  });
230
- throw new Error(`downstream "${this.config.name}" unhealthy after one reconnect: ${reconnectErr instanceof Error ? reconnectErr.message : reconnectErr}`);
260
+ throw new Error(`downstream "${this.config.name}" unhealthy after one reconnect: ${errMsg}`);
231
261
  }
232
262
  }
233
263
  this.health = 'unhealthy';
264
+ this.lastErrorMessage = message;
234
265
  this.logger?.error({
235
266
  event: 'downstream.call_failed',
236
267
  server_name: this.config.name,
@@ -0,0 +1,117 @@
1
+ /**
2
+ * Gateway-internal `__rea__health` meta-tool.
3
+ *
4
+ * WHY THIS EXISTS
5
+ * ===============
6
+ *
7
+ * The MCP `listTools` catalog the gateway advertises is the UNION of every
8
+ * healthy downstream's own catalog. When all downstreams are unhealthy — or
9
+ * the registry is empty, or fingerprints fail, or an env var is missing — the
10
+ * catalog is empty. From the LLM's perspective this is indistinguishable from
11
+ * a gateway that came up fine but happens to have nothing to proxy, and there
12
+ * is no tool it can call to ask "why is this empty?" because, well, the
13
+ * catalog is empty.
14
+ *
15
+ * This meta-tool closes that diagnostic gap: the gateway ALWAYS exposes
16
+ * `__rea__health` regardless of downstream state, the kill-switch, or the
17
+ * middleware chain. A caller can invoke it to get a snapshot of every
18
+ * registered server's connection state, last error, and tool count.
19
+ *
20
+ * DESIGN CHOICES
21
+ * --------------
22
+ *
23
+ * 1. Name shape: `__rea__health`. The leading `__` (instead of a normal
24
+ * `<server>__<tool>` prefix) reserves the namespace for gateway-internal
25
+ * tools. It never collides with a registered server because
26
+ * `src/registry/loader.ts` restricts `name` to `^[a-z0-9][a-z0-9-]*$` —
27
+ * no underscores allowed.
28
+ *
29
+ * 2. Short-circuit in `server.ts`: the CallTool handler matches on the
30
+ * constant below BEFORE calling `splitPrefixed`, and responds directly
31
+ * without running the middleware chain. Reasons, ordered:
32
+ * (a) This tool must be callable while HALT is present — otherwise the
33
+ * operator can't introspect a frozen gateway.
34
+ * (b) Tier middleware would classify `health` as Write (default for
35
+ * unlisted names) and deny L0 callers — wrong for read-only
36
+ * introspection.
37
+ * (c) There is no downstream to dispatch to — the entire middleware
38
+ * chain is about getting to one safely.
39
+ * The short-circuit still writes an audit record via `appendAuditRecord`
40
+ * so invocations remain accountable.
41
+ *
42
+ * 3. Never throws. Health is the one tool the caller uses when things are
43
+ * broken. Every field is best-effort; a missing value is surfaced as
44
+ * `null`, not as an exception.
45
+ */
46
+ import type { Policy } from '../../policy/types.js';
47
+ import type { DownstreamHealth } from '../downstream-pool.js';
48
+ /** Canonical MCP tool name exposed by the gateway. */
49
+ export declare const META_HEALTH_TOOL_NAME = "__rea__health";
50
+ /** `server_name` recorded in audit entries for this meta-tool. */
51
+ export declare const META_SERVER_NAME = "__rea__";
52
+ /** `tool_name` recorded in audit entries for this meta-tool. */
53
+ export declare const META_TOOL_NAME = "health";
54
+ export interface MetaHealthSnapshot {
55
+ /** rea gateway version (from package.json, pinned to the shipped version). */
56
+ gateway: {
57
+ version: string;
58
+ /** Seconds since gateway process started. */
59
+ uptime_s: number;
60
+ /** Whether `.rea/HALT` is present. */
61
+ halt: boolean;
62
+ /** When true, the health tool is the only callable tool right now. */
63
+ halt_reason: string | null;
64
+ };
65
+ policy: {
66
+ profile: string;
67
+ autonomy_level: string;
68
+ max_autonomy_level: string;
69
+ block_ai_attribution: boolean;
70
+ blocked_paths_count: number;
71
+ };
72
+ /** Per-downstream state. Empty array iff the registry is empty. */
73
+ downstreams: DownstreamHealth[];
74
+ /** Rolled-up counts the LLM can act on without walking the array. */
75
+ summary: {
76
+ registered: number;
77
+ connected: number;
78
+ healthy: number;
79
+ total_tools: number;
80
+ };
81
+ }
82
+ export interface BuildHealthSnapshotDeps {
83
+ /** Gateway version (so we can test deterministically without reading package.json). */
84
+ gatewayVersion: string;
85
+ /** Gateway boot time in epoch ms. `uptime_s` is computed from this. */
86
+ startedAtMs: number;
87
+ /** Frozen policy snapshot — we do not re-read `.rea/policy.yaml` here. */
88
+ policy: Policy;
89
+ /** Per-downstream state from the pool. */
90
+ downstreams: DownstreamHealth[];
91
+ /** Whether `.rea/HALT` is present at snapshot time. */
92
+ halt: boolean;
93
+ /**
94
+ * HALT reason, if any. `null` when HALT is absent OR when the file exists
95
+ * but the caller couldn't read its contents — we never surface an I/O
96
+ * exception through this tool.
97
+ */
98
+ haltReason: string | null;
99
+ /** Current epoch ms. Injected for determinism in tests. */
100
+ nowMs?: number;
101
+ }
102
+ /**
103
+ * Pure function that builds the snapshot from injected state. All I/O happens
104
+ * in the caller (`server.ts`) — keeps this testable and keeps "health never
105
+ * throws" a local invariant rather than a chain-wide claim.
106
+ */
107
+ export declare function buildHealthSnapshot(deps: BuildHealthSnapshotDeps): MetaHealthSnapshot;
108
+ /**
109
+ * The descriptor the gateway advertises via `tools/list`. No arguments —
110
+ * callers request a snapshot by calling with `{}`. Keeping the surface
111
+ * argument-free makes the tool trivially safe for any autonomy level.
112
+ */
113
+ export declare function metaHealthToolDescriptor(): {
114
+ name: string;
115
+ description: string;
116
+ inputSchema: Record<string, unknown>;
117
+ };
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Gateway-internal `__rea__health` meta-tool.
3
+ *
4
+ * WHY THIS EXISTS
5
+ * ===============
6
+ *
7
+ * The MCP `listTools` catalog the gateway advertises is the UNION of every
8
+ * healthy downstream's own catalog. When all downstreams are unhealthy — or
9
+ * the registry is empty, or fingerprints fail, or an env var is missing — the
10
+ * catalog is empty. From the LLM's perspective this is indistinguishable from
11
+ * a gateway that came up fine but happens to have nothing to proxy, and there
12
+ * is no tool it can call to ask "why is this empty?" because, well, the
13
+ * catalog is empty.
14
+ *
15
+ * This meta-tool closes that diagnostic gap: the gateway ALWAYS exposes
16
+ * `__rea__health` regardless of downstream state, the kill-switch, or the
17
+ * middleware chain. A caller can invoke it to get a snapshot of every
18
+ * registered server's connection state, last error, and tool count.
19
+ *
20
+ * DESIGN CHOICES
21
+ * --------------
22
+ *
23
+ * 1. Name shape: `__rea__health`. The leading `__` (instead of a normal
24
+ * `<server>__<tool>` prefix) reserves the namespace for gateway-internal
25
+ * tools. It never collides with a registered server because
26
+ * `src/registry/loader.ts` restricts `name` to `^[a-z0-9][a-z0-9-]*$` —
27
+ * no underscores allowed.
28
+ *
29
+ * 2. Short-circuit in `server.ts`: the CallTool handler matches on the
30
+ * constant below BEFORE calling `splitPrefixed`, and responds directly
31
+ * without running the middleware chain. Reasons, ordered:
32
+ * (a) This tool must be callable while HALT is present — otherwise the
33
+ * operator can't introspect a frozen gateway.
34
+ * (b) Tier middleware would classify `health` as Write (default for
35
+ * unlisted names) and deny L0 callers — wrong for read-only
36
+ * introspection.
37
+ * (c) There is no downstream to dispatch to — the entire middleware
38
+ * chain is about getting to one safely.
39
+ * The short-circuit still writes an audit record via `appendAuditRecord`
40
+ * so invocations remain accountable.
41
+ *
42
+ * 3. Never throws. Health is the one tool the caller uses when things are
43
+ * broken. Every field is best-effort; a missing value is surfaced as
44
+ * `null`, not as an exception.
45
+ */
46
+ /** Canonical MCP tool name exposed by the gateway. */
47
+ export const META_HEALTH_TOOL_NAME = '__rea__health';
48
+ /** `server_name` recorded in audit entries for this meta-tool. */
49
+ export const META_SERVER_NAME = '__rea__';
50
+ /** `tool_name` recorded in audit entries for this meta-tool. */
51
+ export const META_TOOL_NAME = 'health';
52
+ /**
53
+ * Pure function that builds the snapshot from injected state. All I/O happens
54
+ * in the caller (`server.ts`) — keeps this testable and keeps "health never
55
+ * throws" a local invariant rather than a chain-wide claim.
56
+ */
57
+ export function buildHealthSnapshot(deps) {
58
+ const now = deps.nowMs ?? Date.now();
59
+ const uptime_s = Math.max(0, Math.floor((now - deps.startedAtMs) / 1000));
60
+ let connected = 0;
61
+ let healthy = 0;
62
+ let total_tools = 0;
63
+ for (const d of deps.downstreams) {
64
+ if (d.connected)
65
+ connected += 1;
66
+ if (d.healthy)
67
+ healthy += 1;
68
+ if (typeof d.tools_count === 'number')
69
+ total_tools += d.tools_count;
70
+ }
71
+ return {
72
+ gateway: {
73
+ version: deps.gatewayVersion,
74
+ uptime_s,
75
+ halt: deps.halt,
76
+ halt_reason: deps.haltReason,
77
+ },
78
+ policy: {
79
+ profile: deps.policy.profile,
80
+ autonomy_level: String(deps.policy.autonomy_level),
81
+ max_autonomy_level: String(deps.policy.max_autonomy_level),
82
+ block_ai_attribution: deps.policy.block_ai_attribution,
83
+ blocked_paths_count: deps.policy.blocked_paths.length,
84
+ },
85
+ downstreams: deps.downstreams,
86
+ summary: {
87
+ registered: deps.downstreams.length,
88
+ connected,
89
+ healthy,
90
+ total_tools,
91
+ },
92
+ };
93
+ }
94
+ /**
95
+ * The descriptor the gateway advertises via `tools/list`. No arguments —
96
+ * callers request a snapshot by calling with `{}`. Keeping the surface
97
+ * argument-free makes the tool trivially safe for any autonomy level.
98
+ */
99
+ export function metaHealthToolDescriptor() {
100
+ return {
101
+ name: META_HEALTH_TOOL_NAME,
102
+ description: 'rea gateway self-diagnostic. Returns the gateway version, HALT state, policy summary, ' +
103
+ 'and per-downstream connection/health/tool-count. Always available, even when every ' +
104
+ 'downstream is unhealthy or HALT is active — this is the tool you call when listTools ' +
105
+ 'comes back empty or suspicious.',
106
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
107
+ };
108
+ }
@@ -32,7 +32,12 @@
32
32
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
33
33
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
34
34
  import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
35
+ import fs from 'node:fs/promises';
36
+ import path from 'node:path';
35
37
  import { DownstreamPool, splitPrefixed } from './downstream-pool.js';
38
+ import { META_HEALTH_TOOL_NAME, META_SERVER_NAME, META_TOOL_NAME, buildHealthSnapshot, metaHealthToolDescriptor, } from './meta/health.js';
39
+ import { appendAuditRecord } from '../audit/append.js';
40
+ import { getPkgVersion } from '../cli/utils.js';
36
41
  import { createAuditMiddleware } from './middleware/audit.js';
37
42
  import { createKillSwitchMiddleware } from './middleware/kill-switch.js';
38
43
  import { createTierMiddleware } from './middleware/tier.js';
@@ -116,11 +121,13 @@ function buildMiddlewareChain(opts, deps) {
116
121
  ];
117
122
  }
118
123
  export function createGateway(opts) {
119
- const { registry } = opts;
124
+ const { registry, policy, baseDir } = opts;
120
125
  const logger = opts.logger ?? createLogger({ base: { session_id: currentSessionId() } });
121
126
  const metrics = opts.metrics;
122
127
  const pool = new DownstreamPool(registry, logger);
123
- const server = new Server({ name: 'rea', version: '0.2.0' }, { capabilities: { tools: {} } });
128
+ const gatewayVersion = getPkgVersion();
129
+ const startedAtMs = Date.now();
130
+ const server = new Server({ name: 'rea', version: gatewayVersion }, { capabilities: { tools: {} } });
124
131
  // Build the circuit breaker with observability hooks wired in — state
125
132
  // transitions log a structured record AND update the Prometheus gauge.
126
133
  const breaker = new CircuitBreaker({
@@ -146,22 +153,116 @@ export function createGateway(opts) {
146
153
  },
147
154
  });
148
155
  const staticChain = buildMiddlewareChain(opts, { breaker });
156
+ // Read `.rea/HALT` without ever throwing. Returns `{halt, reason}` where
157
+ // `reason` is the (trimmed) file contents or null when the file is absent
158
+ // / unreadable. The meta-tool never surfaces I/O errors — health is the one
159
+ // thing that has to keep working when everything else is broken.
160
+ async function readHalt() {
161
+ try {
162
+ const contents = await fs.readFile(path.join(baseDir, '.rea', 'HALT'), 'utf8');
163
+ const trimmed = contents.trim();
164
+ return { halt: true, reason: trimmed.length > 0 ? trimmed : null };
165
+ }
166
+ catch {
167
+ return { halt: false, reason: null };
168
+ }
169
+ }
149
170
  // ── Handlers ─────────────────────────────────────────────────────────────
150
171
  server.setRequestHandler(ListToolsRequestSchema, async () => {
172
+ // The `__rea__health` meta-tool is ALWAYS advertised, regardless of
173
+ // downstream state. This is the systemic answer to the "listTools came
174
+ // back empty, now what?" diagnostic gap — the LLM can always call
175
+ // health to find out why.
176
+ const metaTool = metaHealthToolDescriptor();
151
177
  if (pool.size === 0)
152
- return { tools: [] };
178
+ return { tools: [metaTool] };
153
179
  const prefixed = await pool.listAllTools();
154
180
  return {
155
- tools: prefixed.map((t) => ({
156
- name: t.name,
157
- description: t.description ?? `${t.server} ${t.name.slice(t.server.length + 2)}`,
158
- inputSchema: t.inputSchema ?? { type: 'object' },
159
- })),
181
+ tools: [
182
+ metaTool,
183
+ ...prefixed.map((t) => ({
184
+ name: t.name,
185
+ description: t.description ?? `${t.server} → ${t.name.slice(t.server.length + 2)}`,
186
+ inputSchema: t.inputSchema ?? { type: 'object' },
187
+ })),
188
+ ],
160
189
  };
161
190
  });
162
191
  server.setRequestHandler(CallToolRequestSchema, async (req) => {
163
192
  const prefixed = req.params.name;
164
193
  const args = (req.params.arguments ?? {});
194
+ // Short-circuit the `__rea__health` meta-tool BEFORE the middleware chain
195
+ // and BEFORE splitPrefixed. Reasons:
196
+ // - Must be callable while HALT is active (so the operator can
197
+ // introspect a frozen gateway). The kill-switch middleware would
198
+ // otherwise deny.
199
+ // - `deriveBaseTier('health')` defaults to Write, which would deny L0
200
+ // callers. Health is pure introspection — tier doesn't apply.
201
+ // - There's no downstream to dispatch to. The middleware chain exists
202
+ // to reach one safely.
203
+ // We still write an audit record so invocations remain accountable.
204
+ // The `__rea__` prefix is reserved for gateway-internal meta-tools.
205
+ // Reject any unknown name in that namespace with a clear error rather
206
+ // than letting `splitPrefixed` produce the confusing `unknown downstream
207
+ // server ""` message for e.g. `__rea__health ` (trailing space) or a
208
+ // future meta-tool name the client was guessing at.
209
+ if (prefixed.startsWith('__rea__') && prefixed !== META_HEALTH_TOOL_NAME) {
210
+ return {
211
+ isError: true,
212
+ content: [
213
+ {
214
+ type: 'text',
215
+ text: `reserved meta-namespace: only "${META_HEALTH_TOOL_NAME}" is defined under __rea__`,
216
+ },
217
+ ],
218
+ };
219
+ }
220
+ if (prefixed === META_HEALTH_TOOL_NAME) {
221
+ const startMs = Date.now();
222
+ const haltState = await readHalt();
223
+ const snapshot = buildHealthSnapshot({
224
+ gatewayVersion,
225
+ startedAtMs,
226
+ policy,
227
+ downstreams: pool.healthSnapshot(),
228
+ halt: haltState.halt,
229
+ haltReason: haltState.reason,
230
+ });
231
+ // Best-effort audit append. Failures here must never prevent the
232
+ // caller from getting the health response — that would defeat the
233
+ // whole point of a "works when everything else is broken" tool.
234
+ try {
235
+ await appendAuditRecord(baseDir, {
236
+ tool_name: META_TOOL_NAME,
237
+ server_name: META_SERVER_NAME,
238
+ status: InvocationStatus.Allowed,
239
+ tier: Tier.Read,
240
+ autonomy_level: String(policy.autonomy_level),
241
+ session_id: currentSessionId(),
242
+ duration_ms: Date.now() - startMs,
243
+ metadata: {
244
+ halt: snapshot.gateway.halt,
245
+ downstreams_registered: snapshot.summary.registered,
246
+ downstreams_healthy: snapshot.summary.healthy,
247
+ },
248
+ });
249
+ }
250
+ catch (err) {
251
+ logger.warn({
252
+ event: 'meta.health.audit_failed',
253
+ message: 'failed to append audit record for __rea__health; serving response anyway',
254
+ error: err instanceof Error ? err.message : String(err),
255
+ });
256
+ }
257
+ return {
258
+ content: [
259
+ {
260
+ type: 'text',
261
+ text: JSON.stringify(snapshot, null, 2),
262
+ },
263
+ ],
264
+ };
265
+ }
165
266
  // Split prefix for downstream dispatch; the terminal middleware uses the
166
267
  // full prefixed name to call the pool (which re-splits internally).
167
268
  let serverName;
@@ -336,7 +437,3 @@ export function createGateway(opts) {
336
437
  }
337
438
  return { server, start, stop, pool, logger, metrics };
338
439
  }
339
- // Prevent TS from complaining about the unused `Tier` import when the file is
340
- // compiled in isolation; keeping the import pins the semantic dependency edge
341
- // for future middleware that may want to inspect the tier in terminal.
342
- void Tier;
@@ -15,6 +15,39 @@ set -uo pipefail
15
15
  # ── 1. Read ALL stdin immediately ─────────────────────────────────────────────
16
16
  INPUT=$(cat)
17
17
 
18
+ # ── 1a. Cross-repo guard (must come FIRST — before any rea-scoped check) ──────
19
+ # Mirror of push-review-gate.sh. When CLAUDE_PROJECT_DIR points to rea but
20
+ # the current git checkout is a DIFFERENT repository (distinct object DB),
21
+ # exit 0 — rea's gate does not own that commit.
22
+ #
23
+ # Identity via `--git-common-dir` so linked worktrees of rea
24
+ # (`git worktree add`, `.claude/worktrees/*`) are correctly recognized as
25
+ # the SAME repo and kept under the gate — they share object DB, refs, and
26
+ # HEAD history with rea's main checkout. Path-prefix fallback fires
27
+ # when either side is not a git checkout. Must run BEFORE the jq and HALT
28
+ # checks: a missing-jq or HALT-frozen rea must not block commits in other
29
+ # repos that merely share a Claude Code session with rea. Fixed in 0.6.1.
30
+ REA_ROOT="${CLAUDE_PROJECT_DIR:-$(pwd)}"
31
+ if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
32
+ CWD_REAL=$(pwd -P 2>/dev/null || pwd)
33
+ if REA_REAL=$(cd "$REA_ROOT" 2>/dev/null && pwd -P 2>/dev/null); then
34
+ CWD_COMMON=$(git -C "$CWD_REAL" rev-parse --path-format=absolute --git-common-dir 2>/dev/null || true)
35
+ REA_COMMON=$(git -C "$REA_REAL" rev-parse --path-format=absolute --git-common-dir 2>/dev/null || true)
36
+ if [[ -n "$CWD_COMMON" && -n "$REA_COMMON" ]]; then
37
+ CWD_COMMON_REAL=$(cd "$CWD_COMMON" 2>/dev/null && pwd -P 2>/dev/null || echo "$CWD_COMMON")
38
+ REA_COMMON_REAL=$(cd "$REA_COMMON" 2>/dev/null && pwd -P 2>/dev/null || echo "$REA_COMMON")
39
+ if [[ "$CWD_COMMON_REAL" != "$REA_COMMON_REAL" ]]; then
40
+ exit 0
41
+ fi
42
+ else
43
+ case "$CWD_REAL/" in
44
+ "$REA_REAL"/*|"$REA_REAL"/) : ;; # inside rea — run the gate
45
+ *) exit 0 ;; # outside rea — not our gate
46
+ esac
47
+ fi
48
+ fi
49
+ fi
50
+
18
51
  # ── 2. Dependency check ──────────────────────────────────────────────────────
19
52
  if ! command -v jq >/dev/null 2>&1; then
20
53
  printf 'REA ERROR: jq is required but not installed.\n' >&2
@@ -23,7 +56,6 @@ if ! command -v jq >/dev/null 2>&1; then
23
56
  fi
24
57
 
25
58
  # ── 3. HALT check ────────────────────────────────────────────────────────────
26
- REA_ROOT="${CLAUDE_PROJECT_DIR:-$(pwd)}"
27
59
  HALT_FILE="${REA_ROOT}/.rea/HALT"
28
60
  if [ -f "$HALT_FILE" ]; then
29
61
  printf 'REA HALT: %s\nAll agent operations suspended. Run: rea unfreeze\n' \
@@ -37,6 +37,63 @@ set -uo pipefail
37
37
  # ── 1. Read ALL stdin immediately ─────────────────────────────────────────────
38
38
  INPUT=$(cat)
39
39
 
40
+ # ── 1a. Cross-repo guard (must come FIRST — before any rea-scoped check) ──────
41
+ # When CLAUDE_PROJECT_DIR points to the rea repo (the Claude Code session's
42
+ # project directory) but the current working directory is a DIFFERENT
43
+ # repository, this hook is firing for someone else's push. rea's gate only
44
+ # owns pushes from within rea itself — exit 0 so the foreign repo's
45
+ # `git push` proceeds unblocked.
46
+ #
47
+ # MUST run before the jq check and HALT check. Those are rea-scoped concerns:
48
+ # a missing-jq or HALT-frozen state in rea must not block pushes in OTHER
49
+ # repos that merely share a Claude Code session with rea. Fixing that
50
+ # governance-scope leak is half the point of this guard.
51
+ #
52
+ # Also: without this guard, ref-resolution inside `resolve_argv_refspecs`
53
+ # runs `git rev-parse` inside REA_ROOT for refs that only exist in the
54
+ # foreign repo, which hard-fails with "could not resolve source ref". That
55
+ # failure lands BEFORE REA_SKIP_PUSH_REVIEW / REA_SKIP_CODEX_REVIEW can be
56
+ # checked, so consumers are left with no documented way out. Discovered
57
+ # during the 0.6.0 cross-repo consumer upgrade; fixed in 0.6.1.
58
+ #
59
+ # Repo-identity comparison via shared `--git-common-dir`, NOT path-prefix or
60
+ # `--show-toplevel`. Why common-dir: a linked worktree created by
61
+ # `git worktree add` has a different toplevel (different checkout path) but
62
+ # the SAME repository — shared object DB, shared refs, shared HEAD history.
63
+ # Any `.claude/worktrees/*` checkout of rea IS rea and must run the gate.
64
+ # `--show-toplevel` would falsely flag those worktrees as "foreign" and
65
+ # bypass HALT plus every other gate (Codex R3 finding, 0.6.1).
66
+ #
67
+ # `--path-format=absolute` (Git ≥ 2.31, March 2021) normalizes the common
68
+ # dir so the same repo's common-dir is equal regardless of which worktree
69
+ # asked. Engines pin Node ≥20 which ships with a recent-enough Git for dev.
70
+ #
71
+ # Falls back to path-prefix when either cwd or REA_ROOT is not a git
72
+ # checkout (the rea 0.5.1 non-git escape-hatch scenario).
73
+ REA_ROOT="${CLAUDE_PROJECT_DIR:-$(pwd)}"
74
+ if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
75
+ CWD_REAL=$(pwd -P 2>/dev/null || pwd)
76
+ if REA_REAL=$(cd "$REA_ROOT" 2>/dev/null && pwd -P 2>/dev/null); then
77
+ CWD_COMMON=$(git -C "$CWD_REAL" rev-parse --path-format=absolute --git-common-dir 2>/dev/null || true)
78
+ REA_COMMON=$(git -C "$REA_REAL" rev-parse --path-format=absolute --git-common-dir 2>/dev/null || true)
79
+ if [[ -n "$CWD_COMMON" && -n "$REA_COMMON" ]]; then
80
+ # Both sides are git checkouts. Realpath'd common-dirs match IFF they
81
+ # point at the same underlying repository (main or linked worktree).
82
+ CWD_COMMON_REAL=$(cd "$CWD_COMMON" 2>/dev/null && pwd -P 2>/dev/null || echo "$CWD_COMMON")
83
+ REA_COMMON_REAL=$(cd "$REA_COMMON" 2>/dev/null && pwd -P 2>/dev/null || echo "$REA_COMMON")
84
+ if [[ "$CWD_COMMON_REAL" != "$REA_COMMON_REAL" ]]; then
85
+ exit 0
86
+ fi
87
+ else
88
+ # Non-git-repo path: literal quoted expansions — no glob expansion.
89
+ case "$CWD_REAL/" in
90
+ "$REA_REAL"/*|"$REA_REAL"/) : ;; # inside rea — run the gate
91
+ *) exit 0 ;; # outside rea — not our gate
92
+ esac
93
+ fi
94
+ fi
95
+ fi
96
+
40
97
  # ── 2. Dependency check ──────────────────────────────────────────────────────
41
98
  if ! command -v jq >/dev/null 2>&1; then
42
99
  printf 'REA ERROR: jq is required but not installed.\n' >&2
@@ -45,7 +102,6 @@ if ! command -v jq >/dev/null 2>&1; then
45
102
  fi
46
103
 
47
104
  # ── 3. HALT check ────────────────────────────────────────────────────────────
48
- REA_ROOT="${CLAUDE_PROJECT_DIR:-$(pwd)}"
49
105
  HALT_FILE="${REA_ROOT}/.rea/HALT"
50
106
  if [ -f "$HALT_FILE" ]; then
51
107
  printf 'REA HALT: %s\nAll agent operations suspended. Run: rea unfreeze\n' \
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bookedsolid/rea",
3
- "version": "0.5.0",
3
+ "version": "0.6.1",
4
4
  "description": "Agentic governance layer for Claude Code — policy enforcement, hook-based safety gates, audit logging, and Codex-integrated adversarial review for AI-assisted projects",
5
5
  "license": "MIT",
6
6
  "author": "Booked Solid Technology <oss@bookedsolid.tech> (https://bookedsolid.tech)",