@roadmapperai/mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/AGENTS.md +885 -0
  2. package/README.md +111 -0
  3. package/package.json +35 -0
  4. package/server.mjs +4019 -0
package/server.mjs ADDED
@@ -0,0 +1,4019 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Roadmapper MCP server — zero-dependency stdio JSON-RPC.
4
+ *
5
+ * Exposes a planning surface so an agent can read the roadmap and
6
+ * (when authorized) propose tasks or stamp acceptance grades:
7
+ *
8
+ * list_themes read
9
+ * list_capabilities read (optionally filtered by themeId)
10
+ * list_tasks read (optionally filtered by capabilityId / status)
11
+ * get_task read (full task detail, including acceptance + deps)
12
+ * get_agents_md read (the planning contract)
13
+ * propose_task write (requires SUPABASE_SERVICE_ROLE_KEY)
14
+ * submit_acceptance_grades write (requires SUPABASE_SERVICE_ROLE_KEY)
15
+ *
16
+ * Data sources, in order:
17
+ * 1. Local seed at src/data/roadmap.json (always read).
18
+ * 2. Workspace edits via Supabase REST, when SUPABASE_URL,
19
+ * SUPABASE_WORKSPACE_ID, and either SUPABASE_PUBLISHABLE_KEY or
20
+ * the legacy SUPABASE_ANON_KEY are set. Edits override / extend
21
+ * the seed exactly like the app does.
22
+ * 3. Writes require SUPABASE_SERVICE_ROLE_KEY (bypasses RLS). Without
23
+ * it the write tools return an error result and the read tools
24
+ * still work.
25
+ *
26
+ * Wire-up (Claude Code / Claude Desktop / any MCP client):
27
+ * {
28
+ * "mcpServers": {
29
+ * "roadmapper": {
30
+ * "command": "node",
31
+ * "args": ["/absolute/path/to/roadmap/mcp/server.mjs"],
32
+ * "env": {
33
+ * "SUPABASE_URL": "...",
34
+ * "SUPABASE_PUBLISHABLE_KEY": "sb_publishable_...",
35
+ * "SUPABASE_WORKSPACE_ID": "...",
36
+ * "SUPABASE_SERVICE_ROLE_KEY": "..."
37
+ * }
38
+ * }
39
+ * }
40
+ * }
41
+ *
42
+ * Self-test: `node mcp/server.mjs --selftest` exercises every tool
43
+ * against the local seed and exits 0 on success, 1 on failure. Useful
44
+ * for verifying the install without an MCP client.
45
+ *
46
+ * Speaks the MCP stdio protocol: newline-delimited JSON-RPC 2.0 on
47
+ * stdin/stdout. Logs go to stderr only.
48
+ */
49
+
50
+ import { readFileSync, existsSync } from "node:fs";
51
+ import { dirname, join, resolve } from "node:path";
52
+ import { fileURLToPath } from "node:url";
53
+
54
+ const HERE = dirname(fileURLToPath(import.meta.url));
55
+ const REPO = resolve(HERE, "..");
56
+ // Seed JSON: dev environment ships the file at src/data/roadmap.json
57
+ // (used by the SPA). The npm-packaged build has no SPA source tree,
58
+ // so readSeed() falls back to an empty roadmap and the real data
59
+ // loads from Supabase as "edits" on top.
60
+ const SEED_PATH = join(REPO, "src", "data", "roadmap.json");
61
+ // AGENTS.md (planning rubric): prefer the copy bundled inside the
62
+ // npm package (HERE/AGENTS.md) so customers get the right rubric
63
+ // without needing the repo. Fall back to REPO/AGENTS.md for local
64
+ // dev where the canonical file lives at the repo root.
65
+ const BUNDLED_AGENTS_PATH = join(HERE, "AGENTS.md");
66
+ const REPO_AGENTS_PATH = join(REPO, "AGENTS.md");
67
+
68
+ const PROTOCOL_VERSION = "2024-11-05";
69
+ const SERVER_NAME = "roadmapper";
70
+ const SERVER_VERSION = "0.6.0";
71
+
72
+ // Must match src/types.ts EFFORT_DAYS — AI-era calibration.
73
+ // Fractional values (XS=0.25, S=0.5) get rounded up when used to
74
+ // project a target date below, since date strings are day-resolution.
75
+ const EFFORT_DAYS = { XS: 0.25, S: 0.5, M: 1, L: 3, XL: 8 };
76
+ const VALID_PRIORITIES = new Set(["P0", "P1", "P2", "P3"]);
77
+ const VALID_EFFORTS = new Set(["XS", "S", "M", "L", "XL"]);
78
+ const VALID_KINDS = new Set(["feature", "bug", "chore", "spike"]);
79
+ const VALID_STATUSES = new Set(["delivered", "in_progress", "planned", "exploring"]);
80
+
81
+ function log(...args) {
82
+ console.error("[roadmapper-mcp]", ...args);
83
+ }
84
+
85
+ function send(message) {
86
+ process.stdout.write(JSON.stringify(message) + "\n");
87
+ }
88
+
89
+ function readSeed() {
90
+ try {
91
+ return JSON.parse(readFileSync(SEED_PATH, "utf-8"));
92
+ } catch (e) {
93
+ log("failed to read seed", e.message);
94
+ return { product: { themes: [] }, capabilities: [], tasks: [], sprints: [] };
95
+ }
96
+ }
97
+
98
+ function readAgentsMd() {
99
+ // Try the bundled copy first (npm install case), then the repo
100
+ // root copy (local dev). One of them is always present in a
101
+ // normal install; both missing means the package was assembled
102
+ // wrong and the rubric is unavailable.
103
+ try {
104
+ return readFileSync(BUNDLED_AGENTS_PATH, "utf-8");
105
+ } catch {
106
+ try {
107
+ return readFileSync(REPO_AGENTS_PATH, "utf-8");
108
+ } catch {
109
+ return "AGENTS.md not found — rubric unavailable.";
110
+ }
111
+ }
112
+ }
113
+
114
+ /**
115
+ * The read key used to fetch the workspace row. Accepts the new
116
+ * publishable key (`sb_publishable_…`) or the legacy `anon`/JWT key.
117
+ */
118
+ function readKey() {
119
+ return (
120
+ process.env.SUPABASE_PUBLISHABLE_KEY ||
121
+ process.env.SUPABASE_ANON_KEY ||
122
+ null
123
+ );
124
+ }
125
+
126
+ function supabaseConfig() {
127
+ return {
128
+ url: process.env.SUPABASE_URL || null,
129
+ readKey: readKey(),
130
+ writeKey: process.env.SUPABASE_SERVICE_ROLE_KEY || null,
131
+ workspaceId: process.env.SUPABASE_WORKSPACE_ID || null,
132
+ };
133
+ }
134
+
135
+ /**
136
+ * Read `.roadmapper/snapshot.json` from the current working directory
137
+ * once at first access. Returns the workspaceId it names, or null if
138
+ * the file is missing/malformed. The snapshot is committed by the
139
+ * snapshot-roadmaps Edge Function into every connected repo's
140
+ * roadmapper-snapshot branch — so if an agent is running from a
141
+ * checkout that has it, the cwd unambiguously names a workspace.
142
+ *
143
+ * This is the "if I'm working in repo X right now, I almost certainly
144
+ * mean to write to repo X's workspace" safety net. It catches the
145
+ * wrong-workspace-push class of mistake before it lands.
146
+ */
147
+ const SNAPSHOT_FILE = join(".roadmapper", "snapshot.json");
148
+ let _snapshotWorkspace = undefined; // undefined = unread; null = read & absent/bad
149
+ function snapshotWorkspaceId() {
150
+ if (_snapshotWorkspace !== undefined) return _snapshotWorkspace;
151
+ try {
152
+ const path = join(process.cwd(), SNAPSHOT_FILE);
153
+ if (!existsSync(path)) {
154
+ _snapshotWorkspace = null;
155
+ return null;
156
+ }
157
+ const raw = JSON.parse(readFileSync(path, "utf8"));
158
+ if (typeof raw?.workspaceId === "string" && raw.workspaceId.length > 0) {
159
+ _snapshotWorkspace = raw.workspaceId;
160
+ return _snapshotWorkspace;
161
+ }
162
+ _snapshotWorkspace = null;
163
+ } catch {
164
+ // Unreadable / malformed snapshot is non-fatal — the server keeps
165
+ // serving with env + per-call defaults. Operators see the warning
166
+ // when callTool surfaces "no workspace" or via --selftest.
167
+ _snapshotWorkspace = null;
168
+ }
169
+ return _snapshotWorkspace;
170
+ }
171
+
172
+ // Test hook for selftest. Module-internal; not surfaced via MCP. Pass
173
+ // `undefined` to force snapshotWorkspaceId() to re-read from disk on
174
+ // next call, or a string/null to short-circuit the cache.
175
+ function __setSnapshotWorkspaceForTest(value) {
176
+ _snapshotWorkspace = value;
177
+ }
178
+
179
+ /**
180
+ * Resolve the workspace id for a tool call. Resolution order:
181
+ * 1. Explicit `workspaceId` arg on the call.
182
+ * 2. `.roadmapper/snapshot.json` in the cwd (committed by the
183
+ * snapshot-roadmaps cron — names the workspace this repo
184
+ * belongs to).
185
+ * 3. Env-driven `SUPABASE_WORKSPACE_ID`.
186
+ * 4. null.
187
+ *
188
+ * Snapshot beats env because the snapshot reflects "where the agent
189
+ * is right now", while the env reflects "where the operator pointed
190
+ * the MCP install when they configured it". Cwd-specific wins.
191
+ *
192
+ * Mutators with an explicit `workspaceId` arg that conflicts with the
193
+ * cwd snapshot are refused upstream in `callTool` — see the
194
+ * cross-workspace guard there.
195
+ */
196
+ function resolveWorkspaceId(argWorkspaceId) {
197
+ if (argWorkspaceId) return argWorkspaceId;
198
+ const snap = snapshotWorkspaceId();
199
+ if (snap) return snap;
200
+ return supabaseConfig().workspaceId ?? null;
201
+ }
202
+
203
+ /**
204
+ * Read the workspace's current entity state directly from the
205
+ * normalized tables (Stage 3 Piece 6c — `workspaces.edits` column
206
+ * was dropped). Returns `{ themes, capabilities, tasks }` in the
207
+ * legacy camelCase shape the rest of MCP consumes, or `null` if
208
+ * the read failed (callers fall back to the bundled seed).
209
+ *
210
+ * Prefers the service-role key when set so RLS doesn't filter
211
+ * agent reads down to the caller's visible_pillars allow-list.
212
+ */
213
+ async function readWorkspaceProjected(wsIdOverride) {
214
+ const { url, readKey: anonKey, writeKey } = supabaseConfig();
215
+ const workspaceId = resolveWorkspaceId(wsIdOverride);
216
+ const key = writeKey || anonKey;
217
+ if (!url || !key || !workspaceId) return null;
218
+ const filter = `workspace_id=eq.${encodeURIComponent(workspaceId)}`;
219
+ const headers = {
220
+ apikey: key,
221
+ Authorization: `Bearer ${key}`,
222
+ Accept: "application/json",
223
+ };
224
+ const fetchTable = async (path) => {
225
+ const res = await fetch(`${url}/rest/v1/${path}&${filter}`, { headers });
226
+ if (!res.ok) {
227
+ throw new Error(`${path}: ${res.status} ${await res.text().catch(() => "")}`);
228
+ }
229
+ return res.json();
230
+ };
231
+ try {
232
+ const [pillars, caps, tasks] = await Promise.all([
233
+ fetchTable("pillars?select=*"),
234
+ fetchTable("capabilities?select=*"),
235
+ fetchTable("tasks?select=*"),
236
+ ]);
237
+ return {
238
+ themes: pillars.map(rowToThemeProjected),
239
+ capabilities: caps.map(rowToCapabilityProjected),
240
+ tasks: tasks.map(rowToTaskProjected),
241
+ };
242
+ } catch (e) {
243
+ log("supabase entity read failed:", e.message);
244
+ return null;
245
+ }
246
+ }
247
+
248
+ /** Row → camelCase projection helpers. Snake-case columns map to
249
+ * the same camelCase keys the SPA + agent surfaces have always
250
+ * used; the legacy JSONB shape and these table rows agree on
251
+ * every field. */
252
+ function rowToThemeProjected(r) {
253
+ return stripUndefined({
254
+ id: r.id,
255
+ name: r.name,
256
+ description: r.description,
257
+ color: r.color,
258
+ targetRoi: r.target_roi,
259
+ ownerUserId: r.owner_user_id,
260
+ idempotencyKey: r.idempotency_key,
261
+ archived: r.archived,
262
+ archivedAt: r.archived_at,
263
+ createdAt: r.created_at,
264
+ updatedAt: r.updated_at,
265
+ });
266
+ }
267
+ function rowToCapabilityProjected(r) {
268
+ return stripUndefined({
269
+ id: r.id,
270
+ pillarId: r.pillar_id,
271
+ name: r.name,
272
+ description: r.description,
273
+ outcome: r.outcome,
274
+ reach: r.reach,
275
+ impact: r.impact,
276
+ confidence: r.confidence,
277
+ roi: r.roi,
278
+ color: r.color,
279
+ status: r.status,
280
+ start: r.start_date,
281
+ target: r.target_date,
282
+ delivered: r.delivered_date,
283
+ originalTarget: r.original_target,
284
+ laneRow: r.lane_row,
285
+ ownerUserId: r.owner_user_id,
286
+ specRef: r.spec_ref,
287
+ outcomeStatus: r.outcome_status,
288
+ outcomeCheckedAt: r.outcome_checked_at,
289
+ outcomeReadings: r.outcome_readings,
290
+ dependsOn: r.depends_on,
291
+ idempotencyKey: r.idempotency_key,
292
+ archived: r.archived,
293
+ archivedAt: r.archived_at,
294
+ createdAt: r.created_at,
295
+ updatedAt: r.updated_at,
296
+ });
297
+ }
298
+ function rowToTaskProjected(r) {
299
+ return stripUndefined({
300
+ id: r.id,
301
+ capabilityId: r.capability_id,
302
+ pillarId: r.pillar_id,
303
+ title: r.title,
304
+ summary: r.summary,
305
+ status: r.status,
306
+ priority: r.priority,
307
+ effort: r.effort,
308
+ start: r.start_date,
309
+ target: r.target_date,
310
+ originalTarget: r.original_target,
311
+ delivered: r.delivered_date,
312
+ deliveredAt: r.delivered_at,
313
+ progress: r.progress,
314
+ owner: r.owner,
315
+ ownerGithub: r.owner_github,
316
+ ownerAvatarUrl: r.owner_avatar_url,
317
+ ownerUserId: r.owner_user_id,
318
+ laneRow: r.lane_row,
319
+ matrixDx: r.matrix_dx,
320
+ matrixDy: r.matrix_dy,
321
+ team: r.team,
322
+ kind: r.kind,
323
+ authorKind: r.author_kind,
324
+ expectedPRs: r.expected_prs,
325
+ expectedScope: r.expected_scope,
326
+ tags: r.tags,
327
+ prs: r.prs,
328
+ links: r.links,
329
+ acceptance: r.acceptance,
330
+ acceptanceGrades: r.acceptance_grades,
331
+ dependsOn: r.depends_on,
332
+ idempotencyKey: r.idempotency_key,
333
+ archived: r.archived,
334
+ archivedAt: r.archived_at,
335
+ createdAt: r.created_at,
336
+ updatedAt: r.updated_at,
337
+ });
338
+ }
339
+ function stripUndefined(o) {
340
+ for (const k of Object.keys(o)) if (o[k] === undefined || o[k] === null) delete o[k];
341
+ return o;
342
+ }
343
+
344
+ /**
345
+ * Invoke a Postgres function exposed via PostgREST. Used by the
346
+ * write tools so the read-modify-write happens inside a single
347
+ * Postgres transaction (with row-level locking on the workspace),
348
+ * not across two round-trips from the MCP. That's what makes
349
+ * concurrent agent writes safe — see migration 0006 for the
350
+ * function bodies.
351
+ */
352
+ async function rpcCall(fn, body) {
353
+ const { url, writeKey } = supabaseConfig();
354
+ // body must already carry p_workspace_id — the per-tool resolver
355
+ // injects it before calling rpcCall so the override path works.
356
+ if (!url || !writeKey || !body?.p_workspace_id) {
357
+ throw new Error(
358
+ "Write tools require SUPABASE_URL + SUPABASE_SERVICE_ROLE_KEY in env and a resolvable workspaceId (either SUPABASE_WORKSPACE_ID env or workspaceId arg)."
359
+ );
360
+ }
361
+ const res = await fetch(`${url}/rest/v1/rpc/${fn}`, {
362
+ method: "POST",
363
+ headers: {
364
+ apikey: writeKey,
365
+ Authorization: `Bearer ${writeKey}`,
366
+ "content-type": "application/json",
367
+ Accept: "application/json",
368
+ },
369
+ body: JSON.stringify(body),
370
+ });
371
+ if (!res.ok) {
372
+ const txt = await res.text();
373
+ throw new Error(
374
+ `rpc ${fn} failed: ${res.status} ${txt.slice(0, 300)}`
375
+ );
376
+ }
377
+ return res.json();
378
+ }
379
+
380
+ /**
381
+ * Project (seed + edits) into a flat view the tools can serve.
382
+ * Mirrors the lightweight subset of src/lib/store.ts merge logic we
383
+ * need read-only: replace seed records by id with edited copies,
384
+ * concat new ones, drop deleted ids.
385
+ */
386
+ function project(seed, edits) {
387
+ const e = edits ?? {};
388
+ const themes = mergeList(
389
+ seed?.product?.themes ?? [],
390
+ e.themes ?? {},
391
+ // The app writes new themes to edits.newPillars and deletes to
392
+ // edits.deletedPillarIds (Theme is still `Pillar` in the schema
393
+ // for legacy reasons — see src/types.ts comment). The MCP must
394
+ // read those same keys so a theme created in the UI shows up in
395
+ // list_themes — and so propose_theme below doesn't have to fight
396
+ // the app over which key holds the truth.
397
+ e.newPillars ?? [],
398
+ e.deletedPillarIds ?? []
399
+ );
400
+ const capabilities = mergeList(
401
+ seed?.capabilities ?? [],
402
+ e.capabilities ?? {},
403
+ e.newCapabilities ?? [],
404
+ e.deletedCapabilityIds ?? []
405
+ );
406
+ const tasks = mergeList(
407
+ seed?.tasks ?? [],
408
+ e.tasks ?? {},
409
+ e.newTasks ?? [],
410
+ e.deletedTaskIds ?? []
411
+ );
412
+ return { themes, capabilities, tasks };
413
+ }
414
+
415
+ /**
416
+ * Effective capability status — mirrors
417
+ * effectiveCapabilityStatus + deriveCapabilityStatus in
418
+ * src/lib/util.ts. Explicit `cap.status` wins; otherwise derived
419
+ * from linked tasks. Used here to keep delivered capabilities out
420
+ * of agent-facing lists so plans target work that's still in flight.
421
+ */
422
+ function effectiveCapabilityStatus(cap, tasks) {
423
+ if (cap.status) return cap.status;
424
+ const own = tasks.filter((t) => t.capabilityId === cap.id);
425
+ if (own.length === 0) return "exploring";
426
+ if (own.every((t) => t.status === "delivered")) return "delivered";
427
+ if (own.some((t) => t.status === "in_progress")) return "in_progress";
428
+ if (own.every((t) => t.status === "exploring")) return "exploring";
429
+ return "planned";
430
+ }
431
+
432
+ function mergeList(seedList, patches, additions, deletedIds) {
433
+ const del = new Set(deletedIds);
434
+ const merged = [];
435
+ for (const row of seedList) {
436
+ if (del.has(row.id)) continue;
437
+ merged.push({ ...row, ...(patches[row.id] ?? {}) });
438
+ }
439
+ // Patches apply to newly-created records too — matches the app's
440
+ // store merge. This is what makes grade_acceptance writes visible
441
+ // on tasks created via propose_task earlier in the same workspace.
442
+ for (const row of additions) {
443
+ if (!del.has(row.id)) merged.push({ ...row, ...(patches[row.id] ?? {}) });
444
+ }
445
+ return merged;
446
+ }
447
+
448
+ function todayISO() {
449
+ return new Date().toISOString().slice(0, 10);
450
+ }
451
+ function addDays(iso, days) {
452
+ const d = new Date(iso + "T00:00:00Z");
453
+ d.setUTCDate(d.getUTCDate() + days);
454
+ return d.toISOString().slice(0, 10);
455
+ }
456
+
457
+ /**
458
+ * Decode the five HTML entities agents most often emit when they
459
+ * think they're rendering into markup. Mirrors src/lib/text.ts.
460
+ *
461
+ * Agents sometimes propose names like `Sandbox & Test Mode` —
462
+ * the storage path stores that verbatim, then React renders it
463
+ * literally in the UI. Apply this on every propose / update path
464
+ * so values land in the database in their decoded form.
465
+ */
466
+ function decodeHtmlEntities(input) {
467
+ if (!input || typeof input !== "string") return input;
468
+ return input
469
+ .replace(/&/g, "&")
470
+ .replace(/&lt;/g, "<")
471
+ .replace(/&gt;/g, ">")
472
+ .replace(/&quot;/g, '"')
473
+ .replace(/&#39;/g, "'")
474
+ .replace(/&#x27;/gi, "'");
475
+ }
476
+ /** Convenience: trim AND decode HTML entities on the same value. */
477
+ function cleanText(s) {
478
+ return decodeHtmlEntities((s ?? "").trim());
479
+ }
480
+ function randomTaskId() {
481
+ // TK-NNNNNN — 6-digit zero-padded random. Matches the app's format
482
+ // and stays comfortably under collision risk for any realistic ws.
483
+ return `TK-${String(Math.floor(Math.random() * 1_000_000)).padStart(6, "0")}`;
484
+ }
485
+ function randomThemeId() {
486
+ // TH-NNNNNN — 6-digit numeric, same shape store.ts numericId6 emits.
487
+ return `TH-${String(100000 + Math.floor(Math.random() * 900000))}`;
488
+ }
489
+ function randomCapabilityId() {
490
+ // CAP-XXXXXX — 6-char uppercase base36, matches store.ts uid("CAP").
491
+ return `CAP-${Math.random().toString(36).slice(2, 8).toUpperCase()}`;
492
+ }
493
+
494
+ const VALID_IMPACTS = new Set([3, 2, 1, 0.5, 0.25]);
495
+
496
+ // ── Validators ────────────────────────────────────────────────────
497
+ // Server-side guardrails for the planning rubric in AGENTS.md.
498
+ // Every propose_* tool runs these before touching Supabase. With
499
+ // dryRun=true the caller sees the validation result without writing.
500
+
501
+ // Month names must be followed by whitespace + a digit so we don't
502
+ // false-positive on phrases like "may have moved" or "September lifts"
503
+ // where the month token has nothing to do with a date.
504
+ const TEMPORAL_RE =
505
+ /\b(20\d\d|q[1-4](\s*20\d\d)?|by\s+\d|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+\d)\b/i;
506
+ const NUMBER_RE = /\d/;
507
+
508
+ /**
509
+ * An outcome is "falsifiable" when it carries both a number (the
510
+ * thing you're moving) and a temporal anchor (when you're checking
511
+ * it). Empty outcomes are also rejected — every capability needs
512
+ * one. The regex is intentionally loose so reasonable phrasings
513
+ * pass: "32% to 55% by Q3 2026", "median 3/quarter by 2026-09",
514
+ * "5x by 2026" all clear.
515
+ */
516
+ function validateOutcome(outcome) {
517
+ const o = (outcome ?? "").trim();
518
+ if (!o) {
519
+ return "outcome is required. Use the template: <metric> moves from <baseline> to <target> by <date>, measured by <source>.";
520
+ }
521
+ const hasNumber = NUMBER_RE.test(o);
522
+ const hasTemporal = TEMPORAL_RE.test(o);
523
+ if (!hasNumber || !hasTemporal) {
524
+ return `outcome must include both a number (the metric) and a date or quarter (when you'll check). Missing: ${[
525
+ !hasNumber ? "number" : null,
526
+ !hasTemporal ? "date/quarter" : null,
527
+ ]
528
+ .filter(Boolean)
529
+ .join(" + ")}. See get_agents_md for examples.`;
530
+ }
531
+ return null;
532
+ }
533
+
534
+ function validateName(name, minLen = 8) {
535
+ const n = (name ?? "").trim();
536
+ if (!n) return "name is required.";
537
+ if (n.length < minLen)
538
+ return `name is too short (${n.length} chars) — aim for ${minLen}+ that describe the bet, not the verb.`;
539
+ return null;
540
+ }
541
+
542
+ function validateConfidence(confidence) {
543
+ if (confidence == null) return null;
544
+ if (confidence < 0 || confidence > 100)
545
+ return `confidence must be 0–100, got ${confidence}.`;
546
+ if (confidence > 95)
547
+ return `confidence ${confidence} is too high. >95 is reserved for work that's already shipped or behind a flag. Cap at 95 unless you can point to the deployed flag.`;
548
+ return null;
549
+ }
550
+
551
+ /**
552
+ * Warning (not error) when a capability's ROI lands well below the
553
+ * parent theme's target. Caller can still proceed — but the
554
+ * warning surfaces in dryRun output so the agent can rethink.
555
+ */
556
+ function warnRoiVsTheme(roi, theme) {
557
+ if (roi == null || theme?.targetRoi == null) return null;
558
+ const floor = theme.targetRoi * 0.7;
559
+ if (roi < floor) {
560
+ return `roi $${roi}M is well below 70% of theme "${theme.name}" target ($${theme.targetRoi}M). Justify the gap in your outcome, or rethink the parent theme.`;
561
+ }
562
+ return null;
563
+ }
564
+
565
+ // ── Lightweight Jaccard for suggest_capability_for ────────────────
566
+ //
567
+ // Kept in sync with src/lib/textMatch.ts and api/github-webhook.ts
568
+ // so the three places that score PR-to-capability overlap rank the
569
+ // same matches identically. The "Jaccard" name is a slight misnomer:
570
+ // we normalise by max(|a|,|b|) instead of |a ∪ b| (textbook Jaccard)
571
+ // so a tiny PR title doesn't trivially match a long capability
572
+ // description via two common tokens.
573
+ const STOPWORDS = new Set([
574
+ "the", "and", "for", "with", "this", "that", "from", "into", "onto", "upon",
575
+ "fix", "fixes", "fixing", "add", "adds", "adding", "update", "updates", "updating",
576
+ "remove", "removes", "removing", "refactor", "refactors", "refactoring",
577
+ "feat", "feature", "bug", "bugfix", "chore", "wip", "draft",
578
+ "use", "uses", "using", "make", "makes", "making", "support", "supports",
579
+ "via", "out", "off", "now", "but", "any", "all", "new", "old",
580
+ "pull", "request", "merge", "branch", "commit", "test", "tests", "testing",
581
+ ]);
582
+ function tokenize(s) {
583
+ const out = new Set();
584
+ const lower = (s ?? "").toLowerCase().replace(/[^a-z0-9 ]+/g, " ");
585
+ for (const w of lower.split(/\s+/)) {
586
+ if (w.length < 3) continue;
587
+ if (STOPWORDS.has(w)) continue;
588
+ out.add(w);
589
+ }
590
+ return out;
591
+ }
592
+ function jaccardScore(a, b) {
593
+ if (a.size === 0 || b.size === 0) return 0;
594
+ let overlap = 0;
595
+ for (const t of a) if (b.has(t)) overlap += 1;
596
+ if (overlap === 0) return 0;
597
+ return overlap / Math.max(a.size, b.size);
598
+ }
599
+
600
+ // ── Session state + enforcement gates ─────────────────────────────
601
+ //
602
+ // One process serves one MCP client (stdio). State below is the
603
+ // client's session-scoped memory: when get_agents_md was last
604
+ // fetched, how many mutator calls have been attempted without the
605
+ // rubric, etc.
606
+ //
607
+ // The point of this is to stop relying on the agent's discretion to
608
+ // follow the rubric. Tool descriptions catch most cases; the gate
609
+ // here catches the rest with a structured error whose `fix` field
610
+ // names the exact next call.
611
+ const session = {
612
+ startedAt: Date.now(),
613
+ rubricFetchedAt: null,
614
+ // "Discovery" gates — the agent must have looked at the current
615
+ // catalogue before proposing new theme/cap records. Prevents the
616
+ // "agent invents Theme X when 'X-ish Theme' already exists"
617
+ // failure mode that token-overlap alone catches inconsistently.
618
+ themesListedAt: null,
619
+ capsDiscoveredAt: null,
620
+ mutatorAttempts: 0,
621
+ mutatorBlocks: 0,
622
+ };
623
+
624
+ function resetSession() {
625
+ session.startedAt = Date.now();
626
+ session.rubricFetchedAt = null;
627
+ session.themesListedAt = null;
628
+ session.capsDiscoveredAt = null;
629
+ session.mutatorAttempts = 0;
630
+ session.mutatorBlocks = 0;
631
+ }
632
+
633
+ /**
634
+ * Build the structured "prereq missing" result the mutators return
635
+ * when the agent hasn't fetched the rubric this session. The shape
636
+ * matters: LLMs recover well from errors whose `fix` field names
637
+ * the exact next call, badly from prose. Mirrors the recommendation
638
+ * in the MCP-effectiveness memo.
639
+ */
640
+ function rubricMissingResult(toolName) {
641
+ return {
642
+ content: [
643
+ {
644
+ type: "text",
645
+ text: JSON.stringify(
646
+ {
647
+ error: "prerequisite_missing",
648
+ message:
649
+ `Call get_agents_md first this session, then retry ${toolName}. ` +
650
+ "The rubric defines acceptance criteria shape and grading dimensions — " +
651
+ "proposals filed without it will not round-trip.",
652
+ fix: "get_agents_md()",
653
+ },
654
+ null,
655
+ 2
656
+ ),
657
+ },
658
+ ],
659
+ isError: true,
660
+ };
661
+ }
662
+
663
+ /**
664
+ * Structured error for the per-tool discovery gates. Same shape +
665
+ * rationale as rubricMissingResult — LLMs follow the `fix` field
666
+ * reliably when it names the exact next call.
667
+ */
668
+ function discoveryMissingResult(toolName, fixCall, rationale) {
669
+ return {
670
+ content: [
671
+ {
672
+ type: "text",
673
+ text: JSON.stringify(
674
+ {
675
+ error: "discovery_missing",
676
+ message:
677
+ `Call ${fixCall} first this session, then retry ${toolName}. ${rationale}`,
678
+ fix: fixCall,
679
+ },
680
+ null,
681
+ 2
682
+ ),
683
+ },
684
+ ],
685
+ isError: true,
686
+ };
687
+ }
688
+
689
+ /**
690
+ * Telemetry write — fire-and-forget POST to public.mcp_telemetry
691
+ * via PostgREST when a service-role key is set. Never blocks the
692
+ * caller; failures are logged to stderr and swallowed.
693
+ *
694
+ * The point isn't real-time observability — it's accumulating
695
+ * signal about *where* the rubric flow breaks so we can tune
696
+ * descriptions and gates against real failure patterns.
697
+ */
698
+ function recordTelemetry(event, payload, wsIdOverride) {
699
+ const { url, writeKey, workspaceId: envWsId } = supabaseConfig();
700
+ if (!url || !writeKey) return; // self-hosted devs without service key get no-op
701
+ const body = {
702
+ event,
703
+ // Prefer the per-call workspace when one was passed. Mutator
704
+ // calls resolve this once at the top of callTool so the row
705
+ // captures the workspace actually being acted on, not just the
706
+ // env default.
707
+ workspace_id: wsIdOverride ?? envWsId ?? null,
708
+ server_version: SERVER_VERSION,
709
+ session_started_at: new Date(session.startedAt).toISOString(),
710
+ rubric_fetched_at: session.rubricFetchedAt
711
+ ? new Date(session.rubricFetchedAt).toISOString()
712
+ : null,
713
+ payload: payload ?? null,
714
+ };
715
+ fetch(`${url}/rest/v1/mcp_telemetry`, {
716
+ method: "POST",
717
+ headers: {
718
+ apikey: writeKey,
719
+ Authorization: `Bearer ${writeKey}`,
720
+ "content-type": "application/json",
721
+ Prefer: "return=minimal",
722
+ },
723
+ body: JSON.stringify(body),
724
+ }).catch((e) => {
725
+ log("telemetry write failed (non-fatal)", e.message);
726
+ });
727
+ }
728
+
729
+ const TOOLS = [
730
+ {
731
+ name: "list_themes",
732
+ description:
733
+ "List active themes (strategic pillars). Excludes archived themes by default.\n\n" +
734
+ "USE WHEN: orienting to the roadmap at session start, scoping which theme a capability belongs under, or answering 'what strategic bets are we tracking'.\n" +
735
+ "PREREQUISITE: none — read-only.\n" +
736
+ "ANTI-PATTERN: do not call to look up a single theme by id (use get_roadmap_snapshot or filter the response yourself). Do not call repeatedly in one session — theme catalogue is years-stable. Pass includeArchived=true only when reviewing closed bets — almost never in a planning session.\n" +
737
+ "EXAMPLE: list_themes()",
738
+ inputSchema: {
739
+ type: "object",
740
+ properties: {
741
+ workspaceId: { type: "string" },
742
+ includeArchived: { type: "boolean" },
743
+ },
744
+ additionalProperties: false,
745
+ },
746
+ },
747
+ {
748
+ name: "list_capabilities",
749
+ description:
750
+ "List active capabilities (quarterly bets). Excludes delivered and archived capabilities by default — agents should target work that's still in flight.\n\n" +
751
+ "USE WHEN: planning a feature and need to find the right parent capability, reviewing in-flight bets, or scoping what's still on the table this quarter.\n" +
752
+ "PREREQUISITE: none — read-only. For routing a specific work description, prefer suggest_capability_for which ranks by token overlap.\n" +
753
+ "ANTI-PATTERN: do not call to find a capability when you already know its id (use get_roadmap_snapshot for richer context). Pass includeDelivered=true or includeArchived=true only when reviewing historical bets — almost never in a planning session.\n" +
754
+ "EXAMPLE: list_capabilities({ themeId: 'TH-XXX' })",
755
+ inputSchema: {
756
+ type: "object",
757
+ properties: {
758
+ themeId: { type: "string" },
759
+ includeDelivered: { type: "boolean" },
760
+ includeArchived: { type: "boolean" },
761
+ workspaceId: { type: "string" },
762
+ },
763
+ additionalProperties: false,
764
+ },
765
+ },
766
+ {
767
+ name: "list_tasks",
768
+ description:
769
+ "List tasks. Filter by capabilityId or status. Excludes archived tasks by default.\n\n" +
770
+ "USE WHEN: surveying what already exists under a capability before proposing a new task (avoid duplicates), reviewing a status bucket (e.g. all in_progress), or answering 'what's open right now'.\n" +
771
+ "PREREQUISITE: none — read-only.\n" +
772
+ "ANTI-PATTERN: do not call to track in-progress work within a single conversation — use the harness TodoWrite tool. Do not call without a filter when the workspace has many tasks; scope by capabilityId or status. Pass includeArchived=true only when reviewing closed history.\n" +
773
+ "EXAMPLE: list_tasks({ capabilityId: 'CAP-XXX', status: 'in_progress' })",
774
+ inputSchema: {
775
+ type: "object",
776
+ properties: {
777
+ capabilityId: { type: "string" },
778
+ status: {
779
+ type: "string",
780
+ enum: ["delivered", "in_progress", "planned", "exploring"],
781
+ },
782
+ includeArchived: { type: "boolean" },
783
+ workspaceId: { type: "string" },
784
+ },
785
+ additionalProperties: false,
786
+ },
787
+ },
788
+ {
789
+ name: "get_task",
790
+ description:
791
+ "Return one task by id with full detail: title, summary, status, owner, acceptance criteria, dependsOn, attached PRs, and acceptance grades.\n\n" +
792
+ "USE WHEN: about to submit acceptance grades (need the criteria indexes), reviewing a specific task before linking a PR, or answering questions about a particular TK-XXXXXX.\n" +
793
+ "PREREQUISITE: none — read-only.\n" +
794
+ "ANTI-PATTERN: do not call to discover that a task exists — use list_tasks for discovery first. Don't loop over many ids; list_tasks returns the same shape in one round trip.\n" +
795
+ "EXAMPLE: get_task({ id: 'TK-100201' })",
796
+ inputSchema: {
797
+ type: "object",
798
+ properties: {
799
+ id: { type: "string" },
800
+ workspaceId: { type: "string" },
801
+ },
802
+ required: ["id"],
803
+ additionalProperties: false,
804
+ },
805
+ },
806
+ {
807
+ name: "get_agents_md",
808
+ description:
809
+ "Return the AGENTS.md planning contract — task shape, acceptance criteria format, capability outcome rubric, grading dimensions.\n\n" +
810
+ "USE WHEN: starting ANY planning session before calling propose_task / propose_capability / propose_theme / submit_acceptance_grades. Call this ONCE per session — the mutator tools refuse without it. Cache the result on your side.\n" +
811
+ "PREREQUISITE: none.\n" +
812
+ "ANTI-PATTERN: do not skip when the user says 'just plan some features' — the rubric IS the planning interface; proposals filed without it won't round-trip into the product.\n" +
813
+ "EXAMPLE: get_agents_md()",
814
+ inputSchema: { type: "object", properties: {}, additionalProperties: false },
815
+ },
816
+ {
817
+ name: "get_roadmap_snapshot",
818
+ description:
819
+ "Single-call orient: themes + active capabilities + in-flight tasks for the workspace, plus the resolved workspaceId. Always live. Excludes archived entities by default.\n\n" +
820
+ "USE WHEN: starting fresh in a workspace and need the whole canonical model in one read, or before opening a PR to confirm which workspace + capability to attach to.\n" +
821
+ "PREREQUISITE: none — read-only. Often the very first call after get_agents_md.\n" +
822
+ "ANTI-PATTERN: do not call repeatedly within one planning pass; the data doesn't change inside a single session. Use list_tasks / list_capabilities if you need just one slice. Pass includeArchived=true only when surveying historical state.\n" +
823
+ "EXAMPLE: get_roadmap_snapshot()",
824
+ inputSchema: {
825
+ type: "object",
826
+ properties: {
827
+ workspaceId: {
828
+ type: "string",
829
+ description:
830
+ "Optional. Override the env-default workspace. Useful when the agent is operating against a .roadmapper/snapshot.json that names its own workspace.",
831
+ },
832
+ includeArchived: { type: "boolean" },
833
+ },
834
+ additionalProperties: false,
835
+ },
836
+ },
837
+ {
838
+ name: "propose_task",
839
+ description:
840
+ "Propose a new task under an existing roadmapper capability. Server stamps authorKind='agent' + status='planned' + a TK-NNNNNN id.\n\n" +
841
+ "USE WHEN: the user asks to plan features, design new work, sketch a roadmap, file a TODO that should persist beyond this conversation, or break a capability into deliverables.\n" +
842
+ "PREREQUISITE: get_agents_md once this session (the server enforces this and returns an error with a `fix` field if missing). Call suggest_capability_for first to find the right parent capability — do not invent a new one.\n" +
843
+ "ANTI-PATTERN: do not call to track in-progress work within a single conversation — use the harness TodoWrite tool. Do not call to log a bug discovered during implementation — file in the issue tracker, not roadmapper. Do not call when you don't know which capability the task belongs under; resolve that first.\n" +
844
+ "EXAMPLE: propose_task({ capabilityId: 'CAP-XXX', title: 'Drag-and-drop block reorder', acceptance: ['Block can be dragged with mouse + keyboard', 'Order persists across reloads'], idempotencyKey: 'session-1-task-3' })\n\n" +
845
+ "Requires SUPABASE_SERVICE_ROLE_KEY. Pass idempotencyKey so retries don't duplicate. Pass dryRun: true to validate without writing. Pass workspaceId to target a workspace other than the env default.",
846
+ inputSchema: {
847
+ type: "object",
848
+ properties: {
849
+ capabilityId: { type: "string" },
850
+ title: { type: "string" },
851
+ summary: { type: "string" },
852
+ effort: { type: "string", enum: ["XS", "S", "M", "L", "XL"] },
853
+ priority: { type: "string", enum: ["P0", "P1", "P2", "P3"] },
854
+ kind: { type: "string", enum: ["feature", "bug", "chore", "spike"] },
855
+ owner: { type: "string" },
856
+ acceptance: { type: "array", items: { type: "string" } },
857
+ dependsOn: { type: "array", items: { type: "string" } },
858
+ expectedPRs: {
859
+ type: "number",
860
+ description:
861
+ "Advisory cap on merged PRs for this task. Unset by default (no cap). Webhook records a scope_overrun audit row when the actual count exceeds this. Not enforced; this is a hint to track how often tasks blow their envelope.",
862
+ },
863
+ expectedScope: {
864
+ type: "number",
865
+ description:
866
+ "Advisory cap on cumulative LoC (additions+deletions) across all PRs linked to this task. Webhook records a scope_overrun audit row when exceeded. Not enforced.",
867
+ },
868
+ idempotencyKey: { type: "string" },
869
+ dryRun: { type: "boolean" },
870
+ workspaceId: { type: "string" },
871
+ },
872
+ required: ["capabilityId", "title"],
873
+ additionalProperties: false,
874
+ },
875
+ },
876
+ {
877
+ name: "propose_theme",
878
+ description:
879
+ "Propose a new strategic theme (pillar). Themes are years-stable — only propose one when nothing existing fits.\n\n" +
880
+ "USE WHEN: the work the user is describing genuinely doesn't fit ANY existing theme, AND the user explicitly says they want a new strategic direction. Almost never the right answer in a planning session.\n" +
881
+ "PREREQUISITE: get_agents_md once this session (enforced). Theme discovery once this session, satisfied by suggest_theme_for (preferred — returns ranked matches with a fit signal), list_themes, or get_roadmap_snapshot. Enforced — the server returns discovery_missing with a fix field if you skip it. Duplicating a theme is the most common failure mode; the gate stops it.\n" +
882
+ "ANTI-PATTERN: do not call to organize a quarter of work — that's a capability, not a theme. Do not call because the existing themes feel too coarse — they're SUPPOSED to be coarse. Use propose_capability under an existing theme instead.\n" +
883
+ "EXAMPLE: propose_theme({ name: 'AI Agent Reliability', description: 'Multi-year bet on making agent workflows reproducible.', targetRoi: 20, idempotencyKey: 'session-1-theme-1' })\n\n" +
884
+ "Requires SUPABASE_SERVICE_ROLE_KEY. Pass idempotencyKey so retries don't duplicate. Pass dryRun: true to validate without writing. Pass workspaceId to target a workspace other than the env default.",
885
+ inputSchema: {
886
+ type: "object",
887
+ properties: {
888
+ name: { type: "string" },
889
+ description: { type: "string" },
890
+ color: { type: "string" },
891
+ targetRoi: { type: "number" },
892
+ idempotencyKey: { type: "string" },
893
+ dryRun: { type: "boolean" },
894
+ workspaceId: { type: "string" },
895
+ },
896
+ required: ["name"],
897
+ additionalProperties: false,
898
+ },
899
+ },
900
+ {
901
+ name: "propose_capability",
902
+ description:
903
+ "Propose a new capability under an existing theme — a quarterly bet with a falsifiable outcome.\n\n" +
904
+ "USE WHEN: planning a multi-task workstream that needs a shared outcome statement; the work is a coherent bet, not a single task; AND suggest_capability_for returned no strong match.\n" +
905
+ "PREREQUISITE: get_agents_md once this session (enforced). suggest_capability_for (or list_capabilities / get_roadmap_snapshot / the roadmapper://capabilities/active resource) once this session (enforced — server returns discovery_missing with a fix field if you skip it). The server WILL also reject if token overlap with an existing capability is too high; the gate is upstream of that.\n" +
906
+ "ANTI-PATTERN: do not call for a single deliverable — that's a task. Do not call when the outcome is fuzzy ('improve X') — the server rejects non-falsifiable outcomes. Do not call when an existing capability is close-enough; capabilities cost human attention to maintain.\n" +
907
+ "EXAMPLE: propose_capability({ pillarId: 'TH-XXX', name: 'Self-serve landing page builder', outcome: 'Customers publish a landing page in under 5 minutes without engineering involvement.', reach: 200, impact: 1, confidence: 70, idempotencyKey: 'session-1-cap-1' })\n\n" +
908
+ "Server rejects empty / non-falsifiable outcomes, confidence >95, and names <8 chars. Requires SUPABASE_SERVICE_ROLE_KEY. Pass idempotencyKey, dryRun, workspaceId as for propose_task.",
909
+ inputSchema: {
910
+ type: "object",
911
+ properties: {
912
+ name: { type: "string" },
913
+ pillarId: { type: "string" },
914
+ description: { type: "string" },
915
+ outcome: { type: "string" },
916
+ reach: { type: "number" },
917
+ impact: { type: "number", enum: [3, 2, 1, 0.5, 0.25] },
918
+ confidence: { type: "number", minimum: 0, maximum: 100 },
919
+ roi: { type: "number" },
920
+ specRef: { type: "string" },
921
+ idempotencyKey: { type: "string" },
922
+ dryRun: { type: "boolean" },
923
+ workspaceId: { type: "string" },
924
+ },
925
+ required: ["name", "pillarId", "outcome"],
926
+ additionalProperties: false,
927
+ },
928
+ },
929
+ {
930
+ name: "submit_acceptance_grades",
931
+ description:
932
+ "Stamp self-grade results onto a task's acceptanceGrades array. Each entry sets pass/fail on the criterion at the given index.\n\n" +
933
+ "USE WHEN: you've finished implementing a task and verified its acceptance criteria. Always call before opening a PR — the rubric requires self-grading prior to human review.\n" +
934
+ "PREREQUISITE: get_agents_md once this session (enforced — defines grading dimensions). Call get_task first to read the acceptance criteria in order — indexes are positional.\n" +
935
+ "ANTI-PATTERN: do not call before the implementation actually works — fabricated passes destroy the trust this signal carries. Do not call without a note when status='fail' — the reviewer needs the failure mode.\n" +
936
+ "EXAMPLE: submit_acceptance_grades({ taskId: 'TK-100201', grades: [{ index: 0, status: 'pass' }, { index: 1, status: 'fail', note: 'Reload-persistence is flaky on Firefox; tracked in TK-100202' }] })\n\n" +
937
+ "Requires SUPABASE_SERVICE_ROLE_KEY. Pass workspaceId to target a workspace other than the env default.",
938
+ inputSchema: {
939
+ type: "object",
940
+ properties: {
941
+ taskId: { type: "string" },
942
+ grades: {
943
+ type: "array",
944
+ items: {
945
+ type: "object",
946
+ properties: {
947
+ index: { type: "integer", minimum: 0 },
948
+ status: { type: "string", enum: ["pass", "fail"] },
949
+ note: { type: "string" },
950
+ },
951
+ required: ["index", "status"],
952
+ additionalProperties: false,
953
+ },
954
+ },
955
+ workspaceId: { type: "string" },
956
+ },
957
+ required: ["taskId", "grades"],
958
+ additionalProperties: false,
959
+ },
960
+ },
961
+ {
962
+ name: "suggest_capability_for",
963
+ description:
964
+ "Given a free-text description of work, return the top existing capabilities ranked by token overlap.\n\n" +
965
+ "USE WHEN: about to propose tasks or a capability — call this FIRST to find an existing parent. If any returned score > 0.4, strongly prefer attaching tasks there over creating a new capability.\n" +
966
+ "PREREQUISITE: none — read-only.\n" +
967
+ "ANTI-PATTERN: do not call after you've already decided to create a new capability — that's the case this tool is meant to prevent. Do not interpret weak matches (<0.2) as fits; if nothing's close, propose_capability is the right next call (after confirming with the user).\n" +
968
+ "EXAMPLE: suggest_capability_for({ description: 'multi-tenant landing page builder with drag-and-drop blocks' })",
969
+ inputSchema: {
970
+ type: "object",
971
+ properties: {
972
+ description: { type: "string" },
973
+ limit: { type: "integer", minimum: 1, maximum: 25 },
974
+ workspaceId: { type: "string" },
975
+ },
976
+ required: ["description"],
977
+ additionalProperties: false,
978
+ },
979
+ },
980
+ {
981
+ name: "suggest_theme_for",
982
+ description:
983
+ "Given a free-text description of work, return the top existing themes ranked by token overlap. Mirror of suggest_capability_for but at the theme level — themes are years-stable, so the bar to create a new one is higher.\n\n" +
984
+ "USE WHEN: about to plan a feature and you've decided you need a Theme/Capability/Task tree. Call this FIRST so you can decide whether to attach a new capability under an existing theme (the usual answer) or whether the work represents a genuinely new strategic direction worth a new theme.\n" +
985
+ "INTERPRETATION: a top score above ~0.4 = existing theme fits, do NOT create a new one. 0.2-0.4 = weak overlap, almost always still better to use the existing theme. Below 0.2 OR empty matches = ask the user before calling propose_theme — themes are years-stable, not per-feature, and duplicates are the most common failure mode.\n" +
986
+ "PREREQUISITE: none — read-only. Also satisfies the discovery gate for propose_theme.\n" +
987
+ "ANTI-PATTERN: do not call after deciding to create a new theme — that's the case this tool is meant to prevent. Do not interpret weak matches as 'must create new' without explicit user confirmation that a new strategic direction is intended.\n" +
988
+ "EXAMPLE: suggest_theme_for({ description: 'multi-channel marketing analytics dashboard with attribution modeling' })",
989
+ inputSchema: {
990
+ type: "object",
991
+ properties: {
992
+ description: { type: "string" },
993
+ limit: { type: "integer", minimum: 1, maximum: 25 },
994
+ workspaceId: { type: "string" },
995
+ },
996
+ required: ["description"],
997
+ additionalProperties: false,
998
+ },
999
+ },
1000
+ {
1001
+ name: "link_pr",
1002
+ description:
1003
+ "Attach a PR to a task. Closes the deliver-loop gap so an agent that just opened a PR can stamp it onto the parent task without waiting for the next GitHub webhook.\n\n" +
1004
+ "USE WHEN: you just opened a PR for a task and want it visible in roadmapper immediately. Always call alongside submit_acceptance_grades when closing out a task.\n" +
1005
+ "PREREQUISITE: get_agents_md once this session (enforced). The task id must exist (get_task / list_tasks to confirm).\n" +
1006
+ "ANTI-PATTERN: do not call as a substitute for the Roadmapper-Task: PR-body trailer convention — the trailer is the durable contract; link_pr is the instant-feedback shortcut. Do not call for PRs that don't have a parent task in roadmapper.\n" +
1007
+ "EXAMPLE: link_pr({ taskId: 'TK-100201', repo: 'acme/frontend', number: 1234, title: 'Drag block reorder', authorGithub: 'octocat' })\n\n" +
1008
+ "Idempotent by (repo, number) — re-calling with an already-linked PR returns idempotent:true. Requires SUPABASE_SERVICE_ROLE_KEY. Pass workspaceId to target a workspace other than the env default.",
1009
+ inputSchema: {
1010
+ type: "object",
1011
+ properties: {
1012
+ taskId: { type: "string" },
1013
+ repo: { type: "string" },
1014
+ number: { type: "integer", minimum: 1 },
1015
+ title: { type: "string" },
1016
+ merged: { type: "boolean" },
1017
+ mergedAt: { type: "string" },
1018
+ authorGithub: { type: "string" },
1019
+ authorKind: { type: "string", enum: ["human", "agent"] },
1020
+ workspaceId: { type: "string" },
1021
+ },
1022
+ required: ["taskId", "repo", "number"],
1023
+ additionalProperties: false,
1024
+ },
1025
+ },
1026
+ // ── Archive lifecycle (Phase 2 of the upgrade) ──────────────
1027
+ // Soft delete: archived rows stay in the workspace's edits
1028
+ // blob; list views filter them out, by-id lookups still
1029
+ // resolve. Refuses with active children (forces bottom-up
1030
+ // archive). Reason required on every call (audit trail).
1031
+ ...archiveLifecycleTools(),
1032
+ // ── Move lifecycle (Phase 3 of the upgrade) ─────────────────
1033
+ // Re-parent tasks under capabilities and capabilities under
1034
+ // themes. IDs are stable across moves. Target parent must be
1035
+ // active (refuses move into archived parent). Moving an
1036
+ // archived entity into an active parent unarchives in one step.
1037
+ ...moveLifecycleTools(),
1038
+ // ── Update lifecycle (Phase 4 of the upgrade) ───────────────
1039
+ // Patch fields on an entity. Parent fields (capabilityId,
1040
+ // pillarId) and lifecycle flags (archived, archivedAt) are
1041
+ // forbidden — those go through move_/archive_/unarchive_ for
1042
+ // audit clarity. UP5 idempotent on identical input.
1043
+ ...updateLifecycleTools(),
1044
+ // ── Outcome readings ────────────────────────────────────────
1045
+ // Track empirical metric readings against capability outcomes.
1046
+ // Append-only; the RPC takes a row lock so concurrent writes
1047
+ // never clobber. list_stale_outcomes flags capabilities whose
1048
+ // most recent reading is older than N days.
1049
+ {
1050
+ name: "record_outcome_reading",
1051
+ description:
1052
+ "Record a metric reading against a capability's stated outcome. Captures the empirical signal between 'outcome declared' and 'outcome decided.'\n\n" +
1053
+ "USE WHEN: you have a fresh measurement of the metric the capability is moving. A weekly Mixpanel paste, a warehouse extract, a Datadog reading — any source. The reading append-only-ly augments the capability's history; it doesn't replace prior readings.\n" +
1054
+ "PREREQUISITE: get_agents_md once this session (enforced). The capability must exist.\n" +
1055
+ "ANTI-PATTERN: do not use to declare the FINAL outcome (use outcomeStatus via update_capability for that). Readings are observations along the way, not the verdict.\n" +
1056
+ "EXAMPLE: record_outcome_reading({ capabilityId: 'CAP-9F2C7E', value: 0.41, asOf: '2026-05-12', source: 'mixpanel: activated_within_7d weekly', note: 'sample size 4218' })\n\n" +
1057
+ "Requires SUPABASE_SERVICE_ROLE_KEY. Audit log records each reading as 'outcome_reading_recorded'.",
1058
+ inputSchema: {
1059
+ type: "object",
1060
+ properties: {
1061
+ capabilityId: { type: "string", description: "CAP-XXXXXX" },
1062
+ value: { type: "number", description: "The metric reading (cardinality matches the outcome statement)." },
1063
+ asOf: { type: "string", description: "ISO date or timestamp the reading was sampled (not recorded)." },
1064
+ source: { type: "string", description: "Where the reading came from (e.g. 'mixpanel weekly', 'warehouse:fact_orders')." },
1065
+ note: { type: "string" },
1066
+ workspaceId: { type: "string" },
1067
+ },
1068
+ required: ["capabilityId", "value", "asOf", "source"],
1069
+ additionalProperties: false,
1070
+ },
1071
+ },
1072
+ {
1073
+ name: "list_stale_outcomes",
1074
+ description:
1075
+ "List capabilities whose outcome metric hasn't been measured recently. Default threshold: 14 days. Surfaces bets that have lost the empirical loop — outcome was declared but nobody's checking.\n\n" +
1076
+ "USE WHEN: at quarterly review, weekly outcome check, or any time you want to spot capabilities that are running without a reading.\n" +
1077
+ "Returns each stale capability with its id, name, outcome, days since last reading (or null if never), and most recent reading if present.",
1078
+ inputSchema: {
1079
+ type: "object",
1080
+ properties: {
1081
+ thresholdDays: {
1082
+ type: "number",
1083
+ description: "Days since last reading to count as stale. Default 14.",
1084
+ },
1085
+ includeArchived: { type: "boolean" },
1086
+ workspaceId: { type: "string" },
1087
+ },
1088
+ additionalProperties: false,
1089
+ },
1090
+ },
1091
+ ];
1092
+
1093
+ /**
1094
+ * Six archive/unarchive tools share most of their schema — same
1095
+ * shape per entity kind (task, capability, theme), same required
1096
+ * inputs (entity id + reason), same optional knobs (idempotency,
1097
+ * dryRun, workspaceId). Build them via a factory so the surface
1098
+ * stays in sync if the contract changes.
1099
+ */
1100
+ function archiveLifecycleTools() {
1101
+ const kinds = [
1102
+ {
1103
+ kind: "task",
1104
+ idDoc: "TK-NNNNNN",
1105
+ idKey: "taskId",
1106
+ example:
1107
+ "archive_task({ taskId: 'TK-100201', reason: 'cut from this quarter; superseded by TK-100299' })",
1108
+ },
1109
+ {
1110
+ kind: "capability",
1111
+ idDoc: "CAP-XXXXXX",
1112
+ idKey: "capabilityId",
1113
+ example:
1114
+ "archive_capability({ capabilityId: 'CAP-9F2C7E', reason: 'bet was wrong; we're going a different direction' })",
1115
+ },
1116
+ {
1117
+ kind: "theme",
1118
+ idDoc: "TH-XXXXXX",
1119
+ idKey: "themeId",
1120
+ example:
1121
+ "archive_theme({ themeId: 'TH-OLD-AREA', reason: 'theme retired; remaining bets re-parented' })",
1122
+ },
1123
+ ];
1124
+ const out = [];
1125
+ for (const { kind, idDoc, idKey, example } of kinds) {
1126
+ const idSchema = { [idKey]: { type: "string", description: idDoc } };
1127
+ out.push({
1128
+ name: `archive_${kind}`,
1129
+ description:
1130
+ `Archive a ${kind} (soft delete). The row stays in the workspace; list views filter it out, by-id lookups still resolve.\n\n` +
1131
+ `USE WHEN: a ${kind} is no longer relevant — cut from scope, superseded, or retired. Soft delete preserves history without cluttering the active roadmap.\n` +
1132
+ "PREREQUISITE: get_agents_md once this session (enforced). For capabilities/themes, every active child must already be archived — the server refuses with a count of blocking children. For tasks, no child check.\n" +
1133
+ `ANTI-PATTERN: do not archive a ${kind} you might come back to within the same session — prefer moving it (move_${kind === "theme" ? "capability" : kind}) or updating its status. Archive is the right tool for "this is closed out, get it out of the picker."\n` +
1134
+ `EXAMPLE: ${example}\n\n` +
1135
+ "Idempotent: re-archiving an already-archived entity returns { idempotent: true } and emits no audit row. Requires SUPABASE_SERVICE_ROLE_KEY. Pass workspaceId to target a workspace other than the env default.",
1136
+ inputSchema: {
1137
+ type: "object",
1138
+ properties: {
1139
+ ...idSchema,
1140
+ reason: {
1141
+ type: "string",
1142
+ description:
1143
+ "Why this is being archived. Required — landed in the audit log so future readers know the rationale.",
1144
+ },
1145
+ idempotencyKey: { type: "string" },
1146
+ dryRun: { type: "boolean" },
1147
+ workspaceId: { type: "string" },
1148
+ },
1149
+ required: [idKey, "reason"],
1150
+ additionalProperties: false,
1151
+ },
1152
+ });
1153
+ out.push({
1154
+ name: `unarchive_${kind}`,
1155
+ description:
1156
+ `Unarchive a ${kind}. Reverses archive_${kind}.\n\n` +
1157
+ `USE WHEN: an archived ${kind} is being pulled back into scope. To move an archived entity to a different parent, call move_${kind === "theme" ? "capability" : kind} instead — that path unarchives in one step.\n` +
1158
+ "PREREQUISITE: get_agents_md once this session (enforced). The parent (if any) must be active — cannot unarchive a task whose capability is archived, or a capability whose theme is archived. Unarchive the parent first.\n" +
1159
+ "ANTI-PATTERN: do not unarchive en masse without thinking — every unarchive re-floats noise into list views. If you're recovering from an over-aggressive archive sweep, work top-down.\n" +
1160
+ `EXAMPLE: un${example.replace("archive", "archive")}\n\n` +
1161
+ "Idempotent: unarchiving an already-active entity returns { idempotent: true }. Requires SUPABASE_SERVICE_ROLE_KEY.",
1162
+ inputSchema: {
1163
+ type: "object",
1164
+ properties: {
1165
+ ...idSchema,
1166
+ reason: { type: "string" },
1167
+ idempotencyKey: { type: "string" },
1168
+ dryRun: { type: "boolean" },
1169
+ workspaceId: { type: "string" },
1170
+ },
1171
+ required: [idKey, "reason"],
1172
+ additionalProperties: false,
1173
+ },
1174
+ });
1175
+ }
1176
+ return out;
1177
+ }
1178
+
1179
+ /**
1180
+ * Four move tools: two single (move_task, move_capability) and two
1181
+ * bulk (move_tasks, move_capabilities). Single tools re-parent one
1182
+ * entity; bulk tools accept up to 100 moves and stamp a shared
1183
+ * batchId into each audit row so a reorg shows up in history as one
1184
+ * logical operation. Themes have no parent, so no move_theme.
1185
+ */
1186
+ function moveLifecycleTools() {
1187
+ const kinds = [
1188
+ {
1189
+ kind: "task",
1190
+ kindPlural: "tasks",
1191
+ idKey: "taskId",
1192
+ idDoc: "TK-NNNNNN",
1193
+ parentKey: "newCapabilityId",
1194
+ parentDoc: "CAP-XXXXXX",
1195
+ parentKind: "capability",
1196
+ example:
1197
+ "move_task({ taskId: 'TK-100201', newCapabilityId: 'CAP-7A2D9F', reason: 'belongs under the auth capability, not onboarding' })",
1198
+ },
1199
+ {
1200
+ kind: "capability",
1201
+ kindPlural: "capabilities",
1202
+ idKey: "capabilityId",
1203
+ idDoc: "CAP-XXXXXX",
1204
+ parentKey: "newThemeId",
1205
+ parentDoc: "TH-NNNNNN",
1206
+ parentKind: "theme",
1207
+ example:
1208
+ "move_capability({ capabilityId: 'CAP-9F2C7E', newThemeId: 'TH-100042', reason: 'theme reorg — moving under Platform' })",
1209
+ },
1210
+ ];
1211
+ const out = [];
1212
+ for (const { kind, kindPlural, idKey, idDoc, parentKey, parentDoc, parentKind, example } of kinds) {
1213
+ out.push({
1214
+ name: `move_${kind}`,
1215
+ description:
1216
+ `Re-parent a ${kind} under a different ${parentKind}. The ${kind}'s id stays the same (${idDoc} never changes).\n\n` +
1217
+ `USE WHEN: a ${kind} is in the wrong place — wrong ${parentKind} after a reorg, or initially proposed under the wrong parent. To unarchive while moving, pass the archived entity's id with a DIFFERENT active target parent; if the new parent is active the entity unarchives in one step. To unarchive in place (without moving), call unarchive_${kind} directly — move to the SAME parent short-circuits as idempotent and won't unarchive.\n` +
1218
+ "PREREQUISITE: get_agents_md once this session (enforced). Target parent must exist AND be active — refuses move into an archived parent.\n" +
1219
+ `ANTI-PATTERN: do not use move to change anything other than the parent. To rename or rescope, use update_${kind} (coming soon). To delete, use archive_${kind}.\n` +
1220
+ `EXAMPLE: ${example}\n\n` +
1221
+ "Idempotent: moving to the current parent returns { idempotent: true } and emits no audit row. Requires SUPABASE_SERVICE_ROLE_KEY.",
1222
+ inputSchema: {
1223
+ type: "object",
1224
+ properties: {
1225
+ [idKey]: { type: "string", description: idDoc },
1226
+ [parentKey]: { type: "string", description: `Target ${parentKind} id (${parentDoc}).` },
1227
+ reason: {
1228
+ type: "string",
1229
+ description:
1230
+ "Why this is being moved. Required — landed in the audit log so future readers know the rationale.",
1231
+ },
1232
+ dryRun: { type: "boolean" },
1233
+ workspaceId: { type: "string" },
1234
+ },
1235
+ required: [idKey, parentKey, "reason"],
1236
+ additionalProperties: false,
1237
+ },
1238
+ });
1239
+ const bulkExample =
1240
+ kind === "task"
1241
+ ? `move_tasks({ moves: [{ taskId: 'TK-100201', newCapabilityId: 'CAP-7A2D9F' }, { taskId: 'TK-100202', newCapabilityId: 'CAP-7A2D9F' }], reason: 'auth reorg — pulling these under the new auth capability' })`
1242
+ : `move_capabilities({ moves: [{ capabilityId: 'CAP-9F2C7E', newThemeId: 'TH-100042' }, { capabilityId: 'CAP-9F2C7F', newThemeId: 'TH-100042' }], reason: 'theme reorg — moving under Platform' })`;
1243
+ out.push({
1244
+ name: `move_${kindPlural}`,
1245
+ description:
1246
+ `Bulk re-parent up to 100 ${kindPlural} in one call. Each item lists the entity id and its new ${parentKind}; the server stamps a shared batchId so the audit log groups the reorg as one logical operation.\n\n` +
1247
+ `USE WHEN: you have a planned reorg touching many ${kindPlural} at once — splitting a ${parentKind}, merging two, or rebalancing scope. The shared batchId is what makes a 50-row reorg show up as one event in history rather than 50 disconnected moves.\n` +
1248
+ "PREREQUISITE: get_agents_md once this session (enforced). Each move follows the same rules as the single tool — target parent active, reason required. Partial failures: the server processes moves one at a time and returns per-item ok/error; later failures do not roll back earlier successes.\n" +
1249
+ "ANTI-PATTERN: do not loop the single tool when you have a batch — you lose the batchId grouping in audit history. Conversely, do not use bulk for a single move; the single tool has a cleaner response shape.\n" +
1250
+ `EXAMPLE: ${bulkExample}`,
1251
+ inputSchema: {
1252
+ type: "object",
1253
+ properties: {
1254
+ moves: {
1255
+ type: "array",
1256
+ minItems: 1,
1257
+ maxItems: 100,
1258
+ items: {
1259
+ type: "object",
1260
+ properties: {
1261
+ [idKey]: { type: "string", description: idDoc },
1262
+ [parentKey]: { type: "string", description: parentDoc },
1263
+ },
1264
+ required: [idKey, parentKey],
1265
+ additionalProperties: false,
1266
+ },
1267
+ description: "Up to 100 moves to apply with a shared batchId.",
1268
+ },
1269
+ reason: {
1270
+ type: "string",
1271
+ description:
1272
+ "Shared rationale for the reorg — written to every audit row in the batch.",
1273
+ },
1274
+ dryRun: { type: "boolean" },
1275
+ workspaceId: { type: "string" },
1276
+ },
1277
+ required: ["moves", "reason"],
1278
+ additionalProperties: false,
1279
+ },
1280
+ });
1281
+ }
1282
+ return out;
1283
+ }
1284
+
1285
+ /**
1286
+ * Three update tools: one per kind. Each accepts a `patch` object
1287
+ * whose keys are the fields to mutate. Schemas enumerate the
1288
+ * mutable fields per kind so an agent can introspect what's
1289
+ * settable. Parent fields and lifecycle flags are NOT listed — the
1290
+ * server also rejects them, but advertising them up-front prevents
1291
+ * the round-trip.
1292
+ */
1293
+ function updateLifecycleTools() {
1294
+ const kinds = [
1295
+ {
1296
+ kind: "task",
1297
+ idKey: "taskId",
1298
+ idDoc: "TK-NNNNNN",
1299
+ patchProps: {
1300
+ title: { type: "string", description: "Task title. Minimum 5 chars." },
1301
+ summary: { type: "string", description: "Free-form description." },
1302
+ status: {
1303
+ type: "string",
1304
+ enum: ["delivered", "in_progress", "planned", "exploring"],
1305
+ },
1306
+ priority: { type: "string", enum: ["P0", "P1", "P2", "P3"] },
1307
+ effort: { type: "string", enum: ["XS", "S", "M", "L", "XL"] },
1308
+ kind: { type: "string", enum: ["feature", "bug", "chore", "spike"] },
1309
+ start: { type: "string", description: "ISO date YYYY-MM-DD." },
1310
+ target: { type: "string", description: "ISO date YYYY-MM-DD." },
1311
+ progress: { type: "number", description: "0–100." },
1312
+ owner: { type: "string" },
1313
+ team: { type: "string" },
1314
+ tags: { type: "array", items: { type: "string" } },
1315
+ acceptance: { type: "array" },
1316
+ dependsOn: { type: "array", items: { type: "string" } },
1317
+ links: { type: "object", additionalProperties: { type: "string" } },
1318
+ expectedPRs: {
1319
+ type: "number",
1320
+ description: "Advisory: max merged PRs for this task (overrun → audit warning).",
1321
+ },
1322
+ expectedScope: {
1323
+ type: "number",
1324
+ description: "Advisory: cumulative LoC ceiling across linked PRs.",
1325
+ },
1326
+ },
1327
+ example:
1328
+ "update_task({ taskId: 'TK-100201', patch: { status: 'in_progress', progress: 25 }, reason: 'work started — kicking off this week' })",
1329
+ },
1330
+ {
1331
+ kind: "capability",
1332
+ idKey: "capabilityId",
1333
+ idDoc: "CAP-XXXXXX",
1334
+ patchProps: {
1335
+ name: { type: "string", description: "Capability name. Minimum 8 chars." },
1336
+ outcome: { type: "string", description: "Falsifiable outcome statement." },
1337
+ hypothesis: { type: "string" },
1338
+ owner: { type: "string" },
1339
+ team: { type: "string" },
1340
+ confidence: { type: "number", description: "0–95." },
1341
+ impact: { type: "number", description: "One of 0.25, 0.5, 1, 2, 3." },
1342
+ roi: { type: "number" },
1343
+ tags: { type: "array", items: { type: "string" } },
1344
+ links: { type: "object", additionalProperties: { type: "string" } },
1345
+ },
1346
+ example:
1347
+ "update_capability({ capabilityId: 'CAP-9F2C7E', patch: { confidence: 80, outcome: 'Activation moves from 32% to 55% by 2026-09-30, measured by mixpanel_activated_v2.' }, reason: 'sharper outcome after the leadership review' })",
1348
+ },
1349
+ {
1350
+ kind: "theme",
1351
+ idKey: "themeId",
1352
+ idDoc: "TH-XXXXXX",
1353
+ patchProps: {
1354
+ name: { type: "string", description: "Theme name. Minimum 5 chars." },
1355
+ description: { type: "string" },
1356
+ owner: { type: "string" },
1357
+ targetRoi: { type: "number" },
1358
+ },
1359
+ example:
1360
+ "update_theme({ themeId: 'TH-100042', patch: { name: 'Platform Reliability' }, reason: 'sharper name; same scope' })",
1361
+ },
1362
+ ];
1363
+ return kinds.map(({ kind, idKey, idDoc, patchProps, example }) => {
1364
+ // Themes are roots (no parent), so the "use move_*" guidance
1365
+ // doesn't apply to them. Per-kind reparenting hint:
1366
+ const reparentHint =
1367
+ kind === "task"
1368
+ ? "Reparenting must use move_task — passing capabilityId/id/archived in the patch is rejected server-side."
1369
+ : kind === "capability"
1370
+ ? "Reparenting must use move_capability — passing pillarId/id/archived in the patch is rejected server-side."
1371
+ : "Themes are top-level (no parent). Passing id/archived in the patch is rejected server-side; use archive_theme to retire.";
1372
+ return {
1373
+ name: `update_${kind}`,
1374
+ description:
1375
+ `Patch fields on a ${kind}. The patch is a partial object — only the keys you include are touched.\n\n` +
1376
+ `USE WHEN: a ${kind}'s details need to change. Renaming, sharpening outcomes, bumping confidence, fixing typos, advancing status, adding tags — all here.\n` +
1377
+ `PREREQUISITE: get_agents_md once this session (enforced). Reason required (audit trail). ${reparentHint}\n` +
1378
+ `ANTI-PATTERN: do not echo the entity back to the server — pass only the keys that changed. The server diffs against current state and a patch that matches everything returns { idempotent: true }.\n` +
1379
+ `EXAMPLE: ${example}\n\n` +
1380
+ "Idempotent: a patch where every key already matches current state returns { idempotent: true } and emits no audit row. Requires SUPABASE_SERVICE_ROLE_KEY.",
1381
+ inputSchema: {
1382
+ type: "object",
1383
+ properties: {
1384
+ [idKey]: { type: "string", description: idDoc },
1385
+ patch: {
1386
+ type: "object",
1387
+ description: `Partial ${kind} — keys to update. Parent fields and lifecycle flags are rejected.`,
1388
+ properties: patchProps,
1389
+ additionalProperties: false,
1390
+ minProperties: 1,
1391
+ },
1392
+ reason: {
1393
+ type: "string",
1394
+ description: "Why this is being updated. Required — landed in the audit log.",
1395
+ },
1396
+ dryRun: { type: "boolean" },
1397
+ workspaceId: { type: "string" },
1398
+ },
1399
+ required: [idKey, "patch", "reason"],
1400
+ additionalProperties: false,
1401
+ },
1402
+ };
1403
+ });
1404
+ }
1405
+
1406
+ /** Tools that mutate the workspace — all gated on rubric fetch. */
1407
+ const MUTATOR_TOOLS = new Set([
1408
+ "propose_task",
1409
+ "propose_theme",
1410
+ "propose_capability",
1411
+ "submit_acceptance_grades",
1412
+ "link_pr",
1413
+ "archive_task",
1414
+ "archive_capability",
1415
+ "archive_theme",
1416
+ "unarchive_task",
1417
+ "unarchive_capability",
1418
+ "unarchive_theme",
1419
+ "move_task",
1420
+ "move_capability",
1421
+ "move_tasks",
1422
+ "move_capabilities",
1423
+ "update_task",
1424
+ "update_capability",
1425
+ "update_theme",
1426
+ "record_outcome_reading",
1427
+ ]);
1428
+
1429
+ async function callTool(name, args) {
1430
+ // Each tool may override the workspace via args.workspaceId. The
1431
+ // projection is workspace-scoped, so we pass that through to the
1432
+ // read. Tools that need to know the resolved id later (write paths,
1433
+ // snapshot) read it back via resolveWorkspaceId(args?.workspaceId).
1434
+ const wsId = resolveWorkspaceId(args?.workspaceId);
1435
+ // Post-Piece-6c, the entity tables ARE the canonical projection
1436
+ // — no edits blob, no seed-overlay merge. Fall back to the
1437
+ // bundled seed only when the DB is unreachable (offline / dev).
1438
+ const projected =
1439
+ (await readWorkspaceProjected(wsId)) ?? project(readSeed(), {});
1440
+
1441
+ // Rubric gate. The agent must have called get_agents_md this
1442
+ // session before any mutator runs — the rubric defines validation
1443
+ // shapes the mutator enforces, and we'd rather block the call
1444
+ // than let a malformed proposal land in the roadmap. Read-only
1445
+ // tools and get_agents_md itself are always available.
1446
+ if (MUTATOR_TOOLS.has(name)) {
1447
+ session.mutatorAttempts += 1;
1448
+ // Best-effort target attribution for telemetry. Pulled here so
1449
+ // every mutator branch below shares it. The arg names are
1450
+ // inconsistent across tools (id / taskId / capabilityId /
1451
+ // themeId / pillarId), and bulk reorgs (move_tasks /
1452
+ // move_capabilities) nest ids inside args.moves[]. We try the
1453
+ // obvious candidates in a sensible priority order. The first
1454
+ // hit wins. Null when the tool genuinely doesn't carry a
1455
+ // target (e.g. propose_* calls that create a new record where
1456
+ // the id only exists after the server stamps it).
1457
+ const firstMove = Array.isArray(args?.moves) ? args.moves[0] : null;
1458
+ const targetId =
1459
+ (typeof args?.id === "string" && args.id) ||
1460
+ (typeof args?.taskId === "string" && args.taskId) ||
1461
+ (typeof args?.capabilityId === "string" && args.capabilityId) ||
1462
+ (typeof args?.themeId === "string" && args.themeId) ||
1463
+ (typeof args?.pillarId === "string" && args.pillarId) ||
1464
+ (firstMove &&
1465
+ typeof firstMove.taskId === "string" &&
1466
+ firstMove.taskId) ||
1467
+ (firstMove &&
1468
+ typeof firstMove.capabilityId === "string" &&
1469
+ firstMove.capabilityId) ||
1470
+ null;
1471
+ if (session.rubricFetchedAt === null) {
1472
+ session.mutatorBlocks += 1;
1473
+ recordTelemetry(
1474
+ "mutator_blocked_no_rubric",
1475
+ { tool: name, targetId },
1476
+ wsId
1477
+ );
1478
+ return rubricMissingResult(name);
1479
+ }
1480
+ // Per-tool discovery gates. Block propose_theme until the agent
1481
+ // has actually inspected the existing theme catalogue, and
1482
+ // propose_capability until they've ranked existing caps for fit.
1483
+ // Tool descriptions already steer agents this way; this turns
1484
+ // the recommendation into enforcement so the most common
1485
+ // failure mode (creating a duplicate of an existing record) can't
1486
+ // slip through when the LLM skipped the discovery step.
1487
+ if (name === "propose_theme" && session.themesListedAt === null) {
1488
+ session.mutatorBlocks += 1;
1489
+ recordTelemetry(
1490
+ "mutator_blocked_no_discovery",
1491
+ { tool: name, missing: "suggest_theme_for", targetId },
1492
+ wsId
1493
+ );
1494
+ return discoveryMissingResult(
1495
+ name,
1496
+ 'suggest_theme_for({ description: "<the work you are about to propose>" })',
1497
+ "Rank existing themes by relevance before proposing a new one — themes are years-stable, duplicates are the most common failure mode. Any returned top score >0.4 means an existing theme is a sensible home; re-use it. list_themes() or get_roadmap_snapshot() also satisfy this gate if you want the full catalogue."
1498
+ );
1499
+ }
1500
+ if (
1501
+ name === "propose_capability" &&
1502
+ session.capsDiscoveredAt === null
1503
+ ) {
1504
+ session.mutatorBlocks += 1;
1505
+ recordTelemetry(
1506
+ "mutator_blocked_no_discovery",
1507
+ {
1508
+ tool: name,
1509
+ missing: "suggest_capability_for",
1510
+ targetId,
1511
+ },
1512
+ wsId
1513
+ );
1514
+ return discoveryMissingResult(
1515
+ name,
1516
+ 'suggest_capability_for({ description: "<the work you are about to propose>" })',
1517
+ "Rank existing capabilities by relevance before proposing a new one. If any score is >0.4, attach tasks there instead."
1518
+ );
1519
+ }
1520
+ // Cross-workspace guard. If the cwd has a .roadmapper/snapshot.json
1521
+ // naming a workspace, and the call carries an explicit workspaceId
1522
+ // pointing somewhere else, refuse — almost always a mistake. An
1523
+ // operator who really needs to write across workspaces can set
1524
+ // ROADMAPPER_ALLOW_CROSS_WORKSPACE=1 in env to bypass.
1525
+ const snap = snapshotWorkspaceId();
1526
+ const argWs = args?.workspaceId;
1527
+ if (
1528
+ snap &&
1529
+ typeof argWs === "string" &&
1530
+ argWs.length > 0 &&
1531
+ argWs !== snap &&
1532
+ process.env.ROADMAPPER_ALLOW_CROSS_WORKSPACE !== "1"
1533
+ ) {
1534
+ session.mutatorBlocks += 1;
1535
+ recordTelemetry(
1536
+ "mutator_blocked_cross_workspace",
1537
+ { tool: name, targetId, cwdWorkspace: snap, argWorkspace: argWs },
1538
+ wsId
1539
+ );
1540
+ return errorResult(
1541
+ `Refusing cross-workspace write: cwd's .roadmapper/snapshot.json names workspace "${snap}" but ${name} call targets "${argWs}". Almost always a mistake — drop the workspaceId arg to use the cwd default, or set ROADMAPPER_ALLOW_CROSS_WORKSPACE=1 to override.`
1542
+ );
1543
+ }
1544
+ recordTelemetry("mutator_attempted", { tool: name, targetId }, wsId);
1545
+ }
1546
+
1547
+ switch (name) {
1548
+ case "list_themes": {
1549
+ // Satisfies the propose_theme discovery gate. The agent has
1550
+ // explicitly enumerated the existing theme catalogue.
1551
+ session.themesListedAt = Date.now();
1552
+ let filtered = projected.themes;
1553
+ if (!args?.includeArchived) {
1554
+ filtered = filtered.filter((t) => !t.archived);
1555
+ }
1556
+ return withReminder(
1557
+ "list_themes",
1558
+ projected,
1559
+ textResult(JSON.stringify(filtered, null, 2))
1560
+ );
1561
+ }
1562
+ case "list_capabilities": {
1563
+ // Counts as cap discovery for the propose_capability gate.
1564
+ session.capsDiscoveredAt = Date.now();
1565
+ let filtered = args?.themeId
1566
+ ? projected.capabilities.filter((c) => c.pillarId === args.themeId)
1567
+ : projected.capabilities;
1568
+ if (!args?.includeDelivered) {
1569
+ filtered = filtered.filter(
1570
+ (c) => effectiveCapabilityStatus(c, projected.tasks) !== "delivered"
1571
+ );
1572
+ }
1573
+ if (!args?.includeArchived) {
1574
+ filtered = filtered.filter((c) => !c.archived);
1575
+ }
1576
+ return withReminder(
1577
+ "list_capabilities",
1578
+ projected,
1579
+ textResult(JSON.stringify(filtered, null, 2))
1580
+ );
1581
+ }
1582
+ case "list_tasks": {
1583
+ let filtered = projected.tasks;
1584
+ if (args?.capabilityId)
1585
+ filtered = filtered.filter((t) => t.capabilityId === args.capabilityId);
1586
+ if (args?.status)
1587
+ filtered = filtered.filter((t) => t.status === args.status);
1588
+ if (!args?.includeArchived) {
1589
+ filtered = filtered.filter((t) => !t.archived);
1590
+ }
1591
+ return withReminder(
1592
+ "list_tasks",
1593
+ projected,
1594
+ textResult(JSON.stringify(filtered, null, 2))
1595
+ );
1596
+ }
1597
+ case "get_task": {
1598
+ // S5: direct-by-id lookups always resolve, even for archived
1599
+ // entities. Cross-references (PR links, dependsOn) need this.
1600
+ const t = projected.tasks.find((x) => x.id === args?.id);
1601
+ if (!t) return errorResult(`Task ${args?.id} not found.`);
1602
+ return textResult(JSON.stringify(t, null, 2));
1603
+ }
1604
+ case "get_agents_md": {
1605
+ const fresh = session.rubricFetchedAt === null;
1606
+ session.rubricFetchedAt = Date.now();
1607
+ if (fresh) recordTelemetry("rubric_fetched", { via: "tool" }, wsId);
1608
+ return textResult(readAgentsMd(), {
1609
+ _meta: {
1610
+ roadmapper: {
1611
+ reminder:
1612
+ "Rubric loaded. You can now safely call propose_task, propose_capability, propose_theme, submit_acceptance_grades, link_pr.",
1613
+ },
1614
+ },
1615
+ });
1616
+ }
1617
+ case "get_roadmap_snapshot": {
1618
+ // The snapshot returns themes + active caps + open tasks in a
1619
+ // single response, so the agent has effectively enumerated both
1620
+ // catalogues. Satisfies BOTH discovery gates.
1621
+ const ts = Date.now();
1622
+ session.themesListedAt = ts;
1623
+ session.capsDiscoveredAt = ts;
1624
+ return withReminder(
1625
+ "get_roadmap_snapshot",
1626
+ projected,
1627
+ getRoadmapSnapshot(projected, wsId, args?.includeArchived === true)
1628
+ );
1629
+ }
1630
+ case "propose_task":
1631
+ return proposeTask(args, projected, wsId);
1632
+ case "propose_theme":
1633
+ return proposeTheme(args, projected, wsId);
1634
+ case "propose_capability":
1635
+ return proposeCapability(args, projected, wsId);
1636
+ case "submit_acceptance_grades":
1637
+ return submitAcceptanceGrades(args, projected, wsId);
1638
+ case "suggest_capability_for":
1639
+ // Counts as cap discovery — the agent has explicitly asked
1640
+ // the server to rank existing caps for fit against the work
1641
+ // they're about to propose.
1642
+ session.capsDiscoveredAt = Date.now();
1643
+ return suggestCapabilityFor(args, projected);
1644
+ case "suggest_theme_for":
1645
+ // Mirror of the cap case: satisfies the propose_theme gate,
1646
+ // because the agent has explicitly asked the server to rank
1647
+ // existing themes for fit.
1648
+ session.themesListedAt = Date.now();
1649
+ return suggestThemeFor(args, projected);
1650
+ case "link_pr":
1651
+ return linkPR(args, projected, seed, wsId);
1652
+ case "archive_task":
1653
+ return archiveLifecycle("task", "archive", args, wsId);
1654
+ case "archive_capability":
1655
+ return archiveLifecycle("capability", "archive", args, wsId);
1656
+ case "archive_theme":
1657
+ return archiveLifecycle("theme", "archive", args, wsId);
1658
+ case "unarchive_task":
1659
+ return archiveLifecycle("task", "unarchive", args, wsId);
1660
+ case "unarchive_capability":
1661
+ return archiveLifecycle("capability", "unarchive", args, wsId);
1662
+ case "unarchive_theme":
1663
+ return archiveLifecycle("theme", "unarchive", args, wsId);
1664
+ case "move_task":
1665
+ return moveEntity("task", args, wsId);
1666
+ case "move_capability":
1667
+ return moveEntity("capability", args, wsId);
1668
+ case "move_tasks":
1669
+ return moveBulk("task", args, wsId);
1670
+ case "move_capabilities":
1671
+ return moveBulk("capability", args, wsId);
1672
+ case "update_task":
1673
+ return updateEntity("task", args, wsId, projected);
1674
+ case "update_capability":
1675
+ return updateEntity("capability", args, wsId, projected);
1676
+ case "update_theme":
1677
+ return updateEntity("theme", args, wsId, projected);
1678
+ case "record_outcome_reading":
1679
+ return recordOutcomeReading(args, wsId, projected);
1680
+ case "list_stale_outcomes":
1681
+ return listStaleOutcomes(args, projected);
1682
+ default:
1683
+ return errorResult(`Unknown tool: ${name}`);
1684
+ }
1685
+ }
1686
+
1687
+ async function proposeTask(args, projected, wsId) {
1688
+ const cap = projected.capabilities.find((c) => c.id === args.capabilityId);
1689
+ if (!cap) return errorResult(`Capability ${args.capabilityId} not found.`);
1690
+ const titleErr = validateName(args.title, 5);
1691
+ if (titleErr) return errorResult(titleErr);
1692
+ if (args.effort && !VALID_EFFORTS.has(args.effort))
1693
+ return errorResult(`Invalid effort ${args.effort}.`);
1694
+ if (args.priority && !VALID_PRIORITIES.has(args.priority))
1695
+ return errorResult(`Invalid priority ${args.priority}.`);
1696
+ if (args.kind && !VALID_KINDS.has(args.kind))
1697
+ return errorResult(`Invalid kind ${args.kind}.`);
1698
+ if (
1699
+ args.expectedPRs !== undefined &&
1700
+ (typeof args.expectedPRs !== "number" || args.expectedPRs <= 0)
1701
+ )
1702
+ return errorResult(`expectedPRs must be a positive number, got ${args.expectedPRs}.`);
1703
+ if (
1704
+ args.expectedScope !== undefined &&
1705
+ (typeof args.expectedScope !== "number" || args.expectedScope <= 0)
1706
+ )
1707
+ return errorResult(`expectedScope must be a positive number, got ${args.expectedScope}.`);
1708
+
1709
+ const effort = args.effort ?? "M";
1710
+ const start = todayISO();
1711
+ // Target dates are day-resolution; round up so sub-day estimates
1712
+ // (XS=0.25, S=0.5) still nudge the target at least one day out.
1713
+ const target = addDays(start, Math.max(1, Math.ceil(EFFORT_DAYS[effort])));
1714
+ const id = randomTaskId();
1715
+ const task = {
1716
+ id,
1717
+ capabilityId: cap.id,
1718
+ title: cleanText(args.title),
1719
+ summary: cleanText(args.summary),
1720
+ status: "planned",
1721
+ priority: args.priority ?? "P2",
1722
+ effort,
1723
+ kind: args.kind ?? "feature",
1724
+ start,
1725
+ target,
1726
+ originalTarget: target,
1727
+ progress: 0,
1728
+ owner: args.owner?.trim() ?? "",
1729
+ team: cap.team ?? "",
1730
+ tags: [],
1731
+ prs: [],
1732
+ links: {},
1733
+ acceptance: args.acceptance ?? [],
1734
+ dependsOn: args.dependsOn ?? [],
1735
+ authorKind: "agent",
1736
+ // Advisory scope ceiling — left unset by default ("default-then-
1737
+ // observe" pattern). Authors who want to declare an envelope can
1738
+ // pass expectedPRs / expectedScope at propose-time, or set them
1739
+ // later via update_task. A future get_size_baseline tool will
1740
+ // suggest sensible defaults from observed history.
1741
+ ...(args.expectedPRs !== undefined ? { expectedPRs: args.expectedPRs } : {}),
1742
+ ...(args.expectedScope !== undefined ? { expectedScope: args.expectedScope } : {}),
1743
+ };
1744
+
1745
+ if (args.dryRun) {
1746
+ return textResult(
1747
+ JSON.stringify(
1748
+ {
1749
+ ok: true,
1750
+ dryRun: true,
1751
+ wouldCreate: task,
1752
+ warnings: [],
1753
+ message: `Would create task ${id} under ${cap.id} (${cap.name}). No record written.`,
1754
+ },
1755
+ null,
1756
+ 2
1757
+ )
1758
+ );
1759
+ }
1760
+
1761
+ let rpcResult;
1762
+ try {
1763
+ // RPC does an idempotency check + append inside a row lock.
1764
+ // Concurrent retries with the same key collapse to one task;
1765
+ // concurrent calls without a key both insert distinct tasks.
1766
+ rpcResult = await rpcCall("propose_task", {
1767
+ p_workspace_id: wsId,
1768
+ p_task: task,
1769
+ p_idempotency_key: args.idempotencyKey ?? null,
1770
+ });
1771
+ } catch (e) {
1772
+ return errorResult(e.message);
1773
+ }
1774
+
1775
+ // RPC returns { task, idempotent }. When idempotent=true, an
1776
+ // earlier call with the same idempotencyKey already created the
1777
+ // task — surface that instead of pretending we just made a new one.
1778
+ const stored = rpcResult?.task ?? task;
1779
+ const idempotent = rpcResult?.idempotent === true;
1780
+
1781
+ return textResult(
1782
+ JSON.stringify(
1783
+ {
1784
+ ok: true,
1785
+ id: stored.id,
1786
+ capabilityId: stored.capabilityId,
1787
+ idempotent,
1788
+ message: idempotent
1789
+ ? `Task ${stored.id} already exists with idempotencyKey ${args.idempotencyKey}; returning existing task instead of creating a duplicate.`
1790
+ : `Created ${stored.id} under ${cap.id} (${cap.name}). status=planned, authorKind=agent.`,
1791
+ },
1792
+ null,
1793
+ 2
1794
+ )
1795
+ );
1796
+ }
1797
+
1798
+ async function proposeTheme(args, _projected /* unused — themes carry no parent */, wsId) {
1799
+ const nameErr = validateName(args.name, 6);
1800
+ if (nameErr) return errorResult(nameErr);
1801
+
1802
+ const name = cleanText(args.name);
1803
+ const id = randomThemeId();
1804
+ const theme = {
1805
+ id,
1806
+ name,
1807
+ description: cleanText(args.description),
1808
+ color: args.color || "#6366f1", // brand-indigo default; user can change
1809
+ ...(typeof args.targetRoi === "number" ? { targetRoi: args.targetRoi } : {}),
1810
+ };
1811
+
1812
+ if (args.dryRun) {
1813
+ return textResult(
1814
+ JSON.stringify(
1815
+ {
1816
+ ok: true,
1817
+ dryRun: true,
1818
+ wouldCreate: theme,
1819
+ warnings: [],
1820
+ message: `Would create theme ${id} (${name}). No record written.`,
1821
+ },
1822
+ null,
1823
+ 2
1824
+ )
1825
+ );
1826
+ }
1827
+
1828
+ let rpcResult;
1829
+ try {
1830
+ rpcResult = await rpcCall("propose_theme", {
1831
+ p_workspace_id: wsId,
1832
+ p_theme: theme,
1833
+ p_idempotency_key: args.idempotencyKey ?? null,
1834
+ });
1835
+ } catch (e) {
1836
+ return errorResult(e.message);
1837
+ }
1838
+ const stored = rpcResult?.theme ?? theme;
1839
+ const idempotent = rpcResult?.idempotent === true;
1840
+ return textResult(
1841
+ JSON.stringify(
1842
+ {
1843
+ ok: true,
1844
+ id: stored.id,
1845
+ idempotent,
1846
+ message: idempotent
1847
+ ? `Theme ${stored.id} already exists with idempotencyKey ${args.idempotencyKey}; returning existing instead of duplicating.`
1848
+ : `Created theme ${stored.id} (${stored.name}).`,
1849
+ },
1850
+ null,
1851
+ 2
1852
+ )
1853
+ );
1854
+ }
1855
+
1856
+ async function proposeCapability(args, projected, wsId) {
1857
+ const pillarId = (args.pillarId || "").trim();
1858
+
1859
+ // Validation order: cheap structural checks first, then rubric.
1860
+ const nameErr = validateName(args.name, 8);
1861
+ if (nameErr) return errorResult(nameErr);
1862
+ if (!pillarId) return errorResult("pillarId is required.");
1863
+
1864
+ // pillarId must exist in the projected view (seed + newPillars −
1865
+ // deletedPillarIds). The RPC catches the "deleted in this
1866
+ // workspace mid-session" case too.
1867
+ const theme = projected.themes.find((t) => t.id === pillarId);
1868
+ if (!theme) {
1869
+ return errorResult(
1870
+ `pillarId ${pillarId} doesn't match any known theme. Call list_themes first.`
1871
+ );
1872
+ }
1873
+ if (typeof args.impact === "number" && !VALID_IMPACTS.has(args.impact)) {
1874
+ return errorResult(
1875
+ `Invalid impact ${args.impact} — must be one of 3, 2, 1, 0.5, 0.25.`
1876
+ );
1877
+ }
1878
+ const confidenceErr = validateConfidence(args.confidence);
1879
+ if (confidenceErr) return errorResult(confidenceErr);
1880
+ if (typeof args.reach === "number" && args.reach < 0) {
1881
+ return errorResult(`Invalid reach ${args.reach} — must be >= 0.`);
1882
+ }
1883
+ // Outcome is required + must be falsifiable per the AGENTS.md rubric.
1884
+ const outcomeErr = validateOutcome(args.outcome);
1885
+ if (outcomeErr) return errorResult(outcomeErr);
1886
+
1887
+ const id = randomCapabilityId();
1888
+ const capability = {
1889
+ id,
1890
+ name: cleanText(args.name),
1891
+ pillarId,
1892
+ description: cleanText(args.description),
1893
+ outcome: cleanText(args.outcome),
1894
+ reach: typeof args.reach === "number" ? args.reach : 100,
1895
+ impact: typeof args.impact === "number" ? args.impact : 1,
1896
+ confidence: typeof args.confidence === "number" ? args.confidence : 70,
1897
+ ...(typeof args.roi === "number" ? { roi: args.roi } : {}),
1898
+ ...(args.specRef ? { specRef: args.specRef } : {}),
1899
+ };
1900
+
1901
+ // Soft warnings — surface, don't reject.
1902
+ const warnings = [];
1903
+ const roiWarn = warnRoiVsTheme(capability.roi, theme);
1904
+ if (roiWarn) warnings.push(roiWarn);
1905
+
1906
+ if (args.dryRun) {
1907
+ return textResult(
1908
+ JSON.stringify(
1909
+ {
1910
+ ok: true,
1911
+ dryRun: true,
1912
+ wouldCreate: capability,
1913
+ warnings,
1914
+ message: `Would create capability ${id} (${capability.name}) under ${theme.id} (${theme.name}). No record written.`,
1915
+ },
1916
+ null,
1917
+ 2
1918
+ )
1919
+ );
1920
+ }
1921
+
1922
+ let rpcResult;
1923
+ try {
1924
+ rpcResult = await rpcCall("propose_capability", {
1925
+ p_workspace_id: wsId,
1926
+ p_capability: capability,
1927
+ p_idempotency_key: args.idempotencyKey ?? null,
1928
+ });
1929
+ } catch (e) {
1930
+ return errorResult(e.message);
1931
+ }
1932
+ const stored = rpcResult?.capability ?? capability;
1933
+ const idempotent = rpcResult?.idempotent === true;
1934
+ return textResult(
1935
+ JSON.stringify(
1936
+ {
1937
+ ok: true,
1938
+ id: stored.id,
1939
+ pillarId: stored.pillarId,
1940
+ idempotent,
1941
+ warnings,
1942
+ message: idempotent
1943
+ ? `Capability ${stored.id} already exists with idempotencyKey ${args.idempotencyKey}; returning existing instead of duplicating.`
1944
+ : `Created capability ${stored.id} (${stored.name}) under ${stored.pillarId}.`,
1945
+ },
1946
+ null,
1947
+ 2
1948
+ )
1949
+ );
1950
+ }
1951
+
1952
+ /**
1953
+ * One-shot snapshot for cold-start agents. Bundles themes, active
1954
+ * capabilities, and in-flight tasks (status=in_progress|planned)
1955
+ * into a single response so the agent doesn't need three round
1956
+ * trips to orient. Always live — never cached.
1957
+ *
1958
+ * The response includes the resolved workspaceId so the agent knows
1959
+ * which id to thread back through subsequent write tools. This is
1960
+ * how a single MCP install operates against multiple workspaces:
1961
+ * the agent reads the workspaceId out of this response (or out of
1962
+ * `.roadmapper/snapshot.json` in the repo it's working in), then
1963
+ * passes that id back on `propose_task` / `propose_capability` /
1964
+ * `propose_theme` calls.
1965
+ */
1966
+ function getRoadmapSnapshot(projected, wsId, includeArchived = false) {
1967
+ // Archived entities are filtered out by default — the snapshot
1968
+ // is meant to surface what an agent should plan against, and
1969
+ // archived rows are by definition not in scope. Pass
1970
+ // includeArchived=true to include them (e.g. when reviewing
1971
+ // historical state).
1972
+ const themes = includeArchived
1973
+ ? projected.themes
1974
+ : projected.themes.filter((t) => !t.archived);
1975
+ const activeCapabilities = projected.capabilities.filter((c) => {
1976
+ if (!includeArchived && c.archived) return false;
1977
+ return effectiveCapabilityStatus(c, projected.tasks) !== "delivered";
1978
+ });
1979
+ const inFlightTasks = projected.tasks.filter((t) => {
1980
+ if (!includeArchived && t.archived) return false;
1981
+ return t.status === "in_progress" || t.status === "planned";
1982
+ });
1983
+ return textResult(
1984
+ JSON.stringify(
1985
+ {
1986
+ workspaceId: wsId,
1987
+ generatedAt: new Date().toISOString(),
1988
+ themes,
1989
+ capabilities: activeCapabilities,
1990
+ tasks: inFlightTasks,
1991
+ counts: {
1992
+ themes: themes.length,
1993
+ activeCapabilities: activeCapabilities.length,
1994
+ inFlightTasks: inFlightTasks.length,
1995
+ totalCapabilities: projected.capabilities.length,
1996
+ totalTasks: projected.tasks.length,
1997
+ },
1998
+ },
1999
+ null,
2000
+ 2
2001
+ )
2002
+ );
2003
+ }
2004
+
2005
+ function suggestCapabilityFor(args, projected) {
2006
+ const desc = (args.description || "").trim();
2007
+ if (!desc) return errorResult("description is required.");
2008
+ const limit = Math.min(25, Math.max(1, args.limit ?? 5));
2009
+
2010
+ // Skip delivered capabilities — they're closed bets. A new PR
2011
+ // mapping to a delivered cap would either be wrong (work for a
2012
+ // different bet) or reopen-the-bet (which the user should do
2013
+ // explicitly, not as a side effect of agent triage).
2014
+ const activeCaps = projected.capabilities.filter(
2015
+ (c) => effectiveCapabilityStatus(c, projected.tasks) !== "delivered"
2016
+ );
2017
+ const query = tokenize(desc);
2018
+ const ranked = activeCaps
2019
+ .map((c) => {
2020
+ const hay = tokenize(
2021
+ `${c.name} ${c.description ?? ""} ${c.outcome ?? ""}`
2022
+ );
2023
+ return { capability: c, score: jaccardScore(query, hay) };
2024
+ })
2025
+ .filter((r) => r.score > 0)
2026
+ .sort((a, b) => b.score - a.score)
2027
+ .slice(0, limit)
2028
+ .map(({ capability, score }) => ({
2029
+ id: capability.id,
2030
+ name: capability.name,
2031
+ pillarId: capability.pillarId,
2032
+ outcome: capability.outcome,
2033
+ score: Number(score.toFixed(3)),
2034
+ }));
2035
+
2036
+ // Reminder via _meta when nothing strong came back — the model
2037
+ // should pause and ask the user before inventing a new capability.
2038
+ const topScore = ranked[0]?.score ?? 0;
2039
+ const meta =
2040
+ topScore < 0.4
2041
+ ? {
2042
+ _meta: {
2043
+ roadmapper: {
2044
+ reminder:
2045
+ ranked.length === 0
2046
+ ? "No existing capability is a sensible parent. Before calling propose_capability, verify with the user that a brand-new capability is warranted — capabilities are quarterly bets, not single tasks."
2047
+ : "No strong match (top score < 0.4). If none of the listed capabilities fit, ask the user before calling propose_capability — the top match is often closer than it scores.",
2048
+ },
2049
+ },
2050
+ }
2051
+ : undefined;
2052
+
2053
+ return textResult(
2054
+ JSON.stringify(
2055
+ {
2056
+ ok: true,
2057
+ query: desc,
2058
+ matches: ranked,
2059
+ hint:
2060
+ ranked.length === 0
2061
+ ? "No existing capabilities overlap your description. propose_capability is likely the right next step."
2062
+ : ranked[0].score > 0.4
2063
+ ? `Strong match: ${ranked[0].id} (${ranked[0].name}). Strongly consider attaching tasks here instead of creating a duplicate capability.`
2064
+ : `Weak overlap. If none of these fit, propose_capability is reasonable — but read the top match first.`,
2065
+ },
2066
+ null,
2067
+ 2
2068
+ ),
2069
+ meta
2070
+ );
2071
+ }
2072
+
2073
+ /**
2074
+ * Theme-level mirror of suggestCapabilityFor — ranks existing
2075
+ * themes by token overlap with the description and signals
2076
+ * whether any existing theme is a sensible home for the work.
2077
+ *
2078
+ * Themes are years-stable so the messaging is more conservative:
2079
+ * even a weak match should usually win over creating a new theme.
2080
+ * Only an empty or very-low-overlap result + explicit user intent
2081
+ * should lead to propose_theme.
2082
+ */
2083
+ function suggestThemeFor(args, projected) {
2084
+ const desc = (args.description || "").trim();
2085
+ if (!desc) return errorResult("description is required.");
2086
+ const limit = Math.min(25, Math.max(1, args.limit ?? 5));
2087
+
2088
+ // Skip archived themes — retired strategic bets shouldn't pull
2089
+ // new work into a closed mandate.
2090
+ const activeThemes = projected.themes.filter((t) => !t.archived);
2091
+ const query = tokenize(desc);
2092
+ const ranked = activeThemes
2093
+ .map((t) => {
2094
+ const hay = tokenize(`${t.name} ${t.description ?? ""}`);
2095
+ return { theme: t, score: jaccardScore(query, hay) };
2096
+ })
2097
+ .filter((r) => r.score > 0)
2098
+ .sort((a, b) => b.score - a.score)
2099
+ .slice(0, limit)
2100
+ .map(({ theme, score }) => ({
2101
+ id: theme.id,
2102
+ name: theme.name,
2103
+ description: theme.description ?? "",
2104
+ score: Number(score.toFixed(3)),
2105
+ }));
2106
+
2107
+ // Reminder when nothing matches strongly — theme creation is the
2108
+ // years-stable decision, so even a weak match deserves a pause.
2109
+ const topScore = ranked[0]?.score ?? 0;
2110
+ const meta =
2111
+ topScore < 0.4
2112
+ ? {
2113
+ _meta: {
2114
+ roadmapper: {
2115
+ reminder:
2116
+ ranked.length === 0
2117
+ ? "No existing theme overlaps your description. Themes are years-stable, so creating a new one is a big decision — verify with the user that this represents a genuinely new strategic direction, not a reframing of an existing bet, before calling propose_theme."
2118
+ : "No strong match (top score < 0.4). Re-using a 'close-enough' theme is almost always the right move; ask the user before calling propose_theme.",
2119
+ },
2120
+ },
2121
+ }
2122
+ : undefined;
2123
+
2124
+ return textResult(
2125
+ JSON.stringify(
2126
+ {
2127
+ ok: true,
2128
+ query: desc,
2129
+ matches: ranked,
2130
+ hint:
2131
+ ranked.length === 0
2132
+ ? "No existing theme overlaps. propose_theme MAY be appropriate, but only with explicit user confirmation that a new strategic direction is intended — themes are years-stable, not per-feature."
2133
+ : ranked[0].score > 0.4
2134
+ ? `Strong match: ${ranked[0].id} (${ranked[0].name}). Attach capabilities under this theme instead of creating a new one.`
2135
+ : `Weak overlap. The top match is often closer than it scores; prefer that over creating a new theme unless the user explicitly asks for a new strategic direction.`,
2136
+ },
2137
+ null,
2138
+ 2
2139
+ ),
2140
+ meta
2141
+ );
2142
+ }
2143
+
2144
+ async function linkPR(args, projected, seed, wsId) {
2145
+ const task = projected.tasks.find((t) => t.id === args.taskId);
2146
+ if (!task) return errorResult(`Task ${args.taskId} not found.`);
2147
+ if (!args.repo || !args.number)
2148
+ return errorResult("repo and number are required.");
2149
+
2150
+ // Build the PR object the way the app expects.
2151
+ const pr = {
2152
+ repo: args.repo,
2153
+ number: args.number,
2154
+ ...(args.title ? { title: args.title } : {}),
2155
+ ...(typeof args.merged === "boolean" ? { merged: args.merged } : {}),
2156
+ ...(args.mergedAt ? { mergedAt: args.mergedAt } : {}),
2157
+ ...(args.authorGithub ? { authorGithub: args.authorGithub } : {}),
2158
+ ...(args.authorKind ? { authorKind: args.authorKind } : {}),
2159
+ };
2160
+
2161
+ // The RPC can't see the seed JSON; if the task is a seed task with
2162
+ // no prior PR patches, we need to pass its seed prs so the RPC can
2163
+ // union our new PR on top rather than clobber.
2164
+ const seedTask = (seed?.tasks ?? []).find((t) => t.id === args.taskId);
2165
+ const seedPrs = seedTask?.prs ?? [];
2166
+
2167
+ let rpcResult;
2168
+ try {
2169
+ rpcResult = await rpcCall("link_pr", {
2170
+ p_workspace_id: wsId,
2171
+ p_task_id: args.taskId,
2172
+ p_pr: pr,
2173
+ p_seed_prs: seedPrs,
2174
+ });
2175
+ } catch (e) {
2176
+ return errorResult(e.message);
2177
+ }
2178
+ const idempotent = rpcResult?.idempotent === true;
2179
+ return textResult(
2180
+ JSON.stringify(
2181
+ {
2182
+ ok: true,
2183
+ taskId: args.taskId,
2184
+ pr: `${pr.repo}#${pr.number}`,
2185
+ idempotent,
2186
+ message: idempotent
2187
+ ? `${pr.repo}#${pr.number} was already linked to ${args.taskId}; no change.`
2188
+ : `Attached ${pr.repo}#${pr.number} to ${args.taskId}.`,
2189
+ },
2190
+ null,
2191
+ 2
2192
+ )
2193
+ );
2194
+ }
2195
+
2196
+ /**
2197
+ * Shared handler for the six archive/unarchive tools. Validates
2198
+ * inputs, calls the SQL RPC (which does the heavy lifting +
2199
+ * audit-log write), and renders the response. The SQL handles
2200
+ * refuse-with-children, idempotency, parent-active checks, and
2201
+ * audit attribution; the JS layer just routes.
2202
+ */
2203
+ async function archiveLifecycle(kind, action, args, wsId) {
2204
+ const idArg =
2205
+ kind === "task" ? "taskId" : kind === "capability" ? "capabilityId" : "themeId";
2206
+ const entityId = (args?.[idArg] ?? "").trim();
2207
+ if (!entityId) return errorResult(`${idArg} is required.`);
2208
+ const reason = (args?.reason ?? "").trim();
2209
+ if (!reason) return errorResult("reason is required.");
2210
+ if (!wsId) {
2211
+ return errorResult(
2212
+ "workspaceId could not be resolved (pass workspaceId arg or set SUPABASE_WORKSPACE_ID)."
2213
+ );
2214
+ }
2215
+ try {
2216
+ const result = await rpcCall(
2217
+ action === "archive" ? "archive_entity" : "unarchive_entity",
2218
+ {
2219
+ p_workspace_id: wsId,
2220
+ p_kind: kind,
2221
+ p_entity_id: entityId,
2222
+ p_reason: reason,
2223
+ p_actor_label: "mcp:agent",
2224
+ p_idempotency_key: args?.idempotencyKey ?? null,
2225
+ p_dry_run: args?.dryRun === true,
2226
+ }
2227
+ );
2228
+ return textResult(JSON.stringify(result, null, 2));
2229
+ } catch (e) {
2230
+ return errorResult(e.message);
2231
+ }
2232
+ }
2233
+
2234
+ /**
2235
+ * Single-entity move. Validates inputs, calls move_entity RPC,
2236
+ * renders the response. SQL handles target-active check, idempotency,
2237
+ * U5 unarchive-on-move, and audit attribution.
2238
+ */
2239
+ async function moveEntity(kind, args, wsId) {
2240
+ const idArg = kind === "task" ? "taskId" : "capabilityId";
2241
+ const parentArg = kind === "task" ? "newCapabilityId" : "newThemeId";
2242
+ const entityId = (args?.[idArg] ?? "").trim();
2243
+ if (!entityId) return errorResult(`${idArg} is required.`);
2244
+ const newParentId = (args?.[parentArg] ?? "").trim();
2245
+ if (!newParentId) return errorResult(`${parentArg} is required.`);
2246
+ const reason = (args?.reason ?? "").trim();
2247
+ if (!reason) return errorResult("reason is required.");
2248
+ if (!wsId) {
2249
+ return errorResult(
2250
+ "workspaceId could not be resolved (pass workspaceId arg or set SUPABASE_WORKSPACE_ID)."
2251
+ );
2252
+ }
2253
+ try {
2254
+ const result = await rpcCall("move_entity", {
2255
+ p_workspace_id: wsId,
2256
+ p_kind: kind,
2257
+ p_entity_id: entityId,
2258
+ p_new_parent_id: newParentId,
2259
+ p_reason: reason,
2260
+ p_actor_label: "mcp:agent",
2261
+ p_batch_id: null,
2262
+ p_dry_run: args?.dryRun === true,
2263
+ });
2264
+ return textResult(JSON.stringify(result, null, 2));
2265
+ } catch (e) {
2266
+ return errorResult(e.message);
2267
+ }
2268
+ }
2269
+
2270
+ /**
2271
+ * Bulk move. Validates the batch (size, shape), generates one
2272
+ * batchId, then issues move_entity calls in sequence stamping each
2273
+ * audit row with the shared id. Per-item failures don't roll back
2274
+ * earlier successes — the response surfaces each move's outcome so
2275
+ * the caller can retry the failures or surface them to the user.
2276
+ */
2277
+ async function moveBulk(kind, args, wsId) {
2278
+ const idArg = kind === "task" ? "taskId" : "capabilityId";
2279
+ const parentArg = kind === "task" ? "newCapabilityId" : "newThemeId";
2280
+ const moves = args?.moves;
2281
+ if (!Array.isArray(moves) || moves.length === 0) {
2282
+ return errorResult("moves must be a non-empty array.");
2283
+ }
2284
+ if (moves.length > 100) {
2285
+ return errorResult(`bulk move cap is 100 (got ${moves.length}).`);
2286
+ }
2287
+ const reason = (args?.reason ?? "").trim();
2288
+ if (!reason) return errorResult("reason is required.");
2289
+ if (!wsId) {
2290
+ return errorResult(
2291
+ "workspaceId could not be resolved (pass workspaceId arg or set SUPABASE_WORKSPACE_ID)."
2292
+ );
2293
+ }
2294
+ // Validate every item up-front so we don't half-apply a batch
2295
+ // that's structurally broken — caller probably wants to fix the
2296
+ // payload, not see 7 successes and 3 "missing field" errors.
2297
+ for (let i = 0; i < moves.length; i++) {
2298
+ const m = moves[i];
2299
+ if (!m || typeof m !== "object") {
2300
+ return errorResult(`moves[${i}] must be an object.`);
2301
+ }
2302
+ if (typeof m[idArg] !== "string" || !m[idArg].trim()) {
2303
+ return errorResult(`moves[${i}].${idArg} is required.`);
2304
+ }
2305
+ if (typeof m[parentArg] !== "string" || !m[parentArg].trim()) {
2306
+ return errorResult(`moves[${i}].${parentArg} is required.`);
2307
+ }
2308
+ }
2309
+ const batchId = `batch-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
2310
+ const dryRun = args?.dryRun === true;
2311
+ const results = [];
2312
+ for (let i = 0; i < moves.length; i++) {
2313
+ const m = moves[i];
2314
+ try {
2315
+ const out = await rpcCall("move_entity", {
2316
+ p_workspace_id: wsId,
2317
+ p_kind: kind,
2318
+ p_entity_id: m[idArg].trim(),
2319
+ p_new_parent_id: m[parentArg].trim(),
2320
+ p_reason: reason,
2321
+ p_actor_label: "mcp:agent",
2322
+ p_batch_id: batchId,
2323
+ p_dry_run: dryRun,
2324
+ });
2325
+ results.push({ index: i, ...out });
2326
+ } catch (e) {
2327
+ results.push({ index: i, ok: false, error: e.message, entityId: m[idArg] });
2328
+ }
2329
+ }
2330
+ const okCount = results.filter((r) => r.ok).length;
2331
+ const failCount = results.length - okCount;
2332
+ return textResult(
2333
+ JSON.stringify(
2334
+ { ok: failCount === 0, batchId, total: results.length, okCount, failCount, dryRun, results },
2335
+ null,
2336
+ 2
2337
+ )
2338
+ );
2339
+ }
2340
+
2341
+ /**
2342
+ * Per-kind field validators for update_*. Enums and ranges live here
2343
+ * (UP3 in the spec) so the SQL function can stay structural. Returns
2344
+ * an array of error strings; empty array means valid.
2345
+ */
2346
+ function validateUpdatePatch(kind, patch) {
2347
+ const errors = [];
2348
+ if (kind === "task") {
2349
+ if (patch.title !== undefined) {
2350
+ const e = validateName(patch.title, 5);
2351
+ if (e) errors.push(e);
2352
+ }
2353
+ if (patch.status !== undefined && !VALID_STATUSES.has(patch.status)) {
2354
+ errors.push(`invalid status: ${patch.status}`);
2355
+ }
2356
+ if (patch.priority !== undefined && !VALID_PRIORITIES.has(patch.priority)) {
2357
+ errors.push(`invalid priority: ${patch.priority}`);
2358
+ }
2359
+ if (patch.effort !== undefined && !VALID_EFFORTS.has(patch.effort)) {
2360
+ errors.push(`invalid effort: ${patch.effort}`);
2361
+ }
2362
+ if (patch.kind !== undefined && !VALID_KINDS.has(patch.kind)) {
2363
+ errors.push(`invalid kind: ${patch.kind}`);
2364
+ }
2365
+ if (
2366
+ patch.progress !== undefined &&
2367
+ (typeof patch.progress !== "number" || patch.progress < 0 || patch.progress > 100)
2368
+ ) {
2369
+ errors.push(`progress must be 0–100, got ${patch.progress}.`);
2370
+ }
2371
+ if (
2372
+ patch.expectedPRs !== undefined &&
2373
+ (typeof patch.expectedPRs !== "number" || patch.expectedPRs <= 0)
2374
+ ) {
2375
+ errors.push(`expectedPRs must be a positive number, got ${patch.expectedPRs}.`);
2376
+ }
2377
+ if (
2378
+ patch.expectedScope !== undefined &&
2379
+ (typeof patch.expectedScope !== "number" || patch.expectedScope <= 0)
2380
+ ) {
2381
+ errors.push(`expectedScope must be a positive number, got ${patch.expectedScope}.`);
2382
+ }
2383
+ } else if (kind === "capability") {
2384
+ if (patch.name !== undefined) {
2385
+ const e = validateName(patch.name, 8);
2386
+ if (e) errors.push(e);
2387
+ }
2388
+ if (patch.outcome !== undefined) {
2389
+ const e = validateOutcome(patch.outcome);
2390
+ if (e) errors.push(e);
2391
+ }
2392
+ if (patch.confidence !== undefined) {
2393
+ const e = validateConfidence(patch.confidence);
2394
+ if (e) errors.push(e);
2395
+ }
2396
+ if (patch.impact !== undefined) {
2397
+ if (typeof patch.impact !== "number" || !VALID_IMPACTS.has(patch.impact)) {
2398
+ errors.push(`invalid impact: ${patch.impact} (must be 0.25, 0.5, 1, 2, or 3).`);
2399
+ }
2400
+ }
2401
+ } else if (kind === "theme") {
2402
+ if (patch.name !== undefined) {
2403
+ const e = validateName(patch.name, 5);
2404
+ if (e) errors.push(e);
2405
+ }
2406
+ }
2407
+ return errors;
2408
+ }
2409
+
2410
+ /**
2411
+ * Deep JSON equality for patch-vs-current diffing. Handles primitives,
2412
+ * arrays (order-sensitive — tags/dependsOn order is meaningful), and
2413
+ * plain objects. Sufficient for the field shapes update_* accepts.
2414
+ */
2415
+ function jsonEqual(a, b) {
2416
+ if (a === b) return true;
2417
+ if (a === null || b === null || a === undefined || b === undefined) {
2418
+ return a == null && b == null;
2419
+ }
2420
+ if (typeof a !== typeof b) return false;
2421
+ if (typeof a !== "object") return false;
2422
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
2423
+ if (Array.isArray(a)) {
2424
+ if (a.length !== b.length) return false;
2425
+ return a.every((x, i) => jsonEqual(x, b[i]));
2426
+ }
2427
+ const ka = Object.keys(a);
2428
+ const kb = Object.keys(b);
2429
+ if (ka.length !== kb.length) return false;
2430
+ return ka.every((k) => Object.prototype.hasOwnProperty.call(b, k) && jsonEqual(a[k], b[k]));
2431
+ }
2432
+
2433
+ /**
2434
+ * Single-entity update. Validates the patch against per-kind rules
2435
+ * (UP3), then diffs against the *projected* entity (seed + edits
2436
+ * merged) so seed-resident values participate in idempotency and
2437
+ * audit before-snapshots. Only the truly differing keys are sent
2438
+ * to SQL — the MCP layer is the diff authority because SQL only
2439
+ * sees the sparse patches row, not the seed overlay.
2440
+ */
2441
+ async function updateEntity(kind, args, wsId, projected) {
2442
+ const idArg = kind === "task" ? "taskId" : kind === "capability" ? "capabilityId" : "themeId";
2443
+ const entityId = (args?.[idArg] ?? "").trim();
2444
+ if (!entityId) return errorResult(`${idArg} is required.`);
2445
+ const patch = args?.patch;
2446
+ if (!patch || typeof patch !== "object" || Array.isArray(patch)) {
2447
+ return errorResult("patch must be a non-empty object.");
2448
+ }
2449
+ if (Object.keys(patch).length === 0) {
2450
+ return errorResult("patch must include at least one field.");
2451
+ }
2452
+ const reason = (args?.reason ?? "").trim();
2453
+ if (!reason) return errorResult("reason is required.");
2454
+ const validationErrors = validateUpdatePatch(kind, patch);
2455
+ if (validationErrors.length) {
2456
+ return errorResult(`Invalid patch: ${validationErrors.join("; ")}`);
2457
+ }
2458
+ if (!wsId) {
2459
+ return errorResult(
2460
+ "workspaceId could not be resolved (pass workspaceId arg or set SUPABASE_WORKSPACE_ID)."
2461
+ );
2462
+ }
2463
+ if (!projected) {
2464
+ return errorResult("internal: projected view not available.");
2465
+ }
2466
+
2467
+ // Look up the entity in the projected view (seed + edits merged).
2468
+ const collection =
2469
+ kind === "task" ? projected.tasks
2470
+ : kind === "capability" ? projected.capabilities
2471
+ : projected.themes;
2472
+ const current = collection.find((e) => e.id === entityId);
2473
+ if (!current) {
2474
+ return errorResult(`${kind} ${entityId} not found in workspace.`);
2475
+ }
2476
+
2477
+ // Decode HTML entities on user-facing text fields before diffing.
2478
+ // Without this, an agent that re-sends `Sandbox &amp; Test Mode`
2479
+ // to "fix" a previously-encoded value would land another &amp;
2480
+ // and look idempotent against the encoded current value. Decode
2481
+ // first so the comparison and the persisted value are both clean.
2482
+ const TEXT_FIELDS = new Set([
2483
+ "title", "summary", "name", "description", "outcome", "hypothesis",
2484
+ "owner", "team", "note",
2485
+ ]);
2486
+ const cleanedPatch = {};
2487
+ for (const [k, v] of Object.entries(patch)) {
2488
+ cleanedPatch[k] = TEXT_FIELDS.has(k) && typeof v === "string" ? cleanText(v) : v;
2489
+ }
2490
+
2491
+ // Compute the effective patch and its before-snapshot. SQL will
2492
+ // write before_snapshot verbatim into audit.before_json — that's
2493
+ // the whole point of doing the diff up here where the seed is
2494
+ // overlaid.
2495
+ const effectivePatch = {};
2496
+ const beforeSnapshot = {};
2497
+ for (const [k, v] of Object.entries(cleanedPatch)) {
2498
+ if (!jsonEqual(current[k], v)) {
2499
+ effectivePatch[k] = v;
2500
+ beforeSnapshot[k] = current[k] ?? null;
2501
+ }
2502
+ }
2503
+ if (Object.keys(effectivePatch).length === 0) {
2504
+ return textResult(
2505
+ JSON.stringify(
2506
+ {
2507
+ ok: true,
2508
+ entityId,
2509
+ kind,
2510
+ idempotent: true,
2511
+ dryRun: args?.dryRun === true,
2512
+ },
2513
+ null,
2514
+ 2
2515
+ )
2516
+ );
2517
+ }
2518
+
2519
+ try {
2520
+ const result = await rpcCall("update_entity", {
2521
+ p_workspace_id: wsId,
2522
+ p_kind: kind,
2523
+ p_entity_id: entityId,
2524
+ p_patch: effectivePatch,
2525
+ p_before: beforeSnapshot,
2526
+ p_reason: reason,
2527
+ p_actor_label: "mcp:agent",
2528
+ p_dry_run: args?.dryRun === true,
2529
+ });
2530
+ return textResult(JSON.stringify(result, null, 2));
2531
+ } catch (e) {
2532
+ return errorResult(e.message);
2533
+ }
2534
+ }
2535
+
2536
+ /**
2537
+ * Append a metric reading to a capability's outcomeReadings array.
2538
+ * Server takes a row lock so concurrent writers (script + human)
2539
+ * union safely instead of clobbering.
2540
+ */
2541
+ async function recordOutcomeReading(args, wsId, projected) {
2542
+ const capabilityId = (args?.capabilityId ?? "").trim();
2543
+ if (!capabilityId) return errorResult("capabilityId is required.");
2544
+ if (typeof args?.value !== "number" || !Number.isFinite(args.value)) {
2545
+ return errorResult("value is required and must be a finite number.");
2546
+ }
2547
+ const asOf = (args?.asOf ?? "").trim();
2548
+ if (!asOf) return errorResult("asOf is required (ISO date or timestamp).");
2549
+ const source = (args?.source ?? "").trim();
2550
+ if (!source) return errorResult("source is required.");
2551
+ if (args?.note !== undefined && args?.note !== null && typeof args.note !== "string") {
2552
+ return errorResult("note must be a string when supplied.");
2553
+ }
2554
+ if (!wsId) {
2555
+ return errorResult(
2556
+ "workspaceId could not be resolved (pass workspaceId arg or set SUPABASE_WORKSPACE_ID)."
2557
+ );
2558
+ }
2559
+ // Existence check — the RPC will happily write a patch entry against
2560
+ // any capabilityId, which would orphan the reading invisibly (the
2561
+ // projector iterates seed + newCapabilities, not the patches dict
2562
+ // alone). Refuse here so phantom readings never land.
2563
+ if (projected) {
2564
+ const exists = projected.capabilities.some((c) => c.id === capabilityId);
2565
+ if (!exists) return errorResult(`capability ${capabilityId} not found in workspace.`);
2566
+ }
2567
+ try {
2568
+ const result = await rpcCall("record_outcome_reading", {
2569
+ p_workspace_id: wsId,
2570
+ p_capability_id: capabilityId,
2571
+ p_value: args.value,
2572
+ p_as_of: asOf,
2573
+ p_source: source,
2574
+ p_note: args.note ?? null,
2575
+ p_actor_label: "mcp:agent",
2576
+ });
2577
+ return textResult(JSON.stringify(result, null, 2));
2578
+ } catch (e) {
2579
+ return errorResult(e.message);
2580
+ }
2581
+ }
2582
+
2583
+ /**
2584
+ * Read tool — surface capabilities with no recent outcome reading.
2585
+ * Pure projection over the workspace edits; no SQL round-trip needed
2586
+ * since the readings live on each capability already.
2587
+ */
2588
+ function listStaleOutcomes(args, projected) {
2589
+ const thresholdDays = typeof args?.thresholdDays === "number" ? args.thresholdDays : 14;
2590
+ const includeArchived = args?.includeArchived === true;
2591
+ const nowMs = Date.now();
2592
+ const thresholdMs = thresholdDays * 24 * 60 * 60 * 1000;
2593
+
2594
+ const stale = [];
2595
+ for (const cap of projected.capabilities) {
2596
+ if (!includeArchived && cap.archived) continue;
2597
+ // Only flag capabilities that declared an outcome — bets without
2598
+ // a falsifiable outcome can't be stale-checked meaningfully.
2599
+ if (!cap.outcome || cap.outcome.trim().length === 0) continue;
2600
+ const rawReadings = Array.isArray(cap.outcomeReadings) ? cap.outcomeReadings : [];
2601
+ // Drop readings whose asOf can't be parsed — they can't anchor a
2602
+ // staleness calculation, and if we let them participate they
2603
+ // could win the "latest" reducer (NaN > X is false) and zero out
2604
+ // staleness for an otherwise-stale capability.
2605
+ const readings = rawReadings.filter(
2606
+ (r) => r && typeof r.asOf === "string" && Number.isFinite(Date.parse(r.asOf))
2607
+ );
2608
+ const latest = readings.length === 0
2609
+ ? null
2610
+ : readings.reduce(
2611
+ (acc, r) => (acc && Date.parse(acc.asOf) >= Date.parse(r.asOf) ? acc : r),
2612
+ null
2613
+ );
2614
+ const daysSince = latest
2615
+ ? Math.floor((nowMs - Date.parse(latest.asOf)) / (24 * 60 * 60 * 1000))
2616
+ : null;
2617
+ const isStale =
2618
+ latest === null ||
2619
+ (typeof daysSince === "number" && daysSince * 24 * 60 * 60 * 1000 > thresholdMs);
2620
+ if (!isStale) continue;
2621
+ stale.push({
2622
+ id: cap.id,
2623
+ name: cap.name,
2624
+ outcome: cap.outcome,
2625
+ daysSinceLastReading: daysSince,
2626
+ latestReading: latest,
2627
+ readingCount: rawReadings.length,
2628
+ malformedReadingCount: rawReadings.length - readings.length || undefined,
2629
+ });
2630
+ }
2631
+ // Sort: never-measured first, then by stalest.
2632
+ stale.sort((a, b) => {
2633
+ if (a.daysSinceLastReading == null && b.daysSinceLastReading != null) return -1;
2634
+ if (b.daysSinceLastReading == null && a.daysSinceLastReading != null) return 1;
2635
+ return (b.daysSinceLastReading ?? 0) - (a.daysSinceLastReading ?? 0);
2636
+ });
2637
+ return textResult(
2638
+ JSON.stringify({ thresholdDays, count: stale.length, stale }, null, 2)
2639
+ );
2640
+ }
2641
+
2642
+ async function submitAcceptanceGrades(args, projected, wsId) {
2643
+ const task = projected.tasks.find((t) => t.id === args.taskId);
2644
+ if (!task) return errorResult(`Task ${args.taskId} not found.`);
2645
+ const max = (task.acceptance ?? []).length;
2646
+ if (max === 0)
2647
+ return errorResult(
2648
+ `Task ${task.id} has no acceptance criteria to grade. Add some first.`
2649
+ );
2650
+ for (const g of args.grades) {
2651
+ if (g.index >= max)
2652
+ return errorResult(
2653
+ `Grade index ${g.index} is out of range (task has ${max} criteria).`
2654
+ );
2655
+ }
2656
+
2657
+ const today = todayISO();
2658
+ const payload = args.grades.map((g) => ({
2659
+ index: g.index,
2660
+ status: g.status,
2661
+ gradedAt: today,
2662
+ gradedBy: "mcp:agent",
2663
+ ...(g.note ? { note: g.note } : {}),
2664
+ }));
2665
+
2666
+ try {
2667
+ // RPC takes a row lock, re-reads existing grades under the lock,
2668
+ // merges these indices on top, writes back. Concurrent graders
2669
+ // for the same task queue cleanly — no clobber.
2670
+ await rpcCall("grade_acceptance", {
2671
+ p_workspace_id: wsId,
2672
+ p_task_id: task.id,
2673
+ p_grades: payload,
2674
+ });
2675
+ } catch (e) {
2676
+ return errorResult(e.message);
2677
+ }
2678
+
2679
+ return textResult(
2680
+ JSON.stringify(
2681
+ {
2682
+ ok: true,
2683
+ taskId: task.id,
2684
+ graded: args.grades.length,
2685
+ of: max,
2686
+ },
2687
+ null,
2688
+ 2
2689
+ )
2690
+ );
2691
+ }
2692
+
2693
+ /**
2694
+ * Standard MCP tool-result envelope. The optional `extra` object can
2695
+ * carry `_meta` (annotations the client may surface as system
2696
+ * reminders), `structuredContent`, or other MCP-spec fields. We use
2697
+ * _meta for the system-reminder nudges per the effectiveness memo.
2698
+ */
2699
+ function textResult(text, extra) {
2700
+ return { content: [{ type: "text", text }], ...(extra ?? {}) };
2701
+ }
2702
+ function errorResult(message) {
2703
+ return { content: [{ type: "text", text: message }], isError: true };
2704
+ }
2705
+
2706
+ /**
2707
+ * Build the nudge text we attach via _meta on certain read results.
2708
+ * Returns null when no nudge is warranted — keeps the response
2709
+ * clean for the common case.
2710
+ */
2711
+ function buildReminder(toolName, projected) {
2712
+ const reminders = [];
2713
+ // If the rubric hasn't been fetched and the agent's reading list_*
2714
+ // / get_*, they're orienting and about to plan — get the rubric in.
2715
+ if (
2716
+ session.rubricFetchedAt === null &&
2717
+ (toolName === "list_capabilities" ||
2718
+ toolName === "list_tasks" ||
2719
+ toolName === "get_roadmap_snapshot" ||
2720
+ toolName === "list_themes")
2721
+ ) {
2722
+ reminders.push(
2723
+ "Call get_agents_md before any propose_* / submit_acceptance_grades / link_pr call — those tools refuse without it."
2724
+ );
2725
+ }
2726
+ // Tasks with merged PRs but no acceptance grades = ungraded
2727
+ // deliveries. The rubric requires self-grading before reviewer
2728
+ // approval.
2729
+ if (toolName === "list_tasks" || toolName === "get_roadmap_snapshot") {
2730
+ const ungraded = projected.tasks.filter((t) => {
2731
+ if (t.status !== "delivered") return false;
2732
+ const merged = (t.prs ?? []).some((p) => p.merged);
2733
+ if (!merged) return false;
2734
+ return !t.acceptanceGrades || t.acceptanceGrades.length === 0;
2735
+ });
2736
+ if (ungraded.length > 0) {
2737
+ const ids = ungraded.slice(0, 5).map((t) => t.id).join(", ");
2738
+ const more = ungraded.length > 5 ? `, +${ungraded.length - 5} more` : "";
2739
+ reminders.push(
2740
+ `${ungraded.length} delivered task${ungraded.length === 1 ? "" : "s"} ` +
2741
+ `have merged PRs without submitted acceptance grades. ` +
2742
+ `Call submit_acceptance_grades for: ${ids}${more}.`
2743
+ );
2744
+ }
2745
+ }
2746
+ if (reminders.length === 0) return null;
2747
+ return reminders.join(" ");
2748
+ }
2749
+
2750
+ function withReminder(toolName, projected, payload) {
2751
+ const text = buildReminder(toolName, projected);
2752
+ if (!text) return payload;
2753
+ return {
2754
+ ...payload,
2755
+ _meta: {
2756
+ ...(payload._meta ?? {}),
2757
+ roadmapper: {
2758
+ ...(payload._meta?.roadmapper ?? {}),
2759
+ reminder: text,
2760
+ },
2761
+ },
2762
+ };
2763
+ }
2764
+
2765
+ // ── MCP resources + prompts ───────────────────────────────────────
2766
+ //
2767
+ // resources/* — content the client can pull without the model
2768
+ // deciding to call a tool. Some clients auto-subscribe on connect,
2769
+ // which sidesteps the "agent forgot to fetch the rubric" failure.
2770
+ // prompts/* — parameterized templates the user invokes directly
2771
+ // (e.g. "/roadmapper:plan-feature lp v2"). They orchestrate a flow
2772
+ // without depending on the model's judgment.
2773
+ //
2774
+ // Both arrays are intentionally small. Keeping the surface tight
2775
+ // is part of the contract — agents read all of this on connect.
2776
+
2777
+ const RESOURCES = [
2778
+ {
2779
+ uri: "roadmapper://rubric",
2780
+ name: "Planning rubric (AGENTS.md)",
2781
+ description:
2782
+ "The contract every planner must satisfy: task shape, acceptance criteria format, capability outcome rubric, grading dimensions. Same content as get_agents_md, exposed as a resource so MCP clients that auto-subscribe pull it at connect.",
2783
+ mimeType: "text/markdown",
2784
+ },
2785
+ {
2786
+ uri: "roadmapper://capabilities/active",
2787
+ name: "Active capabilities (snapshot)",
2788
+ description:
2789
+ "Live list of non-delivered capabilities for the env-default workspace. Read this before propose_task or propose_capability to find the right parent. Note: MCP resources don't accept arguments, so this always reads SUPABASE_WORKSPACE_ID's workspace — use list_capabilities({ workspaceId }) for cross-workspace reads.",
2790
+ mimeType: "application/json",
2791
+ },
2792
+ {
2793
+ uri: "roadmapper://tasks/open",
2794
+ name: "Open tasks (snapshot)",
2795
+ description:
2796
+ "Live list of in_progress + planned tasks for the env-default workspace. Same workspaceId caveat as roadmapper://capabilities/active — use list_tasks({ workspaceId }) for cross-workspace reads.",
2797
+ mimeType: "application/json",
2798
+ },
2799
+ ];
2800
+
2801
+ async function readResource(uri) {
2802
+ if (uri === "roadmapper://rubric") {
2803
+ // Side-effect: reading the rubric resource counts as fetching
2804
+ // it. Mutators after this don't get blocked even if the agent
2805
+ // never called the tool — the contract is "the rubric reached
2806
+ // the model," not "this specific call shape ran."
2807
+ if (session.rubricFetchedAt === null) {
2808
+ session.rubricFetchedAt = Date.now();
2809
+ // Pass the cwd snapshot's workspace id so the row is
2810
+ // visible in Settings → MCP activity. Without this the
2811
+ // resource-route fetch lands with workspace_id=NULL and
2812
+ // gets filtered out for non-operator viewers (per migration
2813
+ // 0038's NULL-workspace lock).
2814
+ recordTelemetry(
2815
+ "rubric_fetched",
2816
+ { via: "resource" },
2817
+ snapshotWorkspaceId() ?? undefined
2818
+ );
2819
+ }
2820
+ return {
2821
+ contents: [
2822
+ { uri, mimeType: "text/markdown", text: readAgentsMd() },
2823
+ ],
2824
+ };
2825
+ }
2826
+ // The two snapshot resources project workspace state on each read
2827
+ // so the response is always live; mirrors get_roadmap_snapshot.
2828
+ const projected =
2829
+ (await readWorkspaceProjected()) ?? project(readSeed(), {});
2830
+
2831
+ if (uri === "roadmapper://capabilities/active") {
2832
+ // Counts as cap discovery for the propose_capability gate —
2833
+ // identical intent to suggest_capability_for / list_capabilities,
2834
+ // just delivered as an MCP resource.
2835
+ session.capsDiscoveredAt = Date.now();
2836
+ const active = projected.capabilities.filter(
2837
+ (c) => effectiveCapabilityStatus(c, projected.tasks) !== "delivered"
2838
+ );
2839
+ return {
2840
+ contents: [
2841
+ {
2842
+ uri,
2843
+ mimeType: "application/json",
2844
+ text: JSON.stringify(active, null, 2),
2845
+ },
2846
+ ],
2847
+ };
2848
+ }
2849
+ if (uri === "roadmapper://tasks/open") {
2850
+ const open = projected.tasks.filter(
2851
+ (t) => t.status === "in_progress" || t.status === "planned"
2852
+ );
2853
+ return {
2854
+ contents: [
2855
+ {
2856
+ uri,
2857
+ mimeType: "application/json",
2858
+ text: JSON.stringify(open, null, 2),
2859
+ },
2860
+ ],
2861
+ };
2862
+ }
2863
+ throw new Error(`Unknown resource: ${uri}`);
2864
+ }
2865
+
2866
+ const PROMPTS = [
2867
+ {
2868
+ name: "plan-feature",
2869
+ description:
2870
+ "Force the full planning flow: rubric → capability resolution → tasks under it. Use when the user says 'design features for X' / 'plan Y' — bypasses model judgment.",
2871
+ arguments: [
2872
+ {
2873
+ name: "description",
2874
+ description: "One-line description of the feature or workstream.",
2875
+ required: true,
2876
+ },
2877
+ ],
2878
+ },
2879
+ {
2880
+ name: "close-task",
2881
+ description:
2882
+ "Force the deliver-flow: load the task → self-grade against its acceptance criteria → link the PR. Use after implementing a TK-NNNNNN.",
2883
+ arguments: [
2884
+ { name: "task_id", description: "TK-NNNNNN", required: true },
2885
+ { name: "pr_url", description: "https://github.com/...", required: false },
2886
+ ],
2887
+ },
2888
+ {
2889
+ name: "weekly-review",
2890
+ description:
2891
+ "Walk through open tasks, stale capabilities, and ungraded deliveries. Use for a structured roadmap review pass.",
2892
+ arguments: [],
2893
+ },
2894
+ ];
2895
+
2896
+ function renderPrompt(name, args) {
2897
+ const text = (() => {
2898
+ switch (name) {
2899
+ case "plan-feature":
2900
+ return (
2901
+ `Plan a feature: "${args.description ?? "(no description provided)"}"\n\n` +
2902
+ "Follow this flow exactly:\n" +
2903
+ "1. Call get_agents_md (or read roadmapper://rubric) to load the rubric for this session.\n" +
2904
+ "2. Call suggest_capability_for with the description above. Read every returned candidate's outcome before deciding.\n" +
2905
+ "3. If a returned candidate scores > 0.4 OR its outcome maps to what we're building, propose tasks under it via propose_task. Each task MUST include acceptance criteria per the rubric.\n" +
2906
+ "4. If nothing fits, STOP and ask the user before calling propose_capability — capabilities are quarterly bets, not single tasks.\n" +
2907
+ "5. After tasks are proposed, summarize: capabilityId chosen, task ids created, anything skipped and why."
2908
+ );
2909
+ case "close-task":
2910
+ return (
2911
+ `Close task ${args.task_id ?? "(missing task_id)"}.\n\n` +
2912
+ "Follow this flow exactly:\n" +
2913
+ "1. Call get_agents_md (or read roadmapper://rubric) to load grading dimensions.\n" +
2914
+ `2. Call get_task({ id: "${args.task_id ?? ""}" }) and read every acceptance criterion.\n` +
2915
+ "3. For each criterion, decide pass/fail. Fabricated passes destroy this signal — only mark pass if you verified.\n" +
2916
+ "4. Call submit_acceptance_grades with the per-index results. Include a note on any fail.\n" +
2917
+ (args.pr_url
2918
+ ? `5. Call link_pr to attach ${args.pr_url} to the task.\n`
2919
+ : "5. If you opened a PR, call link_pr to attach it.\n") +
2920
+ "6. Stamp Roadmapper-Task: " +
2921
+ (args.task_id ?? "TK-NNNNNN") +
2922
+ " in the PR body so the webhook routes future events back here."
2923
+ );
2924
+ case "weekly-review":
2925
+ return (
2926
+ "Run a structured roadmap review.\n\n" +
2927
+ "1. Call get_agents_md to load the rubric (or confirm rubric is current).\n" +
2928
+ "2. Call get_roadmap_snapshot for the canonical model. Note any _meta reminders in the response.\n" +
2929
+ "3. For each active capability, scan: are open tasks aging? Are any without acceptance criteria? Are there delivered tasks without acceptance grades?\n" +
2930
+ "4. List capabilities whose outcomes are no longer falsifiable or whose tasks all delivered (close them or pivot).\n" +
2931
+ "5. Report: ungraded deliveries, stale capabilities, capabilities ready to close, suggested next bets."
2932
+ );
2933
+ default:
2934
+ throw new Error(`Unknown prompt: ${name}`);
2935
+ }
2936
+ })();
2937
+ return {
2938
+ description: PROMPTS.find((p) => p.name === name)?.description ?? "",
2939
+ messages: [
2940
+ { role: "user", content: { type: "text", text } },
2941
+ ],
2942
+ };
2943
+ }
2944
+
2945
+ async function handle(request) {
2946
+ const { id, method, params } = request;
2947
+ try {
2948
+ if (method === "initialize") {
2949
+ // Snapshot counts so an MCP client showing server info
2950
+ // surfaces actual roadmap shape, not just "connected".
2951
+ const projected =
2952
+ (await readWorkspaceProjected()) ?? project(readSeed(), {});
2953
+ const openTasks = projected.tasks.filter(
2954
+ (t) => t.status !== "delivered"
2955
+ ).length;
2956
+ const stats = {
2957
+ themes: projected.themes.length,
2958
+ capabilities: projected.capabilities.length,
2959
+ openTasks,
2960
+ };
2961
+ // Fresh session — reset rubric-fetched state. The client
2962
+ // re-initializes when it reconnects, which is the right
2963
+ // boundary for "you need to fetch the rubric again."
2964
+ resetSession();
2965
+ recordTelemetry("session_initialized", { stats });
2966
+ return {
2967
+ jsonrpc: "2.0",
2968
+ id,
2969
+ result: {
2970
+ protocolVersion: PROTOCOL_VERSION,
2971
+ // Declare every capability we support. resources +
2972
+ // prompts unlock the auto-pull / slash-command surfaces
2973
+ // some MCP clients expose; tools work for everyone.
2974
+ capabilities: {
2975
+ tools: {},
2976
+ resources: { listChanged: false },
2977
+ prompts: { listChanged: false },
2978
+ },
2979
+ serverInfo: {
2980
+ name: SERVER_NAME,
2981
+ version: SERVER_VERSION,
2982
+ stats,
2983
+ instructions:
2984
+ "Roadmapper online — " +
2985
+ `${stats.themes} theme${stats.themes === 1 ? "" : "s"}, ` +
2986
+ `${stats.capabilities} capabilit${stats.capabilities === 1 ? "y" : "ies"}, ` +
2987
+ `${stats.openTasks} open task${stats.openTasks === 1 ? "" : "s"}. ` +
2988
+ "Call get_agents_md before planning — the propose_* and submit_acceptance_grades tools refuse without it. " +
2989
+ "Use suggest_capability_for before propose_capability. " +
2990
+ "Slash-prompts available: roadmapper:plan-feature, roadmapper:close-task, roadmapper:weekly-review.",
2991
+ },
2992
+ },
2993
+ };
2994
+ }
2995
+ if (method === "tools/list") {
2996
+ return { jsonrpc: "2.0", id, result: { tools: TOOLS } };
2997
+ }
2998
+ if (method === "tools/call") {
2999
+ const result = await callTool(params?.name, params?.arguments ?? {});
3000
+ return { jsonrpc: "2.0", id, result };
3001
+ }
3002
+ if (method === "resources/list") {
3003
+ return { jsonrpc: "2.0", id, result: { resources: RESOURCES } };
3004
+ }
3005
+ if (method === "resources/read") {
3006
+ const result = await readResource(params?.uri);
3007
+ return { jsonrpc: "2.0", id, result };
3008
+ }
3009
+ if (method === "prompts/list") {
3010
+ return { jsonrpc: "2.0", id, result: { prompts: PROMPTS } };
3011
+ }
3012
+ if (method === "prompts/get") {
3013
+ const result = renderPrompt(params?.name, params?.arguments ?? {});
3014
+ return { jsonrpc: "2.0", id, result };
3015
+ }
3016
+ // Notifications (no id) and unknown methods: ignore.
3017
+ if (id === undefined) return null;
3018
+ return {
3019
+ jsonrpc: "2.0",
3020
+ id,
3021
+ error: { code: -32601, message: `Method not found: ${method}` },
3022
+ };
3023
+ } catch (e) {
3024
+ return {
3025
+ jsonrpc: "2.0",
3026
+ id,
3027
+ error: { code: -32603, message: e.message || String(e) },
3028
+ };
3029
+ }
3030
+ }
3031
+
3032
+ /**
3033
+ * `node mcp/server.mjs --selftest` — invokes each read tool against
3034
+ * the local seed and prints a pass/fail table. Useful for sanity-
3035
+ * checking the install without wiring it into an MCP client.
3036
+ *
3037
+ * Write tools (propose_task, submit_acceptance_grades) are smoke-
3038
+ * tested for argument validation only — they don't touch Supabase.
3039
+ */
3040
+ async function runSelftest() {
3041
+ const seed = readSeed();
3042
+ const aTheme = seed?.product?.themes?.[0]?.id;
3043
+ const aCap = seed?.capabilities?.[0]?.id;
3044
+ const aTask = seed?.tasks?.[0]?.id;
3045
+
3046
+ const checks = [
3047
+ {
3048
+ name: "initialize",
3049
+ fn: () => handle({ id: 1, method: "initialize", params: {} }),
3050
+ pass: (r) =>
3051
+ r?.result?.serverInfo?.name === SERVER_NAME &&
3052
+ // New: capabilities advertise resources + prompts too.
3053
+ r?.result?.capabilities?.resources &&
3054
+ r?.result?.capabilities?.prompts,
3055
+ },
3056
+ {
3057
+ // Hitting a mutator with no rubric fetched must return the
3058
+ // structured prerequisite_missing error with a `fix` field,
3059
+ // not a successful write. This is the gate the effectiveness
3060
+ // memo specified. Explicit resetSession() so the check is
3061
+ // order-independent — if a prior check fetched the rubric,
3062
+ // this would otherwise silently pass for the wrong reason.
3063
+ name: "rubric gate blocks mutator before get_agents_md",
3064
+ fn: () => {
3065
+ resetSession();
3066
+ return handle({
3067
+ id: 11,
3068
+ method: "tools/call",
3069
+ params: {
3070
+ name: "propose_task",
3071
+ arguments: { capabilityId: aCap, title: "Should be blocked" },
3072
+ },
3073
+ });
3074
+ },
3075
+ pass: (r) => {
3076
+ if (!r?.result?.isError) return false;
3077
+ const text = r.result.content?.[0]?.text ?? "";
3078
+ return (
3079
+ text.includes("prerequisite_missing") && text.includes("get_agents_md")
3080
+ );
3081
+ },
3082
+ },
3083
+ {
3084
+ // After fetching the rubric, propose_theme should still be
3085
+ // blocked until the agent has actually listed themes. Asserts
3086
+ // the discovery gate fires with the right `fix` field.
3087
+ name: "discovery gate blocks propose_theme before list_themes",
3088
+ fn: () => {
3089
+ resetSession();
3090
+ session.rubricFetchedAt = Date.now(); // rubric satisfied
3091
+ return handle({
3092
+ id: 16,
3093
+ method: "tools/call",
3094
+ params: {
3095
+ name: "propose_theme",
3096
+ arguments: { name: "Some New Theme Idea" },
3097
+ },
3098
+ });
3099
+ },
3100
+ pass: (r) => {
3101
+ if (!r?.result?.isError) return false;
3102
+ const text = r.result.content?.[0]?.text ?? "";
3103
+ return (
3104
+ text.includes("discovery_missing") && text.includes("list_themes")
3105
+ );
3106
+ },
3107
+ },
3108
+ {
3109
+ // Same gate for propose_capability — requires suggest_capability_for
3110
+ // (or list_capabilities / get_roadmap_snapshot) first.
3111
+ name:
3112
+ "discovery gate blocks propose_capability before suggest_capability_for",
3113
+ fn: () => {
3114
+ resetSession();
3115
+ session.rubricFetchedAt = Date.now();
3116
+ return handle({
3117
+ id: 17,
3118
+ method: "tools/call",
3119
+ params: {
3120
+ name: "propose_capability",
3121
+ arguments: {
3122
+ name: "Brand new capability",
3123
+ pillarId: aTheme,
3124
+ outcome: "x",
3125
+ },
3126
+ },
3127
+ });
3128
+ },
3129
+ pass: (r) => {
3130
+ if (!r?.result?.isError) return false;
3131
+ const text = r.result.content?.[0]?.text ?? "";
3132
+ return (
3133
+ text.includes("discovery_missing") &&
3134
+ text.includes("suggest_capability_for")
3135
+ );
3136
+ },
3137
+ },
3138
+ {
3139
+ // get_roadmap_snapshot returns BOTH themes and caps in a single
3140
+ // response, so it satisfies BOTH discovery gates at once.
3141
+ name: "get_roadmap_snapshot satisfies both discovery gates",
3142
+ fn: async () => {
3143
+ resetSession();
3144
+ session.rubricFetchedAt = Date.now();
3145
+ await handle({
3146
+ id: 18,
3147
+ method: "tools/call",
3148
+ params: { name: "get_roadmap_snapshot", arguments: {} },
3149
+ });
3150
+ return {
3151
+ themesListedAt: session.themesListedAt,
3152
+ capsDiscoveredAt: session.capsDiscoveredAt,
3153
+ };
3154
+ },
3155
+ pass: (r) => r?.themesListedAt !== null && r?.capsDiscoveredAt !== null,
3156
+ },
3157
+ {
3158
+ name: "resources/list returns the three resources",
3159
+ fn: () => handle({ id: 12, method: "resources/list", params: {} }),
3160
+ pass: (r) =>
3161
+ Array.isArray(r?.result?.resources) &&
3162
+ r.result.resources.length === RESOURCES.length &&
3163
+ r.result.resources.some((x) => x.uri === "roadmapper://rubric"),
3164
+ },
3165
+ {
3166
+ name: "resources/read rubric counts as a fetched rubric",
3167
+ fn: () =>
3168
+ handle({
3169
+ id: 13,
3170
+ method: "resources/read",
3171
+ params: { uri: "roadmapper://rubric" },
3172
+ }),
3173
+ pass: (r) =>
3174
+ r?.result?.contents?.[0]?.text?.includes("# AGENTS.md") &&
3175
+ session.rubricFetchedAt !== null,
3176
+ },
3177
+ {
3178
+ name: "prompts/list returns the three prompts",
3179
+ fn: () => handle({ id: 14, method: "prompts/list", params: {} }),
3180
+ pass: (r) =>
3181
+ Array.isArray(r?.result?.prompts) &&
3182
+ r.result.prompts.length === PROMPTS.length &&
3183
+ r.result.prompts.some((p) => p.name === "plan-feature"),
3184
+ },
3185
+ {
3186
+ name: "prompts/get plan-feature expands the template",
3187
+ fn: () =>
3188
+ handle({
3189
+ id: 15,
3190
+ method: "prompts/get",
3191
+ params: {
3192
+ name: "plan-feature",
3193
+ arguments: { description: "demo description" },
3194
+ },
3195
+ }),
3196
+ pass: (r) =>
3197
+ r?.result?.messages?.[0]?.content?.text?.includes(
3198
+ "suggest_capability_for"
3199
+ ) &&
3200
+ r.result.messages[0].content.text.includes("demo description"),
3201
+ },
3202
+ {
3203
+ name: "tools/list",
3204
+ fn: () => handle({ id: 2, method: "tools/list", params: {} }),
3205
+ pass: (r) =>
3206
+ Array.isArray(r?.result?.tools) && r.result.tools.length === TOOLS.length,
3207
+ },
3208
+ {
3209
+ name: "list_themes",
3210
+ fn: () =>
3211
+ handle({
3212
+ id: 3,
3213
+ method: "tools/call",
3214
+ params: { name: "list_themes", arguments: {} },
3215
+ }),
3216
+ pass: (r) => !!r?.result?.content?.[0]?.text?.includes(aTheme),
3217
+ },
3218
+ {
3219
+ name: `list_capabilities themeId=${aTheme}`,
3220
+ fn: () =>
3221
+ handle({
3222
+ id: 4,
3223
+ method: "tools/call",
3224
+ params: {
3225
+ name: "list_capabilities",
3226
+ arguments: { themeId: aTheme },
3227
+ },
3228
+ }),
3229
+ pass: (r) => r?.result && !r.result.isError,
3230
+ },
3231
+ {
3232
+ name: "list_tasks status=delivered",
3233
+ fn: () =>
3234
+ handle({
3235
+ id: 5,
3236
+ method: "tools/call",
3237
+ params: {
3238
+ name: "list_tasks",
3239
+ arguments: { status: "delivered" },
3240
+ },
3241
+ }),
3242
+ pass: (r) => r?.result && !r.result.isError,
3243
+ },
3244
+ {
3245
+ // Phase-1 archive smoke: with no archived rows in the seed,
3246
+ // includeArchived: true should still return the full set —
3247
+ // and crucially the code path doesn't throw. Once Phase 2
3248
+ // ships writes, a future check can assert that an archived
3249
+ // task is hidden by default and visible with the flag.
3250
+ name: "list_tasks includeArchived=true (smoke)",
3251
+ fn: () =>
3252
+ handle({
3253
+ id: 51,
3254
+ method: "tools/call",
3255
+ params: {
3256
+ name: "list_tasks",
3257
+ arguments: { includeArchived: true },
3258
+ },
3259
+ }),
3260
+ pass: (r) => r?.result && !r.result.isError,
3261
+ },
3262
+ {
3263
+ name: `get_task id=${aTask}`,
3264
+ fn: () =>
3265
+ handle({
3266
+ id: 6,
3267
+ method: "tools/call",
3268
+ params: { name: "get_task", arguments: { id: aTask } },
3269
+ }),
3270
+ pass: (r) =>
3271
+ !r?.result?.isError &&
3272
+ r?.result?.content?.[0]?.text?.includes(`"id": "${aTask}"`),
3273
+ },
3274
+ {
3275
+ name: "get_task (bogus id returns error result)",
3276
+ fn: () =>
3277
+ handle({
3278
+ id: 7,
3279
+ method: "tools/call",
3280
+ params: {
3281
+ name: "get_task",
3282
+ arguments: { id: "TK-NOPE" },
3283
+ },
3284
+ }),
3285
+ pass: (r) => r?.result?.isError === true,
3286
+ },
3287
+ {
3288
+ name: "get_agents_md",
3289
+ fn: () =>
3290
+ handle({
3291
+ id: 8,
3292
+ method: "tools/call",
3293
+ params: { name: "get_agents_md", arguments: {} },
3294
+ }),
3295
+ pass: (r) => r?.result?.content?.[0]?.text?.includes("# AGENTS.md"),
3296
+ },
3297
+ {
3298
+ name: "propose_task (bad capabilityId returns error result)",
3299
+ fn: () =>
3300
+ handle({
3301
+ id: 9,
3302
+ method: "tools/call",
3303
+ params: {
3304
+ name: "propose_task",
3305
+ arguments: {
3306
+ capabilityId: "CAP-NOPE",
3307
+ title: "Should fail",
3308
+ },
3309
+ },
3310
+ }),
3311
+ pass: (r) => r?.result?.isError === true,
3312
+ },
3313
+ {
3314
+ name: "propose_task (valid args, no service key) errors cleanly",
3315
+ fn: () =>
3316
+ handle({
3317
+ id: 10,
3318
+ method: "tools/call",
3319
+ params: {
3320
+ name: "propose_task",
3321
+ arguments: { capabilityId: aCap, title: "Selftest task" },
3322
+ },
3323
+ }),
3324
+ // Without SUPABASE_SERVICE_ROLE_KEY this must return an error result
3325
+ // (not throw). With the key set, this would actually write — so we
3326
+ // only assert the no-key path here.
3327
+ pass: (r) =>
3328
+ process.env.SUPABASE_SERVICE_ROLE_KEY
3329
+ ? r?.result && !r.result.isError
3330
+ : r?.result?.isError === true,
3331
+ },
3332
+ {
3333
+ name: "propose_theme (missing name returns error result)",
3334
+ fn: () =>
3335
+ handle({
3336
+ id: 11,
3337
+ method: "tools/call",
3338
+ params: { name: "propose_theme", arguments: { name: "" } },
3339
+ }),
3340
+ pass: (r) => r?.result?.isError === true,
3341
+ },
3342
+ {
3343
+ name: "propose_capability (unknown pillarId returns error result)",
3344
+ fn: () =>
3345
+ handle({
3346
+ id: 12,
3347
+ method: "tools/call",
3348
+ params: {
3349
+ name: "propose_capability",
3350
+ arguments: { name: "bogus", pillarId: "TH-DOES-NOT-EXIST" },
3351
+ },
3352
+ }),
3353
+ pass: (r) => r?.result?.isError === true,
3354
+ },
3355
+ {
3356
+ name: "propose_capability (invalid impact returns error result)",
3357
+ fn: () =>
3358
+ handle({
3359
+ id: 13,
3360
+ method: "tools/call",
3361
+ params: {
3362
+ name: "propose_capability",
3363
+ arguments: { name: "bad-impact", pillarId: aTheme, impact: 7 },
3364
+ },
3365
+ }),
3366
+ pass: (r) => r?.result?.isError === true,
3367
+ },
3368
+ {
3369
+ name: "propose_capability (valid args, no service key) errors cleanly",
3370
+ fn: () =>
3371
+ handle({
3372
+ id: 14,
3373
+ method: "tools/call",
3374
+ params: {
3375
+ name: "propose_capability",
3376
+ arguments: {
3377
+ name: "Selftest capability example",
3378
+ pillarId: aTheme,
3379
+ outcome:
3380
+ "Selftest metric moves from 0 to 10 by 2026-12-31, measured by selftest_event.",
3381
+ },
3382
+ },
3383
+ }),
3384
+ pass: (r) =>
3385
+ process.env.SUPABASE_SERVICE_ROLE_KEY
3386
+ ? r?.result && !r.result.isError
3387
+ : r?.result?.isError === true,
3388
+ },
3389
+ {
3390
+ name: "propose_capability (empty outcome rejected by validator)",
3391
+ fn: () =>
3392
+ handle({
3393
+ id: 15,
3394
+ method: "tools/call",
3395
+ params: {
3396
+ name: "propose_capability",
3397
+ arguments: {
3398
+ name: "Selftest capability example",
3399
+ pillarId: aTheme,
3400
+ outcome: "",
3401
+ },
3402
+ },
3403
+ }),
3404
+ pass: (r) => r?.result?.isError === true,
3405
+ },
3406
+ {
3407
+ name: "propose_capability (non-falsifiable outcome rejected)",
3408
+ fn: () =>
3409
+ handle({
3410
+ id: 16,
3411
+ method: "tools/call",
3412
+ params: {
3413
+ name: "propose_capability",
3414
+ arguments: {
3415
+ name: "Selftest capability example",
3416
+ pillarId: aTheme,
3417
+ outcome: "Make the thing better.",
3418
+ },
3419
+ },
3420
+ }),
3421
+ pass: (r) => r?.result?.isError === true,
3422
+ },
3423
+ {
3424
+ // Regression guard for the over-lax month-name match. Outcome
3425
+ // contains a digit ("50%") but no real date — just the verb
3426
+ // "may". Should be rejected, not pass via the month branch.
3427
+ name: "propose_capability (bare month name without a digit-after rejected)",
3428
+ fn: () =>
3429
+ handle({
3430
+ id: 161,
3431
+ method: "tools/call",
3432
+ params: {
3433
+ name: "propose_capability",
3434
+ arguments: {
3435
+ name: "Selftest capability example",
3436
+ pillarId: aTheme,
3437
+ outcome: "We may improve activation from 30% to 50% if all goes well.",
3438
+ },
3439
+ },
3440
+ }),
3441
+ pass: (r) => r?.result?.isError === true,
3442
+ },
3443
+ {
3444
+ name: "propose_capability (confidence 100 rejected by validator)",
3445
+ fn: () =>
3446
+ handle({
3447
+ id: 17,
3448
+ method: "tools/call",
3449
+ params: {
3450
+ name: "propose_capability",
3451
+ arguments: {
3452
+ name: "Selftest capability example",
3453
+ pillarId: aTheme,
3454
+ outcome: "Metric moves from 0 to 5 by 2026-09-30, measured by event.",
3455
+ confidence: 100,
3456
+ },
3457
+ },
3458
+ }),
3459
+ pass: (r) => r?.result?.isError === true,
3460
+ },
3461
+ {
3462
+ name: "propose_capability dryRun returns wouldCreate without writing",
3463
+ fn: () =>
3464
+ handle({
3465
+ id: 18,
3466
+ method: "tools/call",
3467
+ params: {
3468
+ name: "propose_capability",
3469
+ arguments: {
3470
+ name: "Selftest dry run capability",
3471
+ pillarId: aTheme,
3472
+ outcome:
3473
+ "Metric moves from 0 to 5 by 2026-09-30, measured by event.",
3474
+ dryRun: true,
3475
+ },
3476
+ },
3477
+ }),
3478
+ // dryRun: works regardless of whether service key is set
3479
+ pass: (r) =>
3480
+ !r?.result?.isError &&
3481
+ r?.result?.content?.[0]?.text?.includes('"dryRun": true'),
3482
+ },
3483
+ {
3484
+ name: "suggest_capability_for (returns matches sorted by score)",
3485
+ fn: () =>
3486
+ handle({
3487
+ id: 19,
3488
+ method: "tools/call",
3489
+ params: {
3490
+ name: "suggest_capability_for",
3491
+ arguments: { description: "example capability" },
3492
+ },
3493
+ }),
3494
+ pass: (r) =>
3495
+ !r?.result?.isError &&
3496
+ r?.result?.content?.[0]?.text?.includes('"matches"'),
3497
+ },
3498
+ {
3499
+ name: "suggest_capability_for (empty description rejected)",
3500
+ fn: () =>
3501
+ handle({
3502
+ id: 20,
3503
+ method: "tools/call",
3504
+ params: {
3505
+ name: "suggest_capability_for",
3506
+ arguments: { description: "" },
3507
+ },
3508
+ }),
3509
+ pass: (r) => r?.result?.isError === true,
3510
+ },
3511
+ {
3512
+ // suggest_theme_for is the theme-level mirror — same shape,
3513
+ // returns ranked matches against an arbitrary description.
3514
+ name: "suggest_theme_for (returns matches sorted by score)",
3515
+ fn: () =>
3516
+ handle({
3517
+ id: 30,
3518
+ method: "tools/call",
3519
+ params: {
3520
+ name: "suggest_theme_for",
3521
+ arguments: { description: "example theme" },
3522
+ },
3523
+ }),
3524
+ pass: (r) =>
3525
+ !r?.result?.isError &&
3526
+ r?.result?.content?.[0]?.text?.includes('"matches"'),
3527
+ },
3528
+ {
3529
+ name: "suggest_theme_for (empty description rejected)",
3530
+ fn: () =>
3531
+ handle({
3532
+ id: 31,
3533
+ method: "tools/call",
3534
+ params: {
3535
+ name: "suggest_theme_for",
3536
+ arguments: { description: "" },
3537
+ },
3538
+ }),
3539
+ pass: (r) => r?.result?.isError === true,
3540
+ },
3541
+ {
3542
+ // suggest_theme_for satisfies the propose_theme discovery
3543
+ // gate the same way suggest_capability_for satisfies the
3544
+ // propose_capability gate. After the call, themesListedAt
3545
+ // should be populated.
3546
+ name: "suggest_theme_for satisfies propose_theme discovery gate",
3547
+ fn: async () => {
3548
+ resetSession();
3549
+ session.rubricFetchedAt = Date.now();
3550
+ await handle({
3551
+ id: 32,
3552
+ method: "tools/call",
3553
+ params: {
3554
+ name: "suggest_theme_for",
3555
+ arguments: { description: "any" },
3556
+ },
3557
+ });
3558
+ return { themesListedAt: session.themesListedAt };
3559
+ },
3560
+ pass: (r) => r?.themesListedAt !== null,
3561
+ },
3562
+ {
3563
+ name: "link_pr (unknown task rejected)",
3564
+ fn: () =>
3565
+ handle({
3566
+ id: 21,
3567
+ method: "tools/call",
3568
+ params: {
3569
+ name: "link_pr",
3570
+ arguments: { taskId: "TK-NOPE", repo: "x/y", number: 1 },
3571
+ },
3572
+ }),
3573
+ pass: (r) => r?.result?.isError === true,
3574
+ },
3575
+ {
3576
+ name: "link_pr (valid args, no service key errors cleanly)",
3577
+ fn: () =>
3578
+ handle({
3579
+ id: 22,
3580
+ method: "tools/call",
3581
+ params: {
3582
+ name: "link_pr",
3583
+ arguments: { taskId: aTask, repo: "x/y", number: 1 },
3584
+ },
3585
+ }),
3586
+ pass: (r) =>
3587
+ process.env.SUPABASE_SERVICE_ROLE_KEY
3588
+ ? r?.result && !r.result.isError
3589
+ : r?.result?.isError === true,
3590
+ },
3591
+ {
3592
+ // Reason is required on archive/unarchive — empty reason
3593
+ // must return an error result regardless of service-key
3594
+ // availability.
3595
+ name: "archive_task (missing reason returns error result)",
3596
+ fn: () =>
3597
+ handle({
3598
+ id: 23,
3599
+ method: "tools/call",
3600
+ params: {
3601
+ name: "archive_task",
3602
+ arguments: { taskId: aTask, reason: "" },
3603
+ },
3604
+ }),
3605
+ pass: (r) => r?.result?.isError === true,
3606
+ },
3607
+ {
3608
+ name: "archive_capability (missing reason returns error result)",
3609
+ fn: () =>
3610
+ handle({
3611
+ id: 24,
3612
+ method: "tools/call",
3613
+ params: {
3614
+ name: "archive_capability",
3615
+ arguments: { capabilityId: aCap, reason: " " },
3616
+ },
3617
+ }),
3618
+ pass: (r) => r?.result?.isError === true,
3619
+ },
3620
+ {
3621
+ name: "unarchive_theme (missing themeId returns error result)",
3622
+ fn: () =>
3623
+ handle({
3624
+ id: 25,
3625
+ method: "tools/call",
3626
+ params: { name: "unarchive_theme", arguments: { reason: "x" } },
3627
+ }),
3628
+ pass: (r) => r?.result?.isError === true,
3629
+ },
3630
+ {
3631
+ // Move validation: missing newCapabilityId must error out.
3632
+ name: "move_task (missing newCapabilityId returns error result)",
3633
+ fn: () =>
3634
+ handle({
3635
+ id: 26,
3636
+ method: "tools/call",
3637
+ params: {
3638
+ name: "move_task",
3639
+ arguments: { taskId: aTask, reason: "reorg" },
3640
+ },
3641
+ }),
3642
+ pass: (r) => r?.result?.isError === true,
3643
+ },
3644
+ {
3645
+ name: "move_capability (missing reason returns error result)",
3646
+ fn: () =>
3647
+ handle({
3648
+ id: 27,
3649
+ method: "tools/call",
3650
+ params: {
3651
+ name: "move_capability",
3652
+ arguments: { capabilityId: aCap, newThemeId: aTheme, reason: "" },
3653
+ },
3654
+ }),
3655
+ pass: (r) => r?.result?.isError === true,
3656
+ },
3657
+ {
3658
+ // Bulk shape: oversize batch rejected pre-flight, doesn't call SQL.
3659
+ name: "move_tasks (over 100-item cap returns error result)",
3660
+ fn: () => {
3661
+ const moves = Array.from({ length: 101 }, (_, i) => ({
3662
+ taskId: `TK-${String(i).padStart(6, "0")}`,
3663
+ newCapabilityId: aCap,
3664
+ }));
3665
+ return handle({
3666
+ id: 28,
3667
+ method: "tools/call",
3668
+ params: { name: "move_tasks", arguments: { moves, reason: "reorg" } },
3669
+ });
3670
+ },
3671
+ pass: (r) => r?.result?.isError === true,
3672
+ },
3673
+ {
3674
+ name: "move_capabilities (empty moves returns error result)",
3675
+ fn: () =>
3676
+ handle({
3677
+ id: 29,
3678
+ method: "tools/call",
3679
+ params: {
3680
+ name: "move_capabilities",
3681
+ arguments: { moves: [], reason: "reorg" },
3682
+ },
3683
+ }),
3684
+ pass: (r) => r?.result?.isError === true,
3685
+ },
3686
+ {
3687
+ // Schema-level: tools/list must advertise the four move tools.
3688
+ name: "tools/list advertises four move tools",
3689
+ fn: () => handle({ id: 30, method: "tools/list", params: {} }),
3690
+ pass: (r) => {
3691
+ const names = (r?.result?.tools ?? []).map((t) => t.name);
3692
+ return ["move_task", "move_capability", "move_tasks", "move_capabilities"].every((n) =>
3693
+ names.includes(n)
3694
+ );
3695
+ },
3696
+ },
3697
+ {
3698
+ // Update validation: missing patch.
3699
+ name: "update_task (missing patch returns error result)",
3700
+ fn: () =>
3701
+ handle({
3702
+ id: 31,
3703
+ method: "tools/call",
3704
+ params: { name: "update_task", arguments: { taskId: aTask, reason: "r" } },
3705
+ }),
3706
+ pass: (r) => r?.result?.isError === true,
3707
+ },
3708
+ {
3709
+ // Update validation: empty patch.
3710
+ name: "update_capability (empty patch returns error result)",
3711
+ fn: () =>
3712
+ handle({
3713
+ id: 32,
3714
+ method: "tools/call",
3715
+ params: {
3716
+ name: "update_capability",
3717
+ arguments: { capabilityId: aCap, patch: {}, reason: "r" },
3718
+ },
3719
+ }),
3720
+ pass: (r) => r?.result?.isError === true,
3721
+ },
3722
+ {
3723
+ // UP3: invalid status rejected client-side before SQL.
3724
+ name: "update_task (invalid status rejected by validator)",
3725
+ fn: () =>
3726
+ handle({
3727
+ id: 33,
3728
+ method: "tools/call",
3729
+ params: {
3730
+ name: "update_task",
3731
+ arguments: {
3732
+ taskId: aTask,
3733
+ patch: { status: "shipped" },
3734
+ reason: "advance",
3735
+ },
3736
+ },
3737
+ }),
3738
+ pass: (r) => {
3739
+ const txt = r?.result?.content?.[0]?.text ?? "";
3740
+ return r?.result?.isError === true && txt.includes("invalid status");
3741
+ },
3742
+ },
3743
+ {
3744
+ // UP3: invalid confidence (>95) rejected by validator.
3745
+ name: "update_capability (confidence 99 rejected by validator)",
3746
+ fn: () =>
3747
+ handle({
3748
+ id: 34,
3749
+ method: "tools/call",
3750
+ params: {
3751
+ name: "update_capability",
3752
+ arguments: {
3753
+ capabilityId: aCap,
3754
+ patch: { confidence: 99 },
3755
+ reason: "bump",
3756
+ },
3757
+ },
3758
+ }),
3759
+ pass: (r) => r?.result?.isError === true,
3760
+ },
3761
+ {
3762
+ // Schema-level: parent fields are blocked at JSON-schema layer
3763
+ // (additionalProperties:false on patch). Without service key
3764
+ // we won't reach SQL, but the schema rejects it pre-call.
3765
+ name: "tools/list advertises three update tools",
3766
+ fn: () => handle({ id: 35, method: "tools/list", params: {} }),
3767
+ pass: (r) => {
3768
+ const names = (r?.result?.tools ?? []).map((t) => t.name);
3769
+ return ["update_task", "update_capability", "update_theme"].every((n) =>
3770
+ names.includes(n)
3771
+ );
3772
+ },
3773
+ },
3774
+ {
3775
+ // Cross-workspace guard fires when snapshot.json names workspace
3776
+ // A and a mutator call carries workspaceId=B. Cleanup is in
3777
+ // finally so a thrown handle() doesn't leave the cache pinned.
3778
+ name: "cross-workspace write refused when snapshot conflicts with arg",
3779
+ fn: async () => {
3780
+ try {
3781
+ __setSnapshotWorkspaceForTest("ws-cwd");
3782
+ return await handle({
3783
+ id: 36,
3784
+ method: "tools/call",
3785
+ params: {
3786
+ name: "archive_task",
3787
+ arguments: {
3788
+ taskId: aTask,
3789
+ reason: "cross-workspace probe",
3790
+ workspaceId: "ws-other",
3791
+ },
3792
+ },
3793
+ });
3794
+ } finally {
3795
+ __setSnapshotWorkspaceForTest(undefined);
3796
+ }
3797
+ },
3798
+ pass: (r) => {
3799
+ if (!r?.result?.isError) return false;
3800
+ const txt = r.result.content?.[0]?.text ?? "";
3801
+ return (
3802
+ txt.includes("Refusing cross-workspace") &&
3803
+ txt.includes("ws-cwd") &&
3804
+ txt.includes("ws-other")
3805
+ );
3806
+ },
3807
+ },
3808
+ {
3809
+ // Matching workspaceId arg — guard passes, call continues. The
3810
+ // tool reaches archiveLifecycle / rpcCall which fails with the
3811
+ // missing-service-key error (or in selftest mode without env,
3812
+ // some other downstream error). The key assertion: NOT the
3813
+ // cross-workspace refusal — proves guard ran and let through.
3814
+ name: "matching workspaceId arg passes the cross-workspace guard",
3815
+ fn: async () => {
3816
+ try {
3817
+ __setSnapshotWorkspaceForTest("ws-cwd");
3818
+ return await handle({
3819
+ id: 37,
3820
+ method: "tools/call",
3821
+ params: {
3822
+ name: "archive_task",
3823
+ arguments: {
3824
+ taskId: aTask,
3825
+ reason: "same-workspace probe",
3826
+ workspaceId: "ws-cwd",
3827
+ },
3828
+ },
3829
+ });
3830
+ } finally {
3831
+ __setSnapshotWorkspaceForTest(undefined);
3832
+ }
3833
+ },
3834
+ pass: (r) => {
3835
+ // Must be an error result (no service key) but specifically
3836
+ // NOT the cross-workspace refusal. The two non-cross-workspace
3837
+ // errors we can land on are the missing-service-key error or
3838
+ // a workspace-resolution error — both prove the guard
3839
+ // accepted and the request reached downstream code.
3840
+ if (!r?.result?.isError) return false;
3841
+ const txt = r.result.content?.[0]?.text ?? "";
3842
+ if (txt.includes("Refusing cross-workspace")) return false;
3843
+ return (
3844
+ txt.includes("SUPABASE_SERVICE_ROLE_KEY") ||
3845
+ txt.includes("workspaceId could not be resolved") ||
3846
+ txt.includes("Write tools require")
3847
+ );
3848
+ },
3849
+ },
3850
+ {
3851
+ // record_outcome_reading rejects missing value.
3852
+ name: "record_outcome_reading (missing value returns error result)",
3853
+ fn: () =>
3854
+ handle({
3855
+ id: 39,
3856
+ method: "tools/call",
3857
+ params: {
3858
+ name: "record_outcome_reading",
3859
+ arguments: { capabilityId: aCap, asOf: "2026-05-12", source: "test" },
3860
+ },
3861
+ }),
3862
+ pass: (r) => r?.result?.isError === true,
3863
+ },
3864
+ {
3865
+ // record_outcome_reading rejects missing source.
3866
+ name: "record_outcome_reading (missing source returns error result)",
3867
+ fn: () =>
3868
+ handle({
3869
+ id: 40,
3870
+ method: "tools/call",
3871
+ params: {
3872
+ name: "record_outcome_reading",
3873
+ arguments: { capabilityId: aCap, value: 0.5, asOf: "2026-05-12" },
3874
+ },
3875
+ }),
3876
+ pass: (r) => r?.result?.isError === true,
3877
+ },
3878
+ {
3879
+ // list_stale_outcomes is a read tool — should return without
3880
+ // service-role and surface a count field.
3881
+ name: "list_stale_outcomes returns a structured stale list",
3882
+ fn: () =>
3883
+ handle({
3884
+ id: 41,
3885
+ method: "tools/call",
3886
+ params: { name: "list_stale_outcomes", arguments: {} },
3887
+ }),
3888
+ pass: (r) => {
3889
+ if (r?.result?.isError) return false;
3890
+ const txt = r?.result?.content?.[0]?.text ?? "";
3891
+ try {
3892
+ const parsed = JSON.parse(txt);
3893
+ return (
3894
+ typeof parsed.thresholdDays === "number" &&
3895
+ typeof parsed.count === "number" &&
3896
+ Array.isArray(parsed.stale)
3897
+ );
3898
+ } catch {
3899
+ return false;
3900
+ }
3901
+ },
3902
+ },
3903
+ {
3904
+ // ROADMAPPER_ALLOW_CROSS_WORKSPACE=1 disables the guard. Env
3905
+ // cleanup in finally so a thrown handle() doesn't leak the
3906
+ // permissive flag into subsequent tests.
3907
+ name: "ROADMAPPER_ALLOW_CROSS_WORKSPACE=1 disables the cross-workspace guard",
3908
+ fn: async () => {
3909
+ try {
3910
+ __setSnapshotWorkspaceForTest("ws-cwd");
3911
+ process.env.ROADMAPPER_ALLOW_CROSS_WORKSPACE = "1";
3912
+ return await handle({
3913
+ id: 38,
3914
+ method: "tools/call",
3915
+ params: {
3916
+ name: "archive_task",
3917
+ arguments: {
3918
+ taskId: aTask,
3919
+ reason: "override probe",
3920
+ workspaceId: "ws-other",
3921
+ },
3922
+ },
3923
+ });
3924
+ } finally {
3925
+ __setSnapshotWorkspaceForTest(undefined);
3926
+ delete process.env.ROADMAPPER_ALLOW_CROSS_WORKSPACE;
3927
+ }
3928
+ },
3929
+ pass: (r) => {
3930
+ const txt = r?.result?.content?.[0]?.text ?? "";
3931
+ return !txt.includes("Refusing cross-workspace");
3932
+ },
3933
+ },
3934
+ ];
3935
+
3936
+ let passed = 0;
3937
+ for (const c of checks) {
3938
+ let ok = false;
3939
+ let err = null;
3940
+ try {
3941
+ const r = await c.fn();
3942
+ ok = !!c.pass(r);
3943
+ } catch (e) {
3944
+ err = e.message;
3945
+ }
3946
+ const mark = ok ? "PASS" : "FAIL";
3947
+ log(`${mark} ${c.name}${err ? " — " + err : ""}`);
3948
+ if (ok) passed++;
3949
+ }
3950
+ log(`---`);
3951
+ log(`${passed}/${checks.length} checks passed.`);
3952
+ if (passed === checks.length) {
3953
+ log("");
3954
+ log("Server is healthy. If your MCP client doesn't see the");
3955
+ log("roadmapper tools after this passes, the client almost");
3956
+ log("certainly needs a full process restart — MCP servers are");
3957
+ log("connected at client startup. For Claude Code:");
3958
+ log(" /exit → relaunch `claude` → /mcp to verify");
3959
+ }
3960
+ process.exit(passed === checks.length ? 0 : 1);
3961
+ }
3962
+
3963
+ if (process.argv.includes("--selftest")) {
3964
+ runSelftest();
3965
+ } else {
3966
+ let buf = "";
3967
+ process.stdin.setEncoding("utf-8");
3968
+ process.stdin.on("data", async (chunk) => {
3969
+ buf += chunk;
3970
+ let nl;
3971
+ while ((nl = buf.indexOf("\n")) >= 0) {
3972
+ const line = buf.slice(0, nl).trim();
3973
+ buf = buf.slice(nl + 1);
3974
+ if (!line) continue;
3975
+ let msg;
3976
+ try {
3977
+ msg = JSON.parse(line);
3978
+ } catch {
3979
+ log("bad json", line.slice(0, 200));
3980
+ continue;
3981
+ }
3982
+ const response = await handle(msg);
3983
+ if (response) send(response);
3984
+ }
3985
+ });
3986
+
3987
+ process.stdin.on("end", () => process.exit(0));
3988
+ const { url, readKey: rk, writeKey } = supabaseConfig();
3989
+ const mode = url && rk
3990
+ ? writeKey
3991
+ ? "supabase (rw)"
3992
+ : "supabase (ro)"
3993
+ : "seed-only";
3994
+
3995
+ // Boot-time snapshot of the roadmap shape — gives the operator a
3996
+ // fast sanity check that the MCP can read the right workspace.
3997
+ // Errors here are swallowed so a flaky network doesn't keep the
3998
+ // server from booting.
3999
+ (async () => {
4000
+ let stats = null;
4001
+ try {
4002
+ const projected =
4003
+ (await readWorkspaceProjected()) ?? project(readSeed(), {});
4004
+ stats = {
4005
+ themes: projected.themes.length,
4006
+ capabilities: projected.capabilities.length,
4007
+ openTasks: projected.tasks.filter((t) => t.status !== "delivered").length,
4008
+ };
4009
+ } catch (e) {
4010
+ log("ready-snapshot errored:", e.message);
4011
+ }
4012
+ const tail = stats
4013
+ ? `, ${stats.themes} themes, ${stats.capabilities} capabilities, ${stats.openTasks} open tasks`
4014
+ : "";
4015
+ const snap = snapshotWorkspaceId();
4016
+ const snapTail = snap ? `, snapshot-workspace=${snap}` : "";
4017
+ log(`ready (mode=${mode}${tail}${snapTail})`);
4018
+ })();
4019
+ }