@roadmapperai/mcp 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/AGENTS.md +112 -16
  2. package/README.md +42 -16
  3. package/package.json +1 -1
  4. package/server.mjs +1647 -125
package/server.mjs CHANGED
@@ -326,6 +326,13 @@ async function fetchWorkspaceEntitiesViaBroker() {
326
326
  pillars: Array.isArray(parsed.pillars) ? parsed.pillars : [],
327
327
  capabilities: Array.isArray(parsed.capabilities) ? parsed.capabilities : [],
328
328
  tasks: Array.isArray(parsed.tasks) ? parsed.tasks : [],
329
+ // Additive (migration 0108): the workspace_settings row, used to
330
+ // resolve agent_theme_autonomy. Absent on older backends → null,
331
+ // which the projection treats as "all defaults" (autonomy on).
332
+ settings:
333
+ parsed.settings && typeof parsed.settings === "object"
334
+ ? parsed.settings
335
+ : null,
329
336
  };
330
337
  } catch {
331
338
  return null;
@@ -462,6 +469,75 @@ function tplDescription(text, labels) {
462
469
  return out;
463
470
  }
464
471
 
472
+ /**
473
+ * The full tool descriptions below carry the planning methodology
474
+ * (USE WHEN / PREREQUISITE / ANTI-PATTERN / EXAMPLE) inline. That prose
475
+ * is relocated to the server `instructions` field — sent once at connect —
476
+ * plus the roadmapper://rubric resource, so the per-tool wire payload is
477
+ * just the one-line summary: the segment before the first blank line.
478
+ *
479
+ * Why: the 34 full descriptions cost ~15k tokens in every tools/list, in
480
+ * every session, used or not. The summaries cost ~3-4k. The methodology
481
+ * isn't lost — it moves to `instructions` (always sent, deduped) and the
482
+ * rubric resource (on demand), and the contract is still enforced server
483
+ * side (rubric/discovery gates + validateOutcome/validateName/etc. return
484
+ * structured `fix` errors). inputSchema, including every per-field
485
+ * description, is untouched — callers keep full argument-level guidance.
486
+ */
487
+ function summaryOf(description) {
488
+ const i = description.indexOf("\n\n");
489
+ return i === -1 ? description : description.slice(0, i);
490
+ }
491
+
492
+ /**
493
+ * The minimal planning contract an agent needs to file a VALID proposal,
494
+ * sent once in the initialize `instructions` field. This is the CORE
495
+ * extract of AGENTS.md (~600 tokens vs the full ~12.5k doc): the gate
496
+ * sequence, the task/capability shapes, the server-enforced falsifiable
497
+ * outcome + confidence rules, the enums, IDs, and don'ts. The full doc
498
+ * (tool catalogue, PR/branch conventions, RICE narrative, GitHub wiring)
499
+ * stays on demand via get_agents_md / roadmapper://rubric — reading
500
+ * either also satisfies the rubric gate. Keep this in sync with AGENTS.md
501
+ * sections: TL;DR, The mental model, Required agent task, Required
502
+ * capability fields, Outcome statements, Impact/Confidence, ID
503
+ * conventions, Don'ts.
504
+ */
505
+ const CORE_CONTRACT = `ROADMAPPER PLANNING CONTRACT (essentials — full version: the get_agents_md op, or read the roadmapper://rubric resource)
506
+
507
+ ACCESS — every operation runs through ONE tool
508
+ roadmap({ op, args }) executes an operation. roadmap_search(intent) lists/ranks the operations; roadmap_describe(op) returns an op's exact arguments. The op names used below (get_roadmap_snapshot, suggest_capability_for, propose_task, get_agents_md, ...) are values for op — e.g. roadmap({ op: "get_agents_md" }) or roadmap({ op: "propose_task", args: { capabilityId, title, effort } }).
509
+
510
+ PER-SESSION WORKFLOW
511
+ 1. Orient first: get_roadmap_snapshot (or list_themes / list_capabilities). This also satisfies the discovery gates below.
512
+ 2. Writing requires the rubric: every workspace-mutating tool (propose_*/update_*/archive_*/unarchive_*/move_*/record_outcome_reading/link_pr/submit_acceptance_grades) refuses until you call get_agents_md once this session (reading roadmapper://rubric also counts).
513
+ 3. Reuse before creating: suggest_capability_for({description}) to find an existing home; only propose a new capability if nothing fits. suggest_theme_for / list_themes before proposing a theme.
514
+ 4. Before your first write, call get_active_workspace and proceed only if its status is "resolved"; for any other status follow the \`next\` action it returns (e.g. link_repo) so writes don't land in the wrong workspace.
515
+ 5. dryRun:true validates any write without committing. Reference everything by stable ID, never by name.
516
+
517
+ MODEL (don't conflate the layers)
518
+ Theme (TH-NNNNNN · leadership · years) > Capability (CAP-XXXXXX · PM · quarters · a falsifiable bet) > Task (TK-NNNNNN · IC/agent · days) > PR (closes tasks). Sprints (SP-NNN) are 1-2 week buckets.
519
+
520
+ TASK fields
521
+ Required: capabilityId, title (>=5 chars), effort: XS|S|M|L|XL (XS=2h S=4h M=1d L=3d XL=8d).
522
+ Recommended (not enforced): kind: feature|bug|chore|spike, priority: P0|P1|P2|P3, acceptance: [checkable assertions], dependsOn: [TK-...].
523
+ Give any task an agent will pick up a non-empty acceptance list — an empty list is a stop signal (spike it or ask). Stamp authorKind:agent. Only set dependsOn when one task truly blocks another.
524
+
525
+ CAPABILITY fields
526
+ Required: name (>=8 chars), pillarId: TH-..., outcome (falsifiable — see below).
527
+ Optional (defaulted): reach: number >=0 (default 100), impact: 3|2|1|0.5|0.25 (default 1), confidence: 0-95 (default 70), specRef (spec link; supply before decomposing a capability into tasks so scope is pinned — convention, not enforced).
528
+
529
+ FALSIFIABLE OUTCOME (server-enforced — propose_capability rejects otherwise)
530
+ Template: <metric> moves from <baseline> to <target> by <date>, measured by <source>.
531
+ The outcome MUST contain both a number AND a temporal anchor. Use a 20XX year (e.g. 2026-09-30 or "Sep 2026") or a quarter (Q3, q1 2026) — a bare month name or "by <month>" is NOT accepted, so always include the year. Confidence: 100 is never accepted (server caps at 95); reserve 91-95 for work already shipped or behind a flag.
532
+ Good: "Activation rate moves from 32% to 55% by 2026-09-30, measured by the activated_user event."
533
+ Weak: "Improve builder UX" (no metric/baseline/date — rewrite, or file it as a task).
534
+
535
+ WHEN NOT TO CREATE A CAPABILITY
536
+ A one-off fix, infra under an existing bet, a refactor/rename, or anything that fits in one PR is a TASK under the existing capability — not a new bet. If you can't write a falsifiable outcome, it isn't a capability yet.
537
+
538
+ DON'TS
539
+ No capability-per-PR. No blank outcomes. Don't game RICE inputs. Don't edit theme IDs after creation. Don't self-promote a task to delivered — wait for the merged PR.`;
540
+
465
541
  /**
466
542
  * Resolve a config value from a primary `ROADMAPPER_*` env var,
467
543
  * falling back to a legacy `SUPABASE_*` alias when the primary
@@ -853,6 +929,7 @@ async function readWorkspaceProjected(wsIdOverride) {
853
929
  themes: ent.pillars.map(rowToThemeProjected),
854
930
  capabilities: ent.capabilities.map(rowToCapabilityProjected),
855
931
  tasks: ent.tasks.map(rowToTaskProjected),
932
+ settings: rowToSettingsProjected(ent.settings),
856
933
  };
857
934
  }
858
935
  // Broker failed — fall through to the direct read below. On a pure
@@ -876,15 +953,22 @@ async function readWorkspaceProjected(wsIdOverride) {
876
953
  return res.json();
877
954
  };
878
955
  try {
879
- const [pillars, caps, tasks] = await Promise.all([
956
+ const [pillars, caps, tasks, settingsRows] = await Promise.all([
880
957
  fetchTable("pillars?select=*"),
881
958
  fetchTable("capabilities?select=*"),
882
959
  fetchTable("tasks?select=*"),
960
+ // Operator path: workspace_settings is one row per workspace.
961
+ // Tolerate the table not existing on an older DB (404) — fall
962
+ // back to defaults rather than failing the whole read.
963
+ fetchTable("workspace_settings?select=*").catch(() => []),
883
964
  ]);
884
965
  return {
885
966
  themes: pillars.map(rowToThemeProjected),
886
967
  capabilities: caps.map(rowToCapabilityProjected),
887
968
  tasks: tasks.map(rowToTaskProjected),
969
+ settings: rowToSettingsProjected(
970
+ Array.isArray(settingsRows) ? settingsRows[0] : settingsRows
971
+ ),
888
972
  };
889
973
  } catch (e) {
890
974
  log("supabase entity read failed:", e.message);
@@ -896,6 +980,20 @@ async function readWorkspaceProjected(wsIdOverride) {
896
980
  * the same camelCase keys the SPA + agent surfaces have always
897
981
  * used; the legacy JSONB shape and these table rows agree on
898
982
  * every field. */
983
+ /**
984
+ * Project a workspace_settings row to the camelCase shape the server
985
+ * reads. Tolerant of null / {} (no row yet, or an older backend that
986
+ * doesn't return settings): every flag falls back to its product
987
+ * default. agent_theme_autonomy defaults TRUE — agents create themes
988
+ * autonomously unless a workspace explicitly turns it off.
989
+ */
990
+ function rowToSettingsProjected(r) {
991
+ const row = r && typeof r === "object" ? r : {};
992
+ return {
993
+ // Default true: missing column / row / backend all mean "on".
994
+ agentThemeAutonomy: row.agent_theme_autonomy !== false,
995
+ };
996
+ }
899
997
  function rowToThemeProjected(r) {
900
998
  return stripUndefined({
901
999
  id: r.id,
@@ -1311,7 +1409,7 @@ function validateOutcome(outcome) {
1311
1409
  !hasTemporal ? "date/quarter" : null,
1312
1410
  ]
1313
1411
  .filter(Boolean)
1314
- .join(" + ")}. See get_agents_md for examples.`;
1412
+ .join(" + ")}. See ${opCall("get_agents_md")} for examples.`;
1315
1413
  }
1316
1414
  return null;
1317
1415
  }
@@ -1396,6 +1494,27 @@ function jaccardScore(a, b) {
1396
1494
  return overlap / Math.max(a.size, b.size);
1397
1495
  }
1398
1496
 
1497
+ // ── Theme sprawl control ──────────────────────────────────────────
1498
+ //
1499
+ // With agent_theme_autonomy ON (the default), the old "stop and ask a
1500
+ // human before any new theme" guard is gone — so the sprawl guard has
1501
+ // to live server-side instead. A proposed theme whose name+description
1502
+ // overlaps an existing active theme at or above this bar is almost
1503
+ // certainly a near-duplicate ("Data Intelligence" vs "Data &
1504
+ // Intelligence"); propose_theme refuses it and points at the match so
1505
+ // the agent reuses/updates that theme instead of minting a sibling.
1506
+ // Set deliberately high: themes are coarse, so only a strong overlap
1507
+ // is a real duplicate. 0.6 blocks name-containment dups ("Data
1508
+ // Intelligence" ⊂ "Data Intelligence Platform" = 0.67) without
1509
+ // false-positiving on two distinct short themes that happen to share
1510
+ // ONE word ("Customer Loyalty" vs "Customer Retention" = 0.5 < 0.6).
1511
+ // force:true overrides for the rare legitimate case.
1512
+ const THEME_SPRAWL_BLOCK = 0.6;
1513
+ // Two existing themes overlapping at/above this are flagged as a
1514
+ // consolidation candidate by detect_theme_sprawl (lower than the block
1515
+ // bar — we want to surface drift before it's an exact dup).
1516
+ const THEME_SPRAWL_WARN = 0.34;
1517
+
1399
1518
  // ── Session state + enforcement gates ─────────────────────────────
1400
1519
  //
1401
1520
  // One process serves one MCP client (stdio). State below is the
@@ -1429,6 +1548,20 @@ function resetSession() {
1429
1548
  session.mutatorBlocks = 0;
1430
1549
  }
1431
1550
 
1551
+ /**
1552
+ * Format an agent-facing "next call" in the dispatch shape. After the
1553
+ * tool-surface collapse the ONLY callable tool is `roadmap`; the 34 ops are
1554
+ * `op` values, so a fix field of `get_agents_md()` names something a real MCP
1555
+ * client can't invoke (it isn't in tools/list). opCall renders the reachable
1556
+ * form `roadmap({ op: "<op>"[, args: {...}] })`. argsHint is a preformatted
1557
+ * args literal (string) when the call needs arguments.
1558
+ */
1559
+ function opCall(op, argsHint) {
1560
+ return argsHint
1561
+ ? `roadmap({ op: "${op}", args: ${argsHint} })`
1562
+ : `roadmap({ op: "${op}" })`;
1563
+ }
1564
+
1432
1565
  /**
1433
1566
  * Build the structured "prereq missing" result the mutators return
1434
1567
  * when the agent hasn't fetched the rubric this session. The shape
@@ -1445,10 +1578,10 @@ function rubricMissingResult(toolName) {
1445
1578
  {
1446
1579
  error: "prerequisite_missing",
1447
1580
  message:
1448
- `Call get_agents_md first this session, then retry ${toolName}. ` +
1581
+ `Call ${opCall("get_agents_md")} first this session, then retry your ${toolName} call. ` +
1449
1582
  "The rubric defines acceptance criteria shape and grading dimensions — " +
1450
1583
  "proposals filed without it will not round-trip.",
1451
- fix: "get_agents_md()",
1584
+ fix: opCall("get_agents_md"),
1452
1585
  },
1453
1586
  null,
1454
1587
  2
@@ -1473,7 +1606,7 @@ function discoveryMissingResult(toolName, fixCall, rationale) {
1473
1606
  {
1474
1607
  error: "discovery_missing",
1475
1608
  message:
1476
- `Call ${fixCall} first this session, then retry ${toolName}. ${rationale}`,
1609
+ `Call ${fixCall} first this session, then retry your ${toolName} call. ${rationale}`,
1477
1610
  fix: fixCall,
1478
1611
  },
1479
1612
  null,
@@ -1485,6 +1618,88 @@ function discoveryMissingResult(toolName, fixCall, rationale) {
1485
1618
  };
1486
1619
  }
1487
1620
 
1621
+ /**
1622
+ * Block result for a mutator whose target workspace fell through to the
1623
+ * install's env default WHILE the agent is sitting in a git repo that
1624
+ * isn't mapped to any workspace. Same shape + self-heal rationale as the
1625
+ * rubric gate: name the exact fix so the LLM links the repo, then retries.
1626
+ *
1627
+ * Why this is repo-aware, not session-aware — a developer routinely has
1628
+ * SEVERAL repos open in one chat. The gate must only fire for the specific
1629
+ * unmapped repo, and must never brick a legitimate cross-repo write:
1630
+ * • An explicit `workspaceId` arg → caller is intentionally targeting a
1631
+ * workspace; never blocked (checked before this is reached).
1632
+ * • source === "repo"/"snapshot"/"arg" → already resolved to a real
1633
+ * mapping; this only fires on "env" (the silent install-default
1634
+ * fall-through), which — because resolveWorkspaceWithSource prefers a
1635
+ * repo_workspace_map hit — means THIS repo genuinely isn't mapped.
1636
+ * • No git slug (not in a repo) → nothing to link; fall through to the
1637
+ * env default rather than deadlock.
1638
+ * The message offers BOTH escape hatches so a multi-repo chat is never
1639
+ * stuck: link_repo (map this repo) OR pass workspaceId (target an existing
1640
+ * workspace without mapping the repo at all).
1641
+ */
1642
+ function repoUnmappedResult(toolName, slug, envWsId) {
1643
+ return {
1644
+ content: [
1645
+ {
1646
+ type: "text",
1647
+ text: JSON.stringify(
1648
+ {
1649
+ error: "repo_unmapped",
1650
+ message:
1651
+ `"${slug}" isn't mapped to a workspace, so ${toolName} would land on the install-default workspace "${envWsId}" — probably not what you want. ` +
1652
+ `Map it once with ${opCall("link_repo")} (this repo → your key's workspace, resolves silently forever after), then retry your ${toolName} call. ` +
1653
+ `Or, if you meant a specific existing workspace, pass workspaceId in the op's args and it proceeds without mapping the repo.`,
1654
+ repo: slug,
1655
+ envDefaultWorkspace: envWsId,
1656
+ fix: opCall("link_repo"),
1657
+ alt: opCall(toolName, '{ workspaceId: "<target>", ... }'),
1658
+ },
1659
+ null,
1660
+ 2
1661
+ ),
1662
+ },
1663
+ ],
1664
+ isError: true,
1665
+ };
1666
+ }
1667
+
1668
+ /**
1669
+ * Decide whether a mutator should be blocked because the agent is in an
1670
+ * unmapped repo and the write would silently hit the env default. Returns
1671
+ * a block result, or null to proceed. Pure + sync (no network) so it's
1672
+ * cheap on every mutator: the per-repo "is it mapped" question was already
1673
+ * answered by resolveWorkspaceWithSource (a mapped repo resolves to
1674
+ * source "repo", never "env"), so we only need the cwd's git slug here.
1675
+ *
1676
+ * Escape hatches, in order:
1677
+ * 1. Explicit workspaceId arg → intentional target, allow.
1678
+ * 2. Writes disabled → not our concern (set_credentials path handles it).
1679
+ * 3. Source isn't "env" → already resolved to a real mapping, allow.
1680
+ * 4. No client roots / no git slug → not in a linkable repo, allow
1681
+ * (fall through to env default; blocking would deadlock).
1682
+ * 5. Bypass env var set → allow (operator opt-out).
1683
+ */
1684
+ async function repoLinkGate(name, args, source, envWsId) {
1685
+ if (args?.workspaceId) return null; // explicit target — never block
1686
+ if (writeMode() === "read-only") return null; // different problem
1687
+ if (source !== "env") return null; // resolved via repo/snapshot/arg
1688
+ if (process.env.ROADMAPPER_ALLOW_UNMAPPED_REPO === "1") return null;
1689
+ if (_clientRoots.length === 0) return null; // not in a repo at all
1690
+
1691
+ // Find the first open root with a resolvable origin slug. If none, the
1692
+ // agent isn't in a linkable git repo — don't block (let env default win).
1693
+ let slug = null;
1694
+ for (const dir of _clientRoots) {
1695
+ slug = await repoSlugForDir(dir);
1696
+ if (slug) break;
1697
+ }
1698
+ if (!slug) return null;
1699
+
1700
+ return repoUnmappedResult(name, slug, envWsId);
1701
+ }
1702
+
1488
1703
  /**
1489
1704
  * Telemetry write — fire-and-forget POST to public.mcp_telemetry
1490
1705
  * via PostgREST when a service-role key is set. Never blocks the
@@ -1768,13 +1983,70 @@ const TOOLS = [
1768
1983
  additionalProperties: false,
1769
1984
  },
1770
1985
  },
1986
+ {
1987
+ name: "propose_tasks",
1988
+ description:
1989
+ "Bulk-create MANY tasks under ONE capability in a single call. Token-efficient: prefer this over N separate propose_task calls when filing a plan — one request, one compact {id,title} array back instead of N round trips. When write tools are live, file directly via this tool; do NOT also paste the full JSON plan into chat (that pays for the plan twice).\n\n" +
1990
+ "USE WHEN: decomposing a capability into its 3-8 tasks, or importing a planned backlog. All tasks share the one capabilityId.\n" +
1991
+ "PREREQUISITE: get_agents_md once this session (enforced). The capability must already exist — propose_capability first if needed.\n" +
1992
+ "INTRA-BATCH DEPENDENCIES: give a task a `ref` (any alias string) and reference it in another task's `dependsOn` — refs are rewritten to the real TK ids after minting. dependsOn entries that aren't a sibling ref pass through as literal existing TK ids.\n" +
1993
+ "PARTIAL SUCCESS: a structural/validation error in any row fails the whole call before writing (fix the batch). Once validated, per-row RPC failures are reported in tasks[].error without sinking the rest.\n" +
1994
+ "ANTI-PATTERN: don't use for a single task (use propose_task); don't spread one capability's tasks across multiple capabilities (call once per capability).\n" +
1995
+ "EXAMPLE: propose_tasks({ capabilityId: 'CAP-018', tasks: [{ ref: 'a', title: 'Schema + migration', effort: 'M' }, { title: 'API endpoint', effort: 'M', dependsOn: ['a'] }] })\n\n" +
1996
+ "Requires write auth (set ROADMAPPER_API_KEY). Pass dryRun:true to validate + preview ids without writing. Pass workspaceId to target a workspace other than the env default.",
1997
+ inputSchema: {
1998
+ type: "object",
1999
+ properties: {
2000
+ capabilityId: { type: "string" },
2001
+ tasks: {
2002
+ type: "array",
2003
+ minItems: 1,
2004
+ maxItems: 100,
2005
+ description: "Task specs. Each needs title + effort; everything else is optional.",
2006
+ items: {
2007
+ type: "object",
2008
+ properties: {
2009
+ ref: {
2010
+ type: "string",
2011
+ description:
2012
+ "Optional caller alias for intra-batch dependsOn references. Not stored.",
2013
+ },
2014
+ title: { type: "string" },
2015
+ summary: { type: "string" },
2016
+ effort: { type: "string", enum: ["XS", "S", "M", "L", "XL"] },
2017
+ kind: { type: "string", enum: ["feature", "bug", "chore", "spike"] },
2018
+ priority: { type: "string", enum: ["P0", "P1", "P2", "P3"] },
2019
+ acceptance: { type: "array", items: { type: "string" } },
2020
+ dependsOn: {
2021
+ type: "array",
2022
+ items: { type: "string" },
2023
+ description:
2024
+ "Sibling refs (rewritten to real ids) and/or existing TK-NNNNNN ids.",
2025
+ },
2026
+ owner: { type: "string" },
2027
+ expectedPRs: { type: "number" },
2028
+ expectedScope: { type: "number" },
2029
+ idempotencyKey: { type: "string" },
2030
+ },
2031
+ required: ["title", "effort"],
2032
+ additionalProperties: false,
2033
+ },
2034
+ },
2035
+ dryRun: { type: "boolean" },
2036
+ workspaceId: { type: "string" },
2037
+ },
2038
+ required: ["capabilityId", "tasks"],
2039
+ additionalProperties: false,
2040
+ },
2041
+ },
1771
2042
  {
1772
2043
  name: "propose_theme",
1773
2044
  description:
1774
- "Propose a new strategic theme (pillar). Themes are years-stable — only propose one when nothing existing fits.\n\n" +
1775
- "USE WHEN: the work the user is describing genuinely doesn't fit ANY existing theme, AND the user explicitly says they want a new strategic direction. Almost never the right answer in a planning session.\n" +
1776
- "PREREQUISITE: get_agents_md once this session (enforced). Theme discovery once this session, satisfied by suggest_theme_for (preferred — returns ranked matches with a fit signal), list_themes, or get_roadmap_snapshot. Enforced the server returns discovery_missing with a fix field if you skip it. Duplicating a theme is the most common failure mode; the gate stops it.\n" +
1777
- "ANTI-PATTERN: do not call to organize a quarter of work — that's a capability, not a theme. Do not call because the existing themes feel too coarse they're SUPPOSED to be coarse. Use propose_capability under an existing theme instead.\n" +
2045
+ "Propose a new strategic theme (pillar). Themes are years-stable, coarse pillars the small top tier of the tree.\n\n" +
2046
+ "AUTONOMY: by default (agent_theme_autonomy ON) you may create a theme without human confirmation when the work genuinely needs a new pillar. The server controls sprawl for you — it REFUSES a near-duplicate of an existing theme (returns error:\"too_similar\" naming the match) so you reuse/update that one instead. If a workspace turned autonomy OFF, propose_theme returns error:\"confirmation_required\" until you surface the theme to the user and retry with confirm:true.\n" +
2047
+ "USE WHEN: the work doesn't fit any existing theme AND represents a distinct multi-year strategic direction. Most planning needs a capability under an existing theme, not a new theme.\n" +
2048
+ "PREREQUISITE: get_agents_md once this session (enforced). Theme discovery once this session, satisfied by suggest_theme_for (preferred returns ranked matches), list_themes, or get_roadmap_snapshot (enforceddiscovery_missing with a fix field otherwise).\n" +
2049
+ "ANTI-PATTERN: do not call to organize a quarter of work — that's a capability. Do not retry with force:true to bypass a too_similar block unless the overlap is a genuine false positive — that's the sprawl guard working.\n" +
1778
2050
  "EXAMPLE: propose_theme({ name: 'AI Agent Reliability', description: 'Multi-year bet on making agent workflows reproducible.', targetRoi: 20000000, idempotencyKey: 'session-1-theme-1' })\n\n" +
1779
2051
  "Requires write auth (set ROADMAPPER_API_KEY). targetRoi is RAW ANNUAL DOLLARS (e.g. 20000000 = $20M), not millions. Pass idempotencyKey so retries don't duplicate. Pass dryRun: true to validate without writing. Pass workspaceId to target a workspace other than the env default.",
1780
2052
  inputSchema: {
@@ -1784,6 +2056,16 @@ const TOOLS = [
1784
2056
  description: { type: "string" },
1785
2057
  color: { type: "string" },
1786
2058
  targetRoi: { type: "number", description: "Annual ROI target in raw dollars (e.g. 20000000 = $20M)." },
2059
+ force: {
2060
+ type: "boolean",
2061
+ description:
2062
+ "Override the too_similar sprawl block. Use ONLY when a flagged overlap with an existing theme is a genuine false positive and this is truly a distinct strategic pillar.",
2063
+ },
2064
+ confirm: {
2065
+ type: "boolean",
2066
+ description:
2067
+ "Set true to proceed when the workspace has agent theme-autonomy turned OFF — your attestation that the user explicitly approved this new theme. Ignored when autonomy is on (the default).",
2068
+ },
1787
2069
  idempotencyKey: { type: "string" },
1788
2070
  dryRun: { type: "boolean" },
1789
2071
  workspaceId: { type: "string" },
@@ -1810,7 +2092,7 @@ const TOOLS = [
1810
2092
  outcome: { type: "string" },
1811
2093
  reach: { type: "number" },
1812
2094
  impact: { type: "number", enum: [3, 2, 1, 0.5, 0.25] },
1813
- confidence: { type: "number", minimum: 0, maximum: 100 },
2095
+ confidence: { type: "number", minimum: 0, maximum: 95 },
1814
2096
  roi: { type: "number", description: "Estimated annual ROI in raw dollars (e.g. 2500000 = $2.5M)." },
1815
2097
  specRef: { type: "string" },
1816
2098
  idempotencyKey: { type: "string" },
@@ -1841,7 +2123,11 @@ const TOOLS = [
1841
2123
  properties: {
1842
2124
  index: { type: "integer", minimum: 0 },
1843
2125
  status: { type: "string", enum: ["pass", "fail"] },
1844
- note: { type: "string" },
2126
+ note: {
2127
+ type: "string",
2128
+ description:
2129
+ "Required when status=fail — the failure mode the reviewer needs. Call this before opening the PR.",
2130
+ },
1845
2131
  },
1846
2132
  required: ["index", "status"],
1847
2133
  additionalProperties: false,
@@ -2019,6 +2305,31 @@ const TOOLS = [
2019
2305
  additionalProperties: false,
2020
2306
  },
2021
2307
  },
2308
+ {
2309
+ name: "detect_theme_sprawl",
2310
+ description:
2311
+ "Find pairs/clusters of EXISTING themes that overlap enough to be candidates for consolidation — the 'we have too many near-duplicate pillars' signal. The companion to agent_theme_autonomy: autonomy lets agents create themes freely, this is how you periodically detect and clean up the drift.\n\n" +
2312
+ "How it works: scores every active theme against every other by name+description token overlap, and reports pairs at or above the warn threshold (default 0.34). Each pair comes with the overlap score and a suggested action (merge via move_capabilities + archive_theme).\n" +
2313
+ "USE WHEN: quarterly review, or any time the theme list feels bloated. With autonomy on, run this occasionally to catch sibling themes that should be one.\n" +
2314
+ "PREREQUISITE: none — read-only. Enumerates every theme, so it satisfies the propose_theme discovery gate.\n" +
2315
+ "ANTI-PATTERN: don't auto-merge on a single weak overlap — a human owns theme structure. Tune threshold rather than acting on noise. Two themes CAN legitimately share vocabulary (e.g. 'Data Ingestion' vs 'Data Governance').\n" +
2316
+ "EXAMPLE: detect_theme_sprawl({ threshold: 0.34 })",
2317
+ inputSchema: {
2318
+ type: "object",
2319
+ properties: {
2320
+ threshold: {
2321
+ type: "number",
2322
+ minimum: 0,
2323
+ maximum: 1,
2324
+ description:
2325
+ "Min name+description Jaccard overlap between two themes to flag as a consolidation candidate. Default 0.34. Raise to surface only the most blatant duplicates.",
2326
+ },
2327
+ includeArchived: { type: "boolean" },
2328
+ workspaceId: { type: "string" },
2329
+ },
2330
+ additionalProperties: false,
2331
+ },
2332
+ },
2022
2333
  ];
2023
2334
 
2024
2335
  /**
@@ -2337,6 +2648,7 @@ function updateLifecycleTools() {
2337
2648
  /** Tools that mutate the workspace — all gated on rubric fetch. */
2338
2649
  const MUTATOR_TOOLS = new Set([
2339
2650
  "propose_task",
2651
+ "propose_tasks",
2340
2652
  "propose_theme",
2341
2653
  "propose_capability",
2342
2654
  "submit_acceptance_grades",
@@ -2357,7 +2669,206 @@ const MUTATOR_TOOLS = new Set([
2357
2669
  "record_outcome_reading",
2358
2670
  ]);
2359
2671
 
2672
+ // --- Dispatch surface -------------------------------------------------
2673
+ // Token-efficiency collapse: instead of advertising all 34 tools (their
2674
+ // inputSchemas alone are ~5k tokens in every tools/list), the wire surface
2675
+ // is three dispatch tools. The 34 operations are routed by name through
2676
+ // callTool (see the roadmap/roadmap_search/roadmap_describe early returns),
2677
+ // and their schemas are served on demand via roadmap_describe. This keeps
2678
+ // tools/list tiny while every operation, gate, and validator is unchanged.
2679
+ const OP_NAMES = new Set(TOOLS.map((t) => t.name));
2680
+ const DISPATCH_TOOLS = new Set(["roadmap", "roadmap_search", "roadmap_describe"]);
2681
+
2682
+ const META_TOOLS = [
2683
+ {
2684
+ name: "roadmap_search",
2685
+ description:
2686
+ "Find the right roadmap operation for what you want to do. Returns operation names with one-line summaries, ranked by your intent (or all of them if you omit intent). Then call roadmap_describe(op) for an op's arguments and roadmap({op, args}) to run it.",
2687
+ inputSchema: {
2688
+ type: "object",
2689
+ properties: {
2690
+ intent: {
2691
+ type: "string",
2692
+ description:
2693
+ "Free-text description of the task, e.g. 'file a new bet' or 'mark a task done'. Omit to list every operation.",
2694
+ },
2695
+ },
2696
+ additionalProperties: false,
2697
+ },
2698
+ },
2699
+ {
2700
+ name: "roadmap_describe",
2701
+ description:
2702
+ "Return the input schema and summary for one roadmap operation (op, e.g. 'propose_task'). Call before roadmap({op, args}) when you need the exact argument shape; this is the same schema the operation validates against.",
2703
+ inputSchema: {
2704
+ type: "object",
2705
+ properties: {
2706
+ op: { type: "string", description: "Operation name to describe, e.g. propose_task." },
2707
+ },
2708
+ required: ["op"],
2709
+ additionalProperties: false,
2710
+ },
2711
+ },
2712
+ {
2713
+ name: "roadmap",
2714
+ description:
2715
+ "Execute any roadmap operation: roadmap({ op, args }). op is an operation name such as get_roadmap_snapshot, list_capabilities, suggest_capability_for, propose_task, or update_capability — discover them with roadmap_search, get their arguments with roadmap_describe. All reads, planning, and writes go through here; the server enforces the rubric/discovery gates and per-op validation. See the server instructions for the planning contract.",
2716
+ inputSchema: {
2717
+ type: "object",
2718
+ properties: {
2719
+ op: {
2720
+ type: "string",
2721
+ description:
2722
+ "Operation name, e.g. get_roadmap_snapshot, suggest_capability_for, propose_task. Call roadmap_search to discover ops.",
2723
+ },
2724
+ args: {
2725
+ type: "object",
2726
+ description:
2727
+ "Arguments for the op (see roadmap_describe(op)). Omit for ops that take none.",
2728
+ additionalProperties: true,
2729
+ },
2730
+ },
2731
+ required: ["op"],
2732
+ additionalProperties: false,
2733
+ },
2734
+ },
2735
+ ];
2736
+
2737
+ // roadmap_search: rank the 34 ops by token overlap with the intent and
2738
+ // return {op, summary} rows. Summaries are the trimmed first line, run
2739
+ // through the same label substitution tool descriptions get, so custom
2740
+ // workspace labels (theme -> initiative) stay consistent.
2741
+ function roadmapSearchResult(intent) {
2742
+ const labels = currentLabels();
2743
+ const ops = TOOLS.map((t) => ({
2744
+ op: t.name,
2745
+ summary: tplDescription(summaryOf(t.description), labels),
2746
+ }));
2747
+ const q = (intent || "").toLowerCase().trim();
2748
+ let operations = ops;
2749
+ if (q) {
2750
+ const terms = q.split(/[^a-z0-9]+/).filter((w) => w.length > 2);
2751
+ if (terms.length) {
2752
+ const score = (o) => {
2753
+ const hay = (o.op + " " + o.summary).toLowerCase();
2754
+ return terms.reduce((n, w) => n + (hay.includes(w) ? 1 : 0), 0);
2755
+ };
2756
+ operations = ops
2757
+ .map((o) => ({ o, s: score(o) }))
2758
+ .sort((a, b) => b.s - a.s)
2759
+ .map((x) => x.o);
2760
+ }
2761
+ }
2762
+ return textResult(
2763
+ JSON.stringify(
2764
+ {
2765
+ intent: intent || null,
2766
+ note: "Call roadmap_describe({ op }) for an op's arguments, then roadmap({ op, args }) to run it.",
2767
+ total: operations.length,
2768
+ operations,
2769
+ },
2770
+ null,
2771
+ 2
2772
+ )
2773
+ );
2774
+ }
2775
+
2776
+ // roadmap_describe: serve one op's inputSchema (the bulk that was evicted
2777
+ // from tools/list) plus its trimmed summary, on demand.
2778
+ function roadmapDescribeResult(op) {
2779
+ if (typeof op !== "string" || !op) {
2780
+ return errorResult(
2781
+ "roadmap_describe requires an 'op' string, e.g. roadmap_describe({ op: 'propose_task' })."
2782
+ );
2783
+ }
2784
+ const t = TOOLS.find((x) => x.name === op);
2785
+ if (!t) {
2786
+ return errorResult(
2787
+ `Unknown op '${op}'. Call roadmap_search to list operations.`
2788
+ );
2789
+ }
2790
+ return textResult(
2791
+ JSON.stringify(
2792
+ {
2793
+ op: t.name,
2794
+ summary: tplDescription(summaryOf(t.description), currentLabels()),
2795
+ inputSchema: t.inputSchema,
2796
+ },
2797
+ null,
2798
+ 2
2799
+ )
2800
+ );
2801
+ }
2802
+
2360
2803
  async function callTool(name, args) {
2804
+ // Dispatch surface. roadmap_search / roadmap_describe answer here without
2805
+ // touching the workspace. roadmap({op, args}) re-enters callTool(op, args)
2806
+ // so the operation runs through the IDENTICAL pipeline below — workspace
2807
+ // resolution, the MUTATOR_TOOLS gates, validators, session-flag side
2808
+ // effects — keyed off the real op name, with nothing duplicated. The ops
2809
+ // also stay directly callable (back-compat + what the selftest drives).
2810
+ if (name === "roadmap_search") {
2811
+ return roadmapSearchResult(typeof args?.intent === "string" ? args.intent : "");
2812
+ }
2813
+ if (name === "roadmap_describe") {
2814
+ return roadmapDescribeResult(args?.op);
2815
+ }
2816
+ if (name === "roadmap") {
2817
+ const op = args?.op;
2818
+ if (typeof op !== "string" || !op) {
2819
+ return errorResult(
2820
+ "roadmap requires an 'op', e.g. roadmap({ op: 'get_roadmap_snapshot' }). Call roadmap_search to discover operations."
2821
+ );
2822
+ }
2823
+ if (DISPATCH_TOOLS.has(op)) {
2824
+ return errorResult(
2825
+ `'${op}' is a dispatch tool, not an operation. Pass a real op such as get_roadmap_snapshot — call roadmap_search to list them.`
2826
+ );
2827
+ }
2828
+ if (!OP_NAMES.has(op)) {
2829
+ return errorResult(
2830
+ `Unknown op '${op}'. Call roadmap_search to list operations, or roadmap_describe({ op }) for one.`
2831
+ );
2832
+ }
2833
+ // Accept BOTH the documented nested shape { op, args: {...} } AND the flat
2834
+ // shape { op, ...fields } that LLM clients routinely emit when they hoist
2835
+ // scalar arguments to the top level. Flat siblings fill in keys the nested
2836
+ // object omits; on conflict the nested (documented) shape wins. This means
2837
+ // a top-level workspaceId/dryRun is never silently dropped — which would
2838
+ // mis-target a write or turn a validate-only call into a real one. No op
2839
+ // uses 'op'/'args' as an argument name, so stripping them here is safe.
2840
+ const { op: _op, args: nested, ...flat } = args ?? {};
2841
+ let inner;
2842
+ if (nested == null) {
2843
+ inner = flat; // flat shape (or no args at all)
2844
+ } else if (typeof nested === "object" && !Array.isArray(nested)) {
2845
+ inner = { ...flat, ...nested };
2846
+ } else if (typeof nested === "string") {
2847
+ // Some clients JSON-encode the args object into a string. Parse and
2848
+ // merge when it's an object; otherwise surface the real cause rather
2849
+ // than silently dropping it (which produced a misleading downstream
2850
+ // "X is required" from the inner op).
2851
+ let parsed;
2852
+ try {
2853
+ parsed = JSON.parse(nested);
2854
+ } catch {
2855
+ parsed = undefined;
2856
+ }
2857
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
2858
+ inner = { ...flat, ...parsed };
2859
+ } else {
2860
+ return errorResult(
2861
+ `roadmap 'args' must be an object — got a string that isn't a JSON object. Call as roadmap({ op: "${op}", args: { ... } }) (or hoist the fields to the top level).`
2862
+ );
2863
+ }
2864
+ } else {
2865
+ return errorResult(
2866
+ `roadmap 'args' must be an object — got ${Array.isArray(nested) ? "an array" : typeof nested}. Call as roadmap({ op: "${op}", args: { ... } }).`
2867
+ );
2868
+ }
2869
+ return callTool(op, inner);
2870
+ }
2871
+
2361
2872
  // Each tool may override the workspace via args.workspaceId. The
2362
2873
  // projection is workspace-scoped, so we pass that through to the
2363
2874
  // read. Tools that need to know the resolved id later (write paths,
@@ -2619,6 +3130,27 @@ async function callTool(name, args) {
2619
3130
  );
2620
3131
  return rubricMissingResult(name);
2621
3132
  }
3133
+ // Repo-link gate. If the agent is in a git repo that isn't mapped to a
3134
+ // workspace, this write would silently land on the install's env
3135
+ // default. Block once with the link_repo fix (or the workspaceId escape
3136
+ // hatch) so the mapping gets done instead of writes scattering onto the
3137
+ // wrong workspace. Repo-aware so a multi-repo chat is never bricked —
3138
+ // see repoLinkGate / repoUnmappedResult for the full escape-hatch list.
3139
+ {
3140
+ const { source: wsSource } = resolveWorkspaceWithSource(
3141
+ args?.workspaceId
3142
+ );
3143
+ const linkBlock = await repoLinkGate(name, args, wsSource, wsId);
3144
+ if (linkBlock) {
3145
+ session.mutatorBlocks += 1;
3146
+ recordTelemetry(
3147
+ "mutator_blocked_repo_unmapped",
3148
+ { tool: name, targetId },
3149
+ wsId
3150
+ );
3151
+ return linkBlock;
3152
+ }
3153
+ }
2622
3154
  // Per-tool discovery gates. Block propose_theme until the agent
2623
3155
  // has actually inspected the existing theme catalogue, and
2624
3156
  // propose_capability until they've ranked existing caps for fit.
@@ -2635,8 +3167,8 @@ async function callTool(name, args) {
2635
3167
  );
2636
3168
  return discoveryMissingResult(
2637
3169
  name,
2638
- 'suggest_theme_for({ description: "<the work you are about to propose>" })',
2639
- "Rank existing themes by relevance before proposing a new one — themes are years-stable, duplicates are the most common failure mode. Any returned top score >0.4 means an existing theme is a sensible home; re-use it. list_themes() or get_roadmap_snapshot() also satisfy this gate if you want the full catalogue."
3170
+ opCall("suggest_theme_for", '{ description: "<the work you are about to propose>" }'),
3171
+ "Rank existing themes by relevance before proposing a new one — themes are years-stable, duplicates are the most common failure mode. Any returned top score >0.4 means an existing theme is a sensible home; re-use it. The list_themes or get_roadmap_snapshot ops also satisfy this gate if you want the full catalogue."
2640
3172
  );
2641
3173
  }
2642
3174
  if (
@@ -2655,7 +3187,7 @@ async function callTool(name, args) {
2655
3187
  );
2656
3188
  return discoveryMissingResult(
2657
3189
  name,
2658
- 'suggest_capability_for({ description: "<the work you are about to propose>" })',
3190
+ opCall("suggest_capability_for", '{ description: "<the work you are about to propose>" }'),
2659
3191
  "Rank existing capabilities by relevance before proposing a new one. If any score is >0.4, attach tasks there instead."
2660
3192
  );
2661
3193
  }
@@ -2809,7 +3341,7 @@ async function callTool(name, args) {
2809
3341
  _meta: {
2810
3342
  roadmapper: {
2811
3343
  reminder:
2812
- "Rubric loaded. You can now safely call propose_task, propose_capability, propose_theme, submit_acceptance_grades, link_pr.",
3344
+ 'Rubric loaded. You can now safely run the write ops via roadmap({ op, args }) — e.g. roadmap({ op: "propose_task", args: {...} }), propose_capability, propose_theme, submit_acceptance_grades, link_pr.',
2813
3345
  },
2814
3346
  },
2815
3347
  });
@@ -2836,6 +3368,8 @@ async function callTool(name, args) {
2836
3368
  }
2837
3369
  case "propose_task":
2838
3370
  return proposeTask(args, projected, wsId);
3371
+ case "propose_tasks":
3372
+ return proposeTasks(args, projected, wsId);
2839
3373
  case "propose_theme":
2840
3374
  return proposeTheme(args, projected, wsId);
2841
3375
  case "propose_capability":
@@ -2899,6 +3433,11 @@ async function callTool(name, args) {
2899
3433
  // propose_capability gate (the natural next step on a gap).
2900
3434
  session.capsDiscoveredAt = Date.now();
2901
3435
  return detectCapabilityGaps(args, projected);
3436
+ case "detect_theme_sprawl":
3437
+ // Enumerates every active theme, so it satisfies the propose_theme
3438
+ // discovery gate (consolidating or proposing is the natural next step).
3439
+ session.themesListedAt = Date.now();
3440
+ return detectThemeSprawl(args, projected);
2902
3441
  default:
2903
3442
  return errorResult(`Unknown tool: ${name}`);
2904
3443
  }
@@ -2981,12 +3520,12 @@ async function proposeTask(args, projected, wsId) {
2981
3520
  if (best && best.score > 0.2 && best.score > chosenScore + 0.1) {
2982
3521
  return (
2983
3522
  base +
2984
- `The task text fits ${best.id} (${best.name}) noticeably better (score ${best.score.toFixed(2)}) than the chosen ${cap.id} (${chosenScore.toFixed(2)}). If that's the right home, move_task it there.`
3523
+ `The task text fits ${best.id} (${best.name}) noticeably better (score ${best.score.toFixed(2)}) than the chosen ${cap.id} (${chosenScore.toFixed(2)}). If that's the right home, move it there with ${opCall("move_task")}.`
2985
3524
  );
2986
3525
  }
2987
3526
  return (
2988
3527
  base +
2989
- "If you're confident in the parent, ignore this; otherwise call suggest_capability_for({ taskId }) to confirm."
3528
+ `If you're confident in the parent, ignore this; otherwise call ${opCall("suggest_capability_for", "{ taskId }")} to confirm.`
2990
3529
  );
2991
3530
  }
2992
3531
 
@@ -3087,44 +3626,263 @@ async function proposeTask(args, projected, wsId) {
3087
3626
  );
3088
3627
  }
3089
3628
 
3090
- async function proposeTheme(args, _projected /* unused — themes carry no parent */, wsId) {
3091
- const nameErr = validateName(args.name, 6);
3092
- if (nameErr) return errorResult(nameErr);
3629
+ /**
3630
+ * Shared field validation for a single task spec (used by the bulk
3631
+ * propose_tasks path). Returns an error string or null. Mirrors the
3632
+ * inline checks in proposeTask so both paths reject identically.
3633
+ */
3634
+ function taskSpecError(t) {
3635
+ const titleErr = validateName(t.title, 5);
3636
+ if (titleErr) return titleErr;
3637
+ if (!t.effort)
3638
+ return "effort is required (one of XS, S, M, L, XL) on every task in the batch.";
3639
+ if (!VALID_EFFORTS.has(t.effort)) return `Invalid effort ${t.effort}.`;
3640
+ if (t.priority && !VALID_PRIORITIES.has(t.priority))
3641
+ return `Invalid priority ${t.priority}.`;
3642
+ if (t.kind && !VALID_KINDS.has(t.kind)) return `Invalid kind ${t.kind}.`;
3643
+ if (t.expectedPRs !== undefined && (typeof t.expectedPRs !== "number" || t.expectedPRs <= 0))
3644
+ return `expectedPRs must be a positive number, got ${t.expectedPRs}.`;
3645
+ if (t.expectedScope !== undefined && (typeof t.expectedScope !== "number" || t.expectedScope <= 0))
3646
+ return `expectedScope must be a positive number, got ${t.expectedScope}.`;
3647
+ return null;
3648
+ }
3093
3649
 
3094
- const name = cleanText(args.name);
3095
- const id = randomThemeId();
3096
- const theme = {
3650
+ /** Build a task record from a spec + its pre-minted id. Mirrors the
3651
+ * object proposeTask constructs (minus the per-call skip warning). */
3652
+ function buildTaskRecord(t, cap, id) {
3653
+ const start = todayISO();
3654
+ const target = addDays(start, Math.max(1, Math.ceil(EFFORT_DAYS[t.effort])));
3655
+ return {
3097
3656
  id,
3098
- name,
3099
- description: cleanText(args.description),
3100
- color: args.color || "#6366f1", // brand-indigo default; user can change
3101
- ...(typeof args.targetRoi === "number" ? { targetRoi: args.targetRoi } : {}),
3657
+ capabilityId: cap.id,
3658
+ title: cleanText(t.title),
3659
+ summary: cleanText(t.summary),
3660
+ status: "planned",
3661
+ priority: t.priority ?? "P2",
3662
+ effort: t.effort,
3663
+ kind: t.kind ?? "feature",
3664
+ start,
3665
+ target,
3666
+ originalTarget: target,
3667
+ progress: 0,
3668
+ owner: t.owner?.trim() ?? "",
3669
+ team: cap.team ?? "",
3670
+ tags: [],
3671
+ prs: [],
3672
+ links: {},
3673
+ acceptance: t.acceptance ?? [],
3674
+ dependsOn: t.dependsOn ?? [],
3675
+ authorKind: "agent",
3676
+ ...(t.expectedPRs !== undefined ? { expectedPRs: t.expectedPRs } : {}),
3677
+ ...(t.expectedScope !== undefined ? { expectedScope: t.expectedScope } : {}),
3102
3678
  };
3679
+ }
3680
+
3681
+ /**
3682
+ * propose_tasks — file MANY tasks under one capability in a single
3683
+ * call. This is the token-efficient path: instead of N round trips
3684
+ * (each with its own tool-call framing + result), the agent sends the
3685
+ * whole batch once and gets back one compact array of {id, title}.
3686
+ *
3687
+ * Intra-batch dependencies: a task may carry a `ref` (a caller-chosen
3688
+ * alias) and other tasks may list that ref in `dependsOn`. We mint all
3689
+ * ids first, then rewrite any dependsOn entry that matches a sibling's
3690
+ * ref to the real TK id. dependsOn entries that aren't a known ref pass
3691
+ * through unchanged (assumed to be existing TK ids).
3692
+ *
3693
+ * Per-item failures don't sink the batch: each result row carries ok
3694
+ * or error, mirroring move_tasks. Validation errors are reported
3695
+ * per-row WITHOUT writing that row; valid rows still get created.
3696
+ */
3697
+ async function proposeTasks(args, projected, wsId) {
3698
+ const cap = projected.capabilities.find((c) => c.id === args.capabilityId);
3699
+ if (!cap) return errorResult(`Capability ${args.capabilityId} not found.`);
3700
+ const specs = Array.isArray(args.tasks) ? args.tasks : null;
3701
+ if (!specs || specs.length === 0)
3702
+ return errorResult("tasks must be a non-empty array of task specs.");
3703
+ if (specs.length > 100)
3704
+ return errorResult(`Too many tasks (${specs.length}); cap is 100 per call.`);
3705
+
3706
+ // Mint ids up front so intra-batch dependsOn refs can resolve.
3707
+ const minted = specs.map((t) => ({ spec: t, id: randomTaskId() }));
3708
+ const refToId = new Map();
3709
+ for (const m of minted) {
3710
+ if (typeof m.spec.ref === "string" && m.spec.ref.trim())
3711
+ refToId.set(m.spec.ref.trim(), m.id);
3712
+ }
3713
+ const resolveDeps = (deps) =>
3714
+ Array.isArray(deps) ? deps.map((d) => refToId.get(d) ?? d) : [];
3715
+
3716
+ // Validate everything first; a structural error in any row fails the
3717
+ // whole call (cheaper to fix the batch than to half-apply it). RPC
3718
+ // errors below are the per-row, partial-success case.
3719
+ for (let i = 0; i < minted.length; i++) {
3720
+ const err = taskSpecError(minted[i].spec);
3721
+ if (err) return errorResult(`tasks[${i}]: ${err}`);
3722
+ }
3103
3723
 
3104
3724
  if (args.dryRun) {
3105
3725
  return textResult(
3106
- JSON.stringify(
3107
- {
3108
- ok: true,
3109
- dryRun: true,
3110
- wouldCreate: theme,
3111
- warnings: [],
3112
- message: `Would create theme ${id} (${name}). No record written.`,
3113
- },
3114
- null,
3115
- 2
3116
- )
3726
+ JSON.stringify({
3727
+ ok: true,
3728
+ dryRun: true,
3729
+ capabilityId: cap.id,
3730
+ wouldCreate: minted.map(({ spec, id }) => ({
3731
+ id,
3732
+ title: cleanText(spec.title),
3733
+ effort: spec.effort,
3734
+ })),
3735
+ message: `Would create ${minted.length} task(s) under ${cap.id} (${cap.name}). No records written.`,
3736
+ }),
3117
3737
  );
3118
3738
  }
3119
3739
 
3120
- let rpcResult;
3121
- try {
3122
- rpcResult = await rpcCall("propose_theme", {
3123
- p_workspace_id: wsId,
3124
- p_theme: theme,
3125
- p_idempotency_key: args.idempotencyKey ?? null,
3126
- });
3127
- } catch (e) {
3740
+ const results = [];
3741
+ let created = 0;
3742
+ for (const { spec, id } of minted) {
3743
+ const record = buildTaskRecord(
3744
+ { ...spec, dependsOn: resolveDeps(spec.dependsOn) },
3745
+ cap,
3746
+ id
3747
+ );
3748
+ try {
3749
+ const rpcResult = await rpcCall("propose_task", {
3750
+ p_workspace_id: wsId,
3751
+ p_task: record,
3752
+ p_idempotency_key: spec.idempotencyKey ?? null,
3753
+ });
3754
+ const stored = rpcResult?.task ?? record;
3755
+ const idempotent = rpcResult?.idempotent === true;
3756
+ if (!idempotent) created += 1;
3757
+ results.push({ ok: true, id: stored.id, title: record.title, idempotent });
3758
+ } catch (e) {
3759
+ results.push({ ok: false, title: record.title, error: e.message });
3760
+ }
3761
+ }
3762
+
3763
+ const failed = results.filter((r) => !r.ok).length;
3764
+ return textResult(
3765
+ JSON.stringify({
3766
+ ok: failed === 0,
3767
+ capabilityId: cap.id,
3768
+ created,
3769
+ idempotent: results.filter((r) => r.ok && r.idempotent).length,
3770
+ failed,
3771
+ tasks: results,
3772
+ message:
3773
+ `Filed ${created} new task(s) under ${cap.id} (${cap.name})` +
3774
+ (failed ? `; ${failed} failed (see tasks[].error).` : "."),
3775
+ })
3776
+ );
3777
+ }
3778
+
3779
+ async function proposeTheme(args, projected, wsId) {
3780
+ const nameErr = validateName(args.name, 6);
3781
+ if (nameErr) return errorResult(nameErr);
3782
+
3783
+ const name = cleanText(args.name);
3784
+ const description = cleanText(args.description);
3785
+
3786
+ // ── Sprawl control (always on, independent of autonomy) ──────────
3787
+ // Refuse a near-duplicate of an existing active theme. This is the
3788
+ // server-side replacement for the human gate: instead of asking a
3789
+ // person every time, we only stop the agent when it's about to mint
3790
+ // a theme that overlaps one that already exists. Reuse/update beats
3791
+ // a sibling. force:true is the deliberate override.
3792
+ const activeThemes = (projected?.themes ?? []).filter((t) => !t.archived);
3793
+ const proposedTokens = tokenize(`${name} ${description ?? ""}`);
3794
+ let nearest = null;
3795
+ let nearestScore = 0;
3796
+ for (const t of activeThemes) {
3797
+ const s = jaccardScore(proposedTokens, tokenize(`${t.name} ${t.description ?? ""}`));
3798
+ if (s > nearestScore) {
3799
+ nearestScore = s;
3800
+ nearest = t;
3801
+ }
3802
+ }
3803
+ if (nearest && nearestScore >= THEME_SPRAWL_BLOCK && args.force !== true) {
3804
+ return textResult(
3805
+ JSON.stringify(
3806
+ {
3807
+ error: "too_similar",
3808
+ message:
3809
+ `"${name}" overlaps the existing theme ${nearest.id} (${nearest.name}) ` +
3810
+ `at ${nearestScore.toFixed(2)} (block bar ${THEME_SPRAWL_BLOCK}). Themes are the ` +
3811
+ "small, years-stable top tier — a near-duplicate fragments the strategic view. " +
3812
+ "Reuse it: file your work as a capability under it (the propose_capability op with " +
3813
+ `pillarId: "${nearest.id}"), or broaden its scope with the update_theme op. If this is ` +
3814
+ "genuinely a distinct strategic pillar, retry with force:true.",
3815
+ nearestTheme: { id: nearest.id, name: nearest.name, score: Number(nearestScore.toFixed(3)) },
3816
+ fix: opCall("propose_capability", `{ pillarId: "${nearest.id}", ... }`),
3817
+ },
3818
+ null,
3819
+ 2
3820
+ ),
3821
+ { isError: true }
3822
+ );
3823
+ }
3824
+
3825
+ // ── Autonomy gate ────────────────────────────────────────────────
3826
+ // Default ON: agents create themes without confirmation. A workspace
3827
+ // that flips agent_theme_autonomy OFF re-imposes a human checkpoint —
3828
+ // propose_theme then refuses until the caller passes confirm:true
3829
+ // (the agent's signal that it surfaced the new theme to the user and
3830
+ // got an explicit yes). The sprawl block above still applies either way.
3831
+ const autonomy = projected?.settings?.agentThemeAutonomy !== false;
3832
+ if (!autonomy && args.confirm !== true && !args.dryRun) {
3833
+ return textResult(
3834
+ JSON.stringify(
3835
+ {
3836
+ error: "confirmation_required",
3837
+ message:
3838
+ `This workspace has agent theme-autonomy turned OFF, so a new theme ("${name}") ` +
3839
+ "needs explicit human sign-off. Surface the proposed theme to the user; if they " +
3840
+ "approve, retry with confirm:true. Otherwise file the work under an existing theme.",
3841
+ ...(nearest
3842
+ ? { closestExisting: { id: nearest.id, name: nearest.name, score: Number(nearestScore.toFixed(3)) } }
3843
+ : {}),
3844
+ fix: opCall("propose_theme", "{ ...same args, confirm: true }"),
3845
+ },
3846
+ null,
3847
+ 2
3848
+ ),
3849
+ { isError: true }
3850
+ );
3851
+ }
3852
+
3853
+ const id = randomThemeId();
3854
+ const theme = {
3855
+ id,
3856
+ name,
3857
+ description,
3858
+ color: args.color || "#6366f1", // brand-indigo default; user can change
3859
+ ...(typeof args.targetRoi === "number" ? { targetRoi: args.targetRoi } : {}),
3860
+ };
3861
+
3862
+ if (args.dryRun) {
3863
+ return textResult(
3864
+ JSON.stringify(
3865
+ {
3866
+ ok: true,
3867
+ dryRun: true,
3868
+ wouldCreate: theme,
3869
+ warnings: [],
3870
+ message: `Would create theme ${id} (${name}). No record written.`,
3871
+ },
3872
+ null,
3873
+ 2
3874
+ )
3875
+ );
3876
+ }
3877
+
3878
+ let rpcResult;
3879
+ try {
3880
+ rpcResult = await rpcCall("propose_theme", {
3881
+ p_workspace_id: wsId,
3882
+ p_theme: theme,
3883
+ p_idempotency_key: args.idempotencyKey ?? null,
3884
+ });
3885
+ } catch (e) {
3128
3886
  return errorResult(e.message);
3129
3887
  }
3130
3888
  const stored = rpcResult?.theme ?? theme;
@@ -3159,7 +3917,7 @@ async function proposeCapability(args, projected, wsId) {
3159
3917
  const theme = projected.themes.find((t) => t.id === pillarId);
3160
3918
  if (!theme) {
3161
3919
  return errorResult(
3162
- `pillarId ${pillarId} doesn't match any known theme. Call list_themes first.`
3920
+ `pillarId ${pillarId} doesn't match any known theme. Run ${opCall("list_themes")} first.`
3163
3921
  );
3164
3922
  }
3165
3923
  if (typeof args.impact === "number" && !VALID_IMPACTS.has(args.impact)) {
@@ -3399,7 +4157,7 @@ function suggestCapabilityFor(args, projected) {
3399
4157
  roadmapper: {
3400
4158
  reminder:
3401
4159
  ranked.length === 0
3402
- ? "No existing capability is a sensible parent. Before calling propose_capability, verify with the user that a brand-new capability is warranted — capabilities are quarterly bets, not single tasks."
4160
+ ? `No existing capability is a sensible parent. Before ${opCall("propose_capability")}, verify with the user that a brand-new capability is warranted — capabilities are quarterly bets, not single tasks.`
3403
4161
  : "No strong match (top score < 0.4). If none of the listed capabilities fit, ask the user before calling propose_capability — the top match is often closer than it scores.",
3404
4162
  },
3405
4163
  },
@@ -3461,18 +4219,25 @@ function suggestThemeFor(args, projected) {
3461
4219
  score: Number(score.toFixed(3)),
3462
4220
  }));
3463
4221
 
3464
- // Reminder when nothing matches strongly — theme creation is the
3465
- // years-stable decision, so even a weak match deserves a pause.
4222
+ // Autonomy-aware guidance. With agent_theme_autonomy ON (default),
4223
+ // the agent may create a theme on a weak/no match WITHOUT asking
4224
+ // the server's too_similar block in propose_theme is the sprawl
4225
+ // guard, not a human checkpoint. With it OFF, fall back to the old
4226
+ // "confirm with the user first" framing.
4227
+ const autonomy = projected?.settings?.agentThemeAutonomy !== false;
3466
4228
  const topScore = ranked[0]?.score ?? 0;
3467
4229
  const meta =
3468
4230
  topScore < 0.4
3469
4231
  ? {
3470
4232
  _meta: {
3471
4233
  roadmapper: {
3472
- reminder:
3473
- ranked.length === 0
3474
- ? "No existing theme overlaps your description. Themes are years-stable, so creating a new one is a big decision verify with the user that this represents a genuinely new strategic direction, not a reframing of an existing bet, before calling propose_theme."
3475
- : "No strong match (top score < 0.4). Re-using a 'close-enough' theme is almost always the right move; ask the user before calling propose_theme.",
4234
+ reminder: autonomy
4235
+ ? ranked.length === 0
4236
+ ? `No existing theme overlaps. Theme-autonomy is ON, so you may run ${opCall("propose_theme")} directly if this is a genuinely new strategic pillar the server will refuse it only if it's a near-duplicate of an existing theme.`
4237
+ : "No strong match (top score < 0.4). Prefer the closest existing theme if it fits; otherwise propose_theme is fine (autonomy is ON, sprawl is guarded server-side)."
4238
+ : ranked.length === 0
4239
+ ? "No existing theme overlaps. Theme-autonomy is OFF for this workspace — verify with the user that this is a genuinely new strategic direction before propose_theme, and pass confirm:true."
4240
+ : "No strong match (top score < 0.4). Re-using a 'close-enough' theme is almost always right; theme-autonomy is OFF, so confirm with the user before propose_theme.",
3476
4241
  },
3477
4242
  },
3478
4243
  }
@@ -3483,13 +4248,18 @@ function suggestThemeFor(args, projected) {
3483
4248
  {
3484
4249
  ok: true,
3485
4250
  query: desc,
4251
+ themeAutonomy: autonomy,
3486
4252
  matches: ranked,
3487
4253
  hint:
3488
4254
  ranked.length === 0
3489
- ? "No existing theme overlaps. propose_theme MAY be appropriate, but only with explicit user confirmation that a new strategic direction is intended — themes are years-stable, not per-feature."
4255
+ ? autonomy
4256
+ ? "No existing theme overlaps. propose_theme is appropriate if this is a distinct strategic pillar — autonomy is on; the server blocks only near-duplicates."
4257
+ : "No existing theme overlaps. propose_theme needs explicit user confirmation (autonomy off): pass confirm:true once the user approves."
3490
4258
  : ranked[0].score > 0.4
3491
4259
  ? `Strong match: ${ranked[0].id} (${ranked[0].name}). Attach capabilities under this theme instead of creating a new one.`
3492
- : `Weak overlap. The top match is often closer than it scores; prefer that over creating a new theme unless the user explicitly asks for a new strategic direction.`,
4260
+ : autonomy
4261
+ ? `Weak overlap. The top match is often closer than it scores — prefer it if it fits; otherwise propose_theme is fine (sprawl guarded server-side).`
4262
+ : `Weak overlap. Prefer the top match over a new theme unless the user explicitly asks for a new strategic direction (autonomy off).`,
3493
4263
  },
3494
4264
  null,
3495
4265
  2
@@ -4132,7 +4902,7 @@ function detectCapabilityGaps(args, projected) {
4132
4902
  roadmapper: {
4133
4903
  reminder:
4134
4904
  `${shaped.length} capability gap(s) detected — clusters of uncategorized work no existing bet covers. ` +
4135
- "Each is a CANDIDATE for propose_capability (confirm with the user — capabilities are quarterly bets, not auto-created), then move_tasks the members under it.",
4905
+ `Each is a CANDIDATE for ${opCall("propose_capability")} (confirm with the user — capabilities are quarterly bets, not auto-created), then ${opCall("move_tasks")} the members under it.`,
4136
4906
  },
4137
4907
  },
4138
4908
  }
@@ -4152,6 +4922,86 @@ function detectCapabilityGaps(args, projected) {
4152
4922
  );
4153
4923
  }
4154
4924
 
4925
+ /**
4926
+ * detect_theme_sprawl — the consolidation companion to
4927
+ * agent_theme_autonomy. Autonomy lets agents mint themes freely (with
4928
+ * the per-create too_similar block as a guard); over time, two themes
4929
+ * created from different sessions can still drift toward overlap. This
4930
+ * surfaces those pairs so a human can merge them.
4931
+ *
4932
+ * O(n^2) over active themes — fine; themes are the small top tier
4933
+ * (tens, not thousands). Deterministic: stable id sort, never random.
4934
+ */
4935
+ function detectThemeSprawl(args, projected) {
4936
+ const threshold =
4937
+ typeof args?.threshold === "number" && Number.isFinite(args.threshold)
4938
+ ? Math.min(1, Math.max(0, args.threshold))
4939
+ : THEME_SPRAWL_WARN;
4940
+ const includeArchived = args?.includeArchived === true;
4941
+
4942
+ const themes = (projected.themes ?? [])
4943
+ .filter((t) => includeArchived || !t.archived)
4944
+ .slice()
4945
+ .sort((a, b) => String(a.id).localeCompare(String(b.id)));
4946
+
4947
+ const capCountByTheme = new Map();
4948
+ for (const c of projected.capabilities ?? []) {
4949
+ if (c.archived) continue;
4950
+ capCountByTheme.set(c.pillarId, (capCountByTheme.get(c.pillarId) ?? 0) + 1);
4951
+ }
4952
+
4953
+ const toks = themes.map((t) => tokenize(`${t.name} ${t.description ?? ""}`));
4954
+ const pairs = [];
4955
+ for (let i = 0; i < themes.length; i++) {
4956
+ for (let j = i + 1; j < themes.length; j++) {
4957
+ const score = jaccardScore(toks[i], toks[j]);
4958
+ if (score < threshold) continue;
4959
+ // Suggest merging the lighter theme (fewer capabilities) INTO the
4960
+ // heavier one — the smaller bet is the cheaper thing to re-parent.
4961
+ const a = themes[i], b = themes[j];
4962
+ const aCaps = capCountByTheme.get(a.id) ?? 0;
4963
+ const bCaps = capCountByTheme.get(b.id) ?? 0;
4964
+ const [keep, fold] = aCaps >= bCaps ? [a, b] : [b, a];
4965
+ const foldCaps = keep === a ? bCaps : aCaps;
4966
+ pairs.push({
4967
+ score: Number(score.toFixed(3)),
4968
+ themes: [
4969
+ { id: a.id, name: a.name, capabilities: aCaps },
4970
+ { id: b.id, name: b.name, capabilities: bCaps },
4971
+ ],
4972
+ suggestion:
4973
+ foldCaps > 0
4974
+ ? `Likely duplicate. To consolidate: move_capabilities the ${foldCaps} capabilit${foldCaps === 1 ? "y" : "ies"} under ${fold.id} (${fold.name}) to ${keep.id} (${keep.name}), then archive_theme ${fold.id}.`
4975
+ : `Likely duplicate. ${fold.id} (${fold.name}) has no capabilities — archive_theme it and keep ${keep.id} (${keep.name}).`,
4976
+ });
4977
+ }
4978
+ }
4979
+ pairs.sort((x, y) => y.score - x.score);
4980
+
4981
+ const meta =
4982
+ pairs.length > 0
4983
+ ? {
4984
+ _meta: {
4985
+ roadmapper: {
4986
+ reminder:
4987
+ `${pairs.length} theme pair(s) overlap at/above ${threshold} — candidate duplicates. ` +
4988
+ "Themes are the years-stable top tier; consolidating keeps the strategic view legible. A human should confirm each merge.",
4989
+ },
4990
+ },
4991
+ }
4992
+ : undefined;
4993
+
4994
+ return textResult(
4995
+ JSON.stringify({
4996
+ themesScanned: themes.length,
4997
+ threshold,
4998
+ sprawlPairCount: pairs.length,
4999
+ pairs,
5000
+ }),
5001
+ meta
5002
+ );
5003
+ }
5004
+
4155
5005
  async function submitAcceptanceGrades(args, projected, wsId) {
4156
5006
  const task = projected.tasks.find((t) => t.id === args.taskId);
4157
5007
  if (!task) return errorResult(`Task ${args.taskId} not found.`);
@@ -4233,7 +5083,7 @@ function buildReminder(toolName, projected) {
4233
5083
  toolName === "list_themes")
4234
5084
  ) {
4235
5085
  reminders.push(
4236
- "Call get_agents_md before any propose_* / submit_acceptance_grades / link_pr call those tools refuse without it."
5086
+ `Call ${opCall("get_agents_md")} before any write op (propose_* / submit_acceptance_grades / link_pr)they refuse without it.`
4237
5087
  );
4238
5088
  }
4239
5089
  // Tasks with merged PRs but no acceptance grades = ungraded
@@ -4252,7 +5102,7 @@ function buildReminder(toolName, projected) {
4252
5102
  reminders.push(
4253
5103
  `${ungraded.length} delivered task${ungraded.length === 1 ? "" : "s"} ` +
4254
5104
  `have merged PRs without submitted acceptance grades. ` +
4255
- `Call submit_acceptance_grades for: ${ids}${more}.`
5105
+ `Call ${opCall("submit_acceptance_grades")} for: ${ids}${more}.`
4256
5106
  );
4257
5107
  }
4258
5108
  }
@@ -4299,14 +5149,14 @@ const RESOURCES = [
4299
5149
  uri: "roadmapper://capabilities/active",
4300
5150
  name: "Active capabilities (snapshot)",
4301
5151
  description:
4302
- "Live list of non-delivered capabilities for the env-default workspace. Read this before propose_task or propose_capability to find the right parent. Note: MCP resources don't accept arguments, so this always reads SUPABASE_WORKSPACE_ID's workspace — use list_capabilities({ workspaceId }) for cross-workspace reads.",
5152
+ `Live list of non-delivered capabilities for the env-default workspace. Read this before proposing tasks or capabilities to find the right parent. Note: MCP resources don't accept arguments, so this always reads SUPABASE_WORKSPACE_ID's workspace — use roadmap({ op: "list_capabilities", args: { workspaceId } }) for cross-workspace reads.`,
4303
5153
  mimeType: "application/json",
4304
5154
  },
4305
5155
  {
4306
5156
  uri: "roadmapper://tasks/open",
4307
5157
  name: "Open tasks (snapshot)",
4308
5158
  description:
4309
- "Live list of in_progress + planned tasks for the env-default workspace. Same workspaceId caveat as roadmapper://capabilities/active — use list_tasks({ workspaceId }) for cross-workspace reads.",
5159
+ `Live list of in_progress + planned tasks for the env-default workspace. Same workspaceId caveat as roadmapper://capabilities/active — use roadmap({ op: "list_tasks", args: { workspaceId } }) for cross-workspace reads.`,
4310
5160
  mimeType: "application/json",
4311
5161
  },
4312
5162
  ];
@@ -4437,33 +5287,33 @@ function renderPrompt(name, args) {
4437
5287
  case "plan-feature":
4438
5288
  return (
4439
5289
  `Plan a feature: "${args.description ?? "(no description provided)"}"\n\n` +
4440
- "Follow this flow exactly:\n" +
4441
- "1. Call get_agents_md (or read roadmapper://rubric) to load the rubric for this session.\n" +
4442
- "2. Call suggest_capability_for with the description above. Read every returned candidate's outcome before deciding.\n" +
4443
- "3. If a returned candidate scores > 0.4 OR its outcome maps to what we're building, propose tasks under it via propose_task. Each task MUST include acceptance criteria per the rubric.\n" +
4444
- "4. If nothing fits, STOP and ask the user before calling propose_capability — capabilities are quarterly bets, not single tasks.\n" +
5290
+ "Every operation runs through one tool: roadmap({ op, args }). Follow this flow exactly:\n" +
5291
+ '1. roadmap({ op: "get_agents_md" }) (or read the roadmapper://rubric resource) to load the rubric for this session.\n' +
5292
+ '2. roadmap({ op: "suggest_capability_for", args: { description } }) with the description above. Read every returned candidate\'s outcome before deciding.\n' +
5293
+ '3. If a returned candidate scores > 0.4 OR its outcome maps to what we\'re building, propose tasks under it via roadmap({ op: "propose_tasks", args: { capabilityId, tasks: [...] } }). Each task MUST include acceptance criteria per the rubric.\n' +
5294
+ '4. If nothing fits, STOP and ask the user before roadmap({ op: "propose_capability", args }) — capabilities are quarterly bets, not single tasks.\n' +
4445
5295
  "5. After tasks are proposed, summarize: capabilityId chosen, task ids created, anything skipped and why."
4446
5296
  );
4447
5297
  case "close-task":
4448
5298
  return (
4449
5299
  `Close task ${args.task_id ?? "(missing task_id)"}.\n\n` +
4450
- "Follow this flow exactly:\n" +
4451
- "1. Call get_agents_md (or read roadmapper://rubric) to load grading dimensions.\n" +
4452
- `2. Call get_task({ id: "${args.task_id ?? ""}" }) and read every acceptance criterion.\n` +
5300
+ "Every operation runs through one tool: roadmap({ op, args }). Follow this flow exactly:\n" +
5301
+ '1. roadmap({ op: "get_agents_md" }) (or read the roadmapper://rubric resource) to load grading dimensions.\n' +
5302
+ `2. roadmap({ op: "get_task", args: { id: "${args.task_id ?? ""}" } }) and read every acceptance criterion.\n` +
4453
5303
  "3. For each criterion, decide pass/fail. Fabricated passes destroy this signal — only mark pass if you verified.\n" +
4454
- "4. Call submit_acceptance_grades with the per-index results. Include a note on any fail.\n" +
5304
+ '4. roadmap({ op: "submit_acceptance_grades", args: { taskId, grades } }) with the per-index results. Include a note on any fail.\n' +
4455
5305
  (args.pr_url
4456
- ? `5. Call link_pr to attach ${args.pr_url} to the task.\n`
4457
- : "5. If you opened a PR, call link_pr to attach it.\n") +
5306
+ ? `5. roadmap({ op: "link_pr", args: {...} }) to attach ${args.pr_url} to the task.\n`
5307
+ : '5. If you opened a PR, roadmap({ op: "link_pr", args: {...} }) to attach it.\n') +
4458
5308
  "6. Stamp Roadmapper-Task: " +
4459
5309
  (args.task_id ?? "TK-NNNNNN") +
4460
5310
  " in the PR body so the webhook routes future events back here."
4461
5311
  );
4462
5312
  case "weekly-review":
4463
5313
  return (
4464
- "Run a structured roadmap review.\n\n" +
4465
- "1. Call get_agents_md to load the rubric (or confirm rubric is current).\n" +
4466
- "2. Call get_roadmap_snapshot for the canonical model. Note any _meta reminders in the response.\n" +
5314
+ "Run a structured roadmap review. Every operation runs through one tool: roadmap({ op, args }).\n\n" +
5315
+ '1. roadmap({ op: "get_agents_md" }) to load the rubric (or confirm rubric is current).\n' +
5316
+ '2. roadmap({ op: "get_roadmap_snapshot" }) for the canonical model. Note any _meta reminders in the response.\n' +
4467
5317
  "3. For each active capability, scan: are open tasks aging? Are any without acceptance criteria? Are there delivered tasks without acceptance grades?\n" +
4468
5318
  "4. List capabilities whose outcomes are no longer falsifiable or whose tasks all delivered (close them or pivot).\n" +
4469
5319
  "5. Report: ungraded deliveries, stale capabilities, capabilities ready to close, suggested next bets."
@@ -4506,6 +5356,36 @@ async function handle(request) {
4506
5356
  // boundary for "you need to fetch the rubric again."
4507
5357
  resetSession();
4508
5358
  recordTelemetry("session_initialized", { stats });
5359
+ // Build the server instructions once. A dynamic preamble (resolved
5360
+ // workspace + where it came from + live counts, so the agent can
5361
+ // trust where its writes land instead of discovering an empty/wrong
5362
+ // workspace later) followed by the static CORE planning contract.
5363
+ // Surfaced at the TOP LEVEL of the result — the MCP-spec
5364
+ // `instructions` channel that compliant clients (Claude Code,
5365
+ // Cursor) inject into context. It previously lived only inside
5366
+ // serverInfo, where the spec doesn't define it, so spec-reading
5367
+ // clients silently dropped it. The gate/suggest reminders that used
5368
+ // to sit here are now folded into CORE_CONTRACT's workflow section.
5369
+ const instructions = (() => {
5370
+ const { id: ws, source } = resolveWorkspaceWithSource();
5371
+ const wsLine = ws
5372
+ ? `Workspace: ${ws} (resolved from ${source}). `
5373
+ : "No workspace resolved yet. ";
5374
+ const rootsLine = _clientSupportsRoots
5375
+ ? "Detecting the repo you're in to pick its workspace; call get_active_workspace before your first write to confirm. "
5376
+ : ws
5377
+ ? ""
5378
+ : "Set ROADMAPPER_WORKSPACE_ID or open a connected repo. ";
5379
+ const preamble =
5380
+ "Roadmapper online — " +
5381
+ wsLine +
5382
+ `${stats.themes} theme${stats.themes === 1 ? "" : "s"}, ` +
5383
+ `${stats.capabilities} capabilit${stats.capabilities === 1 ? "y" : "ies"}, ` +
5384
+ `${stats.openTasks} open task${stats.openTasks === 1 ? "" : "s"}. ` +
5385
+ rootsLine +
5386
+ "Slash-prompts available: roadmapper:plan-feature, roadmapper:close-task, roadmapper:weekly-review.";
5387
+ return preamble + "\n\n" + CORE_CONTRACT;
5388
+ })();
4509
5389
  return {
4510
5390
  jsonrpc: "2.0",
4511
5391
  id,
@@ -4519,38 +5399,14 @@ async function handle(request) {
4519
5399
  resources: { listChanged: false },
4520
5400
  prompts: { listChanged: false },
4521
5401
  },
5402
+ // Top-level instructions: the spec-defined channel. serverInfo
5403
+ // keeps only name/version/stats (stats is a non-standard extra
5404
+ // some clients surface as "server info").
5405
+ instructions,
4522
5406
  serverInfo: {
4523
5407
  name: SERVER_NAME,
4524
5408
  version: SERVER_VERSION,
4525
5409
  stats,
4526
- instructions: (() => {
4527
- // Name the workspace we resolve to RIGHT NOW + where it came
4528
- // from, so the agent can trust where its writes land instead
4529
- // of discovering an empty/wrong workspace later. Repo-based
4530
- // resolution (roots → repo_workspace_map) finishes just after
4531
- // this handshake, so if the client supports roots we say the
4532
- // target may refine and to confirm via get_active_workspace.
4533
- const { id: ws, source } = resolveWorkspaceWithSource();
4534
- const wsLine = ws
4535
- ? `Workspace: ${ws} (resolved from ${source}). `
4536
- : "No workspace resolved yet. ";
4537
- const rootsLine = _clientSupportsRoots
4538
- ? "Detecting the repo you're in to pick its workspace; call get_active_workspace before your first write to confirm. "
4539
- : ws
4540
- ? ""
4541
- : "Set ROADMAPPER_WORKSPACE_ID or open a connected repo. ";
4542
- return (
4543
- "Roadmapper online — " +
4544
- wsLine +
4545
- `${stats.themes} theme${stats.themes === 1 ? "" : "s"}, ` +
4546
- `${stats.capabilities} capabilit${stats.capabilities === 1 ? "y" : "ies"}, ` +
4547
- `${stats.openTasks} open task${stats.openTasks === 1 ? "" : "s"}. ` +
4548
- rootsLine +
4549
- "Call get_agents_md before planning — the propose_* and submit_acceptance_grades tools refuse without it. " +
4550
- "Use suggest_capability_for before propose_capability. " +
4551
- "Slash-prompts available: roadmapper:plan-feature, roadmapper:close-task, roadmapper:weekly-review."
4552
- );
4553
- })(),
4554
5410
  },
4555
5411
  },
4556
5412
  };
@@ -4564,7 +5420,11 @@ async function handle(request) {
4564
5420
  // so the timing usually works out.
4565
5421
  startLabelLoad();
4566
5422
  const labels = currentLabels();
4567
- const tools = TOOLS.map((t) => ({
5423
+ // Advertise the three dispatch tools, not the 34 operations. The ops
5424
+ // (and their schemas) are reachable via roadmap_search / roadmap_describe
5425
+ // / roadmap — see META_TOOLS and the callTool dispatch. tplDescription
5426
+ // still runs so custom workspace labels apply.
5427
+ const tools = META_TOOLS.map((t) => ({
4568
5428
  ...t,
4569
5429
  description: tplDescription(t.description, labels),
4570
5430
  }));
@@ -4633,6 +5493,23 @@ async function runSelftest() {
4633
5493
  r?.result?.capabilities?.resources &&
4634
5494
  r?.result?.capabilities?.prompts,
4635
5495
  },
5496
+ {
5497
+ // The CORE planning contract must ride on the TOP-LEVEL `instructions`
5498
+ // field (the spec channel clients read), not buried in serverInfo, and
5499
+ // must carry the server-enforced falsifiable-outcome rule so an agent
5500
+ // can file a valid proposal without first fetching the full AGENTS.md.
5501
+ name: "initialize returns top-level instructions with the core contract",
5502
+ fn: () => handle({ id: 2, method: "initialize", params: {} }),
5503
+ pass: (r) => {
5504
+ const instr = r?.result?.instructions;
5505
+ return (
5506
+ typeof instr === "string" &&
5507
+ instr.length > 0 &&
5508
+ instr.includes("FALSIFIABLE OUTCOME") &&
5509
+ instr.includes("get_agents_md")
5510
+ );
5511
+ },
5512
+ },
4636
5513
  {
4637
5514
  // Hitting a mutator with no rubric fetched must return the
4638
5515
  // structured prerequisite_missing error with a `fix` field,
@@ -4734,6 +5611,147 @@ async function runSelftest() {
4734
5611
  },
4735
5612
  pass: (r) => r?.themesListedAt !== null && r?.capsDiscoveredAt !== null,
4736
5613
  },
5614
+ {
5615
+ // Sprawl control: a theme that overlaps an existing one above the
5616
+ // block bar is refused with too_similar, naming the match — even
5617
+ // on dryRun (the guard runs before the write/preview).
5618
+ name: "propose_theme blocks a near-duplicate theme (too_similar)",
5619
+ fn: () =>
5620
+ proposeTheme(
5621
+ { name: "Data Intelligence Platform Core", dryRun: true },
5622
+ {
5623
+ themes: [
5624
+ { id: "TH-DUP", name: "Data Intelligence Platform", description: "" },
5625
+ ],
5626
+ settings: { agentThemeAutonomy: true },
5627
+ },
5628
+ "ws-test"
5629
+ ),
5630
+ pass: (r) => {
5631
+ const t = r?.content?.[0]?.text ?? "";
5632
+ return t.includes("too_similar") && t.includes("TH-DUP");
5633
+ },
5634
+ },
5635
+ {
5636
+ // A distinct theme passes the sprawl guard, and with autonomy ON
5637
+ // (default) sails through to the (dryRun) create — no confirmation.
5638
+ name: "propose_theme allows a distinct theme when autonomy is on",
5639
+ fn: () =>
5640
+ proposeTheme(
5641
+ { name: "Customer Onboarding Automation", dryRun: true },
5642
+ {
5643
+ themes: [
5644
+ { id: "TH-DUP", name: "Data Intelligence Platform", description: "" },
5645
+ ],
5646
+ settings: { agentThemeAutonomy: true },
5647
+ },
5648
+ "ws-test"
5649
+ ),
5650
+ pass: (r) => {
5651
+ const t = r?.content?.[0]?.text ?? "";
5652
+ return t.includes("\"ok\": true") && t.includes("wouldCreate") && !t.includes("too_similar");
5653
+ },
5654
+ },
5655
+ {
5656
+ // force:true overrides a too_similar block for the rare genuine
5657
+ // false positive.
5658
+ name: "propose_theme force:true overrides the sprawl block",
5659
+ fn: () =>
5660
+ proposeTheme(
5661
+ { name: "Data Intelligence Platform Core", force: true, dryRun: true },
5662
+ {
5663
+ themes: [
5664
+ { id: "TH-DUP", name: "Data Intelligence Platform", description: "" },
5665
+ ],
5666
+ settings: { agentThemeAutonomy: true },
5667
+ },
5668
+ "ws-test"
5669
+ ),
5670
+ pass: (r) => {
5671
+ const t = r?.content?.[0]?.text ?? "";
5672
+ return t.includes("wouldCreate") && !t.includes("too_similar");
5673
+ },
5674
+ },
5675
+ {
5676
+ // With autonomy OFF, a brand-new theme needs confirm:true — the
5677
+ // server returns confirmation_required until the human signs off.
5678
+ name: "propose_theme requires confirm when autonomy is off",
5679
+ fn: () =>
5680
+ proposeTheme(
5681
+ { name: "Brand New Distinct Strategic Pillar" },
5682
+ { themes: [], settings: { agentThemeAutonomy: false } },
5683
+ "ws-test"
5684
+ ),
5685
+ pass: (r) => {
5686
+ const t = r?.content?.[0]?.text ?? "";
5687
+ return t.includes("confirmation_required") && t.includes("confirm");
5688
+ },
5689
+ },
5690
+ {
5691
+ // detect_theme_sprawl surfaces overlapping existing themes as
5692
+ // consolidation candidates.
5693
+ name: "detect_theme_sprawl flags overlapping themes",
5694
+ fn: () =>
5695
+ detectThemeSprawl(
5696
+ {},
5697
+ {
5698
+ themes: [
5699
+ { id: "TH-A", name: "Data Intelligence", description: "" },
5700
+ { id: "TH-B", name: "Data Intelligence Platform", description: "" },
5701
+ ],
5702
+ capabilities: [],
5703
+ }
5704
+ ),
5705
+ pass: (r) => {
5706
+ const t = r?.content?.[0]?.text ?? "";
5707
+ return t.includes("\"sprawlPairCount\": 1") || (t.includes("TH-A") && t.includes("TH-B"));
5708
+ },
5709
+ },
5710
+ {
5711
+ // propose_tasks bulk: dryRun previews all rows + mints an id each.
5712
+ name: "propose_tasks bulk previews the whole batch (dryRun)",
5713
+ fn: () =>
5714
+ proposeTasks(
5715
+ {
5716
+ capabilityId: "CAP-1",
5717
+ dryRun: true,
5718
+ tasks: [
5719
+ { ref: "a", title: "First bulk task here", effort: "M" },
5720
+ { title: "Second bulk task here", effort: "S", dependsOn: ["a"] },
5721
+ ],
5722
+ },
5723
+ { capabilities: [{ id: "CAP-1", name: "Test Cap" }], themes: [], tasks: [] },
5724
+ "ws-test"
5725
+ ),
5726
+ pass: (r) => {
5727
+ const t = r?.content?.[0]?.text ?? "";
5728
+ try {
5729
+ const j = JSON.parse(t);
5730
+ return j.dryRun === true && Array.isArray(j.wouldCreate) && j.wouldCreate.length === 2;
5731
+ } catch {
5732
+ return false;
5733
+ }
5734
+ },
5735
+ },
5736
+ {
5737
+ // propose_tasks rejects the whole batch on a per-row validation
5738
+ // error (missing effort), naming the offending index.
5739
+ name: "propose_tasks rejects a batch with a missing-effort row",
5740
+ fn: () =>
5741
+ proposeTasks(
5742
+ {
5743
+ capabilityId: "CAP-1",
5744
+ tasks: [{ title: "No effort on this task" }],
5745
+ },
5746
+ { capabilities: [{ id: "CAP-1", name: "Test Cap" }], themes: [], tasks: [] },
5747
+ "ws-test"
5748
+ ),
5749
+ pass: (r) => {
5750
+ if (!r?.isError) return false;
5751
+ const t = r?.content?.[0]?.text ?? "";
5752
+ return t.includes("tasks[0]") && t.includes("effort");
5753
+ },
5754
+ },
4737
5755
  {
4738
5756
  name: "resources/list returns the three resources",
4739
5757
  fn: () => handle({ id: 12, method: "resources/list", params: {} }),
@@ -4780,10 +5798,317 @@ async function runSelftest() {
4780
5798
  r.result.messages[0].content.text.includes("demo description"),
4781
5799
  },
4782
5800
  {
4783
- name: "tools/list",
5801
+ // The wire surface is the three dispatch tools, NOT the 34 ops.
5802
+ name: "tools/list advertises exactly the three dispatch tools",
4784
5803
  fn: () => handle({ id: 2, method: "tools/list", params: {} }),
4785
- pass: (r) =>
4786
- Array.isArray(r?.result?.tools) && r.result.tools.length === TOOLS.length,
5804
+ pass: (r) => {
5805
+ const names = (r?.result?.tools ?? []).map((t) => t.name).sort();
5806
+ return (
5807
+ names.length === META_TOOLS.length &&
5808
+ ["roadmap", "roadmap_describe", "roadmap_search"].every((n) =>
5809
+ names.includes(n)
5810
+ )
5811
+ );
5812
+ },
5813
+ },
5814
+ {
5815
+ // tools/list must serve TRIMMED descriptions (summary only): every
5816
+ // tool keeps a non-empty one-line summary, and the methodology blocks
5817
+ // (USE WHEN / PREREQUISITE / ANTI-PATTERN / EXAMPLE) must be gone from
5818
+ // the wire payload — they now live in `instructions` + the rubric.
5819
+ // Guards against a regression that re-serves the full descriptions.
5820
+ name: "tools/list serves trimmed one-line descriptions",
5821
+ fn: () => handle({ id: 23, method: "tools/list", params: {} }),
5822
+ pass: (r) => {
5823
+ const tools = r?.result?.tools;
5824
+ if (!Array.isArray(tools) || tools.length === 0) return false;
5825
+ return tools.every(
5826
+ (t) =>
5827
+ typeof t.description === "string" &&
5828
+ t.description.length > 0 &&
5829
+ !t.description.includes("\n\n") &&
5830
+ !t.description.includes("ANTI-PATTERN:") &&
5831
+ !t.description.includes("PREREQUISITE:")
5832
+ );
5833
+ },
5834
+ },
5835
+ {
5836
+ // roadmap_search returns the op catalogue (all 34 when no intent),
5837
+ // each row carrying a trimmed summary (no methodology blocks).
5838
+ name: "roadmap_search lists operations with trimmed summaries",
5839
+ fn: () =>
5840
+ handle({
5841
+ id: 24,
5842
+ method: "tools/call",
5843
+ params: { name: "roadmap_search", arguments: {} },
5844
+ }),
5845
+ pass: (r) => {
5846
+ if (r?.result?.isError) return false;
5847
+ const text = r?.result?.content?.[0]?.text ?? "";
5848
+ let body;
5849
+ try {
5850
+ body = JSON.parse(text);
5851
+ } catch {
5852
+ return false;
5853
+ }
5854
+ const ops = body?.operations ?? [];
5855
+ return (
5856
+ ops.length === TOOLS.length &&
5857
+ ops.some((o) => o.op === "propose_task") &&
5858
+ ops.every(
5859
+ (o) =>
5860
+ typeof o.summary === "string" &&
5861
+ o.summary.length > 0 &&
5862
+ !o.summary.includes("ANTI-PATTERN:")
5863
+ )
5864
+ );
5865
+ },
5866
+ },
5867
+ {
5868
+ // roadmap_describe serves an op's inputSchema on demand (the bulk
5869
+ // evicted from tools/list). move_* / update_* live here now.
5870
+ name: "roadmap_describe returns inputSchema for move/update ops",
5871
+ fn: () =>
5872
+ handle({
5873
+ id: 25,
5874
+ method: "tools/call",
5875
+ params: { name: "roadmap_describe", arguments: { op: "move_task" } },
5876
+ }),
5877
+ pass: (r) => {
5878
+ if (r?.result?.isError) return false;
5879
+ let body;
5880
+ try {
5881
+ body = JSON.parse(r?.result?.content?.[0]?.text ?? "");
5882
+ } catch {
5883
+ return false;
5884
+ }
5885
+ return (
5886
+ body?.op === "move_task" &&
5887
+ body?.inputSchema?.type === "object" &&
5888
+ !!body?.inputSchema?.properties?.newCapabilityId
5889
+ );
5890
+ },
5891
+ },
5892
+ {
5893
+ name: "roadmap_describe rejects an unknown op",
5894
+ fn: () =>
5895
+ handle({
5896
+ id: 26,
5897
+ method: "tools/call",
5898
+ params: { name: "roadmap_describe", arguments: { op: "no_such_op" } },
5899
+ }),
5900
+ pass: (r) => r?.result?.isError === true,
5901
+ },
5902
+ {
5903
+ name: "roadmap rejects a missing op",
5904
+ fn: () =>
5905
+ handle({
5906
+ id: 27,
5907
+ method: "tools/call",
5908
+ params: { name: "roadmap", arguments: {} },
5909
+ }),
5910
+ pass: (r) => r?.result?.isError === true,
5911
+ },
5912
+ {
5913
+ name: "roadmap rejects an unknown op",
5914
+ fn: () =>
5915
+ handle({
5916
+ id: 28,
5917
+ method: "tools/call",
5918
+ params: { name: "roadmap", arguments: { op: "no_such_op" } },
5919
+ }),
5920
+ pass: (r) => r?.result?.isError === true,
5921
+ },
5922
+ {
5923
+ // Dispatch must run the SAME gates as a direct call: hitting a mutator
5924
+ // op through roadmap() before the rubric is fetched returns the
5925
+ // structured prerequisite_missing error, proving gates key off the op.
5926
+ name: "roadmap dispatch enforces the rubric gate on the inner op",
5927
+ fn: () => {
5928
+ resetSession();
5929
+ return handle({
5930
+ id: 29,
5931
+ method: "tools/call",
5932
+ params: {
5933
+ name: "roadmap",
5934
+ arguments: {
5935
+ op: "propose_task",
5936
+ args: { capabilityId: aCap, title: "Should be blocked via dispatch" },
5937
+ },
5938
+ },
5939
+ });
5940
+ },
5941
+ pass: (r) => {
5942
+ if (!r?.result?.isError) return false;
5943
+ try {
5944
+ // Parse and assert the fix FIELD directly (not the whole blob) so a
5945
+ // regression reverting fix to a bare uncallable get_agents_md() —
5946
+ // which still appears in the message — can't pass on substring luck.
5947
+ const out = JSON.parse(r.result.content?.[0]?.text ?? "");
5948
+ return (
5949
+ out.error === "prerequisite_missing" &&
5950
+ out.fix === 'roadmap({ op: "get_agents_md" })'
5951
+ );
5952
+ } catch {
5953
+ return false;
5954
+ }
5955
+ },
5956
+ },
5957
+ {
5958
+ // Dispatch reaches the inner op's argument validation identically to a
5959
+ // direct call. Tightened: assert the message is move_task's OWN
5960
+ // validator (newCapabilityId), so a regression that swallowed args.args
5961
+ // (yielding a generic 'taskId is required') can't pass this.
5962
+ name: "roadmap dispatch reaches inner-op validation",
5963
+ fn: () => {
5964
+ resetSession();
5965
+ return handle({
5966
+ id: 30,
5967
+ method: "tools/call",
5968
+ params: { name: "get_agents_md", arguments: {} },
5969
+ }).then(() =>
5970
+ handle({
5971
+ id: 31,
5972
+ method: "tools/call",
5973
+ params: {
5974
+ name: "roadmap",
5975
+ arguments: { op: "move_task", args: { taskId: "TK-1" } },
5976
+ },
5977
+ })
5978
+ );
5979
+ },
5980
+ pass: (r) => {
5981
+ if (!r?.result?.isError) return false;
5982
+ const text = r.result.content?.[0]?.text ?? "";
5983
+ return text.includes("newCapabilityId") && text.includes("required");
5984
+ },
5985
+ },
5986
+ {
5987
+ // POSITIVE path: a successful read THROUGH roadmap returns the inner
5988
+ // op's real data verbatim (proves the unwrap + passthrough, which the
5989
+ // error-path checks above never exercise).
5990
+ name: "roadmap dispatch returns real data on the happy path",
5991
+ fn: () =>
5992
+ handle({
5993
+ id: 32,
5994
+ method: "tools/call",
5995
+ params: { name: "roadmap", arguments: { op: "get_roadmap_snapshot" } },
5996
+ }),
5997
+ pass: (r) => {
5998
+ if (r?.result?.isError) return false;
5999
+ try {
6000
+ const body = JSON.parse(r.result.content?.[0]?.text ?? "");
6001
+ // Real snapshot shape (workspaceId may be null in seed-only mode).
6002
+ return (
6003
+ Array.isArray(body?.themes) &&
6004
+ Array.isArray(body?.capabilities) &&
6005
+ typeof body?.counts === "object"
6006
+ );
6007
+ } catch {
6008
+ return false;
6009
+ }
6010
+ },
6011
+ },
6012
+ {
6013
+ // Discovery/rubric session flags must be SET through dispatch, not just
6014
+ // blocked when unset — satisfy both gates purely via roadmap({op}) and
6015
+ // confirm the flags flipped (guards a regression where dispatch stopped
6016
+ // re-entering the switch that writes them).
6017
+ name: "roadmap dispatch sets the rubric + discovery session flags",
6018
+ fn: async () => {
6019
+ resetSession();
6020
+ await handle({
6021
+ id: 33,
6022
+ method: "tools/call",
6023
+ params: { name: "roadmap", arguments: { op: "get_agents_md" } },
6024
+ });
6025
+ await handle({
6026
+ id: 34,
6027
+ method: "tools/call",
6028
+ params: { name: "roadmap", arguments: { op: "get_roadmap_snapshot" } },
6029
+ });
6030
+ return {
6031
+ rubric: session.rubricFetchedAt,
6032
+ themes: session.themesListedAt,
6033
+ caps: session.capsDiscoveredAt,
6034
+ };
6035
+ },
6036
+ pass: (r) => r?.rubric !== null && r?.themes !== null && r?.caps !== null,
6037
+ },
6038
+ {
6039
+ // Flat (un-nested) args must NOT be silently dropped — the dangerous
6040
+ // case being a dropped workspaceId/dryRun. Use a gate-free read: a flat
6041
+ // get_task carrying the id at the TOP level (not under args) must reach
6042
+ // the op and resolve the real task. If the flat id were dropped, it
6043
+ // would 404 instead. Proves the flat-merge in the roadmap dispatch.
6044
+ name: "roadmap tolerates flat (un-nested) args",
6045
+ fn: () =>
6046
+ handle({
6047
+ id: 36,
6048
+ method: "tools/call",
6049
+ params: { name: "roadmap", arguments: { op: "get_task", id: aTask } },
6050
+ }),
6051
+ pass: (r) => {
6052
+ if (r?.result?.isError) return false;
6053
+ const text = r.result.content?.[0]?.text ?? "";
6054
+ return typeof aTask === "string" && text.includes(aTask);
6055
+ },
6056
+ },
6057
+ {
6058
+ // When both a flat sibling and a nested args carry the same key, the
6059
+ // nested (documented) value must win. Flat id is a bogus task; nested
6060
+ // id is the real one — resolving the real task proves nested precedence.
6061
+ name: "roadmap merge: nested args win over flat siblings on conflict",
6062
+ fn: () =>
6063
+ handle({
6064
+ id: 37,
6065
+ method: "tools/call",
6066
+ params: {
6067
+ name: "roadmap",
6068
+ arguments: { op: "get_task", id: "TK-DOES-NOT-EXIST", args: { id: aTask } },
6069
+ },
6070
+ }),
6071
+ pass: (r) => {
6072
+ if (r?.result?.isError) return false;
6073
+ const text = r.result.content?.[0]?.text ?? "";
6074
+ return text.includes(aTask) && !text.includes("TK-DOES-NOT-EXIST");
6075
+ },
6076
+ },
6077
+ {
6078
+ // args passed as a JSON STRING (a real LLM failure mode) must be parsed
6079
+ // and honored, not silently dropped — a flat get_task with stringified
6080
+ // args resolves the real task.
6081
+ name: "roadmap parses JSON-string args",
6082
+ fn: () =>
6083
+ handle({
6084
+ id: 38,
6085
+ method: "tools/call",
6086
+ params: {
6087
+ name: "roadmap",
6088
+ arguments: { op: "get_task", args: JSON.stringify({ id: aTask }) },
6089
+ },
6090
+ }),
6091
+ pass: (r) => {
6092
+ if (r?.result?.isError) return false;
6093
+ const text = r.result.content?.[0]?.text ?? "";
6094
+ return typeof aTask === "string" && text.includes(aTask);
6095
+ },
6096
+ },
6097
+ {
6098
+ // A non-object, non-parseable args must produce a clear boundary error
6099
+ // naming the cause — not a misleading downstream 'X is required'.
6100
+ name: "roadmap rejects non-object args with a clear error",
6101
+ fn: () =>
6102
+ handle({
6103
+ id: 39,
6104
+ method: "tools/call",
6105
+ params: { name: "roadmap", arguments: { op: "get_task", args: "not json at all" } },
6106
+ }),
6107
+ pass: (r) => {
6108
+ if (!r?.result?.isError) return false;
6109
+ const text = r.result.content?.[0]?.text ?? "";
6110
+ return text.includes("args") && text.includes("must be an object");
6111
+ },
4787
6112
  },
4788
6113
  {
4789
6114
  name: "get_active_workspace reports a resolution source",
@@ -5772,15 +7097,37 @@ async function runSelftest() {
5772
7097
  pass: (r) => r?.result?.isError === true,
5773
7098
  },
5774
7099
  {
5775
- // Schema-level: tools/list must advertise the four move tools.
5776
- name: "tools/list advertises four move tools",
5777
- fn: () => handle({ id: 30, method: "tools/list", params: {} }),
5778
- pass: (r) => {
5779
- const names = (r?.result?.tools ?? []).map((t) => t.name);
5780
- return ["move_task", "move_capability", "move_tasks", "move_capabilities"].every((n) =>
5781
- names.includes(n)
5782
- );
7100
+ // The four move ops are no longer advertised by name (the surface is
7101
+ // the three dispatch tools) but must remain reachable + describable.
7102
+ name: "roadmap_describe resolves all four move ops",
7103
+ fn: async () => {
7104
+ const ops = ["move_task", "move_capability", "move_tasks", "move_capabilities"];
7105
+ const out = [];
7106
+ for (const op of ops) {
7107
+ out.push(
7108
+ await handle({
7109
+ id: 30,
7110
+ method: "tools/call",
7111
+ params: { name: "roadmap_describe", arguments: { op } },
7112
+ })
7113
+ );
7114
+ }
7115
+ return out;
5783
7116
  },
7117
+ pass: (results) =>
7118
+ Array.isArray(results) &&
7119
+ results.length === 4 &&
7120
+ results.every((r) => {
7121
+ if (r?.result?.isError) return false;
7122
+ try {
7123
+ return (
7124
+ JSON.parse(r.result.content?.[0]?.text ?? "")?.inputSchema?.type ===
7125
+ "object"
7126
+ );
7127
+ } catch {
7128
+ return false;
7129
+ }
7130
+ }),
5784
7131
  },
5785
7132
  {
5786
7133
  // Update validation: missing patch.
@@ -5847,17 +7194,37 @@ async function runSelftest() {
5847
7194
  pass: (r) => r?.result?.isError === true,
5848
7195
  },
5849
7196
  {
5850
- // Schema-level: parent fields are blocked at JSON-schema layer
5851
- // (additionalProperties:false on patch). Without service key
5852
- // we won't reach SQL, but the schema rejects it pre-call.
5853
- name: "tools/list advertises three update tools",
5854
- fn: () => handle({ id: 35, method: "tools/list", params: {} }),
5855
- pass: (r) => {
5856
- const names = (r?.result?.tools ?? []).map((t) => t.name);
5857
- return ["update_task", "update_capability", "update_theme"].every((n) =>
5858
- names.includes(n)
5859
- );
7197
+ // The three update ops are reachable + describable via the dispatch
7198
+ // surface (not advertised by name in tools/list anymore).
7199
+ name: "roadmap_describe resolves all three update ops",
7200
+ fn: async () => {
7201
+ const ops = ["update_task", "update_capability", "update_theme"];
7202
+ const out = [];
7203
+ for (const op of ops) {
7204
+ out.push(
7205
+ await handle({
7206
+ id: 35,
7207
+ method: "tools/call",
7208
+ params: { name: "roadmap_describe", arguments: { op } },
7209
+ })
7210
+ );
7211
+ }
7212
+ return out;
5860
7213
  },
7214
+ pass: (results) =>
7215
+ Array.isArray(results) &&
7216
+ results.length === 3 &&
7217
+ results.every((r) => {
7218
+ if (r?.result?.isError) return false;
7219
+ try {
7220
+ return (
7221
+ JSON.parse(r.result.content?.[0]?.text ?? "")?.inputSchema?.type ===
7222
+ "object"
7223
+ );
7224
+ } catch {
7225
+ return false;
7226
+ }
7227
+ }),
5861
7228
  },
5862
7229
  {
5863
7230
  // Cross-workspace guard fires when snapshot.json names workspace
@@ -6400,6 +7767,161 @@ async function runSelftest() {
6400
7767
  }
6401
7768
  },
6402
7769
  },
7770
+ {
7771
+ // Repo-link gate: a mutator in an UNMAPPED git repo (slug resolves
7772
+ // but no repo_workspace_map row, so resolution falls to env source)
7773
+ // is blocked with repo_unmapped naming the slug + the link_repo fix.
7774
+ name: "mutator blocked when in an unmapped repo (would hit env default)",
7775
+ fn: async () => {
7776
+ const savedWs = process.env.ROADMAPPER_WORKSPACE_ID;
7777
+ const savedKey = process.env.ROADMAPPER_API_KEY;
7778
+ const savedUrl = process.env.ROADMAPPER_BACKEND_URL;
7779
+ try {
7780
+ // Writes must be enabled or the gate defers to set_credentials.
7781
+ process.env.ROADMAPPER_API_KEY = "rmpr_selftest";
7782
+ process.env.ROADMAPPER_BACKEND_URL = "https://selftest.local";
7783
+ process.env.ROADMAPPER_WORKSPACE_ID = "ws-envdefault";
7784
+ session.rubricFetchedAt = Date.now(); // past the rubric gate
7785
+ _clientRoots = ["/tmp/unmapped"];
7786
+ __setRepoSlugForTest("acme/unmapped");
7787
+ __setRootWorkspaceForTest(null); // no repo_workspace_map hit → env source
7788
+ return await handle({
7789
+ id: 94,
7790
+ method: "tools/call",
7791
+ params: {
7792
+ name: "archive_task",
7793
+ arguments: { taskId: aTask, reason: "unmapped-repo probe" },
7794
+ },
7795
+ });
7796
+ } finally {
7797
+ __setRepoSlugForTest(undefined);
7798
+ __setRootWorkspaceForTest(undefined);
7799
+ _clientRoots = [];
7800
+ if (savedWs === undefined) delete process.env.ROADMAPPER_WORKSPACE_ID;
7801
+ else process.env.ROADMAPPER_WORKSPACE_ID = savedWs;
7802
+ if (savedKey === undefined) delete process.env.ROADMAPPER_API_KEY;
7803
+ else process.env.ROADMAPPER_API_KEY = savedKey;
7804
+ if (savedUrl === undefined) delete process.env.ROADMAPPER_BACKEND_URL;
7805
+ else process.env.ROADMAPPER_BACKEND_URL = savedUrl;
7806
+ }
7807
+ },
7808
+ pass: (r) => {
7809
+ try {
7810
+ const out = JSON.parse(r?.result?.content?.[0]?.text ?? "{}");
7811
+ return (
7812
+ out.error === "repo_unmapped" &&
7813
+ out.repo === "acme/unmapped" &&
7814
+ out.fix === 'roadmap({ op: "link_repo" })' &&
7815
+ out.envDefaultWorkspace === "ws-envdefault"
7816
+ );
7817
+ } catch {
7818
+ return false;
7819
+ }
7820
+ },
7821
+ },
7822
+ {
7823
+ // ESCAPE HATCH 1 (the multi-repo case): an explicit workspaceId arg
7824
+ // means the caller is intentionally targeting a workspace — the gate
7825
+ // must NOT fire even in an unmapped repo. Proves a developer juggling
7826
+ // several repos in one chat is never bricked: pass workspaceId and the
7827
+ // write proceeds (lands downstream on the missing-service-key error in
7828
+ // selftest, NOT the repo_unmapped block — that's the assertion).
7829
+ name: "repo-link gate skipped when workspaceId passed explicitly",
7830
+ fn: async () => {
7831
+ try {
7832
+ session.rubricFetchedAt = Date.now();
7833
+ _clientRoots = ["/tmp/unmapped"];
7834
+ __setRepoSlugForTest("acme/unmapped");
7835
+ __setRootWorkspaceForTest(null);
7836
+ return await handle({
7837
+ id: 95,
7838
+ method: "tools/call",
7839
+ params: {
7840
+ name: "archive_task",
7841
+ arguments: {
7842
+ taskId: aTask,
7843
+ reason: "explicit-ws probe",
7844
+ workspaceId: "ws-explicit",
7845
+ },
7846
+ },
7847
+ });
7848
+ } finally {
7849
+ __setRepoSlugForTest(undefined);
7850
+ __setRootWorkspaceForTest(undefined);
7851
+ _clientRoots = [];
7852
+ }
7853
+ },
7854
+ pass: (r) => {
7855
+ // Must be an error result (no service key downstream) but NOT the
7856
+ // repo_unmapped block — proves the gate let the explicit target through.
7857
+ if (!r?.result?.isError) return false;
7858
+ const txt = r.result.content?.[0]?.text ?? "";
7859
+ return !txt.includes("repo_unmapped");
7860
+ },
7861
+ },
7862
+ {
7863
+ // ESCAPE HATCH 2: a MAPPED repo (resolution returns source "repo")
7864
+ // never trips the gate — the whole point. Seeding a root workspace
7865
+ // makes resolveWorkspaceWithSource return source:"repo", not "env".
7866
+ name: "repo-link gate skipped when repo IS mapped (source=repo)",
7867
+ fn: async () => {
7868
+ try {
7869
+ session.rubricFetchedAt = Date.now();
7870
+ _clientRoots = ["/tmp/mapped"];
7871
+ __setRepoSlugForTest("acme/mapped");
7872
+ __setRootWorkspaceForTest("ws-mapped", "acme/mapped"); // mapped → source "repo"
7873
+ return await handle({
7874
+ id: 96,
7875
+ method: "tools/call",
7876
+ params: {
7877
+ name: "archive_task",
7878
+ arguments: { taskId: aTask, reason: "mapped-repo probe" },
7879
+ },
7880
+ });
7881
+ } finally {
7882
+ __setRepoSlugForTest(undefined);
7883
+ __setRootWorkspaceForTest(undefined);
7884
+ _clientRoots = [];
7885
+ }
7886
+ },
7887
+ pass: (r) => {
7888
+ if (!r?.result?.isError) return false;
7889
+ const txt = r.result.content?.[0]?.text ?? "";
7890
+ return !txt.includes("repo_unmapped");
7891
+ },
7892
+ },
7893
+ {
7894
+ // ESCAPE HATCH 3: not in a git repo at all (no client roots) — nothing
7895
+ // to link, so the gate must fall through to the env default rather than
7896
+ // deadlock. Asserts NOT repo_unmapped.
7897
+ name: "repo-link gate skipped when not in a git repo (no deadlock)",
7898
+ fn: async () => {
7899
+ const savedWs = process.env.ROADMAPPER_WORKSPACE_ID;
7900
+ try {
7901
+ process.env.ROADMAPPER_WORKSPACE_ID = "ws-envdefault";
7902
+ session.rubricFetchedAt = Date.now();
7903
+ _clientRoots = []; // not in a repo
7904
+ __setRootWorkspaceForTest(null);
7905
+ return await handle({
7906
+ id: 97,
7907
+ method: "tools/call",
7908
+ params: {
7909
+ name: "archive_task",
7910
+ arguments: { taskId: aTask, reason: "no-repo probe" },
7911
+ },
7912
+ });
7913
+ } finally {
7914
+ __setRootWorkspaceForTest(undefined);
7915
+ if (savedWs === undefined) delete process.env.ROADMAPPER_WORKSPACE_ID;
7916
+ else process.env.ROADMAPPER_WORKSPACE_ID = savedWs;
7917
+ }
7918
+ },
7919
+ pass: (r) => {
7920
+ if (!r?.result?.isError) return false;
7921
+ const txt = r.result.content?.[0]?.text ?? "";
7922
+ return !txt.includes("repo_unmapped");
7923
+ },
7924
+ },
6403
7925
  ];
6404
7926
 
6405
7927
  let passed = 0;