nodebench-mcp 2.31.2 → 2.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/README.md +14 -6
  2. package/dist/engine/server.js +14 -4
  3. package/dist/engine/server.js.map +1 -1
  4. package/dist/index.js +1581 -670
  5. package/dist/index.js.map +1 -1
  6. package/dist/security/SecurityError.d.ts +18 -0
  7. package/dist/security/SecurityError.js +22 -0
  8. package/dist/security/SecurityError.js.map +1 -0
  9. package/dist/security/__tests__/security.test.d.ts +8 -0
  10. package/dist/security/__tests__/security.test.js +295 -0
  11. package/dist/security/__tests__/security.test.js.map +1 -0
  12. package/dist/security/auditLog.d.ts +36 -0
  13. package/dist/security/auditLog.js +178 -0
  14. package/dist/security/auditLog.js.map +1 -0
  15. package/dist/security/commandSandbox.d.ts +33 -0
  16. package/dist/security/commandSandbox.js +159 -0
  17. package/dist/security/commandSandbox.js.map +1 -0
  18. package/dist/security/config.d.ts +23 -0
  19. package/dist/security/config.js +43 -0
  20. package/dist/security/config.js.map +1 -0
  21. package/dist/security/credentialRedactor.d.ts +22 -0
  22. package/dist/security/credentialRedactor.js +118 -0
  23. package/dist/security/credentialRedactor.js.map +1 -0
  24. package/dist/security/index.d.ts +20 -0
  25. package/dist/security/index.js +21 -0
  26. package/dist/security/index.js.map +1 -0
  27. package/dist/security/pathSandbox.d.ts +23 -0
  28. package/dist/security/pathSandbox.js +160 -0
  29. package/dist/security/pathSandbox.js.map +1 -0
  30. package/dist/security/urlValidator.d.ts +23 -0
  31. package/dist/security/urlValidator.js +125 -0
  32. package/dist/security/urlValidator.js.map +1 -0
  33. package/dist/tools/agentBootstrapTools.js +22 -29
  34. package/dist/tools/agentBootstrapTools.js.map +1 -1
  35. package/dist/tools/contextSandboxTools.js +7 -9
  36. package/dist/tools/contextSandboxTools.js.map +1 -1
  37. package/dist/tools/deepSimTools.d.ts +2 -0
  38. package/dist/tools/deepSimTools.js +404 -0
  39. package/dist/tools/deepSimTools.js.map +1 -0
  40. package/dist/tools/dimensionTools.d.ts +2 -0
  41. package/dist/tools/dimensionTools.js +246 -0
  42. package/dist/tools/dimensionTools.js.map +1 -0
  43. package/dist/tools/executionTraceTools.d.ts +2 -0
  44. package/dist/tools/executionTraceTools.js +446 -0
  45. package/dist/tools/executionTraceTools.js.map +1 -0
  46. package/dist/tools/founderTools.d.ts +13 -0
  47. package/dist/tools/founderTools.js +595 -0
  48. package/dist/tools/founderTools.js.map +1 -0
  49. package/dist/tools/founderTrackingTools.d.ts +9 -0
  50. package/dist/tools/founderTrackingTools.js +644 -0
  51. package/dist/tools/founderTrackingTools.js.map +1 -0
  52. package/dist/tools/gitWorkflowTools.js +14 -10
  53. package/dist/tools/gitWorkflowTools.js.map +1 -1
  54. package/dist/tools/githubTools.js +19 -2
  55. package/dist/tools/githubTools.js.map +1 -1
  56. package/dist/tools/index.d.ts +87 -0
  57. package/dist/tools/index.js +102 -0
  58. package/dist/tools/index.js.map +1 -0
  59. package/dist/tools/localFileTools.js +24 -12
  60. package/dist/tools/localFileTools.js.map +1 -1
  61. package/dist/tools/memoryDecay.d.ts +70 -0
  62. package/dist/tools/memoryDecay.js +247 -0
  63. package/dist/tools/memoryDecay.js.map +1 -0
  64. package/dist/tools/missionHarnessTools.d.ts +32 -0
  65. package/dist/tools/missionHarnessTools.js +972 -0
  66. package/dist/tools/missionHarnessTools.js.map +1 -0
  67. package/dist/tools/observabilityTools.d.ts +15 -0
  68. package/dist/tools/observabilityTools.js +787 -0
  69. package/dist/tools/observabilityTools.js.map +1 -0
  70. package/dist/tools/openclawTools.js +151 -36
  71. package/dist/tools/openclawTools.js.map +1 -1
  72. package/dist/tools/progressiveDiscoveryTools.js +5 -4
  73. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  74. package/dist/tools/qualityGateTools.js +118 -2
  75. package/dist/tools/qualityGateTools.js.map +1 -1
  76. package/dist/tools/rssTools.js +3 -0
  77. package/dist/tools/rssTools.js.map +1 -1
  78. package/dist/tools/scraplingTools.js +15 -0
  79. package/dist/tools/scraplingTools.js.map +1 -1
  80. package/dist/tools/seoTools.js +66 -1
  81. package/dist/tools/seoTools.js.map +1 -1
  82. package/dist/tools/sessionMemoryTools.js +50 -11
  83. package/dist/tools/sessionMemoryTools.js.map +1 -1
  84. package/dist/tools/temporalIntelligenceTools.d.ts +12 -0
  85. package/dist/tools/temporalIntelligenceTools.js +1068 -0
  86. package/dist/tools/temporalIntelligenceTools.js.map +1 -0
  87. package/dist/tools/toolRegistry.d.ts +19 -0
  88. package/dist/tools/toolRegistry.js +956 -31
  89. package/dist/tools/toolRegistry.js.map +1 -1
  90. package/dist/tools/webTools.js +14 -1
  91. package/dist/tools/webTools.js.map +1 -1
  92. package/dist/tools/webmcpTools.js +13 -2
  93. package/dist/tools/webmcpTools.js.map +1 -1
  94. package/dist/toolsetRegistry.js +14 -0
  95. package/dist/toolsetRegistry.js.map +1 -1
  96. package/dist/types.d.ts +10 -0
  97. package/package.json +124 -124
package/dist/index.js CHANGED
@@ -20,6 +20,7 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
20
20
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
21
21
  import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
22
22
  import { getDb, genId } from "./db.js";
23
+ import { redactSecrets, auditLog, SecurityError } from "./security/index.js";
23
24
  import { startDashboardServer } from "./dashboard/server.js";
24
25
  import { startEngineServer } from "./engine/server.js";
25
26
  import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
@@ -27,9 +28,10 @@ import { AnalyticsTracker } from "./analytics/toolTracker.js";
27
28
  import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
28
29
  import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
29
30
  import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
31
+ import { initObservability, startWatchdog, stopWatchdog } from "./tools/observabilityTools.js";
30
32
  import { createMetaTools } from "./tools/metaTools.js";
31
33
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
32
- import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
34
+ import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, getToolAnnotations, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
33
35
  // TOON format — ~40% token savings on tool responses
34
36
  import { encode as toonEncode } from "@toon-format/toon";
35
37
  // Embedding provider — neural semantic search
@@ -44,29 +46,35 @@ const exportStats = cliArgs.includes("--export-stats");
44
46
  const resetStats = cliArgs.includes("--reset-stats");
45
47
  const listPresetsFlag = cliArgs.includes("--list-presets");
46
48
  const healthFlag = cliArgs.includes("--health");
49
+ const statusFlag = cliArgs.includes("--status");
50
+ const diagnoseFlag = cliArgs.includes("--diagnose");
47
51
  const autoPresetFlag = cliArgs.includes("--auto-preset");
52
+ const syncConfigsFlag = cliArgs.includes("--sync-configs");
48
53
  const useEngine = cliArgs.includes("--engine");
49
54
  const engineSecret = (() => {
50
55
  const idx = cliArgs.indexOf("--engine-secret");
51
56
  return idx >= 0 && idx + 1 < cliArgs.length ? cliArgs[idx + 1] : process.env.ENGINE_SECRET;
52
57
  })();
53
58
  export { TOOLSET_MAP };
54
- const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox"];
59
+ const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox", "observability", "execution_trace", "mission_harness", "deep_sim", "founder"];
55
60
  const PRESETS = {
56
61
  default: DEFAULT_TOOLSETS,
57
- // Themed presets — bridge between default (50 tools) and full (175 tools)
62
+ // Themed presets — bridge between default (81 tools) and full (295 tools)
58
63
  web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "qa_orchestration", "visual_qa", "design_governance", "web_scraping"],
59
- research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping"],
60
- data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping"],
64
+ research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping", "temporal_intelligence", "deep_sim"],
65
+ data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping", "temporal_intelligence"],
61
66
  devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
62
67
  mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "visual_qa"],
63
68
  academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
64
- multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping"],
69
+ multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping", "deep_sim"],
65
70
  content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect", "local_dashboard", "engine_context", "thompson_protocol"],
71
+ // Cursor IDE has a hard 40-tool limit across ALL MCP servers.
72
+ // 28 tools = 22 domain + 3 meta + 3 discovery — leaves 12 slots for other servers.
73
+ cursor: ["deep_sim", "quality_gate", "learning", "session_memory", "web", "toon"],
66
74
  full: Object.keys(TOOLSET_MAP),
67
75
  };
68
76
  const PRESET_DESCRIPTIONS = {
69
- default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
77
+ default: "Core AI Flywheel — verification, eval, quality gates, learning, recon, mission harness",
70
78
  web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
71
79
  research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
72
80
  data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
@@ -75,6 +83,7 @@ const PRESET_DESCRIPTIONS = {
75
83
  academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
76
84
  multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing, frontend traversal",
77
85
  content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
86
+ cursor: "Cursor IDE (28 tools) — decision intelligence, research, quality gates, session memory, web, TOON encoding. Leaves 12 slots for other MCP servers.",
78
87
  full: "Everything — all toolsets for maximum coverage",
79
88
  };
80
89
  function parseToolsets() {
@@ -101,6 +110,9 @@ function parseToolsets() {
101
110
  " --engine-secret <s> Require Bearer token for engine API (or set ENGINE_SECRET env var)",
102
111
  " --explain <tool> Show plain-English explanation of a tool and exit",
103
112
  " --health Run diagnostic health check and exit",
113
+ " --status Show live system pulse (uptime, errors, call rates) and exit",
114
+ " --diagnose Run drift detection + auto-heal and exit",
115
+ " --sync-configs Write MCP config to Claude Code, Cursor, and Windsurf IDE locations",
104
116
  " --help Show this help and exit",
105
117
  "",
106
118
  "Available toolsets:",
@@ -113,12 +125,12 @@ function parseToolsets() {
113
125
  }),
114
126
  "",
115
127
  "Examples:",
116
- " npx nodebench-mcp # Default (50 tools) - core AI Flywheel",
128
+ " npx nodebench-mcp # Default (81 tools) - core AI Flywheel",
117
129
  " npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
118
130
  " npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
119
131
  " npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
120
132
  " npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
121
- " npx nodebench-mcp --preset full # All 175 tools",
133
+ " npx nodebench-mcp --preset full # All 295 tools",
122
134
  " npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
123
135
  " npx nodebench-mcp --stats # Show usage statistics",
124
136
  " npx nodebench-mcp --toolsets verification,eval,recon",
@@ -562,6 +574,637 @@ if (healthFlag) {
562
574
  console.log(lines.join("\n"));
563
575
  process.exit(0);
564
576
  }
577
+ // ── Status CLI handler (run-and-exit) ─────────────────────────────────
578
+ if (statusFlag) {
579
+ const os = await import("node:os");
580
+ const path = await import("node:path");
581
+ const fs = await import("node:fs");
582
+ const USE_COLOR = process.stdout.isTTY;
583
+ const B = USE_COLOR ? "\x1b[1m" : "";
584
+ const C = USE_COLOR ? "\x1b[36m" : "";
585
+ const G = USE_COLOR ? "\x1b[32m" : "";
586
+ const Y = USE_COLOR ? "\x1b[33m" : "";
587
+ const R = USE_COLOR ? "\x1b[31m" : "";
588
+ const X = USE_COLOR ? "\x1b[0m" : "";
589
+ const dir = path.join(os.homedir(), ".nodebench");
590
+ const dbPath = path.join(dir, "nodebench.db");
591
+ if (!fs.existsSync(dbPath)) {
592
+ console.error("No database found. Run the MCP server first to initialize.");
593
+ process.exit(1);
594
+ }
595
+ // Open DB directly for status query
596
+ const Database = (await import("better-sqlite3")).default;
597
+ const db = new Database(dbPath, { readonly: true });
598
+ const lines = [];
599
+ lines.push(`${B}NodeBench MCP — System Status${X}`);
600
+ lines.push("");
601
+ // Uptime info from DB (last tool call as proxy for when server was active)
602
+ try {
603
+ const recent = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get();
604
+ const today = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')`).get();
605
+ const week = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-7 days')`).get();
606
+ const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-1 hour')`).get();
607
+ const errors24h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-24 hours')`).get();
608
+ lines.push(`${C}Call Volume${X}`);
609
+ lines.push(` Last 1h: ${recent.cnt} calls (${errors1h.cnt} errors)`);
610
+ lines.push(` Last 24h: ${today.cnt} calls (${errors24h.cnt} errors)`);
611
+ lines.push(` Last 7d: ${week.cnt} calls`);
612
+ const rate1h = recent.cnt > 0 ? ((recent.cnt - errors1h.cnt) / recent.cnt * 100).toFixed(1) : "N/A";
613
+ const rate24h = today.cnt > 0 ? ((today.cnt - errors24h.cnt) / today.cnt * 100).toFixed(1) : "N/A";
614
+ lines.push(` Success: ${rate1h}% (1h) / ${rate24h}% (24h)`);
615
+ lines.push("");
616
+ // Top 5 tools
617
+ const topTools = db.prepare(`SELECT tool_name, COUNT(*) as calls, SUM(CASE WHEN result_status='error' THEN 1 ELSE 0 END) as errs, ROUND(AVG(duration_ms)) as avg_ms
618
+ FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')
619
+ GROUP BY tool_name ORDER BY calls DESC LIMIT 5`).all();
620
+ if (topTools.length > 0) {
621
+ lines.push(`${C}Top Tools (24h)${X}`);
622
+ for (const t of topTools) {
623
+ const errTag = t.errs > 0 ? ` ${R}${t.errs} err${X}` : "";
624
+ lines.push(` ${t.calls.toString().padStart(4)} ${t.tool_name.padEnd(30)} ${t.avg_ms}ms avg${errTag}`);
625
+ }
626
+ lines.push("");
627
+ }
628
+ // Error trend
629
+ const errPrevHour = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-2 hours') AND created_at <= datetime('now', '-1 hour')`).get();
630
+ const direction = errors1h.cnt > errPrevHour.cnt ? `${R}increasing${X}` : errors1h.cnt < errPrevHour.cnt ? `${G}decreasing${X}` : `${G}stable${X}`;
631
+ lines.push(`${C}Error Trend${X} ${direction} (${errPrevHour.cnt} prev hour → ${errors1h.cnt} this hour)`);
632
+ // Active verification cycles
633
+ const activeCycles = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress')`).get();
634
+ if (activeCycles.cnt > 0) {
635
+ lines.push(`${C}Active Cycles${X} ${Y}${activeCycles.cnt} verification cycle(s) in progress${X}`);
636
+ }
637
+ }
638
+ catch (e) {
639
+ lines.push(`${R}Error querying DB: ${e.message}${X}`);
640
+ }
641
+ db.close();
642
+ console.log(lines.join("\n"));
643
+ process.exit(0);
644
+ }
645
+ // ── Diagnose CLI handler (run-and-exit) ───────────────────────────────
646
+ if (diagnoseFlag) {
647
+ const os = await import("node:os");
648
+ const path = await import("node:path");
649
+ const fs = await import("node:fs");
650
+ const USE_COLOR = process.stdout.isTTY;
651
+ const B = USE_COLOR ? "\x1b[1m" : "";
652
+ const C = USE_COLOR ? "\x1b[36m" : "";
653
+ const G = USE_COLOR ? "\x1b[32m" : "";
654
+ const Y = USE_COLOR ? "\x1b[33m" : "";
655
+ const R = USE_COLOR ? "\x1b[31m" : "";
656
+ const X = USE_COLOR ? "\x1b[0m" : "";
657
+ const dir = path.join(os.homedir(), ".nodebench");
658
+ const dbPath = path.join(dir, "nodebench.db");
659
+ if (!fs.existsSync(dbPath)) {
660
+ console.error("No database found. Run the MCP server first to initialize.");
661
+ process.exit(1);
662
+ }
663
+ const Database = (await import("better-sqlite3")).default;
664
+ const db = new Database(dbPath);
665
+ const lines = [];
666
+ lines.push(`${B}NodeBench MCP — Diagnose & Heal${X}`);
667
+ lines.push("");
668
+ let issueCount = 0;
669
+ let healedCount = 0;
670
+ // 1. Orphaned verification cycles
671
+ try {
672
+ const orphanedCount = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).get().cnt;
673
+ if (orphanedCount > 0) {
674
+ lines.push(`${Y}DRIFT${X} ${orphanedCount} orphaned verification cycle(s) (>48h old)`);
675
+ const result = db.prepare(`UPDATE verification_cycles SET status = 'abandoned', updated_at = datetime('now') WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).run();
676
+ lines.push(` ${G}HEALED${X} Abandoned ${result.changes} cycles in batch`);
677
+ healedCount += result.changes;
678
+ issueCount += orphanedCount;
679
+ }
680
+ else {
681
+ lines.push(`${G}OK${X} No orphaned verification cycles`);
682
+ }
683
+ }
684
+ catch {
685
+ lines.push(`${Y}SKIP${X} Could not check verification cycles`);
686
+ }
687
+ // 2. Stale eval runs
688
+ try {
689
+ const staleCount = db.prepare(`SELECT COUNT(*) as cnt FROM eval_runs WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).get().cnt;
690
+ if (staleCount > 0) {
691
+ lines.push(`${Y}DRIFT${X} ${staleCount} stale eval run(s) (>24h old)`);
692
+ const result = db.prepare(`UPDATE eval_runs SET status = 'failed', completed_at = datetime('now') WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).run();
693
+ lines.push(` ${G}HEALED${X} Marked ${result.changes} eval runs as failed`);
694
+ healedCount += result.changes;
695
+ issueCount += staleCount;
696
+ }
697
+ else {
698
+ lines.push(`${G}OK${X} No stale eval runs`);
699
+ }
700
+ }
701
+ catch {
702
+ lines.push(`${Y}SKIP${X} Could not check eval runs`);
703
+ }
704
+ // 3. DB size
705
+ const dbInfo = fs.statSync(dbPath);
706
+ const dbSizeMb = dbInfo.size / (1024 * 1024);
707
+ if (dbSizeMb > 500) {
708
+ lines.push(`${Y}DRIFT${X} Database is ${dbSizeMb.toFixed(1)} MB`);
709
+ try {
710
+ const cutoff = new Date(Date.now() - 90 * 24 * 3_600_000).toISOString();
711
+ const deleted = db.prepare(`DELETE FROM tool_call_log WHERE created_at < ?`).run(cutoff);
712
+ if (deleted.changes > 0) {
713
+ lines.push(` ${G}HEALED${X} Pruned ${deleted.changes} tool_call_log entries older than 90 days`);
714
+ healedCount++;
715
+ }
716
+ db.pragma("wal_checkpoint(TRUNCATE)");
717
+ lines.push(` ${G}HEALED${X} Ran WAL checkpoint`);
718
+ healedCount++;
719
+ }
720
+ catch { /* skip */ }
721
+ issueCount++;
722
+ }
723
+ else {
724
+ lines.push(`${G}OK${X} Database size: ${dbSizeMb.toFixed(1)} MB`);
725
+ }
726
+ // 4. Error rate
727
+ try {
728
+ const calls1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get().cnt;
729
+ const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-1 hour')`).get().cnt;
730
+ const rate = calls1h > 0 ? (errors1h / calls1h * 100) : 0;
731
+ if (rate > 20 && calls1h > 5) {
732
+ lines.push(`${R}ALERT${X} Error rate ${rate.toFixed(1)}% in last hour (${errors1h}/${calls1h})`);
733
+ issueCount++;
734
+ }
735
+ else {
736
+ lines.push(`${G}OK${X} Error rate: ${rate.toFixed(1)}% (${errors1h}/${calls1h} in last hour)`);
737
+ }
738
+ }
739
+ catch {
740
+ lines.push(`${Y}SKIP${X} Could not check error rates`);
741
+ }
742
+ // 5. Embedding cache
743
+ const cachePath = path.join(dir, "embedding_cache.json");
744
+ if (fs.existsSync(cachePath)) {
745
+ const cacheAge = Math.round((Date.now() - fs.statSync(cachePath).mtimeMs) / 3_600_000);
746
+ if (cacheAge > 168) {
747
+ lines.push(`${Y}DRIFT${X} Embedding cache is ${cacheAge}h old (>7 days) — will refresh on next server start`);
748
+ issueCount++;
749
+ }
750
+ else {
751
+ lines.push(`${G}OK${X} Embedding cache: ${cacheAge}h old`);
752
+ }
753
+ }
754
+ else {
755
+ lines.push(`${Y}INFO${X} No embedding cache found (will build on first server start)`);
756
+ }
757
+ // Summary
758
+ lines.push("");
759
+ if (issueCount === 0) {
760
+ lines.push(`${G}${B}All clear${X} — no drift detected`);
761
+ }
762
+ else {
763
+ lines.push(`${B}Found ${issueCount} issue(s), healed ${healedCount}${X}`);
764
+ const remaining = issueCount - healedCount;
765
+ if (remaining > 0)
766
+ lines.push(`${Y}${remaining} issue(s) require manual attention${X}`);
767
+ }
768
+ db.close();
769
+ console.log(lines.join("\n"));
770
+ process.exit(0);
771
+ }
772
+ // ── Sync Configs CLI handler (run-and-exit) ─────────────────────────────
773
+ if (syncConfigsFlag) {
774
+ const os = await import("node:os");
775
+ const path = await import("node:path");
776
+ const fs = await import("node:fs");
777
+ const USE_COLOR = process.stdout.isTTY;
778
+ const B = USE_COLOR ? "\x1b[1m" : "";
779
+ const C = USE_COLOR ? "\x1b[36m" : "";
780
+ const G = USE_COLOR ? "\x1b[32m" : "";
781
+ const Y = USE_COLOR ? "\x1b[33m" : "";
782
+ const X = USE_COLOR ? "\x1b[0m" : "";
783
+ // Detect the nodebench-mcp entry point path
784
+ const entryPath = path.resolve(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1") // fix Windows drive letter
785
+ );
786
+ // Build args array from current CLI flags (exclude --sync-configs and other run-and-exit flags)
787
+ const forwardArgs = [];
788
+ const skipNext = new Set(["--preset", "--toolsets", "--exclude", "--engine-secret"]);
789
+ const runAndExitFlags = new Set([
790
+ "--sync-configs", "--health", "--status", "--diagnose", "--stats",
791
+ "--export-stats", "--reset-stats", "--list-presets", "--smart-preset",
792
+ "--auto-preset", "--help",
793
+ ]);
794
+ for (let i = 0; i < cliArgs.length; i++) {
795
+ if (runAndExitFlags.has(cliArgs[i]))
796
+ continue;
797
+ if (cliArgs[i].startsWith("--explain"))
798
+ continue;
799
+ if (skipNext.has(cliArgs[i])) {
800
+ forwardArgs.push(cliArgs[i], cliArgs[i + 1] ?? "");
801
+ i++; // skip the value
802
+ continue;
803
+ }
804
+ forwardArgs.push(cliArgs[i]);
805
+ }
806
+ // Collect env vars that are currently set
807
+ const ENV_KEYS = [
808
+ "ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GEMINI_API_KEY",
809
+ "GITHUB_TOKEN", "BROWSERBASE_API_KEY", "FIRECRAWL_API_KEY",
810
+ "SMTP_HOST", "SMTP_PORT", "SMTP_USER", "SMTP_PASS",
811
+ "IMAP_HOST", "IMAP_PORT", "IMAP_USER", "IMAP_PASS",
812
+ "ENGINE_SECRET",
813
+ ];
814
+ const envObj = {};
815
+ for (const key of ENV_KEYS) {
816
+ if (process.env[key])
817
+ envObj[key] = process.env[key];
818
+ }
819
+ // Build the MCP server config entry
820
+ const nodePath = process.execPath; // path to node binary
821
+ const serverEntry = {
822
+ command: nodePath,
823
+ args: [entryPath, ...forwardArgs],
824
+ ...(Object.keys(envObj).length > 0 ? { env: envObj } : {}),
825
+ };
826
+ // Helper: merge into existing config file (preserves other servers)
827
+ function mergeConfig(filePath, serverKey) {
828
+ let existing = {};
829
+ if (fs.existsSync(filePath)) {
830
+ try {
831
+ existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
832
+ }
833
+ catch {
834
+ // If file exists but is invalid JSON, back it up and start fresh
835
+ const backupPath = filePath + ".bak";
836
+ fs.copyFileSync(filePath, backupPath);
837
+ existing = {};
838
+ }
839
+ }
840
+ // Ensure mcpServers key exists
841
+ if (!existing.mcpServers || typeof existing.mcpServers !== "object") {
842
+ existing.mcpServers = {};
843
+ }
844
+ const servers = existing.mcpServers;
845
+ const hadExisting = !!servers[serverKey];
846
+ servers[serverKey] = serverEntry;
847
+ // Ensure parent directory exists
848
+ const dir = path.dirname(filePath);
849
+ if (!fs.existsSync(dir)) {
850
+ fs.mkdirSync(dir, { recursive: true });
851
+ }
852
+ fs.writeFileSync(filePath, JSON.stringify(existing, null, 2) + "\n", "utf-8");
853
+ return { action: hadExisting ? "updated" : "created", path: filePath };
854
+ }
855
+ const lines = [];
856
+ lines.push(`${B}NodeBench MCP — Sync IDE Configs${X}`);
857
+ lines.push("");
858
+ const results = [];
859
+ // 1. Claude Code: ~/.claude/claude_desktop_config.json
860
+ try {
861
+ const claudeConfigPath = path.join(os.homedir(), ".claude", "claude_desktop_config.json");
862
+ const r = mergeConfig(claudeConfigPath, "nodebench-mcp");
863
+ results.push({ name: "Claude Code", ...r });
864
+ }
865
+ catch (e) {
866
+ results.push({ name: "Claude Code", action: "failed", path: "", error: e.message });
867
+ }
868
+ // 2. Cursor: <project>/.cursor/mcp.json
869
+ try {
870
+ const cursorConfigPath = path.join(process.cwd(), ".cursor", "mcp.json");
871
+ const r = mergeConfig(cursorConfigPath, "nodebench-mcp");
872
+ results.push({ name: "Cursor", ...r });
873
+ }
874
+ catch (e) {
875
+ results.push({ name: "Cursor", action: "failed", path: "", error: e.message });
876
+ }
877
+ // 3. Windsurf: <project>/.windsurf/mcp.json
878
+ try {
879
+ const windsurfConfigPath = path.join(process.cwd(), ".windsurf", "mcp.json");
880
+ const r = mergeConfig(windsurfConfigPath, "nodebench-mcp");
881
+ results.push({ name: "Windsurf", ...r });
882
+ }
883
+ catch (e) {
884
+ results.push({ name: "Windsurf", action: "failed", path: "", error: e.message });
885
+ }
886
+ // Print results
887
+ for (const r of results) {
888
+ if (r.action === "failed") {
889
+ lines.push(`${Y}FAIL${X} ${r.name}: ${r.error}`);
890
+ }
891
+ else {
892
+ const icon = r.action === "created" ? `${G}NEW${X} ` : `${G}UPD${X} `;
893
+ lines.push(`${icon} ${r.name}: ${r.path}`);
894
+ }
895
+ }
896
+ // Print config summary
897
+ lines.push("");
898
+ lines.push(`${C}Config entry:${X}`);
899
+ lines.push(` command: ${nodePath}`);
900
+ lines.push(` args: [${[entryPath, ...forwardArgs].map(a => `"${a}"`).join(", ")}]`);
901
+ if (Object.keys(envObj).length > 0) {
902
+ lines.push(` env: ${Object.keys(envObj).join(", ")}`);
903
+ }
904
+ else {
905
+ lines.push(` env: ${Y}(none set)${X}`);
906
+ }
907
+ lines.push("");
908
+ const successCount = results.filter(r => r.action !== "failed").length;
909
+ lines.push(`${B}Written to ${successCount}/${results.length} locations${X}`);
910
+ console.log(lines.join("\n"));
911
+ process.exit(0);
912
+ }
913
+ // ── CLI subcommand detection ──────────────────────────────────────────
914
+ // First positional arg (not starting with --) is a subcommand
915
+ const subCmd = cliArgs.find(a => !a.startsWith("--") && !cliArgs.some((f, i) => f.startsWith("--") && cliArgs[i + 1] === a));
916
+ // ── Welcome screen (no arguments at all) ─────────────────────────────
917
+ if (cliArgs.length === 0 || (subCmd === undefined && !cliArgs.includes("--stdio") && !cliArgs.some(a => a.startsWith("--")))) {
918
+ const USE_COLOR = process.stdout.isTTY;
919
+ const B = USE_COLOR ? "\x1b[1m" : "";
920
+ const C = USE_COLOR ? "\x1b[36m" : "";
921
+ const G = USE_COLOR ? "\x1b[32m" : "";
922
+ const D = USE_COLOR ? "\x1b[2m" : "";
923
+ const Y = USE_COLOR ? "\x1b[33m" : "";
924
+ const X = USE_COLOR ? "\x1b[0m" : "";
925
+ const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
926
+ const domainCount = Object.keys(TOOLSET_MAP).length;
927
+ const welcome = [
928
+ "",
929
+ ` ${B}NodeBench AI${X} ${D}— The trust layer for agents${X}`,
930
+ "",
931
+ ` ${C}Quick start${X}`,
932
+ ` ${G}$${X} npx nodebench-mcp discover ${D}Show available tools${X}`,
933
+ ` ${G}$${X} npx nodebench-mcp demo ${D}Run a live demo (no keys needed)${X}`,
934
+ ` ${G}$${X} npx nodebench-mcp quickref research ${D}Get research workflow guide${X}`,
935
+ ` ${G}$${X} npx nodebench-mcp --explain run_recon ${D}Deep-dive on any tool${X}`,
936
+ "",
937
+ ` ${C}Connect to your IDE${X}`,
938
+ ` ${G}$${X} claude mcp add nodebench -- npx nodebench-mcp --stdio`,
939
+ ` ${G}$${X} npx nodebench-mcp --sync-configs ${D}Auto-write to Claude/Cursor/Windsurf${X}`,
940
+ "",
941
+ ` ${C}Start the MCP server${X}`,
942
+ ` ${G}$${X} npx nodebench-mcp --stdio ${D}Default preset${X}`,
943
+ ` ${G}$${X} npx nodebench-mcp --preset research ${D}Research workflows${X}`,
944
+ ` ${G}$${X} npx nodebench-mcp --auto-preset ${D}Detect from your project${X}`,
945
+ "",
946
+ ` ${Y}${totalTools} tools${X} ${D}·${X} ${Y}${domainCount} domains${X} ${D}· Progressive discovery · Agent-as-a-Graph${X}`,
947
+ "",
948
+ ];
949
+ console.log(welcome.join("\n"));
950
+ process.exit(0);
951
+ }
952
+ // ── Demo subcommand (run-and-exit) ───────────────────────────────────
953
+ if (subCmd === "demo") {
954
+ const USE_COLOR = process.stdout.isTTY;
955
+ const B = USE_COLOR ? "\x1b[1m" : "";
956
+ const C = USE_COLOR ? "\x1b[36m" : "";
957
+ const G = USE_COLOR ? "\x1b[32m" : "";
958
+ const D = USE_COLOR ? "\x1b[2m" : "";
959
+ const Y = USE_COLOR ? "\x1b[33m" : "";
960
+ const X = USE_COLOR ? "\x1b[0m" : "";
961
+ const demoLines = [];
962
+ demoLines.push("");
963
+ demoLines.push(` ${B}NodeBench AI — Live Demo${X}`);
964
+ demoLines.push(` ${D}No API keys needed. Everything runs locally.${X}`);
965
+ demoLines.push("");
966
+ // 1. Show research tools via hybridSearch
967
+ demoLines.push(` ${C}1. Discovering research tools...${X}`);
968
+ demoLines.push("");
969
+ const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
970
+ const researchResults = hybridSearch("research", stubTools, { limit: 5, mode: "hybrid" });
971
+ for (const r of researchResults.slice(0, 5)) {
972
+ const entry = TOOL_REGISTRY.get(r.name);
973
+ const phase = entry?.phase ?? "";
974
+ demoLines.push(` ${G}>${X} ${B}${r.name}${X} ${D}(${phase})${X}`);
975
+ if (entry?.quickRef?.nextAction) {
976
+ demoLines.push(` ${entry.quickRef.nextAction.slice(0, 80)}`);
977
+ }
978
+ }
979
+ demoLines.push("");
980
+ // 2. Show a workflow chain
981
+ demoLines.push(` ${C}2. Workflow chain: "Build a New Feature"${X}`);
982
+ demoLines.push("");
983
+ const chain = WORKFLOW_CHAINS["new_feature"];
984
+ if (chain) {
985
+ demoLines.push(` ${B}${chain.name}${X} ${D}— ${chain.description}${X}`);
986
+ demoLines.push("");
987
+ for (let i = 0; i < Math.min(chain.steps.length, 8); i++) {
988
+ const step = chain.steps[i];
989
+ const num = String(i + 1).padStart(2, " ");
990
+ demoLines.push(` ${Y}${num}.${X} ${step.tool} ${D}→ ${step.action}${X}`);
991
+ }
992
+ if (chain.steps.length > 8) {
993
+ demoLines.push(` ${D} ... +${chain.steps.length - 8} more steps${X}`);
994
+ }
995
+ }
996
+ demoLines.push("");
997
+ // 3. Summary stats
998
+ const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
999
+ const domainCount = Object.keys(TOOLSET_MAP).length;
1000
+ const chainCount = Object.keys(WORKFLOW_CHAINS).length;
1001
+ demoLines.push(` ${C}3. What's available${X}`);
1002
+ demoLines.push("");
1003
+ demoLines.push(` ${Y}${totalTools}${X} tools across ${Y}${domainCount}${X} domains`);
1004
+ demoLines.push(` ${Y}${chainCount}${X} pre-built workflow chains`);
1005
+ demoLines.push(` ${Y}${ALL_REGISTRY_ENTRIES.length}${X} entries in the tool registry`);
1006
+ demoLines.push("");
1007
+ // 4. Next steps
1008
+ demoLines.push(` ${C}Next steps${X}`);
1009
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --explain run_recon ${D}Deep-dive on any tool${X}`);
1010
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --health ${D}Check your environment${X}`);
1011
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --sync-configs ${D}Wire into your IDE${X}`);
1012
+ demoLines.push("");
1013
+ console.log(demoLines.join("\n"));
1014
+ process.exit(0);
1015
+ }
1016
+ // ── Discover subcommand (run-and-exit) ───────────────────────────────
1017
+ if (subCmd === "discover") {
1018
+ const USE_COLOR = process.stdout.isTTY;
1019
+ const B = USE_COLOR ? "\x1b[1m" : "";
1020
+ const C = USE_COLOR ? "\x1b[36m" : "";
1021
+ const G = USE_COLOR ? "\x1b[32m" : "";
1022
+ const D = USE_COLOR ? "\x1b[2m" : "";
1023
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1024
+ const X = USE_COLOR ? "\x1b[0m" : "";
1025
+ const query = cliArgs.find(a => a !== "discover" && !a.startsWith("--")) ?? "";
1026
+ const limit = 10;
1027
+ const lines = [];
1028
+ lines.push("");
1029
+ if (query) {
1030
+ lines.push(` ${B}Discovering tools for:${X} ${C}${query}${X}`);
1031
+ const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
1032
+ const results = hybridSearch(query, stubTools, { limit, mode: "hybrid" });
1033
+ lines.push("");
1034
+ for (const r of results) {
1035
+ const entry = TOOL_REGISTRY.get(r.name);
1036
+ lines.push(` ${G}>${X} ${B}${r.name}${X} ${D}score: ${r.score.toFixed(2)}${X}`);
1037
+ if (entry) {
1038
+ lines.push(` ${D}${entry.category} · ${entry.phase}${X}`);
1039
+ if (entry.quickRef?.nextAction)
1040
+ lines.push(` ${entry.quickRef.nextAction.slice(0, 90)}`);
1041
+ }
1042
+ lines.push("");
1043
+ }
1044
+ if (results.length === 0)
1045
+ lines.push(` ${Y}No results.${X} Try a broader query.\n`);
1046
+ }
1047
+ else {
1048
+ lines.push(` ${B}Tool domains${X} ${D}(${Object.keys(TOOLSET_MAP).length} domains)${X}`);
1049
+ lines.push("");
1050
+ for (const [domain, tools] of Object.entries(TOOLSET_MAP)) {
1051
+ lines.push(` ${G}>${X} ${domain.padEnd(24)} ${Y}${String(tools.length).padStart(3)}${X} tools`);
1052
+ }
1053
+ lines.push("");
1054
+ lines.push(` ${D}Search: npx nodebench-mcp discover <query>${X}`);
1055
+ }
1056
+ lines.push("");
1057
+ console.log(lines.join("\n"));
1058
+ process.exit(0);
1059
+ }
1060
+ // ── Quickref subcommand (run-and-exit) ───────────────────────────────
1061
+ if (subCmd === "quickref") {
1062
+ const USE_COLOR = process.stdout.isTTY;
1063
+ const B = USE_COLOR ? "\x1b[1m" : "";
1064
+ const C = USE_COLOR ? "\x1b[36m" : "";
1065
+ const G = USE_COLOR ? "\x1b[32m" : "";
1066
+ const D = USE_COLOR ? "\x1b[2m" : "";
1067
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1068
+ const X = USE_COLOR ? "\x1b[0m" : "";
1069
+ const toolName = cliArgs.find(a => a !== "quickref" && !a.startsWith("--")) ?? "";
1070
+ const lines = [];
1071
+ lines.push("");
1072
+ if (!toolName) {
1073
+ lines.push(` ${B}Usage:${X} npx nodebench-mcp quickref <tool_or_workflow>`);
1074
+ lines.push("");
1075
+ lines.push(` ${C}Workflows${X}`);
1076
+ for (const [key, chain] of Object.entries(WORKFLOW_CHAINS).slice(0, 10)) {
1077
+ lines.push(` ${G}>${X} ${key.padEnd(28)} ${D}${chain.name}${X}`);
1078
+ }
1079
+ lines.push(` ${D} ... +${Object.keys(WORKFLOW_CHAINS).length - 10} more${X}`);
1080
+ lines.push("");
1081
+ }
1082
+ else {
1083
+ // Try workflow first
1084
+ const chain = WORKFLOW_CHAINS[toolName];
1085
+ if (chain) {
1086
+ lines.push(` ${B}${chain.name}${X} ${D}(${toolName})${X}`);
1087
+ lines.push(` ${chain.description}`);
1088
+ lines.push("");
1089
+ for (let i = 0; i < chain.steps.length; i++) {
1090
+ const step = chain.steps[i];
1091
+ lines.push(` ${Y}${String(i + 1).padStart(2)}.${X} ${step.tool} ${D}→ ${step.action}${X}`);
1092
+ }
1093
+ lines.push("");
1094
+ }
1095
+ else {
1096
+ // Try tool registry
1097
+ const entry = TOOL_REGISTRY.get(toolName);
1098
+ if (entry) {
1099
+ lines.push(` ${B}${entry.name}${X} ${D}(${entry.category}, ${entry.phase})${X}`);
1100
+ lines.push(` ${entry.quickRef.nextAction}`);
1101
+ if (entry.quickRef.tip)
1102
+ lines.push(` ${Y}Tip:${X} ${entry.quickRef.tip}`);
1103
+ if (entry.quickRef.nextTools.length > 0) {
1104
+ lines.push("");
1105
+ lines.push(` ${C}Next tools${X}`);
1106
+ for (const nt of entry.quickRef.nextTools)
1107
+ lines.push(` ${G}>${X} ${nt}`);
1108
+ }
1109
+ lines.push("");
1110
+ }
1111
+ else {
1112
+ lines.push(` ${Y}Not found:${X} ${toolName}`);
1113
+ lines.push(` ${D}Try: npx nodebench-mcp quickref new_feature${X}`);
1114
+ lines.push("");
1115
+ }
1116
+ }
1117
+ }
1118
+ console.log(lines.join("\n"));
1119
+ process.exit(0);
1120
+ }
1121
+ // ── Call subcommand (run-and-exit) ───────────────────────────────────
1122
+ if (subCmd === "call") {
1123
+ const toolName = cliArgs.find(a => a !== "call" && !a.startsWith("--") && !a.startsWith("{"));
1124
+ const argsJson = cliArgs.find(a => a.startsWith("{")) ?? "{}";
1125
+ const USE_COLOR = process.stdout.isTTY;
1126
+ const B = USE_COLOR ? "\x1b[1m" : "";
1127
+ const G = USE_COLOR ? "\x1b[32m" : "";
1128
+ const R = USE_COLOR ? "\x1b[31m" : "";
1129
+ const D = USE_COLOR ? "\x1b[2m" : "";
1130
+ const X = USE_COLOR ? "\x1b[0m" : "";
1131
+ if (!toolName) {
1132
+ console.log(`\n ${B}Usage:${X} npx nodebench-mcp call <tool_name> [json_args]\n`);
1133
+ console.log(` ${D}Example:${X} npx nodebench-mcp call founder_deep_context_gather '{"packetType":"weekly_reset"}'`);
1134
+ console.log(` ${D}Example:${X} npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
1135
+ console.log(` ${D}Example:${X} npx nodebench-mcp call save_session_note '{"note":"test"}'\n`);
1136
+ process.exit(0);
1137
+ }
1138
+ // Find tool in all toolsets — meta/discovery tools are created later,
1139
+ // so for CLI call we build them inline
1140
+ const cliDomainTools = Object.values(TOOLSET_MAP).flat();
1141
+ const cliMetaTools = createMetaTools(cliDomainTools);
1142
+ const cliDiscoveryTools = createProgressiveDiscoveryTools(cliDomainTools);
1143
+ const allCallable = [...cliDomainTools, ...cliMetaTools, ...cliDiscoveryTools];
1144
+ const tool = allCallable.find(t => t.name === toolName);
1145
+ if (!tool) {
1146
+ console.log(`\n ${R}Tool not found:${X} ${toolName}`);
1147
+ console.log(` ${D}Run: npx nodebench-mcp discover ${toolName}${X}\n`);
1148
+ process.exit(1);
1149
+ }
1150
+ let parsedArgs;
1151
+ try {
1152
+ parsedArgs = JSON.parse(argsJson);
1153
+ }
1154
+ catch {
1155
+ console.log(`\n ${R}Invalid JSON args:${X} ${argsJson}\n`);
1156
+ process.exit(1);
1157
+ }
1158
+ console.log(`\n ${D}Calling${X} ${B}${toolName}${X} ${D}...${X}`);
1159
+ try {
1160
+ const result = await tool.handler(parsedArgs);
1161
+ const output = typeof result === "string" ? result : JSON.stringify(result, null, 2);
1162
+ console.log(`\n ${G}Result:${X}\n`);
1163
+ // Pretty-print, indent 4 spaces
1164
+ for (const line of output.split("\n")) {
1165
+ console.log(` ${line}`);
1166
+ }
1167
+ console.log("");
1168
+ }
1169
+ catch (err) {
1170
+ const msg = err instanceof Error ? err.message : String(err);
1171
+ console.log(`\n ${R}Error:${X} ${msg}\n`);
1172
+ process.exit(1);
1173
+ }
1174
+ process.exit(0);
1175
+ }
1176
+ // ── Setup subcommand (run-and-exit) ──────────────────────────────────
1177
+ if (subCmd === "setup") {
1178
+ const USE_COLOR = process.stdout.isTTY;
1179
+ const B = USE_COLOR ? "\x1b[1m" : "";
1180
+ const C = USE_COLOR ? "\x1b[36m" : "";
1181
+ const G = USE_COLOR ? "\x1b[32m" : "";
1182
+ const D = USE_COLOR ? "\x1b[2m" : "";
1183
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1184
+ const X = USE_COLOR ? "\x1b[0m" : "";
1185
+ const lines = [];
1186
+ lines.push("");
1187
+ lines.push(` ${B}NodeBench MCP — Quick Setup${X}`);
1188
+ lines.push("");
1189
+ lines.push(` ${G}1.${X} ${B}Claude Code${X}`);
1190
+ lines.push(` claude mcp add nodebench -- npx -y nodebench-mcp`);
1191
+ lines.push("");
1192
+ lines.push(` ${G}2.${X} ${B}Cursor${X} ${D}(.cursor/mcp.json)${X}`);
1193
+ lines.push(` { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
1194
+ lines.push("");
1195
+ lines.push(` ${G}3.${X} ${B}Windsurf${X} ${D}(.windsurf/mcp.json)${X}`);
1196
+ lines.push(` { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
1197
+ lines.push("");
1198
+ lines.push(` ${C}Verify:${X} npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
1199
+ lines.push(` ${C}Dashboard:${X} https://www.nodebenchai.com/founder`);
1200
+ lines.push(` ${C}Agent setup:${X} https://www.nodebenchai.com/agent-setup.txt`);
1201
+ lines.push("");
1202
+ lines.push(` ${Y}Presets:${X} --preset default (99 tools) | --preset full (313 tools)`);
1203
+ lines.push(` ${Y}Founder tools:${X} founder_deep_context_gather, founder_packet_validate, founder_packet_diff`);
1204
+ lines.push("");
1205
+ console.log(lines.join("\n"));
1206
+ process.exit(0);
1207
+ }
565
1208
  // Initialize DB (creates ~/.nodebench/ and schema on first run)
566
1209
  getDb();
567
1210
  // Wire up DB accessor for execution trace edges (avoids circular import)
@@ -956,42 +1599,42 @@ const dynamicLoadingTools = [
956
1599
  const db = getDb();
957
1600
  const detailed = args.detailed === true;
958
1601
  // Session-level aggregates by mode
959
- const sessionSummary = db.prepare(`
960
- SELECT
961
- mode,
962
- COUNT(*) as sessions,
963
- ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
964
- ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
965
- ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
966
- ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
967
- ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
968
- SUM(COALESCE(total_tool_calls, 0)) as total_calls,
969
- SUM(COALESCE(total_load_events, 0)) as total_loads
970
- FROM ab_test_sessions
971
- GROUP BY mode
1602
+ const sessionSummary = db.prepare(`
1603
+ SELECT
1604
+ mode,
1605
+ COUNT(*) as sessions,
1606
+ ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
1607
+ ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
1608
+ ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
1609
+ ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
1610
+ ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
1611
+ SUM(COALESCE(total_tool_calls, 0)) as total_calls,
1612
+ SUM(COALESCE(total_load_events, 0)) as total_loads
1613
+ FROM ab_test_sessions
1614
+ GROUP BY mode
972
1615
  `).all();
973
1616
  // Error rate by mode (join with tool_call_log)
974
- const errorRates = db.prepare(`
975
- SELECT
976
- s.mode,
977
- COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
978
- COUNT(*) as total_calls,
979
- ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
980
- FROM tool_call_log t
981
- JOIN ab_test_sessions s ON t.session_id = s.id
982
- GROUP BY s.mode
1617
+ const errorRates = db.prepare(`
1618
+ SELECT
1619
+ s.mode,
1620
+ COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
1621
+ COUNT(*) as total_calls,
1622
+ ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
1623
+ FROM tool_call_log t
1624
+ JOIN ab_test_sessions s ON t.session_id = s.id
1625
+ GROUP BY s.mode
983
1626
  `).all();
984
1627
  // Top loaded toolsets (dynamic mode)
985
- const topToolsets = db.prepare(`
986
- SELECT
987
- toolset_name,
988
- COUNT(*) as load_count,
989
- ROUND(AVG(latency_ms), 1) as avg_latency_ms
990
- FROM ab_tool_events
991
- WHERE event_type = 'load'
992
- GROUP BY toolset_name
993
- ORDER BY load_count DESC
994
- LIMIT 10
1628
+ const topToolsets = db.prepare(`
1629
+ SELECT
1630
+ toolset_name,
1631
+ COUNT(*) as load_count,
1632
+ ROUND(AVG(latency_ms), 1) as avg_latency_ms
1633
+ FROM ab_tool_events
1634
+ WHERE event_type = 'load'
1635
+ GROUP BY toolset_name
1636
+ ORDER BY load_count DESC
1637
+ LIMIT 10
995
1638
  `).all();
996
1639
  // Current session info
997
1640
  const currentSession = {
@@ -1007,13 +1650,13 @@ const dynamicLoadingTools = [
1007
1650
  // Optional per-session detail
1008
1651
  let sessions = [];
1009
1652
  if (detailed) {
1010
- sessions = db.prepare(`
1011
- SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
1012
- toolsets_loaded, total_tool_calls, total_load_events,
1013
- session_duration_ms, created_at, ended_at
1014
- FROM ab_test_sessions
1015
- ORDER BY created_at DESC
1016
- LIMIT 50
1653
+ sessions = db.prepare(`
1654
+ SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
1655
+ toolsets_loaded, total_tool_calls, total_load_events,
1656
+ session_duration_ms, created_at, ended_at
1657
+ FROM ab_test_sessions
1658
+ ORDER BY created_at DESC
1659
+ LIMIT 50
1017
1660
  `).all();
1018
1661
  }
1019
1662
  // Build verdict
@@ -1116,6 +1759,54 @@ const _hookState = {
1116
1759
  lastRefreshReminder: 0, // totalCalls at last reminder
1117
1760
  };
1118
1761
  const WEB_TOOL_NAMES = new Set(["web_search", "fetch_url"]);
1762
+ // ── Intent-based auto-expansion ─────────────────────────────────────────
1763
+ // On the first tool call, classify intent from tool name + args keywords
1764
+ // and auto-load relevant toolsets if running on the default preset.
1765
+ // Zero-latency: pure keyword matching, no LLM calls. Runs once per session.
1766
+ let _intentClassified = false;
1767
+ const INTENT_PATTERNS = [
1768
+ { pattern: /web|css|html|dom|seo|browser|page|viewport|screenshot|ui_capture|ui_ux/i, toolsets: ["ui_capture", "vision", "web", "seo", "git_workflow", "architect"] },
1769
+ { pattern: /research|paper|arxiv|scholar|literature|digest|brief|rss|feed/i, toolsets: ["web", "llm", "rss", "email", "docs"] },
1770
+ { pattern: /data|csv|sql|pandas|xlsx|json_parse|spreadsheet|parquet|parse/i, toolsets: ["local_file", "llm", "web"] },
1771
+ { pattern: /deploy|docker|k8s|kubernetes|ci|cd|pipeline|terraform|helm|infra/i, toolsets: ["git_workflow", "session_memory", "benchmark", "pattern"] },
1772
+ { pattern: /agent|swarm|orchestr|parallel|multi.?agent|spawn|coordinat/i, toolsets: ["parallel", "self_eval", "session_memory", "pattern", "toon"] },
1773
+ { pattern: /mobile|ios|android|react.?native|flutter|swift|kotlin/i, toolsets: ["ui_capture", "vision", "flicker_detection"] },
1774
+ { pattern: /academic|thesis|review|cite|biblio|latex|peer/i, toolsets: ["research_writing", "llm", "web", "local_file"] },
1775
+ { pattern: /content|publish|post|newsletter|email|campaign|linkedin/i, toolsets: ["llm", "critter", "email", "rss", "platform", "architect"] },
1776
+ ];
1777
+ function classifyAndExpand(toolName, args) {
1778
+ // Only expand if on default preset — user explicitly chose a preset, respect it
1779
+ if (currentPreset !== "default")
1780
+ return null;
1781
+ // Build a single haystack from tool name + stringified arg keys/values
1782
+ const argStr = args ? Object.entries(args).map(([k, v]) => `${k} ${typeof v === "string" ? v : ""}`).join(" ") : "";
1783
+ const haystack = `${toolName} ${argStr}`;
1784
+ // Collect all matching toolsets (deduplicated)
1785
+ const toLoad = new Set();
1786
+ for (const { pattern, toolsets } of INTENT_PATTERNS) {
1787
+ if (pattern.test(haystack)) {
1788
+ for (const ts of toolsets) {
1789
+ if (TOOLSET_MAP[ts] && !activeToolsets.has(ts)) {
1790
+ toLoad.add(ts);
1791
+ }
1792
+ }
1793
+ }
1794
+ }
1795
+ if (toLoad.size === 0)
1796
+ return null;
1797
+ // Load matched toolsets
1798
+ for (const ts of toLoad) {
1799
+ activeToolsets.add(ts);
1800
+ }
1801
+ // Rebuild tool arrays
1802
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
1803
+ const newMetaTools = createMetaTools(domainTools);
1804
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
1805
+ rebuildAllTools();
1806
+ // Notify client of tool list change
1807
+ server.notification({ method: "notifications/tools/list_changed" }).catch(() => { });
1808
+ return [...toLoad];
1809
+ }
1119
1810
  const SAVE_TOOL_NAMES = new Set(["save_session_note", "record_learning"]);
1120
1811
  const REFRESH_INTERVAL = 30; // remind after every 30 calls
1121
1812
  function getHookHint(toolName) {
@@ -1149,42 +1840,93 @@ const PROMPTS = [
1149
1840
  role: "user",
1150
1841
  content: {
1151
1842
  type: "text",
1152
- text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
1153
-
1154
- WHAT THIS DOES:
1155
- In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
1156
- that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
1157
- the agent finds 2+ prior findings before writing a single line of code.
1158
-
1159
- HOW IT WORKS:
1160
- Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
1161
- Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
1162
- compounds into future tasks.
1163
-
1164
- FIRST TIME? Run these 3 steps:
1165
- 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
1166
- 2. Call getMethodology("overview") to see all available methodologies
1167
- 3. Call search_all_knowledge("your current task") before starting any work
1168
-
1169
- RETURNING? Your project context and all past learnings are persisted. Start with:
1170
- 1. Call search_all_knowledge with your current task
1171
- 2. Follow the methodology tools as you work — they'll guide you step by step
1172
-
1173
- KEY TOOLS:
1174
- - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
1175
- - run_mandatory_flywheel — 6-step minimum verification before declaring work done
1176
- - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
1177
- - findTools — Discover tools by keyword or category
1178
- - assess_risk — Assess risk before acting (HIGH = needs confirmation)
1179
-
1180
- PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
1181
- - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
1182
- - get_parallel_status — See what all agents are doing
1843
+ text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
1844
+
1845
+ WHAT THIS DOES:
1846
+ In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
1847
+ that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
1848
+ the agent finds 2+ prior findings before writing a single line of code.
1849
+
1850
+ HOW IT WORKS:
1851
+ Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
1852
+ Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
1853
+ compounds into future tasks.
1854
+
1855
+ FIRST TIME? Run these 3 steps:
1856
+ 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
1857
+ 2. Call getMethodology("overview") to see all available methodologies
1858
+ 3. Call search_all_knowledge("your current task") before starting any work
1859
+
1860
+ RETURNING? Your project context and all past learnings are persisted. Start with:
1861
+ 1. Call search_all_knowledge with your current task
1862
+ 2. Follow the methodology tools as you work — they'll guide you step by step
1863
+
1864
+ KEY TOOLS:
1865
+ - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
1866
+ - run_mandatory_flywheel — 6-step minimum verification before declaring work done
1867
+ - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
1868
+ - findTools — Discover tools by keyword or category
1869
+ - assess_risk — Assess risk before acting (HIGH = needs confirmation)
1870
+
1871
+ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
1872
+ - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
1873
+ - get_parallel_status — See what all agents are doing
1183
1874
  - Use the "claude-code-parallel" prompt for step-by-step guidance`,
1184
1875
  },
1185
1876
  },
1186
1877
  ],
1187
1878
  },
1879
+ {
1880
+ name: "execution-trace-workflow",
1881
+ description: "Start and maintain a traceable execution run. Use this for any workflow that needs receipts, evidence, decisions, verification, approvals, and a durable audit trail.",
1882
+ arguments: [
1883
+ {
1884
+ name: "workflowTitle",
1885
+ description: "Human-readable title for the run",
1886
+ required: true,
1887
+ },
1888
+ {
1889
+ name: "workflowGoal",
1890
+ description: "What the workflow must accomplish",
1891
+ required: true,
1892
+ },
1893
+ {
1894
+ name: "workflowType",
1895
+ description: "Optional workflow label such as spreadsheet_enrichment or company_direction_analysis",
1896
+ required: false,
1897
+ },
1898
+ ],
1899
+ messages: (args) => [
1900
+ {
1901
+ role: "user",
1902
+ content: {
1903
+ type: "text",
1904
+ text: `Run this task as a fully traceable execution workflow.
1905
+
1906
+ Title: ${args.workflowTitle}
1907
+ Goal: ${args.workflowGoal}
1908
+ Workflow type: ${args.workflowType || "execution_trace"}
1909
+
1910
+ Required operating loop:
1911
+ 1. Call start_execution_run first. Create one durable run before doing substantive work.
1912
+ 2. Record every meaningful action with record_execution_step. Do this for inspect, research, edit, verify, export, and issue-fix steps.
1913
+ 3. Attach evidence as you go with attach_execution_evidence. Store URLs, uploaded files, renders, screenshots, logs, and notes.
1914
+ 4. Record explicit choices with record_execution_decision. Capture alternatives considered, evidence basis, confidence, and limitations. Do not expose raw chain-of-thought.
1915
+ 5. Record QA checks with record_execution_verification. Use this for render checks, formula checks, diff checks, replay checks, or artifact integrity checks.
1916
+ 6. If a risky action needs human sign-off, call request_execution_approval before proceeding.
1917
+ 7. Finish with complete_execution_run and set the final status plus any drift summary if applicable.
1918
+
1919
+ Trace standard:
1920
+ - Facts and outputs must be evidence-grounded.
1921
+ - Decisions must separate verified evidence from inference.
1922
+ - Verification must explain what was checked and what passed or failed.
1923
+ - Limitations must be explicit instead of implied.
1924
+
1925
+ Do not treat the trace as optional. The run should be inspectable after completion by an operator who was not present during execution.`,
1926
+ },
1927
+ },
1928
+ ],
1929
+ },
1188
1930
  {
1189
1931
  name: "project-setup",
1190
1932
  description: "Guided project bootstrapping. Walks you through registering project context so the MCP has full project awareness.",
@@ -1200,21 +1942,154 @@ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
1200
1942
  role: "user",
1201
1943
  content: {
1202
1944
  type: "text",
1203
- text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
1204
-
1205
- Please gather and record the following using the bootstrap_project tool:
1206
- 1. Tech stack (languages, frameworks, runtimes)
1207
- 2. Key dependency versions
1208
- 3. Architecture overview
1209
- 4. Build/test commands
1210
- 5. Known conventions or patterns
1211
- 6. Repository structure highlights
1212
-
1945
+ text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
1946
+
1947
+ Please gather and record the following using the bootstrap_project tool:
1948
+ 1. Tech stack (languages, frameworks, runtimes)
1949
+ 2. Key dependency versions
1950
+ 3. Architecture overview
1951
+ 4. Build/test commands
1952
+ 5. Known conventions or patterns
1953
+ 6. Repository structure highlights
1954
+
1213
1955
  After bootstrapping, run a reconnaissance session with run_recon to check for latest updates on the project's key frameworks and SDKs.`,
1214
1956
  },
1215
1957
  },
1216
1958
  ],
1217
1959
  },
1960
+ {
1961
+ name: "spreadsheet-enrichment-trace",
1962
+ description: "Traceable workflow for spreadsheet enrichment: inspect workbook, research supporting evidence, edit cells, verify render/calculation quality, and export with receipts.",
1963
+ arguments: [
1964
+ {
1965
+ name: "fileUri",
1966
+ description: "Input spreadsheet path or URI",
1967
+ required: true,
1968
+ },
1969
+ {
1970
+ name: "goal",
1971
+ description: "What the spreadsheet workflow should achieve",
1972
+ required: true,
1973
+ },
1974
+ ],
1975
+ messages: (args) => [
1976
+ {
1977
+ role: "user",
1978
+ content: {
1979
+ type: "text",
1980
+ text: `Run a traceable spreadsheet-enrichment workflow.
1981
+
1982
+ Input spreadsheet: ${args.fileUri}
1983
+ Goal: ${args.goal}
1984
+
1985
+ Workflow:
1986
+ 1. Start a run with start_execution_run using workflowName="spreadsheet_enrichment".
1987
+ 2. Inspect workbook structure, layout, formulas, and formatting. Record this with record_execution_step.
1988
+ 3. Attach the workbook and any rendered images as evidence with attach_execution_evidence.
1989
+ 4. If public research is needed, attach source URLs and record the evidence boundary.
1990
+ 5. Record major ranking or editing choices with record_execution_decision. Include alternatives considered and any unsupported claims.
1991
+ 6. Perform edits. Record the edit step and attach output artifacts or before/after references.
1992
+ 7. Verify the workbook. Record calculation checks, render checks, formatting checks, link cleanup, and export checks with record_execution_verification.
1993
+ 8. Complete the run only after the workbook is exported and the final verification state is known.
1994
+
1995
+ Required output discipline:
1996
+ - Make changed cells traceable.
1997
+ - Distinguish verified facts from inferred recommendations.
1998
+ - Record any formatting or hyperlink cleanup as explicit fix steps.
1999
+ - Leave behind enough evidence for another operator to replay what happened.`,
2000
+ },
2001
+ },
2002
+ ],
2003
+ },
2004
+ {
2005
+ name: "company-direction-analysis-trace",
2006
+ description: "Traceable workflow for capability-to-product-direction analysis grounded in public evidence, credibility filters, and phased recommendations.",
2007
+ arguments: [
2008
+ {
2009
+ name: "subjectCompany",
2010
+ description: "Company being evaluated",
2011
+ required: true,
2012
+ },
2013
+ {
2014
+ name: "strategicQuestion",
2015
+ description: "The product-direction or capability question being answered",
2016
+ required: true,
2017
+ },
2018
+ ],
2019
+ messages: (args) => [
2020
+ {
2021
+ role: "user",
2022
+ content: {
2023
+ type: "text",
2024
+ text: `Run a traceable company-direction analysis.
2025
+
2026
+ Subject company: ${args.subjectCompany}
2027
+ Strategic question: ${args.strategicQuestion}
2028
+
2029
+ Required method:
2030
+ 1. Start a run with start_execution_run using workflowName="company_direction_analysis".
2031
+ 2. Gather public evidence first. Attach company pages, press, resumes, hiring signals, papers, and adjacent market references as evidence.
2032
+ 3. Call compute_dimension_profile as soon as you have enough evidence to ground the company state. Then use export_dimension_bundle to inspect the regime label, policy context, evidence rows, and interaction effects.
2033
+ 4. Record a decision boundary between:
2034
+ - publicly supported facts
2035
+ - supported but incomplete claims
2036
+ - not established by public evidence
2037
+ 5. Build a credibility filter and a dimension-aware regime summary. Record explicit decisions for high-credibility, medium-credibility, and low-credibility directions, and tie them to capital, capability, network, market, operations, and narrative dimensions where relevant.
2038
+ 6. Record the final recommendation as a structured decision with alternatives considered, evidence basis, confidence, limitations, and the regime you believe the company is operating under.
2039
+ 7. Record at least one verification step that checks the final memo still reflects the truth boundary, the exported dimension bundle, and does not overclaim pedigree.
2040
+ 8. Complete the run after the recommendation, limitations, evidence links, and dimension bundle references are all attached.
2041
+
2042
+ Output rules:
2043
+ - Recommendations must stay adjacent to reputation and public proof.
2044
+ - Unsupported claims must be clearly labeled as unsupported.
2045
+ - Distinguish verified, estimated, inferred, and unavailable dimension signals.
2046
+ - The trace should let another operator audit why a direction was recommended or rejected.`,
2047
+ },
2048
+ },
2049
+ ],
2050
+ },
2051
+ {
2052
+ name: "agent-delegation-with-approval-trace",
2053
+ description: "Traceable workflow for delegated agent work with approval gates. Use this when a capable agent can operate, but risky actions still need scoped human sign-off.",
2054
+ arguments: [
2055
+ {
2056
+ name: "task",
2057
+ description: "Delegated task description",
2058
+ required: true,
2059
+ },
2060
+ {
2061
+ name: "riskLevel",
2062
+ description: "Expected risk level: low, medium, or high",
2063
+ required: true,
2064
+ },
2065
+ ],
2066
+ messages: (args) => [
2067
+ {
2068
+ role: "user",
2069
+ content: {
2070
+ type: "text",
2071
+ text: `Run a delegated agent workflow with explicit approval boundaries.
2072
+
2073
+ Task: ${args.task}
2074
+ Risk level: ${args.riskLevel}
2075
+
2076
+ Required process:
2077
+ 1. Start a run with start_execution_run using workflowName="agent_delegation".
2078
+ 2. Record the initial scope, intended tools, and expected outputs with record_execution_step.
2079
+ 3. Attach inputs, policies, and constraints as evidence.
2080
+ 4. Record any material choice or plan update with record_execution_decision.
2081
+ 5. Before any externally visible, destructive, or high-risk action, call request_execution_approval.
2082
+ 6. Only continue after the approval state is known, and record the resulting step explicitly.
2083
+ 7. Record verification that the final output stayed inside scope and honored the approval boundary.
2084
+ 8. Complete the run with the final status and limitations.
2085
+
2086
+ Trust requirements:
2087
+ - The operator must be able to see what was attempted, what required approval, and what evidence justified the action.
2088
+ - Do not hide uncertainty or skipped approvals inside prose summaries.`,
2089
+ },
2090
+ },
2091
+ ],
2092
+ },
1218
2093
  {
1219
2094
  name: "ui-qa-checklist",
1220
2095
  description: "UI/UX QA checklist for frontend implementations. Run after any change that touches React components, layouts, or interactions. Guides the agent through component tests, accessibility, responsive checks, and E2E validation.",
@@ -1230,33 +2105,33 @@ After bootstrapping, run a reconnaissance session with run_recon to check for la
1230
2105
  role: "user",
1231
2106
  content: {
1232
2107
  type: "text",
1233
- text: `You just implemented UI changes to: ${args.componentName}
1234
-
1235
- Before declaring this work done, run the UI/UX QA checklist:
1236
-
1237
- 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
1238
- 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
1239
- 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
1240
- 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
1241
- 5. STATES: Verify loading, error, and empty states are handled
1242
- 6. CONSOLE: Check browser devtools for errors/warnings
1243
- 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
1244
- 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
1245
- 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
1246
-
1247
- After checking each item, record results:
1248
- call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
1249
- evaluate each rule against ${args.componentName}
1250
- call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
1251
- call record_learning for any UI gotchas discovered
1252
-
1253
- For the full step-by-step methodology, call getMethodology("ui_ux_qa").
1254
-
1255
- Commands available:
1256
- npm run test:run — Vitest component tests
1257
- npm run test:e2e — Playwright E2E tests
1258
- npm run storybook — Storybook dev server (port 6006)
1259
- npm run perf:lighthouse — Lighthouse audit
2108
+ text: `You just implemented UI changes to: ${args.componentName}
2109
+
2110
+ Before declaring this work done, run the UI/UX QA checklist:
2111
+
2112
+ 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
2113
+ 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
2114
+ 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
2115
+ 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
2116
+ 5. STATES: Verify loading, error, and empty states are handled
2117
+ 6. CONSOLE: Check browser devtools for errors/warnings
2118
+ 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
2119
+ 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
2120
+ 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
2121
+
2122
+ After checking each item, record results:
2123
+ call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
2124
+ evaluate each rule against ${args.componentName}
2125
+ call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
2126
+ call record_learning for any UI gotchas discovered
2127
+
2128
+ For the full step-by-step methodology, call getMethodology("ui_ux_qa").
2129
+
2130
+ Commands available:
2131
+ npm run test:run — Vitest component tests
2132
+ npm run test:e2e — Playwright E2E tests
2133
+ npm run storybook — Storybook dev server (port 6006)
2134
+ npm run perf:lighthouse — Lighthouse audit
1260
2135
  npm run perf:bundle — Bundle size analysis`,
1261
2136
  },
1262
2137
  },
@@ -1284,47 +2159,47 @@ Commands available:
1284
2159
  role: "user",
1285
2160
  content: {
1286
2161
  type: "text",
1287
- text: `You are coordinating a parallel agent team for: ${args.projectGoal}
1288
-
1289
- This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
1290
- Reference: https://www.anthropic.com/engineering/building-c-compiler
1291
-
1292
- SETUP (run these in order):
1293
-
1294
- 1. ORIENT — Check what's already happening:
1295
- call get_parallel_status({ includeHistory: true })
1296
- call list_agent_tasks({ status: "all" })
1297
-
1298
- 2. PLAN ROLES — Assign ${agentCount} specialized agents:
1299
- Recommended role split for ${agentCount} agents:
1300
- ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
1301
- - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
1302
- - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
2162
+ text: `You are coordinating a parallel agent team for: ${args.projectGoal}
2163
+
2164
+ This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
2165
+ Reference: https://www.anthropic.com/engineering/building-c-compiler
2166
+
2167
+ SETUP (run these in order):
2168
+
2169
+ 1. ORIENT — Check what's already happening:
2170
+ call get_parallel_status({ includeHistory: true })
2171
+ call list_agent_tasks({ status: "all" })
2172
+
2173
+ 2. PLAN ROLES — Assign ${agentCount} specialized agents:
2174
+ Recommended role split for ${agentCount} agents:
2175
+ ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
2176
+ - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
2177
+ - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
1303
2178
  - Agent 4: assign_agent_role({ role: "documentation_maintainer", focusArea: "docs and progress" })` :
1304
- `- Agent 1: assign_agent_role({ role: "implementer" })
1305
- - Agent 2: assign_agent_role({ role: "test_writer" })`}
1306
-
1307
- 3. BREAK DOWN WORK — Create task claims:
1308
- For each independent piece of work:
1309
- call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
1310
-
1311
- 4. WORK LOOP (each agent independently):
1312
- a. claim_agent_task — Lock your task
1313
- b. Do the work (implement, test, review)
1314
- c. log_context_budget — Track context usage, avoid pollution
1315
- d. run_oracle_comparison — Validate output against known-good reference
1316
- e. release_agent_task — Release with progress note
1317
- f. Pick next task (repeat)
1318
-
1319
- 5. ANTI-PATTERNS TO AVOID:
1320
- - Two agents working on the same task (always claim first)
1321
- - Dumping thousands of lines of test output (log to file, print summary)
1322
- - Spending hours on one stuck problem (mark as blocked, move on)
1323
- - Overwriting each other's changes (commit frequently, pull before push)
1324
-
1325
- KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
1326
- use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
1327
-
2179
+ `- Agent 1: assign_agent_role({ role: "implementer" })
2180
+ - Agent 2: assign_agent_role({ role: "test_writer" })`}
2181
+
2182
+ 3. BREAK DOWN WORK — Create task claims:
2183
+ For each independent piece of work:
2184
+ call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
2185
+
2186
+ 4. WORK LOOP (each agent independently):
2187
+ a. claim_agent_task — Lock your task
2188
+ b. Do the work (implement, test, review)
2189
+ c. log_context_budget — Track context usage, avoid pollution
2190
+ d. run_oracle_comparison — Validate output against known-good reference
2191
+ e. release_agent_task — Release with progress note
2192
+ f. Pick next task (repeat)
2193
+
2194
+ 5. ANTI-PATTERNS TO AVOID:
2195
+ - Two agents working on the same task (always claim first)
2196
+ - Dumping thousands of lines of test output (log to file, print summary)
2197
+ - Spending hours on one stuck problem (mark as blocked, move on)
2198
+ - Overwriting each other's changes (commit frequently, pull before push)
2199
+
2200
+ KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
2201
+ use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
2202
+
1328
2203
  For the full methodology: call getMethodology("parallel_agent_teams")`,
1329
2204
  },
1330
2205
  },
@@ -1351,45 +2226,45 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
1351
2226
  role: "user",
1352
2227
  content: {
1353
2228
  type: "text",
1354
- text: `Set up oracle-based testing for: ${args.componentName}
1355
- Oracle source: ${args.oracleSource}
1356
-
1357
- This follows the pattern from Anthropic's C Compiler project where GCC served as a
1358
- "known-good compiler oracle" to identify which specific files were broken.
1359
-
1360
- SETUP:
1361
-
1362
- 1. DEFINE ORACLE — Capture known-good reference outputs:
1363
- Run the reference implementation (${args.oracleSource}) on each test input.
1364
- Save outputs as golden files or capture them in the oracle comparison tool.
1365
-
1366
- 2. RUN COMPARISONS — For each test case:
1367
- call run_oracle_comparison({
1368
- testLabel: "${args.componentName}_test_1",
1369
- actualOutput: "<your implementation's output>",
1370
- expectedOutput: "<oracle's output>",
1371
- oracleSource: "${args.oracleSource}"
1372
- })
1373
-
1374
- 3. TRIAGE FAILURES — Review diff summaries:
1375
- Each failing comparison is an independent work item.
1376
- Assign each to a different parallel agent via claim_agent_task.
1377
-
1378
- 4. BINARY SEARCH (for complex failures):
1379
- If a test passes individually but fails when combined with others,
1380
- use delta debugging: split the test set in half, test each half,
1381
- narrow down to the minimal failing combination.
1382
- (This is how Anthropic found pairs of files that failed together but worked independently.)
1383
-
1384
- 5. TRACK PROGRESS — Monitor convergence:
1385
- call get_parallel_status to see how many oracle tests are still failing.
1386
- As agents fix failures, the match percentage should trend toward 100%.
1387
-
1388
- CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
1389
- call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
1390
-
1391
- After all oracle tests pass:
1392
- call record_learning with patterns discovered
2229
+ text: `Set up oracle-based testing for: ${args.componentName}
2230
+ Oracle source: ${args.oracleSource}
2231
+
2232
+ This follows the pattern from Anthropic's C Compiler project where GCC served as a
2233
+ "known-good compiler oracle" to identify which specific files were broken.
2234
+
2235
+ SETUP:
2236
+
2237
+ 1. DEFINE ORACLE — Capture known-good reference outputs:
2238
+ Run the reference implementation (${args.oracleSource}) on each test input.
2239
+ Save outputs as golden files or capture them in the oracle comparison tool.
2240
+
2241
+ 2. RUN COMPARISONS — For each test case:
2242
+ call run_oracle_comparison({
2243
+ testLabel: "${args.componentName}_test_1",
2244
+ actualOutput: "<your implementation's output>",
2245
+ expectedOutput: "<oracle's output>",
2246
+ oracleSource: "${args.oracleSource}"
2247
+ })
2248
+
2249
+ 3. TRIAGE FAILURES — Review diff summaries:
2250
+ Each failing comparison is an independent work item.
2251
+ Assign each to a different parallel agent via claim_agent_task.
2252
+
2253
+ 4. BINARY SEARCH (for complex failures):
2254
+ If a test passes individually but fails when combined with others,
2255
+ use delta debugging: split the test set in half, test each half,
2256
+ narrow down to the minimal failing combination.
2257
+ (This is how Anthropic found pairs of files that failed together but worked independently.)
2258
+
2259
+ 5. TRACK PROGRESS — Monitor convergence:
2260
+ call get_parallel_status to see how many oracle tests are still failing.
2261
+ As agents fix failures, the match percentage should trend toward 100%.
2262
+
2263
+ CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
2264
+ call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
2265
+
2266
+ After all oracle tests pass:
2267
+ call record_learning with patterns discovered
1393
2268
  call run_mandatory_flywheel to verify the full change`,
1394
2269
  },
1395
2270
  },
@@ -1417,67 +2292,67 @@ After all oracle tests pass:
1417
2292
  role: "user",
1418
2293
  content: {
1419
2294
  type: "text",
1420
- text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
1421
-
1422
- ## How This Works
1423
-
1424
- Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
1425
- context window. NodeBench MCP tools coordinate them via a shared SQLite database.
1426
-
1427
- **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
1428
- **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
1429
-
1430
- ## Step-by-Step
1431
-
1432
- ### 1. PLAN — Break work into ${count} independent tasks
1433
- Identify ${count} pieces of work that can run in parallel without dependencies.
1434
- Each task should be independently completable and testable.
1435
-
1436
- ### 2. SPAWN — Launch subagents with coordination instructions
1437
- For each task, use the Task tool:
1438
-
1439
- \`\`\`
1440
- Task tool call:
1441
- prompt: "You have access to NodeBench MCP. Do the following:
1442
- 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
1443
- 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
1444
- 3. Do the work
1445
- 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
1446
- 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
1447
- 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
1448
- \`\`\`
1449
-
1450
- ### 3. MONITOR — Check progress
1451
- After spawning all subagents:
1452
- call get_parallel_status({ includeHistory: true })
1453
- call list_agent_tasks({ status: "all" })
1454
-
1455
- ### 4. VALIDATE — Run oracle comparisons if applicable
1456
- If subagents produced outputs that should match a reference:
1457
- call run_oracle_comparison for each output
1458
-
1459
- ### 5. GATE — Quality check the aggregate result
1460
- call run_quality_gate with rules covering all ${count} tasks
1461
- call run_mandatory_flywheel to verify the combined change
1462
-
1463
- ## Concrete IMPACT of This Workflow
1464
-
1465
- | What NodeBench Adds | Without It (bare subagents) |
1466
- |---------------------------------|---------------------------------------|
1467
- | Task locks prevent duplicate work | Two subagents might fix the same bug |
1468
- | Role specialization | All subagents do everything |
1469
- | Context budget tracking | Subagent runs out of context silently |
1470
- | Oracle comparisons | No reference-based validation |
1471
- | Progress notes for handoff | Next session starts from scratch |
1472
- | Learnings persisted | Knowledge lost when subagent exits |
1473
- | Quality gate on aggregate | No validation that pieces fit together |
1474
-
1475
- ## Anti-Patterns
1476
- - DO NOT spawn subagents for work that has dependencies (sequential steps)
1477
- - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
1478
- - DO NOT dump large outputs into subagent context — use log_context_budget to track
1479
- - DO NOT forget release_agent_task — orphaned claims block future sessions
1480
-
2295
+ text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
2296
+
2297
+ ## How This Works
2298
+
2299
+ Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
2300
+ context window. NodeBench MCP tools coordinate them via a shared SQLite database.
2301
+
2302
+ **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
2303
+ **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
2304
+
2305
+ ## Step-by-Step
2306
+
2307
+ ### 1. PLAN — Break work into ${count} independent tasks
2308
+ Identify ${count} pieces of work that can run in parallel without dependencies.
2309
+ Each task should be independently completable and testable.
2310
+
2311
+ ### 2. SPAWN — Launch subagents with coordination instructions
2312
+ For each task, use the Task tool:
2313
+
2314
+ \`\`\`
2315
+ Task tool call:
2316
+ prompt: "You have access to NodeBench MCP. Do the following:
2317
+ 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
2318
+ 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
2319
+ 3. Do the work
2320
+ 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
2321
+ 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
2322
+ 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
2323
+ \`\`\`
2324
+
2325
+ ### 3. MONITOR — Check progress
2326
+ After spawning all subagents:
2327
+ call get_parallel_status({ includeHistory: true })
2328
+ call list_agent_tasks({ status: "all" })
2329
+
2330
+ ### 4. VALIDATE — Run oracle comparisons if applicable
2331
+ If subagents produced outputs that should match a reference:
2332
+ call run_oracle_comparison for each output
2333
+
2334
+ ### 5. GATE — Quality check the aggregate result
2335
+ call run_quality_gate with rules covering all ${count} tasks
2336
+ call run_mandatory_flywheel to verify the combined change
2337
+
2338
+ ## Concrete IMPACT of This Workflow
2339
+
2340
+ | What NodeBench Adds | Without It (bare subagents) |
2341
+ |---------------------------------|---------------------------------------|
2342
+ | Task locks prevent duplicate work | Two subagents might fix the same bug |
2343
+ | Role specialization | All subagents do everything |
2344
+ | Context budget tracking | Subagent runs out of context silently |
2345
+ | Oracle comparisons | No reference-based validation |
2346
+ | Progress notes for handoff | Next session starts from scratch |
2347
+ | Learnings persisted | Knowledge lost when subagent exits |
2348
+ | Quality gate on aggregate | No validation that pieces fit together |
2349
+
2350
+ ## Anti-Patterns
2351
+ - DO NOT spawn subagents for work that has dependencies (sequential steps)
2352
+ - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
2353
+ - DO NOT dump large outputs into subagent context — use log_context_budget to track
2354
+ - DO NOT forget release_agent_task — orphaned claims block future sessions
2355
+
1481
2356
  For the full parallel agent methodology: call getMethodology("parallel_agent_teams")`,
1482
2357
  },
1483
2358
  },
@@ -1504,72 +2379,72 @@ For the full parallel agent methodology: call getMethodology("parallel_agent_tea
1504
2379
  role: "user",
1505
2380
  content: {
1506
2381
  type: "text",
1507
- text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
1508
- ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
1509
-
1510
- This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
1511
-
1512
- STEP 1 — DETECT (dry run first):
1513
- call bootstrap_parallel_agents({
1514
- projectRoot: "${args.projectPath}",
1515
- dryRun: true,
1516
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1517
- includeAgentsMd: true
1518
- })
1519
-
1520
- Review the gap report. It scans 7 categories:
1521
- - Task coordination (lock files, claim directories)
1522
- - Role specialization (role configs, AGENTS.md mentions)
1523
- - Oracle testing (golden files, reference outputs, snapshots)
1524
- - Context budget tracking (budget configs, AGENTS.md mentions)
1525
- - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
1526
- - AGENTS.md parallel section (parallel agent coordination protocol)
1527
- - Git worktrees (for true parallel work)
1528
-
1529
- STEP 2 — SCAFFOLD (create files):
1530
- If gaps found, run with dryRun=false:
1531
- call bootstrap_parallel_agents({
1532
- projectRoot: "${args.projectPath}",
1533
- dryRun: false,
1534
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1535
- includeAgentsMd: true
1536
- })
1537
-
1538
- This creates:
1539
- - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
1540
- - progress.md template for agent orientation
1541
- - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
1542
-
1543
- STEP 3 — GENERATE AGENTS.MD (if needed):
1544
- call generate_parallel_agents_md({
1545
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1546
- projectName: "${args.projectPath.split("/").pop() || "project"}",
1547
- maxAgents: 4,
1548
- includeNodebenchSetup: true
1549
- })
1550
-
1551
- Copy the output into the target repo's AGENTS.md.
1552
-
1553
- STEP 4 — VERIFY (6-step flywheel):
1554
- The bootstrap tool returns a flywheelPlan. Execute each step:
1555
- 1. Static analysis — verify scaffold files don't conflict
1556
- 2. Happy path — claim task → work → release → progress.md updated
1557
- 3. Conflict test — two claims on same task → second gets conflict
1558
- 4. Oracle test — create golden file → diff catches changes
1559
- 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
1560
- 6. Document — record_learning with patterns discovered
1561
-
1562
- STEP 5 — FIX (if anything fails):
1563
- Fix the issue, then re-run from Step 4.
1564
-
1565
- STEP 6 — DOCUMENT:
1566
- call record_learning({
1567
- key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
1568
- content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
1569
- category: "pattern",
1570
- tags: ["parallel-agents", "bootstrap", "external-repo"]
1571
- })
1572
-
2382
+ text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
2383
+ ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
2384
+
2385
+ This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
2386
+
2387
+ STEP 1 — DETECT (dry run first):
2388
+ call bootstrap_parallel_agents({
2389
+ projectRoot: "${args.projectPath}",
2390
+ dryRun: true,
2391
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2392
+ includeAgentsMd: true
2393
+ })
2394
+
2395
+ Review the gap report. It scans 7 categories:
2396
+ - Task coordination (lock files, claim directories)
2397
+ - Role specialization (role configs, AGENTS.md mentions)
2398
+ - Oracle testing (golden files, reference outputs, snapshots)
2399
+ - Context budget tracking (budget configs, AGENTS.md mentions)
2400
+ - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
2401
+ - AGENTS.md parallel section (parallel agent coordination protocol)
2402
+ - Git worktrees (for true parallel work)
2403
+
2404
+ STEP 2 — SCAFFOLD (create files):
2405
+ If gaps found, run with dryRun=false:
2406
+ call bootstrap_parallel_agents({
2407
+ projectRoot: "${args.projectPath}",
2408
+ dryRun: false,
2409
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2410
+ includeAgentsMd: true
2411
+ })
2412
+
2413
+ This creates:
2414
+ - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
2415
+ - progress.md template for agent orientation
2416
+ - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
2417
+
2418
+ STEP 3 — GENERATE AGENTS.MD (if needed):
2419
+ call generate_parallel_agents_md({
2420
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2421
+ projectName: "${args.projectPath.split("/").pop() || "project"}",
2422
+ maxAgents: 4,
2423
+ includeNodebenchSetup: true
2424
+ })
2425
+
2426
+ Copy the output into the target repo's AGENTS.md.
2427
+
2428
+ STEP 4 — VERIFY (6-step flywheel):
2429
+ The bootstrap tool returns a flywheelPlan. Execute each step:
2430
+ 1. Static analysis — verify scaffold files don't conflict
2431
+ 2. Happy path — claim task → work → release → progress.md updated
2432
+ 3. Conflict test — two claims on same task → second gets conflict
2433
+ 4. Oracle test — create golden file → diff catches changes
2434
+ 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
2435
+ 6. Document — record_learning with patterns discovered
2436
+
2437
+ STEP 5 — FIX (if anything fails):
2438
+ Fix the issue, then re-run from Step 4.
2439
+
2440
+ STEP 6 — DOCUMENT:
2441
+ call record_learning({
2442
+ key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
2443
+ content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
2444
+ category: "pattern",
2445
+ tags: ["parallel-agents", "bootstrap", "external-repo"]
2446
+ })
2447
+
1573
2448
  For the full methodology: call getMethodology("parallel_agent_teams")`,
1574
2449
  },
1575
2450
  },
@@ -1583,82 +2458,82 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
1583
2458
  role: "user",
1584
2459
  content: {
1585
2460
  type: "text",
1586
- text: `## NodeBench MCP Agent Contract
1587
-
1588
- You are connected to NodeBench MCP. Follow these rules EXACTLY.
1589
-
1590
- ### FRONT DOOR — Always start here (before writing any code)
1591
- 1. search_all_knowledge("<your current task>") — Check if this was solved before
1592
- 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
1593
- 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
1594
- 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
1595
-
1596
- ### SELF-SETUP — If a capability is missing
1597
- When discover_tools returns nothing useful, or a tool says "not configured":
1598
- 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
1599
- 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
1600
- 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
1601
- 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
1602
-
1603
- ### BEFORE IMPLEMENTATION
1604
- - run_recon + log_recon_finding (if reconnaissance applies)
1605
- - assess_risk (HIGH risk = must get confirmation before proceeding)
1606
-
1607
- ### PARALLEL WORK
1608
- - MUST claim_agent_task before editing or designing anything
1609
- - MUST release_agent_task with a progress note + next action when done
1610
- - MUST log_context_budget to track context usage and avoid pollution
1611
-
1612
- ### BEFORE SHIP
1613
- - 3-layer tests logged (unit + integration + e2e via log_test_result)
1614
- - Eval run recorded (promote_to_eval)
1615
- - Quality gate passed (run_quality_gate)
1616
- - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
1617
- - Learning banked (record_learning)
1618
-
1619
- ### COORDINATOR SPAWN TEMPLATE
1620
- When spawning subagents, give each this instruction block:
1621
- "You have NodeBench MCP. Before any work:
1622
- 1. search_all_knowledge('<task>')
1623
- 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
1624
- 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
1625
- Do the work, then:
1626
- 4. log_context_budget({ eventType: 'checkpoint' })
1627
- 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
1628
- 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
1629
-
1630
- ### ANTI-RATIONALIZATION — Block these escape patterns
1631
- Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
1632
- 1. "I already know which tool to use" → Still call discover_tools to confirm
1633
- 2. "This is a simple task" → Still call search_all_knowledge to check history
1634
- 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
1635
- 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
1636
- 5. "I'll record the learning later" → Record NOW — context compaction may erase it
1637
- 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
1638
- 7. "The user said to skip verification" → Log the skip decision, never silently omit
1639
- 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
1640
-
1641
- ### 2-ACTION SAVE RULE
1642
- After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
1643
- - save_session_note (filesystem, survives compaction)
1644
- - record_learning (SQLite, searchable across sessions)
1645
- - log_recon_finding (tied to recon session)
1646
- This prevents knowledge loss when context is compacted mid-session.
1647
-
1648
- ### 3-STRIKE ERROR PROTOCOL
1649
- When an action fails:
1650
- - Strike 1: Diagnose root cause, apply targeted fix
1651
- - Strike 2: Try a different method or tool
1652
- - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
1653
- - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
1654
-
1655
- ### ATTENTION REFRESH
1656
- After 30+ tool calls, call refresh_task_context to combat attention drift.
1657
- Re-read your original goal and open gaps before continuing.
1658
-
1659
- ### WHY THIS MATTERS
1660
- Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
1661
- work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
2461
+ text: `## NodeBench MCP Agent Contract
2462
+
2463
+ You are connected to NodeBench MCP. Follow these rules EXACTLY.
2464
+
2465
+ ### FRONT DOOR — Always start here (before writing any code)
2466
+ 1. search_all_knowledge("<your current task>") — Check if this was solved before
2467
+ 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
2468
+ 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
2469
+ 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
2470
+
2471
+ ### SELF-SETUP — If a capability is missing
2472
+ When discover_tools returns nothing useful, or a tool says "not configured":
2473
+ 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
2474
+ 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
2475
+ 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
2476
+ 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
2477
+
2478
+ ### BEFORE IMPLEMENTATION
2479
+ - run_recon + log_recon_finding (if reconnaissance applies)
2480
+ - assess_risk (HIGH risk = must get confirmation before proceeding)
2481
+
2482
+ ### PARALLEL WORK
2483
+ - MUST claim_agent_task before editing or designing anything
2484
+ - MUST release_agent_task with a progress note + next action when done
2485
+ - MUST log_context_budget to track context usage and avoid pollution
2486
+
2487
+ ### BEFORE SHIP
2488
+ - 3-layer tests logged (unit + integration + e2e via log_test_result)
2489
+ - Eval run recorded (promote_to_eval)
2490
+ - Quality gate passed (run_quality_gate)
2491
+ - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
2492
+ - Learning banked (record_learning)
2493
+
2494
+ ### COORDINATOR SPAWN TEMPLATE
2495
+ When spawning subagents, give each this instruction block:
2496
+ "You have NodeBench MCP. Before any work:
2497
+ 1. search_all_knowledge('<task>')
2498
+ 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
2499
+ 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
2500
+ Do the work, then:
2501
+ 4. log_context_budget({ eventType: 'checkpoint' })
2502
+ 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
2503
+ 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
2504
+
2505
+ ### ANTI-RATIONALIZATION — Block these escape patterns
2506
+ Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
2507
+ 1. "I already know which tool to use" → Still call discover_tools to confirm
2508
+ 2. "This is a simple task" → Still call search_all_knowledge to check history
2509
+ 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
2510
+ 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
2511
+ 5. "I'll record the learning later" → Record NOW — context compaction may erase it
2512
+ 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
2513
+ 7. "The user said to skip verification" → Log the skip decision, never silently omit
2514
+ 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
2515
+
2516
+ ### 2-ACTION SAVE RULE
2517
+ After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
2518
+ - save_session_note (filesystem, survives compaction)
2519
+ - record_learning (SQLite, searchable across sessions)
2520
+ - log_recon_finding (tied to recon session)
2521
+ This prevents knowledge loss when context is compacted mid-session.
2522
+
2523
+ ### 3-STRIKE ERROR PROTOCOL
2524
+ When an action fails:
2525
+ - Strike 1: Diagnose root cause, apply targeted fix
2526
+ - Strike 2: Try a different method or tool
2527
+ - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
2528
+ - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
2529
+
2530
+ ### ATTENTION REFRESH
2531
+ After 30+ tool calls, call refresh_task_context to combat attention drift.
2532
+ Re-read your original goal and open gaps before continuing.
2533
+
2534
+ ### WHY THIS MATTERS
2535
+ Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
2536
+ work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
1662
2537
  artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound across tasks.`,
1663
2538
  },
1664
2539
  },
@@ -1672,191 +2547,191 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
1672
2547
  role: "user",
1673
2548
  content: {
1674
2549
  type: "text",
1675
- text: `# Claude Code Swarm Orchestration
1676
-
1677
- Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
1678
-
1679
- ---
1680
-
1681
- ## Primitives
1682
-
1683
- | Primitive | What It Is |
1684
- |-----------|-----------|
1685
- | **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
1686
- | **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
1687
- | **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
1688
- | **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
1689
- | **Task** | A work item with subject, description, status, owner, and dependencies. |
1690
- | **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
1691
- | **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
1692
-
1693
- ---
1694
-
1695
- ## Two Ways to Spawn Agents
1696
-
1697
- ### Method 1: Task Tool (Subagents) — short-lived, returns result directly
1698
- \`\`\`javascript
1699
- Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
1700
- \`\`\`
1701
-
1702
- ### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
1703
- \`\`\`javascript
1704
- Teammate({ operation: "spawnTeam", team_name: "my-project" })
1705
- Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
1706
- \`\`\`
1707
-
1708
- | Aspect | Task (subagent) | Task + team_name + name (teammate) |
1709
- |--------|-----------------|-----------------------------------|
1710
- | Lifespan | Until task complete | Until shutdown requested |
1711
- | Communication | Return value | Inbox messages |
1712
- | Task access | None | Shared task list |
1713
- | Team membership | No | Yes |
1714
-
1715
- ---
1716
-
1717
- ## Built-in Agent Types
1718
-
1719
- - **Bash** — command execution, git ops (tools: Bash only)
1720
- - **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
1721
- - **Plan** — architecture + implementation plans (read-only tools)
1722
- - **general-purpose** — all tools, multi-step research + action
1723
- - **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
1724
- - **statusline-setup** — configure Claude Code status line
1725
-
1726
- ---
1727
-
1728
- ## TeammateTool Operations
1729
-
1730
- | Operation | Who | What |
1731
- |-----------|-----|------|
1732
- | \`spawnTeam\` | Leader | Create team + task directory |
1733
- | \`discoverTeams\` | Anyone | List joinable teams |
1734
- | \`requestJoin\` | Teammate | Request to join existing team |
1735
- | \`approveJoin\` | Leader | Accept join request |
1736
- | \`write\` | Anyone | Message ONE teammate |
1737
- | \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
1738
- | \`requestShutdown\` | Leader | Ask teammate to exit |
1739
- | \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
1740
- | \`rejectShutdown\` | Teammate | Decline shutdown with reason |
1741
- | \`approvePlan\` | Leader | Approve plan_approval_request |
1742
- | \`rejectPlan\` | Leader | Reject plan with feedback |
1743
- | \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
1744
-
1745
- ---
1746
-
1747
- ## Task System
1748
-
1749
- \`\`\`javascript
1750
- TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
1751
- TaskList() // See all tasks + statuses
1752
- TaskGet({ taskId: "2" }) // Get full task details
1753
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // Dependency — auto-unblocks when #1 completes
1754
- TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
1755
- TaskUpdate({ taskId: "2", status: "completed" })
1756
- \`\`\`
1757
-
1758
- ---
1759
-
1760
- ## Orchestration Patterns
1761
-
1762
- ### Pattern 1: Parallel Specialists
1763
- \`\`\`javascript
1764
- Teammate({ operation: "spawnTeam", team_name: "pr-review" })
1765
- // Spawn reviewers in ONE message (parallel execution)
1766
- Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
1767
- Task({ team_name: "pr-review", name: "perf", subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
1768
- // Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
1769
- \`\`\`
1770
-
1771
- ### Pattern 2: Pipeline (Sequential Dependencies)
1772
- \`\`\`javascript
1773
- TaskCreate({ subject: "Research" }) // #1
1774
- TaskCreate({ subject: "Plan" }) // #2
1775
- TaskCreate({ subject: "Implement" }) // #3
1776
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // #2 waits for #1
1777
- TaskUpdate({ taskId: "3", addBlockedBy: ["2"] }) // #3 waits for #2
1778
- // Spawn workers that poll TaskList and claim unblocked tasks
1779
- \`\`\`
1780
-
1781
- ### Pattern 3: Self-Organizing Swarm
1782
- \`\`\`javascript
1783
- // 1. Create N independent tasks (no dependencies)
1784
- // 2. Spawn M workers with this prompt loop:
1785
- // a. TaskList → find pending+unclaimed task
1786
- // b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
1787
- // c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
1788
- // d. If no tasks: notify team-lead idle, retry 3x, then exit
1789
- \`\`\`
1790
-
1791
- ### Pattern 4: Research → Implement (synchronous)
1792
- \`\`\`javascript
1793
- const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
1794
- Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
1795
- \`\`\`
1796
-
1797
- ---
1798
-
1799
- ## Shutdown Sequence (always follow this order)
1800
-
1801
- \`\`\`javascript
1802
- // 1. Request shutdown for all teammates
1803
- Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
1804
- // 2. Wait for {"type": "shutdown_approved"} in inbox
1805
- // 3. Only then cleanup
1806
- Teammate({ operation: "cleanup" })
1807
- \`\`\`
1808
-
1809
- ---
1810
-
1811
- ## Spawn Backends
1812
-
1813
- | Backend | When auto-selected | Visibility |
1814
- |---------|-------------------|------------|
1815
- | \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
1816
- | \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
1817
- | \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
1818
-
1819
- Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
1820
-
1821
- ---
1822
-
1823
- ## Best Practices
1824
-
1825
- 1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
1826
- 2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
1827
- 3. **Use dependencies**: \`addBlockedBy\` — never poll manually
1828
- 4. **Prefer write over broadcast**: broadcast = N messages for N teammates
1829
- 5. **Always cleanup**: Don't leave orphaned teams
1830
- 6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
1831
-
1832
- ---
1833
-
1834
- ## Quick Reference
1835
-
1836
- \`\`\`javascript
1837
- // Subagent (returns result)
1838
- Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
1839
-
1840
- // Teammate (persistent, background)
1841
- Teammate({ operation: "spawnTeam", team_name: "my-team" })
1842
- Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
1843
-
1844
- // Message teammate
1845
- Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
1846
-
1847
- // Pipeline
1848
- TaskCreate({ subject: "Step 1" }) // → #1
1849
- TaskCreate({ subject: "Step 2" }) // → #2
1850
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
1851
-
1852
- // Shutdown
1853
- Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
1854
- // wait for {"type": "shutdown_approved"} in inbox...
1855
- Teammate({ operation: "cleanup" })
1856
- \`\`\`
1857
-
1858
- ---
1859
-
2550
+ text: `# Claude Code Swarm Orchestration
2551
+
2552
+ Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
2553
+
2554
+ ---
2555
+
2556
+ ## Primitives
2557
+
2558
+ | Primitive | What It Is |
2559
+ |-----------|-----------|
2560
+ | **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
2561
+ | **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
2562
+ | **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
2563
+ | **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
2564
+ | **Task** | A work item with subject, description, status, owner, and dependencies. |
2565
+ | **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
2566
+ | **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
2567
+
2568
+ ---
2569
+
2570
+ ## Two Ways to Spawn Agents
2571
+
2572
+ ### Method 1: Task Tool (Subagents) — short-lived, returns result directly
2573
+ \`\`\`javascript
2574
+ Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
2575
+ \`\`\`
2576
+
2577
+ ### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
2578
+ \`\`\`javascript
2579
+ Teammate({ operation: "spawnTeam", team_name: "my-project" })
2580
+ Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
2581
+ \`\`\`
2582
+
2583
+ | Aspect | Task (subagent) | Task + team_name + name (teammate) |
2584
+ |--------|-----------------|-----------------------------------|
2585
+ | Lifespan | Until task complete | Until shutdown requested |
2586
+ | Communication | Return value | Inbox messages |
2587
+ | Task access | None | Shared task list |
2588
+ | Team membership | No | Yes |
2589
+
2590
+ ---
2591
+
2592
+ ## Built-in Agent Types
2593
+
2594
+ - **Bash** — command execution, git ops (tools: Bash only)
2595
+ - **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
2596
+ - **Plan** — architecture + implementation plans (read-only tools)
2597
+ - **general-purpose** — all tools, multi-step research + action
2598
+ - **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
2599
+ - **statusline-setup** — configure Claude Code status line
2600
+
2601
+ ---
2602
+
2603
+ ## TeammateTool Operations
2604
+
2605
+ | Operation | Who | What |
2606
+ |-----------|-----|------|
2607
+ | \`spawnTeam\` | Leader | Create team + task directory |
2608
+ | \`discoverTeams\` | Anyone | List joinable teams |
2609
+ | \`requestJoin\` | Teammate | Request to join existing team |
2610
+ | \`approveJoin\` | Leader | Accept join request |
2611
+ | \`write\` | Anyone | Message ONE teammate |
2612
+ | \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
2613
+ | \`requestShutdown\` | Leader | Ask teammate to exit |
2614
+ | \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
2615
+ | \`rejectShutdown\` | Teammate | Decline shutdown with reason |
2616
+ | \`approvePlan\` | Leader | Approve plan_approval_request |
2617
+ | \`rejectPlan\` | Leader | Reject plan with feedback |
2618
+ | \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
2619
+
2620
+ ---
2621
+
2622
+ ## Task System
2623
+
2624
+ \`\`\`javascript
2625
+ TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
2626
+ TaskList() // See all tasks + statuses
2627
+ TaskGet({ taskId: "2" }) // Get full task details
2628
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // Dependency — auto-unblocks when #1 completes
2629
+ TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
2630
+ TaskUpdate({ taskId: "2", status: "completed" })
2631
+ \`\`\`
2632
+
2633
+ ---
2634
+
2635
+ ## Orchestration Patterns
2636
+
2637
+ ### Pattern 1: Parallel Specialists
2638
+ \`\`\`javascript
2639
+ Teammate({ operation: "spawnTeam", team_name: "pr-review" })
2640
+ // Spawn reviewers in ONE message (parallel execution)
2641
+ Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
2642
+ Task({ team_name: "pr-review", name: "perf", subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
2643
+ // Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
2644
+ \`\`\`
2645
+
2646
+ ### Pattern 2: Pipeline (Sequential Dependencies)
2647
+ \`\`\`javascript
2648
+ TaskCreate({ subject: "Research" }) // #1
2649
+ TaskCreate({ subject: "Plan" }) // #2
2650
+ TaskCreate({ subject: "Implement" }) // #3
2651
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // #2 waits for #1
2652
+ TaskUpdate({ taskId: "3", addBlockedBy: ["2"] }) // #3 waits for #2
2653
+ // Spawn workers that poll TaskList and claim unblocked tasks
2654
+ \`\`\`
2655
+
2656
+ ### Pattern 3: Self-Organizing Swarm
2657
+ \`\`\`javascript
2658
+ // 1. Create N independent tasks (no dependencies)
2659
+ // 2. Spawn M workers with this prompt loop:
2660
+ // a. TaskList → find pending+unclaimed task
2661
+ // b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
2662
+ // c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
2663
+ // d. If no tasks: notify team-lead idle, retry 3x, then exit
2664
+ \`\`\`
2665
+
2666
+ ### Pattern 4: Research → Implement (synchronous)
2667
+ \`\`\`javascript
2668
+ const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
2669
+ Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
2670
+ \`\`\`
2671
+
2672
+ ---
2673
+
2674
+ ## Shutdown Sequence (always follow this order)
2675
+
2676
+ \`\`\`javascript
2677
+ // 1. Request shutdown for all teammates
2678
+ Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
2679
+ // 2. Wait for {"type": "shutdown_approved"} in inbox
2680
+ // 3. Only then cleanup
2681
+ Teammate({ operation: "cleanup" })
2682
+ \`\`\`
2683
+
2684
+ ---
2685
+
2686
+ ## Spawn Backends
2687
+
2688
+ | Backend | When auto-selected | Visibility |
2689
+ |---------|-------------------|------------|
2690
+ | \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
2691
+ | \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
2692
+ | \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
2693
+
2694
+ Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
2695
+
2696
+ ---
2697
+
2698
+ ## Best Practices
2699
+
2700
+ 1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
2701
+ 2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
2702
+ 3. **Use dependencies**: \`addBlockedBy\` — never poll manually
2703
+ 4. **Prefer write over broadcast**: broadcast = N messages for N teammates
2704
+ 5. **Always cleanup**: Don't leave orphaned teams
2705
+ 6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
2706
+
2707
+ ---
2708
+
2709
+ ## Quick Reference
2710
+
2711
+ \`\`\`javascript
2712
+ // Subagent (returns result)
2713
+ Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
2714
+
2715
+ // Teammate (persistent, background)
2716
+ Teammate({ operation: "spawnTeam", team_name: "my-team" })
2717
+ Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
2718
+
2719
+ // Message teammate
2720
+ Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
2721
+
2722
+ // Pipeline
2723
+ TaskCreate({ subject: "Step 1" }) // → #1
2724
+ TaskCreate({ subject: "Step 2" }) // → #2
2725
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
2726
+
2727
+ // Shutdown
2728
+ Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
2729
+ // wait for {"type": "shutdown_approved"} in inbox...
2730
+ Teammate({ operation: "cleanup" })
2731
+ \`\`\`
2732
+
2733
+ ---
2734
+
1860
2735
  *Source: kieranklaassen/orchestrating-swarms gist — Claude Code v2.1.19*`,
1861
2736
  },
1862
2737
  },
@@ -1870,70 +2745,70 @@ Teammate({ operation: "cleanup" })
1870
2745
  role: "user",
1871
2746
  content: {
1872
2747
  type: "text",
1873
- text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
1874
-
1875
- You are running the Thompson Protocol content pipeline. This is a multi-agent system
1876
- that transforms complex topics into content that makes the reader feel smart.
1877
-
1878
- Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
1879
- the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
1880
- teaching any mechanics.
1881
-
1882
- ## Pipeline (execute in order)
1883
-
1884
- ### Step 1: Initialize
1885
- \`\`\`
1886
- thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
1887
- \`\`\`
1888
- This returns the full execution plan with system prompts for each agent.
1889
-
1890
- ### Step 2: Write (Thompson Writer)
1891
- \`\`\`
1892
- thompson_write({ topic: "<topic>", target_audience: "<audience>" })
1893
- \`\`\`
1894
- Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
1895
- Every technical term MUST have an "in other words..." analogy.
1896
-
1897
- ### Step 3: Edit (Feynman Editor — max 3 cycles)
1898
- \`\`\`
1899
- thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
1900
- \`\`\`
1901
- The Skeptical Beginner reviews against 8 rejection criteria.
1902
- If any section gets REWRITE → send back to thompson_write with fix instructions.
1903
- Loop max 3 times. After 3, escalate stuck sections.
1904
-
1905
- ### Step 4: Visual Map
1906
- \`\`\`
1907
- thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
1908
- \`\`\`
1909
- Generates image prompts that map 1:1 with text analogies. No generic b-roll.
1910
-
1911
- ### Step 5: Anti-Elitism Lint
1912
- \`\`\`
1913
- thompson_anti_elitism_lint({ content: "<full text>" })
1914
- \`\`\`
1915
- Deterministic scan: 22 banned phrases, readability metrics, jargon density.
1916
- Zero LLM cost — pure regex + math.
1917
-
1918
- ### Step 6: Quality Gate
1919
- \`\`\`
1920
- thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
1921
- \`\`\`
1922
- 10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
1923
- Only distribute if passing or exemplary.
1924
-
1925
- ## Core Principles (non-negotiable)
1926
- 1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
1927
- 2. **Intuition Before Mechanics**: Explain WHY before HOW
1928
- 3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
1929
- 4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
1930
- 5. **Progressive Complexity**: Start with simplest true statement, layer up
1931
- 6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
1932
- 7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
1933
-
1934
- ## After Pipeline
1935
- - \`save_session_note\` — persist Thompson-processed content
1936
- - \`record_learning\` — log which analogies and styles worked best
2748
+ text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
2749
+
2750
+ You are running the Thompson Protocol content pipeline. This is a multi-agent system
2751
+ that transforms complex topics into content that makes the reader feel smart.
2752
+
2753
+ Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
2754
+ the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
2755
+ teaching any mechanics.
2756
+
2757
+ ## Pipeline (execute in order)
2758
+
2759
+ ### Step 1: Initialize
2760
+ \`\`\`
2761
+ thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
2762
+ \`\`\`
2763
+ This returns the full execution plan with system prompts for each agent.
2764
+
2765
+ ### Step 2: Write (Thompson Writer)
2766
+ \`\`\`
2767
+ thompson_write({ topic: "<topic>", target_audience: "<audience>" })
2768
+ \`\`\`
2769
+ Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
2770
+ Every technical term MUST have an "in other words..." analogy.
2771
+
2772
+ ### Step 3: Edit (Feynman Editor — max 3 cycles)
2773
+ \`\`\`
2774
+ thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
2775
+ \`\`\`
2776
+ The Skeptical Beginner reviews against 8 rejection criteria.
2777
+ If any section gets REWRITE → send back to thompson_write with fix instructions.
2778
+ Loop max 3 times. After 3, escalate stuck sections.
2779
+
2780
+ ### Step 4: Visual Map
2781
+ \`\`\`
2782
+ thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
2783
+ \`\`\`
2784
+ Generates image prompts that map 1:1 with text analogies. No generic b-roll.
2785
+
2786
+ ### Step 5: Anti-Elitism Lint
2787
+ \`\`\`
2788
+ thompson_anti_elitism_lint({ content: "<full text>" })
2789
+ \`\`\`
2790
+ Deterministic scan: 22 banned phrases, readability metrics, jargon density.
2791
+ Zero LLM cost — pure regex + math.
2792
+
2793
+ ### Step 6: Quality Gate
2794
+ \`\`\`
2795
+ thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
2796
+ \`\`\`
2797
+ 10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
2798
+ Only distribute if passing or exemplary.
2799
+
2800
+ ## Core Principles (non-negotiable)
2801
+ 1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
2802
+ 2. **Intuition Before Mechanics**: Explain WHY before HOW
2803
+ 3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
2804
+ 4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
2805
+ 5. **Progressive Complexity**: Start with simplest true statement, layer up
2806
+ 6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
2807
+ 7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
2808
+
2809
+ ## After Pipeline
2810
+ - \`save_session_note\` — persist Thompson-processed content
2811
+ - \`record_learning\` — log which analogies and styles worked best
1937
2812
  - Use \`content_publish\` workflow chain for distribution`,
1938
2813
  },
1939
2814
  },
@@ -1943,21 +2818,21 @@ Only distribute if passing or exemplary.
1943
2818
  // Server instructions — tells Claude Code Tool Search (and other clients) when to search
1944
2819
  // for NodeBench tools. This is the key integration point for lazy loading compatibility.
1945
2820
  // See: https://www.anthropic.com/engineering/advanced-tool-use
1946
- const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
1947
- Use NodeBench tools when you need to:
1948
- - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
1949
- - Run evaluations and quality gates before shipping code
1950
- - Search prior knowledge and record learnings across sessions
1951
- - Assess risk before taking actions
1952
- - Coordinate parallel agents (task locks, roles, context budget)
1953
- - Research with structured recon (web search, GitHub, RSS feeds)
1954
- - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
1955
- - Run security audits (dependency scanning, code analysis, secrets detection)
1956
- - Write and polish academic papers
1957
- - Audit SEO, analyze Figma flows, detect Android flicker
1958
- - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
2821
+ const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
2822
+ Use NodeBench tools when you need to:
2823
+ - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
2824
+ - Run evaluations and quality gates before shipping code
2825
+ - Search prior knowledge and record learnings across sessions
2826
+ - Assess risk before taking actions
2827
+ - Coordinate parallel agents (task locks, roles, context budget)
2828
+ - Research with structured recon (web search, GitHub, RSS feeds)
2829
+ - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
2830
+ - Run security audits (dependency scanning, code analysis, secrets detection)
2831
+ - Write and polish academic papers
2832
+ - Audit SEO, analyze Figma flows, detect Android flicker
2833
+ - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
1959
2834
  Start with discover_tools("<your task>") to find the right tool.`;
1960
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.30.0" }, {
2835
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.32.0" }, {
1961
2836
  capabilities: { tools: { listChanged: true }, prompts: {} },
1962
2837
  instructions: SERVER_INSTRUCTIONS,
1963
2838
  });
@@ -1970,10 +2845,12 @@ try {
1970
2845
  catch { /* instrumentation must not block server start */ }
1971
2846
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
1972
2847
  // Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
2848
+ // + MCP security annotations: readOnlyHint, destructiveHint, openWorldHint
1973
2849
  server.setRequestHandler(ListToolsRequestSchema, async () => {
1974
2850
  return {
1975
2851
  tools: allTools.map((t) => {
1976
2852
  const entry = TOOL_REGISTRY.get(t.name);
2853
+ const securityAnnotations = getToolAnnotations(t.name);
1977
2854
  return {
1978
2855
  name: t.name,
1979
2856
  description: t.description,
@@ -1984,8 +2861,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1984
2861
  category: entry.category,
1985
2862
  phase: entry.phase,
1986
2863
  complexity: getToolComplexity(t.name),
2864
+ ...securityAnnotations,
2865
+ },
2866
+ } : {
2867
+ annotations: {
2868
+ ...securityAnnotations,
1987
2869
  },
1988
- } : {}),
2870
+ }),
1989
2871
  };
1990
2872
  }),
1991
2873
  };
@@ -1996,6 +2878,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1996
2878
  _abToolCallCount++;
1997
2879
  if (name === "load_toolset" || name === "unload_toolset")
1998
2880
  _abLoadEventCount++;
2881
+ // Intent-based auto-expansion: on first call, classify and load relevant toolsets
2882
+ if (!_intentClassified) {
2883
+ _intentClassified = true;
2884
+ const expanded = classifyAndExpand(name, args);
2885
+ if (expanded) {
2886
+ console.error(`[intent-classify] Auto-loaded toolsets: ${expanded.join(", ")} (from tool: ${name})`);
2887
+ }
2888
+ }
1999
2889
  const tool = toolMap.get(name);
2000
2890
  if (!tool) {
2001
2891
  return {
@@ -2059,18 +2949,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2059
2949
  else {
2060
2950
  serialized = JSON.stringify(enrichedResult, null, 2);
2061
2951
  }
2952
+ // Security: redact credentials from all tool outputs (single enforcement point)
2953
+ const sanitized = redactSecrets(serialized);
2062
2954
  const contentBlocks = [
2063
- { type: "text", text: serialized },
2955
+ { type: "text", text: sanitized },
2064
2956
  ];
2065
2957
  if (hookHint) {
2066
2958
  contentBlocks.push({ type: "text", text: hookHint });
2067
2959
  }
2960
+ // Audit log: successful tool call
2961
+ auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), true);
2068
2962
  return {
2069
2963
  content: contentBlocks,
2070
2964
  isError: false,
2071
2965
  };
2072
2966
  }
2073
2967
  catch (err) {
2968
+ // Security errors get a clean response (not a stack trace)
2969
+ if (err instanceof SecurityError) {
2970
+ auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), false, err.message);
2971
+ return {
2972
+ content: [{ type: "text", text: `[SECURITY] ${err.message}` }],
2973
+ isError: true,
2974
+ };
2975
+ }
2074
2976
  resultStatus = "error";
2075
2977
  errorMsg = err?.message || "Internal error";
2076
2978
  // Auto-log errors to main DB
@@ -2121,13 +3023,13 @@ process.on('exit', () => {
2121
3023
  try {
2122
3024
  const db = getDb();
2123
3025
  const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
2124
- db.prepare(`UPDATE ab_test_sessions SET
2125
- final_tool_count = ?,
2126
- toolsets_loaded = ?,
2127
- total_tool_calls = ?,
2128
- total_load_events = ?,
2129
- session_duration_ms = ?,
2130
- ended_at = datetime('now')
3026
+ db.prepare(`UPDATE ab_test_sessions SET
3027
+ final_tool_count = ?,
3028
+ toolsets_loaded = ?,
3029
+ total_tool_calls = ?,
3030
+ total_load_events = ?,
3031
+ session_duration_ms = ?,
3032
+ ended_at = datetime('now')
2131
3033
  WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
2132
3034
  }
2133
3035
  catch { /* instrumentation must not block shutdown */ }
@@ -2157,6 +3059,15 @@ if (useEngine) {
2157
3059
  }
2158
3060
  catch { /* engine is optional — don't block MCP */ }
2159
3061
  }
3062
+ // Start observability watchdog (non-blocking, best-effort)
3063
+ try {
3064
+ initObservability(getDb);
3065
+ startWatchdog(getDb());
3066
+ }
3067
+ catch { /* observability is optional — don't block MCP */ }
3068
+ // Graceful shutdown
3069
+ process.on("SIGINT", () => { stopWatchdog(); process.exit(0); });
3070
+ process.on("SIGTERM", () => { stopWatchdog(); process.exit(0); });
2160
3071
  const toolsetInfo = cliArgs.includes("--toolsets") || cliArgs.includes("--exclude") || cliArgs.includes("--preset")
2161
3072
  ? ` [gated: ${domainTools.length} domain + 2 meta]`
2162
3073
  : "";