nodebench-mcp 2.31.1 → 2.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +14 -6
  2. package/dist/engine/server.js +14 -4
  3. package/dist/engine/server.js.map +1 -1
  4. package/dist/index.js +1946 -670
  5. package/dist/index.js.map +1 -1
  6. package/dist/security/SecurityError.d.ts +18 -0
  7. package/dist/security/SecurityError.js +22 -0
  8. package/dist/security/SecurityError.js.map +1 -0
  9. package/dist/security/__tests__/security.test.d.ts +8 -0
  10. package/dist/security/__tests__/security.test.js +295 -0
  11. package/dist/security/__tests__/security.test.js.map +1 -0
  12. package/dist/security/auditLog.d.ts +36 -0
  13. package/dist/security/auditLog.js +178 -0
  14. package/dist/security/auditLog.js.map +1 -0
  15. package/dist/security/commandSandbox.d.ts +33 -0
  16. package/dist/security/commandSandbox.js +159 -0
  17. package/dist/security/commandSandbox.js.map +1 -0
  18. package/dist/security/config.d.ts +23 -0
  19. package/dist/security/config.js +43 -0
  20. package/dist/security/config.js.map +1 -0
  21. package/dist/security/credentialRedactor.d.ts +22 -0
  22. package/dist/security/credentialRedactor.js +118 -0
  23. package/dist/security/credentialRedactor.js.map +1 -0
  24. package/dist/security/index.d.ts +20 -0
  25. package/dist/security/index.js +21 -0
  26. package/dist/security/index.js.map +1 -0
  27. package/dist/security/pathSandbox.d.ts +23 -0
  28. package/dist/security/pathSandbox.js +160 -0
  29. package/dist/security/pathSandbox.js.map +1 -0
  30. package/dist/security/urlValidator.d.ts +23 -0
  31. package/dist/security/urlValidator.js +125 -0
  32. package/dist/security/urlValidator.js.map +1 -0
  33. package/dist/tools/agentBootstrapTools.js +22 -29
  34. package/dist/tools/agentBootstrapTools.js.map +1 -1
  35. package/dist/tools/contextSandboxTools.js +7 -9
  36. package/dist/tools/contextSandboxTools.js.map +1 -1
  37. package/dist/tools/deepSimTools.d.ts +2 -0
  38. package/dist/tools/deepSimTools.js +404 -0
  39. package/dist/tools/deepSimTools.js.map +1 -0
  40. package/dist/tools/dimensionTools.d.ts +2 -0
  41. package/dist/tools/dimensionTools.js +246 -0
  42. package/dist/tools/dimensionTools.js.map +1 -0
  43. package/dist/tools/executionTraceTools.d.ts +2 -0
  44. package/dist/tools/executionTraceTools.js +446 -0
  45. package/dist/tools/executionTraceTools.js.map +1 -0
  46. package/dist/tools/founderTools.d.ts +13 -0
  47. package/dist/tools/founderTools.js +595 -0
  48. package/dist/tools/founderTools.js.map +1 -0
  49. package/dist/tools/gitWorkflowTools.js +14 -10
  50. package/dist/tools/gitWorkflowTools.js.map +1 -1
  51. package/dist/tools/githubTools.js +19 -2
  52. package/dist/tools/githubTools.js.map +1 -1
  53. package/dist/tools/index.d.ts +87 -0
  54. package/dist/tools/index.js +102 -0
  55. package/dist/tools/index.js.map +1 -0
  56. package/dist/tools/localFileTools.js +24 -12
  57. package/dist/tools/localFileTools.js.map +1 -1
  58. package/dist/tools/memoryDecay.d.ts +70 -0
  59. package/dist/tools/memoryDecay.js +247 -0
  60. package/dist/tools/memoryDecay.js.map +1 -0
  61. package/dist/tools/missionHarnessTools.d.ts +32 -0
  62. package/dist/tools/missionHarnessTools.js +972 -0
  63. package/dist/tools/missionHarnessTools.js.map +1 -0
  64. package/dist/tools/observabilityTools.d.ts +15 -0
  65. package/dist/tools/observabilityTools.js +787 -0
  66. package/dist/tools/observabilityTools.js.map +1 -0
  67. package/dist/tools/openclawTools.js +151 -36
  68. package/dist/tools/openclawTools.js.map +1 -1
  69. package/dist/tools/progressiveDiscoveryTools.js +5 -4
  70. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  71. package/dist/tools/qualityGateTools.js +118 -2
  72. package/dist/tools/qualityGateTools.js.map +1 -1
  73. package/dist/tools/rssTools.js +3 -0
  74. package/dist/tools/rssTools.js.map +1 -1
  75. package/dist/tools/scraplingTools.js +15 -0
  76. package/dist/tools/scraplingTools.js.map +1 -1
  77. package/dist/tools/seoTools.js +66 -1
  78. package/dist/tools/seoTools.js.map +1 -1
  79. package/dist/tools/sessionMemoryTools.js +50 -11
  80. package/dist/tools/sessionMemoryTools.js.map +1 -1
  81. package/dist/tools/temporalIntelligenceTools.d.ts +12 -0
  82. package/dist/tools/temporalIntelligenceTools.js +1068 -0
  83. package/dist/tools/temporalIntelligenceTools.js.map +1 -0
  84. package/dist/tools/toolRegistry.d.ts +19 -0
  85. package/dist/tools/toolRegistry.js +857 -31
  86. package/dist/tools/toolRegistry.js.map +1 -1
  87. package/dist/tools/webTools.js +14 -1
  88. package/dist/tools/webTools.js.map +1 -1
  89. package/dist/tools/webmcpTools.js +13 -2
  90. package/dist/tools/webmcpTools.js.map +1 -1
  91. package/dist/toolsetRegistry.js +13 -0
  92. package/dist/toolsetRegistry.js.map +1 -1
  93. package/dist/types.d.ts +10 -0
  94. package/package.json +124 -124
package/dist/index.js CHANGED
@@ -20,6 +20,7 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
20
20
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
21
21
  import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
22
22
  import { getDb, genId } from "./db.js";
23
+ import { redactSecrets, auditLog, SecurityError } from "./security/index.js";
23
24
  import { startDashboardServer } from "./dashboard/server.js";
24
25
  import { startEngineServer } from "./engine/server.js";
25
26
  import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
@@ -27,9 +28,10 @@ import { AnalyticsTracker } from "./analytics/toolTracker.js";
27
28
  import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
28
29
  import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
29
30
  import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
31
+ import { initObservability, startWatchdog, stopWatchdog } from "./tools/observabilityTools.js";
30
32
  import { createMetaTools } from "./tools/metaTools.js";
31
33
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
32
- import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
34
+ import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, getToolAnnotations, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
33
35
  // TOON format — ~40% token savings on tool responses
34
36
  import { encode as toonEncode } from "@toon-format/toon";
35
37
  // Embedding provider — neural semantic search
@@ -43,28 +45,36 @@ const showStats = cliArgs.includes("--stats");
43
45
  const exportStats = cliArgs.includes("--export-stats");
44
46
  const resetStats = cliArgs.includes("--reset-stats");
45
47
  const listPresetsFlag = cliArgs.includes("--list-presets");
48
+ const healthFlag = cliArgs.includes("--health");
49
+ const statusFlag = cliArgs.includes("--status");
50
+ const diagnoseFlag = cliArgs.includes("--diagnose");
51
+ const autoPresetFlag = cliArgs.includes("--auto-preset");
52
+ const syncConfigsFlag = cliArgs.includes("--sync-configs");
46
53
  const useEngine = cliArgs.includes("--engine");
47
54
  const engineSecret = (() => {
48
55
  const idx = cliArgs.indexOf("--engine-secret");
49
56
  return idx >= 0 && idx + 1 < cliArgs.length ? cliArgs[idx + 1] : process.env.ENGINE_SECRET;
50
57
  })();
51
58
  export { TOOLSET_MAP };
52
- const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox"];
59
+ const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox", "observability", "execution_trace", "mission_harness", "deep_sim", "founder"];
53
60
  const PRESETS = {
54
61
  default: DEFAULT_TOOLSETS,
55
- // Themed presets — bridge between default (50 tools) and full (175 tools)
62
+ // Themed presets — bridge between default (81 tools) and full (295 tools)
56
63
  web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "qa_orchestration", "visual_qa", "design_governance", "web_scraping"],
57
- research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping"],
58
- data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping"],
64
+ research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping", "temporal_intelligence", "deep_sim"],
65
+ data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping", "temporal_intelligence"],
59
66
  devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
60
67
  mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "visual_qa"],
61
68
  academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
62
- multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping"],
69
+ multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping", "deep_sim"],
63
70
  content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect", "local_dashboard", "engine_context", "thompson_protocol"],
71
+ // Cursor IDE has a hard 40-tool limit across ALL MCP servers.
72
+ // 28 tools = 22 domain + 3 meta + 3 discovery — leaves 12 slots for other servers.
73
+ cursor: ["deep_sim", "quality_gate", "learning", "session_memory", "web", "toon"],
64
74
  full: Object.keys(TOOLSET_MAP),
65
75
  };
66
76
  const PRESET_DESCRIPTIONS = {
67
- default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
77
+ default: "Core AI Flywheel — verification, eval, quality gates, learning, recon, mission harness",
68
78
  web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
69
79
  research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
70
80
  data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
@@ -73,6 +83,7 @@ const PRESET_DESCRIPTIONS = {
73
83
  academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
74
84
  multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing, frontend traversal",
75
85
  content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
86
+ cursor: "Cursor IDE (28 tools) — decision intelligence, research, quality gates, session memory, web, TOON encoding. Leaves 12 slots for other MCP servers.",
76
87
  full: "Everything — all toolsets for maximum coverage",
77
88
  };
78
89
  function parseToolsets() {
@@ -87,6 +98,7 @@ function parseToolsets() {
87
98
  " --exclude <list> Comma-separated toolsets to exclude",
88
99
  " --preset <name> Use a preset: default or full",
89
100
  " --smart-preset Generate smart preset recommendation based on project type and usage history",
101
+ " --auto-preset Detect project type from package.json/pyproject.toml and recommend a preset",
90
102
  " --stats Show usage statistics for current project",
91
103
  " --export-stats Export usage statistics to JSON",
92
104
  " --reset-stats Clear all usage analytics data",
@@ -96,6 +108,11 @@ function parseToolsets() {
96
108
  " --no-embedding Disable neural embedding search (uses local HuggingFace model or API keys)",
97
109
  " --engine Start headless API engine server on port 6276",
98
110
  " --engine-secret <s> Require Bearer token for engine API (or set ENGINE_SECRET env var)",
111
+ " --explain <tool> Show plain-English explanation of a tool and exit",
112
+ " --health Run diagnostic health check and exit",
113
+ " --status Show live system pulse (uptime, errors, call rates) and exit",
114
+ " --diagnose Run drift detection + auto-heal and exit",
115
+ " --sync-configs Write MCP config to Claude Code, Cursor, and Windsurf IDE locations",
99
116
  " --help Show this help and exit",
100
117
  "",
101
118
  "Available toolsets:",
@@ -108,12 +125,12 @@ function parseToolsets() {
108
125
  }),
109
126
  "",
110
127
  "Examples:",
111
- " npx nodebench-mcp # Default (50 tools) - core AI Flywheel",
128
+ " npx nodebench-mcp # Default (81 tools) - core AI Flywheel",
112
129
  " npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
113
130
  " npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
114
131
  " npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
115
132
  " npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
116
- " npx nodebench-mcp --preset full # All 175 tools",
133
+ " npx nodebench-mcp --preset full # All 295 tools",
117
134
  " npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
118
135
  " npx nodebench-mcp --stats # Show usage statistics",
119
136
  " npx nodebench-mcp --toolsets verification,eval,recon",
@@ -197,6 +214,997 @@ if (resetStats || useSmartPreset || showStats || exportStats) {
197
214
  }
198
215
  process.exit(0);
199
216
  }
217
+ // ── Explain CLI handler (run-and-exit) ────────────────────────────────
218
+ const explainIdx = cliArgs.indexOf("--explain");
219
+ if (explainIdx !== -1) {
220
+ const toolName = cliArgs[explainIdx + 1];
221
+ const USE_COLOR = process.stdout.isTTY;
222
+ const B = USE_COLOR ? "\x1b[1m" : "";
223
+ const C = USE_COLOR ? "\x1b[36m" : "";
224
+ const G = USE_COLOR ? "\x1b[32m" : "";
225
+ const Y = USE_COLOR ? "\x1b[33m" : "";
226
+ const D = USE_COLOR ? "\x1b[2m" : "";
227
+ const X = USE_COLOR ? "\x1b[0m" : "";
228
+ if (!toolName || toolName.startsWith("--")) {
229
+ console.error("Usage: nodebench-mcp --explain <tool_name>");
230
+ console.error("Example: nodebench-mcp --explain start_verification_cycle");
231
+ process.exit(1);
232
+ }
233
+ const entry = TOOL_REGISTRY.get(toolName);
234
+ if (!entry) {
235
+ // Fuzzy match: find closest tool names
236
+ const candidates = ALL_REGISTRY_ENTRIES
237
+ .filter(e => e.name.includes(toolName) || toolName.split("_").some(w => e.name.includes(w)))
238
+ .slice(0, 5);
239
+ console.error(`Tool "${toolName}" not found in registry.`);
240
+ if (candidates.length > 0) {
241
+ console.error(`\nDid you mean:`);
242
+ for (const c of candidates)
243
+ console.error(` --explain ${c.name}`);
244
+ }
245
+ process.exit(1);
246
+ }
247
+ // Find the actual McpTool for description + inputSchema
248
+ const allDomainTools = Object.values(TOOLSET_MAP).flat();
249
+ const mcpTool = allDomainTools.find(t => t.name === toolName);
250
+ const complexity = getToolComplexity(toolName);
251
+ const complexityLabel = { low: "Haiku (fast, cheap)", medium: "Sonnet (balanced)", high: "Opus (deep reasoning)" };
252
+ const toolset = TOOL_TO_TOOLSET.get(toolName) ?? "unknown";
253
+ const lines = [];
254
+ lines.push(`${B}${entry.name}${X}`);
255
+ lines.push("");
256
+ // Thompson-style "what problem does this solve" section
257
+ if (mcpTool?.description) {
258
+ lines.push(`${C}What it does${X}`);
259
+ lines.push(` ${mcpTool.description}`);
260
+ lines.push("");
261
+ }
262
+ // Category + phase + complexity
263
+ lines.push(`${C}At a glance${X}`);
264
+ lines.push(` Category: ${entry.category}`);
265
+ lines.push(` Phase: ${entry.phase}`);
266
+ lines.push(` Toolset: ${toolset}`);
267
+ lines.push(` Complexity: ${complexity} — ${complexityLabel[complexity] ?? complexity}`);
268
+ lines.push(` Tags: ${entry.tags.join(", ")}`);
269
+ lines.push("");
270
+ // QuickRef — what to do next (the actionable guidance)
271
+ lines.push(`${C}What to do next${X} ${D}(Thompson: intuition before mechanics)${X}`);
272
+ lines.push(` ${entry.quickRef.nextAction}`);
273
+ if (entry.quickRef.tip) {
274
+ lines.push(` ${Y}Tip:${X} ${entry.quickRef.tip}`);
275
+ }
276
+ if (entry.quickRef.methodology) {
277
+ lines.push(` ${D}Methodology: ${entry.quickRef.methodology}${X}`);
278
+ }
279
+ lines.push("");
280
+ // Next tools — the chain
281
+ if (entry.quickRef.nextTools.length > 0) {
282
+ lines.push(`${C}Commonly used after this${X}`);
283
+ for (const nt of entry.quickRef.nextTools) {
284
+ const ntEntry = TOOL_REGISTRY.get(nt);
285
+ lines.push(` ${G}→${X} ${nt}${ntEntry ? ` ${D}(${ntEntry.category}, ${ntEntry.phase})${X}` : ""}`);
286
+ }
287
+ lines.push("");
288
+ }
289
+ // Input schema (if available)
290
+ if (mcpTool?.inputSchema?.properties) {
291
+ lines.push(`${C}Parameters${X}`);
292
+ const props = mcpTool.inputSchema.properties;
293
+ const required = new Set(mcpTool.inputSchema.required ?? []);
294
+ for (const [key, schema] of Object.entries(props)) {
295
+ const req = required.has(key) ? `${Y}*${X}` : " ";
296
+ const type = schema.type ?? "any";
297
+ const desc = schema.description ? ` ${D}${schema.description.slice(0, 80)}${X}` : "";
298
+ lines.push(` ${req} ${key.padEnd(24)} ${type.padEnd(10)}${desc}`);
299
+ }
300
+ lines.push(` ${D}(* = required)${X}`);
301
+ lines.push("");
302
+ }
303
+ // Analogy — Thompson protocol
304
+ lines.push(`${C}Think of it like...${X}`);
305
+ const analogies = {
306
+ verification: "A pre-flight checklist — you wouldn't fly without checking every system first.",
307
+ eval: "A lab experiment — you set up controlled conditions, run the test, and measure what actually happened.",
308
+ quality_gate: "A bouncer at a club — it checks if your code meets the standards before letting it through.",
309
+ learning: "A journal — you write down what worked and what didn't so you don't repeat mistakes.",
310
+ flywheel: "A spinning wheel that gains momentum — each iteration makes the next one faster and better.",
311
+ recon: "A detective gathering clues — you survey the scene before making any moves.",
312
+ security: "A locksmith checking every door and window — systematic, thorough, nothing left unlocked.",
313
+ boilerplate: "A cookie cutter — it stamps out a proven shape so you can focus on the filling.",
314
+ research_writing: "A research assistant — it helps you find, cite, and structure knowledge.",
315
+ web: "A web browser for your AI — it can fetch pages, search the internet, and extract information.",
316
+ github: "Your Git assistant — it handles PRs, issues, and repo operations without you leaving the terminal.",
317
+ email: "A mailroom worker — it can send, receive, and organize emails programmatically.",
318
+ llm: "A phone that can call other AI models — sometimes you need a specialist for a specific question.",
319
+ vision: "Eyes for your AI — it can look at screenshots, images, and visual content.",
320
+ ui_capture: "A camera pointed at your app — it takes screenshots so you can see what users see.",
321
+ parallel: "A team of workers — instead of one person doing everything, you split the work and do it simultaneously.",
322
+ documentation: "A technical writer — it reads code and produces human-friendly explanations.",
323
+ agent_bootstrap: "A setup wizard — it configures a new agent with everything it needs to start working.",
324
+ self_eval: "A mirror — the agent looks at its own work and grades it honestly.",
325
+ platform: "A Swiss Army knife for your OS — file operations, system info, environment checks.",
326
+ skill_update: "A teacher's gradebook — it tracks which skills are fresh and which need a refresher.",
327
+ local_file: "A file parser — it can read PDFs, spreadsheets, images, and documents without external services.",
328
+ seo: "A search engine consultant — it checks how visible and crawlable your site is.",
329
+ rss: "A news aggregator — it monitors feeds and brings you the latest updates.",
330
+ thompson_protocol: "A writing coach — it makes sure your content is clear, uses analogies, and never talks down to the reader.",
331
+ };
332
+ const analogy = analogies[entry.category] ?? `A specialized tool in the ${entry.category} category — it does one thing well so you can focus on the bigger picture.`;
333
+ lines.push(` ${analogy}`);
334
+ console.log(lines.join("\n"));
335
+ process.exit(0);
336
+ }
337
+ // ── Auto-preset detection (run-and-exit) ──────────────────────────────
338
+ if (autoPresetFlag) {
339
+ const fs = await import("node:fs");
340
+ const path = await import("node:path");
341
+ const cwd = process.cwd();
342
+ const USE_COLOR = process.stderr.isTTY;
343
+ const B = USE_COLOR ? "\x1b[1m" : "";
344
+ const C = USE_COLOR ? "\x1b[36m" : "";
345
+ const X = USE_COLOR ? "\x1b[0m" : "";
346
+ const signals = [];
347
+ let recommended = "default";
348
+ // Check package.json
349
+ const pkgPath = path.join(cwd, "package.json");
350
+ if (fs.existsSync(pkgPath)) {
351
+ try {
352
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
353
+ const allDeps = { ...pkg.dependencies, ...pkg.devDependencies };
354
+ const depNames = Object.keys(allDeps);
355
+ // Web frameworks
356
+ const webSignals = ["react", "vue", "svelte", "next", "nuxt", "angular", "@angular/core", "vite", "webpack", "gatsby", "remix", "astro"];
357
+ const webHits = depNames.filter(d => webSignals.includes(d));
358
+ if (webHits.length > 0) {
359
+ signals.push(`web: ${webHits.join(", ")}`);
360
+ recommended = "web_dev";
361
+ }
362
+ // Mobile
363
+ const mobileSignals = ["react-native", "expo", "@capacitor/core", "ionic", "@ionic/react", "@ionic/vue", "nativescript"];
364
+ const mobileHits = depNames.filter(d => mobileSignals.includes(d));
365
+ if (mobileHits.length > 0) {
366
+ signals.push(`mobile: ${mobileHits.join(", ")}`);
367
+ recommended = "mobile";
368
+ }
369
+ // Data/ML
370
+ const dataSignals = ["@tensorflow/tfjs", "onnxruntime-node", "ml5", "brain.js", "d3", "chart.js", "recharts", "plotly.js"];
371
+ const dataHits = depNames.filter(d => dataSignals.includes(d));
372
+ if (dataHits.length > 0) {
373
+ signals.push(`data: ${dataHits.join(", ")}`);
374
+ recommended = "data";
375
+ }
376
+ // DevOps
377
+ const devopsSignals = ["aws-sdk", "@aws-sdk/client-s3", "docker-compose", "pulumi", "@pulumi/aws", "serverless"];
378
+ const devopsHits = depNames.filter(d => devopsSignals.includes(d));
379
+ if (devopsHits.length > 0) {
380
+ signals.push(`devops: ${devopsHits.join(", ")}`);
381
+ recommended = "devops";
382
+ }
383
+ // Research / content
384
+ const researchSignals = ["@anthropic-ai/sdk", "openai", "langchain", "@langchain/core", "llamaindex"];
385
+ const researchHits = depNames.filter(d => researchSignals.includes(d));
386
+ if (researchHits.length > 0) {
387
+ signals.push(`research/AI: ${researchHits.join(", ")}`);
388
+ recommended = "research";
389
+ }
390
+ // Multi-agent
391
+ const agentSignals = ["@modelcontextprotocol/sdk", "autogen", "crewai"];
392
+ const agentHits = depNames.filter(d => agentSignals.includes(d));
393
+ if (agentHits.length > 0) {
394
+ signals.push(`multi-agent: ${agentHits.join(", ")}`);
395
+ recommended = "multi_agent";
396
+ }
397
+ // Content
398
+ const contentSignals = ["marked", "remark", "rehype", "contentful", "sanity", "@sanity/client", "strapi"];
399
+ const contentHits = depNames.filter(d => contentSignals.includes(d));
400
+ if (contentHits.length > 0 && !webHits.length) {
401
+ signals.push(`content: ${contentHits.join(", ")}`);
402
+ recommended = "content";
403
+ }
404
+ }
405
+ catch { /* malformed package.json */ }
406
+ }
407
+ // Check pyproject.toml
408
+ const pyPath = path.join(cwd, "pyproject.toml");
409
+ if (fs.existsSync(pyPath)) {
410
+ try {
411
+ const content = fs.readFileSync(pyPath, "utf-8");
412
+ if (/torch|tensorflow|scikit|pandas|numpy|scipy/i.test(content)) {
413
+ signals.push("python: ML/data libraries detected");
414
+ recommended = "data";
415
+ }
416
+ else if (/fastapi|flask|django/i.test(content)) {
417
+ signals.push("python: web framework detected");
418
+ recommended = "web_dev";
419
+ }
420
+ else if (/langchain|openai|anthropic/i.test(content)) {
421
+ signals.push("python: AI/research libraries detected");
422
+ recommended = "research";
423
+ }
424
+ }
425
+ catch { /* malformed */ }
426
+ }
427
+ // Check for academic markers
428
+ const hasLatex = fs.existsSync(path.join(cwd, "main.tex")) || fs.existsSync(path.join(cwd, "paper.tex"));
429
+ if (hasLatex) {
430
+ signals.push("academic: LaTeX files found");
431
+ recommended = "academic";
432
+ }
433
+ // Output
434
+ const presetToolsets = PRESETS[recommended];
435
+ const toolCount = presetToolsets
436
+ ? presetToolsets.reduce((s, k) => s + (TOOLSET_MAP[k]?.length ?? 0), 0) + 12
437
+ : 0;
438
+ console.error(`${B}Auto-Preset Detection${X} (${cwd})`);
439
+ console.error("");
440
+ if (signals.length > 0) {
441
+ console.error(`${C}Signals${X}`);
442
+ for (const s of signals)
443
+ console.error(` - ${s}`);
444
+ console.error("");
445
+ }
446
+ console.error(`${B}Recommended:${X} --preset ${recommended} (${toolCount} tools)`);
447
+ if (signals.length === 0) {
448
+ console.error(" No project markers found — using default preset.");
449
+ }
450
+ console.error("");
451
+ console.error(`Run: npx nodebench-mcp --preset ${recommended}`);
452
+ // Also output just the preset name to stdout (composable)
453
+ console.log(recommended);
454
+ process.exit(0);
455
+ }
456
+ // ── Health CLI handler (run-and-exit) ─────────────────────────────────
457
+ if (healthFlag) {
458
+ const USE_COLOR = process.stdout.isTTY;
459
+ const G = USE_COLOR ? "\x1b[32m" : "";
460
+ const R = USE_COLOR ? "\x1b[31m" : "";
461
+ const Y = USE_COLOR ? "\x1b[33m" : "";
462
+ const C = USE_COLOR ? "\x1b[36m" : "";
463
+ const B = USE_COLOR ? "\x1b[1m" : "";
464
+ const X = USE_COLOR ? "\x1b[0m" : "";
465
+ const ok = `${G}OK${X}`;
466
+ const warn = `${Y}WARN${X}`;
467
+ const fail = `${R}FAIL${X}`;
468
+ const lines = [];
469
+ lines.push(`${B}NodeBench MCP v2.30.0 — Health Check${X}`);
470
+ lines.push("");
471
+ // 1. Tool count + preset
472
+ const presetIdx = cliArgs.indexOf("--preset");
473
+ const activePreset = presetIdx !== -1 && cliArgs[presetIdx + 1] ? cliArgs[presetIdx + 1] : "default";
474
+ const domainCount = Object.keys(TOOLSET_MAP).length;
475
+ const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0);
476
+ const presetToolsets = PRESETS[activePreset];
477
+ const presetToolCount = presetToolsets
478
+ ? presetToolsets.reduce((s, k) => s + (TOOLSET_MAP[k]?.length ?? 0), 0) + 12
479
+ : totalTools;
480
+ lines.push(`${C}Tools${X} ${presetToolCount} loaded (preset: ${activePreset}) | ${totalTools} total across ${domainCount} domains`);
481
+ // 2. TOON + Embedding
482
+ lines.push(`${C}TOON${X} ${useToon ? ok : `${warn} disabled (--no-toon)`}`);
483
+ lines.push(`${C}Embedding${X} ${useEmbedding ? ok : `${warn} disabled (--no-embedding)`}`);
484
+ // 3. Database
485
+ const os = await import("node:os");
486
+ const path = await import("node:path");
487
+ const fs = await import("node:fs");
488
+ const dbDir = path.join(os.homedir(), ".nodebench");
489
+ const dbPath = path.join(dbDir, "nodebench.db");
490
+ const dbExists = fs.existsSync(dbPath);
491
+ let dbSize = "";
492
+ if (dbExists) {
493
+ const stat = fs.statSync(dbPath);
494
+ dbSize = ` (${(stat.size / 1024).toFixed(0)} KB)`;
495
+ }
496
+ lines.push(`${C}Database${X} ${dbExists ? `${ok}${dbSize}` : `${warn} not initialized (will create on first run)`}`);
497
+ // 4. Analytics DB
498
+ const analyticsPath = path.join(dbDir, "analytics.db");
499
+ const analyticsExists = fs.existsSync(analyticsPath);
500
+ lines.push(`${C}Analytics${X} ${analyticsExists ? ok : `${warn} no usage data yet`}`);
501
+ // 5. Embedding cache
502
+ const cachePath = path.join(dbDir, "embedding_cache.json");
503
+ const cacheExists = fs.existsSync(cachePath);
504
+ let cacheInfo = "";
505
+ if (cacheExists) {
506
+ const stat = fs.statSync(cachePath);
507
+ cacheInfo = ` (${(stat.size / 1024).toFixed(0)} KB)`;
508
+ }
509
+ lines.push(`${C}Emb Cache${X} ${cacheExists ? `${ok}${cacheInfo}` : `${warn} not built yet`}`);
510
+ // 6. Environment variables
511
+ lines.push("");
512
+ lines.push(`${B}Environment${X}`);
513
+ const envChecks = [
514
+ ["ANTHROPIC_API_KEY", "Claude LLM tools", "llm"],
515
+ ["OPENAI_API_KEY", "OpenAI + embeddings", "llm"],
516
+ ["GEMINI_API_KEY", "Gemini + embeddings", "llm"],
517
+ ["GITHUB_TOKEN", "GitHub tools", "github"],
518
+ ["BROWSERBASE_API_KEY", "Web scraping", "web"],
519
+ ["FIRECRAWL_API_KEY", "Web crawling", "web"],
520
+ ["SMTP_HOST", "Email sending", "email"],
521
+ ["IMAP_HOST", "Email reading", "email"],
522
+ ];
523
+ for (const [key, desc, _domain] of envChecks) {
524
+ const set = !!process.env[key];
525
+ const val = set ? process.env[key].slice(0, 4) + "..." : "";
526
+ lines.push(` ${set ? ok : `${Y}--${X}`} ${key.padEnd(22)} ${desc}${set ? ` ${C}${val}${X}` : ""}`);
527
+ }
528
+ // 7. Optional npm packages
529
+ lines.push("");
530
+ lines.push(`${B}Optional Packages${X}`);
531
+ const { createRequire } = await import("node:module");
532
+ const _require = createRequire(import.meta.url);
533
+ const _isInstalled = (pkg) => { try {
534
+ _require.resolve(pkg);
535
+ return true;
536
+ }
537
+ catch {
538
+ return false;
539
+ } };
540
+ const pkgChecks = [
541
+ ["playwright", "UI capture + screenshots"],
542
+ ["sharp", "Image processing"],
543
+ ["@huggingface/transformers", "Local embeddings (384-dim)"],
544
+ ["tesseract.js", "OCR text extraction"],
545
+ ["pdf-parse", "PDF parsing"],
546
+ ["mammoth", "DOCX parsing"],
547
+ ["xlsx", "Spreadsheet parsing"],
548
+ ];
549
+ for (const [pkg, desc] of pkgChecks) {
550
+ const installed = _isInstalled(pkg);
551
+ lines.push(` ${installed ? ok : `${Y}--${X}`} ${pkg.padEnd(30)} ${desc}`);
552
+ }
553
+ // 8. Python servers
554
+ lines.push("");
555
+ lines.push(`${B}Python Servers${X}`);
556
+ const serverChecks = [
557
+ ["Flicker Detection", 8006],
558
+ ["Figma Flow", 8007],
559
+ ];
560
+ for (const [name, port] of serverChecks) {
561
+ let reachable = false;
562
+ try {
563
+ const resp = await fetch(`http://127.0.0.1:${port}/health`, { signal: AbortSignal.timeout(2000) });
564
+ reachable = resp.ok;
565
+ }
566
+ catch { /* not running */ }
567
+ lines.push(` ${reachable ? ok : `${Y}--${X}`} ${name.padEnd(22)} :${port}${reachable ? "" : " (not running)"}`);
568
+ }
569
+ // Summary
570
+ lines.push("");
571
+ const allEnvSet = envChecks.filter(([k]) => !!process.env[k]).length;
572
+ const allPkgSet = pkgChecks.filter(([p]) => _isInstalled(p)).length;
573
+ lines.push(`${B}Summary${X} ${allEnvSet}/${envChecks.length} env vars | ${allPkgSet}/${pkgChecks.length} packages | ${dbExists ? "DB ready" : "DB pending"}`);
574
+ console.log(lines.join("\n"));
575
+ process.exit(0);
576
+ }
577
+ // ── Status CLI handler (run-and-exit) ─────────────────────────────────
578
+ if (statusFlag) {
579
+ const os = await import("node:os");
580
+ const path = await import("node:path");
581
+ const fs = await import("node:fs");
582
+ const USE_COLOR = process.stdout.isTTY;
583
+ const B = USE_COLOR ? "\x1b[1m" : "";
584
+ const C = USE_COLOR ? "\x1b[36m" : "";
585
+ const G = USE_COLOR ? "\x1b[32m" : "";
586
+ const Y = USE_COLOR ? "\x1b[33m" : "";
587
+ const R = USE_COLOR ? "\x1b[31m" : "";
588
+ const X = USE_COLOR ? "\x1b[0m" : "";
589
+ const dir = path.join(os.homedir(), ".nodebench");
590
+ const dbPath = path.join(dir, "nodebench.db");
591
+ if (!fs.existsSync(dbPath)) {
592
+ console.error("No database found. Run the MCP server first to initialize.");
593
+ process.exit(1);
594
+ }
595
+ // Open DB directly for status query
596
+ const Database = (await import("better-sqlite3")).default;
597
+ const db = new Database(dbPath, { readonly: true });
598
+ const lines = [];
599
+ lines.push(`${B}NodeBench MCP — System Status${X}`);
600
+ lines.push("");
601
+ // Uptime info from DB (last tool call as proxy for when server was active)
602
+ try {
603
+ const recent = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get();
604
+ const today = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')`).get();
605
+ const week = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-7 days')`).get();
606
+ const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-1 hour')`).get();
607
+ const errors24h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-24 hours')`).get();
608
+ lines.push(`${C}Call Volume${X}`);
609
+ lines.push(` Last 1h: ${recent.cnt} calls (${errors1h.cnt} errors)`);
610
+ lines.push(` Last 24h: ${today.cnt} calls (${errors24h.cnt} errors)`);
611
+ lines.push(` Last 7d: ${week.cnt} calls`);
612
+ const rate1h = recent.cnt > 0 ? ((recent.cnt - errors1h.cnt) / recent.cnt * 100).toFixed(1) : "N/A";
613
+ const rate24h = today.cnt > 0 ? ((today.cnt - errors24h.cnt) / today.cnt * 100).toFixed(1) : "N/A";
614
+ lines.push(` Success: ${rate1h}% (1h) / ${rate24h}% (24h)`);
615
+ lines.push("");
616
+ // Top 5 tools
617
+ const topTools = db.prepare(`SELECT tool_name, COUNT(*) as calls, SUM(CASE WHEN result_status='error' THEN 1 ELSE 0 END) as errs, ROUND(AVG(duration_ms)) as avg_ms
618
+ FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')
619
+ GROUP BY tool_name ORDER BY calls DESC LIMIT 5`).all();
620
+ if (topTools.length > 0) {
621
+ lines.push(`${C}Top Tools (24h)${X}`);
622
+ for (const t of topTools) {
623
+ const errTag = t.errs > 0 ? ` ${R}${t.errs} err${X}` : "";
624
+ lines.push(` ${t.calls.toString().padStart(4)} ${t.tool_name.padEnd(30)} ${t.avg_ms}ms avg${errTag}`);
625
+ }
626
+ lines.push("");
627
+ }
628
+ // Error trend
629
+ const errPrevHour = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-2 hours') AND created_at <= datetime('now', '-1 hour')`).get();
630
+ const direction = errors1h.cnt > errPrevHour.cnt ? `${R}increasing${X}` : errors1h.cnt < errPrevHour.cnt ? `${G}decreasing${X}` : `${G}stable${X}`;
631
+ lines.push(`${C}Error Trend${X} ${direction} (${errPrevHour.cnt} prev hour → ${errors1h.cnt} this hour)`);
632
+ // Active verification cycles
633
+ const activeCycles = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress')`).get();
634
+ if (activeCycles.cnt > 0) {
635
+ lines.push(`${C}Active Cycles${X} ${Y}${activeCycles.cnt} verification cycle(s) in progress${X}`);
636
+ }
637
+ }
638
+ catch (e) {
639
+ lines.push(`${R}Error querying DB: ${e.message}${X}`);
640
+ }
641
+ db.close();
642
+ console.log(lines.join("\n"));
643
+ process.exit(0);
644
+ }
645
+ // ── Diagnose CLI handler (run-and-exit) ───────────────────────────────
646
+ if (diagnoseFlag) {
647
+ const os = await import("node:os");
648
+ const path = await import("node:path");
649
+ const fs = await import("node:fs");
650
+ const USE_COLOR = process.stdout.isTTY;
651
+ const B = USE_COLOR ? "\x1b[1m" : "";
652
+ const C = USE_COLOR ? "\x1b[36m" : "";
653
+ const G = USE_COLOR ? "\x1b[32m" : "";
654
+ const Y = USE_COLOR ? "\x1b[33m" : "";
655
+ const R = USE_COLOR ? "\x1b[31m" : "";
656
+ const X = USE_COLOR ? "\x1b[0m" : "";
657
+ const dir = path.join(os.homedir(), ".nodebench");
658
+ const dbPath = path.join(dir, "nodebench.db");
659
+ if (!fs.existsSync(dbPath)) {
660
+ console.error("No database found. Run the MCP server first to initialize.");
661
+ process.exit(1);
662
+ }
663
+ const Database = (await import("better-sqlite3")).default;
664
+ const db = new Database(dbPath);
665
+ const lines = [];
666
+ lines.push(`${B}NodeBench MCP — Diagnose & Heal${X}`);
667
+ lines.push("");
668
+ let issueCount = 0;
669
+ let healedCount = 0;
670
+ // 1. Orphaned verification cycles
671
+ try {
672
+ const orphanedCount = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).get().cnt;
673
+ if (orphanedCount > 0) {
674
+ lines.push(`${Y}DRIFT${X} ${orphanedCount} orphaned verification cycle(s) (>48h old)`);
675
+ const result = db.prepare(`UPDATE verification_cycles SET status = 'abandoned', updated_at = datetime('now') WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).run();
676
+ lines.push(` ${G}HEALED${X} Abandoned ${result.changes} cycles in batch`);
677
+ healedCount += result.changes;
678
+ issueCount += orphanedCount;
679
+ }
680
+ else {
681
+ lines.push(`${G}OK${X} No orphaned verification cycles`);
682
+ }
683
+ }
684
+ catch {
685
+ lines.push(`${Y}SKIP${X} Could not check verification cycles`);
686
+ }
687
+ // 2. Stale eval runs
688
+ try {
689
+ const staleCount = db.prepare(`SELECT COUNT(*) as cnt FROM eval_runs WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).get().cnt;
690
+ if (staleCount > 0) {
691
+ lines.push(`${Y}DRIFT${X} ${staleCount} stale eval run(s) (>24h old)`);
692
+ const result = db.prepare(`UPDATE eval_runs SET status = 'failed', completed_at = datetime('now') WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).run();
693
+ lines.push(` ${G}HEALED${X} Marked ${result.changes} eval runs as failed`);
694
+ healedCount += result.changes;
695
+ issueCount += staleCount;
696
+ }
697
+ else {
698
+ lines.push(`${G}OK${X} No stale eval runs`);
699
+ }
700
+ }
701
+ catch {
702
+ lines.push(`${Y}SKIP${X} Could not check eval runs`);
703
+ }
704
+ // 3. DB size
705
+ const dbInfo = fs.statSync(dbPath);
706
+ const dbSizeMb = dbInfo.size / (1024 * 1024);
707
+ if (dbSizeMb > 500) {
708
+ lines.push(`${Y}DRIFT${X} Database is ${dbSizeMb.toFixed(1)} MB`);
709
+ try {
710
+ const cutoff = new Date(Date.now() - 90 * 24 * 3_600_000).toISOString();
711
+ const deleted = db.prepare(`DELETE FROM tool_call_log WHERE created_at < ?`).run(cutoff);
712
+ if (deleted.changes > 0) {
713
+ lines.push(` ${G}HEALED${X} Pruned ${deleted.changes} tool_call_log entries older than 90 days`);
714
+ healedCount++;
715
+ }
716
+ db.pragma("wal_checkpoint(TRUNCATE)");
717
+ lines.push(` ${G}HEALED${X} Ran WAL checkpoint`);
718
+ healedCount++;
719
+ }
720
+ catch { /* skip */ }
721
+ issueCount++;
722
+ }
723
+ else {
724
+ lines.push(`${G}OK${X} Database size: ${dbSizeMb.toFixed(1)} MB`);
725
+ }
726
+ // 4. Error rate
727
+ try {
728
+ const calls1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get().cnt;
729
+ const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-1 hour')`).get().cnt;
730
+ const rate = calls1h > 0 ? (errors1h / calls1h * 100) : 0;
731
+ if (rate > 20 && calls1h > 5) {
732
+ lines.push(`${R}ALERT${X} Error rate ${rate.toFixed(1)}% in last hour (${errors1h}/${calls1h})`);
733
+ issueCount++;
734
+ }
735
+ else {
736
+ lines.push(`${G}OK${X} Error rate: ${rate.toFixed(1)}% (${errors1h}/${calls1h} in last hour)`);
737
+ }
738
+ }
739
+ catch {
740
+ lines.push(`${Y}SKIP${X} Could not check error rates`);
741
+ }
742
+ // 5. Embedding cache
743
+ const cachePath = path.join(dir, "embedding_cache.json");
744
+ if (fs.existsSync(cachePath)) {
745
+ const cacheAge = Math.round((Date.now() - fs.statSync(cachePath).mtimeMs) / 3_600_000);
746
+ if (cacheAge > 168) {
747
+ lines.push(`${Y}DRIFT${X} Embedding cache is ${cacheAge}h old (>7 days) — will refresh on next server start`);
748
+ issueCount++;
749
+ }
750
+ else {
751
+ lines.push(`${G}OK${X} Embedding cache: ${cacheAge}h old`);
752
+ }
753
+ }
754
+ else {
755
+ lines.push(`${Y}INFO${X} No embedding cache found (will build on first server start)`);
756
+ }
757
+ // Summary
758
+ lines.push("");
759
+ if (issueCount === 0) {
760
+ lines.push(`${G}${B}All clear${X} — no drift detected`);
761
+ }
762
+ else {
763
+ lines.push(`${B}Found ${issueCount} issue(s), healed ${healedCount}${X}`);
764
+ const remaining = issueCount - healedCount;
765
+ if (remaining > 0)
766
+ lines.push(`${Y}${remaining} issue(s) require manual attention${X}`);
767
+ }
768
+ db.close();
769
+ console.log(lines.join("\n"));
770
+ process.exit(0);
771
+ }
772
+ // ── Sync Configs CLI handler (run-and-exit) ─────────────────────────────
773
+ if (syncConfigsFlag) {
774
+ const os = await import("node:os");
775
+ const path = await import("node:path");
776
+ const fs = await import("node:fs");
777
+ const USE_COLOR = process.stdout.isTTY;
778
+ const B = USE_COLOR ? "\x1b[1m" : "";
779
+ const C = USE_COLOR ? "\x1b[36m" : "";
780
+ const G = USE_COLOR ? "\x1b[32m" : "";
781
+ const Y = USE_COLOR ? "\x1b[33m" : "";
782
+ const X = USE_COLOR ? "\x1b[0m" : "";
783
+ // Detect the nodebench-mcp entry point path
784
+ const entryPath = path.resolve(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1") // fix Windows drive letter
785
+ );
786
+ // Build args array from current CLI flags (exclude --sync-configs and other run-and-exit flags)
787
+ const forwardArgs = [];
788
+ const skipNext = new Set(["--preset", "--toolsets", "--exclude", "--engine-secret"]);
789
+ const runAndExitFlags = new Set([
790
+ "--sync-configs", "--health", "--status", "--diagnose", "--stats",
791
+ "--export-stats", "--reset-stats", "--list-presets", "--smart-preset",
792
+ "--auto-preset", "--help",
793
+ ]);
794
+ for (let i = 0; i < cliArgs.length; i++) {
795
+ if (runAndExitFlags.has(cliArgs[i]))
796
+ continue;
797
+ if (cliArgs[i].startsWith("--explain"))
798
+ continue;
799
+ if (skipNext.has(cliArgs[i])) {
800
+ forwardArgs.push(cliArgs[i], cliArgs[i + 1] ?? "");
801
+ i++; // skip the value
802
+ continue;
803
+ }
804
+ forwardArgs.push(cliArgs[i]);
805
+ }
806
+ // Collect env vars that are currently set
807
+ const ENV_KEYS = [
808
+ "ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GEMINI_API_KEY",
809
+ "GITHUB_TOKEN", "BROWSERBASE_API_KEY", "FIRECRAWL_API_KEY",
810
+ "SMTP_HOST", "SMTP_PORT", "SMTP_USER", "SMTP_PASS",
811
+ "IMAP_HOST", "IMAP_PORT", "IMAP_USER", "IMAP_PASS",
812
+ "ENGINE_SECRET",
813
+ ];
814
+ const envObj = {};
815
+ for (const key of ENV_KEYS) {
816
+ if (process.env[key])
817
+ envObj[key] = process.env[key];
818
+ }
819
+ // Build the MCP server config entry
820
+ const nodePath = process.execPath; // path to node binary
821
+ const serverEntry = {
822
+ command: nodePath,
823
+ args: [entryPath, ...forwardArgs],
824
+ ...(Object.keys(envObj).length > 0 ? { env: envObj } : {}),
825
+ };
826
+ // Helper: merge into existing config file (preserves other servers)
827
+ function mergeConfig(filePath, serverKey) {
828
+ let existing = {};
829
+ if (fs.existsSync(filePath)) {
830
+ try {
831
+ existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
832
+ }
833
+ catch {
834
+ // If file exists but is invalid JSON, back it up and start fresh
835
+ const backupPath = filePath + ".bak";
836
+ fs.copyFileSync(filePath, backupPath);
837
+ existing = {};
838
+ }
839
+ }
840
+ // Ensure mcpServers key exists
841
+ if (!existing.mcpServers || typeof existing.mcpServers !== "object") {
842
+ existing.mcpServers = {};
843
+ }
844
+ const servers = existing.mcpServers;
845
+ const hadExisting = !!servers[serverKey];
846
+ servers[serverKey] = serverEntry;
847
+ // Ensure parent directory exists
848
+ const dir = path.dirname(filePath);
849
+ if (!fs.existsSync(dir)) {
850
+ fs.mkdirSync(dir, { recursive: true });
851
+ }
852
+ fs.writeFileSync(filePath, JSON.stringify(existing, null, 2) + "\n", "utf-8");
853
+ return { action: hadExisting ? "updated" : "created", path: filePath };
854
+ }
855
+ const lines = [];
856
+ lines.push(`${B}NodeBench MCP — Sync IDE Configs${X}`);
857
+ lines.push("");
858
+ const results = [];
859
+ // 1. Claude Code: ~/.claude/claude_desktop_config.json
860
+ try {
861
+ const claudeConfigPath = path.join(os.homedir(), ".claude", "claude_desktop_config.json");
862
+ const r = mergeConfig(claudeConfigPath, "nodebench-mcp");
863
+ results.push({ name: "Claude Code", ...r });
864
+ }
865
+ catch (e) {
866
+ results.push({ name: "Claude Code", action: "failed", path: "", error: e.message });
867
+ }
868
+ // 2. Cursor: <project>/.cursor/mcp.json
869
+ try {
870
+ const cursorConfigPath = path.join(process.cwd(), ".cursor", "mcp.json");
871
+ const r = mergeConfig(cursorConfigPath, "nodebench-mcp");
872
+ results.push({ name: "Cursor", ...r });
873
+ }
874
+ catch (e) {
875
+ results.push({ name: "Cursor", action: "failed", path: "", error: e.message });
876
+ }
877
+ // 3. Windsurf: <project>/.windsurf/mcp.json
878
+ try {
879
+ const windsurfConfigPath = path.join(process.cwd(), ".windsurf", "mcp.json");
880
+ const r = mergeConfig(windsurfConfigPath, "nodebench-mcp");
881
+ results.push({ name: "Windsurf", ...r });
882
+ }
883
+ catch (e) {
884
+ results.push({ name: "Windsurf", action: "failed", path: "", error: e.message });
885
+ }
886
+ // Print results
887
+ for (const r of results) {
888
+ if (r.action === "failed") {
889
+ lines.push(`${Y}FAIL${X} ${r.name}: ${r.error}`);
890
+ }
891
+ else {
892
+ const icon = r.action === "created" ? `${G}NEW${X} ` : `${G}UPD${X} `;
893
+ lines.push(`${icon} ${r.name}: ${r.path}`);
894
+ }
895
+ }
896
+ // Print config summary
897
+ lines.push("");
898
+ lines.push(`${C}Config entry:${X}`);
899
+ lines.push(` command: ${nodePath}`);
900
+ lines.push(` args: [${[entryPath, ...forwardArgs].map(a => `"${a}"`).join(", ")}]`);
901
+ if (Object.keys(envObj).length > 0) {
902
+ lines.push(` env: ${Object.keys(envObj).join(", ")}`);
903
+ }
904
+ else {
905
+ lines.push(` env: ${Y}(none set)${X}`);
906
+ }
907
+ lines.push("");
908
+ const successCount = results.filter(r => r.action !== "failed").length;
909
+ lines.push(`${B}Written to ${successCount}/${results.length} locations${X}`);
910
+ console.log(lines.join("\n"));
911
+ process.exit(0);
912
+ }
913
+ // ── CLI subcommand detection ──────────────────────────────────────────
914
+ // First positional arg (not starting with --) is a subcommand
915
+ const subCmd = cliArgs.find(a => !a.startsWith("--") && !cliArgs.some((f, i) => f.startsWith("--") && cliArgs[i + 1] === a));
916
+ // ── Welcome screen (no arguments at all) ─────────────────────────────
917
+ if (cliArgs.length === 0 || (subCmd === undefined && !cliArgs.includes("--stdio") && !cliArgs.some(a => a.startsWith("--")))) {
918
+ const USE_COLOR = process.stdout.isTTY;
919
+ const B = USE_COLOR ? "\x1b[1m" : "";
920
+ const C = USE_COLOR ? "\x1b[36m" : "";
921
+ const G = USE_COLOR ? "\x1b[32m" : "";
922
+ const D = USE_COLOR ? "\x1b[2m" : "";
923
+ const Y = USE_COLOR ? "\x1b[33m" : "";
924
+ const X = USE_COLOR ? "\x1b[0m" : "";
925
+ const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
926
+ const domainCount = Object.keys(TOOLSET_MAP).length;
927
+ const welcome = [
928
+ "",
929
+ ` ${B}NodeBench AI${X} ${D}— The trust layer for agents${X}`,
930
+ "",
931
+ ` ${C}Quick start${X}`,
932
+ ` ${G}$${X} npx nodebench-mcp discover ${D}Show available tools${X}`,
933
+ ` ${G}$${X} npx nodebench-mcp demo ${D}Run a live demo (no keys needed)${X}`,
934
+ ` ${G}$${X} npx nodebench-mcp quickref research ${D}Get research workflow guide${X}`,
935
+ ` ${G}$${X} npx nodebench-mcp --explain run_recon ${D}Deep-dive on any tool${X}`,
936
+ "",
937
+ ` ${C}Connect to your IDE${X}`,
938
+ ` ${G}$${X} claude mcp add nodebench -- npx nodebench-mcp --stdio`,
939
+ ` ${G}$${X} npx nodebench-mcp --sync-configs ${D}Auto-write to Claude/Cursor/Windsurf${X}`,
940
+ "",
941
+ ` ${C}Start the MCP server${X}`,
942
+ ` ${G}$${X} npx nodebench-mcp --stdio ${D}Default preset${X}`,
943
+ ` ${G}$${X} npx nodebench-mcp --preset research ${D}Research workflows${X}`,
944
+ ` ${G}$${X} npx nodebench-mcp --auto-preset ${D}Detect from your project${X}`,
945
+ "",
946
+ ` ${Y}${totalTools} tools${X} ${D}·${X} ${Y}${domainCount} domains${X} ${D}· Progressive discovery · Agent-as-a-Graph${X}`,
947
+ "",
948
+ ];
949
+ console.log(welcome.join("\n"));
950
+ process.exit(0);
951
+ }
952
+ // ── Demo subcommand (run-and-exit) ───────────────────────────────────
953
+ if (subCmd === "demo") {
954
+ const USE_COLOR = process.stdout.isTTY;
955
+ const B = USE_COLOR ? "\x1b[1m" : "";
956
+ const C = USE_COLOR ? "\x1b[36m" : "";
957
+ const G = USE_COLOR ? "\x1b[32m" : "";
958
+ const D = USE_COLOR ? "\x1b[2m" : "";
959
+ const Y = USE_COLOR ? "\x1b[33m" : "";
960
+ const X = USE_COLOR ? "\x1b[0m" : "";
961
+ const demoLines = [];
962
+ demoLines.push("");
963
+ demoLines.push(` ${B}NodeBench AI — Live Demo${X}`);
964
+ demoLines.push(` ${D}No API keys needed. Everything runs locally.${X}`);
965
+ demoLines.push("");
966
+ // 1. Show research tools via hybridSearch
967
+ demoLines.push(` ${C}1. Discovering research tools...${X}`);
968
+ demoLines.push("");
969
+ const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
970
+ const researchResults = hybridSearch("research", stubTools, { limit: 5, mode: "hybrid" });
971
+ for (const r of researchResults.slice(0, 5)) {
972
+ const entry = TOOL_REGISTRY.get(r.name);
973
+ const phase = entry?.phase ?? "";
974
+ demoLines.push(` ${G}>${X} ${B}${r.name}${X} ${D}(${phase})${X}`);
975
+ if (entry?.quickRef?.nextAction) {
976
+ demoLines.push(` ${entry.quickRef.nextAction.slice(0, 80)}`);
977
+ }
978
+ }
979
+ demoLines.push("");
980
+ // 2. Show a workflow chain
981
+ demoLines.push(` ${C}2. Workflow chain: "Build a New Feature"${X}`);
982
+ demoLines.push("");
983
+ const chain = WORKFLOW_CHAINS["new_feature"];
984
+ if (chain) {
985
+ demoLines.push(` ${B}${chain.name}${X} ${D}— ${chain.description}${X}`);
986
+ demoLines.push("");
987
+ for (let i = 0; i < Math.min(chain.steps.length, 8); i++) {
988
+ const step = chain.steps[i];
989
+ const num = String(i + 1).padStart(2, " ");
990
+ demoLines.push(` ${Y}${num}.${X} ${step.tool} ${D}→ ${step.action}${X}`);
991
+ }
992
+ if (chain.steps.length > 8) {
993
+ demoLines.push(` ${D} ... +${chain.steps.length - 8} more steps${X}`);
994
+ }
995
+ }
996
+ demoLines.push("");
997
+ // 3. Summary stats
998
+ const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
999
+ const domainCount = Object.keys(TOOLSET_MAP).length;
1000
+ const chainCount = Object.keys(WORKFLOW_CHAINS).length;
1001
+ demoLines.push(` ${C}3. What's available${X}`);
1002
+ demoLines.push("");
1003
+ demoLines.push(` ${Y}${totalTools}${X} tools across ${Y}${domainCount}${X} domains`);
1004
+ demoLines.push(` ${Y}${chainCount}${X} pre-built workflow chains`);
1005
+ demoLines.push(` ${Y}${ALL_REGISTRY_ENTRIES.length}${X} entries in the tool registry`);
1006
+ demoLines.push("");
1007
+ // 4. Next steps
1008
+ demoLines.push(` ${C}Next steps${X}`);
1009
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --explain run_recon ${D}Deep-dive on any tool${X}`);
1010
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --health ${D}Check your environment${X}`);
1011
+ demoLines.push(` ${G}$${X} npx nodebench-mcp --sync-configs ${D}Wire into your IDE${X}`);
1012
+ demoLines.push("");
1013
+ console.log(demoLines.join("\n"));
1014
+ process.exit(0);
1015
+ }
1016
+ // ── Discover subcommand (run-and-exit) ───────────────────────────────
1017
+ if (subCmd === "discover") {
1018
+ const USE_COLOR = process.stdout.isTTY;
1019
+ const B = USE_COLOR ? "\x1b[1m" : "";
1020
+ const C = USE_COLOR ? "\x1b[36m" : "";
1021
+ const G = USE_COLOR ? "\x1b[32m" : "";
1022
+ const D = USE_COLOR ? "\x1b[2m" : "";
1023
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1024
+ const X = USE_COLOR ? "\x1b[0m" : "";
1025
+ const query = cliArgs.find(a => a !== "discover" && !a.startsWith("--")) ?? "";
1026
+ const limit = 10;
1027
+ const lines = [];
1028
+ lines.push("");
1029
+ if (query) {
1030
+ lines.push(` ${B}Discovering tools for:${X} ${C}${query}${X}`);
1031
+ const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
1032
+ const results = hybridSearch(query, stubTools, { limit, mode: "hybrid" });
1033
+ lines.push("");
1034
+ for (const r of results) {
1035
+ const entry = TOOL_REGISTRY.get(r.name);
1036
+ lines.push(` ${G}>${X} ${B}${r.name}${X} ${D}score: ${r.score.toFixed(2)}${X}`);
1037
+ if (entry) {
1038
+ lines.push(` ${D}${entry.category} · ${entry.phase}${X}`);
1039
+ if (entry.quickRef?.nextAction)
1040
+ lines.push(` ${entry.quickRef.nextAction.slice(0, 90)}`);
1041
+ }
1042
+ lines.push("");
1043
+ }
1044
+ if (results.length === 0)
1045
+ lines.push(` ${Y}No results.${X} Try a broader query.\n`);
1046
+ }
1047
+ else {
1048
+ lines.push(` ${B}Tool domains${X} ${D}(${Object.keys(TOOLSET_MAP).length} domains)${X}`);
1049
+ lines.push("");
1050
+ for (const [domain, tools] of Object.entries(TOOLSET_MAP)) {
1051
+ lines.push(` ${G}>${X} ${domain.padEnd(24)} ${Y}${String(tools.length).padStart(3)}${X} tools`);
1052
+ }
1053
+ lines.push("");
1054
+ lines.push(` ${D}Search: npx nodebench-mcp discover <query>${X}`);
1055
+ }
1056
+ lines.push("");
1057
+ console.log(lines.join("\n"));
1058
+ process.exit(0);
1059
+ }
1060
+ // ── Quickref subcommand (run-and-exit) ───────────────────────────────
1061
+ if (subCmd === "quickref") {
1062
+ const USE_COLOR = process.stdout.isTTY;
1063
+ const B = USE_COLOR ? "\x1b[1m" : "";
1064
+ const C = USE_COLOR ? "\x1b[36m" : "";
1065
+ const G = USE_COLOR ? "\x1b[32m" : "";
1066
+ const D = USE_COLOR ? "\x1b[2m" : "";
1067
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1068
+ const X = USE_COLOR ? "\x1b[0m" : "";
1069
+ const toolName = cliArgs.find(a => a !== "quickref" && !a.startsWith("--")) ?? "";
1070
+ const lines = [];
1071
+ lines.push("");
1072
+ if (!toolName) {
1073
+ lines.push(` ${B}Usage:${X} npx nodebench-mcp quickref <tool_or_workflow>`);
1074
+ lines.push("");
1075
+ lines.push(` ${C}Workflows${X}`);
1076
+ for (const [key, chain] of Object.entries(WORKFLOW_CHAINS).slice(0, 10)) {
1077
+ lines.push(` ${G}>${X} ${key.padEnd(28)} ${D}${chain.name}${X}`);
1078
+ }
1079
+ lines.push(` ${D} ... +${Object.keys(WORKFLOW_CHAINS).length - 10} more${X}`);
1080
+ lines.push("");
1081
+ }
1082
+ else {
1083
+ // Try workflow first
1084
+ const chain = WORKFLOW_CHAINS[toolName];
1085
+ if (chain) {
1086
+ lines.push(` ${B}${chain.name}${X} ${D}(${toolName})${X}`);
1087
+ lines.push(` ${chain.description}`);
1088
+ lines.push("");
1089
+ for (let i = 0; i < chain.steps.length; i++) {
1090
+ const step = chain.steps[i];
1091
+ lines.push(` ${Y}${String(i + 1).padStart(2)}.${X} ${step.tool} ${D}→ ${step.action}${X}`);
1092
+ }
1093
+ lines.push("");
1094
+ }
1095
+ else {
1096
+ // Try tool registry
1097
+ const entry = TOOL_REGISTRY.get(toolName);
1098
+ if (entry) {
1099
+ lines.push(` ${B}${entry.name}${X} ${D}(${entry.category}, ${entry.phase})${X}`);
1100
+ lines.push(` ${entry.quickRef.nextAction}`);
1101
+ if (entry.quickRef.tip)
1102
+ lines.push(` ${Y}Tip:${X} ${entry.quickRef.tip}`);
1103
+ if (entry.quickRef.nextTools.length > 0) {
1104
+ lines.push("");
1105
+ lines.push(` ${C}Next tools${X}`);
1106
+ for (const nt of entry.quickRef.nextTools)
1107
+ lines.push(` ${G}>${X} ${nt}`);
1108
+ }
1109
+ lines.push("");
1110
+ }
1111
+ else {
1112
+ lines.push(` ${Y}Not found:${X} ${toolName}`);
1113
+ lines.push(` ${D}Try: npx nodebench-mcp quickref new_feature${X}`);
1114
+ lines.push("");
1115
+ }
1116
+ }
1117
+ }
1118
+ console.log(lines.join("\n"));
1119
+ process.exit(0);
1120
+ }
1121
+ // ── Call subcommand (run-and-exit) ───────────────────────────────────
1122
+ if (subCmd === "call") {
1123
+ const toolName = cliArgs.find(a => a !== "call" && !a.startsWith("--") && !a.startsWith("{"));
1124
+ const argsJson = cliArgs.find(a => a.startsWith("{")) ?? "{}";
1125
+ const USE_COLOR = process.stdout.isTTY;
1126
+ const B = USE_COLOR ? "\x1b[1m" : "";
1127
+ const G = USE_COLOR ? "\x1b[32m" : "";
1128
+ const R = USE_COLOR ? "\x1b[31m" : "";
1129
+ const D = USE_COLOR ? "\x1b[2m" : "";
1130
+ const X = USE_COLOR ? "\x1b[0m" : "";
1131
+ if (!toolName) {
1132
+ console.log(`\n ${B}Usage:${X} npx nodebench-mcp call <tool_name> [json_args]\n`);
1133
+ console.log(` ${D}Example:${X} npx nodebench-mcp call founder_deep_context_gather '{"packetType":"weekly_reset"}'`);
1134
+ console.log(` ${D}Example:${X} npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
1135
+ console.log(` ${D}Example:${X} npx nodebench-mcp call save_session_note '{"note":"test"}'\n`);
1136
+ process.exit(0);
1137
+ }
1138
+ // Find tool in all toolsets — meta/discovery tools are created later,
1139
+ // so for CLI call we build them inline
1140
+ const cliDomainTools = Object.values(TOOLSET_MAP).flat();
1141
+ const cliMetaTools = createMetaTools(cliDomainTools);
1142
+ const cliDiscoveryTools = createProgressiveDiscoveryTools(cliDomainTools);
1143
+ const allCallable = [...cliDomainTools, ...cliMetaTools, ...cliDiscoveryTools];
1144
+ const tool = allCallable.find(t => t.name === toolName);
1145
+ if (!tool) {
1146
+ console.log(`\n ${R}Tool not found:${X} ${toolName}`);
1147
+ console.log(` ${D}Run: npx nodebench-mcp discover ${toolName}${X}\n`);
1148
+ process.exit(1);
1149
+ }
1150
+ let parsedArgs;
1151
+ try {
1152
+ parsedArgs = JSON.parse(argsJson);
1153
+ }
1154
+ catch {
1155
+ console.log(`\n ${R}Invalid JSON args:${X} ${argsJson}\n`);
1156
+ process.exit(1);
1157
+ }
1158
+ console.log(`\n ${D}Calling${X} ${B}${toolName}${X} ${D}...${X}`);
1159
+ try {
1160
+ const result = await tool.handler(parsedArgs);
1161
+ const output = typeof result === "string" ? result : JSON.stringify(result, null, 2);
1162
+ console.log(`\n ${G}Result:${X}\n`);
1163
+ // Pretty-print, indent 4 spaces
1164
+ for (const line of output.split("\n")) {
1165
+ console.log(` ${line}`);
1166
+ }
1167
+ console.log("");
1168
+ }
1169
+ catch (err) {
1170
+ const msg = err instanceof Error ? err.message : String(err);
1171
+ console.log(`\n ${R}Error:${X} ${msg}\n`);
1172
+ process.exit(1);
1173
+ }
1174
+ process.exit(0);
1175
+ }
1176
+ // ── Setup subcommand (run-and-exit) ──────────────────────────────────
1177
+ if (subCmd === "setup") {
1178
+ const USE_COLOR = process.stdout.isTTY;
1179
+ const B = USE_COLOR ? "\x1b[1m" : "";
1180
+ const C = USE_COLOR ? "\x1b[36m" : "";
1181
+ const G = USE_COLOR ? "\x1b[32m" : "";
1182
+ const D = USE_COLOR ? "\x1b[2m" : "";
1183
+ const Y = USE_COLOR ? "\x1b[33m" : "";
1184
+ const X = USE_COLOR ? "\x1b[0m" : "";
1185
+ const lines = [];
1186
+ lines.push("");
1187
+ lines.push(` ${B}NodeBench MCP — Quick Setup${X}`);
1188
+ lines.push("");
1189
+ lines.push(` ${G}1.${X} ${B}Claude Code${X}`);
1190
+ lines.push(` claude mcp add nodebench -- npx -y nodebench-mcp`);
1191
+ lines.push("");
1192
+ lines.push(` ${G}2.${X} ${B}Cursor${X} ${D}(.cursor/mcp.json)${X}`);
1193
+ lines.push(` { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
1194
+ lines.push("");
1195
+ lines.push(` ${G}3.${X} ${B}Windsurf${X} ${D}(.windsurf/mcp.json)${X}`);
1196
+ lines.push(` { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
1197
+ lines.push("");
1198
+ lines.push(` ${C}Verify:${X} npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
1199
+ lines.push(` ${C}Dashboard:${X} https://www.nodebenchai.com/founder`);
1200
+ lines.push(` ${C}Agent setup:${X} https://www.nodebenchai.com/agent-setup.txt`);
1201
+ lines.push("");
1202
+ lines.push(` ${Y}Presets:${X} --preset default (99 tools) | --preset full (313 tools)`);
1203
+ lines.push(` ${Y}Founder tools:${X} founder_deep_context_gather, founder_packet_validate, founder_packet_diff`);
1204
+ lines.push("");
1205
+ console.log(lines.join("\n"));
1206
+ process.exit(0);
1207
+ }
200
1208
  // Initialize DB (creates ~/.nodebench/ and schema on first run)
201
1209
  getDb();
202
1210
  // Wire up DB accessor for execution trace edges (avoids circular import)
@@ -591,42 +1599,42 @@ const dynamicLoadingTools = [
591
1599
  const db = getDb();
592
1600
  const detailed = args.detailed === true;
593
1601
  // Session-level aggregates by mode
594
- const sessionSummary = db.prepare(`
595
- SELECT
596
- mode,
597
- COUNT(*) as sessions,
598
- ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
599
- ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
600
- ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
601
- ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
602
- ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
603
- SUM(COALESCE(total_tool_calls, 0)) as total_calls,
604
- SUM(COALESCE(total_load_events, 0)) as total_loads
605
- FROM ab_test_sessions
606
- GROUP BY mode
1602
+ const sessionSummary = db.prepare(`
1603
+ SELECT
1604
+ mode,
1605
+ COUNT(*) as sessions,
1606
+ ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
1607
+ ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
1608
+ ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
1609
+ ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
1610
+ ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
1611
+ SUM(COALESCE(total_tool_calls, 0)) as total_calls,
1612
+ SUM(COALESCE(total_load_events, 0)) as total_loads
1613
+ FROM ab_test_sessions
1614
+ GROUP BY mode
607
1615
  `).all();
608
1616
  // Error rate by mode (join with tool_call_log)
609
- const errorRates = db.prepare(`
610
- SELECT
611
- s.mode,
612
- COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
613
- COUNT(*) as total_calls,
614
- ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
615
- FROM tool_call_log t
616
- JOIN ab_test_sessions s ON t.session_id = s.id
617
- GROUP BY s.mode
1617
+ const errorRates = db.prepare(`
1618
+ SELECT
1619
+ s.mode,
1620
+ COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
1621
+ COUNT(*) as total_calls,
1622
+ ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
1623
+ FROM tool_call_log t
1624
+ JOIN ab_test_sessions s ON t.session_id = s.id
1625
+ GROUP BY s.mode
618
1626
  `).all();
619
1627
  // Top loaded toolsets (dynamic mode)
620
- const topToolsets = db.prepare(`
621
- SELECT
622
- toolset_name,
623
- COUNT(*) as load_count,
624
- ROUND(AVG(latency_ms), 1) as avg_latency_ms
625
- FROM ab_tool_events
626
- WHERE event_type = 'load'
627
- GROUP BY toolset_name
628
- ORDER BY load_count DESC
629
- LIMIT 10
1628
+ const topToolsets = db.prepare(`
1629
+ SELECT
1630
+ toolset_name,
1631
+ COUNT(*) as load_count,
1632
+ ROUND(AVG(latency_ms), 1) as avg_latency_ms
1633
+ FROM ab_tool_events
1634
+ WHERE event_type = 'load'
1635
+ GROUP BY toolset_name
1636
+ ORDER BY load_count DESC
1637
+ LIMIT 10
630
1638
  `).all();
631
1639
  // Current session info
632
1640
  const currentSession = {
@@ -642,13 +1650,13 @@ const dynamicLoadingTools = [
642
1650
  // Optional per-session detail
643
1651
  let sessions = [];
644
1652
  if (detailed) {
645
- sessions = db.prepare(`
646
- SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
647
- toolsets_loaded, total_tool_calls, total_load_events,
648
- session_duration_ms, created_at, ended_at
649
- FROM ab_test_sessions
650
- ORDER BY created_at DESC
651
- LIMIT 50
1653
+ sessions = db.prepare(`
1654
+ SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
1655
+ toolsets_loaded, total_tool_calls, total_load_events,
1656
+ session_duration_ms, created_at, ended_at
1657
+ FROM ab_test_sessions
1658
+ ORDER BY created_at DESC
1659
+ LIMIT 50
652
1660
  `).all();
653
1661
  }
654
1662
  // Build verdict
@@ -751,6 +1759,54 @@ const _hookState = {
751
1759
  lastRefreshReminder: 0, // totalCalls at last reminder
752
1760
  };
753
1761
  const WEB_TOOL_NAMES = new Set(["web_search", "fetch_url"]);
1762
+ // ── Intent-based auto-expansion ─────────────────────────────────────────
1763
+ // On the first tool call, classify intent from tool name + args keywords
1764
+ // and auto-load relevant toolsets if running on the default preset.
1765
+ // Zero-latency: pure keyword matching, no LLM calls. Runs once per session.
1766
+ let _intentClassified = false;
1767
+ const INTENT_PATTERNS = [
1768
+ { pattern: /web|css|html|dom|seo|browser|page|viewport|screenshot|ui_capture|ui_ux/i, toolsets: ["ui_capture", "vision", "web", "seo", "git_workflow", "architect"] },
1769
+ { pattern: /research|paper|arxiv|scholar|literature|digest|brief|rss|feed/i, toolsets: ["web", "llm", "rss", "email", "docs"] },
1770
+ { pattern: /data|csv|sql|pandas|xlsx|json_parse|spreadsheet|parquet|parse/i, toolsets: ["local_file", "llm", "web"] },
1771
+ { pattern: /deploy|docker|k8s|kubernetes|ci|cd|pipeline|terraform|helm|infra/i, toolsets: ["git_workflow", "session_memory", "benchmark", "pattern"] },
1772
+ { pattern: /agent|swarm|orchestr|parallel|multi.?agent|spawn|coordinat/i, toolsets: ["parallel", "self_eval", "session_memory", "pattern", "toon"] },
1773
+ { pattern: /mobile|ios|android|react.?native|flutter|swift|kotlin/i, toolsets: ["ui_capture", "vision", "flicker_detection"] },
1774
+ { pattern: /academic|thesis|review|cite|biblio|latex|peer/i, toolsets: ["research_writing", "llm", "web", "local_file"] },
1775
+ { pattern: /content|publish|post|newsletter|email|campaign|linkedin/i, toolsets: ["llm", "critter", "email", "rss", "platform", "architect"] },
1776
+ ];
1777
+ function classifyAndExpand(toolName, args) {
1778
+ // Only expand if on default preset — user explicitly chose a preset, respect it
1779
+ if (currentPreset !== "default")
1780
+ return null;
1781
+ // Build a single haystack from tool name + stringified arg keys/values
1782
+ const argStr = args ? Object.entries(args).map(([k, v]) => `${k} ${typeof v === "string" ? v : ""}`).join(" ") : "";
1783
+ const haystack = `${toolName} ${argStr}`;
1784
+ // Collect all matching toolsets (deduplicated)
1785
+ const toLoad = new Set();
1786
+ for (const { pattern, toolsets } of INTENT_PATTERNS) {
1787
+ if (pattern.test(haystack)) {
1788
+ for (const ts of toolsets) {
1789
+ if (TOOLSET_MAP[ts] && !activeToolsets.has(ts)) {
1790
+ toLoad.add(ts);
1791
+ }
1792
+ }
1793
+ }
1794
+ }
1795
+ if (toLoad.size === 0)
1796
+ return null;
1797
+ // Load matched toolsets
1798
+ for (const ts of toLoad) {
1799
+ activeToolsets.add(ts);
1800
+ }
1801
+ // Rebuild tool arrays
1802
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
1803
+ const newMetaTools = createMetaTools(domainTools);
1804
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
1805
+ rebuildAllTools();
1806
+ // Notify client of tool list change
1807
+ server.notification({ method: "notifications/tools/list_changed" }).catch(() => { });
1808
+ return [...toLoad];
1809
+ }
754
1810
  const SAVE_TOOL_NAMES = new Set(["save_session_note", "record_learning"]);
755
1811
  const REFRESH_INTERVAL = 30; // remind after every 30 calls
756
1812
  function getHookHint(toolName) {
@@ -784,42 +1840,93 @@ const PROMPTS = [
784
1840
  role: "user",
785
1841
  content: {
786
1842
  type: "text",
787
- text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
788
-
789
- WHAT THIS DOES:
790
- In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
791
- that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
792
- the agent finds 2+ prior findings before writing a single line of code.
793
-
794
- HOW IT WORKS:
795
- Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
796
- Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
797
- compounds into future tasks.
798
-
799
- FIRST TIME? Run these 3 steps:
800
- 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
801
- 2. Call getMethodology("overview") to see all available methodologies
802
- 3. Call search_all_knowledge("your current task") before starting any work
803
-
804
- RETURNING? Your project context and all past learnings are persisted. Start with:
805
- 1. Call search_all_knowledge with your current task
806
- 2. Follow the methodology tools as you work — they'll guide you step by step
807
-
808
- KEY TOOLS:
809
- - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
810
- - run_mandatory_flywheel — 6-step minimum verification before declaring work done
811
- - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
812
- - findTools — Discover tools by keyword or category
813
- - assess_risk — Assess risk before acting (HIGH = needs confirmation)
814
-
815
- PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
816
- - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
817
- - get_parallel_status — See what all agents are doing
1843
+ text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
1844
+
1845
+ WHAT THIS DOES:
1846
+ In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
1847
+ that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
1848
+ the agent finds 2+ prior findings before writing a single line of code.
1849
+
1850
+ HOW IT WORKS:
1851
+ Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
1852
+ Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
1853
+ compounds into future tasks.
1854
+
1855
+ FIRST TIME? Run these 3 steps:
1856
+ 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
1857
+ 2. Call getMethodology("overview") to see all available methodologies
1858
+ 3. Call search_all_knowledge("your current task") before starting any work
1859
+
1860
+ RETURNING? Your project context and all past learnings are persisted. Start with:
1861
+ 1. Call search_all_knowledge with your current task
1862
+ 2. Follow the methodology tools as you work — they'll guide you step by step
1863
+
1864
+ KEY TOOLS:
1865
+ - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
1866
+ - run_mandatory_flywheel — 6-step minimum verification before declaring work done
1867
+ - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
1868
+ - findTools — Discover tools by keyword or category
1869
+ - assess_risk — Assess risk before acting (HIGH = needs confirmation)
1870
+
1871
+ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
1872
+ - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
1873
+ - get_parallel_status — See what all agents are doing
818
1874
  - Use the "claude-code-parallel" prompt for step-by-step guidance`,
819
1875
  },
820
1876
  },
821
1877
  ],
822
1878
  },
1879
+ {
1880
+ name: "execution-trace-workflow",
1881
+ description: "Start and maintain a traceable execution run. Use this for any workflow that needs receipts, evidence, decisions, verification, approvals, and a durable audit trail.",
1882
+ arguments: [
1883
+ {
1884
+ name: "workflowTitle",
1885
+ description: "Human-readable title for the run",
1886
+ required: true,
1887
+ },
1888
+ {
1889
+ name: "workflowGoal",
1890
+ description: "What the workflow must accomplish",
1891
+ required: true,
1892
+ },
1893
+ {
1894
+ name: "workflowType",
1895
+ description: "Optional workflow label such as spreadsheet_enrichment or company_direction_analysis",
1896
+ required: false,
1897
+ },
1898
+ ],
1899
+ messages: (args) => [
1900
+ {
1901
+ role: "user",
1902
+ content: {
1903
+ type: "text",
1904
+ text: `Run this task as a fully traceable execution workflow.
1905
+
1906
+ Title: ${args.workflowTitle}
1907
+ Goal: ${args.workflowGoal}
1908
+ Workflow type: ${args.workflowType || "execution_trace"}
1909
+
1910
+ Required operating loop:
1911
+ 1. Call start_execution_run first. Create one durable run before doing substantive work.
1912
+ 2. Record every meaningful action with record_execution_step. Do this for inspect, research, edit, verify, export, and issue-fix steps.
1913
+ 3. Attach evidence as you go with attach_execution_evidence. Store URLs, uploaded files, renders, screenshots, logs, and notes.
1914
+ 4. Record explicit choices with record_execution_decision. Capture alternatives considered, evidence basis, confidence, and limitations. Do not expose raw chain-of-thought.
1915
+ 5. Record QA checks with record_execution_verification. Use this for render checks, formula checks, diff checks, replay checks, or artifact integrity checks.
1916
+ 6. If a risky action needs human sign-off, call request_execution_approval before proceeding.
1917
+ 7. Finish with complete_execution_run and set the final status plus any drift summary if applicable.
1918
+
1919
+ Trace standard:
1920
+ - Facts and outputs must be evidence-grounded.
1921
+ - Decisions must separate verified evidence from inference.
1922
+ - Verification must explain what was checked and what passed or failed.
1923
+ - Limitations must be explicit instead of implied.
1924
+
1925
+ Do not treat the trace as optional. The run should be inspectable after completion by an operator who was not present during execution.`,
1926
+ },
1927
+ },
1928
+ ],
1929
+ },
823
1930
  {
824
1931
  name: "project-setup",
825
1932
  description: "Guided project bootstrapping. Walks you through registering project context so the MCP has full project awareness.",
@@ -835,21 +1942,154 @@ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
835
1942
  role: "user",
836
1943
  content: {
837
1944
  type: "text",
838
- text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
839
-
840
- Please gather and record the following using the bootstrap_project tool:
841
- 1. Tech stack (languages, frameworks, runtimes)
842
- 2. Key dependency versions
843
- 3. Architecture overview
844
- 4. Build/test commands
845
- 5. Known conventions or patterns
846
- 6. Repository structure highlights
847
-
1945
+ text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
1946
+
1947
+ Please gather and record the following using the bootstrap_project tool:
1948
+ 1. Tech stack (languages, frameworks, runtimes)
1949
+ 2. Key dependency versions
1950
+ 3. Architecture overview
1951
+ 4. Build/test commands
1952
+ 5. Known conventions or patterns
1953
+ 6. Repository structure highlights
1954
+
848
1955
  After bootstrapping, run a reconnaissance session with run_recon to check for latest updates on the project's key frameworks and SDKs.`,
849
1956
  },
850
1957
  },
851
1958
  ],
852
1959
  },
1960
+ {
1961
+ name: "spreadsheet-enrichment-trace",
1962
+ description: "Traceable workflow for spreadsheet enrichment: inspect workbook, research supporting evidence, edit cells, verify render/calculation quality, and export with receipts.",
1963
+ arguments: [
1964
+ {
1965
+ name: "fileUri",
1966
+ description: "Input spreadsheet path or URI",
1967
+ required: true,
1968
+ },
1969
+ {
1970
+ name: "goal",
1971
+ description: "What the spreadsheet workflow should achieve",
1972
+ required: true,
1973
+ },
1974
+ ],
1975
+ messages: (args) => [
1976
+ {
1977
+ role: "user",
1978
+ content: {
1979
+ type: "text",
1980
+ text: `Run a traceable spreadsheet-enrichment workflow.
1981
+
1982
+ Input spreadsheet: ${args.fileUri}
1983
+ Goal: ${args.goal}
1984
+
1985
+ Workflow:
1986
+ 1. Start a run with start_execution_run using workflowName="spreadsheet_enrichment".
1987
+ 2. Inspect workbook structure, layout, formulas, and formatting. Record this with record_execution_step.
1988
+ 3. Attach the workbook and any rendered images as evidence with attach_execution_evidence.
1989
+ 4. If public research is needed, attach source URLs and record the evidence boundary.
1990
+ 5. Record major ranking or editing choices with record_execution_decision. Include alternatives considered and any unsupported claims.
1991
+ 6. Perform edits. Record the edit step and attach output artifacts or before/after references.
1992
+ 7. Verify the workbook. Record calculation checks, render checks, formatting checks, link cleanup, and export checks with record_execution_verification.
1993
+ 8. Complete the run only after the workbook is exported and the final verification state is known.
1994
+
1995
+ Required output discipline:
1996
+ - Make changed cells traceable.
1997
+ - Distinguish verified facts from inferred recommendations.
1998
+ - Record any formatting or hyperlink cleanup as explicit fix steps.
1999
+ - Leave behind enough evidence for another operator to replay what happened.`,
2000
+ },
2001
+ },
2002
+ ],
2003
+ },
2004
+ {
2005
+ name: "company-direction-analysis-trace",
2006
+ description: "Traceable workflow for capability-to-product-direction analysis grounded in public evidence, credibility filters, and phased recommendations.",
2007
+ arguments: [
2008
+ {
2009
+ name: "subjectCompany",
2010
+ description: "Company being evaluated",
2011
+ required: true,
2012
+ },
2013
+ {
2014
+ name: "strategicQuestion",
2015
+ description: "The product-direction or capability question being answered",
2016
+ required: true,
2017
+ },
2018
+ ],
2019
+ messages: (args) => [
2020
+ {
2021
+ role: "user",
2022
+ content: {
2023
+ type: "text",
2024
+ text: `Run a traceable company-direction analysis.
2025
+
2026
+ Subject company: ${args.subjectCompany}
2027
+ Strategic question: ${args.strategicQuestion}
2028
+
2029
+ Required method:
2030
+ 1. Start a run with start_execution_run using workflowName="company_direction_analysis".
2031
+ 2. Gather public evidence first. Attach company pages, press, resumes, hiring signals, papers, and adjacent market references as evidence.
2032
+ 3. Call compute_dimension_profile as soon as you have enough evidence to ground the company state. Then use export_dimension_bundle to inspect the regime label, policy context, evidence rows, and interaction effects.
2033
+ 4. Record a decision boundary between:
2034
+ - publicly supported facts
2035
+ - supported but incomplete claims
2036
+ - not established by public evidence
2037
+ 5. Build a credibility filter and a dimension-aware regime summary. Record explicit decisions for high-credibility, medium-credibility, and low-credibility directions, and tie them to capital, capability, network, market, operations, and narrative dimensions where relevant.
2038
+ 6. Record the final recommendation as a structured decision with alternatives considered, evidence basis, confidence, limitations, and the regime you believe the company is operating under.
2039
+ 7. Record at least one verification step that checks the final memo still reflects the truth boundary, the exported dimension bundle, and does not overclaim pedigree.
2040
+ 8. Complete the run after the recommendation, limitations, evidence links, and dimension bundle references are all attached.
2041
+
2042
+ Output rules:
2043
+ - Recommendations must stay adjacent to reputation and public proof.
2044
+ - Unsupported claims must be clearly labeled as unsupported.
2045
+ - Distinguish verified, estimated, inferred, and unavailable dimension signals.
2046
+ - The trace should let another operator audit why a direction was recommended or rejected.`,
2047
+ },
2048
+ },
2049
+ ],
2050
+ },
2051
+ {
2052
+ name: "agent-delegation-with-approval-trace",
2053
+ description: "Traceable workflow for delegated agent work with approval gates. Use this when a capable agent can operate, but risky actions still need scoped human sign-off.",
2054
+ arguments: [
2055
+ {
2056
+ name: "task",
2057
+ description: "Delegated task description",
2058
+ required: true,
2059
+ },
2060
+ {
2061
+ name: "riskLevel",
2062
+ description: "Expected risk level: low, medium, or high",
2063
+ required: true,
2064
+ },
2065
+ ],
2066
+ messages: (args) => [
2067
+ {
2068
+ role: "user",
2069
+ content: {
2070
+ type: "text",
2071
+ text: `Run a delegated agent workflow with explicit approval boundaries.
2072
+
2073
+ Task: ${args.task}
2074
+ Risk level: ${args.riskLevel}
2075
+
2076
+ Required process:
2077
+ 1. Start a run with start_execution_run using workflowName="agent_delegation".
2078
+ 2. Record the initial scope, intended tools, and expected outputs with record_execution_step.
2079
+ 3. Attach inputs, policies, and constraints as evidence.
2080
+ 4. Record any material choice or plan update with record_execution_decision.
2081
+ 5. Before any externally visible, destructive, or high-risk action, call request_execution_approval.
2082
+ 6. Only continue after the approval state is known, and record the resulting step explicitly.
2083
+ 7. Record verification that the final output stayed inside scope and honored the approval boundary.
2084
+ 8. Complete the run with the final status and limitations.
2085
+
2086
+ Trust requirements:
2087
+ - The operator must be able to see what was attempted, what required approval, and what evidence justified the action.
2088
+ - Do not hide uncertainty or skipped approvals inside prose summaries.`,
2089
+ },
2090
+ },
2091
+ ],
2092
+ },
853
2093
  {
854
2094
  name: "ui-qa-checklist",
855
2095
  description: "UI/UX QA checklist for frontend implementations. Run after any change that touches React components, layouts, or interactions. Guides the agent through component tests, accessibility, responsive checks, and E2E validation.",
@@ -865,33 +2105,33 @@ After bootstrapping, run a reconnaissance session with run_recon to check for la
865
2105
  role: "user",
866
2106
  content: {
867
2107
  type: "text",
868
- text: `You just implemented UI changes to: ${args.componentName}
869
-
870
- Before declaring this work done, run the UI/UX QA checklist:
871
-
872
- 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
873
- 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
874
- 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
875
- 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
876
- 5. STATES: Verify loading, error, and empty states are handled
877
- 6. CONSOLE: Check browser devtools for errors/warnings
878
- 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
879
- 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
880
- 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
881
-
882
- After checking each item, record results:
883
- call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
884
- evaluate each rule against ${args.componentName}
885
- call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
886
- call record_learning for any UI gotchas discovered
887
-
888
- For the full step-by-step methodology, call getMethodology("ui_ux_qa").
889
-
890
- Commands available:
891
- npm run test:run — Vitest component tests
892
- npm run test:e2e — Playwright E2E tests
893
- npm run storybook — Storybook dev server (port 6006)
894
- npm run perf:lighthouse — Lighthouse audit
2108
+ text: `You just implemented UI changes to: ${args.componentName}
2109
+
2110
+ Before declaring this work done, run the UI/UX QA checklist:
2111
+
2112
+ 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
2113
+ 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
2114
+ 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
2115
+ 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
2116
+ 5. STATES: Verify loading, error, and empty states are handled
2117
+ 6. CONSOLE: Check browser devtools for errors/warnings
2118
+ 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
2119
+ 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
2120
+ 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
2121
+
2122
+ After checking each item, record results:
2123
+ call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
2124
+ evaluate each rule against ${args.componentName}
2125
+ call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
2126
+ call record_learning for any UI gotchas discovered
2127
+
2128
+ For the full step-by-step methodology, call getMethodology("ui_ux_qa").
2129
+
2130
+ Commands available:
2131
+ npm run test:run — Vitest component tests
2132
+ npm run test:e2e — Playwright E2E tests
2133
+ npm run storybook — Storybook dev server (port 6006)
2134
+ npm run perf:lighthouse — Lighthouse audit
895
2135
  npm run perf:bundle — Bundle size analysis`,
896
2136
  },
897
2137
  },
@@ -919,47 +2159,47 @@ Commands available:
919
2159
  role: "user",
920
2160
  content: {
921
2161
  type: "text",
922
- text: `You are coordinating a parallel agent team for: ${args.projectGoal}
923
-
924
- This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
925
- Reference: https://www.anthropic.com/engineering/building-c-compiler
926
-
927
- SETUP (run these in order):
928
-
929
- 1. ORIENT — Check what's already happening:
930
- call get_parallel_status({ includeHistory: true })
931
- call list_agent_tasks({ status: "all" })
932
-
933
- 2. PLAN ROLES — Assign ${agentCount} specialized agents:
934
- Recommended role split for ${agentCount} agents:
935
- ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
936
- - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
937
- - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
2162
+ text: `You are coordinating a parallel agent team for: ${args.projectGoal}
2163
+
2164
+ This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
2165
+ Reference: https://www.anthropic.com/engineering/building-c-compiler
2166
+
2167
+ SETUP (run these in order):
2168
+
2169
+ 1. ORIENT — Check what's already happening:
2170
+ call get_parallel_status({ includeHistory: true })
2171
+ call list_agent_tasks({ status: "all" })
2172
+
2173
+ 2. PLAN ROLES — Assign ${agentCount} specialized agents:
2174
+ Recommended role split for ${agentCount} agents:
2175
+ ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
2176
+ - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
2177
+ - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
938
2178
  - Agent 4: assign_agent_role({ role: "documentation_maintainer", focusArea: "docs and progress" })` :
939
- `- Agent 1: assign_agent_role({ role: "implementer" })
940
- - Agent 2: assign_agent_role({ role: "test_writer" })`}
941
-
942
- 3. BREAK DOWN WORK — Create task claims:
943
- For each independent piece of work:
944
- call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
945
-
946
- 4. WORK LOOP (each agent independently):
947
- a. claim_agent_task — Lock your task
948
- b. Do the work (implement, test, review)
949
- c. log_context_budget — Track context usage, avoid pollution
950
- d. run_oracle_comparison — Validate output against known-good reference
951
- e. release_agent_task — Release with progress note
952
- f. Pick next task (repeat)
953
-
954
- 5. ANTI-PATTERNS TO AVOID:
955
- - Two agents working on the same task (always claim first)
956
- - Dumping thousands of lines of test output (log to file, print summary)
957
- - Spending hours on one stuck problem (mark as blocked, move on)
958
- - Overwriting each other's changes (commit frequently, pull before push)
959
-
960
- KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
961
- use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
962
-
2179
+ `- Agent 1: assign_agent_role({ role: "implementer" })
2180
+ - Agent 2: assign_agent_role({ role: "test_writer" })`}
2181
+
2182
+ 3. BREAK DOWN WORK — Create task claims:
2183
+ For each independent piece of work:
2184
+ call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
2185
+
2186
+ 4. WORK LOOP (each agent independently):
2187
+ a. claim_agent_task — Lock your task
2188
+ b. Do the work (implement, test, review)
2189
+ c. log_context_budget — Track context usage, avoid pollution
2190
+ d. run_oracle_comparison — Validate output against known-good reference
2191
+ e. release_agent_task — Release with progress note
2192
+ f. Pick next task (repeat)
2193
+
2194
+ 5. ANTI-PATTERNS TO AVOID:
2195
+ - Two agents working on the same task (always claim first)
2196
+ - Dumping thousands of lines of test output (log to file, print summary)
2197
+ - Spending hours on one stuck problem (mark as blocked, move on)
2198
+ - Overwriting each other's changes (commit frequently, pull before push)
2199
+
2200
+ KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
2201
+ use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
2202
+
963
2203
  For the full methodology: call getMethodology("parallel_agent_teams")`,
964
2204
  },
965
2205
  },
@@ -986,45 +2226,45 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
986
2226
  role: "user",
987
2227
  content: {
988
2228
  type: "text",
989
- text: `Set up oracle-based testing for: ${args.componentName}
990
- Oracle source: ${args.oracleSource}
991
-
992
- This follows the pattern from Anthropic's C Compiler project where GCC served as a
993
- "known-good compiler oracle" to identify which specific files were broken.
994
-
995
- SETUP:
996
-
997
- 1. DEFINE ORACLE — Capture known-good reference outputs:
998
- Run the reference implementation (${args.oracleSource}) on each test input.
999
- Save outputs as golden files or capture them in the oracle comparison tool.
1000
-
1001
- 2. RUN COMPARISONS — For each test case:
1002
- call run_oracle_comparison({
1003
- testLabel: "${args.componentName}_test_1",
1004
- actualOutput: "<your implementation's output>",
1005
- expectedOutput: "<oracle's output>",
1006
- oracleSource: "${args.oracleSource}"
1007
- })
1008
-
1009
- 3. TRIAGE FAILURES — Review diff summaries:
1010
- Each failing comparison is an independent work item.
1011
- Assign each to a different parallel agent via claim_agent_task.
1012
-
1013
- 4. BINARY SEARCH (for complex failures):
1014
- If a test passes individually but fails when combined with others,
1015
- use delta debugging: split the test set in half, test each half,
1016
- narrow down to the minimal failing combination.
1017
- (This is how Anthropic found pairs of files that failed together but worked independently.)
1018
-
1019
- 5. TRACK PROGRESS — Monitor convergence:
1020
- call get_parallel_status to see how many oracle tests are still failing.
1021
- As agents fix failures, the match percentage should trend toward 100%.
1022
-
1023
- CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
1024
- call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
1025
-
1026
- After all oracle tests pass:
1027
- call record_learning with patterns discovered
2229
+ text: `Set up oracle-based testing for: ${args.componentName}
2230
+ Oracle source: ${args.oracleSource}
2231
+
2232
+ This follows the pattern from Anthropic's C Compiler project where GCC served as a
2233
+ "known-good compiler oracle" to identify which specific files were broken.
2234
+
2235
+ SETUP:
2236
+
2237
+ 1. DEFINE ORACLE — Capture known-good reference outputs:
2238
+ Run the reference implementation (${args.oracleSource}) on each test input.
2239
+ Save outputs as golden files or capture them in the oracle comparison tool.
2240
+
2241
+ 2. RUN COMPARISONS — For each test case:
2242
+ call run_oracle_comparison({
2243
+ testLabel: "${args.componentName}_test_1",
2244
+ actualOutput: "<your implementation's output>",
2245
+ expectedOutput: "<oracle's output>",
2246
+ oracleSource: "${args.oracleSource}"
2247
+ })
2248
+
2249
+ 3. TRIAGE FAILURES — Review diff summaries:
2250
+ Each failing comparison is an independent work item.
2251
+ Assign each to a different parallel agent via claim_agent_task.
2252
+
2253
+ 4. BINARY SEARCH (for complex failures):
2254
+ If a test passes individually but fails when combined with others,
2255
+ use delta debugging: split the test set in half, test each half,
2256
+ narrow down to the minimal failing combination.
2257
+ (This is how Anthropic found pairs of files that failed together but worked independently.)
2258
+
2259
+ 5. TRACK PROGRESS — Monitor convergence:
2260
+ call get_parallel_status to see how many oracle tests are still failing.
2261
+ As agents fix failures, the match percentage should trend toward 100%.
2262
+
2263
+ CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
2264
+ call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
2265
+
2266
+ After all oracle tests pass:
2267
+ call record_learning with patterns discovered
1028
2268
  call run_mandatory_flywheel to verify the full change`,
1029
2269
  },
1030
2270
  },
@@ -1052,67 +2292,67 @@ After all oracle tests pass:
1052
2292
  role: "user",
1053
2293
  content: {
1054
2294
  type: "text",
1055
- text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
1056
-
1057
- ## How This Works
1058
-
1059
- Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
1060
- context window. NodeBench MCP tools coordinate them via a shared SQLite database.
1061
-
1062
- **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
1063
- **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
1064
-
1065
- ## Step-by-Step
1066
-
1067
- ### 1. PLAN — Break work into ${count} independent tasks
1068
- Identify ${count} pieces of work that can run in parallel without dependencies.
1069
- Each task should be independently completable and testable.
1070
-
1071
- ### 2. SPAWN — Launch subagents with coordination instructions
1072
- For each task, use the Task tool:
1073
-
1074
- \`\`\`
1075
- Task tool call:
1076
- prompt: "You have access to NodeBench MCP. Do the following:
1077
- 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
1078
- 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
1079
- 3. Do the work
1080
- 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
1081
- 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
1082
- 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
1083
- \`\`\`
1084
-
1085
- ### 3. MONITOR — Check progress
1086
- After spawning all subagents:
1087
- call get_parallel_status({ includeHistory: true })
1088
- call list_agent_tasks({ status: "all" })
1089
-
1090
- ### 4. VALIDATE — Run oracle comparisons if applicable
1091
- If subagents produced outputs that should match a reference:
1092
- call run_oracle_comparison for each output
1093
-
1094
- ### 5. GATE — Quality check the aggregate result
1095
- call run_quality_gate with rules covering all ${count} tasks
1096
- call run_mandatory_flywheel to verify the combined change
1097
-
1098
- ## Concrete IMPACT of This Workflow
1099
-
1100
- | What NodeBench Adds | Without It (bare subagents) |
1101
- |---------------------------------|---------------------------------------|
1102
- | Task locks prevent duplicate work | Two subagents might fix the same bug |
1103
- | Role specialization | All subagents do everything |
1104
- | Context budget tracking | Subagent runs out of context silently |
1105
- | Oracle comparisons | No reference-based validation |
1106
- | Progress notes for handoff | Next session starts from scratch |
1107
- | Learnings persisted | Knowledge lost when subagent exits |
1108
- | Quality gate on aggregate | No validation that pieces fit together |
1109
-
1110
- ## Anti-Patterns
1111
- - DO NOT spawn subagents for work that has dependencies (sequential steps)
1112
- - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
1113
- - DO NOT dump large outputs into subagent context — use log_context_budget to track
1114
- - DO NOT forget release_agent_task — orphaned claims block future sessions
1115
-
2295
+ text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
2296
+
2297
+ ## How This Works
2298
+
2299
+ Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
2300
+ context window. NodeBench MCP tools coordinate them via a shared SQLite database.
2301
+
2302
+ **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
2303
+ **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
2304
+
2305
+ ## Step-by-Step
2306
+
2307
+ ### 1. PLAN — Break work into ${count} independent tasks
2308
+ Identify ${count} pieces of work that can run in parallel without dependencies.
2309
+ Each task should be independently completable and testable.
2310
+
2311
+ ### 2. SPAWN — Launch subagents with coordination instructions
2312
+ For each task, use the Task tool:
2313
+
2314
+ \`\`\`
2315
+ Task tool call:
2316
+ prompt: "You have access to NodeBench MCP. Do the following:
2317
+ 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
2318
+ 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
2319
+ 3. Do the work
2320
+ 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
2321
+ 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
2322
+ 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
2323
+ \`\`\`
2324
+
2325
+ ### 3. MONITOR — Check progress
2326
+ After spawning all subagents:
2327
+ call get_parallel_status({ includeHistory: true })
2328
+ call list_agent_tasks({ status: "all" })
2329
+
2330
+ ### 4. VALIDATE — Run oracle comparisons if applicable
2331
+ If subagents produced outputs that should match a reference:
2332
+ call run_oracle_comparison for each output
2333
+
2334
+ ### 5. GATE — Quality check the aggregate result
2335
+ call run_quality_gate with rules covering all ${count} tasks
2336
+ call run_mandatory_flywheel to verify the combined change
2337
+
2338
+ ## Concrete IMPACT of This Workflow
2339
+
2340
+ | What NodeBench Adds | Without It (bare subagents) |
2341
+ |---------------------------------|---------------------------------------|
2342
+ | Task locks prevent duplicate work | Two subagents might fix the same bug |
2343
+ | Role specialization | All subagents do everything |
2344
+ | Context budget tracking | Subagent runs out of context silently |
2345
+ | Oracle comparisons | No reference-based validation |
2346
+ | Progress notes for handoff | Next session starts from scratch |
2347
+ | Learnings persisted | Knowledge lost when subagent exits |
2348
+ | Quality gate on aggregate | No validation that pieces fit together |
2349
+
2350
+ ## Anti-Patterns
2351
+ - DO NOT spawn subagents for work that has dependencies (sequential steps)
2352
+ - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
2353
+ - DO NOT dump large outputs into subagent context — use log_context_budget to track
2354
+ - DO NOT forget release_agent_task — orphaned claims block future sessions
2355
+
1116
2356
  For the full parallel agent methodology: call getMethodology("parallel_agent_teams")`,
1117
2357
  },
1118
2358
  },
@@ -1139,72 +2379,72 @@ For the full parallel agent methodology: call getMethodology("parallel_agent_tea
1139
2379
  role: "user",
1140
2380
  content: {
1141
2381
  type: "text",
1142
- text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
1143
- ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
1144
-
1145
- This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
1146
-
1147
- STEP 1 — DETECT (dry run first):
1148
- call bootstrap_parallel_agents({
1149
- projectRoot: "${args.projectPath}",
1150
- dryRun: true,
1151
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1152
- includeAgentsMd: true
1153
- })
1154
-
1155
- Review the gap report. It scans 7 categories:
1156
- - Task coordination (lock files, claim directories)
1157
- - Role specialization (role configs, AGENTS.md mentions)
1158
- - Oracle testing (golden files, reference outputs, snapshots)
1159
- - Context budget tracking (budget configs, AGENTS.md mentions)
1160
- - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
1161
- - AGENTS.md parallel section (parallel agent coordination protocol)
1162
- - Git worktrees (for true parallel work)
1163
-
1164
- STEP 2 — SCAFFOLD (create files):
1165
- If gaps found, run with dryRun=false:
1166
- call bootstrap_parallel_agents({
1167
- projectRoot: "${args.projectPath}",
1168
- dryRun: false,
1169
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1170
- includeAgentsMd: true
1171
- })
1172
-
1173
- This creates:
1174
- - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
1175
- - progress.md template for agent orientation
1176
- - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
1177
-
1178
- STEP 3 — GENERATE AGENTS.MD (if needed):
1179
- call generate_parallel_agents_md({
1180
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
1181
- projectName: "${args.projectPath.split("/").pop() || "project"}",
1182
- maxAgents: 4,
1183
- includeNodebenchSetup: true
1184
- })
1185
-
1186
- Copy the output into the target repo's AGENTS.md.
1187
-
1188
- STEP 4 — VERIFY (6-step flywheel):
1189
- The bootstrap tool returns a flywheelPlan. Execute each step:
1190
- 1. Static analysis — verify scaffold files don't conflict
1191
- 2. Happy path — claim task → work → release → progress.md updated
1192
- 3. Conflict test — two claims on same task → second gets conflict
1193
- 4. Oracle test — create golden file → diff catches changes
1194
- 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
1195
- 6. Document — record_learning with patterns discovered
1196
-
1197
- STEP 5 — FIX (if anything fails):
1198
- Fix the issue, then re-run from Step 4.
1199
-
1200
- STEP 6 — DOCUMENT:
1201
- call record_learning({
1202
- key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
1203
- content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
1204
- category: "pattern",
1205
- tags: ["parallel-agents", "bootstrap", "external-repo"]
1206
- })
1207
-
2382
+ text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
2383
+ ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
2384
+
2385
+ This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
2386
+
2387
+ STEP 1 — DETECT (dry run first):
2388
+ call bootstrap_parallel_agents({
2389
+ projectRoot: "${args.projectPath}",
2390
+ dryRun: true,
2391
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2392
+ includeAgentsMd: true
2393
+ })
2394
+
2395
+ Review the gap report. It scans 7 categories:
2396
+ - Task coordination (lock files, claim directories)
2397
+ - Role specialization (role configs, AGENTS.md mentions)
2398
+ - Oracle testing (golden files, reference outputs, snapshots)
2399
+ - Context budget tracking (budget configs, AGENTS.md mentions)
2400
+ - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
2401
+ - AGENTS.md parallel section (parallel agent coordination protocol)
2402
+ - Git worktrees (for true parallel work)
2403
+
2404
+ STEP 2 — SCAFFOLD (create files):
2405
+ If gaps found, run with dryRun=false:
2406
+ call bootstrap_parallel_agents({
2407
+ projectRoot: "${args.projectPath}",
2408
+ dryRun: false,
2409
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2410
+ includeAgentsMd: true
2411
+ })
2412
+
2413
+ This creates:
2414
+ - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
2415
+ - progress.md template for agent orientation
2416
+ - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
2417
+
2418
+ STEP 3 — GENERATE AGENTS.MD (if needed):
2419
+ call generate_parallel_agents_md({
2420
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
2421
+ projectName: "${args.projectPath.split("/").pop() || "project"}",
2422
+ maxAgents: 4,
2423
+ includeNodebenchSetup: true
2424
+ })
2425
+
2426
+ Copy the output into the target repo's AGENTS.md.
2427
+
2428
+ STEP 4 — VERIFY (6-step flywheel):
2429
+ The bootstrap tool returns a flywheelPlan. Execute each step:
2430
+ 1. Static analysis — verify scaffold files don't conflict
2431
+ 2. Happy path — claim task → work → release → progress.md updated
2432
+ 3. Conflict test — two claims on same task → second gets conflict
2433
+ 4. Oracle test — create golden file → diff catches changes
2434
+ 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
2435
+ 6. Document — record_learning with patterns discovered
2436
+
2437
+ STEP 5 — FIX (if anything fails):
2438
+ Fix the issue, then re-run from Step 4.
2439
+
2440
+ STEP 6 — DOCUMENT:
2441
+ call record_learning({
2442
+ key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
2443
+ content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
2444
+ category: "pattern",
2445
+ tags: ["parallel-agents", "bootstrap", "external-repo"]
2446
+ })
2447
+
1208
2448
  For the full methodology: call getMethodology("parallel_agent_teams")`,
1209
2449
  },
1210
2450
  },
@@ -1218,82 +2458,82 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
1218
2458
  role: "user",
1219
2459
  content: {
1220
2460
  type: "text",
1221
- text: `## NodeBench MCP Agent Contract
1222
-
1223
- You are connected to NodeBench MCP. Follow these rules EXACTLY.
1224
-
1225
- ### FRONT DOOR — Always start here (before writing any code)
1226
- 1. search_all_knowledge("<your current task>") — Check if this was solved before
1227
- 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
1228
- 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
1229
- 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
1230
-
1231
- ### SELF-SETUP — If a capability is missing
1232
- When discover_tools returns nothing useful, or a tool says "not configured":
1233
- 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
1234
- 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
1235
- 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
1236
- 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
1237
-
1238
- ### BEFORE IMPLEMENTATION
1239
- - run_recon + log_recon_finding (if reconnaissance applies)
1240
- - assess_risk (HIGH risk = must get confirmation before proceeding)
1241
-
1242
- ### PARALLEL WORK
1243
- - MUST claim_agent_task before editing or designing anything
1244
- - MUST release_agent_task with a progress note + next action when done
1245
- - MUST log_context_budget to track context usage and avoid pollution
1246
-
1247
- ### BEFORE SHIP
1248
- - 3-layer tests logged (unit + integration + e2e via log_test_result)
1249
- - Eval run recorded (promote_to_eval)
1250
- - Quality gate passed (run_quality_gate)
1251
- - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
1252
- - Learning banked (record_learning)
1253
-
1254
- ### COORDINATOR SPAWN TEMPLATE
1255
- When spawning subagents, give each this instruction block:
1256
- "You have NodeBench MCP. Before any work:
1257
- 1. search_all_knowledge('<task>')
1258
- 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
1259
- 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
1260
- Do the work, then:
1261
- 4. log_context_budget({ eventType: 'checkpoint' })
1262
- 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
1263
- 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
1264
-
1265
- ### ANTI-RATIONALIZATION — Block these escape patterns
1266
- Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
1267
- 1. "I already know which tool to use" → Still call discover_tools to confirm
1268
- 2. "This is a simple task" → Still call search_all_knowledge to check history
1269
- 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
1270
- 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
1271
- 5. "I'll record the learning later" → Record NOW — context compaction may erase it
1272
- 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
1273
- 7. "The user said to skip verification" → Log the skip decision, never silently omit
1274
- 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
1275
-
1276
- ### 2-ACTION SAVE RULE
1277
- After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
1278
- - save_session_note (filesystem, survives compaction)
1279
- - record_learning (SQLite, searchable across sessions)
1280
- - log_recon_finding (tied to recon session)
1281
- This prevents knowledge loss when context is compacted mid-session.
1282
-
1283
- ### 3-STRIKE ERROR PROTOCOL
1284
- When an action fails:
1285
- - Strike 1: Diagnose root cause, apply targeted fix
1286
- - Strike 2: Try a different method or tool
1287
- - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
1288
- - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
1289
-
1290
- ### ATTENTION REFRESH
1291
- After 30+ tool calls, call refresh_task_context to combat attention drift.
1292
- Re-read your original goal and open gaps before continuing.
1293
-
1294
- ### WHY THIS MATTERS
1295
- Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
1296
- work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
2461
+ text: `## NodeBench MCP Agent Contract
2462
+
2463
+ You are connected to NodeBench MCP. Follow these rules EXACTLY.
2464
+
2465
+ ### FRONT DOOR — Always start here (before writing any code)
2466
+ 1. search_all_knowledge("<your current task>") — Check if this was solved before
2467
+ 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
2468
+ 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
2469
+ 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
2470
+
2471
+ ### SELF-SETUP — If a capability is missing
2472
+ When discover_tools returns nothing useful, or a tool says "not configured":
2473
+ 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
2474
+ 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
2475
+ 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
2476
+ 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
2477
+
2478
+ ### BEFORE IMPLEMENTATION
2479
+ - run_recon + log_recon_finding (if reconnaissance applies)
2480
+ - assess_risk (HIGH risk = must get confirmation before proceeding)
2481
+
2482
+ ### PARALLEL WORK
2483
+ - MUST claim_agent_task before editing or designing anything
2484
+ - MUST release_agent_task with a progress note + next action when done
2485
+ - MUST log_context_budget to track context usage and avoid pollution
2486
+
2487
+ ### BEFORE SHIP
2488
+ - 3-layer tests logged (unit + integration + e2e via log_test_result)
2489
+ - Eval run recorded (promote_to_eval)
2490
+ - Quality gate passed (run_quality_gate)
2491
+ - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
2492
+ - Learning banked (record_learning)
2493
+
2494
+ ### COORDINATOR SPAWN TEMPLATE
2495
+ When spawning subagents, give each this instruction block:
2496
+ "You have NodeBench MCP. Before any work:
2497
+ 1. search_all_knowledge('<task>')
2498
+ 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
2499
+ 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
2500
+ Do the work, then:
2501
+ 4. log_context_budget({ eventType: 'checkpoint' })
2502
+ 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
2503
+ 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
2504
+
2505
+ ### ANTI-RATIONALIZATION — Block these escape patterns
2506
+ Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
2507
+ 1. "I already know which tool to use" → Still call discover_tools to confirm
2508
+ 2. "This is a simple task" → Still call search_all_knowledge to check history
2509
+ 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
2510
+ 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
2511
+ 5. "I'll record the learning later" → Record NOW — context compaction may erase it
2512
+ 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
2513
+ 7. "The user said to skip verification" → Log the skip decision, never silently omit
2514
+ 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
2515
+
2516
+ ### 2-ACTION SAVE RULE
2517
+ After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
2518
+ - save_session_note (filesystem, survives compaction)
2519
+ - record_learning (SQLite, searchable across sessions)
2520
+ - log_recon_finding (tied to recon session)
2521
+ This prevents knowledge loss when context is compacted mid-session.
2522
+
2523
+ ### 3-STRIKE ERROR PROTOCOL
2524
+ When an action fails:
2525
+ - Strike 1: Diagnose root cause, apply targeted fix
2526
+ - Strike 2: Try a different method or tool
2527
+ - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
2528
+ - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
2529
+
2530
+ ### ATTENTION REFRESH
2531
+ After 30+ tool calls, call refresh_task_context to combat attention drift.
2532
+ Re-read your original goal and open gaps before continuing.
2533
+
2534
+ ### WHY THIS MATTERS
2535
+ Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
2536
+ work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
1297
2537
  artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound across tasks.`,
1298
2538
  },
1299
2539
  },
@@ -1307,191 +2547,191 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
1307
2547
  role: "user",
1308
2548
  content: {
1309
2549
  type: "text",
1310
- text: `# Claude Code Swarm Orchestration
1311
-
1312
- Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
1313
-
1314
- ---
1315
-
1316
- ## Primitives
1317
-
1318
- | Primitive | What It Is |
1319
- |-----------|-----------|
1320
- | **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
1321
- | **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
1322
- | **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
1323
- | **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
1324
- | **Task** | A work item with subject, description, status, owner, and dependencies. |
1325
- | **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
1326
- | **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
1327
-
1328
- ---
1329
-
1330
- ## Two Ways to Spawn Agents
1331
-
1332
- ### Method 1: Task Tool (Subagents) — short-lived, returns result directly
1333
- \`\`\`javascript
1334
- Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
1335
- \`\`\`
1336
-
1337
- ### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
1338
- \`\`\`javascript
1339
- Teammate({ operation: "spawnTeam", team_name: "my-project" })
1340
- Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
1341
- \`\`\`
1342
-
1343
- | Aspect | Task (subagent) | Task + team_name + name (teammate) |
1344
- |--------|-----------------|-----------------------------------|
1345
- | Lifespan | Until task complete | Until shutdown requested |
1346
- | Communication | Return value | Inbox messages |
1347
- | Task access | None | Shared task list |
1348
- | Team membership | No | Yes |
1349
-
1350
- ---
1351
-
1352
- ## Built-in Agent Types
1353
-
1354
- - **Bash** — command execution, git ops (tools: Bash only)
1355
- - **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
1356
- - **Plan** — architecture + implementation plans (read-only tools)
1357
- - **general-purpose** — all tools, multi-step research + action
1358
- - **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
1359
- - **statusline-setup** — configure Claude Code status line
1360
-
1361
- ---
1362
-
1363
- ## TeammateTool Operations
1364
-
1365
- | Operation | Who | What |
1366
- |-----------|-----|------|
1367
- | \`spawnTeam\` | Leader | Create team + task directory |
1368
- | \`discoverTeams\` | Anyone | List joinable teams |
1369
- | \`requestJoin\` | Teammate | Request to join existing team |
1370
- | \`approveJoin\` | Leader | Accept join request |
1371
- | \`write\` | Anyone | Message ONE teammate |
1372
- | \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
1373
- | \`requestShutdown\` | Leader | Ask teammate to exit |
1374
- | \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
1375
- | \`rejectShutdown\` | Teammate | Decline shutdown with reason |
1376
- | \`approvePlan\` | Leader | Approve plan_approval_request |
1377
- | \`rejectPlan\` | Leader | Reject plan with feedback |
1378
- | \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
1379
-
1380
- ---
1381
-
1382
- ## Task System
1383
-
1384
- \`\`\`javascript
1385
- TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
1386
- TaskList() // See all tasks + statuses
1387
- TaskGet({ taskId: "2" }) // Get full task details
1388
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // Dependency — auto-unblocks when #1 completes
1389
- TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
1390
- TaskUpdate({ taskId: "2", status: "completed" })
1391
- \`\`\`
1392
-
1393
- ---
1394
-
1395
- ## Orchestration Patterns
1396
-
1397
- ### Pattern 1: Parallel Specialists
1398
- \`\`\`javascript
1399
- Teammate({ operation: "spawnTeam", team_name: "pr-review" })
1400
- // Spawn reviewers in ONE message (parallel execution)
1401
- Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
1402
- Task({ team_name: "pr-review", name: "perf", subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
1403
- // Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
1404
- \`\`\`
1405
-
1406
- ### Pattern 2: Pipeline (Sequential Dependencies)
1407
- \`\`\`javascript
1408
- TaskCreate({ subject: "Research" }) // #1
1409
- TaskCreate({ subject: "Plan" }) // #2
1410
- TaskCreate({ subject: "Implement" }) // #3
1411
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // #2 waits for #1
1412
- TaskUpdate({ taskId: "3", addBlockedBy: ["2"] }) // #3 waits for #2
1413
- // Spawn workers that poll TaskList and claim unblocked tasks
1414
- \`\`\`
1415
-
1416
- ### Pattern 3: Self-Organizing Swarm
1417
- \`\`\`javascript
1418
- // 1. Create N independent tasks (no dependencies)
1419
- // 2. Spawn M workers with this prompt loop:
1420
- // a. TaskList → find pending+unclaimed task
1421
- // b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
1422
- // c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
1423
- // d. If no tasks: notify team-lead idle, retry 3x, then exit
1424
- \`\`\`
1425
-
1426
- ### Pattern 4: Research → Implement (synchronous)
1427
- \`\`\`javascript
1428
- const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
1429
- Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
1430
- \`\`\`
1431
-
1432
- ---
1433
-
1434
- ## Shutdown Sequence (always follow this order)
1435
-
1436
- \`\`\`javascript
1437
- // 1. Request shutdown for all teammates
1438
- Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
1439
- // 2. Wait for {"type": "shutdown_approved"} in inbox
1440
- // 3. Only then cleanup
1441
- Teammate({ operation: "cleanup" })
1442
- \`\`\`
1443
-
1444
- ---
1445
-
1446
- ## Spawn Backends
1447
-
1448
- | Backend | When auto-selected | Visibility |
1449
- |---------|-------------------|------------|
1450
- | \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
1451
- | \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
1452
- | \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
1453
-
1454
- Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
1455
-
1456
- ---
1457
-
1458
- ## Best Practices
1459
-
1460
- 1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
1461
- 2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
1462
- 3. **Use dependencies**: \`addBlockedBy\` — never poll manually
1463
- 4. **Prefer write over broadcast**: broadcast = N messages for N teammates
1464
- 5. **Always cleanup**: Don't leave orphaned teams
1465
- 6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
1466
-
1467
- ---
1468
-
1469
- ## Quick Reference
1470
-
1471
- \`\`\`javascript
1472
- // Subagent (returns result)
1473
- Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
1474
-
1475
- // Teammate (persistent, background)
1476
- Teammate({ operation: "spawnTeam", team_name: "my-team" })
1477
- Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
1478
-
1479
- // Message teammate
1480
- Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
1481
-
1482
- // Pipeline
1483
- TaskCreate({ subject: "Step 1" }) // → #1
1484
- TaskCreate({ subject: "Step 2" }) // → #2
1485
- TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
1486
-
1487
- // Shutdown
1488
- Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
1489
- // wait for {"type": "shutdown_approved"} in inbox...
1490
- Teammate({ operation: "cleanup" })
1491
- \`\`\`
1492
-
1493
- ---
1494
-
2550
+ text: `# Claude Code Swarm Orchestration
2551
+
2552
+ Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
2553
+
2554
+ ---
2555
+
2556
+ ## Primitives
2557
+
2558
+ | Primitive | What It Is |
2559
+ |-----------|-----------|
2560
+ | **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
2561
+ | **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
2562
+ | **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
2563
+ | **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
2564
+ | **Task** | A work item with subject, description, status, owner, and dependencies. |
2565
+ | **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
2566
+ | **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
2567
+
2568
+ ---
2569
+
2570
+ ## Two Ways to Spawn Agents
2571
+
2572
+ ### Method 1: Task Tool (Subagents) — short-lived, returns result directly
2573
+ \`\`\`javascript
2574
+ Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
2575
+ \`\`\`
2576
+
2577
+ ### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
2578
+ \`\`\`javascript
2579
+ Teammate({ operation: "spawnTeam", team_name: "my-project" })
2580
+ Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
2581
+ \`\`\`
2582
+
2583
+ | Aspect | Task (subagent) | Task + team_name + name (teammate) |
2584
+ |--------|-----------------|-----------------------------------|
2585
+ | Lifespan | Until task complete | Until shutdown requested |
2586
+ | Communication | Return value | Inbox messages |
2587
+ | Task access | None | Shared task list |
2588
+ | Team membership | No | Yes |
2589
+
2590
+ ---
2591
+
2592
+ ## Built-in Agent Types
2593
+
2594
+ - **Bash** — command execution, git ops (tools: Bash only)
2595
+ - **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
2596
+ - **Plan** — architecture + implementation plans (read-only tools)
2597
+ - **general-purpose** — all tools, multi-step research + action
2598
+ - **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
2599
+ - **statusline-setup** — configure Claude Code status line
2600
+
2601
+ ---
2602
+
2603
+ ## TeammateTool Operations
2604
+
2605
+ | Operation | Who | What |
2606
+ |-----------|-----|------|
2607
+ | \`spawnTeam\` | Leader | Create team + task directory |
2608
+ | \`discoverTeams\` | Anyone | List joinable teams |
2609
+ | \`requestJoin\` | Teammate | Request to join existing team |
2610
+ | \`approveJoin\` | Leader | Accept join request |
2611
+ | \`write\` | Anyone | Message ONE teammate |
2612
+ | \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
2613
+ | \`requestShutdown\` | Leader | Ask teammate to exit |
2614
+ | \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
2615
+ | \`rejectShutdown\` | Teammate | Decline shutdown with reason |
2616
+ | \`approvePlan\` | Leader | Approve plan_approval_request |
2617
+ | \`rejectPlan\` | Leader | Reject plan with feedback |
2618
+ | \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
2619
+
2620
+ ---
2621
+
2622
+ ## Task System
2623
+
2624
+ \`\`\`javascript
2625
+ TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
2626
+ TaskList() // See all tasks + statuses
2627
+ TaskGet({ taskId: "2" }) // Get full task details
2628
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // Dependency — auto-unblocks when #1 completes
2629
+ TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
2630
+ TaskUpdate({ taskId: "2", status: "completed" })
2631
+ \`\`\`
2632
+
2633
+ ---
2634
+
2635
+ ## Orchestration Patterns
2636
+
2637
+ ### Pattern 1: Parallel Specialists
2638
+ \`\`\`javascript
2639
+ Teammate({ operation: "spawnTeam", team_name: "pr-review" })
2640
+ // Spawn reviewers in ONE message (parallel execution)
2641
+ Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
2642
+ Task({ team_name: "pr-review", name: "perf", subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
2643
+ // Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
2644
+ \`\`\`
2645
+
2646
+ ### Pattern 2: Pipeline (Sequential Dependencies)
2647
+ \`\`\`javascript
2648
+ TaskCreate({ subject: "Research" }) // #1
2649
+ TaskCreate({ subject: "Plan" }) // #2
2650
+ TaskCreate({ subject: "Implement" }) // #3
2651
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] }) // #2 waits for #1
2652
+ TaskUpdate({ taskId: "3", addBlockedBy: ["2"] }) // #3 waits for #2
2653
+ // Spawn workers that poll TaskList and claim unblocked tasks
2654
+ \`\`\`
2655
+
2656
+ ### Pattern 3: Self-Organizing Swarm
2657
+ \`\`\`javascript
2658
+ // 1. Create N independent tasks (no dependencies)
2659
+ // 2. Spawn M workers with this prompt loop:
2660
+ // a. TaskList → find pending+unclaimed task
2661
+ // b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
2662
+ // c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
2663
+ // d. If no tasks: notify team-lead idle, retry 3x, then exit
2664
+ \`\`\`
2665
+
2666
+ ### Pattern 4: Research → Implement (synchronous)
2667
+ \`\`\`javascript
2668
+ const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
2669
+ Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
2670
+ \`\`\`
2671
+
2672
+ ---
2673
+
2674
+ ## Shutdown Sequence (always follow this order)
2675
+
2676
+ \`\`\`javascript
2677
+ // 1. Request shutdown for all teammates
2678
+ Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
2679
+ // 2. Wait for {"type": "shutdown_approved"} in inbox
2680
+ // 3. Only then cleanup
2681
+ Teammate({ operation: "cleanup" })
2682
+ \`\`\`
2683
+
2684
+ ---
2685
+
2686
+ ## Spawn Backends
2687
+
2688
+ | Backend | When auto-selected | Visibility |
2689
+ |---------|-------------------|------------|
2690
+ | \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
2691
+ | \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
2692
+ | \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
2693
+
2694
+ Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
2695
+
2696
+ ---
2697
+
2698
+ ## Best Practices
2699
+
2700
+ 1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
2701
+ 2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
2702
+ 3. **Use dependencies**: \`addBlockedBy\` — never poll manually
2703
+ 4. **Prefer write over broadcast**: broadcast = N messages for N teammates
2704
+ 5. **Always cleanup**: Don't leave orphaned teams
2705
+ 6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
2706
+
2707
+ ---
2708
+
2709
+ ## Quick Reference
2710
+
2711
+ \`\`\`javascript
2712
+ // Subagent (returns result)
2713
+ Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
2714
+
2715
+ // Teammate (persistent, background)
2716
+ Teammate({ operation: "spawnTeam", team_name: "my-team" })
2717
+ Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
2718
+
2719
+ // Message teammate
2720
+ Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
2721
+
2722
+ // Pipeline
2723
+ TaskCreate({ subject: "Step 1" }) // → #1
2724
+ TaskCreate({ subject: "Step 2" }) // → #2
2725
+ TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
2726
+
2727
+ // Shutdown
2728
+ Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
2729
+ // wait for {"type": "shutdown_approved"} in inbox...
2730
+ Teammate({ operation: "cleanup" })
2731
+ \`\`\`
2732
+
2733
+ ---
2734
+
1495
2735
  *Source: kieranklaassen/orchestrating-swarms gist — Claude Code v2.1.19*`,
1496
2736
  },
1497
2737
  },
@@ -1505,70 +2745,70 @@ Teammate({ operation: "cleanup" })
1505
2745
  role: "user",
1506
2746
  content: {
1507
2747
  type: "text",
1508
- text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
1509
-
1510
- You are running the Thompson Protocol content pipeline. This is a multi-agent system
1511
- that transforms complex topics into content that makes the reader feel smart.
1512
-
1513
- Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
1514
- the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
1515
- teaching any mechanics.
1516
-
1517
- ## Pipeline (execute in order)
1518
-
1519
- ### Step 1: Initialize
1520
- \`\`\`
1521
- thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
1522
- \`\`\`
1523
- This returns the full execution plan with system prompts for each agent.
1524
-
1525
- ### Step 2: Write (Thompson Writer)
1526
- \`\`\`
1527
- thompson_write({ topic: "<topic>", target_audience: "<audience>" })
1528
- \`\`\`
1529
- Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
1530
- Every technical term MUST have an "in other words..." analogy.
1531
-
1532
- ### Step 3: Edit (Feynman Editor — max 3 cycles)
1533
- \`\`\`
1534
- thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
1535
- \`\`\`
1536
- The Skeptical Beginner reviews against 8 rejection criteria.
1537
- If any section gets REWRITE → send back to thompson_write with fix instructions.
1538
- Loop max 3 times. After 3, escalate stuck sections.
1539
-
1540
- ### Step 4: Visual Map
1541
- \`\`\`
1542
- thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
1543
- \`\`\`
1544
- Generates image prompts that map 1:1 with text analogies. No generic b-roll.
1545
-
1546
- ### Step 5: Anti-Elitism Lint
1547
- \`\`\`
1548
- thompson_anti_elitism_lint({ content: "<full text>" })
1549
- \`\`\`
1550
- Deterministic scan: 22 banned phrases, readability metrics, jargon density.
1551
- Zero LLM cost — pure regex + math.
1552
-
1553
- ### Step 6: Quality Gate
1554
- \`\`\`
1555
- thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
1556
- \`\`\`
1557
- 10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
1558
- Only distribute if passing or exemplary.
1559
-
1560
- ## Core Principles (non-negotiable)
1561
- 1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
1562
- 2. **Intuition Before Mechanics**: Explain WHY before HOW
1563
- 3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
1564
- 4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
1565
- 5. **Progressive Complexity**: Start with simplest true statement, layer up
1566
- 6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
1567
- 7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
1568
-
1569
- ## After Pipeline
1570
- - \`save_session_note\` — persist Thompson-processed content
1571
- - \`record_learning\` — log which analogies and styles worked best
2748
+ text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
2749
+
2750
+ You are running the Thompson Protocol content pipeline. This is a multi-agent system
2751
+ that transforms complex topics into content that makes the reader feel smart.
2752
+
2753
+ Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
2754
+ the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
2755
+ teaching any mechanics.
2756
+
2757
+ ## Pipeline (execute in order)
2758
+
2759
+ ### Step 1: Initialize
2760
+ \`\`\`
2761
+ thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
2762
+ \`\`\`
2763
+ This returns the full execution plan with system prompts for each agent.
2764
+
2765
+ ### Step 2: Write (Thompson Writer)
2766
+ \`\`\`
2767
+ thompson_write({ topic: "<topic>", target_audience: "<audience>" })
2768
+ \`\`\`
2769
+ Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
2770
+ Every technical term MUST have an "in other words..." analogy.
2771
+
2772
+ ### Step 3: Edit (Feynman Editor — max 3 cycles)
2773
+ \`\`\`
2774
+ thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
2775
+ \`\`\`
2776
+ The Skeptical Beginner reviews against 8 rejection criteria.
2777
+ If any section gets REWRITE → send back to thompson_write with fix instructions.
2778
+ Loop max 3 times. After 3, escalate stuck sections.
2779
+
2780
+ ### Step 4: Visual Map
2781
+ \`\`\`
2782
+ thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
2783
+ \`\`\`
2784
+ Generates image prompts that map 1:1 with text analogies. No generic b-roll.
2785
+
2786
+ ### Step 5: Anti-Elitism Lint
2787
+ \`\`\`
2788
+ thompson_anti_elitism_lint({ content: "<full text>" })
2789
+ \`\`\`
2790
+ Deterministic scan: 22 banned phrases, readability metrics, jargon density.
2791
+ Zero LLM cost — pure regex + math.
2792
+
2793
+ ### Step 6: Quality Gate
2794
+ \`\`\`
2795
+ thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
2796
+ \`\`\`
2797
+ 10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
2798
+ Only distribute if passing or exemplary.
2799
+
2800
+ ## Core Principles (non-negotiable)
2801
+ 1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
2802
+ 2. **Intuition Before Mechanics**: Explain WHY before HOW
2803
+ 3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
2804
+ 4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
2805
+ 5. **Progressive Complexity**: Start with simplest true statement, layer up
2806
+ 6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
2807
+ 7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
2808
+
2809
+ ## After Pipeline
2810
+ - \`save_session_note\` — persist Thompson-processed content
2811
+ - \`record_learning\` — log which analogies and styles worked best
1572
2812
  - Use \`content_publish\` workflow chain for distribution`,
1573
2813
  },
1574
2814
  },
@@ -1578,21 +2818,21 @@ Only distribute if passing or exemplary.
1578
2818
  // Server instructions — tells Claude Code Tool Search (and other clients) when to search
1579
2819
  // for NodeBench tools. This is the key integration point for lazy loading compatibility.
1580
2820
  // See: https://www.anthropic.com/engineering/advanced-tool-use
1581
- const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
1582
- Use NodeBench tools when you need to:
1583
- - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
1584
- - Run evaluations and quality gates before shipping code
1585
- - Search prior knowledge and record learnings across sessions
1586
- - Assess risk before taking actions
1587
- - Coordinate parallel agents (task locks, roles, context budget)
1588
- - Research with structured recon (web search, GitHub, RSS feeds)
1589
- - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
1590
- - Run security audits (dependency scanning, code analysis, secrets detection)
1591
- - Write and polish academic papers
1592
- - Audit SEO, analyze Figma flows, detect Android flicker
1593
- - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
2821
+ const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
2822
+ Use NodeBench tools when you need to:
2823
+ - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
2824
+ - Run evaluations and quality gates before shipping code
2825
+ - Search prior knowledge and record learnings across sessions
2826
+ - Assess risk before taking actions
2827
+ - Coordinate parallel agents (task locks, roles, context budget)
2828
+ - Research with structured recon (web search, GitHub, RSS feeds)
2829
+ - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
2830
+ - Run security audits (dependency scanning, code analysis, secrets detection)
2831
+ - Write and polish academic papers
2832
+ - Audit SEO, analyze Figma flows, detect Android flicker
2833
+ - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
1594
2834
  Start with discover_tools("<your task>") to find the right tool.`;
1595
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.30.0" }, {
2835
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.32.0" }, {
1596
2836
  capabilities: { tools: { listChanged: true }, prompts: {} },
1597
2837
  instructions: SERVER_INSTRUCTIONS,
1598
2838
  });
@@ -1605,10 +2845,12 @@ try {
1605
2845
  catch { /* instrumentation must not block server start */ }
1606
2846
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
1607
2847
  // Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
2848
+ // + MCP security annotations: readOnlyHint, destructiveHint, openWorldHint
1608
2849
  server.setRequestHandler(ListToolsRequestSchema, async () => {
1609
2850
  return {
1610
2851
  tools: allTools.map((t) => {
1611
2852
  const entry = TOOL_REGISTRY.get(t.name);
2853
+ const securityAnnotations = getToolAnnotations(t.name);
1612
2854
  return {
1613
2855
  name: t.name,
1614
2856
  description: t.description,
@@ -1619,8 +2861,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1619
2861
  category: entry.category,
1620
2862
  phase: entry.phase,
1621
2863
  complexity: getToolComplexity(t.name),
2864
+ ...securityAnnotations,
2865
+ },
2866
+ } : {
2867
+ annotations: {
2868
+ ...securityAnnotations,
1622
2869
  },
1623
- } : {}),
2870
+ }),
1624
2871
  };
1625
2872
  }),
1626
2873
  };
@@ -1631,6 +2878,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1631
2878
  _abToolCallCount++;
1632
2879
  if (name === "load_toolset" || name === "unload_toolset")
1633
2880
  _abLoadEventCount++;
2881
+ // Intent-based auto-expansion: on first call, classify and load relevant toolsets
2882
+ if (!_intentClassified) {
2883
+ _intentClassified = true;
2884
+ const expanded = classifyAndExpand(name, args);
2885
+ if (expanded) {
2886
+ console.error(`[intent-classify] Auto-loaded toolsets: ${expanded.join(", ")} (from tool: ${name})`);
2887
+ }
2888
+ }
1634
2889
  const tool = toolMap.get(name);
1635
2890
  if (!tool) {
1636
2891
  return {
@@ -1694,18 +2949,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1694
2949
  else {
1695
2950
  serialized = JSON.stringify(enrichedResult, null, 2);
1696
2951
  }
2952
+ // Security: redact credentials from all tool outputs (single enforcement point)
2953
+ const sanitized = redactSecrets(serialized);
1697
2954
  const contentBlocks = [
1698
- { type: "text", text: serialized },
2955
+ { type: "text", text: sanitized },
1699
2956
  ];
1700
2957
  if (hookHint) {
1701
2958
  contentBlocks.push({ type: "text", text: hookHint });
1702
2959
  }
2960
+ // Audit log: successful tool call
2961
+ auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), true);
1703
2962
  return {
1704
2963
  content: contentBlocks,
1705
2964
  isError: false,
1706
2965
  };
1707
2966
  }
1708
2967
  catch (err) {
2968
+ // Security errors get a clean response (not a stack trace)
2969
+ if (err instanceof SecurityError) {
2970
+ auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), false, err.message);
2971
+ return {
2972
+ content: [{ type: "text", text: `[SECURITY] ${err.message}` }],
2973
+ isError: true,
2974
+ };
2975
+ }
1709
2976
  resultStatus = "error";
1710
2977
  errorMsg = err?.message || "Internal error";
1711
2978
  // Auto-log errors to main DB
@@ -1756,13 +3023,13 @@ process.on('exit', () => {
1756
3023
  try {
1757
3024
  const db = getDb();
1758
3025
  const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
1759
- db.prepare(`UPDATE ab_test_sessions SET
1760
- final_tool_count = ?,
1761
- toolsets_loaded = ?,
1762
- total_tool_calls = ?,
1763
- total_load_events = ?,
1764
- session_duration_ms = ?,
1765
- ended_at = datetime('now')
3026
+ db.prepare(`UPDATE ab_test_sessions SET
3027
+ final_tool_count = ?,
3028
+ toolsets_loaded = ?,
3029
+ total_tool_calls = ?,
3030
+ total_load_events = ?,
3031
+ session_duration_ms = ?,
3032
+ ended_at = datetime('now')
1766
3033
  WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
1767
3034
  }
1768
3035
  catch { /* instrumentation must not block shutdown */ }
@@ -1792,6 +3059,15 @@ if (useEngine) {
1792
3059
  }
1793
3060
  catch { /* engine is optional — don't block MCP */ }
1794
3061
  }
3062
+ // Start observability watchdog (non-blocking, best-effort)
3063
+ try {
3064
+ initObservability(getDb);
3065
+ startWatchdog(getDb());
3066
+ }
3067
+ catch { /* observability is optional — don't block MCP */ }
3068
+ // Graceful shutdown
3069
+ process.on("SIGINT", () => { stopWatchdog(); process.exit(0); });
3070
+ process.on("SIGTERM", () => { stopWatchdog(); process.exit(0); });
1795
3071
  const toolsetInfo = cliArgs.includes("--toolsets") || cliArgs.includes("--exclude") || cliArgs.includes("--preset")
1796
3072
  ? ` [gated: ${domainTools.length} domain + 2 meta]`
1797
3073
  : "";