npm - nodebench-mcp - Versions diffs - 2.31.2 → 2.33.0 - Mend

nodebench-mcp 2.31.2 → 2.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

package/README.md +14 -6
package/dist/engine/server.js +14 -4
package/dist/engine/server.js.map +1 -1
package/dist/index.js +1581 -670
package/dist/index.js.map +1 -1
package/dist/security/SecurityError.d.ts +18 -0
package/dist/security/SecurityError.js +22 -0
package/dist/security/SecurityError.js.map +1 -0
package/dist/security/__tests__/security.test.d.ts +8 -0
package/dist/security/__tests__/security.test.js +295 -0
package/dist/security/__tests__/security.test.js.map +1 -0
package/dist/security/auditLog.d.ts +36 -0
package/dist/security/auditLog.js +178 -0
package/dist/security/auditLog.js.map +1 -0
package/dist/security/commandSandbox.d.ts +33 -0
package/dist/security/commandSandbox.js +159 -0
package/dist/security/commandSandbox.js.map +1 -0
package/dist/security/config.d.ts +23 -0
package/dist/security/config.js +43 -0
package/dist/security/config.js.map +1 -0
package/dist/security/credentialRedactor.d.ts +22 -0
package/dist/security/credentialRedactor.js +118 -0
package/dist/security/credentialRedactor.js.map +1 -0
package/dist/security/index.d.ts +20 -0
package/dist/security/index.js +21 -0
package/dist/security/index.js.map +1 -0
package/dist/security/pathSandbox.d.ts +23 -0
package/dist/security/pathSandbox.js +160 -0
package/dist/security/pathSandbox.js.map +1 -0
package/dist/security/urlValidator.d.ts +23 -0
package/dist/security/urlValidator.js +125 -0
package/dist/security/urlValidator.js.map +1 -0
package/dist/tools/agentBootstrapTools.js +22 -29
package/dist/tools/agentBootstrapTools.js.map +1 -1
package/dist/tools/contextSandboxTools.js +7 -9
package/dist/tools/contextSandboxTools.js.map +1 -1
package/dist/tools/deepSimTools.d.ts +2 -0
package/dist/tools/deepSimTools.js +404 -0
package/dist/tools/deepSimTools.js.map +1 -0
package/dist/tools/dimensionTools.d.ts +2 -0
package/dist/tools/dimensionTools.js +246 -0
package/dist/tools/dimensionTools.js.map +1 -0
package/dist/tools/executionTraceTools.d.ts +2 -0
package/dist/tools/executionTraceTools.js +446 -0
package/dist/tools/executionTraceTools.js.map +1 -0
package/dist/tools/founderTools.d.ts +13 -0
package/dist/tools/founderTools.js +595 -0
package/dist/tools/founderTools.js.map +1 -0
package/dist/tools/founderTrackingTools.d.ts +9 -0
package/dist/tools/founderTrackingTools.js +644 -0
package/dist/tools/founderTrackingTools.js.map +1 -0
package/dist/tools/gitWorkflowTools.js +14 -10
package/dist/tools/gitWorkflowTools.js.map +1 -1
package/dist/tools/githubTools.js +19 -2
package/dist/tools/githubTools.js.map +1 -1
package/dist/tools/index.d.ts +87 -0
package/dist/tools/index.js +102 -0
package/dist/tools/index.js.map +1 -0
package/dist/tools/localFileTools.js +24 -12
package/dist/tools/localFileTools.js.map +1 -1
package/dist/tools/memoryDecay.d.ts +70 -0
package/dist/tools/memoryDecay.js +247 -0
package/dist/tools/memoryDecay.js.map +1 -0
package/dist/tools/missionHarnessTools.d.ts +32 -0
package/dist/tools/missionHarnessTools.js +972 -0
package/dist/tools/missionHarnessTools.js.map +1 -0
package/dist/tools/observabilityTools.d.ts +15 -0
package/dist/tools/observabilityTools.js +787 -0
package/dist/tools/observabilityTools.js.map +1 -0
package/dist/tools/openclawTools.js +151 -36
package/dist/tools/openclawTools.js.map +1 -1
package/dist/tools/progressiveDiscoveryTools.js +5 -4
package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
package/dist/tools/qualityGateTools.js +118 -2
package/dist/tools/qualityGateTools.js.map +1 -1
package/dist/tools/rssTools.js +3 -0
package/dist/tools/rssTools.js.map +1 -1
package/dist/tools/scraplingTools.js +15 -0
package/dist/tools/scraplingTools.js.map +1 -1
package/dist/tools/seoTools.js +66 -1
package/dist/tools/seoTools.js.map +1 -1
package/dist/tools/sessionMemoryTools.js +50 -11
package/dist/tools/sessionMemoryTools.js.map +1 -1
package/dist/tools/temporalIntelligenceTools.d.ts +12 -0
package/dist/tools/temporalIntelligenceTools.js +1068 -0
package/dist/tools/temporalIntelligenceTools.js.map +1 -0
package/dist/tools/toolRegistry.d.ts +19 -0
package/dist/tools/toolRegistry.js +956 -31
package/dist/tools/toolRegistry.js.map +1 -1
package/dist/tools/webTools.js +14 -1
package/dist/tools/webTools.js.map +1 -1
package/dist/tools/webmcpTools.js +13 -2
package/dist/tools/webmcpTools.js.map +1 -1
package/dist/toolsetRegistry.js +14 -0
package/dist/toolsetRegistry.js.map +1 -1
package/dist/types.d.ts +10 -0
package/package.json +124 -124

package/dist/index.js CHANGED Viewed

@@ -20,6 +20,7 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
 import { getDb, genId } from "./db.js";
+import { redactSecrets, auditLog, SecurityError } from "./security/index.js";
 import { startDashboardServer } from "./dashboard/server.js";
 import { startEngineServer } from "./engine/server.js";
 import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
@@ -27,9 +28,10 @@ import { AnalyticsTracker } from "./analytics/toolTracker.js";
 import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
 import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
 import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
+import { initObservability, startWatchdog, stopWatchdog } from "./tools/observabilityTools.js";
 import { createMetaTools } from "./tools/metaTools.js";
 import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
-import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
+import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, getToolAnnotations, _setDbAccessor, hybridSearch, WORKFLOW_CHAINS } from "./tools/toolRegistry.js";
 // TOON format — ~40% token savings on tool responses
 import { encode as toonEncode } from "@toon-format/toon";
 // Embedding provider — neural semantic search
@@ -44,29 +46,35 @@ const exportStats = cliArgs.includes("--export-stats");
 const resetStats = cliArgs.includes("--reset-stats");
 const listPresetsFlag = cliArgs.includes("--list-presets");
 const healthFlag = cliArgs.includes("--health");
+const statusFlag = cliArgs.includes("--status");
+const diagnoseFlag = cliArgs.includes("--diagnose");
 const autoPresetFlag = cliArgs.includes("--auto-preset");
+const syncConfigsFlag = cliArgs.includes("--sync-configs");
 const useEngine = cliArgs.includes("--engine");
 const engineSecret = (() => {
     const idx = cliArgs.indexOf("--engine-secret");
     return idx >= 0 && idx + 1 < cliArgs.length ? cliArgs[idx + 1] : process.env.ENGINE_SECRET;
 })();
 export { TOOLSET_MAP };
-const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox"];
+const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate", "skill_update", "context_sandbox", "observability", "execution_trace", "mission_harness", "deep_sim", "founder"];
 const PRESETS = {
     default: DEFAULT_TOOLSETS,
-    // Themed presets — bridge between default (50 tools) and full (175 tools)
+    // Themed presets — bridge between default (81 tools) and full (295 tools)
     web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "qa_orchestration", "visual_qa", "design_governance", "web_scraping"],
-    research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping"],
-    data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping"],
+    research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs", "research_optimizer", "web_scraping", "temporal_intelligence", "deep_sim"],
+    data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web", "research_optimizer", "web_scraping", "temporal_intelligence"],
     devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
     mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection", "ui_ux_dive", "ui_ux_dive_v2", "mcp_bridge", "visual_qa"],
     academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
-    multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping"],
+    multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon", "qa_orchestration", "agent_traverse", "engine_context", "research_optimizer", "web_scraping", "deep_sim"],
     content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect", "local_dashboard", "engine_context", "thompson_protocol"],
+    // Cursor IDE has a hard 40-tool limit across ALL MCP servers.
+    // 28 tools = 22 domain + 3 meta + 3 discovery — leaves 12 slots for other servers.
+    cursor: ["deep_sim", "quality_gate", "learning", "session_memory", "web", "toon"],
     full: Object.keys(TOOLSET_MAP),
 };
 const PRESET_DESCRIPTIONS = {
-    default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
+    default: "Core AI Flywheel — verification, eval, quality gates, learning, recon, mission harness",
     web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
     research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
     data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
@@ -75,6 +83,7 @@ const PRESET_DESCRIPTIONS = {
     academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
     multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing, frontend traversal",
     content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
+    cursor: "Cursor IDE (28 tools) — decision intelligence, research, quality gates, session memory, web, TOON encoding. Leaves 12 slots for other MCP servers.",
     full: "Everything — all toolsets for maximum coverage",
 };
 function parseToolsets() {
@@ -101,6 +110,9 @@ function parseToolsets() {
             "  --engine-secret <s> Require Bearer token for engine API (or set ENGINE_SECRET env var)",
             "  --explain <tool>    Show plain-English explanation of a tool and exit",
             "  --health            Run diagnostic health check and exit",
+            "  --status            Show live system pulse (uptime, errors, call rates) and exit",
+            "  --diagnose          Run drift detection + auto-heal and exit",
+            "  --sync-configs      Write MCP config to Claude Code, Cursor, and Windsurf IDE locations",
             "  --help              Show this help and exit",
             "",
             "Available toolsets:",
@@ -113,12 +125,12 @@ function parseToolsets() {
             }),
             "",
             "Examples:",
-            "  npx nodebench-mcp                    # Default (50 tools) - core AI Flywheel",
+            "  npx nodebench-mcp                    # Default (81 tools) - core AI Flywheel",
             "  npx nodebench-mcp --preset web_dev   # Web development (+ vision, SEO, git)",
             "  npx nodebench-mcp --preset research  # Research workflows (+ web, LLM, RSS, email)",
             "  npx nodebench-mcp --preset data      # Data analysis (+ local file parsing, LLM)",
             "  npx nodebench-mcp --preset academic  # Academic writing (+ paper tools, LLM)",
-            "  npx nodebench-mcp --preset full      # All 175 tools",
+            "  npx nodebench-mcp --preset full      # All 295 tools",
             "  npx nodebench-mcp --smart-preset     # Get AI-powered preset recommendation",
             "  npx nodebench-mcp --stats            # Show usage statistics",
             "  npx nodebench-mcp --toolsets verification,eval,recon",
@@ -562,6 +574,637 @@ if (healthFlag) {
     console.log(lines.join("\n"));
     process.exit(0);
 }
+// ── Status CLI handler (run-and-exit) ─────────────────────────────────
+if (statusFlag) {
+    const os = await import("node:os");
+    const path = await import("node:path");
+    const fs = await import("node:fs");
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const R = USE_COLOR ? "\x1b[31m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const dir = path.join(os.homedir(), ".nodebench");
+    const dbPath = path.join(dir, "nodebench.db");
+    if (!fs.existsSync(dbPath)) {
+        console.error("No database found. Run the MCP server first to initialize.");
+        process.exit(1);
+    }
+    // Open DB directly for status query
+    const Database = (await import("better-sqlite3")).default;
+    const db = new Database(dbPath, { readonly: true });
+    const lines = [];
+    lines.push(`${B}NodeBench MCP — System Status${X}`);
+    lines.push("");
+    // Uptime info from DB (last tool call as proxy for when server was active)
+    try {
+        const recent = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get();
+        const today = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')`).get();
+        const week = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-7 days')`).get();
+        const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-1 hour')`).get();
+        const errors24h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status = 'error' AND created_at > datetime('now', '-24 hours')`).get();
+        lines.push(`${C}Call Volume${X}`);
+        lines.push(`  Last 1h:   ${recent.cnt} calls  (${errors1h.cnt} errors)`);
+        lines.push(`  Last 24h:  ${today.cnt} calls  (${errors24h.cnt} errors)`);
+        lines.push(`  Last 7d:   ${week.cnt} calls`);
+        const rate1h = recent.cnt > 0 ? ((recent.cnt - errors1h.cnt) / recent.cnt * 100).toFixed(1) : "N/A";
+        const rate24h = today.cnt > 0 ? ((today.cnt - errors24h.cnt) / today.cnt * 100).toFixed(1) : "N/A";
+        lines.push(`  Success:   ${rate1h}% (1h) / ${rate24h}% (24h)`);
+        lines.push("");
+        // Top 5 tools
+        const topTools = db.prepare(`SELECT tool_name, COUNT(*) as calls, SUM(CASE WHEN result_status='error' THEN 1 ELSE 0 END) as errs, ROUND(AVG(duration_ms)) as avg_ms
+       FROM tool_call_log WHERE created_at > datetime('now', '-24 hours')
+       GROUP BY tool_name ORDER BY calls DESC LIMIT 5`).all();
+        if (topTools.length > 0) {
+            lines.push(`${C}Top Tools (24h)${X}`);
+            for (const t of topTools) {
+                const errTag = t.errs > 0 ? `  ${R}${t.errs} err${X}` : "";
+                lines.push(`  ${t.calls.toString().padStart(4)} ${t.tool_name.padEnd(30)} ${t.avg_ms}ms avg${errTag}`);
+            }
+            lines.push("");
+        }
+        // Error trend
+        const errPrevHour = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-2 hours') AND created_at <= datetime('now', '-1 hour')`).get();
+        const direction = errors1h.cnt > errPrevHour.cnt ? `${R}increasing${X}` : errors1h.cnt < errPrevHour.cnt ? `${G}decreasing${X}` : `${G}stable${X}`;
+        lines.push(`${C}Error Trend${X}  ${direction} (${errPrevHour.cnt} prev hour → ${errors1h.cnt} this hour)`);
+        // Active verification cycles
+        const activeCycles = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress')`).get();
+        if (activeCycles.cnt > 0) {
+            lines.push(`${C}Active Cycles${X}  ${Y}${activeCycles.cnt} verification cycle(s) in progress${X}`);
+        }
+    }
+    catch (e) {
+        lines.push(`${R}Error querying DB: ${e.message}${X}`);
+    }
+    db.close();
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
+// ── Diagnose CLI handler (run-and-exit) ───────────────────────────────
+if (diagnoseFlag) {
+    const os = await import("node:os");
+    const path = await import("node:path");
+    const fs = await import("node:fs");
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const R = USE_COLOR ? "\x1b[31m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const dir = path.join(os.homedir(), ".nodebench");
+    const dbPath = path.join(dir, "nodebench.db");
+    if (!fs.existsSync(dbPath)) {
+        console.error("No database found. Run the MCP server first to initialize.");
+        process.exit(1);
+    }
+    const Database = (await import("better-sqlite3")).default;
+    const db = new Database(dbPath);
+    const lines = [];
+    lines.push(`${B}NodeBench MCP — Diagnose & Heal${X}`);
+    lines.push("");
+    let issueCount = 0;
+    let healedCount = 0;
+    // 1. Orphaned verification cycles
+    try {
+        const orphanedCount = db.prepare(`SELECT COUNT(*) as cnt FROM verification_cycles WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).get().cnt;
+        if (orphanedCount > 0) {
+            lines.push(`${Y}DRIFT${X}  ${orphanedCount} orphaned verification cycle(s) (>48h old)`);
+            const result = db.prepare(`UPDATE verification_cycles SET status = 'abandoned', updated_at = datetime('now') WHERE status IN ('active', 'in_progress') AND created_at < datetime('now', '-48 hours')`).run();
+            lines.push(`  ${G}HEALED${X}  Abandoned ${result.changes} cycles in batch`);
+            healedCount += result.changes;
+            issueCount += orphanedCount;
+        }
+        else {
+            lines.push(`${G}OK${X}     No orphaned verification cycles`);
+        }
+    }
+    catch {
+        lines.push(`${Y}SKIP${X}   Could not check verification cycles`);
+    }
+    // 2. Stale eval runs
+    try {
+        const staleCount = db.prepare(`SELECT COUNT(*) as cnt FROM eval_runs WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).get().cnt;
+        if (staleCount > 0) {
+            lines.push(`${Y}DRIFT${X}  ${staleCount} stale eval run(s) (>24h old)`);
+            const result = db.prepare(`UPDATE eval_runs SET status = 'failed', completed_at = datetime('now') WHERE status IN ('running', 'pending') AND created_at < datetime('now', '-24 hours')`).run();
+            lines.push(`  ${G}HEALED${X}  Marked ${result.changes} eval runs as failed`);
+            healedCount += result.changes;
+            issueCount += staleCount;
+        }
+        else {
+            lines.push(`${G}OK${X}     No stale eval runs`);
+        }
+    }
+    catch {
+        lines.push(`${Y}SKIP${X}   Could not check eval runs`);
+    }
+    // 3. DB size
+    const dbInfo = fs.statSync(dbPath);
+    const dbSizeMb = dbInfo.size / (1024 * 1024);
+    if (dbSizeMb > 500) {
+        lines.push(`${Y}DRIFT${X}  Database is ${dbSizeMb.toFixed(1)} MB`);
+        try {
+            const cutoff = new Date(Date.now() - 90 * 24 * 3_600_000).toISOString();
+            const deleted = db.prepare(`DELETE FROM tool_call_log WHERE created_at < ?`).run(cutoff);
+            if (deleted.changes > 0) {
+                lines.push(`  ${G}HEALED${X}  Pruned ${deleted.changes} tool_call_log entries older than 90 days`);
+                healedCount++;
+            }
+            db.pragma("wal_checkpoint(TRUNCATE)");
+            lines.push(`  ${G}HEALED${X}  Ran WAL checkpoint`);
+            healedCount++;
+        }
+        catch { /* skip */ }
+        issueCount++;
+    }
+    else {
+        lines.push(`${G}OK${X}     Database size: ${dbSizeMb.toFixed(1)} MB`);
+    }
+    // 4. Error rate
+    try {
+        const calls1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE created_at > datetime('now', '-1 hour')`).get().cnt;
+        const errors1h = db.prepare(`SELECT COUNT(*) as cnt FROM tool_call_log WHERE result_status='error' AND created_at > datetime('now', '-1 hour')`).get().cnt;
+        const rate = calls1h > 0 ? (errors1h / calls1h * 100) : 0;
+        if (rate > 20 && calls1h > 5) {
+            lines.push(`${R}ALERT${X}  Error rate ${rate.toFixed(1)}% in last hour (${errors1h}/${calls1h})`);
+            issueCount++;
+        }
+        else {
+            lines.push(`${G}OK${X}     Error rate: ${rate.toFixed(1)}% (${errors1h}/${calls1h} in last hour)`);
+        }
+    }
+    catch {
+        lines.push(`${Y}SKIP${X}   Could not check error rates`);
+    }
+    // 5. Embedding cache
+    const cachePath = path.join(dir, "embedding_cache.json");
+    if (fs.existsSync(cachePath)) {
+        const cacheAge = Math.round((Date.now() - fs.statSync(cachePath).mtimeMs) / 3_600_000);
+        if (cacheAge > 168) {
+            lines.push(`${Y}DRIFT${X}  Embedding cache is ${cacheAge}h old (>7 days) — will refresh on next server start`);
+            issueCount++;
+        }
+        else {
+            lines.push(`${G}OK${X}     Embedding cache: ${cacheAge}h old`);
+        }
+    }
+    else {
+        lines.push(`${Y}INFO${X}   No embedding cache found (will build on first server start)`);
+    }
+    // Summary
+    lines.push("");
+    if (issueCount === 0) {
+        lines.push(`${G}${B}All clear${X} — no drift detected`);
+    }
+    else {
+        lines.push(`${B}Found ${issueCount} issue(s), healed ${healedCount}${X}`);
+        const remaining = issueCount - healedCount;
+        if (remaining > 0)
+            lines.push(`${Y}${remaining} issue(s) require manual attention${X}`);
+    }
+    db.close();
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
+// ── Sync Configs CLI handler (run-and-exit) ─────────────────────────────
+if (syncConfigsFlag) {
+    const os = await import("node:os");
+    const path = await import("node:path");
+    const fs = await import("node:fs");
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    // Detect the nodebench-mcp entry point path
+    const entryPath = path.resolve(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1") // fix Windows drive letter
+    );
+    // Build args array from current CLI flags (exclude --sync-configs and other run-and-exit flags)
+    const forwardArgs = [];
+    const skipNext = new Set(["--preset", "--toolsets", "--exclude", "--engine-secret"]);
+    const runAndExitFlags = new Set([
+        "--sync-configs", "--health", "--status", "--diagnose", "--stats",
+        "--export-stats", "--reset-stats", "--list-presets", "--smart-preset",
+        "--auto-preset", "--help",
+    ]);
+    for (let i = 0; i < cliArgs.length; i++) {
+        if (runAndExitFlags.has(cliArgs[i]))
+            continue;
+        if (cliArgs[i].startsWith("--explain"))
+            continue;
+        if (skipNext.has(cliArgs[i])) {
+            forwardArgs.push(cliArgs[i], cliArgs[i + 1] ?? "");
+            i++; // skip the value
+            continue;
+        }
+        forwardArgs.push(cliArgs[i]);
+    }
+    // Collect env vars that are currently set
+    const ENV_KEYS = [
+        "ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GEMINI_API_KEY",
+        "GITHUB_TOKEN", "BROWSERBASE_API_KEY", "FIRECRAWL_API_KEY",
+        "SMTP_HOST", "SMTP_PORT", "SMTP_USER", "SMTP_PASS",
+        "IMAP_HOST", "IMAP_PORT", "IMAP_USER", "IMAP_PASS",
+        "ENGINE_SECRET",
+    ];
+    const envObj = {};
+    for (const key of ENV_KEYS) {
+        if (process.env[key])
+            envObj[key] = process.env[key];
+    }
+    // Build the MCP server config entry
+    const nodePath = process.execPath; // path to node binary
+    const serverEntry = {
+        command: nodePath,
+        args: [entryPath, ...forwardArgs],
+        ...(Object.keys(envObj).length > 0 ? { env: envObj } : {}),
+    };
+    // Helper: merge into existing config file (preserves other servers)
+    function mergeConfig(filePath, serverKey) {
+        let existing = {};
+        if (fs.existsSync(filePath)) {
+            try {
+                existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
+            }
+            catch {
+                // If file exists but is invalid JSON, back it up and start fresh
+                const backupPath = filePath + ".bak";
+                fs.copyFileSync(filePath, backupPath);
+                existing = {};
+            }
+        }
+        // Ensure mcpServers key exists
+        if (!existing.mcpServers || typeof existing.mcpServers !== "object") {
+            existing.mcpServers = {};
+        }
+        const servers = existing.mcpServers;
+        const hadExisting = !!servers[serverKey];
+        servers[serverKey] = serverEntry;
+        // Ensure parent directory exists
+        const dir = path.dirname(filePath);
+        if (!fs.existsSync(dir)) {
+            fs.mkdirSync(dir, { recursive: true });
+        }
+        fs.writeFileSync(filePath, JSON.stringify(existing, null, 2) + "\n", "utf-8");
+        return { action: hadExisting ? "updated" : "created", path: filePath };
+    }
+    const lines = [];
+    lines.push(`${B}NodeBench MCP — Sync IDE Configs${X}`);
+    lines.push("");
+    const results = [];
+    // 1. Claude Code: ~/.claude/claude_desktop_config.json
+    try {
+        const claudeConfigPath = path.join(os.homedir(), ".claude", "claude_desktop_config.json");
+        const r = mergeConfig(claudeConfigPath, "nodebench-mcp");
+        results.push({ name: "Claude Code", ...r });
+    }
+    catch (e) {
+        results.push({ name: "Claude Code", action: "failed", path: "", error: e.message });
+    }
+    // 2. Cursor: <project>/.cursor/mcp.json
+    try {
+        const cursorConfigPath = path.join(process.cwd(), ".cursor", "mcp.json");
+        const r = mergeConfig(cursorConfigPath, "nodebench-mcp");
+        results.push({ name: "Cursor", ...r });
+    }
+    catch (e) {
+        results.push({ name: "Cursor", action: "failed", path: "", error: e.message });
+    }
+    // 3. Windsurf: <project>/.windsurf/mcp.json
+    try {
+        const windsurfConfigPath = path.join(process.cwd(), ".windsurf", "mcp.json");
+        const r = mergeConfig(windsurfConfigPath, "nodebench-mcp");
+        results.push({ name: "Windsurf", ...r });
+    }
+    catch (e) {
+        results.push({ name: "Windsurf", action: "failed", path: "", error: e.message });
+    }
+    // Print results
+    for (const r of results) {
+        if (r.action === "failed") {
+            lines.push(`${Y}FAIL${X}  ${r.name}: ${r.error}`);
+        }
+        else {
+            const icon = r.action === "created" ? `${G}NEW${X} ` : `${G}UPD${X} `;
+            lines.push(`${icon} ${r.name}: ${r.path}`);
+        }
+    }
+    // Print config summary
+    lines.push("");
+    lines.push(`${C}Config entry:${X}`);
+    lines.push(`  command: ${nodePath}`);
+    lines.push(`  args:    [${[entryPath, ...forwardArgs].map(a => `"${a}"`).join(", ")}]`);
+    if (Object.keys(envObj).length > 0) {
+        lines.push(`  env:     ${Object.keys(envObj).join(", ")}`);
+    }
+    else {
+        lines.push(`  env:     ${Y}(none set)${X}`);
+    }
+    lines.push("");
+    const successCount = results.filter(r => r.action !== "failed").length;
+    lines.push(`${B}Written to ${successCount}/${results.length} locations${X}`);
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
+// ── CLI subcommand detection ──────────────────────────────────────────
+// First positional arg (not starting with --) is a subcommand
+const subCmd = cliArgs.find(a => !a.startsWith("--") && !cliArgs.some((f, i) => f.startsWith("--") && cliArgs[i + 1] === a));
+// ── Welcome screen (no arguments at all) ─────────────────────────────
+if (cliArgs.length === 0 || (subCmd === undefined && !cliArgs.includes("--stdio") && !cliArgs.some(a => a.startsWith("--")))) {
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
+    const domainCount = Object.keys(TOOLSET_MAP).length;
+    const welcome = [
+        "",
+        `  ${B}NodeBench AI${X} ${D}— The trust layer for agents${X}`,
+        "",
+        `  ${C}Quick start${X}`,
+        `    ${G}$${X} npx nodebench-mcp discover              ${D}Show available tools${X}`,
+        `    ${G}$${X} npx nodebench-mcp demo                   ${D}Run a live demo (no keys needed)${X}`,
+        `    ${G}$${X} npx nodebench-mcp quickref research       ${D}Get research workflow guide${X}`,
+        `    ${G}$${X} npx nodebench-mcp --explain run_recon     ${D}Deep-dive on any tool${X}`,
+        "",
+        `  ${C}Connect to your IDE${X}`,
+        `    ${G}$${X} claude mcp add nodebench -- npx nodebench-mcp --stdio`,
+        `    ${G}$${X} npx nodebench-mcp --sync-configs          ${D}Auto-write to Claude/Cursor/Windsurf${X}`,
+        "",
+        `  ${C}Start the MCP server${X}`,
+        `    ${G}$${X} npx nodebench-mcp --stdio                 ${D}Default preset${X}`,
+        `    ${G}$${X} npx nodebench-mcp --preset research       ${D}Research workflows${X}`,
+        `    ${G}$${X} npx nodebench-mcp --auto-preset           ${D}Detect from your project${X}`,
+        "",
+        `  ${Y}${totalTools} tools${X} ${D}·${X} ${Y}${domainCount} domains${X} ${D}· Progressive discovery · Agent-as-a-Graph${X}`,
+        "",
+    ];
+    console.log(welcome.join("\n"));
+    process.exit(0);
+}
+// ── Demo subcommand (run-and-exit) ───────────────────────────────────
+if (subCmd === "demo") {
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const demoLines = [];
+    demoLines.push("");
+    demoLines.push(`  ${B}NodeBench AI — Live Demo${X}`);
+    demoLines.push(`  ${D}No API keys needed. Everything runs locally.${X}`);
+    demoLines.push("");
+    // 1. Show research tools via hybridSearch
+    demoLines.push(`  ${C}1. Discovering research tools...${X}`);
+    demoLines.push("");
+    const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
+    const researchResults = hybridSearch("research", stubTools, { limit: 5, mode: "hybrid" });
+    for (const r of researchResults.slice(0, 5)) {
+        const entry = TOOL_REGISTRY.get(r.name);
+        const phase = entry?.phase ?? "";
+        demoLines.push(`     ${G}>${X} ${B}${r.name}${X}  ${D}(${phase})${X}`);
+        if (entry?.quickRef?.nextAction) {
+            demoLines.push(`       ${entry.quickRef.nextAction.slice(0, 80)}`);
+        }
+    }
+    demoLines.push("");
+    // 2. Show a workflow chain
+    demoLines.push(`  ${C}2. Workflow chain: "Build a New Feature"${X}`);
+    demoLines.push("");
+    const chain = WORKFLOW_CHAINS["new_feature"];
+    if (chain) {
+        demoLines.push(`     ${B}${chain.name}${X}  ${D}— ${chain.description}${X}`);
+        demoLines.push("");
+        for (let i = 0; i < Math.min(chain.steps.length, 8); i++) {
+            const step = chain.steps[i];
+            const num = String(i + 1).padStart(2, " ");
+            demoLines.push(`     ${Y}${num}.${X} ${step.tool}  ${D}→ ${step.action}${X}`);
+        }
+        if (chain.steps.length > 8) {
+            demoLines.push(`     ${D}    ... +${chain.steps.length - 8} more steps${X}`);
+        }
+    }
+    demoLines.push("");
+    // 3. Summary stats
+    const totalTools = Object.values(TOOLSET_MAP).reduce((s, v) => s + v.length, 0) + 12;
+    const domainCount = Object.keys(TOOLSET_MAP).length;
+    const chainCount = Object.keys(WORKFLOW_CHAINS).length;
+    demoLines.push(`  ${C}3. What's available${X}`);
+    demoLines.push("");
+    demoLines.push(`     ${Y}${totalTools}${X} tools across ${Y}${domainCount}${X} domains`);
+    demoLines.push(`     ${Y}${chainCount}${X} pre-built workflow chains`);
+    demoLines.push(`     ${Y}${ALL_REGISTRY_ENTRIES.length}${X} entries in the tool registry`);
+    demoLines.push("");
+    // 4. Next steps
+    demoLines.push(`  ${C}Next steps${X}`);
+    demoLines.push(`    ${G}$${X} npx nodebench-mcp --explain run_recon     ${D}Deep-dive on any tool${X}`);
+    demoLines.push(`    ${G}$${X} npx nodebench-mcp --health                ${D}Check your environment${X}`);
+    demoLines.push(`    ${G}$${X} npx nodebench-mcp --sync-configs          ${D}Wire into your IDE${X}`);
+    demoLines.push("");
+    console.log(demoLines.join("\n"));
+    process.exit(0);
+}
+// ── Discover subcommand (run-and-exit) ───────────────────────────────
+if (subCmd === "discover") {
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const query = cliArgs.find(a => a !== "discover" && !a.startsWith("--")) ?? "";
+    const limit = 10;
+    const lines = [];
+    lines.push("");
+    if (query) {
+        lines.push(`  ${B}Discovering tools for:${X} ${C}${query}${X}`);
+        const stubTools = ALL_REGISTRY_ENTRIES.map(e => ({ name: e.name, description: e.category }));
+        const results = hybridSearch(query, stubTools, { limit, mode: "hybrid" });
+        lines.push("");
+        for (const r of results) {
+            const entry = TOOL_REGISTRY.get(r.name);
+            lines.push(`  ${G}>${X} ${B}${r.name}${X}  ${D}score: ${r.score.toFixed(2)}${X}`);
+            if (entry) {
+                lines.push(`    ${D}${entry.category} · ${entry.phase}${X}`);
+                if (entry.quickRef?.nextAction)
+                    lines.push(`    ${entry.quickRef.nextAction.slice(0, 90)}`);
+            }
+            lines.push("");
+        }
+        if (results.length === 0)
+            lines.push(`  ${Y}No results.${X} Try a broader query.\n`);
+    }
+    else {
+        lines.push(`  ${B}Tool domains${X}  ${D}(${Object.keys(TOOLSET_MAP).length} domains)${X}`);
+        lines.push("");
+        for (const [domain, tools] of Object.entries(TOOLSET_MAP)) {
+            lines.push(`  ${G}>${X} ${domain.padEnd(24)} ${Y}${String(tools.length).padStart(3)}${X} tools`);
+        }
+        lines.push("");
+        lines.push(`  ${D}Search: npx nodebench-mcp discover <query>${X}`);
+    }
+    lines.push("");
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
+// ── Quickref subcommand (run-and-exit) ───────────────────────────────
+if (subCmd === "quickref") {
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const toolName = cliArgs.find(a => a !== "quickref" && !a.startsWith("--")) ?? "";
+    const lines = [];
+    lines.push("");
+    if (!toolName) {
+        lines.push(`  ${B}Usage:${X} npx nodebench-mcp quickref <tool_or_workflow>`);
+        lines.push("");
+        lines.push(`  ${C}Workflows${X}`);
+        for (const [key, chain] of Object.entries(WORKFLOW_CHAINS).slice(0, 10)) {
+            lines.push(`  ${G}>${X} ${key.padEnd(28)} ${D}${chain.name}${X}`);
+        }
+        lines.push(`  ${D}  ... +${Object.keys(WORKFLOW_CHAINS).length - 10} more${X}`);
+        lines.push("");
+    }
+    else {
+        // Try workflow first
+        const chain = WORKFLOW_CHAINS[toolName];
+        if (chain) {
+            lines.push(`  ${B}${chain.name}${X}  ${D}(${toolName})${X}`);
+            lines.push(`  ${chain.description}`);
+            lines.push("");
+            for (let i = 0; i < chain.steps.length; i++) {
+                const step = chain.steps[i];
+                lines.push(`  ${Y}${String(i + 1).padStart(2)}.${X} ${step.tool}  ${D}→ ${step.action}${X}`);
+            }
+            lines.push("");
+        }
+        else {
+            // Try tool registry
+            const entry = TOOL_REGISTRY.get(toolName);
+            if (entry) {
+                lines.push(`  ${B}${entry.name}${X}  ${D}(${entry.category}, ${entry.phase})${X}`);
+                lines.push(`  ${entry.quickRef.nextAction}`);
+                if (entry.quickRef.tip)
+                    lines.push(`  ${Y}Tip:${X} ${entry.quickRef.tip}`);
+                if (entry.quickRef.nextTools.length > 0) {
+                    lines.push("");
+                    lines.push(`  ${C}Next tools${X}`);
+                    for (const nt of entry.quickRef.nextTools)
+                        lines.push(`    ${G}>${X} ${nt}`);
+                }
+                lines.push("");
+            }
+            else {
+                lines.push(`  ${Y}Not found:${X} ${toolName}`);
+                lines.push(`  ${D}Try: npx nodebench-mcp quickref new_feature${X}`);
+                lines.push("");
+            }
+        }
+    }
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
+// ── Call subcommand (run-and-exit) ───────────────────────────────────
+if (subCmd === "call") {
+    const toolName = cliArgs.find(a => a !== "call" && !a.startsWith("--") && !a.startsWith("{"));
+    const argsJson = cliArgs.find(a => a.startsWith("{")) ?? "{}";
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const R = USE_COLOR ? "\x1b[31m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    if (!toolName) {
+        console.log(`\n  ${B}Usage:${X} npx nodebench-mcp call <tool_name> [json_args]\n`);
+        console.log(`  ${D}Example:${X} npx nodebench-mcp call founder_deep_context_gather '{"packetType":"weekly_reset"}'`);
+        console.log(`  ${D}Example:${X} npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
+        console.log(`  ${D}Example:${X} npx nodebench-mcp call save_session_note '{"note":"test"}'\n`);
+        process.exit(0);
+    }
+    // Find tool in all toolsets — meta/discovery tools are created later,
+    // so for CLI call we build them inline
+    const cliDomainTools = Object.values(TOOLSET_MAP).flat();
+    const cliMetaTools = createMetaTools(cliDomainTools);
+    const cliDiscoveryTools = createProgressiveDiscoveryTools(cliDomainTools);
+    const allCallable = [...cliDomainTools, ...cliMetaTools, ...cliDiscoveryTools];
+    const tool = allCallable.find(t => t.name === toolName);
+    if (!tool) {
+        console.log(`\n  ${R}Tool not found:${X} ${toolName}`);
+        console.log(`  ${D}Run: npx nodebench-mcp discover ${toolName}${X}\n`);
+        process.exit(1);
+    }
+    let parsedArgs;
+    try {
+        parsedArgs = JSON.parse(argsJson);
+    }
+    catch {
+        console.log(`\n  ${R}Invalid JSON args:${X} ${argsJson}\n`);
+        process.exit(1);
+    }
+    console.log(`\n  ${D}Calling${X} ${B}${toolName}${X} ${D}...${X}`);
+    try {
+        const result = await tool.handler(parsedArgs);
+        const output = typeof result === "string" ? result : JSON.stringify(result, null, 2);
+        console.log(`\n  ${G}Result:${X}\n`);
+        // Pretty-print, indent 4 spaces
+        for (const line of output.split("\n")) {
+            console.log(`    ${line}`);
+        }
+        console.log("");
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        console.log(`\n  ${R}Error:${X} ${msg}\n`);
+        process.exit(1);
+    }
+    process.exit(0);
+}
+// ── Setup subcommand (run-and-exit) ──────────────────────────────────
+if (subCmd === "setup") {
+    const USE_COLOR = process.stdout.isTTY;
+    const B = USE_COLOR ? "\x1b[1m" : "";
+    const C = USE_COLOR ? "\x1b[36m" : "";
+    const G = USE_COLOR ? "\x1b[32m" : "";
+    const D = USE_COLOR ? "\x1b[2m" : "";
+    const Y = USE_COLOR ? "\x1b[33m" : "";
+    const X = USE_COLOR ? "\x1b[0m" : "";
+    const lines = [];
+    lines.push("");
+    lines.push(`  ${B}NodeBench MCP — Quick Setup${X}`);
+    lines.push("");
+    lines.push(`  ${G}1.${X} ${B}Claude Code${X}`);
+    lines.push(`     claude mcp add nodebench -- npx -y nodebench-mcp`);
+    lines.push("");
+    lines.push(`  ${G}2.${X} ${B}Cursor${X}  ${D}(.cursor/mcp.json)${X}`);
+    lines.push(`     { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
+    lines.push("");
+    lines.push(`  ${G}3.${X} ${B}Windsurf${X}  ${D}(.windsurf/mcp.json)${X}`);
+    lines.push(`     { "mcpServers": { "nodebench": { "command": "npx", "args": ["-y", "nodebench-mcp"] } } }`);
+    lines.push("");
+    lines.push(`  ${C}Verify:${X}  npx nodebench-mcp call discover_tools '{"query":"founder"}'`);
+    lines.push(`  ${C}Dashboard:${X}  https://www.nodebenchai.com/founder`);
+    lines.push(`  ${C}Agent setup:${X}  https://www.nodebenchai.com/agent-setup.txt`);
+    lines.push("");
+    lines.push(`  ${Y}Presets:${X}  --preset default (99 tools) | --preset full (313 tools)`);
+    lines.push(`  ${Y}Founder tools:${X}  founder_deep_context_gather, founder_packet_validate, founder_packet_diff`);
+    lines.push("");
+    console.log(lines.join("\n"));
+    process.exit(0);
+}
 // Initialize DB (creates ~/.nodebench/ and schema on first run)
 getDb();
 // Wire up DB accessor for execution trace edges (avoids circular import)
@@ -956,42 +1599,42 @@ const dynamicLoadingTools = [
             const db = getDb();
             const detailed = args.detailed === true;
             // Session-level aggregates by mode
-            const sessionSummary = db.prepare(`
-        SELECT
-          mode,
-          COUNT(*) as sessions,
-          ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
-          ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
-          ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
-          ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
-          ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
-          SUM(COALESCE(total_tool_calls, 0)) as total_calls,
-          SUM(COALESCE(total_load_events, 0)) as total_loads
-        FROM ab_test_sessions
-        GROUP BY mode
+            const sessionSummary = db.prepare(`
+        SELECT
+          mode,
+          COUNT(*) as sessions,
+          ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
+          ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
+          ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
+          ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
+          ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
+          SUM(COALESCE(total_tool_calls, 0)) as total_calls,
+          SUM(COALESCE(total_load_events, 0)) as total_loads
+        FROM ab_test_sessions
+        GROUP BY mode
       `).all();
             // Error rate by mode (join with tool_call_log)
-            const errorRates = db.prepare(`
-        SELECT
-          s.mode,
-          COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
-          COUNT(*) as total_calls,
-          ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
-        FROM tool_call_log t
-        JOIN ab_test_sessions s ON t.session_id = s.id
-        GROUP BY s.mode
+            const errorRates = db.prepare(`
+        SELECT
+          s.mode,
+          COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
+          COUNT(*) as total_calls,
+          ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
+        FROM tool_call_log t
+        JOIN ab_test_sessions s ON t.session_id = s.id
+        GROUP BY s.mode
       `).all();
             // Top loaded toolsets (dynamic mode)
-            const topToolsets = db.prepare(`
-        SELECT
-          toolset_name,
-          COUNT(*) as load_count,
-          ROUND(AVG(latency_ms), 1) as avg_latency_ms
-        FROM ab_tool_events
-        WHERE event_type = 'load'
-        GROUP BY toolset_name
-        ORDER BY load_count DESC
-        LIMIT 10
+            const topToolsets = db.prepare(`
+        SELECT
+          toolset_name,
+          COUNT(*) as load_count,
+          ROUND(AVG(latency_ms), 1) as avg_latency_ms
+        FROM ab_tool_events
+        WHERE event_type = 'load'
+        GROUP BY toolset_name
+        ORDER BY load_count DESC
+        LIMIT 10
       `).all();
             // Current session info
             const currentSession = {
@@ -1007,13 +1650,13 @@ const dynamicLoadingTools = [
             // Optional per-session detail
             let sessions = [];
             if (detailed) {
-                sessions = db.prepare(`
-          SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
-                 toolsets_loaded, total_tool_calls, total_load_events,
-                 session_duration_ms, created_at, ended_at
-          FROM ab_test_sessions
-          ORDER BY created_at DESC
-          LIMIT 50
+                sessions = db.prepare(`
+          SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
+                 toolsets_loaded, total_tool_calls, total_load_events,
+                 session_duration_ms, created_at, ended_at
+          FROM ab_test_sessions
+          ORDER BY created_at DESC
+          LIMIT 50
         `).all();
             }
             // Build verdict
@@ -1116,6 +1759,54 @@ const _hookState = {
     lastRefreshReminder: 0, // totalCalls at last reminder
 };
 const WEB_TOOL_NAMES = new Set(["web_search", "fetch_url"]);
+// ── Intent-based auto-expansion ─────────────────────────────────────────
+// On the first tool call, classify intent from tool name + args keywords
+// and auto-load relevant toolsets if running on the default preset.
+// Zero-latency: pure keyword matching, no LLM calls. Runs once per session.
+let _intentClassified = false;
+const INTENT_PATTERNS = [
+    { pattern: /web|css|html|dom|seo|browser|page|viewport|screenshot|ui_capture|ui_ux/i, toolsets: ["ui_capture", "vision", "web", "seo", "git_workflow", "architect"] },
+    { pattern: /research|paper|arxiv|scholar|literature|digest|brief|rss|feed/i, toolsets: ["web", "llm", "rss", "email", "docs"] },
+    { pattern: /data|csv|sql|pandas|xlsx|json_parse|spreadsheet|parquet|parse/i, toolsets: ["local_file", "llm", "web"] },
+    { pattern: /deploy|docker|k8s|kubernetes|ci|cd|pipeline|terraform|helm|infra/i, toolsets: ["git_workflow", "session_memory", "benchmark", "pattern"] },
+    { pattern: /agent|swarm|orchestr|parallel|multi.?agent|spawn|coordinat/i, toolsets: ["parallel", "self_eval", "session_memory", "pattern", "toon"] },
+    { pattern: /mobile|ios|android|react.?native|flutter|swift|kotlin/i, toolsets: ["ui_capture", "vision", "flicker_detection"] },
+    { pattern: /academic|thesis|review|cite|biblio|latex|peer/i, toolsets: ["research_writing", "llm", "web", "local_file"] },
+    { pattern: /content|publish|post|newsletter|email|campaign|linkedin/i, toolsets: ["llm", "critter", "email", "rss", "platform", "architect"] },
+];
+function classifyAndExpand(toolName, args) {
+    // Only expand if on default preset — user explicitly chose a preset, respect it
+    if (currentPreset !== "default")
+        return null;
+    // Build a single haystack from tool name + stringified arg keys/values
+    const argStr = args ? Object.entries(args).map(([k, v]) => `${k} ${typeof v === "string" ? v : ""}`).join(" ") : "";
+    const haystack = `${toolName} ${argStr}`;
+    // Collect all matching toolsets (deduplicated)
+    const toLoad = new Set();
+    for (const { pattern, toolsets } of INTENT_PATTERNS) {
+        if (pattern.test(haystack)) {
+            for (const ts of toolsets) {
+                if (TOOLSET_MAP[ts] && !activeToolsets.has(ts)) {
+                    toLoad.add(ts);
+                }
+            }
+        }
+    }
+    if (toLoad.size === 0)
+        return null;
+    // Load matched toolsets
+    for (const ts of toLoad) {
+        activeToolsets.add(ts);
+    }
+    // Rebuild tool arrays
+    domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
+    const newMetaTools = createMetaTools(domainTools);
+    allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
+    rebuildAllTools();
+    // Notify client of tool list change
+    server.notification({ method: "notifications/tools/list_changed" }).catch(() => { });
+    return [...toLoad];
+}
 const SAVE_TOOL_NAMES = new Set(["save_session_note", "record_learning"]);
 const REFRESH_INTERVAL = 30; // remind after every 30 calls
 function getHookHint(toolName) {
@@ -1149,42 +1840,93 @@ const PROMPTS = [
                 role: "user",
                 content: {
                     type: "text",
-                    text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
-WHAT THIS DOES:
-In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
-that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
-the agent finds 2+ prior findings before writing a single line of code.
-HOW IT WORKS:
-Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
-Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
-compounds into future tasks.
-FIRST TIME? Run these 3 steps:
-1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
-2. Call getMethodology("overview") to see all available methodologies
-3. Call search_all_knowledge("your current task") before starting any work
-RETURNING? Your project context and all past learnings are persisted. Start with:
-1. Call search_all_knowledge with your current task
-2. Follow the methodology tools as you work — they'll guide you step by step
-KEY TOOLS:
-- search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
-- run_mandatory_flywheel — 6-step minimum verification before declaring work done
-- getMethodology — Step-by-step guides for verification, eval, flywheel, recon
-- findTools — Discover tools by keyword or category
-- assess_risk — Assess risk before acting (HIGH = needs confirmation)
-PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
-- claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
-- get_parallel_status — See what all agents are doing
+                    text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
+WHAT THIS DOES:
+In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
+that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
+the agent finds 2+ prior findings before writing a single line of code.
+HOW IT WORKS:
+Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
+Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
+compounds into future tasks.
+FIRST TIME? Run these 3 steps:
+1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
+2. Call getMethodology("overview") to see all available methodologies
+3. Call search_all_knowledge("your current task") before starting any work
+RETURNING? Your project context and all past learnings are persisted. Start with:
+1. Call search_all_knowledge with your current task
+2. Follow the methodology tools as you work — they'll guide you step by step
+KEY TOOLS:
+- search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
+- run_mandatory_flywheel — 6-step minimum verification before declaring work done
+- getMethodology — Step-by-step guides for verification, eval, flywheel, recon
+- findTools — Discover tools by keyword or category
+- assess_risk — Assess risk before acting (HIGH = needs confirmation)
+PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
+- claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
+- get_parallel_status — See what all agents are doing
 - Use the "claude-code-parallel" prompt for step-by-step guidance`,
                 },
             },
         ],
     },
+    {
+        name: "execution-trace-workflow",
+        description: "Start and maintain a traceable execution run. Use this for any workflow that needs receipts, evidence, decisions, verification, approvals, and a durable audit trail.",
+        arguments: [
+            {
+                name: "workflowTitle",
+                description: "Human-readable title for the run",
+                required: true,
+            },
+            {
+                name: "workflowGoal",
+                description: "What the workflow must accomplish",
+                required: true,
+            },
+            {
+                name: "workflowType",
+                description: "Optional workflow label such as spreadsheet_enrichment or company_direction_analysis",
+                required: false,
+            },
+        ],
+        messages: (args) => [
+            {
+                role: "user",
+                content: {
+                    type: "text",
+                    text: `Run this task as a fully traceable execution workflow.
+Title: ${args.workflowTitle}
+Goal: ${args.workflowGoal}
+Workflow type: ${args.workflowType || "execution_trace"}
+Required operating loop:
+1. Call start_execution_run first. Create one durable run before doing substantive work.
+2. Record every meaningful action with record_execution_step. Do this for inspect, research, edit, verify, export, and issue-fix steps.
+3. Attach evidence as you go with attach_execution_evidence. Store URLs, uploaded files, renders, screenshots, logs, and notes.
+4. Record explicit choices with record_execution_decision. Capture alternatives considered, evidence basis, confidence, and limitations. Do not expose raw chain-of-thought.
+5. Record QA checks with record_execution_verification. Use this for render checks, formula checks, diff checks, replay checks, or artifact integrity checks.
+6. If a risky action needs human sign-off, call request_execution_approval before proceeding.
+7. Finish with complete_execution_run and set the final status plus any drift summary if applicable.
+Trace standard:
+- Facts and outputs must be evidence-grounded.
+- Decisions must separate verified evidence from inference.
+- Verification must explain what was checked and what passed or failed.
+- Limitations must be explicit instead of implied.
+Do not treat the trace as optional. The run should be inspectable after completion by an operator who was not present during execution.`,
+                },
+            },
+        ],
+    },
     {
         name: "project-setup",
         description: "Guided project bootstrapping. Walks you through registering project context so the MCP has full project awareness.",
@@ -1200,21 +1942,154 @@ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
                 role: "user",
                 content: {
                     type: "text",
-                    text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
-Please gather and record the following using the bootstrap_project tool:
-1. Tech stack (languages, frameworks, runtimes)
-2. Key dependency versions
-3. Architecture overview
-4. Build/test commands
-5. Known conventions or patterns
-6. Repository structure highlights
+                    text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
+Please gather and record the following using the bootstrap_project tool:
+1. Tech stack (languages, frameworks, runtimes)
+2. Key dependency versions
+3. Architecture overview
+4. Build/test commands
+5. Known conventions or patterns
+6. Repository structure highlights
 After bootstrapping, run a reconnaissance session with run_recon to check for latest updates on the project's key frameworks and SDKs.`,
                 },
             },
         ],
     },
+    {
+        name: "spreadsheet-enrichment-trace",
+        description: "Traceable workflow for spreadsheet enrichment: inspect workbook, research supporting evidence, edit cells, verify render/calculation quality, and export with receipts.",
+        arguments: [
+            {
+                name: "fileUri",
+                description: "Input spreadsheet path or URI",
+                required: true,
+            },
+            {
+                name: "goal",
+                description: "What the spreadsheet workflow should achieve",
+                required: true,
+            },
+        ],
+        messages: (args) => [
+            {
+                role: "user",
+                content: {
+                    type: "text",
+                    text: `Run a traceable spreadsheet-enrichment workflow.
+Input spreadsheet: ${args.fileUri}
+Goal: ${args.goal}
+Workflow:
+1. Start a run with start_execution_run using workflowName="spreadsheet_enrichment".
+2. Inspect workbook structure, layout, formulas, and formatting. Record this with record_execution_step.
+3. Attach the workbook and any rendered images as evidence with attach_execution_evidence.
+4. If public research is needed, attach source URLs and record the evidence boundary.
+5. Record major ranking or editing choices with record_execution_decision. Include alternatives considered and any unsupported claims.
+6. Perform edits. Record the edit step and attach output artifacts or before/after references.
+7. Verify the workbook. Record calculation checks, render checks, formatting checks, link cleanup, and export checks with record_execution_verification.
+8. Complete the run only after the workbook is exported and the final verification state is known.
+Required output discipline:
+- Make changed cells traceable.
+- Distinguish verified facts from inferred recommendations.
+- Record any formatting or hyperlink cleanup as explicit fix steps.
+- Leave behind enough evidence for another operator to replay what happened.`,
+                },
+            },
+        ],
+    },
+    {
+        name: "company-direction-analysis-trace",
+        description: "Traceable workflow for capability-to-product-direction analysis grounded in public evidence, credibility filters, and phased recommendations.",
+        arguments: [
+            {
+                name: "subjectCompany",
+                description: "Company being evaluated",
+                required: true,
+            },
+            {
+                name: "strategicQuestion",
+                description: "The product-direction or capability question being answered",
+                required: true,
+            },
+        ],
+        messages: (args) => [
+            {
+                role: "user",
+                content: {
+                    type: "text",
+                    text: `Run a traceable company-direction analysis.
+Subject company: ${args.subjectCompany}
+Strategic question: ${args.strategicQuestion}
+Required method:
+1. Start a run with start_execution_run using workflowName="company_direction_analysis".
+2. Gather public evidence first. Attach company pages, press, resumes, hiring signals, papers, and adjacent market references as evidence.
+3. Call compute_dimension_profile as soon as you have enough evidence to ground the company state. Then use export_dimension_bundle to inspect the regime label, policy context, evidence rows, and interaction effects.
+4. Record a decision boundary between:
+   - publicly supported facts
+   - supported but incomplete claims
+   - not established by public evidence
+5. Build a credibility filter and a dimension-aware regime summary. Record explicit decisions for high-credibility, medium-credibility, and low-credibility directions, and tie them to capital, capability, network, market, operations, and narrative dimensions where relevant.
+6. Record the final recommendation as a structured decision with alternatives considered, evidence basis, confidence, limitations, and the regime you believe the company is operating under.
+7. Record at least one verification step that checks the final memo still reflects the truth boundary, the exported dimension bundle, and does not overclaim pedigree.
+8. Complete the run after the recommendation, limitations, evidence links, and dimension bundle references are all attached.
+Output rules:
+- Recommendations must stay adjacent to reputation and public proof.
+- Unsupported claims must be clearly labeled as unsupported.
+- Distinguish verified, estimated, inferred, and unavailable dimension signals.
+- The trace should let another operator audit why a direction was recommended or rejected.`,
+                },
+            },
+        ],
+    },
+    {
+        name: "agent-delegation-with-approval-trace",
+        description: "Traceable workflow for delegated agent work with approval gates. Use this when a capable agent can operate, but risky actions still need scoped human sign-off.",
+        arguments: [
+            {
+                name: "task",
+                description: "Delegated task description",
+                required: true,
+            },
+            {
+                name: "riskLevel",
+                description: "Expected risk level: low, medium, or high",
+                required: true,
+            },
+        ],
+        messages: (args) => [
+            {
+                role: "user",
+                content: {
+                    type: "text",
+                    text: `Run a delegated agent workflow with explicit approval boundaries.
+Task: ${args.task}
+Risk level: ${args.riskLevel}
+Required process:
+1. Start a run with start_execution_run using workflowName="agent_delegation".
+2. Record the initial scope, intended tools, and expected outputs with record_execution_step.
+3. Attach inputs, policies, and constraints as evidence.
+4. Record any material choice or plan update with record_execution_decision.
+5. Before any externally visible, destructive, or high-risk action, call request_execution_approval.
+6. Only continue after the approval state is known, and record the resulting step explicitly.
+7. Record verification that the final output stayed inside scope and honored the approval boundary.
+8. Complete the run with the final status and limitations.
+Trust requirements:
+- The operator must be able to see what was attempted, what required approval, and what evidence justified the action.
+- Do not hide uncertainty or skipped approvals inside prose summaries.`,
+                },
+            },
+        ],
+    },
     {
         name: "ui-qa-checklist",
         description: "UI/UX QA checklist for frontend implementations. Run after any change that touches React components, layouts, or interactions. Guides the agent through component tests, accessibility, responsive checks, and E2E validation.",
@@ -1230,33 +2105,33 @@ After bootstrapping, run a reconnaissance session with run_recon to check for la
                 role: "user",
                 content: {
                     type: "text",
-                    text: `You just implemented UI changes to: ${args.componentName}
-Before declaring this work done, run the UI/UX QA checklist:
-1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
-2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
-3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
-4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
-5. STATES: Verify loading, error, and empty states are handled
-6. CONSOLE: Check browser devtools for errors/warnings
-7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
-8. E2E: Run \`npm run test:e2e\` if relevant tests exist
-9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
-After checking each item, record results:
-  call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
-  evaluate each rule against ${args.componentName}
-  call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
-  call record_learning for any UI gotchas discovered
-For the full step-by-step methodology, call getMethodology("ui_ux_qa").
-Commands available:
-  npm run test:run        — Vitest component tests
-  npm run test:e2e        — Playwright E2E tests
-  npm run storybook       — Storybook dev server (port 6006)
-  npm run perf:lighthouse — Lighthouse audit
+                    text: `You just implemented UI changes to: ${args.componentName}
+Before declaring this work done, run the UI/UX QA checklist:
+1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
+2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
+3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
+4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
+5. STATES: Verify loading, error, and empty states are handled
+6. CONSOLE: Check browser devtools for errors/warnings
+7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
+8. E2E: Run \`npm run test:e2e\` if relevant tests exist
+9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
+After checking each item, record results:
+  call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
+  evaluate each rule against ${args.componentName}
+  call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
+  call record_learning for any UI gotchas discovered
+For the full step-by-step methodology, call getMethodology("ui_ux_qa").
+Commands available:
+  npm run test:run        — Vitest component tests
+  npm run test:e2e        — Playwright E2E tests
+  npm run storybook       — Storybook dev server (port 6006)
+  npm run perf:lighthouse — Lighthouse audit
   npm run perf:bundle     — Bundle size analysis`,
                 },
             },
@@ -1284,47 +2159,47 @@ Commands available:
                     role: "user",
                     content: {
                         type: "text",
-                        text: `You are coordinating a parallel agent team for: ${args.projectGoal}
-This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
-Reference: https://www.anthropic.com/engineering/building-c-compiler
-SETUP (run these in order):
-1. ORIENT — Check what's already happening:
-   call get_parallel_status({ includeHistory: true })
-   call list_agent_tasks({ status: "all" })
-2. PLAN ROLES — Assign ${agentCount} specialized agents:
-   Recommended role split for ${agentCount} agents:
-   ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
-   - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
-   - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
+                        text: `You are coordinating a parallel agent team for: ${args.projectGoal}
+This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
+Reference: https://www.anthropic.com/engineering/building-c-compiler
+SETUP (run these in order):
+1. ORIENT — Check what's already happening:
+   call get_parallel_status({ includeHistory: true })
+   call list_agent_tasks({ status: "all" })
+2. PLAN ROLES — Assign ${agentCount} specialized agents:
+   Recommended role split for ${agentCount} agents:
+   ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
+   - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
+   - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
    - Agent 4: assign_agent_role({ role: "documentation_maintainer", focusArea: "docs and progress" })` :
-                            `- Agent 1: assign_agent_role({ role: "implementer" })
-   - Agent 2: assign_agent_role({ role: "test_writer" })`}
-3. BREAK DOWN WORK — Create task claims:
-   For each independent piece of work:
-   call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
-4. WORK LOOP (each agent independently):
-   a. claim_agent_task — Lock your task
-   b. Do the work (implement, test, review)
-   c. log_context_budget — Track context usage, avoid pollution
-   d. run_oracle_comparison — Validate output against known-good reference
-   e. release_agent_task — Release with progress note
-   f. Pick next task (repeat)
-5. ANTI-PATTERNS TO AVOID:
-   - Two agents working on the same task (always claim first)
-   - Dumping thousands of lines of test output (log to file, print summary)
-   - Spending hours on one stuck problem (mark as blocked, move on)
-   - Overwriting each other's changes (commit frequently, pull before push)
-KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
-use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
+                            `- Agent 1: assign_agent_role({ role: "implementer" })
+   - Agent 2: assign_agent_role({ role: "test_writer" })`}
+3. BREAK DOWN WORK — Create task claims:
+   For each independent piece of work:
+   call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
+4. WORK LOOP (each agent independently):
+   a. claim_agent_task — Lock your task
+   b. Do the work (implement, test, review)
+   c. log_context_budget — Track context usage, avoid pollution
+   d. run_oracle_comparison — Validate output against known-good reference
+   e. release_agent_task — Release with progress note
+   f. Pick next task (repeat)
+5. ANTI-PATTERNS TO AVOID:
+   - Two agents working on the same task (always claim first)
+   - Dumping thousands of lines of test output (log to file, print summary)
+   - Spending hours on one stuck problem (mark as blocked, move on)
+   - Overwriting each other's changes (commit frequently, pull before push)
+KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
+use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
 For the full methodology: call getMethodology("parallel_agent_teams")`,
                     },
                 },
@@ -1351,45 +2226,45 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
                 role: "user",
                 content: {
                     type: "text",
-                    text: `Set up oracle-based testing for: ${args.componentName}
-Oracle source: ${args.oracleSource}
-This follows the pattern from Anthropic's C Compiler project where GCC served as a
-"known-good compiler oracle" to identify which specific files were broken.
-SETUP:
-1. DEFINE ORACLE — Capture known-good reference outputs:
-   Run the reference implementation (${args.oracleSource}) on each test input.
-   Save outputs as golden files or capture them in the oracle comparison tool.
-2. RUN COMPARISONS — For each test case:
-   call run_oracle_comparison({
-     testLabel: "${args.componentName}_test_1",
-     actualOutput: "<your implementation's output>",
-     expectedOutput: "<oracle's output>",
-     oracleSource: "${args.oracleSource}"
-   })
-3. TRIAGE FAILURES — Review diff summaries:
-   Each failing comparison is an independent work item.
-   Assign each to a different parallel agent via claim_agent_task.
-4. BINARY SEARCH (for complex failures):
-   If a test passes individually but fails when combined with others,
-   use delta debugging: split the test set in half, test each half,
-   narrow down to the minimal failing combination.
-   (This is how Anthropic found pairs of files that failed together but worked independently.)
-5. TRACK PROGRESS — Monitor convergence:
-   call get_parallel_status to see how many oracle tests are still failing.
-   As agents fix failures, the match percentage should trend toward 100%.
-CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
-call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
-After all oracle tests pass:
-  call record_learning with patterns discovered
+                    text: `Set up oracle-based testing for: ${args.componentName}
+Oracle source: ${args.oracleSource}
+This follows the pattern from Anthropic's C Compiler project where GCC served as a
+"known-good compiler oracle" to identify which specific files were broken.
+SETUP:
+1. DEFINE ORACLE — Capture known-good reference outputs:
+   Run the reference implementation (${args.oracleSource}) on each test input.
+   Save outputs as golden files or capture them in the oracle comparison tool.
+2. RUN COMPARISONS — For each test case:
+   call run_oracle_comparison({
+     testLabel: "${args.componentName}_test_1",
+     actualOutput: "<your implementation's output>",
+     expectedOutput: "<oracle's output>",
+     oracleSource: "${args.oracleSource}"
+   })
+3. TRIAGE FAILURES — Review diff summaries:
+   Each failing comparison is an independent work item.
+   Assign each to a different parallel agent via claim_agent_task.
+4. BINARY SEARCH (for complex failures):
+   If a test passes individually but fails when combined with others,
+   use delta debugging: split the test set in half, test each half,
+   narrow down to the minimal failing combination.
+   (This is how Anthropic found pairs of files that failed together but worked independently.)
+5. TRACK PROGRESS — Monitor convergence:
+   call get_parallel_status to see how many oracle tests are still failing.
+   As agents fix failures, the match percentage should trend toward 100%.
+CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
+call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
+After all oracle tests pass:
+  call record_learning with patterns discovered
   call run_mandatory_flywheel to verify the full change`,
                 },
             },
@@ -1417,67 +2292,67 @@ After all oracle tests pass:
                     role: "user",
                     content: {
                         type: "text",
-                        text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
-## How This Works
-Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
-context window. NodeBench MCP tools coordinate them via a shared SQLite database.
-**Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
-**Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
-## Step-by-Step
-### 1. PLAN — Break work into ${count} independent tasks
-Identify ${count} pieces of work that can run in parallel without dependencies.
-Each task should be independently completable and testable.
-### 2. SPAWN — Launch subagents with coordination instructions
-For each task, use the Task tool:
-\`\`\`
-Task tool call:
-  prompt: "You have access to NodeBench MCP. Do the following:
-    1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
-    2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
-    3. Do the work
-    4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
-    5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
-    6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
-\`\`\`
-### 3. MONITOR — Check progress
-After spawning all subagents:
-  call get_parallel_status({ includeHistory: true })
-  call list_agent_tasks({ status: "all" })
-### 4. VALIDATE — Run oracle comparisons if applicable
-If subagents produced outputs that should match a reference:
-  call run_oracle_comparison for each output
-### 5. GATE — Quality check the aggregate result
-  call run_quality_gate with rules covering all ${count} tasks
-  call run_mandatory_flywheel to verify the combined change
-## Concrete IMPACT of This Workflow
-| What NodeBench Adds             | Without It (bare subagents)           |
-|---------------------------------|---------------------------------------|
-| Task locks prevent duplicate work | Two subagents might fix the same bug |
-| Role specialization             | All subagents do everything           |
-| Context budget tracking         | Subagent runs out of context silently |
-| Oracle comparisons              | No reference-based validation         |
-| Progress notes for handoff      | Next session starts from scratch      |
-| Learnings persisted             | Knowledge lost when subagent exits    |
-| Quality gate on aggregate       | No validation that pieces fit together |
-## Anti-Patterns
-- DO NOT spawn subagents for work that has dependencies (sequential steps)
-- DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
-- DO NOT dump large outputs into subagent context — use log_context_budget to track
-- DO NOT forget release_agent_task — orphaned claims block future sessions
+                        text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
+## How This Works
+Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
+context window. NodeBench MCP tools coordinate them via a shared SQLite database.
+**Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
+**Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
+## Step-by-Step
+### 1. PLAN — Break work into ${count} independent tasks
+Identify ${count} pieces of work that can run in parallel without dependencies.
+Each task should be independently completable and testable.
+### 2. SPAWN — Launch subagents with coordination instructions
+For each task, use the Task tool:
+\`\`\`
+Task tool call:
+  prompt: "You have access to NodeBench MCP. Do the following:
+    1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
+    2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
+    3. Do the work
+    4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
+    5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
+    6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
+\`\`\`
+### 3. MONITOR — Check progress
+After spawning all subagents:
+  call get_parallel_status({ includeHistory: true })
+  call list_agent_tasks({ status: "all" })
+### 4. VALIDATE — Run oracle comparisons if applicable
+If subagents produced outputs that should match a reference:
+  call run_oracle_comparison for each output
+### 5. GATE — Quality check the aggregate result
+  call run_quality_gate with rules covering all ${count} tasks
+  call run_mandatory_flywheel to verify the combined change
+## Concrete IMPACT of This Workflow
+| What NodeBench Adds             | Without It (bare subagents)           |
+|---------------------------------|---------------------------------------|
+| Task locks prevent duplicate work | Two subagents might fix the same bug |
+| Role specialization             | All subagents do everything           |
+| Context budget tracking         | Subagent runs out of context silently |
+| Oracle comparisons              | No reference-based validation         |
+| Progress notes for handoff      | Next session starts from scratch      |
+| Learnings persisted             | Knowledge lost when subagent exits    |
+| Quality gate on aggregate       | No validation that pieces fit together |
+## Anti-Patterns
+- DO NOT spawn subagents for work that has dependencies (sequential steps)
+- DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
+- DO NOT dump large outputs into subagent context — use log_context_budget to track
+- DO NOT forget release_agent_task — orphaned claims block future sessions
 For the full parallel agent methodology: call getMethodology("parallel_agent_teams")`,
                     },
                 },
@@ -1504,72 +2379,72 @@ For the full parallel agent methodology: call getMethodology("parallel_agent_tea
                 role: "user",
                 content: {
                     type: "text",
-                    text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
-${args.techStack ? `Tech stack: ${args.techStack}` : ""}
-This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
-STEP 1 — DETECT (dry run first):
-  call bootstrap_parallel_agents({
-    projectRoot: "${args.projectPath}",
-    dryRun: true,
-    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
-    includeAgentsMd: true
-  })
-  Review the gap report. It scans 7 categories:
-  - Task coordination (lock files, claim directories)
-  - Role specialization (role configs, AGENTS.md mentions)
-  - Oracle testing (golden files, reference outputs, snapshots)
-  - Context budget tracking (budget configs, AGENTS.md mentions)
-  - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
-  - AGENTS.md parallel section (parallel agent coordination protocol)
-  - Git worktrees (for true parallel work)
-STEP 2 — SCAFFOLD (create files):
-  If gaps found, run with dryRun=false:
-  call bootstrap_parallel_agents({
-    projectRoot: "${args.projectPath}",
-    dryRun: false,
-    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
-    includeAgentsMd: true
-  })
-  This creates:
-  - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
-  - progress.md template for agent orientation
-  - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
-STEP 3 — GENERATE AGENTS.MD (if needed):
-  call generate_parallel_agents_md({
-    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
-    projectName: "${args.projectPath.split("/").pop() || "project"}",
-    maxAgents: 4,
-    includeNodebenchSetup: true
-  })
-  Copy the output into the target repo's AGENTS.md.
-STEP 4 — VERIFY (6-step flywheel):
-  The bootstrap tool returns a flywheelPlan. Execute each step:
-  1. Static analysis — verify scaffold files don't conflict
-  2. Happy path — claim task → work → release → progress.md updated
-  3. Conflict test — two claims on same task → second gets conflict
-  4. Oracle test — create golden file → diff catches changes
-  5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
-  6. Document — record_learning with patterns discovered
-STEP 5 — FIX (if anything fails):
-  Fix the issue, then re-run from Step 4.
-STEP 6 — DOCUMENT:
-  call record_learning({
-    key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
-    content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
-    category: "pattern",
-    tags: ["parallel-agents", "bootstrap", "external-repo"]
-  })
+                    text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
+${args.techStack ? `Tech stack: ${args.techStack}` : ""}
+This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
+STEP 1 — DETECT (dry run first):
+  call bootstrap_parallel_agents({
+    projectRoot: "${args.projectPath}",
+    dryRun: true,
+    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
+    includeAgentsMd: true
+  })
+  Review the gap report. It scans 7 categories:
+  - Task coordination (lock files, claim directories)
+  - Role specialization (role configs, AGENTS.md mentions)
+  - Oracle testing (golden files, reference outputs, snapshots)
+  - Context budget tracking (budget configs, AGENTS.md mentions)
+  - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
+  - AGENTS.md parallel section (parallel agent coordination protocol)
+  - Git worktrees (for true parallel work)
+STEP 2 — SCAFFOLD (create files):
+  If gaps found, run with dryRun=false:
+  call bootstrap_parallel_agents({
+    projectRoot: "${args.projectPath}",
+    dryRun: false,
+    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
+    includeAgentsMd: true
+  })
+  This creates:
+  - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
+  - progress.md template for agent orientation
+  - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
+STEP 3 — GENERATE AGENTS.MD (if needed):
+  call generate_parallel_agents_md({
+    ${args.techStack ? `techStack: "${args.techStack}",` : ""}
+    projectName: "${args.projectPath.split("/").pop() || "project"}",
+    maxAgents: 4,
+    includeNodebenchSetup: true
+  })
+  Copy the output into the target repo's AGENTS.md.
+STEP 4 — VERIFY (6-step flywheel):
+  The bootstrap tool returns a flywheelPlan. Execute each step:
+  1. Static analysis — verify scaffold files don't conflict
+  2. Happy path — claim task → work → release → progress.md updated
+  3. Conflict test — two claims on same task → second gets conflict
+  4. Oracle test — create golden file → diff catches changes
+  5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
+  6. Document — record_learning with patterns discovered
+STEP 5 — FIX (if anything fails):
+  Fix the issue, then re-run from Step 4.
+STEP 6 — DOCUMENT:
+  call record_learning({
+    key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
+    content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
+    category: "pattern",
+    tags: ["parallel-agents", "bootstrap", "external-repo"]
+  })
 For the full methodology: call getMethodology("parallel_agent_teams")`,
                 },
             },
@@ -1583,82 +2458,82 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
                 role: "user",
                 content: {
                     type: "text",
-                    text: `## NodeBench MCP Agent Contract
-You are connected to NodeBench MCP. Follow these rules EXACTLY.
-### FRONT DOOR — Always start here (before writing any code)
-1. search_all_knowledge("<your current task>") — Check if this was solved before
-2. getMethodology("mandatory_flywheel") — Load the verification pipeline
-3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
-4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
-### SELF-SETUP — If a capability is missing
-When discover_tools returns nothing useful, or a tool says "not configured":
-1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
-2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
-3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
-4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
-### BEFORE IMPLEMENTATION
-- run_recon + log_recon_finding (if reconnaissance applies)
-- assess_risk (HIGH risk = must get confirmation before proceeding)
-### PARALLEL WORK
-- MUST claim_agent_task before editing or designing anything
-- MUST release_agent_task with a progress note + next action when done
-- MUST log_context_budget to track context usage and avoid pollution
-### BEFORE SHIP
-- 3-layer tests logged (unit + integration + e2e via log_test_result)
-- Eval run recorded (promote_to_eval)
-- Quality gate passed (run_quality_gate)
-- Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
-- Learning banked (record_learning)
-### COORDINATOR SPAWN TEMPLATE
-When spawning subagents, give each this instruction block:
-  "You have NodeBench MCP. Before any work:
-   1. search_all_knowledge('<task>')
-   2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
-   3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
-   Do the work, then:
-   4. log_context_budget({ eventType: 'checkpoint' })
-   5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
-   6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
-### ANTI-RATIONALIZATION — Block these escape patterns
-Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
-1. "I already know which tool to use" → Still call discover_tools to confirm
-2. "This is a simple task" → Still call search_all_knowledge to check history
-3. "Let me just check one thing first" → Follow the 4-step front door FIRST
-4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
-5. "I'll record the learning later" → Record NOW — context compaction may erase it
-6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
-7. "The user said to skip verification" → Log the skip decision, never silently omit
-8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
-### 2-ACTION SAVE RULE
-After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
-- save_session_note (filesystem, survives compaction)
-- record_learning (SQLite, searchable across sessions)
-- log_recon_finding (tied to recon session)
-This prevents knowledge loss when context is compacted mid-session.
-### 3-STRIKE ERROR PROTOCOL
-When an action fails:
-- Strike 1: Diagnose root cause, apply targeted fix
-- Strike 2: Try a different method or tool
-- Strike 3: Question your assumptions, search_all_knowledge for prior solutions
-- After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
-### ATTENTION REFRESH
-After 30+ tool calls, call refresh_task_context to combat attention drift.
-Re-read your original goal and open gaps before continuing.
-### WHY THIS MATTERS
-Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
-work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
+                    text: `## NodeBench MCP Agent Contract
+You are connected to NodeBench MCP. Follow these rules EXACTLY.
+### FRONT DOOR — Always start here (before writing any code)
+1. search_all_knowledge("<your current task>") — Check if this was solved before
+2. getMethodology("mandatory_flywheel") — Load the verification pipeline
+3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
+4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
+### SELF-SETUP — If a capability is missing
+When discover_tools returns nothing useful, or a tool says "not configured":
+1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
+2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
+3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
+4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
+### BEFORE IMPLEMENTATION
+- run_recon + log_recon_finding (if reconnaissance applies)
+- assess_risk (HIGH risk = must get confirmation before proceeding)
+### PARALLEL WORK
+- MUST claim_agent_task before editing or designing anything
+- MUST release_agent_task with a progress note + next action when done
+- MUST log_context_budget to track context usage and avoid pollution
+### BEFORE SHIP
+- 3-layer tests logged (unit + integration + e2e via log_test_result)
+- Eval run recorded (promote_to_eval)
+- Quality gate passed (run_quality_gate)
+- Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
+- Learning banked (record_learning)
+### COORDINATOR SPAWN TEMPLATE
+When spawning subagents, give each this instruction block:
+  "You have NodeBench MCP. Before any work:
+   1. search_all_knowledge('<task>')
+   2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
+   3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
+   Do the work, then:
+   4. log_context_budget({ eventType: 'checkpoint' })
+   5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
+   6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
+### ANTI-RATIONALIZATION — Block these escape patterns
+Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
+1. "I already know which tool to use" → Still call discover_tools to confirm
+2. "This is a simple task" → Still call search_all_knowledge to check history
+3. "Let me just check one thing first" → Follow the 4-step front door FIRST
+4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
+5. "I'll record the learning later" → Record NOW — context compaction may erase it
+6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
+7. "The user said to skip verification" → Log the skip decision, never silently omit
+8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
+### 2-ACTION SAVE RULE
+After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
+- save_session_note (filesystem, survives compaction)
+- record_learning (SQLite, searchable across sessions)
+- log_recon_finding (tied to recon session)
+This prevents knowledge loss when context is compacted mid-session.
+### 3-STRIKE ERROR PROTOCOL
+When an action fails:
+- Strike 1: Diagnose root cause, apply targeted fix
+- Strike 2: Try a different method or tool
+- Strike 3: Question your assumptions, search_all_knowledge for prior solutions
+- After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
+### ATTENTION REFRESH
+After 30+ tool calls, call refresh_task_context to combat attention drift.
+Re-read your original goal and open gaps before continuing.
+### WHY THIS MATTERS
+Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
+work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
 artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound across tasks.`,
                 },
             },
@@ -1672,191 +2547,191 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
                 role: "user",
                 content: {
                     type: "text",
-                    text: `# Claude Code Swarm Orchestration
-Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
----
-## Primitives
-| Primitive | What It Is |
-|-----------|-----------|
-| **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
-| **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
-| **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
-| **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
-| **Task** | A work item with subject, description, status, owner, and dependencies. |
-| **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
-| **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
----
-## Two Ways to Spawn Agents
-### Method 1: Task Tool (Subagents) — short-lived, returns result directly
-\`\`\`javascript
-Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
-\`\`\`
-### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
-\`\`\`javascript
-Teammate({ operation: "spawnTeam", team_name: "my-project" })
-Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
-\`\`\`
-| Aspect | Task (subagent) | Task + team_name + name (teammate) |
-|--------|-----------------|-----------------------------------|
-| Lifespan | Until task complete | Until shutdown requested |
-| Communication | Return value | Inbox messages |
-| Task access | None | Shared task list |
-| Team membership | No | Yes |
----
-## Built-in Agent Types
-- **Bash** — command execution, git ops (tools: Bash only)
-- **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
-- **Plan** — architecture + implementation plans (read-only tools)
-- **general-purpose** — all tools, multi-step research + action
-- **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
-- **statusline-setup** — configure Claude Code status line
----
-## TeammateTool Operations
-| Operation | Who | What |
-|-----------|-----|------|
-| \`spawnTeam\` | Leader | Create team + task directory |
-| \`discoverTeams\` | Anyone | List joinable teams |
-| \`requestJoin\` | Teammate | Request to join existing team |
-| \`approveJoin\` | Leader | Accept join request |
-| \`write\` | Anyone | Message ONE teammate |
-| \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
-| \`requestShutdown\` | Leader | Ask teammate to exit |
-| \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
-| \`rejectShutdown\` | Teammate | Decline shutdown with reason |
-| \`approvePlan\` | Leader | Approve plan_approval_request |
-| \`rejectPlan\` | Leader | Reject plan with feedback |
-| \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
----
-## Task System
-\`\`\`javascript
-TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
-TaskList()                                              // See all tasks + statuses
-TaskGet({ taskId: "2" })                               // Get full task details
-TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })       // Dependency — auto-unblocks when #1 completes
-TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
-TaskUpdate({ taskId: "2", status: "completed" })
-\`\`\`
----
-## Orchestration Patterns
-### Pattern 1: Parallel Specialists
-\`\`\`javascript
-Teammate({ operation: "spawnTeam", team_name: "pr-review" })
-// Spawn reviewers in ONE message (parallel execution)
-Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
-Task({ team_name: "pr-review", name: "perf",     subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
-// Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
-\`\`\`
-### Pattern 2: Pipeline (Sequential Dependencies)
-\`\`\`javascript
-TaskCreate({ subject: "Research" })     // #1
-TaskCreate({ subject: "Plan" })         // #2
-TaskCreate({ subject: "Implement" })    // #3
-TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })   // #2 waits for #1
-TaskUpdate({ taskId: "3", addBlockedBy: ["2"] })   // #3 waits for #2
-// Spawn workers that poll TaskList and claim unblocked tasks
-\`\`\`
-### Pattern 3: Self-Organizing Swarm
-\`\`\`javascript
-// 1. Create N independent tasks (no dependencies)
-// 2. Spawn M workers with this prompt loop:
-//    a. TaskList → find pending+unclaimed task
-//    b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
-//    c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
-//    d. If no tasks: notify team-lead idle, retry 3x, then exit
-\`\`\`
-### Pattern 4: Research → Implement (synchronous)
-\`\`\`javascript
-const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
-Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
-\`\`\`
----
-## Shutdown Sequence (always follow this order)
-\`\`\`javascript
-// 1. Request shutdown for all teammates
-Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
-// 2. Wait for {"type": "shutdown_approved"} in inbox
-// 3. Only then cleanup
-Teammate({ operation: "cleanup" })
-\`\`\`
----
-## Spawn Backends
-| Backend | When auto-selected | Visibility |
-|---------|-------------------|------------|
-| \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
-| \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
-| \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
-Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
----
-## Best Practices
-1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
-2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
-3. **Use dependencies**: \`addBlockedBy\` — never poll manually
-4. **Prefer write over broadcast**: broadcast = N messages for N teammates
-5. **Always cleanup**: Don't leave orphaned teams
-6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
----
-## Quick Reference
-\`\`\`javascript
-// Subagent (returns result)
-Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
-// Teammate (persistent, background)
-Teammate({ operation: "spawnTeam", team_name: "my-team" })
-Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
-// Message teammate
-Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
-// Pipeline
-TaskCreate({ subject: "Step 1" })   // → #1
-TaskCreate({ subject: "Step 2" })   // → #2
-TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
-// Shutdown
-Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
-// wait for {"type": "shutdown_approved"} in inbox...
-Teammate({ operation: "cleanup" })
-\`\`\`
----
+                    text: `# Claude Code Swarm Orchestration
+Master multi-agent orchestration using Claude Code's TeammateTool and Task system.
+---
+## Primitives
+| Primitive | What It Is |
+|-----------|-----------|
+| **Agent** | A Claude instance that can use tools. You are an agent. Subagents are agents you spawn. |
+| **Team** | A named group of agents working together. One leader, multiple teammates. Config: \`~/.claude/teams/{name}/config.json\` |
+| **Teammate** | An agent that joined a team. Has a name, color, inbox. Spawned via Task with \`team_name\` + \`name\`. |
+| **Leader** | The agent that created the team. Receives messages, approves plans/shutdowns. |
+| **Task** | A work item with subject, description, status, owner, and dependencies. |
+| **Inbox** | JSON file where an agent receives messages. \`~/.claude/teams/{name}/inboxes/{agent}.json\` |
+| **Backend** | How teammates run. Auto-detected: \`in-process\` (invisible), \`tmux\` (visible panes), \`iterm2\` (split panes). |
+---
+## Two Ways to Spawn Agents
+### Method 1: Task Tool (Subagents) — short-lived, returns result directly
+\`\`\`javascript
+Task({ subagent_type: "Explore", description: "Find auth files", prompt: "...", model: "haiku" })
+\`\`\`
+### Method 2: Task + team_name + name (Teammates) — persistent, communicates via inbox
+\`\`\`javascript
+Teammate({ operation: "spawnTeam", team_name: "my-project" })
+Task({ team_name: "my-project", name: "security-reviewer", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
+\`\`\`
+| Aspect | Task (subagent) | Task + team_name + name (teammate) |
+|--------|-----------------|-----------------------------------|
+| Lifespan | Until task complete | Until shutdown requested |
+| Communication | Return value | Inbox messages |
+| Task access | None | Shared task list |
+| Team membership | No | Yes |
+---
+## Built-in Agent Types
+- **Bash** — command execution, git ops (tools: Bash only)
+- **Explore** — read-only codebase search, file finding (use \`model: "haiku"\`)
+- **Plan** — architecture + implementation plans (read-only tools)
+- **general-purpose** — all tools, multi-step research + action
+- **claude-code-guide** — questions about Claude Code, Agent SDK, Anthropic API
+- **statusline-setup** — configure Claude Code status line
+---
+## TeammateTool Operations
+| Operation | Who | What |
+|-----------|-----|------|
+| \`spawnTeam\` | Leader | Create team + task directory |
+| \`discoverTeams\` | Anyone | List joinable teams |
+| \`requestJoin\` | Teammate | Request to join existing team |
+| \`approveJoin\` | Leader | Accept join request |
+| \`write\` | Anyone | Message ONE teammate |
+| \`broadcast\` | Anyone | Message ALL teammates (N messages — expensive, avoid) |
+| \`requestShutdown\` | Leader | Ask teammate to exit |
+| \`approveShutdown\` | Teammate | **MUST call** — sends confirmation, exits process |
+| \`rejectShutdown\` | Teammate | Decline shutdown with reason |
+| \`approvePlan\` | Leader | Approve plan_approval_request |
+| \`rejectPlan\` | Leader | Reject plan with feedback |
+| \`cleanup\` | Leader | Remove team + task files (all teammates must be shut down first) |
+---
+## Task System
+\`\`\`javascript
+TaskCreate({ subject: "Step 1", description: "...", activeForm: "Working on step 1..." })
+TaskList()                                              // See all tasks + statuses
+TaskGet({ taskId: "2" })                               // Get full task details
+TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })       // Dependency — auto-unblocks when #1 completes
+TaskUpdate({ taskId: "2", owner: "worker-1", status: "in_progress" })
+TaskUpdate({ taskId: "2", status: "completed" })
+\`\`\`
+---
+## Orchestration Patterns
+### Pattern 1: Parallel Specialists
+\`\`\`javascript
+Teammate({ operation: "spawnTeam", team_name: "pr-review" })
+// Spawn reviewers in ONE message (parallel execution)
+Task({ team_name: "pr-review", name: "security", subagent_type: "general-purpose", prompt: "Review for security issues. Send findings to team-lead via Teammate write.", run_in_background: true })
+Task({ team_name: "pr-review", name: "perf",     subagent_type: "general-purpose", prompt: "Review for perf issues. Send findings to team-lead via Teammate write.", run_in_background: true })
+// Collect from: cat ~/.claude/teams/pr-review/inboxes/team-lead.json
+\`\`\`
+### Pattern 2: Pipeline (Sequential Dependencies)
+\`\`\`javascript
+TaskCreate({ subject: "Research" })     // #1
+TaskCreate({ subject: "Plan" })         // #2
+TaskCreate({ subject: "Implement" })    // #3
+TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })   // #2 waits for #1
+TaskUpdate({ taskId: "3", addBlockedBy: ["2"] })   // #3 waits for #2
+// Spawn workers that poll TaskList and claim unblocked tasks
+\`\`\`
+### Pattern 3: Self-Organizing Swarm
+\`\`\`javascript
+// 1. Create N independent tasks (no dependencies)
+// 2. Spawn M workers with this prompt loop:
+//    a. TaskList → find pending+unclaimed task
+//    b. TaskUpdate(claim) → TaskUpdate(in_progress) → do work
+//    c. TaskUpdate(completed) → Teammate write findings to team-lead → repeat
+//    d. If no tasks: notify team-lead idle, retry 3x, then exit
+\`\`\`
+### Pattern 4: Research → Implement (synchronous)
+\`\`\`javascript
+const research = await Task({ subagent_type: "general-purpose", prompt: "Research best practices for X..." })
+Task({ subagent_type: "general-purpose", prompt: \`Implement based on research: \${research.content}\` })
+\`\`\`
+---
+## Shutdown Sequence (always follow this order)
+\`\`\`javascript
+// 1. Request shutdown for all teammates
+Teammate({ operation: "requestShutdown", target_agent_id: "worker-1", reason: "All tasks complete" })
+// 2. Wait for {"type": "shutdown_approved"} in inbox
+// 3. Only then cleanup
+Teammate({ operation: "cleanup" })
+\`\`\`
+---
+## Spawn Backends
+| Backend | When auto-selected | Visibility |
+|---------|-------------------|------------|
+| \`in-process\` | Not in tmux/iTerm2 (default) | Hidden — no real-time output |
+| \`tmux\` | Inside tmux session (\$TMUX set) | Visible — switch panes |
+| \`iterm2\` | In iTerm2 + \`it2\` CLI installed | Visible — split panes |
+Force: \`export CLAUDE_CODE_SPAWN_BACKEND=tmux\`
+---
+## Best Practices
+1. **Meaningful names**: \`security-reviewer\` not \`worker-1\`
+2. **Explicit prompts**: Numbered steps + "send findings to team-lead via Teammate write"
+3. **Use dependencies**: \`addBlockedBy\` — never poll manually
+4. **Prefer write over broadcast**: broadcast = N messages for N teammates
+5. **Always cleanup**: Don't leave orphaned teams
+6. **Worker failures**: 5-min heartbeat timeout; crashed worker tasks can be reclaimed by others
+---
+## Quick Reference
+\`\`\`javascript
+// Subagent (returns result)
+Task({ subagent_type: "Explore", description: "Find files", prompt: "..." })
+// Teammate (persistent, background)
+Teammate({ operation: "spawnTeam", team_name: "my-team" })
+Task({ team_name: "my-team", name: "worker", subagent_type: "general-purpose", prompt: "...", run_in_background: true })
+// Message teammate
+Teammate({ operation: "write", target_agent_id: "worker-1", value: "..." })
+// Pipeline
+TaskCreate({ subject: "Step 1" })   // → #1
+TaskCreate({ subject: "Step 2" })   // → #2
+TaskUpdate({ taskId: "2", addBlockedBy: ["1"] })
+// Shutdown
+Teammate({ operation: "requestShutdown", target_agent_id: "worker-1" })
+// wait for {"type": "shutdown_approved"} in inbox...
+Teammate({ operation: "cleanup" })
+\`\`\`
+---
 *Source: kieranklaassen/orchestrating-swarms gist — Claude Code v2.1.19*`,
                 },
             },
@@ -1870,70 +2745,70 @@ Teammate({ operation: "cleanup" })
                 role: "user",
                 content: {
                     type: "text",
-                    text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
-You are running the Thompson Protocol content pipeline. This is a multi-agent system
-that transforms complex topics into content that makes the reader feel smart.
-Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
-the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
-teaching any mechanics.
-## Pipeline (execute in order)
-### Step 1: Initialize
-\`\`\`
-thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
-\`\`\`
-This returns the full execution plan with system prompts for each agent.
-### Step 2: Write (Thompson Writer)
-\`\`\`
-thompson_write({ topic: "<topic>", target_audience: "<audience>" })
-\`\`\`
-Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
-Every technical term MUST have an "in other words..." analogy.
-### Step 3: Edit (Feynman Editor — max 3 cycles)
-\`\`\`
-thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
-\`\`\`
-The Skeptical Beginner reviews against 8 rejection criteria.
-If any section gets REWRITE → send back to thompson_write with fix instructions.
-Loop max 3 times. After 3, escalate stuck sections.
-### Step 4: Visual Map
-\`\`\`
-thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
-\`\`\`
-Generates image prompts that map 1:1 with text analogies. No generic b-roll.
-### Step 5: Anti-Elitism Lint
-\`\`\`
-thompson_anti_elitism_lint({ content: "<full text>" })
-\`\`\`
-Deterministic scan: 22 banned phrases, readability metrics, jargon density.
-Zero LLM cost — pure regex + math.
-### Step 6: Quality Gate
-\`\`\`
-thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
-\`\`\`
-10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
-Only distribute if passing or exemplary.
-## Core Principles (non-negotiable)
-1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
-2. **Intuition Before Mechanics**: Explain WHY before HOW
-3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
-4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
-5. **Progressive Complexity**: Start with simplest true statement, layer up
-6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
-7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
-## After Pipeline
-- \`save_session_note\` — persist Thompson-processed content
-- \`record_learning\` — log which analogies and styles worked best
+                    text: `# The Thompson Protocol — "Calculus Made Easy" for AI Content
+You are running the Thompson Protocol content pipeline. This is a multi-agent system
+that transforms complex topics into content that makes the reader feel smart.
+Named after Silvanus P. Thompson, who wrote "Calculus Made Easy" (1910) by attacking
+the "preliminary terrors" — the intimidating jargon and elitist gatekeeping — before
+teaching any mechanics.
+## Pipeline (execute in order)
+### Step 1: Initialize
+\`\`\`
+thompson_pipeline({ topic: "<your topic>", target_audience: "<audience>", output_format: "script|article|thread|explainer" })
+\`\`\`
+This returns the full execution plan with system prompts for each agent.
+### Step 2: Write (Thompson Writer)
+\`\`\`
+thompson_write({ topic: "<topic>", target_audience: "<audience>" })
+\`\`\`
+Then use \`call_llm\` with the returned system_prompt to generate plain-English content.
+Every technical term MUST have an "in other words..." analogy.
+### Step 3: Edit (Feynman Editor — max 3 cycles)
+\`\`\`
+thompson_feynman_edit({ sections: "<writer output>", rewrite_cycle: 1 })
+\`\`\`
+The Skeptical Beginner reviews against 8 rejection criteria.
+If any section gets REWRITE → send back to thompson_write with fix instructions.
+Loop max 3 times. After 3, escalate stuck sections.
+### Step 4: Visual Map
+\`\`\`
+thompson_visual_map({ sections: "<approved sections>", visual_style: "line_art" })
+\`\`\`
+Generates image prompts that map 1:1 with text analogies. No generic b-roll.
+### Step 5: Anti-Elitism Lint
+\`\`\`
+thompson_anti_elitism_lint({ content: "<full text>" })
+\`\`\`
+Deterministic scan: 22 banned phrases, readability metrics, jargon density.
+Zero LLM cost — pure regex + math.
+### Step 6: Quality Gate
+\`\`\`
+thompson_quality_gate({ writer_output: "...", feynman_verdict: "...", lint_result: "..." })
+\`\`\`
+10-point boolean checklist → grade (exemplary/passing/needs_work/failing).
+Only distribute if passing or exemplary.
+## Core Principles (non-negotiable)
+1. **Plain English Mandate**: Every jargon term gets an "in other words..." with a household analogy
+2. **Intuition Before Mechanics**: Explain WHY before HOW
+3. **Acknowledge Difficulty**: Validate reader confusion ("This sounds terrifying, but...")
+4. **No Elitism**: Ban "it is obvious", "as we all know", "simply put", "just do X"
+5. **Progressive Complexity**: Start with simplest true statement, layer up
+6. **Visual = Analogy**: Every visual reinforces a specific text metaphor, 1:1
+7. **12-Year-Old Bar**: If a 12-year-old can't understand it, rewrite it
+## After Pipeline
+- \`save_session_note\` — persist Thompson-processed content
+- \`record_learning\` — log which analogies and styles worked best
 - Use \`content_publish\` workflow chain for distribution`,
                 },
             },
@@ -1943,21 +2818,21 @@ Only distribute if passing or exemplary.
 // Server instructions — tells Claude Code Tool Search (and other clients) when to search
 // for NodeBench tools. This is the key integration point for lazy loading compatibility.
 // See: https://www.anthropic.com/engineering/advanced-tool-use
-const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
-Use NodeBench tools when you need to:
-- Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
-- Run evaluations and quality gates before shipping code
-- Search prior knowledge and record learnings across sessions
-- Assess risk before taking actions
-- Coordinate parallel agents (task locks, roles, context budget)
-- Research with structured recon (web search, GitHub, RSS feeds)
-- Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
-- Run security audits (dependency scanning, code analysis, secrets detection)
-- Write and polish academic papers
-- Audit SEO, analyze Figma flows, detect Android flicker
-- Call LLMs (GPT, Claude, Gemini) for analysis and extraction
+const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
+Use NodeBench tools when you need to:
+- Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
+- Run evaluations and quality gates before shipping code
+- Search prior knowledge and record learnings across sessions
+- Assess risk before taking actions
+- Coordinate parallel agents (task locks, roles, context budget)
+- Research with structured recon (web search, GitHub, RSS feeds)
+- Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
+- Run security audits (dependency scanning, code analysis, secrets detection)
+- Write and polish academic papers
+- Audit SEO, analyze Figma flows, detect Android flicker
+- Call LLMs (GPT, Claude, Gemini) for analysis and extraction
 Start with discover_tools("<your task>") to find the right tool.`;
-const server = new Server({ name: "nodebench-mcp-methodology", version: "2.30.0" }, {
+const server = new Server({ name: "nodebench-mcp-methodology", version: "2.32.0" }, {
     capabilities: { tools: { listChanged: true }, prompts: {} },
     instructions: SERVER_INSTRUCTIONS,
 });
@@ -1970,10 +2845,12 @@ try {
 catch { /* instrumentation must not block server start */ }
 // Handle tools/list — return all tools with their JSON Schema inputSchemas
 // Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
+// + MCP security annotations: readOnlyHint, destructiveHint, openWorldHint
 server.setRequestHandler(ListToolsRequestSchema, async () => {
     return {
         tools: allTools.map((t) => {
             const entry = TOOL_REGISTRY.get(t.name);
+            const securityAnnotations = getToolAnnotations(t.name);
             return {
                 name: t.name,
                 description: t.description,
@@ -1984,8 +2861,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         category: entry.category,
                         phase: entry.phase,
                         complexity: getToolComplexity(t.name),
+                        ...securityAnnotations,
+                    },
+                } : {
+                    annotations: {
+                        ...securityAnnotations,
                     },
-                } : {}),
+                }),
             };
         }),
     };
@@ -1996,6 +2878,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
     _abToolCallCount++;
     if (name === "load_toolset" || name === "unload_toolset")
         _abLoadEventCount++;
+    // Intent-based auto-expansion: on first call, classify and load relevant toolsets
+    if (!_intentClassified) {
+        _intentClassified = true;
+        const expanded = classifyAndExpand(name, args);
+        if (expanded) {
+            console.error(`[intent-classify] Auto-loaded toolsets: ${expanded.join(", ")} (from tool: ${name})`);
+        }
+    }
     const tool = toolMap.get(name);
     if (!tool) {
         return {
@@ -2059,18 +2949,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
         else {
             serialized = JSON.stringify(enrichedResult, null, 2);
         }
+        // Security: redact credentials from all tool outputs (single enforcement point)
+        const sanitized = redactSecrets(serialized);
         const contentBlocks = [
-            { type: "text", text: serialized },
+            { type: "text", text: sanitized },
         ];
         if (hookHint) {
             contentBlocks.push({ type: "text", text: hookHint });
         }
+        // Audit log: successful tool call
+        auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), true);
         return {
             content: contentBlocks,
             isError: false,
         };
     }
     catch (err) {
+        // Security errors get a clean response (not a stack trace)
+        if (err instanceof SecurityError) {
+            auditLog("tool_call", name, JSON.stringify(args ?? {}).substring(0, 200), false, err.message);
+            return {
+                content: [{ type: "text", text: `[SECURITY] ${err.message}` }],
+                isError: true,
+            };
+        }
         resultStatus = "error";
         errorMsg = err?.message || "Internal error";
         // Auto-log errors to main DB
@@ -2121,13 +3023,13 @@ process.on('exit', () => {
     try {
         const db = getDb();
         const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
-        db.prepare(`UPDATE ab_test_sessions SET
-        final_tool_count = ?,
-        toolsets_loaded = ?,
-        total_tool_calls = ?,
-        total_load_events = ?,
-        session_duration_ms = ?,
-        ended_at = datetime('now')
+        db.prepare(`UPDATE ab_test_sessions SET
+        final_tool_count = ?,
+        toolsets_loaded = ?,
+        total_tool_calls = ?,
+        total_load_events = ?,
+        session_duration_ms = ?,
+        ended_at = datetime('now')
       WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
     }
     catch { /* instrumentation must not block shutdown */ }
@@ -2157,6 +3059,15 @@ if (useEngine) {
     }
     catch { /* engine is optional — don't block MCP */ }
 }
+// Start observability watchdog (non-blocking, best-effort)
+try {
+    initObservability(getDb);
+    startWatchdog(getDb());
+}
+catch { /* observability is optional — don't block MCP */ }
+// Graceful shutdown
+process.on("SIGINT", () => { stopWatchdog(); process.exit(0); });
+process.on("SIGTERM", () => { stopWatchdog(); process.exit(0); });
 const toolsetInfo = cliArgs.includes("--toolsets") || cliArgs.includes("--exclude") || cliArgs.includes("--preset")
     ? ` [gated: ${domainTools.length} domain + 2 meta]`
     : "";