nodebench-mcp 2.25.0 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/NODEBENCH_AGENTS.md +5 -4
  2. package/README.md +145 -16
  3. package/dist/__tests__/architectComplex.test.js +3 -5
  4. package/dist/__tests__/architectComplex.test.js.map +1 -1
  5. package/dist/__tests__/batchAutopilot.test.d.ts +8 -0
  6. package/dist/__tests__/batchAutopilot.test.js +218 -0
  7. package/dist/__tests__/batchAutopilot.test.js.map +1 -0
  8. package/dist/__tests__/cliSubcommands.test.d.ts +1 -0
  9. package/dist/__tests__/cliSubcommands.test.js +138 -0
  10. package/dist/__tests__/cliSubcommands.test.js.map +1 -0
  11. package/dist/__tests__/evalHarness.test.js +1 -1
  12. package/dist/__tests__/forecastingDogfood.test.d.ts +9 -0
  13. package/dist/__tests__/forecastingDogfood.test.js +284 -0
  14. package/dist/__tests__/forecastingDogfood.test.js.map +1 -0
  15. package/dist/__tests__/forecastingScoring.test.d.ts +9 -0
  16. package/dist/__tests__/forecastingScoring.test.js +202 -0
  17. package/dist/__tests__/forecastingScoring.test.js.map +1 -0
  18. package/dist/__tests__/localDashboard.test.d.ts +1 -0
  19. package/dist/__tests__/localDashboard.test.js +226 -0
  20. package/dist/__tests__/localDashboard.test.js.map +1 -0
  21. package/dist/__tests__/multiHopDogfood.test.js +11 -11
  22. package/dist/__tests__/multiHopDogfood.test.js.map +1 -1
  23. package/dist/__tests__/openclawDogfood.test.d.ts +23 -0
  24. package/dist/__tests__/openclawDogfood.test.js +535 -0
  25. package/dist/__tests__/openclawDogfood.test.js.map +1 -0
  26. package/dist/__tests__/openclawMessaging.test.d.ts +14 -0
  27. package/dist/__tests__/openclawMessaging.test.js +232 -0
  28. package/dist/__tests__/openclawMessaging.test.js.map +1 -0
  29. package/dist/__tests__/presetRealWorldBench.test.js +0 -2
  30. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  31. package/dist/__tests__/tools.test.js +9 -157
  32. package/dist/__tests__/tools.test.js.map +1 -1
  33. package/dist/__tests__/toolsetGatingEval.test.js +0 -2
  34. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  35. package/dist/__tests__/traceabilityDogfood.test.d.ts +12 -0
  36. package/dist/__tests__/traceabilityDogfood.test.js +241 -0
  37. package/dist/__tests__/traceabilityDogfood.test.js.map +1 -0
  38. package/dist/__tests__/webmcpTools.test.d.ts +7 -0
  39. package/dist/__tests__/webmcpTools.test.js +195 -0
  40. package/dist/__tests__/webmcpTools.test.js.map +1 -0
  41. package/dist/dashboard/briefHtml.d.ts +20 -0
  42. package/dist/dashboard/briefHtml.js +1000 -0
  43. package/dist/dashboard/briefHtml.js.map +1 -0
  44. package/dist/dashboard/briefServer.d.ts +18 -0
  45. package/dist/dashboard/briefServer.js +320 -0
  46. package/dist/dashboard/briefServer.js.map +1 -0
  47. package/dist/dashboard/html.js +1470 -1230
  48. package/dist/dashboard/html.js.map +1 -1
  49. package/dist/dashboard/server.js +166 -41
  50. package/dist/dashboard/server.js.map +1 -1
  51. package/dist/index.js +210 -14
  52. package/dist/index.js.map +1 -1
  53. package/dist/tools/critterTools.js +4 -0
  54. package/dist/tools/critterTools.js.map +1 -1
  55. package/dist/tools/forecastingTools.d.ts +11 -0
  56. package/dist/tools/forecastingTools.js +616 -0
  57. package/dist/tools/forecastingTools.js.map +1 -0
  58. package/dist/tools/localDashboardTools.d.ts +8 -0
  59. package/dist/tools/localDashboardTools.js +332 -0
  60. package/dist/tools/localDashboardTools.js.map +1 -0
  61. package/dist/tools/metaTools.js +170 -1
  62. package/dist/tools/metaTools.js.map +1 -1
  63. package/dist/tools/openclawTools.d.ts +11 -0
  64. package/dist/tools/openclawTools.js +1017 -0
  65. package/dist/tools/openclawTools.js.map +1 -0
  66. package/dist/tools/overstoryTools.d.ts +14 -0
  67. package/dist/tools/overstoryTools.js +426 -0
  68. package/dist/tools/overstoryTools.js.map +1 -0
  69. package/dist/tools/progressiveDiscoveryTools.js +50 -115
  70. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  71. package/dist/tools/selfEvalTools.js +8 -1
  72. package/dist/tools/selfEvalTools.js.map +1 -1
  73. package/dist/tools/sessionMemoryTools.js +14 -2
  74. package/dist/tools/sessionMemoryTools.js.map +1 -1
  75. package/dist/tools/toolRegistry.d.ts +1 -15
  76. package/dist/tools/toolRegistry.js +243 -228
  77. package/dist/tools/toolRegistry.js.map +1 -1
  78. package/dist/tools/visualQaTools.d.ts +2 -0
  79. package/dist/tools/visualQaTools.js +1088 -0
  80. package/dist/tools/visualQaTools.js.map +1 -0
  81. package/dist/tools/webmcpTools.d.ts +16 -0
  82. package/dist/tools/webmcpTools.js +703 -0
  83. package/dist/tools/webmcpTools.js.map +1 -0
  84. package/dist/toolsetRegistry.js +6 -2
  85. package/dist/toolsetRegistry.js.map +1 -1
  86. package/package.json +2 -2
@@ -2073,14 +2073,14 @@ const REGISTRY_ENTRIES = [
2073
2073
  {
2074
2074
  name: "ingest_dive_screenshots",
2075
2075
  category: "ui_ux_dive",
2076
- tags: ["ui", "screenshot", "ingest", "import", "bulk", "png", "jpg", "gallery", "dive", "disk", "file"],
2076
+ tags: ["ui", "screenshot", "ingest", "import", "bulk", "disk", "gallery", "dive", "png", "jpg"],
2077
2077
  quickRef: {
2078
- nextAction: "Screenshots ingested into session DB. View them in the dashboard gallery or reference in dive_changelog entries.",
2079
- nextTools: ["dive_changelog", "get_dive_report", "open_dive_dashboard"],
2078
+ nextAction: "Screenshots ingested into dive session. View them in the dashboard or use dive_screenshot to capture new ones.",
2079
+ nextTools: ["dive_screenshot", "tag_ui_bug", "end_component_flow", "get_dive_tree"],
2080
2080
  methodology: "agentic_vision",
2081
- tip: "Use after external Playwright MCP captures screenshots to disk. Scans directory recursively, base64-encodes, and inserts into the session DB for dashboard display.",
2081
+ tip: "Scans a directory for PNG/JPG files and bulk-imports them into a dive session's screenshot gallery. Use after external Playwright captures.",
2082
2082
  },
2083
- phase: "utility",
2083
+ phase: "test",
2084
2084
  },
2085
2085
  // ═══════════════════════════════════════════
2086
2086
  // UI/UX DIVE V2 — Deep interaction testing,
@@ -2266,7 +2266,7 @@ const REGISTRY_ENTRIES = [
2266
2266
  {
2267
2267
  name: "register_skill",
2268
2268
  category: "skill_update",
2269
- tags: ["skill", "rule", "register", "source", "hash", "frontmatter", "provenance", "memory", "agents-md", "cursor", "windsurf", "update", "reexamine", "related_"],
2269
+ tags: ["skill", "rule", "register", "source", "hash", "frontmatter", "provenance", "memory", "agents-md", "cursor", "windsurf", "update"],
2270
2270
  quickRef: {
2271
2271
  nextAction: "Skill registered. Use check_skill_freshness periodically to detect when source files change.",
2272
2272
  nextTools: ["check_skill_freshness", "list_skills"],
@@ -2312,28 +2312,6 @@ const REGISTRY_ENTRIES = [
2312
2312
  phase: "utility",
2313
2313
  },
2314
2314
  // ═══════════════════════════════════════════
2315
- // RE-EXAMINE 11/10 — Fresh-eyes quality pass
2316
- // Modular rules: reexamine_process → a11y,
2317
- // resilience, polish, keyboard, performance
2318
- // Cross-ref via related_ frontmatter hops
2319
- // ═══════════════════════════════════════════
2320
- // NOTE: These are not MCP tools — they are rule
2321
- // files in .cursor/rules/ and .windsurf/rules/.
2322
- // The skill_update tools above (register_skill,
2323
- // check_skill_freshness) track their freshness.
2324
- // The related_ field in each rule's frontmatter
2325
- // enables one-hop and two-hop cross-referencing:
2326
- //
2327
- // reexamine_process
2328
- // └─ related_: [a11y, resilience, polish, keyboard, performance]
2329
- // └─ reexamine_a11y.related_: [keyboard, polish, process]
2330
- // └─ reexamine_resilience.related_: [performance, process, polish]
2331
- // └─ reexamine_polish.related_: [a11y, performance, process]
2332
- // └─ reexamine_keyboard.related_: [a11y, process]
2333
- // └─ reexamine_performance.related_: [resilience, polish, process]
2334
- //
2335
- // Two-hop example: process → a11y → keyboard (discovers keyboard via a11y)
2336
- // ═══════════════════════════════════════════
2337
2315
  // MCP BRIDGE — Connect external MCP servers
2338
2316
  // ═══════════════════════════════════════════
2339
2317
  {
@@ -2432,43 +2410,178 @@ const REGISTRY_ENTRIES = [
2432
2410
  phase: "implement",
2433
2411
  },
2434
2412
  // ═══════════════════════════════════════════
2435
- // PR REPORTVisual PR creation from dives
2413
+ // QA ORCHESTRATIONOverstory multi-agent QA
2436
2414
  // ═══════════════════════════════════════════
2437
2415
  {
2438
- name: "generate_pr_report",
2439
- category: "pr_report",
2440
- tags: ["pr", "pull-request", "report", "markdown", "visual", "screenshot", "before-after", "timeline", "changelog", "dive", "github", "review", "evidence"],
2416
+ name: "overstory_fleet_status",
2417
+ category: "qa_orchestration",
2418
+ tags: ["overstory", "agent", "fleet", "status", "health", "multi-agent", "orchestration", "qa", "dogfood", "worktree"],
2441
2419
  quickRef: {
2442
- nextAction: "PR report generated. Use the markdown with `gh pr create --body-file` or call create_visual_pr for end-to-end PR creation.",
2443
- nextTools: ["create_visual_pr", "export_pr_screenshots", "review_pr_checklist"],
2444
- methodology: "agentic_vision",
2445
- tip: "Pass asset_dir to export screenshots as PNGs that can be committed alongside the PR.",
2420
+ nextAction: "Review agent states. If agents are idle, run dogfood:overstory to start a QA session.",
2421
+ nextTools: ["overstory_qa_summary", "overstory_mail_log", "run_visual_qa_suite"],
2422
+ methodology: "ai_flywheel",
2423
+ tip: "Reads .overstory/agent-manifest.json and overstory.db. Shows configured agents, capabilities, gate policy, and live agent health.",
2446
2424
  },
2447
- phase: "ship",
2425
+ phase: "utility",
2448
2426
  },
2449
2427
  {
2450
- name: "export_pr_screenshots",
2451
- category: "pr_report",
2452
- tags: ["pr", "screenshot", "export", "png", "before-after", "visual", "evidence", "assets", "commit", "dive", "changelog", "fix"],
2428
+ name: "overstory_qa_summary",
2429
+ category: "qa_orchestration",
2430
+ tags: ["overstory", "qa", "gate", "summary", "stability", "grade", "ssim", "triage", "p0", "p1", "dogfood"],
2453
2431
  quickRef: {
2454
- nextAction: "Screenshots exported. Stage and commit them, then use generate_pr_report or create_visual_pr to reference them in the PR body.",
2455
- nextTools: ["generate_pr_report", "create_visual_pr"],
2456
- methodology: "agentic_vision",
2457
- tip: "Naming convention: {index}-{type}-before.png / after.png. Commit these with your branch.",
2432
+ nextAction: "If gate fails, check failing routes and fix p0/p1 issues. If gate passes, proceed to merge.",
2433
+ nextTools: ["overstory_mail_log", "overstory_fleet_status", "run_visual_qa_suite", "burst_capture"],
2434
+ methodology: "ai_flywheel",
2435
+ tip: "Aggregates SSIM stability grades from visual_qa_runs and Gemini QA triage from Overstory mail. Returns gate pass/fail verdict.",
2458
2436
  },
2459
- phase: "ship",
2437
+ phase: "verify",
2460
2438
  },
2461
2439
  {
2462
- name: "create_visual_pr",
2463
- category: "pr_report",
2464
- tags: ["pr", "pull-request", "create", "github", "gh", "visual", "screenshot", "end-to-end", "push", "merge", "review", "dive", "timeline", "evidence"],
2440
+ name: "overstory_mail_log",
2441
+ category: "qa_orchestration",
2442
+ tags: ["overstory", "mail", "log", "message", "route", "triage", "dispatch", "agent", "coordination"],
2465
2443
  quickRef: {
2466
- nextAction: "PR created! Share the URL with reviewers. The PR body contains visual evidence and dashboard links for interactive browsing.",
2467
- nextTools: ["review_pr_checklist", "enforce_merge_gate"],
2468
- methodology: "agentic_vision",
2469
- tip: "Set draft:true for WIP PRs. Combines export_pr_screenshots + generate_pr_report + gh pr create in one call.",
2444
+ nextAction: "Review messages to understand QA session state. Filter by type or agent for focused view.",
2445
+ nextTools: ["overstory_qa_summary", "overstory_fleet_status", "overstory_merge_queue"],
2446
+ methodology: "ai_flywheel",
2447
+ tip: "Supports type_filter (result/dispatch/worker_done/escalation) and agent_filter. Shows structured mail payloads from the QA agent fleet.",
2470
2448
  },
2471
- phase: "ship",
2449
+ phase: "utility",
2450
+ },
2451
+ {
2452
+ name: "overstory_merge_queue",
2453
+ category: "qa_orchestration",
2454
+ tags: ["overstory", "merge", "queue", "branch", "conflict", "gate", "builder", "qa", "resolution"],
2455
+ quickRef: {
2456
+ nextAction: "If branches are blocked, check QA gate failures. If pending, trigger merge with overstory merge --all.",
2457
+ nextTools: ["overstory_qa_summary", "overstory_mail_log", "overstory_fleet_status"],
2458
+ methodology: "ai_flywheel",
2459
+ tip: "Shows FIFO merge queue with conflict resolution tiers. Use include_completed:true to see merge history.",
2460
+ },
2461
+ phase: "utility",
2462
+ },
2463
+ // ═══════════════════════════════════════════
2464
+ // VISUAL QA — Deep interaction captures & stability
2465
+ // ═══════════════════════════════════════════
2466
+ {
2467
+ name: "burst_capture",
2468
+ category: "visual_qa",
2469
+ tags: ["burst", "capture", "screenshot", "rapid", "interaction", "deep", "animation", "transition", "hover", "click", "popup", "drawer", "modal", "streaming", "agent", "component"],
2470
+ quickRef: {
2471
+ nextAction: "Burst captured. Run compute_web_stability to measure SSIM across frames, or generate_grid_collage for visual comparison.",
2472
+ nextTools: ["compute_web_stability", "generate_grid_collage", "run_visual_qa_suite"],
2473
+ methodology: "ai_flywheel",
2474
+ tip: "Use burst capture for deep interaction testing — popups, hover states, streaming responses, drawer opens, thread switches. Captures rapid frame sequences during UI transitions.",
2475
+ },
2476
+ phase: "test",
2477
+ complexity: "medium",
2478
+ },
2479
+ {
2480
+ name: "generate_grid_collage",
2481
+ category: "visual_qa",
2482
+ tags: ["grid", "collage", "visual", "comparison", "before-after", "screenshot", "composite", "overview", "review"],
2483
+ quickRef: {
2484
+ nextAction: "Collage generated. Review visually for inconsistencies. Use run_visual_qa_suite for automated scoring.",
2485
+ nextTools: ["run_visual_qa_suite", "compute_web_stability", "analyze_screenshot"],
2486
+ methodology: "ai_flywheel",
2487
+ tip: "Generates a composite grid image from multiple screenshots — useful for comparing dark/light, desktop/mobile, or before/after states side-by-side.",
2488
+ },
2489
+ phase: "verify",
2490
+ complexity: "low",
2491
+ },
2492
+ {
2493
+ name: "compute_web_stability",
2494
+ category: "visual_qa",
2495
+ tags: ["stability", "ssim", "structural", "similarity", "flicker", "jank", "layout-shift", "regression", "diff", "frame", "comparison"],
2496
+ quickRef: {
2497
+ nextAction: "Stability computed. If SSIM < 0.95, investigate layout shifts or animation jank. Log issues with tag_ui_bug.",
2498
+ nextTools: ["burst_capture", "tag_ui_bug", "log_gap", "run_visual_qa_suite"],
2499
+ methodology: "ai_flywheel",
2500
+ tip: "Computes block-based SSIM between frame pairs to detect visual instability — layout shifts, flicker, and rendering regressions.",
2501
+ },
2502
+ phase: "test",
2503
+ complexity: "medium",
2504
+ },
2505
+ {
2506
+ name: "run_visual_qa_suite",
2507
+ category: "visual_qa",
2508
+ tags: ["visual", "qa", "suite", "end-to-end", "automated", "gemini", "scoring", "jony-ive", "design", "review", "deep-interaction", "scenario", "agent", "streaming", "popup", "drawer"],
2509
+ quickRef: {
2510
+ nextAction: "QA suite complete. Fix P0/P1 issues first (highest score impact), then P2/P3. Re-run to verify improvements.",
2511
+ nextTools: ["burst_capture", "log_gap", "record_learning", "save_session_note"],
2512
+ methodology: "ai_flywheel",
2513
+ tip: "End-to-end visual QA: captures all routes + deep interactions (agent queries, streaming, popups, drawers) → Gemini scores against Jony Ive design principles → auto-triages by P-level. Formula: 100 - P1×6 - P2×2 - P3×1.",
2514
+ },
2515
+ phase: "verify",
2516
+ complexity: "high",
2517
+ },
2518
+ // ═══════════════════════════════════════════
2519
+ // LOCAL DASHBOARD — Daily brief + narrative + ops
2520
+ // ═══════════════════════════════════════════
2521
+ {
2522
+ name: "sync_daily_brief",
2523
+ category: "local_dashboard",
2524
+ tags: ["sync", "daily", "brief", "convex", "sqlite", "pull", "refresh", "narrative", "dashboard", "data"],
2525
+ quickRef: {
2526
+ nextAction: "Data synced. Call get_daily_brief_summary to read the brief, or open_local_dashboard for visual review.",
2527
+ nextTools: ["get_daily_brief_summary", "get_narrative_status", "open_local_dashboard"],
2528
+ methodology: "ai_flywheel",
2529
+ tip: "Pulls latest dashboard snapshot + narrative threads from Convex into local SQLite. Requires CONVEX_SITE_URL and MCP_SECRET env vars.",
2530
+ },
2531
+ phase: "research",
2532
+ complexity: "medium",
2533
+ },
2534
+ {
2535
+ name: "get_daily_brief_summary",
2536
+ category: "local_dashboard",
2537
+ tags: ["daily", "brief", "summary", "metrics", "features", "sources", "dashboard", "offline", "local"],
2538
+ quickRef: {
2539
+ nextAction: "Review the brief. Check key signals and source quality. Use get_narrative_status for thread analysis.",
2540
+ nextTools: ["get_narrative_status", "get_ops_dashboard", "open_local_dashboard"],
2541
+ methodology: "ai_flywheel",
2542
+ tip: "Reads from local SQLite — zero network needed. Returns dashboard metrics, features, and source summary from the last sync.",
2543
+ },
2544
+ phase: "research",
2545
+ complexity: "low",
2546
+ },
2547
+ {
2548
+ name: "get_narrative_status",
2549
+ category: "local_dashboard",
2550
+ tags: ["narrative", "thread", "status", "phase", "emerging", "escalating", "climax", "resolution", "dormant", "story"],
2551
+ quickRef: {
2552
+ nextAction: "Review thread distribution. Focus on escalating/climax threads for timely action. Use get_ops_dashboard for pipeline health.",
2553
+ nextTools: ["get_daily_brief_summary", "get_ops_dashboard", "open_local_dashboard"],
2554
+ methodology: "ai_flywheel",
2555
+ tip: "Returns narrative threads grouped by phase with event counts. Filter by phase to focus on specific lifecycle stages.",
2556
+ },
2557
+ phase: "research",
2558
+ complexity: "low",
2559
+ },
2560
+ {
2561
+ name: "get_ops_dashboard",
2562
+ category: "local_dashboard",
2563
+ tags: ["ops", "operational", "dashboard", "sync", "tool-call", "frequency", "verification", "health", "monitoring"],
2564
+ quickRef: {
2565
+ nextAction: "Review ops health. If tool error rates are high, investigate root causes. If sync is stale, run sync_daily_brief.",
2566
+ nextTools: ["sync_daily_brief", "get_daily_brief_summary", "open_local_dashboard"],
2567
+ methodology: "ai_flywheel",
2568
+ tip: "Returns last sync info, tool call frequency (24h), active verification cycles, data counts, and privacy mode status.",
2569
+ },
2570
+ phase: "utility",
2571
+ complexity: "low",
2572
+ },
2573
+ {
2574
+ name: "open_local_dashboard",
2575
+ category: "local_dashboard",
2576
+ tags: ["open", "dashboard", "browser", "server", "html", "visual", "brief", "narrative", "ops", "local", "ui"],
2577
+ quickRef: {
2578
+ nextAction: "Dashboard is running. Open the URL in a browser to see Brief metrics, Narrative thread lanes, and Ops status.",
2579
+ nextTools: ["sync_daily_brief", "get_daily_brief_summary", "get_narrative_status"],
2580
+ methodology: "ai_flywheel",
2581
+ tip: "Starts the local dashboard server on port 6275 if not already running. Auto-refreshes every 30s from local SQLite.",
2582
+ },
2583
+ phase: "utility",
2584
+ complexity: "low",
2472
2585
  },
2473
2586
  ];
2474
2587
  // ── Exported lookup structures ───────────────────────────────────────────
@@ -2476,110 +2589,6 @@ const REGISTRY_ENTRIES = [
2476
2589
  export const TOOL_REGISTRY = new Map(REGISTRY_ENTRIES.map((e) => [e.name, e]));
2477
2590
  /** All registry entries as array */
2478
2591
  export const ALL_REGISTRY_ENTRIES = REGISTRY_ENTRIES;
2479
- // ── Auto-derive relatedTools for entries that don't have manual overrides ──
2480
- // Uses 3 signals: same-category siblings, DOMAIN_CLUSTERS neighbors, tag overlap.
2481
- // Must run after REGISTRY_ENTRIES is fully built. Forward-reference to DOMAIN_CLUSTERS
2482
- // is fine because this runs at module load time (DOMAIN_CLUSTERS is defined below).
2483
- /** Late-init: populated by _populateRelatedTools() at bottom of file */
2484
- let _domainClusters = null;
2485
- export function _setDomainClustersRef(clusters) {
2486
- _domainClusters = clusters;
2487
- }
2488
- function computeRelatedTools(entry) {
2489
- // If manually specified, use that
2490
- if (entry.quickRef.relatedTools && entry.quickRef.relatedTools.length > 0) {
2491
- return entry.quickRef.relatedTools;
2492
- }
2493
- const related = new Set();
2494
- const nextToolsSet = new Set(entry.quickRef.nextTools);
2495
- // 1. Same-category siblings (excluding self and nextTools), up to 3
2496
- let sibCount = 0;
2497
- for (const e of REGISTRY_ENTRIES) {
2498
- if (sibCount >= 3)
2499
- break;
2500
- if (e.category === entry.category && e.name !== entry.name && !nextToolsSet.has(e.name)) {
2501
- related.add(e.name);
2502
- sibCount++;
2503
- }
2504
- }
2505
- // 2. DOMAIN_CLUSTERS neighbors: tools from related categories, up to 2
2506
- if (_domainClusters) {
2507
- let clusterCount = 0;
2508
- for (const cluster of Object.values(_domainClusters)) {
2509
- if (clusterCount >= 2)
2510
- break;
2511
- if (cluster.includes(entry.category)) {
2512
- for (const neighborCat of cluster) {
2513
- if (clusterCount >= 2)
2514
- break;
2515
- if (neighborCat === entry.category)
2516
- continue;
2517
- for (const e of REGISTRY_ENTRIES) {
2518
- if (e.category === neighborCat && !nextToolsSet.has(e.name) && !related.has(e.name)) {
2519
- related.add(e.name);
2520
- clusterCount++;
2521
- break; // one tool per neighbor category
2522
- }
2523
- }
2524
- }
2525
- }
2526
- }
2527
- }
2528
- // 3. Tag overlap: tools sharing 2+ tags (not in nextTools or already related), up to 2
2529
- const myTags = new Set(entry.tags);
2530
- let tagCount = 0;
2531
- for (const other of REGISTRY_ENTRIES) {
2532
- if (tagCount >= 2)
2533
- break;
2534
- if (other.name === entry.name || nextToolsSet.has(other.name) || related.has(other.name))
2535
- continue;
2536
- let overlap = 0;
2537
- for (const t of other.tags) {
2538
- if (myTags.has(t))
2539
- overlap++;
2540
- if (overlap >= 2)
2541
- break;
2542
- }
2543
- if (overlap >= 2) {
2544
- related.add(other.name);
2545
- tagCount++;
2546
- }
2547
- }
2548
- // 4. Fallback: if still empty (small category, all siblings in nextTools), accept 1-tag overlap
2549
- if (related.size === 0) {
2550
- for (const other of REGISTRY_ENTRIES) {
2551
- if (related.size >= 3)
2552
- break;
2553
- if (other.name === entry.name || nextToolsSet.has(other.name))
2554
- continue;
2555
- const hasTagOverlap = other.tags.some((t) => myTags.has(t));
2556
- if (hasTagOverlap) {
2557
- related.add(other.name);
2558
- }
2559
- }
2560
- }
2561
- // 5. Last resort: if STILL empty, pick tools from the same phase (workflow adjacency)
2562
- if (related.size === 0) {
2563
- for (const other of REGISTRY_ENTRIES) {
2564
- if (related.size >= 3)
2565
- break;
2566
- if (other.name === entry.name || nextToolsSet.has(other.name))
2567
- continue;
2568
- if (other.phase === entry.phase) {
2569
- related.add(other.name);
2570
- }
2571
- }
2572
- }
2573
- return [...related].slice(0, 7); // hard cap at 7
2574
- }
2575
- /** Populate relatedTools for all registry entries. Called once at module load after DOMAIN_CLUSTERS exists. */
2576
- export function _populateRelatedTools() {
2577
- for (const entry of REGISTRY_ENTRIES) {
2578
- if (!entry.quickRef.relatedTools || entry.quickRef.relatedTools.length === 0) {
2579
- entry.quickRef.relatedTools = computeRelatedTools(entry);
2580
- }
2581
- }
2582
- }
2583
2592
  /** Get quick ref for a tool, with fallback for unregistered tools */
2584
2593
  export function getQuickRef(toolName) {
2585
2594
  return TOOL_REGISTRY.get(toolName)?.quickRef ?? null;
@@ -2631,7 +2640,9 @@ const CATEGORY_COMPLEXITY = {
2631
2640
  email: "medium",
2632
2641
  rss: "low",
2633
2642
  architect: "low",
2634
- pr_report: "medium",
2643
+ qa_orchestration: "low",
2644
+ visual_qa: "medium",
2645
+ local_dashboard: "low",
2635
2646
  };
2636
2647
  /** Per-tool complexity overrides (when category default is wrong) */
2637
2648
  const TOOL_COMPLEXITY_OVERRIDES = {
@@ -2910,9 +2921,6 @@ const DOMAIN_CLUSTERS = {
2910
2921
  writing: ["research_writing", "documentation"],
2911
2922
  measurement: ["eval", "benchmark", "self_eval"],
2912
2923
  };
2913
- // Wire up domain clusters and auto-populate relatedTools for all registry entries
2914
- _setDomainClustersRef(DOMAIN_CLUSTERS);
2915
- _populateRelatedTools();
2916
2924
  // ── Execution trace edges — co-occurrence mining from tool_call_log ────────
2917
2925
  // Based on Agent-as-a-Graph (arxiv:2511.18194): execution trace edges
2918
2926
  // mine sequential co-occurrence patterns to discover implicit tool relationships.
@@ -2953,36 +2961,17 @@ export function _setDbAccessor(accessor) {
2953
2961
  *
2954
2962
  * Approach: for each session, pull the ordered tool sequence, then count
2955
2963
  * pairs within a sliding window of 5 calls. O(n) per session, no self-join.
2956
- *
2957
- * When transitive=true, infer A→C via A→B + B→C (two-hop co-occurrence).
2958
- * Extended cap of 15 edges/tool (vs 10 for direct-only).
2959
2964
  */
2960
- let _transitiveCooccurrenceCache = null;
2961
- let _transitiveCooccurrenceCacheTime = 0;
2962
- function getCooccurrenceEdges(options) {
2963
- const transitive = options?.transitive ?? false;
2965
+ function getCooccurrenceEdges() {
2964
2966
  const now = Date.now();
2965
- // Check appropriate cache
2966
- if (transitive) {
2967
- if (_transitiveCooccurrenceCache && now - _transitiveCooccurrenceCacheTime < COOCCURRENCE_TTL_MS) {
2968
- return _transitiveCooccurrenceCache;
2969
- }
2970
- }
2971
- else {
2972
- if (_cooccurrenceCache && now - _cooccurrenceCacheTime < COOCCURRENCE_TTL_MS) {
2973
- return _cooccurrenceCache;
2974
- }
2967
+ if (_cooccurrenceCache && now - _cooccurrenceCacheTime < COOCCURRENCE_TTL_MS) {
2968
+ return _cooccurrenceCache;
2975
2969
  }
2976
- // Build direct edges first (always needed)
2977
- const directEdges = new Map();
2970
+ const edges = new Map();
2978
2971
  if (!_dbAccessor) {
2979
- _cooccurrenceCache = directEdges;
2972
+ _cooccurrenceCache = edges;
2980
2973
  _cooccurrenceCacheTime = now;
2981
- if (transitive) {
2982
- _transitiveCooccurrenceCache = directEdges;
2983
- _transitiveCooccurrenceCacheTime = now;
2984
- }
2985
- return directEdges;
2974
+ return edges;
2986
2975
  }
2987
2976
  try {
2988
2977
  const db = _dbAccessor();
@@ -3023,51 +3012,24 @@ function getCooccurrenceEdges(options) {
3023
3012
  .sort((a, b) => b[1] - a[1]);
3024
3013
  for (const [key] of sorted) {
3025
3014
  const [toolA, toolB] = key.split("\0");
3026
- const list = directEdges.get(toolA) ?? [];
3015
+ const list = edges.get(toolA) ?? [];
3027
3016
  if (list.length < 10) {
3028
3017
  list.push(toolB);
3029
- directEdges.set(toolA, list);
3018
+ edges.set(toolA, list);
3030
3019
  }
3031
3020
  }
3032
3021
  }
3033
3022
  catch {
3034
3023
  // No DB or table not yet created — return empty (graceful degradation)
3035
3024
  }
3036
- // Cache direct edges
3037
- _cooccurrenceCache = directEdges;
3025
+ _cooccurrenceCache = edges;
3038
3026
  _cooccurrenceCacheTime = now;
3039
- if (!transitive)
3040
- return directEdges;
3041
- // Transitive inference: A→B and B→C ⟹ A→C (two-hop)
3042
- const transitiveEdges = new Map([...directEdges.entries()].map(([k, v]) => [k, [...v]]));
3043
- for (const [toolA, directNeighbors] of directEdges) {
3044
- const existingSet = new Set(directNeighbors);
3045
- existingSet.add(toolA); // avoid self-loops
3046
- for (const toolB of directNeighbors) {
3047
- const bNeighbors = directEdges.get(toolB);
3048
- if (!bNeighbors)
3049
- continue;
3050
- const list = transitiveEdges.get(toolA);
3051
- for (const toolC of bNeighbors) {
3052
- if (existingSet.has(toolC))
3053
- continue;
3054
- if (list.length >= 15)
3055
- break; // extended cap for transitive
3056
- list.push(toolC);
3057
- existingSet.add(toolC);
3058
- }
3059
- }
3060
- }
3061
- _transitiveCooccurrenceCache = transitiveEdges;
3062
- _transitiveCooccurrenceCacheTime = now;
3063
- return transitiveEdges;
3027
+ return edges;
3064
3028
  }
3065
3029
  /** Reset co-occurrence cache — for testing only. */
3066
3030
  export function _resetCooccurrenceCache() {
3067
3031
  _cooccurrenceCache = null;
3068
3032
  _cooccurrenceCacheTime = 0;
3069
- _transitiveCooccurrenceCache = null;
3070
- _transitiveCooccurrenceCacheTime = 0;
3071
3033
  }
3072
3034
  /** Inject co-occurrence edges directly — for testing only. */
3073
3035
  export function _setCooccurrenceForTesting(edges) {
@@ -3459,8 +3421,7 @@ export function hybridSearch(query, tools, options) {
3459
3421
  });
3460
3422
  }
3461
3423
  results.sort((a, b) => b.score - a.score);
3462
- const offset = options?.offset ?? 0;
3463
- return results.slice(offset, offset + limit);
3424
+ return results.slice(0, limit);
3464
3425
  }
3465
3426
  /** Available search modes for discover_tools */
3466
3427
  export const SEARCH_MODES = ["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense", "embedding"];
@@ -3482,6 +3443,7 @@ export const WORKFLOW_CHAINS = {
3482
3443
  { tool: "run_mandatory_flywheel", action: "6-step final verification" },
3483
3444
  { tool: "record_learning", action: "Capture what you learned" },
3484
3445
  { tool: "promote_to_eval", action: "Feed into eval batch" },
3446
+ { tool: "save_session_note", action: "Save traceability note — cite original request, summarize what was delivered" },
3485
3447
  ],
3486
3448
  },
3487
3449
  fix_bug: {
@@ -3494,6 +3456,7 @@ export const WORKFLOW_CHAINS = {
3494
3456
  { tool: "log_test_result", action: "Record regression test" },
3495
3457
  { tool: "run_mandatory_flywheel", action: "6-step verification" },
3496
3458
  { tool: "record_learning", action: "Record the gotcha/pattern" },
3459
+ { tool: "save_session_note", action: "Save traceability note — cite original request, record root cause and fix" },
3497
3460
  ],
3498
3461
  },
3499
3462
  ui_change: {
@@ -3507,6 +3470,7 @@ export const WORKFLOW_CHAINS = {
3507
3470
  { tool: "run_quality_gate", action: "Run ui_ux_qa gate" },
3508
3471
  { tool: "run_mandatory_flywheel", action: "Final verification" },
3509
3472
  { tool: "record_learning", action: "Record UI patterns" },
3473
+ { tool: "save_session_note", action: "Save traceability note — cite original request, record visual evidence path" },
3510
3474
  ],
3511
3475
  },
3512
3476
  parallel_project: {
@@ -3824,16 +3788,67 @@ export const WORKFLOW_CHAINS = {
3824
3788
  { tool: "save_session_note", action: "Log sent emails so you have an audit trail that survives compaction" },
3825
3789
  ],
3826
3790
  },
3827
- pr_creation: {
3828
- name: "Visual PR Creation",
3829
- description: "Create a PR with visual evidence from a UI Dive session screenshots, timeline, bug fixes, past session links",
3791
+ webmcp_discovery: {
3792
+ name: "WebMCP Origin Discovery",
3793
+ description: "Connect to a WebMCP-enabled origin, discover its tools, and invoke them from the agent",
3794
+ steps: [
3795
+ { tool: "connect_webmcp_origin", action: "Connect to the target origin URL and establish a WebMCP session" },
3796
+ { tool: "list_webmcp_tools", action: "List all tools exposed by the origin with schemas and annotations" },
3797
+ { tool: "call_webmcp_tool", action: "Invoke a specific tool on the remote origin with arguments" },
3798
+ { tool: "disconnect_webmcp_origin", action: "Clean up the WebMCP session when done" },
3799
+ ],
3800
+ },
3801
+ batch_autopilot: {
3802
+ name: "Batch Autopilot Run",
3803
+ description: "Set up an operator profile and run a batch autopilot session for autonomous agent tasks",
3804
+ steps: [
3805
+ { tool: "setup_operator_profile", action: "Create or update USER.md and operator profile for autopilot context" },
3806
+ { tool: "get_autopilot_status", action: "Check current autopilot readiness, profile completeness, and last run status" },
3807
+ { tool: "trigger_batch_run", action: "Start a batch autopilot run using the operator profile as context" },
3808
+ { tool: "get_batch_run_history", action: "Review history of past batch runs, outcomes, and timing" },
3809
+ { tool: "sync_operator_profile", action: "Sync operator profile state from disk after manual edits" },
3810
+ ],
3811
+ },
3812
+ daily_review: {
3813
+ name: "Daily Brief Review",
3814
+ description: "Pull the latest daily brief, review narrative threads, check ops dashboard, and sync to local storage",
3815
+ steps: [
3816
+ { tool: "sync_daily_brief", action: "Pull today's brief and narrative from Convex into local SQLite" },
3817
+ { tool: "get_daily_brief_summary", action: "Get the full brief summary with key signals and insights" },
3818
+ { tool: "get_narrative_status", action: "Check narrative thread status — dominant story, under-reported angle, evidence scores" },
3819
+ { tool: "get_ops_dashboard", action: "Review pipeline health: posting status, tool usage, active workflows" },
3820
+ { tool: "open_local_dashboard", action: "Open the local HTML dashboard in the browser for visual review" },
3821
+ ],
3822
+ },
3823
+ deep_interaction: {
3824
+ name: "Deep Interaction Discovery & Capture",
3825
+ description: "Systematically discover, capture, and verify interactive UI behaviors — popups, drawers, streaming responses, hover states, agent conversations, thread management, keyboard shortcuts. Goes beyond static screenshot routes to test real user behavior flows.",
3826
+ steps: [
3827
+ { tool: "dive_auto_discover", action: "Auto-discover interactive components (buttons, drawers, modals, expandable rows) across all routes" },
3828
+ { tool: "start_ui_dive", action: "Start a structured UI dive session to track interaction coverage" },
3829
+ { tool: "burst_capture", action: "Rapid-fire capture during interaction transitions (open drawer, hover tooltip, type in agent panel)" },
3830
+ { tool: "dive_interaction_test", action: "Test specific interaction patterns: click→open→verify, type→submit→stream, hover→preview→dismiss" },
3831
+ { tool: "compute_web_stability", action: "Measure SSIM stability across interaction frames — detect layout shifts, flicker, animation jank" },
3832
+ { tool: "dive_record_test_step", action: "Record each interaction test step with expected vs actual behavior" },
3833
+ { tool: "run_visual_qa_suite", action: "Run full visual QA suite including deep interaction captures" },
3834
+ { tool: "tag_ui_bug", action: "Tag issues found during interaction testing (broken hover, drawer z-index, missing focus trap)" },
3835
+ { tool: "get_dive_report", action: "Generate interaction coverage report — which components were tested, which remain" },
3836
+ { tool: "record_learning", action: "Record interaction patterns, common failure modes, and selector strategies" },
3837
+ ],
3838
+ },
3839
+ gemini_qa: {
3840
+ name: "Gemini Vision QA Loop",
3841
+ description: "Automated UI/UX quality gate — capture screenshots (dark/light × desktop/mobile), send to Gemini Flash for Jony Ive product design review, fix issues, loop until 100/100",
3830
3842
  steps: [
3831
- { tool: "get_dive_report", action: "Review the dive findings and health score before creating PR" },
3832
- { tool: "export_pr_screenshots", action: "Export before/after screenshot pairs to a directory for committing" },
3833
- { tool: "generate_pr_report", action: "Generate rich markdown PR body with visual evidence, timeline, and past session links" },
3834
- { tool: "create_visual_pr", action: "End-to-end PR creation: exports assets, generates markdown, pushes branch, creates GitHub PR" },
3835
- { tool: "review_pr_checklist", action: "Validate the PR against the checklist (title, description, tests, verification)" },
3836
- { tool: "enforce_merge_gate", action: "Pre-merge validation git state, quality gates, verification cycles" },
3843
+ { tool: "check_mcp_setup", action: "Verify Gemini API key (GOOGLE_AI_KEY) and vision domain are ready" },
3844
+ { tool: "start_verification_cycle", action: "Open a verification cycle titled 'Gemini QA Loop' to track progress" },
3845
+ { tool: "save_session_note", action: "Shell: `npx vite build` then `npx playwright test tests/e2e/full-ui-dogfood.spec.ts --project=chromium --workers=1` capture 4-variant screenshots" },
3846
+ { tool: "save_session_note", action: "Shell: `npm run dogfood:publish` copy screenshots to public/dogfood/ with variant metadata manifest" },
3847
+ { tool: "save_session_note", action: "Shell: `npx vite build && node scripts/ui/runDogfoodGeminiQa.mjs` — rebuild, launch preview, trigger Gemini QA" },
3848
+ { tool: "log_test_result", action: "Log QA score from public/dogfood/qa-results.jsonformula: 100 - P1×6 - P2×2 - P3×1" },
3849
+ { tool: "save_session_note", action: "Fix P1 issues (6pts each) then P2 (2pts) then P3 (1pt) — root-cause each before fixing" },
3850
+ { tool: "get_overstory_qa_gate", action: "Check QA gate for per-route stability grades and issue counts" },
3851
+ { tool: "record_learning", action: "Record QA trajectory and Gemini finding patterns for regression tracking" },
3837
3852
  ],
3838
3853
  },
3839
3854
  };