nodebench-mcp 2.10.1 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/NODEBENCH_AGENTS.md +809 -726
  2. package/README.md +443 -415
  3. package/STYLE_GUIDE.md +477 -477
  4. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +153 -5
  5. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  6. package/dist/__tests__/helpers/textLlm.d.ts +1 -1
  7. package/dist/__tests__/presetRealWorldBench.test.d.ts +1 -0
  8. package/dist/__tests__/presetRealWorldBench.test.js +839 -0
  9. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -0
  10. package/dist/__tests__/tools.test.js +8 -5
  11. package/dist/__tests__/tools.test.js.map +1 -1
  12. package/dist/__tests__/toolsetGatingEval.test.js +72 -38
  13. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  14. package/dist/index.js +399 -328
  15. package/dist/index.js.map +1 -1
  16. package/dist/tools/agentBootstrapTools.js +258 -258
  17. package/dist/tools/boilerplateTools.js +144 -144
  18. package/dist/tools/cCompilerBenchmarkTools.js +33 -33
  19. package/dist/tools/documentationTools.js +59 -59
  20. package/dist/tools/flywheelTools.js +6 -6
  21. package/dist/tools/learningTools.js +26 -26
  22. package/dist/tools/localFileTools.d.ts +3 -0
  23. package/dist/tools/localFileTools.js +3335 -89
  24. package/dist/tools/localFileTools.js.map +1 -1
  25. package/dist/tools/reconTools.js +31 -31
  26. package/dist/tools/selfEvalTools.js +44 -44
  27. package/dist/tools/sessionMemoryTools.d.ts +15 -0
  28. package/dist/tools/sessionMemoryTools.js +348 -0
  29. package/dist/tools/sessionMemoryTools.js.map +1 -0
  30. package/dist/tools/toolRegistry.d.ts +4 -0
  31. package/dist/tools/toolRegistry.js +229 -0
  32. package/dist/tools/toolRegistry.js.map +1 -1
  33. package/dist/tools/verificationTools.js +41 -41
  34. package/dist/tools/visionTools.js +17 -17
  35. package/dist/tools/webTools.js +18 -18
  36. package/package.json +101 -101
package/dist/index.js CHANGED
@@ -41,10 +41,11 @@ import { researchWritingTools } from "./tools/researchWritingTools.js";
41
41
  import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
42
42
  import { figmaFlowTools } from "./tools/figmaFlowTools.js";
43
43
  import { createMetaTools } from "./tools/metaTools.js";
44
- import { localFileTools } from "./tools/localFileTools.js";
44
+ import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
45
45
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
46
46
  import { boilerplateTools } from "./tools/boilerplateTools.js";
47
47
  import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
48
+ import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
48
49
  import { getQuickRef } from "./tools/toolRegistry.js";
49
50
  // ── CLI argument parsing ──────────────────────────────────────────────
50
51
  const cliArgs = process.argv.slice(2);
@@ -72,23 +73,26 @@ const TOOLSET_MAP = {
72
73
  figma_flow: figmaFlowTools,
73
74
  boilerplate: boilerplateTools,
74
75
  benchmark: cCompilerBenchmarkTools,
76
+ session_memory: sessionMemoryTools,
77
+ gaia_solvers: gaiaMediaSolvers,
75
78
  };
76
79
  const PRESETS = {
77
- core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark"],
78
- lite: ["verification", "eval", "quality_gate", "learning", "recon", "security", "boilerplate"],
80
+ meta: [],
81
+ lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
82
+ core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory"],
79
83
  full: Object.keys(TOOLSET_MAP),
80
84
  };
81
85
  function parseToolsets() {
82
86
  if (cliArgs.includes("--help")) {
83
87
  const lines = [
84
- "nodebench-mcp v2.10.1 — Development Methodology MCP Server",
88
+ "nodebench-mcp v2.13.0 — Development Methodology MCP Server",
85
89
  "",
86
90
  "Usage: nodebench-mcp [options]",
87
91
  "",
88
92
  "Options:",
89
93
  " --toolsets <list> Comma-separated toolsets to enable (default: all)",
90
94
  " --exclude <list> Comma-separated toolsets to exclude",
91
- " --preset <name> Use a preset: core, lite, or full",
95
+ " --preset <name> Use a preset: meta, lite, core, or full",
92
96
  " --help Show this help and exit",
93
97
  "",
94
98
  "Available toolsets:",
@@ -160,6 +164,36 @@ for (const tool of allTools) {
160
164
  const SESSION_ID = genId("mcp");
161
165
  // Tools to skip auto-logging (avoid infinite recursion and noise)
162
166
  const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings"]);
167
+ // ── Lightweight hooks: auto-save + attention refresh reminders ─────────
168
+ const _hookState = {
169
+ totalCalls: 0,
170
+ consecutiveWebCalls: 0, // web_search, fetch_url without save_session_note
171
+ lastRefreshReminder: 0, // totalCalls at last reminder
172
+ };
173
+ const WEB_TOOL_NAMES = new Set(["web_search", "fetch_url"]);
174
+ const SAVE_TOOL_NAMES = new Set(["save_session_note", "record_learning"]);
175
+ const REFRESH_INTERVAL = 30; // remind after every 30 calls
176
+ function getHookHint(toolName) {
177
+ _hookState.totalCalls++;
178
+ // Track consecutive web calls
179
+ if (WEB_TOOL_NAMES.has(toolName)) {
180
+ _hookState.consecutiveWebCalls++;
181
+ }
182
+ else if (SAVE_TOOL_NAMES.has(toolName)) {
183
+ _hookState.consecutiveWebCalls = 0;
184
+ }
185
+ const hints = [];
186
+ // Auto-save reminder after 2+ consecutive web calls
187
+ if (_hookState.consecutiveWebCalls >= 2) {
188
+ hints.push("_hint: You've made " + _hookState.consecutiveWebCalls + " web calls without saving. Consider calling save_session_note to persist findings before context compaction.");
189
+ }
190
+ // Attention refresh reminder every 30 calls
191
+ if (_hookState.totalCalls - _hookState.lastRefreshReminder >= REFRESH_INTERVAL) {
192
+ hints.push("_hint: " + _hookState.totalCalls + " tool calls this session. Consider calling refresh_task_context to reload your bearings and prevent attention drift.");
193
+ _hookState.lastRefreshReminder = _hookState.totalCalls;
194
+ }
195
+ return hints.length > 0 ? hints.join(" | ") : null;
196
+ }
163
197
  // MCP Prompts — protocol-native agent instructions for onboarding
164
198
  const PROMPTS = [
165
199
  {
@@ -170,37 +204,37 @@ const PROMPTS = [
170
204
  role: "user",
171
205
  content: {
172
206
  type: "text",
173
- text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
174
-
175
- WHAT THIS DOES:
176
- In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
177
- that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
178
- the agent finds 2+ prior findings before writing a single line of code.
179
-
180
- HOW IT WORKS:
181
- Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
182
- Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
183
- compounds into future tasks.
184
-
185
- FIRST TIME? Run these 3 steps:
186
- 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
187
- 2. Call getMethodology("overview") to see all available methodologies
188
- 3. Call search_all_knowledge("your current task") before starting any work
189
-
190
- RETURNING? Your project context and all past learnings are persisted. Start with:
191
- 1. Call search_all_knowledge with your current task
192
- 2. Follow the methodology tools as you work — they'll guide you step by step
193
-
194
- KEY TOOLS:
195
- - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
196
- - run_mandatory_flywheel — 6-step minimum verification before declaring work done
197
- - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
198
- - findTools — Discover tools by keyword or category
199
- - assess_risk — Assess risk before acting (HIGH = needs confirmation)
200
-
201
- PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
202
- - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
203
- - get_parallel_status — See what all agents are doing
207
+ text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
208
+
209
+ WHAT THIS DOES:
210
+ In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
211
+ that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
212
+ the agent finds 2+ prior findings before writing a single line of code.
213
+
214
+ HOW IT WORKS:
215
+ Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
216
+ Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
217
+ compounds into future tasks.
218
+
219
+ FIRST TIME? Run these 3 steps:
220
+ 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
221
+ 2. Call getMethodology("overview") to see all available methodologies
222
+ 3. Call search_all_knowledge("your current task") before starting any work
223
+
224
+ RETURNING? Your project context and all past learnings are persisted. Start with:
225
+ 1. Call search_all_knowledge with your current task
226
+ 2. Follow the methodology tools as you work — they'll guide you step by step
227
+
228
+ KEY TOOLS:
229
+ - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
230
+ - run_mandatory_flywheel — 6-step minimum verification before declaring work done
231
+ - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
232
+ - findTools — Discover tools by keyword or category
233
+ - assess_risk — Assess risk before acting (HIGH = needs confirmation)
234
+
235
+ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
236
+ - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
237
+ - get_parallel_status — See what all agents are doing
204
238
  - Use the "claude-code-parallel" prompt for step-by-step guidance`,
205
239
  },
206
240
  },
@@ -221,16 +255,16 @@ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
221
255
  role: "user",
222
256
  content: {
223
257
  type: "text",
224
- text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
225
-
226
- Please gather and record the following using the bootstrap_project tool:
227
- 1. Tech stack (languages, frameworks, runtimes)
228
- 2. Key dependency versions
229
- 3. Architecture overview
230
- 4. Build/test commands
231
- 5. Known conventions or patterns
232
- 6. Repository structure highlights
233
-
258
+ text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
259
+
260
+ Please gather and record the following using the bootstrap_project tool:
261
+ 1. Tech stack (languages, frameworks, runtimes)
262
+ 2. Key dependency versions
263
+ 3. Architecture overview
264
+ 4. Build/test commands
265
+ 5. Known conventions or patterns
266
+ 6. Repository structure highlights
267
+
234
268
  After bootstrapping, run a reconnaissance session with run_recon to check for latest updates on the project's key frameworks and SDKs.`,
235
269
  },
236
270
  },
@@ -251,33 +285,33 @@ After bootstrapping, run a reconnaissance session with run_recon to check for la
251
285
  role: "user",
252
286
  content: {
253
287
  type: "text",
254
- text: `You just implemented UI changes to: ${args.componentName}
255
-
256
- Before declaring this work done, run the UI/UX QA checklist:
257
-
258
- 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
259
- 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
260
- 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
261
- 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
262
- 5. STATES: Verify loading, error, and empty states are handled
263
- 6. CONSOLE: Check browser devtools for errors/warnings
264
- 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
265
- 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
266
- 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
267
-
268
- After checking each item, record results:
269
- call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
270
- evaluate each rule against ${args.componentName}
271
- call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
272
- call record_learning for any UI gotchas discovered
273
-
274
- For the full step-by-step methodology, call getMethodology("ui_ux_qa").
275
-
276
- Commands available:
277
- npm run test:run — Vitest component tests
278
- npm run test:e2e — Playwright E2E tests
279
- npm run storybook — Storybook dev server (port 6006)
280
- npm run perf:lighthouse — Lighthouse audit
288
+ text: `You just implemented UI changes to: ${args.componentName}
289
+
290
+ Before declaring this work done, run the UI/UX QA checklist:
291
+
292
+ 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
293
+ 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
294
+ 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
295
+ 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
296
+ 5. STATES: Verify loading, error, and empty states are handled
297
+ 6. CONSOLE: Check browser devtools for errors/warnings
298
+ 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
299
+ 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
300
+ 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
301
+
302
+ After checking each item, record results:
303
+ call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
304
+ evaluate each rule against ${args.componentName}
305
+ call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
306
+ call record_learning for any UI gotchas discovered
307
+
308
+ For the full step-by-step methodology, call getMethodology("ui_ux_qa").
309
+
310
+ Commands available:
311
+ npm run test:run — Vitest component tests
312
+ npm run test:e2e — Playwright E2E tests
313
+ npm run storybook — Storybook dev server (port 6006)
314
+ npm run perf:lighthouse — Lighthouse audit
281
315
  npm run perf:bundle — Bundle size analysis`,
282
316
  },
283
317
  },
@@ -305,47 +339,47 @@ Commands available:
305
339
  role: "user",
306
340
  content: {
307
341
  type: "text",
308
- text: `You are coordinating a parallel agent team for: ${args.projectGoal}
309
-
310
- This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
311
- Reference: https://www.anthropic.com/engineering/building-c-compiler
312
-
313
- SETUP (run these in order):
314
-
315
- 1. ORIENT — Check what's already happening:
316
- call get_parallel_status({ includeHistory: true })
317
- call list_agent_tasks({ status: "all" })
318
-
319
- 2. PLAN ROLES — Assign ${agentCount} specialized agents:
320
- Recommended role split for ${agentCount} agents:
321
- ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
322
- - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
323
- - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
342
+ text: `You are coordinating a parallel agent team for: ${args.projectGoal}
343
+
344
+ This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
345
+ Reference: https://www.anthropic.com/engineering/building-c-compiler
346
+
347
+ SETUP (run these in order):
348
+
349
+ 1. ORIENT — Check what's already happening:
350
+ call get_parallel_status({ includeHistory: true })
351
+ call list_agent_tasks({ status: "all" })
352
+
353
+ 2. PLAN ROLES — Assign ${agentCount} specialized agents:
354
+ Recommended role split for ${agentCount} agents:
355
+ ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
356
+ - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
357
+ - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
324
358
  - Agent 4: assign_agent_role({ role: "documentation_maintainer", focusArea: "docs and progress" })` :
325
- `- Agent 1: assign_agent_role({ role: "implementer" })
326
- - Agent 2: assign_agent_role({ role: "test_writer" })`}
327
-
328
- 3. BREAK DOWN WORK — Create task claims:
329
- For each independent piece of work:
330
- call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
331
-
332
- 4. WORK LOOP (each agent independently):
333
- a. claim_agent_task — Lock your task
334
- b. Do the work (implement, test, review)
335
- c. log_context_budget — Track context usage, avoid pollution
336
- d. run_oracle_comparison — Validate output against known-good reference
337
- e. release_agent_task — Release with progress note
338
- f. Pick next task (repeat)
339
-
340
- 5. ANTI-PATTERNS TO AVOID:
341
- - Two agents working on the same task (always claim first)
342
- - Dumping thousands of lines of test output (log to file, print summary)
343
- - Spending hours on one stuck problem (mark as blocked, move on)
344
- - Overwriting each other's changes (commit frequently, pull before push)
345
-
346
- KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
347
- use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
348
-
359
+ `- Agent 1: assign_agent_role({ role: "implementer" })
360
+ - Agent 2: assign_agent_role({ role: "test_writer" })`}
361
+
362
+ 3. BREAK DOWN WORK — Create task claims:
363
+ For each independent piece of work:
364
+ call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
365
+
366
+ 4. WORK LOOP (each agent independently):
367
+ a. claim_agent_task — Lock your task
368
+ b. Do the work (implement, test, review)
369
+ c. log_context_budget — Track context usage, avoid pollution
370
+ d. run_oracle_comparison — Validate output against known-good reference
371
+ e. release_agent_task — Release with progress note
372
+ f. Pick next task (repeat)
373
+
374
+ 5. ANTI-PATTERNS TO AVOID:
375
+ - Two agents working on the same task (always claim first)
376
+ - Dumping thousands of lines of test output (log to file, print summary)
377
+ - Spending hours on one stuck problem (mark as blocked, move on)
378
+ - Overwriting each other's changes (commit frequently, pull before push)
379
+
380
+ KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
381
+ use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
382
+
349
383
  For the full methodology: call getMethodology("parallel_agent_teams")`,
350
384
  },
351
385
  },
@@ -372,45 +406,45 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
372
406
  role: "user",
373
407
  content: {
374
408
  type: "text",
375
- text: `Set up oracle-based testing for: ${args.componentName}
376
- Oracle source: ${args.oracleSource}
377
-
378
- This follows the pattern from Anthropic's C Compiler project where GCC served as a
379
- "known-good compiler oracle" to identify which specific files were broken.
380
-
381
- SETUP:
382
-
383
- 1. DEFINE ORACLE — Capture known-good reference outputs:
384
- Run the reference implementation (${args.oracleSource}) on each test input.
385
- Save outputs as golden files or capture them in the oracle comparison tool.
386
-
387
- 2. RUN COMPARISONS — For each test case:
388
- call run_oracle_comparison({
389
- testLabel: "${args.componentName}_test_1",
390
- actualOutput: "<your implementation's output>",
391
- expectedOutput: "<oracle's output>",
392
- oracleSource: "${args.oracleSource}"
393
- })
394
-
395
- 3. TRIAGE FAILURES — Review diff summaries:
396
- Each failing comparison is an independent work item.
397
- Assign each to a different parallel agent via claim_agent_task.
398
-
399
- 4. BINARY SEARCH (for complex failures):
400
- If a test passes individually but fails when combined with others,
401
- use delta debugging: split the test set in half, test each half,
402
- narrow down to the minimal failing combination.
403
- (This is how Anthropic found pairs of files that failed together but worked independently.)
404
-
405
- 5. TRACK PROGRESS — Monitor convergence:
406
- call get_parallel_status to see how many oracle tests are still failing.
407
- As agents fix failures, the match percentage should trend toward 100%.
408
-
409
- CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
410
- call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
411
-
412
- After all oracle tests pass:
413
- call record_learning with patterns discovered
409
+ text: `Set up oracle-based testing for: ${args.componentName}
410
+ Oracle source: ${args.oracleSource}
411
+
412
+ This follows the pattern from Anthropic's C Compiler project where GCC served as a
413
+ "known-good compiler oracle" to identify which specific files were broken.
414
+
415
+ SETUP:
416
+
417
+ 1. DEFINE ORACLE — Capture known-good reference outputs:
418
+ Run the reference implementation (${args.oracleSource}) on each test input.
419
+ Save outputs as golden files or capture them in the oracle comparison tool.
420
+
421
+ 2. RUN COMPARISONS — For each test case:
422
+ call run_oracle_comparison({
423
+ testLabel: "${args.componentName}_test_1",
424
+ actualOutput: "<your implementation's output>",
425
+ expectedOutput: "<oracle's output>",
426
+ oracleSource: "${args.oracleSource}"
427
+ })
428
+
429
+ 3. TRIAGE FAILURES — Review diff summaries:
430
+ Each failing comparison is an independent work item.
431
+ Assign each to a different parallel agent via claim_agent_task.
432
+
433
+ 4. BINARY SEARCH (for complex failures):
434
+ If a test passes individually but fails when combined with others,
435
+ use delta debugging: split the test set in half, test each half,
436
+ narrow down to the minimal failing combination.
437
+ (This is how Anthropic found pairs of files that failed together but worked independently.)
438
+
439
+ 5. TRACK PROGRESS — Monitor convergence:
440
+ call get_parallel_status to see how many oracle tests are still failing.
441
+ As agents fix failures, the match percentage should trend toward 100%.
442
+
443
+ CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
444
+ call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
445
+
446
+ After all oracle tests pass:
447
+ call record_learning with patterns discovered
414
448
  call run_mandatory_flywheel to verify the full change`,
415
449
  },
416
450
  },
@@ -438,67 +472,67 @@ After all oracle tests pass:
438
472
  role: "user",
439
473
  content: {
440
474
  type: "text",
441
- text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
442
-
443
- ## How This Works
444
-
445
- Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
446
- context window. NodeBench MCP tools coordinate them via a shared SQLite database.
447
-
448
- **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
449
- **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
450
-
451
- ## Step-by-Step
452
-
453
- ### 1. PLAN — Break work into ${count} independent tasks
454
- Identify ${count} pieces of work that can run in parallel without dependencies.
455
- Each task should be independently completable and testable.
456
-
457
- ### 2. SPAWN — Launch subagents with coordination instructions
458
- For each task, use the Task tool:
459
-
460
- \`\`\`
461
- Task tool call:
462
- prompt: "You have access to NodeBench MCP. Do the following:
463
- 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
464
- 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
465
- 3. Do the work
466
- 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
467
- 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
468
- 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
469
- \`\`\`
470
-
471
- ### 3. MONITOR — Check progress
472
- After spawning all subagents:
473
- call get_parallel_status({ includeHistory: true })
474
- call list_agent_tasks({ status: "all" })
475
-
476
- ### 4. VALIDATE — Run oracle comparisons if applicable
477
- If subagents produced outputs that should match a reference:
478
- call run_oracle_comparison for each output
479
-
480
- ### 5. GATE — Quality check the aggregate result
481
- call run_quality_gate with rules covering all ${count} tasks
482
- call run_mandatory_flywheel to verify the combined change
483
-
484
- ## Concrete IMPACT of This Workflow
485
-
486
- | What NodeBench Adds | Without It (bare subagents) |
487
- |---------------------------------|---------------------------------------|
488
- | Task locks prevent duplicate work | Two subagents might fix the same bug |
489
- | Role specialization | All subagents do everything |
490
- | Context budget tracking | Subagent runs out of context silently |
491
- | Oracle comparisons | No reference-based validation |
492
- | Progress notes for handoff | Next session starts from scratch |
493
- | Learnings persisted | Knowledge lost when subagent exits |
494
- | Quality gate on aggregate | No validation that pieces fit together |
495
-
496
- ## Anti-Patterns
497
- - DO NOT spawn subagents for work that has dependencies (sequential steps)
498
- - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
499
- - DO NOT dump large outputs into subagent context — use log_context_budget to track
500
- - DO NOT forget release_agent_task — orphaned claims block future sessions
501
-
475
+ text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
476
+
477
+ ## How This Works
478
+
479
+ Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
480
+ context window. NodeBench MCP tools coordinate them via a shared SQLite database.
481
+
482
+ **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
483
+ **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
484
+
485
+ ## Step-by-Step
486
+
487
+ ### 1. PLAN — Break work into ${count} independent tasks
488
+ Identify ${count} pieces of work that can run in parallel without dependencies.
489
+ Each task should be independently completable and testable.
490
+
491
+ ### 2. SPAWN — Launch subagents with coordination instructions
492
+ For each task, use the Task tool:
493
+
494
+ \`\`\`
495
+ Task tool call:
496
+ prompt: "You have access to NodeBench MCP. Do the following:
497
+ 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
498
+ 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
499
+ 3. Do the work
500
+ 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
501
+ 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
502
+ 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
503
+ \`\`\`
504
+
505
+ ### 3. MONITOR — Check progress
506
+ After spawning all subagents:
507
+ call get_parallel_status({ includeHistory: true })
508
+ call list_agent_tasks({ status: "all" })
509
+
510
+ ### 4. VALIDATE — Run oracle comparisons if applicable
511
+ If subagents produced outputs that should match a reference:
512
+ call run_oracle_comparison for each output
513
+
514
+ ### 5. GATE — Quality check the aggregate result
515
+ call run_quality_gate with rules covering all ${count} tasks
516
+ call run_mandatory_flywheel to verify the combined change
517
+
518
+ ## Concrete IMPACT of This Workflow
519
+
520
+ | What NodeBench Adds | Without It (bare subagents) |
521
+ |---------------------------------|---------------------------------------|
522
+ | Task locks prevent duplicate work | Two subagents might fix the same bug |
523
+ | Role specialization | All subagents do everything |
524
+ | Context budget tracking | Subagent runs out of context silently |
525
+ | Oracle comparisons | No reference-based validation |
526
+ | Progress notes for handoff | Next session starts from scratch |
527
+ | Learnings persisted | Knowledge lost when subagent exits |
528
+ | Quality gate on aggregate | No validation that pieces fit together |
529
+
530
+ ## Anti-Patterns
531
+ - DO NOT spawn subagents for work that has dependencies (sequential steps)
532
+ - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
533
+ - DO NOT dump large outputs into subagent context — use log_context_budget to track
534
+ - DO NOT forget release_agent_task — orphaned claims block future sessions
535
+
502
536
  For the full parallel agent methodology: call getMethodology("parallel_agent_teams")`,
503
537
  },
504
538
  },
@@ -525,72 +559,72 @@ For the full parallel agent methodology: call getMethodology("parallel_agent_tea
525
559
  role: "user",
526
560
  content: {
527
561
  type: "text",
528
- text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
529
- ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
530
-
531
- This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
532
-
533
- STEP 1 — DETECT (dry run first):
534
- call bootstrap_parallel_agents({
535
- projectRoot: "${args.projectPath}",
536
- dryRun: true,
537
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
538
- includeAgentsMd: true
539
- })
540
-
541
- Review the gap report. It scans 7 categories:
542
- - Task coordination (lock files, claim directories)
543
- - Role specialization (role configs, AGENTS.md mentions)
544
- - Oracle testing (golden files, reference outputs, snapshots)
545
- - Context budget tracking (budget configs, AGENTS.md mentions)
546
- - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
547
- - AGENTS.md parallel section (parallel agent coordination protocol)
548
- - Git worktrees (for true parallel work)
549
-
550
- STEP 2 — SCAFFOLD (create files):
551
- If gaps found, run with dryRun=false:
552
- call bootstrap_parallel_agents({
553
- projectRoot: "${args.projectPath}",
554
- dryRun: false,
555
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
556
- includeAgentsMd: true
557
- })
558
-
559
- This creates:
560
- - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
561
- - progress.md template for agent orientation
562
- - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
563
-
564
- STEP 3 — GENERATE AGENTS.MD (if needed):
565
- call generate_parallel_agents_md({
566
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
567
- projectName: "${args.projectPath.split("/").pop() || "project"}",
568
- maxAgents: 4,
569
- includeNodebenchSetup: true
570
- })
571
-
572
- Copy the output into the target repo's AGENTS.md.
573
-
574
- STEP 4 — VERIFY (6-step flywheel):
575
- The bootstrap tool returns a flywheelPlan. Execute each step:
576
- 1. Static analysis — verify scaffold files don't conflict
577
- 2. Happy path — claim task → work → release → progress.md updated
578
- 3. Conflict test — two claims on same task → second gets conflict
579
- 4. Oracle test — create golden file → diff catches changes
580
- 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
581
- 6. Document — record_learning with patterns discovered
582
-
583
- STEP 5 — FIX (if anything fails):
584
- Fix the issue, then re-run from Step 4.
585
-
586
- STEP 6 — DOCUMENT:
587
- call record_learning({
588
- key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
589
- content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
590
- category: "pattern",
591
- tags: ["parallel-agents", "bootstrap", "external-repo"]
592
- })
593
-
562
+ text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
563
+ ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
564
+
565
+ This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
566
+
567
+ STEP 1 — DETECT (dry run first):
568
+ call bootstrap_parallel_agents({
569
+ projectRoot: "${args.projectPath}",
570
+ dryRun: true,
571
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
572
+ includeAgentsMd: true
573
+ })
574
+
575
+ Review the gap report. It scans 7 categories:
576
+ - Task coordination (lock files, claim directories)
577
+ - Role specialization (role configs, AGENTS.md mentions)
578
+ - Oracle testing (golden files, reference outputs, snapshots)
579
+ - Context budget tracking (budget configs, AGENTS.md mentions)
580
+ - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
581
+ - AGENTS.md parallel section (parallel agent coordination protocol)
582
+ - Git worktrees (for true parallel work)
583
+
584
+ STEP 2 — SCAFFOLD (create files):
585
+ If gaps found, run with dryRun=false:
586
+ call bootstrap_parallel_agents({
587
+ projectRoot: "${args.projectPath}",
588
+ dryRun: false,
589
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
590
+ includeAgentsMd: true
591
+ })
592
+
593
+ This creates:
594
+ - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
595
+ - progress.md template for agent orientation
596
+ - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
597
+
598
+ STEP 3 — GENERATE AGENTS.MD (if needed):
599
+ call generate_parallel_agents_md({
600
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
601
+ projectName: "${args.projectPath.split("/").pop() || "project"}",
602
+ maxAgents: 4,
603
+ includeNodebenchSetup: true
604
+ })
605
+
606
+ Copy the output into the target repo's AGENTS.md.
607
+
608
+ STEP 4 — VERIFY (6-step flywheel):
609
+ The bootstrap tool returns a flywheelPlan. Execute each step:
610
+ 1. Static analysis — verify scaffold files don't conflict
611
+ 2. Happy path — claim task → work → release → progress.md updated
612
+ 3. Conflict test — two claims on same task → second gets conflict
613
+ 4. Oracle test — create golden file → diff catches changes
614
+ 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
615
+ 6. Document — record_learning with patterns discovered
616
+
617
+ STEP 5 — FIX (if anything fails):
618
+ Fix the issue, then re-run from Step 4.
619
+
620
+ STEP 6 — DOCUMENT:
621
+ call record_learning({
622
+ key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
623
+ content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
624
+ category: "pattern",
625
+ tags: ["parallel-agents", "bootstrap", "external-repo"]
626
+ })
627
+
594
628
  For the full methodology: call getMethodology("parallel_agent_teams")`,
595
629
  },
596
630
  },
@@ -604,60 +638,89 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
604
638
  role: "user",
605
639
  content: {
606
640
  type: "text",
607
- text: `## NodeBench MCP Agent Contract
608
-
609
- You are connected to NodeBench MCP. Follow these rules EXACTLY.
610
-
611
- ### FRONT DOOR — Always start here (before writing any code)
612
- 1. search_all_knowledge("<your current task>") — Check if this was solved before
613
- 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
614
- 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
615
- 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
616
-
617
- ### SELF-SETUP — If a capability is missing
618
- When discover_tools returns nothing useful, or a tool says "not configured":
619
- 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
620
- 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
621
- 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
622
- 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
623
-
624
- ### BEFORE IMPLEMENTATION
625
- - run_recon + log_recon_finding (if reconnaissance applies)
626
- - assess_risk (HIGH risk = must get confirmation before proceeding)
627
-
628
- ### PARALLEL WORK
629
- - MUST claim_agent_task before editing or designing anything
630
- - MUST release_agent_task with a progress note + next action when done
631
- - MUST log_context_budget to track context usage and avoid pollution
632
-
633
- ### BEFORE SHIP
634
- - 3-layer tests logged (unit + integration + e2e via log_test_result)
635
- - Eval run recorded (promote_to_eval)
636
- - Quality gate passed (run_quality_gate)
637
- - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
638
- - Learning banked (record_learning)
639
-
640
- ### COORDINATOR SPAWN TEMPLATE
641
- When spawning subagents, give each this instruction block:
642
- "You have NodeBench MCP. Before any work:
643
- 1. search_all_knowledge('<task>')
644
- 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
645
- 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
646
- Do the work, then:
647
- 4. log_context_budget({ eventType: 'checkpoint' })
648
- 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
649
- 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
650
-
651
- ### WHY THIS MATTERS
652
- Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
653
- work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
641
+ text: `## NodeBench MCP Agent Contract
642
+
643
+ You are connected to NodeBench MCP. Follow these rules EXACTLY.
644
+
645
+ ### FRONT DOOR — Always start here (before writing any code)
646
+ 1. search_all_knowledge("<your current task>") — Check if this was solved before
647
+ 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
648
+ 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
649
+ 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
650
+
651
+ ### SELF-SETUP — If a capability is missing
652
+ When discover_tools returns nothing useful, or a tool says "not configured":
653
+ 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
654
+ 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
655
+ 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
656
+ 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
657
+
658
+ ### BEFORE IMPLEMENTATION
659
+ - run_recon + log_recon_finding (if reconnaissance applies)
660
+ - assess_risk (HIGH risk = must get confirmation before proceeding)
661
+
662
+ ### PARALLEL WORK
663
+ - MUST claim_agent_task before editing or designing anything
664
+ - MUST release_agent_task with a progress note + next action when done
665
+ - MUST log_context_budget to track context usage and avoid pollution
666
+
667
+ ### BEFORE SHIP
668
+ - 3-layer tests logged (unit + integration + e2e via log_test_result)
669
+ - Eval run recorded (promote_to_eval)
670
+ - Quality gate passed (run_quality_gate)
671
+ - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
672
+ - Learning banked (record_learning)
673
+
674
+ ### COORDINATOR SPAWN TEMPLATE
675
+ When spawning subagents, give each this instruction block:
676
+ "You have NodeBench MCP. Before any work:
677
+ 1. search_all_knowledge('<task>')
678
+ 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
679
+ 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
680
+ Do the work, then:
681
+ 4. log_context_budget({ eventType: 'checkpoint' })
682
+ 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
683
+ 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
684
+
685
+ ### ANTI-RATIONALIZATION Block these escape patterns
686
+ Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
687
+ 1. "I already know which tool to use" Still call discover_tools to confirm
688
+ 2. "This is a simple task" → Still call search_all_knowledge to check history
689
+ 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
690
+ 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
691
+ 5. "I'll record the learning later" → Record NOW — context compaction may erase it
692
+ 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
693
+ 7. "The user said to skip verification" → Log the skip decision, never silently omit
694
+ 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
695
+
696
+ ### 2-ACTION SAVE RULE
697
+ After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
698
+ - save_session_note (filesystem, survives compaction)
699
+ - record_learning (SQLite, searchable across sessions)
700
+ - log_recon_finding (tied to recon session)
701
+ This prevents knowledge loss when context is compacted mid-session.
702
+
703
+ ### 3-STRIKE ERROR PROTOCOL
704
+ When an action fails:
705
+ - Strike 1: Diagnose root cause, apply targeted fix
706
+ - Strike 2: Try a different method or tool
707
+ - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
708
+ - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
709
+
710
+ ### ATTENTION REFRESH
711
+ After 30+ tool calls, call refresh_task_context to combat attention drift.
712
+ Re-read your original goal and open gaps before continuing.
713
+
714
+ ### WHY THIS MATTERS
715
+ Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
716
+ work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
654
717
  artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound across tasks.`,
655
718
  },
656
719
  },
657
720
  ],
658
721
  },
659
722
  ];
660
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.10.1" }, { capabilities: { tools: {}, prompts: {} } });
723
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.13.0" }, { capabilities: { tools: {}, prompts: {} } });
661
724
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
662
725
  server.setRequestHandler(ListToolsRequestSchema, async () => {
663
726
  return {
@@ -708,8 +771,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
708
771
  enrichedResult = { ...result, _quickRef: quickRef };
709
772
  }
710
773
  }
774
+ // Lightweight hook: append save/refresh hints when thresholds are met
775
+ const hookHint = getHookHint(name);
776
+ const contentBlocks = [
777
+ { type: "text", text: JSON.stringify(enrichedResult, null, 2) },
778
+ ];
779
+ if (hookHint) {
780
+ contentBlocks.push({ type: "text", text: hookHint });
781
+ }
711
782
  return {
712
- content: [{ type: "text", text: JSON.stringify(enrichedResult, null, 2) }],
783
+ content: contentBlocks,
713
784
  isError: false,
714
785
  };
715
786
  }