nodebench-mcp 2.11.0 → 2.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/NODEBENCH_AGENTS.md +809 -809
  2. package/README.md +443 -431
  3. package/STYLE_GUIDE.md +477 -477
  4. package/dist/__tests__/evalHarness.test.js +1 -1
  5. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +9 -14
  6. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
  7. package/dist/__tests__/gaiaCapabilityEval.test.js +88 -14
  8. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
  9. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +9 -5
  10. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
  11. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +165 -17
  12. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  13. package/dist/__tests__/helpers/answerMatch.d.ts +36 -7
  14. package/dist/__tests__/helpers/answerMatch.js +224 -35
  15. package/dist/__tests__/helpers/answerMatch.js.map +1 -1
  16. package/dist/__tests__/helpers/textLlm.d.ts +1 -1
  17. package/dist/__tests__/presetRealWorldBench.test.d.ts +1 -0
  18. package/dist/__tests__/presetRealWorldBench.test.js +850 -0
  19. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -0
  20. package/dist/__tests__/tools.test.js +20 -7
  21. package/dist/__tests__/tools.test.js.map +1 -1
  22. package/dist/__tests__/toolsetGatingEval.test.js +21 -11
  23. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  24. package/dist/db.js +21 -0
  25. package/dist/db.js.map +1 -1
  26. package/dist/index.js +424 -327
  27. package/dist/index.js.map +1 -1
  28. package/dist/tools/agentBootstrapTools.js +258 -258
  29. package/dist/tools/boilerplateTools.js +144 -144
  30. package/dist/tools/cCompilerBenchmarkTools.js +33 -33
  31. package/dist/tools/documentationTools.js +59 -59
  32. package/dist/tools/flywheelTools.js +6 -6
  33. package/dist/tools/gitWorkflowTools.d.ts +11 -0
  34. package/dist/tools/gitWorkflowTools.js +580 -0
  35. package/dist/tools/gitWorkflowTools.js.map +1 -0
  36. package/dist/tools/learningTools.js +26 -26
  37. package/dist/tools/localFileTools.d.ts +3 -0
  38. package/dist/tools/localFileTools.js +3164 -125
  39. package/dist/tools/localFileTools.js.map +1 -1
  40. package/dist/tools/metaTools.js +82 -0
  41. package/dist/tools/metaTools.js.map +1 -1
  42. package/dist/tools/parallelAgentTools.js +228 -0
  43. package/dist/tools/parallelAgentTools.js.map +1 -1
  44. package/dist/tools/patternTools.d.ts +13 -0
  45. package/dist/tools/patternTools.js +456 -0
  46. package/dist/tools/patternTools.js.map +1 -0
  47. package/dist/tools/reconTools.js +31 -31
  48. package/dist/tools/selfEvalTools.js +44 -44
  49. package/dist/tools/seoTools.d.ts +16 -0
  50. package/dist/tools/seoTools.js +866 -0
  51. package/dist/tools/seoTools.js.map +1 -0
  52. package/dist/tools/sessionMemoryTools.d.ts +15 -0
  53. package/dist/tools/sessionMemoryTools.js +348 -0
  54. package/dist/tools/sessionMemoryTools.js.map +1 -0
  55. package/dist/tools/toolRegistry.d.ts +4 -0
  56. package/dist/tools/toolRegistry.js +489 -0
  57. package/dist/tools/toolRegistry.js.map +1 -1
  58. package/dist/tools/toonTools.d.ts +15 -0
  59. package/dist/tools/toonTools.js +94 -0
  60. package/dist/tools/toonTools.js.map +1 -0
  61. package/dist/tools/verificationTools.js +41 -41
  62. package/dist/tools/visionTools.js +17 -17
  63. package/dist/tools/voiceBridgeTools.d.ts +15 -0
  64. package/dist/tools/voiceBridgeTools.js +1427 -0
  65. package/dist/tools/voiceBridgeTools.js.map +1 -0
  66. package/dist/tools/webTools.js +18 -18
  67. package/package.json +102 -101
package/dist/index.js CHANGED
@@ -41,13 +41,22 @@ import { researchWritingTools } from "./tools/researchWritingTools.js";
41
41
  import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
42
42
  import { figmaFlowTools } from "./tools/figmaFlowTools.js";
43
43
  import { createMetaTools } from "./tools/metaTools.js";
44
- import { localFileTools } from "./tools/localFileTools.js";
44
+ import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
45
45
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
46
46
  import { boilerplateTools } from "./tools/boilerplateTools.js";
47
47
  import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
48
+ import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
49
+ import { patternTools } from "./tools/patternTools.js";
50
+ import { gitWorkflowTools } from "./tools/gitWorkflowTools.js";
51
+ import { seoTools } from "./tools/seoTools.js";
52
+ import { voiceBridgeTools } from "./tools/voiceBridgeTools.js";
48
53
  import { getQuickRef } from "./tools/toolRegistry.js";
54
+ import { toonTools } from "./tools/toonTools.js";
55
+ // TOON format — ~40% token savings on tool responses
56
+ import { encode as toonEncode } from "@toon-format/toon";
49
57
  // ── CLI argument parsing ──────────────────────────────────────────────
50
58
  const cliArgs = process.argv.slice(2);
59
+ const useToon = cliArgs.includes("--toon");
51
60
  const TOOLSET_MAP = {
52
61
  verification: verificationTools,
53
62
  eval: evalTools,
@@ -72,17 +81,24 @@ const TOOLSET_MAP = {
72
81
  figma_flow: figmaFlowTools,
73
82
  boilerplate: boilerplateTools,
74
83
  benchmark: cCompilerBenchmarkTools,
84
+ session_memory: sessionMemoryTools,
85
+ gaia_solvers: gaiaMediaSolvers,
86
+ toon: toonTools,
87
+ pattern: patternTools,
88
+ git_workflow: gitWorkflowTools,
89
+ seo: seoTools,
90
+ voice_bridge: voiceBridgeTools,
75
91
  };
76
92
  const PRESETS = {
77
93
  meta: [],
78
- lite: ["verification", "eval", "quality_gate", "learning", "recon", "security", "boilerplate"],
79
- core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark"],
94
+ lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
95
+ core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory", "toon", "pattern", "git_workflow", "seo", "voice_bridge"],
80
96
  full: Object.keys(TOOLSET_MAP),
81
97
  };
82
98
  function parseToolsets() {
83
99
  if (cliArgs.includes("--help")) {
84
100
  const lines = [
85
- "nodebench-mcp v2.11.0 — Development Methodology MCP Server",
101
+ "nodebench-mcp v2.14.0 — Development Methodology MCP Server",
86
102
  "",
87
103
  "Usage: nodebench-mcp [options]",
88
104
  "",
@@ -90,6 +106,7 @@ function parseToolsets() {
90
106
  " --toolsets <list> Comma-separated toolsets to enable (default: all)",
91
107
  " --exclude <list> Comma-separated toolsets to exclude",
92
108
  " --preset <name> Use a preset: meta, lite, core, or full",
109
+ " --toon Encode all tool responses in TOON format (~40% token savings)",
93
110
  " --help Show this help and exit",
94
111
  "",
95
112
  "Available toolsets:",
@@ -161,6 +178,36 @@ for (const tool of allTools) {
161
178
  const SESSION_ID = genId("mcp");
162
179
  // Tools to skip auto-logging (avoid infinite recursion and noise)
163
180
  const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings"]);
181
+ // ── Lightweight hooks: auto-save + attention refresh reminders ─────────
182
+ const _hookState = {
183
+ totalCalls: 0,
184
+ consecutiveWebCalls: 0, // web_search, fetch_url without save_session_note
185
+ lastRefreshReminder: 0, // totalCalls at last reminder
186
+ };
187
+ const WEB_TOOL_NAMES = new Set(["web_search", "fetch_url"]);
188
+ const SAVE_TOOL_NAMES = new Set(["save_session_note", "record_learning"]);
189
+ const REFRESH_INTERVAL = 30; // remind after every 30 calls
190
+ function getHookHint(toolName) {
191
+ _hookState.totalCalls++;
192
+ // Track consecutive web calls
193
+ if (WEB_TOOL_NAMES.has(toolName)) {
194
+ _hookState.consecutiveWebCalls++;
195
+ }
196
+ else if (SAVE_TOOL_NAMES.has(toolName)) {
197
+ _hookState.consecutiveWebCalls = 0;
198
+ }
199
+ const hints = [];
200
+ // Auto-save reminder after 2+ consecutive web calls
201
+ if (_hookState.consecutiveWebCalls >= 2) {
202
+ hints.push("_hint: You've made " + _hookState.consecutiveWebCalls + " web calls without saving. Consider calling save_session_note to persist findings before context compaction.");
203
+ }
204
+ // Attention refresh reminder every 30 calls
205
+ if (_hookState.totalCalls - _hookState.lastRefreshReminder >= REFRESH_INTERVAL) {
206
+ hints.push("_hint: " + _hookState.totalCalls + " tool calls this session. Consider calling refresh_task_context to reload your bearings and prevent attention drift.");
207
+ _hookState.lastRefreshReminder = _hookState.totalCalls;
208
+ }
209
+ return hints.length > 0 ? hints.join(" | ") : null;
210
+ }
164
211
  // MCP Prompts — protocol-native agent instructions for onboarding
165
212
  const PROMPTS = [
166
213
  {
@@ -171,37 +218,37 @@ const PROMPTS = [
171
218
  role: "user",
172
219
  content: {
173
220
  type: "text",
174
- text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
175
-
176
- WHAT THIS DOES:
177
- In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
178
- that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
179
- the agent finds 2+ prior findings before writing a single line of code.
180
-
181
- HOW IT WORKS:
182
- Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
183
- Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
184
- compounds into future tasks.
185
-
186
- FIRST TIME? Run these 3 steps:
187
- 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
188
- 2. Call getMethodology("overview") to see all available methodologies
189
- 3. Call search_all_knowledge("your current task") before starting any work
190
-
191
- RETURNING? Your project context and all past learnings are persisted. Start with:
192
- 1. Call search_all_knowledge with your current task
193
- 2. Follow the methodology tools as you work — they'll guide you step by step
194
-
195
- KEY TOOLS:
196
- - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
197
- - run_mandatory_flywheel — 6-step minimum verification before declaring work done
198
- - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
199
- - findTools — Discover tools by keyword or category
200
- - assess_risk — Assess risk before acting (HIGH = needs confirmation)
201
-
202
- PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
203
- - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
204
- - get_parallel_status — See what all agents are doing
221
+ text: `You are connected to NodeBench MCP — tools that make you catch the bugs you'd normally ship.
222
+
223
+ WHAT THIS DOES:
224
+ In benchmarks across 9 real production prompts, agents with NodeBench MCP caught 13 issues (4 HIGH severity)
225
+ that bare agents shipped to production. 26 blind spots prevented. Knowledge compounds — by task 9,
226
+ the agent finds 2+ prior findings before writing a single line of code.
227
+
228
+ HOW IT WORKS:
229
+ Every task follows a pipeline: Research → Risk → Implement → Test (3 layers) → Eval → Gate → Learn → Ship.
230
+ Each step produces a concrete artifact (an issue found, a regression guarded, a pattern banked) that
231
+ compounds into future tasks.
232
+
233
+ FIRST TIME? Run these 3 steps:
234
+ 1. Call bootstrap_project to register your project (tech stack, architecture, conventions)
235
+ 2. Call getMethodology("overview") to see all available methodologies
236
+ 3. Call search_all_knowledge("your current task") before starting any work
237
+
238
+ RETURNING? Your project context and all past learnings are persisted. Start with:
239
+ 1. Call search_all_knowledge with your current task
240
+ 2. Follow the methodology tools as you work — they'll guide you step by step
241
+
242
+ KEY TOOLS:
243
+ - search_all_knowledge — Search prior findings before starting (avoid repeating past mistakes)
244
+ - run_mandatory_flywheel — 6-step minimum verification before declaring work done
245
+ - getMethodology — Step-by-step guides for verification, eval, flywheel, recon
246
+ - findTools — Discover tools by keyword or category
247
+ - assess_risk — Assess risk before acting (HIGH = needs confirmation)
248
+
249
+ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
250
+ - claim_agent_task / release_agent_task — Lock tasks to prevent duplicate work
251
+ - get_parallel_status — See what all agents are doing
205
252
  - Use the "claude-code-parallel" prompt for step-by-step guidance`,
206
253
  },
207
254
  },
@@ -222,16 +269,16 @@ PARALLEL AGENTS? If using Claude Code subagents or multiple terminals:
222
269
  role: "user",
223
270
  content: {
224
271
  type: "text",
225
- text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
226
-
227
- Please gather and record the following using the bootstrap_project tool:
228
- 1. Tech stack (languages, frameworks, runtimes)
229
- 2. Key dependency versions
230
- 3. Architecture overview
231
- 4. Build/test commands
232
- 5. Known conventions or patterns
233
- 6. Repository structure highlights
234
-
272
+ text: `Help me set up NodeBench methodology tracking for project: ${args.projectName}
273
+
274
+ Please gather and record the following using the bootstrap_project tool:
275
+ 1. Tech stack (languages, frameworks, runtimes)
276
+ 2. Key dependency versions
277
+ 3. Architecture overview
278
+ 4. Build/test commands
279
+ 5. Known conventions or patterns
280
+ 6. Repository structure highlights
281
+
235
282
  After bootstrapping, run a reconnaissance session with run_recon to check for latest updates on the project's key frameworks and SDKs.`,
236
283
  },
237
284
  },
@@ -252,33 +299,33 @@ After bootstrapping, run a reconnaissance session with run_recon to check for la
252
299
  role: "user",
253
300
  content: {
254
301
  type: "text",
255
- text: `You just implemented UI changes to: ${args.componentName}
256
-
257
- Before declaring this work done, run the UI/UX QA checklist:
258
-
259
- 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
260
- 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
261
- 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
262
- 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
263
- 5. STATES: Verify loading, error, and empty states are handled
264
- 6. CONSOLE: Check browser devtools for errors/warnings
265
- 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
266
- 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
267
- 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
268
-
269
- After checking each item, record results:
270
- call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
271
- evaluate each rule against ${args.componentName}
272
- call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
273
- call record_learning for any UI gotchas discovered
274
-
275
- For the full step-by-step methodology, call getMethodology("ui_ux_qa").
276
-
277
- Commands available:
278
- npm run test:run — Vitest component tests
279
- npm run test:e2e — Playwright E2E tests
280
- npm run storybook — Storybook dev server (port 6006)
281
- npm run perf:lighthouse — Lighthouse audit
302
+ text: `You just implemented UI changes to: ${args.componentName}
303
+
304
+ Before declaring this work done, run the UI/UX QA checklist:
305
+
306
+ 1. COMPONENT TESTS: Run \`npm run test:run\` — all component tests must pass
307
+ 2. STORYBOOK: Run \`npm run storybook\` — verify the component renders in isolation
308
+ 3. RESPONSIVE: Check at 375px, 768px, 1280px — layout must not break
309
+ 4. ACCESSIBILITY: Tab through the UI, check aria-labels, run Storybook a11y panel
310
+ 5. STATES: Verify loading, error, and empty states are handled
311
+ 6. CONSOLE: Check browser devtools for errors/warnings
312
+ 7. CAPTURE: Call capture_responsive_suite(url, label) to screenshot at 3 breakpoints
313
+ 8. E2E: Run \`npm run test:e2e\` if relevant tests exist
314
+ 9. LIGHTHOUSE: Run \`npm run perf:lighthouse\` for performance + accessibility scores
315
+
316
+ After checking each item, record results:
317
+ call get_gate_preset("ui_ux_qa") to see the 8 evaluation rules
318
+ evaluate each rule against ${args.componentName}
319
+ call run_quality_gate(gateName: "ui_ux_qa", rules: [{name, passed}, ...]) with your boolean results
320
+ call record_learning for any UI gotchas discovered
321
+
322
+ For the full step-by-step methodology, call getMethodology("ui_ux_qa").
323
+
324
+ Commands available:
325
+ npm run test:run — Vitest component tests
326
+ npm run test:e2e — Playwright E2E tests
327
+ npm run storybook — Storybook dev server (port 6006)
328
+ npm run perf:lighthouse — Lighthouse audit
282
329
  npm run perf:bundle — Bundle size analysis`,
283
330
  },
284
331
  },
@@ -306,47 +353,47 @@ Commands available:
306
353
  role: "user",
307
354
  content: {
308
355
  type: "text",
309
- text: `You are coordinating a parallel agent team for: ${args.projectGoal}
310
-
311
- This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
312
- Reference: https://www.anthropic.com/engineering/building-c-compiler
313
-
314
- SETUP (run these in order):
315
-
316
- 1. ORIENT — Check what's already happening:
317
- call get_parallel_status({ includeHistory: true })
318
- call list_agent_tasks({ status: "all" })
319
-
320
- 2. PLAN ROLES — Assign ${agentCount} specialized agents:
321
- Recommended role split for ${agentCount} agents:
322
- ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
323
- - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
324
- - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
356
+ text: `You are coordinating a parallel agent team for: ${args.projectGoal}
357
+
358
+ This follows the pattern from Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
359
+ Reference: https://www.anthropic.com/engineering/building-c-compiler
360
+
361
+ SETUP (run these in order):
362
+
363
+ 1. ORIENT — Check what's already happening:
364
+ call get_parallel_status({ includeHistory: true })
365
+ call list_agent_tasks({ status: "all" })
366
+
367
+ 2. PLAN ROLES — Assign ${agentCount} specialized agents:
368
+ Recommended role split for ${agentCount} agents:
369
+ ${agentCount >= 4 ? `- Agent 1: assign_agent_role({ role: "implementer", focusArea: "core features" })
370
+ - Agent 2: assign_agent_role({ role: "test_writer", focusArea: "test coverage" })
371
+ - Agent 3: assign_agent_role({ role: "code_quality_critic", focusArea: "refactoring" })
325
372
  - Agent 4: assign_agent_role({ role: "documentation_maintainer", focusArea: "docs and progress" })` :
326
- `- Agent 1: assign_agent_role({ role: "implementer" })
327
- - Agent 2: assign_agent_role({ role: "test_writer" })`}
328
-
329
- 3. BREAK DOWN WORK — Create task claims:
330
- For each independent piece of work:
331
- call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
332
-
333
- 4. WORK LOOP (each agent independently):
334
- a. claim_agent_task — Lock your task
335
- b. Do the work (implement, test, review)
336
- c. log_context_budget — Track context usage, avoid pollution
337
- d. run_oracle_comparison — Validate output against known-good reference
338
- e. release_agent_task — Release with progress note
339
- f. Pick next task (repeat)
340
-
341
- 5. ANTI-PATTERNS TO AVOID:
342
- - Two agents working on the same task (always claim first)
343
- - Dumping thousands of lines of test output (log to file, print summary)
344
- - Spending hours on one stuck problem (mark as blocked, move on)
345
- - Overwriting each other's changes (commit frequently, pull before push)
346
-
347
- KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
348
- use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
349
-
373
+ `- Agent 1: assign_agent_role({ role: "implementer" })
374
+ - Agent 2: assign_agent_role({ role: "test_writer" })`}
375
+
376
+ 3. BREAK DOWN WORK — Create task claims:
377
+ For each independent piece of work:
378
+ call claim_agent_task({ taskKey: "descriptive_snake_case", description: "What to do" })
379
+
380
+ 4. WORK LOOP (each agent independently):
381
+ a. claim_agent_task — Lock your task
382
+ b. Do the work (implement, test, review)
383
+ c. log_context_budget — Track context usage, avoid pollution
384
+ d. run_oracle_comparison — Validate output against known-good reference
385
+ e. release_agent_task — Release with progress note
386
+ f. Pick next task (repeat)
387
+
388
+ 5. ANTI-PATTERNS TO AVOID:
389
+ - Two agents working on the same task (always claim first)
390
+ - Dumping thousands of lines of test output (log to file, print summary)
391
+ - Spending hours on one stuck problem (mark as blocked, move on)
392
+ - Overwriting each other's changes (commit frequently, pull before push)
393
+
394
+ KEY INSIGHT from Anthropic: When all agents get stuck on the same bug (like compiling the Linux kernel),
395
+ use oracle-based testing to split the problem into independent sub-problems that each agent can solve in parallel.
396
+
350
397
  For the full methodology: call getMethodology("parallel_agent_teams")`,
351
398
  },
352
399
  },
@@ -373,45 +420,45 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
373
420
  role: "user",
374
421
  content: {
375
422
  type: "text",
376
- text: `Set up oracle-based testing for: ${args.componentName}
377
- Oracle source: ${args.oracleSource}
378
-
379
- This follows the pattern from Anthropic's C Compiler project where GCC served as a
380
- "known-good compiler oracle" to identify which specific files were broken.
381
-
382
- SETUP:
383
-
384
- 1. DEFINE ORACLE — Capture known-good reference outputs:
385
- Run the reference implementation (${args.oracleSource}) on each test input.
386
- Save outputs as golden files or capture them in the oracle comparison tool.
387
-
388
- 2. RUN COMPARISONS — For each test case:
389
- call run_oracle_comparison({
390
- testLabel: "${args.componentName}_test_1",
391
- actualOutput: "<your implementation's output>",
392
- expectedOutput: "<oracle's output>",
393
- oracleSource: "${args.oracleSource}"
394
- })
395
-
396
- 3. TRIAGE FAILURES — Review diff summaries:
397
- Each failing comparison is an independent work item.
398
- Assign each to a different parallel agent via claim_agent_task.
399
-
400
- 4. BINARY SEARCH (for complex failures):
401
- If a test passes individually but fails when combined with others,
402
- use delta debugging: split the test set in half, test each half,
403
- narrow down to the minimal failing combination.
404
- (This is how Anthropic found pairs of files that failed together but worked independently.)
405
-
406
- 5. TRACK PROGRESS — Monitor convergence:
407
- call get_parallel_status to see how many oracle tests are still failing.
408
- As agents fix failures, the match percentage should trend toward 100%.
409
-
410
- CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
411
- call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
412
-
413
- After all oracle tests pass:
414
- call record_learning with patterns discovered
423
+ text: `Set up oracle-based testing for: ${args.componentName}
424
+ Oracle source: ${args.oracleSource}
425
+
426
+ This follows the pattern from Anthropic's C Compiler project where GCC served as a
427
+ "known-good compiler oracle" to identify which specific files were broken.
428
+
429
+ SETUP:
430
+
431
+ 1. DEFINE ORACLE — Capture known-good reference outputs:
432
+ Run the reference implementation (${args.oracleSource}) on each test input.
433
+ Save outputs as golden files or capture them in the oracle comparison tool.
434
+
435
+ 2. RUN COMPARISONS — For each test case:
436
+ call run_oracle_comparison({
437
+ testLabel: "${args.componentName}_test_1",
438
+ actualOutput: "<your implementation's output>",
439
+ expectedOutput: "<oracle's output>",
440
+ oracleSource: "${args.oracleSource}"
441
+ })
442
+
443
+ 3. TRIAGE FAILURES — Review diff summaries:
444
+ Each failing comparison is an independent work item.
445
+ Assign each to a different parallel agent via claim_agent_task.
446
+
447
+ 4. BINARY SEARCH (for complex failures):
448
+ If a test passes individually but fails when combined with others,
449
+ use delta debugging: split the test set in half, test each half,
450
+ narrow down to the minimal failing combination.
451
+ (This is how Anthropic found pairs of files that failed together but worked independently.)
452
+
453
+ 5. TRACK PROGRESS — Monitor convergence:
454
+ call get_parallel_status to see how many oracle tests are still failing.
455
+ As agents fix failures, the match percentage should trend toward 100%.
456
+
457
+ CONTEXT BUDGET TIP: Large test outputs pollute context. Instead of printing full output,
458
+ call log_context_budget to track usage and only show diff summaries (first 20 differing lines).
459
+
460
+ After all oracle tests pass:
461
+ call record_learning with patterns discovered
415
462
  call run_mandatory_flywheel to verify the full change`,
416
463
  },
417
464
  },
@@ -439,67 +486,67 @@ After all oracle tests pass:
439
486
  role: "user",
440
487
  content: {
441
488
  type: "text",
442
- text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
443
-
444
- ## How This Works
445
-
446
- Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
447
- context window. NodeBench MCP tools coordinate them via a shared SQLite database.
448
-
449
- **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
450
- **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
451
-
452
- ## Step-by-Step
453
-
454
- ### 1. PLAN — Break work into ${count} independent tasks
455
- Identify ${count} pieces of work that can run in parallel without dependencies.
456
- Each task should be independently completable and testable.
457
-
458
- ### 2. SPAWN — Launch subagents with coordination instructions
459
- For each task, use the Task tool:
460
-
461
- \`\`\`
462
- Task tool call:
463
- prompt: "You have access to NodeBench MCP. Do the following:
464
- 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
465
- 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
466
- 3. Do the work
467
- 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
468
- 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
469
- 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
470
- \`\`\`
471
-
472
- ### 3. MONITOR — Check progress
473
- After spawning all subagents:
474
- call get_parallel_status({ includeHistory: true })
475
- call list_agent_tasks({ status: "all" })
476
-
477
- ### 4. VALIDATE — Run oracle comparisons if applicable
478
- If subagents produced outputs that should match a reference:
479
- call run_oracle_comparison for each output
480
-
481
- ### 5. GATE — Quality check the aggregate result
482
- call run_quality_gate with rules covering all ${count} tasks
483
- call run_mandatory_flywheel to verify the combined change
484
-
485
- ## Concrete IMPACT of This Workflow
486
-
487
- | What NodeBench Adds | Without It (bare subagents) |
488
- |---------------------------------|---------------------------------------|
489
- | Task locks prevent duplicate work | Two subagents might fix the same bug |
490
- | Role specialization | All subagents do everything |
491
- | Context budget tracking | Subagent runs out of context silently |
492
- | Oracle comparisons | No reference-based validation |
493
- | Progress notes for handoff | Next session starts from scratch |
494
- | Learnings persisted | Knowledge lost when subagent exits |
495
- | Quality gate on aggregate | No validation that pieces fit together |
496
-
497
- ## Anti-Patterns
498
- - DO NOT spawn subagents for work that has dependencies (sequential steps)
499
- - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
500
- - DO NOT dump large outputs into subagent context — use log_context_budget to track
501
- - DO NOT forget release_agent_task — orphaned claims block future sessions
502
-
489
+ text: `You are coordinating ${count} parallel Claude Code subagents for: ${args.taskDescription}
490
+
491
+ ## How This Works
492
+
493
+ Claude Code's Task tool spawns subagents — each is an independent Claude instance with its own
494
+ context window. NodeBench MCP tools coordinate them via a shared SQLite database.
495
+
496
+ **Your role: COORDINATOR.** You break work into independent tasks and spawn subagents.
497
+ **Subagent role: WORKER.** Each claims a task, does work, releases with a progress note.
498
+
499
+ ## Step-by-Step
500
+
501
+ ### 1. PLAN — Break work into ${count} independent tasks
502
+ Identify ${count} pieces of work that can run in parallel without dependencies.
503
+ Each task should be independently completable and testable.
504
+
505
+ ### 2. SPAWN — Launch subagents with coordination instructions
506
+ For each task, use the Task tool:
507
+
508
+ \`\`\`
509
+ Task tool call:
510
+ prompt: "You have access to NodeBench MCP. Do the following:
511
+ 1. Call claim_agent_task({ taskKey: '<task_key>', description: '<what to do>' })
512
+ 2. Call assign_agent_role({ role: 'implementer', focusArea: '<area>' })
513
+ 3. Do the work
514
+ 4. Call log_context_budget({ eventType: 'checkpoint', tokensUsed: <estimate> })
515
+ 5. Call release_agent_task({ taskKey: '<task_key>', status: 'completed', progressNote: '<summary>' })
516
+ 6. Call record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
517
+ \`\`\`
518
+
519
+ ### 3. MONITOR — Check progress
520
+ After spawning all subagents:
521
+ call get_parallel_status({ includeHistory: true })
522
+ call list_agent_tasks({ status: "all" })
523
+
524
+ ### 4. VALIDATE — Run oracle comparisons if applicable
525
+ If subagents produced outputs that should match a reference:
526
+ call run_oracle_comparison for each output
527
+
528
+ ### 5. GATE — Quality check the aggregate result
529
+ call run_quality_gate with rules covering all ${count} tasks
530
+ call run_mandatory_flywheel to verify the combined change
531
+
532
+ ## Concrete IMPACT of This Workflow
533
+
534
+ | What NodeBench Adds | Without It (bare subagents) |
535
+ |---------------------------------|---------------------------------------|
536
+ | Task locks prevent duplicate work | Two subagents might fix the same bug |
537
+ | Role specialization | All subagents do everything |
538
+ | Context budget tracking | Subagent runs out of context silently |
539
+ | Oracle comparisons | No reference-based validation |
540
+ | Progress notes for handoff | Next session starts from scratch |
541
+ | Learnings persisted | Knowledge lost when subagent exits |
542
+ | Quality gate on aggregate | No validation that pieces fit together |
543
+
544
+ ## Anti-Patterns
545
+ - DO NOT spawn subagents for work that has dependencies (sequential steps)
546
+ - DO NOT skip claim_agent_task — without it, two subagents may duplicate effort
547
+ - DO NOT dump large outputs into subagent context — use log_context_budget to track
548
+ - DO NOT forget release_agent_task — orphaned claims block future sessions
549
+
503
550
  For the full parallel agent methodology: call getMethodology("parallel_agent_teams")`,
504
551
  },
505
552
  },
@@ -526,72 +573,72 @@ For the full parallel agent methodology: call getMethodology("parallel_agent_tea
526
573
  role: "user",
527
574
  content: {
528
575
  type: "text",
529
- text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
530
- ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
531
-
532
- This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
533
-
534
- STEP 1 — DETECT (dry run first):
535
- call bootstrap_parallel_agents({
536
- projectRoot: "${args.projectPath}",
537
- dryRun: true,
538
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
539
- includeAgentsMd: true
540
- })
541
-
542
- Review the gap report. It scans 7 categories:
543
- - Task coordination (lock files, claim directories)
544
- - Role specialization (role configs, AGENTS.md mentions)
545
- - Oracle testing (golden files, reference outputs, snapshots)
546
- - Context budget tracking (budget configs, AGENTS.md mentions)
547
- - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
548
- - AGENTS.md parallel section (parallel agent coordination protocol)
549
- - Git worktrees (for true parallel work)
550
-
551
- STEP 2 — SCAFFOLD (create files):
552
- If gaps found, run with dryRun=false:
553
- call bootstrap_parallel_agents({
554
- projectRoot: "${args.projectPath}",
555
- dryRun: false,
556
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
557
- includeAgentsMd: true
558
- })
559
-
560
- This creates:
561
- - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
562
- - progress.md template for agent orientation
563
- - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
564
-
565
- STEP 3 — GENERATE AGENTS.MD (if needed):
566
- call generate_parallel_agents_md({
567
- ${args.techStack ? `techStack: "${args.techStack}",` : ""}
568
- projectName: "${args.projectPath.split("/").pop() || "project"}",
569
- maxAgents: 4,
570
- includeNodebenchSetup: true
571
- })
572
-
573
- Copy the output into the target repo's AGENTS.md.
574
-
575
- STEP 4 — VERIFY (6-step flywheel):
576
- The bootstrap tool returns a flywheelPlan. Execute each step:
577
- 1. Static analysis — verify scaffold files don't conflict
578
- 2. Happy path — claim task → work → release → progress.md updated
579
- 3. Conflict test — two claims on same task → second gets conflict
580
- 4. Oracle test — create golden file → diff catches changes
581
- 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
582
- 6. Document — record_learning with patterns discovered
583
-
584
- STEP 5 — FIX (if anything fails):
585
- Fix the issue, then re-run from Step 4.
586
-
587
- STEP 6 — DOCUMENT:
588
- call record_learning({
589
- key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
590
- content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
591
- category: "pattern",
592
- tags: ["parallel-agents", "bootstrap", "external-repo"]
593
- })
594
-
576
+ text: `Bootstrap parallel agent infrastructure for: ${args.projectPath}
577
+ ${args.techStack ? `Tech stack: ${args.techStack}` : ""}
578
+
579
+ This follows the AI Flywheel closed loop: detect → scaffold → verify → fix → document.
580
+
581
+ STEP 1 — DETECT (dry run first):
582
+ call bootstrap_parallel_agents({
583
+ projectRoot: "${args.projectPath}",
584
+ dryRun: true,
585
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
586
+ includeAgentsMd: true
587
+ })
588
+
589
+ Review the gap report. It scans 7 categories:
590
+ - Task coordination (lock files, claim directories)
591
+ - Role specialization (role configs, AGENTS.md mentions)
592
+ - Oracle testing (golden files, reference outputs, snapshots)
593
+ - Context budget tracking (budget configs, AGENTS.md mentions)
594
+ - Progress files (PROGRESS.md, STATUS.md, claude-progress.txt)
595
+ - AGENTS.md parallel section (parallel agent coordination protocol)
596
+ - Git worktrees (for true parallel work)
597
+
598
+ STEP 2 — SCAFFOLD (create files):
599
+ If gaps found, run with dryRun=false:
600
+ call bootstrap_parallel_agents({
601
+ projectRoot: "${args.projectPath}",
602
+ dryRun: false,
603
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
604
+ includeAgentsMd: true
605
+ })
606
+
607
+ This creates:
608
+ - .parallel-agents/ directory with README, current_tasks/, oracle/, roles.json
609
+ - progress.md template for agent orientation
610
+ - AGENTS.md parallel section (or .parallel-append file for existing AGENTS.md)
611
+
612
+ STEP 3 — GENERATE AGENTS.MD (if needed):
613
+ call generate_parallel_agents_md({
614
+ ${args.techStack ? `techStack: "${args.techStack}",` : ""}
615
+ projectName: "${args.projectPath.split("/").pop() || "project"}",
616
+ maxAgents: 4,
617
+ includeNodebenchSetup: true
618
+ })
619
+
620
+ Copy the output into the target repo's AGENTS.md.
621
+
622
+ STEP 4 — VERIFY (6-step flywheel):
623
+ The bootstrap tool returns a flywheelPlan. Execute each step:
624
+ 1. Static analysis — verify scaffold files don't conflict
625
+ 2. Happy path — claim task → work → release → progress.md updated
626
+ 3. Conflict test — two claims on same task → second gets conflict
627
+ 4. Oracle test — create golden file → diff catches changes
628
+ 5. Gap re-scan — re-run bootstrap with dryRun=true → all gaps filled
629
+ 6. Document — record_learning with patterns discovered
630
+
631
+ STEP 5 — FIX (if anything fails):
632
+ Fix the issue, then re-run from Step 4.
633
+
634
+ STEP 6 — DOCUMENT:
635
+ call record_learning({
636
+ key: "bootstrap_parallel_${args.projectPath.split("/").pop() || "project"}",
637
+ content: "Bootstrapped parallel agent infrastructure for ${args.projectPath}. <summary of what was created and any issues found>",
638
+ category: "pattern",
639
+ tags: ["parallel-agents", "bootstrap", "external-repo"]
640
+ })
641
+
595
642
  For the full methodology: call getMethodology("parallel_agent_teams")`,
596
643
  },
597
644
  },
@@ -605,60 +652,89 @@ For the full methodology: call getMethodology("parallel_agent_teams")`,
605
652
  role: "user",
606
653
  content: {
607
654
  type: "text",
608
- text: `## NodeBench MCP Agent Contract
609
-
610
- You are connected to NodeBench MCP. Follow these rules EXACTLY.
611
-
612
- ### FRONT DOOR — Always start here (before writing any code)
613
- 1. search_all_knowledge("<your current task>") — Check if this was solved before
614
- 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
615
- 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
616
- 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
617
-
618
- ### SELF-SETUP — If a capability is missing
619
- When discover_tools returns nothing useful, or a tool says "not configured":
620
- 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
621
- 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
622
- 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
623
- 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
624
-
625
- ### BEFORE IMPLEMENTATION
626
- - run_recon + log_recon_finding (if reconnaissance applies)
627
- - assess_risk (HIGH risk = must get confirmation before proceeding)
628
-
629
- ### PARALLEL WORK
630
- - MUST claim_agent_task before editing or designing anything
631
- - MUST release_agent_task with a progress note + next action when done
632
- - MUST log_context_budget to track context usage and avoid pollution
633
-
634
- ### BEFORE SHIP
635
- - 3-layer tests logged (unit + integration + e2e via log_test_result)
636
- - Eval run recorded (promote_to_eval)
637
- - Quality gate passed (run_quality_gate)
638
- - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
639
- - Learning banked (record_learning)
640
-
641
- ### COORDINATOR SPAWN TEMPLATE
642
- When spawning subagents, give each this instruction block:
643
- "You have NodeBench MCP. Before any work:
644
- 1. search_all_knowledge('<task>')
645
- 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
646
- 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
647
- Do the work, then:
648
- 4. log_context_budget({ eventType: 'checkpoint' })
649
- 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
650
- 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
651
-
652
- ### WHY THIS MATTERS
653
- Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
654
- work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
655
+ text: `## NodeBench MCP Agent Contract
656
+
657
+ You are connected to NodeBench MCP. Follow these rules EXACTLY.
658
+
659
+ ### FRONT DOOR — Always start here (before writing any code)
660
+ 1. search_all_knowledge("<your current task>") — Check if this was solved before
661
+ 2. getMethodology("mandatory_flywheel") — Load the verification pipeline
662
+ 3. discover_tools("<your task>", { explain: true }) — Find the right tools for this job
663
+ 4. get_workflow_chain("<workflow>") — Get step-by-step sequence (fix_bug, new_feature, etc.)
664
+
665
+ ### SELF-SETUP — If a capability is missing
666
+ When discover_tools returns nothing useful, or a tool says "not configured":
667
+ 1. Escalate toolset: If started with --preset lite, switch to --preset core or targeted --toolsets
668
+ 2. Resolve providers: Configure missing API keys (GEMINI_API_KEY, OPENAI_API_KEY, etc.)
669
+ 3. Bootstrap infra: Run scaffold_nodebench_project or bootstrap_parallel_agents if repo lacks infra
670
+ 4. Smoke-test: Re-run the first workflow chain step to confirm the capability is available
671
+
672
+ ### BEFORE IMPLEMENTATION
673
+ - run_recon + log_recon_finding (if reconnaissance applies)
674
+ - assess_risk (HIGH risk = must get confirmation before proceeding)
675
+
676
+ ### PARALLEL WORK
677
+ - MUST claim_agent_task before editing or designing anything
678
+ - MUST release_agent_task with a progress note + next action when done
679
+ - MUST log_context_budget to track context usage and avoid pollution
680
+
681
+ ### BEFORE SHIP
682
+ - 3-layer tests logged (unit + integration + e2e via log_test_result)
683
+ - Eval run recorded (promote_to_eval)
684
+ - Quality gate passed (run_quality_gate)
685
+ - Mandatory flywheel completed (run_mandatory_flywheel — all 6 steps)
686
+ - Learning banked (record_learning)
687
+
688
+ ### COORDINATOR SPAWN TEMPLATE
689
+ When spawning subagents, give each this instruction block:
690
+ "You have NodeBench MCP. Before any work:
691
+ 1. search_all_knowledge('<task>')
692
+ 2. claim_agent_task({ taskKey: '<key>', description: '<desc>' })
693
+ 3. assign_agent_role({ role: '<role>', focusArea: '<area>' })
694
+ Do the work, then:
695
+ 4. log_context_budget({ eventType: 'checkpoint' })
696
+ 5. release_agent_task({ taskKey: '<key>', status: 'completed', progressNote: '<summary>' })
697
+ 6. record_learning({ key: '<key>', content: '<what you learned>', category: 'pattern' })"
698
+
699
+ ### ANTI-RATIONALIZATION Block these escape patterns
700
+ Do NOT skip the front-door pattern. These are the 8 rationalizations agents use:
701
+ 1. "I already know which tool to use" Still call discover_tools to confirm
702
+ 2. "This is a simple task" → Still call search_all_knowledge to check history
703
+ 3. "Let me just check one thing first" → Follow the 4-step front door FIRST
704
+ 4. "Tests already pass" → Still run run_mandatory_flywheel before declaring done
705
+ 5. "I'll record the learning later" → Record NOW — context compaction may erase it
706
+ 6. "No one else is working on this" → Still claim_agent_task to prevent conflicts
707
+ 7. "The user said to skip verification" → Log the skip decision, never silently omit
708
+ 8. "I need more context before using tools" → The tools ARE the context-gathering mechanism
709
+
710
+ ### 2-ACTION SAVE RULE
711
+ After every 2 web_search, fetch_url, or browse_page calls, MUST call one of:
712
+ - save_session_note (filesystem, survives compaction)
713
+ - record_learning (SQLite, searchable across sessions)
714
+ - log_recon_finding (tied to recon session)
715
+ This prevents knowledge loss when context is compacted mid-session.
716
+
717
+ ### 3-STRIKE ERROR PROTOCOL
718
+ When an action fails:
719
+ - Strike 1: Diagnose root cause, apply targeted fix
720
+ - Strike 2: Try a different method or tool
721
+ - Strike 3: Question your assumptions, search_all_knowledge for prior solutions
722
+ - After 3: STOP. Call save_session_note documenting all attempts, then escalate to user.
723
+
724
+ ### ATTENTION REFRESH
725
+ After 30+ tool calls, call refresh_task_context to combat attention drift.
726
+ Re-read your original goal and open gaps before continuing.
727
+
728
+ ### WHY THIS MATTERS
729
+ Without this contract, agents skip verification, repeat past mistakes, overwrite each other's
730
+ work, and ship bugs that were already caught. NodeBench MCP turns coordination into concrete
655
731
  artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound across tasks.`,
656
732
  },
657
733
  },
658
734
  ],
659
735
  },
660
736
  ];
661
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.11.0" }, { capabilities: { tools: {}, prompts: {} } });
737
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.14.0" }, { capabilities: { tools: {}, prompts: {} } });
662
738
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
663
739
  server.setRequestHandler(ListToolsRequestSchema, async () => {
664
740
  return {
@@ -709,8 +785,29 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
709
785
  enrichedResult = { ...result, _quickRef: quickRef };
710
786
  }
711
787
  }
788
+ // Lightweight hook: append save/refresh hints when thresholds are met
789
+ const hookHint = getHookHint(name);
790
+ // Serialize: TOON (~40% fewer tokens) or JSON
791
+ let serialized;
792
+ if (useToon) {
793
+ try {
794
+ serialized = toonEncode(enrichedResult);
795
+ }
796
+ catch {
797
+ serialized = JSON.stringify(enrichedResult, null, 2);
798
+ }
799
+ }
800
+ else {
801
+ serialized = JSON.stringify(enrichedResult, null, 2);
802
+ }
803
+ const contentBlocks = [
804
+ { type: "text", text: serialized },
805
+ ];
806
+ if (hookHint) {
807
+ contentBlocks.push({ type: "text", text: hookHint });
808
+ }
712
809
  return {
713
- content: [{ type: "text", text: JSON.stringify(enrichedResult, null, 2) }],
810
+ content: contentBlocks,
714
811
  isError: false,
715
812
  };
716
813
  }