nodebench-mcp 2.15.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/LICENSE +21 -0
  2. package/NODEBENCH_AGENTS.md +2 -2
  3. package/README.md +514 -82
  4. package/dist/__tests__/analytics.test.d.ts +11 -0
  5. package/dist/__tests__/analytics.test.js +546 -0
  6. package/dist/__tests__/analytics.test.js.map +1 -0
  7. package/dist/__tests__/architectComplex.test.d.ts +1 -0
  8. package/dist/__tests__/architectComplex.test.js +375 -0
  9. package/dist/__tests__/architectComplex.test.js.map +1 -0
  10. package/dist/__tests__/architectSmoke.test.d.ts +1 -0
  11. package/dist/__tests__/architectSmoke.test.js +92 -0
  12. package/dist/__tests__/architectSmoke.test.js.map +1 -0
  13. package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
  14. package/dist/__tests__/dynamicLoading.test.js +278 -0
  15. package/dist/__tests__/dynamicLoading.test.js.map +1 -0
  16. package/dist/__tests__/evalHarness.test.js +7 -2
  17. package/dist/__tests__/evalHarness.test.js.map +1 -1
  18. package/dist/__tests__/gaiaCapabilityEval.test.js +229 -12
  19. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
  20. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +194 -109
  21. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  22. package/dist/__tests__/helpers/answerMatch.js +22 -22
  23. package/dist/__tests__/presetRealWorldBench.test.js +11 -2
  24. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  25. package/dist/__tests__/tools.test.js +10 -4
  26. package/dist/__tests__/tools.test.js.map +1 -1
  27. package/dist/__tests__/toolsetGatingEval.test.js +12 -4
  28. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  29. package/dist/analytics/index.d.ts +10 -0
  30. package/dist/analytics/index.js +11 -0
  31. package/dist/analytics/index.js.map +1 -0
  32. package/dist/analytics/projectDetector.d.ts +19 -0
  33. package/dist/analytics/projectDetector.js +259 -0
  34. package/dist/analytics/projectDetector.js.map +1 -0
  35. package/dist/analytics/schema.d.ts +57 -0
  36. package/dist/analytics/schema.js +157 -0
  37. package/dist/analytics/schema.js.map +1 -0
  38. package/dist/analytics/smartPreset.d.ts +63 -0
  39. package/dist/analytics/smartPreset.js +300 -0
  40. package/dist/analytics/smartPreset.js.map +1 -0
  41. package/dist/analytics/toolTracker.d.ts +59 -0
  42. package/dist/analytics/toolTracker.js +163 -0
  43. package/dist/analytics/toolTracker.js.map +1 -0
  44. package/dist/analytics/usageStats.d.ts +64 -0
  45. package/dist/analytics/usageStats.js +252 -0
  46. package/dist/analytics/usageStats.js.map +1 -0
  47. package/dist/db.js +359 -321
  48. package/dist/db.js.map +1 -1
  49. package/dist/index.d.ts +2 -1
  50. package/dist/index.js +653 -84
  51. package/dist/index.js.map +1 -1
  52. package/dist/tools/architectTools.d.ts +15 -0
  53. package/dist/tools/architectTools.js +304 -0
  54. package/dist/tools/architectTools.js.map +1 -0
  55. package/dist/tools/critterTools.js +14 -14
  56. package/dist/tools/emailTools.d.ts +15 -0
  57. package/dist/tools/emailTools.js +664 -0
  58. package/dist/tools/emailTools.js.map +1 -0
  59. package/dist/tools/metaTools.js +660 -0
  60. package/dist/tools/metaTools.js.map +1 -1
  61. package/dist/tools/parallelAgentTools.js +176 -176
  62. package/dist/tools/patternTools.js +11 -11
  63. package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
  64. package/dist/tools/progressiveDiscoveryTools.js +113 -21
  65. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  66. package/dist/tools/researchWritingTools.js +42 -42
  67. package/dist/tools/rssTools.d.ts +8 -0
  68. package/dist/tools/rssTools.js +833 -0
  69. package/dist/tools/rssTools.js.map +1 -0
  70. package/dist/tools/toolRegistry.d.ts +17 -0
  71. package/dist/tools/toolRegistry.js +236 -17
  72. package/dist/tools/toolRegistry.js.map +1 -1
  73. package/dist/tools/voiceBridgeTools.js +498 -498
  74. package/dist/toolsetRegistry.d.ts +10 -0
  75. package/dist/toolsetRegistry.js +84 -0
  76. package/dist/toolsetRegistry.js.map +1 -0
  77. package/package.json +12 -5
package/dist/index.js CHANGED
@@ -20,39 +20,14 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
20
20
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
21
21
  import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
22
22
  import { getDb, genId } from "./db.js";
23
- import { verificationTools } from "./tools/verificationTools.js";
24
- import { evalTools } from "./tools/evalTools.js";
25
- import { qualityGateTools } from "./tools/qualityGateTools.js";
26
- import { learningTools } from "./tools/learningTools.js";
27
- import { flywheelTools } from "./tools/flywheelTools.js";
28
- import { reconTools } from "./tools/reconTools.js";
29
- import { uiCaptureTools } from "./tools/uiCaptureTools.js";
30
- import { visionTools } from "./tools/visionTools.js";
31
- import { webTools } from "./tools/webTools.js";
32
- import { githubTools } from "./tools/githubTools.js";
33
- import { documentationTools } from "./tools/documentationTools.js";
34
- import { agentBootstrapTools } from "./tools/agentBootstrapTools.js";
35
- import { selfEvalTools } from "./tools/selfEvalTools.js";
36
- import { parallelAgentTools } from "./tools/parallelAgentTools.js";
37
- import { llmTools } from "./tools/llmTools.js";
38
- import { securityTools } from "./tools/securityTools.js";
39
- import { platformTools } from "./tools/platformTools.js";
40
- import { researchWritingTools } from "./tools/researchWritingTools.js";
41
- import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
42
- import { figmaFlowTools } from "./tools/figmaFlowTools.js";
23
+ import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
24
+ import { AnalyticsTracker } from "./analytics/toolTracker.js";
25
+ import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
26
+ import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
27
+ import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
43
28
  import { createMetaTools } from "./tools/metaTools.js";
44
- import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
45
29
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
46
- import { boilerplateTools } from "./tools/boilerplateTools.js";
47
- import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
48
- import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
49
- import { patternTools } from "./tools/patternTools.js";
50
- import { gitWorkflowTools } from "./tools/gitWorkflowTools.js";
51
- import { seoTools } from "./tools/seoTools.js";
52
- import { voiceBridgeTools } from "./tools/voiceBridgeTools.js";
53
- import { critterTools } from "./tools/critterTools.js";
54
- import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor } from "./tools/toolRegistry.js";
55
- import { toonTools } from "./tools/toonTools.js";
30
+ import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch } from "./tools/toolRegistry.js";
56
31
  // TOON format — ~40% token savings on tool responses
57
32
  import { encode as toonEncode } from "@toon-format/toon";
58
33
  // Embedding provider — neural semantic search
@@ -61,56 +36,55 @@ import { initEmbeddingIndex } from "./tools/embeddingProvider.js";
61
36
  const cliArgs = process.argv.slice(2);
62
37
  const useToon = !cliArgs.includes("--no-toon");
63
38
  const useEmbedding = !cliArgs.includes("--no-embedding");
64
- const TOOLSET_MAP = {
65
- verification: verificationTools,
66
- eval: evalTools,
67
- quality_gate: qualityGateTools,
68
- learning: learningTools,
69
- flywheel: flywheelTools,
70
- recon: reconTools,
71
- ui_capture: uiCaptureTools,
72
- vision: visionTools,
73
- local_file: localFileTools,
74
- web: webTools,
75
- github: githubTools,
76
- docs: documentationTools,
77
- bootstrap: agentBootstrapTools,
78
- self_eval: selfEvalTools,
79
- parallel: parallelAgentTools,
80
- llm: llmTools,
81
- security: securityTools,
82
- platform: platformTools,
83
- research_writing: researchWritingTools,
84
- flicker_detection: flickerDetectionTools,
85
- figma_flow: figmaFlowTools,
86
- boilerplate: boilerplateTools,
87
- benchmark: cCompilerBenchmarkTools,
88
- session_memory: sessionMemoryTools,
89
- gaia_solvers: gaiaMediaSolvers,
90
- toon: toonTools,
91
- pattern: patternTools,
92
- git_workflow: gitWorkflowTools,
93
- seo: seoTools,
94
- voice_bridge: voiceBridgeTools,
95
- critter: critterTools,
96
- };
39
+ const useSmartPreset = cliArgs.includes("--smart-preset");
40
+ const showStats = cliArgs.includes("--stats");
41
+ const exportStats = cliArgs.includes("--export-stats");
42
+ const resetStats = cliArgs.includes("--reset-stats");
43
+ const listPresetsFlag = cliArgs.includes("--list-presets");
44
+ export { TOOLSET_MAP };
45
+ const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"];
97
46
  const PRESETS = {
98
- meta: [],
99
- lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
100
- core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory", "toon", "pattern", "git_workflow", "seo", "voice_bridge", "critter"],
47
+ default: DEFAULT_TOOLSETS,
48
+ // Themed presets bridge between default (39 tools) and full (175 tools)
49
+ web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect"],
50
+ research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs"],
51
+ data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web"],
52
+ devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
53
+ mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection"],
54
+ academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
55
+ multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon"],
56
+ content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect"],
101
57
  full: Object.keys(TOOLSET_MAP),
102
58
  };
59
+ const PRESET_DESCRIPTIONS = {
60
+ default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
61
+ web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
62
+ research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
63
+ data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
64
+ devops: "CI/CD & ops — adds git compliance, session memory, benchmarks, pattern mining",
65
+ mobile: "Mobile apps — adds screenshot capture, vision analysis, flicker detection",
66
+ academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
67
+ multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing",
68
+ content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
69
+ full: "Everything — all toolsets for maximum coverage",
70
+ };
103
71
  function parseToolsets() {
104
72
  if (cliArgs.includes("--help")) {
105
73
  const lines = [
106
- "nodebench-mcp v2.15.0 — Development Methodology MCP Server",
74
+ "nodebench-mcp v2.17.0 — Development Methodology MCP Server",
107
75
  "",
108
76
  "Usage: nodebench-mcp [options]",
109
77
  "",
110
78
  "Options:",
111
- " --toolsets <list> Comma-separated toolsets to enable (default: all)",
79
+ " --toolsets <list> Comma-separated toolsets to enable (default: default)",
112
80
  " --exclude <list> Comma-separated toolsets to exclude",
113
- " --preset <name> Use a preset: meta, lite, core, or full",
81
+ " --preset <name> Use a preset: default or full",
82
+ " --smart-preset Generate smart preset recommendation based on project type and usage history",
83
+ " --stats Show usage statistics for current project",
84
+ " --export-stats Export usage statistics to JSON",
85
+ " --reset-stats Clear all usage analytics data",
86
+ " --list-presets List all available presets with descriptions",
87
+ " --dynamic Enable dynamic toolset loading (Search+Load pattern from arxiv 2509.20386)",
114
88
  " --no-toon Disable TOON encoding (TOON is on by default for ~40% token savings)",
115
89
  " --no-embedding Disable neural embedding search (uses local HuggingFace model or API keys)",
116
90
  " --help Show this help and exit",
@@ -119,10 +93,20 @@ function parseToolsets() {
119
93
  ...Object.entries(TOOLSET_MAP).map(([k, v]) => ` ${k.padEnd(16)} ${v.length} tools`),
120
94
  "",
121
95
  "Presets:",
122
- ...Object.entries(PRESETS).map(([k, v]) => ` ${k.padEnd(16)} ${v.join(", ")}`),
96
+ ...Object.entries(PRESETS).map(([k, v]) => {
97
+ const count = v.reduce((s, ts) => s + (TOOLSET_MAP[ts]?.length ?? 0), 0) + 6;
98
+ return ` ${k.padEnd(14)} ${String(count).padStart(3)} tools ${PRESET_DESCRIPTIONS[k] ?? ''}`;
99
+ }),
123
100
  "",
124
101
  "Examples:",
125
- " npx nodebench-mcp --preset core",
102
+ " npx nodebench-mcp # Default (39 tools) - core AI Flywheel",
103
+ " npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
104
+ " npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
105
+ " npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
106
+ " npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
107
+ " npx nodebench-mcp --preset full # All 175 tools",
108
+ " npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
109
+ " npx nodebench-mcp --stats # Show usage statistics",
126
110
  " npx nodebench-mcp --toolsets verification,eval,recon",
127
111
  " npx nodebench-mcp --exclude vision,ui_capture,parallel",
128
112
  "",
@@ -164,19 +148,533 @@ function parseToolsets() {
164
148
  .filter(([k]) => !excluded.has(k))
165
149
  .flatMap(([, v]) => v);
166
150
  }
167
- return Object.values(TOOLSET_MAP).flat();
151
+ // Default to default preset (39 tools - complete AI Flywheel)
152
+ return PRESETS.default.flatMap((k) => TOOLSET_MAP[k] ?? []);
153
+ }
154
+ // ── Analytics CLI flag handling ─────────────────────────────────────────
155
+ // Handle --list-presets
156
+ if (listPresetsFlag) {
157
+ const presets = listPresets(TOOLSET_MAP);
158
+ console.log(JSON.stringify(presets, null, 2));
159
+ process.exit(0);
160
+ }
161
+ // ── Analytics CLI handlers (run-and-exit) ───────────────────────────────
162
+ if (resetStats || useSmartPreset || showStats || exportStats) {
163
+ const aDb = getAnalyticsDb();
164
+ try {
165
+ if (resetStats) {
166
+ clearOldRecords(aDb, 0);
167
+ console.error("Usage analytics data cleared (tool_usage + cache). Project context and preset history preserved.");
168
+ }
169
+ else if (useSmartPreset) {
170
+ const recommendation = generateSmartPreset(aDb, TOOLSET_MAP);
171
+ console.error(formatPresetRecommendation(recommendation, TOOLSET_MAP));
172
+ }
173
+ else if (showStats) {
174
+ const summary = getProjectUsageSummary(aDb, process.cwd(), 30);
175
+ if (summary) {
176
+ console.error(formatStatsDisplay(summary, process.cwd()));
177
+ }
178
+ else {
179
+ console.error("No usage data available for this project in the last 30 days.");
180
+ }
181
+ }
182
+ else if (exportStats) {
183
+ console.log(exportUsageStats(aDb, process.cwd(), 30));
184
+ }
185
+ }
186
+ finally {
187
+ closeAnalyticsDb(aDb);
188
+ }
189
+ process.exit(0);
168
190
  }
169
191
  // Initialize DB (creates ~/.nodebench/ and schema on first run)
170
192
  getDb();
171
193
  // Wire up DB accessor for execution trace edges (avoids circular import)
172
194
  _setDbAccessor(getDb);
173
195
  // Assemble tools (filtered by --toolsets / --exclude / --preset if provided)
174
- const domainTools = parseToolsets();
196
+ let domainTools = parseToolsets();
197
+ // Determine current preset name for analytics
198
+ let currentPreset = 'default';
199
+ const presetIdx = cliArgs.indexOf("--preset");
200
+ if (presetIdx !== -1 && cliArgs[presetIdx + 1]) {
201
+ currentPreset = cliArgs[presetIdx + 1];
202
+ }
203
+ else if (cliArgs.includes("--toolsets") || cliArgs.includes("--exclude")) {
204
+ currentPreset = 'custom';
205
+ }
206
+ // Dynamic loading: --dynamic flag enables Search+Load architecture
207
+ // (arxiv 2509.20386 "Dynamic ReAct" winning pattern)
208
+ const useDynamicLoading = cliArgs.includes("--dynamic");
209
+ // Track which toolsets are currently active (mutable for dynamic loading)
210
+ const initialToolsetNames = new Set(PRESETS[currentPreset] ?? PRESETS.default);
211
+ const activeToolsets = new Set(initialToolsetNames);
212
+ // Tools to skip auto-logging (avoid infinite recursion and noise)
213
+ const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings", "load_toolset", "unload_toolset", "list_available_toolsets"]);
214
+ // Initialize analytics tracker singleton (handles DB, project context, retention cleanup)
215
+ const tracker = AnalyticsTracker.init({
216
+ projectPath: process.cwd(),
217
+ preset: currentPreset,
218
+ toolCount: domainTools.length + 6,
219
+ toolToToolset: TOOL_TO_TOOLSET,
220
+ skipTools: SKIP_AUTO_LOG,
221
+ });
175
222
  const metaTools = createMetaTools(domainTools);
176
- const allToolsWithoutDiscovery = [...domainTools, ...metaTools];
223
+ let allToolsWithoutDiscovery = [...domainTools, ...metaTools];
177
224
  // Progressive discovery tools need the full tool list for hybrid search
178
- const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })));
179
- const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
225
+ // Pass dynamic loading callbacks so discover_tools can suggest load_toolset for unloaded toolsets
226
+ const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })), {
227
+ getLoadedToolNames: () => new Set(allTools.map(t => t.name)),
228
+ getToolToToolset: () => TOOL_TO_TOOLSET,
229
+ });
230
+ // ── Dynamic Loading Tools (Search+Load pattern) ────────────────────────
231
+ // Based on Dynamic ReAct (arxiv 2509.20386) — the winning architecture.
232
+ // Agent starts with default preset, discovers tools via discover_tools,
233
+ // then calls load_toolset to activate them. Server sends
234
+ // notifications/tools/list_changed so the client re-fetches the tool list.
235
+ const dynamicLoadingTools = [
236
+ {
237
+ name: "load_toolset",
238
+ description: 'Dynamically load a toolset into the current session. After loading, the tools become immediately available for use. Based on the "Search+Load" architecture from Dynamic ReAct (arxiv 2509.20386) — the winning pattern for scalable MCP tool selection. Use discover_tools first to find which toolset you need, then call this to activate it.',
239
+ inputSchema: {
240
+ type: "object",
241
+ properties: {
242
+ toolset: {
243
+ type: "string",
244
+ description: `Toolset name to load. Available: ${Object.keys(TOOLSET_MAP).filter(k => !activeToolsets.has(k)).join(", ") || "(all loaded)"}`,
245
+ },
246
+ },
247
+ required: ["toolset"],
248
+ },
249
+ handler: async (args) => {
250
+ const { toolset } = args;
251
+ if (!TOOLSET_MAP[toolset]) {
252
+ return { error: true, message: `Unknown toolset: ${toolset}`, available: Object.keys(TOOLSET_MAP) };
253
+ }
254
+ if (activeToolsets.has(toolset)) {
255
+ return { alreadyLoaded: true, toolset, message: `Toolset '${toolset}' is already active.`, activeToolCount: allTools.length };
256
+ }
257
+ const startMs = Date.now();
258
+ const toolsBefore = allTools.length;
259
+ // Add toolset to active set
260
+ activeToolsets.add(toolset);
261
+ const newTools = TOOLSET_MAP[toolset];
262
+ // Rebuild domain tools from active toolsets
263
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
264
+ const newMetaTools = createMetaTools(domainTools);
265
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
266
+ // Rebuild allTools (keep discovery + dynamic loading tools stable)
267
+ rebuildAllTools();
268
+ // Track A/B event
269
+ try {
270
+ const db = getDb();
271
+ db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, latency_ms, created_at) VALUES (?, ?, 'load', ?, ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length, Date.now() - startMs);
272
+ }
273
+ catch { /* instrumentation must not break tool dispatch */ }
274
+ // Notify client that tool list changed (MCP spec)
275
+ try {
276
+ await server.notification({ method: "notifications/tools/list_changed" });
277
+ }
278
+ catch { /* client may not support notifications */ }
279
+ return {
280
+ loaded: true,
281
+ toolset,
282
+ toolsAdded: newTools.length,
283
+ toolNames: newTools.map(t => t.name),
284
+ activeToolCount: allTools.length,
285
+ activeToolsets: [...activeToolsets],
286
+ _hint: `${newTools.length} tools from '${toolset}' are now available. You can use them directly.`,
287
+ };
288
+ },
289
+ },
290
+ {
291
+ name: "unload_toolset",
292
+ description: "Remove a dynamically loaded toolset from the current session to free up context. Cannot unload toolsets from the initial preset.",
293
+ inputSchema: {
294
+ type: "object",
295
+ properties: {
296
+ toolset: {
297
+ type: "string",
298
+ description: "Toolset name to unload.",
299
+ },
300
+ },
301
+ required: ["toolset"],
302
+ },
303
+ handler: async (args) => {
304
+ const { toolset } = args;
305
+ if (!activeToolsets.has(toolset)) {
306
+ return { error: true, message: `Toolset '${toolset}' is not currently loaded.` };
307
+ }
308
+ if (initialToolsetNames.has(toolset)) {
309
+ return { error: true, message: `Cannot unload '${toolset}' — it's part of the initial preset (${currentPreset}).` };
310
+ }
311
+ const toolsBefore = allTools.length;
312
+ activeToolsets.delete(toolset);
313
+ // Rebuild
314
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
315
+ const newMetaTools = createMetaTools(domainTools);
316
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
317
+ rebuildAllTools();
318
+ try {
319
+ const db = getDb();
320
+ db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, created_at) VALUES (?, ?, 'unload', ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length);
321
+ }
322
+ catch { /* instrumentation */ }
323
+ try {
324
+ await server.notification({ method: "notifications/tools/list_changed" });
325
+ }
326
+ catch { /* client may not support notifications */ }
327
+ return {
328
+ unloaded: true,
329
+ toolset,
330
+ activeToolCount: allTools.length,
331
+ activeToolsets: [...activeToolsets],
332
+ };
333
+ },
334
+ },
335
+ {
336
+ name: "list_available_toolsets",
337
+ description: "List all available toolsets showing which are currently loaded and which can be dynamically added. Includes tool counts and descriptions for each toolset.",
338
+ inputSchema: { type: "object", properties: {} },
339
+ handler: async () => {
340
+ const toolsets = Object.entries(TOOLSET_MAP).map(([name, tools]) => ({
341
+ name,
342
+ toolCount: tools.length,
343
+ loaded: activeToolsets.has(name),
344
+ isInitialPreset: initialToolsetNames.has(name),
345
+ description: PRESET_DESCRIPTIONS[name] ?? null,
346
+ tools: tools.map(t => t.name),
347
+ }));
348
+ const loaded = toolsets.filter(t => t.loaded);
349
+ const available = toolsets.filter(t => !t.loaded);
350
+ return {
351
+ mode: useDynamicLoading ? "dynamic" : "static",
352
+ currentPreset,
353
+ activeToolCount: allTools.length,
354
+ loaded: { count: loaded.length, toolsets: loaded },
355
+ available: { count: available.length, toolsets: available },
356
+ _hint: available.length > 0
357
+ ? `${available.length} toolsets available to load. Call load_toolset("<name>") to activate.`
358
+ : "All toolsets are loaded.",
359
+ };
360
+ },
361
+ },
362
+ {
363
+ name: "call_loaded_tool",
364
+ description: 'Call a dynamically loaded tool by name. Use this after load_toolset when your client does not automatically refresh the tool list. Pass the tool name and its arguments. Example: call_loaded_tool({ tool: "analyze_screenshot", args: { imagePath: "screenshot.png" } }). This is a fallback — if the loaded tool appears in your tool list directly, call it directly instead.',
365
+ inputSchema: {
366
+ type: "object",
367
+ properties: {
368
+ tool: {
369
+ type: "string",
370
+ description: "Name of the dynamically loaded tool to call.",
371
+ },
372
+ args: {
373
+ type: "object",
374
+ description: "Arguments to pass to the tool (same as its inputSchema).",
375
+ additionalProperties: true,
376
+ },
377
+ },
378
+ required: ["tool"],
379
+ },
380
+ handler: async (callArgs) => {
381
+ const { tool: toolName, args: toolArgs } = callArgs;
382
+ const target = allTools.find(t => t.name === toolName);
383
+ if (!target) {
384
+ return {
385
+ error: true,
386
+ message: `Tool '${toolName}' not found. It may not be loaded yet.`,
387
+ _hint: "Call list_available_toolsets to see what's available, then load_toolset to activate it.",
388
+ loadedTools: allTools.map(t => t.name),
389
+ };
390
+ }
391
+ // Dispatch to the target tool's handler
392
+ return target.handler(toolArgs ?? {});
393
+ },
394
+ },
395
+ {
396
+ name: "smart_select_tools",
397
+ description: 'LLM-powered tool selection: sends your task description + a compact tool catalog to a fast model (Gemini Flash, GPT-4o-mini, or Claude Haiku) to pick the best 5-10 tools. Much more accurate than keyword search for ambiguous queries like "call an AI model" or "analyze my data". Requires GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY. Falls back to heuristic discover_tools if no API key is set.',
398
+ inputSchema: {
399
+ type: "object",
400
+ properties: {
401
+ task: {
402
+ type: "string",
403
+ description: "Describe what you want to accomplish. Be specific. Example: 'I need to parse a PDF, extract tables, and email a summary'",
404
+ },
405
+ maxTools: {
406
+ type: "number",
407
+ description: "Maximum tools to return (default: 8)",
408
+ },
409
+ provider: {
410
+ type: "string",
411
+ enum: ["auto", "gemini", "openai", "anthropic"],
412
+ description: "Which LLM provider to use. 'auto' (default) picks the first available API key.",
413
+ },
414
+ },
415
+ required: ["task"],
416
+ },
417
+ handler: async (args) => {
418
+ const task = args.task;
419
+ const maxTools = args.maxTools ?? 8;
420
+ const provider = args.provider ?? "auto";
421
+ // Build compact tool catalog: name + category + tags (no descriptions — saves tokens)
422
+ const catalog = ALL_REGISTRY_ENTRIES.map(e => `${e.name} [${e.category}] ${e.tags.slice(0, 5).join(",")}`).join("\n");
423
+ const systemPrompt = `You are a tool selection assistant. Given a task description and a catalog of ${ALL_REGISTRY_ENTRIES.length} tools, pick the ${maxTools} most relevant tools. Return ONLY a JSON array of tool names, nothing else. Example: ["tool_a","tool_b"]`;
424
+ const userPrompt = `Task: ${task}\n\nTool catalog (name [category] tags):\n${catalog}`;
425
+ // Try LLM providers in order
426
+ const geminiKey = process.env.GEMINI_API_KEY;
427
+ const openaiKey = process.env.OPENAI_API_KEY;
428
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
429
+ let selectedProvider = provider;
430
+ if (selectedProvider === "auto") {
431
+ if (geminiKey)
432
+ selectedProvider = "gemini";
433
+ else if (openaiKey)
434
+ selectedProvider = "openai";
435
+ else if (anthropicKey)
436
+ selectedProvider = "anthropic";
437
+ else
438
+ selectedProvider = "none";
439
+ }
440
+ if (selectedProvider === "none") {
441
+ // Fallback: run heuristic discover_tools (search full registry for dynamic mode)
442
+ const heuristicResults = hybridSearch(task, allTools.map(t => ({ name: t.name, description: t.description })), {
443
+ limit: maxTools,
444
+ mode: "hybrid",
445
+ searchFullRegistry: useDynamicLoading,
446
+ });
447
+ return {
448
+ method: "heuristic_fallback",
449
+ reason: "No API key found. Set GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY for LLM-powered selection.",
450
+ tools: heuristicResults.map((r) => ({
451
+ name: r.name,
452
+ category: r.category,
453
+ score: r.score,
454
+ quickRef: r.quickRef,
455
+ })),
456
+ _hint: "For better accuracy on ambiguous queries, set an API key to enable LLM-powered selection.",
457
+ };
458
+ }
459
+ try {
460
+ let responseText = "";
461
+ if (selectedProvider === "gemini" && geminiKey) {
462
+ const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`, {
463
+ method: "POST",
464
+ headers: { "Content-Type": "application/json" },
465
+ body: JSON.stringify({
466
+ contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
467
+ generationConfig: { temperature: 0, maxOutputTokens: 512 },
468
+ }),
469
+ });
470
+ const data = await resp.json();
471
+ responseText = data?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
472
+ }
473
+ else if (selectedProvider === "openai" && openaiKey) {
474
+ const resp = await fetch("https://api.openai.com/v1/chat/completions", {
475
+ method: "POST",
476
+ headers: { "Content-Type": "application/json", Authorization: `Bearer ${openaiKey}` },
477
+ body: JSON.stringify({
478
+ model: "gpt-4o-mini",
479
+ messages: [
480
+ { role: "system", content: systemPrompt },
481
+ { role: "user", content: userPrompt },
482
+ ],
483
+ temperature: 0,
484
+ max_tokens: 512,
485
+ }),
486
+ });
487
+ const data = await resp.json();
488
+ responseText = data?.choices?.[0]?.message?.content ?? "";
489
+ }
490
+ else if (selectedProvider === "anthropic" && anthropicKey) {
491
+ const resp = await fetch("https://api.anthropic.com/v1/messages", {
492
+ method: "POST",
493
+ headers: {
494
+ "Content-Type": "application/json",
495
+ "x-api-key": anthropicKey,
496
+ "anthropic-version": "2023-06-01",
497
+ },
498
+ body: JSON.stringify({
499
+ model: "claude-3-5-haiku-latest",
500
+ max_tokens: 512,
501
+ system: systemPrompt,
502
+ messages: [{ role: "user", content: userPrompt }],
503
+ }),
504
+ });
505
+ const data = await resp.json();
506
+ responseText = data?.content?.[0]?.text ?? "";
507
+ }
508
+ // Parse the JSON array from the response
509
+ const jsonMatch = responseText.match(/\[[\s\S]*?\]/);
510
+ if (!jsonMatch) {
511
+ return { error: true, message: "LLM did not return a valid JSON array", raw: responseText.slice(0, 200) };
512
+ }
513
+ const selectedNames = JSON.parse(jsonMatch[0]);
514
+ // Enrich with registry metadata
515
+ const enriched = selectedNames
516
+ .map(name => {
517
+ const entry = TOOL_REGISTRY.get(name);
518
+ if (!entry)
519
+ return null;
520
+ return {
521
+ name: entry.name,
522
+ category: entry.category,
523
+ phase: entry.phase,
524
+ tags: entry.tags,
525
+ quickRef: entry.quickRef,
526
+ loaded: allTools.some(t => t.name === name),
527
+ };
528
+ })
529
+ .filter(Boolean);
530
+ // Identify toolsets to load
531
+ const unloadedToolsets = new Map();
532
+ for (const tool of enriched) {
533
+ if (tool && !tool.loaded) {
534
+ const ts = TOOL_TO_TOOLSET.get(tool.name);
535
+ if (ts) {
536
+ const list = unloadedToolsets.get(ts) ?? [];
537
+ list.push(tool.name);
538
+ unloadedToolsets.set(ts, list);
539
+ }
540
+ }
541
+ }
542
+ return {
543
+ method: `llm_${selectedProvider}`,
544
+ task,
545
+ selectedTools: enriched,
546
+ toolCount: enriched.length,
547
+ ...(unloadedToolsets.size > 0 ? {
548
+ _loadSuggestions: [...unloadedToolsets.entries()].map(([ts, tools]) => ({
549
+ toolset: ts,
550
+ matchingTools: tools,
551
+ action: `Call load_toolset("${ts}") to activate ${tools.length} tool(s).`,
552
+ })),
553
+ } : {}),
554
+ _hint: enriched.length > 0
555
+ ? `Top pick: ${enriched[0].name}. ${enriched[0].quickRef.nextAction}`
556
+ : "No tools selected. Try rephrasing your task.",
557
+ };
558
+ }
559
+ catch (err) {
560
+ return {
561
+ error: true,
562
+ method: `llm_${selectedProvider}`,
563
+ message: `LLM call failed: ${err.message}`,
564
+ _hint: "Falling back to heuristic search. Check your API key.",
565
+ };
566
+ }
567
+ },
568
+ },
569
+ {
570
+ name: "get_ab_test_report",
571
+ description: "Generate an A/B test comparison report for static vs dynamic toolset loading. Shows session counts, tool counts, load events, error rates, and per-toolset load frequency. Use after running sessions in both modes to evaluate the impact of dynamic loading.",
572
+ inputSchema: {
573
+ type: "object",
574
+ properties: {
575
+ detailed: {
576
+ type: "boolean",
577
+ description: "Include per-session breakdown (default: false, summary only)",
578
+ },
579
+ },
580
+ },
581
+ handler: async (args) => {
582
+ const db = getDb();
583
+ const detailed = args.detailed === true;
584
+ // Session-level aggregates by mode
585
+ const sessionSummary = db.prepare(`
586
+ SELECT
587
+ mode,
588
+ COUNT(*) as sessions,
589
+ ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
590
+ ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
591
+ ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
592
+ ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
593
+ ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
594
+ SUM(COALESCE(total_tool_calls, 0)) as total_calls,
595
+ SUM(COALESCE(total_load_events, 0)) as total_loads
596
+ FROM ab_test_sessions
597
+ GROUP BY mode
598
+ `).all();
599
+ // Error rate by mode (join with tool_call_log)
600
+ const errorRates = db.prepare(`
601
+ SELECT
602
+ s.mode,
603
+ COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
604
+ COUNT(*) as total_calls,
605
+ ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
606
+ FROM tool_call_log t
607
+ JOIN ab_test_sessions s ON t.session_id = s.id
608
+ GROUP BY s.mode
609
+ `).all();
610
+ // Top loaded toolsets (dynamic mode)
611
+ const topToolsets = db.prepare(`
612
+ SELECT
613
+ toolset_name,
614
+ COUNT(*) as load_count,
615
+ ROUND(AVG(latency_ms), 1) as avg_latency_ms
616
+ FROM ab_tool_events
617
+ WHERE event_type = 'load'
618
+ GROUP BY toolset_name
619
+ ORDER BY load_count DESC
620
+ LIMIT 10
621
+ `).all();
622
+ // Current session info
623
+ const currentSession = {
624
+ sessionId: SESSION_ID,
625
+ mode: useDynamicLoading ? "dynamic" : "static",
626
+ preset: currentPreset,
627
+ toolCalls: _abToolCallCount,
628
+ loadEvents: _abLoadEventCount,
629
+ activeTools: allTools.length,
630
+ durationSec: Math.round((Date.now() - _abStartMs) / 1000),
631
+ dynamicallyLoaded: [...activeToolsets].filter(ts => !initialToolsetNames.has(ts)),
632
+ };
633
+ // Optional per-session detail
634
+ let sessions = [];
635
+ if (detailed) {
636
+ sessions = db.prepare(`
637
+ SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
638
+ toolsets_loaded, total_tool_calls, total_load_events,
639
+ session_duration_ms, created_at, ended_at
640
+ FROM ab_test_sessions
641
+ ORDER BY created_at DESC
642
+ LIMIT 50
643
+ `).all();
644
+ }
645
+ // Build verdict
646
+ const staticSummary = sessionSummary.find((s) => s.mode === "static");
647
+ const dynamicSummary = sessionSummary.find((s) => s.mode === "dynamic");
648
+ let verdict = "Insufficient data. Run sessions in both modes to compare.";
649
+ if (staticSummary && dynamicSummary) {
650
+ const toolDiff = (staticSummary.avg_final_tools ?? 0) - (dynamicSummary.avg_final_tools ?? 0);
651
+ const staticErr = errorRates.find((e) => e.mode === "static");
652
+ const dynamicErr = errorRates.find((e) => e.mode === "dynamic");
653
+ const errDiff = (staticErr?.error_pct ?? 0) - (dynamicErr?.error_pct ?? 0);
654
+ verdict = [
655
+ `Static: ${staticSummary.sessions} sessions, avg ${staticSummary.avg_final_tools} tools, ${staticErr?.error_pct ?? "?"}% error rate.`,
656
+ `Dynamic: ${dynamicSummary.sessions} sessions, avg ${dynamicSummary.avg_final_tools} tools, ${dynamicErr?.error_pct ?? "?"}% error rate.`,
657
+ toolDiff > 0 ? `Dynamic uses ${toolDiff.toFixed(1)} fewer tools on average.` : "",
658
+ errDiff > 0 ? `Dynamic has ${errDiff.toFixed(2)}pp lower error rate.` : errDiff < 0 ? `Static has ${(-errDiff).toFixed(2)}pp lower error rate.` : "",
659
+ dynamicSummary.avg_load_events > 0 ? `Agents loaded ${dynamicSummary.avg_load_events} toolsets per session on average.` : "",
660
+ ].filter(Boolean).join(" ");
661
+ }
662
+ return {
663
+ verdict,
664
+ sessionSummary,
665
+ errorRates,
666
+ topLoadedToolsets: topToolsets,
667
+ currentSession,
668
+ ...(detailed ? { sessions } : {}),
669
+ _hint: sessionSummary.length < 2
670
+ ? "Run sessions with both `npx nodebench-mcp` (static) and `npx nodebench-mcp --dynamic` (dynamic) to compare."
671
+ : "Compare avg_final_tools and error_pct between modes to evaluate dynamic loading impact.",
672
+ };
673
+ },
674
+ },
675
+ ];
676
+ // Combine all tools (mutable for dynamic loading)
677
+ let allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
180
678
  // Background: initialize embedding index for semantic search (non-blocking)
181
679
  // Uses Agent-as-a-Graph bipartite corpus: tool nodes + domain nodes for graph-aware retrieval
182
680
  if (useEmbedding) {
@@ -218,15 +716,25 @@ if (useEmbedding) {
218
716
  /* Embedding init failed — semantic search stays disabled, no impact on other features */
219
717
  });
220
718
  }
221
- // Build a lookup map for fast tool dispatch
222
- const toolMap = new Map();
719
+ // Build a lookup map for fast tool dispatch (mutable for dynamic loading)
720
+ let toolMap = new Map();
223
721
  for (const tool of allTools) {
224
722
  toolMap.set(tool.name, tool);
225
723
  }
724
+ // Rebuild function for dynamic loading — reconstructs allTools + toolMap
725
+ function rebuildAllTools() {
726
+ allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
727
+ toolMap = new Map();
728
+ for (const tool of allTools) {
729
+ toolMap.set(tool.name, tool);
730
+ }
731
+ }
226
732
  // Auto-instrumentation: generate a session ID per MCP connection
227
733
  const SESSION_ID = genId("mcp");
228
- // Tools to skip auto-logging (avoid infinite recursion and noise)
229
- const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings"]);
734
+ // A/B test session-level counters (mutable, finalized on exit)
735
+ let _abToolCallCount = 0;
736
+ let _abLoadEventCount = 0;
737
+ const _abStartMs = Date.now();
230
738
  // ── Lightweight hooks: auto-save + attention refresh reminders ─────────
231
739
  const _hookState = {
232
740
  totalCalls: 0,
@@ -783,7 +1291,34 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
783
1291
  ],
784
1292
  },
785
1293
  ];
786
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.15.0" }, { capabilities: { tools: {}, prompts: {} } });
1294
+ // Server instructions tells Claude Code Tool Search (and other clients) when to search
1295
+ // for NodeBench tools. This is the key integration point for lazy loading compatibility.
1296
+ // See: https://www.anthropic.com/engineering/advanced-tool-use
1297
+ const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
1298
+ Use NodeBench tools when you need to:
1299
+ - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
1300
+ - Run evaluations and quality gates before shipping code
1301
+ - Search prior knowledge and record learnings across sessions
1302
+ - Assess risk before taking actions
1303
+ - Coordinate parallel agents (task locks, roles, context budget)
1304
+ - Research with structured recon (web search, GitHub, RSS feeds)
1305
+ - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
1306
+ - Run security audits (dependency scanning, code analysis, secrets detection)
1307
+ - Write and polish academic papers
1308
+ - Audit SEO, analyze Figma flows, detect Android flicker
1309
+ - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
1310
+ Start with discover_tools("<your task>") to find the right tool.`;
1311
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.18.0" }, {
1312
+ capabilities: { tools: { listChanged: true }, prompts: {} },
1313
+ instructions: SERVER_INSTRUCTIONS,
1314
+ });
1315
+ // ── A/B Test Session Tracking ─────────────────────────────────────────
1316
+ // Record session start for A/B comparison (static vs dynamic loading)
1317
+ try {
1318
+ const db = getDb();
1319
+ db.prepare("INSERT INTO ab_test_sessions (id, mode, initial_preset, initial_tool_count, created_at) VALUES (?, ?, ?, ?, datetime('now'))").run(SESSION_ID, useDynamicLoading ? 'dynamic' : 'static', currentPreset, allTools.length);
1320
+ }
1321
+ catch { /* instrumentation must not block server start */ }
787
1322
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
788
1323
  // Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
789
1324
  server.setRequestHandler(ListToolsRequestSchema, async () => {
@@ -809,6 +1344,9 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
809
1344
  // Handle tools/call — dispatch to the matching tool handler (auto-instrumented)
810
1345
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
811
1346
  const { name, arguments: args } = request.params;
1347
+ _abToolCallCount++;
1348
+ if (name === "load_toolset" || name === "unload_toolset")
1349
+ _abLoadEventCount++;
812
1350
  const tool = toolMap.get(name);
813
1351
  if (!tool) {
814
1352
  return {
@@ -826,7 +1364,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
826
1364
  resultStatus = "error";
827
1365
  errorMsg = result.message ?? "soft error";
828
1366
  }
829
- // Auto-log (skip self-eval tools to avoid recursion/noise)
1367
+ // Auto-log to main DB (skip self-eval tools to avoid recursion/noise)
830
1368
  if (!SKIP_AUTO_LOG.has(name)) {
831
1369
  try {
832
1370
  const db = getDb();
@@ -834,6 +1372,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
834
1372
  }
835
1373
  catch { /* never let instrumentation break tool dispatch */ }
836
1374
  }
1375
+ // Auto-log to analytics tracker
1376
+ tracker.record(name, startMs, resultStatus === "success", errorMsg, args);
1377
+ // Inline A/B session counter update (every 5 calls — amortized cost)
1378
+ if (_abToolCallCount % 5 === 0) {
1379
+ try {
1380
+ const db2 = getDb();
1381
+ const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
1382
+ db2.prepare("UPDATE ab_test_sessions SET total_tool_calls = ?, total_load_events = ?, final_tool_count = ?, toolsets_loaded = ? WHERE id = ?").run(_abToolCallCount, _abLoadEventCount, allTools.length, JSON.stringify(dynamicallyLoaded), SESSION_ID);
1383
+ }
1384
+ catch { /* instrumentation */ }
1385
+ }
837
1386
  // Tools with rawContent return ContentBlock[] directly (e.g. image captures)
838
1387
  if (tool.rawContent && Array.isArray(result)) {
839
1388
  return { content: result, isError: false };
@@ -875,7 +1424,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
875
1424
  catch (err) {
876
1425
  resultStatus = "error";
877
1426
  errorMsg = err?.message || "Internal error";
878
- // Auto-log errors
1427
+ // Auto-log errors to main DB
879
1428
  if (!SKIP_AUTO_LOG.has(name)) {
880
1429
  try {
881
1430
  const db = getDb();
@@ -883,6 +1432,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
883
1432
  }
884
1433
  catch { /* never let instrumentation break tool dispatch */ }
885
1434
  }
1435
+ // Auto-log error to analytics tracker
1436
+ tracker.record(name, startMs, false, errorMsg, args);
886
1437
  return {
887
1438
  content: [{ type: "text", text: errorMsg }],
888
1439
  isError: true,
@@ -914,6 +1465,24 @@ server.setRequestHandler(GetPromptRequestSchema, async (request) => {
914
1465
  messages,
915
1466
  };
916
1467
  });
1468
+ // Graceful shutdown: close analytics tracker + finalize A/B session on exit
1469
+ process.on('exit', () => {
1470
+ tracker.close();
1471
+ // Finalize A/B test session with aggregate metrics
1472
+ try {
1473
+ const db = getDb();
1474
+ const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
1475
+ db.prepare(`UPDATE ab_test_sessions SET
1476
+ final_tool_count = ?,
1477
+ toolsets_loaded = ?,
1478
+ total_tool_calls = ?,
1479
+ total_load_events = ?,
1480
+ session_duration_ms = ?,
1481
+ ended_at = datetime('now')
1482
+ WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
1483
+ }
1484
+ catch { /* instrumentation must not block shutdown */ }
1485
+ });
917
1486
  // Connect via stdio
918
1487
  const transport = new StdioServerTransport();
919
1488
  await server.connect(transport);