nodebench-mcp 2.17.0 → 2.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +21 -0
  2. package/NODEBENCH_AGENTS.md +2 -2
  3. package/README.md +516 -82
  4. package/dist/__tests__/analytics.test.d.ts +11 -0
  5. package/dist/__tests__/analytics.test.js +546 -0
  6. package/dist/__tests__/analytics.test.js.map +1 -0
  7. package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
  8. package/dist/__tests__/dynamicLoading.test.js +278 -0
  9. package/dist/__tests__/dynamicLoading.test.js.map +1 -0
  10. package/dist/__tests__/evalHarness.test.js +1 -1
  11. package/dist/__tests__/evalHarness.test.js.map +1 -1
  12. package/dist/__tests__/helpers/answerMatch.js +22 -22
  13. package/dist/__tests__/presetRealWorldBench.test.js +9 -0
  14. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  15. package/dist/__tests__/tools.test.js +1 -1
  16. package/dist/__tests__/toolsetGatingEval.test.js +9 -1
  17. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  18. package/dist/analytics/index.d.ts +10 -0
  19. package/dist/analytics/index.js +11 -0
  20. package/dist/analytics/index.js.map +1 -0
  21. package/dist/analytics/projectDetector.d.ts +19 -0
  22. package/dist/analytics/projectDetector.js +259 -0
  23. package/dist/analytics/projectDetector.js.map +1 -0
  24. package/dist/analytics/schema.d.ts +57 -0
  25. package/dist/analytics/schema.js +157 -0
  26. package/dist/analytics/schema.js.map +1 -0
  27. package/dist/analytics/smartPreset.d.ts +63 -0
  28. package/dist/analytics/smartPreset.js +300 -0
  29. package/dist/analytics/smartPreset.js.map +1 -0
  30. package/dist/analytics/toolTracker.d.ts +59 -0
  31. package/dist/analytics/toolTracker.js +163 -0
  32. package/dist/analytics/toolTracker.js.map +1 -0
  33. package/dist/analytics/usageStats.d.ts +64 -0
  34. package/dist/analytics/usageStats.js +252 -0
  35. package/dist/analytics/usageStats.js.map +1 -0
  36. package/dist/db.js +359 -321
  37. package/dist/db.js.map +1 -1
  38. package/dist/index.d.ts +2 -1
  39. package/dist/index.js +652 -89
  40. package/dist/index.js.map +1 -1
  41. package/dist/tools/architectTools.js +13 -13
  42. package/dist/tools/critterTools.js +14 -14
  43. package/dist/tools/parallelAgentTools.js +176 -176
  44. package/dist/tools/patternTools.js +11 -11
  45. package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
  46. package/dist/tools/progressiveDiscoveryTools.js +111 -19
  47. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  48. package/dist/tools/researchWritingTools.js +42 -42
  49. package/dist/tools/rssTools.js +396 -396
  50. package/dist/tools/toolRegistry.d.ts +17 -0
  51. package/dist/tools/toolRegistry.js +65 -17
  52. package/dist/tools/toolRegistry.js.map +1 -1
  53. package/dist/tools/voiceBridgeTools.js +498 -498
  54. package/dist/toolsetRegistry.d.ts +10 -0
  55. package/dist/toolsetRegistry.js +84 -0
  56. package/dist/toolsetRegistry.js.map +1 -0
  57. package/package.json +4 -4
package/dist/index.js CHANGED
@@ -20,42 +20,14 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
20
20
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
21
21
  import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
22
22
  import { getDb, genId } from "./db.js";
23
- import { verificationTools } from "./tools/verificationTools.js";
24
- import { evalTools } from "./tools/evalTools.js";
25
- import { qualityGateTools } from "./tools/qualityGateTools.js";
26
- import { learningTools } from "./tools/learningTools.js";
27
- import { flywheelTools } from "./tools/flywheelTools.js";
28
- import { reconTools } from "./tools/reconTools.js";
29
- import { uiCaptureTools } from "./tools/uiCaptureTools.js";
30
- import { visionTools } from "./tools/visionTools.js";
31
- import { webTools } from "./tools/webTools.js";
32
- import { githubTools } from "./tools/githubTools.js";
33
- import { documentationTools } from "./tools/documentationTools.js";
34
- import { agentBootstrapTools } from "./tools/agentBootstrapTools.js";
35
- import { selfEvalTools } from "./tools/selfEvalTools.js";
36
- import { parallelAgentTools } from "./tools/parallelAgentTools.js";
37
- import { llmTools } from "./tools/llmTools.js";
38
- import { securityTools } from "./tools/securityTools.js";
39
- import { platformTools } from "./tools/platformTools.js";
40
- import { researchWritingTools } from "./tools/researchWritingTools.js";
41
- import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
42
- import { figmaFlowTools } from "./tools/figmaFlowTools.js";
23
+ import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
24
+ import { AnalyticsTracker } from "./analytics/toolTracker.js";
25
+ import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
26
+ import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
27
+ import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
43
28
  import { createMetaTools } from "./tools/metaTools.js";
44
- import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
45
29
  import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
46
- import { boilerplateTools } from "./tools/boilerplateTools.js";
47
- import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
48
- import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
49
- import { patternTools } from "./tools/patternTools.js";
50
- import { gitWorkflowTools } from "./tools/gitWorkflowTools.js";
51
- import { seoTools } from "./tools/seoTools.js";
52
- import { voiceBridgeTools } from "./tools/voiceBridgeTools.js";
53
- import { critterTools } from "./tools/critterTools.js";
54
- import { emailTools } from "./tools/emailTools.js";
55
- import { rssTools } from "./tools/rssTools.js";
56
- import { architectTools } from "./tools/architectTools.js";
57
- import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor } from "./tools/toolRegistry.js";
58
- import { toonTools } from "./tools/toonTools.js";
30
+ import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch } from "./tools/toolRegistry.js";
59
31
  // TOON format — ~40% token savings on tool responses
60
32
  import { encode as toonEncode } from "@toon-format/toon";
61
33
  // Embedding provider — neural semantic search
@@ -64,48 +36,38 @@ import { initEmbeddingIndex } from "./tools/embeddingProvider.js";
64
36
  const cliArgs = process.argv.slice(2);
65
37
  const useToon = !cliArgs.includes("--no-toon");
66
38
  const useEmbedding = !cliArgs.includes("--no-embedding");
67
- const TOOLSET_MAP = {
68
- verification: verificationTools,
69
- eval: evalTools,
70
- quality_gate: qualityGateTools,
71
- learning: learningTools,
72
- flywheel: flywheelTools,
73
- recon: reconTools,
74
- ui_capture: uiCaptureTools,
75
- vision: visionTools,
76
- local_file: localFileTools,
77
- web: webTools,
78
- github: githubTools,
79
- docs: documentationTools,
80
- bootstrap: agentBootstrapTools,
81
- self_eval: selfEvalTools,
82
- parallel: parallelAgentTools,
83
- llm: llmTools,
84
- security: securityTools,
85
- platform: platformTools,
86
- research_writing: researchWritingTools,
87
- flicker_detection: flickerDetectionTools,
88
- figma_flow: figmaFlowTools,
89
- boilerplate: boilerplateTools,
90
- benchmark: cCompilerBenchmarkTools,
91
- session_memory: sessionMemoryTools,
92
- gaia_solvers: gaiaMediaSolvers,
93
- toon: toonTools,
94
- pattern: patternTools,
95
- git_workflow: gitWorkflowTools,
96
- seo: seoTools,
97
- voice_bridge: voiceBridgeTools,
98
- critter: critterTools,
99
- email: emailTools,
100
- rss: rssTools,
101
- architect: architectTools,
102
- };
39
+ const useSmartPreset = cliArgs.includes("--smart-preset");
40
+ const showStats = cliArgs.includes("--stats");
41
+ const exportStats = cliArgs.includes("--export-stats");
42
+ const resetStats = cliArgs.includes("--reset-stats");
43
+ const listPresetsFlag = cliArgs.includes("--list-presets");
44
+ export { TOOLSET_MAP };
45
+ const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"];
103
46
  const PRESETS = {
104
- meta: [],
105
- lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
106
- core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory", "toon", "pattern", "git_workflow", "seo", "voice_bridge", "critter", "email", "rss", "architect"],
47
+ default: DEFAULT_TOOLSETS,
48
+ // Themed presets bridge between default (50 tools) and full (175 tools)
49
+ web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect"],
50
+ research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs"],
51
+ data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web"],
52
+ devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
53
+ mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection"],
54
+ academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
55
+ multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon"],
56
+ content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect"],
107
57
  full: Object.keys(TOOLSET_MAP),
108
58
  };
59
+ const PRESET_DESCRIPTIONS = {
60
+ default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
61
+ web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
62
+ research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
63
+ data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
64
+ devops: "CI/CD & ops — adds git compliance, session memory, benchmarks, pattern mining",
65
+ mobile: "Mobile apps — adds screenshot capture, vision analysis, flicker detection",
66
+ academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
67
+ multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing",
68
+ content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
69
+ full: "Everything — all toolsets for maximum coverage",
70
+ };
109
71
  function parseToolsets() {
110
72
  if (cliArgs.includes("--help")) {
111
73
  const lines = [
@@ -114,9 +76,15 @@ function parseToolsets() {
114
76
  "Usage: nodebench-mcp [options]",
115
77
  "",
116
78
  "Options:",
117
- " --toolsets <list> Comma-separated toolsets to enable (default: all)",
79
+ " --toolsets <list> Comma-separated toolsets to enable (default: default)",
118
80
  " --exclude <list> Comma-separated toolsets to exclude",
119
- " --preset <name> Use a preset: meta, lite, core, or full",
81
+ " --preset <name> Use a preset: default or full",
82
+ " --smart-preset Generate smart preset recommendation based on project type and usage history",
83
+ " --stats Show usage statistics for current project",
84
+ " --export-stats Export usage statistics to JSON",
85
+ " --reset-stats Clear all usage analytics data",
86
+ " --list-presets List all available presets with descriptions",
87
+ " --dynamic Enable dynamic toolset loading (Search+Load pattern from arxiv 2509.20386)",
120
88
  " --no-toon Disable TOON encoding (TOON is on by default for ~40% token savings)",
121
89
  " --no-embedding Disable neural embedding search (uses local HuggingFace model or API keys)",
122
90
  " --help Show this help and exit",
@@ -125,10 +93,20 @@ function parseToolsets() {
125
93
  ...Object.entries(TOOLSET_MAP).map(([k, v]) => ` ${k.padEnd(16)} ${v.length} tools`),
126
94
  "",
127
95
  "Presets:",
128
- ...Object.entries(PRESETS).map(([k, v]) => ` ${k.padEnd(16)} ${v.join(", ")}`),
96
+ ...Object.entries(PRESETS).map(([k, v]) => {
97
+ const count = v.reduce((s, ts) => s + (TOOLSET_MAP[ts]?.length ?? 0), 0) + 12;
98
+ return ` ${k.padEnd(14)} ${String(count).padStart(3)} tools ${PRESET_DESCRIPTIONS[k] ?? ''}`;
99
+ }),
129
100
  "",
130
101
  "Examples:",
131
- " npx nodebench-mcp --preset core",
102
+ " npx nodebench-mcp # Default (50 tools) - core AI Flywheel",
103
+ " npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
104
+ " npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
105
+ " npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
106
+ " npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
107
+ " npx nodebench-mcp --preset full # All 175 tools",
108
+ " npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
109
+ " npx nodebench-mcp --stats # Show usage statistics",
132
110
  " npx nodebench-mcp --toolsets verification,eval,recon",
133
111
  " npx nodebench-mcp --exclude vision,ui_capture,parallel",
134
112
  "",
@@ -170,19 +148,533 @@ function parseToolsets() {
170
148
  .filter(([k]) => !excluded.has(k))
171
149
  .flatMap(([, v]) => v);
172
150
  }
173
- return Object.values(TOOLSET_MAP).flat();
151
+ // Default to default preset (50 tools - complete AI Flywheel)
152
+ return PRESETS.default.flatMap((k) => TOOLSET_MAP[k] ?? []);
153
+ }
154
+ // ── Analytics CLI flag handling ─────────────────────────────────────────
155
+ // Handle --list-presets
156
+ if (listPresetsFlag) {
157
+ const presets = listPresets(TOOLSET_MAP);
158
+ console.log(JSON.stringify(presets, null, 2));
159
+ process.exit(0);
160
+ }
161
+ // ── Analytics CLI handlers (run-and-exit) ───────────────────────────────
162
+ if (resetStats || useSmartPreset || showStats || exportStats) {
163
+ const aDb = getAnalyticsDb();
164
+ try {
165
+ if (resetStats) {
166
+ clearOldRecords(aDb, 0);
167
+ console.error("Usage analytics data cleared (tool_usage + cache). Project context and preset history preserved.");
168
+ }
169
+ else if (useSmartPreset) {
170
+ const recommendation = generateSmartPreset(aDb, TOOLSET_MAP);
171
+ console.error(formatPresetRecommendation(recommendation, TOOLSET_MAP));
172
+ }
173
+ else if (showStats) {
174
+ const summary = getProjectUsageSummary(aDb, process.cwd(), 30);
175
+ if (summary) {
176
+ console.error(formatStatsDisplay(summary, process.cwd()));
177
+ }
178
+ else {
179
+ console.error("No usage data available for this project in the last 30 days.");
180
+ }
181
+ }
182
+ else if (exportStats) {
183
+ console.log(exportUsageStats(aDb, process.cwd(), 30));
184
+ }
185
+ }
186
+ finally {
187
+ closeAnalyticsDb(aDb);
188
+ }
189
+ process.exit(0);
174
190
  }
175
191
  // Initialize DB (creates ~/.nodebench/ and schema on first run)
176
192
  getDb();
177
193
  // Wire up DB accessor for execution trace edges (avoids circular import)
178
194
  _setDbAccessor(getDb);
179
195
  // Assemble tools (filtered by --toolsets / --exclude / --preset if provided)
180
- const domainTools = parseToolsets();
196
+ let domainTools = parseToolsets();
197
+ // Determine current preset name for analytics
198
+ let currentPreset = 'default';
199
+ const presetIdx = cliArgs.indexOf("--preset");
200
+ if (presetIdx !== -1 && cliArgs[presetIdx + 1]) {
201
+ currentPreset = cliArgs[presetIdx + 1];
202
+ }
203
+ else if (cliArgs.includes("--toolsets") || cliArgs.includes("--exclude")) {
204
+ currentPreset = 'custom';
205
+ }
206
+ // Dynamic loading: --dynamic flag enables Search+Load architecture
207
+ // (arxiv 2509.20386 "Dynamic ReAct" winning pattern)
208
+ const useDynamicLoading = cliArgs.includes("--dynamic");
209
+ // Track which toolsets are currently active (mutable for dynamic loading)
210
+ const initialToolsetNames = new Set(PRESETS[currentPreset] ?? PRESETS.default);
211
+ const activeToolsets = new Set(initialToolsetNames);
212
+ // Tools to skip auto-logging (avoid infinite recursion and noise)
213
+ const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings", "load_toolset", "unload_toolset", "list_available_toolsets"]);
214
+ // Initialize analytics tracker singleton (handles DB, project context, retention cleanup)
215
+ const tracker = AnalyticsTracker.init({
216
+ projectPath: process.cwd(),
217
+ preset: currentPreset,
218
+ toolCount: domainTools.length + 6,
219
+ toolToToolset: TOOL_TO_TOOLSET,
220
+ skipTools: SKIP_AUTO_LOG,
221
+ });
181
222
  const metaTools = createMetaTools(domainTools);
182
- const allToolsWithoutDiscovery = [...domainTools, ...metaTools];
223
+ let allToolsWithoutDiscovery = [...domainTools, ...metaTools];
183
224
  // Progressive discovery tools need the full tool list for hybrid search
184
- const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })));
185
- const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
225
+ // Pass dynamic loading callbacks so discover_tools can suggest load_toolset for unloaded toolsets
226
+ const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })), {
227
+ getLoadedToolNames: () => new Set(allTools.map(t => t.name)),
228
+ getToolToToolset: () => TOOL_TO_TOOLSET,
229
+ });
230
+ // ── Dynamic Loading Tools (Search+Load pattern) ────────────────────────
231
+ // Based on Dynamic ReAct (arxiv 2509.20386) — the winning architecture.
232
+ // Agent starts with default preset, discovers tools via discover_tools,
233
+ // then calls load_toolset to activate them. Server sends
234
+ // notifications/tools/list_changed so the client re-fetches the tool list.
235
+ const dynamicLoadingTools = [
236
+ {
237
+ name: "load_toolset",
238
+ description: 'Dynamically load a toolset into the current session. After loading, the tools become immediately available for use. Based on the "Search+Load" architecture from Dynamic ReAct (arxiv 2509.20386) — the winning pattern for scalable MCP tool selection. Use discover_tools first to find which toolset you need, then call this to activate it.',
239
+ inputSchema: {
240
+ type: "object",
241
+ properties: {
242
+ toolset: {
243
+ type: "string",
244
+ description: `Toolset name to load. Available: ${Object.keys(TOOLSET_MAP).filter(k => !activeToolsets.has(k)).join(", ") || "(all loaded)"}`,
245
+ },
246
+ },
247
+ required: ["toolset"],
248
+ },
249
+ handler: async (args) => {
250
+ const { toolset } = args;
251
+ if (!TOOLSET_MAP[toolset]) {
252
+ return { error: true, message: `Unknown toolset: ${toolset}`, available: Object.keys(TOOLSET_MAP) };
253
+ }
254
+ if (activeToolsets.has(toolset)) {
255
+ return { alreadyLoaded: true, toolset, message: `Toolset '${toolset}' is already active.`, activeToolCount: allTools.length };
256
+ }
257
+ const startMs = Date.now();
258
+ const toolsBefore = allTools.length;
259
+ // Add toolset to active set
260
+ activeToolsets.add(toolset);
261
+ const newTools = TOOLSET_MAP[toolset];
262
+ // Rebuild domain tools from active toolsets
263
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
264
+ const newMetaTools = createMetaTools(domainTools);
265
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
266
+ // Rebuild allTools (keep discovery + dynamic loading tools stable)
267
+ rebuildAllTools();
268
+ // Track A/B event
269
+ try {
270
+ const db = getDb();
271
+ db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, latency_ms, created_at) VALUES (?, ?, 'load', ?, ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length, Date.now() - startMs);
272
+ }
273
+ catch { /* instrumentation must not break tool dispatch */ }
274
+ // Notify client that tool list changed (MCP spec)
275
+ try {
276
+ await server.notification({ method: "notifications/tools/list_changed" });
277
+ }
278
+ catch { /* client may not support notifications */ }
279
+ return {
280
+ loaded: true,
281
+ toolset,
282
+ toolsAdded: newTools.length,
283
+ toolNames: newTools.map(t => t.name),
284
+ activeToolCount: allTools.length,
285
+ activeToolsets: [...activeToolsets],
286
+ _hint: `${newTools.length} tools from '${toolset}' are now available. You can use them directly.`,
287
+ };
288
+ },
289
+ },
290
+ {
291
+ name: "unload_toolset",
292
+ description: "Remove a dynamically loaded toolset from the current session to free up context. Cannot unload toolsets from the initial preset.",
293
+ inputSchema: {
294
+ type: "object",
295
+ properties: {
296
+ toolset: {
297
+ type: "string",
298
+ description: "Toolset name to unload.",
299
+ },
300
+ },
301
+ required: ["toolset"],
302
+ },
303
+ handler: async (args) => {
304
+ const { toolset } = args;
305
+ if (!activeToolsets.has(toolset)) {
306
+ return { error: true, message: `Toolset '${toolset}' is not currently loaded.` };
307
+ }
308
+ if (initialToolsetNames.has(toolset)) {
309
+ return { error: true, message: `Cannot unload '${toolset}' — it's part of the initial preset (${currentPreset}).` };
310
+ }
311
+ const toolsBefore = allTools.length;
312
+ activeToolsets.delete(toolset);
313
+ // Rebuild
314
+ domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
315
+ const newMetaTools = createMetaTools(domainTools);
316
+ allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
317
+ rebuildAllTools();
318
+ try {
319
+ const db = getDb();
320
+ db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, created_at) VALUES (?, ?, 'unload', ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length);
321
+ }
322
+ catch { /* instrumentation */ }
323
+ try {
324
+ await server.notification({ method: "notifications/tools/list_changed" });
325
+ }
326
+ catch { /* client may not support notifications */ }
327
+ return {
328
+ unloaded: true,
329
+ toolset,
330
+ activeToolCount: allTools.length,
331
+ activeToolsets: [...activeToolsets],
332
+ };
333
+ },
334
+ },
335
+ {
336
+ name: "list_available_toolsets",
337
+ description: "List all available toolsets showing which are currently loaded and which can be dynamically added. Includes tool counts and descriptions for each toolset.",
338
+ inputSchema: { type: "object", properties: {} },
339
+ handler: async () => {
340
+ const toolsets = Object.entries(TOOLSET_MAP).map(([name, tools]) => ({
341
+ name,
342
+ toolCount: tools.length,
343
+ loaded: activeToolsets.has(name),
344
+ isInitialPreset: initialToolsetNames.has(name),
345
+ description: PRESET_DESCRIPTIONS[name] ?? null,
346
+ tools: tools.map(t => t.name),
347
+ }));
348
+ const loaded = toolsets.filter(t => t.loaded);
349
+ const available = toolsets.filter(t => !t.loaded);
350
+ return {
351
+ mode: useDynamicLoading ? "dynamic" : "static",
352
+ currentPreset,
353
+ activeToolCount: allTools.length,
354
+ loaded: { count: loaded.length, toolsets: loaded },
355
+ available: { count: available.length, toolsets: available },
356
+ _hint: available.length > 0
357
+ ? `${available.length} toolsets available to load. Call load_toolset("<name>") to activate.`
358
+ : "All toolsets are loaded.",
359
+ };
360
+ },
361
+ },
362
+ {
363
+ name: "call_loaded_tool",
364
+ description: 'Call a dynamically loaded tool by name. Use this after load_toolset when your client does not automatically refresh the tool list. Pass the tool name and its arguments. Example: call_loaded_tool({ tool: "analyze_screenshot", args: { imagePath: "screenshot.png" } }). This is a fallback — if the loaded tool appears in your tool list directly, call it directly instead.',
365
+ inputSchema: {
366
+ type: "object",
367
+ properties: {
368
+ tool: {
369
+ type: "string",
370
+ description: "Name of the dynamically loaded tool to call.",
371
+ },
372
+ args: {
373
+ type: "object",
374
+ description: "Arguments to pass to the tool (same as its inputSchema).",
375
+ additionalProperties: true,
376
+ },
377
+ },
378
+ required: ["tool"],
379
+ },
380
+ handler: async (callArgs) => {
381
+ const { tool: toolName, args: toolArgs } = callArgs;
382
+ const target = allTools.find(t => t.name === toolName);
383
+ if (!target) {
384
+ return {
385
+ error: true,
386
+ message: `Tool '${toolName}' not found. It may not be loaded yet.`,
387
+ _hint: "Call list_available_toolsets to see what's available, then load_toolset to activate it.",
388
+ loadedTools: allTools.map(t => t.name),
389
+ };
390
+ }
391
+ // Dispatch to the target tool's handler
392
+ return target.handler(toolArgs ?? {});
393
+ },
394
+ },
395
+ {
396
+ name: "smart_select_tools",
397
+ description: 'LLM-powered tool selection: sends your task description + a compact tool catalog to a fast model (Gemini Flash, GPT-4o-mini, or Claude Haiku) to pick the best 5-10 tools. Much more accurate than keyword search for ambiguous queries like "call an AI model" or "analyze my data". Requires GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY. Falls back to heuristic discover_tools if no API key is set.',
398
+ inputSchema: {
399
+ type: "object",
400
+ properties: {
401
+ task: {
402
+ type: "string",
403
+ description: "Describe what you want to accomplish. Be specific. Example: 'I need to parse a PDF, extract tables, and email a summary'",
404
+ },
405
+ maxTools: {
406
+ type: "number",
407
+ description: "Maximum tools to return (default: 8)",
408
+ },
409
+ provider: {
410
+ type: "string",
411
+ enum: ["auto", "gemini", "openai", "anthropic"],
412
+ description: "Which LLM provider to use. 'auto' (default) picks the first available API key.",
413
+ },
414
+ },
415
+ required: ["task"],
416
+ },
417
+ handler: async (args) => {
418
+ const task = args.task;
419
+ const maxTools = args.maxTools ?? 8;
420
+ const provider = args.provider ?? "auto";
421
+ // Build compact tool catalog: name + category + tags (no descriptions — saves tokens)
422
+ const catalog = ALL_REGISTRY_ENTRIES.map(e => `${e.name} [${e.category}] ${e.tags.slice(0, 5).join(",")}`).join("\n");
423
+ const systemPrompt = `You are a tool selection assistant. Given a task description and a catalog of ${ALL_REGISTRY_ENTRIES.length} tools, pick the ${maxTools} most relevant tools. Return ONLY a JSON array of tool names, nothing else. Example: ["tool_a","tool_b"]`;
424
+ const userPrompt = `Task: ${task}\n\nTool catalog (name [category] tags):\n${catalog}`;
425
+ // Try LLM providers in order
426
+ const geminiKey = process.env.GEMINI_API_KEY;
427
+ const openaiKey = process.env.OPENAI_API_KEY;
428
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
429
+ let selectedProvider = provider;
430
+ if (selectedProvider === "auto") {
431
+ if (geminiKey)
432
+ selectedProvider = "gemini";
433
+ else if (openaiKey)
434
+ selectedProvider = "openai";
435
+ else if (anthropicKey)
436
+ selectedProvider = "anthropic";
437
+ else
438
+ selectedProvider = "none";
439
+ }
440
+ if (selectedProvider === "none") {
441
+ // Fallback: run heuristic discover_tools (search full registry for dynamic mode)
442
+ const heuristicResults = hybridSearch(task, allTools.map(t => ({ name: t.name, description: t.description })), {
443
+ limit: maxTools,
444
+ mode: "hybrid",
445
+ searchFullRegistry: useDynamicLoading,
446
+ });
447
+ return {
448
+ method: "heuristic_fallback",
449
+ reason: "No API key found. Set GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY for LLM-powered selection.",
450
+ tools: heuristicResults.map((r) => ({
451
+ name: r.name,
452
+ category: r.category,
453
+ score: r.score,
454
+ quickRef: r.quickRef,
455
+ })),
456
+ _hint: "For better accuracy on ambiguous queries, set an API key to enable LLM-powered selection.",
457
+ };
458
+ }
459
+ try {
460
+ let responseText = "";
461
+ if (selectedProvider === "gemini" && geminiKey) {
462
+ const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`, {
463
+ method: "POST",
464
+ headers: { "Content-Type": "application/json" },
465
+ body: JSON.stringify({
466
+ contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
467
+ generationConfig: { temperature: 0, maxOutputTokens: 512 },
468
+ }),
469
+ });
470
+ const data = await resp.json();
471
+ responseText = data?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
472
+ }
473
+ else if (selectedProvider === "openai" && openaiKey) {
474
+ const resp = await fetch("https://api.openai.com/v1/chat/completions", {
475
+ method: "POST",
476
+ headers: { "Content-Type": "application/json", Authorization: `Bearer ${openaiKey}` },
477
+ body: JSON.stringify({
478
+ model: "gpt-4o-mini",
479
+ messages: [
480
+ { role: "system", content: systemPrompt },
481
+ { role: "user", content: userPrompt },
482
+ ],
483
+ temperature: 0,
484
+ max_tokens: 512,
485
+ }),
486
+ });
487
+ const data = await resp.json();
488
+ responseText = data?.choices?.[0]?.message?.content ?? "";
489
+ }
490
+ else if (selectedProvider === "anthropic" && anthropicKey) {
491
+ const resp = await fetch("https://api.anthropic.com/v1/messages", {
492
+ method: "POST",
493
+ headers: {
494
+ "Content-Type": "application/json",
495
+ "x-api-key": anthropicKey,
496
+ "anthropic-version": "2023-06-01",
497
+ },
498
+ body: JSON.stringify({
499
+ model: "claude-3-5-haiku-latest",
500
+ max_tokens: 512,
501
+ system: systemPrompt,
502
+ messages: [{ role: "user", content: userPrompt }],
503
+ }),
504
+ });
505
+ const data = await resp.json();
506
+ responseText = data?.content?.[0]?.text ?? "";
507
+ }
508
+ // Parse the JSON array from the response
509
+ const jsonMatch = responseText.match(/\[[\s\S]*?\]/);
510
+ if (!jsonMatch) {
511
+ return { error: true, message: "LLM did not return a valid JSON array", raw: responseText.slice(0, 200) };
512
+ }
513
+ const selectedNames = JSON.parse(jsonMatch[0]);
514
+ // Enrich with registry metadata
515
+ const enriched = selectedNames
516
+ .map(name => {
517
+ const entry = TOOL_REGISTRY.get(name);
518
+ if (!entry)
519
+ return null;
520
+ return {
521
+ name: entry.name,
522
+ category: entry.category,
523
+ phase: entry.phase,
524
+ tags: entry.tags,
525
+ quickRef: entry.quickRef,
526
+ loaded: allTools.some(t => t.name === name),
527
+ };
528
+ })
529
+ .filter(Boolean);
530
+ // Identify toolsets to load
531
+ const unloadedToolsets = new Map();
532
+ for (const tool of enriched) {
533
+ if (tool && !tool.loaded) {
534
+ const ts = TOOL_TO_TOOLSET.get(tool.name);
535
+ if (ts) {
536
+ const list = unloadedToolsets.get(ts) ?? [];
537
+ list.push(tool.name);
538
+ unloadedToolsets.set(ts, list);
539
+ }
540
+ }
541
+ }
542
+ return {
543
+ method: `llm_${selectedProvider}`,
544
+ task,
545
+ selectedTools: enriched,
546
+ toolCount: enriched.length,
547
+ ...(unloadedToolsets.size > 0 ? {
548
+ _loadSuggestions: [...unloadedToolsets.entries()].map(([ts, tools]) => ({
549
+ toolset: ts,
550
+ matchingTools: tools,
551
+ action: `Call load_toolset("${ts}") to activate ${tools.length} tool(s).`,
552
+ })),
553
+ } : {}),
554
+ _hint: enriched.length > 0
555
+ ? `Top pick: ${enriched[0].name}. ${enriched[0].quickRef.nextAction}`
556
+ : "No tools selected. Try rephrasing your task.",
557
+ };
558
+ }
559
+ catch (err) {
560
+ return {
561
+ error: true,
562
+ method: `llm_${selectedProvider}`,
563
+ message: `LLM call failed: ${err.message}`,
564
+ _hint: "Falling back to heuristic search. Check your API key.",
565
+ };
566
+ }
567
+ },
568
+ },
569
+ {
570
+ name: "get_ab_test_report",
571
+ description: "Generate an A/B test comparison report for static vs dynamic toolset loading. Shows session counts, tool counts, load events, error rates, and per-toolset load frequency. Use after running sessions in both modes to evaluate the impact of dynamic loading.",
572
+ inputSchema: {
573
+ type: "object",
574
+ properties: {
575
+ detailed: {
576
+ type: "boolean",
577
+ description: "Include per-session breakdown (default: false, summary only)",
578
+ },
579
+ },
580
+ },
581
+ handler: async (args) => {
582
+ const db = getDb();
583
+ const detailed = args.detailed === true;
584
+ // Session-level aggregates by mode
585
+ const sessionSummary = db.prepare(`
586
+ SELECT
587
+ mode,
588
+ COUNT(*) as sessions,
589
+ ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
590
+ ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
591
+ ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
592
+ ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
593
+ ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
594
+ SUM(COALESCE(total_tool_calls, 0)) as total_calls,
595
+ SUM(COALESCE(total_load_events, 0)) as total_loads
596
+ FROM ab_test_sessions
597
+ GROUP BY mode
598
+ `).all();
599
+ // Error rate by mode (join with tool_call_log)
600
+ const errorRates = db.prepare(`
601
+ SELECT
602
+ s.mode,
603
+ COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
604
+ COUNT(*) as total_calls,
605
+ ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
606
+ FROM tool_call_log t
607
+ JOIN ab_test_sessions s ON t.session_id = s.id
608
+ GROUP BY s.mode
609
+ `).all();
610
+ // Top loaded toolsets (dynamic mode)
611
+ const topToolsets = db.prepare(`
612
+ SELECT
613
+ toolset_name,
614
+ COUNT(*) as load_count,
615
+ ROUND(AVG(latency_ms), 1) as avg_latency_ms
616
+ FROM ab_tool_events
617
+ WHERE event_type = 'load'
618
+ GROUP BY toolset_name
619
+ ORDER BY load_count DESC
620
+ LIMIT 10
621
+ `).all();
622
+ // Current session info
623
+ const currentSession = {
624
+ sessionId: SESSION_ID,
625
+ mode: useDynamicLoading ? "dynamic" : "static",
626
+ preset: currentPreset,
627
+ toolCalls: _abToolCallCount,
628
+ loadEvents: _abLoadEventCount,
629
+ activeTools: allTools.length,
630
+ durationSec: Math.round((Date.now() - _abStartMs) / 1000),
631
+ dynamicallyLoaded: [...activeToolsets].filter(ts => !initialToolsetNames.has(ts)),
632
+ };
633
+ // Optional per-session detail
634
+ let sessions = [];
635
+ if (detailed) {
636
+ sessions = db.prepare(`
637
+ SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
638
+ toolsets_loaded, total_tool_calls, total_load_events,
639
+ session_duration_ms, created_at, ended_at
640
+ FROM ab_test_sessions
641
+ ORDER BY created_at DESC
642
+ LIMIT 50
643
+ `).all();
644
+ }
645
+ // Build verdict
646
+ const staticSummary = sessionSummary.find((s) => s.mode === "static");
647
+ const dynamicSummary = sessionSummary.find((s) => s.mode === "dynamic");
648
+ let verdict = "Insufficient data. Run sessions in both modes to compare.";
649
+ if (staticSummary && dynamicSummary) {
650
+ const toolDiff = (staticSummary.avg_final_tools ?? 0) - (dynamicSummary.avg_final_tools ?? 0);
651
+ const staticErr = errorRates.find((e) => e.mode === "static");
652
+ const dynamicErr = errorRates.find((e) => e.mode === "dynamic");
653
+ const errDiff = (staticErr?.error_pct ?? 0) - (dynamicErr?.error_pct ?? 0);
654
+ verdict = [
655
+ `Static: ${staticSummary.sessions} sessions, avg ${staticSummary.avg_final_tools} tools, ${staticErr?.error_pct ?? "?"}% error rate.`,
656
+ `Dynamic: ${dynamicSummary.sessions} sessions, avg ${dynamicSummary.avg_final_tools} tools, ${dynamicErr?.error_pct ?? "?"}% error rate.`,
657
+ toolDiff > 0 ? `Dynamic uses ${toolDiff.toFixed(1)} fewer tools on average.` : "",
658
+ errDiff > 0 ? `Dynamic has ${errDiff.toFixed(2)}pp lower error rate.` : errDiff < 0 ? `Static has ${(-errDiff).toFixed(2)}pp lower error rate.` : "",
659
+ dynamicSummary.avg_load_events > 0 ? `Agents loaded ${dynamicSummary.avg_load_events} toolsets per session on average.` : "",
660
+ ].filter(Boolean).join(" ");
661
+ }
662
+ return {
663
+ verdict,
664
+ sessionSummary,
665
+ errorRates,
666
+ topLoadedToolsets: topToolsets,
667
+ currentSession,
668
+ ...(detailed ? { sessions } : {}),
669
+ _hint: sessionSummary.length < 2
670
+ ? "Run sessions with both `npx nodebench-mcp` (static) and `npx nodebench-mcp --dynamic` (dynamic) to compare."
671
+ : "Compare avg_final_tools and error_pct between modes to evaluate dynamic loading impact.",
672
+ };
673
+ },
674
+ },
675
+ ];
676
+ // Combine all tools (mutable for dynamic loading)
677
+ let allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
186
678
  // Background: initialize embedding index for semantic search (non-blocking)
187
679
  // Uses Agent-as-a-Graph bipartite corpus: tool nodes + domain nodes for graph-aware retrieval
188
680
  if (useEmbedding) {
@@ -224,15 +716,25 @@ if (useEmbedding) {
224
716
  /* Embedding init failed — semantic search stays disabled, no impact on other features */
225
717
  });
226
718
  }
227
- // Build a lookup map for fast tool dispatch
228
- const toolMap = new Map();
719
+ // Build a lookup map for fast tool dispatch (mutable for dynamic loading)
720
+ let toolMap = new Map();
229
721
  for (const tool of allTools) {
230
722
  toolMap.set(tool.name, tool);
231
723
  }
724
+ // Rebuild function for dynamic loading — reconstructs allTools + toolMap
725
+ function rebuildAllTools() {
726
+ allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
727
+ toolMap = new Map();
728
+ for (const tool of allTools) {
729
+ toolMap.set(tool.name, tool);
730
+ }
731
+ }
232
732
  // Auto-instrumentation: generate a session ID per MCP connection
233
733
  const SESSION_ID = genId("mcp");
234
- // Tools to skip auto-logging (avoid infinite recursion and noise)
235
- const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings"]);
734
+ // A/B test session-level counters (mutable, finalized on exit)
735
+ let _abToolCallCount = 0;
736
+ let _abLoadEventCount = 0;
737
+ const _abStartMs = Date.now();
236
738
  // ── Lightweight hooks: auto-save + attention refresh reminders ─────────
237
739
  const _hookState = {
238
740
  totalCalls: 0,
@@ -789,7 +1291,34 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
789
1291
  ],
790
1292
  },
791
1293
  ];
792
- const server = new Server({ name: "nodebench-mcp-methodology", version: "2.16.0" }, { capabilities: { tools: {}, prompts: {} } });
1294
+ // Server instructions tells Claude Code Tool Search (and other clients) when to search
1295
+ // for NodeBench tools. This is the key integration point for lazy loading compatibility.
1296
+ // See: https://www.anthropic.com/engineering/advanced-tool-use
1297
+ const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
1298
+ Use NodeBench tools when you need to:
1299
+ - Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
1300
+ - Run evaluations and quality gates before shipping code
1301
+ - Search prior knowledge and record learnings across sessions
1302
+ - Assess risk before taking actions
1303
+ - Coordinate parallel agents (task locks, roles, context budget)
1304
+ - Research with structured recon (web search, GitHub, RSS feeds)
1305
+ - Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
1306
+ - Run security audits (dependency scanning, code analysis, secrets detection)
1307
+ - Write and polish academic papers
1308
+ - Audit SEO, analyze Figma flows, detect Android flicker
1309
+ - Call LLMs (GPT, Claude, Gemini) for analysis and extraction
1310
+ Start with discover_tools("<your task>") to find the right tool.`;
1311
+ const server = new Server({ name: "nodebench-mcp-methodology", version: "2.18.1" }, {
1312
+ capabilities: { tools: { listChanged: true }, prompts: {} },
1313
+ instructions: SERVER_INSTRUCTIONS,
1314
+ });
1315
+ // ── A/B Test Session Tracking ─────────────────────────────────────────
1316
+ // Record session start for A/B comparison (static vs dynamic loading)
1317
+ try {
1318
+ const db = getDb();
1319
+ db.prepare("INSERT INTO ab_test_sessions (id, mode, initial_preset, initial_tool_count, created_at) VALUES (?, ?, ?, ?, datetime('now'))").run(SESSION_ID, useDynamicLoading ? 'dynamic' : 'static', currentPreset, allTools.length);
1320
+ }
1321
+ catch { /* instrumentation must not block server start */ }
793
1322
  // Handle tools/list — return all tools with their JSON Schema inputSchemas
794
1323
  // Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
795
1324
  server.setRequestHandler(ListToolsRequestSchema, async () => {
@@ -815,6 +1344,9 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
815
1344
  // Handle tools/call — dispatch to the matching tool handler (auto-instrumented)
816
1345
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
817
1346
  const { name, arguments: args } = request.params;
1347
+ _abToolCallCount++;
1348
+ if (name === "load_toolset" || name === "unload_toolset")
1349
+ _abLoadEventCount++;
818
1350
  const tool = toolMap.get(name);
819
1351
  if (!tool) {
820
1352
  return {
@@ -832,7 +1364,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
832
1364
  resultStatus = "error";
833
1365
  errorMsg = result.message ?? "soft error";
834
1366
  }
835
- // Auto-log (skip self-eval tools to avoid recursion/noise)
1367
+ // Auto-log to main DB (skip self-eval tools to avoid recursion/noise)
836
1368
  if (!SKIP_AUTO_LOG.has(name)) {
837
1369
  try {
838
1370
  const db = getDb();
@@ -840,6 +1372,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
840
1372
  }
841
1373
  catch { /* never let instrumentation break tool dispatch */ }
842
1374
  }
1375
+ // Auto-log to analytics tracker
1376
+ tracker.record(name, startMs, resultStatus === "success", errorMsg, args);
1377
+ // Inline A/B session counter update (every 5 calls — amortized cost)
1378
+ if (_abToolCallCount % 5 === 0) {
1379
+ try {
1380
+ const db2 = getDb();
1381
+ const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
1382
+ db2.prepare("UPDATE ab_test_sessions SET total_tool_calls = ?, total_load_events = ?, final_tool_count = ?, toolsets_loaded = ? WHERE id = ?").run(_abToolCallCount, _abLoadEventCount, allTools.length, JSON.stringify(dynamicallyLoaded), SESSION_ID);
1383
+ }
1384
+ catch { /* instrumentation */ }
1385
+ }
843
1386
  // Tools with rawContent return ContentBlock[] directly (e.g. image captures)
844
1387
  if (tool.rawContent && Array.isArray(result)) {
845
1388
  return { content: result, isError: false };
@@ -881,7 +1424,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
881
1424
  catch (err) {
882
1425
  resultStatus = "error";
883
1426
  errorMsg = err?.message || "Internal error";
884
- // Auto-log errors
1427
+ // Auto-log errors to main DB
885
1428
  if (!SKIP_AUTO_LOG.has(name)) {
886
1429
  try {
887
1430
  const db = getDb();
@@ -889,6 +1432,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
889
1432
  }
890
1433
  catch { /* never let instrumentation break tool dispatch */ }
891
1434
  }
1435
+ // Auto-log error to analytics tracker
1436
+ tracker.record(name, startMs, false, errorMsg, args);
892
1437
  return {
893
1438
  content: [{ type: "text", text: errorMsg }],
894
1439
  isError: true,
@@ -920,6 +1465,24 @@ server.setRequestHandler(GetPromptRequestSchema, async (request) => {
920
1465
  messages,
921
1466
  };
922
1467
  });
1468
+ // Graceful shutdown: close analytics tracker + finalize A/B session on exit
1469
+ process.on('exit', () => {
1470
+ tracker.close();
1471
+ // Finalize A/B test session with aggregate metrics
1472
+ try {
1473
+ const db = getDb();
1474
+ const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
1475
+ db.prepare(`UPDATE ab_test_sessions SET
1476
+ final_tool_count = ?,
1477
+ toolsets_loaded = ?,
1478
+ total_tool_calls = ?,
1479
+ total_load_events = ?,
1480
+ session_duration_ms = ?,
1481
+ ended_at = datetime('now')
1482
+ WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
1483
+ }
1484
+ catch { /* instrumentation must not block shutdown */ }
1485
+ });
923
1486
  // Connect via stdio
924
1487
  const transport = new StdioServerTransport();
925
1488
  await server.connect(transport);