nodebench-mcp 2.17.0 → 2.18.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/NODEBENCH_AGENTS.md +2 -2
- package/README.md +516 -82
- package/dist/__tests__/analytics.test.d.ts +11 -0
- package/dist/__tests__/analytics.test.js +546 -0
- package/dist/__tests__/analytics.test.js.map +1 -0
- package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
- package/dist/__tests__/dynamicLoading.test.js +278 -0
- package/dist/__tests__/dynamicLoading.test.js.map +1 -0
- package/dist/__tests__/evalHarness.test.js +1 -1
- package/dist/__tests__/evalHarness.test.js.map +1 -1
- package/dist/__tests__/helpers/answerMatch.js +22 -22
- package/dist/__tests__/presetRealWorldBench.test.js +9 -0
- package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +1 -1
- package/dist/__tests__/toolsetGatingEval.test.js +9 -1
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
- package/dist/analytics/index.d.ts +10 -0
- package/dist/analytics/index.js +11 -0
- package/dist/analytics/index.js.map +1 -0
- package/dist/analytics/projectDetector.d.ts +19 -0
- package/dist/analytics/projectDetector.js +259 -0
- package/dist/analytics/projectDetector.js.map +1 -0
- package/dist/analytics/schema.d.ts +57 -0
- package/dist/analytics/schema.js +157 -0
- package/dist/analytics/schema.js.map +1 -0
- package/dist/analytics/smartPreset.d.ts +63 -0
- package/dist/analytics/smartPreset.js +300 -0
- package/dist/analytics/smartPreset.js.map +1 -0
- package/dist/analytics/toolTracker.d.ts +59 -0
- package/dist/analytics/toolTracker.js +163 -0
- package/dist/analytics/toolTracker.js.map +1 -0
- package/dist/analytics/usageStats.d.ts +64 -0
- package/dist/analytics/usageStats.js +252 -0
- package/dist/analytics/usageStats.js.map +1 -0
- package/dist/db.js +359 -321
- package/dist/db.js.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +652 -89
- package/dist/index.js.map +1 -1
- package/dist/tools/architectTools.js +13 -13
- package/dist/tools/critterTools.js +14 -14
- package/dist/tools/parallelAgentTools.js +176 -176
- package/dist/tools/patternTools.js +11 -11
- package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
- package/dist/tools/progressiveDiscoveryTools.js +111 -19
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/researchWritingTools.js +42 -42
- package/dist/tools/rssTools.js +396 -396
- package/dist/tools/toolRegistry.d.ts +17 -0
- package/dist/tools/toolRegistry.js +65 -17
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/voiceBridgeTools.js +498 -498
- package/dist/toolsetRegistry.d.ts +10 -0
- package/dist/toolsetRegistry.js +84 -0
- package/dist/toolsetRegistry.js.map +1 -0
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -20,42 +20,14 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
|
20
20
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
21
21
|
import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
22
22
|
import { getDb, genId } from "./db.js";
|
|
23
|
-
import {
|
|
24
|
-
import {
|
|
25
|
-
import {
|
|
26
|
-
import {
|
|
27
|
-
import {
|
|
28
|
-
import { reconTools } from "./tools/reconTools.js";
|
|
29
|
-
import { uiCaptureTools } from "./tools/uiCaptureTools.js";
|
|
30
|
-
import { visionTools } from "./tools/visionTools.js";
|
|
31
|
-
import { webTools } from "./tools/webTools.js";
|
|
32
|
-
import { githubTools } from "./tools/githubTools.js";
|
|
33
|
-
import { documentationTools } from "./tools/documentationTools.js";
|
|
34
|
-
import { agentBootstrapTools } from "./tools/agentBootstrapTools.js";
|
|
35
|
-
import { selfEvalTools } from "./tools/selfEvalTools.js";
|
|
36
|
-
import { parallelAgentTools } from "./tools/parallelAgentTools.js";
|
|
37
|
-
import { llmTools } from "./tools/llmTools.js";
|
|
38
|
-
import { securityTools } from "./tools/securityTools.js";
|
|
39
|
-
import { platformTools } from "./tools/platformTools.js";
|
|
40
|
-
import { researchWritingTools } from "./tools/researchWritingTools.js";
|
|
41
|
-
import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
|
|
42
|
-
import { figmaFlowTools } from "./tools/figmaFlowTools.js";
|
|
23
|
+
import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
|
|
24
|
+
import { AnalyticsTracker } from "./analytics/toolTracker.js";
|
|
25
|
+
import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
|
|
26
|
+
import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
|
|
27
|
+
import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
|
|
43
28
|
import { createMetaTools } from "./tools/metaTools.js";
|
|
44
|
-
import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
|
|
45
29
|
import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
|
|
46
|
-
import {
|
|
47
|
-
import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
|
|
48
|
-
import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
|
|
49
|
-
import { patternTools } from "./tools/patternTools.js";
|
|
50
|
-
import { gitWorkflowTools } from "./tools/gitWorkflowTools.js";
|
|
51
|
-
import { seoTools } from "./tools/seoTools.js";
|
|
52
|
-
import { voiceBridgeTools } from "./tools/voiceBridgeTools.js";
|
|
53
|
-
import { critterTools } from "./tools/critterTools.js";
|
|
54
|
-
import { emailTools } from "./tools/emailTools.js";
|
|
55
|
-
import { rssTools } from "./tools/rssTools.js";
|
|
56
|
-
import { architectTools } from "./tools/architectTools.js";
|
|
57
|
-
import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor } from "./tools/toolRegistry.js";
|
|
58
|
-
import { toonTools } from "./tools/toonTools.js";
|
|
30
|
+
import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch } from "./tools/toolRegistry.js";
|
|
59
31
|
// TOON format — ~40% token savings on tool responses
|
|
60
32
|
import { encode as toonEncode } from "@toon-format/toon";
|
|
61
33
|
// Embedding provider — neural semantic search
|
|
@@ -64,48 +36,38 @@ import { initEmbeddingIndex } from "./tools/embeddingProvider.js";
|
|
|
64
36
|
const cliArgs = process.argv.slice(2);
|
|
65
37
|
const useToon = !cliArgs.includes("--no-toon");
|
|
66
38
|
const useEmbedding = !cliArgs.includes("--no-embedding");
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
ui_capture: uiCaptureTools,
|
|
75
|
-
vision: visionTools,
|
|
76
|
-
local_file: localFileTools,
|
|
77
|
-
web: webTools,
|
|
78
|
-
github: githubTools,
|
|
79
|
-
docs: documentationTools,
|
|
80
|
-
bootstrap: agentBootstrapTools,
|
|
81
|
-
self_eval: selfEvalTools,
|
|
82
|
-
parallel: parallelAgentTools,
|
|
83
|
-
llm: llmTools,
|
|
84
|
-
security: securityTools,
|
|
85
|
-
platform: platformTools,
|
|
86
|
-
research_writing: researchWritingTools,
|
|
87
|
-
flicker_detection: flickerDetectionTools,
|
|
88
|
-
figma_flow: figmaFlowTools,
|
|
89
|
-
boilerplate: boilerplateTools,
|
|
90
|
-
benchmark: cCompilerBenchmarkTools,
|
|
91
|
-
session_memory: sessionMemoryTools,
|
|
92
|
-
gaia_solvers: gaiaMediaSolvers,
|
|
93
|
-
toon: toonTools,
|
|
94
|
-
pattern: patternTools,
|
|
95
|
-
git_workflow: gitWorkflowTools,
|
|
96
|
-
seo: seoTools,
|
|
97
|
-
voice_bridge: voiceBridgeTools,
|
|
98
|
-
critter: critterTools,
|
|
99
|
-
email: emailTools,
|
|
100
|
-
rss: rssTools,
|
|
101
|
-
architect: architectTools,
|
|
102
|
-
};
|
|
39
|
+
const useSmartPreset = cliArgs.includes("--smart-preset");
|
|
40
|
+
const showStats = cliArgs.includes("--stats");
|
|
41
|
+
const exportStats = cliArgs.includes("--export-stats");
|
|
42
|
+
const resetStats = cliArgs.includes("--reset-stats");
|
|
43
|
+
const listPresetsFlag = cliArgs.includes("--list-presets");
|
|
44
|
+
export { TOOLSET_MAP };
|
|
45
|
+
const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"];
|
|
103
46
|
const PRESETS = {
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
47
|
+
default: DEFAULT_TOOLSETS,
|
|
48
|
+
// Themed presets — bridge between default (50 tools) and full (175 tools)
|
|
49
|
+
web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect"],
|
|
50
|
+
research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs"],
|
|
51
|
+
data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web"],
|
|
52
|
+
devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
|
|
53
|
+
mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection"],
|
|
54
|
+
academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
|
|
55
|
+
multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon"],
|
|
56
|
+
content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect"],
|
|
107
57
|
full: Object.keys(TOOLSET_MAP),
|
|
108
58
|
};
|
|
59
|
+
const PRESET_DESCRIPTIONS = {
|
|
60
|
+
default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
|
|
61
|
+
web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
|
|
62
|
+
research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
|
|
63
|
+
data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
|
|
64
|
+
devops: "CI/CD & ops — adds git compliance, session memory, benchmarks, pattern mining",
|
|
65
|
+
mobile: "Mobile apps — adds screenshot capture, vision analysis, flicker detection",
|
|
66
|
+
academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
|
|
67
|
+
multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing",
|
|
68
|
+
content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
|
|
69
|
+
full: "Everything — all toolsets for maximum coverage",
|
|
70
|
+
};
|
|
109
71
|
function parseToolsets() {
|
|
110
72
|
if (cliArgs.includes("--help")) {
|
|
111
73
|
const lines = [
|
|
@@ -114,9 +76,15 @@ function parseToolsets() {
|
|
|
114
76
|
"Usage: nodebench-mcp [options]",
|
|
115
77
|
"",
|
|
116
78
|
"Options:",
|
|
117
|
-
" --toolsets <list> Comma-separated toolsets to enable (default:
|
|
79
|
+
" --toolsets <list> Comma-separated toolsets to enable (default: default)",
|
|
118
80
|
" --exclude <list> Comma-separated toolsets to exclude",
|
|
119
|
-
" --preset <name> Use a preset:
|
|
81
|
+
" --preset <name> Use a preset: default or full",
|
|
82
|
+
" --smart-preset Generate smart preset recommendation based on project type and usage history",
|
|
83
|
+
" --stats Show usage statistics for current project",
|
|
84
|
+
" --export-stats Export usage statistics to JSON",
|
|
85
|
+
" --reset-stats Clear all usage analytics data",
|
|
86
|
+
" --list-presets List all available presets with descriptions",
|
|
87
|
+
" --dynamic Enable dynamic toolset loading (Search+Load pattern from arxiv 2509.20386)",
|
|
120
88
|
" --no-toon Disable TOON encoding (TOON is on by default for ~40% token savings)",
|
|
121
89
|
" --no-embedding Disable neural embedding search (uses local HuggingFace model or API keys)",
|
|
122
90
|
" --help Show this help and exit",
|
|
@@ -125,10 +93,20 @@ function parseToolsets() {
|
|
|
125
93
|
...Object.entries(TOOLSET_MAP).map(([k, v]) => ` ${k.padEnd(16)} ${v.length} tools`),
|
|
126
94
|
"",
|
|
127
95
|
"Presets:",
|
|
128
|
-
...Object.entries(PRESETS).map(([k, v]) =>
|
|
96
|
+
...Object.entries(PRESETS).map(([k, v]) => {
|
|
97
|
+
const count = v.reduce((s, ts) => s + (TOOLSET_MAP[ts]?.length ?? 0), 0) + 12;
|
|
98
|
+
return ` ${k.padEnd(14)} ${String(count).padStart(3)} tools ${PRESET_DESCRIPTIONS[k] ?? ''}`;
|
|
99
|
+
}),
|
|
129
100
|
"",
|
|
130
101
|
"Examples:",
|
|
131
|
-
" npx nodebench-mcp
|
|
102
|
+
" npx nodebench-mcp # Default (50 tools) - core AI Flywheel",
|
|
103
|
+
" npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
|
|
104
|
+
" npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
|
|
105
|
+
" npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
|
|
106
|
+
" npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
|
|
107
|
+
" npx nodebench-mcp --preset full # All 175 tools",
|
|
108
|
+
" npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
|
|
109
|
+
" npx nodebench-mcp --stats # Show usage statistics",
|
|
132
110
|
" npx nodebench-mcp --toolsets verification,eval,recon",
|
|
133
111
|
" npx nodebench-mcp --exclude vision,ui_capture,parallel",
|
|
134
112
|
"",
|
|
@@ -170,19 +148,533 @@ function parseToolsets() {
|
|
|
170
148
|
.filter(([k]) => !excluded.has(k))
|
|
171
149
|
.flatMap(([, v]) => v);
|
|
172
150
|
}
|
|
173
|
-
|
|
151
|
+
// Default to default preset (50 tools - complete AI Flywheel)
|
|
152
|
+
return PRESETS.default.flatMap((k) => TOOLSET_MAP[k] ?? []);
|
|
153
|
+
}
|
|
154
|
+
// ── Analytics CLI flag handling ─────────────────────────────────────────
|
|
155
|
+
// Handle --list-presets
|
|
156
|
+
if (listPresetsFlag) {
|
|
157
|
+
const presets = listPresets(TOOLSET_MAP);
|
|
158
|
+
console.log(JSON.stringify(presets, null, 2));
|
|
159
|
+
process.exit(0);
|
|
160
|
+
}
|
|
161
|
+
// ── Analytics CLI handlers (run-and-exit) ───────────────────────────────
|
|
162
|
+
if (resetStats || useSmartPreset || showStats || exportStats) {
|
|
163
|
+
const aDb = getAnalyticsDb();
|
|
164
|
+
try {
|
|
165
|
+
if (resetStats) {
|
|
166
|
+
clearOldRecords(aDb, 0);
|
|
167
|
+
console.error("Usage analytics data cleared (tool_usage + cache). Project context and preset history preserved.");
|
|
168
|
+
}
|
|
169
|
+
else if (useSmartPreset) {
|
|
170
|
+
const recommendation = generateSmartPreset(aDb, TOOLSET_MAP);
|
|
171
|
+
console.error(formatPresetRecommendation(recommendation, TOOLSET_MAP));
|
|
172
|
+
}
|
|
173
|
+
else if (showStats) {
|
|
174
|
+
const summary = getProjectUsageSummary(aDb, process.cwd(), 30);
|
|
175
|
+
if (summary) {
|
|
176
|
+
console.error(formatStatsDisplay(summary, process.cwd()));
|
|
177
|
+
}
|
|
178
|
+
else {
|
|
179
|
+
console.error("No usage data available for this project in the last 30 days.");
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
else if (exportStats) {
|
|
183
|
+
console.log(exportUsageStats(aDb, process.cwd(), 30));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
finally {
|
|
187
|
+
closeAnalyticsDb(aDb);
|
|
188
|
+
}
|
|
189
|
+
process.exit(0);
|
|
174
190
|
}
|
|
175
191
|
// Initialize DB (creates ~/.nodebench/ and schema on first run)
|
|
176
192
|
getDb();
|
|
177
193
|
// Wire up DB accessor for execution trace edges (avoids circular import)
|
|
178
194
|
_setDbAccessor(getDb);
|
|
179
195
|
// Assemble tools (filtered by --toolsets / --exclude / --preset if provided)
|
|
180
|
-
|
|
196
|
+
let domainTools = parseToolsets();
|
|
197
|
+
// Determine current preset name for analytics
|
|
198
|
+
let currentPreset = 'default';
|
|
199
|
+
const presetIdx = cliArgs.indexOf("--preset");
|
|
200
|
+
if (presetIdx !== -1 && cliArgs[presetIdx + 1]) {
|
|
201
|
+
currentPreset = cliArgs[presetIdx + 1];
|
|
202
|
+
}
|
|
203
|
+
else if (cliArgs.includes("--toolsets") || cliArgs.includes("--exclude")) {
|
|
204
|
+
currentPreset = 'custom';
|
|
205
|
+
}
|
|
206
|
+
// Dynamic loading: --dynamic flag enables Search+Load architecture
|
|
207
|
+
// (arxiv 2509.20386 "Dynamic ReAct" winning pattern)
|
|
208
|
+
const useDynamicLoading = cliArgs.includes("--dynamic");
|
|
209
|
+
// Track which toolsets are currently active (mutable for dynamic loading)
|
|
210
|
+
const initialToolsetNames = new Set(PRESETS[currentPreset] ?? PRESETS.default);
|
|
211
|
+
const activeToolsets = new Set(initialToolsetNames);
|
|
212
|
+
// Tools to skip auto-logging (avoid infinite recursion and noise)
|
|
213
|
+
const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings", "load_toolset", "unload_toolset", "list_available_toolsets"]);
|
|
214
|
+
// Initialize analytics tracker singleton (handles DB, project context, retention cleanup)
|
|
215
|
+
const tracker = AnalyticsTracker.init({
|
|
216
|
+
projectPath: process.cwd(),
|
|
217
|
+
preset: currentPreset,
|
|
218
|
+
toolCount: domainTools.length + 6,
|
|
219
|
+
toolToToolset: TOOL_TO_TOOLSET,
|
|
220
|
+
skipTools: SKIP_AUTO_LOG,
|
|
221
|
+
});
|
|
181
222
|
const metaTools = createMetaTools(domainTools);
|
|
182
|
-
|
|
223
|
+
let allToolsWithoutDiscovery = [...domainTools, ...metaTools];
|
|
183
224
|
// Progressive discovery tools need the full tool list for hybrid search
|
|
184
|
-
|
|
185
|
-
const
|
|
225
|
+
// Pass dynamic loading callbacks so discover_tools can suggest load_toolset for unloaded toolsets
|
|
226
|
+
const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })), {
|
|
227
|
+
getLoadedToolNames: () => new Set(allTools.map(t => t.name)),
|
|
228
|
+
getToolToToolset: () => TOOL_TO_TOOLSET,
|
|
229
|
+
});
|
|
230
|
+
// ── Dynamic Loading Tools (Search+Load pattern) ────────────────────────
|
|
231
|
+
// Based on Dynamic ReAct (arxiv 2509.20386) — the winning architecture.
|
|
232
|
+
// Agent starts with default preset, discovers tools via discover_tools,
|
|
233
|
+
// then calls load_toolset to activate them. Server sends
|
|
234
|
+
// notifications/tools/list_changed so the client re-fetches the tool list.
|
|
235
|
+
const dynamicLoadingTools = [
|
|
236
|
+
{
|
|
237
|
+
name: "load_toolset",
|
|
238
|
+
description: 'Dynamically load a toolset into the current session. After loading, the tools become immediately available for use. Based on the "Search+Load" architecture from Dynamic ReAct (arxiv 2509.20386) — the winning pattern for scalable MCP tool selection. Use discover_tools first to find which toolset you need, then call this to activate it.',
|
|
239
|
+
inputSchema: {
|
|
240
|
+
type: "object",
|
|
241
|
+
properties: {
|
|
242
|
+
toolset: {
|
|
243
|
+
type: "string",
|
|
244
|
+
description: `Toolset name to load. Available: ${Object.keys(TOOLSET_MAP).filter(k => !activeToolsets.has(k)).join(", ") || "(all loaded)"}`,
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
required: ["toolset"],
|
|
248
|
+
},
|
|
249
|
+
handler: async (args) => {
|
|
250
|
+
const { toolset } = args;
|
|
251
|
+
if (!TOOLSET_MAP[toolset]) {
|
|
252
|
+
return { error: true, message: `Unknown toolset: ${toolset}`, available: Object.keys(TOOLSET_MAP) };
|
|
253
|
+
}
|
|
254
|
+
if (activeToolsets.has(toolset)) {
|
|
255
|
+
return { alreadyLoaded: true, toolset, message: `Toolset '${toolset}' is already active.`, activeToolCount: allTools.length };
|
|
256
|
+
}
|
|
257
|
+
const startMs = Date.now();
|
|
258
|
+
const toolsBefore = allTools.length;
|
|
259
|
+
// Add toolset to active set
|
|
260
|
+
activeToolsets.add(toolset);
|
|
261
|
+
const newTools = TOOLSET_MAP[toolset];
|
|
262
|
+
// Rebuild domain tools from active toolsets
|
|
263
|
+
domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
|
|
264
|
+
const newMetaTools = createMetaTools(domainTools);
|
|
265
|
+
allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
|
|
266
|
+
// Rebuild allTools (keep discovery + dynamic loading tools stable)
|
|
267
|
+
rebuildAllTools();
|
|
268
|
+
// Track A/B event
|
|
269
|
+
try {
|
|
270
|
+
const db = getDb();
|
|
271
|
+
db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, latency_ms, created_at) VALUES (?, ?, 'load', ?, ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length, Date.now() - startMs);
|
|
272
|
+
}
|
|
273
|
+
catch { /* instrumentation must not break tool dispatch */ }
|
|
274
|
+
// Notify client that tool list changed (MCP spec)
|
|
275
|
+
try {
|
|
276
|
+
await server.notification({ method: "notifications/tools/list_changed" });
|
|
277
|
+
}
|
|
278
|
+
catch { /* client may not support notifications */ }
|
|
279
|
+
return {
|
|
280
|
+
loaded: true,
|
|
281
|
+
toolset,
|
|
282
|
+
toolsAdded: newTools.length,
|
|
283
|
+
toolNames: newTools.map(t => t.name),
|
|
284
|
+
activeToolCount: allTools.length,
|
|
285
|
+
activeToolsets: [...activeToolsets],
|
|
286
|
+
_hint: `${newTools.length} tools from '${toolset}' are now available. You can use them directly.`,
|
|
287
|
+
};
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
name: "unload_toolset",
|
|
292
|
+
description: "Remove a dynamically loaded toolset from the current session to free up context. Cannot unload toolsets from the initial preset.",
|
|
293
|
+
inputSchema: {
|
|
294
|
+
type: "object",
|
|
295
|
+
properties: {
|
|
296
|
+
toolset: {
|
|
297
|
+
type: "string",
|
|
298
|
+
description: "Toolset name to unload.",
|
|
299
|
+
},
|
|
300
|
+
},
|
|
301
|
+
required: ["toolset"],
|
|
302
|
+
},
|
|
303
|
+
handler: async (args) => {
|
|
304
|
+
const { toolset } = args;
|
|
305
|
+
if (!activeToolsets.has(toolset)) {
|
|
306
|
+
return { error: true, message: `Toolset '${toolset}' is not currently loaded.` };
|
|
307
|
+
}
|
|
308
|
+
if (initialToolsetNames.has(toolset)) {
|
|
309
|
+
return { error: true, message: `Cannot unload '${toolset}' — it's part of the initial preset (${currentPreset}).` };
|
|
310
|
+
}
|
|
311
|
+
const toolsBefore = allTools.length;
|
|
312
|
+
activeToolsets.delete(toolset);
|
|
313
|
+
// Rebuild
|
|
314
|
+
domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
|
|
315
|
+
const newMetaTools = createMetaTools(domainTools);
|
|
316
|
+
allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
|
|
317
|
+
rebuildAllTools();
|
|
318
|
+
try {
|
|
319
|
+
const db = getDb();
|
|
320
|
+
db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, created_at) VALUES (?, ?, 'unload', ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length);
|
|
321
|
+
}
|
|
322
|
+
catch { /* instrumentation */ }
|
|
323
|
+
try {
|
|
324
|
+
await server.notification({ method: "notifications/tools/list_changed" });
|
|
325
|
+
}
|
|
326
|
+
catch { /* client may not support notifications */ }
|
|
327
|
+
return {
|
|
328
|
+
unloaded: true,
|
|
329
|
+
toolset,
|
|
330
|
+
activeToolCount: allTools.length,
|
|
331
|
+
activeToolsets: [...activeToolsets],
|
|
332
|
+
};
|
|
333
|
+
},
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
name: "list_available_toolsets",
|
|
337
|
+
description: "List all available toolsets showing which are currently loaded and which can be dynamically added. Includes tool counts and descriptions for each toolset.",
|
|
338
|
+
inputSchema: { type: "object", properties: {} },
|
|
339
|
+
handler: async () => {
|
|
340
|
+
const toolsets = Object.entries(TOOLSET_MAP).map(([name, tools]) => ({
|
|
341
|
+
name,
|
|
342
|
+
toolCount: tools.length,
|
|
343
|
+
loaded: activeToolsets.has(name),
|
|
344
|
+
isInitialPreset: initialToolsetNames.has(name),
|
|
345
|
+
description: PRESET_DESCRIPTIONS[name] ?? null,
|
|
346
|
+
tools: tools.map(t => t.name),
|
|
347
|
+
}));
|
|
348
|
+
const loaded = toolsets.filter(t => t.loaded);
|
|
349
|
+
const available = toolsets.filter(t => !t.loaded);
|
|
350
|
+
return {
|
|
351
|
+
mode: useDynamicLoading ? "dynamic" : "static",
|
|
352
|
+
currentPreset,
|
|
353
|
+
activeToolCount: allTools.length,
|
|
354
|
+
loaded: { count: loaded.length, toolsets: loaded },
|
|
355
|
+
available: { count: available.length, toolsets: available },
|
|
356
|
+
_hint: available.length > 0
|
|
357
|
+
? `${available.length} toolsets available to load. Call load_toolset("<name>") to activate.`
|
|
358
|
+
: "All toolsets are loaded.",
|
|
359
|
+
};
|
|
360
|
+
},
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
name: "call_loaded_tool",
|
|
364
|
+
description: 'Call a dynamically loaded tool by name. Use this after load_toolset when your client does not automatically refresh the tool list. Pass the tool name and its arguments. Example: call_loaded_tool({ tool: "analyze_screenshot", args: { imagePath: "screenshot.png" } }). This is a fallback — if the loaded tool appears in your tool list directly, call it directly instead.',
|
|
365
|
+
inputSchema: {
|
|
366
|
+
type: "object",
|
|
367
|
+
properties: {
|
|
368
|
+
tool: {
|
|
369
|
+
type: "string",
|
|
370
|
+
description: "Name of the dynamically loaded tool to call.",
|
|
371
|
+
},
|
|
372
|
+
args: {
|
|
373
|
+
type: "object",
|
|
374
|
+
description: "Arguments to pass to the tool (same as its inputSchema).",
|
|
375
|
+
additionalProperties: true,
|
|
376
|
+
},
|
|
377
|
+
},
|
|
378
|
+
required: ["tool"],
|
|
379
|
+
},
|
|
380
|
+
handler: async (callArgs) => {
|
|
381
|
+
const { tool: toolName, args: toolArgs } = callArgs;
|
|
382
|
+
const target = allTools.find(t => t.name === toolName);
|
|
383
|
+
if (!target) {
|
|
384
|
+
return {
|
|
385
|
+
error: true,
|
|
386
|
+
message: `Tool '${toolName}' not found. It may not be loaded yet.`,
|
|
387
|
+
_hint: "Call list_available_toolsets to see what's available, then load_toolset to activate it.",
|
|
388
|
+
loadedTools: allTools.map(t => t.name),
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
// Dispatch to the target tool's handler
|
|
392
|
+
return target.handler(toolArgs ?? {});
|
|
393
|
+
},
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
name: "smart_select_tools",
|
|
397
|
+
description: 'LLM-powered tool selection: sends your task description + a compact tool catalog to a fast model (Gemini Flash, GPT-4o-mini, or Claude Haiku) to pick the best 5-10 tools. Much more accurate than keyword search for ambiguous queries like "call an AI model" or "analyze my data". Requires GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY. Falls back to heuristic discover_tools if no API key is set.',
|
|
398
|
+
inputSchema: {
|
|
399
|
+
type: "object",
|
|
400
|
+
properties: {
|
|
401
|
+
task: {
|
|
402
|
+
type: "string",
|
|
403
|
+
description: "Describe what you want to accomplish. Be specific. Example: 'I need to parse a PDF, extract tables, and email a summary'",
|
|
404
|
+
},
|
|
405
|
+
maxTools: {
|
|
406
|
+
type: "number",
|
|
407
|
+
description: "Maximum tools to return (default: 8)",
|
|
408
|
+
},
|
|
409
|
+
provider: {
|
|
410
|
+
type: "string",
|
|
411
|
+
enum: ["auto", "gemini", "openai", "anthropic"],
|
|
412
|
+
description: "Which LLM provider to use. 'auto' (default) picks the first available API key.",
|
|
413
|
+
},
|
|
414
|
+
},
|
|
415
|
+
required: ["task"],
|
|
416
|
+
},
|
|
417
|
+
handler: async (args) => {
|
|
418
|
+
const task = args.task;
|
|
419
|
+
const maxTools = args.maxTools ?? 8;
|
|
420
|
+
const provider = args.provider ?? "auto";
|
|
421
|
+
// Build compact tool catalog: name + category + tags (no descriptions — saves tokens)
|
|
422
|
+
const catalog = ALL_REGISTRY_ENTRIES.map(e => `${e.name} [${e.category}] ${e.tags.slice(0, 5).join(",")}`).join("\n");
|
|
423
|
+
const systemPrompt = `You are a tool selection assistant. Given a task description and a catalog of ${ALL_REGISTRY_ENTRIES.length} tools, pick the ${maxTools} most relevant tools. Return ONLY a JSON array of tool names, nothing else. Example: ["tool_a","tool_b"]`;
|
|
424
|
+
const userPrompt = `Task: ${task}\n\nTool catalog (name [category] tags):\n${catalog}`;
|
|
425
|
+
// Try LLM providers in order
|
|
426
|
+
const geminiKey = process.env.GEMINI_API_KEY;
|
|
427
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
428
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
429
|
+
let selectedProvider = provider;
|
|
430
|
+
if (selectedProvider === "auto") {
|
|
431
|
+
if (geminiKey)
|
|
432
|
+
selectedProvider = "gemini";
|
|
433
|
+
else if (openaiKey)
|
|
434
|
+
selectedProvider = "openai";
|
|
435
|
+
else if (anthropicKey)
|
|
436
|
+
selectedProvider = "anthropic";
|
|
437
|
+
else
|
|
438
|
+
selectedProvider = "none";
|
|
439
|
+
}
|
|
440
|
+
if (selectedProvider === "none") {
|
|
441
|
+
// Fallback: run heuristic discover_tools (search full registry for dynamic mode)
|
|
442
|
+
const heuristicResults = hybridSearch(task, allTools.map(t => ({ name: t.name, description: t.description })), {
|
|
443
|
+
limit: maxTools,
|
|
444
|
+
mode: "hybrid",
|
|
445
|
+
searchFullRegistry: useDynamicLoading,
|
|
446
|
+
});
|
|
447
|
+
return {
|
|
448
|
+
method: "heuristic_fallback",
|
|
449
|
+
reason: "No API key found. Set GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY for LLM-powered selection.",
|
|
450
|
+
tools: heuristicResults.map((r) => ({
|
|
451
|
+
name: r.name,
|
|
452
|
+
category: r.category,
|
|
453
|
+
score: r.score,
|
|
454
|
+
quickRef: r.quickRef,
|
|
455
|
+
})),
|
|
456
|
+
_hint: "For better accuracy on ambiguous queries, set an API key to enable LLM-powered selection.",
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
try {
|
|
460
|
+
let responseText = "";
|
|
461
|
+
if (selectedProvider === "gemini" && geminiKey) {
|
|
462
|
+
const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`, {
|
|
463
|
+
method: "POST",
|
|
464
|
+
headers: { "Content-Type": "application/json" },
|
|
465
|
+
body: JSON.stringify({
|
|
466
|
+
contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
|
|
467
|
+
generationConfig: { temperature: 0, maxOutputTokens: 512 },
|
|
468
|
+
}),
|
|
469
|
+
});
|
|
470
|
+
const data = await resp.json();
|
|
471
|
+
responseText = data?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
|
|
472
|
+
}
|
|
473
|
+
else if (selectedProvider === "openai" && openaiKey) {
|
|
474
|
+
const resp = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
475
|
+
method: "POST",
|
|
476
|
+
headers: { "Content-Type": "application/json", Authorization: `Bearer ${openaiKey}` },
|
|
477
|
+
body: JSON.stringify({
|
|
478
|
+
model: "gpt-4o-mini",
|
|
479
|
+
messages: [
|
|
480
|
+
{ role: "system", content: systemPrompt },
|
|
481
|
+
{ role: "user", content: userPrompt },
|
|
482
|
+
],
|
|
483
|
+
temperature: 0,
|
|
484
|
+
max_tokens: 512,
|
|
485
|
+
}),
|
|
486
|
+
});
|
|
487
|
+
const data = await resp.json();
|
|
488
|
+
responseText = data?.choices?.[0]?.message?.content ?? "";
|
|
489
|
+
}
|
|
490
|
+
else if (selectedProvider === "anthropic" && anthropicKey) {
|
|
491
|
+
const resp = await fetch("https://api.anthropic.com/v1/messages", {
|
|
492
|
+
method: "POST",
|
|
493
|
+
headers: {
|
|
494
|
+
"Content-Type": "application/json",
|
|
495
|
+
"x-api-key": anthropicKey,
|
|
496
|
+
"anthropic-version": "2023-06-01",
|
|
497
|
+
},
|
|
498
|
+
body: JSON.stringify({
|
|
499
|
+
model: "claude-3-5-haiku-latest",
|
|
500
|
+
max_tokens: 512,
|
|
501
|
+
system: systemPrompt,
|
|
502
|
+
messages: [{ role: "user", content: userPrompt }],
|
|
503
|
+
}),
|
|
504
|
+
});
|
|
505
|
+
const data = await resp.json();
|
|
506
|
+
responseText = data?.content?.[0]?.text ?? "";
|
|
507
|
+
}
|
|
508
|
+
// Parse the JSON array from the response
|
|
509
|
+
const jsonMatch = responseText.match(/\[[\s\S]*?\]/);
|
|
510
|
+
if (!jsonMatch) {
|
|
511
|
+
return { error: true, message: "LLM did not return a valid JSON array", raw: responseText.slice(0, 200) };
|
|
512
|
+
}
|
|
513
|
+
const selectedNames = JSON.parse(jsonMatch[0]);
|
|
514
|
+
// Enrich with registry metadata
|
|
515
|
+
const enriched = selectedNames
|
|
516
|
+
.map(name => {
|
|
517
|
+
const entry = TOOL_REGISTRY.get(name);
|
|
518
|
+
if (!entry)
|
|
519
|
+
return null;
|
|
520
|
+
return {
|
|
521
|
+
name: entry.name,
|
|
522
|
+
category: entry.category,
|
|
523
|
+
phase: entry.phase,
|
|
524
|
+
tags: entry.tags,
|
|
525
|
+
quickRef: entry.quickRef,
|
|
526
|
+
loaded: allTools.some(t => t.name === name),
|
|
527
|
+
};
|
|
528
|
+
})
|
|
529
|
+
.filter(Boolean);
|
|
530
|
+
// Identify toolsets to load
|
|
531
|
+
const unloadedToolsets = new Map();
|
|
532
|
+
for (const tool of enriched) {
|
|
533
|
+
if (tool && !tool.loaded) {
|
|
534
|
+
const ts = TOOL_TO_TOOLSET.get(tool.name);
|
|
535
|
+
if (ts) {
|
|
536
|
+
const list = unloadedToolsets.get(ts) ?? [];
|
|
537
|
+
list.push(tool.name);
|
|
538
|
+
unloadedToolsets.set(ts, list);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
return {
|
|
543
|
+
method: `llm_${selectedProvider}`,
|
|
544
|
+
task,
|
|
545
|
+
selectedTools: enriched,
|
|
546
|
+
toolCount: enriched.length,
|
|
547
|
+
...(unloadedToolsets.size > 0 ? {
|
|
548
|
+
_loadSuggestions: [...unloadedToolsets.entries()].map(([ts, tools]) => ({
|
|
549
|
+
toolset: ts,
|
|
550
|
+
matchingTools: tools,
|
|
551
|
+
action: `Call load_toolset("${ts}") to activate ${tools.length} tool(s).`,
|
|
552
|
+
})),
|
|
553
|
+
} : {}),
|
|
554
|
+
_hint: enriched.length > 0
|
|
555
|
+
? `Top pick: ${enriched[0].name}. ${enriched[0].quickRef.nextAction}`
|
|
556
|
+
: "No tools selected. Try rephrasing your task.",
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
catch (err) {
|
|
560
|
+
return {
|
|
561
|
+
error: true,
|
|
562
|
+
method: `llm_${selectedProvider}`,
|
|
563
|
+
message: `LLM call failed: ${err.message}`,
|
|
564
|
+
_hint: "Falling back to heuristic search. Check your API key.",
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
},
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
name: "get_ab_test_report",
|
|
571
|
+
description: "Generate an A/B test comparison report for static vs dynamic toolset loading. Shows session counts, tool counts, load events, error rates, and per-toolset load frequency. Use after running sessions in both modes to evaluate the impact of dynamic loading.",
|
|
572
|
+
inputSchema: {
|
|
573
|
+
type: "object",
|
|
574
|
+
properties: {
|
|
575
|
+
detailed: {
|
|
576
|
+
type: "boolean",
|
|
577
|
+
description: "Include per-session breakdown (default: false, summary only)",
|
|
578
|
+
},
|
|
579
|
+
},
|
|
580
|
+
},
|
|
581
|
+
handler: async (args) => {
|
|
582
|
+
const db = getDb();
|
|
583
|
+
const detailed = args.detailed === true;
|
|
584
|
+
// Session-level aggregates by mode
|
|
585
|
+
const sessionSummary = db.prepare(`
|
|
586
|
+
SELECT
|
|
587
|
+
mode,
|
|
588
|
+
COUNT(*) as sessions,
|
|
589
|
+
ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
|
|
590
|
+
ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
|
|
591
|
+
ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
|
|
592
|
+
ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
|
|
593
|
+
ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
|
|
594
|
+
SUM(COALESCE(total_tool_calls, 0)) as total_calls,
|
|
595
|
+
SUM(COALESCE(total_load_events, 0)) as total_loads
|
|
596
|
+
FROM ab_test_sessions
|
|
597
|
+
GROUP BY mode
|
|
598
|
+
`).all();
|
|
599
|
+
// Error rate by mode (join with tool_call_log)
|
|
600
|
+
const errorRates = db.prepare(`
|
|
601
|
+
SELECT
|
|
602
|
+
s.mode,
|
|
603
|
+
COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
|
|
604
|
+
COUNT(*) as total_calls,
|
|
605
|
+
ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
|
|
606
|
+
FROM tool_call_log t
|
|
607
|
+
JOIN ab_test_sessions s ON t.session_id = s.id
|
|
608
|
+
GROUP BY s.mode
|
|
609
|
+
`).all();
|
|
610
|
+
// Top loaded toolsets (dynamic mode)
|
|
611
|
+
const topToolsets = db.prepare(`
|
|
612
|
+
SELECT
|
|
613
|
+
toolset_name,
|
|
614
|
+
COUNT(*) as load_count,
|
|
615
|
+
ROUND(AVG(latency_ms), 1) as avg_latency_ms
|
|
616
|
+
FROM ab_tool_events
|
|
617
|
+
WHERE event_type = 'load'
|
|
618
|
+
GROUP BY toolset_name
|
|
619
|
+
ORDER BY load_count DESC
|
|
620
|
+
LIMIT 10
|
|
621
|
+
`).all();
|
|
622
|
+
// Current session info
|
|
623
|
+
const currentSession = {
|
|
624
|
+
sessionId: SESSION_ID,
|
|
625
|
+
mode: useDynamicLoading ? "dynamic" : "static",
|
|
626
|
+
preset: currentPreset,
|
|
627
|
+
toolCalls: _abToolCallCount,
|
|
628
|
+
loadEvents: _abLoadEventCount,
|
|
629
|
+
activeTools: allTools.length,
|
|
630
|
+
durationSec: Math.round((Date.now() - _abStartMs) / 1000),
|
|
631
|
+
dynamicallyLoaded: [...activeToolsets].filter(ts => !initialToolsetNames.has(ts)),
|
|
632
|
+
};
|
|
633
|
+
// Optional per-session detail
|
|
634
|
+
let sessions = [];
|
|
635
|
+
if (detailed) {
|
|
636
|
+
sessions = db.prepare(`
|
|
637
|
+
SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
|
|
638
|
+
toolsets_loaded, total_tool_calls, total_load_events,
|
|
639
|
+
session_duration_ms, created_at, ended_at
|
|
640
|
+
FROM ab_test_sessions
|
|
641
|
+
ORDER BY created_at DESC
|
|
642
|
+
LIMIT 50
|
|
643
|
+
`).all();
|
|
644
|
+
}
|
|
645
|
+
// Build verdict
|
|
646
|
+
const staticSummary = sessionSummary.find((s) => s.mode === "static");
|
|
647
|
+
const dynamicSummary = sessionSummary.find((s) => s.mode === "dynamic");
|
|
648
|
+
let verdict = "Insufficient data. Run sessions in both modes to compare.";
|
|
649
|
+
if (staticSummary && dynamicSummary) {
|
|
650
|
+
const toolDiff = (staticSummary.avg_final_tools ?? 0) - (dynamicSummary.avg_final_tools ?? 0);
|
|
651
|
+
const staticErr = errorRates.find((e) => e.mode === "static");
|
|
652
|
+
const dynamicErr = errorRates.find((e) => e.mode === "dynamic");
|
|
653
|
+
const errDiff = (staticErr?.error_pct ?? 0) - (dynamicErr?.error_pct ?? 0);
|
|
654
|
+
verdict = [
|
|
655
|
+
`Static: ${staticSummary.sessions} sessions, avg ${staticSummary.avg_final_tools} tools, ${staticErr?.error_pct ?? "?"}% error rate.`,
|
|
656
|
+
`Dynamic: ${dynamicSummary.sessions} sessions, avg ${dynamicSummary.avg_final_tools} tools, ${dynamicErr?.error_pct ?? "?"}% error rate.`,
|
|
657
|
+
toolDiff > 0 ? `Dynamic uses ${toolDiff.toFixed(1)} fewer tools on average.` : "",
|
|
658
|
+
errDiff > 0 ? `Dynamic has ${errDiff.toFixed(2)}pp lower error rate.` : errDiff < 0 ? `Static has ${(-errDiff).toFixed(2)}pp lower error rate.` : "",
|
|
659
|
+
dynamicSummary.avg_load_events > 0 ? `Agents loaded ${dynamicSummary.avg_load_events} toolsets per session on average.` : "",
|
|
660
|
+
].filter(Boolean).join(" ");
|
|
661
|
+
}
|
|
662
|
+
return {
|
|
663
|
+
verdict,
|
|
664
|
+
sessionSummary,
|
|
665
|
+
errorRates,
|
|
666
|
+
topLoadedToolsets: topToolsets,
|
|
667
|
+
currentSession,
|
|
668
|
+
...(detailed ? { sessions } : {}),
|
|
669
|
+
_hint: sessionSummary.length < 2
|
|
670
|
+
? "Run sessions with both `npx nodebench-mcp` (static) and `npx nodebench-mcp --dynamic` (dynamic) to compare."
|
|
671
|
+
: "Compare avg_final_tools and error_pct between modes to evaluate dynamic loading impact.",
|
|
672
|
+
};
|
|
673
|
+
},
|
|
674
|
+
},
|
|
675
|
+
];
|
|
676
|
+
// Combine all tools (mutable for dynamic loading)
|
|
677
|
+
let allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
|
|
186
678
|
// Background: initialize embedding index for semantic search (non-blocking)
|
|
187
679
|
// Uses Agent-as-a-Graph bipartite corpus: tool nodes + domain nodes for graph-aware retrieval
|
|
188
680
|
if (useEmbedding) {
|
|
@@ -224,15 +716,25 @@ if (useEmbedding) {
|
|
|
224
716
|
/* Embedding init failed — semantic search stays disabled, no impact on other features */
|
|
225
717
|
});
|
|
226
718
|
}
|
|
227
|
-
// Build a lookup map for fast tool dispatch
|
|
228
|
-
|
|
719
|
+
// Build a lookup map for fast tool dispatch (mutable for dynamic loading)
|
|
720
|
+
let toolMap = new Map();
|
|
229
721
|
for (const tool of allTools) {
|
|
230
722
|
toolMap.set(tool.name, tool);
|
|
231
723
|
}
|
|
724
|
+
// Rebuild function for dynamic loading — reconstructs allTools + toolMap
|
|
725
|
+
function rebuildAllTools() {
|
|
726
|
+
allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
|
|
727
|
+
toolMap = new Map();
|
|
728
|
+
for (const tool of allTools) {
|
|
729
|
+
toolMap.set(tool.name, tool);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
232
732
|
// Auto-instrumentation: generate a session ID per MCP connection
|
|
233
733
|
const SESSION_ID = genId("mcp");
|
|
234
|
-
//
|
|
235
|
-
|
|
734
|
+
// A/B test session-level counters (mutable, finalized on exit)
|
|
735
|
+
let _abToolCallCount = 0;
|
|
736
|
+
let _abLoadEventCount = 0;
|
|
737
|
+
const _abStartMs = Date.now();
|
|
236
738
|
// ── Lightweight hooks: auto-save + attention refresh reminders ─────────
|
|
237
739
|
const _hookState = {
|
|
238
740
|
totalCalls: 0,
|
|
@@ -789,7 +1291,34 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
|
|
|
789
1291
|
],
|
|
790
1292
|
},
|
|
791
1293
|
];
|
|
792
|
-
|
|
1294
|
+
// Server instructions — tells Claude Code Tool Search (and other clients) when to search
|
|
1295
|
+
// for NodeBench tools. This is the key integration point for lazy loading compatibility.
|
|
1296
|
+
// See: https://www.anthropic.com/engineering/advanced-tool-use
|
|
1297
|
+
const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
|
|
1298
|
+
Use NodeBench tools when you need to:
|
|
1299
|
+
- Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
|
|
1300
|
+
- Run evaluations and quality gates before shipping code
|
|
1301
|
+
- Search prior knowledge and record learnings across sessions
|
|
1302
|
+
- Assess risk before taking actions
|
|
1303
|
+
- Coordinate parallel agents (task locks, roles, context budget)
|
|
1304
|
+
- Research with structured recon (web search, GitHub, RSS feeds)
|
|
1305
|
+
- Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
|
|
1306
|
+
- Run security audits (dependency scanning, code analysis, secrets detection)
|
|
1307
|
+
- Write and polish academic papers
|
|
1308
|
+
- Audit SEO, analyze Figma flows, detect Android flicker
|
|
1309
|
+
- Call LLMs (GPT, Claude, Gemini) for analysis and extraction
|
|
1310
|
+
Start with discover_tools("<your task>") to find the right tool.`;
|
|
1311
|
+
const server = new Server({ name: "nodebench-mcp-methodology", version: "2.18.1" }, {
|
|
1312
|
+
capabilities: { tools: { listChanged: true }, prompts: {} },
|
|
1313
|
+
instructions: SERVER_INSTRUCTIONS,
|
|
1314
|
+
});
|
|
1315
|
+
// ── A/B Test Session Tracking ─────────────────────────────────────────
|
|
1316
|
+
// Record session start for A/B comparison (static vs dynamic loading)
|
|
1317
|
+
try {
|
|
1318
|
+
const db = getDb();
|
|
1319
|
+
db.prepare("INSERT INTO ab_test_sessions (id, mode, initial_preset, initial_tool_count, created_at) VALUES (?, ?, ?, ?, datetime('now'))").run(SESSION_ID, useDynamicLoading ? 'dynamic' : 'static', currentPreset, allTools.length);
|
|
1320
|
+
}
|
|
1321
|
+
catch { /* instrumentation must not block server start */ }
|
|
793
1322
|
// Handle tools/list — return all tools with their JSON Schema inputSchemas
|
|
794
1323
|
// Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
|
|
795
1324
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
@@ -815,6 +1344,9 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
815
1344
|
// Handle tools/call — dispatch to the matching tool handler (auto-instrumented)
|
|
816
1345
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
817
1346
|
const { name, arguments: args } = request.params;
|
|
1347
|
+
_abToolCallCount++;
|
|
1348
|
+
if (name === "load_toolset" || name === "unload_toolset")
|
|
1349
|
+
_abLoadEventCount++;
|
|
818
1350
|
const tool = toolMap.get(name);
|
|
819
1351
|
if (!tool) {
|
|
820
1352
|
return {
|
|
@@ -832,7 +1364,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
832
1364
|
resultStatus = "error";
|
|
833
1365
|
errorMsg = result.message ?? "soft error";
|
|
834
1366
|
}
|
|
835
|
-
// Auto-log (skip self-eval tools to avoid recursion/noise)
|
|
1367
|
+
// Auto-log to main DB (skip self-eval tools to avoid recursion/noise)
|
|
836
1368
|
if (!SKIP_AUTO_LOG.has(name)) {
|
|
837
1369
|
try {
|
|
838
1370
|
const db = getDb();
|
|
@@ -840,6 +1372,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
840
1372
|
}
|
|
841
1373
|
catch { /* never let instrumentation break tool dispatch */ }
|
|
842
1374
|
}
|
|
1375
|
+
// Auto-log to analytics tracker
|
|
1376
|
+
tracker.record(name, startMs, resultStatus === "success", errorMsg, args);
|
|
1377
|
+
// Inline A/B session counter update (every 5 calls — amortized cost)
|
|
1378
|
+
if (_abToolCallCount % 5 === 0) {
|
|
1379
|
+
try {
|
|
1380
|
+
const db2 = getDb();
|
|
1381
|
+
const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
|
|
1382
|
+
db2.prepare("UPDATE ab_test_sessions SET total_tool_calls = ?, total_load_events = ?, final_tool_count = ?, toolsets_loaded = ? WHERE id = ?").run(_abToolCallCount, _abLoadEventCount, allTools.length, JSON.stringify(dynamicallyLoaded), SESSION_ID);
|
|
1383
|
+
}
|
|
1384
|
+
catch { /* instrumentation */ }
|
|
1385
|
+
}
|
|
843
1386
|
// Tools with rawContent return ContentBlock[] directly (e.g. image captures)
|
|
844
1387
|
if (tool.rawContent && Array.isArray(result)) {
|
|
845
1388
|
return { content: result, isError: false };
|
|
@@ -881,7 +1424,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
881
1424
|
catch (err) {
|
|
882
1425
|
resultStatus = "error";
|
|
883
1426
|
errorMsg = err?.message || "Internal error";
|
|
884
|
-
// Auto-log errors
|
|
1427
|
+
// Auto-log errors to main DB
|
|
885
1428
|
if (!SKIP_AUTO_LOG.has(name)) {
|
|
886
1429
|
try {
|
|
887
1430
|
const db = getDb();
|
|
@@ -889,6 +1432,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
889
1432
|
}
|
|
890
1433
|
catch { /* never let instrumentation break tool dispatch */ }
|
|
891
1434
|
}
|
|
1435
|
+
// Auto-log error to analytics tracker
|
|
1436
|
+
tracker.record(name, startMs, false, errorMsg, args);
|
|
892
1437
|
return {
|
|
893
1438
|
content: [{ type: "text", text: errorMsg }],
|
|
894
1439
|
isError: true,
|
|
@@ -920,6 +1465,24 @@ server.setRequestHandler(GetPromptRequestSchema, async (request) => {
|
|
|
920
1465
|
messages,
|
|
921
1466
|
};
|
|
922
1467
|
});
|
|
1468
|
+
// Graceful shutdown: close analytics tracker + finalize A/B session on exit
|
|
1469
|
+
process.on('exit', () => {
|
|
1470
|
+
tracker.close();
|
|
1471
|
+
// Finalize A/B test session with aggregate metrics
|
|
1472
|
+
try {
|
|
1473
|
+
const db = getDb();
|
|
1474
|
+
const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
|
|
1475
|
+
db.prepare(`UPDATE ab_test_sessions SET
|
|
1476
|
+
final_tool_count = ?,
|
|
1477
|
+
toolsets_loaded = ?,
|
|
1478
|
+
total_tool_calls = ?,
|
|
1479
|
+
total_load_events = ?,
|
|
1480
|
+
session_duration_ms = ?,
|
|
1481
|
+
ended_at = datetime('now')
|
|
1482
|
+
WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
|
|
1483
|
+
}
|
|
1484
|
+
catch { /* instrumentation must not block shutdown */ }
|
|
1485
|
+
});
|
|
923
1486
|
// Connect via stdio
|
|
924
1487
|
const transport = new StdioServerTransport();
|
|
925
1488
|
await server.connect(transport);
|