nodebench-mcp 2.15.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/NODEBENCH_AGENTS.md +2 -2
- package/README.md +514 -82
- package/dist/__tests__/analytics.test.d.ts +11 -0
- package/dist/__tests__/analytics.test.js +546 -0
- package/dist/__tests__/analytics.test.js.map +1 -0
- package/dist/__tests__/architectComplex.test.d.ts +1 -0
- package/dist/__tests__/architectComplex.test.js +375 -0
- package/dist/__tests__/architectComplex.test.js.map +1 -0
- package/dist/__tests__/architectSmoke.test.d.ts +1 -0
- package/dist/__tests__/architectSmoke.test.js +92 -0
- package/dist/__tests__/architectSmoke.test.js.map +1 -0
- package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
- package/dist/__tests__/dynamicLoading.test.js +278 -0
- package/dist/__tests__/dynamicLoading.test.js.map +1 -0
- package/dist/__tests__/evalHarness.test.js +7 -2
- package/dist/__tests__/evalHarness.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityEval.test.js +229 -12
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +194 -109
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
- package/dist/__tests__/helpers/answerMatch.js +22 -22
- package/dist/__tests__/presetRealWorldBench.test.js +11 -2
- package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +10 -4
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/__tests__/toolsetGatingEval.test.js +12 -4
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
- package/dist/analytics/index.d.ts +10 -0
- package/dist/analytics/index.js +11 -0
- package/dist/analytics/index.js.map +1 -0
- package/dist/analytics/projectDetector.d.ts +19 -0
- package/dist/analytics/projectDetector.js +259 -0
- package/dist/analytics/projectDetector.js.map +1 -0
- package/dist/analytics/schema.d.ts +57 -0
- package/dist/analytics/schema.js +157 -0
- package/dist/analytics/schema.js.map +1 -0
- package/dist/analytics/smartPreset.d.ts +63 -0
- package/dist/analytics/smartPreset.js +300 -0
- package/dist/analytics/smartPreset.js.map +1 -0
- package/dist/analytics/toolTracker.d.ts +59 -0
- package/dist/analytics/toolTracker.js +163 -0
- package/dist/analytics/toolTracker.js.map +1 -0
- package/dist/analytics/usageStats.d.ts +64 -0
- package/dist/analytics/usageStats.js +252 -0
- package/dist/analytics/usageStats.js.map +1 -0
- package/dist/db.js +359 -321
- package/dist/db.js.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +653 -84
- package/dist/index.js.map +1 -1
- package/dist/tools/architectTools.d.ts +15 -0
- package/dist/tools/architectTools.js +304 -0
- package/dist/tools/architectTools.js.map +1 -0
- package/dist/tools/critterTools.js +14 -14
- package/dist/tools/emailTools.d.ts +15 -0
- package/dist/tools/emailTools.js +664 -0
- package/dist/tools/emailTools.js.map +1 -0
- package/dist/tools/metaTools.js +660 -0
- package/dist/tools/metaTools.js.map +1 -1
- package/dist/tools/parallelAgentTools.js +176 -176
- package/dist/tools/patternTools.js +11 -11
- package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
- package/dist/tools/progressiveDiscoveryTools.js +113 -21
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/researchWritingTools.js +42 -42
- package/dist/tools/rssTools.d.ts +8 -0
- package/dist/tools/rssTools.js +833 -0
- package/dist/tools/rssTools.js.map +1 -0
- package/dist/tools/toolRegistry.d.ts +17 -0
- package/dist/tools/toolRegistry.js +236 -17
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/voiceBridgeTools.js +498 -498
- package/dist/toolsetRegistry.d.ts +10 -0
- package/dist/toolsetRegistry.js +84 -0
- package/dist/toolsetRegistry.js.map +1 -0
- package/package.json +12 -5
package/dist/index.js
CHANGED
|
@@ -20,39 +20,14 @@ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
|
20
20
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
21
21
|
import { ListToolsRequestSchema, CallToolRequestSchema, ListPromptsRequestSchema, GetPromptRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
22
22
|
import { getDb, genId } from "./db.js";
|
|
23
|
-
import {
|
|
24
|
-
import {
|
|
25
|
-
import {
|
|
26
|
-
import {
|
|
27
|
-
import {
|
|
28
|
-
import { reconTools } from "./tools/reconTools.js";
|
|
29
|
-
import { uiCaptureTools } from "./tools/uiCaptureTools.js";
|
|
30
|
-
import { visionTools } from "./tools/visionTools.js";
|
|
31
|
-
import { webTools } from "./tools/webTools.js";
|
|
32
|
-
import { githubTools } from "./tools/githubTools.js";
|
|
33
|
-
import { documentationTools } from "./tools/documentationTools.js";
|
|
34
|
-
import { agentBootstrapTools } from "./tools/agentBootstrapTools.js";
|
|
35
|
-
import { selfEvalTools } from "./tools/selfEvalTools.js";
|
|
36
|
-
import { parallelAgentTools } from "./tools/parallelAgentTools.js";
|
|
37
|
-
import { llmTools } from "./tools/llmTools.js";
|
|
38
|
-
import { securityTools } from "./tools/securityTools.js";
|
|
39
|
-
import { platformTools } from "./tools/platformTools.js";
|
|
40
|
-
import { researchWritingTools } from "./tools/researchWritingTools.js";
|
|
41
|
-
import { flickerDetectionTools } from "./tools/flickerDetectionTools.js";
|
|
42
|
-
import { figmaFlowTools } from "./tools/figmaFlowTools.js";
|
|
23
|
+
import { getAnalyticsDb, closeAnalyticsDb, clearOldRecords } from "./analytics/index.js";
|
|
24
|
+
import { AnalyticsTracker } from "./analytics/toolTracker.js";
|
|
25
|
+
import { generateSmartPreset, formatPresetRecommendation, listPresets } from "./analytics/index.js";
|
|
26
|
+
import { getProjectUsageSummary, exportUsageStats, formatStatsDisplay } from "./analytics/index.js";
|
|
27
|
+
import { TOOLSET_MAP, TOOL_TO_TOOLSET } from "./toolsetRegistry.js";
|
|
43
28
|
import { createMetaTools } from "./tools/metaTools.js";
|
|
44
|
-
import { localFileTools, gaiaMediaSolvers } from "./tools/localFileTools.js";
|
|
45
29
|
import { createProgressiveDiscoveryTools } from "./tools/progressiveDiscoveryTools.js";
|
|
46
|
-
import {
|
|
47
|
-
import { cCompilerBenchmarkTools } from "./tools/cCompilerBenchmarkTools.js";
|
|
48
|
-
import { sessionMemoryTools } from "./tools/sessionMemoryTools.js";
|
|
49
|
-
import { patternTools } from "./tools/patternTools.js";
|
|
50
|
-
import { gitWorkflowTools } from "./tools/gitWorkflowTools.js";
|
|
51
|
-
import { seoTools } from "./tools/seoTools.js";
|
|
52
|
-
import { voiceBridgeTools } from "./tools/voiceBridgeTools.js";
|
|
53
|
-
import { critterTools } from "./tools/critterTools.js";
|
|
54
|
-
import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor } from "./tools/toolRegistry.js";
|
|
55
|
-
import { toonTools } from "./tools/toonTools.js";
|
|
30
|
+
import { getQuickRef, ALL_REGISTRY_ENTRIES, TOOL_REGISTRY, getToolComplexity, _setDbAccessor, hybridSearch } from "./tools/toolRegistry.js";
|
|
56
31
|
// TOON format — ~40% token savings on tool responses
|
|
57
32
|
import { encode as toonEncode } from "@toon-format/toon";
|
|
58
33
|
// Embedding provider — neural semantic search
|
|
@@ -61,56 +36,55 @@ import { initEmbeddingIndex } from "./tools/embeddingProvider.js";
|
|
|
61
36
|
const cliArgs = process.argv.slice(2);
|
|
62
37
|
const useToon = !cliArgs.includes("--no-toon");
|
|
63
38
|
const useEmbedding = !cliArgs.includes("--no-embedding");
|
|
64
|
-
const
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
ui_capture: uiCaptureTools,
|
|
72
|
-
vision: visionTools,
|
|
73
|
-
local_file: localFileTools,
|
|
74
|
-
web: webTools,
|
|
75
|
-
github: githubTools,
|
|
76
|
-
docs: documentationTools,
|
|
77
|
-
bootstrap: agentBootstrapTools,
|
|
78
|
-
self_eval: selfEvalTools,
|
|
79
|
-
parallel: parallelAgentTools,
|
|
80
|
-
llm: llmTools,
|
|
81
|
-
security: securityTools,
|
|
82
|
-
platform: platformTools,
|
|
83
|
-
research_writing: researchWritingTools,
|
|
84
|
-
flicker_detection: flickerDetectionTools,
|
|
85
|
-
figma_flow: figmaFlowTools,
|
|
86
|
-
boilerplate: boilerplateTools,
|
|
87
|
-
benchmark: cCompilerBenchmarkTools,
|
|
88
|
-
session_memory: sessionMemoryTools,
|
|
89
|
-
gaia_solvers: gaiaMediaSolvers,
|
|
90
|
-
toon: toonTools,
|
|
91
|
-
pattern: patternTools,
|
|
92
|
-
git_workflow: gitWorkflowTools,
|
|
93
|
-
seo: seoTools,
|
|
94
|
-
voice_bridge: voiceBridgeTools,
|
|
95
|
-
critter: critterTools,
|
|
96
|
-
};
|
|
39
|
+
const useSmartPreset = cliArgs.includes("--smart-preset");
|
|
40
|
+
const showStats = cliArgs.includes("--stats");
|
|
41
|
+
const exportStats = cliArgs.includes("--export-stats");
|
|
42
|
+
const resetStats = cliArgs.includes("--reset-stats");
|
|
43
|
+
const listPresetsFlag = cliArgs.includes("--list-presets");
|
|
44
|
+
export { TOOLSET_MAP };
|
|
45
|
+
const DEFAULT_TOOLSETS = ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"];
|
|
97
46
|
const PRESETS = {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
47
|
+
default: DEFAULT_TOOLSETS,
|
|
48
|
+
// Themed presets — bridge between default (39 tools) and full (175 tools)
|
|
49
|
+
web_dev: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "web", "seo", "git_workflow", "architect"],
|
|
50
|
+
research: [...DEFAULT_TOOLSETS, "web", "llm", "rss", "email", "docs"],
|
|
51
|
+
data: [...DEFAULT_TOOLSETS, "local_file", "llm", "web"],
|
|
52
|
+
devops: [...DEFAULT_TOOLSETS, "git_workflow", "session_memory", "benchmark", "pattern"],
|
|
53
|
+
mobile: [...DEFAULT_TOOLSETS, "ui_capture", "vision", "flicker_detection"],
|
|
54
|
+
academic: [...DEFAULT_TOOLSETS, "research_writing", "llm", "web", "local_file"],
|
|
55
|
+
multi_agent: [...DEFAULT_TOOLSETS, "parallel", "self_eval", "session_memory", "pattern", "toon"],
|
|
56
|
+
content: [...DEFAULT_TOOLSETS, "llm", "critter", "email", "rss", "platform", "architect"],
|
|
101
57
|
full: Object.keys(TOOLSET_MAP),
|
|
102
58
|
};
|
|
59
|
+
const PRESET_DESCRIPTIONS = {
|
|
60
|
+
default: "Core AI Flywheel — verification, eval, quality gates, learning, recon",
|
|
61
|
+
web_dev: "Web projects — adds visual QA, SEO audit, git workflow, code architecture",
|
|
62
|
+
research: "Research workflows — adds web search, LLM calls, RSS feeds, email, docs",
|
|
63
|
+
data: "Data analysis — adds CSV/XLSX/PDF/JSON parsing, LLM extraction, web fetch",
|
|
64
|
+
devops: "CI/CD & ops — adds git compliance, session memory, benchmarks, pattern mining",
|
|
65
|
+
mobile: "Mobile apps — adds screenshot capture, vision analysis, flicker detection",
|
|
66
|
+
academic: "Academic papers — adds polish, review, translate, logic check, data analysis",
|
|
67
|
+
multi_agent: "Multi-agent teams — adds task locking, messaging, roles, oracle testing",
|
|
68
|
+
content: "Content & publishing — adds LLM, accountability, email, RSS, platform queue",
|
|
69
|
+
full: "Everything — all toolsets for maximum coverage",
|
|
70
|
+
};
|
|
103
71
|
function parseToolsets() {
|
|
104
72
|
if (cliArgs.includes("--help")) {
|
|
105
73
|
const lines = [
|
|
106
|
-
"nodebench-mcp v2.
|
|
74
|
+
"nodebench-mcp v2.17.0 — Development Methodology MCP Server",
|
|
107
75
|
"",
|
|
108
76
|
"Usage: nodebench-mcp [options]",
|
|
109
77
|
"",
|
|
110
78
|
"Options:",
|
|
111
|
-
" --toolsets <list> Comma-separated toolsets to enable (default:
|
|
79
|
+
" --toolsets <list> Comma-separated toolsets to enable (default: default)",
|
|
112
80
|
" --exclude <list> Comma-separated toolsets to exclude",
|
|
113
|
-
" --preset <name> Use a preset:
|
|
81
|
+
" --preset <name> Use a preset: default or full",
|
|
82
|
+
" --smart-preset Generate smart preset recommendation based on project type and usage history",
|
|
83
|
+
" --stats Show usage statistics for current project",
|
|
84
|
+
" --export-stats Export usage statistics to JSON",
|
|
85
|
+
" --reset-stats Clear all usage analytics data",
|
|
86
|
+
" --list-presets List all available presets with descriptions",
|
|
87
|
+
" --dynamic Enable dynamic toolset loading (Search+Load pattern from arxiv 2509.20386)",
|
|
114
88
|
" --no-toon Disable TOON encoding (TOON is on by default for ~40% token savings)",
|
|
115
89
|
" --no-embedding Disable neural embedding search (uses local HuggingFace model or API keys)",
|
|
116
90
|
" --help Show this help and exit",
|
|
@@ -119,10 +93,20 @@ function parseToolsets() {
|
|
|
119
93
|
...Object.entries(TOOLSET_MAP).map(([k, v]) => ` ${k.padEnd(16)} ${v.length} tools`),
|
|
120
94
|
"",
|
|
121
95
|
"Presets:",
|
|
122
|
-
...Object.entries(PRESETS).map(([k, v]) =>
|
|
96
|
+
...Object.entries(PRESETS).map(([k, v]) => {
|
|
97
|
+
const count = v.reduce((s, ts) => s + (TOOLSET_MAP[ts]?.length ?? 0), 0) + 6;
|
|
98
|
+
return ` ${k.padEnd(14)} ${String(count).padStart(3)} tools ${PRESET_DESCRIPTIONS[k] ?? ''}`;
|
|
99
|
+
}),
|
|
123
100
|
"",
|
|
124
101
|
"Examples:",
|
|
125
|
-
" npx nodebench-mcp
|
|
102
|
+
" npx nodebench-mcp # Default (39 tools) - core AI Flywheel",
|
|
103
|
+
" npx nodebench-mcp --preset web_dev # Web development (+ vision, SEO, git)",
|
|
104
|
+
" npx nodebench-mcp --preset research # Research workflows (+ web, LLM, RSS, email)",
|
|
105
|
+
" npx nodebench-mcp --preset data # Data analysis (+ local file parsing, LLM)",
|
|
106
|
+
" npx nodebench-mcp --preset academic # Academic writing (+ paper tools, LLM)",
|
|
107
|
+
" npx nodebench-mcp --preset full # All 175 tools",
|
|
108
|
+
" npx nodebench-mcp --smart-preset # Get AI-powered preset recommendation",
|
|
109
|
+
" npx nodebench-mcp --stats # Show usage statistics",
|
|
126
110
|
" npx nodebench-mcp --toolsets verification,eval,recon",
|
|
127
111
|
" npx nodebench-mcp --exclude vision,ui_capture,parallel",
|
|
128
112
|
"",
|
|
@@ -164,19 +148,533 @@ function parseToolsets() {
|
|
|
164
148
|
.filter(([k]) => !excluded.has(k))
|
|
165
149
|
.flatMap(([, v]) => v);
|
|
166
150
|
}
|
|
167
|
-
|
|
151
|
+
// Default to default preset (39 tools - complete AI Flywheel)
|
|
152
|
+
return PRESETS.default.flatMap((k) => TOOLSET_MAP[k] ?? []);
|
|
153
|
+
}
|
|
154
|
+
// ── Analytics CLI flag handling ─────────────────────────────────────────
|
|
155
|
+
// Handle --list-presets
|
|
156
|
+
if (listPresetsFlag) {
|
|
157
|
+
const presets = listPresets(TOOLSET_MAP);
|
|
158
|
+
console.log(JSON.stringify(presets, null, 2));
|
|
159
|
+
process.exit(0);
|
|
160
|
+
}
|
|
161
|
+
// ── Analytics CLI handlers (run-and-exit) ───────────────────────────────
|
|
162
|
+
if (resetStats || useSmartPreset || showStats || exportStats) {
|
|
163
|
+
const aDb = getAnalyticsDb();
|
|
164
|
+
try {
|
|
165
|
+
if (resetStats) {
|
|
166
|
+
clearOldRecords(aDb, 0);
|
|
167
|
+
console.error("Usage analytics data cleared (tool_usage + cache). Project context and preset history preserved.");
|
|
168
|
+
}
|
|
169
|
+
else if (useSmartPreset) {
|
|
170
|
+
const recommendation = generateSmartPreset(aDb, TOOLSET_MAP);
|
|
171
|
+
console.error(formatPresetRecommendation(recommendation, TOOLSET_MAP));
|
|
172
|
+
}
|
|
173
|
+
else if (showStats) {
|
|
174
|
+
const summary = getProjectUsageSummary(aDb, process.cwd(), 30);
|
|
175
|
+
if (summary) {
|
|
176
|
+
console.error(formatStatsDisplay(summary, process.cwd()));
|
|
177
|
+
}
|
|
178
|
+
else {
|
|
179
|
+
console.error("No usage data available for this project in the last 30 days.");
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
else if (exportStats) {
|
|
183
|
+
console.log(exportUsageStats(aDb, process.cwd(), 30));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
finally {
|
|
187
|
+
closeAnalyticsDb(aDb);
|
|
188
|
+
}
|
|
189
|
+
process.exit(0);
|
|
168
190
|
}
|
|
169
191
|
// Initialize DB (creates ~/.nodebench/ and schema on first run)
|
|
170
192
|
getDb();
|
|
171
193
|
// Wire up DB accessor for execution trace edges (avoids circular import)
|
|
172
194
|
_setDbAccessor(getDb);
|
|
173
195
|
// Assemble tools (filtered by --toolsets / --exclude / --preset if provided)
|
|
174
|
-
|
|
196
|
+
let domainTools = parseToolsets();
|
|
197
|
+
// Determine current preset name for analytics
|
|
198
|
+
let currentPreset = 'default';
|
|
199
|
+
const presetIdx = cliArgs.indexOf("--preset");
|
|
200
|
+
if (presetIdx !== -1 && cliArgs[presetIdx + 1]) {
|
|
201
|
+
currentPreset = cliArgs[presetIdx + 1];
|
|
202
|
+
}
|
|
203
|
+
else if (cliArgs.includes("--toolsets") || cliArgs.includes("--exclude")) {
|
|
204
|
+
currentPreset = 'custom';
|
|
205
|
+
}
|
|
206
|
+
// Dynamic loading: --dynamic flag enables Search+Load architecture
|
|
207
|
+
// (arxiv 2509.20386 "Dynamic ReAct" winning pattern)
|
|
208
|
+
const useDynamicLoading = cliArgs.includes("--dynamic");
|
|
209
|
+
// Track which toolsets are currently active (mutable for dynamic loading)
|
|
210
|
+
const initialToolsetNames = new Set(PRESETS[currentPreset] ?? PRESETS.default);
|
|
211
|
+
const activeToolsets = new Set(initialToolsetNames);
|
|
212
|
+
// Tools to skip auto-logging (avoid infinite recursion and noise)
|
|
213
|
+
const SKIP_AUTO_LOG = new Set(["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings", "load_toolset", "unload_toolset", "list_available_toolsets"]);
|
|
214
|
+
// Initialize analytics tracker singleton (handles DB, project context, retention cleanup)
|
|
215
|
+
const tracker = AnalyticsTracker.init({
|
|
216
|
+
projectPath: process.cwd(),
|
|
217
|
+
preset: currentPreset,
|
|
218
|
+
toolCount: domainTools.length + 6,
|
|
219
|
+
toolToToolset: TOOL_TO_TOOLSET,
|
|
220
|
+
skipTools: SKIP_AUTO_LOG,
|
|
221
|
+
});
|
|
175
222
|
const metaTools = createMetaTools(domainTools);
|
|
176
|
-
|
|
223
|
+
let allToolsWithoutDiscovery = [...domainTools, ...metaTools];
|
|
177
224
|
// Progressive discovery tools need the full tool list for hybrid search
|
|
178
|
-
|
|
179
|
-
const
|
|
225
|
+
// Pass dynamic loading callbacks so discover_tools can suggest load_toolset for unloaded toolsets
|
|
226
|
+
const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })), {
|
|
227
|
+
getLoadedToolNames: () => new Set(allTools.map(t => t.name)),
|
|
228
|
+
getToolToToolset: () => TOOL_TO_TOOLSET,
|
|
229
|
+
});
|
|
230
|
+
// ── Dynamic Loading Tools (Search+Load pattern) ────────────────────────
|
|
231
|
+
// Based on Dynamic ReAct (arxiv 2509.20386) — the winning architecture.
|
|
232
|
+
// Agent starts with default preset, discovers tools via discover_tools,
|
|
233
|
+
// then calls load_toolset to activate them. Server sends
|
|
234
|
+
// notifications/tools/list_changed so the client re-fetches the tool list.
|
|
235
|
+
const dynamicLoadingTools = [
|
|
236
|
+
{
|
|
237
|
+
name: "load_toolset",
|
|
238
|
+
description: 'Dynamically load a toolset into the current session. After loading, the tools become immediately available for use. Based on the "Search+Load" architecture from Dynamic ReAct (arxiv 2509.20386) — the winning pattern for scalable MCP tool selection. Use discover_tools first to find which toolset you need, then call this to activate it.',
|
|
239
|
+
inputSchema: {
|
|
240
|
+
type: "object",
|
|
241
|
+
properties: {
|
|
242
|
+
toolset: {
|
|
243
|
+
type: "string",
|
|
244
|
+
description: `Toolset name to load. Available: ${Object.keys(TOOLSET_MAP).filter(k => !activeToolsets.has(k)).join(", ") || "(all loaded)"}`,
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
required: ["toolset"],
|
|
248
|
+
},
|
|
249
|
+
handler: async (args) => {
|
|
250
|
+
const { toolset } = args;
|
|
251
|
+
if (!TOOLSET_MAP[toolset]) {
|
|
252
|
+
return { error: true, message: `Unknown toolset: ${toolset}`, available: Object.keys(TOOLSET_MAP) };
|
|
253
|
+
}
|
|
254
|
+
if (activeToolsets.has(toolset)) {
|
|
255
|
+
return { alreadyLoaded: true, toolset, message: `Toolset '${toolset}' is already active.`, activeToolCount: allTools.length };
|
|
256
|
+
}
|
|
257
|
+
const startMs = Date.now();
|
|
258
|
+
const toolsBefore = allTools.length;
|
|
259
|
+
// Add toolset to active set
|
|
260
|
+
activeToolsets.add(toolset);
|
|
261
|
+
const newTools = TOOLSET_MAP[toolset];
|
|
262
|
+
// Rebuild domain tools from active toolsets
|
|
263
|
+
domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
|
|
264
|
+
const newMetaTools = createMetaTools(domainTools);
|
|
265
|
+
allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
|
|
266
|
+
// Rebuild allTools (keep discovery + dynamic loading tools stable)
|
|
267
|
+
rebuildAllTools();
|
|
268
|
+
// Track A/B event
|
|
269
|
+
try {
|
|
270
|
+
const db = getDb();
|
|
271
|
+
db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, latency_ms, created_at) VALUES (?, ?, 'load', ?, ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length, Date.now() - startMs);
|
|
272
|
+
}
|
|
273
|
+
catch { /* instrumentation must not break tool dispatch */ }
|
|
274
|
+
// Notify client that tool list changed (MCP spec)
|
|
275
|
+
try {
|
|
276
|
+
await server.notification({ method: "notifications/tools/list_changed" });
|
|
277
|
+
}
|
|
278
|
+
catch { /* client may not support notifications */ }
|
|
279
|
+
return {
|
|
280
|
+
loaded: true,
|
|
281
|
+
toolset,
|
|
282
|
+
toolsAdded: newTools.length,
|
|
283
|
+
toolNames: newTools.map(t => t.name),
|
|
284
|
+
activeToolCount: allTools.length,
|
|
285
|
+
activeToolsets: [...activeToolsets],
|
|
286
|
+
_hint: `${newTools.length} tools from '${toolset}' are now available. You can use them directly.`,
|
|
287
|
+
};
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
name: "unload_toolset",
|
|
292
|
+
description: "Remove a dynamically loaded toolset from the current session to free up context. Cannot unload toolsets from the initial preset.",
|
|
293
|
+
inputSchema: {
|
|
294
|
+
type: "object",
|
|
295
|
+
properties: {
|
|
296
|
+
toolset: {
|
|
297
|
+
type: "string",
|
|
298
|
+
description: "Toolset name to unload.",
|
|
299
|
+
},
|
|
300
|
+
},
|
|
301
|
+
required: ["toolset"],
|
|
302
|
+
},
|
|
303
|
+
handler: async (args) => {
|
|
304
|
+
const { toolset } = args;
|
|
305
|
+
if (!activeToolsets.has(toolset)) {
|
|
306
|
+
return { error: true, message: `Toolset '${toolset}' is not currently loaded.` };
|
|
307
|
+
}
|
|
308
|
+
if (initialToolsetNames.has(toolset)) {
|
|
309
|
+
return { error: true, message: `Cannot unload '${toolset}' — it's part of the initial preset (${currentPreset}).` };
|
|
310
|
+
}
|
|
311
|
+
const toolsBefore = allTools.length;
|
|
312
|
+
activeToolsets.delete(toolset);
|
|
313
|
+
// Rebuild
|
|
314
|
+
domainTools = [...activeToolsets].flatMap(k => TOOLSET_MAP[k] ?? []);
|
|
315
|
+
const newMetaTools = createMetaTools(domainTools);
|
|
316
|
+
allToolsWithoutDiscovery = [...domainTools, ...newMetaTools];
|
|
317
|
+
rebuildAllTools();
|
|
318
|
+
try {
|
|
319
|
+
const db = getDb();
|
|
320
|
+
db.prepare("INSERT INTO ab_tool_events (id, session_id, event_type, toolset_name, tools_before, tools_after, created_at) VALUES (?, ?, 'unload', ?, ?, ?, datetime('now'))").run(genId("abe"), SESSION_ID, toolset, toolsBefore, allTools.length);
|
|
321
|
+
}
|
|
322
|
+
catch { /* instrumentation */ }
|
|
323
|
+
try {
|
|
324
|
+
await server.notification({ method: "notifications/tools/list_changed" });
|
|
325
|
+
}
|
|
326
|
+
catch { /* client may not support notifications */ }
|
|
327
|
+
return {
|
|
328
|
+
unloaded: true,
|
|
329
|
+
toolset,
|
|
330
|
+
activeToolCount: allTools.length,
|
|
331
|
+
activeToolsets: [...activeToolsets],
|
|
332
|
+
};
|
|
333
|
+
},
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
name: "list_available_toolsets",
|
|
337
|
+
description: "List all available toolsets showing which are currently loaded and which can be dynamically added. Includes tool counts and descriptions for each toolset.",
|
|
338
|
+
inputSchema: { type: "object", properties: {} },
|
|
339
|
+
handler: async () => {
|
|
340
|
+
const toolsets = Object.entries(TOOLSET_MAP).map(([name, tools]) => ({
|
|
341
|
+
name,
|
|
342
|
+
toolCount: tools.length,
|
|
343
|
+
loaded: activeToolsets.has(name),
|
|
344
|
+
isInitialPreset: initialToolsetNames.has(name),
|
|
345
|
+
description: PRESET_DESCRIPTIONS[name] ?? null,
|
|
346
|
+
tools: tools.map(t => t.name),
|
|
347
|
+
}));
|
|
348
|
+
const loaded = toolsets.filter(t => t.loaded);
|
|
349
|
+
const available = toolsets.filter(t => !t.loaded);
|
|
350
|
+
return {
|
|
351
|
+
mode: useDynamicLoading ? "dynamic" : "static",
|
|
352
|
+
currentPreset,
|
|
353
|
+
activeToolCount: allTools.length,
|
|
354
|
+
loaded: { count: loaded.length, toolsets: loaded },
|
|
355
|
+
available: { count: available.length, toolsets: available },
|
|
356
|
+
_hint: available.length > 0
|
|
357
|
+
? `${available.length} toolsets available to load. Call load_toolset("<name>") to activate.`
|
|
358
|
+
: "All toolsets are loaded.",
|
|
359
|
+
};
|
|
360
|
+
},
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
name: "call_loaded_tool",
|
|
364
|
+
description: 'Call a dynamically loaded tool by name. Use this after load_toolset when your client does not automatically refresh the tool list. Pass the tool name and its arguments. Example: call_loaded_tool({ tool: "analyze_screenshot", args: { imagePath: "screenshot.png" } }). This is a fallback — if the loaded tool appears in your tool list directly, call it directly instead.',
|
|
365
|
+
inputSchema: {
|
|
366
|
+
type: "object",
|
|
367
|
+
properties: {
|
|
368
|
+
tool: {
|
|
369
|
+
type: "string",
|
|
370
|
+
description: "Name of the dynamically loaded tool to call.",
|
|
371
|
+
},
|
|
372
|
+
args: {
|
|
373
|
+
type: "object",
|
|
374
|
+
description: "Arguments to pass to the tool (same as its inputSchema).",
|
|
375
|
+
additionalProperties: true,
|
|
376
|
+
},
|
|
377
|
+
},
|
|
378
|
+
required: ["tool"],
|
|
379
|
+
},
|
|
380
|
+
handler: async (callArgs) => {
|
|
381
|
+
const { tool: toolName, args: toolArgs } = callArgs;
|
|
382
|
+
const target = allTools.find(t => t.name === toolName);
|
|
383
|
+
if (!target) {
|
|
384
|
+
return {
|
|
385
|
+
error: true,
|
|
386
|
+
message: `Tool '${toolName}' not found. It may not be loaded yet.`,
|
|
387
|
+
_hint: "Call list_available_toolsets to see what's available, then load_toolset to activate it.",
|
|
388
|
+
loadedTools: allTools.map(t => t.name),
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
// Dispatch to the target tool's handler
|
|
392
|
+
return target.handler(toolArgs ?? {});
|
|
393
|
+
},
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
name: "smart_select_tools",
|
|
397
|
+
description: 'LLM-powered tool selection: sends your task description + a compact tool catalog to a fast model (Gemini Flash, GPT-4o-mini, or Claude Haiku) to pick the best 5-10 tools. Much more accurate than keyword search for ambiguous queries like "call an AI model" or "analyze my data". Requires GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY. Falls back to heuristic discover_tools if no API key is set.',
|
|
398
|
+
inputSchema: {
|
|
399
|
+
type: "object",
|
|
400
|
+
properties: {
|
|
401
|
+
task: {
|
|
402
|
+
type: "string",
|
|
403
|
+
description: "Describe what you want to accomplish. Be specific. Example: 'I need to parse a PDF, extract tables, and email a summary'",
|
|
404
|
+
},
|
|
405
|
+
maxTools: {
|
|
406
|
+
type: "number",
|
|
407
|
+
description: "Maximum tools to return (default: 8)",
|
|
408
|
+
},
|
|
409
|
+
provider: {
|
|
410
|
+
type: "string",
|
|
411
|
+
enum: ["auto", "gemini", "openai", "anthropic"],
|
|
412
|
+
description: "Which LLM provider to use. 'auto' (default) picks the first available API key.",
|
|
413
|
+
},
|
|
414
|
+
},
|
|
415
|
+
required: ["task"],
|
|
416
|
+
},
|
|
417
|
+
handler: async (args) => {
|
|
418
|
+
const task = args.task;
|
|
419
|
+
const maxTools = args.maxTools ?? 8;
|
|
420
|
+
const provider = args.provider ?? "auto";
|
|
421
|
+
// Build compact tool catalog: name + category + tags (no descriptions — saves tokens)
|
|
422
|
+
const catalog = ALL_REGISTRY_ENTRIES.map(e => `${e.name} [${e.category}] ${e.tags.slice(0, 5).join(",")}`).join("\n");
|
|
423
|
+
const systemPrompt = `You are a tool selection assistant. Given a task description and a catalog of ${ALL_REGISTRY_ENTRIES.length} tools, pick the ${maxTools} most relevant tools. Return ONLY a JSON array of tool names, nothing else. Example: ["tool_a","tool_b"]`;
|
|
424
|
+
const userPrompt = `Task: ${task}\n\nTool catalog (name [category] tags):\n${catalog}`;
|
|
425
|
+
// Try LLM providers in order
|
|
426
|
+
const geminiKey = process.env.GEMINI_API_KEY;
|
|
427
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
428
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
429
|
+
let selectedProvider = provider;
|
|
430
|
+
if (selectedProvider === "auto") {
|
|
431
|
+
if (geminiKey)
|
|
432
|
+
selectedProvider = "gemini";
|
|
433
|
+
else if (openaiKey)
|
|
434
|
+
selectedProvider = "openai";
|
|
435
|
+
else if (anthropicKey)
|
|
436
|
+
selectedProvider = "anthropic";
|
|
437
|
+
else
|
|
438
|
+
selectedProvider = "none";
|
|
439
|
+
}
|
|
440
|
+
if (selectedProvider === "none") {
|
|
441
|
+
// Fallback: run heuristic discover_tools (search full registry for dynamic mode)
|
|
442
|
+
const heuristicResults = hybridSearch(task, allTools.map(t => ({ name: t.name, description: t.description })), {
|
|
443
|
+
limit: maxTools,
|
|
444
|
+
mode: "hybrid",
|
|
445
|
+
searchFullRegistry: useDynamicLoading,
|
|
446
|
+
});
|
|
447
|
+
return {
|
|
448
|
+
method: "heuristic_fallback",
|
|
449
|
+
reason: "No API key found. Set GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY for LLM-powered selection.",
|
|
450
|
+
tools: heuristicResults.map((r) => ({
|
|
451
|
+
name: r.name,
|
|
452
|
+
category: r.category,
|
|
453
|
+
score: r.score,
|
|
454
|
+
quickRef: r.quickRef,
|
|
455
|
+
})),
|
|
456
|
+
_hint: "For better accuracy on ambiguous queries, set an API key to enable LLM-powered selection.",
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
try {
|
|
460
|
+
let responseText = "";
|
|
461
|
+
if (selectedProvider === "gemini" && geminiKey) {
|
|
462
|
+
const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`, {
|
|
463
|
+
method: "POST",
|
|
464
|
+
headers: { "Content-Type": "application/json" },
|
|
465
|
+
body: JSON.stringify({
|
|
466
|
+
contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
|
|
467
|
+
generationConfig: { temperature: 0, maxOutputTokens: 512 },
|
|
468
|
+
}),
|
|
469
|
+
});
|
|
470
|
+
const data = await resp.json();
|
|
471
|
+
responseText = data?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
|
|
472
|
+
}
|
|
473
|
+
else if (selectedProvider === "openai" && openaiKey) {
|
|
474
|
+
const resp = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
475
|
+
method: "POST",
|
|
476
|
+
headers: { "Content-Type": "application/json", Authorization: `Bearer ${openaiKey}` },
|
|
477
|
+
body: JSON.stringify({
|
|
478
|
+
model: "gpt-4o-mini",
|
|
479
|
+
messages: [
|
|
480
|
+
{ role: "system", content: systemPrompt },
|
|
481
|
+
{ role: "user", content: userPrompt },
|
|
482
|
+
],
|
|
483
|
+
temperature: 0,
|
|
484
|
+
max_tokens: 512,
|
|
485
|
+
}),
|
|
486
|
+
});
|
|
487
|
+
const data = await resp.json();
|
|
488
|
+
responseText = data?.choices?.[0]?.message?.content ?? "";
|
|
489
|
+
}
|
|
490
|
+
else if (selectedProvider === "anthropic" && anthropicKey) {
|
|
491
|
+
const resp = await fetch("https://api.anthropic.com/v1/messages", {
|
|
492
|
+
method: "POST",
|
|
493
|
+
headers: {
|
|
494
|
+
"Content-Type": "application/json",
|
|
495
|
+
"x-api-key": anthropicKey,
|
|
496
|
+
"anthropic-version": "2023-06-01",
|
|
497
|
+
},
|
|
498
|
+
body: JSON.stringify({
|
|
499
|
+
model: "claude-3-5-haiku-latest",
|
|
500
|
+
max_tokens: 512,
|
|
501
|
+
system: systemPrompt,
|
|
502
|
+
messages: [{ role: "user", content: userPrompt }],
|
|
503
|
+
}),
|
|
504
|
+
});
|
|
505
|
+
const data = await resp.json();
|
|
506
|
+
responseText = data?.content?.[0]?.text ?? "";
|
|
507
|
+
}
|
|
508
|
+
// Parse the JSON array from the response
|
|
509
|
+
const jsonMatch = responseText.match(/\[[\s\S]*?\]/);
|
|
510
|
+
if (!jsonMatch) {
|
|
511
|
+
return { error: true, message: "LLM did not return a valid JSON array", raw: responseText.slice(0, 200) };
|
|
512
|
+
}
|
|
513
|
+
const selectedNames = JSON.parse(jsonMatch[0]);
|
|
514
|
+
// Enrich with registry metadata
|
|
515
|
+
const enriched = selectedNames
|
|
516
|
+
.map(name => {
|
|
517
|
+
const entry = TOOL_REGISTRY.get(name);
|
|
518
|
+
if (!entry)
|
|
519
|
+
return null;
|
|
520
|
+
return {
|
|
521
|
+
name: entry.name,
|
|
522
|
+
category: entry.category,
|
|
523
|
+
phase: entry.phase,
|
|
524
|
+
tags: entry.tags,
|
|
525
|
+
quickRef: entry.quickRef,
|
|
526
|
+
loaded: allTools.some(t => t.name === name),
|
|
527
|
+
};
|
|
528
|
+
})
|
|
529
|
+
.filter(Boolean);
|
|
530
|
+
// Identify toolsets to load
|
|
531
|
+
const unloadedToolsets = new Map();
|
|
532
|
+
for (const tool of enriched) {
|
|
533
|
+
if (tool && !tool.loaded) {
|
|
534
|
+
const ts = TOOL_TO_TOOLSET.get(tool.name);
|
|
535
|
+
if (ts) {
|
|
536
|
+
const list = unloadedToolsets.get(ts) ?? [];
|
|
537
|
+
list.push(tool.name);
|
|
538
|
+
unloadedToolsets.set(ts, list);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
return {
|
|
543
|
+
method: `llm_${selectedProvider}`,
|
|
544
|
+
task,
|
|
545
|
+
selectedTools: enriched,
|
|
546
|
+
toolCount: enriched.length,
|
|
547
|
+
...(unloadedToolsets.size > 0 ? {
|
|
548
|
+
_loadSuggestions: [...unloadedToolsets.entries()].map(([ts, tools]) => ({
|
|
549
|
+
toolset: ts,
|
|
550
|
+
matchingTools: tools,
|
|
551
|
+
action: `Call load_toolset("${ts}") to activate ${tools.length} tool(s).`,
|
|
552
|
+
})),
|
|
553
|
+
} : {}),
|
|
554
|
+
_hint: enriched.length > 0
|
|
555
|
+
? `Top pick: ${enriched[0].name}. ${enriched[0].quickRef.nextAction}`
|
|
556
|
+
: "No tools selected. Try rephrasing your task.",
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
catch (err) {
|
|
560
|
+
return {
|
|
561
|
+
error: true,
|
|
562
|
+
method: `llm_${selectedProvider}`,
|
|
563
|
+
message: `LLM call failed: ${err.message}`,
|
|
564
|
+
_hint: "Falling back to heuristic search. Check your API key.",
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
},
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
name: "get_ab_test_report",
|
|
571
|
+
description: "Generate an A/B test comparison report for static vs dynamic toolset loading. Shows session counts, tool counts, load events, error rates, and per-toolset load frequency. Use after running sessions in both modes to evaluate the impact of dynamic loading.",
|
|
572
|
+
inputSchema: {
|
|
573
|
+
type: "object",
|
|
574
|
+
properties: {
|
|
575
|
+
detailed: {
|
|
576
|
+
type: "boolean",
|
|
577
|
+
description: "Include per-session breakdown (default: false, summary only)",
|
|
578
|
+
},
|
|
579
|
+
},
|
|
580
|
+
},
|
|
581
|
+
handler: async (args) => {
|
|
582
|
+
const db = getDb();
|
|
583
|
+
const detailed = args.detailed === true;
|
|
584
|
+
// Session-level aggregates by mode
|
|
585
|
+
const sessionSummary = db.prepare(`
|
|
586
|
+
SELECT
|
|
587
|
+
mode,
|
|
588
|
+
COUNT(*) as sessions,
|
|
589
|
+
ROUND(AVG(initial_tool_count), 1) as avg_initial_tools,
|
|
590
|
+
ROUND(AVG(COALESCE(final_tool_count, initial_tool_count)), 1) as avg_final_tools,
|
|
591
|
+
ROUND(AVG(COALESCE(total_tool_calls, 0)), 1) as avg_tool_calls,
|
|
592
|
+
ROUND(AVG(COALESCE(total_load_events, 0)), 1) as avg_load_events,
|
|
593
|
+
ROUND(AVG(COALESCE(session_duration_ms, 0)) / 1000.0, 1) as avg_duration_sec,
|
|
594
|
+
SUM(COALESCE(total_tool_calls, 0)) as total_calls,
|
|
595
|
+
SUM(COALESCE(total_load_events, 0)) as total_loads
|
|
596
|
+
FROM ab_test_sessions
|
|
597
|
+
GROUP BY mode
|
|
598
|
+
`).all();
|
|
599
|
+
// Error rate by mode (join with tool_call_log)
|
|
600
|
+
const errorRates = db.prepare(`
|
|
601
|
+
SELECT
|
|
602
|
+
s.mode,
|
|
603
|
+
COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) as errors,
|
|
604
|
+
COUNT(*) as total_calls,
|
|
605
|
+
ROUND(100.0 * COUNT(CASE WHEN t.result_status = 'error' THEN 1 END) / MAX(COUNT(*), 1), 2) as error_pct
|
|
606
|
+
FROM tool_call_log t
|
|
607
|
+
JOIN ab_test_sessions s ON t.session_id = s.id
|
|
608
|
+
GROUP BY s.mode
|
|
609
|
+
`).all();
|
|
610
|
+
// Top loaded toolsets (dynamic mode)
|
|
611
|
+
const topToolsets = db.prepare(`
|
|
612
|
+
SELECT
|
|
613
|
+
toolset_name,
|
|
614
|
+
COUNT(*) as load_count,
|
|
615
|
+
ROUND(AVG(latency_ms), 1) as avg_latency_ms
|
|
616
|
+
FROM ab_tool_events
|
|
617
|
+
WHERE event_type = 'load'
|
|
618
|
+
GROUP BY toolset_name
|
|
619
|
+
ORDER BY load_count DESC
|
|
620
|
+
LIMIT 10
|
|
621
|
+
`).all();
|
|
622
|
+
// Current session info
|
|
623
|
+
const currentSession = {
|
|
624
|
+
sessionId: SESSION_ID,
|
|
625
|
+
mode: useDynamicLoading ? "dynamic" : "static",
|
|
626
|
+
preset: currentPreset,
|
|
627
|
+
toolCalls: _abToolCallCount,
|
|
628
|
+
loadEvents: _abLoadEventCount,
|
|
629
|
+
activeTools: allTools.length,
|
|
630
|
+
durationSec: Math.round((Date.now() - _abStartMs) / 1000),
|
|
631
|
+
dynamicallyLoaded: [...activeToolsets].filter(ts => !initialToolsetNames.has(ts)),
|
|
632
|
+
};
|
|
633
|
+
// Optional per-session detail
|
|
634
|
+
let sessions = [];
|
|
635
|
+
if (detailed) {
|
|
636
|
+
sessions = db.prepare(`
|
|
637
|
+
SELECT id, mode, initial_preset, initial_tool_count, final_tool_count,
|
|
638
|
+
toolsets_loaded, total_tool_calls, total_load_events,
|
|
639
|
+
session_duration_ms, created_at, ended_at
|
|
640
|
+
FROM ab_test_sessions
|
|
641
|
+
ORDER BY created_at DESC
|
|
642
|
+
LIMIT 50
|
|
643
|
+
`).all();
|
|
644
|
+
}
|
|
645
|
+
// Build verdict
|
|
646
|
+
const staticSummary = sessionSummary.find((s) => s.mode === "static");
|
|
647
|
+
const dynamicSummary = sessionSummary.find((s) => s.mode === "dynamic");
|
|
648
|
+
let verdict = "Insufficient data. Run sessions in both modes to compare.";
|
|
649
|
+
if (staticSummary && dynamicSummary) {
|
|
650
|
+
const toolDiff = (staticSummary.avg_final_tools ?? 0) - (dynamicSummary.avg_final_tools ?? 0);
|
|
651
|
+
const staticErr = errorRates.find((e) => e.mode === "static");
|
|
652
|
+
const dynamicErr = errorRates.find((e) => e.mode === "dynamic");
|
|
653
|
+
const errDiff = (staticErr?.error_pct ?? 0) - (dynamicErr?.error_pct ?? 0);
|
|
654
|
+
verdict = [
|
|
655
|
+
`Static: ${staticSummary.sessions} sessions, avg ${staticSummary.avg_final_tools} tools, ${staticErr?.error_pct ?? "?"}% error rate.`,
|
|
656
|
+
`Dynamic: ${dynamicSummary.sessions} sessions, avg ${dynamicSummary.avg_final_tools} tools, ${dynamicErr?.error_pct ?? "?"}% error rate.`,
|
|
657
|
+
toolDiff > 0 ? `Dynamic uses ${toolDiff.toFixed(1)} fewer tools on average.` : "",
|
|
658
|
+
errDiff > 0 ? `Dynamic has ${errDiff.toFixed(2)}pp lower error rate.` : errDiff < 0 ? `Static has ${(-errDiff).toFixed(2)}pp lower error rate.` : "",
|
|
659
|
+
dynamicSummary.avg_load_events > 0 ? `Agents loaded ${dynamicSummary.avg_load_events} toolsets per session on average.` : "",
|
|
660
|
+
].filter(Boolean).join(" ");
|
|
661
|
+
}
|
|
662
|
+
return {
|
|
663
|
+
verdict,
|
|
664
|
+
sessionSummary,
|
|
665
|
+
errorRates,
|
|
666
|
+
topLoadedToolsets: topToolsets,
|
|
667
|
+
currentSession,
|
|
668
|
+
...(detailed ? { sessions } : {}),
|
|
669
|
+
_hint: sessionSummary.length < 2
|
|
670
|
+
? "Run sessions with both `npx nodebench-mcp` (static) and `npx nodebench-mcp --dynamic` (dynamic) to compare."
|
|
671
|
+
: "Compare avg_final_tools and error_pct between modes to evaluate dynamic loading impact.",
|
|
672
|
+
};
|
|
673
|
+
},
|
|
674
|
+
},
|
|
675
|
+
];
|
|
676
|
+
// Combine all tools (mutable for dynamic loading)
|
|
677
|
+
let allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
|
|
180
678
|
// Background: initialize embedding index for semantic search (non-blocking)
|
|
181
679
|
// Uses Agent-as-a-Graph bipartite corpus: tool nodes + domain nodes for graph-aware retrieval
|
|
182
680
|
if (useEmbedding) {
|
|
@@ -218,15 +716,25 @@ if (useEmbedding) {
|
|
|
218
716
|
/* Embedding init failed — semantic search stays disabled, no impact on other features */
|
|
219
717
|
});
|
|
220
718
|
}
|
|
221
|
-
// Build a lookup map for fast tool dispatch
|
|
222
|
-
|
|
719
|
+
// Build a lookup map for fast tool dispatch (mutable for dynamic loading)
|
|
720
|
+
let toolMap = new Map();
|
|
223
721
|
for (const tool of allTools) {
|
|
224
722
|
toolMap.set(tool.name, tool);
|
|
225
723
|
}
|
|
724
|
+
// Rebuild function for dynamic loading — reconstructs allTools + toolMap
|
|
725
|
+
function rebuildAllTools() {
|
|
726
|
+
allTools = [...allToolsWithoutDiscovery, ...discoveryTools, ...dynamicLoadingTools];
|
|
727
|
+
toolMap = new Map();
|
|
728
|
+
for (const tool of allTools) {
|
|
729
|
+
toolMap.set(tool.name, tool);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
226
732
|
// Auto-instrumentation: generate a session ID per MCP connection
|
|
227
733
|
const SESSION_ID = genId("mcp");
|
|
228
|
-
//
|
|
229
|
-
|
|
734
|
+
// A/B test session-level counters (mutable, finalized on exit)
|
|
735
|
+
let _abToolCallCount = 0;
|
|
736
|
+
let _abLoadEventCount = 0;
|
|
737
|
+
const _abStartMs = Date.now();
|
|
230
738
|
// ── Lightweight hooks: auto-save + attention refresh reminders ─────────
|
|
231
739
|
const _hookState = {
|
|
232
740
|
totalCalls: 0,
|
|
@@ -783,7 +1291,34 @@ artifacts (findings, risks, gaps, tests, evals, gates, learnings) that compound
|
|
|
783
1291
|
],
|
|
784
1292
|
},
|
|
785
1293
|
];
|
|
786
|
-
|
|
1294
|
+
// Server instructions — tells Claude Code Tool Search (and other clients) when to search
|
|
1295
|
+
// for NodeBench tools. This is the key integration point for lazy loading compatibility.
|
|
1296
|
+
// See: https://www.anthropic.com/engineering/advanced-tool-use
|
|
1297
|
+
const SERVER_INSTRUCTIONS = `NodeBench MCP provides structured AI development methodology tools.
|
|
1298
|
+
Use NodeBench tools when you need to:
|
|
1299
|
+
- Verify implementations (verification cycles, gap tracking, 6-phase flywheel)
|
|
1300
|
+
- Run evaluations and quality gates before shipping code
|
|
1301
|
+
- Search prior knowledge and record learnings across sessions
|
|
1302
|
+
- Assess risk before taking actions
|
|
1303
|
+
- Coordinate parallel agents (task locks, roles, context budget)
|
|
1304
|
+
- Research with structured recon (web search, GitHub, RSS feeds)
|
|
1305
|
+
- Analyze files (CSV, PDF, XLSX, images, audio, ZIP)
|
|
1306
|
+
- Run security audits (dependency scanning, code analysis, secrets detection)
|
|
1307
|
+
- Write and polish academic papers
|
|
1308
|
+
- Audit SEO, analyze Figma flows, detect Android flicker
|
|
1309
|
+
- Call LLMs (GPT, Claude, Gemini) for analysis and extraction
|
|
1310
|
+
Start with discover_tools("<your task>") to find the right tool.`;
|
|
1311
|
+
const server = new Server({ name: "nodebench-mcp-methodology", version: "2.18.0" }, {
|
|
1312
|
+
capabilities: { tools: { listChanged: true }, prompts: {} },
|
|
1313
|
+
instructions: SERVER_INSTRUCTIONS,
|
|
1314
|
+
});
|
|
1315
|
+
// ── A/B Test Session Tracking ─────────────────────────────────────────
|
|
1316
|
+
// Record session start for A/B comparison (static vs dynamic loading)
|
|
1317
|
+
try {
|
|
1318
|
+
const db = getDb();
|
|
1319
|
+
db.prepare("INSERT INTO ab_test_sessions (id, mode, initial_preset, initial_tool_count, created_at) VALUES (?, ?, ?, ?, datetime('now'))").run(SESSION_ID, useDynamicLoading ? 'dynamic' : 'static', currentPreset, allTools.length);
|
|
1320
|
+
}
|
|
1321
|
+
catch { /* instrumentation must not block server start */ }
|
|
787
1322
|
// Handle tools/list — return all tools with their JSON Schema inputSchemas
|
|
788
1323
|
// Includes MCP 2025-11-25 spec annotations: category, phase, complexity (model tier hint)
|
|
789
1324
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
@@ -809,6 +1344,9 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
809
1344
|
// Handle tools/call — dispatch to the matching tool handler (auto-instrumented)
|
|
810
1345
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
811
1346
|
const { name, arguments: args } = request.params;
|
|
1347
|
+
_abToolCallCount++;
|
|
1348
|
+
if (name === "load_toolset" || name === "unload_toolset")
|
|
1349
|
+
_abLoadEventCount++;
|
|
812
1350
|
const tool = toolMap.get(name);
|
|
813
1351
|
if (!tool) {
|
|
814
1352
|
return {
|
|
@@ -826,7 +1364,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
826
1364
|
resultStatus = "error";
|
|
827
1365
|
errorMsg = result.message ?? "soft error";
|
|
828
1366
|
}
|
|
829
|
-
// Auto-log (skip self-eval tools to avoid recursion/noise)
|
|
1367
|
+
// Auto-log to main DB (skip self-eval tools to avoid recursion/noise)
|
|
830
1368
|
if (!SKIP_AUTO_LOG.has(name)) {
|
|
831
1369
|
try {
|
|
832
1370
|
const db = getDb();
|
|
@@ -834,6 +1372,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
834
1372
|
}
|
|
835
1373
|
catch { /* never let instrumentation break tool dispatch */ }
|
|
836
1374
|
}
|
|
1375
|
+
// Auto-log to analytics tracker
|
|
1376
|
+
tracker.record(name, startMs, resultStatus === "success", errorMsg, args);
|
|
1377
|
+
// Inline A/B session counter update (every 5 calls — amortized cost)
|
|
1378
|
+
if (_abToolCallCount % 5 === 0) {
|
|
1379
|
+
try {
|
|
1380
|
+
const db2 = getDb();
|
|
1381
|
+
const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
|
|
1382
|
+
db2.prepare("UPDATE ab_test_sessions SET total_tool_calls = ?, total_load_events = ?, final_tool_count = ?, toolsets_loaded = ? WHERE id = ?").run(_abToolCallCount, _abLoadEventCount, allTools.length, JSON.stringify(dynamicallyLoaded), SESSION_ID);
|
|
1383
|
+
}
|
|
1384
|
+
catch { /* instrumentation */ }
|
|
1385
|
+
}
|
|
837
1386
|
// Tools with rawContent return ContentBlock[] directly (e.g. image captures)
|
|
838
1387
|
if (tool.rawContent && Array.isArray(result)) {
|
|
839
1388
|
return { content: result, isError: false };
|
|
@@ -875,7 +1424,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
875
1424
|
catch (err) {
|
|
876
1425
|
resultStatus = "error";
|
|
877
1426
|
errorMsg = err?.message || "Internal error";
|
|
878
|
-
// Auto-log errors
|
|
1427
|
+
// Auto-log errors to main DB
|
|
879
1428
|
if (!SKIP_AUTO_LOG.has(name)) {
|
|
880
1429
|
try {
|
|
881
1430
|
const db = getDb();
|
|
@@ -883,6 +1432,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
883
1432
|
}
|
|
884
1433
|
catch { /* never let instrumentation break tool dispatch */ }
|
|
885
1434
|
}
|
|
1435
|
+
// Auto-log error to analytics tracker
|
|
1436
|
+
tracker.record(name, startMs, false, errorMsg, args);
|
|
886
1437
|
return {
|
|
887
1438
|
content: [{ type: "text", text: errorMsg }],
|
|
888
1439
|
isError: true,
|
|
@@ -914,6 +1465,24 @@ server.setRequestHandler(GetPromptRequestSchema, async (request) => {
|
|
|
914
1465
|
messages,
|
|
915
1466
|
};
|
|
916
1467
|
});
|
|
1468
|
+
// Graceful shutdown: close analytics tracker + finalize A/B session on exit
|
|
1469
|
+
process.on('exit', () => {
|
|
1470
|
+
tracker.close();
|
|
1471
|
+
// Finalize A/B test session with aggregate metrics
|
|
1472
|
+
try {
|
|
1473
|
+
const db = getDb();
|
|
1474
|
+
const dynamicallyLoaded = [...activeToolsets].filter(ts => !initialToolsetNames.has(ts));
|
|
1475
|
+
db.prepare(`UPDATE ab_test_sessions SET
|
|
1476
|
+
final_tool_count = ?,
|
|
1477
|
+
toolsets_loaded = ?,
|
|
1478
|
+
total_tool_calls = ?,
|
|
1479
|
+
total_load_events = ?,
|
|
1480
|
+
session_duration_ms = ?,
|
|
1481
|
+
ended_at = datetime('now')
|
|
1482
|
+
WHERE id = ?`).run(allTools.length, JSON.stringify(dynamicallyLoaded), _abToolCallCount, _abLoadEventCount, Date.now() - _abStartMs, SESSION_ID);
|
|
1483
|
+
}
|
|
1484
|
+
catch { /* instrumentation must not block shutdown */ }
|
|
1485
|
+
});
|
|
917
1486
|
// Connect via stdio
|
|
918
1487
|
const transport = new StdioServerTransport();
|
|
919
1488
|
await server.connect(transport);
|