nodebench-mcp 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.js +51 -3
- package/dist/db.js.map +1 -1
- package/dist/index.js +13 -16
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +11 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/toolsetRegistry.js +74 -1
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +4 -3
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/sync/hyperloopEval.test.d.ts +0 -4
- package/dist/sync/hyperloopEval.test.js +0 -60
- package/dist/sync/hyperloopEval.test.js.map +0 -1
- package/dist/sync/store.test.d.ts +0 -4
- package/dist/sync/store.test.js +0 -43
- package/dist/sync/store.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,3201 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Automated tests for NodeBench MCP tools.
|
|
3
|
-
* Covers: static, unit, integration layers.
|
|
4
|
-
* Live E2E layer is tested via bash pipe in the flywheel step.
|
|
5
|
-
*/
|
|
6
|
-
import { describe, it, expect, afterEach } from "vitest";
|
|
7
|
-
import os from "node:os";
|
|
8
|
-
import path from "node:path";
|
|
9
|
-
import { mkdtemp, writeFile } from "node:fs/promises";
|
|
10
|
-
import { existsSync } from "node:fs";
|
|
11
|
-
import { verificationTools } from "../tools/verificationTools.js";
|
|
12
|
-
import { reconTools } from "../tools/reconTools.js";
|
|
13
|
-
import { uiCaptureTools } from "../tools/uiCaptureTools.js";
|
|
14
|
-
import { visionTools } from "../tools/visionTools.js";
|
|
15
|
-
import { evalTools } from "../tools/evalTools.js";
|
|
16
|
-
import { qualityGateTools } from "../tools/qualityGateTools.js";
|
|
17
|
-
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
18
|
-
import { learningTools } from "../tools/learningTools.js";
|
|
19
|
-
import { createMetaTools } from "../tools/metaTools.js";
|
|
20
|
-
import { webTools } from "../tools/webTools.js";
|
|
21
|
-
import { githubTools } from "../tools/githubTools.js";
|
|
22
|
-
import { documentationTools } from "../tools/documentationTools.js";
|
|
23
|
-
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
24
|
-
import { selfEvalTools } from "../tools/selfEvalTools.js";
|
|
25
|
-
import { parallelAgentTools } from "../tools/parallelAgentTools.js";
|
|
26
|
-
import { llmTools } from "../tools/llmTools.js";
|
|
27
|
-
import { securityTools } from "../tools/securityTools.js";
|
|
28
|
-
import { platformTools } from "../tools/platformTools.js";
|
|
29
|
-
import { localFileTools, gaiaMediaSolvers } from "../tools/localFileTools.js";
|
|
30
|
-
import { researchWritingTools } from "../tools/researchWritingTools.js";
|
|
31
|
-
import { flickerDetectionTools } from "../tools/flickerDetectionTools.js";
|
|
32
|
-
import { figmaFlowTools } from "../tools/figmaFlowTools.js";
|
|
33
|
-
import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
|
|
34
|
-
import { boilerplateTools } from "../tools/boilerplateTools.js";
|
|
35
|
-
import { cCompilerBenchmarkTools } from "../tools/cCompilerBenchmarkTools.js";
|
|
36
|
-
import { sessionMemoryTools } from "../tools/sessionMemoryTools.js";
|
|
37
|
-
import { toonTools } from "../tools/toonTools.js";
|
|
38
|
-
import { patternTools } from "../tools/patternTools.js";
|
|
39
|
-
import { gitWorkflowTools } from "../tools/gitWorkflowTools.js";
|
|
40
|
-
import { seoTools } from "../tools/seoTools.js";
|
|
41
|
-
import { voiceBridgeTools } from "../tools/voiceBridgeTools.js";
|
|
42
|
-
import { critterTools } from "../tools/critterTools.js";
|
|
43
|
-
import { emailTools } from "../tools/emailTools.js";
|
|
44
|
-
import { rssTools } from "../tools/rssTools.js";
|
|
45
|
-
import { architectTools } from "../tools/architectTools.js";
|
|
46
|
-
import { uiUxDiveTools } from "../tools/uiUxDiveTools.js";
|
|
47
|
-
import { mcpBridgeTools } from "../tools/mcpBridgeTools.js";
|
|
48
|
-
import { uiUxDiveAdvancedTools } from "../tools/uiUxDiveAdvancedTools.js";
|
|
49
|
-
import { skillUpdateTools } from "../tools/skillUpdateTools.js";
|
|
50
|
-
import { overstoryTools } from "../tools/overstoryTools.js";
|
|
51
|
-
import { getQuickRef, hybridSearch, TOOL_REGISTRY, SEARCH_MODES, ALL_REGISTRY_ENTRIES, WORKFLOW_CHAINS, tokenize, buildDenseIndex, getToolComplexity } from "../tools/toolRegistry.js";
|
|
52
|
-
// Assemble all tools like index.ts does
|
|
53
|
-
const domainTools = [
|
|
54
|
-
...verificationTools,
|
|
55
|
-
...evalTools,
|
|
56
|
-
...qualityGateTools,
|
|
57
|
-
...learningTools,
|
|
58
|
-
...flywheelTools,
|
|
59
|
-
...reconTools,
|
|
60
|
-
...uiCaptureTools,
|
|
61
|
-
...visionTools,
|
|
62
|
-
...localFileTools,
|
|
63
|
-
...gaiaMediaSolvers,
|
|
64
|
-
...webTools,
|
|
65
|
-
...githubTools,
|
|
66
|
-
...documentationTools,
|
|
67
|
-
...agentBootstrapTools,
|
|
68
|
-
...selfEvalTools,
|
|
69
|
-
...parallelAgentTools,
|
|
70
|
-
...llmTools,
|
|
71
|
-
...securityTools,
|
|
72
|
-
...platformTools,
|
|
73
|
-
...researchWritingTools,
|
|
74
|
-
...flickerDetectionTools,
|
|
75
|
-
...figmaFlowTools,
|
|
76
|
-
...boilerplateTools,
|
|
77
|
-
...cCompilerBenchmarkTools,
|
|
78
|
-
...sessionMemoryTools,
|
|
79
|
-
...toonTools,
|
|
80
|
-
...patternTools,
|
|
81
|
-
...gitWorkflowTools,
|
|
82
|
-
...seoTools,
|
|
83
|
-
...voiceBridgeTools,
|
|
84
|
-
...critterTools,
|
|
85
|
-
...emailTools,
|
|
86
|
-
...rssTools,
|
|
87
|
-
...architectTools,
|
|
88
|
-
...uiUxDiveTools,
|
|
89
|
-
...mcpBridgeTools,
|
|
90
|
-
...uiUxDiveAdvancedTools,
|
|
91
|
-
...skillUpdateTools,
|
|
92
|
-
...overstoryTools,
|
|
93
|
-
];
|
|
94
|
-
const metaTools = createMetaTools(domainTools);
|
|
95
|
-
const allToolsWithoutDiscovery = [...domainTools, ...metaTools];
|
|
96
|
-
const discoveryTools = createProgressiveDiscoveryTools(allToolsWithoutDiscovery.map((t) => ({ name: t.name, description: t.description })));
|
|
97
|
-
const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
|
|
98
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
99
|
-
// STATIC LAYER — structure validation
|
|
100
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
101
|
-
describe("Static: tool structure", () => {
|
|
102
|
-
it("should have 175 tools total", () => {
|
|
103
|
-
// domain tools + 3 meta tools (findTools, getMethodology, check_mcp_setup) + 3 progressive discovery tools
|
|
104
|
-
expect(allTools.length).toBe(213);
|
|
105
|
-
});
|
|
106
|
-
it("every tool has name, description, inputSchema, handler", () => {
|
|
107
|
-
for (const tool of allTools) {
|
|
108
|
-
expect(tool.name).toBeTruthy();
|
|
109
|
-
expect(typeof tool.name).toBe("string");
|
|
110
|
-
expect(tool.description).toBeTruthy();
|
|
111
|
-
expect(typeof tool.description).toBe("string");
|
|
112
|
-
expect(tool.inputSchema).toBeTruthy();
|
|
113
|
-
expect(typeof tool.handler).toBe("function");
|
|
114
|
-
}
|
|
115
|
-
});
|
|
116
|
-
it("tool names are unique", () => {
|
|
117
|
-
const names = allTools.map((t) => t.name);
|
|
118
|
-
expect(new Set(names).size).toBe(names.length);
|
|
119
|
-
});
|
|
120
|
-
it("every registered tool has MCP annotations (category, phase, complexity)", () => {
|
|
121
|
-
// MCP 2025-11-25 spec: annotations field provides behavior hints for models.
|
|
122
|
-
// We surface category, phase, complexity from the registry as annotations.
|
|
123
|
-
for (const tool of allTools) {
|
|
124
|
-
const entry = TOOL_REGISTRY.get(tool.name);
|
|
125
|
-
expect(entry, `Missing registry entry for ${tool.name}`).toBeDefined();
|
|
126
|
-
expect(entry.category).toBeTruthy();
|
|
127
|
-
expect(entry.phase).toBeTruthy();
|
|
128
|
-
const complexity = getToolComplexity(tool.name);
|
|
129
|
-
expect(["low", "medium", "high"]).toContain(complexity);
|
|
130
|
-
}
|
|
131
|
-
});
|
|
132
|
-
it("abandon_cycle tool exists in verificationTools", () => {
|
|
133
|
-
const tool = verificationTools.find((t) => t.name === "abandon_cycle");
|
|
134
|
-
expect(tool).toBeDefined();
|
|
135
|
-
expect(tool.inputSchema.required).toContain("cycleId");
|
|
136
|
-
});
|
|
137
|
-
});
|
|
138
|
-
describe("Static: ui_ux_qa preset", () => {
|
|
139
|
-
it("should return 8 rules from get_gate_preset", async () => {
|
|
140
|
-
const tool = findTool("get_gate_preset");
|
|
141
|
-
const result = (await tool.handler({ preset: "ui_ux_qa" }));
|
|
142
|
-
expect(result.preset).toBe("ui_ux_qa");
|
|
143
|
-
expect(result.ruleCount).toBe(8);
|
|
144
|
-
expect(result.rules.map((r) => r.name)).toContain("component_renders");
|
|
145
|
-
expect(result.rules.map((r) => r.name)).toContain("keyboard_navigable");
|
|
146
|
-
expect(result.rules.map((r) => r.name)).toContain("aria_labels_present");
|
|
147
|
-
expect(result.rules.map((r) => r.name)).toContain("storybook_story_exists");
|
|
148
|
-
});
|
|
149
|
-
it("should accept ui_ux_qa gate results via run_quality_gate", async () => {
|
|
150
|
-
const tool = findTool("run_quality_gate");
|
|
151
|
-
const result = (await tool.handler({
|
|
152
|
-
gateName: "ui_ux_qa",
|
|
153
|
-
target: "TestComponent",
|
|
154
|
-
rules: [
|
|
155
|
-
{ name: "component_renders", passed: true },
|
|
156
|
-
{ name: "responsive_check", passed: true },
|
|
157
|
-
{ name: "keyboard_navigable", passed: false },
|
|
158
|
-
{ name: "aria_labels_present", passed: true },
|
|
159
|
-
{ name: "loading_states_handled", passed: true },
|
|
160
|
-
{ name: "no_console_errors", passed: true },
|
|
161
|
-
{ name: "visual_consistency", passed: true },
|
|
162
|
-
{ name: "storybook_story_exists", passed: false },
|
|
163
|
-
],
|
|
164
|
-
}));
|
|
165
|
-
expect(result.passed).toBe(false);
|
|
166
|
-
expect(result.totalRules).toBe(8);
|
|
167
|
-
expect(result.passedCount).toBe(6);
|
|
168
|
-
expect(result.failures).toContain("keyboard_navigable");
|
|
169
|
-
expect(result.failures).toContain("storybook_story_exists");
|
|
170
|
-
});
|
|
171
|
-
});
|
|
172
|
-
describe("Static: ui capture tools", () => {
|
|
173
|
-
it("should include capture_ui_screenshot and capture_responsive_suite", () => {
|
|
174
|
-
const names = allTools.map((t) => t.name);
|
|
175
|
-
expect(names).toContain("capture_ui_screenshot");
|
|
176
|
-
expect(names).toContain("capture_responsive_suite");
|
|
177
|
-
});
|
|
178
|
-
it("capture_ui_screenshot requires url parameter", () => {
|
|
179
|
-
const tool = allTools.find((t) => t.name === "capture_ui_screenshot");
|
|
180
|
-
expect(tool.inputSchema.required).toContain("url");
|
|
181
|
-
});
|
|
182
|
-
it("capture_responsive_suite requires url and label", () => {
|
|
183
|
-
const tool = allTools.find((t) => t.name === "capture_responsive_suite");
|
|
184
|
-
expect(tool.inputSchema.required).toContain("url");
|
|
185
|
-
expect(tool.inputSchema.required).toContain("label");
|
|
186
|
-
});
|
|
187
|
-
it("capture_ui_screenshot has viewport enum with expected presets", () => {
|
|
188
|
-
const tool = allTools.find((t) => t.name === "capture_ui_screenshot");
|
|
189
|
-
const viewportProp = tool.inputSchema.properties.viewport;
|
|
190
|
-
expect(viewportProp.enum).toContain("mobile");
|
|
191
|
-
expect(viewportProp.enum).toContain("tablet");
|
|
192
|
-
expect(viewportProp.enum).toContain("desktop");
|
|
193
|
-
expect(viewportProp.enum).toContain("wide");
|
|
194
|
-
expect(viewportProp.enum).toContain("custom");
|
|
195
|
-
});
|
|
196
|
-
});
|
|
197
|
-
describe("Static: vision tools", () => {
|
|
198
|
-
it("should include discover_vision_env, analyze_screenshot, manipulate_screenshot", () => {
|
|
199
|
-
const names = allTools.map((t) => t.name);
|
|
200
|
-
expect(names).toContain("discover_vision_env");
|
|
201
|
-
expect(names).toContain("analyze_screenshot");
|
|
202
|
-
expect(names).toContain("manipulate_screenshot");
|
|
203
|
-
});
|
|
204
|
-
it("analyze_screenshot requires imageBase64 and has rawContent", () => {
|
|
205
|
-
const tool = allTools.find((t) => t.name === "analyze_screenshot");
|
|
206
|
-
expect(tool.inputSchema.required).toContain("imageBase64");
|
|
207
|
-
expect(tool.rawContent).toBe(true);
|
|
208
|
-
});
|
|
209
|
-
it("manipulate_screenshot requires imageBase64 and operation", () => {
|
|
210
|
-
const tool = allTools.find((t) => t.name === "manipulate_screenshot");
|
|
211
|
-
expect(tool.inputSchema.required).toContain("imageBase64");
|
|
212
|
-
expect(tool.inputSchema.required).toContain("operation");
|
|
213
|
-
expect(tool.rawContent).toBe(true);
|
|
214
|
-
});
|
|
215
|
-
it("discover_vision_env has no required params", () => {
|
|
216
|
-
const tool = allTools.find((t) => t.name === "discover_vision_env");
|
|
217
|
-
const required = tool.inputSchema.required;
|
|
218
|
-
expect(required ?? []).toEqual([]);
|
|
219
|
-
});
|
|
220
|
-
});
|
|
221
|
-
describe("Unit: discover_vision_env", () => {
|
|
222
|
-
// Skip in CI - dynamic imports for SDK detection can timeout unpredictably
|
|
223
|
-
it.skip("should return environment scan without errors", async () => {
|
|
224
|
-
const tool = allTools.find((t) => t.name === "discover_vision_env");
|
|
225
|
-
const result = (await tool.handler({}));
|
|
226
|
-
expect(result).toHaveProperty("apiKeys");
|
|
227
|
-
expect(result).toHaveProperty("sdks");
|
|
228
|
-
expect(result).toHaveProperty("providers");
|
|
229
|
-
expect(result).toHaveProperty("canAnalyze");
|
|
230
|
-
expect(result).toHaveProperty("canManipulate");
|
|
231
|
-
expect(result).toHaveProperty("canCapture");
|
|
232
|
-
expect(typeof result.canAnalyze).toBe("boolean");
|
|
233
|
-
}, 30000);
|
|
234
|
-
});
|
|
235
|
-
describe("Static: agentic_vision methodology", () => {
|
|
236
|
-
it("should return agentic_vision methodology from getMethodology", async () => {
|
|
237
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
238
|
-
const result = (await tool.handler({ topic: "agentic_vision" }));
|
|
239
|
-
expect(result.title).toContain("Agentic Vision");
|
|
240
|
-
expect(result.steps.length).toBe(6);
|
|
241
|
-
expect(result.steps[0].name).toBe("Discover");
|
|
242
|
-
expect(result.steps[2].name).toBe("Analyze");
|
|
243
|
-
});
|
|
244
|
-
});
|
|
245
|
-
describe("Static: web tools", () => {
|
|
246
|
-
it("should include web_search and fetch_url tools", () => {
|
|
247
|
-
const names = allTools.map((t) => t.name);
|
|
248
|
-
expect(names).toContain("web_search");
|
|
249
|
-
expect(names).toContain("fetch_url");
|
|
250
|
-
});
|
|
251
|
-
it("web_search requires query parameter", () => {
|
|
252
|
-
const tool = allTools.find((t) => t.name === "web_search");
|
|
253
|
-
expect(tool.inputSchema.required).toContain("query");
|
|
254
|
-
});
|
|
255
|
-
it("fetch_url requires url parameter", () => {
|
|
256
|
-
const tool = allTools.find((t) => t.name === "fetch_url");
|
|
257
|
-
expect(tool.inputSchema.required).toContain("url");
|
|
258
|
-
});
|
|
259
|
-
});
|
|
260
|
-
describe("Static: github tools", () => {
|
|
261
|
-
it("should include search_github and analyze_repo tools", () => {
|
|
262
|
-
const names = allTools.map((t) => t.name);
|
|
263
|
-
expect(names).toContain("search_github");
|
|
264
|
-
expect(names).toContain("analyze_repo");
|
|
265
|
-
});
|
|
266
|
-
it("search_github requires query parameter", () => {
|
|
267
|
-
const tool = allTools.find((t) => t.name === "search_github");
|
|
268
|
-
expect(tool.inputSchema.required).toContain("query");
|
|
269
|
-
});
|
|
270
|
-
it("analyze_repo requires repoUrl parameter", () => {
|
|
271
|
-
const tool = allTools.find((t) => t.name === "analyze_repo");
|
|
272
|
-
expect(tool.inputSchema.required).toContain("repoUrl");
|
|
273
|
-
});
|
|
274
|
-
});
|
|
275
|
-
describe("Static: documentation tools", () => {
|
|
276
|
-
it("should include update_agents_md, research_job_market, and setup_local_env tools", () => {
|
|
277
|
-
const names = allTools.map((t) => t.name);
|
|
278
|
-
expect(names).toContain("update_agents_md");
|
|
279
|
-
expect(names).toContain("research_job_market");
|
|
280
|
-
expect(names).toContain("setup_local_env");
|
|
281
|
-
});
|
|
282
|
-
it("update_agents_md requires operation parameter", () => {
|
|
283
|
-
const tool = allTools.find((t) => t.name === "update_agents_md");
|
|
284
|
-
expect(tool.inputSchema.required).toContain("operation");
|
|
285
|
-
});
|
|
286
|
-
it("research_job_market requires role parameter", () => {
|
|
287
|
-
const tool = allTools.find((t) => t.name === "research_job_market");
|
|
288
|
-
expect(tool.inputSchema.required).toContain("role");
|
|
289
|
-
});
|
|
290
|
-
});
|
|
291
|
-
describe("Static: new methodology topics", () => {
|
|
292
|
-
it("should return project_ideation methodology with 6 steps", async () => {
|
|
293
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
294
|
-
const result = (await tool.handler({ topic: "project_ideation" }));
|
|
295
|
-
expect(result.title).toContain("Project Ideation");
|
|
296
|
-
expect(result.steps.length).toBe(6);
|
|
297
|
-
expect(result.steps[0].name).toBe("Define Concept");
|
|
298
|
-
});
|
|
299
|
-
it("should return tech_stack_2026 methodology with 5 steps", async () => {
|
|
300
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
301
|
-
const result = (await tool.handler({ topic: "tech_stack_2026" }));
|
|
302
|
-
expect(result.title).toContain("Tech Stack");
|
|
303
|
-
expect(result.steps.length).toBe(5);
|
|
304
|
-
});
|
|
305
|
-
it("should return telemetry_setup methodology with 5 steps", async () => {
|
|
306
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
307
|
-
const result = (await tool.handler({ topic: "telemetry_setup" }));
|
|
308
|
-
expect(result.title).toContain("Telemetry");
|
|
309
|
-
expect(result.steps.length).toBe(5);
|
|
310
|
-
});
|
|
311
|
-
it("should return agents_md_maintenance methodology with 5 steps", async () => {
|
|
312
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
313
|
-
const result = (await tool.handler({ topic: "agents_md_maintenance" }));
|
|
314
|
-
expect(result.title).toContain("AGENTS.md");
|
|
315
|
-
expect(result.steps.length).toBe(5);
|
|
316
|
-
});
|
|
317
|
-
it("overview should include all 16 methodology topics", async () => {
|
|
318
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
319
|
-
const result = (await tool.handler({ topic: "overview" }));
|
|
320
|
-
const topics = Object.keys(result.steps[0].topics);
|
|
321
|
-
expect(topics).toContain("project_ideation");
|
|
322
|
-
expect(topics).toContain("tech_stack_2026");
|
|
323
|
-
expect(topics).toContain("telemetry_setup");
|
|
324
|
-
expect(topics).toContain("agents_md_maintenance");
|
|
325
|
-
expect(topics).toContain("agent_bootstrap");
|
|
326
|
-
expect(topics).toContain("autonomous_maintenance");
|
|
327
|
-
expect(topics).toContain("parallel_agent_teams");
|
|
328
|
-
expect(topics.length).toBe(26); // All topics listed in overview (includes scenario_testing added v2.26.0)
|
|
329
|
-
});
|
|
330
|
-
});
|
|
331
|
-
describe("Unit: setup_local_env", () => {
|
|
332
|
-
it("should return environment status without errors", async () => {
|
|
333
|
-
const tool = allTools.find((t) => t.name === "setup_local_env");
|
|
334
|
-
// Skip SDK checks to avoid timeout from dynamic imports
|
|
335
|
-
const result = (await tool.handler({ checkSdks: false }));
|
|
336
|
-
expect(result).toHaveProperty("environment");
|
|
337
|
-
expect(result).toHaveProperty("apiKeys");
|
|
338
|
-
expect(result).toHaveProperty("capabilities");
|
|
339
|
-
expect(result).toHaveProperty("recommendation");
|
|
340
|
-
expect(result.environment).toHaveProperty("nodeVersion");
|
|
341
|
-
expect(result.environment).toHaveProperty("packageManager");
|
|
342
|
-
});
|
|
343
|
-
});
|
|
344
|
-
describe("Unit: research_job_market", () => {
|
|
345
|
-
it("should return job market data for known roles", async () => {
|
|
346
|
-
const tool = allTools.find((t) => t.name === "research_job_market");
|
|
347
|
-
const result = (await tool.handler({ role: "AI Engineer" }));
|
|
348
|
-
expect(result.role).toBe("AI Engineer");
|
|
349
|
-
expect(result).toHaveProperty("commonRequirements");
|
|
350
|
-
expect(result).toHaveProperty("emergingSkills");
|
|
351
|
-
expect(result).toHaveProperty("salaryRange");
|
|
352
|
-
expect(result).toHaveProperty("recommendation");
|
|
353
|
-
expect(result.commonRequirements.length).toBeGreaterThan(0);
|
|
354
|
-
});
|
|
355
|
-
});
|
|
356
|
-
describe("Static: autonomous maintenance tools", () => {
|
|
357
|
-
it("should include all autonomous tools", () => {
|
|
358
|
-
const names = allTools.map((t) => t.name);
|
|
359
|
-
expect(names).toContain("assess_risk");
|
|
360
|
-
expect(names).toContain("decide_re_update");
|
|
361
|
-
expect(names).toContain("run_self_maintenance");
|
|
362
|
-
expect(names).toContain("scaffold_directory");
|
|
363
|
-
expect(names).toContain("run_autonomous_loop");
|
|
364
|
-
});
|
|
365
|
-
it("assess_risk requires action parameter", () => {
|
|
366
|
-
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
367
|
-
expect(tool.inputSchema.required).toContain("action");
|
|
368
|
-
});
|
|
369
|
-
it("decide_re_update requires targetContent and contentType", () => {
|
|
370
|
-
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
371
|
-
expect(tool.inputSchema.required).toContain("targetContent");
|
|
372
|
-
expect(tool.inputSchema.required).toContain("contentType");
|
|
373
|
-
});
|
|
374
|
-
it("scaffold_directory requires component", () => {
|
|
375
|
-
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
376
|
-
expect(tool.inputSchema.required).toContain("component");
|
|
377
|
-
});
|
|
378
|
-
it("run_autonomous_loop requires goal", () => {
|
|
379
|
-
const tool = allTools.find((t) => t.name === "run_autonomous_loop");
|
|
380
|
-
expect(tool.inputSchema.required).toContain("goal");
|
|
381
|
-
});
|
|
382
|
-
});
|
|
383
|
-
describe("Unit: assess_risk", () => {
|
|
384
|
-
it("should classify known high-risk actions", async () => {
|
|
385
|
-
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
386
|
-
const result = (await tool.handler({ action: "push_to_remote" }));
|
|
387
|
-
expect(result.assessment.tier).toBe("high");
|
|
388
|
-
expect(result.assessment.recommendation).toBe("require_confirmation");
|
|
389
|
-
});
|
|
390
|
-
it("should classify known low-risk actions", async () => {
|
|
391
|
-
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
392
|
-
const result = (await tool.handler({ action: "read_file" }));
|
|
393
|
-
expect(result.assessment.tier).toBe("low");
|
|
394
|
-
expect(result.assessment.recommendation).toBe("auto_approve");
|
|
395
|
-
});
|
|
396
|
-
it("should use heuristics for unknown actions", async () => {
|
|
397
|
-
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
398
|
-
const result = (await tool.handler({ action: "delete everything" }));
|
|
399
|
-
expect(result.assessment.tier).toBe("high");
|
|
400
|
-
expect(result.reasoning).toContain("Heuristic");
|
|
401
|
-
});
|
|
402
|
-
});
|
|
403
|
-
describe("Unit: decide_re_update", () => {
|
|
404
|
-
it("should recommend update_existing for instruction files", async () => {
|
|
405
|
-
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
406
|
-
const result = (await tool.handler({
|
|
407
|
-
targetContent: "New agent instructions",
|
|
408
|
-
contentType: "instructions",
|
|
409
|
-
existingFiles: ["AGENTS.md", "README.md"],
|
|
410
|
-
}));
|
|
411
|
-
expect(result.action).toBe("update_existing");
|
|
412
|
-
expect(result.existingFile).toBe("AGENTS.md");
|
|
413
|
-
});
|
|
414
|
-
it("should recommend create_new when no matching files exist", async () => {
|
|
415
|
-
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
416
|
-
const result = (await tool.handler({
|
|
417
|
-
targetContent: "Some random config",
|
|
418
|
-
contentType: "config",
|
|
419
|
-
existingFiles: [],
|
|
420
|
-
}));
|
|
421
|
-
expect(result.action).toBe("create_new");
|
|
422
|
-
});
|
|
423
|
-
});
|
|
424
|
-
describe("Unit: run_self_maintenance", () => {
|
|
425
|
-
it("should return maintenance report with quick scope", async () => {
|
|
426
|
-
const tool = allTools.find((t) => t.name === "run_self_maintenance");
|
|
427
|
-
const result = (await tool.handler({ scope: "quick" }));
|
|
428
|
-
expect(result).toHaveProperty("checksPerformed");
|
|
429
|
-
expect(result).toHaveProperty("issuesFound");
|
|
430
|
-
expect(result).toHaveProperty("actionsExecuted");
|
|
431
|
-
expect(result).toHaveProperty("updatesRecommended");
|
|
432
|
-
expect(result).toHaveProperty("nextScheduledCheck");
|
|
433
|
-
expect(result.checksPerformed.length).toBeGreaterThan(0);
|
|
434
|
-
});
|
|
435
|
-
});
|
|
436
|
-
describe("Unit: scaffold_directory", () => {
|
|
437
|
-
it("should return scaffold structure for agent_loop", async () => {
|
|
438
|
-
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
439
|
-
const result = (await tool.handler({ component: "agent_loop" }));
|
|
440
|
-
expect(result.component).toBe("agent_loop");
|
|
441
|
-
expect(result.structure.files.length).toBeGreaterThan(0);
|
|
442
|
-
expect(result.createCommands.length).toBeGreaterThan(0);
|
|
443
|
-
expect(result.nextSteps.length).toBeGreaterThan(0);
|
|
444
|
-
});
|
|
445
|
-
it("should throw for unknown component", async () => {
|
|
446
|
-
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
447
|
-
await expect(tool.handler({ component: "unknown_component" })).rejects.toThrow("Unknown component");
|
|
448
|
-
});
|
|
449
|
-
});
|
|
450
|
-
describe("Unit: run_autonomous_loop", () => {
|
|
451
|
-
it("should complete loop with goal", async () => {
|
|
452
|
-
const tool = allTools.find((t) => t.name === "run_autonomous_loop");
|
|
453
|
-
const result = (await tool.handler({
|
|
454
|
-
goal: "Test autonomous verification",
|
|
455
|
-
maxIterations: 3,
|
|
456
|
-
maxDurationMs: 5000,
|
|
457
|
-
}));
|
|
458
|
-
expect(result.goal).toBe("Test autonomous verification");
|
|
459
|
-
expect(result.iterations).toBeGreaterThan(0);
|
|
460
|
-
expect(result.iterations).toBeLessThanOrEqual(3);
|
|
461
|
-
expect(["completed", "stopped", "timeout", "failed"]).toContain(result.status);
|
|
462
|
-
expect(result.results.length).toBeGreaterThan(0);
|
|
463
|
-
});
|
|
464
|
-
});
|
|
465
|
-
describe("Static: autonomous_maintenance methodology", () => {
|
|
466
|
-
it("should return autonomous_maintenance methodology with 5 steps", async () => {
|
|
467
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
468
|
-
const result = (await tool.handler({ topic: "autonomous_maintenance" }));
|
|
469
|
-
expect(result.title).toContain("Autonomous Self-Maintenance");
|
|
470
|
-
expect(result.steps.length).toBe(5);
|
|
471
|
-
expect(result.steps[0].name).toBe("Assess Risk Before Action");
|
|
472
|
-
expect(result.steps[1].name).toBe("Re-Update Before Create");
|
|
473
|
-
expect(result).toHaveProperty("riskTiers");
|
|
474
|
-
expect(result).toHaveProperty("patterns");
|
|
475
|
-
});
|
|
476
|
-
});
|
|
477
|
-
describe("Static: self-eval tools", () => {
|
|
478
|
-
it("should include all 6 self-eval tools", () => {
|
|
479
|
-
const names = allTools.map((t) => t.name);
|
|
480
|
-
expect(names).toContain("log_tool_call");
|
|
481
|
-
expect(names).toContain("get_trajectory_analysis");
|
|
482
|
-
expect(names).toContain("get_self_eval_report");
|
|
483
|
-
expect(names).toContain("get_improvement_recommendations");
|
|
484
|
-
expect(names).toContain("cleanup_stale_runs");
|
|
485
|
-
expect(names).toContain("synthesize_recon_to_learnings");
|
|
486
|
-
});
|
|
487
|
-
it("log_tool_call requires sessionId and toolName", () => {
|
|
488
|
-
const tool = allTools.find((t) => t.name === "log_tool_call");
|
|
489
|
-
expect(tool.inputSchema.required).toContain("sessionId");
|
|
490
|
-
expect(tool.inputSchema.required).toContain("toolName");
|
|
491
|
-
});
|
|
492
|
-
it("get_improvement_recommendations has focus enum", () => {
|
|
493
|
-
const tool = allTools.find((t) => t.name === "get_improvement_recommendations");
|
|
494
|
-
const focusProp = tool.inputSchema.properties.focus;
|
|
495
|
-
expect(focusProp.enum).toContain("tools");
|
|
496
|
-
expect(focusProp.enum).toContain("process");
|
|
497
|
-
expect(focusProp.enum).toContain("quality");
|
|
498
|
-
expect(focusProp.enum).toContain("knowledge");
|
|
499
|
-
expect(focusProp.enum).toContain("all");
|
|
500
|
-
});
|
|
501
|
-
});
|
|
502
|
-
describe("Static: self_reinforced_learning methodology", () => {
|
|
503
|
-
it("should return self_reinforced_learning methodology with 6 steps", async () => {
|
|
504
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
505
|
-
const result = (await tool.handler({ topic: "self_reinforced_learning" }));
|
|
506
|
-
expect(result.title).toContain("Self-Reinforced Learning");
|
|
507
|
-
expect(result.steps.length).toBe(6);
|
|
508
|
-
expect(result.steps[0].name).toBe("Instrument");
|
|
509
|
-
expect(result.steps[4].name).toBe("Clean & Synthesize");
|
|
510
|
-
expect(result.steps[5].name).toBe("Apply & Re-Analyze");
|
|
511
|
-
});
|
|
512
|
-
});
|
|
513
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
514
|
-
// UNIT LAYER — individual tool behavior
|
|
515
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
516
|
-
const findTool = (name) => allTools.find((t) => t.name === name);
|
|
517
|
-
describe("Unit: local file tools", () => {
|
|
518
|
-
const findRepoFile = (relPath) => {
|
|
519
|
-
let dir = process.cwd();
|
|
520
|
-
for (let i = 0; i < 10; i++) {
|
|
521
|
-
const candidate = path.join(dir, relPath);
|
|
522
|
-
if (existsSync(candidate))
|
|
523
|
-
return candidate;
|
|
524
|
-
const parent = path.dirname(dir);
|
|
525
|
-
if (parent === dir)
|
|
526
|
-
break;
|
|
527
|
-
dir = parent;
|
|
528
|
-
}
|
|
529
|
-
throw new Error(`Fixture not found: ${relPath}`);
|
|
530
|
-
};
|
|
531
|
-
it("tool registry should include quickRefs for all local_file tools", () => {
|
|
532
|
-
const missing = localFileTools
|
|
533
|
-
.map((t) => t.name)
|
|
534
|
-
.filter((name) => !getQuickRef(name));
|
|
535
|
-
expect(missing).toEqual([]);
|
|
536
|
-
});
|
|
537
|
-
it("read_csv_file should parse a bounded table", async () => {
|
|
538
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
539
|
-
const csvPath = path.join(tmpDir, "sample.csv");
|
|
540
|
-
await writeFile(csvPath, "name,age\nAlice,30\nBob,25\n", "utf8");
|
|
541
|
-
const tool = findTool("read_csv_file");
|
|
542
|
-
const result = (await tool.handler({
|
|
543
|
-
path: csvPath,
|
|
544
|
-
hasHeader: true,
|
|
545
|
-
maxRows: 10,
|
|
546
|
-
maxCols: 10,
|
|
547
|
-
}));
|
|
548
|
-
expect(result.headers).toEqual(["name", "age"]);
|
|
549
|
-
expect(result.rows.length).toBe(2);
|
|
550
|
-
expect(result.rows[0][0]).toBe("Alice");
|
|
551
|
-
expect(result.rows[0][1]).toBe("30");
|
|
552
|
-
});
|
|
553
|
-
it("read_xlsx_file should parse a bounded sheet preview", async () => {
|
|
554
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
555
|
-
const xlsxPath = path.join(tmpDir, "sample.xlsx");
|
|
556
|
-
const mod = await import("xlsx");
|
|
557
|
-
const XLSX = mod.default ?? mod;
|
|
558
|
-
const wb = XLSX.utils.book_new();
|
|
559
|
-
const sheet = XLSX.utils.aoa_to_sheet([
|
|
560
|
-
["Title", "Year"],
|
|
561
|
-
["Movie A", 2009],
|
|
562
|
-
["Movie B", 2011],
|
|
563
|
-
]);
|
|
564
|
-
XLSX.utils.book_append_sheet(wb, sheet, "Sheet1");
|
|
565
|
-
XLSX.writeFile(wb, xlsxPath);
|
|
566
|
-
const tool = findTool("read_xlsx_file");
|
|
567
|
-
const result = (await tool.handler({
|
|
568
|
-
path: xlsxPath,
|
|
569
|
-
headerRow: 1,
|
|
570
|
-
maxRows: 10,
|
|
571
|
-
maxCols: 10,
|
|
572
|
-
}));
|
|
573
|
-
expect(result.sheets).toContain("Sheet1");
|
|
574
|
-
expect(result.sheetName).toBe("Sheet1");
|
|
575
|
-
expect(result.headers).toEqual(["Title", "Year"]);
|
|
576
|
-
expect(result.rows.length).toBe(2);
|
|
577
|
-
expect(result.rows[0][0]).toBe("Movie A");
|
|
578
|
-
expect(result.rows[0][1]).toBe(2009);
|
|
579
|
-
});
|
|
580
|
-
it("csv_select_rows should filter rows and select columns", async () => {
|
|
581
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
582
|
-
const csvPath = path.join(tmpDir, "sample.csv");
|
|
583
|
-
await writeFile(csvPath, "name,age\nAlice,30\nBob,25\nCara,40\n", "utf8");
|
|
584
|
-
const tool = findTool("csv_select_rows");
|
|
585
|
-
const result = (await tool.handler({
|
|
586
|
-
path: csvPath,
|
|
587
|
-
hasHeader: true,
|
|
588
|
-
where: [{ column: "age", op: "gt", value: 25 }],
|
|
589
|
-
returnColumns: ["name"],
|
|
590
|
-
limit: 10,
|
|
591
|
-
}));
|
|
592
|
-
expect(result.headers).toEqual(["name"]);
|
|
593
|
-
expect(result.rows.length).toBe(2);
|
|
594
|
-
expect(result.rows[0].row[0]).toBe("Alice");
|
|
595
|
-
expect(result.rows[1].row[0]).toBe("Cara");
|
|
596
|
-
});
|
|
597
|
-
it("csv_select_rows should support is_even on address-like strings", async () => {
|
|
598
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
599
|
-
const csvPath = path.join(tmpDir, "sample.csv");
|
|
600
|
-
await writeFile(csvPath, "name,address\nAlice,101 Main St\nBob,102 Main St\nCara,103 Main St\n", "utf8");
|
|
601
|
-
const tool = findTool("csv_select_rows");
|
|
602
|
-
const result = (await tool.handler({
|
|
603
|
-
path: csvPath,
|
|
604
|
-
hasHeader: true,
|
|
605
|
-
where: [{ column: "address", op: "is_even" }],
|
|
606
|
-
returnColumns: ["name"],
|
|
607
|
-
limit: 10,
|
|
608
|
-
}));
|
|
609
|
-
expect(result.headers).toEqual(["name"]);
|
|
610
|
-
expect(result.rows.length).toBe(1);
|
|
611
|
-
expect(result.rows[0].row[0]).toBe("Bob");
|
|
612
|
-
});
|
|
613
|
-
it("csv_aggregate should compute min and return bestRow", async () => {
|
|
614
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
615
|
-
const csvPath = path.join(tmpDir, "sample.csv");
|
|
616
|
-
await writeFile(csvPath, "name,age\nAlice,30\nBob,25\nCara,40\n", "utf8");
|
|
617
|
-
const tool = findTool("csv_aggregate");
|
|
618
|
-
const result = (await tool.handler({
|
|
619
|
-
path: csvPath,
|
|
620
|
-
hasHeader: true,
|
|
621
|
-
operation: "min",
|
|
622
|
-
value: { type: "column", column: "age" },
|
|
623
|
-
returnColumns: ["name", "age"],
|
|
624
|
-
}));
|
|
625
|
-
expect(result.result).toBe(25);
|
|
626
|
-
expect(result.bestRow.headers).toEqual(["name", "age"]);
|
|
627
|
-
expect(result.bestRow.row[0]).toBe("Bob");
|
|
628
|
-
});
|
|
629
|
-
it("xlsx_select_rows should filter rows and select columns", async () => {
|
|
630
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
631
|
-
const xlsxPath = path.join(tmpDir, "sample.xlsx");
|
|
632
|
-
const mod = await import("xlsx");
|
|
633
|
-
const XLSX = mod.default ?? mod;
|
|
634
|
-
const wb = XLSX.utils.book_new();
|
|
635
|
-
const sheet = XLSX.utils.aoa_to_sheet([
|
|
636
|
-
["Title", "Year"],
|
|
637
|
-
["Movie A", 2009],
|
|
638
|
-
["Movie B", 2011],
|
|
639
|
-
]);
|
|
640
|
-
XLSX.utils.book_append_sheet(wb, sheet, "Sheet1");
|
|
641
|
-
XLSX.writeFile(wb, xlsxPath);
|
|
642
|
-
const tool = findTool("xlsx_select_rows");
|
|
643
|
-
const result = (await tool.handler({
|
|
644
|
-
path: xlsxPath,
|
|
645
|
-
sheetName: "Sheet1",
|
|
646
|
-
headerRow: 1,
|
|
647
|
-
where: [{ column: "Year", op: "eq", value: 2009 }],
|
|
648
|
-
returnColumns: ["Title"],
|
|
649
|
-
limit: 10,
|
|
650
|
-
}));
|
|
651
|
-
expect(result.headers).toEqual(["Title"]);
|
|
652
|
-
expect(result.rows.length).toBe(1);
|
|
653
|
-
expect(result.rows[0].row[0]).toBe("Movie A");
|
|
654
|
-
});
|
|
655
|
-
it("xlsx_select_rows should support is_odd on numeric columns", async () => {
|
|
656
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
657
|
-
const xlsxPath = path.join(tmpDir, "sample.xlsx");
|
|
658
|
-
const mod = await import("xlsx");
|
|
659
|
-
const XLSX = mod.default ?? mod;
|
|
660
|
-
const wb = XLSX.utils.book_new();
|
|
661
|
-
const sheet = XLSX.utils.aoa_to_sheet([
|
|
662
|
-
["Title", "Year"],
|
|
663
|
-
["Movie A", 2009],
|
|
664
|
-
["Movie B", 2010],
|
|
665
|
-
["Movie C", 2011],
|
|
666
|
-
]);
|
|
667
|
-
XLSX.utils.book_append_sheet(wb, sheet, "Sheet1");
|
|
668
|
-
XLSX.writeFile(wb, xlsxPath);
|
|
669
|
-
const tool = findTool("xlsx_select_rows");
|
|
670
|
-
const result = (await tool.handler({
|
|
671
|
-
path: xlsxPath,
|
|
672
|
-
sheetName: "Sheet1",
|
|
673
|
-
headerRow: 1,
|
|
674
|
-
where: [{ column: "Year", op: "is_odd" }],
|
|
675
|
-
returnColumns: ["Title"],
|
|
676
|
-
limit: 10,
|
|
677
|
-
}));
|
|
678
|
-
expect(result.headers).toEqual(["Title"]);
|
|
679
|
-
expect(result.rows.length).toBe(2);
|
|
680
|
-
expect(result.rows[0].row[0]).toBe("Movie A");
|
|
681
|
-
expect(result.rows[1].row[0]).toBe("Movie C");
|
|
682
|
-
});
|
|
683
|
-
it("xlsx_aggregate should compute min and return bestRow", async () => {
|
|
684
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
685
|
-
const xlsxPath = path.join(tmpDir, "sample.xlsx");
|
|
686
|
-
const mod = await import("xlsx");
|
|
687
|
-
const XLSX = mod.default ?? mod;
|
|
688
|
-
const wb = XLSX.utils.book_new();
|
|
689
|
-
const sheet = XLSX.utils.aoa_to_sheet([
|
|
690
|
-
["Title", "Year"],
|
|
691
|
-
["Movie A", 2009],
|
|
692
|
-
["Movie B", 2011],
|
|
693
|
-
]);
|
|
694
|
-
XLSX.utils.book_append_sheet(wb, sheet, "Sheet1");
|
|
695
|
-
XLSX.writeFile(wb, xlsxPath);
|
|
696
|
-
const tool = findTool("xlsx_aggregate");
|
|
697
|
-
const result = (await tool.handler({
|
|
698
|
-
path: xlsxPath,
|
|
699
|
-
sheetName: "Sheet1",
|
|
700
|
-
headerRow: 1,
|
|
701
|
-
operation: "min",
|
|
702
|
-
value: { type: "column", column: "Year" },
|
|
703
|
-
returnColumns: ["Title", "Year"],
|
|
704
|
-
}));
|
|
705
|
-
expect(result.result).toBe(2009);
|
|
706
|
-
expect(result.bestRow.headers).toEqual(["Title", "Year"]);
|
|
707
|
-
expect(result.bestRow.row[0]).toBe("Movie A");
|
|
708
|
-
});
|
|
709
|
-
it("read_pdf_text should extract page text", async () => {
|
|
710
|
-
const pdfPath = findRepoFile(path.join("test_assets", "Report_2025-12-25.pdf"));
|
|
711
|
-
const tool = findTool("read_pdf_text");
|
|
712
|
-
const result = (await tool.handler({
|
|
713
|
-
path: pdfPath,
|
|
714
|
-
pageStart: 1,
|
|
715
|
-
pageEnd: 1,
|
|
716
|
-
maxChars: 2000,
|
|
717
|
-
}));
|
|
718
|
-
expect(result.pagesIncluded).toEqual([1]);
|
|
719
|
-
expect(String(result.text)).toContain("Hello World");
|
|
720
|
-
}, 20_000);
|
|
721
|
-
it("pdf_search_text should find matches with snippets", async () => {
|
|
722
|
-
const pdfPath = findRepoFile(path.join("test_assets", "Report_2025-12-25.pdf"));
|
|
723
|
-
const tool = findTool("pdf_search_text");
|
|
724
|
-
const result = (await tool.handler({
|
|
725
|
-
path: pdfPath,
|
|
726
|
-
query: "Hello",
|
|
727
|
-
maxMatches: 5,
|
|
728
|
-
}));
|
|
729
|
-
expect(result.matchCount).toBeGreaterThan(0);
|
|
730
|
-
expect(result.matches[0].page).toBe(1);
|
|
731
|
-
expect(String(result.matches[0].snippet)).toContain("Hello");
|
|
732
|
-
});
|
|
733
|
-
it("read_text_file should return bounded text slices", async () => {
|
|
734
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
735
|
-
const filePath = path.join(tmpDir, "notes.txt");
|
|
736
|
-
await writeFile(filePath, "Line1\nLine2\nLine3\n", "utf8");
|
|
737
|
-
const tool = findTool("read_text_file");
|
|
738
|
-
const result = (await tool.handler({
|
|
739
|
-
path: filePath,
|
|
740
|
-
startChar: 0,
|
|
741
|
-
maxChars: 10,
|
|
742
|
-
}));
|
|
743
|
-
expect(result.truncated).toBe(true);
|
|
744
|
-
expect(String(result.text)).toContain("Line1");
|
|
745
|
-
});
|
|
746
|
-
it("read_json_file and json_select should parse and select values", async () => {
|
|
747
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
748
|
-
const filePath = path.join(tmpDir, "data.json");
|
|
749
|
-
await writeFile(filePath, JSON.stringify({ a: { b: [{ name: "alpha" }, { name: "beta" }] } }), "utf8");
|
|
750
|
-
const readTool = findTool("read_json_file");
|
|
751
|
-
const readResult = (await readTool.handler({
|
|
752
|
-
path: filePath,
|
|
753
|
-
maxDepth: 6,
|
|
754
|
-
maxItems: 50,
|
|
755
|
-
maxStringChars: 1000,
|
|
756
|
-
}));
|
|
757
|
-
expect(readResult.rootType).toBe("object");
|
|
758
|
-
expect(readResult.value.a.b.length).toBe(2);
|
|
759
|
-
const selectTool = findTool("json_select");
|
|
760
|
-
const selectResult = (await selectTool.handler({
|
|
761
|
-
path: filePath,
|
|
762
|
-
pointer: "/a/b/1/name",
|
|
763
|
-
maxDepth: 3,
|
|
764
|
-
maxItems: 10,
|
|
765
|
-
maxStringChars: 100,
|
|
766
|
-
}));
|
|
767
|
-
expect(selectResult.found).toBe(true);
|
|
768
|
-
expect(selectResult.value).toBe("beta");
|
|
769
|
-
});
|
|
770
|
-
it("read_jsonl_file should parse lines and report errors", async () => {
|
|
771
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-"));
|
|
772
|
-
const filePath = path.join(tmpDir, "data.jsonl");
|
|
773
|
-
await writeFile(filePath, '{"ok":1}\nnot-json\n{"ok":2}\n', "utf8");
|
|
774
|
-
const tool = findTool("read_jsonl_file");
|
|
775
|
-
const result = (await tool.handler({
|
|
776
|
-
path: filePath,
|
|
777
|
-
limitLines: 10,
|
|
778
|
-
parseJson: true,
|
|
779
|
-
maxDepth: 4,
|
|
780
|
-
maxItems: 20,
|
|
781
|
-
maxStringChars: 100,
|
|
782
|
-
}));
|
|
783
|
-
expect(result.returnedLines).toBe(2);
|
|
784
|
-
expect(result.errorCount).toBe(1);
|
|
785
|
-
expect(result.lines[0].value.ok).toBe(1);
|
|
786
|
-
expect(result.lines[1].value.ok).toBe(2);
|
|
787
|
-
});
|
|
788
|
-
it("zip_list_files and zip_read_text_file should read entries", async () => {
|
|
789
|
-
const zipPath = findRepoFile(path.join("test_assets", "zip_fixture.zip"));
|
|
790
|
-
const listTool = findTool("zip_list_files");
|
|
791
|
-
const listResult = (await listTool.handler({ path: zipPath, maxEntries: 50 }));
|
|
792
|
-
const names = (listResult.entries ?? []).map((e) => e.fileName);
|
|
793
|
-
expect(names).toContain("hello.txt");
|
|
794
|
-
expect(names).toContain("folder/data.csv");
|
|
795
|
-
const readTool = findTool("zip_read_text_file");
|
|
796
|
-
const readResult = (await readTool.handler({
|
|
797
|
-
path: zipPath,
|
|
798
|
-
innerPath: "hello.txt",
|
|
799
|
-
maxChars: 2000,
|
|
800
|
-
}));
|
|
801
|
-
expect(String(readResult.text)).toContain("Hello from zip fixture");
|
|
802
|
-
});
|
|
803
|
-
it("zip_extract_file should safely extract to outputDir", async () => {
|
|
804
|
-
const zipPath = findRepoFile(path.join("test_assets", "zip_fixture.zip"));
|
|
805
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-mcp-zip-"));
|
|
806
|
-
const extractTool = findTool("zip_extract_file");
|
|
807
|
-
const extracted = (await extractTool.handler({
|
|
808
|
-
path: zipPath,
|
|
809
|
-
innerPath: "folder/data.csv",
|
|
810
|
-
outputDir: tmpDir,
|
|
811
|
-
overwrite: true,
|
|
812
|
-
}));
|
|
813
|
-
expect(typeof extracted.extractedPath).toBe("string");
|
|
814
|
-
expect(existsSync(extracted.extractedPath)).toBe(true);
|
|
815
|
-
const readTool = findTool("read_text_file");
|
|
816
|
-
const text = (await readTool.handler({ path: extracted.extractedPath, maxChars: 2000 }));
|
|
817
|
-
expect(String(text.text)).toContain("alpha,1");
|
|
818
|
-
});
|
|
819
|
-
it("read_docx_text should extract document text", async () => {
|
|
820
|
-
const docxPath = findRepoFile(path.join("test_assets", "docx_fixture.docx"));
|
|
821
|
-
const tool = findTool("read_docx_text");
|
|
822
|
-
const result = (await tool.handler({ path: docxPath, maxChars: 5000 }));
|
|
823
|
-
expect(String(result.text)).toContain("Hello DOCX");
|
|
824
|
-
expect(String(result.text)).toContain("Second paragraph");
|
|
825
|
-
});
|
|
826
|
-
it("read_pptx_text should extract slide text with markers", async () => {
|
|
827
|
-
const pptxPath = findRepoFile(path.join("test_assets", "pptx_fixture.pptx"));
|
|
828
|
-
const tool = findTool("read_pptx_text");
|
|
829
|
-
const result = (await tool.handler({ path: pptxPath, maxChars: 10000 }));
|
|
830
|
-
expect(result.slideCount).toBe(2);
|
|
831
|
-
expect(String(result.text)).toContain("[SLIDE 1]");
|
|
832
|
-
expect(String(result.text)).toContain("Hello PPTX Slide1");
|
|
833
|
-
expect(String(result.text)).toContain("[SLIDE 2]");
|
|
834
|
-
expect(String(result.text)).toContain("Slide2 Text");
|
|
835
|
-
});
|
|
836
|
-
});
|
|
837
|
-
describe("Unit: abandon_cycle", () => {
|
|
838
|
-
it("should abandon an active cycle", async () => {
|
|
839
|
-
// Create a cycle first
|
|
840
|
-
const startTool = findTool("start_verification_cycle");
|
|
841
|
-
const cycle = (await startTool.handler({
|
|
842
|
-
title: "test-abandon-cycle",
|
|
843
|
-
description: "test cycle for abandon",
|
|
844
|
-
}));
|
|
845
|
-
expect(cycle.cycleId).toBeTruthy();
|
|
846
|
-
// Abandon it
|
|
847
|
-
const abandonTool = findTool("abandon_cycle");
|
|
848
|
-
const result = (await abandonTool.handler({
|
|
849
|
-
cycleId: cycle.cycleId,
|
|
850
|
-
reason: "test cleanup",
|
|
851
|
-
}));
|
|
852
|
-
expect(result.abandoned).toBe(true);
|
|
853
|
-
expect(result.reason).toBe("test cleanup");
|
|
854
|
-
});
|
|
855
|
-
it("should skip already-abandoned cycles", async () => {
|
|
856
|
-
const startTool = findTool("start_verification_cycle");
|
|
857
|
-
const cycle = (await startTool.handler({
|
|
858
|
-
title: "test-double-abandon",
|
|
859
|
-
description: "test",
|
|
860
|
-
}));
|
|
861
|
-
const abandonTool = findTool("abandon_cycle");
|
|
862
|
-
await abandonTool.handler({ cycleId: cycle.cycleId });
|
|
863
|
-
const result2 = (await abandonTool.handler({
|
|
864
|
-
cycleId: cycle.cycleId,
|
|
865
|
-
}));
|
|
866
|
-
expect(result2.skipped).toBe(true);
|
|
867
|
-
});
|
|
868
|
-
it("should throw on nonexistent cycle", async () => {
|
|
869
|
-
const abandonTool = findTool("abandon_cycle");
|
|
870
|
-
await expect(abandonTool.handler({ cycleId: "nonexistent_cycle_id" })).rejects.toThrow("Cycle not found");
|
|
871
|
-
});
|
|
872
|
-
});
|
|
873
|
-
describe("Unit: search_all_knowledge", () => {
|
|
874
|
-
it("should return results structure with gaps field", async () => {
|
|
875
|
-
const tool = findTool("search_all_knowledge");
|
|
876
|
-
const result = (await tool.handler({
|
|
877
|
-
query: "test",
|
|
878
|
-
}));
|
|
879
|
-
expect(result).toHaveProperty("query");
|
|
880
|
-
expect(result).toHaveProperty("totalResults");
|
|
881
|
-
expect(result).toHaveProperty("learnings");
|
|
882
|
-
expect(result).toHaveProperty("reconFindings");
|
|
883
|
-
expect(result).toHaveProperty("gaps");
|
|
884
|
-
expect(result).toHaveProperty("_contributeBack");
|
|
885
|
-
});
|
|
886
|
-
});
|
|
887
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
888
|
-
// INTEGRATION LAYER — multi-tool chain
|
|
889
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
890
|
-
describe("Integration: full verification cycle chain", () => {
|
|
891
|
-
it("start → log_phase → log_gap → resolve_gap → abandon", async () => {
|
|
892
|
-
// 1. Start cycle
|
|
893
|
-
const cycle = (await findTool("start_verification_cycle").handler({
|
|
894
|
-
title: "integration-test-cycle",
|
|
895
|
-
description: "full chain test",
|
|
896
|
-
}));
|
|
897
|
-
expect(cycle.cycleId).toBeTruthy();
|
|
898
|
-
// 2. Log phase 1 findings
|
|
899
|
-
const phase1 = (await findTool("log_phase_findings").handler({
|
|
900
|
-
cycleId: cycle.cycleId,
|
|
901
|
-
phaseNumber: 1,
|
|
902
|
-
status: "passed",
|
|
903
|
-
findings: { summary: "context gathered" },
|
|
904
|
-
}));
|
|
905
|
-
expect(phase1.phaseRecorded).toBe(1);
|
|
906
|
-
expect(phase1.phaseStatus).toBe("passed");
|
|
907
|
-
// 3. Log a gap
|
|
908
|
-
const gap = (await findTool("log_gap").handler({
|
|
909
|
-
cycleId: cycle.cycleId,
|
|
910
|
-
severity: "LOW",
|
|
911
|
-
title: "test gap for integration",
|
|
912
|
-
description: "This is a test gap",
|
|
913
|
-
rootCause: "testing",
|
|
914
|
-
fixStrategy: "resolve in test",
|
|
915
|
-
}));
|
|
916
|
-
expect(gap.gapId).toBeTruthy();
|
|
917
|
-
// 4. Resolve the gap
|
|
918
|
-
const resolved = (await findTool("resolve_gap").handler({
|
|
919
|
-
gapId: gap.gapId,
|
|
920
|
-
}));
|
|
921
|
-
expect(resolved.status).toBe("resolved");
|
|
922
|
-
// 5. Abandon the cycle (cleanup)
|
|
923
|
-
const abandoned = (await findTool("abandon_cycle").handler({
|
|
924
|
-
cycleId: cycle.cycleId,
|
|
925
|
-
reason: "integration test cleanup",
|
|
926
|
-
}));
|
|
927
|
-
expect(abandoned.abandoned).toBe(true);
|
|
928
|
-
});
|
|
929
|
-
});
|
|
930
|
-
describe("Unit: log_tool_call", () => {
|
|
931
|
-
it("should log a tool call and return confirmation", async () => {
|
|
932
|
-
const tool = findTool("log_tool_call");
|
|
933
|
-
const result = (await tool.handler({
|
|
934
|
-
sessionId: "test-session-001",
|
|
935
|
-
toolName: "run_recon",
|
|
936
|
-
durationMs: 42,
|
|
937
|
-
resultStatus: "success",
|
|
938
|
-
phase: "recon",
|
|
939
|
-
}));
|
|
940
|
-
expect(result.logged).toBe(true);
|
|
941
|
-
expect(result.sessionId).toBe("test-session-001");
|
|
942
|
-
expect(result.toolName).toBe("run_recon");
|
|
943
|
-
expect(result.resultStatus).toBe("success");
|
|
944
|
-
});
|
|
945
|
-
it("should log error tool calls", async () => {
|
|
946
|
-
const tool = findTool("log_tool_call");
|
|
947
|
-
const result = (await tool.handler({
|
|
948
|
-
sessionId: "test-session-001",
|
|
949
|
-
toolName: "web_search",
|
|
950
|
-
durationMs: 1500,
|
|
951
|
-
resultStatus: "error",
|
|
952
|
-
error: "API key not configured",
|
|
953
|
-
phase: "recon",
|
|
954
|
-
}));
|
|
955
|
-
expect(result.logged).toBe(true);
|
|
956
|
-
expect(result.resultStatus).toBe("error");
|
|
957
|
-
});
|
|
958
|
-
});
|
|
959
|
-
describe("Unit: get_trajectory_analysis", () => {
|
|
960
|
-
it("should return trajectory analysis with logged data", async () => {
|
|
961
|
-
// Log a few calls first
|
|
962
|
-
const logTool = findTool("log_tool_call");
|
|
963
|
-
await logTool.handler({ sessionId: "traj-test", toolName: "findTools", durationMs: 10, phase: "meta" });
|
|
964
|
-
await logTool.handler({ sessionId: "traj-test", toolName: "run_recon", durationMs: 20, phase: "recon" });
|
|
965
|
-
await logTool.handler({ sessionId: "traj-test", toolName: "log_recon_finding", durationMs: 15, phase: "recon" });
|
|
966
|
-
const tool = findTool("get_trajectory_analysis");
|
|
967
|
-
const result = (await tool.handler({ sessionId: "traj-test" }));
|
|
968
|
-
expect(result.totalCalls).toBeGreaterThanOrEqual(3);
|
|
969
|
-
expect(result.uniqueTools).toBeGreaterThanOrEqual(3);
|
|
970
|
-
expect(result.topTools.length).toBeGreaterThan(0);
|
|
971
|
-
});
|
|
972
|
-
it("should return empty message when no data exists for session", async () => {
|
|
973
|
-
const tool = findTool("get_trajectory_analysis");
|
|
974
|
-
const result = (await tool.handler({ sessionId: "nonexistent-session-xyz" }));
|
|
975
|
-
expect(result.totalCalls).toBe(0);
|
|
976
|
-
expect(result.message).toBeTruthy();
|
|
977
|
-
});
|
|
978
|
-
});
|
|
979
|
-
describe("Unit: get_self_eval_report", () => {
|
|
980
|
-
it("should return health report with all sections", async () => {
|
|
981
|
-
const tool = findTool("get_self_eval_report");
|
|
982
|
-
const result = (await tool.handler({ sinceDaysAgo: 30 }));
|
|
983
|
-
expect(result).toHaveProperty("healthScore");
|
|
984
|
-
expect(result).toHaveProperty("healthGrade");
|
|
985
|
-
expect(result).toHaveProperty("verification");
|
|
986
|
-
expect(result).toHaveProperty("gaps");
|
|
987
|
-
expect(result).toHaveProperty("evalRuns");
|
|
988
|
-
expect(result).toHaveProperty("qualityGates");
|
|
989
|
-
expect(result).toHaveProperty("knowledge");
|
|
990
|
-
expect(result).toHaveProperty("toolTrajectory");
|
|
991
|
-
expect(typeof result.healthScore).toBe("number");
|
|
992
|
-
expect(["A", "B", "C", "D", "F"]).toContain(result.healthGrade);
|
|
993
|
-
});
|
|
994
|
-
it("should include details when requested", async () => {
|
|
995
|
-
const tool = findTool("get_self_eval_report");
|
|
996
|
-
const result = (await tool.handler({ sinceDaysAgo: 30, includeDetails: true }));
|
|
997
|
-
expect(result).toHaveProperty("cycleDetails");
|
|
998
|
-
expect(result).toHaveProperty("openGapDetails");
|
|
999
|
-
});
|
|
1000
|
-
});
|
|
1001
|
-
describe("Unit: get_improvement_recommendations", () => {
|
|
1002
|
-
it("should return structured recommendations", async () => {
|
|
1003
|
-
const tool = findTool("get_improvement_recommendations");
|
|
1004
|
-
const result = (await tool.handler({ sinceDaysAgo: 30 }));
|
|
1005
|
-
expect(typeof result.totalRecommendations).toBe("number");
|
|
1006
|
-
expect(typeof result.highPriority).toBe("number");
|
|
1007
|
-
expect(typeof result.mediumPriority).toBe("number");
|
|
1008
|
-
expect(typeof result.lowPriority).toBe("number");
|
|
1009
|
-
expect(Array.isArray(result.recommendations)).toBe(true);
|
|
1010
|
-
expect(result).toHaveProperty("_selfReinforcement");
|
|
1011
|
-
expect(result._selfReinforcement.nextSteps.length).toBe(4);
|
|
1012
|
-
});
|
|
1013
|
-
it("should filter by focus area", async () => {
|
|
1014
|
-
const tool = findTool("get_improvement_recommendations");
|
|
1015
|
-
const result = (await tool.handler({ sinceDaysAgo: 30, focus: "quality" }));
|
|
1016
|
-
expect(result.focus).toBe("quality");
|
|
1017
|
-
for (const rec of result.recommendations) {
|
|
1018
|
-
expect(rec.category).toBe("quality");
|
|
1019
|
-
}
|
|
1020
|
-
});
|
|
1021
|
-
});
|
|
1022
|
-
describe("Unit: cleanup_stale_runs", () => {
|
|
1023
|
-
it("should return dry run preview without modifying data", async () => {
|
|
1024
|
-
const tool = findTool("cleanup_stale_runs");
|
|
1025
|
-
const result = (await tool.handler({ staleDays: 7, dryRun: true }));
|
|
1026
|
-
expect(result.dryRun).toBe(true);
|
|
1027
|
-
expect(result).toHaveProperty("staleEvalRuns");
|
|
1028
|
-
expect(result).toHaveProperty("staleCycles");
|
|
1029
|
-
expect(result).toHaveProperty("staleGaps");
|
|
1030
|
-
expect(result.staleEvalRuns).toHaveProperty("count");
|
|
1031
|
-
expect(result.staleCycles).toHaveProperty("count");
|
|
1032
|
-
expect(result.nextStep).toContain("dryRun=false");
|
|
1033
|
-
});
|
|
1034
|
-
it("should support closeStaleGaps option", async () => {
|
|
1035
|
-
const tool = findTool("cleanup_stale_runs");
|
|
1036
|
-
const result = (await tool.handler({ staleDays: 7, closeStaleGaps: true, dryRun: true }));
|
|
1037
|
-
expect(result.staleGaps).toHaveProperty("count");
|
|
1038
|
-
expect(result.staleGaps.skipped).toBeUndefined();
|
|
1039
|
-
});
|
|
1040
|
-
it("should skip stale gaps by default", async () => {
|
|
1041
|
-
const tool = findTool("cleanup_stale_runs");
|
|
1042
|
-
const result = (await tool.handler({ staleDays: 7, dryRun: true }));
|
|
1043
|
-
expect(result.staleGaps.skipped).toBe(true);
|
|
1044
|
-
});
|
|
1045
|
-
});
|
|
1046
|
-
describe("Unit: synthesize_recon_to_learnings", () => {
|
|
1047
|
-
it("should return dry run preview", async () => {
|
|
1048
|
-
const tool = findTool("synthesize_recon_to_learnings");
|
|
1049
|
-
const result = (await tool.handler({ sinceDaysAgo: 30, dryRun: true }));
|
|
1050
|
-
expect(result.dryRun).toBe(true);
|
|
1051
|
-
expect(result).toHaveProperty("totalFindings");
|
|
1052
|
-
expect(result).toHaveProperty("alreadySynthesized");
|
|
1053
|
-
expect(result).toHaveProperty("newLearnings");
|
|
1054
|
-
expect(result).toHaveProperty("created");
|
|
1055
|
-
expect(result.created).toBe(0); // dry run doesn't create
|
|
1056
|
-
expect(result).toHaveProperty("preview");
|
|
1057
|
-
expect(result.nextStep).toContain("dryRun=false");
|
|
1058
|
-
});
|
|
1059
|
-
it("should support sessionId filter", async () => {
|
|
1060
|
-
const tool = findTool("synthesize_recon_to_learnings");
|
|
1061
|
-
const result = (await tool.handler({ sessionId: "nonexistent-session", dryRun: true }));
|
|
1062
|
-
expect(result.totalFindings).toBe(0);
|
|
1063
|
-
expect(result.newLearnings).toBe(0);
|
|
1064
|
-
});
|
|
1065
|
-
});
|
|
1066
|
-
describe("Unit: get_self_eval_report excludeTestSessions", () => {
|
|
1067
|
-
it("should accept excludeTestSessions parameter", async () => {
|
|
1068
|
-
const tool = findTool("get_self_eval_report");
|
|
1069
|
-
const result = (await tool.handler({ sinceDaysAgo: 30, excludeTestSessions: true }));
|
|
1070
|
-
expect(result).toHaveProperty("healthScore");
|
|
1071
|
-
expect(typeof result.healthScore).toBe("number");
|
|
1072
|
-
});
|
|
1073
|
-
it("should have excludeTestSessions default to true", async () => {
|
|
1074
|
-
const tool = findTool("get_self_eval_report");
|
|
1075
|
-
const schema = tool.inputSchema;
|
|
1076
|
-
expect(schema.properties.excludeTestSessions).toBeDefined();
|
|
1077
|
-
expect(schema.properties.excludeTestSessions.type).toBe("boolean");
|
|
1078
|
-
});
|
|
1079
|
-
});
|
|
1080
|
-
describe("Unit: web_search graceful fallback", () => {
|
|
1081
|
-
it("should return empty results with setup info when no provider", async () => {
|
|
1082
|
-
// Save and clear all API keys
|
|
1083
|
-
const saved = {
|
|
1084
|
-
GEMINI_API_KEY: process.env.GEMINI_API_KEY,
|
|
1085
|
-
GOOGLE_AI_API_KEY: process.env.GOOGLE_AI_API_KEY,
|
|
1086
|
-
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
|
|
1087
|
-
PERPLEXITY_API_KEY: process.env.PERPLEXITY_API_KEY,
|
|
1088
|
-
};
|
|
1089
|
-
delete process.env.GEMINI_API_KEY;
|
|
1090
|
-
delete process.env.GOOGLE_AI_API_KEY;
|
|
1091
|
-
delete process.env.OPENAI_API_KEY;
|
|
1092
|
-
delete process.env.PERPLEXITY_API_KEY;
|
|
1093
|
-
try {
|
|
1094
|
-
const tool = findTool("web_search");
|
|
1095
|
-
const result = (await tool.handler({ query: "test query" }));
|
|
1096
|
-
expect(result.results).toEqual([]);
|
|
1097
|
-
expect(result.provider).toBe("none");
|
|
1098
|
-
expect(result.resultCount).toBe(0);
|
|
1099
|
-
expect(result).toHaveProperty("setup");
|
|
1100
|
-
expect(result.setup.options.length).toBe(3);
|
|
1101
|
-
// Verify no error flag
|
|
1102
|
-
expect(result.error).toBeUndefined();
|
|
1103
|
-
}
|
|
1104
|
-
finally {
|
|
1105
|
-
// Restore API keys
|
|
1106
|
-
for (const [key, val] of Object.entries(saved)) {
|
|
1107
|
-
if (val !== undefined)
|
|
1108
|
-
process.env[key] = val;
|
|
1109
|
-
}
|
|
1110
|
-
}
|
|
1111
|
-
});
|
|
1112
|
-
});
|
|
1113
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1114
|
-
// v2.2.0 — LLM, Security, and Diff tools
|
|
1115
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1116
|
-
describe("Static: llm tools", () => {
|
|
1117
|
-
it("should include call_llm and extract_structured_data", () => {
|
|
1118
|
-
const names = allTools.map((t) => t.name);
|
|
1119
|
-
expect(names).toContain("call_llm");
|
|
1120
|
-
expect(names).toContain("extract_structured_data");
|
|
1121
|
-
});
|
|
1122
|
-
it("call_llm requires prompt parameter", () => {
|
|
1123
|
-
const tool = findTool("call_llm");
|
|
1124
|
-
expect(tool.inputSchema.required).toContain("prompt");
|
|
1125
|
-
});
|
|
1126
|
-
it("extract_structured_data requires text and fields parameters", () => {
|
|
1127
|
-
const tool = findTool("extract_structured_data");
|
|
1128
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1129
|
-
expect(tool.inputSchema.required).toContain("fields");
|
|
1130
|
-
});
|
|
1131
|
-
});
|
|
1132
|
-
describe("Static: security tools", () => {
|
|
1133
|
-
it("should include scan_dependencies and run_code_analysis", () => {
|
|
1134
|
-
const names = allTools.map((t) => t.name);
|
|
1135
|
-
expect(names).toContain("scan_dependencies");
|
|
1136
|
-
expect(names).toContain("run_code_analysis");
|
|
1137
|
-
});
|
|
1138
|
-
it("run_code_analysis requires content parameter", () => {
|
|
1139
|
-
const tool = findTool("run_code_analysis");
|
|
1140
|
-
expect(tool.inputSchema.required).toContain("content");
|
|
1141
|
-
});
|
|
1142
|
-
it("scan_dependencies has no required parameters", () => {
|
|
1143
|
-
const tool = findTool("scan_dependencies");
|
|
1144
|
-
const required = tool.inputSchema.required;
|
|
1145
|
-
expect(required ?? []).toEqual([]);
|
|
1146
|
-
});
|
|
1147
|
-
});
|
|
1148
|
-
describe("Unit: run_code_analysis", () => {
|
|
1149
|
-
it("should detect hardcoded API key in code", async () => {
|
|
1150
|
-
const tool = findTool("run_code_analysis");
|
|
1151
|
-
const result = (await tool.handler({
|
|
1152
|
-
content: 'const api_key = "FAKE_TEST_KEY_abcdefghijklmnopqrstuvwxyz1234567890";',
|
|
1153
|
-
checks: ["secrets"],
|
|
1154
|
-
}));
|
|
1155
|
-
expect(result.totalFindings).toBeGreaterThanOrEqual(1);
|
|
1156
|
-
expect(result.bySeverity.HIGH).toBeGreaterThanOrEqual(1);
|
|
1157
|
-
expect(result.findings[0].check).toBe("secrets");
|
|
1158
|
-
});
|
|
1159
|
-
it("should detect zero-width characters (homograph check)", async () => {
|
|
1160
|
-
const tool = findTool("run_code_analysis");
|
|
1161
|
-
const result = (await tool.handler({
|
|
1162
|
-
content: "export API_KEY=sk-\u200bsecret123",
|
|
1163
|
-
checks: ["homograph"],
|
|
1164
|
-
}));
|
|
1165
|
-
expect(result.totalFindings).toBeGreaterThanOrEqual(1);
|
|
1166
|
-
expect(result.bySeverity.HIGH).toBeGreaterThanOrEqual(1);
|
|
1167
|
-
});
|
|
1168
|
-
it("should return clean for safe code", async () => {
|
|
1169
|
-
const tool = findTool("run_code_analysis");
|
|
1170
|
-
const result = (await tool.handler({
|
|
1171
|
-
content: 'function add(a: number, b: number): number { return a + b; }',
|
|
1172
|
-
checks: ["secrets", "homograph", "urls"],
|
|
1173
|
-
}));
|
|
1174
|
-
expect(result.totalFindings).toBe(0);
|
|
1175
|
-
});
|
|
1176
|
-
});
|
|
1177
|
-
describe("Unit: scan_dependencies", () => {
|
|
1178
|
-
it("should scan the mcp-local package.json", async () => {
|
|
1179
|
-
const tool = findTool("scan_dependencies");
|
|
1180
|
-
const result = (await tool.handler({
|
|
1181
|
-
projectRoot: path.resolve(__dirname, "../.."),
|
|
1182
|
-
}));
|
|
1183
|
-
expect(result.totalPackages).toBeGreaterThan(0);
|
|
1184
|
-
expect(result.manifests.length).toBeGreaterThan(0);
|
|
1185
|
-
expect(result).toHaveProperty("summary");
|
|
1186
|
-
expect(result).toHaveProperty("dependencies");
|
|
1187
|
-
});
|
|
1188
|
-
it("should return error when no manifest found", async () => {
|
|
1189
|
-
const tool = findTool("scan_dependencies");
|
|
1190
|
-
const result = (await tool.handler({
|
|
1191
|
-
projectRoot: os.tmpdir(),
|
|
1192
|
-
}));
|
|
1193
|
-
expect(result.error).toBe(true);
|
|
1194
|
-
expect(result.message).toContain("No package manifest");
|
|
1195
|
-
});
|
|
1196
|
-
});
|
|
1197
|
-
describe("Static: diff_outputs tool", () => {
|
|
1198
|
-
it("should exist in eval tools", () => {
|
|
1199
|
-
const names = allTools.map((t) => t.name);
|
|
1200
|
-
expect(names).toContain("diff_outputs");
|
|
1201
|
-
});
|
|
1202
|
-
it("requires baseline and candidate parameters", () => {
|
|
1203
|
-
const tool = findTool("diff_outputs");
|
|
1204
|
-
expect(tool.inputSchema.required).toContain("baseline");
|
|
1205
|
-
expect(tool.inputSchema.required).toContain("candidate");
|
|
1206
|
-
});
|
|
1207
|
-
});
|
|
1208
|
-
describe("Unit: diff_outputs", () => {
|
|
1209
|
-
it("should compute text diff with similarity score", async () => {
|
|
1210
|
-
const tool = findTool("diff_outputs");
|
|
1211
|
-
const result = (await tool.handler({
|
|
1212
|
-
baseline: "line one\nline two\nline three",
|
|
1213
|
-
candidate: "line one\nline TWO\nline three\nline four",
|
|
1214
|
-
}));
|
|
1215
|
-
expect(result).toHaveProperty("similarity");
|
|
1216
|
-
expect(result.similarity).toBeGreaterThan(0);
|
|
1217
|
-
expect(result.similarity).toBeLessThan(1);
|
|
1218
|
-
expect(result.added.length).toBeGreaterThan(0);
|
|
1219
|
-
expect(result).toHaveProperty("summary");
|
|
1220
|
-
});
|
|
1221
|
-
it("should return 1.0 similarity for identical text", async () => {
|
|
1222
|
-
const tool = findTool("diff_outputs");
|
|
1223
|
-
const result = (await tool.handler({
|
|
1224
|
-
baseline: "identical content",
|
|
1225
|
-
candidate: "identical content",
|
|
1226
|
-
}));
|
|
1227
|
-
expect(result.similarity).toBe(1);
|
|
1228
|
-
expect(result.added.length).toBe(0);
|
|
1229
|
-
expect(result.removed.length).toBe(0);
|
|
1230
|
-
});
|
|
1231
|
-
it("should diff JSON objects with field-level changes", async () => {
|
|
1232
|
-
const tool = findTool("diff_outputs");
|
|
1233
|
-
const result = (await tool.handler({
|
|
1234
|
-
baseline: '{"name":"Alice","age":30,"city":"NYC"}',
|
|
1235
|
-
candidate: '{"name":"Alice","age":31,"country":"USA"}',
|
|
1236
|
-
format: "json",
|
|
1237
|
-
}));
|
|
1238
|
-
expect(result).toHaveProperty("changed");
|
|
1239
|
-
expect(result.changed.length).toBeGreaterThan(0);
|
|
1240
|
-
expect(result).toHaveProperty("removed");
|
|
1241
|
-
expect(result).toHaveProperty("added");
|
|
1242
|
-
});
|
|
1243
|
-
});
|
|
1244
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1245
|
-
// PLATFORM TOOLS — Convex bridge validation
|
|
1246
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1247
|
-
describe("Static: platform tools", () => {
|
|
1248
|
-
it("should export 4 platform tools", () => {
|
|
1249
|
-
expect(platformTools.length).toBe(4);
|
|
1250
|
-
});
|
|
1251
|
-
const expectedTools = [
|
|
1252
|
-
{ name: "query_daily_brief", requiredParams: [] },
|
|
1253
|
-
{ name: "query_funding_entities", requiredParams: [] },
|
|
1254
|
-
{ name: "query_research_queue", requiredParams: [] },
|
|
1255
|
-
{ name: "publish_to_queue", requiredParams: ["content", "postType"] },
|
|
1256
|
-
];
|
|
1257
|
-
for (const { name, requiredParams } of expectedTools) {
|
|
1258
|
-
it(`${name} has valid schema`, () => {
|
|
1259
|
-
const tool = platformTools.find((t) => t.name === name);
|
|
1260
|
-
expect(tool).toBeDefined();
|
|
1261
|
-
expect(tool.description.length).toBeGreaterThan(10);
|
|
1262
|
-
expect(tool.inputSchema.type).toBe("object");
|
|
1263
|
-
if (requiredParams.length > 0) {
|
|
1264
|
-
expect(tool.inputSchema.required).toEqual(expect.arrayContaining(requiredParams));
|
|
1265
|
-
}
|
|
1266
|
-
});
|
|
1267
|
-
}
|
|
1268
|
-
});
|
|
1269
|
-
describe("Unit: platform tools graceful fallback", () => {
|
|
1270
|
-
it("query_daily_brief returns error when CONVEX_SITE_URL not set", async () => {
|
|
1271
|
-
const tool = findTool("query_daily_brief");
|
|
1272
|
-
const result = (await tool.handler({}));
|
|
1273
|
-
// Without CONVEX_SITE_URL, should return a platform-not-configured error
|
|
1274
|
-
expect(result).toHaveProperty("error");
|
|
1275
|
-
expect(result.message).toContain("Platform not configured");
|
|
1276
|
-
});
|
|
1277
|
-
it("query_funding_entities returns error when CONVEX_SITE_URL not set", async () => {
|
|
1278
|
-
const tool = findTool("query_funding_entities");
|
|
1279
|
-
const result = (await tool.handler({ query: "test" }));
|
|
1280
|
-
expect(result).toHaveProperty("error");
|
|
1281
|
-
expect(result.message).toContain("Platform not configured");
|
|
1282
|
-
});
|
|
1283
|
-
it("query_research_queue returns error when CONVEX_SITE_URL not set", async () => {
|
|
1284
|
-
const tool = findTool("query_research_queue");
|
|
1285
|
-
const result = (await tool.handler({}));
|
|
1286
|
-
expect(result).toHaveProperty("error");
|
|
1287
|
-
expect(result.message).toContain("Platform not configured");
|
|
1288
|
-
});
|
|
1289
|
-
it("publish_to_queue returns error when CONVEX_SITE_URL not set", async () => {
|
|
1290
|
-
const tool = findTool("publish_to_queue");
|
|
1291
|
-
const result = (await tool.handler({ content: "test", postType: "insight" }));
|
|
1292
|
-
expect(result).toHaveProperty("error");
|
|
1293
|
-
expect(result.message).toContain("Platform not configured");
|
|
1294
|
-
});
|
|
1295
|
-
});
|
|
1296
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1297
|
-
// TIER 3 CAPABILITY TOOLS — new domains
|
|
1298
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1299
|
-
describe("Static: benchmark_models tool", () => {
|
|
1300
|
-
it("has valid schema with required prompt", () => {
|
|
1301
|
-
const tool = findTool("benchmark_models");
|
|
1302
|
-
expect(tool.inputSchema.required).toEqual(["prompt"]);
|
|
1303
|
-
expect(tool.inputSchema.properties).toHaveProperty("prompt");
|
|
1304
|
-
expect(tool.inputSchema.properties).toHaveProperty("system");
|
|
1305
|
-
});
|
|
1306
|
-
it("returns error when no providers available", async () => {
|
|
1307
|
-
const tool = findTool("benchmark_models");
|
|
1308
|
-
const result = (await tool.handler({ prompt: "test" }));
|
|
1309
|
-
// No API keys set in test env
|
|
1310
|
-
expect(result).toHaveProperty("error");
|
|
1311
|
-
expect(result.message).toContain("No LLM providers available");
|
|
1312
|
-
});
|
|
1313
|
-
});
|
|
1314
|
-
describe("Static: generate_report tool", () => {
|
|
1315
|
-
it("has valid schema with required title and sections", () => {
|
|
1316
|
-
const tool = findTool("generate_report");
|
|
1317
|
-
expect(tool.inputSchema.required).toEqual(["title", "sections"]);
|
|
1318
|
-
});
|
|
1319
|
-
it("generates markdown report from sections", async () => {
|
|
1320
|
-
const tool = findTool("generate_report");
|
|
1321
|
-
const result = (await tool.handler({
|
|
1322
|
-
title: "Test Report",
|
|
1323
|
-
sections: [
|
|
1324
|
-
{ heading: "Overview", content: "This is a test report." },
|
|
1325
|
-
{ heading: "Findings", content: "- Finding 1\n- Finding 2" },
|
|
1326
|
-
],
|
|
1327
|
-
metadata: { author: "test", project: "nodebench" },
|
|
1328
|
-
}));
|
|
1329
|
-
expect(result).toHaveProperty("markdown");
|
|
1330
|
-
expect(result.markdown).toContain("# Test Report");
|
|
1331
|
-
expect(result.markdown).toContain("## Overview");
|
|
1332
|
-
expect(result.markdown).toContain("## Findings");
|
|
1333
|
-
expect(result.markdown).toContain("Table of Contents");
|
|
1334
|
-
expect(result.sections).toBe(2);
|
|
1335
|
-
expect(result.characters).toBeGreaterThan(100);
|
|
1336
|
-
});
|
|
1337
|
-
});
|
|
1338
|
-
describe("Static: monitor_repo tool", () => {
|
|
1339
|
-
it("has valid schema with required repo", () => {
|
|
1340
|
-
const tool = findTool("monitor_repo");
|
|
1341
|
-
expect(tool.inputSchema.required).toEqual(["repo"]);
|
|
1342
|
-
});
|
|
1343
|
-
it("rejects invalid repo format", async () => {
|
|
1344
|
-
const tool = findTool("monitor_repo");
|
|
1345
|
-
const result = (await tool.handler({ repo: "not-a-valid-repo" }));
|
|
1346
|
-
expect(result).toHaveProperty("error");
|
|
1347
|
-
expect(result.message).toContain("Invalid repo format");
|
|
1348
|
-
});
|
|
1349
|
-
});
|
|
1350
|
-
describe("Static: run_tests_cli tool", () => {
|
|
1351
|
-
it("has valid schema with required command", () => {
|
|
1352
|
-
const tool = findTool("run_tests_cli");
|
|
1353
|
-
expect(tool.inputSchema.required).toEqual(["command"]);
|
|
1354
|
-
expect(tool.inputSchema.properties).toHaveProperty("cwd");
|
|
1355
|
-
expect(tool.inputSchema.properties).toHaveProperty("timeoutMs");
|
|
1356
|
-
});
|
|
1357
|
-
it("blocks dangerous commands", async () => {
|
|
1358
|
-
const tool = findTool("run_tests_cli");
|
|
1359
|
-
const result = (await tool.handler({ command: "rm -rf /" }));
|
|
1360
|
-
expect(result).toHaveProperty("error");
|
|
1361
|
-
expect(result.message).toContain("blocked");
|
|
1362
|
-
});
|
|
1363
|
-
it("runs a simple command successfully", async () => {
|
|
1364
|
-
const tool = findTool("run_tests_cli");
|
|
1365
|
-
const result = (await tool.handler({ command: "node -e \"console.log('hello')\"" }));
|
|
1366
|
-
expect(result.exitCode).toBe(0);
|
|
1367
|
-
expect(result.passed).toBe(true);
|
|
1368
|
-
expect(result.stdout).toContain("hello");
|
|
1369
|
-
});
|
|
1370
|
-
});
|
|
1371
|
-
describe("Static: diff_screenshots tool", () => {
|
|
1372
|
-
it("has valid schema with required baseline and candidate", () => {
|
|
1373
|
-
const tool = findTool("diff_screenshots");
|
|
1374
|
-
expect(tool.inputSchema.required).toEqual(["baseline", "candidate"]);
|
|
1375
|
-
expect(tool.inputSchema.properties).toHaveProperty("threshold");
|
|
1376
|
-
expect(tool.inputSchema.properties).toHaveProperty("outputPath");
|
|
1377
|
-
});
|
|
1378
|
-
});
|
|
1379
|
-
describe("Integration: search finds logged gaps", () => {
|
|
1380
|
-
it("should find gaps via search_all_knowledge after logging", async () => {
|
|
1381
|
-
const uniqueMarker = `vitest-marker-${Date.now()}`;
|
|
1382
|
-
// Create a cycle and gap with a unique marker
|
|
1383
|
-
const cycle = (await findTool("start_verification_cycle").handler({
|
|
1384
|
-
title: `search-test-${uniqueMarker}`,
|
|
1385
|
-
description: "test",
|
|
1386
|
-
}));
|
|
1387
|
-
await findTool("log_gap").handler({
|
|
1388
|
-
cycleId: cycle.cycleId,
|
|
1389
|
-
severity: "LOW",
|
|
1390
|
-
title: `gap-${uniqueMarker}`,
|
|
1391
|
-
description: `Testing search finds this gap ${uniqueMarker}`,
|
|
1392
|
-
rootCause: "test",
|
|
1393
|
-
fixStrategy: "none",
|
|
1394
|
-
});
|
|
1395
|
-
// Search for it
|
|
1396
|
-
const results = (await findTool("search_all_knowledge").handler({
|
|
1397
|
-
query: uniqueMarker,
|
|
1398
|
-
}));
|
|
1399
|
-
expect(results.gaps.length).toBeGreaterThanOrEqual(1);
|
|
1400
|
-
expect(results.gaps[0].title).toContain(uniqueMarker);
|
|
1401
|
-
expect(results.gaps[0].status).toBe("open");
|
|
1402
|
-
// Cleanup
|
|
1403
|
-
await findTool("abandon_cycle").handler({
|
|
1404
|
-
cycleId: cycle.cycleId,
|
|
1405
|
-
reason: "test cleanup",
|
|
1406
|
-
});
|
|
1407
|
-
});
|
|
1408
|
-
});
|
|
1409
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1410
|
-
// RESEARCH WRITING TOOLS — academic paper polishing
|
|
1411
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1412
|
-
describe("Static: research writing tools", () => {
|
|
1413
|
-
it("should export 8 research writing tools", () => {
|
|
1414
|
-
expect(researchWritingTools.length).toBe(8);
|
|
1415
|
-
});
|
|
1416
|
-
it("should include all 8 research writing tools in allTools", () => {
|
|
1417
|
-
const names = allTools.map((t) => t.name);
|
|
1418
|
-
expect(names).toContain("polish_academic_text");
|
|
1419
|
-
expect(names).toContain("translate_academic");
|
|
1420
|
-
expect(names).toContain("compress_or_expand_text");
|
|
1421
|
-
expect(names).toContain("remove_ai_signatures");
|
|
1422
|
-
expect(names).toContain("check_paper_logic");
|
|
1423
|
-
expect(names).toContain("generate_academic_caption");
|
|
1424
|
-
expect(names).toContain("analyze_experiment_data");
|
|
1425
|
-
expect(names).toContain("review_paper_as_reviewer");
|
|
1426
|
-
});
|
|
1427
|
-
it("polish_academic_text requires text parameter", () => {
|
|
1428
|
-
const tool = findTool("polish_academic_text");
|
|
1429
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1430
|
-
expect(tool.inputSchema.properties).toHaveProperty("targetVenue");
|
|
1431
|
-
expect(tool.inputSchema.properties).toHaveProperty("language");
|
|
1432
|
-
});
|
|
1433
|
-
it("translate_academic requires text, from, and to parameters", () => {
|
|
1434
|
-
const tool = findTool("translate_academic");
|
|
1435
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1436
|
-
expect(tool.inputSchema.required).toContain("from");
|
|
1437
|
-
expect(tool.inputSchema.required).toContain("to");
|
|
1438
|
-
});
|
|
1439
|
-
it("compress_or_expand_text requires text and mode parameters", () => {
|
|
1440
|
-
const tool = findTool("compress_or_expand_text");
|
|
1441
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1442
|
-
expect(tool.inputSchema.required).toContain("mode");
|
|
1443
|
-
const modeProp = tool.inputSchema.properties.mode;
|
|
1444
|
-
expect(modeProp.enum).toContain("compress");
|
|
1445
|
-
expect(modeProp.enum).toContain("expand");
|
|
1446
|
-
});
|
|
1447
|
-
it("remove_ai_signatures requires text parameter", () => {
|
|
1448
|
-
const tool = findTool("remove_ai_signatures");
|
|
1449
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1450
|
-
});
|
|
1451
|
-
it("check_paper_logic requires text parameter", () => {
|
|
1452
|
-
const tool = findTool("check_paper_logic");
|
|
1453
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1454
|
-
expect(tool.inputSchema.properties).toHaveProperty("checkType");
|
|
1455
|
-
});
|
|
1456
|
-
it("generate_academic_caption requires description and figureType", () => {
|
|
1457
|
-
const tool = findTool("generate_academic_caption");
|
|
1458
|
-
expect(tool.inputSchema.required).toContain("description");
|
|
1459
|
-
expect(tool.inputSchema.required).toContain("figureType");
|
|
1460
|
-
const ftProp = tool.inputSchema.properties.figureType;
|
|
1461
|
-
expect(ftProp.enum).toContain("figure");
|
|
1462
|
-
expect(ftProp.enum).toContain("table");
|
|
1463
|
-
});
|
|
1464
|
-
it("analyze_experiment_data requires data and goal parameters", () => {
|
|
1465
|
-
const tool = findTool("analyze_experiment_data");
|
|
1466
|
-
expect(tool.inputSchema.required).toContain("data");
|
|
1467
|
-
expect(tool.inputSchema.required).toContain("goal");
|
|
1468
|
-
expect(tool.inputSchema.properties).toHaveProperty("format");
|
|
1469
|
-
});
|
|
1470
|
-
it("review_paper_as_reviewer requires text and venue parameters", () => {
|
|
1471
|
-
const tool = findTool("review_paper_as_reviewer");
|
|
1472
|
-
expect(tool.inputSchema.required).toContain("text");
|
|
1473
|
-
expect(tool.inputSchema.required).toContain("venue");
|
|
1474
|
-
const strictProp = tool.inputSchema.properties.strictness;
|
|
1475
|
-
expect(strictProp.enum).toContain("lenient");
|
|
1476
|
-
expect(strictProp.enum).toContain("moderate");
|
|
1477
|
-
expect(strictProp.enum).toContain("harsh");
|
|
1478
|
-
});
|
|
1479
|
-
});
|
|
1480
|
-
describe("Unit: remove_ai_signatures pattern detection", () => {
|
|
1481
|
-
it("should detect AI patterns in text with known signatures", async () => {
|
|
1482
|
-
const tool = findTool("remove_ai_signatures");
|
|
1483
|
-
const result = (await tool.handler({
|
|
1484
|
-
text: "We leverage advanced techniques to delve into the multifaceted landscape of deep learning. Furthermore, it is worth noting that our comprehensive approach utilizes a robust framework.",
|
|
1485
|
-
}));
|
|
1486
|
-
expect(result.patternsFound).toBeGreaterThan(0);
|
|
1487
|
-
expect(result.detectedPatterns.length).toBeGreaterThan(0);
|
|
1488
|
-
expect(result.detectedPatterns.some((p) => p.label.includes("leverage"))).toBe(true);
|
|
1489
|
-
});
|
|
1490
|
-
it("should return clean verdict for natural text", async () => {
|
|
1491
|
-
const tool = findTool("remove_ai_signatures");
|
|
1492
|
-
const result = (await tool.handler({
|
|
1493
|
-
text: "We train a convolutional network on ImageNet for 90 epochs using SGD with momentum 0.9.",
|
|
1494
|
-
}));
|
|
1495
|
-
expect(result.patternsFound).toBe(0);
|
|
1496
|
-
expect(result.verdict).toContain("No significant AI signatures");
|
|
1497
|
-
});
|
|
1498
|
-
});
|
|
1499
|
-
describe("Static: academic_paper_writing methodology", () => {
|
|
1500
|
-
it("should return academic_paper_writing methodology with 8 steps", async () => {
|
|
1501
|
-
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
1502
|
-
const result = (await tool.handler({ topic: "academic_paper_writing" }));
|
|
1503
|
-
expect(result.title).toContain("Academic Paper Writing");
|
|
1504
|
-
expect(result.steps.length).toBe(8);
|
|
1505
|
-
expect(result.steps[0].name).toBe("Polish Draft");
|
|
1506
|
-
expect(result.steps[6].name).toBe("Simulate Review");
|
|
1507
|
-
});
|
|
1508
|
-
});
|
|
1509
|
-
describe("Static: scan_terminal_security tool", () => {
|
|
1510
|
-
const tool = domainTools.find((t) => t.name === "scan_terminal_security");
|
|
1511
|
-
it("should exist", () => {
|
|
1512
|
-
expect(tool).toBeDefined();
|
|
1513
|
-
});
|
|
1514
|
-
it("should accept projectRoot and checks", () => {
|
|
1515
|
-
const props = tool.inputSchema.properties;
|
|
1516
|
-
expect(props).toHaveProperty("projectRoot");
|
|
1517
|
-
expect(props).toHaveProperty("checks");
|
|
1518
|
-
});
|
|
1519
|
-
it("should accept scanHome and verbose flags", () => {
|
|
1520
|
-
const props = tool.inputSchema.properties;
|
|
1521
|
-
expect(props).toHaveProperty("scanHome");
|
|
1522
|
-
expect(props).toHaveProperty("verbose");
|
|
1523
|
-
});
|
|
1524
|
-
});
|
|
1525
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1526
|
-
// v2.8.0 — Progressive Discovery, Boilerplate, Benchmark tools
|
|
1527
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
1528
|
-
describe("Static: progressive discovery tools", () => {
|
|
1529
|
-
it("should include discover_tools, get_tool_quick_ref, get_workflow_chain", () => {
|
|
1530
|
-
const names = allTools.map((t) => t.name);
|
|
1531
|
-
expect(names).toContain("discover_tools");
|
|
1532
|
-
expect(names).toContain("get_tool_quick_ref");
|
|
1533
|
-
expect(names).toContain("get_workflow_chain");
|
|
1534
|
-
});
|
|
1535
|
-
it("discover_tools requires query parameter", () => {
|
|
1536
|
-
const tool = findTool("discover_tools");
|
|
1537
|
-
expect(tool.inputSchema.required).toContain("query");
|
|
1538
|
-
expect(tool.inputSchema.properties).toHaveProperty("category");
|
|
1539
|
-
expect(tool.inputSchema.properties).toHaveProperty("phase");
|
|
1540
|
-
expect(tool.inputSchema.properties).toHaveProperty("limit");
|
|
1541
|
-
});
|
|
1542
|
-
it("get_tool_quick_ref requires toolName parameter", () => {
|
|
1543
|
-
const tool = findTool("get_tool_quick_ref");
|
|
1544
|
-
expect(tool.inputSchema.required).toContain("toolName");
|
|
1545
|
-
});
|
|
1546
|
-
it("get_workflow_chain requires chain parameter", () => {
|
|
1547
|
-
const tool = findTool("get_workflow_chain");
|
|
1548
|
-
expect(tool.inputSchema.required).toContain("chain");
|
|
1549
|
-
});
|
|
1550
|
-
});
|
|
1551
|
-
describe("Unit: discover_tools hybrid search", () => {
|
|
1552
|
-
it("should return ranked results for verification query", async () => {
|
|
1553
|
-
const tool = findTool("discover_tools");
|
|
1554
|
-
const result = (await tool.handler({ query: "verify implementation" }));
|
|
1555
|
-
expect(result.resultCount).toBeGreaterThan(0);
|
|
1556
|
-
expect(result.results[0]).toHaveProperty("relevanceScore");
|
|
1557
|
-
expect(result.results[0]).toHaveProperty("quickRef");
|
|
1558
|
-
expect(result.results[0].relevanceScore).toBeGreaterThan(0);
|
|
1559
|
-
});
|
|
1560
|
-
it("should filter by category", async () => {
|
|
1561
|
-
const tool = findTool("discover_tools");
|
|
1562
|
-
const result = (await tool.handler({ query: "test", category: "eval" }));
|
|
1563
|
-
for (const r of result.results) {
|
|
1564
|
-
expect(r.category).toBe("eval");
|
|
1565
|
-
}
|
|
1566
|
-
});
|
|
1567
|
-
it("should filter by phase", async () => {
|
|
1568
|
-
const tool = findTool("discover_tools");
|
|
1569
|
-
const result = (await tool.handler({ query: "search find", phase: "research" }));
|
|
1570
|
-
for (const r of result.results) {
|
|
1571
|
-
expect(r.phase).toBe("research");
|
|
1572
|
-
}
|
|
1573
|
-
});
|
|
1574
|
-
it("should include matching workflow chains", async () => {
|
|
1575
|
-
const tool = findTool("discover_tools");
|
|
1576
|
-
const result = (await tool.handler({ query: "new feature build" }));
|
|
1577
|
-
expect(result.matchingWorkflows.length).toBeGreaterThan(0);
|
|
1578
|
-
});
|
|
1579
|
-
it("should return progressive hint", async () => {
|
|
1580
|
-
const tool = findTool("discover_tools");
|
|
1581
|
-
const result = (await tool.handler({ query: "verify" }));
|
|
1582
|
-
expect(result._progressiveHint).toBeTruthy();
|
|
1583
|
-
});
|
|
1584
|
-
});
|
|
1585
|
-
describe("Unit: get_tool_quick_ref", () => {
|
|
1586
|
-
it("should return quick ref for known tool", async () => {
|
|
1587
|
-
const tool = findTool("get_tool_quick_ref");
|
|
1588
|
-
const result = (await tool.handler({ toolName: "start_verification_cycle" }));
|
|
1589
|
-
expect(result.tool).toBe("start_verification_cycle");
|
|
1590
|
-
expect(result.category).toBe("verification");
|
|
1591
|
-
expect(result.quickRef).toHaveProperty("nextAction");
|
|
1592
|
-
expect(result.quickRef).toHaveProperty("nextTools");
|
|
1593
|
-
expect(result.quickRef.nextTools.length).toBeGreaterThan(0);
|
|
1594
|
-
});
|
|
1595
|
-
it("should return error for unknown tool with suggestions", async () => {
|
|
1596
|
-
const tool = findTool("get_tool_quick_ref");
|
|
1597
|
-
const result = (await tool.handler({ toolName: "nonexistent_tool_xyz" }));
|
|
1598
|
-
expect(result.error).toBe(true);
|
|
1599
|
-
expect(result).toHaveProperty("didYouMean");
|
|
1600
|
-
});
|
|
1601
|
-
it("should include related tool details when requested", async () => {
|
|
1602
|
-
const tool = findTool("get_tool_quick_ref");
|
|
1603
|
-
const result = (await tool.handler({
|
|
1604
|
-
toolName: "run_mandatory_flywheel",
|
|
1605
|
-
includeRelatedDetails: true,
|
|
1606
|
-
}));
|
|
1607
|
-
expect(result).toHaveProperty("relatedToolDetails");
|
|
1608
|
-
expect(Object.keys(result.relatedToolDetails).length).toBeGreaterThan(0);
|
|
1609
|
-
});
|
|
1610
|
-
});
|
|
1611
|
-
describe("Unit: get_workflow_chain", () => {
|
|
1612
|
-
it("should list all available chains", async () => {
|
|
1613
|
-
const tool = findTool("get_workflow_chain");
|
|
1614
|
-
const result = (await tool.handler({ chain: "list" }));
|
|
1615
|
-
expect(result.availableChains.length).toBeGreaterThan(0);
|
|
1616
|
-
const keys = result.availableChains.map((c) => c.key);
|
|
1617
|
-
expect(keys).toContain("new_feature");
|
|
1618
|
-
expect(keys).toContain("fix_bug");
|
|
1619
|
-
expect(keys).toContain("c_compiler_benchmark");
|
|
1620
|
-
});
|
|
1621
|
-
it("should return enriched chain steps", async () => {
|
|
1622
|
-
const tool = findTool("get_workflow_chain");
|
|
1623
|
-
const result = (await tool.handler({ chain: "new_feature" }));
|
|
1624
|
-
expect(result.name).toBe("Build a New Feature");
|
|
1625
|
-
expect(result.totalSteps).toBeGreaterThan(5);
|
|
1626
|
-
expect(result.steps[0]).toHaveProperty("tool");
|
|
1627
|
-
expect(result.steps[0]).toHaveProperty("action");
|
|
1628
|
-
expect(result.steps[0]).toHaveProperty("quickRef");
|
|
1629
|
-
});
|
|
1630
|
-
it("should return error for unknown chain", async () => {
|
|
1631
|
-
const tool = findTool("get_workflow_chain");
|
|
1632
|
-
const result = (await tool.handler({ chain: "nonexistent_chain" }));
|
|
1633
|
-
expect(result.error).toBe(true);
|
|
1634
|
-
});
|
|
1635
|
-
});
|
|
1636
|
-
describe("Static: boilerplate tools", () => {
|
|
1637
|
-
it("should include scaffold_nodebench_project and get_boilerplate_status", () => {
|
|
1638
|
-
const names = allTools.map((t) => t.name);
|
|
1639
|
-
expect(names).toContain("scaffold_nodebench_project");
|
|
1640
|
-
expect(names).toContain("get_boilerplate_status");
|
|
1641
|
-
});
|
|
1642
|
-
it("scaffold_nodebench_project requires projectPath, projectName, techStack", () => {
|
|
1643
|
-
const tool = findTool("scaffold_nodebench_project");
|
|
1644
|
-
expect(tool.inputSchema.required).toContain("projectPath");
|
|
1645
|
-
expect(tool.inputSchema.required).toContain("projectName");
|
|
1646
|
-
expect(tool.inputSchema.required).toContain("techStack");
|
|
1647
|
-
});
|
|
1648
|
-
it("get_boilerplate_status requires projectPath", () => {
|
|
1649
|
-
const tool = findTool("get_boilerplate_status");
|
|
1650
|
-
expect(tool.inputSchema.required).toContain("projectPath");
|
|
1651
|
-
});
|
|
1652
|
-
});
|
|
1653
|
-
describe("Unit: scaffold_nodebench_project dry run", () => {
|
|
1654
|
-
it("should preview files without creating them", async () => {
|
|
1655
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-scaffold-"));
|
|
1656
|
-
const tool = findTool("scaffold_nodebench_project");
|
|
1657
|
-
const result = (await tool.handler({
|
|
1658
|
-
projectPath: tmpDir,
|
|
1659
|
-
projectName: "test-project",
|
|
1660
|
-
techStack: "TypeScript, Node.js",
|
|
1661
|
-
dryRun: true,
|
|
1662
|
-
}));
|
|
1663
|
-
expect(result.dryRun).toBe(true);
|
|
1664
|
-
expect(result.summary.totalFiles).toBeGreaterThan(5);
|
|
1665
|
-
expect(result.willCreate.length).toBeGreaterThan(0);
|
|
1666
|
-
expect(result.willCreate).toContain("AGENTS.md");
|
|
1667
|
-
expect(result.willCreate).toContain("package.json");
|
|
1668
|
-
expect(result.willCreate).toContain(".mcp.json");
|
|
1669
|
-
expect(result._quickRef).toBeDefined();
|
|
1670
|
-
});
|
|
1671
|
-
});
|
|
1672
|
-
describe("Unit: scaffold_nodebench_project actual creation", () => {
|
|
1673
|
-
it("should create all project files", async () => {
|
|
1674
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-scaffold-"));
|
|
1675
|
-
const tool = findTool("scaffold_nodebench_project");
|
|
1676
|
-
const result = (await tool.handler({
|
|
1677
|
-
projectPath: tmpDir,
|
|
1678
|
-
projectName: "real-project",
|
|
1679
|
-
techStack: "TypeScript, React",
|
|
1680
|
-
dryRun: false,
|
|
1681
|
-
includeParallelAgents: true,
|
|
1682
|
-
includeGithubActions: true,
|
|
1683
|
-
}));
|
|
1684
|
-
expect(result.dryRun).toBe(false);
|
|
1685
|
-
expect(result.summary.created).toBeGreaterThan(5);
|
|
1686
|
-
// Verify key files exist
|
|
1687
|
-
const { existsSync } = await import("node:fs");
|
|
1688
|
-
expect(existsSync(path.join(tmpDir, "AGENTS.md"))).toBe(true);
|
|
1689
|
-
expect(existsSync(path.join(tmpDir, "package.json"))).toBe(true);
|
|
1690
|
-
expect(existsSync(path.join(tmpDir, ".mcp.json"))).toBe(true);
|
|
1691
|
-
expect(existsSync(path.join(tmpDir, ".parallel-agents"))).toBe(true);
|
|
1692
|
-
expect(existsSync(path.join(tmpDir, ".github", "workflows"))).toBe(true);
|
|
1693
|
-
});
|
|
1694
|
-
});
|
|
1695
|
-
describe("Unit: get_boilerplate_status", () => {
|
|
1696
|
-
it("should scan an empty directory and find everything missing", async () => {
|
|
1697
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-status-"));
|
|
1698
|
-
const tool = findTool("get_boilerplate_status");
|
|
1699
|
-
const result = (await tool.handler({ projectPath: tmpDir }));
|
|
1700
|
-
expect(result.completionPercentage).toBe(0);
|
|
1701
|
-
expect(result.missing).toBeGreaterThan(0);
|
|
1702
|
-
expect(result.missingFiles).toContain("AGENTS.md");
|
|
1703
|
-
expect(result.recommendations.length).toBeGreaterThan(0);
|
|
1704
|
-
});
|
|
1705
|
-
it("should detect existing files after scaffolding", async () => {
|
|
1706
|
-
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "nodebench-status-"));
|
|
1707
|
-
// Scaffold first
|
|
1708
|
-
await findTool("scaffold_nodebench_project").handler({
|
|
1709
|
-
projectPath: tmpDir,
|
|
1710
|
-
projectName: "status-test",
|
|
1711
|
-
techStack: "TypeScript",
|
|
1712
|
-
dryRun: false,
|
|
1713
|
-
});
|
|
1714
|
-
// Then check status
|
|
1715
|
-
const tool = findTool("get_boilerplate_status");
|
|
1716
|
-
const result = (await tool.handler({ projectPath: tmpDir }));
|
|
1717
|
-
expect(result.completionPercentage).toBeGreaterThan(50);
|
|
1718
|
-
expect(result.found).toBeGreaterThan(5);
|
|
1719
|
-
});
|
|
1720
|
-
it("should throw for nonexistent path", async () => {
|
|
1721
|
-
const tool = findTool("get_boilerplate_status");
|
|
1722
|
-
await expect(tool.handler({ projectPath: "/nonexistent/path/xyz123" })).rejects.toThrow("does not exist");
|
|
1723
|
-
});
|
|
1724
|
-
});
|
|
1725
|
-
describe("Static: C-compiler benchmark tools", () => {
|
|
1726
|
-
it("should include all 3 benchmark tools", () => {
|
|
1727
|
-
const names = allTools.map((t) => t.name);
|
|
1728
|
-
expect(names).toContain("start_autonomy_benchmark");
|
|
1729
|
-
expect(names).toContain("log_benchmark_milestone");
|
|
1730
|
-
expect(names).toContain("complete_autonomy_benchmark");
|
|
1731
|
-
});
|
|
1732
|
-
it("start_autonomy_benchmark requires challenge parameter", () => {
|
|
1733
|
-
const tool = findTool("start_autonomy_benchmark");
|
|
1734
|
-
expect(tool.inputSchema.required).toContain("challenge");
|
|
1735
|
-
const challengeProp = tool.inputSchema.properties.challenge;
|
|
1736
|
-
expect(challengeProp.enum).toContain("c_compiler");
|
|
1737
|
-
expect(challengeProp.enum).toContain("rest_api");
|
|
1738
|
-
expect(challengeProp.enum).toContain("fullstack_app");
|
|
1739
|
-
expect(challengeProp.enum).toContain("list");
|
|
1740
|
-
});
|
|
1741
|
-
it("log_benchmark_milestone requires benchmarkId, milestoneId, verificationPassed", () => {
|
|
1742
|
-
const tool = findTool("log_benchmark_milestone");
|
|
1743
|
-
expect(tool.inputSchema.required).toContain("benchmarkId");
|
|
1744
|
-
expect(tool.inputSchema.required).toContain("milestoneId");
|
|
1745
|
-
expect(tool.inputSchema.required).toContain("verificationPassed");
|
|
1746
|
-
});
|
|
1747
|
-
it("complete_autonomy_benchmark requires benchmarkId and reason", () => {
|
|
1748
|
-
const tool = findTool("complete_autonomy_benchmark");
|
|
1749
|
-
expect(tool.inputSchema.required).toContain("benchmarkId");
|
|
1750
|
-
expect(tool.inputSchema.required).toContain("reason");
|
|
1751
|
-
});
|
|
1752
|
-
});
|
|
1753
|
-
describe("Unit: start_autonomy_benchmark", () => {
|
|
1754
|
-
it("should list all available challenges", async () => {
|
|
1755
|
-
const tool = findTool("start_autonomy_benchmark");
|
|
1756
|
-
const result = (await tool.handler({ challenge: "list" }));
|
|
1757
|
-
expect(result.availableChallenges.length).toBe(5);
|
|
1758
|
-
const keys = result.availableChallenges.map((c) => c.key);
|
|
1759
|
-
expect(keys).toContain("c_compiler");
|
|
1760
|
-
expect(keys).toContain("rest_api");
|
|
1761
|
-
expect(keys).toContain("fullstack_app");
|
|
1762
|
-
expect(keys).toContain("cli_tool");
|
|
1763
|
-
expect(keys).toContain("data_pipeline");
|
|
1764
|
-
});
|
|
1765
|
-
it("should start a cli_tool benchmark", async () => {
|
|
1766
|
-
const tool = findTool("start_autonomy_benchmark");
|
|
1767
|
-
const result = (await tool.handler({
|
|
1768
|
-
challenge: "cli_tool",
|
|
1769
|
-
notes: "test benchmark",
|
|
1770
|
-
}));
|
|
1771
|
-
expect(result.benchmarkId).toBeTruthy();
|
|
1772
|
-
expect(result.challenge).toBe("cli_tool");
|
|
1773
|
-
expect(result.difficulty).toBe("easy");
|
|
1774
|
-
expect(result.totalPoints).toBe(100);
|
|
1775
|
-
expect(result.milestones.length).toBe(8);
|
|
1776
|
-
expect(result._quickRef).toBeDefined();
|
|
1777
|
-
});
|
|
1778
|
-
it("should throw for unknown challenge", async () => {
|
|
1779
|
-
const tool = findTool("start_autonomy_benchmark");
|
|
1780
|
-
await expect(tool.handler({ challenge: "nonexistent_challenge" })).rejects.toThrow("Unknown challenge");
|
|
1781
|
-
});
|
|
1782
|
-
});
|
|
1783
|
-
describe("Integration: full benchmark lifecycle", () => {
|
|
1784
|
-
it("start → log milestone → complete", async () => {
|
|
1785
|
-
// 1. Start benchmark
|
|
1786
|
-
const benchmark = (await findTool("start_autonomy_benchmark").handler({
|
|
1787
|
-
challenge: "cli_tool",
|
|
1788
|
-
notes: "integration test",
|
|
1789
|
-
}));
|
|
1790
|
-
expect(benchmark.benchmarkId).toBeTruthy();
|
|
1791
|
-
// 2. Log a milestone
|
|
1792
|
-
const milestone = (await findTool("log_benchmark_milestone").handler({
|
|
1793
|
-
benchmarkId: benchmark.benchmarkId,
|
|
1794
|
-
milestoneId: "project_setup",
|
|
1795
|
-
verificationPassed: true,
|
|
1796
|
-
toolsUsed: ["run_closed_loop", "bootstrap_project"],
|
|
1797
|
-
notes: "Project initialized",
|
|
1798
|
-
}));
|
|
1799
|
-
expect(milestone.points).toBe(15);
|
|
1800
|
-
expect(milestone.progress.earnedPoints).toBe(15);
|
|
1801
|
-
expect(milestone.progress.milestonesCompleted).toBe(1);
|
|
1802
|
-
// 3. Log another milestone (failed)
|
|
1803
|
-
const milestone2 = (await findTool("log_benchmark_milestone").handler({
|
|
1804
|
-
benchmarkId: benchmark.benchmarkId,
|
|
1805
|
-
milestoneId: "arg_parsing",
|
|
1806
|
-
verificationPassed: false,
|
|
1807
|
-
notes: "Arg parsing failed tests",
|
|
1808
|
-
}));
|
|
1809
|
-
expect(milestone2.points).toBe(0);
|
|
1810
|
-
expect(milestone2.progress.earnedPoints).toBe(15); // unchanged
|
|
1811
|
-
// 4. Complete benchmark
|
|
1812
|
-
const completed = (await findTool("complete_autonomy_benchmark").handler({
|
|
1813
|
-
benchmarkId: benchmark.benchmarkId,
|
|
1814
|
-
reason: "stuck",
|
|
1815
|
-
notes: "Integration test complete",
|
|
1816
|
-
}));
|
|
1817
|
-
expect(completed.score.earnedPoints).toBe(15);
|
|
1818
|
-
expect(completed.score.percentage).toBe(15);
|
|
1819
|
-
expect(completed.score.grade).toContain("F");
|
|
1820
|
-
expect(completed.milestones.completed).toBe(1);
|
|
1821
|
-
expect(completed.milestones.failed).toBe(1);
|
|
1822
|
-
expect(completed.milestones.pending).toBe(6);
|
|
1823
|
-
expect(completed.analysis.strengths).toContain("Project Setup");
|
|
1824
|
-
expect(completed._quickRef).toBeDefined();
|
|
1825
|
-
});
|
|
1826
|
-
});
|
|
1827
|
-
// ═══════════════════════════════════════════════════════════════════════
|
|
1828
|
-
// Multi-modal search engine quality tests
|
|
1829
|
-
// ═══════════════════════════════════════════════════════════════════════
|
|
1830
|
-
const toolDescs = allTools.map((t) => ({ name: t.name, description: t.description }));
|
|
1831
|
-
describe("Search engine: registry coverage", () => {
|
|
1832
|
-
it("should have a registry entry for every tool (198/198)", () => {
|
|
1833
|
-
const missing = allTools.filter((t) => !TOOL_REGISTRY.has(t.name));
|
|
1834
|
-
expect(missing.map((t) => t.name)).toEqual([]);
|
|
1835
|
-
expect(TOOL_REGISTRY.size).toBe(allTools.length);
|
|
1836
|
-
});
|
|
1837
|
-
it("should expose all 8 search modes", () => {
|
|
1838
|
-
expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense", "embedding"]);
|
|
1839
|
-
});
|
|
1840
|
-
it("discover_tools category enum covers every registry category", () => {
|
|
1841
|
-
const discoverTool = allTools.find((t) => t.name === "discover_tools");
|
|
1842
|
-
expect(discoverTool).toBeDefined();
|
|
1843
|
-
const categoryEnum = discoverTool.inputSchema.properties.category.enum;
|
|
1844
|
-
const registryCategories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
1845
|
-
for (const cat of registryCategories) {
|
|
1846
|
-
expect(categoryEnum, `category "${cat}" missing from discover_tools enum`).toContain(cat);
|
|
1847
|
-
}
|
|
1848
|
-
});
|
|
1849
|
-
it("get_workflow_chain enum covers every WORKFLOW_CHAINS key", () => {
|
|
1850
|
-
const chainTool = allTools.find((t) => t.name === "get_workflow_chain");
|
|
1851
|
-
expect(chainTool).toBeDefined();
|
|
1852
|
-
const chainEnum = chainTool.inputSchema.properties.chain.enum;
|
|
1853
|
-
for (const key of Object.keys(WORKFLOW_CHAINS)) {
|
|
1854
|
-
expect(chainEnum, `chain "${key}" missing from get_workflow_chain enum`).toContain(key);
|
|
1855
|
-
}
|
|
1856
|
-
expect(chainEnum).toContain("list");
|
|
1857
|
-
});
|
|
1858
|
-
it("should have quickRef for every registered tool", () => {
|
|
1859
|
-
for (const tool of allTools) {
|
|
1860
|
-
const qr = getQuickRef(tool.name);
|
|
1861
|
-
expect(qr, `Missing quickRef for ${tool.name}`).not.toBeNull();
|
|
1862
|
-
expect(qr.nextAction.length).toBeGreaterThan(10);
|
|
1863
|
-
expect(qr.nextTools.length).toBeGreaterThan(0);
|
|
1864
|
-
}
|
|
1865
|
-
});
|
|
1866
|
-
});
|
|
1867
|
-
describe("Search engine: hybrid mode (default)", () => {
|
|
1868
|
-
it("should find benchmark tools when searching 'benchmark'", () => {
|
|
1869
|
-
const results = hybridSearch("benchmark", toolDescs, { limit: 10 });
|
|
1870
|
-
const names = results.map((r) => r.name);
|
|
1871
|
-
expect(names).toContain("start_autonomy_benchmark");
|
|
1872
|
-
expect(names).toContain("complete_autonomy_benchmark");
|
|
1873
|
-
expect(names).toContain("benchmark_models");
|
|
1874
|
-
expect(names).toContain("log_benchmark_milestone");
|
|
1875
|
-
});
|
|
1876
|
-
it("should find scaffold tools when searching 'scaffold'", () => {
|
|
1877
|
-
const results = hybridSearch("scaffold", toolDescs, { limit: 10 });
|
|
1878
|
-
const names = results.map((r) => r.name);
|
|
1879
|
-
expect(names).toContain("scaffold_directory");
|
|
1880
|
-
expect(names).toContain("scaffold_nodebench_project");
|
|
1881
|
-
});
|
|
1882
|
-
it("should rank exact name matches highest", () => {
|
|
1883
|
-
const results = hybridSearch("web_search", toolDescs, { limit: 5 });
|
|
1884
|
-
expect(results[0].name).toBe("web_search");
|
|
1885
|
-
});
|
|
1886
|
-
it("should filter by category", () => {
|
|
1887
|
-
const results = hybridSearch("test", toolDescs, { category: "eval", limit: 10 });
|
|
1888
|
-
for (const r of results) {
|
|
1889
|
-
expect(r.category).toBe("eval");
|
|
1890
|
-
}
|
|
1891
|
-
});
|
|
1892
|
-
it("should filter by phase", () => {
|
|
1893
|
-
const results = hybridSearch("verify", toolDescs, { phase: "verify", limit: 10 });
|
|
1894
|
-
for (const r of results) {
|
|
1895
|
-
expect(r.phase).toBe("verify");
|
|
1896
|
-
}
|
|
1897
|
-
});
|
|
1898
|
-
it("should include matchReasons when explain=true", () => {
|
|
1899
|
-
const results = hybridSearch("verify", toolDescs, { limit: 3, explain: true });
|
|
1900
|
-
expect(results.length).toBeGreaterThan(0);
|
|
1901
|
-
expect(results[0].matchReasons.length).toBeGreaterThan(0);
|
|
1902
|
-
expect(results[0].matchReasons[0]).toMatch(/keyword|prefix|fuzzy|semantic|ngram|bigram|regex|domain/);
|
|
1903
|
-
});
|
|
1904
|
-
it("should return empty matchReasons when explain=false", () => {
|
|
1905
|
-
const results = hybridSearch("verify", toolDescs, { limit: 3, explain: false });
|
|
1906
|
-
expect(results[0].matchReasons).toEqual([]);
|
|
1907
|
-
});
|
|
1908
|
-
});
|
|
1909
|
-
describe("Search engine: fuzzy mode (typo tolerance)", () => {
|
|
1910
|
-
it("should find 'verify' tools when searching 'verifiy' (typo)", () => {
|
|
1911
|
-
const results = hybridSearch("verifiy", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1912
|
-
const names = results.map((r) => r.name);
|
|
1913
|
-
expect(names.some((n) => n.includes("verif"))).toBe(true);
|
|
1914
|
-
});
|
|
1915
|
-
it("should find 'benchmark' tools when searching 'benchmrk' (typo)", () => {
|
|
1916
|
-
const results = hybridSearch("benchmrk", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1917
|
-
const names = results.map((r) => r.name);
|
|
1918
|
-
expect(names.some((n) => n.includes("benchmark"))).toBe(true);
|
|
1919
|
-
});
|
|
1920
|
-
it("should find 'scaffold' when searching 'scafold' (typo)", () => {
|
|
1921
|
-
const results = hybridSearch("scafold", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1922
|
-
const names = results.map((r) => r.name);
|
|
1923
|
-
expect(names.some((n) => n.includes("scaffold"))).toBe(true);
|
|
1924
|
-
});
|
|
1925
|
-
});
|
|
1926
|
-
describe("Search engine: regex mode", () => {
|
|
1927
|
-
it("should match tools by regex pattern on name", () => {
|
|
1928
|
-
const results = hybridSearch("^capture_.*screenshot$", toolDescs, { mode: "regex", limit: 10 });
|
|
1929
|
-
expect(results.length).toBeGreaterThan(0);
|
|
1930
|
-
expect(results[0].name).toBe("capture_ui_screenshot");
|
|
1931
|
-
});
|
|
1932
|
-
it("should match tools by regex on tags", () => {
|
|
1933
|
-
const results = hybridSearch("c-compiler", toolDescs, { mode: "regex", limit: 10 });
|
|
1934
|
-
const names = results.map((r) => r.name);
|
|
1935
|
-
expect(names).toContain("start_autonomy_benchmark");
|
|
1936
|
-
});
|
|
1937
|
-
it("should handle invalid regex gracefully", () => {
|
|
1938
|
-
const results = hybridSearch("[invalid(", toolDescs, { mode: "regex", limit: 10 });
|
|
1939
|
-
expect(results).toEqual([]);
|
|
1940
|
-
});
|
|
1941
|
-
});
|
|
1942
|
-
describe("Search engine: prefix mode", () => {
|
|
1943
|
-
it("should find all 'run_' prefixed tools", () => {
|
|
1944
|
-
const results = hybridSearch("run_", toolDescs, { mode: "prefix", limit: 20 });
|
|
1945
|
-
for (const r of results) {
|
|
1946
|
-
expect(r.name.startsWith("run_")).toBe(true);
|
|
1947
|
-
}
|
|
1948
|
-
expect(results.length).toBeGreaterThanOrEqual(5);
|
|
1949
|
-
});
|
|
1950
|
-
it("should find 'cap' → capture_* tools", () => {
|
|
1951
|
-
const results = hybridSearch("cap", toolDescs, { mode: "prefix", limit: 10 });
|
|
1952
|
-
const names = results.map((r) => r.name);
|
|
1953
|
-
expect(names).toContain("capture_ui_screenshot");
|
|
1954
|
-
expect(names).toContain("capture_responsive_suite");
|
|
1955
|
-
});
|
|
1956
|
-
});
|
|
1957
|
-
describe("Search engine: semantic mode (synonym expansion)", () => {
|
|
1958
|
-
it("should expand 'check' to find 'verify' tools", () => {
|
|
1959
|
-
const results = hybridSearch("check", toolDescs, { mode: "semantic", limit: 10 });
|
|
1960
|
-
const names = results.map((r) => r.name);
|
|
1961
|
-
expect(names.some((n) => n.includes("verif") || n.includes("gate") || n.includes("quality") || n.includes("check"))).toBe(true);
|
|
1962
|
-
});
|
|
1963
|
-
it("should expand 'fix' to find 'resolve' tools", () => {
|
|
1964
|
-
const results = hybridSearch("fix", toolDescs, { mode: "semantic", limit: 10 });
|
|
1965
|
-
const names = results.map((r) => r.name);
|
|
1966
|
-
expect(names).toContain("resolve_gap");
|
|
1967
|
-
});
|
|
1968
|
-
it("should expand 'deploy' to find 'ship' phase tools", () => {
|
|
1969
|
-
const results = hybridSearch("deploy", toolDescs, { mode: "semantic", limit: 15 });
|
|
1970
|
-
const names = results.map((r) => r.name);
|
|
1971
|
-
expect(names.some((n) => n.includes("mandatory_flywheel") || n.includes("quality_gate"))).toBe(true);
|
|
1972
|
-
});
|
|
1973
|
-
});
|
|
1974
|
-
describe("Search engine: exact mode", () => {
|
|
1975
|
-
it("should return only exact name match", () => {
|
|
1976
|
-
const results = hybridSearch("web_search", toolDescs, { mode: "exact", limit: 5 });
|
|
1977
|
-
expect(results.length).toBeGreaterThan(0);
|
|
1978
|
-
expect(results[0].name).toBe("web_search");
|
|
1979
|
-
expect(results[0].score).toBeGreaterThanOrEqual(100);
|
|
1980
|
-
});
|
|
1981
|
-
});
|
|
1982
|
-
describe("Search engine: bigram phrase matching", () => {
|
|
1983
|
-
it("should match 'quality gate' as a phrase", () => {
|
|
1984
|
-
const results = hybridSearch("quality gate", toolDescs, { limit: 5 });
|
|
1985
|
-
const names = results.map((r) => r.name);
|
|
1986
|
-
expect(names).toContain("run_quality_gate");
|
|
1987
|
-
});
|
|
1988
|
-
it("should match 'parallel agents' as a phrase", () => {
|
|
1989
|
-
const results = hybridSearch("parallel agents", toolDescs, { limit: 5 });
|
|
1990
|
-
const names = results.map((r) => r.name);
|
|
1991
|
-
expect(names.some((n) => n.includes("parallel") || n.includes("agent"))).toBe(true);
|
|
1992
|
-
});
|
|
1993
|
-
});
|
|
1994
|
-
// ── Dense search NDCG regression guard ──────────────────────────────────
|
|
1995
|
-
// Tested BM25 vs TF-IDF cosine (v2.14.2): TF-IDF won 0.692 vs 0.691.
|
|
1996
|
-
// BM25's length normalization adds no value for short tool descriptions.
|
|
1997
|
-
// Keeping TF-IDF cosine. This test guards against ranking regressions.
|
|
1998
|
-
describe("Search engine: dense search NDCG@5 regression guard", () => {
|
|
1999
|
-
function ndcg(rankedNames, idealNames, k) {
|
|
2000
|
-
const relevance = new Map();
|
|
2001
|
-
idealNames.forEach((name, i) => relevance.set(name, idealNames.length - i));
|
|
2002
|
-
let dcg = 0;
|
|
2003
|
-
for (let i = 0; i < Math.min(k, rankedNames.length); i++) {
|
|
2004
|
-
const rel = relevance.get(rankedNames[i]) ?? 0;
|
|
2005
|
-
dcg += rel / Math.log2(i + 2);
|
|
2006
|
-
}
|
|
2007
|
-
let idcg = 0;
|
|
2008
|
-
const idealRels = idealNames.map((_, i) => idealNames.length - i).sort((a, b) => b - a);
|
|
2009
|
-
for (let i = 0; i < Math.min(k, idealRels.length); i++) {
|
|
2010
|
-
idcg += idealRels[i] / Math.log2(i + 2);
|
|
2011
|
-
}
|
|
2012
|
-
return idcg === 0 ? 0 : dcg / idcg;
|
|
2013
|
-
}
|
|
2014
|
-
const EVAL_QUERIES = [
|
|
2015
|
-
{ query: "verify my implementation", ideal: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
|
|
2016
|
-
{ query: "security audit", ideal: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk", "check_git_compliance"] },
|
|
2017
|
-
{ query: "write an academic paper", ideal: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer", "compress_or_expand_text"] },
|
|
2018
|
-
{ query: "deploy my changes", ideal: ["run_mandatory_flywheel", "run_quality_gate", "assess_risk", "run_closed_loop", "log_test_result"] },
|
|
2019
|
-
{ query: "parallel agent coordination", ideal: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
|
|
2020
|
-
{ query: "seo website performance", ideal: ["seo_audit_url", "check_page_performance", "analyze_seo_content", "check_wordpress_site", "scan_wordpress_updates"] },
|
|
2021
|
-
{ query: "voice pipeline latency", ideal: ["benchmark_voice_latency", "design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold"] },
|
|
2022
|
-
{ query: "session notes context", ideal: ["save_session_note", "load_session_notes", "refresh_task_context"] },
|
|
2023
|
-
{ query: "git compliance merge", ideal: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
2024
|
-
{ query: "benchmark autonomous capability", ideal: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone", "benchmark_models"] },
|
|
2025
|
-
{ query: "find tools for testing", ideal: ["discover_tools", "findTools", "log_test_result", "run_tests_cli", "start_eval_run"] },
|
|
2026
|
-
{ query: "knowledge learning record", ideal: ["record_learning", "search_all_knowledge", "save_session_note"] },
|
|
2027
|
-
];
|
|
2028
|
-
it("TF-IDF cosine dense search should maintain NDCG@5 >= 0.60 across eval queries", () => {
|
|
2029
|
-
const { vectors, idf } = buildDenseIndex();
|
|
2030
|
-
const K = 5;
|
|
2031
|
-
let totalNDCG = 0;
|
|
2032
|
-
for (const { query, ideal } of EVAL_QUERIES) {
|
|
2033
|
-
const queryTokens = tokenize(query.toLowerCase());
|
|
2034
|
-
const queryTf = new Map();
|
|
2035
|
-
for (const t of queryTokens)
|
|
2036
|
-
queryTf.set(t, (queryTf.get(t) ?? 0) + 1);
|
|
2037
|
-
const maxFreq = Math.max(...queryTf.values(), 1);
|
|
2038
|
-
for (const [k, v] of queryTf)
|
|
2039
|
-
queryTf.set(k, v / maxFreq);
|
|
2040
|
-
const queryVec = new Map();
|
|
2041
|
-
for (const [term, tfVal] of queryTf) {
|
|
2042
|
-
queryVec.set(term, tfVal * (idf.get(term) ?? 1));
|
|
2043
|
-
}
|
|
2044
|
-
const scores = [];
|
|
2045
|
-
for (const [name, docVec] of vectors) {
|
|
2046
|
-
let dot = 0, normA = 0, normB = 0;
|
|
2047
|
-
for (const [k, v] of queryVec) {
|
|
2048
|
-
normA += v * v;
|
|
2049
|
-
const bv = docVec.get(k);
|
|
2050
|
-
if (bv !== undefined)
|
|
2051
|
-
dot += v * bv;
|
|
2052
|
-
}
|
|
2053
|
-
for (const v of docVec.values())
|
|
2054
|
-
normB += v * v;
|
|
2055
|
-
const sim = (normA === 0 || normB === 0) ? 0 : dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
2056
|
-
if (sim > 0)
|
|
2057
|
-
scores.push({ name, sim });
|
|
2058
|
-
}
|
|
2059
|
-
scores.sort((a, b) => b.sim - a.sim);
|
|
2060
|
-
const rankedNames = scores.slice(0, K).map(r => r.name);
|
|
2061
|
-
totalNDCG += ndcg(rankedNames, ideal, K);
|
|
2062
|
-
}
|
|
2063
|
-
const avgNDCG = totalNDCG / EVAL_QUERIES.length;
|
|
2064
|
-
expect(avgNDCG).toBeGreaterThanOrEqual(0.60);
|
|
2065
|
-
});
|
|
2066
|
-
});
|
|
2067
|
-
// ── FTS5+BM25 A/B test: search_all_knowledge (recon_findings + gaps) ────
|
|
2068
|
-
// Verifies that FTS5 BM25 ranking produces relevance-ordered results
|
|
2069
|
-
// for variable-length recon findings and gaps, compared to LIKE (recency-only).
|
|
2070
|
-
describe("FTS5 BM25: search_all_knowledge relevance ranking", () => {
|
|
2071
|
-
const searchTool = reconTools.find((t) => t.name === "search_all_knowledge");
|
|
2072
|
-
const logFinding = reconTools.find((t) => t.name === "log_recon_finding");
|
|
2073
|
-
const runRecon = reconTools.find((t) => t.name === "run_recon");
|
|
2074
|
-
const startCycle = verificationTools.find((t) => t.name === "start_verification_cycle");
|
|
2075
|
-
const logGap = verificationTools.find((t) => t.name === "log_gap");
|
|
2076
|
-
it("should rank recon findings by BM25 relevance (term-specific > generic mentions)", async () => {
|
|
2077
|
-
// Setup: create a recon session with varied findings
|
|
2078
|
-
const session = (await runRecon.handler({ target: "BM25 FTS5 test session" }));
|
|
2079
|
-
const sid = session.sessionId;
|
|
2080
|
-
// Insert findings — the "MCP SDK breaking change" finding is highly relevant
|
|
2081
|
-
await logFinding.handler({
|
|
2082
|
-
sessionId: sid,
|
|
2083
|
-
category: "breaking_change",
|
|
2084
|
-
summary: "MCP SDK v2.0 introduces breaking changes to the transport layer requiring migration",
|
|
2085
|
-
relevance: "All MCP servers must update their transport initialization code",
|
|
2086
|
-
actionItems: "Update transport from stdio to new StreamableHTTP pattern",
|
|
2087
|
-
});
|
|
2088
|
-
await logFinding.handler({
|
|
2089
|
-
sessionId: sid,
|
|
2090
|
-
category: "best_practice",
|
|
2091
|
-
summary: "React 19 compiler optimizations reduce bundle size by 15%",
|
|
2092
|
-
relevance: "Frontend build pipeline could benefit from upgrade",
|
|
2093
|
-
actionItems: "Evaluate React 19 migration path",
|
|
2094
|
-
});
|
|
2095
|
-
await logFinding.handler({
|
|
2096
|
-
sessionId: sid,
|
|
2097
|
-
category: "new_feature",
|
|
2098
|
-
summary: "New MCP SDK sampling API enables server-initiated LLM requests",
|
|
2099
|
-
relevance: "MCP servers can now call LLMs directly through the protocol",
|
|
2100
|
-
actionItems: "Integrate sampling API into MCP tool handlers",
|
|
2101
|
-
});
|
|
2102
|
-
// Query for "MCP SDK breaking" — should rank MCP findings above React
|
|
2103
|
-
const result = (await searchTool.handler({ query: "MCP SDK breaking" }));
|
|
2104
|
-
const findings = result.reconFindings;
|
|
2105
|
-
// At minimum, MCP-related findings should appear (FTS5 MATCH or LIKE fallback)
|
|
2106
|
-
expect(findings.length).toBeGreaterThan(0);
|
|
2107
|
-
// If FTS5 BM25 is working, the breaking_change finding should rank first
|
|
2108
|
-
// (it has the most term overlap with "MCP SDK breaking")
|
|
2109
|
-
if (findings.length >= 2) {
|
|
2110
|
-
const firstSummary = findings[0].summary.toLowerCase();
|
|
2111
|
-
expect(firstSummary).toContain("breaking");
|
|
2112
|
-
}
|
|
2113
|
-
});
|
|
2114
|
-
it("should rank gaps by BM25 relevance (specific match > loose mention)", async () => {
|
|
2115
|
-
// Setup: create a verification cycle with varied gaps
|
|
2116
|
-
const cycle = (await startCycle.handler({
|
|
2117
|
-
title: "BM25 gaps FTS5 test cycle",
|
|
2118
|
-
}));
|
|
2119
|
-
const cid = cycle.cycleId;
|
|
2120
|
-
await logGap.handler({
|
|
2121
|
-
cycleId: cid,
|
|
2122
|
-
severity: "HIGH",
|
|
2123
|
-
title: "SQLite WAL mode lock contention under parallel writes",
|
|
2124
|
-
description: "When multiple agents write to SQLite simultaneously, WAL mode lock contention causes timeout errors after 5 seconds",
|
|
2125
|
-
fixStrategy: "Implement write queue with retry backoff for SQLite parallel access",
|
|
2126
|
-
});
|
|
2127
|
-
await logGap.handler({
|
|
2128
|
-
cycleId: cid,
|
|
2129
|
-
severity: "MEDIUM",
|
|
2130
|
-
title: "API rate limiting not implemented",
|
|
2131
|
-
description: "External API calls have no rate limiting or retry logic",
|
|
2132
|
-
fixStrategy: "Add exponential backoff with jitter for API calls",
|
|
2133
|
-
});
|
|
2134
|
-
await logGap.handler({
|
|
2135
|
-
cycleId: cid,
|
|
2136
|
-
severity: "LOW",
|
|
2137
|
-
title: "Test coverage below 80% for SQLite module",
|
|
2138
|
-
description: "SQLite database module has only 60% test coverage, missing edge cases for concurrent access",
|
|
2139
|
-
fixStrategy: "Add integration tests for SQLite concurrent write scenarios",
|
|
2140
|
-
});
|
|
2141
|
-
// Query for "SQLite parallel" — should rank SQLite-specific gaps above API gap
|
|
2142
|
-
const result = (await searchTool.handler({ query: "SQLite parallel" }));
|
|
2143
|
-
const gaps = result.gaps;
|
|
2144
|
-
expect(gaps.length).toBeGreaterThan(0);
|
|
2145
|
-
// If FTS5 BM25 is working, the WAL lock contention gap (HIGH severity, most term overlap) ranks first
|
|
2146
|
-
if (gaps.length >= 2) {
|
|
2147
|
-
const firstTitle = gaps[0].title.toLowerCase();
|
|
2148
|
-
expect(firstTitle).toContain("sqlite");
|
|
2149
|
-
}
|
|
2150
|
-
});
|
|
2151
|
-
});
|
|
2152
|
-
// ── Gateway BM25 meta-tool A/B test ────────────────────────────────────
|
|
2153
|
-
// Tests BM25 scoring in the gateway metaTools findTools — verifies that
|
|
2154
|
-
// IDF-weighted scoring ranks specific tools higher than generic matches.
|
|
2155
|
-
describe("Gateway BM25: findTools IDF-weighted ranking", () => {
|
|
2156
|
-
// Simulate the gateway's BM25 scorer with inline implementation
|
|
2157
|
-
function tokenize(text) {
|
|
2158
|
-
return text.toLowerCase().match(/[a-z_]+/g) ?? [];
|
|
2159
|
-
}
|
|
2160
|
-
// Word-count baseline (old approach)
|
|
2161
|
-
function wordCountSearch(query, tools) {
|
|
2162
|
-
const words = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
2163
|
-
return tools
|
|
2164
|
-
.map((t) => {
|
|
2165
|
-
const text = `${t.name} ${t.description}`.toLowerCase();
|
|
2166
|
-
const hits = words.filter((w) => text.includes(w)).length;
|
|
2167
|
-
return { name: t.name, hits };
|
|
2168
|
-
})
|
|
2169
|
-
.filter((t) => t.hits > 0)
|
|
2170
|
-
.sort((a, b) => b.hits - a.hits)
|
|
2171
|
-
.map((t) => t.name);
|
|
2172
|
-
}
|
|
2173
|
-
// BM25 search (new approach)
|
|
2174
|
-
function bm25Search(query, tools) {
|
|
2175
|
-
const corpus = new Map();
|
|
2176
|
-
for (const t of tools) {
|
|
2177
|
-
corpus.set(t.name, tokenize(`${t.name} ${t.description}`));
|
|
2178
|
-
}
|
|
2179
|
-
let totalLen = 0;
|
|
2180
|
-
for (const tokens of corpus.values())
|
|
2181
|
-
totalLen += tokens.length;
|
|
2182
|
-
const avgDl = corpus.size > 0 ? totalLen / corpus.size : 1;
|
|
2183
|
-
const docFreq = new Map();
|
|
2184
|
-
for (const tokens of corpus.values()) {
|
|
2185
|
-
const unique = new Set(tokens);
|
|
2186
|
-
for (const t of unique)
|
|
2187
|
-
docFreq.set(t, (docFreq.get(t) ?? 0) + 1);
|
|
2188
|
-
}
|
|
2189
|
-
const N = corpus.size;
|
|
2190
|
-
const idf = new Map();
|
|
2191
|
-
for (const [term, df] of docFreq) {
|
|
2192
|
-
idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
|
|
2193
|
-
}
|
|
2194
|
-
const queryTokens = tokenize(query);
|
|
2195
|
-
const k1 = 1.2, b = 0.75;
|
|
2196
|
-
return tools
|
|
2197
|
-
.map((t) => {
|
|
2198
|
-
const docTokens = corpus.get(t.name) ?? [];
|
|
2199
|
-
const dl = docTokens.length;
|
|
2200
|
-
const tf = new Map();
|
|
2201
|
-
for (const tok of docTokens)
|
|
2202
|
-
tf.set(tok, (tf.get(tok) ?? 0) + 1);
|
|
2203
|
-
let score = 0;
|
|
2204
|
-
for (const qt of queryTokens) {
|
|
2205
|
-
const termTf = tf.get(qt) ?? 0;
|
|
2206
|
-
if (termTf === 0)
|
|
2207
|
-
continue;
|
|
2208
|
-
const termIdf = idf.get(qt) ?? 0;
|
|
2209
|
-
score += termIdf * (termTf * (k1 + 1)) / (termTf + k1 * (1 - b + b * (dl / avgDl)));
|
|
2210
|
-
}
|
|
2211
|
-
return { name: t.name, score };
|
|
2212
|
-
})
|
|
2213
|
-
.filter((t) => t.score > 0)
|
|
2214
|
-
.sort((a, b) => b.score - a.score)
|
|
2215
|
-
.map((t) => t.name);
|
|
2216
|
-
}
|
|
2217
|
-
// Use the real tool list from allTools
|
|
2218
|
-
const toolEntries = allTools.map((t) => ({ name: t.name, description: t.description }));
|
|
2219
|
-
// Queries where IDF matters — rare terms should beat common ones
|
|
2220
|
-
const IDF_QUERIES = [
|
|
2221
|
-
{
|
|
2222
|
-
query: "flicker detection android",
|
|
2223
|
-
mustRankHigher: "start_flicker_analysis",
|
|
2224
|
-
mustRankLower: "web_search",
|
|
2225
|
-
reason: "'flicker' is rare (high IDF), 'search' is common (low IDF)",
|
|
2226
|
-
},
|
|
2227
|
-
{
|
|
2228
|
-
query: "autonomous benchmark c compiler",
|
|
2229
|
-
mustRankHigher: "start_autonomy_benchmark",
|
|
2230
|
-
mustRankLower: "run_quality_gate",
|
|
2231
|
-
reason: "'autonomy' and 'benchmark' are specific (high IDF)",
|
|
2232
|
-
},
|
|
2233
|
-
{
|
|
2234
|
-
query: "toon encode token",
|
|
2235
|
-
mustRankHigher: "toon_encode",
|
|
2236
|
-
mustRankLower: "record_learning",
|
|
2237
|
-
reason: "'toon' is extremely rare (high IDF), should dominate scoring",
|
|
2238
|
-
},
|
|
2239
|
-
];
|
|
2240
|
-
it("BM25 should outperform word-count on IDF-sensitive queries", () => {
|
|
2241
|
-
let bm25Wins = 0;
|
|
2242
|
-
let wordCountWins = 0;
|
|
2243
|
-
for (const { query, mustRankHigher, mustRankLower } of IDF_QUERIES) {
|
|
2244
|
-
const bm25Results = bm25Search(query, toolEntries);
|
|
2245
|
-
const wordResults = wordCountSearch(query, toolEntries);
|
|
2246
|
-
const bm25IdxHigh = bm25Results.indexOf(mustRankHigher);
|
|
2247
|
-
const bm25IdxLow = bm25Results.indexOf(mustRankLower);
|
|
2248
|
-
const wordIdxHigh = wordResults.indexOf(mustRankHigher);
|
|
2249
|
-
const wordIdxLow = wordResults.indexOf(mustRankLower);
|
|
2250
|
-
// BM25 correctly ranks the specific tool higher
|
|
2251
|
-
if (bm25IdxHigh !== -1 && (bm25IdxLow === -1 || bm25IdxHigh < bm25IdxLow))
|
|
2252
|
-
bm25Wins++;
|
|
2253
|
-
if (wordIdxHigh !== -1 && (wordIdxLow === -1 || wordIdxHigh < wordIdxLow))
|
|
2254
|
-
wordCountWins++;
|
|
2255
|
-
}
|
|
2256
|
-
// BM25 should win at least as many IDF-sensitive queries as word-count
|
|
2257
|
-
expect(bm25Wins).toBeGreaterThanOrEqual(wordCountWins);
|
|
2258
|
-
// BM25 should get at least 2 of 3 IDF-sensitive queries correct
|
|
2259
|
-
expect(bm25Wins).toBeGreaterThanOrEqual(2);
|
|
2260
|
-
});
|
|
2261
|
-
it("BM25 should return results for all eval queries (no regressions)", () => {
|
|
2262
|
-
const queries = ["verify implementation", "search the web", "create document", "find stock prices", "security audit"];
|
|
2263
|
-
for (const q of queries) {
|
|
2264
|
-
const results = bm25Search(q, toolEntries);
|
|
2265
|
-
expect(results.length).toBeGreaterThan(0);
|
|
2266
|
-
}
|
|
2267
|
-
});
|
|
2268
|
-
});
|
|
2269
|
-
// ── Contract Compliance Tool Tests ──────────────────────────────────────
|
|
2270
|
-
describe("check_contract_compliance", () => {
|
|
2271
|
-
it("should return N/A score when no tool call data exists", async () => {
|
|
2272
|
-
const tool = findTool("check_contract_compliance");
|
|
2273
|
-
const result = (await tool.handler({ sessionId: "nonexistent-session-xyz-" + Date.now() }));
|
|
2274
|
-
expect(result.score).toBe(0);
|
|
2275
|
-
expect(result.grade).toBe("N/A");
|
|
2276
|
-
});
|
|
2277
|
-
it("should score a perfect session with all contract phases", async () => {
|
|
2278
|
-
const sessionId = `compliance-test-perfect-${Date.now()}`;
|
|
2279
|
-
const logTool = findTool("log_tool_call");
|
|
2280
|
-
// Simulate a perfect agent session following the contract
|
|
2281
|
-
const perfectSequence = [
|
|
2282
|
-
// Front door (25pts)
|
|
2283
|
-
"search_all_knowledge",
|
|
2284
|
-
"getMethodology",
|
|
2285
|
-
"discover_tools",
|
|
2286
|
-
"get_workflow_chain",
|
|
2287
|
-
// Pre-impl (15pts)
|
|
2288
|
-
"run_recon",
|
|
2289
|
-
"log_recon_finding",
|
|
2290
|
-
"assess_risk",
|
|
2291
|
-
// Implementation
|
|
2292
|
-
"start_verification_cycle",
|
|
2293
|
-
"log_phase_findings",
|
|
2294
|
-
// Ship gates (30pts)
|
|
2295
|
-
"run_closed_loop",
|
|
2296
|
-
"log_test_result",
|
|
2297
|
-
"start_eval_run",
|
|
2298
|
-
"record_eval_result",
|
|
2299
|
-
"run_quality_gate",
|
|
2300
|
-
"run_mandatory_flywheel",
|
|
2301
|
-
"record_learning",
|
|
2302
|
-
];
|
|
2303
|
-
for (const toolName of perfectSequence) {
|
|
2304
|
-
await logTool.handler({ sessionId, toolName, resultStatus: "success" });
|
|
2305
|
-
}
|
|
2306
|
-
const tool = findTool("check_contract_compliance");
|
|
2307
|
-
const result = (await tool.handler({ sessionId }));
|
|
2308
|
-
expect(result.score).toBeGreaterThanOrEqual(85);
|
|
2309
|
-
expect(result.grade).toMatch(/^[AB]/);
|
|
2310
|
-
expect(result.violations.length).toBeLessThanOrEqual(2);
|
|
2311
|
-
expect(result.dimensions.front_door.score).toBeGreaterThanOrEqual(20);
|
|
2312
|
-
expect(result.dimensions.ship_gates.score).toBeGreaterThanOrEqual(25);
|
|
2313
|
-
});
|
|
2314
|
-
it("should flag violations when agent skips front-door protocol", async () => {
|
|
2315
|
-
const sessionId = `compliance-test-no-frontdoor-${Date.now()}`;
|
|
2316
|
-
const logTool = findTool("log_tool_call");
|
|
2317
|
-
// Simulate an agent that jumps straight to implementation
|
|
2318
|
-
const badSequence = [
|
|
2319
|
-
"run_closed_loop",
|
|
2320
|
-
"log_test_result",
|
|
2321
|
-
"log_gap",
|
|
2322
|
-
"resolve_gap",
|
|
2323
|
-
];
|
|
2324
|
-
for (const toolName of badSequence) {
|
|
2325
|
-
await logTool.handler({ sessionId, toolName, resultStatus: "success" });
|
|
2326
|
-
}
|
|
2327
|
-
const tool = findTool("check_contract_compliance");
|
|
2328
|
-
const result = (await tool.handler({ sessionId }));
|
|
2329
|
-
expect(result.score).toBeLessThan(50);
|
|
2330
|
-
expect(result.grade).toMatch(/^[DF]/);
|
|
2331
|
-
expect(result.dimensions.front_door.score).toBeLessThanOrEqual(5);
|
|
2332
|
-
expect(result.violations.some((v) => v.dimension === "front_door")).toBe(true);
|
|
2333
|
-
expect(result.recommendations.length).toBeGreaterThan(0);
|
|
2334
|
-
});
|
|
2335
|
-
it("should detect self-setup recovery from errors", async () => {
|
|
2336
|
-
const sessionId = `compliance-test-selfsetup-${Date.now()}`;
|
|
2337
|
-
const logTool = findTool("log_tool_call");
|
|
2338
|
-
await logTool.handler({ sessionId, toolName: "search_all_knowledge", resultStatus: "success" });
|
|
2339
|
-
await logTool.handler({ sessionId, toolName: "discover_tools", resultStatus: "error", error: "No provider available" });
|
|
2340
|
-
await logTool.handler({ sessionId, toolName: "setup_local_env", resultStatus: "success" });
|
|
2341
|
-
await logTool.handler({ sessionId, toolName: "bootstrap_project", resultStatus: "success" });
|
|
2342
|
-
await logTool.handler({ sessionId, toolName: "discover_tools", resultStatus: "success" });
|
|
2343
|
-
const tool = findTool("check_contract_compliance");
|
|
2344
|
-
const result = (await tool.handler({ sessionId }));
|
|
2345
|
-
// Self-setup should get full credit since agent recovered from errors
|
|
2346
|
-
expect(result.dimensions.self_setup.score).toBe(10);
|
|
2347
|
-
});
|
|
2348
|
-
it("should give full parallel credit when no parallel tools used (N/A)", async () => {
|
|
2349
|
-
const sessionId = `compliance-test-noparallel-${Date.now()}`;
|
|
2350
|
-
const logTool = findTool("log_tool_call");
|
|
2351
|
-
await logTool.handler({ sessionId, toolName: "search_all_knowledge", resultStatus: "success" });
|
|
2352
|
-
await logTool.handler({ sessionId, toolName: "run_closed_loop", resultStatus: "success" });
|
|
2353
|
-
const tool = findTool("check_contract_compliance");
|
|
2354
|
-
const result = (await tool.handler({ sessionId }));
|
|
2355
|
-
// No parallel tools = full credit (not applicable)
|
|
2356
|
-
expect(result.dimensions.parallel_coordination.score).toBe(10);
|
|
2357
|
-
});
|
|
2358
|
-
it("should support verbose mode with timeline", async () => {
|
|
2359
|
-
const sessionId = `compliance-test-verbose-${Date.now()}`;
|
|
2360
|
-
const logTool = findTool("log_tool_call");
|
|
2361
|
-
await logTool.handler({ sessionId, toolName: "search_all_knowledge", resultStatus: "success" });
|
|
2362
|
-
await logTool.handler({ sessionId, toolName: "getMethodology", resultStatus: "success" });
|
|
2363
|
-
const tool = findTool("check_contract_compliance");
|
|
2364
|
-
const result = (await tool.handler({ sessionId, verbose: true }));
|
|
2365
|
-
expect(result.timeline).toBeDefined();
|
|
2366
|
-
expect(result.timeline.length).toBe(2);
|
|
2367
|
-
expect(result.timeline[0].tool).toBe("search_all_knowledge");
|
|
2368
|
-
expect(result.timeline[1].tool).toBe("getMethodology");
|
|
2369
|
-
});
|
|
2370
|
-
});
|
|
2371
|
-
describe("Registry: check_contract_compliance has quickRef", () => {
|
|
2372
|
-
it("should have quickRef with methodology agent_evaluation", () => {
|
|
2373
|
-
const entry = ALL_REGISTRY_ENTRIES.find((e) => e.name === "check_contract_compliance");
|
|
2374
|
-
expect(entry).toBeDefined();
|
|
2375
|
-
expect(entry.quickRef).toBeDefined();
|
|
2376
|
-
expect(entry.quickRef.methodology).toBe("agent_evaluation");
|
|
2377
|
-
expect(entry.category).toBe("self_eval");
|
|
2378
|
-
});
|
|
2379
|
-
});
|
|
2380
|
-
describe("Workflow chains: agent_eval and contract_compliance", () => {
|
|
2381
|
-
it("should have agent_eval chain with 9 steps", () => {
|
|
2382
|
-
expect(WORKFLOW_CHAINS.agent_eval).toBeDefined();
|
|
2383
|
-
expect(WORKFLOW_CHAINS.agent_eval.steps.length).toBe(9);
|
|
2384
|
-
expect(WORKFLOW_CHAINS.agent_eval.steps[0].tool).toBe("check_contract_compliance");
|
|
2385
|
-
});
|
|
2386
|
-
it("should have contract_compliance chain with 5 steps", () => {
|
|
2387
|
-
expect(WORKFLOW_CHAINS.contract_compliance).toBeDefined();
|
|
2388
|
-
expect(WORKFLOW_CHAINS.contract_compliance.steps.length).toBe(5);
|
|
2389
|
-
expect(WORKFLOW_CHAINS.contract_compliance.steps[1].tool).toBe("check_contract_compliance");
|
|
2390
|
-
});
|
|
2391
|
-
});
|
|
2392
|
-
// ── Controlled Evaluation Tool Tests ────────────────────────────────────
|
|
2393
|
-
describe("create_task_bank", () => {
|
|
2394
|
-
it("should create a new task in the bank", async () => {
|
|
2395
|
-
const tool = findTool("create_task_bank");
|
|
2396
|
-
const result = (await tool.handler({
|
|
2397
|
-
taskId: `test-task-${Date.now()}`,
|
|
2398
|
-
title: "Fix login redirect",
|
|
2399
|
-
category: "bugfix",
|
|
2400
|
-
difficulty: "easy",
|
|
2401
|
-
prompt: "Fix the login page redirect loop",
|
|
2402
|
-
successCriteria: ["tests pass", "no redirect loop"],
|
|
2403
|
-
forbiddenBehaviors: ["hardcode URL"],
|
|
2404
|
-
timeBudgetMinutes: 15,
|
|
2405
|
-
}));
|
|
2406
|
-
expect(result.action).toBe("created");
|
|
2407
|
-
expect(result.totalTasksInBank).toBeGreaterThanOrEqual(1);
|
|
2408
|
-
expect(result.successCriteriaCount).toBe(2);
|
|
2409
|
-
expect(result.forbiddenBehaviorCount).toBe(1);
|
|
2410
|
-
});
|
|
2411
|
-
it("should update an existing task", async () => {
|
|
2412
|
-
const taskId = `test-task-update-${Date.now()}`;
|
|
2413
|
-
const tool = findTool("create_task_bank");
|
|
2414
|
-
await tool.handler({
|
|
2415
|
-
taskId,
|
|
2416
|
-
title: "Original title",
|
|
2417
|
-
category: "bugfix",
|
|
2418
|
-
difficulty: "easy",
|
|
2419
|
-
prompt: "Original prompt",
|
|
2420
|
-
successCriteria: ["tests pass"],
|
|
2421
|
-
});
|
|
2422
|
-
const result = (await tool.handler({
|
|
2423
|
-
taskId,
|
|
2424
|
-
title: "Updated title",
|
|
2425
|
-
category: "refactor",
|
|
2426
|
-
difficulty: "medium",
|
|
2427
|
-
prompt: "Updated prompt",
|
|
2428
|
-
successCriteria: ["tests pass", "lint clean"],
|
|
2429
|
-
}));
|
|
2430
|
-
expect(result.action).toBe("updated");
|
|
2431
|
-
});
|
|
2432
|
-
});
|
|
2433
|
-
describe("grade_agent_run", () => {
|
|
2434
|
-
it("should grade a bare run with outcome-only scoring", async () => {
|
|
2435
|
-
const tool = findTool("grade_agent_run");
|
|
2436
|
-
const result = (await tool.handler({
|
|
2437
|
-
taskId: `grade-test-bare-${Date.now()}`,
|
|
2438
|
-
condition: "bare",
|
|
2439
|
-
outcomeResults: [
|
|
2440
|
-
{ criterion: "tests pass", passed: true },
|
|
2441
|
-
{ criterion: "lint clean", passed: true },
|
|
2442
|
-
{ criterion: "feature works", passed: true },
|
|
2443
|
-
],
|
|
2444
|
-
}));
|
|
2445
|
-
expect(result.grade).toBeDefined();
|
|
2446
|
-
expect(result.scores.outcome.score).toBe(50); // 40 criteria + 10 budget
|
|
2447
|
-
expect(result.scores.process.score).toBe(25); // No session = half credit
|
|
2448
|
-
expect(result.scores.combined.score).toBe(75);
|
|
2449
|
-
});
|
|
2450
|
-
it("should apply forbidden behavior penalties", async () => {
|
|
2451
|
-
const tool = findTool("grade_agent_run");
|
|
2452
|
-
const result = (await tool.handler({
|
|
2453
|
-
taskId: `grade-test-penalty-${Date.now()}`,
|
|
2454
|
-
condition: "bare",
|
|
2455
|
-
outcomeResults: [
|
|
2456
|
-
{ criterion: "tests pass", passed: true },
|
|
2457
|
-
],
|
|
2458
|
-
forbiddenViolations: ["hardcoded API key", "skipped tests"],
|
|
2459
|
-
}));
|
|
2460
|
-
expect(result.scores.outcome.breakdown.forbiddenPenalty).toBe(-10);
|
|
2461
|
-
expect(result.outcomeDetails.forbiddenViolations.length).toBe(2);
|
|
2462
|
-
});
|
|
2463
|
-
it("should produce ablation comparison when multiple conditions exist", async () => {
|
|
2464
|
-
const taskId = `grade-test-ablation-${Date.now()}`;
|
|
2465
|
-
const tool = findTool("grade_agent_run");
|
|
2466
|
-
await tool.handler({
|
|
2467
|
-
taskId,
|
|
2468
|
-
condition: "bare",
|
|
2469
|
-
outcomeResults: [{ criterion: "tests pass", passed: false }],
|
|
2470
|
-
});
|
|
2471
|
-
const result = (await tool.handler({
|
|
2472
|
-
taskId,
|
|
2473
|
-
condition: "full",
|
|
2474
|
-
outcomeResults: [{ criterion: "tests pass", passed: true }],
|
|
2475
|
-
}));
|
|
2476
|
-
expect(result.ablationComparison).toBeDefined();
|
|
2477
|
-
expect(result.ablationComparison.length).toBe(2);
|
|
2478
|
-
const bare = result.ablationComparison.find((c) => c.condition === "bare");
|
|
2479
|
-
const full = result.ablationComparison.find((c) => c.condition === "full");
|
|
2480
|
-
expect(full.avgScore).toBeGreaterThan(bare.avgScore);
|
|
2481
|
-
});
|
|
2482
|
-
});
|
|
2483
|
-
describe("Registry: controlled evaluation tools", () => {
|
|
2484
|
-
it("create_task_bank has quickRef with methodology controlled_evaluation", () => {
|
|
2485
|
-
const entry = ALL_REGISTRY_ENTRIES.find((e) => e.name === "create_task_bank");
|
|
2486
|
-
expect(entry).toBeDefined();
|
|
2487
|
-
expect(entry.quickRef.methodology).toBe("controlled_evaluation");
|
|
2488
|
-
});
|
|
2489
|
-
it("grade_agent_run has quickRef with methodology controlled_evaluation", () => {
|
|
2490
|
-
const entry = ALL_REGISTRY_ENTRIES.find((e) => e.name === "grade_agent_run");
|
|
2491
|
-
expect(entry).toBeDefined();
|
|
2492
|
-
expect(entry.quickRef.methodology).toBe("controlled_evaluation");
|
|
2493
|
-
});
|
|
2494
|
-
});
|
|
2495
|
-
describe("Workflow chains: ablation_eval and task_bank_setup", () => {
|
|
2496
|
-
it("should have ablation_eval chain with 10 steps", () => {
|
|
2497
|
-
expect(WORKFLOW_CHAINS.ablation_eval).toBeDefined();
|
|
2498
|
-
expect(WORKFLOW_CHAINS.ablation_eval.steps.length).toBe(10);
|
|
2499
|
-
expect(WORKFLOW_CHAINS.ablation_eval.steps[0].tool).toBe("create_task_bank");
|
|
2500
|
-
});
|
|
2501
|
-
it("should have task_bank_setup chain with 9 steps", () => {
|
|
2502
|
-
expect(WORKFLOW_CHAINS.task_bank_setup).toBeDefined();
|
|
2503
|
-
expect(WORKFLOW_CHAINS.task_bank_setup.steps.length).toBe(9);
|
|
2504
|
-
});
|
|
2505
|
-
});
|
|
2506
|
-
// ── Embedding search A/B: natural language queries where synonym map misses ──
|
|
2507
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
2508
|
-
// CRITTER TOOL — intentionality check
|
|
2509
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
2510
|
-
describe("Unit: critter_check", () => {
|
|
2511
|
-
const tool = critterTools.find((t) => t.name === "critter_check");
|
|
2512
|
-
it("scores a well-intentioned task as proceed", async () => {
|
|
2513
|
-
const result = await tool.handler({
|
|
2514
|
-
task: "Add embedding-based semantic search to discover_tools",
|
|
2515
|
-
why: "Natural language queries like 'keep track of what I learned' miss record_learning because lexical search can't bridge vocabulary gaps",
|
|
2516
|
-
who: "AI agents using the MCP server who think in natural language, not tool names",
|
|
2517
|
-
success_looks_like: "A/B eval shows 60% lexical → 85%+ hybrid hit rate with zero drops",
|
|
2518
|
-
});
|
|
2519
|
-
expect(result.score).toBeGreaterThanOrEqual(70);
|
|
2520
|
-
expect(result.verdict).toBe("proceed");
|
|
2521
|
-
});
|
|
2522
|
-
it("catches circular reasoning and vague audience", async () => {
|
|
2523
|
-
const result = await tool.handler({
|
|
2524
|
-
task: "Add user authentication and login system to the application",
|
|
2525
|
-
why: "Because we need user authentication and login system in the application",
|
|
2526
|
-
who: "users",
|
|
2527
|
-
});
|
|
2528
|
-
// Circular (-30) + vague audience (-20) = 50, well under 70
|
|
2529
|
-
expect(result.score).toBeLessThan(70);
|
|
2530
|
-
expect(result.feedback.some((f) => f.toLowerCase().includes("circular") || f.toLowerCase().includes("vague"))).toBe(true);
|
|
2531
|
-
});
|
|
2532
|
-
it("catches deference over understanding", async () => {
|
|
2533
|
-
const result = await tool.handler({
|
|
2534
|
-
task: "Refactor the database layer",
|
|
2535
|
-
why: "I was told to refactor it in the ticket",
|
|
2536
|
-
who: "Backend developers maintaining the codebase",
|
|
2537
|
-
});
|
|
2538
|
-
expect(result.feedback.some((f) => f.toLowerCase().includes("deference") || f.toLowerCase().includes("authority"))).toBe(true);
|
|
2539
|
-
});
|
|
2540
|
-
it("rewards specificity bonuses", async () => {
|
|
2541
|
-
const result = await tool.handler({
|
|
2542
|
-
task: "Migrate from REST to GraphQL",
|
|
2543
|
-
why: "Our mobile app makes 12 API calls per screen load because REST endpoints return fixed shapes — GraphQL lets us fetch exactly what each screen needs in one round trip",
|
|
2544
|
-
who: "Mobile team (3 iOS + 2 Android devs) who spend 40% of sprint time on API pagination workarounds",
|
|
2545
|
-
success_looks_like: "Screen load API calls drop from 12 to 1-2, mobile team velocity increases by at least 20%",
|
|
2546
|
-
simplest_version: "Start with the 3 highest-traffic screens, keep REST endpoints alive for backwards compat",
|
|
2547
|
-
});
|
|
2548
|
-
expect(result.score).toBeGreaterThanOrEqual(90);
|
|
2549
|
-
expect(result.verdict).toBe("proceed");
|
|
2550
|
-
});
|
|
2551
|
-
it("persists the check to SQLite", async () => {
|
|
2552
|
-
const result = await tool.handler({
|
|
2553
|
-
task: "Test persistence",
|
|
2554
|
-
why: "Verifying that critter checks are saved for accountability",
|
|
2555
|
-
who: "The test suite validating the critter tool",
|
|
2556
|
-
});
|
|
2557
|
-
expect(result.id).toBeDefined();
|
|
2558
|
-
expect(result.id).toMatch(/^crit_/);
|
|
2559
|
-
});
|
|
2560
|
-
});
|
|
2561
|
-
// These tests verify that when a neural embedding provider IS available,
|
|
2562
|
-
// natural language queries that lexical search struggles with get boosted.
|
|
2563
|
-
// When no provider is available, they gracefully skip.
|
|
2564
|
-
import { _setIndexForTesting, _resetForTesting as resetEmbedding } from "../tools/embeddingProvider.js";
|
|
2565
|
-
import { _resetCooccurrenceCache, _setCooccurrenceForTesting, _setWrrfParamsForTesting, _resetWrrfParamsForTesting } from "../tools/toolRegistry.js";
|
|
2566
|
-
describe("Embedding search: RRF integration with hybridSearch", () => {
|
|
2567
|
-
it("hybridSearch accepts embeddingQueryVec option without error", () => {
|
|
2568
|
-
// Even without an embedding index loaded, hybridSearch should not throw
|
|
2569
|
-
const results = hybridSearch("verify code", toolDescs, {
|
|
2570
|
-
mode: "hybrid",
|
|
2571
|
-
limit: 5,
|
|
2572
|
-
embeddingQueryVec: new Float32Array([0.5, 0.3, 0.1]),
|
|
2573
|
-
});
|
|
2574
|
-
// Should still return results from lexical strategies
|
|
2575
|
-
expect(results.length).toBeGreaterThan(0);
|
|
2576
|
-
});
|
|
2577
|
-
it("embedding mode without index has no embedding reasons", () => {
|
|
2578
|
-
resetEmbedding();
|
|
2579
|
-
const results = hybridSearch("keep track of lessons", toolDescs, {
|
|
2580
|
-
mode: "embedding",
|
|
2581
|
-
limit: 5,
|
|
2582
|
-
explain: true,
|
|
2583
|
-
});
|
|
2584
|
-
// Without an embedding index, no results should have embedding reasons
|
|
2585
|
-
for (const r of results) {
|
|
2586
|
-
expect(r.matchReasons.some((m) => m.startsWith("embedding:"))).toBe(false);
|
|
2587
|
-
}
|
|
2588
|
-
});
|
|
2589
|
-
it("embedding RRF adds score when index is loaded with mock vectors", () => {
|
|
2590
|
-
// Build a simple mock index: record_learning gets a vector close to the query
|
|
2591
|
-
const mockEntries = toolDescs.map((t) => ({
|
|
2592
|
-
name: t.name,
|
|
2593
|
-
// Give record_learning a "close" vector, everything else a distant one
|
|
2594
|
-
vector: t.name === "record_learning"
|
|
2595
|
-
? new Float32Array([0.9, 0.1, 0.0])
|
|
2596
|
-
: new Float32Array([0.1, 0.1, 0.9]),
|
|
2597
|
-
nodeType: "tool",
|
|
2598
|
-
}));
|
|
2599
|
-
_setIndexForTesting(mockEntries);
|
|
2600
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2601
|
-
const results = hybridSearch("remember what I learned", toolDescs, {
|
|
2602
|
-
mode: "hybrid",
|
|
2603
|
-
limit: 10,
|
|
2604
|
-
explain: true,
|
|
2605
|
-
embeddingQueryVec: queryVec,
|
|
2606
|
-
});
|
|
2607
|
-
// record_learning should appear and have an embedding:tool_rrf reason
|
|
2608
|
-
const recordLearning = results.find((r) => r.name === "record_learning");
|
|
2609
|
-
expect(recordLearning).toBeDefined();
|
|
2610
|
-
expect(recordLearning.matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
|
|
2611
|
-
// Clean up
|
|
2612
|
-
resetEmbedding();
|
|
2613
|
-
});
|
|
2614
|
-
it("embedding-only mode with mock index ranks by RRF", () => {
|
|
2615
|
-
// Set up mock where start_verification_cycle is closest to query
|
|
2616
|
-
const mockEntries = toolDescs.map((t) => ({
|
|
2617
|
-
name: t.name,
|
|
2618
|
-
vector: t.name === "start_verification_cycle"
|
|
2619
|
-
? new Float32Array([0.95, 0.05, 0.0])
|
|
2620
|
-
: t.name === "run_quality_gate"
|
|
2621
|
-
? new Float32Array([0.7, 0.3, 0.0])
|
|
2622
|
-
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2623
|
-
nodeType: "tool",
|
|
2624
|
-
}));
|
|
2625
|
-
_setIndexForTesting(mockEntries);
|
|
2626
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2627
|
-
const results = hybridSearch("ensure correctness", toolDescs, {
|
|
2628
|
-
mode: "embedding",
|
|
2629
|
-
limit: 5,
|
|
2630
|
-
explain: true,
|
|
2631
|
-
embeddingQueryVec: queryVec,
|
|
2632
|
-
});
|
|
2633
|
-
// In embedding-only mode, results should come from embedding RRF only
|
|
2634
|
-
expect(results.length).toBeGreaterThan(0);
|
|
2635
|
-
expect(results[0].name).toBe("start_verification_cycle");
|
|
2636
|
-
expect(results[0].matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
|
|
2637
|
-
resetEmbedding();
|
|
2638
|
-
});
|
|
2639
|
-
});
|
|
2640
|
-
// ── Agent-as-a-Graph: structural property tests ──────────────────────────
|
|
2641
|
-
// These tests verify the STRUCTURAL properties of the bipartite graph search:
|
|
2642
|
-
// 1. Domain-only proximity lifts siblings (upward traversal)
|
|
2643
|
-
// 2. Type-specific wRRF weight asymmetry (α_D=1.5 > α_T=1.0, per paper + ablation)
|
|
2644
|
-
// 3. Strong lexical matches survive noisy embeddings (non-regression)
|
|
2645
|
-
// 4. Execution trace edges boost co-occurring tools
|
|
2646
|
-
//
|
|
2647
|
-
// Unlike tautological tests that mock the "right answer" as close, these tests
|
|
2648
|
-
// prove the ALGORITHM works by testing its structural invariants.
|
|
2649
|
-
describe("Agent-as-a-Graph: bipartite wRRF structural properties", () => {
|
|
2650
|
-
// Helper: build a bipartite index where specific domains are close but NO tools are
|
|
2651
|
-
function buildDomainOnlyIndex(closeDomains) {
|
|
2652
|
-
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2653
|
-
// ALL tool nodes are distant from query — no direct tool match
|
|
2654
|
-
const toolEntries = toolDescs.map((t) => ({
|
|
2655
|
-
name: t.name,
|
|
2656
|
-
vector: new Float32Array([0.1, 0.1, 0.8]),
|
|
2657
|
-
nodeType: "tool",
|
|
2658
|
-
}));
|
|
2659
|
-
// Only specified domains are close to query
|
|
2660
|
-
const domainEntries = [...categories].map((cat) => ({
|
|
2661
|
-
name: `domain:${cat}`,
|
|
2662
|
-
vector: closeDomains.has(cat)
|
|
2663
|
-
? new Float32Array([0.85, 0.15, 0.0])
|
|
2664
|
-
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2665
|
-
nodeType: "domain",
|
|
2666
|
-
}));
|
|
2667
|
-
return [...toolEntries, ...domainEntries];
|
|
2668
|
-
}
|
|
2669
|
-
afterEach(() => {
|
|
2670
|
-
resetEmbedding();
|
|
2671
|
-
_resetCooccurrenceCache();
|
|
2672
|
-
});
|
|
2673
|
-
it("domain-only embedding proximity causes measurable rank lift for sibling tools", () => {
|
|
2674
|
-
// Prove CAUSATION, not just presence: compare ranks WITH vs WITHOUT domain proximity.
|
|
2675
|
-
// Use a query that gives moderate lexical scores to research_writing tools,
|
|
2676
|
-
// then show domain_rrf lifts them higher.
|
|
2677
|
-
const query = "polish text for submission";
|
|
2678
|
-
// Step 1: Baseline — lexical only (no embeddings)
|
|
2679
|
-
resetEmbedding();
|
|
2680
|
-
const baseline = hybridSearch(query, toolDescs, {
|
|
2681
|
-
mode: "hybrid",
|
|
2682
|
-
limit: 30,
|
|
2683
|
-
explain: true,
|
|
2684
|
-
});
|
|
2685
|
-
// Find a research_writing tool in baseline and record its rank
|
|
2686
|
-
const rwToolBaseline = baseline.findIndex((r) => r.category === "research_writing");
|
|
2687
|
-
// It should exist somewhere (polish/text/submission have some keyword overlap)
|
|
2688
|
-
expect(rwToolBaseline).toBeGreaterThanOrEqual(0);
|
|
2689
|
-
const rwToolName = baseline[rwToolBaseline].name;
|
|
2690
|
-
const rwBaselineScore = baseline[rwToolBaseline].score;
|
|
2691
|
-
// Step 2: With domain-only embeddings (research_writing domain close, NO tools close)
|
|
2692
|
-
const mockIndex = buildDomainOnlyIndex(new Set(["research_writing"]));
|
|
2693
|
-
_setIndexForTesting(mockIndex);
|
|
2694
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2695
|
-
const enhanced = hybridSearch(query, toolDescs, {
|
|
2696
|
-
mode: "hybrid",
|
|
2697
|
-
limit: 30,
|
|
2698
|
-
explain: true,
|
|
2699
|
-
embeddingQueryVec: queryVec,
|
|
2700
|
-
});
|
|
2701
|
-
const rwToolEnhanced = enhanced.find((r) => r.name === rwToolName);
|
|
2702
|
-
expect(rwToolEnhanced).toBeDefined();
|
|
2703
|
-
// CAUSATION: score increased due to domain_rrf
|
|
2704
|
-
expect(rwToolEnhanced.score).toBeGreaterThan(rwBaselineScore);
|
|
2705
|
-
expect(rwToolEnhanced.matchReasons.some((r) => r.includes("domain_rrf"))).toBe(true);
|
|
2706
|
-
// No tool_rrf (all tools are equally distant)
|
|
2707
|
-
expect(rwToolEnhanced.matchReasons.some((r) => r.includes("tool_rrf"))).toBe(false);
|
|
2708
|
-
// Rank should improve (lower index = higher rank)
|
|
2709
|
-
const rwEnhancedIdx = enhanced.findIndex((r) => r.name === rwToolName);
|
|
2710
|
-
expect(rwEnhancedIdx).toBeLessThanOrEqual(rwToolBaseline);
|
|
2711
|
-
});
|
|
2712
|
-
it("multiple close domains each lift their own sibling tools independently", () => {
|
|
2713
|
-
// Setup: security AND vision domains close, but no tools close
|
|
2714
|
-
const mockIndex = buildDomainOnlyIndex(new Set(["security", "vision"]));
|
|
2715
|
-
_setIndexForTesting(mockIndex);
|
|
2716
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2717
|
-
const results = hybridSearch("analyze security visual", toolDescs, {
|
|
2718
|
-
mode: "embedding",
|
|
2719
|
-
limit: 30,
|
|
2720
|
-
explain: true,
|
|
2721
|
-
embeddingQueryVec: queryVec,
|
|
2722
|
-
});
|
|
2723
|
-
const securityTools = results.filter((r) => r.category === "security" && r.matchReasons.some((mr) => mr.includes("domain_rrf(security")));
|
|
2724
|
-
const visionTools = results.filter((r) => r.category === "vision" && r.matchReasons.some((mr) => mr.includes("domain_rrf(vision")));
|
|
2725
|
-
// Both categories should have siblings lifted
|
|
2726
|
-
expect(securityTools.length).toBeGreaterThanOrEqual(1);
|
|
2727
|
-
expect(visionTools.length).toBeGreaterThanOrEqual(1);
|
|
2728
|
-
});
|
|
2729
|
-
it("type-specific wRRF: domain_rrf score exceeds tool_rrf (paper calibration α_D=1.5 > α_T=1.0)", () => {
|
|
2730
|
-
// After ablation (see "wRRF α ratio ablation" test), paper's domain emphasis wins.
|
|
2731
|
-
// At rank 1: α_D * 1000/(K+1) = 1.5 * 1000/61 ≈ 25, α_T * 1000/(K+1) = 1.0 * 1000/61 ≈ 16.
|
|
2732
|
-
// Domain emphasis means category-level matches contribute MORE than individual tool matches,
|
|
2733
|
-
// which helps surface all tools in a matching domain (upward traversal).
|
|
2734
|
-
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2735
|
-
const targetTool = "polish_academic_text";
|
|
2736
|
-
const toolEntries = toolDescs.map((t) => ({
|
|
2737
|
-
name: t.name,
|
|
2738
|
-
vector: t.name === targetTool
|
|
2739
|
-
? new Float32Array([0.95, 0.05, 0.0])
|
|
2740
|
-
: new Float32Array([0.1, 0.1, 0.8]),
|
|
2741
|
-
nodeType: "tool",
|
|
2742
|
-
}));
|
|
2743
|
-
const domainEntries = [...categories].map((cat) => ({
|
|
2744
|
-
name: `domain:${cat}`,
|
|
2745
|
-
vector: cat === "research_writing"
|
|
2746
|
-
? new Float32Array([0.90, 0.10, 0.0])
|
|
2747
|
-
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2748
|
-
nodeType: "domain",
|
|
2749
|
-
}));
|
|
2750
|
-
_setIndexForTesting([...toolEntries, ...domainEntries]);
|
|
2751
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2752
|
-
const results = hybridSearch("academic writing", toolDescs, {
|
|
2753
|
-
mode: "embedding",
|
|
2754
|
-
limit: 20,
|
|
2755
|
-
explain: true,
|
|
2756
|
-
embeddingQueryVec: queryVec,
|
|
2757
|
-
});
|
|
2758
|
-
const target = results.find((r) => r.name === targetTool);
|
|
2759
|
-
expect(target).toBeDefined();
|
|
2760
|
-
// Extract individual RRF scores from matchReasons
|
|
2761
|
-
const toolRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:tool_rrf"));
|
|
2762
|
-
const domainRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:domain_rrf"));
|
|
2763
|
-
expect(toolRrfReason).toBeDefined();
|
|
2764
|
-
expect(domainRrfReason).toBeDefined();
|
|
2765
|
-
const toolScore = parseInt(toolRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
|
|
2766
|
-
const domainScore = parseInt(domainRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
|
|
2767
|
-
// α_D=1.5 > α_T=1.0 → domain_rrf contributes more than tool_rrf at similar ranks
|
|
2768
|
-
expect(domainScore).toBeGreaterThan(toolScore);
|
|
2769
|
-
});
|
|
2770
|
-
it("strong lexical matches are not displaced by noisy embeddings", () => {
|
|
2771
|
-
// "start verification cycle" should easily find start_verification_cycle lexically.
|
|
2772
|
-
// Adding uniformly noisy embeddings should NOT knock it from #1.
|
|
2773
|
-
resetEmbedding();
|
|
2774
|
-
const lexicalResults = hybridSearch("start verification cycle", toolDescs, {
|
|
2775
|
-
mode: "hybrid",
|
|
2776
|
-
limit: 5,
|
|
2777
|
-
});
|
|
2778
|
-
expect(lexicalResults[0].name).toBe("start_verification_cycle");
|
|
2779
|
-
// Add noisy embeddings — all vectors point roughly the same direction
|
|
2780
|
-
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2781
|
-
const toolEntries = toolDescs.map((t, i) => ({
|
|
2782
|
-
name: t.name,
|
|
2783
|
-
vector: new Float32Array([0.2 + (i % 10) * 0.01, 0.3, 0.7]),
|
|
2784
|
-
nodeType: "tool",
|
|
2785
|
-
}));
|
|
2786
|
-
const domainEntries = [...categories].map((cat, i) => ({
|
|
2787
|
-
name: `domain:${cat}`,
|
|
2788
|
-
vector: new Float32Array([0.15 + i * 0.02, 0.25, 0.7]),
|
|
2789
|
-
nodeType: "domain",
|
|
2790
|
-
}));
|
|
2791
|
-
_setIndexForTesting([...toolEntries, ...domainEntries]);
|
|
2792
|
-
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2793
|
-
const graphResults = hybridSearch("start verification cycle", toolDescs, {
|
|
2794
|
-
mode: "hybrid",
|
|
2795
|
-
limit: 5,
|
|
2796
|
-
embeddingQueryVec: queryVec,
|
|
2797
|
-
});
|
|
2798
|
-
// Lexical dominance should preserve #1 position
|
|
2799
|
-
expect(graphResults[0].name).toBe("start_verification_cycle");
|
|
2800
|
-
});
|
|
2801
|
-
});
|
|
2802
|
-
// ── Agent-as-a-Graph: execution trace edge tests ──────────────────────────
|
|
2803
|
-
// Validates that co-occurrence edges mined from tool_call_log boost results.
|
|
2804
|
-
// Uses _setCooccurrenceForTesting to inject deterministic edges.
|
|
2805
|
-
//
|
|
2806
|
-
// Key insight: trace edges only boost tools that ALREADY scored > 0 from
|
|
2807
|
-
// lexical matching. They lift borderline tools, not create results from nothing.
|
|
2808
|
-
// Tests use a data-driven approach: run baseline first, then inject edges
|
|
2809
|
-
// targeting actual result entries.
|
|
2810
|
-
describe("Agent-as-a-Graph: execution trace edges", () => {
|
|
2811
|
-
const TRACE_QUERY = "verify test quality";
|
|
2812
|
-
afterEach(() => {
|
|
2813
|
-
resetEmbedding();
|
|
2814
|
-
_resetCooccurrenceCache();
|
|
2815
|
-
});
|
|
2816
|
-
it("co-occurrence edges boost a non-top-5 tool by exactly +4", () => {
|
|
2817
|
-
// Step 1: Get natural ranking without trace edges
|
|
2818
|
-
_setCooccurrenceForTesting(new Map());
|
|
2819
|
-
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2820
|
-
mode: "hybrid",
|
|
2821
|
-
limit: 15,
|
|
2822
|
-
explain: true,
|
|
2823
|
-
});
|
|
2824
|
-
expect(baseline.length).toBeGreaterThanOrEqual(6);
|
|
2825
|
-
const topTool = baseline[0].name;
|
|
2826
|
-
const boostTarget = baseline[5].name; // position 6 — NOT in top 5
|
|
2827
|
-
const baselineScore = baseline[5].score;
|
|
2828
|
-
// Step 2: Inject trace edge from top tool → boost target
|
|
2829
|
-
_resetCooccurrenceCache();
|
|
2830
|
-
const edges = new Map();
|
|
2831
|
-
edges.set(topTool, [boostTarget]);
|
|
2832
|
-
_setCooccurrenceForTesting(edges);
|
|
2833
|
-
const boosted = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2834
|
-
mode: "hybrid",
|
|
2835
|
-
limit: 15,
|
|
2836
|
-
explain: true,
|
|
2837
|
-
});
|
|
2838
|
-
const result = boosted.find((r) => r.name === boostTarget);
|
|
2839
|
-
expect(result).toBeDefined();
|
|
2840
|
-
expect(result.score).toBe(baselineScore + 4);
|
|
2841
|
-
expect(result.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2842
|
-
});
|
|
2843
|
-
it("top-5 tools do NOT receive trace edge self-boost", () => {
|
|
2844
|
-
// Get natural ranking
|
|
2845
|
-
_setCooccurrenceForTesting(new Map());
|
|
2846
|
-
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2847
|
-
mode: "hybrid",
|
|
2848
|
-
limit: 15,
|
|
2849
|
-
explain: true,
|
|
2850
|
-
});
|
|
2851
|
-
const topTool = baseline[0].name;
|
|
2852
|
-
const topScore = baseline[0].score;
|
|
2853
|
-
const secondTool = baseline[1].name;
|
|
2854
|
-
// Set edge FROM secondTool TO topTool — topTool is already top-5
|
|
2855
|
-
_resetCooccurrenceCache();
|
|
2856
|
-
const edges = new Map();
|
|
2857
|
-
edges.set(secondTool, [topTool]);
|
|
2858
|
-
_setCooccurrenceForTesting(edges);
|
|
2859
|
-
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2860
|
-
mode: "hybrid",
|
|
2861
|
-
limit: 15,
|
|
2862
|
-
explain: true,
|
|
2863
|
-
});
|
|
2864
|
-
const top = results.find((r) => r.name === topTool);
|
|
2865
|
-
expect(top).toBeDefined();
|
|
2866
|
-
// Score should NOT increase — top-5 tools are excluded from trace boost
|
|
2867
|
-
expect(top.score).toBe(topScore);
|
|
2868
|
-
expect(top.matchReasons.some((r) => r === "trace_edge:+4")).toBe(false);
|
|
2869
|
-
});
|
|
2870
|
-
it("empty co-occurrence map produces no trace_edge boosts", () => {
|
|
2871
|
-
_setCooccurrenceForTesting(new Map());
|
|
2872
|
-
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2873
|
-
mode: "hybrid",
|
|
2874
|
-
limit: 15,
|
|
2875
|
-
explain: true,
|
|
2876
|
-
});
|
|
2877
|
-
for (const r of results) {
|
|
2878
|
-
expect(r.matchReasons.some((mr) => mr.includes("trace_edge"))).toBe(false);
|
|
2879
|
-
}
|
|
2880
|
-
});
|
|
2881
|
-
it("trace edges from multiple top tools merge — both targets get +4", () => {
|
|
2882
|
-
// Get natural ranking
|
|
2883
|
-
_setCooccurrenceForTesting(new Map());
|
|
2884
|
-
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2885
|
-
mode: "hybrid",
|
|
2886
|
-
limit: 15,
|
|
2887
|
-
explain: true,
|
|
2888
|
-
});
|
|
2889
|
-
expect(baseline.length).toBeGreaterThanOrEqual(8);
|
|
2890
|
-
const topTool1 = baseline[0].name;
|
|
2891
|
-
const topTool2 = baseline[1].name;
|
|
2892
|
-
const target1 = baseline[6].name;
|
|
2893
|
-
const target2 = baseline[7].name;
|
|
2894
|
-
const target1BaseScore = baseline[6].score;
|
|
2895
|
-
const target2BaseScore = baseline[7].score;
|
|
2896
|
-
// Two top tools each point to a different target
|
|
2897
|
-
_resetCooccurrenceCache();
|
|
2898
|
-
const edges = new Map();
|
|
2899
|
-
edges.set(topTool1, [target1]);
|
|
2900
|
-
edges.set(topTool2, [target2]);
|
|
2901
|
-
_setCooccurrenceForTesting(edges);
|
|
2902
|
-
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2903
|
-
mode: "hybrid",
|
|
2904
|
-
limit: 15,
|
|
2905
|
-
explain: true,
|
|
2906
|
-
});
|
|
2907
|
-
const boosted1 = results.find((r) => r.name === target1);
|
|
2908
|
-
const boosted2 = results.find((r) => r.name === target2);
|
|
2909
|
-
expect(boosted1).toBeDefined();
|
|
2910
|
-
expect(boosted2).toBeDefined();
|
|
2911
|
-
expect(boosted1.score).toBe(target1BaseScore + 4);
|
|
2912
|
-
expect(boosted2.score).toBe(target2BaseScore + 4);
|
|
2913
|
-
expect(boosted1.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2914
|
-
expect(boosted2.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2915
|
-
});
|
|
2916
|
-
});
|
|
2917
|
-
// ── Industry-Standard IR Metrics: Recall@K, mAP@K, NDCG@K ──────────────
|
|
2918
|
-
// Every tool retrieval paper (ToolBench, AnyTool, Agent-as-a-Graph, TOOLRET)
|
|
2919
|
-
// reports these metrics. We evaluate hybrid search against 15 intent-based
|
|
2920
|
-
// queries with ground-truth relevant tool sets.
|
|
2921
|
-
//
|
|
2922
|
-
// Standards compared against:
|
|
2923
|
-
// - Agent-as-a-Graph (arxiv:2511.18194): Recall@5=0.85, NDCG@5=0.47
|
|
2924
|
-
// - TOOLRET (ACL 2025): best NDCG@10=33.83 (bi-encoder only)
|
|
2925
|
-
// - ToolBench: NDCG@5=84.9 (contrastive-trained Sentence-BERT)
|
|
2926
|
-
//
|
|
2927
|
-
// Our system is different (single MCP server, 163 tools, 14-strategy ensemble)
|
|
2928
|
-
// so absolute numbers aren't comparable, but we should track and not regress.
|
|
2929
|
-
describe("Industry-standard IR metrics: Recall@K, mAP@K, NDCG@K", () => {
|
|
2930
|
-
// Ground truth: query → set of relevant tools (any order).
|
|
2931
|
-
// Each query has 3-6 relevant tools, reflecting realistic intent breadth.
|
|
2932
|
-
const EVAL_QUERIES = [
|
|
2933
|
-
{ query: "verify my implementation is correct", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
|
|
2934
|
-
{ query: "search past findings and lessons", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
|
|
2935
|
-
{ query: "run security audit on codebase", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk"] },
|
|
2936
|
-
{ query: "write and polish academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer"] },
|
|
2937
|
-
{ query: "coordinate parallel agent tasks", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
|
|
2938
|
-
{ query: "check website performance and SEO", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
|
|
2939
|
-
{ query: "save and recall context between sessions", relevant: ["save_session_note", "load_session_notes", "refresh_task_context"] },
|
|
2940
|
-
{ query: "review git compliance before merge", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
2941
|
-
{ query: "benchmark model autonomy", relevant: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone"] },
|
|
2942
|
-
{ query: "capture screenshot of UI state", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
|
|
2943
|
-
{ query: "encode data in compact token format", relevant: ["toon_encode", "toon_decode"] },
|
|
2944
|
-
{ query: "mine patterns from past sessions", relevant: ["mine_session_patterns", "predict_risks_from_patterns"] },
|
|
2945
|
-
{ query: "detect video flicker artifacts", relevant: ["analyze_video_flicker", "compare_video_segments", "get_flicker_report"] },
|
|
2946
|
-
{ query: "design voice interaction pipeline", relevant: ["design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold", "benchmark_voice_latency"] },
|
|
2947
|
-
{ query: "check if this task is worth doing", relevant: ["critter_check"] },
|
|
2948
|
-
];
|
|
2949
|
-
function recallAtK(ranked, relevant, k) {
|
|
2950
|
-
const topK = ranked.slice(0, k);
|
|
2951
|
-
const found = topK.filter((name) => relevant.has(name)).length;
|
|
2952
|
-
return found / relevant.size;
|
|
2953
|
-
}
|
|
2954
|
-
function averagePrecisionAtK(ranked, relevant, k) {
|
|
2955
|
-
let hits = 0;
|
|
2956
|
-
let sumPrecision = 0;
|
|
2957
|
-
for (let i = 0; i < Math.min(k, ranked.length); i++) {
|
|
2958
|
-
if (relevant.has(ranked[i])) {
|
|
2959
|
-
hits++;
|
|
2960
|
-
sumPrecision += hits / (i + 1);
|
|
2961
|
-
}
|
|
2962
|
-
}
|
|
2963
|
-
return relevant.size === 0 ? 0 : sumPrecision / relevant.size;
|
|
2964
|
-
}
|
|
2965
|
-
function ndcgAtK(ranked, relevant, k) {
|
|
2966
|
-
// Binary relevance: 1 if relevant, 0 otherwise
|
|
2967
|
-
let dcg = 0;
|
|
2968
|
-
for (let i = 0; i < Math.min(k, ranked.length); i++) {
|
|
2969
|
-
if (relevant.has(ranked[i]))
|
|
2970
|
-
dcg += 1 / Math.log2(i + 2);
|
|
2971
|
-
}
|
|
2972
|
-
let idcg = 0;
|
|
2973
|
-
const idealCount = Math.min(k, relevant.size);
|
|
2974
|
-
for (let i = 0; i < idealCount; i++) {
|
|
2975
|
-
idcg += 1 / Math.log2(i + 2);
|
|
2976
|
-
}
|
|
2977
|
-
return idcg === 0 ? 0 : dcg / idcg;
|
|
2978
|
-
}
|
|
2979
|
-
function evaluateConfig(configLabel, searchFn) {
|
|
2980
|
-
let totalRecall1 = 0, totalRecall3 = 0, totalRecall5 = 0, totalMap5 = 0, totalNdcg5 = 0;
|
|
2981
|
-
for (const { query, relevant } of EVAL_QUERIES) {
|
|
2982
|
-
const relevantSet = new Set(relevant);
|
|
2983
|
-
const ranked = searchFn(query);
|
|
2984
|
-
totalRecall1 += recallAtK(ranked, relevantSet, 1);
|
|
2985
|
-
totalRecall3 += recallAtK(ranked, relevantSet, 3);
|
|
2986
|
-
totalRecall5 += recallAtK(ranked, relevantSet, 5);
|
|
2987
|
-
totalMap5 += averagePrecisionAtK(ranked, relevantSet, 5);
|
|
2988
|
-
totalNdcg5 += ndcgAtK(ranked, relevantSet, 5);
|
|
2989
|
-
}
|
|
2990
|
-
const n = EVAL_QUERIES.length;
|
|
2991
|
-
return {
|
|
2992
|
-
recall1: totalRecall1 / n,
|
|
2993
|
-
recall3: totalRecall3 / n,
|
|
2994
|
-
recall5: totalRecall5 / n,
|
|
2995
|
-
map5: totalMap5 / n,
|
|
2996
|
-
ndcg5: totalNdcg5 / n,
|
|
2997
|
-
};
|
|
2998
|
-
}
|
|
2999
|
-
afterEach(() => {
|
|
3000
|
-
resetEmbedding();
|
|
3001
|
-
_resetCooccurrenceCache();
|
|
3002
|
-
_resetWrrfParamsForTesting();
|
|
3003
|
-
});
|
|
3004
|
-
it("hybrid search (lexical only) meets minimum IR thresholds", () => {
|
|
3005
|
-
// Baseline: no embeddings, pure lexical ensemble (keyword + fuzzy + n-gram + semantic + dense)
|
|
3006
|
-
resetEmbedding();
|
|
3007
|
-
const metrics = evaluateConfig("lexical-only", (query) => {
|
|
3008
|
-
const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
|
|
3009
|
-
return results.map((r) => r.name);
|
|
3010
|
-
});
|
|
3011
|
-
// Minimum thresholds for our 14-strategy lexical ensemble
|
|
3012
|
-
// These are regression guards — if we drop below, something broke.
|
|
3013
|
-
expect(metrics.recall5).toBeGreaterThanOrEqual(0.55);
|
|
3014
|
-
expect(metrics.map5).toBeGreaterThanOrEqual(0.40);
|
|
3015
|
-
expect(metrics.ndcg5).toBeGreaterThanOrEqual(0.50);
|
|
3016
|
-
});
|
|
3017
|
-
it("hybrid + embedding search improves over lexical-only baseline", () => {
|
|
3018
|
-
// Build a realistic mock index: tools close to their own category
|
|
3019
|
-
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
3020
|
-
const catList = [...categories];
|
|
3021
|
-
// Each category gets a unique direction in a high-dim space
|
|
3022
|
-
const mockIndex = toolDescs.map((t) => {
|
|
3023
|
-
const entry = TOOL_REGISTRY.get(t.name);
|
|
3024
|
-
const catIdx = catList.indexOf(entry?.category ?? "");
|
|
3025
|
-
// Tools in same category share a similar vector direction
|
|
3026
|
-
const vec = new Float32Array(catList.length + 1);
|
|
3027
|
-
if (catIdx >= 0)
|
|
3028
|
-
vec[catIdx] = 0.8;
|
|
3029
|
-
vec[catList.length] = 0.2; // small shared component
|
|
3030
|
-
// Normalize
|
|
3031
|
-
let norm = 0;
|
|
3032
|
-
for (let i = 0; i < vec.length; i++)
|
|
3033
|
-
norm += vec[i] * vec[i];
|
|
3034
|
-
norm = Math.sqrt(norm);
|
|
3035
|
-
for (let i = 0; i < vec.length; i++)
|
|
3036
|
-
vec[i] /= norm;
|
|
3037
|
-
return { name: t.name, vector: vec, nodeType: "tool" };
|
|
3038
|
-
});
|
|
3039
|
-
const domainIndex = catList.map((cat, catIdx) => {
|
|
3040
|
-
const vec = new Float32Array(catList.length + 1);
|
|
3041
|
-
vec[catIdx] = 0.9;
|
|
3042
|
-
vec[catList.length] = 0.1;
|
|
3043
|
-
let norm = 0;
|
|
3044
|
-
for (let i = 0; i < vec.length; i++)
|
|
3045
|
-
norm += vec[i] * vec[i];
|
|
3046
|
-
norm = Math.sqrt(norm);
|
|
3047
|
-
for (let i = 0; i < vec.length; i++)
|
|
3048
|
-
vec[i] /= norm;
|
|
3049
|
-
return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
|
|
3050
|
-
});
|
|
3051
|
-
_setIndexForTesting([...mockIndex, ...domainIndex]);
|
|
3052
|
-
// Lexical baseline
|
|
3053
|
-
resetEmbedding();
|
|
3054
|
-
const lexicalMetrics = evaluateConfig("lexical", (query) => {
|
|
3055
|
-
const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
|
|
3056
|
-
return results.map((r) => r.name);
|
|
3057
|
-
});
|
|
3058
|
-
// Hybrid + embedding
|
|
3059
|
-
_setIndexForTesting([...mockIndex, ...domainIndex]);
|
|
3060
|
-
const embeddingMetrics = evaluateConfig("hybrid+embedding", (query) => {
|
|
3061
|
-
// Simulate query embedding: average of relevant category vectors
|
|
3062
|
-
const queryWords = query.toLowerCase().split(/\s+/);
|
|
3063
|
-
const queryVec = new Float32Array(catList.length + 1);
|
|
3064
|
-
for (const cat of catList) {
|
|
3065
|
-
if (queryWords.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
|
|
3066
|
-
queryVec[catList.indexOf(cat)] = 0.7;
|
|
3067
|
-
}
|
|
3068
|
-
}
|
|
3069
|
-
queryVec[catList.length] = 0.3;
|
|
3070
|
-
let norm = 0;
|
|
3071
|
-
for (let i = 0; i < queryVec.length; i++)
|
|
3072
|
-
norm += queryVec[i] * queryVec[i];
|
|
3073
|
-
norm = Math.sqrt(norm) || 1;
|
|
3074
|
-
for (let i = 0; i < queryVec.length; i++)
|
|
3075
|
-
queryVec[i] /= norm;
|
|
3076
|
-
const results = hybridSearch(query, toolDescs, {
|
|
3077
|
-
mode: "hybrid",
|
|
3078
|
-
limit: 10,
|
|
3079
|
-
embeddingQueryVec: queryVec,
|
|
3080
|
-
});
|
|
3081
|
-
return results.map((r) => r.name);
|
|
3082
|
-
});
|
|
3083
|
-
// Embedding should not degrade any metric (non-regression)
|
|
3084
|
-
expect(embeddingMetrics.ndcg5).toBeGreaterThanOrEqual(lexicalMetrics.ndcg5 - 0.02);
|
|
3085
|
-
});
|
|
3086
|
-
});
|
|
3087
|
-
// ── wRRF α ratio ablation: paper vs our calibration ──────────────────────
|
|
3088
|
-
// Agent-as-a-Graph (arxiv:2511.18194) optimal: α_A=1.5, α_T=1.0, K=60
|
|
3089
|
-
// Our calibration: α_T=1.0, α_D=0.6, K=20
|
|
3090
|
-
//
|
|
3091
|
-
// The paper optimizes for agent SELECTION across 70 MCP servers.
|
|
3092
|
-
// We optimize for tool RETRIEVAL within a single server.
|
|
3093
|
-
// This ablation verifies our deviation is justified by measuring Recall@5.
|
|
3094
|
-
describe("wRRF α ratio ablation: paper vs NodeBench calibration", () => {
|
|
3095
|
-
const ABLATION_QUERIES = [
|
|
3096
|
-
{ query: "verify my implementation", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result"] },
|
|
3097
|
-
{ query: "search past findings", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
|
|
3098
|
-
{ query: "run security checks", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security"] },
|
|
3099
|
-
{ query: "coordinate parallel work", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role"] },
|
|
3100
|
-
{ query: "capture UI screenshots", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
|
|
3101
|
-
{ query: "review git compliance", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
3102
|
-
{ query: "write academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption"] },
|
|
3103
|
-
{ query: "check website performance", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
|
|
3104
|
-
];
|
|
3105
|
-
function buildCategoryAwareIndex() {
|
|
3106
|
-
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
3107
|
-
const catList = [...categories];
|
|
3108
|
-
const toolEntries = toolDescs.map((t) => {
|
|
3109
|
-
const entry = TOOL_REGISTRY.get(t.name);
|
|
3110
|
-
const catIdx = catList.indexOf(entry?.category ?? "");
|
|
3111
|
-
const vec = new Float32Array(catList.length);
|
|
3112
|
-
if (catIdx >= 0)
|
|
3113
|
-
vec[catIdx] = 0.85;
|
|
3114
|
-
// Add small noise per tool so not all tools in same cat have identical vectors
|
|
3115
|
-
const nameHash = t.name.split("").reduce((h, c) => ((h << 5) - h + c.charCodeAt(0)) | 0, 0);
|
|
3116
|
-
vec[Math.abs(nameHash) % catList.length] += 0.1;
|
|
3117
|
-
let norm = 0;
|
|
3118
|
-
for (let i = 0; i < vec.length; i++)
|
|
3119
|
-
norm += vec[i] * vec[i];
|
|
3120
|
-
norm = Math.sqrt(norm);
|
|
3121
|
-
for (let i = 0; i < vec.length; i++)
|
|
3122
|
-
vec[i] /= norm;
|
|
3123
|
-
return { name: t.name, vector: vec, nodeType: "tool" };
|
|
3124
|
-
});
|
|
3125
|
-
const domainEntries = catList.map((cat, catIdx) => {
|
|
3126
|
-
const vec = new Float32Array(catList.length);
|
|
3127
|
-
vec[catIdx] = 0.95;
|
|
3128
|
-
let norm = 0;
|
|
3129
|
-
for (let i = 0; i < vec.length; i++)
|
|
3130
|
-
norm += vec[i] * vec[i];
|
|
3131
|
-
norm = Math.sqrt(norm);
|
|
3132
|
-
for (let i = 0; i < vec.length; i++)
|
|
3133
|
-
vec[i] /= norm;
|
|
3134
|
-
return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
|
|
3135
|
-
});
|
|
3136
|
-
return [...toolEntries, ...domainEntries];
|
|
3137
|
-
}
|
|
3138
|
-
function makeQueryVec(query, catList) {
|
|
3139
|
-
const words = query.toLowerCase().split(/\s+/);
|
|
3140
|
-
const vec = new Float32Array(catList.length);
|
|
3141
|
-
for (const cat of catList) {
|
|
3142
|
-
if (words.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
|
|
3143
|
-
vec[catList.indexOf(cat)] = 0.8;
|
|
3144
|
-
}
|
|
3145
|
-
}
|
|
3146
|
-
let norm = 0;
|
|
3147
|
-
for (let i = 0; i < vec.length; i++)
|
|
3148
|
-
norm += vec[i] * vec[i];
|
|
3149
|
-
norm = Math.sqrt(norm) || 1;
|
|
3150
|
-
for (let i = 0; i < vec.length; i++)
|
|
3151
|
-
vec[i] /= norm;
|
|
3152
|
-
return vec;
|
|
3153
|
-
}
|
|
3154
|
-
function runAblation(label) {
|
|
3155
|
-
const catList = [...new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category))];
|
|
3156
|
-
let totalRecall = 0;
|
|
3157
|
-
for (const { query, relevant } of ABLATION_QUERIES) {
|
|
3158
|
-
const relevantSet = new Set(relevant);
|
|
3159
|
-
const queryVec = makeQueryVec(query, catList);
|
|
3160
|
-
const results = hybridSearch(query, toolDescs, {
|
|
3161
|
-
mode: "hybrid",
|
|
3162
|
-
limit: 10,
|
|
3163
|
-
embeddingQueryVec: queryVec,
|
|
3164
|
-
});
|
|
3165
|
-
const topK = results.slice(0, 5).map((r) => r.name);
|
|
3166
|
-
const found = topK.filter((n) => relevantSet.has(n)).length;
|
|
3167
|
-
totalRecall += found / relevantSet.size;
|
|
3168
|
-
}
|
|
3169
|
-
return totalRecall / ABLATION_QUERIES.length;
|
|
3170
|
-
}
|
|
3171
|
-
afterEach(() => {
|
|
3172
|
-
resetEmbedding();
|
|
3173
|
-
_resetWrrfParamsForTesting();
|
|
3174
|
-
});
|
|
3175
|
-
it("ablation grid: find optimal α_D and K for single-server tool retrieval", () => {
|
|
3176
|
-
const mockIndex = buildCategoryAwareIndex();
|
|
3177
|
-
const configs = [
|
|
3178
|
-
{ label: "old(T=1.0,D=0.6,K=20)", alphaT: 1.0, alphaD: 0.6, k: 20 },
|
|
3179
|
-
{ label: "paper(T=1.0,D=1.5,K=60)", alphaT: 1.0, alphaD: 1.5, k: 60 },
|
|
3180
|
-
{ label: "paperK20(T=1.0,D=1.5,K=20)", alphaT: 1.0, alphaD: 1.5, k: 20 },
|
|
3181
|
-
{ label: "balanced(T=1.0,D=1.0,K=20)", alphaT: 1.0, alphaD: 1.0, k: 20 },
|
|
3182
|
-
{ label: "gentleDom(T=1.0,D=1.2,K=20)", alphaT: 1.0, alphaD: 1.2, k: 20 },
|
|
3183
|
-
{ label: "strongDom(T=1.0,D=2.0,K=20)", alphaT: 1.0, alphaD: 2.0, k: 20 },
|
|
3184
|
-
];
|
|
3185
|
-
const results = [];
|
|
3186
|
-
for (const cfg of configs) {
|
|
3187
|
-
_setIndexForTesting(mockIndex);
|
|
3188
|
-
_setWrrfParamsForTesting({ alphaT: cfg.alphaT, alphaD: cfg.alphaD, k: cfg.k });
|
|
3189
|
-
results.push({ label: cfg.label, recall: runAblation(cfg.label) });
|
|
3190
|
-
}
|
|
3191
|
-
// Sort by recall descending to find winner
|
|
3192
|
-
results.sort((a, b) => b.recall - a.recall);
|
|
3193
|
-
console.log(`wRRF ablation grid — Recall@5:\n${results.map((r) => ` ${r.label}: ${r.recall.toFixed(3)}`).join("\n")}`);
|
|
3194
|
-
// The winning config should be used as our production default.
|
|
3195
|
-
// Assert the winner beats the old default by at least not being worse.
|
|
3196
|
-
const oldResult = results.find((r) => r.label.startsWith("old"));
|
|
3197
|
-
const bestResult = results[0];
|
|
3198
|
-
expect(bestResult.recall).toBeGreaterThanOrEqual(oldResult.recall);
|
|
3199
|
-
});
|
|
3200
|
-
});
|
|
3201
|
-
//# sourceMappingURL=tools.test.js.map
|