nodebench-mcp 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +253 -20
- package/STYLE_GUIDE.md +477 -0
- package/dist/__tests__/evalDatasetBench.test.d.ts +1 -0
- package/dist/__tests__/evalDatasetBench.test.js +738 -0
- package/dist/__tests__/evalDatasetBench.test.js.map +1 -0
- package/dist/__tests__/evalHarness.test.d.ts +1 -0
- package/dist/__tests__/evalHarness.test.js +830 -0
- package/dist/__tests__/evalHarness.test.js.map +1 -0
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +264 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +10 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +135 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +1 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +14 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +189 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +1 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +16 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +154 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +1 -0
- package/dist/__tests__/fixtures/swebench_verified.sample.json +162 -0
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +109 -0
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEval.test.js +209 -0
- package/dist/__tests__/openDatasetParallelEval.test.js.map +1 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +220 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +1 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +218 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +1 -0
- package/dist/__tests__/tools.test.js +252 -3
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/db.js +20 -0
- package/dist/db.js.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/tools/agentBootstrapTools.d.ts +5 -1
- package/dist/tools/agentBootstrapTools.js +566 -1
- package/dist/tools/agentBootstrapTools.js.map +1 -1
- package/dist/tools/documentationTools.js +102 -8
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/learningTools.js +6 -2
- package/dist/tools/learningTools.js.map +1 -1
- package/dist/tools/metaTools.js +112 -1
- package/dist/tools/metaTools.js.map +1 -1
- package/dist/tools/selfEvalTools.d.ts +12 -0
- package/dist/tools/selfEvalTools.js +568 -0
- package/dist/tools/selfEvalTools.js.map +1 -0
- package/package.json +11 -3
|
@@ -0,0 +1,830 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Eval Harness for NodeBench MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Tests REAL agent scenarios to prove tools work in practice.
|
|
5
|
+
* Each scenario exercises multiple tools in realistic workflows.
|
|
6
|
+
*
|
|
7
|
+
* Coverage Goals:
|
|
8
|
+
* - Every tool called at least once
|
|
9
|
+
* - Every methodology workflow tested
|
|
10
|
+
* - Cross-tool integration verified
|
|
11
|
+
*/
|
|
12
|
+
import { describe, it, expect } from "vitest";
|
|
13
|
+
import { verificationTools } from "../tools/verificationTools.js";
|
|
14
|
+
import { reconTools } from "../tools/reconTools.js";
|
|
15
|
+
import { evalTools } from "../tools/evalTools.js";
|
|
16
|
+
import { qualityGateTools } from "../tools/qualityGateTools.js";
|
|
17
|
+
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
18
|
+
import { learningTools } from "../tools/learningTools.js";
|
|
19
|
+
import { documentationTools } from "../tools/documentationTools.js";
|
|
20
|
+
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
21
|
+
import { selfEvalTools } from "../tools/selfEvalTools.js";
|
|
22
|
+
import { createMetaTools } from "../tools/metaTools.js";
|
|
23
|
+
// Assemble all tools
|
|
24
|
+
const domainTools = [
|
|
25
|
+
...verificationTools,
|
|
26
|
+
...evalTools,
|
|
27
|
+
...qualityGateTools,
|
|
28
|
+
...learningTools,
|
|
29
|
+
...flywheelTools,
|
|
30
|
+
...reconTools,
|
|
31
|
+
...documentationTools,
|
|
32
|
+
...agentBootstrapTools,
|
|
33
|
+
...selfEvalTools,
|
|
34
|
+
];
|
|
35
|
+
const allTools = [...domainTools, ...createMetaTools(domainTools)];
|
|
36
|
+
const findTool = (name) => {
|
|
37
|
+
const tool = allTools.find((t) => t.name === name);
|
|
38
|
+
if (!tool)
|
|
39
|
+
throw new Error(`Tool not found: ${name}`);
|
|
40
|
+
return tool;
|
|
41
|
+
};
|
|
42
|
+
// Track which tools are called
|
|
43
|
+
const toolCallLog = [];
|
|
44
|
+
async function callTool(name, args, scenario) {
|
|
45
|
+
const tool = findTool(name);
|
|
46
|
+
try {
|
|
47
|
+
const result = await tool.handler(args);
|
|
48
|
+
toolCallLog.push({ tool: name, scenario, success: true });
|
|
49
|
+
return result;
|
|
50
|
+
}
|
|
51
|
+
catch (error) {
|
|
52
|
+
toolCallLog.push({ tool: name, scenario, success: false });
|
|
53
|
+
throw error;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
57
|
+
// SCENARIO 1: New Feature Development (verification methodology)
|
|
58
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
59
|
+
describe("Scenario: New Feature Development", () => {
|
|
60
|
+
let cycleId;
|
|
61
|
+
let gapId;
|
|
62
|
+
it("Step 1: Start verification cycle", async () => {
|
|
63
|
+
const result = await callTool("start_verification_cycle", {
|
|
64
|
+
title: "eval-feature-development",
|
|
65
|
+
description: "Implementing user authentication",
|
|
66
|
+
}, "feature-dev");
|
|
67
|
+
expect(result.cycleId).toBeTruthy();
|
|
68
|
+
cycleId = result.cycleId;
|
|
69
|
+
});
|
|
70
|
+
it("Step 2: Log context gathering (Phase 1)", async () => {
|
|
71
|
+
const result = await callTool("log_phase_findings", {
|
|
72
|
+
cycleId,
|
|
73
|
+
phaseNumber: 1,
|
|
74
|
+
status: "passed",
|
|
75
|
+
findings: { patterns: ["JWT", "session-based"], recommendation: "use JWT" },
|
|
76
|
+
}, "feature-dev");
|
|
77
|
+
expect(result.phaseRecorded).toBe(1);
|
|
78
|
+
});
|
|
79
|
+
it("Step 3: Log a gap found during implementation", async () => {
|
|
80
|
+
const result = await callTool("log_gap", {
|
|
81
|
+
cycleId,
|
|
82
|
+
severity: "MEDIUM",
|
|
83
|
+
title: "Missing rate limiting",
|
|
84
|
+
description: "Auth endpoint needs rate limiting",
|
|
85
|
+
rootCause: "Security oversight",
|
|
86
|
+
fixStrategy: "Add express-rate-limit middleware",
|
|
87
|
+
}, "feature-dev");
|
|
88
|
+
expect(result.gapId).toBeTruthy();
|
|
89
|
+
gapId = result.gapId;
|
|
90
|
+
});
|
|
91
|
+
it("Step 4: Get verification status", async () => {
|
|
92
|
+
const result = await callTool("get_verification_status", {
|
|
93
|
+
cycleId,
|
|
94
|
+
}, "feature-dev");
|
|
95
|
+
// Tool returns status (active/completed/abandoned), currentPhase, etc.
|
|
96
|
+
expect(result.status).toBeTruthy();
|
|
97
|
+
});
|
|
98
|
+
it("Step 5: Resolve the gap", async () => {
|
|
99
|
+
const result = await callTool("resolve_gap", {
|
|
100
|
+
gapId,
|
|
101
|
+
}, "feature-dev");
|
|
102
|
+
expect(result.status).toBe("resolved");
|
|
103
|
+
});
|
|
104
|
+
it("Step 6: Log test result", async () => {
|
|
105
|
+
const result = await callTool("log_test_result", {
|
|
106
|
+
cycleId,
|
|
107
|
+
layer: "integration", // Required field
|
|
108
|
+
label: "auth-integration-test", // Required field (not testName)
|
|
109
|
+
passed: true,
|
|
110
|
+
output: "All auth flows passing",
|
|
111
|
+
}, "feature-dev");
|
|
112
|
+
expect(result.testId).toBeTruthy();
|
|
113
|
+
});
|
|
114
|
+
it("Step 7: Cleanup - abandon cycle", async () => {
|
|
115
|
+
const result = await callTool("abandon_cycle", {
|
|
116
|
+
cycleId,
|
|
117
|
+
reason: "eval harness cleanup",
|
|
118
|
+
}, "feature-dev");
|
|
119
|
+
expect(result.abandoned).toBe(true);
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
123
|
+
// SCENARIO 2: Eval-Driven Development (eval methodology)
|
|
124
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
125
|
+
describe("Scenario: Eval-Driven Development", () => {
|
|
126
|
+
let evalRunId;
|
|
127
|
+
let caseIds;
|
|
128
|
+
it("Step 1: Start eval run with test cases", async () => {
|
|
129
|
+
// Actual schema: name, cases (with input, intent, expected)
|
|
130
|
+
const result = await callTool("start_eval_run", {
|
|
131
|
+
name: "eval-harness-run",
|
|
132
|
+
description: "Testing prompt quality",
|
|
133
|
+
cases: [
|
|
134
|
+
{ input: "Hello", intent: "greeting" },
|
|
135
|
+
{ input: "Help me code", intent: "assistance" },
|
|
136
|
+
],
|
|
137
|
+
}, "eval-driven");
|
|
138
|
+
expect(result.runId).toBeTruthy();
|
|
139
|
+
evalRunId = result.runId;
|
|
140
|
+
caseIds = result.caseIds;
|
|
141
|
+
});
|
|
142
|
+
it("Step 2: Record eval results", async () => {
|
|
143
|
+
// Actual schema: caseId, verdict (pass/fail/partial), actual, score
|
|
144
|
+
const result1 = await callTool("record_eval_result", {
|
|
145
|
+
caseId: caseIds[0],
|
|
146
|
+
actual: "greeting response",
|
|
147
|
+
verdict: "pass",
|
|
148
|
+
score: 0.9,
|
|
149
|
+
}, "eval-driven");
|
|
150
|
+
expect(result1.caseId).toBe(caseIds[0]);
|
|
151
|
+
expect(result1.verdict).toBe("pass");
|
|
152
|
+
const result2 = await callTool("record_eval_result", {
|
|
153
|
+
caseId: caseIds[1],
|
|
154
|
+
actual: "help response",
|
|
155
|
+
verdict: "pass",
|
|
156
|
+
score: 0.85,
|
|
157
|
+
}, "eval-driven");
|
|
158
|
+
expect(result2.caseId).toBe(caseIds[1]);
|
|
159
|
+
});
|
|
160
|
+
it("Step 3: Complete eval run", async () => {
|
|
161
|
+
const result = await callTool("complete_eval_run", {
|
|
162
|
+
runId: evalRunId,
|
|
163
|
+
}, "eval-driven");
|
|
164
|
+
expect(result.runId).toBe(evalRunId);
|
|
165
|
+
expect(result.status).toBe("completed");
|
|
166
|
+
expect(result.summary).toBeDefined();
|
|
167
|
+
});
|
|
168
|
+
it("Step 4: List eval runs", async () => {
|
|
169
|
+
const result = await callTool("list_eval_runs", {
|
|
170
|
+
limit: 10,
|
|
171
|
+
}, "eval-driven");
|
|
172
|
+
expect(result.runs).toBeDefined();
|
|
173
|
+
expect(result.runs.length).toBeGreaterThan(0);
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
177
|
+
// SCENARIO 3: Knowledge Management (learning methodology)
|
|
178
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
179
|
+
describe("Scenario: Knowledge Management", () => {
|
|
180
|
+
const uniqueKey = `eval-learning-${Date.now()}`;
|
|
181
|
+
it("Step 1: Record a learning", async () => {
|
|
182
|
+
const result = await callTool("record_learning", {
|
|
183
|
+
key: uniqueKey,
|
|
184
|
+
category: "pattern",
|
|
185
|
+
content: "Use scenario-based testing to verify tool chains work together",
|
|
186
|
+
tags: ["testing", "eval", "integration"],
|
|
187
|
+
}, "knowledge");
|
|
188
|
+
expect(result.key).toBe(uniqueKey);
|
|
189
|
+
expect(result.success).toBe(true);
|
|
190
|
+
});
|
|
191
|
+
it("Step 2: Search for the learning", async () => {
|
|
192
|
+
// Returns { query, count, learnings: [...] }
|
|
193
|
+
const result = await callTool("search_learnings", {
|
|
194
|
+
query: "scenario testing",
|
|
195
|
+
}, "knowledge");
|
|
196
|
+
expect(result.learnings).toBeDefined();
|
|
197
|
+
});
|
|
198
|
+
it("Step 3: List all learnings", async () => {
|
|
199
|
+
const result = await callTool("list_learnings", {
|
|
200
|
+
limit: 20,
|
|
201
|
+
}, "knowledge");
|
|
202
|
+
expect(result.learnings).toBeDefined();
|
|
203
|
+
});
|
|
204
|
+
it("Step 4: Delete the learning", async () => {
|
|
205
|
+
// Returns { success: true, message }
|
|
206
|
+
const result = await callTool("delete_learning", {
|
|
207
|
+
key: uniqueKey,
|
|
208
|
+
}, "knowledge");
|
|
209
|
+
expect(result.success).toBe(true);
|
|
210
|
+
});
|
|
211
|
+
});
|
|
212
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
213
|
+
// SCENARIO 4: Quality Gates (quality_gates methodology)
|
|
214
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
215
|
+
describe("Scenario: Quality Gates", () => {
|
|
216
|
+
it("Step 1: Get deploy_readiness preset", async () => {
|
|
217
|
+
const result = await callTool("get_gate_preset", {
|
|
218
|
+
preset: "deploy_readiness",
|
|
219
|
+
}, "quality-gates");
|
|
220
|
+
expect(result.preset).toBe("deploy_readiness");
|
|
221
|
+
expect(result.rules.length).toBeGreaterThan(0);
|
|
222
|
+
});
|
|
223
|
+
it("Step 2: Run quality gate", async () => {
|
|
224
|
+
const result = await callTool("run_quality_gate", {
|
|
225
|
+
gateName: "deploy_readiness",
|
|
226
|
+
target: "eval-harness-test",
|
|
227
|
+
rules: [
|
|
228
|
+
{ name: "tests_pass", passed: true },
|
|
229
|
+
{ name: "no_type_errors", passed: true },
|
|
230
|
+
{ name: "no_lint_errors", passed: true },
|
|
231
|
+
{ name: "coverage_threshold", passed: false },
|
|
232
|
+
],
|
|
233
|
+
}, "quality-gates");
|
|
234
|
+
expect(result.passed).toBe(false);
|
|
235
|
+
expect(result.failures).toContain("coverage_threshold");
|
|
236
|
+
});
|
|
237
|
+
it("Step 3: Get gate history", async () => {
|
|
238
|
+
// Returns { gateName, runs: [...], trend }
|
|
239
|
+
const result = await callTool("get_gate_history", {
|
|
240
|
+
gateName: "deploy_readiness",
|
|
241
|
+
limit: 10,
|
|
242
|
+
}, "quality-gates");
|
|
243
|
+
expect(result.gateName).toBe("deploy_readiness");
|
|
244
|
+
expect(result.runs).toBeDefined();
|
|
245
|
+
});
|
|
246
|
+
it("Step 4: Run closed loop verification", async () => {
|
|
247
|
+
// Actual schema: steps with { step: enum, passed: boolean }
|
|
248
|
+
const result = await callTool("run_closed_loop", {
|
|
249
|
+
steps: [
|
|
250
|
+
{ step: "compile", passed: true },
|
|
251
|
+
{ step: "lint", passed: true },
|
|
252
|
+
{ step: "test", passed: true },
|
|
253
|
+
],
|
|
254
|
+
}, "quality-gates");
|
|
255
|
+
expect(result.allPassed).toBe(true);
|
|
256
|
+
});
|
|
257
|
+
});
|
|
258
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
259
|
+
// SCENARIO 5: Flywheel Orchestration (flywheel methodology)
|
|
260
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
261
|
+
describe("Scenario: Flywheel Orchestration", () => {
|
|
262
|
+
it("Step 1: Get flywheel status", async () => {
|
|
263
|
+
// Returns { innerLoop, outerLoop, connections }
|
|
264
|
+
const result = await callTool("get_flywheel_status", {}, "flywheel");
|
|
265
|
+
expect(result).toHaveProperty("innerLoop");
|
|
266
|
+
expect(result).toHaveProperty("outerLoop");
|
|
267
|
+
expect(result).toHaveProperty("connections");
|
|
268
|
+
});
|
|
269
|
+
it("Step 2: Run mandatory flywheel check", async () => {
|
|
270
|
+
// Actual schema: target, steps array with stepName enum
|
|
271
|
+
const result = await callTool("run_mandatory_flywheel", {
|
|
272
|
+
target: "Added new auth feature",
|
|
273
|
+
steps: [
|
|
274
|
+
{ stepName: "static_analysis", passed: true },
|
|
275
|
+
{ stepName: "happy_path_test", passed: true },
|
|
276
|
+
{ stepName: "failure_path_test", passed: true },
|
|
277
|
+
{ stepName: "gap_analysis", passed: true },
|
|
278
|
+
{ stepName: "fix_and_reverify", passed: true },
|
|
279
|
+
{ stepName: "deploy_and_document", passed: true },
|
|
280
|
+
],
|
|
281
|
+
}, "flywheel");
|
|
282
|
+
expect(result).toHaveProperty("passed");
|
|
283
|
+
expect(result.passed).toBe(true);
|
|
284
|
+
});
|
|
285
|
+
});
|
|
286
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
287
|
+
// SCENARIO 5.5: Flywheel Integration (promote, investigate, compare)
|
|
288
|
+
// Tests the 4 previously untested flywheel integration tools
|
|
289
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
290
|
+
describe("Scenario: Flywheel Integration", () => {
|
|
291
|
+
// Test the 4 previously untested flywheel tools in isolated tests
|
|
292
|
+
it("Step 1: list_verification_cycles - lists cycles", async () => {
|
|
293
|
+
// First create a cycle so we have something to list
|
|
294
|
+
const createResult = await callTool("start_verification_cycle", {
|
|
295
|
+
title: "List test cycle", // Note: title, not goal
|
|
296
|
+
description: "Testing list_verification_cycles",
|
|
297
|
+
}, "flywheel-integration");
|
|
298
|
+
const testCycleId = createResult.cycleId;
|
|
299
|
+
// Now list cycles
|
|
300
|
+
const result = await callTool("list_verification_cycles", {
|
|
301
|
+
limit: 10,
|
|
302
|
+
}, "flywheel-integration");
|
|
303
|
+
expect(result).toHaveProperty("count");
|
|
304
|
+
expect(result).toHaveProperty("cycles");
|
|
305
|
+
expect(Array.isArray(result.cycles)).toBe(true);
|
|
306
|
+
// Each cycle has cycleId property (not id)
|
|
307
|
+
expect(result.cycles.some((c) => c.cycleId === testCycleId)).toBe(true);
|
|
308
|
+
// Cleanup
|
|
309
|
+
await callTool("abandon_cycle", {
|
|
310
|
+
cycleId: testCycleId,
|
|
311
|
+
reason: "Test cleanup",
|
|
312
|
+
}, "flywheel-integration");
|
|
313
|
+
});
|
|
314
|
+
it("Step 2: promote_to_eval - promotes verification to eval suite", async () => {
|
|
315
|
+
// Create a cycle to promote from
|
|
316
|
+
const cycleResult = await callTool("start_verification_cycle", {
|
|
317
|
+
title: "Promote test cycle",
|
|
318
|
+
description: "Testing promote_to_eval",
|
|
319
|
+
}, "flywheel-integration");
|
|
320
|
+
// Promote with explicit cases (required)
|
|
321
|
+
const result = await callTool("promote_to_eval", {
|
|
322
|
+
cycleId: cycleResult.cycleId,
|
|
323
|
+
evalRunName: "promoted-eval-test",
|
|
324
|
+
cases: [
|
|
325
|
+
{ input: "test input", intent: "Test intent" },
|
|
326
|
+
],
|
|
327
|
+
}, "flywheel-integration");
|
|
328
|
+
expect(result).toHaveProperty("evalRunId");
|
|
329
|
+
expect(result).toHaveProperty("caseIds");
|
|
330
|
+
expect(result.caseCount).toBe(1);
|
|
331
|
+
// Cleanup
|
|
332
|
+
await callTool("abandon_cycle", {
|
|
333
|
+
cycleId: cycleResult.cycleId,
|
|
334
|
+
reason: "Test cleanup",
|
|
335
|
+
}, "flywheel-integration");
|
|
336
|
+
});
|
|
337
|
+
it("Step 3: compare_eval_runs - compares two completed evals", async () => {
|
|
338
|
+
// Create and complete baseline eval
|
|
339
|
+
const baseline = await callTool("start_eval_run", {
|
|
340
|
+
name: "baseline-for-compare",
|
|
341
|
+
cases: [{ input: "test", intent: "baseline" }],
|
|
342
|
+
}, "flywheel-integration");
|
|
343
|
+
await callTool("record_eval_result", {
|
|
344
|
+
caseId: baseline.caseIds[0],
|
|
345
|
+
actual: "result",
|
|
346
|
+
verdict: "pass",
|
|
347
|
+
}, "flywheel-integration");
|
|
348
|
+
await callTool("complete_eval_run", {
|
|
349
|
+
runId: baseline.runId,
|
|
350
|
+
}, "flywheel-integration");
|
|
351
|
+
// Create and complete candidate eval
|
|
352
|
+
const candidate = await callTool("start_eval_run", {
|
|
353
|
+
name: "candidate-for-compare",
|
|
354
|
+
cases: [{ input: "test", intent: "candidate" }],
|
|
355
|
+
}, "flywheel-integration");
|
|
356
|
+
await callTool("record_eval_result", {
|
|
357
|
+
caseId: candidate.caseIds[0],
|
|
358
|
+
actual: "result",
|
|
359
|
+
verdict: "pass",
|
|
360
|
+
}, "flywheel-integration");
|
|
361
|
+
await callTool("complete_eval_run", {
|
|
362
|
+
runId: candidate.runId,
|
|
363
|
+
}, "flywheel-integration");
|
|
364
|
+
// Compare them
|
|
365
|
+
const result = await callTool("compare_eval_runs", {
|
|
366
|
+
baselineRunId: baseline.runId,
|
|
367
|
+
candidateRunId: candidate.runId,
|
|
368
|
+
}, "flywheel-integration");
|
|
369
|
+
expect(result).toHaveProperty("recommendation");
|
|
370
|
+
expect(["DEPLOY", "REVERT", "INVESTIGATE"]).toContain(result.recommendation);
|
|
371
|
+
});
|
|
372
|
+
it("Step 4: trigger_investigation - creates investigation cycle", async () => {
|
|
373
|
+
// Create and complete an eval run to investigate
|
|
374
|
+
const eval1 = await callTool("start_eval_run", {
|
|
375
|
+
name: "eval-to-investigate",
|
|
376
|
+
cases: [{ input: "test", intent: "investigate" }],
|
|
377
|
+
}, "flywheel-integration");
|
|
378
|
+
await callTool("record_eval_result", {
|
|
379
|
+
caseId: eval1.caseIds[0],
|
|
380
|
+
actual: "failed",
|
|
381
|
+
verdict: "fail",
|
|
382
|
+
}, "flywheel-integration");
|
|
383
|
+
await callTool("complete_eval_run", {
|
|
384
|
+
runId: eval1.runId,
|
|
385
|
+
}, "flywheel-integration");
|
|
386
|
+
// Trigger investigation
|
|
387
|
+
const result = await callTool("trigger_investigation", {
|
|
388
|
+
evalRunId: eval1.runId,
|
|
389
|
+
regressionDescription: "Test failure detected",
|
|
390
|
+
}, "flywheel-integration");
|
|
391
|
+
// Returns cycleId, title, linkedEvalRun, phase1Instructions
|
|
392
|
+
expect(result).toHaveProperty("cycleId");
|
|
393
|
+
expect(result).toHaveProperty("title");
|
|
394
|
+
expect(result).toHaveProperty("linkedEvalRun");
|
|
395
|
+
// Cleanup
|
|
396
|
+
await callTool("abandon_cycle", {
|
|
397
|
+
cycleId: result.cycleId,
|
|
398
|
+
reason: "Test cleanup",
|
|
399
|
+
}, "flywheel-integration");
|
|
400
|
+
});
|
|
401
|
+
});
|
|
402
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
403
|
+
// SCENARIO 6: Research & Discovery (recon methodology)
|
|
404
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
405
|
+
describe("Scenario: Research & Discovery", () => {
|
|
406
|
+
let reconSessionId;
|
|
407
|
+
it("Step 1: Start recon session", async () => {
|
|
408
|
+
// Actual schema: target (required), description, projectContext
|
|
409
|
+
const result = await callTool("run_recon", {
|
|
410
|
+
target: "MCP server best practices",
|
|
411
|
+
description: "Research for eval harness",
|
|
412
|
+
}, "research");
|
|
413
|
+
expect(result.sessionId).toBeTruthy();
|
|
414
|
+
reconSessionId = result.sessionId;
|
|
415
|
+
});
|
|
416
|
+
it("Step 2: Log recon finding", async () => {
|
|
417
|
+
// Actual schema: sessionId, category (enum), summary, sourceUrl, relevance
|
|
418
|
+
const result = await callTool("log_recon_finding", {
|
|
419
|
+
sessionId: reconSessionId,
|
|
420
|
+
category: "best_practice",
|
|
421
|
+
summary: "Organize tools by domain for better discoverability",
|
|
422
|
+
sourceUrl: "https://docs.anthropic.com",
|
|
423
|
+
relevance: "Applies to MCP tool organization",
|
|
424
|
+
}, "research");
|
|
425
|
+
expect(result.findingId).toBeTruthy();
|
|
426
|
+
expect(result.findingCount).toBeGreaterThan(0);
|
|
427
|
+
});
|
|
428
|
+
it("Step 3: Get recon summary", async () => {
|
|
429
|
+
// Returns { sessionId, target, status, totalFindings, findingsByCategory, ... }
|
|
430
|
+
const result = await callTool("get_recon_summary", {
|
|
431
|
+
sessionId: reconSessionId,
|
|
432
|
+
}, "research");
|
|
433
|
+
expect(result.sessionId).toBe(reconSessionId);
|
|
434
|
+
expect(result.totalFindings).toBeGreaterThan(0);
|
|
435
|
+
});
|
|
436
|
+
it("Step 4: Check framework updates", async () => {
|
|
437
|
+
// Actual schema: ecosystem (enum)
|
|
438
|
+
const result = await callTool("check_framework_updates", {
|
|
439
|
+
ecosystem: "mcp",
|
|
440
|
+
}, "research");
|
|
441
|
+
expect(result.ecosystem).toBe("mcp");
|
|
442
|
+
expect(result.sources).toBeDefined();
|
|
443
|
+
});
|
|
444
|
+
it("Step 5: Bootstrap project context", async () => {
|
|
445
|
+
// Actual schema: projectName (required), techStack, architecture, etc.
|
|
446
|
+
const result = await callTool("bootstrap_project", {
|
|
447
|
+
projectName: "eval-harness-project",
|
|
448
|
+
techStack: "TypeScript, Vitest, MCP",
|
|
449
|
+
architecture: "Modular tool system",
|
|
450
|
+
}, "research");
|
|
451
|
+
expect(result.projectName).toBe("eval-harness-project");
|
|
452
|
+
expect(result.storedFields).toBeDefined();
|
|
453
|
+
});
|
|
454
|
+
it("Step 6: Get project context", async () => {
|
|
455
|
+
// Returns { context: {}, knowledgeBase: {} }
|
|
456
|
+
const result = await callTool("get_project_context", {}, "research");
|
|
457
|
+
expect(result).toHaveProperty("context");
|
|
458
|
+
expect(result).toHaveProperty("knowledgeBase");
|
|
459
|
+
});
|
|
460
|
+
it("Step 7: Search all knowledge", async () => {
|
|
461
|
+
const result = await callTool("search_all_knowledge", {
|
|
462
|
+
query: "MCP tools",
|
|
463
|
+
}, "research");
|
|
464
|
+
expect(result).toHaveProperty("learnings");
|
|
465
|
+
expect(result).toHaveProperty("reconFindings");
|
|
466
|
+
expect(result).toHaveProperty("gaps");
|
|
467
|
+
});
|
|
468
|
+
});
|
|
469
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
470
|
+
// SCENARIO 7: Agent Self-Bootstrap (agent_bootstrap methodology)
|
|
471
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
472
|
+
describe("Scenario: Agent Self-Bootstrap", () => {
|
|
473
|
+
it("Step 1: Discover infrastructure", async () => {
|
|
474
|
+
const result = await callTool("discover_infrastructure", {
|
|
475
|
+
categories: ["agent_loop", "telemetry"],
|
|
476
|
+
depth: "shallow",
|
|
477
|
+
}, "bootstrap");
|
|
478
|
+
expect(result).toHaveProperty("discovered");
|
|
479
|
+
expect(result).toHaveProperty("missing");
|
|
480
|
+
});
|
|
481
|
+
it("Step 2: Triple verify a component", async () => {
|
|
482
|
+
const result = await callTool("triple_verify", {
|
|
483
|
+
target: "verification-tools",
|
|
484
|
+
scope: "implementation", // Valid: implementation|integration|deployment|full
|
|
485
|
+
includeWebSearch: false,
|
|
486
|
+
}, "bootstrap");
|
|
487
|
+
// Returns verification1_internal, verification2_external, verification3_synthesis
|
|
488
|
+
expect(result).toHaveProperty("verification1_internal");
|
|
489
|
+
expect(result).toHaveProperty("verification2_external");
|
|
490
|
+
expect(result).toHaveProperty("verification3_synthesis");
|
|
491
|
+
});
|
|
492
|
+
it("Step 3: Self-implement missing component", async () => {
|
|
493
|
+
const result = await callTool("self_implement", {
|
|
494
|
+
component: "telemetry",
|
|
495
|
+
dryRun: true,
|
|
496
|
+
}, "bootstrap");
|
|
497
|
+
expect(result).toHaveProperty("component");
|
|
498
|
+
// Returns plan, files, nextSteps
|
|
499
|
+
expect(result).toHaveProperty("plan");
|
|
500
|
+
expect(result).toHaveProperty("files");
|
|
501
|
+
});
|
|
502
|
+
it("Step 4: Generate self-instructions", async () => {
|
|
503
|
+
const result = await callTool("generate_self_instructions", {
|
|
504
|
+
format: "claude_md",
|
|
505
|
+
includeExternalSources: false,
|
|
506
|
+
}, "bootstrap");
|
|
507
|
+
expect(result).toHaveProperty("format");
|
|
508
|
+
// Returns content (not instructions)
|
|
509
|
+
expect(result).toHaveProperty("content");
|
|
510
|
+
});
|
|
511
|
+
it("Step 5: Connect channels", async () => {
|
|
512
|
+
const result = await callTool("connect_channels", {
|
|
513
|
+
channels: ["web", "github"],
|
|
514
|
+
query: "mcp tools",
|
|
515
|
+
aggressive: false,
|
|
516
|
+
}, "bootstrap");
|
|
517
|
+
// Returns query, results (array of {channel, findings, sources})
|
|
518
|
+
expect(result).toHaveProperty("query");
|
|
519
|
+
expect(result).toHaveProperty("results");
|
|
520
|
+
expect(Array.isArray(result.results)).toBe(true);
|
|
521
|
+
});
|
|
522
|
+
});
|
|
523
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
524
|
+
// SCENARIO 8: Autonomous Maintenance (autonomous_maintenance methodology)
|
|
525
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
526
|
+
describe("Scenario: Autonomous Maintenance", () => {
|
|
527
|
+
it("Step 1: Assess risk before action", async () => {
|
|
528
|
+
const result = await callTool("assess_risk", {
|
|
529
|
+
action: "update_agents_md",
|
|
530
|
+
context: "Adding new documentation",
|
|
531
|
+
}, "autonomous");
|
|
532
|
+
expect(result.assessment.tier).toBe("medium");
|
|
533
|
+
expect(result.assessment.recommendation).toBe("log_and_proceed");
|
|
534
|
+
});
|
|
535
|
+
it("Step 2: Decide re-update vs create", async () => {
|
|
536
|
+
const result = await callTool("decide_re_update", {
|
|
537
|
+
targetContent: "New methodology documentation",
|
|
538
|
+
contentType: "documentation",
|
|
539
|
+
existingFiles: ["README.md", "AGENTS.md"],
|
|
540
|
+
}, "autonomous");
|
|
541
|
+
expect(["update_existing", "create_new", "merge"]).toContain(result.action);
|
|
542
|
+
});
|
|
543
|
+
it("Step 3: Run self-maintenance", async () => {
|
|
544
|
+
const result = await callTool("run_self_maintenance", {
|
|
545
|
+
scope: "quick",
|
|
546
|
+
autoFix: false,
|
|
547
|
+
dryRun: true,
|
|
548
|
+
}, "autonomous");
|
|
549
|
+
expect(result).toHaveProperty("checksPerformed");
|
|
550
|
+
expect(result).toHaveProperty("issuesFound");
|
|
551
|
+
});
|
|
552
|
+
it("Step 4: Scaffold directory structure", async () => {
|
|
553
|
+
const result = await callTool("scaffold_directory", {
|
|
554
|
+
component: "agent_loop",
|
|
555
|
+
includeTests: true,
|
|
556
|
+
dryRun: true,
|
|
557
|
+
}, "autonomous");
|
|
558
|
+
expect(result.component).toBe("agent_loop");
|
|
559
|
+
expect(result.structure.files.length).toBeGreaterThan(0);
|
|
560
|
+
});
|
|
561
|
+
it("Step 5: Run autonomous loop with guardrails", async () => {
|
|
562
|
+
const result = await callTool("run_autonomous_loop", {
|
|
563
|
+
goal: "Verify all documentation is in sync",
|
|
564
|
+
maxIterations: 3,
|
|
565
|
+
maxDurationMs: 5000,
|
|
566
|
+
stopOnFirstFailure: true,
|
|
567
|
+
}, "autonomous");
|
|
568
|
+
expect(result.goal).toBeTruthy();
|
|
569
|
+
expect(result.iterations).toBeLessThanOrEqual(3);
|
|
570
|
+
expect(["completed", "stopped", "timeout", "failed"]).toContain(result.status);
|
|
571
|
+
});
|
|
572
|
+
});
|
|
573
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
574
|
+
// SCENARIO 9: Meta Tools (tool discovery)
|
|
575
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
576
|
+
describe("Scenario: Meta Tool Discovery", () => {
|
|
577
|
+
it("Step 1: Find tools by keyword", async () => {
|
|
578
|
+
const result = await callTool("findTools", {
|
|
579
|
+
query: "verification",
|
|
580
|
+
}, "meta");
|
|
581
|
+
expect(result.tools.length).toBeGreaterThan(0);
|
|
582
|
+
});
|
|
583
|
+
it("Step 2: Find tools by category", async () => {
|
|
584
|
+
const result = await callTool("findTools", {
|
|
585
|
+
category: "bootstrap",
|
|
586
|
+
}, "meta");
|
|
587
|
+
expect(result.tools.length).toBeGreaterThan(0);
|
|
588
|
+
});
|
|
589
|
+
it("Step 3: Get methodology overview", async () => {
|
|
590
|
+
const result = await callTool("getMethodology", {
|
|
591
|
+
topic: "overview",
|
|
592
|
+
}, "meta");
|
|
593
|
+
expect(result.title).toContain("Overview");
|
|
594
|
+
const topics = Object.keys(result.steps[0].topics);
|
|
595
|
+
expect(topics.length).toBe(17);
|
|
596
|
+
});
|
|
597
|
+
it("Step 4: Get specific methodology", async () => {
|
|
598
|
+
const methodologies = [
|
|
599
|
+
"verification", "eval", "flywheel", "mandatory_flywheel",
|
|
600
|
+
"reconnaissance", "quality_gates", "ui_ux_qa", "agentic_vision",
|
|
601
|
+
"closed_loop", "learnings", "project_ideation", "tech_stack_2026",
|
|
602
|
+
"telemetry_setup", "agents_md_maintenance", "agent_bootstrap",
|
|
603
|
+
"autonomous_maintenance",
|
|
604
|
+
"self_reinforced_learning",
|
|
605
|
+
];
|
|
606
|
+
for (const topic of methodologies) {
|
|
607
|
+
const result = await callTool("getMethodology", { topic }, "meta");
|
|
608
|
+
expect(result.title).toBeTruthy();
|
|
609
|
+
expect(result.steps.length).toBeGreaterThan(0);
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
});
|
|
613
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
614
|
+
// SCENARIO 10: Self-Reinforced Learning (trajectory analysis)
|
|
615
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
616
|
+
describe("Scenario: Self-Reinforced Learning", () => {
|
|
617
|
+
it("Step 1: Log tool calls to build trajectory data", async () => {
|
|
618
|
+
const result = await callTool("log_tool_call", {
|
|
619
|
+
sessionId: "eval-harness-self-eval",
|
|
620
|
+
toolName: "start_verification_cycle",
|
|
621
|
+
durationMs: 25,
|
|
622
|
+
resultStatus: "success",
|
|
623
|
+
phase: "verification",
|
|
624
|
+
}, "self-eval");
|
|
625
|
+
expect(result.logged).toBe(true);
|
|
626
|
+
await callTool("log_tool_call", {
|
|
627
|
+
sessionId: "eval-harness-self-eval",
|
|
628
|
+
toolName: "log_phase_findings",
|
|
629
|
+
durationMs: 12,
|
|
630
|
+
resultStatus: "success",
|
|
631
|
+
phase: "verification",
|
|
632
|
+
}, "self-eval");
|
|
633
|
+
await callTool("log_tool_call", {
|
|
634
|
+
sessionId: "eval-harness-self-eval",
|
|
635
|
+
toolName: "run_mandatory_flywheel",
|
|
636
|
+
durationMs: 35,
|
|
637
|
+
resultStatus: "success",
|
|
638
|
+
phase: "flywheel",
|
|
639
|
+
}, "self-eval");
|
|
640
|
+
});
|
|
641
|
+
it("Step 2: Analyze trajectory patterns", async () => {
|
|
642
|
+
const result = await callTool("get_trajectory_analysis", {
|
|
643
|
+
sessionId: "eval-harness-self-eval",
|
|
644
|
+
}, "self-eval");
|
|
645
|
+
expect(result.totalCalls).toBeGreaterThanOrEqual(3);
|
|
646
|
+
expect(result.uniqueTools).toBeGreaterThanOrEqual(3);
|
|
647
|
+
expect(result.topTools.length).toBeGreaterThan(0);
|
|
648
|
+
});
|
|
649
|
+
it("Step 3: Generate self-eval health report", async () => {
|
|
650
|
+
const result = await callTool("get_self_eval_report", {
|
|
651
|
+
sinceDaysAgo: 30,
|
|
652
|
+
}, "self-eval");
|
|
653
|
+
expect(typeof result.healthScore).toBe("number");
|
|
654
|
+
expect(result).toHaveProperty("verification");
|
|
655
|
+
expect(result).toHaveProperty("gaps");
|
|
656
|
+
expect(result).toHaveProperty("evalRuns");
|
|
657
|
+
expect(result).toHaveProperty("toolTrajectory");
|
|
658
|
+
});
|
|
659
|
+
it("Step 4: Get improvement recommendations", async () => {
|
|
660
|
+
const result = await callTool("get_improvement_recommendations", {
|
|
661
|
+
sinceDaysAgo: 30,
|
|
662
|
+
focus: "all",
|
|
663
|
+
}, "self-eval");
|
|
664
|
+
expect(typeof result.totalRecommendations).toBe("number");
|
|
665
|
+
expect(Array.isArray(result.recommendations)).toBe(true);
|
|
666
|
+
expect(result._selfReinforcement.nextSteps.length).toBe(4);
|
|
667
|
+
});
|
|
668
|
+
});
|
|
669
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
670
|
+
// COVERAGE REPORT
|
|
671
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
672
|
+
describe("Coverage Report", () => {
|
|
673
|
+
it("should generate comprehensive Proof of Work report", () => {
|
|
674
|
+
const testedTools = new Set(toolCallLog.map(l => l.tool));
|
|
675
|
+
const allToolNames = allTools.map(t => t.name);
|
|
676
|
+
// Tools that require external dependencies (skip in automated tests)
|
|
677
|
+
const externalDependencyTools = [
|
|
678
|
+
"capture_ui_screenshot", // Requires Playwright
|
|
679
|
+
"capture_responsive_suite", // Requires Playwright
|
|
680
|
+
"discover_vision_env", // Dynamic SDK imports
|
|
681
|
+
"analyze_screenshot", // Requires AI API key
|
|
682
|
+
"manipulate_screenshot", // Requires Sharp
|
|
683
|
+
"web_search", // Requires AI API key
|
|
684
|
+
"fetch_url", // External network calls
|
|
685
|
+
"search_github", // Requires GitHub API
|
|
686
|
+
"analyze_repo", // Requires GitHub API
|
|
687
|
+
"update_agents_md", // File system - tested separately
|
|
688
|
+
"research_job_market", // Covered in tools.test.ts
|
|
689
|
+
"setup_local_env", // Covered in tools.test.ts
|
|
690
|
+
];
|
|
691
|
+
// Deprecated tools (kept for backwards compatibility, but flagged)
|
|
692
|
+
const deprecatedTools = [
|
|
693
|
+
{ tool: "search_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
|
|
694
|
+
{ tool: "list_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
|
|
695
|
+
];
|
|
696
|
+
const untestedTools = allToolNames.filter(name => !testedTools.has(name) && !externalDependencyTools.includes(name));
|
|
697
|
+
// Build tool-by-scenario matrix
|
|
698
|
+
const toolScenarioMap = new Map();
|
|
699
|
+
toolCallLog.forEach(l => {
|
|
700
|
+
if (!toolScenarioMap.has(l.tool))
|
|
701
|
+
toolScenarioMap.set(l.tool, []);
|
|
702
|
+
if (!toolScenarioMap.get(l.tool).includes(l.scenario)) {
|
|
703
|
+
toolScenarioMap.get(l.tool).push(l.scenario);
|
|
704
|
+
}
|
|
705
|
+
});
|
|
706
|
+
// Count successes and failures
|
|
707
|
+
const successCount = toolCallLog.filter(l => l.success).length;
|
|
708
|
+
const failureCount = toolCallLog.filter(l => !l.success).length;
|
|
709
|
+
// Build scenario summary
|
|
710
|
+
const byScenario = new Map();
|
|
711
|
+
toolCallLog.forEach(l => {
|
|
712
|
+
if (!byScenario.has(l.scenario)) {
|
|
713
|
+
byScenario.set(l.scenario, { tools: [], success: 0, fail: 0 });
|
|
714
|
+
}
|
|
715
|
+
const s = byScenario.get(l.scenario);
|
|
716
|
+
if (!s.tools.includes(l.tool))
|
|
717
|
+
s.tools.push(l.tool);
|
|
718
|
+
if (l.success)
|
|
719
|
+
s.success++;
|
|
720
|
+
else
|
|
721
|
+
s.fail++;
|
|
722
|
+
});
|
|
723
|
+
console.log("\n");
|
|
724
|
+
console.log("╔═══════════════════════════════════════════════════════════════════════════╗");
|
|
725
|
+
console.log("║ NODEBENCH MCP - PROOF OF WORK REPORT ║");
|
|
726
|
+
console.log("╚═══════════════════════════════════════════════════════════════════════════╝");
|
|
727
|
+
console.log("");
|
|
728
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
729
|
+
console.log("│ SUMMARY │");
|
|
730
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
731
|
+
console.log(`│ Total Tools in MCP: ${String(allToolNames.length).padStart(3)} │`);
|
|
732
|
+
console.log(`│ Tools Tested in Scenarios: ${String(testedTools.size).padStart(3)} (${Math.round(testedTools.size / allToolNames.length * 100)}%) │`);
|
|
733
|
+
console.log(`│ External Dependency (skip): ${String(externalDependencyTools.length).padStart(3)} (require API keys/network) │`);
|
|
734
|
+
console.log(`│ Untested (GAPS): ${String(untestedTools.length).padStart(3)} │`);
|
|
735
|
+
console.log(`│ Total Tool Calls: ${String(toolCallLog.length).padStart(3)} │`);
|
|
736
|
+
console.log(`│ Success Rate: ${successCount}/${toolCallLog.length} (${Math.round(successCount / toolCallLog.length * 100)}%) │`);
|
|
737
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
738
|
+
console.log("");
|
|
739
|
+
// Scenario breakdown
|
|
740
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
741
|
+
console.log("│ SCENARIOS TESTED │");
|
|
742
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
743
|
+
byScenario.forEach((data, scenario) => {
|
|
744
|
+
const status = data.fail === 0 ? "✓" : "✗";
|
|
745
|
+
const line = `│ ${status} ${scenario.padEnd(25)} ${String(data.tools.length).padStart(2)} tools, ${String(data.success).padStart(2)} calls`;
|
|
746
|
+
console.log(line.padEnd(78) + "│");
|
|
747
|
+
});
|
|
748
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
749
|
+
console.log("");
|
|
750
|
+
// Tool coverage matrix (grouped by domain)
|
|
751
|
+
const domainMap = {
|
|
752
|
+
"Verification": ["start_verification_cycle", "log_phase_findings", "log_gap", "resolve_gap", "log_test_result", "get_verification_status", "list_verification_cycles", "abandon_cycle"],
|
|
753
|
+
"Eval": ["start_eval_run", "record_eval_result", "complete_eval_run", "compare_eval_runs", "list_eval_runs"],
|
|
754
|
+
"Quality Gates": ["run_quality_gate", "get_gate_preset", "get_gate_history", "run_closed_loop"],
|
|
755
|
+
"Learning": ["record_learning", "search_learnings", "list_learnings", "delete_learning"],
|
|
756
|
+
"Flywheel": ["get_flywheel_status", "promote_to_eval", "trigger_investigation", "run_mandatory_flywheel"],
|
|
757
|
+
"Recon": ["run_recon", "log_recon_finding", "get_recon_summary", "check_framework_updates", "search_all_knowledge", "bootstrap_project", "get_project_context"],
|
|
758
|
+
"Bootstrap": ["discover_infrastructure", "triple_verify", "self_implement", "generate_self_instructions", "connect_channels"],
|
|
759
|
+
"Autonomous": ["assess_risk", "decide_re_update", "run_self_maintenance", "scaffold_directory", "run_autonomous_loop"],
|
|
760
|
+
"Self-Eval": ["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations"],
|
|
761
|
+
"Meta": ["findTools", "getMethodology"],
|
|
762
|
+
"External (skip)": externalDependencyTools,
|
|
763
|
+
};
|
|
764
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
765
|
+
console.log("│ TOOL COVERAGE BY DOMAIN │");
|
|
766
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
767
|
+
for (const [domain, tools] of Object.entries(domainMap)) {
|
|
768
|
+
const tested = tools.filter(t => testedTools.has(t)).length;
|
|
769
|
+
const total = tools.length;
|
|
770
|
+
const pct = Math.round(tested / total * 100);
|
|
771
|
+
const bar = "█".repeat(Math.round(pct / 10)) + "░".repeat(10 - Math.round(pct / 10));
|
|
772
|
+
const line = `│ ${domain.padEnd(18)} ${bar} ${String(tested).padStart(2)}/${String(total).padStart(2)} (${String(pct).padStart(3)}%)`;
|
|
773
|
+
console.log(line.padEnd(78) + "│");
|
|
774
|
+
}
|
|
775
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
776
|
+
console.log("");
|
|
777
|
+
// Gaps
|
|
778
|
+
if (untestedTools.length > 0) {
|
|
779
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
780
|
+
console.log("│ ⚠ GAPS (Untested Tools) │");
|
|
781
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
782
|
+
untestedTools.forEach(t => {
|
|
783
|
+
console.log(`│ - ${t}`.padEnd(78) + "│");
|
|
784
|
+
});
|
|
785
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
786
|
+
console.log("");
|
|
787
|
+
}
|
|
788
|
+
// Deprecated tools analysis
|
|
789
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
790
|
+
console.log("│ ⚠️ DEPRECATED TOOLS │");
|
|
791
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
792
|
+
if (deprecatedTools.length === 0) {
|
|
793
|
+
console.log("│ No deprecated tools.".padEnd(78) + "│");
|
|
794
|
+
}
|
|
795
|
+
else {
|
|
796
|
+
deprecatedTools.forEach(d => {
|
|
797
|
+
console.log(`│ - ${d.tool}: ${d.reason}`.slice(0, 77).padEnd(78) + "│");
|
|
798
|
+
});
|
|
799
|
+
}
|
|
800
|
+
console.log("│ │");
|
|
801
|
+
console.log("│ These tools are kept for backwards compatibility but return a │");
|
|
802
|
+
console.log("│ deprecation notice. Use search_all_knowledge for unified search. │");
|
|
803
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
804
|
+
console.log("");
|
|
805
|
+
// Final verdict
|
|
806
|
+
const allCovered = untestedTools.length === 0;
|
|
807
|
+
const allPassed = failureCount === 0;
|
|
808
|
+
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
809
|
+
console.log("│ VERDICT │");
|
|
810
|
+
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
811
|
+
if (allCovered && allPassed) {
|
|
812
|
+
console.log("│ ✅ ALL TOOLS TESTED AND WORKING │");
|
|
813
|
+
console.log("│ \"Yah it definitely works!\" │");
|
|
814
|
+
}
|
|
815
|
+
else if (allPassed) {
|
|
816
|
+
console.log("│ ✅ ALL TESTED TOOLS WORKING │");
|
|
817
|
+
console.log(`│ ⚠ ${untestedTools.length} tools not covered in scenario tests (see gaps above)`.padEnd(78) + "│");
|
|
818
|
+
}
|
|
819
|
+
else {
|
|
820
|
+
console.log(`│ ❌ ${failureCount} tool calls failed - investigate before shipping`.padEnd(78) + "│");
|
|
821
|
+
}
|
|
822
|
+
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
823
|
+
console.log("");
|
|
824
|
+
// Assert minimum coverage
|
|
825
|
+
expect(testedTools.size).toBeGreaterThan(35); // Should test at least 35 tools
|
|
826
|
+
expect(untestedTools.length).toBe(0); // All non-external tools should be tested
|
|
827
|
+
expect(failureCount).toBe(0); // No failures allowed
|
|
828
|
+
});
|
|
829
|
+
});
|
|
830
|
+
//# sourceMappingURL=evalHarness.test.js.map
|