nodebench-mcp 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/NODEBENCH_AGENTS.md +253 -20
  2. package/STYLE_GUIDE.md +477 -0
  3. package/dist/__tests__/evalDatasetBench.test.d.ts +1 -0
  4. package/dist/__tests__/evalDatasetBench.test.js +738 -0
  5. package/dist/__tests__/evalDatasetBench.test.js.map +1 -0
  6. package/dist/__tests__/evalHarness.test.d.ts +1 -0
  7. package/dist/__tests__/evalHarness.test.js +830 -0
  8. package/dist/__tests__/evalHarness.test.js.map +1 -0
  9. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +264 -0
  10. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +10 -0
  11. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +135 -0
  12. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +1 -0
  13. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +14 -0
  14. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +189 -0
  15. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +1 -0
  16. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +16 -0
  17. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +154 -0
  18. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +1 -0
  19. package/dist/__tests__/fixtures/swebench_verified.sample.json +162 -0
  20. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +109 -0
  21. package/dist/__tests__/openDatasetParallelEval.test.d.ts +7 -0
  22. package/dist/__tests__/openDatasetParallelEval.test.js +209 -0
  23. package/dist/__tests__/openDatasetParallelEval.test.js.map +1 -0
  24. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +7 -0
  25. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +220 -0
  26. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +1 -0
  27. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +7 -0
  28. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +218 -0
  29. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +1 -0
  30. package/dist/__tests__/tools.test.js +252 -3
  31. package/dist/__tests__/tools.test.js.map +1 -1
  32. package/dist/db.js +20 -0
  33. package/dist/db.js.map +1 -1
  34. package/dist/index.js +2 -0
  35. package/dist/index.js.map +1 -1
  36. package/dist/tools/agentBootstrapTools.d.ts +5 -1
  37. package/dist/tools/agentBootstrapTools.js +566 -1
  38. package/dist/tools/agentBootstrapTools.js.map +1 -1
  39. package/dist/tools/documentationTools.js +102 -8
  40. package/dist/tools/documentationTools.js.map +1 -1
  41. package/dist/tools/learningTools.js +6 -2
  42. package/dist/tools/learningTools.js.map +1 -1
  43. package/dist/tools/metaTools.js +112 -1
  44. package/dist/tools/metaTools.js.map +1 -1
  45. package/dist/tools/selfEvalTools.d.ts +12 -0
  46. package/dist/tools/selfEvalTools.js +568 -0
  47. package/dist/tools/selfEvalTools.js.map +1 -0
  48. package/package.json +11 -3
@@ -0,0 +1,830 @@
1
+ /**
2
+ * Eval Harness for NodeBench MCP Tools
3
+ *
4
+ * Tests REAL agent scenarios to prove tools work in practice.
5
+ * Each scenario exercises multiple tools in realistic workflows.
6
+ *
7
+ * Coverage Goals:
8
+ * - Every tool called at least once
9
+ * - Every methodology workflow tested
10
+ * - Cross-tool integration verified
11
+ */
12
+ import { describe, it, expect } from "vitest";
13
+ import { verificationTools } from "../tools/verificationTools.js";
14
+ import { reconTools } from "../tools/reconTools.js";
15
+ import { evalTools } from "../tools/evalTools.js";
16
+ import { qualityGateTools } from "../tools/qualityGateTools.js";
17
+ import { flywheelTools } from "../tools/flywheelTools.js";
18
+ import { learningTools } from "../tools/learningTools.js";
19
+ import { documentationTools } from "../tools/documentationTools.js";
20
+ import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
21
+ import { selfEvalTools } from "../tools/selfEvalTools.js";
22
+ import { createMetaTools } from "../tools/metaTools.js";
23
+ // Assemble all tools
24
+ const domainTools = [
25
+ ...verificationTools,
26
+ ...evalTools,
27
+ ...qualityGateTools,
28
+ ...learningTools,
29
+ ...flywheelTools,
30
+ ...reconTools,
31
+ ...documentationTools,
32
+ ...agentBootstrapTools,
33
+ ...selfEvalTools,
34
+ ];
35
+ const allTools = [...domainTools, ...createMetaTools(domainTools)];
36
+ const findTool = (name) => {
37
+ const tool = allTools.find((t) => t.name === name);
38
+ if (!tool)
39
+ throw new Error(`Tool not found: ${name}`);
40
+ return tool;
41
+ };
42
+ // Track which tools are called
43
+ const toolCallLog = [];
44
+ async function callTool(name, args, scenario) {
45
+ const tool = findTool(name);
46
+ try {
47
+ const result = await tool.handler(args);
48
+ toolCallLog.push({ tool: name, scenario, success: true });
49
+ return result;
50
+ }
51
+ catch (error) {
52
+ toolCallLog.push({ tool: name, scenario, success: false });
53
+ throw error;
54
+ }
55
+ }
56
+ // ═══════════════════════════════════════════════════════════════════════════
57
+ // SCENARIO 1: New Feature Development (verification methodology)
58
+ // ═══════════════════════════════════════════════════════════════════════════
59
+ describe("Scenario: New Feature Development", () => {
60
+ let cycleId;
61
+ let gapId;
62
+ it("Step 1: Start verification cycle", async () => {
63
+ const result = await callTool("start_verification_cycle", {
64
+ title: "eval-feature-development",
65
+ description: "Implementing user authentication",
66
+ }, "feature-dev");
67
+ expect(result.cycleId).toBeTruthy();
68
+ cycleId = result.cycleId;
69
+ });
70
+ it("Step 2: Log context gathering (Phase 1)", async () => {
71
+ const result = await callTool("log_phase_findings", {
72
+ cycleId,
73
+ phaseNumber: 1,
74
+ status: "passed",
75
+ findings: { patterns: ["JWT", "session-based"], recommendation: "use JWT" },
76
+ }, "feature-dev");
77
+ expect(result.phaseRecorded).toBe(1);
78
+ });
79
+ it("Step 3: Log a gap found during implementation", async () => {
80
+ const result = await callTool("log_gap", {
81
+ cycleId,
82
+ severity: "MEDIUM",
83
+ title: "Missing rate limiting",
84
+ description: "Auth endpoint needs rate limiting",
85
+ rootCause: "Security oversight",
86
+ fixStrategy: "Add express-rate-limit middleware",
87
+ }, "feature-dev");
88
+ expect(result.gapId).toBeTruthy();
89
+ gapId = result.gapId;
90
+ });
91
+ it("Step 4: Get verification status", async () => {
92
+ const result = await callTool("get_verification_status", {
93
+ cycleId,
94
+ }, "feature-dev");
95
+ // Tool returns status (active/completed/abandoned), currentPhase, etc.
96
+ expect(result.status).toBeTruthy();
97
+ });
98
+ it("Step 5: Resolve the gap", async () => {
99
+ const result = await callTool("resolve_gap", {
100
+ gapId,
101
+ }, "feature-dev");
102
+ expect(result.status).toBe("resolved");
103
+ });
104
+ it("Step 6: Log test result", async () => {
105
+ const result = await callTool("log_test_result", {
106
+ cycleId,
107
+ layer: "integration", // Required field
108
+ label: "auth-integration-test", // Required field (not testName)
109
+ passed: true,
110
+ output: "All auth flows passing",
111
+ }, "feature-dev");
112
+ expect(result.testId).toBeTruthy();
113
+ });
114
+ it("Step 7: Cleanup - abandon cycle", async () => {
115
+ const result = await callTool("abandon_cycle", {
116
+ cycleId,
117
+ reason: "eval harness cleanup",
118
+ }, "feature-dev");
119
+ expect(result.abandoned).toBe(true);
120
+ });
121
+ });
122
+ // ═══════════════════════════════════════════════════════════════════════════
123
+ // SCENARIO 2: Eval-Driven Development (eval methodology)
124
+ // ═══════════════════════════════════════════════════════════════════════════
125
+ describe("Scenario: Eval-Driven Development", () => {
126
+ let evalRunId;
127
+ let caseIds;
128
+ it("Step 1: Start eval run with test cases", async () => {
129
+ // Actual schema: name, cases (with input, intent, expected)
130
+ const result = await callTool("start_eval_run", {
131
+ name: "eval-harness-run",
132
+ description: "Testing prompt quality",
133
+ cases: [
134
+ { input: "Hello", intent: "greeting" },
135
+ { input: "Help me code", intent: "assistance" },
136
+ ],
137
+ }, "eval-driven");
138
+ expect(result.runId).toBeTruthy();
139
+ evalRunId = result.runId;
140
+ caseIds = result.caseIds;
141
+ });
142
+ it("Step 2: Record eval results", async () => {
143
+ // Actual schema: caseId, verdict (pass/fail/partial), actual, score
144
+ const result1 = await callTool("record_eval_result", {
145
+ caseId: caseIds[0],
146
+ actual: "greeting response",
147
+ verdict: "pass",
148
+ score: 0.9,
149
+ }, "eval-driven");
150
+ expect(result1.caseId).toBe(caseIds[0]);
151
+ expect(result1.verdict).toBe("pass");
152
+ const result2 = await callTool("record_eval_result", {
153
+ caseId: caseIds[1],
154
+ actual: "help response",
155
+ verdict: "pass",
156
+ score: 0.85,
157
+ }, "eval-driven");
158
+ expect(result2.caseId).toBe(caseIds[1]);
159
+ });
160
+ it("Step 3: Complete eval run", async () => {
161
+ const result = await callTool("complete_eval_run", {
162
+ runId: evalRunId,
163
+ }, "eval-driven");
164
+ expect(result.runId).toBe(evalRunId);
165
+ expect(result.status).toBe("completed");
166
+ expect(result.summary).toBeDefined();
167
+ });
168
+ it("Step 4: List eval runs", async () => {
169
+ const result = await callTool("list_eval_runs", {
170
+ limit: 10,
171
+ }, "eval-driven");
172
+ expect(result.runs).toBeDefined();
173
+ expect(result.runs.length).toBeGreaterThan(0);
174
+ });
175
+ });
176
+ // ═══════════════════════════════════════════════════════════════════════════
177
+ // SCENARIO 3: Knowledge Management (learning methodology)
178
+ // ═══════════════════════════════════════════════════════════════════════════
179
+ describe("Scenario: Knowledge Management", () => {
180
+ const uniqueKey = `eval-learning-${Date.now()}`;
181
+ it("Step 1: Record a learning", async () => {
182
+ const result = await callTool("record_learning", {
183
+ key: uniqueKey,
184
+ category: "pattern",
185
+ content: "Use scenario-based testing to verify tool chains work together",
186
+ tags: ["testing", "eval", "integration"],
187
+ }, "knowledge");
188
+ expect(result.key).toBe(uniqueKey);
189
+ expect(result.success).toBe(true);
190
+ });
191
+ it("Step 2: Search for the learning", async () => {
192
+ // Returns { query, count, learnings: [...] }
193
+ const result = await callTool("search_learnings", {
194
+ query: "scenario testing",
195
+ }, "knowledge");
196
+ expect(result.learnings).toBeDefined();
197
+ });
198
+ it("Step 3: List all learnings", async () => {
199
+ const result = await callTool("list_learnings", {
200
+ limit: 20,
201
+ }, "knowledge");
202
+ expect(result.learnings).toBeDefined();
203
+ });
204
+ it("Step 4: Delete the learning", async () => {
205
+ // Returns { success: true, message }
206
+ const result = await callTool("delete_learning", {
207
+ key: uniqueKey,
208
+ }, "knowledge");
209
+ expect(result.success).toBe(true);
210
+ });
211
+ });
212
+ // ═══════════════════════════════════════════════════════════════════════════
213
+ // SCENARIO 4: Quality Gates (quality_gates methodology)
214
+ // ═══════════════════════════════════════════════════════════════════════════
215
+ describe("Scenario: Quality Gates", () => {
216
+ it("Step 1: Get deploy_readiness preset", async () => {
217
+ const result = await callTool("get_gate_preset", {
218
+ preset: "deploy_readiness",
219
+ }, "quality-gates");
220
+ expect(result.preset).toBe("deploy_readiness");
221
+ expect(result.rules.length).toBeGreaterThan(0);
222
+ });
223
+ it("Step 2: Run quality gate", async () => {
224
+ const result = await callTool("run_quality_gate", {
225
+ gateName: "deploy_readiness",
226
+ target: "eval-harness-test",
227
+ rules: [
228
+ { name: "tests_pass", passed: true },
229
+ { name: "no_type_errors", passed: true },
230
+ { name: "no_lint_errors", passed: true },
231
+ { name: "coverage_threshold", passed: false },
232
+ ],
233
+ }, "quality-gates");
234
+ expect(result.passed).toBe(false);
235
+ expect(result.failures).toContain("coverage_threshold");
236
+ });
237
+ it("Step 3: Get gate history", async () => {
238
+ // Returns { gateName, runs: [...], trend }
239
+ const result = await callTool("get_gate_history", {
240
+ gateName: "deploy_readiness",
241
+ limit: 10,
242
+ }, "quality-gates");
243
+ expect(result.gateName).toBe("deploy_readiness");
244
+ expect(result.runs).toBeDefined();
245
+ });
246
+ it("Step 4: Run closed loop verification", async () => {
247
+ // Actual schema: steps with { step: enum, passed: boolean }
248
+ const result = await callTool("run_closed_loop", {
249
+ steps: [
250
+ { step: "compile", passed: true },
251
+ { step: "lint", passed: true },
252
+ { step: "test", passed: true },
253
+ ],
254
+ }, "quality-gates");
255
+ expect(result.allPassed).toBe(true);
256
+ });
257
+ });
258
+ // ═══════════════════════════════════════════════════════════════════════════
259
+ // SCENARIO 5: Flywheel Orchestration (flywheel methodology)
260
+ // ═══════════════════════════════════════════════════════════════════════════
261
+ describe("Scenario: Flywheel Orchestration", () => {
262
+ it("Step 1: Get flywheel status", async () => {
263
+ // Returns { innerLoop, outerLoop, connections }
264
+ const result = await callTool("get_flywheel_status", {}, "flywheel");
265
+ expect(result).toHaveProperty("innerLoop");
266
+ expect(result).toHaveProperty("outerLoop");
267
+ expect(result).toHaveProperty("connections");
268
+ });
269
+ it("Step 2: Run mandatory flywheel check", async () => {
270
+ // Actual schema: target, steps array with stepName enum
271
+ const result = await callTool("run_mandatory_flywheel", {
272
+ target: "Added new auth feature",
273
+ steps: [
274
+ { stepName: "static_analysis", passed: true },
275
+ { stepName: "happy_path_test", passed: true },
276
+ { stepName: "failure_path_test", passed: true },
277
+ { stepName: "gap_analysis", passed: true },
278
+ { stepName: "fix_and_reverify", passed: true },
279
+ { stepName: "deploy_and_document", passed: true },
280
+ ],
281
+ }, "flywheel");
282
+ expect(result).toHaveProperty("passed");
283
+ expect(result.passed).toBe(true);
284
+ });
285
+ });
286
+ // ═══════════════════════════════════════════════════════════════════════════
287
+ // SCENARIO 5.5: Flywheel Integration (promote, investigate, compare)
288
+ // Tests the 4 previously untested flywheel integration tools
289
+ // ═══════════════════════════════════════════════════════════════════════════
290
+ describe("Scenario: Flywheel Integration", () => {
291
+ // Test the 4 previously untested flywheel tools in isolated tests
292
+ it("Step 1: list_verification_cycles - lists cycles", async () => {
293
+ // First create a cycle so we have something to list
294
+ const createResult = await callTool("start_verification_cycle", {
295
+ title: "List test cycle", // Note: title, not goal
296
+ description: "Testing list_verification_cycles",
297
+ }, "flywheel-integration");
298
+ const testCycleId = createResult.cycleId;
299
+ // Now list cycles
300
+ const result = await callTool("list_verification_cycles", {
301
+ limit: 10,
302
+ }, "flywheel-integration");
303
+ expect(result).toHaveProperty("count");
304
+ expect(result).toHaveProperty("cycles");
305
+ expect(Array.isArray(result.cycles)).toBe(true);
306
+ // Each cycle has cycleId property (not id)
307
+ expect(result.cycles.some((c) => c.cycleId === testCycleId)).toBe(true);
308
+ // Cleanup
309
+ await callTool("abandon_cycle", {
310
+ cycleId: testCycleId,
311
+ reason: "Test cleanup",
312
+ }, "flywheel-integration");
313
+ });
314
+ it("Step 2: promote_to_eval - promotes verification to eval suite", async () => {
315
+ // Create a cycle to promote from
316
+ const cycleResult = await callTool("start_verification_cycle", {
317
+ title: "Promote test cycle",
318
+ description: "Testing promote_to_eval",
319
+ }, "flywheel-integration");
320
+ // Promote with explicit cases (required)
321
+ const result = await callTool("promote_to_eval", {
322
+ cycleId: cycleResult.cycleId,
323
+ evalRunName: "promoted-eval-test",
324
+ cases: [
325
+ { input: "test input", intent: "Test intent" },
326
+ ],
327
+ }, "flywheel-integration");
328
+ expect(result).toHaveProperty("evalRunId");
329
+ expect(result).toHaveProperty("caseIds");
330
+ expect(result.caseCount).toBe(1);
331
+ // Cleanup
332
+ await callTool("abandon_cycle", {
333
+ cycleId: cycleResult.cycleId,
334
+ reason: "Test cleanup",
335
+ }, "flywheel-integration");
336
+ });
337
+ it("Step 3: compare_eval_runs - compares two completed evals", async () => {
338
+ // Create and complete baseline eval
339
+ const baseline = await callTool("start_eval_run", {
340
+ name: "baseline-for-compare",
341
+ cases: [{ input: "test", intent: "baseline" }],
342
+ }, "flywheel-integration");
343
+ await callTool("record_eval_result", {
344
+ caseId: baseline.caseIds[0],
345
+ actual: "result",
346
+ verdict: "pass",
347
+ }, "flywheel-integration");
348
+ await callTool("complete_eval_run", {
349
+ runId: baseline.runId,
350
+ }, "flywheel-integration");
351
+ // Create and complete candidate eval
352
+ const candidate = await callTool("start_eval_run", {
353
+ name: "candidate-for-compare",
354
+ cases: [{ input: "test", intent: "candidate" }],
355
+ }, "flywheel-integration");
356
+ await callTool("record_eval_result", {
357
+ caseId: candidate.caseIds[0],
358
+ actual: "result",
359
+ verdict: "pass",
360
+ }, "flywheel-integration");
361
+ await callTool("complete_eval_run", {
362
+ runId: candidate.runId,
363
+ }, "flywheel-integration");
364
+ // Compare them
365
+ const result = await callTool("compare_eval_runs", {
366
+ baselineRunId: baseline.runId,
367
+ candidateRunId: candidate.runId,
368
+ }, "flywheel-integration");
369
+ expect(result).toHaveProperty("recommendation");
370
+ expect(["DEPLOY", "REVERT", "INVESTIGATE"]).toContain(result.recommendation);
371
+ });
372
+ it("Step 4: trigger_investigation - creates investigation cycle", async () => {
373
+ // Create and complete an eval run to investigate
374
+ const eval1 = await callTool("start_eval_run", {
375
+ name: "eval-to-investigate",
376
+ cases: [{ input: "test", intent: "investigate" }],
377
+ }, "flywheel-integration");
378
+ await callTool("record_eval_result", {
379
+ caseId: eval1.caseIds[0],
380
+ actual: "failed",
381
+ verdict: "fail",
382
+ }, "flywheel-integration");
383
+ await callTool("complete_eval_run", {
384
+ runId: eval1.runId,
385
+ }, "flywheel-integration");
386
+ // Trigger investigation
387
+ const result = await callTool("trigger_investigation", {
388
+ evalRunId: eval1.runId,
389
+ regressionDescription: "Test failure detected",
390
+ }, "flywheel-integration");
391
+ // Returns cycleId, title, linkedEvalRun, phase1Instructions
392
+ expect(result).toHaveProperty("cycleId");
393
+ expect(result).toHaveProperty("title");
394
+ expect(result).toHaveProperty("linkedEvalRun");
395
+ // Cleanup
396
+ await callTool("abandon_cycle", {
397
+ cycleId: result.cycleId,
398
+ reason: "Test cleanup",
399
+ }, "flywheel-integration");
400
+ });
401
+ });
402
+ // ═══════════════════════════════════════════════════════════════════════════
403
+ // SCENARIO 6: Research & Discovery (recon methodology)
404
+ // ═══════════════════════════════════════════════════════════════════════════
405
+ describe("Scenario: Research & Discovery", () => {
406
+ let reconSessionId;
407
+ it("Step 1: Start recon session", async () => {
408
+ // Actual schema: target (required), description, projectContext
409
+ const result = await callTool("run_recon", {
410
+ target: "MCP server best practices",
411
+ description: "Research for eval harness",
412
+ }, "research");
413
+ expect(result.sessionId).toBeTruthy();
414
+ reconSessionId = result.sessionId;
415
+ });
416
+ it("Step 2: Log recon finding", async () => {
417
+ // Actual schema: sessionId, category (enum), summary, sourceUrl, relevance
418
+ const result = await callTool("log_recon_finding", {
419
+ sessionId: reconSessionId,
420
+ category: "best_practice",
421
+ summary: "Organize tools by domain for better discoverability",
422
+ sourceUrl: "https://docs.anthropic.com",
423
+ relevance: "Applies to MCP tool organization",
424
+ }, "research");
425
+ expect(result.findingId).toBeTruthy();
426
+ expect(result.findingCount).toBeGreaterThan(0);
427
+ });
428
+ it("Step 3: Get recon summary", async () => {
429
+ // Returns { sessionId, target, status, totalFindings, findingsByCategory, ... }
430
+ const result = await callTool("get_recon_summary", {
431
+ sessionId: reconSessionId,
432
+ }, "research");
433
+ expect(result.sessionId).toBe(reconSessionId);
434
+ expect(result.totalFindings).toBeGreaterThan(0);
435
+ });
436
+ it("Step 4: Check framework updates", async () => {
437
+ // Actual schema: ecosystem (enum)
438
+ const result = await callTool("check_framework_updates", {
439
+ ecosystem: "mcp",
440
+ }, "research");
441
+ expect(result.ecosystem).toBe("mcp");
442
+ expect(result.sources).toBeDefined();
443
+ });
444
+ it("Step 5: Bootstrap project context", async () => {
445
+ // Actual schema: projectName (required), techStack, architecture, etc.
446
+ const result = await callTool("bootstrap_project", {
447
+ projectName: "eval-harness-project",
448
+ techStack: "TypeScript, Vitest, MCP",
449
+ architecture: "Modular tool system",
450
+ }, "research");
451
+ expect(result.projectName).toBe("eval-harness-project");
452
+ expect(result.storedFields).toBeDefined();
453
+ });
454
+ it("Step 6: Get project context", async () => {
455
+ // Returns { context: {}, knowledgeBase: {} }
456
+ const result = await callTool("get_project_context", {}, "research");
457
+ expect(result).toHaveProperty("context");
458
+ expect(result).toHaveProperty("knowledgeBase");
459
+ });
460
+ it("Step 7: Search all knowledge", async () => {
461
+ const result = await callTool("search_all_knowledge", {
462
+ query: "MCP tools",
463
+ }, "research");
464
+ expect(result).toHaveProperty("learnings");
465
+ expect(result).toHaveProperty("reconFindings");
466
+ expect(result).toHaveProperty("gaps");
467
+ });
468
+ });
469
+ // ═══════════════════════════════════════════════════════════════════════════
470
+ // SCENARIO 7: Agent Self-Bootstrap (agent_bootstrap methodology)
471
+ // ═══════════════════════════════════════════════════════════════════════════
472
+ describe("Scenario: Agent Self-Bootstrap", () => {
473
+ it("Step 1: Discover infrastructure", async () => {
474
+ const result = await callTool("discover_infrastructure", {
475
+ categories: ["agent_loop", "telemetry"],
476
+ depth: "shallow",
477
+ }, "bootstrap");
478
+ expect(result).toHaveProperty("discovered");
479
+ expect(result).toHaveProperty("missing");
480
+ });
481
+ it("Step 2: Triple verify a component", async () => {
482
+ const result = await callTool("triple_verify", {
483
+ target: "verification-tools",
484
+ scope: "implementation", // Valid: implementation|integration|deployment|full
485
+ includeWebSearch: false,
486
+ }, "bootstrap");
487
+ // Returns verification1_internal, verification2_external, verification3_synthesis
488
+ expect(result).toHaveProperty("verification1_internal");
489
+ expect(result).toHaveProperty("verification2_external");
490
+ expect(result).toHaveProperty("verification3_synthesis");
491
+ });
492
+ it("Step 3: Self-implement missing component", async () => {
493
+ const result = await callTool("self_implement", {
494
+ component: "telemetry",
495
+ dryRun: true,
496
+ }, "bootstrap");
497
+ expect(result).toHaveProperty("component");
498
+ // Returns plan, files, nextSteps
499
+ expect(result).toHaveProperty("plan");
500
+ expect(result).toHaveProperty("files");
501
+ });
502
+ it("Step 4: Generate self-instructions", async () => {
503
+ const result = await callTool("generate_self_instructions", {
504
+ format: "claude_md",
505
+ includeExternalSources: false,
506
+ }, "bootstrap");
507
+ expect(result).toHaveProperty("format");
508
+ // Returns content (not instructions)
509
+ expect(result).toHaveProperty("content");
510
+ });
511
+ it("Step 5: Connect channels", async () => {
512
+ const result = await callTool("connect_channels", {
513
+ channels: ["web", "github"],
514
+ query: "mcp tools",
515
+ aggressive: false,
516
+ }, "bootstrap");
517
+ // Returns query, results (array of {channel, findings, sources})
518
+ expect(result).toHaveProperty("query");
519
+ expect(result).toHaveProperty("results");
520
+ expect(Array.isArray(result.results)).toBe(true);
521
+ });
522
+ });
523
+ // ═══════════════════════════════════════════════════════════════════════════
524
+ // SCENARIO 8: Autonomous Maintenance (autonomous_maintenance methodology)
525
+ // ═══════════════════════════════════════════════════════════════════════════
526
+ describe("Scenario: Autonomous Maintenance", () => {
527
+ it("Step 1: Assess risk before action", async () => {
528
+ const result = await callTool("assess_risk", {
529
+ action: "update_agents_md",
530
+ context: "Adding new documentation",
531
+ }, "autonomous");
532
+ expect(result.assessment.tier).toBe("medium");
533
+ expect(result.assessment.recommendation).toBe("log_and_proceed");
534
+ });
535
+ it("Step 2: Decide re-update vs create", async () => {
536
+ const result = await callTool("decide_re_update", {
537
+ targetContent: "New methodology documentation",
538
+ contentType: "documentation",
539
+ existingFiles: ["README.md", "AGENTS.md"],
540
+ }, "autonomous");
541
+ expect(["update_existing", "create_new", "merge"]).toContain(result.action);
542
+ });
543
+ it("Step 3: Run self-maintenance", async () => {
544
+ const result = await callTool("run_self_maintenance", {
545
+ scope: "quick",
546
+ autoFix: false,
547
+ dryRun: true,
548
+ }, "autonomous");
549
+ expect(result).toHaveProperty("checksPerformed");
550
+ expect(result).toHaveProperty("issuesFound");
551
+ });
552
+ it("Step 4: Scaffold directory structure", async () => {
553
+ const result = await callTool("scaffold_directory", {
554
+ component: "agent_loop",
555
+ includeTests: true,
556
+ dryRun: true,
557
+ }, "autonomous");
558
+ expect(result.component).toBe("agent_loop");
559
+ expect(result.structure.files.length).toBeGreaterThan(0);
560
+ });
561
+ it("Step 5: Run autonomous loop with guardrails", async () => {
562
+ const result = await callTool("run_autonomous_loop", {
563
+ goal: "Verify all documentation is in sync",
564
+ maxIterations: 3,
565
+ maxDurationMs: 5000,
566
+ stopOnFirstFailure: true,
567
+ }, "autonomous");
568
+ expect(result.goal).toBeTruthy();
569
+ expect(result.iterations).toBeLessThanOrEqual(3);
570
+ expect(["completed", "stopped", "timeout", "failed"]).toContain(result.status);
571
+ });
572
+ });
573
+ // ═══════════════════════════════════════════════════════════════════════════
574
+ // SCENARIO 9: Meta Tools (tool discovery)
575
+ // ═══════════════════════════════════════════════════════════════════════════
576
+ describe("Scenario: Meta Tool Discovery", () => {
577
+ it("Step 1: Find tools by keyword", async () => {
578
+ const result = await callTool("findTools", {
579
+ query: "verification",
580
+ }, "meta");
581
+ expect(result.tools.length).toBeGreaterThan(0);
582
+ });
583
+ it("Step 2: Find tools by category", async () => {
584
+ const result = await callTool("findTools", {
585
+ category: "bootstrap",
586
+ }, "meta");
587
+ expect(result.tools.length).toBeGreaterThan(0);
588
+ });
589
+ it("Step 3: Get methodology overview", async () => {
590
+ const result = await callTool("getMethodology", {
591
+ topic: "overview",
592
+ }, "meta");
593
+ expect(result.title).toContain("Overview");
594
+ const topics = Object.keys(result.steps[0].topics);
595
+ expect(topics.length).toBe(17);
596
+ });
597
+ it("Step 4: Get specific methodology", async () => {
598
+ const methodologies = [
599
+ "verification", "eval", "flywheel", "mandatory_flywheel",
600
+ "reconnaissance", "quality_gates", "ui_ux_qa", "agentic_vision",
601
+ "closed_loop", "learnings", "project_ideation", "tech_stack_2026",
602
+ "telemetry_setup", "agents_md_maintenance", "agent_bootstrap",
603
+ "autonomous_maintenance",
604
+ "self_reinforced_learning",
605
+ ];
606
+ for (const topic of methodologies) {
607
+ const result = await callTool("getMethodology", { topic }, "meta");
608
+ expect(result.title).toBeTruthy();
609
+ expect(result.steps.length).toBeGreaterThan(0);
610
+ }
611
+ });
612
+ });
613
+ // ═══════════════════════════════════════════════════════════════════════════
614
+ // SCENARIO 10: Self-Reinforced Learning (trajectory analysis)
615
+ // ═══════════════════════════════════════════════════════════════════════════
616
+ describe("Scenario: Self-Reinforced Learning", () => {
617
+ it("Step 1: Log tool calls to build trajectory data", async () => {
618
+ const result = await callTool("log_tool_call", {
619
+ sessionId: "eval-harness-self-eval",
620
+ toolName: "start_verification_cycle",
621
+ durationMs: 25,
622
+ resultStatus: "success",
623
+ phase: "verification",
624
+ }, "self-eval");
625
+ expect(result.logged).toBe(true);
626
+ await callTool("log_tool_call", {
627
+ sessionId: "eval-harness-self-eval",
628
+ toolName: "log_phase_findings",
629
+ durationMs: 12,
630
+ resultStatus: "success",
631
+ phase: "verification",
632
+ }, "self-eval");
633
+ await callTool("log_tool_call", {
634
+ sessionId: "eval-harness-self-eval",
635
+ toolName: "run_mandatory_flywheel",
636
+ durationMs: 35,
637
+ resultStatus: "success",
638
+ phase: "flywheel",
639
+ }, "self-eval");
640
+ });
641
+ it("Step 2: Analyze trajectory patterns", async () => {
642
+ const result = await callTool("get_trajectory_analysis", {
643
+ sessionId: "eval-harness-self-eval",
644
+ }, "self-eval");
645
+ expect(result.totalCalls).toBeGreaterThanOrEqual(3);
646
+ expect(result.uniqueTools).toBeGreaterThanOrEqual(3);
647
+ expect(result.topTools.length).toBeGreaterThan(0);
648
+ });
649
+ it("Step 3: Generate self-eval health report", async () => {
650
+ const result = await callTool("get_self_eval_report", {
651
+ sinceDaysAgo: 30,
652
+ }, "self-eval");
653
+ expect(typeof result.healthScore).toBe("number");
654
+ expect(result).toHaveProperty("verification");
655
+ expect(result).toHaveProperty("gaps");
656
+ expect(result).toHaveProperty("evalRuns");
657
+ expect(result).toHaveProperty("toolTrajectory");
658
+ });
659
+ it("Step 4: Get improvement recommendations", async () => {
660
+ const result = await callTool("get_improvement_recommendations", {
661
+ sinceDaysAgo: 30,
662
+ focus: "all",
663
+ }, "self-eval");
664
+ expect(typeof result.totalRecommendations).toBe("number");
665
+ expect(Array.isArray(result.recommendations)).toBe(true);
666
+ expect(result._selfReinforcement.nextSteps.length).toBe(4);
667
+ });
668
+ });
669
+ // ═══════════════════════════════════════════════════════════════════════════
670
+ // COVERAGE REPORT
671
+ // ═══════════════════════════════════════════════════════════════════════════
672
+ describe("Coverage Report", () => {
673
+ it("should generate comprehensive Proof of Work report", () => {
674
+ const testedTools = new Set(toolCallLog.map(l => l.tool));
675
+ const allToolNames = allTools.map(t => t.name);
676
+ // Tools that require external dependencies (skip in automated tests)
677
+ const externalDependencyTools = [
678
+ "capture_ui_screenshot", // Requires Playwright
679
+ "capture_responsive_suite", // Requires Playwright
680
+ "discover_vision_env", // Dynamic SDK imports
681
+ "analyze_screenshot", // Requires AI API key
682
+ "manipulate_screenshot", // Requires Sharp
683
+ "web_search", // Requires AI API key
684
+ "fetch_url", // External network calls
685
+ "search_github", // Requires GitHub API
686
+ "analyze_repo", // Requires GitHub API
687
+ "update_agents_md", // File system - tested separately
688
+ "research_job_market", // Covered in tools.test.ts
689
+ "setup_local_env", // Covered in tools.test.ts
690
+ ];
691
+ // Deprecated tools (kept for backwards compatibility, but flagged)
692
+ const deprecatedTools = [
693
+ { tool: "search_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
694
+ { tool: "list_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
695
+ ];
696
+ const untestedTools = allToolNames.filter(name => !testedTools.has(name) && !externalDependencyTools.includes(name));
697
+ // Build tool-by-scenario matrix
698
+ const toolScenarioMap = new Map();
699
+ toolCallLog.forEach(l => {
700
+ if (!toolScenarioMap.has(l.tool))
701
+ toolScenarioMap.set(l.tool, []);
702
+ if (!toolScenarioMap.get(l.tool).includes(l.scenario)) {
703
+ toolScenarioMap.get(l.tool).push(l.scenario);
704
+ }
705
+ });
706
+ // Count successes and failures
707
+ const successCount = toolCallLog.filter(l => l.success).length;
708
+ const failureCount = toolCallLog.filter(l => !l.success).length;
709
+ // Build scenario summary
710
+ const byScenario = new Map();
711
+ toolCallLog.forEach(l => {
712
+ if (!byScenario.has(l.scenario)) {
713
+ byScenario.set(l.scenario, { tools: [], success: 0, fail: 0 });
714
+ }
715
+ const s = byScenario.get(l.scenario);
716
+ if (!s.tools.includes(l.tool))
717
+ s.tools.push(l.tool);
718
+ if (l.success)
719
+ s.success++;
720
+ else
721
+ s.fail++;
722
+ });
723
+ console.log("\n");
724
+ console.log("╔═══════════════════════════════════════════════════════════════════════════╗");
725
+ console.log("║ NODEBENCH MCP - PROOF OF WORK REPORT ║");
726
+ console.log("╚═══════════════════════════════════════════════════════════════════════════╝");
727
+ console.log("");
728
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
729
+ console.log("│ SUMMARY │");
730
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
731
+ console.log(`│ Total Tools in MCP: ${String(allToolNames.length).padStart(3)} │`);
732
+ console.log(`│ Tools Tested in Scenarios: ${String(testedTools.size).padStart(3)} (${Math.round(testedTools.size / allToolNames.length * 100)}%) │`);
733
+ console.log(`│ External Dependency (skip): ${String(externalDependencyTools.length).padStart(3)} (require API keys/network) │`);
734
+ console.log(`│ Untested (GAPS): ${String(untestedTools.length).padStart(3)} │`);
735
+ console.log(`│ Total Tool Calls: ${String(toolCallLog.length).padStart(3)} │`);
736
+ console.log(`│ Success Rate: ${successCount}/${toolCallLog.length} (${Math.round(successCount / toolCallLog.length * 100)}%) │`);
737
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
738
+ console.log("");
739
+ // Scenario breakdown
740
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
741
+ console.log("│ SCENARIOS TESTED │");
742
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
743
+ byScenario.forEach((data, scenario) => {
744
+ const status = data.fail === 0 ? "✓" : "✗";
745
+ const line = `│ ${status} ${scenario.padEnd(25)} ${String(data.tools.length).padStart(2)} tools, ${String(data.success).padStart(2)} calls`;
746
+ console.log(line.padEnd(78) + "│");
747
+ });
748
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
749
+ console.log("");
750
+ // Tool coverage matrix (grouped by domain)
751
+ const domainMap = {
752
+ "Verification": ["start_verification_cycle", "log_phase_findings", "log_gap", "resolve_gap", "log_test_result", "get_verification_status", "list_verification_cycles", "abandon_cycle"],
753
+ "Eval": ["start_eval_run", "record_eval_result", "complete_eval_run", "compare_eval_runs", "list_eval_runs"],
754
+ "Quality Gates": ["run_quality_gate", "get_gate_preset", "get_gate_history", "run_closed_loop"],
755
+ "Learning": ["record_learning", "search_learnings", "list_learnings", "delete_learning"],
756
+ "Flywheel": ["get_flywheel_status", "promote_to_eval", "trigger_investigation", "run_mandatory_flywheel"],
757
+ "Recon": ["run_recon", "log_recon_finding", "get_recon_summary", "check_framework_updates", "search_all_knowledge", "bootstrap_project", "get_project_context"],
758
+ "Bootstrap": ["discover_infrastructure", "triple_verify", "self_implement", "generate_self_instructions", "connect_channels"],
759
+ "Autonomous": ["assess_risk", "decide_re_update", "run_self_maintenance", "scaffold_directory", "run_autonomous_loop"],
760
+ "Self-Eval": ["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations"],
761
+ "Meta": ["findTools", "getMethodology"],
762
+ "External (skip)": externalDependencyTools,
763
+ };
764
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
765
+ console.log("│ TOOL COVERAGE BY DOMAIN │");
766
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
767
+ for (const [domain, tools] of Object.entries(domainMap)) {
768
+ const tested = tools.filter(t => testedTools.has(t)).length;
769
+ const total = tools.length;
770
+ const pct = Math.round(tested / total * 100);
771
+ const bar = "█".repeat(Math.round(pct / 10)) + "░".repeat(10 - Math.round(pct / 10));
772
+ const line = `│ ${domain.padEnd(18)} ${bar} ${String(tested).padStart(2)}/${String(total).padStart(2)} (${String(pct).padStart(3)}%)`;
773
+ console.log(line.padEnd(78) + "│");
774
+ }
775
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
776
+ console.log("");
777
+ // Gaps
778
+ if (untestedTools.length > 0) {
779
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
780
+ console.log("│ ⚠ GAPS (Untested Tools) │");
781
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
782
+ untestedTools.forEach(t => {
783
+ console.log(`│ - ${t}`.padEnd(78) + "│");
784
+ });
785
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
786
+ console.log("");
787
+ }
788
+ // Deprecated tools analysis
789
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
790
+ console.log("│ ⚠️ DEPRECATED TOOLS │");
791
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
792
+ if (deprecatedTools.length === 0) {
793
+ console.log("│ No deprecated tools.".padEnd(78) + "│");
794
+ }
795
+ else {
796
+ deprecatedTools.forEach(d => {
797
+ console.log(`│ - ${d.tool}: ${d.reason}`.slice(0, 77).padEnd(78) + "│");
798
+ });
799
+ }
800
+ console.log("│ │");
801
+ console.log("│ These tools are kept for backwards compatibility but return a │");
802
+ console.log("│ deprecation notice. Use search_all_knowledge for unified search. │");
803
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
804
+ console.log("");
805
+ // Final verdict
806
+ const allCovered = untestedTools.length === 0;
807
+ const allPassed = failureCount === 0;
808
+ console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
809
+ console.log("│ VERDICT │");
810
+ console.log("├─────────────────────────────────────────────────────────────────────────────┤");
811
+ if (allCovered && allPassed) {
812
+ console.log("│ ✅ ALL TOOLS TESTED AND WORKING │");
813
+ console.log("│ \"Yah it definitely works!\" │");
814
+ }
815
+ else if (allPassed) {
816
+ console.log("│ ✅ ALL TESTED TOOLS WORKING │");
817
+ console.log(`│ ⚠ ${untestedTools.length} tools not covered in scenario tests (see gaps above)`.padEnd(78) + "│");
818
+ }
819
+ else {
820
+ console.log(`│ ❌ ${failureCount} tool calls failed - investigate before shipping`.padEnd(78) + "│");
821
+ }
822
+ console.log("└─────────────────────────────────────────────────────────────────────────────┘");
823
+ console.log("");
824
+ // Assert minimum coverage
825
+ expect(testedTools.size).toBeGreaterThan(35); // Should test at least 35 tools
826
+ expect(untestedTools.length).toBe(0); // All non-external tools should be tested
827
+ expect(failureCount).toBe(0); // No failures allowed
828
+ });
829
+ });
830
+ //# sourceMappingURL=evalHarness.test.js.map