nodebench-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  2. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  3. package/dist/dashboard/operatingServer.js +3 -2
  4. package/dist/dashboard/operatingServer.js.map +1 -1
  5. package/dist/db.js +51 -3
  6. package/dist/db.js.map +1 -1
  7. package/dist/index.js +13 -16
  8. package/dist/index.js.map +1 -1
  9. package/dist/packageInfo.d.ts +3 -0
  10. package/dist/packageInfo.js +32 -0
  11. package/dist/packageInfo.js.map +1 -0
  12. package/dist/sandboxApi.js +2 -1
  13. package/dist/sandboxApi.js.map +1 -1
  14. package/dist/tools/boilerplateTools.js +10 -9
  15. package/dist/tools/boilerplateTools.js.map +1 -1
  16. package/dist/tools/documentationTools.js +2 -1
  17. package/dist/tools/documentationTools.js.map +1 -1
  18. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  19. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  20. package/dist/tools/toolRegistry.js +11 -0
  21. package/dist/tools/toolRegistry.js.map +1 -1
  22. package/dist/toolsetRegistry.js +74 -1
  23. package/dist/toolsetRegistry.js.map +1 -1
  24. package/package.json +4 -3
  25. package/dist/__tests__/analytics.test.d.ts +0 -11
  26. package/dist/__tests__/analytics.test.js +0 -546
  27. package/dist/__tests__/analytics.test.js.map +0 -1
  28. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  29. package/dist/__tests__/architectComplex.test.js +0 -373
  30. package/dist/__tests__/architectComplex.test.js.map +0 -1
  31. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  32. package/dist/__tests__/architectSmoke.test.js +0 -92
  33. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  34. package/dist/__tests__/audit-registry.d.ts +0 -1
  35. package/dist/__tests__/audit-registry.js +0 -60
  36. package/dist/__tests__/audit-registry.js.map +0 -1
  37. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  38. package/dist/__tests__/batchAutopilot.test.js +0 -218
  39. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  40. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  41. package/dist/__tests__/cliSubcommands.test.js +0 -138
  42. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  43. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  44. package/dist/__tests__/comparativeBench.test.js +0 -722
  45. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  46. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  47. package/dist/__tests__/critterCalibrationEval.js +0 -370
  48. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  49. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  50. package/dist/__tests__/dynamicLoading.test.js +0 -280
  51. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  52. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  53. package/dist/__tests__/embeddingProvider.test.js +0 -86
  54. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  55. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  56. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  57. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  58. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  59. package/dist/__tests__/evalHarness.test.js +0 -1107
  60. package/dist/__tests__/evalHarness.test.js.map +0 -1
  61. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  62. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  63. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  64. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  65. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  66. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  67. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  69. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  70. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  72. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  73. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  74. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  75. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  76. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingScoring.test.js +0 -202
  78. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  79. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  80. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  81. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  83. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  84. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  86. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  87. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  90. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  91. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  92. package/dist/__tests__/helpers/answerMatch.js +0 -267
  93. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  94. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  95. package/dist/__tests__/helpers/textLlm.js +0 -214
  96. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  97. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  98. package/dist/__tests__/localDashboard.test.js +0 -226
  99. package/dist/__tests__/localDashboard.test.js.map +0 -1
  100. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  101. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  102. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  103. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  104. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  105. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  108. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  111. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  114. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  116. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  117. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  118. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  119. package/dist/__tests__/openclawDogfood.test.js +0 -535
  120. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  121. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  122. package/dist/__tests__/openclawMessaging.test.js +0 -232
  123. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  124. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  125. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  126. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  127. package/dist/__tests__/tools.test.d.ts +0 -1
  128. package/dist/__tests__/tools.test.js +0 -3201
  129. package/dist/__tests__/tools.test.js.map +0 -1
  130. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  131. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  132. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  133. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  134. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  135. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  136. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  137. package/dist/__tests__/webmcpTools.test.js +0 -195
  138. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  139. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  140. package/dist/benchmarks/testProviderBus.js +0 -272
  141. package/dist/benchmarks/testProviderBus.js.map +0 -1
  142. package/dist/hooks/postCompaction.d.ts +0 -14
  143. package/dist/hooks/postCompaction.js +0 -51
  144. package/dist/hooks/postCompaction.js.map +0 -1
  145. package/dist/security/__tests__/security.test.d.ts +0 -8
  146. package/dist/security/__tests__/security.test.js +0 -295
  147. package/dist/security/__tests__/security.test.js.map +0 -1
  148. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  149. package/dist/sync/hyperloopEval.test.js +0 -60
  150. package/dist/sync/hyperloopEval.test.js.map +0 -1
  151. package/dist/sync/store.test.d.ts +0 -4
  152. package/dist/sync/store.test.js +0 -43
  153. package/dist/sync/store.test.js.map +0 -1
  154. package/dist/tools/documentTools.d.ts +0 -5
  155. package/dist/tools/documentTools.js +0 -524
  156. package/dist/tools/documentTools.js.map +0 -1
  157. package/dist/tools/financialTools.d.ts +0 -10
  158. package/dist/tools/financialTools.js +0 -403
  159. package/dist/tools/financialTools.js.map +0 -1
  160. package/dist/tools/memoryTools.d.ts +0 -5
  161. package/dist/tools/memoryTools.js +0 -137
  162. package/dist/tools/memoryTools.js.map +0 -1
  163. package/dist/tools/planningTools.d.ts +0 -5
  164. package/dist/tools/planningTools.js +0 -147
  165. package/dist/tools/planningTools.js.map +0 -1
  166. package/dist/tools/searchTools.d.ts +0 -5
  167. package/dist/tools/searchTools.js +0 -145
  168. package/dist/tools/searchTools.js.map +0 -1
@@ -1,1107 +0,0 @@
1
- /**
2
- * Eval Harness for NodeBench MCP Tools
3
- *
4
- * Tests REAL agent scenarios to prove tools work in practice.
5
- * Each scenario exercises multiple tools in realistic workflows.
6
- *
7
- * Coverage Goals:
8
- * - Every tool called at least once
9
- * - Every methodology workflow tested
10
- * - Cross-tool integration verified
11
- */
12
- import { describe, it, expect } from "vitest";
13
- import { verificationTools } from "../tools/verificationTools.js";
14
- import { reconTools } from "../tools/reconTools.js";
15
- import { evalTools } from "../tools/evalTools.js";
16
- import { qualityGateTools } from "../tools/qualityGateTools.js";
17
- import { flywheelTools } from "../tools/flywheelTools.js";
18
- import { learningTools } from "../tools/learningTools.js";
19
- import { documentationTools } from "../tools/documentationTools.js";
20
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
21
- import { selfEvalTools } from "../tools/selfEvalTools.js";
22
- import { flickerDetectionTools } from "../tools/flickerDetectionTools.js";
23
- import { figmaFlowTools } from "../tools/figmaFlowTools.js";
24
- import { boilerplateTools } from "../tools/boilerplateTools.js";
25
- import { cCompilerBenchmarkTools } from "../tools/cCompilerBenchmarkTools.js";
26
- import { createMetaTools } from "../tools/metaTools.js";
27
- // Assemble all tools
28
- const domainTools = [
29
- ...verificationTools,
30
- ...evalTools,
31
- ...qualityGateTools,
32
- ...learningTools,
33
- ...flywheelTools,
34
- ...reconTools,
35
- ...documentationTools,
36
- ...agentBootstrapTools,
37
- ...selfEvalTools,
38
- ...flickerDetectionTools,
39
- ...figmaFlowTools,
40
- ...boilerplateTools,
41
- ...cCompilerBenchmarkTools,
42
- ];
43
- const allTools = [...domainTools, ...createMetaTools(domainTools)];
44
- const findTool = (name) => {
45
- const tool = allTools.find((t) => t.name === name);
46
- if (!tool)
47
- throw new Error(`Tool not found: ${name}`);
48
- return tool;
49
- };
50
- // Track which tools are called
51
- const toolCallLog = [];
52
- async function callTool(name, args, scenario) {
53
- const tool = findTool(name);
54
- try {
55
- const result = await tool.handler(args);
56
- toolCallLog.push({ tool: name, scenario, success: true });
57
- return result;
58
- }
59
- catch (error) {
60
- toolCallLog.push({ tool: name, scenario, success: false });
61
- throw error;
62
- }
63
- }
64
- // ═══════════════════════════════════════════════════════════════════════════
65
- // SCENARIO 1: New Feature Development (verification methodology)
66
- // ═══════════════════════════════════════════════════════════════════════════
67
- describe("Scenario: New Feature Development", () => {
68
- let cycleId;
69
- let gapId;
70
- it("Step 1: Start verification cycle", async () => {
71
- const result = await callTool("start_verification_cycle", {
72
- title: "eval-feature-development",
73
- description: "Implementing user authentication",
74
- }, "feature-dev");
75
- expect(result.cycleId).toBeTruthy();
76
- cycleId = result.cycleId;
77
- });
78
- it("Step 2: Log context gathering (Phase 1)", async () => {
79
- const result = await callTool("log_phase_findings", {
80
- cycleId,
81
- phaseNumber: 1,
82
- status: "passed",
83
- findings: { patterns: ["JWT", "session-based"], recommendation: "use JWT" },
84
- }, "feature-dev");
85
- expect(result.phaseRecorded).toBe(1);
86
- });
87
- it("Step 3: Log a gap found during implementation", async () => {
88
- const result = await callTool("log_gap", {
89
- cycleId,
90
- severity: "MEDIUM",
91
- title: "Missing rate limiting",
92
- description: "Auth endpoint needs rate limiting",
93
- rootCause: "Security oversight",
94
- fixStrategy: "Add express-rate-limit middleware",
95
- }, "feature-dev");
96
- expect(result.gapId).toBeTruthy();
97
- gapId = result.gapId;
98
- });
99
- it("Step 4: Get verification status", async () => {
100
- const result = await callTool("get_verification_status", {
101
- cycleId,
102
- }, "feature-dev");
103
- // Tool returns status (active/completed/abandoned), currentPhase, etc.
104
- expect(result.status).toBeTruthy();
105
- });
106
- it("Step 5: Resolve the gap", async () => {
107
- const result = await callTool("resolve_gap", {
108
- gapId,
109
- }, "feature-dev");
110
- expect(result.status).toBe("resolved");
111
- });
112
- it("Step 6: Log test result", async () => {
113
- const result = await callTool("log_test_result", {
114
- cycleId,
115
- layer: "integration", // Required field
116
- label: "auth-integration-test", // Required field (not testName)
117
- passed: true,
118
- output: "All auth flows passing",
119
- }, "feature-dev");
120
- expect(result.testId).toBeTruthy();
121
- });
122
- it("Step 7: Cleanup - abandon cycle", async () => {
123
- const result = await callTool("abandon_cycle", {
124
- cycleId,
125
- reason: "eval harness cleanup",
126
- }, "feature-dev");
127
- expect(result.abandoned).toBe(true);
128
- });
129
- });
130
- // ═══════════════════════════════════════════════════════════════════════════
131
- // SCENARIO 2: Eval-Driven Development (eval methodology)
132
- // ═══════════════════════════════════════════════════════════════════════════
133
- describe("Scenario: Eval-Driven Development", () => {
134
- let evalRunId;
135
- let caseIds;
136
- it("Step 1: Start eval run with test cases", async () => {
137
- // Actual schema: name, cases (with input, intent, expected)
138
- const result = await callTool("start_eval_run", {
139
- name: "eval-harness-run",
140
- description: "Testing prompt quality",
141
- cases: [
142
- { input: "Hello", intent: "greeting" },
143
- { input: "Help me code", intent: "assistance" },
144
- ],
145
- }, "eval-driven");
146
- expect(result.runId).toBeTruthy();
147
- evalRunId = result.runId;
148
- caseIds = result.caseIds;
149
- });
150
- it("Step 2: Record eval results", async () => {
151
- // Actual schema: caseId, verdict (pass/fail/partial), actual, score
152
- const result1 = await callTool("record_eval_result", {
153
- caseId: caseIds[0],
154
- actual: "greeting response",
155
- verdict: "pass",
156
- score: 0.9,
157
- }, "eval-driven");
158
- expect(result1.caseId).toBe(caseIds[0]);
159
- expect(result1.verdict).toBe("pass");
160
- const result2 = await callTool("record_eval_result", {
161
- caseId: caseIds[1],
162
- actual: "help response",
163
- verdict: "pass",
164
- score: 0.85,
165
- }, "eval-driven");
166
- expect(result2.caseId).toBe(caseIds[1]);
167
- });
168
- it("Step 3: Complete eval run", async () => {
169
- const result = await callTool("complete_eval_run", {
170
- runId: evalRunId,
171
- }, "eval-driven");
172
- expect(result.runId).toBe(evalRunId);
173
- expect(result.status).toBe("completed");
174
- expect(result.summary).toBeDefined();
175
- });
176
- it("Step 4: List eval runs", async () => {
177
- const result = await callTool("list_eval_runs", {
178
- limit: 10,
179
- }, "eval-driven");
180
- expect(result.runs).toBeDefined();
181
- expect(result.runs.length).toBeGreaterThan(0);
182
- });
183
- });
184
- // ═══════════════════════════════════════════════════════════════════════════
185
- // SCENARIO 3: Knowledge Management (learning methodology)
186
- // ═══════════════════════════════════════════════════════════════════════════
187
- describe("Scenario: Knowledge Management", () => {
188
- const uniqueKey = `eval-learning-${Date.now()}`;
189
- it("Step 1: Record a learning", async () => {
190
- const result = await callTool("record_learning", {
191
- key: uniqueKey,
192
- category: "pattern",
193
- content: "Use scenario-based testing to verify tool chains work together",
194
- tags: ["testing", "eval", "integration"],
195
- }, "knowledge");
196
- expect(result.key).toBe(uniqueKey);
197
- expect(result.success).toBe(true);
198
- });
199
- it("Step 2: Search for the learning", async () => {
200
- // Returns { query, count, learnings: [...] }
201
- const result = await callTool("search_learnings", {
202
- query: "scenario testing",
203
- }, "knowledge");
204
- expect(result.learnings).toBeDefined();
205
- });
206
- it("Step 3: List all learnings", async () => {
207
- const result = await callTool("list_learnings", {
208
- limit: 20,
209
- }, "knowledge");
210
- expect(result.learnings).toBeDefined();
211
- });
212
- it("Step 4: Delete the learning", async () => {
213
- // Returns { success: true, message }
214
- const result = await callTool("delete_learning", {
215
- key: uniqueKey,
216
- }, "knowledge");
217
- expect(result.success).toBe(true);
218
- });
219
- });
220
- // ═══════════════════════════════════════════════════════════════════════════
221
- // SCENARIO 4: Quality Gates (quality_gates methodology)
222
- // ═══════════════════════════════════════════════════════════════════════════
223
- describe("Scenario: Quality Gates", () => {
224
- it("Step 1: Get deploy_readiness preset", async () => {
225
- const result = await callTool("get_gate_preset", {
226
- preset: "deploy_readiness",
227
- }, "quality-gates");
228
- expect(result.preset).toBe("deploy_readiness");
229
- expect(result.rules.length).toBeGreaterThan(0);
230
- });
231
- it("Step 2: Run quality gate", async () => {
232
- const result = await callTool("run_quality_gate", {
233
- gateName: "deploy_readiness",
234
- target: "eval-harness-test",
235
- rules: [
236
- { name: "tests_pass", passed: true },
237
- { name: "no_type_errors", passed: true },
238
- { name: "no_lint_errors", passed: true },
239
- { name: "coverage_threshold", passed: false },
240
- ],
241
- }, "quality-gates");
242
- expect(result.passed).toBe(false);
243
- expect(result.failures).toContain("coverage_threshold");
244
- });
245
- it("Step 3: Get gate history", async () => {
246
- // Returns { gateName, runs: [...], trend }
247
- const result = await callTool("get_gate_history", {
248
- gateName: "deploy_readiness",
249
- limit: 10,
250
- }, "quality-gates");
251
- expect(result.gateName).toBe("deploy_readiness");
252
- expect(result.runs).toBeDefined();
253
- });
254
- it("Step 4: Run closed loop verification", async () => {
255
- // Actual schema: steps with { step: enum, passed: boolean }
256
- const result = await callTool("run_closed_loop", {
257
- steps: [
258
- { step: "compile", passed: true },
259
- { step: "lint", passed: true },
260
- { step: "test", passed: true },
261
- ],
262
- }, "quality-gates");
263
- expect(result.allPassed).toBe(true);
264
- });
265
- });
266
- // ═══════════════════════════════════════════════════════════════════════════
267
- // SCENARIO 5: Flywheel Orchestration (flywheel methodology)
268
- // ═══════════════════════════════════════════════════════════════════════════
269
- describe("Scenario: Flywheel Orchestration", () => {
270
- it("Step 1: Get flywheel status", async () => {
271
- // Returns { innerLoop, outerLoop, connections }
272
- const result = await callTool("get_flywheel_status", {}, "flywheel");
273
- expect(result).toHaveProperty("innerLoop");
274
- expect(result).toHaveProperty("outerLoop");
275
- expect(result).toHaveProperty("connections");
276
- }, 15_000);
277
- it("Step 2: Run mandatory flywheel check", async () => {
278
- // Actual schema: target, steps array with stepName enum
279
- const result = await callTool("run_mandatory_flywheel", {
280
- target: "Added new auth feature",
281
- steps: [
282
- { stepName: "static_analysis", passed: true },
283
- { stepName: "happy_path_test", passed: true },
284
- { stepName: "failure_path_test", passed: true },
285
- { stepName: "gap_analysis", passed: true },
286
- { stepName: "fix_and_reverify", passed: true },
287
- { stepName: "deploy_and_document", passed: true },
288
- ],
289
- }, "flywheel");
290
- expect(result).toHaveProperty("passed");
291
- expect(result.passed).toBe(true);
292
- });
293
- });
294
- // ═══════════════════════════════════════════════════════════════════════════
295
- // SCENARIO 5.5: Flywheel Integration (promote, investigate, compare)
296
- // Tests the 4 previously untested flywheel integration tools
297
- // ═══════════════════════════════════════════════════════════════════════════
298
- describe("Scenario: Flywheel Integration", () => {
299
- // Test the 4 previously untested flywheel tools in isolated tests
300
- it("Step 1: list_verification_cycles - lists cycles", async () => {
301
- // First create a cycle so we have something to list
302
- const createResult = await callTool("start_verification_cycle", {
303
- title: "List test cycle", // Note: title, not goal
304
- description: "Testing list_verification_cycles",
305
- }, "flywheel-integration");
306
- const testCycleId = createResult.cycleId;
307
- // Now list cycles
308
- const result = await callTool("list_verification_cycles", {
309
- limit: 10,
310
- }, "flywheel-integration");
311
- expect(result).toHaveProperty("count");
312
- expect(result).toHaveProperty("cycles");
313
- expect(Array.isArray(result.cycles)).toBe(true);
314
- // Each cycle has cycleId property (not id)
315
- expect(result.cycles.some((c) => c.cycleId === testCycleId)).toBe(true);
316
- // Cleanup
317
- await callTool("abandon_cycle", {
318
- cycleId: testCycleId,
319
- reason: "Test cleanup",
320
- }, "flywheel-integration");
321
- });
322
- it("Step 2: promote_to_eval - promotes verification to eval suite", async () => {
323
- // Create a cycle to promote from
324
- const cycleResult = await callTool("start_verification_cycle", {
325
- title: "Promote test cycle",
326
- description: "Testing promote_to_eval",
327
- }, "flywheel-integration");
328
- // Promote with explicit cases (required)
329
- const result = await callTool("promote_to_eval", {
330
- cycleId: cycleResult.cycleId,
331
- evalRunName: "promoted-eval-test",
332
- cases: [
333
- { input: "test input", intent: "Test intent" },
334
- ],
335
- }, "flywheel-integration");
336
- expect(result).toHaveProperty("evalRunId");
337
- expect(result).toHaveProperty("caseIds");
338
- expect(result.caseCount).toBe(1);
339
- // Cleanup
340
- await callTool("abandon_cycle", {
341
- cycleId: cycleResult.cycleId,
342
- reason: "Test cleanup",
343
- }, "flywheel-integration");
344
- });
345
- it("Step 3: compare_eval_runs - compares two completed evals", async () => {
346
- // Create and complete baseline eval
347
- const baseline = await callTool("start_eval_run", {
348
- name: "baseline-for-compare",
349
- cases: [{ input: "test", intent: "baseline" }],
350
- }, "flywheel-integration");
351
- await callTool("record_eval_result", {
352
- caseId: baseline.caseIds[0],
353
- actual: "result",
354
- verdict: "pass",
355
- }, "flywheel-integration");
356
- await callTool("complete_eval_run", {
357
- runId: baseline.runId,
358
- }, "flywheel-integration");
359
- // Create and complete candidate eval
360
- const candidate = await callTool("start_eval_run", {
361
- name: "candidate-for-compare",
362
- cases: [{ input: "test", intent: "candidate" }],
363
- }, "flywheel-integration");
364
- await callTool("record_eval_result", {
365
- caseId: candidate.caseIds[0],
366
- actual: "result",
367
- verdict: "pass",
368
- }, "flywheel-integration");
369
- await callTool("complete_eval_run", {
370
- runId: candidate.runId,
371
- }, "flywheel-integration");
372
- // Compare them
373
- const result = await callTool("compare_eval_runs", {
374
- baselineRunId: baseline.runId,
375
- candidateRunId: candidate.runId,
376
- }, "flywheel-integration");
377
- expect(result).toHaveProperty("recommendation");
378
- expect(["DEPLOY", "REVERT", "INVESTIGATE"]).toContain(result.recommendation);
379
- });
380
- it("Step 4: trigger_investigation - creates investigation cycle", async () => {
381
- // Create and complete an eval run to investigate
382
- const eval1 = await callTool("start_eval_run", {
383
- name: "eval-to-investigate",
384
- cases: [{ input: "test", intent: "investigate" }],
385
- }, "flywheel-integration");
386
- await callTool("record_eval_result", {
387
- caseId: eval1.caseIds[0],
388
- actual: "failed",
389
- verdict: "fail",
390
- }, "flywheel-integration");
391
- await callTool("complete_eval_run", {
392
- runId: eval1.runId,
393
- }, "flywheel-integration");
394
- // Trigger investigation
395
- const result = await callTool("trigger_investigation", {
396
- evalRunId: eval1.runId,
397
- regressionDescription: "Test failure detected",
398
- }, "flywheel-integration");
399
- // Returns cycleId, title, linkedEvalRun, phase1Instructions
400
- expect(result).toHaveProperty("cycleId");
401
- expect(result).toHaveProperty("title");
402
- expect(result).toHaveProperty("linkedEvalRun");
403
- // Cleanup
404
- await callTool("abandon_cycle", {
405
- cycleId: result.cycleId,
406
- reason: "Test cleanup",
407
- }, "flywheel-integration");
408
- });
409
- });
410
- // ═══════════════════════════════════════════════════════════════════════════
411
- // SCENARIO 6: Research & Discovery (recon methodology)
412
- // ═══════════════════════════════════════════════════════════════════════════
413
- describe("Scenario: Research & Discovery", () => {
414
- let reconSessionId;
415
- it("Step 1: Start recon session", async () => {
416
- // Actual schema: target (required), description, projectContext
417
- const result = await callTool("run_recon", {
418
- target: "MCP server best practices",
419
- description: "Research for eval harness",
420
- }, "research");
421
- expect(result.sessionId).toBeTruthy();
422
- reconSessionId = result.sessionId;
423
- });
424
- it("Step 2: Log recon finding", async () => {
425
- // Actual schema: sessionId, category (enum), summary, sourceUrl, relevance
426
- const result = await callTool("log_recon_finding", {
427
- sessionId: reconSessionId,
428
- category: "best_practice",
429
- summary: "Organize tools by domain for better discoverability",
430
- sourceUrl: "https://docs.anthropic.com",
431
- relevance: "Applies to MCP tool organization",
432
- }, "research");
433
- expect(result.findingId).toBeTruthy();
434
- expect(result.findingCount).toBeGreaterThan(0);
435
- });
436
- it("Step 3: Get recon summary", async () => {
437
- // Returns { sessionId, target, status, totalFindings, findingsByCategory, ... }
438
- const result = await callTool("get_recon_summary", {
439
- sessionId: reconSessionId,
440
- }, "research");
441
- expect(result.sessionId).toBe(reconSessionId);
442
- expect(result.totalFindings).toBeGreaterThan(0);
443
- });
444
- it("Step 4: Check framework updates", async () => {
445
- // Actual schema: ecosystem (enum)
446
- const result = await callTool("check_framework_updates", {
447
- ecosystem: "mcp",
448
- }, "research");
449
- expect(result.ecosystem).toBe("mcp");
450
- expect(result.sources).toBeDefined();
451
- });
452
- it("Step 5: Bootstrap project context", async () => {
453
- // Actual schema: projectName (required), techStack, architecture, etc.
454
- const result = await callTool("bootstrap_project", {
455
- projectName: "eval-harness-project",
456
- techStack: "TypeScript, Vitest, MCP",
457
- architecture: "Modular tool system",
458
- }, "research");
459
- expect(result.projectName).toBe("eval-harness-project");
460
- expect(result.storedFields).toBeDefined();
461
- });
462
- it("Step 6: Get project context", async () => {
463
- // Returns { context: {}, knowledgeBase: {} }
464
- const result = await callTool("get_project_context", {}, "research");
465
- expect(result).toHaveProperty("context");
466
- expect(result).toHaveProperty("knowledgeBase");
467
- });
468
- it("Step 7: Search all knowledge", async () => {
469
- const result = await callTool("search_all_knowledge", {
470
- query: "MCP tools",
471
- }, "research");
472
- expect(result).toHaveProperty("learnings");
473
- expect(result).toHaveProperty("reconFindings");
474
- expect(result).toHaveProperty("gaps");
475
- });
476
- });
477
- // ═══════════════════════════════════════════════════════════════════════════
478
- // SCENARIO 7: Agent Self-Bootstrap (agent_bootstrap methodology)
479
- // ═══════════════════════════════════════════════════════════════════════════
480
- describe("Scenario: Agent Self-Bootstrap", () => {
481
- it("Step 1: Discover infrastructure", async () => {
482
- const result = await callTool("discover_infrastructure", {
483
- categories: ["agent_loop", "telemetry"],
484
- depth: "shallow",
485
- }, "bootstrap");
486
- expect(result).toHaveProperty("discovered");
487
- expect(result).toHaveProperty("missing");
488
- });
489
- it("Step 2: Triple verify a component", async () => {
490
- const result = await callTool("triple_verify", {
491
- target: "verification-tools",
492
- scope: "implementation", // Valid: implementation|integration|deployment|full
493
- includeWebSearch: false,
494
- }, "bootstrap");
495
- // Returns verification1_internal, verification2_external, verification3_synthesis
496
- expect(result).toHaveProperty("verification1_internal");
497
- expect(result).toHaveProperty("verification2_external");
498
- expect(result).toHaveProperty("verification3_synthesis");
499
- });
500
- it("Step 3: Self-implement missing component", async () => {
501
- const result = await callTool("self_implement", {
502
- component: "telemetry",
503
- dryRun: true,
504
- }, "bootstrap");
505
- expect(result).toHaveProperty("component");
506
- // Returns plan, files, nextSteps
507
- expect(result).toHaveProperty("plan");
508
- expect(result).toHaveProperty("files");
509
- });
510
- it("Step 4: Generate self-instructions", async () => {
511
- const result = await callTool("generate_self_instructions", {
512
- format: "claude_md",
513
- includeExternalSources: false,
514
- }, "bootstrap");
515
- expect(result).toHaveProperty("format");
516
- // Returns content (not instructions)
517
- expect(result).toHaveProperty("content");
518
- });
519
- it("Step 5: Connect channels", async () => {
520
- const result = await callTool("connect_channels", {
521
- channels: ["web", "github"],
522
- query: "mcp tools",
523
- aggressive: false,
524
- }, "bootstrap");
525
- // Returns query, results (array of {channel, findings, sources})
526
- expect(result).toHaveProperty("query");
527
- expect(result).toHaveProperty("results");
528
- expect(Array.isArray(result.results)).toBe(true);
529
- });
530
- });
531
- // ═══════════════════════════════════════════════════════════════════════════
532
- // SCENARIO 8: Autonomous Maintenance (autonomous_maintenance methodology)
533
- // ═══════════════════════════════════════════════════════════════════════════
534
- describe("Scenario: Autonomous Maintenance", () => {
535
- it("Step 1: Assess risk before action", async () => {
536
- const result = await callTool("assess_risk", {
537
- action: "update_agents_md",
538
- context: "Adding new documentation",
539
- }, "autonomous");
540
- expect(result.assessment.tier).toBe("medium");
541
- expect(result.assessment.recommendation).toBe("log_and_proceed");
542
- });
543
- it("Step 2: Decide re-update vs create", async () => {
544
- const result = await callTool("decide_re_update", {
545
- targetContent: "New methodology documentation",
546
- contentType: "documentation",
547
- existingFiles: ["README.md", "AGENTS.md"],
548
- }, "autonomous");
549
- expect(["update_existing", "create_new", "merge"]).toContain(result.action);
550
- });
551
- it("Step 3: Run self-maintenance", async () => {
552
- const result = await callTool("run_self_maintenance", {
553
- scope: "quick",
554
- autoFix: false,
555
- dryRun: true,
556
- }, "autonomous");
557
- expect(result).toHaveProperty("checksPerformed");
558
- expect(result).toHaveProperty("issuesFound");
559
- });
560
- it("Step 4: Scaffold directory structure", async () => {
561
- const result = await callTool("scaffold_directory", {
562
- component: "agent_loop",
563
- includeTests: true,
564
- dryRun: true,
565
- }, "autonomous");
566
- expect(result.component).toBe("agent_loop");
567
- expect(result.structure.files.length).toBeGreaterThan(0);
568
- });
569
- it("Step 5: Run autonomous loop with guardrails", async () => {
570
- const result = await callTool("run_autonomous_loop", {
571
- goal: "Verify all documentation is in sync",
572
- maxIterations: 3,
573
- maxDurationMs: 5000,
574
- stopOnFirstFailure: true,
575
- }, "autonomous");
576
- expect(result.goal).toBeTruthy();
577
- expect(result.iterations).toBeLessThanOrEqual(3);
578
- expect(["completed", "stopped", "timeout", "failed"]).toContain(result.status);
579
- });
580
- });
581
- // ═══════════════════════════════════════════════════════════════════════════
582
- // SCENARIO 9: Meta Tools (tool discovery)
583
- // ═══════════════════════════════════════════════════════════════════════════
584
- describe("Scenario: Meta Tool Discovery", () => {
585
- it("Step 1: Find tools by keyword", async () => {
586
- const result = await callTool("findTools", {
587
- query: "verification",
588
- }, "meta");
589
- expect(result.tools.length).toBeGreaterThan(0);
590
- });
591
- it("Step 2: Find tools by category", async () => {
592
- const result = await callTool("findTools", {
593
- category: "bootstrap",
594
- }, "meta");
595
- expect(result.tools.length).toBeGreaterThan(0);
596
- });
597
- it("Step 3: Get methodology overview", async () => {
598
- const result = await callTool("getMethodology", {
599
- topic: "overview",
600
- }, "meta");
601
- expect(result.title).toContain("Overview");
602
- const topics = Object.keys(result.steps[0].topics);
603
- expect(topics.length).toBe(26);
604
- });
605
- it("Step 4: Get specific methodology", async () => {
606
- const methodologies = [
607
- "verification", "eval", "flywheel", "mandatory_flywheel",
608
- "reconnaissance", "quality_gates", "ui_ux_qa", "agentic_vision",
609
- "closed_loop", "learnings", "project_ideation", "tech_stack_2026",
610
- "telemetry_setup", "agents_md_maintenance", "agent_bootstrap",
611
- "autonomous_maintenance",
612
- "self_reinforced_learning",
613
- ];
614
- for (const topic of methodologies) {
615
- const result = await callTool("getMethodology", { topic }, "meta");
616
- expect(result.title).toBeTruthy();
617
- expect(result.steps.length).toBeGreaterThan(0);
618
- }
619
- });
620
- });
621
- // ═══════════════════════════════════════════════════════════════════════════
622
- // SCENARIO 10: Self-Reinforced Learning (trajectory analysis)
623
- // ═══════════════════════════════════════════════════════════════════════════
624
- describe("Scenario: Self-Reinforced Learning", () => {
625
- it("Step 1: Log tool calls to build trajectory data", async () => {
626
- const result = await callTool("log_tool_call", {
627
- sessionId: "eval-harness-self-eval",
628
- toolName: "start_verification_cycle",
629
- durationMs: 25,
630
- resultStatus: "success",
631
- phase: "verification",
632
- }, "self-eval");
633
- expect(result.logged).toBe(true);
634
- await callTool("log_tool_call", {
635
- sessionId: "eval-harness-self-eval",
636
- toolName: "log_phase_findings",
637
- durationMs: 12,
638
- resultStatus: "success",
639
- phase: "verification",
640
- }, "self-eval");
641
- await callTool("log_tool_call", {
642
- sessionId: "eval-harness-self-eval",
643
- toolName: "run_mandatory_flywheel",
644
- durationMs: 35,
645
- resultStatus: "success",
646
- phase: "flywheel",
647
- }, "self-eval");
648
- });
649
- it("Step 2: Analyze trajectory patterns", async () => {
650
- const result = await callTool("get_trajectory_analysis", {
651
- sessionId: "eval-harness-self-eval",
652
- }, "self-eval");
653
- expect(result.totalCalls).toBeGreaterThanOrEqual(3);
654
- expect(result.uniqueTools).toBeGreaterThanOrEqual(3);
655
- expect(result.topTools.length).toBeGreaterThan(0);
656
- });
657
- it("Step 3: Generate self-eval health report", async () => {
658
- const result = await callTool("get_self_eval_report", {
659
- sinceDaysAgo: 30,
660
- }, "self-eval");
661
- expect(typeof result.healthScore).toBe("number");
662
- expect(result).toHaveProperty("verification");
663
- expect(result).toHaveProperty("gaps");
664
- expect(result).toHaveProperty("evalRuns");
665
- expect(result).toHaveProperty("toolTrajectory");
666
- });
667
- it("Step 4: Get improvement recommendations", async () => {
668
- const result = await callTool("get_improvement_recommendations", {
669
- sinceDaysAgo: 30,
670
- focus: "all",
671
- }, "self-eval");
672
- expect(typeof result.totalRecommendations).toBe("number");
673
- expect(Array.isArray(result.recommendations)).toBe(true);
674
- expect(result._selfReinforcement.nextSteps.length).toBe(4);
675
- });
676
- it("Step 5: Cleanup stale runs (dry run)", async () => {
677
- const result = await callTool("cleanup_stale_runs", {
678
- staleDays: 7,
679
- dryRun: true,
680
- }, "self-eval");
681
- expect(result.dryRun).toBe(true);
682
- expect(result).toHaveProperty("staleEvalRuns");
683
- expect(result).toHaveProperty("staleCycles");
684
- expect(result).toHaveProperty("staleGaps");
685
- expect(result.staleEvalRuns).toHaveProperty("count");
686
- });
687
- it("Step 6: Synthesize recon to learnings (dry run)", async () => {
688
- const result = await callTool("synthesize_recon_to_learnings", {
689
- sinceDaysAgo: 30,
690
- dryRun: true,
691
- }, "self-eval");
692
- expect(result.dryRun).toBe(true);
693
- expect(result).toHaveProperty("totalFindings");
694
- expect(result).toHaveProperty("newLearnings");
695
- expect(result).toHaveProperty("preview");
696
- expect(result.created).toBe(0);
697
- });
698
- });
699
- // ═══════════════════════════════════════════════════════════════════════════
700
- // SCENARIO 11: Flicker Detection (env-gated — returns "not configured")
701
- // ═══════════════════════════════════════════════════════════════════════════
702
- describe("Scenario: Flicker Detection Pipeline", () => {
703
- it("Step 1: run_flicker_detection returns not-configured when no server", async () => {
704
- const result = await callTool("run_flicker_detection", {
705
- durationS: 5,
706
- }, "flicker-detection");
707
- expect(result.error).toBe(true);
708
- expect(result.message).toContain("not configured");
709
- });
710
- it("Step 2: capture_surface_stats returns not-configured", async () => {
711
- const result = await callTool("capture_surface_stats", {}, "flicker-detection");
712
- expect(result.error).toBe(true);
713
- expect(result.message).toContain("not configured");
714
- });
715
- it("Step 3: extract_video_frames returns not-configured", async () => {
716
- const result = await callTool("extract_video_frames", {}, "flicker-detection");
717
- expect(result.error).toBe(true);
718
- expect(result.message).toContain("not configured");
719
- });
720
- it("Step 4: compute_ssim_analysis returns not-configured", async () => {
721
- const result = await callTool("compute_ssim_analysis", {
722
- framePaths: ["/tmp/frame1.jpg", "/tmp/frame2.jpg"],
723
- }, "flicker-detection");
724
- expect(result.error).toBe(true);
725
- expect(result.message).toContain("not configured");
726
- });
727
- it("Step 5: generate_flicker_report returns not-configured", async () => {
728
- const result = await callTool("generate_flicker_report", {
729
- ssimScores: [0.95, 0.93, 0.88, 0.91],
730
- threshold: 0.90,
731
- }, "flicker-detection");
732
- expect(result.error).toBe(true);
733
- expect(result.message).toContain("not configured");
734
- });
735
- });
736
- // ═══════════════════════════════════════════════════════════════════════════
737
- // SCENARIO 12: Figma Flow Analysis (env-gated — returns "not configured")
738
- // ═══════════════════════════════════════════════════════════════════════════
739
- describe("Scenario: Figma Flow Analysis Pipeline", () => {
740
- it("Step 1: analyze_figma_flows returns not-configured when no server", async () => {
741
- const result = await callTool("analyze_figma_flows", {
742
- fileKey: "abc123",
743
- }, "figma-flow");
744
- expect(result.error).toBe(true);
745
- expect(result.message).toContain("not configured");
746
- });
747
- it("Step 2: extract_figma_frames returns not-configured", async () => {
748
- const result = await callTool("extract_figma_frames", {
749
- fileKey: "abc123",
750
- }, "figma-flow");
751
- expect(result.error).toBe(true);
752
- expect(result.message).toContain("not configured");
753
- });
754
- it("Step 3: cluster_figma_flows returns not-configured", async () => {
755
- const result = await callTool("cluster_figma_flows", {
756
- frames: [],
757
- }, "figma-flow");
758
- expect(result.error).toBe(true);
759
- expect(result.message).toContain("not configured");
760
- });
761
- it("Step 4: render_flow_visualization returns not-configured", async () => {
762
- const result = await callTool("render_flow_visualization", {
763
- flowGroups: [],
764
- }, "figma-flow");
765
- expect(result.error).toBe(true);
766
- expect(result.message).toContain("not configured");
767
- });
768
- });
769
- // ═══════════════════════════════════════════════════════════════════════════
770
- // SCENARIO 13: Boilerplate Scaffolding
771
- // ═══════════════════════════════════════════════════════════════════════════
772
- describe("Scenario: Boilerplate Scaffolding", () => {
773
- it("Step 1: scaffold_nodebench_project dry run", async () => {
774
- const result = await callTool("scaffold_nodebench_project", {
775
- projectPath: "/tmp/eval-harness-scaffold-test",
776
- projectName: "eval-test-project",
777
- techStack: "TypeScript/Node.js",
778
- dryRun: true,
779
- }, "boilerplate");
780
- expect(result.dryRun).toBe(true);
781
- expect(result.summary.totalFiles).toBeGreaterThan(5);
782
- });
783
- it("Step 2: get_boilerplate_status on empty dir", async () => {
784
- const result = await callTool("get_boilerplate_status", {
785
- projectPath: process.cwd(),
786
- }, "boilerplate");
787
- expect(typeof result.completionPercentage).toBe("number");
788
- expect(result.total).toBeGreaterThan(0);
789
- });
790
- });
791
- // ═══════════════════════════════════════════════════════════════════════════
792
- // SCENARIO 14: C-Compiler Benchmark
793
- // ═══════════════════════════════════════════════════════════════════════════
794
- describe("Scenario: C-Compiler Benchmark", () => {
795
- let benchmarkId;
796
- it("Step 1: start_autonomy_benchmark with challenge list", async () => {
797
- const result = await callTool("start_autonomy_benchmark", {
798
- challenge: "list",
799
- }, "benchmark");
800
- expect(result.availableChallenges.length).toBe(5);
801
- });
802
- it("Step 2: start_autonomy_benchmark with c_compiler", async () => {
803
- const result = await callTool("start_autonomy_benchmark", {
804
- challenge: "c_compiler",
805
- }, "benchmark");
806
- expect(result.totalPoints).toBe(100);
807
- expect(result.milestones.length).toBe(10);
808
- benchmarkId = result.benchmarkId;
809
- });
810
- it("Step 3: log_benchmark_milestone", async () => {
811
- const result = await callTool("log_benchmark_milestone", {
812
- benchmarkId,
813
- milestoneId: "lexer",
814
- verificationPassed: true,
815
- notes: "Lexer tokenizes all C keywords correctly",
816
- }, "benchmark");
817
- expect(result.milestoneId).toBe("lexer");
818
- expect(result.points).toBe(15);
819
- });
820
- it("Step 4: complete_autonomy_benchmark", async () => {
821
- const result = await callTool("complete_autonomy_benchmark", {
822
- benchmarkId,
823
- reason: "completed",
824
- }, "benchmark");
825
- expect(result.score.earnedPoints).toBe(15);
826
- expect(result.milestones.completed).toBe(1);
827
- });
828
- });
829
- // ═══════════════════════════════════════════════════════════════════════════
830
- // SCENARIO 15: Contract Compliance
831
- // ═══════════════════════════════════════════════════════════════════════════
832
- describe("Scenario: Contract Compliance", () => {
833
- it("Step 1: check_contract_compliance with empty session", async () => {
834
- const result = await callTool("check_contract_compliance", {
835
- sessionId: `evalharness-empty-${Date.now()}`,
836
- }, "self_eval");
837
- expect(result.score).toBe(0);
838
- expect(result.grade).toBe("N/A");
839
- });
840
- it("Step 2: check_contract_compliance scores a compliant session", async () => {
841
- // First seed some tool calls
842
- const sessionId = `evalharness-compliant-${Date.now()}`;
843
- const logTool = allTools.find(t => t.name === "log_tool_call");
844
- const sequence = [
845
- "search_all_knowledge", "getMethodology", "discover_tools",
846
- "run_recon", "assess_risk",
847
- "run_closed_loop", "log_test_result", "start_eval_run",
848
- "run_quality_gate", "run_mandatory_flywheel", "record_learning",
849
- ];
850
- for (const toolName of sequence) {
851
- await logTool.handler({ sessionId, toolName, resultStatus: "success" });
852
- }
853
- const result = await callTool("check_contract_compliance", {
854
- sessionId,
855
- }, "self_eval");
856
- expect(result.score).toBeGreaterThanOrEqual(80);
857
- expect(result.grade).toMatch(/^[AB]/);
858
- expect(result.dimensions).toBeDefined();
859
- expect(result.dimensions.front_door.score).toBeGreaterThanOrEqual(15);
860
- expect(result.dimensions.ship_gates.score).toBeGreaterThanOrEqual(20);
861
- });
862
- });
863
- // ═══════════════════════════════════════════════════════════════════════════
864
- // SCENARIO 16: Controlled Evaluation (Task Bank + Ablation Grading)
865
- // ═══════════════════════════════════════════════════════════════════════════
866
- describe("Scenario: Controlled Evaluation", () => {
867
- const taskId = `evalharness-bugfix-${Date.now()}`;
868
- it("Step 1: create_task_bank creates a task", async () => {
869
- const result = await callTool("create_task_bank", {
870
- taskId,
871
- title: "Fix JWT token expiry bug",
872
- category: "bugfix",
873
- difficulty: "medium",
874
- prompt: "Fix the bug where JWT tokens expire 1 hour early due to timezone offset",
875
- successCriteria: ["tests pass", "no lint errors", "token expiry is correct"],
876
- forbiddenBehaviors: ["hardcode timezone", "skip tests"],
877
- timeBudgetMinutes: 20,
878
- }, "self_eval");
879
- expect(result.action).toBe("created");
880
- expect(result.taskId).toBe(taskId);
881
- expect(result.totalTasksInBank).toBeGreaterThanOrEqual(1);
882
- });
883
- it("Step 2: grade_agent_run grades a bare condition", async () => {
884
- const result = await callTool("grade_agent_run", {
885
- taskId,
886
- condition: "bare",
887
- outcomeResults: [
888
- { criterion: "tests pass", passed: true },
889
- { criterion: "no lint errors", passed: true },
890
- { criterion: "token expiry is correct", passed: false },
891
- ],
892
- durationMinutes: 15,
893
- }, "self_eval");
894
- expect(result.grade).toBeDefined();
895
- expect(result.scores.outcome.score).toBeGreaterThan(0);
896
- expect(result.scores.process.score).toBe(25); // No session = half credit
897
- expect(result.outcomeDetails.passed).toBe(2);
898
- expect(result.outcomeDetails.total).toBe(3);
899
- });
900
- it("Step 3: grade_agent_run grades a full condition with session", async () => {
901
- const sessionId = `evalharness-full-${Date.now()}`;
902
- const logTool = allTools.find(t => t.name === "log_tool_call");
903
- for (const toolName of ["search_all_knowledge", "assess_risk", "run_closed_loop", "log_test_result", "run_quality_gate", "record_learning"]) {
904
- await logTool.handler({ sessionId, toolName, resultStatus: "success" });
905
- }
906
- const result = await callTool("grade_agent_run", {
907
- taskId,
908
- sessionId,
909
- condition: "full",
910
- outcomeResults: [
911
- { criterion: "tests pass", passed: true },
912
- { criterion: "no lint errors", passed: true },
913
- { criterion: "token expiry is correct", passed: true },
914
- ],
915
- durationMinutes: 12,
916
- }, "self_eval");
917
- expect(result.scores.outcome.score).toBeGreaterThan(40);
918
- expect(result.scores.process.score).toBeGreaterThan(20);
919
- expect(result.ablationComparison).toBeDefined();
920
- expect(result.ablationComparison.length).toBe(2); // bare + full
921
- });
922
- });
923
- // ═══════════════════════════════════════════════════════════════════════════
924
- // COVERAGE REPORT
925
- // ═══════════════════════════════════════════════════════════════════════════
926
- describe("Coverage Report", () => {
927
- it("should generate comprehensive Proof of Work report", () => {
928
- const testedTools = new Set(toolCallLog.map(l => l.tool));
929
- const allToolNames = allTools.map(t => t.name);
930
- // Tools that require external dependencies (skip in automated tests)
931
- const externalDependencyTools = [
932
- "capture_ui_screenshot", // Requires Playwright
933
- "capture_responsive_suite", // Requires Playwright
934
- "discover_vision_env", // Dynamic SDK imports
935
- "analyze_screenshot", // Requires AI API key
936
- "manipulate_screenshot", // Requires Sharp
937
- "web_search", // Requires AI API key
938
- "fetch_url", // External network calls
939
- "search_github", // Requires GitHub API
940
- "analyze_repo", // Requires GitHub API
941
- "update_agents_md", // File system - tested separately
942
- "research_job_market", // Covered in tools.test.ts
943
- "setup_local_env", // Covered in tools.test.ts
944
- "call_llm", // Requires AI API key
945
- "extract_structured_data", // Requires AI API key
946
- "scan_dependencies", // Runs npm audit - covered in tools.test.ts
947
- "run_code_analysis", // Covered in tools.test.ts
948
- "diff_outputs", // Covered in tools.test.ts
949
- "query_daily_brief", // Requires CONVEX_SITE_URL - covered in tools.test.ts
950
- "query_funding_entities", // Requires CONVEX_SITE_URL - covered in tools.test.ts
951
- "query_research_queue", // Requires CONVEX_SITE_URL - covered in tools.test.ts
952
- "publish_to_queue", // Requires CONVEX_SITE_URL - covered in tools.test.ts
953
- "benchmark_models", // Requires AI API keys - covered in tools.test.ts
954
- "diff_screenshots", // Requires sharp - covered in tools.test.ts
955
- "generate_report", // Covered in tools.test.ts
956
- "monitor_repo", // Requires GitHub API - covered in tools.test.ts
957
- "run_tests_cli", // Covered in tools.test.ts
958
- "check_mcp_setup", // Env-dependent diagnostic wizard - covered in tools.test.ts
959
- "scan_capabilities", // Requires file path - covered in tools.test.ts
960
- "verify_concept_support", // Requires file path - covered in tools.test.ts
961
- "generate_implementation_plan", // Depends on verify_concept_support output - covered in tools.test.ts
962
- ];
963
- // Deprecated tools (kept for backwards compatibility, but flagged)
964
- const deprecatedTools = [
965
- { tool: "search_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
966
- { tool: "list_learnings", reason: "DEPRECATED: Use search_all_knowledge instead" },
967
- ];
968
- const untestedTools = allToolNames.filter(name => !testedTools.has(name) && !externalDependencyTools.includes(name));
969
- // Build tool-by-scenario matrix
970
- const toolScenarioMap = new Map();
971
- toolCallLog.forEach(l => {
972
- if (!toolScenarioMap.has(l.tool))
973
- toolScenarioMap.set(l.tool, []);
974
- if (!toolScenarioMap.get(l.tool).includes(l.scenario)) {
975
- toolScenarioMap.get(l.tool).push(l.scenario);
976
- }
977
- });
978
- // Count successes and failures
979
- const successCount = toolCallLog.filter(l => l.success).length;
980
- const failureCount = toolCallLog.filter(l => !l.success).length;
981
- // Build scenario summary
982
- const byScenario = new Map();
983
- toolCallLog.forEach(l => {
984
- if (!byScenario.has(l.scenario)) {
985
- byScenario.set(l.scenario, { tools: [], success: 0, fail: 0 });
986
- }
987
- const s = byScenario.get(l.scenario);
988
- if (!s.tools.includes(l.tool))
989
- s.tools.push(l.tool);
990
- if (l.success)
991
- s.success++;
992
- else
993
- s.fail++;
994
- });
995
- console.log("\n");
996
- console.log("╔═══════════════════════════════════════════════════════════════════════════╗");
997
- console.log("║ NODEBENCH MCP - PROOF OF WORK REPORT ║");
998
- console.log("╚═══════════════════════════════════════════════════════════════════════════╝");
999
- console.log("");
1000
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1001
- console.log("│ SUMMARY │");
1002
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1003
- console.log(`│ Total Tools in MCP: ${String(allToolNames.length).padStart(3)} │`);
1004
- console.log(`│ Tools Tested in Scenarios: ${String(testedTools.size).padStart(3)} (${Math.round(testedTools.size / allToolNames.length * 100)}%) │`);
1005
- console.log(`│ External Dependency (skip): ${String(externalDependencyTools.length).padStart(3)} (require API keys/network) │`);
1006
- console.log(`│ Untested (GAPS): ${String(untestedTools.length).padStart(3)} │`);
1007
- console.log(`│ Total Tool Calls: ${String(toolCallLog.length).padStart(3)} │`);
1008
- console.log(`│ Success Rate: ${successCount}/${toolCallLog.length} (${Math.round(successCount / toolCallLog.length * 100)}%) │`);
1009
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1010
- console.log("");
1011
- // Scenario breakdown
1012
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1013
- console.log("│ SCENARIOS TESTED │");
1014
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1015
- byScenario.forEach((data, scenario) => {
1016
- const status = data.fail === 0 ? "✓" : "✗";
1017
- const line = `│ ${status} ${scenario.padEnd(25)} ${String(data.tools.length).padStart(2)} tools, ${String(data.success).padStart(2)} calls`;
1018
- console.log(line.padEnd(78) + "│");
1019
- });
1020
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1021
- console.log("");
1022
- // Tool coverage matrix (grouped by domain)
1023
- const domainMap = {
1024
- "Verification": ["start_verification_cycle", "log_phase_findings", "log_gap", "resolve_gap", "log_test_result", "get_verification_status", "list_verification_cycles", "abandon_cycle"],
1025
- "Eval": ["start_eval_run", "record_eval_result", "complete_eval_run", "compare_eval_runs", "list_eval_runs"],
1026
- "Quality Gates": ["run_quality_gate", "get_gate_preset", "get_gate_history", "run_closed_loop"],
1027
- "Learning": ["record_learning", "search_learnings", "list_learnings", "delete_learning"],
1028
- "Flywheel": ["get_flywheel_status", "promote_to_eval", "trigger_investigation", "run_mandatory_flywheel"],
1029
- "Recon": ["run_recon", "log_recon_finding", "get_recon_summary", "check_framework_updates", "search_all_knowledge", "bootstrap_project", "get_project_context"],
1030
- "Bootstrap": ["discover_infrastructure", "triple_verify", "self_implement", "generate_self_instructions", "connect_channels"],
1031
- "Autonomous": ["assess_risk", "decide_re_update", "run_self_maintenance", "scaffold_directory", "run_autonomous_loop"],
1032
- "Self-Eval": ["log_tool_call", "get_trajectory_analysis", "get_self_eval_report", "get_improvement_recommendations", "cleanup_stale_runs", "synthesize_recon_to_learnings", "check_contract_compliance", "create_task_bank", "grade_agent_run"],
1033
- "Flicker Detection": ["run_flicker_detection", "capture_surface_stats", "extract_video_frames", "compute_ssim_analysis", "generate_flicker_report"],
1034
- "Figma Flow": ["analyze_figma_flows", "extract_figma_frames", "cluster_figma_flows", "render_flow_visualization"],
1035
- "Boilerplate": ["scaffold_nodebench_project", "get_boilerplate_status"],
1036
- "Benchmark": ["start_autonomy_benchmark", "log_benchmark_milestone", "complete_autonomy_benchmark"],
1037
- "Meta": ["findTools", "getMethodology", "check_mcp_setup"],
1038
- "Architect": ["scan_capabilities", "verify_concept_support", "generate_implementation_plan"],
1039
- "External (skip)": externalDependencyTools,
1040
- };
1041
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1042
- console.log("│ TOOL COVERAGE BY DOMAIN │");
1043
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1044
- for (const [domain, tools] of Object.entries(domainMap)) {
1045
- const tested = tools.filter(t => testedTools.has(t)).length;
1046
- const total = tools.length;
1047
- const pct = Math.round(tested / total * 100);
1048
- const bar = "█".repeat(Math.round(pct / 10)) + "░".repeat(10 - Math.round(pct / 10));
1049
- const line = `│ ${domain.padEnd(18)} ${bar} ${String(tested).padStart(2)}/${String(total).padStart(2)} (${String(pct).padStart(3)}%)`;
1050
- console.log(line.padEnd(78) + "│");
1051
- }
1052
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1053
- console.log("");
1054
- // Gaps
1055
- if (untestedTools.length > 0) {
1056
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1057
- console.log("│ ⚠ GAPS (Untested Tools) │");
1058
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1059
- untestedTools.forEach(t => {
1060
- console.log(`│ - ${t}`.padEnd(78) + "│");
1061
- });
1062
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1063
- console.log("");
1064
- }
1065
- // Deprecated tools analysis
1066
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1067
- console.log("│ ⚠️ DEPRECATED TOOLS │");
1068
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1069
- if (deprecatedTools.length === 0) {
1070
- console.log("│ No deprecated tools.".padEnd(78) + "│");
1071
- }
1072
- else {
1073
- deprecatedTools.forEach(d => {
1074
- console.log(`│ - ${d.tool}: ${d.reason}`.slice(0, 77).padEnd(78) + "│");
1075
- });
1076
- }
1077
- console.log("│ │");
1078
- console.log("│ These tools are kept for backwards compatibility but return a │");
1079
- console.log("│ deprecation notice. Use search_all_knowledge for unified search. │");
1080
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1081
- console.log("");
1082
- // Final verdict
1083
- const allCovered = untestedTools.length === 0;
1084
- const allPassed = failureCount === 0;
1085
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
1086
- console.log("│ VERDICT │");
1087
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
1088
- if (allCovered && allPassed) {
1089
- console.log("│ ✅ ALL TOOLS TESTED AND WORKING │");
1090
- console.log("│ \"Yah it definitely works!\" │");
1091
- }
1092
- else if (allPassed) {
1093
- console.log("│ ✅ ALL TESTED TOOLS WORKING │");
1094
- console.log(`│ ⚠ ${untestedTools.length} tools not covered in scenario tests (see gaps above)`.padEnd(78) + "│");
1095
- }
1096
- else {
1097
- console.log(`│ ❌ ${failureCount} tool calls failed - investigate before shipping`.padEnd(78) + "│");
1098
- }
1099
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
1100
- console.log("");
1101
- // Assert minimum coverage
1102
- expect(testedTools.size).toBeGreaterThan(35); // Should test at least 35 tools
1103
- expect(untestedTools.length).toBe(0); // All non-external tools should be tested
1104
- expect(failureCount).toBe(0); // No failures allowed
1105
- });
1106
- });
1107
- //# sourceMappingURL=evalHarness.test.js.map