nodebench-mcp 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -1
- package/dist/__tests__/evalHarness.test.js +5 -0
- package/dist/__tests__/evalHarness.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +138 -3
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +1 -0
- package/dist/__tests__/toolsetGatingEval.test.js +1031 -0
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -0
- package/dist/index.js +99 -20
- package/dist/index.js.map +1 -1
- package/dist/tools/evalTools.js +135 -0
- package/dist/tools/evalTools.js.map +1 -1
- package/dist/tools/llmTools.d.ts +11 -0
- package/dist/tools/llmTools.js +251 -0
- package/dist/tools/llmTools.js.map +1 -0
- package/dist/tools/metaTools.js +11 -0
- package/dist/tools/metaTools.js.map +1 -1
- package/dist/tools/securityTools.d.ts +10 -0
- package/dist/tools/securityTools.js +338 -0
- package/dist/tools/securityTools.js.map +1 -0
- package/package.json +9 -3
|
@@ -0,0 +1,1031 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Toolset Gating Evaluation — Real Trajectory Comparison
|
|
3
|
+
*
|
|
4
|
+
* Runs 9 diverse real-world scenarios through lite, core, and full presets.
|
|
5
|
+
* Scenario categories inspired by SWE-bench Pro, GAIA, TAU-bench, MCP-AgentBench,
|
|
6
|
+
* and real tasks from the nodebench-ai codebase.
|
|
7
|
+
*
|
|
8
|
+
* Categories:
|
|
9
|
+
* - Bug fix (model fallback, cron lifecycle)
|
|
10
|
+
* - Feature implementation (governance appeal, OAuth token rotation)
|
|
11
|
+
* - Refactoring (cross-branch dedup reconciliation)
|
|
12
|
+
* - Multi-agent coordination (parallel pipeline refactor, swarm state isolation)
|
|
13
|
+
* - Deployment / canary (model canary rollout)
|
|
14
|
+
* - Performance (query optimization)
|
|
15
|
+
*
|
|
16
|
+
* Measures:
|
|
17
|
+
* - Which phases complete vs fail (tool not found)
|
|
18
|
+
* - Concrete impact delta between presets
|
|
19
|
+
* - Token surface area reduction (tool count × estimated schema tokens)
|
|
20
|
+
* - Whether lite/core catch enough per scenario category
|
|
21
|
+
*
|
|
22
|
+
* This answers: "If I gate to --preset lite, what do I lose per scenario type?"
|
|
23
|
+
*/
|
|
24
|
+
import { describe, it, expect, afterAll } from "vitest";
|
|
25
|
+
import { verificationTools } from "../tools/verificationTools.js";
|
|
26
|
+
import { reconTools } from "../tools/reconTools.js";
|
|
27
|
+
import { evalTools } from "../tools/evalTools.js";
|
|
28
|
+
import { qualityGateTools } from "../tools/qualityGateTools.js";
|
|
29
|
+
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
30
|
+
import { learningTools } from "../tools/learningTools.js";
|
|
31
|
+
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
32
|
+
import { selfEvalTools } from "../tools/selfEvalTools.js";
|
|
33
|
+
import { parallelAgentTools } from "../tools/parallelAgentTools.js";
|
|
34
|
+
import { uiCaptureTools } from "../tools/uiCaptureTools.js";
|
|
35
|
+
import { visionTools } from "../tools/visionTools.js";
|
|
36
|
+
import { webTools } from "../tools/webTools.js";
|
|
37
|
+
import { githubTools } from "../tools/githubTools.js";
|
|
38
|
+
import { documentationTools } from "../tools/documentationTools.js";
|
|
39
|
+
import { localFileTools } from "../tools/localFileTools.js";
|
|
40
|
+
import { createMetaTools } from "../tools/metaTools.js";
|
|
41
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
42
|
+
// PRESET DEFINITIONS (mirrors index.ts TOOLSET_MAP + PRESETS exactly)
|
|
43
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
44
|
+
const TOOLSET_MAP = {
|
|
45
|
+
verification: verificationTools,
|
|
46
|
+
eval: evalTools,
|
|
47
|
+
quality_gate: qualityGateTools,
|
|
48
|
+
learning: learningTools,
|
|
49
|
+
flywheel: flywheelTools,
|
|
50
|
+
recon: reconTools,
|
|
51
|
+
ui_capture: uiCaptureTools,
|
|
52
|
+
vision: visionTools,
|
|
53
|
+
local_file: localFileTools,
|
|
54
|
+
web: webTools,
|
|
55
|
+
github: githubTools,
|
|
56
|
+
docs: documentationTools,
|
|
57
|
+
bootstrap: agentBootstrapTools,
|
|
58
|
+
self_eval: selfEvalTools,
|
|
59
|
+
parallel: parallelAgentTools,
|
|
60
|
+
};
|
|
61
|
+
const PRESETS = {
|
|
62
|
+
lite: ["verification", "eval", "quality_gate", "learning", "recon"],
|
|
63
|
+
core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval"],
|
|
64
|
+
full: Object.keys(TOOLSET_MAP),
|
|
65
|
+
};
|
|
66
|
+
function buildToolset(preset) {
|
|
67
|
+
const keys = PRESETS[preset];
|
|
68
|
+
const domain = keys.flatMap((k) => TOOLSET_MAP[k] ?? []);
|
|
69
|
+
return [...domain, ...createMetaTools(domain)];
|
|
70
|
+
}
|
|
71
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
72
|
+
// 9 DIVERSE SCENARIOS — from actual production codebase
|
|
73
|
+
// Categories: bug_fix, feature, refactor, operational, security, performance, deployment
|
|
74
|
+
// Inspired by SWE-bench Pro, GAIA, TAU-bench, MCP-AgentBench scenario diversity
|
|
75
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
76
|
+
const SCENARIOS = [
|
|
77
|
+
// ─── Bug Fix ───
|
|
78
|
+
{
|
|
79
|
+
id: "model-fallback-chain",
|
|
80
|
+
prompt: "The free model resolver isn't falling back correctly. When glm-4-flash-250414 returns 429, we should try the next model in the chain but instead the agent just errors out. Fix executeWithModelFallback in modelResolver.ts.",
|
|
81
|
+
domain: "Model Resolution",
|
|
82
|
+
category: "bug_fix",
|
|
83
|
+
complexity: "medium",
|
|
84
|
+
blindSpots: [
|
|
85
|
+
"Fallback chain doesn't skip models that returned 429 in the last 5 minutes",
|
|
86
|
+
"No exponential backoff — retries slam the rate-limited endpoint immediately",
|
|
87
|
+
"Missing telemetry: which model actually served the response is never logged",
|
|
88
|
+
],
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
id: "digest-cron-silent-fail",
|
|
92
|
+
prompt: "The daily digest agent hasn't produced output in 4 days. No errors in logs. Is the cron firing? Is the heartbeat blocking? Trace the full lifecycle from crons.ts through digestAgent.ts.",
|
|
93
|
+
domain: "Agent Loop",
|
|
94
|
+
category: "bug_fix",
|
|
95
|
+
complexity: "high",
|
|
96
|
+
blindSpots: [
|
|
97
|
+
"Heartbeat rate limiting silently returns success but blocks execution",
|
|
98
|
+
"listAgents returns empty if no agents have 'active' status in DB",
|
|
99
|
+
"No timeout on executeAgentWorkCycle — hung LLM call stalls entire cron tick",
|
|
100
|
+
],
|
|
101
|
+
},
|
|
102
|
+
// ─── Feature Implementation ───
|
|
103
|
+
{
|
|
104
|
+
id: "governance-appeal-workflow",
|
|
105
|
+
prompt: "We have quarantine.ts to pause misbehaving agents, but no way for them to appeal or auto-remediate. Build a system where agents can request trust tier upgrades after 7 days without incidents, with human-in-the-loop appeal review.",
|
|
106
|
+
domain: "Governance",
|
|
107
|
+
category: "feature",
|
|
108
|
+
complexity: "high",
|
|
109
|
+
blindSpots: [
|
|
110
|
+
"Appeal versioning & history — no table for tracking appeal requests, success rates, or preventing appeal spam",
|
|
111
|
+
"Trust score decay logic — static TRUST_TIER_SCORES with no time-weighted rebuild from incident-free periods",
|
|
112
|
+
"Cross-domain impact — lifting quarantine for post_to_linkedin should sync across allowedTools and allowedChannels without manual intervention",
|
|
113
|
+
],
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
id: "oauth-token-rotation",
|
|
117
|
+
prompt: "LinkedIn tokens expire in 60 days. We refresh proactively 7 days before, but if the refresh fails and we have no refresh_token, posting just silently fails. Build a proper fallback: system token → user token → expired-but-retry, with alerting.",
|
|
118
|
+
domain: "LinkedIn Pipeline",
|
|
119
|
+
category: "security",
|
|
120
|
+
complexity: "medium",
|
|
121
|
+
blindSpots: [
|
|
122
|
+
"Token state machine missing — code checks boolean 'is expired', should model: valid → expiring_soon → expired_can_refresh → expired_final → requires_reauth",
|
|
123
|
+
"Retry budget exhaustion — if refresh fails 5x, should escalate alert severity, not just log",
|
|
124
|
+
"Scope reduction fallback — if full refresh fails, fall back to posting-only scope (LinkedIn API supports it), not all-or-nothing failure",
|
|
125
|
+
],
|
|
126
|
+
},
|
|
127
|
+
// ─── Refactoring ───
|
|
128
|
+
{
|
|
129
|
+
id: "dd-cross-branch-dedup",
|
|
130
|
+
prompt: "Due diligence spawns 5 parallel branches (company, team, market, technical, regulatory), but results are full of contradictions: Team branch says founder left, Market branch says he's still there. Build cross-branch verification that detects and auto-resolves contradictions by source reliability.",
|
|
131
|
+
domain: "Due Diligence",
|
|
132
|
+
category: "refactor",
|
|
133
|
+
complexity: "high",
|
|
134
|
+
blindSpots: [
|
|
135
|
+
"Entity linking across branches — Team extracts 'founder: John Smith', Market extracts 'CEO: John Smith Jr.' — needs fuzzy matching not naive string dedup",
|
|
136
|
+
"Source reliability weighting — contradiction between LinkedIn (primary) and archived tweet (secondary) should favor LinkedIn; SourceReliability enum exists but not used in conflict resolution",
|
|
137
|
+
"Partial confidence updates — resolving contradiction should update original branch confidence score, not return flat contradiction list",
|
|
138
|
+
],
|
|
139
|
+
},
|
|
140
|
+
// ─── Multi-Agent Coordination ───
|
|
141
|
+
{
|
|
142
|
+
id: "linkedin-parallel-refactor",
|
|
143
|
+
prompt: "I need to refactor the LinkedIn posting pipeline so 3 Claude Code subagents can work on it in parallel: one on posting, one on archive dedup, one on scheduling. They keep overwriting each other's changes. Set up coordination.",
|
|
144
|
+
domain: "LinkedIn Pipeline",
|
|
145
|
+
category: "operational",
|
|
146
|
+
complexity: "high",
|
|
147
|
+
blindSpots: [
|
|
148
|
+
"No task claiming — both agents see the same dedup bug and both fix it",
|
|
149
|
+
"No progress file — third agent re-investigates what agent 1 already solved",
|
|
150
|
+
"No context budget tracking — agent 2 hits context limit mid-fix, loses work",
|
|
151
|
+
"No oracle comparison — merged output has conflict markers nobody catches",
|
|
152
|
+
],
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
id: "swarm-state-isolation",
|
|
156
|
+
prompt: "We spawn parallel subagents (DocumentAgent, MediaAgent, OpenBBAgent) in swarmOrchestrator. Sometimes they step on each other's messages in the same thread — both write to threadId=X simultaneously. Build proper message locking so agents don't clobber each other's outputs.",
|
|
157
|
+
domain: "Agent Loop",
|
|
158
|
+
category: "operational",
|
|
159
|
+
complexity: "high",
|
|
160
|
+
blindSpots: [
|
|
161
|
+
"Message ordering guarantees — parallel agents write to same thread; if DocumentAgent finishes before MediaAgent, message order is wrong in UI",
|
|
162
|
+
"Checkpoint contention — CheckpointManager.start() may lose concurrent updates from multiple agents despite Convex OCC",
|
|
163
|
+
"Partial failure recovery — if one agent crashes after checkpoint but before writing final message, next agent doesn't know to re-read context",
|
|
164
|
+
],
|
|
165
|
+
},
|
|
166
|
+
// ─── Deployment / Canary ───
|
|
167
|
+
{
|
|
168
|
+
id: "model-canary-rollout",
|
|
169
|
+
prompt: "We hardcoded model selection in autonomousConfig.ts (SYNTHESIS_MODEL = 'qwen3-coder-free'). Implement canary rollout: test new models on 10% of jobs, track quality, auto-promote to 100% if success rate > 95%, auto-rollback if < 80%.",
|
|
170
|
+
domain: "Model Resolution",
|
|
171
|
+
category: "deployment",
|
|
172
|
+
complexity: "medium",
|
|
173
|
+
blindSpots: [
|
|
174
|
+
"Canary slot assignment — need deterministic hash of job ID (hash(jobId) % 100 < canaryPercent), not random, so same job never switches models mid-retry",
|
|
175
|
+
"Success metric definition — 'success rate' is ambiguous: tool error rate? output quality? latency? Need multi-factor gate with independent thresholds",
|
|
176
|
+
"Model state drift — rolling back from Model-B to Model-A but old jobs cached with Model-B responses; resuming from checkpoint confuses model_id",
|
|
177
|
+
],
|
|
178
|
+
},
|
|
179
|
+
// ─── Performance ───
|
|
180
|
+
{
|
|
181
|
+
id: "archive-query-optimization",
|
|
182
|
+
prompt: "The LinkedIn archive page takes 8 seconds to load for companies with 500+ posts. The query does a full table scan with JS-side filtering and .take(500) pagination. Optimize with proper indexes, cursor-based pagination, and server-side filtering.",
|
|
183
|
+
domain: "LinkedIn Pipeline",
|
|
184
|
+
category: "performance",
|
|
185
|
+
complexity: "medium",
|
|
186
|
+
blindSpots: [
|
|
187
|
+
"Archive lookback uses .take(500) with no cursor — page 2 re-scans rows from page 1, O(n^2) total reads",
|
|
188
|
+
"JS-side filtering of personaType and contentSource happens after fetching all rows — should be index-based",
|
|
189
|
+
"Dedup hash (cyrb53) is computed on every query, not stored as indexed column — can't deduplicate at DB level",
|
|
190
|
+
],
|
|
191
|
+
},
|
|
192
|
+
];
|
|
193
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
194
|
+
// TRAJECTORY RUNNER
|
|
195
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
196
|
+
const callLog = [];
|
|
197
|
+
const cleanup = { cycleIds: [], learningKeys: [] };
|
|
198
|
+
async function runTrajectory(preset, scenario) {
|
|
199
|
+
const tools = buildToolset(preset);
|
|
200
|
+
const toolMap = new Map(tools.map((t) => [t.name, t]));
|
|
201
|
+
const sid = `gating-${preset}-${scenario.id}`;
|
|
202
|
+
const trajectory = {
|
|
203
|
+
preset,
|
|
204
|
+
scenarioId: scenario.id,
|
|
205
|
+
toolCount: tools.length,
|
|
206
|
+
estimatedSchemaTokens: tools.length * 200, // ~200 tokens per tool schema avg
|
|
207
|
+
phases: [],
|
|
208
|
+
phasesCompleted: 0,
|
|
209
|
+
phasesSkipped: 0,
|
|
210
|
+
totalToolCalls: 0,
|
|
211
|
+
issuesDetected: 0,
|
|
212
|
+
reconFindings: 0,
|
|
213
|
+
evalCases: 0,
|
|
214
|
+
gateRules: 0,
|
|
215
|
+
learningRecorded: false,
|
|
216
|
+
flywheelComplete: false,
|
|
217
|
+
riskAssessed: false,
|
|
218
|
+
};
|
|
219
|
+
async function tryCall(name, args, phase) {
|
|
220
|
+
const tool = toolMap.get(name);
|
|
221
|
+
if (!tool) {
|
|
222
|
+
callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "missing" });
|
|
223
|
+
return null; // tool not available in this preset
|
|
224
|
+
}
|
|
225
|
+
try {
|
|
226
|
+
const result = await tool.handler(args);
|
|
227
|
+
callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "success" });
|
|
228
|
+
trajectory.totalToolCalls++;
|
|
229
|
+
return result;
|
|
230
|
+
}
|
|
231
|
+
catch (err) {
|
|
232
|
+
callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "error" });
|
|
233
|
+
trajectory.totalToolCalls++;
|
|
234
|
+
return null;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// ─── Phase 1: META — tool discovery ───
|
|
238
|
+
{
|
|
239
|
+
const called = [];
|
|
240
|
+
const missing = [];
|
|
241
|
+
const ft = await tryCall("findTools", { query: `${scenario.domain} ${scenario.category}` }, "meta");
|
|
242
|
+
ft ? called.push("findTools") : missing.push("findTools");
|
|
243
|
+
const gm = await tryCall("getMethodology", { topic: "verification" }, "meta");
|
|
244
|
+
gm ? called.push("getMethodology") : missing.push("getMethodology");
|
|
245
|
+
const success = called.length > 0;
|
|
246
|
+
trajectory.phases.push({ phase: "meta", toolsCalled: called, toolsMissing: missing, success });
|
|
247
|
+
if (success)
|
|
248
|
+
trajectory.phasesCompleted++;
|
|
249
|
+
else
|
|
250
|
+
trajectory.phasesSkipped++;
|
|
251
|
+
}
|
|
252
|
+
// ─── Phase 2: RECON — structured research ───
|
|
253
|
+
{
|
|
254
|
+
const called = [];
|
|
255
|
+
const missing = [];
|
|
256
|
+
const recon = await tryCall("run_recon", {
|
|
257
|
+
target: `${scenario.domain}: ${scenario.prompt.slice(0, 80)}`,
|
|
258
|
+
description: `Gating eval: ${scenario.prompt.slice(0, 120)}`,
|
|
259
|
+
}, "recon");
|
|
260
|
+
if (recon) {
|
|
261
|
+
called.push("run_recon");
|
|
262
|
+
const findingCount = scenario.complexity === "high" ? 3 : 2;
|
|
263
|
+
for (let f = 0; f < findingCount; f++) {
|
|
264
|
+
const r = await tryCall("log_recon_finding", {
|
|
265
|
+
sessionId: recon.sessionId,
|
|
266
|
+
category: f === 0 ? "codebase_pattern" : "existing_implementation",
|
|
267
|
+
summary: scenario.blindSpots[f] || `Pattern in ${scenario.domain}`,
|
|
268
|
+
relevance: `Impacts: ${scenario.prompt.slice(0, 60)}`,
|
|
269
|
+
}, "recon");
|
|
270
|
+
if (r) {
|
|
271
|
+
called.push("log_recon_finding");
|
|
272
|
+
trajectory.reconFindings++;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
await tryCall("get_recon_summary", { sessionId: recon.sessionId }, "recon");
|
|
276
|
+
called.push("get_recon_summary");
|
|
277
|
+
}
|
|
278
|
+
else {
|
|
279
|
+
missing.push("run_recon", "log_recon_finding", "get_recon_summary");
|
|
280
|
+
}
|
|
281
|
+
// Additional recon tools
|
|
282
|
+
const fwCheck = await tryCall("check_framework_updates", { ecosystem: "mcp" }, "recon");
|
|
283
|
+
fwCheck ? called.push("check_framework_updates") : missing.push("check_framework_updates");
|
|
284
|
+
const projCtx = await tryCall("get_project_context", {}, "recon");
|
|
285
|
+
projCtx ? called.push("get_project_context") : missing.push("get_project_context");
|
|
286
|
+
const success = called.length > 0;
|
|
287
|
+
trajectory.phases.push({ phase: "recon", toolsCalled: called, toolsMissing: missing, success });
|
|
288
|
+
if (success)
|
|
289
|
+
trajectory.phasesCompleted++;
|
|
290
|
+
else
|
|
291
|
+
trajectory.phasesSkipped++;
|
|
292
|
+
}
|
|
293
|
+
// ─── Phase 3: RISK — assessment ───
|
|
294
|
+
{
|
|
295
|
+
const called = [];
|
|
296
|
+
const missing = [];
|
|
297
|
+
const risk = await tryCall("assess_risk", {
|
|
298
|
+
action: "fix_implementation",
|
|
299
|
+
context: `${scenario.domain} — ${scenario.complexity} — ${scenario.prompt.slice(0, 80)}`,
|
|
300
|
+
}, "risk");
|
|
301
|
+
if (risk) {
|
|
302
|
+
called.push("assess_risk");
|
|
303
|
+
trajectory.riskAssessed = true;
|
|
304
|
+
}
|
|
305
|
+
else {
|
|
306
|
+
missing.push("assess_risk");
|
|
307
|
+
}
|
|
308
|
+
const success = called.length > 0;
|
|
309
|
+
trajectory.phases.push({ phase: "risk", toolsCalled: called, toolsMissing: missing, success });
|
|
310
|
+
if (success)
|
|
311
|
+
trajectory.phasesCompleted++;
|
|
312
|
+
else
|
|
313
|
+
trajectory.phasesSkipped++;
|
|
314
|
+
}
|
|
315
|
+
// ─── Phase 4: VERIFICATION — tracked implementation cycle ───
|
|
316
|
+
{
|
|
317
|
+
const called = [];
|
|
318
|
+
const missing = [];
|
|
319
|
+
const cycle = await tryCall("start_verification_cycle", {
|
|
320
|
+
title: `gating-eval-${preset}-${scenario.id}`,
|
|
321
|
+
description: scenario.prompt.slice(0, 200),
|
|
322
|
+
}, "verification");
|
|
323
|
+
if (cycle) {
|
|
324
|
+
called.push("start_verification_cycle");
|
|
325
|
+
cleanup.cycleIds.push(cycle.cycleId);
|
|
326
|
+
// Phase findings
|
|
327
|
+
await tryCall("log_phase_findings", {
|
|
328
|
+
cycleId: cycle.cycleId, phaseNumber: 1, status: "passed",
|
|
329
|
+
findings: { domain: scenario.domain, reconFindings: trajectory.reconFindings },
|
|
330
|
+
}, "verification");
|
|
331
|
+
called.push("log_phase_findings");
|
|
332
|
+
await tryCall("log_phase_findings", {
|
|
333
|
+
cycleId: cycle.cycleId, phaseNumber: 2, status: "passed",
|
|
334
|
+
findings: { fixApplied: true },
|
|
335
|
+
}, "verification");
|
|
336
|
+
// Log gaps from blind spots
|
|
337
|
+
const gapCount = scenario.complexity === "high" ? 2 : 1;
|
|
338
|
+
const gapIds = [];
|
|
339
|
+
for (let g = 0; g < gapCount; g++) {
|
|
340
|
+
const gap = await tryCall("log_gap", {
|
|
341
|
+
cycleId: cycle.cycleId,
|
|
342
|
+
severity: g === 0 ? (scenario.complexity === "high" ? "HIGH" : "MEDIUM") : "MEDIUM",
|
|
343
|
+
title: `gating-eval-${scenario.blindSpots[g]?.slice(0, 50) || scenario.id}`,
|
|
344
|
+
description: scenario.blindSpots[g] || `Issue in ${scenario.domain}`,
|
|
345
|
+
rootCause: "Discovered via structured recon",
|
|
346
|
+
fixStrategy: `Fix ${scenario.category} in ${scenario.domain}`,
|
|
347
|
+
}, "verification");
|
|
348
|
+
if (gap) {
|
|
349
|
+
called.push("log_gap");
|
|
350
|
+
gapIds.push(gap.gapId);
|
|
351
|
+
trajectory.issuesDetected++;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
// Resolve gaps
|
|
355
|
+
for (const gapId of gapIds) {
|
|
356
|
+
await tryCall("resolve_gap", { gapId }, "verification");
|
|
357
|
+
called.push("resolve_gap");
|
|
358
|
+
}
|
|
359
|
+
// 3-layer testing
|
|
360
|
+
for (const layer of ["static", "unit", "integration"]) {
|
|
361
|
+
await tryCall("log_test_result", {
|
|
362
|
+
cycleId: cycle.cycleId, layer,
|
|
363
|
+
label: `gating-eval-${preset}-${scenario.id}-${layer}`,
|
|
364
|
+
passed: true, output: `${layer} pass`,
|
|
365
|
+
}, "verification");
|
|
366
|
+
called.push("log_test_result");
|
|
367
|
+
}
|
|
368
|
+
// Check status and list cycles
|
|
369
|
+
const status = await tryCall("get_verification_status", { cycleId: cycle.cycleId }, "verification");
|
|
370
|
+
if (status)
|
|
371
|
+
called.push("get_verification_status");
|
|
372
|
+
const cycleList = await tryCall("list_verification_cycles", { limit: 5 }, "verification");
|
|
373
|
+
if (cycleList)
|
|
374
|
+
called.push("list_verification_cycles");
|
|
375
|
+
}
|
|
376
|
+
else {
|
|
377
|
+
missing.push("start_verification_cycle", "log_gap", "log_test_result", "get_verification_status", "list_verification_cycles");
|
|
378
|
+
}
|
|
379
|
+
const success = called.length > 0;
|
|
380
|
+
trajectory.phases.push({ phase: "verification", toolsCalled: called, toolsMissing: missing, success });
|
|
381
|
+
if (success)
|
|
382
|
+
trajectory.phasesCompleted++;
|
|
383
|
+
else
|
|
384
|
+
trajectory.phasesSkipped++;
|
|
385
|
+
}
|
|
386
|
+
// ─── Phase 5: EVAL — regression cases ───
|
|
387
|
+
{
|
|
388
|
+
const called = [];
|
|
389
|
+
const missing = [];
|
|
390
|
+
const evalRun = await tryCall("start_eval_run", {
|
|
391
|
+
name: `gating-eval-${preset}-${scenario.id}`,
|
|
392
|
+
description: `Regression eval for ${scenario.domain}`,
|
|
393
|
+
cases: [
|
|
394
|
+
{ input: scenario.prompt.slice(0, 100), intent: `Verify ${scenario.category} fix` },
|
|
395
|
+
{ input: `Regression guard for ${scenario.id}`, intent: `Prevent regression` },
|
|
396
|
+
],
|
|
397
|
+
}, "eval");
|
|
398
|
+
if (evalRun) {
|
|
399
|
+
called.push("start_eval_run");
|
|
400
|
+
for (const caseId of evalRun.caseIds) {
|
|
401
|
+
await tryCall("record_eval_result", {
|
|
402
|
+
caseId, actual: "Fix verified", verdict: "pass", score: 0.92,
|
|
403
|
+
}, "eval");
|
|
404
|
+
called.push("record_eval_result");
|
|
405
|
+
trajectory.evalCases++;
|
|
406
|
+
}
|
|
407
|
+
await tryCall("complete_eval_run", { runId: evalRun.runId }, "eval");
|
|
408
|
+
called.push("complete_eval_run");
|
|
409
|
+
// List and compare runs
|
|
410
|
+
const runList = await tryCall("list_eval_runs", { limit: 5 }, "eval");
|
|
411
|
+
if (runList)
|
|
412
|
+
called.push("list_eval_runs");
|
|
413
|
+
// Compare with self (validates the tool works even if baseline === candidate)
|
|
414
|
+
const cmp = await tryCall("compare_eval_runs", {
|
|
415
|
+
baselineRunId: evalRun.runId,
|
|
416
|
+
candidateRunId: evalRun.runId,
|
|
417
|
+
}, "eval");
|
|
418
|
+
if (cmp)
|
|
419
|
+
called.push("compare_eval_runs");
|
|
420
|
+
}
|
|
421
|
+
else {
|
|
422
|
+
missing.push("start_eval_run", "record_eval_result", "complete_eval_run", "list_eval_runs", "compare_eval_runs");
|
|
423
|
+
}
|
|
424
|
+
const success = called.length > 0;
|
|
425
|
+
trajectory.phases.push({ phase: "eval", toolsCalled: called, toolsMissing: missing, success });
|
|
426
|
+
if (success)
|
|
427
|
+
trajectory.phasesCompleted++;
|
|
428
|
+
else
|
|
429
|
+
trajectory.phasesSkipped++;
|
|
430
|
+
}
|
|
431
|
+
// ─── Phase 6: QUALITY GATE ───
|
|
432
|
+
{
|
|
433
|
+
const called = [];
|
|
434
|
+
const missing = [];
|
|
435
|
+
const rules = [
|
|
436
|
+
{ name: "all_tests_pass", passed: true },
|
|
437
|
+
{ name: "no_type_errors", passed: true },
|
|
438
|
+
{ name: "no_lint_violations", passed: true },
|
|
439
|
+
{ name: "coverage_threshold", passed: scenario.complexity !== "high" },
|
|
440
|
+
];
|
|
441
|
+
if (scenario.complexity === "high") {
|
|
442
|
+
rules.push({ name: "regression_cases_exist", passed: true });
|
|
443
|
+
rules.push({ name: "edge_cases_covered", passed: true });
|
|
444
|
+
}
|
|
445
|
+
const gate = await tryCall("run_quality_gate", {
|
|
446
|
+
gateName: "deploy_readiness",
|
|
447
|
+
target: `gating-eval-${preset}-${scenario.id}`,
|
|
448
|
+
rules,
|
|
449
|
+
}, "quality-gate");
|
|
450
|
+
if (gate) {
|
|
451
|
+
called.push("run_quality_gate");
|
|
452
|
+
trajectory.gateRules = rules.length;
|
|
453
|
+
}
|
|
454
|
+
else {
|
|
455
|
+
missing.push("run_quality_gate");
|
|
456
|
+
}
|
|
457
|
+
const cl = await tryCall("run_closed_loop", {
|
|
458
|
+
steps: [{ step: "compile", passed: true }, { step: "lint", passed: true }, { step: "test", passed: true }],
|
|
459
|
+
}, "quality-gate");
|
|
460
|
+
if (cl)
|
|
461
|
+
called.push("run_closed_loop");
|
|
462
|
+
else
|
|
463
|
+
missing.push("run_closed_loop");
|
|
464
|
+
// Gate preset and history
|
|
465
|
+
const gp = await tryCall("get_gate_preset", { preset: "deploy_readiness" }, "quality-gate");
|
|
466
|
+
gp ? called.push("get_gate_preset") : missing.push("get_gate_preset");
|
|
467
|
+
const gh = await tryCall("get_gate_history", {
|
|
468
|
+
gateName: "deploy_readiness",
|
|
469
|
+
limit: 5,
|
|
470
|
+
}, "quality-gate");
|
|
471
|
+
gh ? called.push("get_gate_history") : missing.push("get_gate_history");
|
|
472
|
+
const success = called.length > 0;
|
|
473
|
+
trajectory.phases.push({ phase: "quality-gate", toolsCalled: called, toolsMissing: missing, success });
|
|
474
|
+
if (success)
|
|
475
|
+
trajectory.phasesCompleted++;
|
|
476
|
+
else
|
|
477
|
+
trajectory.phasesSkipped++;
|
|
478
|
+
}
|
|
479
|
+
// ─── Phase 7: KNOWLEDGE — search + record ───
|
|
480
|
+
{
|
|
481
|
+
const called = [];
|
|
482
|
+
const missing = [];
|
|
483
|
+
const prior = await tryCall("search_all_knowledge", {
|
|
484
|
+
query: `gating ${scenario.domain}`,
|
|
485
|
+
}, "knowledge");
|
|
486
|
+
if (prior)
|
|
487
|
+
called.push("search_all_knowledge");
|
|
488
|
+
else
|
|
489
|
+
missing.push("search_all_knowledge");
|
|
490
|
+
const lkey = `gating-eval-${preset}-${scenario.id}-${Date.now()}`;
|
|
491
|
+
cleanup.learningKeys.push(lkey);
|
|
492
|
+
const lr = await tryCall("record_learning", {
|
|
493
|
+
key: lkey,
|
|
494
|
+
category: "pattern",
|
|
495
|
+
content: `[gating-eval] ${scenario.domain}: ${scenario.blindSpots[0]?.slice(0, 80)}`,
|
|
496
|
+
tags: ["gating-eval", preset, scenario.domain.toLowerCase().replace(/\s+/g, "-")],
|
|
497
|
+
}, "knowledge");
|
|
498
|
+
if (lr) {
|
|
499
|
+
called.push("record_learning");
|
|
500
|
+
trajectory.learningRecorded = true;
|
|
501
|
+
}
|
|
502
|
+
else
|
|
503
|
+
missing.push("record_learning");
|
|
504
|
+
const success = called.length > 0;
|
|
505
|
+
trajectory.phases.push({ phase: "knowledge", toolsCalled: called, toolsMissing: missing, success });
|
|
506
|
+
if (success)
|
|
507
|
+
trajectory.phasesCompleted++;
|
|
508
|
+
else
|
|
509
|
+
trajectory.phasesSkipped++;
|
|
510
|
+
}
|
|
511
|
+
// ─── Phase 8: FLYWHEEL — mandatory 6-step ───
|
|
512
|
+
{
|
|
513
|
+
const called = [];
|
|
514
|
+
const missing = [];
|
|
515
|
+
const fw = await tryCall("run_mandatory_flywheel", {
|
|
516
|
+
target: `gating-eval-${preset}-${scenario.id}`,
|
|
517
|
+
steps: [
|
|
518
|
+
{ stepName: "static_analysis", passed: true },
|
|
519
|
+
{ stepName: "happy_path_test", passed: true },
|
|
520
|
+
{ stepName: "failure_path_test", passed: true },
|
|
521
|
+
{ stepName: "gap_analysis", passed: true },
|
|
522
|
+
{ stepName: "fix_and_reverify", passed: true },
|
|
523
|
+
{ stepName: "deploy_and_document", passed: true },
|
|
524
|
+
],
|
|
525
|
+
}, "flywheel");
|
|
526
|
+
if (fw) {
|
|
527
|
+
called.push("run_mandatory_flywheel");
|
|
528
|
+
trajectory.flywheelComplete = fw.passed === true;
|
|
529
|
+
}
|
|
530
|
+
else {
|
|
531
|
+
missing.push("run_mandatory_flywheel");
|
|
532
|
+
}
|
|
533
|
+
// Flywheel status check
|
|
534
|
+
const fwStatus = await tryCall("get_flywheel_status", { includeHistory: false }, "flywheel");
|
|
535
|
+
fwStatus ? called.push("get_flywheel_status") : missing.push("get_flywheel_status");
|
|
536
|
+
// Promote to eval (needs a real cycleId from phase 4)
|
|
537
|
+
const cycleId = cleanup.cycleIds[cleanup.cycleIds.length - 1];
|
|
538
|
+
if (cycleId) {
|
|
539
|
+
const promo = await tryCall("promote_to_eval", {
|
|
540
|
+
cycleId,
|
|
541
|
+
evalRunName: `gating-promoted-${preset}-${scenario.id}`,
|
|
542
|
+
cases: [{ input: scenario.prompt.slice(0, 80), intent: `Regression guard for ${scenario.domain}` }],
|
|
543
|
+
}, "flywheel");
|
|
544
|
+
promo ? called.push("promote_to_eval") : missing.push("promote_to_eval");
|
|
545
|
+
// Trigger investigation (needs evalRunId from promotion)
|
|
546
|
+
if (promo?.evalRunId) {
|
|
547
|
+
const inv = await tryCall("trigger_investigation", {
|
|
548
|
+
evalRunId: promo.evalRunId,
|
|
549
|
+
regressionDescription: `Potential regression in ${scenario.domain}: ${scenario.blindSpots[0]?.slice(0, 60)}`,
|
|
550
|
+
}, "flywheel");
|
|
551
|
+
inv ? called.push("trigger_investigation") : missing.push("trigger_investigation");
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
const success = called.length > 0;
|
|
555
|
+
trajectory.phases.push({ phase: "flywheel", toolsCalled: called, toolsMissing: missing, success });
|
|
556
|
+
if (success)
|
|
557
|
+
trajectory.phasesCompleted++;
|
|
558
|
+
else
|
|
559
|
+
trajectory.phasesSkipped++;
|
|
560
|
+
}
|
|
561
|
+
// ─── Phase 9 (operational scenarios): PARALLEL AGENT TOOLS ───
|
|
562
|
+
if (scenario.category === "operational") {
|
|
563
|
+
const called = [];
|
|
564
|
+
const missing = [];
|
|
565
|
+
// Bootstrap parallel session
|
|
566
|
+
const bootstrap = await tryCall("bootstrap_parallel_agents", {
|
|
567
|
+
dryRun: true,
|
|
568
|
+
}, "parallel");
|
|
569
|
+
bootstrap ? called.push("bootstrap_parallel_agents") : missing.push("bootstrap_parallel_agents");
|
|
570
|
+
const claim = await tryCall("claim_agent_task", {
|
|
571
|
+
taskKey: `gating-eval-${preset}-${scenario.id}-posting`,
|
|
572
|
+
description: "Refactor LinkedIn posting module",
|
|
573
|
+
}, "parallel");
|
|
574
|
+
if (claim) {
|
|
575
|
+
called.push("claim_agent_task");
|
|
576
|
+
await tryCall("assign_agent_role", {
|
|
577
|
+
role: "implementer", focusArea: "posting",
|
|
578
|
+
}, "parallel");
|
|
579
|
+
called.push("assign_agent_role");
|
|
580
|
+
// Verify role assignment
|
|
581
|
+
const role = await tryCall("get_agent_role", {}, "parallel");
|
|
582
|
+
role ? called.push("get_agent_role") : missing.push("get_agent_role");
|
|
583
|
+
// Log context budget during work
|
|
584
|
+
const budget = await tryCall("log_context_budget", {
|
|
585
|
+
eventType: "tool_output",
|
|
586
|
+
tokensUsed: 3500,
|
|
587
|
+
description: `Phase output for ${scenario.id}`,
|
|
588
|
+
}, "parallel");
|
|
589
|
+
budget ? called.push("log_context_budget") : missing.push("log_context_budget");
|
|
590
|
+
// List tasks to verify claim
|
|
591
|
+
const taskList = await tryCall("list_agent_tasks", { status: "claimed" }, "parallel");
|
|
592
|
+
taskList ? called.push("list_agent_tasks") : missing.push("list_agent_tasks");
|
|
593
|
+
await tryCall("get_parallel_status", { includeHistory: false }, "parallel");
|
|
594
|
+
called.push("get_parallel_status");
|
|
595
|
+
// Oracle comparison — validate merged output
|
|
596
|
+
const oracle = await tryCall("run_oracle_comparison", {
|
|
597
|
+
testLabel: `gating-eval-${preset}-${scenario.id}-merge`,
|
|
598
|
+
actualOutput: `Fixed ${scenario.domain} posting module`,
|
|
599
|
+
expectedOutput: `Fixed ${scenario.domain} posting module`,
|
|
600
|
+
oracleSource: "gating-eval-reference",
|
|
601
|
+
}, "parallel");
|
|
602
|
+
oracle ? called.push("run_oracle_comparison") : missing.push("run_oracle_comparison");
|
|
603
|
+
await tryCall("release_agent_task", {
|
|
604
|
+
taskKey: `gating-eval-${preset}-${scenario.id}-posting`,
|
|
605
|
+
status: "completed",
|
|
606
|
+
progressNote: "Posting module refactored",
|
|
607
|
+
}, "parallel");
|
|
608
|
+
called.push("release_agent_task");
|
|
609
|
+
// Generate coordination doc
|
|
610
|
+
const agentsMd = await tryCall("generate_parallel_agents_md", {
|
|
611
|
+
projectName: `gating-eval-${scenario.id}`,
|
|
612
|
+
maxAgents: 3,
|
|
613
|
+
}, "parallel");
|
|
614
|
+
agentsMd ? called.push("generate_parallel_agents_md") : missing.push("generate_parallel_agents_md");
|
|
615
|
+
}
|
|
616
|
+
else {
|
|
617
|
+
missing.push("claim_agent_task", "assign_agent_role", "get_agent_role", "log_context_budget", "list_agent_tasks", "get_parallel_status", "run_oracle_comparison", "release_agent_task", "generate_parallel_agents_md");
|
|
618
|
+
}
|
|
619
|
+
trajectory.phases.push({
|
|
620
|
+
phase: "parallel",
|
|
621
|
+
toolsCalled: called,
|
|
622
|
+
toolsMissing: missing,
|
|
623
|
+
success: called.length > 0,
|
|
624
|
+
});
|
|
625
|
+
if (called.length > 0)
|
|
626
|
+
trajectory.phasesCompleted++;
|
|
627
|
+
else
|
|
628
|
+
trajectory.phasesSkipped++;
|
|
629
|
+
}
|
|
630
|
+
// ─── Phase 10: SELF-EVAL (all 6 tools) ───
|
|
631
|
+
{
|
|
632
|
+
const called = [];
|
|
633
|
+
const missing = [];
|
|
634
|
+
// Log a tool call for this trajectory
|
|
635
|
+
const logCall = await tryCall("log_tool_call", {
|
|
636
|
+
sessionId: sid,
|
|
637
|
+
toolName: "run_recon",
|
|
638
|
+
durationMs: 42,
|
|
639
|
+
resultStatus: "success",
|
|
640
|
+
phase: "recon",
|
|
641
|
+
}, "self-eval");
|
|
642
|
+
logCall ? called.push("log_tool_call") : missing.push("log_tool_call");
|
|
643
|
+
// Get trajectory analysis
|
|
644
|
+
const trajAnalysis = await tryCall("get_trajectory_analysis", {
|
|
645
|
+
sessionId: sid,
|
|
646
|
+
}, "self-eval");
|
|
647
|
+
trajAnalysis ? called.push("get_trajectory_analysis") : missing.push("get_trajectory_analysis");
|
|
648
|
+
// Get self-eval report
|
|
649
|
+
const report = await tryCall("get_self_eval_report", {
|
|
650
|
+
excludeTestSessions: true,
|
|
651
|
+
}, "self-eval");
|
|
652
|
+
report ? called.push("get_self_eval_report") : missing.push("get_self_eval_report");
|
|
653
|
+
// Get improvement recommendations
|
|
654
|
+
const recs = await tryCall("get_improvement_recommendations", {
|
|
655
|
+
focus: "all",
|
|
656
|
+
}, "self-eval");
|
|
657
|
+
recs ? called.push("get_improvement_recommendations") : missing.push("get_improvement_recommendations");
|
|
658
|
+
// Cleanup stale runs (dry run)
|
|
659
|
+
const staleCleanup = await tryCall("cleanup_stale_runs", {
|
|
660
|
+
dryRun: true,
|
|
661
|
+
}, "self-eval");
|
|
662
|
+
staleCleanup ? called.push("cleanup_stale_runs") : missing.push("cleanup_stale_runs");
|
|
663
|
+
// Synthesize recon to learnings (dry run)
|
|
664
|
+
const synth = await tryCall("synthesize_recon_to_learnings", {
|
|
665
|
+
dryRun: true,
|
|
666
|
+
}, "self-eval");
|
|
667
|
+
synth ? called.push("synthesize_recon_to_learnings") : missing.push("synthesize_recon_to_learnings");
|
|
668
|
+
if (called.length > 0 || missing.length > 0) {
|
|
669
|
+
trajectory.phases.push({
|
|
670
|
+
phase: "self-eval",
|
|
671
|
+
toolsCalled: called,
|
|
672
|
+
toolsMissing: missing,
|
|
673
|
+
success: called.length > 0,
|
|
674
|
+
});
|
|
675
|
+
if (called.length > 0)
|
|
676
|
+
trajectory.phasesCompleted++;
|
|
677
|
+
else
|
|
678
|
+
trajectory.phasesSkipped++;
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
return trajectory;
|
|
682
|
+
}
|
|
683
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
684
|
+
// CLEANUP
|
|
685
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
686
|
+
async function cleanupAll() {
|
|
687
|
+
const fullTools = buildToolset("full");
|
|
688
|
+
const findTool = (name) => fullTools.find((t) => t.name === name);
|
|
689
|
+
for (const cycleId of cleanup.cycleIds) {
|
|
690
|
+
try {
|
|
691
|
+
await findTool("abandon_cycle")?.handler({ cycleId, reason: "gating eval cleanup" });
|
|
692
|
+
}
|
|
693
|
+
catch { /* ok */ }
|
|
694
|
+
}
|
|
695
|
+
for (const key of cleanup.learningKeys) {
|
|
696
|
+
try {
|
|
697
|
+
await findTool("delete_learning")?.handler({ key });
|
|
698
|
+
}
|
|
699
|
+
catch { /* ok */ }
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
703
|
+
// TESTS
|
|
704
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
705
|
+
const allTrajectories = [];
|
|
706
|
+
describe("Toolset Gating Eval", () => {
|
|
707
|
+
afterAll(async () => { await cleanupAll(); });
|
|
708
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
709
|
+
describe(`Preset: ${preset}`, () => {
|
|
710
|
+
for (const scenario of SCENARIOS) {
|
|
711
|
+
it(`${preset}/${scenario.id}: runs 8-phase pipeline`, async () => {
|
|
712
|
+
const t = await runTrajectory(preset, scenario);
|
|
713
|
+
allTrajectories.push(t);
|
|
714
|
+
// Core methodology phases should always work (lite, core, full all have these)
|
|
715
|
+
const metaPhase = t.phases.find((p) => p.phase === "meta");
|
|
716
|
+
expect(metaPhase?.success).toBe(true);
|
|
717
|
+
const reconPhase = t.phases.find((p) => p.phase === "recon");
|
|
718
|
+
expect(reconPhase?.success).toBe(true);
|
|
719
|
+
const verifyPhase = t.phases.find((p) => p.phase === "verification");
|
|
720
|
+
expect(verifyPhase?.success).toBe(true);
|
|
721
|
+
const evalPhase = t.phases.find((p) => p.phase === "eval");
|
|
722
|
+
expect(evalPhase?.success).toBe(true);
|
|
723
|
+
const gatePhase = t.phases.find((p) => p.phase === "quality-gate");
|
|
724
|
+
expect(gatePhase?.success).toBe(true);
|
|
725
|
+
// Knowledge phase depends on preset (learning tools in lite + core + full)
|
|
726
|
+
const knowledgePhase = t.phases.find((p) => p.phase === "knowledge");
|
|
727
|
+
expect(knowledgePhase?.success).toBe(true);
|
|
728
|
+
}, 30_000);
|
|
729
|
+
}
|
|
730
|
+
});
|
|
731
|
+
}
|
|
732
|
+
describe("Flywheel availability", () => {
|
|
733
|
+
it("lite preset does NOT have flywheel tools", () => {
|
|
734
|
+
const liteTrajectories = allTrajectories.filter((t) => t.preset === "lite");
|
|
735
|
+
for (const t of liteTrajectories) {
|
|
736
|
+
const fw = t.phases.find((p) => p.phase === "flywheel");
|
|
737
|
+
expect(fw?.success).toBe(false);
|
|
738
|
+
expect(fw?.toolsMissing).toContain("run_mandatory_flywheel");
|
|
739
|
+
}
|
|
740
|
+
});
|
|
741
|
+
it("core and full presets HAVE flywheel tools", () => {
|
|
742
|
+
const coreFullTrajectories = allTrajectories.filter((t) => t.preset !== "lite");
|
|
743
|
+
for (const t of coreFullTrajectories) {
|
|
744
|
+
expect(t.flywheelComplete).toBe(true);
|
|
745
|
+
}
|
|
746
|
+
});
|
|
747
|
+
});
|
|
748
|
+
describe("Parallel agent tools availability", () => {
|
|
749
|
+
it("lite and core do NOT have parallel tools", () => {
|
|
750
|
+
const parallelScenarios = allTrajectories.filter((t) => (t.scenarioId === "linkedin-parallel-refactor" || t.scenarioId === "swarm-state-isolation") && t.preset !== "full");
|
|
751
|
+
for (const t of parallelScenarios) {
|
|
752
|
+
const pp = t.phases.find((p) => p.phase === "parallel");
|
|
753
|
+
if (pp) {
|
|
754
|
+
expect(pp.success).toBe(false);
|
|
755
|
+
expect(pp.toolsMissing).toContain("claim_agent_task");
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
});
|
|
759
|
+
it("full preset HAS parallel tools for parallel scenarios", () => {
|
|
760
|
+
const fullParallel = allTrajectories.filter((t) => (t.scenarioId === "linkedin-parallel-refactor" || t.scenarioId === "swarm-state-isolation") && t.preset === "full");
|
|
761
|
+
for (const t of fullParallel) {
|
|
762
|
+
const pp = t.phases.find((p) => p.phase === "parallel");
|
|
763
|
+
if (pp) {
|
|
764
|
+
expect(pp.success).toBe(true);
|
|
765
|
+
expect(pp.toolsCalled).toContain("claim_agent_task");
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
});
|
|
769
|
+
});
|
|
770
|
+
describe("Self-eval availability", () => {
|
|
771
|
+
it("lite does NOT have self-eval tools", () => {
|
|
772
|
+
const liteTrajectories = allTrajectories.filter((t) => t.preset === "lite");
|
|
773
|
+
for (const t of liteTrajectories) {
|
|
774
|
+
const se = t.phases.find((p) => p.phase === "self-eval");
|
|
775
|
+
if (se)
|
|
776
|
+
expect(se.success).toBe(false);
|
|
777
|
+
}
|
|
778
|
+
});
|
|
779
|
+
it("core and full HAVE self-eval tools", () => {
|
|
780
|
+
const coreFullTrajectories = allTrajectories.filter((t) => t.preset !== "lite");
|
|
781
|
+
for (const t of coreFullTrajectories) {
|
|
782
|
+
const se = t.phases.find((p) => p.phase === "self-eval");
|
|
783
|
+
expect(se?.success).toBe(true);
|
|
784
|
+
}
|
|
785
|
+
});
|
|
786
|
+
});
|
|
787
|
+
describe("Token surface area reduction", () => {
|
|
788
|
+
it("lite reduces tool count and estimated token overhead vs full", () => {
|
|
789
|
+
const liteT = allTrajectories.find((t) => t.preset === "lite");
|
|
790
|
+
const fullT = allTrajectories.find((t) => t.preset === "full");
|
|
791
|
+
expect(liteT.toolCount).toBeLessThan(fullT.toolCount);
|
|
792
|
+
expect(liteT.estimatedSchemaTokens).toBeLessThan(fullT.estimatedSchemaTokens);
|
|
793
|
+
const reduction = 1 - liteT.toolCount / fullT.toolCount;
|
|
794
|
+
expect(reduction).toBeGreaterThan(0.5); // lite is at least 50% fewer tools
|
|
795
|
+
});
|
|
796
|
+
it("core is between lite and full", () => {
|
|
797
|
+
const liteT = allTrajectories.find((t) => t.preset === "lite");
|
|
798
|
+
const coreT = allTrajectories.find((t) => t.preset === "core");
|
|
799
|
+
const fullT = allTrajectories.find((t) => t.preset === "full");
|
|
800
|
+
expect(coreT.toolCount).toBeGreaterThan(liteT.toolCount);
|
|
801
|
+
expect(coreT.toolCount).toBeLessThan(fullT.toolCount);
|
|
802
|
+
});
|
|
803
|
+
});
|
|
804
|
+
});
|
|
805
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
806
|
+
// TRAJECTORY COMPARISON REPORT
|
|
807
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
808
|
+
describe("Toolset Gating Report", () => {
|
|
809
|
+
it("generates trajectory comparison across presets", () => {
|
|
810
|
+
expect(allTrajectories.length).toBe(27); // 3 presets × 9 scenarios
|
|
811
|
+
console.log("\n");
|
|
812
|
+
console.log("╔══════════════════════════════════════════════════════════════════════════════╗");
|
|
813
|
+
console.log("║ TOOLSET GATING EVAL — Trajectory Comparison ║");
|
|
814
|
+
console.log("║ 3 presets × 9 diverse scenarios = 27 trajectories ║");
|
|
815
|
+
console.log("╚══════════════════════════════════════════════════════════════════════════════╝");
|
|
816
|
+
console.log("");
|
|
817
|
+
// ─── SECTION 1: TOOL COUNT & TOKEN OVERHEAD ───
|
|
818
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
819
|
+
console.log("│ 1. TOOL COUNT & ESTIMATED TOKEN OVERHEAD │");
|
|
820
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
821
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
822
|
+
const t = allTrajectories.find((tr) => tr.preset === preset);
|
|
823
|
+
const bar = "█".repeat(Math.round(t.toolCount / 3));
|
|
824
|
+
console.log(`│ ${preset.padEnd(6)} ${String(t.toolCount).padStart(3)} tools ~${String(t.estimatedSchemaTokens).padStart(5)} tokens ${bar}`.padEnd(79) + "│");
|
|
825
|
+
}
|
|
826
|
+
const liteT = allTrajectories.find((t) => t.preset === "lite");
|
|
827
|
+
const fullT = allTrajectories.find((t) => t.preset === "full");
|
|
828
|
+
const savings = Math.round((1 - liteT.estimatedSchemaTokens / fullT.estimatedSchemaTokens) * 100);
|
|
829
|
+
console.log("│ │");
|
|
830
|
+
console.log(`│ lite saves ~${savings}% token overhead vs full (${fullT.estimatedSchemaTokens - liteT.estimatedSchemaTokens} fewer tokens/call)`.padEnd(79) + "│");
|
|
831
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
832
|
+
console.log("");
|
|
833
|
+
// ─── SECTION 2: PHASE COMPLETION MATRIX ───
|
|
834
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
835
|
+
console.log("│ 2. PHASE COMPLETION MATRIX │");
|
|
836
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
837
|
+
console.log("│ Phase lite core full │");
|
|
838
|
+
console.log("│ ───────────────── ────── ────── ────── │");
|
|
839
|
+
const allPhaseNames = ["meta", "recon", "risk", "verification", "eval", "quality-gate", "knowledge", "flywheel", "parallel", "self-eval"];
|
|
840
|
+
for (const phase of allPhaseNames) {
|
|
841
|
+
const cols = [];
|
|
842
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
843
|
+
const trajectories = allTrajectories.filter((t) => t.preset === preset);
|
|
844
|
+
const phaseResults = trajectories.map((t) => t.phases.find((p) => p.phase === phase));
|
|
845
|
+
const present = phaseResults.some((p) => p);
|
|
846
|
+
if (!present) {
|
|
847
|
+
cols.push(" -- ");
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
const allSuccess = phaseResults.every((p) => p?.success);
|
|
851
|
+
const anySuccess = phaseResults.some((p) => p?.success);
|
|
852
|
+
cols.push(allSuccess ? " OK " : anySuccess ? " PART " : " MISS ");
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
console.log(`│ ${phase.padEnd(19)}${cols.join(" ")}`.padEnd(79) + "│");
|
|
856
|
+
}
|
|
857
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
858
|
+
console.log("");
|
|
859
|
+
// ─── SECTION 3: IMPACT COMPARISON ───
|
|
860
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
861
|
+
console.log("│ 3. CONCRETE IMPACT PER PRESET (aggregated across 9 scenarios) │");
|
|
862
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
863
|
+
console.log("│ Metric lite core full │");
|
|
864
|
+
console.log("│ ───────────────────────────── ────── ────── ────── │");
|
|
865
|
+
for (const metric of [
|
|
866
|
+
{ label: "Issues detected", key: "issuesDetected" },
|
|
867
|
+
{ label: "Recon findings", key: "reconFindings" },
|
|
868
|
+
{ label: "Eval cases created", key: "evalCases" },
|
|
869
|
+
{ label: "Gate rules enforced", key: "gateRules" },
|
|
870
|
+
{ label: "Total tool calls", key: "totalToolCalls" },
|
|
871
|
+
]) {
|
|
872
|
+
const cols = [];
|
|
873
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
874
|
+
const sum = allTrajectories
|
|
875
|
+
.filter((t) => t.preset === preset)
|
|
876
|
+
.reduce((s, t) => s + t[metric.key], 0);
|
|
877
|
+
cols.push(String(sum).padStart(4));
|
|
878
|
+
}
|
|
879
|
+
console.log(`│ ${metric.label.padEnd(30)}${cols.map((c) => c.padEnd(8)).join("")}`.padEnd(79) + "│");
|
|
880
|
+
}
|
|
881
|
+
// Boolean metrics
|
|
882
|
+
for (const metric of [
|
|
883
|
+
{ label: "Risk assessed", fn: (t) => t.riskAssessed },
|
|
884
|
+
{ label: "Learning recorded", fn: (t) => t.learningRecorded },
|
|
885
|
+
{ label: "Flywheel complete", fn: (t) => t.flywheelComplete },
|
|
886
|
+
]) {
|
|
887
|
+
const cols = [];
|
|
888
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
889
|
+
const count = allTrajectories
|
|
890
|
+
.filter((t) => t.preset === preset)
|
|
891
|
+
.filter(metric.fn).length;
|
|
892
|
+
cols.push(`${count}/${SCENARIOS.length}`);
|
|
893
|
+
}
|
|
894
|
+
console.log(`│ ${metric.label.padEnd(30)}${cols.map((c) => c.padStart(4).padEnd(8)).join("")}`.padEnd(79) + "│");
|
|
895
|
+
}
|
|
896
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
897
|
+
console.log("");
|
|
898
|
+
// ─── SECTION 4: MISSING TOOLS LOG ───
|
|
899
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
900
|
+
console.log("│ 4. TOOLS MISSING BY PRESET (what you lose with gating) │");
|
|
901
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
902
|
+
for (const preset of ["lite", "core"]) {
|
|
903
|
+
const missingCalls = callLog.filter((c) => c.preset === preset && c.status === "missing");
|
|
904
|
+
const uniqueMissing = [...new Set(missingCalls.map((c) => c.tool))];
|
|
905
|
+
if (uniqueMissing.length > 0) {
|
|
906
|
+
console.log(`│ ${preset.toUpperCase()}: missing ${uniqueMissing.length} tools`.padEnd(79) + "│");
|
|
907
|
+
for (const tool of uniqueMissing) {
|
|
908
|
+
const phases = [...new Set(missingCalls.filter((c) => c.tool === tool).map((c) => c.phase))];
|
|
909
|
+
console.log(`│ ${tool.padEnd(28)} (needed in: ${phases.join(", ")})`.padEnd(79) + "│");
|
|
910
|
+
}
|
|
911
|
+
console.log("│ │");
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
console.log(`│ FULL: 0 missing tools (all ${fullT.toolCount} available)`.padEnd(79) + "│");
|
|
915
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
916
|
+
console.log("");
|
|
917
|
+
// ─── SECTION 5: CATEGORY BREAKDOWN ───
|
|
918
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
919
|
+
console.log("│ 5. IMPACT BY SCENARIO CATEGORY │");
|
|
920
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
921
|
+
console.log("│ Category Scenarios lite% core% full% Key delta │");
|
|
922
|
+
console.log("│ ────────────── ───────── ───── ───── ───── ────────────────────── │");
|
|
923
|
+
const categories = [...new Set(SCENARIOS.map((s) => s.category))];
|
|
924
|
+
for (const cat of categories) {
|
|
925
|
+
const catScenarios = SCENARIOS.filter((s) => s.category === cat);
|
|
926
|
+
const catIds = new Set(catScenarios.map((s) => s.id));
|
|
927
|
+
const count = catScenarios.length;
|
|
928
|
+
const pctFor = (preset) => {
|
|
929
|
+
const ts = allTrajectories.filter((t) => t.preset === preset && catIds.has(t.scenarioId));
|
|
930
|
+
const completed = ts.reduce((s, t) => s + t.phasesCompleted, 0);
|
|
931
|
+
const total = ts.reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
|
|
932
|
+
return total > 0 ? Math.round(completed / total * 100) : 0;
|
|
933
|
+
};
|
|
934
|
+
const litePct = pctFor("lite");
|
|
935
|
+
const corePct = pctFor("core");
|
|
936
|
+
const fullPct = pctFor("full");
|
|
937
|
+
let delta = "";
|
|
938
|
+
if (litePct === corePct && corePct === fullPct)
|
|
939
|
+
delta = "no difference";
|
|
940
|
+
else if (litePct === corePct)
|
|
941
|
+
delta = "parallel only";
|
|
942
|
+
else if (corePct === fullPct)
|
|
943
|
+
delta = "lite loses risk+flywheel";
|
|
944
|
+
else
|
|
945
|
+
delta = `lite ${litePct}% → full ${fullPct}%`;
|
|
946
|
+
console.log(`│ ${cat.padEnd(15)}${String(count).padStart(5)} ${String(litePct).padStart(3)}% ${String(corePct).padStart(3)}% ${String(fullPct).padStart(3)}% ${delta}`.padEnd(79) + "│");
|
|
947
|
+
}
|
|
948
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
949
|
+
console.log("");
|
|
950
|
+
// ─── SECTION 6: PER-SCENARIO DETAIL ───
|
|
951
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
952
|
+
console.log("│ 6. PER-SCENARIO TRAJECTORY DETAIL │");
|
|
953
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
954
|
+
console.log("│ Scenario Cat Cplx lite core full Issues Calls │");
|
|
955
|
+
console.log("│ ────────────────────────── ───────── ──── ──── ──── ──── ────── ───── │");
|
|
956
|
+
for (const s of SCENARIOS) {
|
|
957
|
+
const liteTr = allTrajectories.find((t) => t.preset === "lite" && t.scenarioId === s.id);
|
|
958
|
+
const coreTr = allTrajectories.find((t) => t.preset === "core" && t.scenarioId === s.id);
|
|
959
|
+
const fullTr = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === s.id);
|
|
960
|
+
const lp = `${liteTr.phasesCompleted}/${liteTr.phasesCompleted + liteTr.phasesSkipped}`;
|
|
961
|
+
const cp = `${coreTr.phasesCompleted}/${coreTr.phasesCompleted + coreTr.phasesSkipped}`;
|
|
962
|
+
const fp = `${fullTr.phasesCompleted}/${fullTr.phasesCompleted + fullTr.phasesSkipped}`;
|
|
963
|
+
console.log(`│ ${s.id.slice(0, 26).padEnd(27)}${s.category.slice(0, 9).padEnd(10)}${s.complexity.slice(0, 3).toUpperCase().padEnd(5)}${lp.padEnd(5)}${cp.padEnd(5)}${fp.padEnd(6)}${String(fullTr.issuesDetected).padStart(4)} ${String(fullTr.totalToolCalls).padStart(5)}`.padEnd(79) + "│");
|
|
964
|
+
}
|
|
965
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
966
|
+
console.log("");
|
|
967
|
+
// ─── SECTION 7: TOOL COVERAGE ───
|
|
968
|
+
console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
|
|
969
|
+
console.log("│ 7. UNIQUE TOOLS EXERCISED PER PRESET │");
|
|
970
|
+
console.log("├──────────────────────────────────────────────────────────────────────────────┤");
|
|
971
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
972
|
+
const successCalls = callLog.filter((c) => c.preset === preset && c.status === "success");
|
|
973
|
+
const uniqueTools = [...new Set(successCalls.map((c) => c.tool))];
|
|
974
|
+
const availableTools = buildToolset(preset).length;
|
|
975
|
+
const pct = Math.round(uniqueTools.length / availableTools * 100);
|
|
976
|
+
console.log(`│ ${preset.padEnd(6)} ${String(uniqueTools.length).padStart(3)} / ${String(availableTools).padStart(3)} tools exercised (${pct}%)`.padEnd(79) + "│");
|
|
977
|
+
}
|
|
978
|
+
const allSuccessCalls = callLog.filter((c) => c.status === "success");
|
|
979
|
+
const totalUnique = [...new Set(allSuccessCalls.map((c) => c.tool))];
|
|
980
|
+
console.log("│ │");
|
|
981
|
+
console.log(`│ Total unique tools exercised across all presets: ${totalUnique.length}`.padEnd(79) + "│");
|
|
982
|
+
console.log("└──────────────────────────────────────────────────────────────────────────────┘");
|
|
983
|
+
console.log("");
|
|
984
|
+
// ─── VERDICT ───
|
|
985
|
+
console.log("╔══════════════════════════════════════════════════════════════════════════════╗");
|
|
986
|
+
console.log("║ VERDICT ║");
|
|
987
|
+
console.log("╠══════════════════════════════════════════════════════════════════════════════╣");
|
|
988
|
+
console.log("║ ║");
|
|
989
|
+
const liteCompleted = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.phasesCompleted, 0);
|
|
990
|
+
const liteTotal = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
|
|
991
|
+
const coreCompleted = allTrajectories.filter((t) => t.preset === "core").reduce((s, t) => s + t.phasesCompleted, 0);
|
|
992
|
+
const coreTotal = allTrajectories.filter((t) => t.preset === "core").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
|
|
993
|
+
const fullCompleted = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.phasesCompleted, 0);
|
|
994
|
+
const fullTotal = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
|
|
995
|
+
console.log(`║ lite: ${liteCompleted}/${liteTotal} phases (${Math.round(liteCompleted / liteTotal * 100)}%) — ${savings}% fewer tokens, loses flywheel + parallel`.padEnd(79) + "║");
|
|
996
|
+
console.log(`║ core: ${coreCompleted}/${coreTotal} phases (${Math.round(coreCompleted / coreTotal * 100)}%) — full methodology loop, no parallel/vision/web`.padEnd(79) + "║");
|
|
997
|
+
console.log(`║ full: ${fullCompleted}/${fullTotal} phases (${Math.round(fullCompleted / fullTotal * 100)}%) — everything`.padEnd(79) + "║");
|
|
998
|
+
console.log("║ ║");
|
|
999
|
+
console.log("║ Recommendation: ║");
|
|
1000
|
+
console.log("║ Solo dev, standard tasks → --preset lite (fast, low token overhead) ║");
|
|
1001
|
+
console.log("║ Team with methodology needs → --preset core (full flywheel loop) ║");
|
|
1002
|
+
console.log("║ Multi-agent / full pipeline → --preset full (parallel + self-eval) ║");
|
|
1003
|
+
console.log("║ ║");
|
|
1004
|
+
console.log("╚══════════════════════════════════════════════════════════════════════════════╝");
|
|
1005
|
+
// ─── ASSERTIONS ───
|
|
1006
|
+
// All presets complete the core 6 phases (meta, recon, risk, verification, eval, quality-gate)
|
|
1007
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
1008
|
+
const trajectories = allTrajectories.filter((t) => t.preset === preset);
|
|
1009
|
+
for (const t of trajectories) {
|
|
1010
|
+
expect(t.phases.find((p) => p.phase === "meta")?.success).toBe(true);
|
|
1011
|
+
expect(t.phases.find((p) => p.phase === "recon")?.success).toBe(true);
|
|
1012
|
+
expect(t.phases.find((p) => p.phase === "verification")?.success).toBe(true);
|
|
1013
|
+
expect(t.phases.find((p) => p.phase === "eval")?.success).toBe(true);
|
|
1014
|
+
expect(t.phases.find((p) => p.phase === "quality-gate")?.success).toBe(true);
|
|
1015
|
+
expect(t.phases.find((p) => p.phase === "knowledge")?.success).toBe(true);
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
// lite and core detect the same number of issues as full (core methodology is intact)
|
|
1019
|
+
for (const preset of ["lite", "core", "full"]) {
|
|
1020
|
+
const totalIssues = allTrajectories
|
|
1021
|
+
.filter((t) => t.preset === preset)
|
|
1022
|
+
.reduce((s, t) => s + t.issuesDetected, 0);
|
|
1023
|
+
expect(totalIssues).toBeGreaterThanOrEqual(10); // at least 10 across 9 scenarios
|
|
1024
|
+
}
|
|
1025
|
+
// Full preset has more tool calls (parallel + self-eval phases add calls)
|
|
1026
|
+
const fullCalls = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.totalToolCalls, 0);
|
|
1027
|
+
const liteCalls = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.totalToolCalls, 0);
|
|
1028
|
+
expect(fullCalls).toBeGreaterThan(liteCalls);
|
|
1029
|
+
});
|
|
1030
|
+
});
|
|
1031
|
+
//# sourceMappingURL=toolsetGatingEval.test.js.map
|