nodebench-mcp 2.67.0 → 2.69.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/chainEval.d.ts +21 -0
- package/dist/benchmarks/chainEval.js +683 -0
- package/dist/benchmarks/chainEval.js.map +1 -0
- package/dist/benchmarks/llmJudgeEval.js +90 -7
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/pipelineEval.d.ts +63 -0
- package/dist/benchmarks/pipelineEval.js +1035 -0
- package/dist/benchmarks/pipelineEval.js.map +1 -0
- package/dist/benchmarks/searchQualityEval.js +4 -4
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/tools/founderTools.js +2 -1
- package/dist/tools/founderTools.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,1035 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* pipelineEval.ts — Multi-step pipeline eval harness for NodeBench MCP
|
|
4
|
+
*
|
|
5
|
+
* Unlike llmJudgeEval.ts which tests tools independently, this harness tests
|
|
6
|
+
* realistic multi-step chains where Tool A's output feeds Tool B's input.
|
|
7
|
+
*
|
|
8
|
+
* Architecture:
|
|
9
|
+
* 1. Pipeline Definitions — 6 canonical pipelines modeling real agent workflows
|
|
10
|
+
* 2. Chaining Engine — executes steps sequentially, extracts fields from output
|
|
11
|
+
* 3. LLM Judge — Gemini Flash Lite evaluates 5 boolean criteria on full trace
|
|
12
|
+
* 4. Per-step tracking — tool name, args, output size, duration, pass/fail
|
|
13
|
+
* 5. SQLite persistence — pipeline_eval_runs + pipeline_eval_steps tables
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* cd packages/mcp-local
|
|
17
|
+
* npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]
|
|
18
|
+
*/
|
|
19
|
+
import { getDb, genId } from "../db.js";
|
|
20
|
+
import { _setDbAccessor } from "../tools/toolRegistry.js";
|
|
21
|
+
import { loadToolsets, ALL_DOMAIN_KEYS } from "../toolsetRegistry.js";
|
|
22
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
23
|
+
// SCHEMA
|
|
24
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
25
|
+
const PIPELINE_EVAL_SCHEMA = `
|
|
26
|
+
CREATE TABLE IF NOT EXISTS pipeline_eval_runs (
|
|
27
|
+
run_id TEXT PRIMARY KEY,
|
|
28
|
+
pipeline_name TEXT NOT NULL,
|
|
29
|
+
timestamp TEXT NOT NULL DEFAULT (datetime('now')),
|
|
30
|
+
step_count INTEGER NOT NULL DEFAULT 0,
|
|
31
|
+
steps_passed INTEGER NOT NULL DEFAULT 0,
|
|
32
|
+
overall_pass INTEGER NOT NULL DEFAULT 0,
|
|
33
|
+
criteria_json TEXT,
|
|
34
|
+
total_ms INTEGER NOT NULL DEFAULT 0
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
CREATE TABLE IF NOT EXISTS pipeline_eval_steps (
|
|
38
|
+
id TEXT PRIMARY KEY,
|
|
39
|
+
run_id TEXT NOT NULL,
|
|
40
|
+
step_index INTEGER NOT NULL,
|
|
41
|
+
tool TEXT NOT NULL,
|
|
42
|
+
description TEXT,
|
|
43
|
+
args_json TEXT,
|
|
44
|
+
ok INTEGER NOT NULL DEFAULT 0,
|
|
45
|
+
output_size INTEGER NOT NULL DEFAULT 0,
|
|
46
|
+
output_preview TEXT,
|
|
47
|
+
error TEXT,
|
|
48
|
+
ms INTEGER NOT NULL DEFAULT 0
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
CREATE INDEX IF NOT EXISTS idx_pipeline_eval_steps_run ON pipeline_eval_steps(run_id);
|
|
52
|
+
`;
|
|
53
|
+
function ensureSchema() {
|
|
54
|
+
const db = getDb();
|
|
55
|
+
db.exec(PIPELINE_EVAL_SCHEMA);
|
|
56
|
+
}
|
|
57
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
58
|
+
// TEST DATA SEEDING — populate SQLite before pipeline execution
|
|
59
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
60
|
+
const SEED_PREFIX = "pipe_eval_";
|
|
61
|
+
function ensureToolSchemas() {
|
|
62
|
+
const db = getDb();
|
|
63
|
+
db.exec(`
|
|
64
|
+
CREATE TABLE IF NOT EXISTS causal_events (
|
|
65
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
66
|
+
userId TEXT NOT NULL,
|
|
67
|
+
eventType TEXT NOT NULL,
|
|
68
|
+
payload TEXT,
|
|
69
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
70
|
+
);
|
|
71
|
+
CREATE TABLE IF NOT EXISTS causal_important_changes (
|
|
72
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
73
|
+
changeId TEXT UNIQUE NOT NULL,
|
|
74
|
+
changeCategory TEXT NOT NULL,
|
|
75
|
+
impactScore REAL NOT NULL DEFAULT 0,
|
|
76
|
+
impactReason TEXT,
|
|
77
|
+
affectedEntities TEXT,
|
|
78
|
+
suggestedAction TEXT,
|
|
79
|
+
status TEXT NOT NULL DEFAULT 'detected',
|
|
80
|
+
timestampMs INTEGER,
|
|
81
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
82
|
+
);
|
|
83
|
+
CREATE TABLE IF NOT EXISTS founder_packets (
|
|
84
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
85
|
+
packetId TEXT UNIQUE NOT NULL,
|
|
86
|
+
sessionId TEXT,
|
|
87
|
+
packetType TEXT NOT NULL,
|
|
88
|
+
role TEXT NOT NULL DEFAULT 'founder',
|
|
89
|
+
content TEXT NOT NULL,
|
|
90
|
+
metadata TEXT,
|
|
91
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
92
|
+
);
|
|
93
|
+
CREATE TABLE IF NOT EXISTS tracking_actions (
|
|
94
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
95
|
+
actionId TEXT UNIQUE NOT NULL,
|
|
96
|
+
sessionId TEXT NOT NULL,
|
|
97
|
+
timestamp TEXT NOT NULL,
|
|
98
|
+
action TEXT NOT NULL,
|
|
99
|
+
category TEXT NOT NULL,
|
|
100
|
+
beforeState TEXT,
|
|
101
|
+
afterState TEXT,
|
|
102
|
+
reasoning TEXT,
|
|
103
|
+
filesChanged TEXT,
|
|
104
|
+
impactLevel TEXT NOT NULL,
|
|
105
|
+
dayOfWeek TEXT NOT NULL,
|
|
106
|
+
weekNumber INTEGER NOT NULL,
|
|
107
|
+
month TEXT NOT NULL,
|
|
108
|
+
quarter TEXT NOT NULL,
|
|
109
|
+
year INTEGER NOT NULL
|
|
110
|
+
);
|
|
111
|
+
CREATE TABLE IF NOT EXISTS tracking_milestones (
|
|
112
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
113
|
+
milestoneId TEXT UNIQUE NOT NULL,
|
|
114
|
+
title TEXT NOT NULL,
|
|
115
|
+
category TEXT NOT NULL,
|
|
116
|
+
description TEXT,
|
|
117
|
+
evidence TEXT,
|
|
118
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
119
|
+
);
|
|
120
|
+
CREATE TABLE IF NOT EXISTS intent_residuals (
|
|
121
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
122
|
+
intentId TEXT UNIQUE NOT NULL,
|
|
123
|
+
intent TEXT NOT NULL,
|
|
124
|
+
status TEXT NOT NULL DEFAULT 'active',
|
|
125
|
+
context TEXT,
|
|
126
|
+
createdAt TEXT NOT NULL,
|
|
127
|
+
updatedAt TEXT NOT NULL
|
|
128
|
+
);
|
|
129
|
+
CREATE TABLE IF NOT EXISTS session_summaries (
|
|
130
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
131
|
+
sessionId TEXT UNIQUE NOT NULL,
|
|
132
|
+
summary TEXT NOT NULL,
|
|
133
|
+
toolCount INTEGER NOT NULL DEFAULT 0,
|
|
134
|
+
entityCount INTEGER NOT NULL DEFAULT 0,
|
|
135
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
136
|
+
);
|
|
137
|
+
CREATE TABLE IF NOT EXISTS recon_sessions (
|
|
138
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
139
|
+
sessionId TEXT UNIQUE NOT NULL,
|
|
140
|
+
target TEXT NOT NULL,
|
|
141
|
+
scope TEXT NOT NULL DEFAULT 'quick',
|
|
142
|
+
status TEXT NOT NULL DEFAULT 'active',
|
|
143
|
+
findings TEXT,
|
|
144
|
+
metadata TEXT,
|
|
145
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
146
|
+
);
|
|
147
|
+
`);
|
|
148
|
+
}
|
|
149
|
+
function seedPipelineData() {
|
|
150
|
+
const db = getDb();
|
|
151
|
+
ensureToolSchemas();
|
|
152
|
+
const iso = new Date().toISOString();
|
|
153
|
+
const dayAgo = Date.now() - 86_400_000;
|
|
154
|
+
// Seed events
|
|
155
|
+
const events = [
|
|
156
|
+
{ userId: `${SEED_PREFIX}user`, eventType: "packet.generated", payload: JSON.stringify({ summary: "Weekly founder briefing generated", entityId: "nodebench" }), createdAt: iso },
|
|
157
|
+
{ userId: `${SEED_PREFIX}user`, eventType: "recon.completed", payload: JSON.stringify({ target: "Anthropic", findings: 5 }), createdAt: iso },
|
|
158
|
+
{ userId: `${SEED_PREFIX}user`, eventType: "strategy.review", payload: JSON.stringify({ topic: "MCP distribution", status: "active" }), createdAt: iso },
|
|
159
|
+
];
|
|
160
|
+
const evtStmt = db.prepare(`INSERT OR IGNORE INTO causal_events (userId, eventType, payload, createdAt) VALUES (?, ?, ?, ?)`);
|
|
161
|
+
for (const e of events)
|
|
162
|
+
evtStmt.run(e.userId, e.eventType, e.payload, e.createdAt);
|
|
163
|
+
// Seed important changes
|
|
164
|
+
const changes = [
|
|
165
|
+
{ changeId: `${SEED_PREFIX}chg_001`, changeCategory: "competitive", impactScore: 0.85, impactReason: "Anthropic launched Model Context Protocol marketplace with 200+ servers", affectedEntities: JSON.stringify(["anthropic", "nodebench"]), suggestedAction: "Evaluate marketplace listing for NodeBench", status: "detected", timestampMs: dayAgo },
|
|
166
|
+
{ changeId: `${SEED_PREFIX}chg_002`, changeCategory: "product", impactScore: 0.72, impactReason: "Supermemory raised $4M seed for memory-layer MCP server", affectedEntities: JSON.stringify(["supermemory"]), suggestedAction: "Analyze differentiation vs Supermemory approach", status: "detected", timestampMs: dayAgo + 3600_000 },
|
|
167
|
+
{ changeId: `${SEED_PREFIX}chg_003`, changeCategory: "strategy", impactScore: 0.68, impactReason: "NodeBench eval pass rate reached 100% — ready for distribution push", affectedEntities: JSON.stringify(["nodebench"]), suggestedAction: "Publish to npm and MCP registry", status: "detected", timestampMs: dayAgo + 7200_000 },
|
|
168
|
+
];
|
|
169
|
+
const chgStmt = db.prepare(`INSERT OR IGNORE INTO causal_important_changes (changeId, changeCategory, impactScore, impactReason, affectedEntities, suggestedAction, status, timestampMs) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
170
|
+
for (const c of changes)
|
|
171
|
+
chgStmt.run(c.changeId, c.changeCategory, c.impactScore, c.impactReason, c.affectedEntities, c.suggestedAction, c.status, c.timestampMs);
|
|
172
|
+
// Seed a recon session for competitor_brief pipeline
|
|
173
|
+
// recon_sessions schema: id, target, description, status, created_at
|
|
174
|
+
try {
|
|
175
|
+
const reconStmt = db.prepare(`INSERT OR IGNORE INTO recon_sessions (id, target, description, status, created_at) VALUES (?, ?, ?, ?, ?)`);
|
|
176
|
+
reconStmt.run(`${SEED_PREFIX}recon_supermemory`, "Supermemory", "Memory-layer MCP server competitor analysis", "completed", iso);
|
|
177
|
+
// Also seed a finding for this session
|
|
178
|
+
db.prepare(`INSERT OR IGNORE INTO recon_findings (id, session_id, category, finding, severity, source, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)`)
|
|
179
|
+
.run(`${SEED_PREFIX}finding_001`, `${SEED_PREFIX}recon_supermemory`, "new_feature", "Supermemory raised $4M seed for universal memory infrastructure with MCP distribution", "medium", "web_search", iso);
|
|
180
|
+
}
|
|
181
|
+
catch { /* table may not exist yet — will be created by run_recon */ }
|
|
182
|
+
console.log(`[seedPipelineData] Seeded: 3 events, 3 important_changes, 1 recon session`);
|
|
183
|
+
}
|
|
184
|
+
function cleanupPipelineData() {
|
|
185
|
+
const db = getDb();
|
|
186
|
+
const prefix = `${SEED_PREFIX}%`;
|
|
187
|
+
try {
|
|
188
|
+
db.exec(`DELETE FROM causal_events WHERE userId LIKE '${prefix}'`);
|
|
189
|
+
db.exec(`DELETE FROM causal_important_changes WHERE changeId LIKE '${prefix}'`);
|
|
190
|
+
db.exec(`DELETE FROM recon_sessions WHERE sessionId LIKE '${prefix}'`);
|
|
191
|
+
db.exec(`DELETE FROM tracking_actions WHERE sessionId LIKE '${prefix}'`);
|
|
192
|
+
db.exec(`DELETE FROM tracking_milestones WHERE milestoneId LIKE '${prefix}'`);
|
|
193
|
+
db.exec(`DELETE FROM intent_residuals WHERE intentId LIKE '${prefix}'`);
|
|
194
|
+
}
|
|
195
|
+
catch {
|
|
196
|
+
/* tables may not exist yet */
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
200
|
+
// OUTPUT EXTRACTION HELPERS
|
|
201
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
202
|
+
/** Extract text from MCP content blocks or raw output */
|
|
203
|
+
function extractText(result) {
|
|
204
|
+
if (!result)
|
|
205
|
+
return "(null)";
|
|
206
|
+
if (typeof result === "string")
|
|
207
|
+
return result;
|
|
208
|
+
if (Array.isArray(result)) {
|
|
209
|
+
const texts = result
|
|
210
|
+
.map((block) => {
|
|
211
|
+
if (typeof block === "string")
|
|
212
|
+
return block;
|
|
213
|
+
if (block?.type === "text")
|
|
214
|
+
return block.text;
|
|
215
|
+
if (block?.text)
|
|
216
|
+
return block.text;
|
|
217
|
+
return JSON.stringify(block);
|
|
218
|
+
})
|
|
219
|
+
.filter(Boolean);
|
|
220
|
+
return texts.join("\n") || JSON.stringify(result);
|
|
221
|
+
}
|
|
222
|
+
if (typeof result === "object")
|
|
223
|
+
return JSON.stringify(result);
|
|
224
|
+
return String(result);
|
|
225
|
+
}
|
|
226
|
+
/** Try to parse JSON from text, return the object or null */
|
|
227
|
+
function tryParseJson(text) {
|
|
228
|
+
try {
|
|
229
|
+
const parsed = JSON.parse(text);
|
|
230
|
+
if (typeof parsed === "object" && parsed !== null)
|
|
231
|
+
return parsed;
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
// Try extracting JSON from content blocks
|
|
235
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
236
|
+
if (jsonMatch) {
|
|
237
|
+
try {
|
|
238
|
+
return JSON.parse(jsonMatch[0]);
|
|
239
|
+
}
|
|
240
|
+
catch { /* not JSON */ }
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
/** Extract a nested field from an object using a dot path */
|
|
246
|
+
function extractField(obj, path) {
|
|
247
|
+
if (!obj || typeof obj !== "object")
|
|
248
|
+
return undefined;
|
|
249
|
+
const parts = path.split(".");
|
|
250
|
+
let current = obj;
|
|
251
|
+
for (const part of parts) {
|
|
252
|
+
if (current == null || typeof current !== "object")
|
|
253
|
+
return undefined;
|
|
254
|
+
current = current[part];
|
|
255
|
+
}
|
|
256
|
+
return current;
|
|
257
|
+
}
|
|
258
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
259
|
+
// PIPELINE DEFINITIONS — 6 canonical multi-step chains
|
|
260
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
261
|
+
const PIPELINES = [
|
|
262
|
+
// ── Pipeline 1: founder_weekly_reset ──────────────────────────────
|
|
263
|
+
{
|
|
264
|
+
name: "founder_weekly_reset",
|
|
265
|
+
description: "Full weekly reset: discover → load → synthesize → track → milestone",
|
|
266
|
+
steps: [
|
|
267
|
+
{
|
|
268
|
+
tool: "discover_tools",
|
|
269
|
+
staticArgs: { query: "founder weekly reset", limit: 10 },
|
|
270
|
+
description: "Discover tools for founder weekly reset workflow",
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
tool: "founder_local_weekly_reset",
|
|
274
|
+
staticArgs: { query: "NodeBench weekly founder reset" },
|
|
275
|
+
description: "Run the weekly reset pipeline",
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
tool: "track_action",
|
|
279
|
+
staticArgs: {
|
|
280
|
+
category: "strategy",
|
|
281
|
+
action: "weekly_reset",
|
|
282
|
+
beforeState: "{}",
|
|
283
|
+
},
|
|
284
|
+
dynamicArgs: {
|
|
285
|
+
afterState: (prev) => {
|
|
286
|
+
const text = extractText(prev);
|
|
287
|
+
return text.slice(0, 500);
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
description: "Track the weekly reset action with before/after state",
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
tool: "track_milestone",
|
|
294
|
+
staticArgs: {
|
|
295
|
+
title: "Weekly reset completed",
|
|
296
|
+
category: "operations",
|
|
297
|
+
},
|
|
298
|
+
dynamicArgs: {
|
|
299
|
+
description: (prev) => {
|
|
300
|
+
const text = extractText(prev);
|
|
301
|
+
return `Weekly reset tracked: ${text.slice(0, 200)}`;
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
description: "Record a milestone for the completed weekly reset",
|
|
305
|
+
},
|
|
306
|
+
],
|
|
307
|
+
},
|
|
308
|
+
// ── Pipeline 2: company_intelligence ──────────────────────────────
|
|
309
|
+
{
|
|
310
|
+
name: "company_intelligence",
|
|
311
|
+
description: "Company analysis: discover → synthesize → export → track action",
|
|
312
|
+
steps: [
|
|
313
|
+
{
|
|
314
|
+
tool: "discover_tools",
|
|
315
|
+
staticArgs: { query: "company analysis Anthropic", limit: 10 },
|
|
316
|
+
description: "Discover tools for company analysis",
|
|
317
|
+
},
|
|
318
|
+
{
|
|
319
|
+
tool: "founder_local_synthesize",
|
|
320
|
+
staticArgs: {
|
|
321
|
+
query: "Analyze Anthropic competitive position",
|
|
322
|
+
includeWeb: false,
|
|
323
|
+
packetType: "company_search",
|
|
324
|
+
daysBack: 7,
|
|
325
|
+
},
|
|
326
|
+
description: "Synthesize Anthropic competitive analysis",
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
tool: "export_artifact_packet",
|
|
330
|
+
staticArgs: { format: "markdown", audience: "banker", title: "Anthropic Competitive Analysis" },
|
|
331
|
+
dynamicArgs: {
|
|
332
|
+
content: (prev) => {
|
|
333
|
+
const text = extractText(prev);
|
|
334
|
+
const parsed = tryParseJson(text);
|
|
335
|
+
// Build a rich content object for the export
|
|
336
|
+
if (parsed) {
|
|
337
|
+
const syn = parsed.synthesis;
|
|
338
|
+
return {
|
|
339
|
+
summary: syn?.summary ?? parsed.summary ?? text.slice(0, 500),
|
|
340
|
+
keyFindings: syn?.keyFindings ?? parsed.keyFindings ?? [],
|
|
341
|
+
entities: syn?.entities ?? parsed.entities ?? ["Anthropic"],
|
|
342
|
+
metrics: syn?.metrics ?? parsed.metrics ?? [],
|
|
343
|
+
risks: syn?.risks ?? parsed.risks ?? [],
|
|
344
|
+
nextSteps: syn?.nextSteps ?? parsed.nextSteps ?? [],
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
return { summary: text.slice(0, 2000), entities: ["Anthropic"] };
|
|
348
|
+
},
|
|
349
|
+
},
|
|
350
|
+
description: "Export the synthesis as a markdown artifact for banker audience",
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
tool: "track_action",
|
|
354
|
+
staticArgs: {
|
|
355
|
+
category: "research",
|
|
356
|
+
action: "anthropic_analysis_exported",
|
|
357
|
+
},
|
|
358
|
+
dynamicArgs: {
|
|
359
|
+
afterState: (prev) => extractText(prev).slice(0, 300),
|
|
360
|
+
},
|
|
361
|
+
description: "Track the analysis export action",
|
|
362
|
+
},
|
|
363
|
+
],
|
|
364
|
+
},
|
|
365
|
+
// ── Pipeline 3: competitor_brief ──────────────────────────────────
|
|
366
|
+
{
|
|
367
|
+
name: "competitor_brief",
|
|
368
|
+
description: "Competitor research: discover → recon → synthesize → export → track",
|
|
369
|
+
steps: [
|
|
370
|
+
{
|
|
371
|
+
tool: "discover_tools",
|
|
372
|
+
staticArgs: { query: "competitor analysis", limit: 10 },
|
|
373
|
+
description: "Discover tools for competitor analysis",
|
|
374
|
+
},
|
|
375
|
+
{
|
|
376
|
+
tool: "run_recon",
|
|
377
|
+
staticArgs: { target: "Supermemory", scope: "full", webEnrich: false },
|
|
378
|
+
description: "Run recon on Supermemory competitor",
|
|
379
|
+
},
|
|
380
|
+
{
|
|
381
|
+
tool: "founder_local_synthesize",
|
|
382
|
+
dynamicArgs: {
|
|
383
|
+
query: (prev) => {
|
|
384
|
+
const text = extractText(prev);
|
|
385
|
+
return `Analyze Supermemory as competitor based on recon findings: ${text.slice(0, 500)}`;
|
|
386
|
+
},
|
|
387
|
+
},
|
|
388
|
+
staticArgs: { packetType: "competitor_brief", daysBack: 7 },
|
|
389
|
+
description: "Synthesize recon findings into a competitor brief",
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
tool: "export_artifact_packet",
|
|
393
|
+
staticArgs: { format: "markdown", audience: "founder" },
|
|
394
|
+
dynamicArgs: {
|
|
395
|
+
content: (prev) => {
|
|
396
|
+
const text = extractText(prev);
|
|
397
|
+
const parsed = tryParseJson(text);
|
|
398
|
+
if (parsed?.synthesis)
|
|
399
|
+
return String(parsed.synthesis);
|
|
400
|
+
return text.slice(0, 2000);
|
|
401
|
+
},
|
|
402
|
+
},
|
|
403
|
+
description: "Export the competitor brief as a markdown artifact",
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
tool: "track_action",
|
|
407
|
+
staticArgs: { category: "research", action: "competitor_brief_exported" },
|
|
408
|
+
dynamicArgs: {
|
|
409
|
+
afterState: (prev) => extractText(prev).slice(0, 300),
|
|
410
|
+
},
|
|
411
|
+
description: "Track the competitor brief export action",
|
|
412
|
+
},
|
|
413
|
+
],
|
|
414
|
+
},
|
|
415
|
+
// ── Pipeline 4: pre_delegation ────────────────────────────────────
|
|
416
|
+
{
|
|
417
|
+
name: "pre_delegation",
|
|
418
|
+
description: "Delegation prep: synthesize → export → track intent → track action",
|
|
419
|
+
steps: [
|
|
420
|
+
{
|
|
421
|
+
tool: "founder_local_synthesize",
|
|
422
|
+
staticArgs: {
|
|
423
|
+
query: "Prepare delegation packet for improving NodeBench suppression quality",
|
|
424
|
+
packetType: "pre_delegation",
|
|
425
|
+
daysBack: 7,
|
|
426
|
+
},
|
|
427
|
+
description: "Synthesize a delegation packet for suppression quality improvement",
|
|
428
|
+
},
|
|
429
|
+
{
|
|
430
|
+
tool: "export_artifact_packet",
|
|
431
|
+
staticArgs: { format: "markdown", audience: "teammate" },
|
|
432
|
+
dynamicArgs: {
|
|
433
|
+
content: (prev) => {
|
|
434
|
+
const text = extractText(prev);
|
|
435
|
+
const parsed = tryParseJson(text);
|
|
436
|
+
if (parsed?.synthesis)
|
|
437
|
+
return String(parsed.synthesis);
|
|
438
|
+
if (parsed?.content)
|
|
439
|
+
return String(parsed.content);
|
|
440
|
+
return text.slice(0, 2000);
|
|
441
|
+
},
|
|
442
|
+
},
|
|
443
|
+
description: "Export the delegation packet as markdown for a teammate",
|
|
444
|
+
},
|
|
445
|
+
{
|
|
446
|
+
tool: "track_intent",
|
|
447
|
+
staticArgs: {
|
|
448
|
+
intent: "Improve suppression quality",
|
|
449
|
+
status: "active",
|
|
450
|
+
},
|
|
451
|
+
description: "Track the delegation intent as active",
|
|
452
|
+
},
|
|
453
|
+
{
|
|
454
|
+
tool: "track_action",
|
|
455
|
+
staticArgs: {
|
|
456
|
+
category: "delegation",
|
|
457
|
+
action: "packet_created",
|
|
458
|
+
},
|
|
459
|
+
dynamicArgs: {
|
|
460
|
+
afterState: (prev) => {
|
|
461
|
+
const text = extractText(prev);
|
|
462
|
+
return text.slice(0, 300);
|
|
463
|
+
},
|
|
464
|
+
},
|
|
465
|
+
description: "Record the delegation packet creation action",
|
|
466
|
+
},
|
|
467
|
+
],
|
|
468
|
+
},
|
|
469
|
+
// ── Pipeline 5: important_change_review ───────────────────────────
|
|
470
|
+
{
|
|
471
|
+
name: "important_change_review",
|
|
472
|
+
description: "Change review: get alerts → synthesize → track action → track milestone",
|
|
473
|
+
steps: [
|
|
474
|
+
{
|
|
475
|
+
tool: "get_proactive_alerts",
|
|
476
|
+
staticArgs: { lookbackDays: 7, limit: 10 },
|
|
477
|
+
description: "Get proactive alerts for recent changes",
|
|
478
|
+
},
|
|
479
|
+
{
|
|
480
|
+
tool: "founder_local_synthesize",
|
|
481
|
+
dynamicArgs: {
|
|
482
|
+
query: (prev) => {
|
|
483
|
+
const text = extractText(prev);
|
|
484
|
+
return `Summarize these alerts and recommend next actions: ${text.slice(0, 500)}`;
|
|
485
|
+
},
|
|
486
|
+
},
|
|
487
|
+
staticArgs: { packetType: "important_change", daysBack: 7 },
|
|
488
|
+
description: "Synthesize a summary of important changes with action recommendations",
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
tool: "track_action",
|
|
492
|
+
staticArgs: { category: "strategy", action: "change_review_completed" },
|
|
493
|
+
dynamicArgs: {
|
|
494
|
+
afterState: (prev) => extractText(prev).slice(0, 300),
|
|
495
|
+
},
|
|
496
|
+
description: "Track the change review action",
|
|
497
|
+
},
|
|
498
|
+
{
|
|
499
|
+
tool: "track_milestone",
|
|
500
|
+
staticArgs: { title: "Important change review completed", category: "strategy" },
|
|
501
|
+
dynamicArgs: {
|
|
502
|
+
description: (prev) => `Change review: ${extractText(prev).slice(0, 200)}`,
|
|
503
|
+
},
|
|
504
|
+
description: "Record a milestone for the change review",
|
|
505
|
+
},
|
|
506
|
+
],
|
|
507
|
+
},
|
|
508
|
+
// ── Pipeline 6: session_memory_cycle ──────────────────────────────
|
|
509
|
+
{
|
|
510
|
+
name: "session_memory_cycle",
|
|
511
|
+
description: "Memory lifecycle: track intent → synthesize → summarize → recover → complete intent",
|
|
512
|
+
steps: [
|
|
513
|
+
{
|
|
514
|
+
tool: "track_intent",
|
|
515
|
+
staticArgs: {
|
|
516
|
+
intent: "Investigate Anthropic valuation",
|
|
517
|
+
status: "active",
|
|
518
|
+
},
|
|
519
|
+
description: "Track the investigation intent as active",
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
tool: "founder_local_synthesize",
|
|
523
|
+
staticArgs: {
|
|
524
|
+
query: "What is Anthropic's current valuation?",
|
|
525
|
+
packetType: "company_search",
|
|
526
|
+
daysBack: 7,
|
|
527
|
+
},
|
|
528
|
+
description: "Synthesize information about Anthropic valuation",
|
|
529
|
+
},
|
|
530
|
+
{
|
|
531
|
+
tool: "summarize_session",
|
|
532
|
+
staticArgs: { sessionId: "pipeline_eval_session" },
|
|
533
|
+
description: "Summarize the current session",
|
|
534
|
+
},
|
|
535
|
+
{
|
|
536
|
+
tool: "get_compaction_recovery",
|
|
537
|
+
staticArgs: {},
|
|
538
|
+
description: "Recover compacted session context",
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
tool: "track_intent",
|
|
542
|
+
staticArgs: {
|
|
543
|
+
intent: "Investigate Anthropic valuation",
|
|
544
|
+
status: "completed",
|
|
545
|
+
},
|
|
546
|
+
description: "Mark the investigation intent as completed",
|
|
547
|
+
},
|
|
548
|
+
],
|
|
549
|
+
},
|
|
550
|
+
];
|
|
551
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
552
|
+
// TOOL EXECUTOR
|
|
553
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
554
|
+
function findTool(tools, name) {
|
|
555
|
+
return tools.find((t) => t.name === name) ?? null;
|
|
556
|
+
}
|
|
557
|
+
async function callTool(tool, args = {}) {
|
|
558
|
+
const start = Date.now();
|
|
559
|
+
try {
|
|
560
|
+
const result = await tool.handler(args);
|
|
561
|
+
return { ok: true, result, ms: Date.now() - start };
|
|
562
|
+
}
|
|
563
|
+
catch (err) {
|
|
564
|
+
return { ok: false, result: null, error: err?.message ?? String(err), ms: Date.now() - start };
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
568
|
+
// PIPELINE RUNNER — sequential execution with output chaining
|
|
569
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
570
|
+
async function runPipeline(pipeline, allTools) {
|
|
571
|
+
const results = [];
|
|
572
|
+
let prevOutput = null;
|
|
573
|
+
for (let i = 0; i < pipeline.steps.length; i++) {
|
|
574
|
+
const step = pipeline.steps[i];
|
|
575
|
+
const tool = findTool(allTools, step.tool);
|
|
576
|
+
if (!tool) {
|
|
577
|
+
results.push({
|
|
578
|
+
stepIndex: i,
|
|
579
|
+
tool: step.tool,
|
|
580
|
+
description: step.description,
|
|
581
|
+
args: step.staticArgs,
|
|
582
|
+
ok: false,
|
|
583
|
+
outputSize: 0,
|
|
584
|
+
outputPreview: "",
|
|
585
|
+
error: `Tool "${step.tool}" not found in loaded toolsets`,
|
|
586
|
+
ms: 0,
|
|
587
|
+
});
|
|
588
|
+
// Continue to next step — don't break the chain entirely
|
|
589
|
+
continue;
|
|
590
|
+
}
|
|
591
|
+
// Build args: merge static + dynamic (extracted from previous output)
|
|
592
|
+
const args = { ...step.staticArgs };
|
|
593
|
+
if (step.dynamicArgs && prevOutput !== null) {
|
|
594
|
+
for (const [key, extractor] of Object.entries(step.dynamicArgs)) {
|
|
595
|
+
try {
|
|
596
|
+
const extracted = extractor(prevOutput);
|
|
597
|
+
if (extracted !== undefined && extracted !== null) {
|
|
598
|
+
args[key] = extracted;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
catch {
|
|
602
|
+
// Extractor failed — use static arg or skip
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
const { ok, result, error, ms } = await callTool(tool, args);
|
|
607
|
+
const outputText = ok ? extractText(result) : (error ?? "(error)");
|
|
608
|
+
results.push({
|
|
609
|
+
stepIndex: i,
|
|
610
|
+
tool: step.tool,
|
|
611
|
+
description: step.description,
|
|
612
|
+
args,
|
|
613
|
+
ok,
|
|
614
|
+
outputSize: outputText.length,
|
|
615
|
+
outputPreview: outputText.slice(0, 500),
|
|
616
|
+
error: ok ? undefined : error,
|
|
617
|
+
ms,
|
|
618
|
+
});
|
|
619
|
+
// Pass output forward for chaining
|
|
620
|
+
prevOutput = result;
|
|
621
|
+
}
|
|
622
|
+
return results;
|
|
623
|
+
}
|
|
624
|
+
const PIPELINE_CRITERIA = [
|
|
625
|
+
{
|
|
626
|
+
criterion: "Final output contains structured data derived from earlier pipeline steps",
|
|
627
|
+
weight: 2,
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
criterion: "No step produced only errors or empty results",
|
|
631
|
+
weight: 2,
|
|
632
|
+
},
|
|
633
|
+
{
|
|
634
|
+
criterion: "Output entity names are consistent across pipeline steps",
|
|
635
|
+
weight: 1,
|
|
636
|
+
},
|
|
637
|
+
{
|
|
638
|
+
criterion: "Final artifact is usable without re-running the pipeline",
|
|
639
|
+
weight: 1,
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
criterion: "Pipeline completed all steps (no step was skipped due to missing input)",
|
|
643
|
+
weight: 2,
|
|
644
|
+
},
|
|
645
|
+
];
|
|
646
|
+
/** Structural code-graded criteria — deterministic, no LLM needed */
|
|
647
|
+
function codeGradeCriteria(steps, pipeline) {
|
|
648
|
+
const results = [];
|
|
649
|
+
// Criterion: "No step produced only errors or empty results"
|
|
650
|
+
const allStepsProduced = steps.every((s) => s.ok && s.outputSize > 10);
|
|
651
|
+
const errorSteps = steps.filter((s) => !s.ok || s.outputSize <= 10);
|
|
652
|
+
results.push({
|
|
653
|
+
criterion: PIPELINE_CRITERIA[1].criterion,
|
|
654
|
+
weight: PIPELINE_CRITERIA[1].weight,
|
|
655
|
+
pass: allStepsProduced,
|
|
656
|
+
evidence: allStepsProduced
|
|
657
|
+
? `All ${steps.length} steps produced non-empty output`
|
|
658
|
+
: `${errorSteps.length} step(s) failed or empty: ${errorSteps.map((s) => s.tool).join(", ")}`,
|
|
659
|
+
});
|
|
660
|
+
// Criterion: "Pipeline completed all steps (no step was skipped)"
|
|
661
|
+
const allCompleted = steps.length === pipeline.steps.length && steps.every((s) => s.outputSize > 0 || s.ok);
|
|
662
|
+
const missingTools = pipeline.steps
|
|
663
|
+
.filter((_, i) => !steps[i] || steps[i].error?.includes("not found"))
|
|
664
|
+
.map((s) => s.tool);
|
|
665
|
+
results.push({
|
|
666
|
+
criterion: PIPELINE_CRITERIA[4].criterion,
|
|
667
|
+
weight: PIPELINE_CRITERIA[4].weight,
|
|
668
|
+
pass: allCompleted,
|
|
669
|
+
evidence: allCompleted
|
|
670
|
+
? `All ${pipeline.steps.length} steps executed`
|
|
671
|
+
: `Missing/skipped: ${missingTools.join(", ") || errorSteps.map((s) => s.tool).join(", ")}`,
|
|
672
|
+
});
|
|
673
|
+
return results;
|
|
674
|
+
}
|
|
675
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
676
|
+
// LLM JUDGE — Gemini evaluates remaining criteria on full pipeline trace
|
|
677
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
678
|
+
const GEMINI_MODEL = "gemini-3.1-flash-lite-preview";
|
|
679
|
+
async function geminiJudgePipeline(pipeline, steps, codeGraded) {
|
|
680
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
681
|
+
if (!apiKey) {
|
|
682
|
+
// No API key — return heuristic-based results for LLM criteria
|
|
683
|
+
return llmCriteriaHeuristic(steps);
|
|
684
|
+
}
|
|
685
|
+
// Build the full trace for the judge
|
|
686
|
+
const traceText = steps
|
|
687
|
+
.map((s) => `Step ${s.stepIndex + 1} [${s.tool}]: ${s.description}\n` +
|
|
688
|
+
` Args: ${JSON.stringify(s.args).slice(0, 300)}\n` +
|
|
689
|
+
` OK: ${s.ok}, Size: ${s.outputSize}b, Time: ${s.ms}ms\n` +
|
|
690
|
+
` Output: ${s.outputPreview}`)
|
|
691
|
+
.join("\n\n");
|
|
692
|
+
// Only ask Gemini about criteria NOT already code-graded
|
|
693
|
+
const codeGradedCriteria = new Set(codeGraded.map((c) => c.criterion));
|
|
694
|
+
const llmCriteria = PIPELINE_CRITERIA.filter((c) => !codeGradedCriteria.has(c.criterion));
|
|
695
|
+
if (llmCriteria.length === 0)
|
|
696
|
+
return [];
|
|
697
|
+
const criteriaList = llmCriteria.map((c, i) => `${i + 1}. ${c.criterion} (weight: ${c.weight})`).join("\n");
|
|
698
|
+
const prompt = `You are an evaluation judge for NodeBench MCP pipeline chains.
|
|
699
|
+
|
|
700
|
+
Pipeline: "${pipeline.name}" — ${pipeline.description}
|
|
701
|
+
|
|
702
|
+
Full execution trace:
|
|
703
|
+
|
|
704
|
+
${traceText.slice(0, 8000)}
|
|
705
|
+
|
|
706
|
+
Evaluate these criteria:
|
|
707
|
+
${criteriaList}
|
|
708
|
+
|
|
709
|
+
RULES:
|
|
710
|
+
- A pipeline step that returns structured JSON or prose analysis is valid output
|
|
711
|
+
- "Derived from earlier steps" means the final output references entities, data, or concepts from previous steps
|
|
712
|
+
- "Entity names consistent" means if step 1 mentions "Anthropic", later steps should too (not switch to a different entity)
|
|
713
|
+
- "Usable without re-running" means the final output has enough context to be read standalone
|
|
714
|
+
|
|
715
|
+
Respond in this exact JSON format (no markdown, no explanation):
|
|
716
|
+
{
|
|
717
|
+
"criteria": [
|
|
718
|
+
{"criterion": "exact criterion text", "pass": true, "evidence": "brief explanation"}
|
|
719
|
+
]
|
|
720
|
+
}`;
|
|
721
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
|
|
722
|
+
try {
|
|
723
|
+
const resp = await fetch(url, {
|
|
724
|
+
method: "POST",
|
|
725
|
+
headers: {
|
|
726
|
+
"Content-Type": "application/json",
|
|
727
|
+
"x-goog-api-key": apiKey,
|
|
728
|
+
},
|
|
729
|
+
body: JSON.stringify({
|
|
730
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
731
|
+
generationConfig: { temperature: 0.1, maxOutputTokens: 1024 },
|
|
732
|
+
}),
|
|
733
|
+
signal: AbortSignal.timeout(30_000),
|
|
734
|
+
});
|
|
735
|
+
if (!resp.ok) {
|
|
736
|
+
console.warn(`[gemini] HTTP ${resp.status}: ${resp.statusText}`);
|
|
737
|
+
return llmCriteriaHeuristic(steps);
|
|
738
|
+
}
|
|
739
|
+
const json = (await resp.json());
|
|
740
|
+
const text = json?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
|
|
741
|
+
// Parse JSON from response
|
|
742
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
743
|
+
if (!jsonMatch)
|
|
744
|
+
return llmCriteriaHeuristic(steps);
|
|
745
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
746
|
+
return parsed.criteria.map((c) => {
|
|
747
|
+
const def = llmCriteria.find((lc) => c.criterion.includes(lc.criterion.slice(0, 30)));
|
|
748
|
+
return {
|
|
749
|
+
criterion: c.criterion,
|
|
750
|
+
weight: def?.weight ?? 1,
|
|
751
|
+
pass: c.pass,
|
|
752
|
+
evidence: c.evidence,
|
|
753
|
+
};
|
|
754
|
+
});
|
|
755
|
+
}
|
|
756
|
+
catch (err) {
|
|
757
|
+
console.warn(`[gemini] Judge error: ${err?.message ?? err}`);
|
|
758
|
+
return llmCriteriaHeuristic(steps);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
/** Heuristic fallback when Gemini is unavailable */
|
|
762
|
+
function llmCriteriaHeuristic(steps) {
|
|
763
|
+
const results = [];
|
|
764
|
+
const allOutputs = steps.map((s) => s.outputPreview).join(" ");
|
|
765
|
+
const lastOutput = steps[steps.length - 1]?.outputPreview ?? "";
|
|
766
|
+
// "Final output contains structured data derived from earlier pipeline steps"
|
|
767
|
+
const hasStructuredFinal = lastOutput.length > 50 && (lastOutput.includes("{") || lastOutput.includes("##") || lastOutput.includes("- "));
|
|
768
|
+
const referencesEarlier = steps.length > 1 && steps.slice(0, -1).some((s) => {
|
|
769
|
+
// Check if any entity/keyword from earlier steps appears in final output
|
|
770
|
+
const words = s.outputPreview.split(/\s+/).filter((w) => w.length > 5);
|
|
771
|
+
return words.some((w) => lastOutput.toLowerCase().includes(w.toLowerCase()));
|
|
772
|
+
});
|
|
773
|
+
results.push({
|
|
774
|
+
criterion: PIPELINE_CRITERIA[0].criterion,
|
|
775
|
+
weight: PIPELINE_CRITERIA[0].weight,
|
|
776
|
+
pass: hasStructuredFinal && (referencesEarlier || steps.length <= 2),
|
|
777
|
+
evidence: hasStructuredFinal
|
|
778
|
+
? `Final output is ${lastOutput.length}b with structure; references earlier: ${referencesEarlier}`
|
|
779
|
+
: `Final output too short (${lastOutput.length}b) or unstructured`,
|
|
780
|
+
});
|
|
781
|
+
// "Output entity names are consistent across pipeline steps"
|
|
782
|
+
// Extract capitalized multi-word names from outputs
|
|
783
|
+
const entitySets = steps
|
|
784
|
+
.filter((s) => s.ok && s.outputSize > 20)
|
|
785
|
+
.map((s) => {
|
|
786
|
+
const matches = s.outputPreview.match(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g) ?? [];
|
|
787
|
+
return new Set(matches.filter((m) => m.length > 3));
|
|
788
|
+
});
|
|
789
|
+
const firstEntities = entitySets[0] ?? new Set();
|
|
790
|
+
const consistentEntities = entitySets.length <= 1 ||
|
|
791
|
+
[...firstEntities].some((e) => entitySets.slice(1).some((s) => s.has(e)));
|
|
792
|
+
results.push({
|
|
793
|
+
criterion: PIPELINE_CRITERIA[2].criterion,
|
|
794
|
+
weight: PIPELINE_CRITERIA[2].weight,
|
|
795
|
+
pass: consistentEntities,
|
|
796
|
+
evidence: consistentEntities
|
|
797
|
+
? `Entity names consistent across steps`
|
|
798
|
+
: `No overlapping entities found between step outputs`,
|
|
799
|
+
});
|
|
800
|
+
// "Final artifact is usable without re-running the pipeline"
|
|
801
|
+
const isUsable = lastOutput.length > 100 && !lastOutput.startsWith("ERROR") && !lastOutput.startsWith("(null)");
|
|
802
|
+
results.push({
|
|
803
|
+
criterion: PIPELINE_CRITERIA[3].criterion,
|
|
804
|
+
weight: PIPELINE_CRITERIA[3].weight,
|
|
805
|
+
pass: isUsable,
|
|
806
|
+
evidence: isUsable
|
|
807
|
+
? `Final output is ${lastOutput.length}b, substantive and self-contained`
|
|
808
|
+
: `Final output is too short or is an error`,
|
|
809
|
+
});
|
|
810
|
+
return results;
|
|
811
|
+
}
|
|
812
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
813
|
+
// PERSISTENCE — SQLite storage
|
|
814
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
815
|
+
function persistRun(result) {
|
|
816
|
+
const db = getDb();
|
|
817
|
+
ensureSchema();
|
|
818
|
+
db.prepare(`INSERT INTO pipeline_eval_runs (run_id, pipeline_name, timestamp, step_count, steps_passed, overall_pass, criteria_json, total_ms)
|
|
819
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(result.runId, result.pipelineName, result.timestamp, result.steps.length, result.steps.filter((s) => s.ok).length, result.overallPass ? 1 : 0, JSON.stringify(result.criteria), result.totalMs);
|
|
820
|
+
const stepStmt = db.prepare(`INSERT INTO pipeline_eval_steps (id, run_id, step_index, tool, description, args_json, ok, output_size, output_preview, error, ms)
|
|
821
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
822
|
+
for (const step of result.steps) {
|
|
823
|
+
stepStmt.run(genId("step"), result.runId, step.stepIndex, step.tool, step.description, JSON.stringify(step.args), step.ok ? 1 : 0, step.outputSize, step.outputPreview, step.error ?? null, step.ms);
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
827
|
+
// MAIN EVAL RUNNER
|
|
828
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
829
|
+
async function runPipelineEval(pipelineName) {
|
|
830
|
+
// 1. Load all tools
|
|
831
|
+
console.log("[pipeline-eval] Loading all tool domains...");
|
|
832
|
+
const allTools = await loadToolsets(ALL_DOMAIN_KEYS);
|
|
833
|
+
console.log(`[pipeline-eval] Loaded ${allTools.length} tools across ${ALL_DOMAIN_KEYS.length} domains`);
|
|
834
|
+
// Also create progressive discovery tools
|
|
835
|
+
const { createProgressiveDiscoveryTools } = await import("../tools/progressiveDiscoveryTools.js");
|
|
836
|
+
const discoveryTools = createProgressiveDiscoveryTools(allTools.map((t) => ({ name: t.name, description: t.description })));
|
|
837
|
+
const fullToolset = [...allTools, ...discoveryTools];
|
|
838
|
+
// 2. Seed test data
|
|
839
|
+
console.log("[pipeline-eval] Seeding test data...");
|
|
840
|
+
seedPipelineData();
|
|
841
|
+
// 3. Select pipelines to run
|
|
842
|
+
const pipelines = pipelineName
|
|
843
|
+
? PIPELINES.filter((p) => p.name === pipelineName)
|
|
844
|
+
: PIPELINES;
|
|
845
|
+
if (pipelines.length === 0) {
|
|
846
|
+
console.error(`[pipeline-eval] Unknown pipeline: ${pipelineName}`);
|
|
847
|
+
console.error(` Available: ${PIPELINES.map((p) => p.name).join(", ")}`);
|
|
848
|
+
process.exit(1);
|
|
849
|
+
}
|
|
850
|
+
// 4. Run each pipeline
|
|
851
|
+
const results = [];
|
|
852
|
+
for (const pipeline of pipelines) {
|
|
853
|
+
console.log(`\n${"═".repeat(60)}`);
|
|
854
|
+
console.log(`PIPELINE: ${pipeline.name}`);
|
|
855
|
+
console.log(` ${pipeline.description}`);
|
|
856
|
+
console.log(` Steps: ${pipeline.steps.length}`);
|
|
857
|
+
console.log("═".repeat(60));
|
|
858
|
+
const runId = genId("pipe");
|
|
859
|
+
const startMs = Date.now();
|
|
860
|
+
// Execute the pipeline
|
|
861
|
+
const steps = await runPipeline(pipeline, fullToolset);
|
|
862
|
+
const totalMs = Date.now() - startMs;
|
|
863
|
+
// Print step results
|
|
864
|
+
for (const step of steps) {
|
|
865
|
+
const status = step.ok ? "PASS" : "FAIL";
|
|
866
|
+
const icon = step.ok ? "[+]" : "[-]";
|
|
867
|
+
console.log(` ${icon} Step ${step.stepIndex + 1}: ${step.tool} (${step.ms}ms) — ${status}`);
|
|
868
|
+
if (step.error) {
|
|
869
|
+
console.log(` Error: ${step.error.slice(0, 150)}`);
|
|
870
|
+
}
|
|
871
|
+
else {
|
|
872
|
+
console.log(` Output: ${step.outputSize}b — ${step.outputPreview.slice(0, 120)}...`);
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
// Grade criteria — code first, then LLM
|
|
876
|
+
console.log(`\n Grading criteria...`);
|
|
877
|
+
const codeGraded = codeGradeCriteria(steps, pipeline);
|
|
878
|
+
const llmGraded = await geminiJudgePipeline(pipeline, steps, codeGraded);
|
|
879
|
+
const allCriteria = [...codeGraded, ...llmGraded];
|
|
880
|
+
// Overall pass: weighted score
|
|
881
|
+
const totalWeight = allCriteria.reduce((sum, c) => sum + c.weight, 0);
|
|
882
|
+
const passedWeight = allCriteria.filter((c) => c.pass).reduce((sum, c) => sum + c.weight, 0);
|
|
883
|
+
const weightedPassRate = totalWeight > 0 ? passedWeight / totalWeight : 0;
|
|
884
|
+
const overallPass = weightedPassRate >= 0.6; // 60% weighted threshold
|
|
885
|
+
// Print criteria results
|
|
886
|
+
for (const c of allCriteria) {
|
|
887
|
+
const icon = c.pass ? "[+]" : "[-]";
|
|
888
|
+
console.log(` ${icon} (w=${c.weight}) ${c.criterion}`);
|
|
889
|
+
console.log(` ${c.evidence}`);
|
|
890
|
+
}
|
|
891
|
+
console.log(`\n RESULT: ${overallPass ? "PASS" : "FAIL"} (${(weightedPassRate * 100).toFixed(1)}% weighted, ${totalMs}ms)`);
|
|
892
|
+
const runResult = {
|
|
893
|
+
pipelineName: pipeline.name,
|
|
894
|
+
runId,
|
|
895
|
+
steps,
|
|
896
|
+
criteria: allCriteria,
|
|
897
|
+
overallPass,
|
|
898
|
+
totalMs,
|
|
899
|
+
timestamp: new Date().toISOString(),
|
|
900
|
+
};
|
|
901
|
+
// Persist to SQLite
|
|
902
|
+
try {
|
|
903
|
+
persistRun(runResult);
|
|
904
|
+
}
|
|
905
|
+
catch (err) {
|
|
906
|
+
console.warn(` [persist] Warning: ${err?.message}`);
|
|
907
|
+
}
|
|
908
|
+
results.push(runResult);
|
|
909
|
+
}
|
|
910
|
+
// Cleanup seed data
|
|
911
|
+
console.log("\n[pipeline-eval] Cleaning up seed data...");
|
|
912
|
+
cleanupPipelineData();
|
|
913
|
+
return results;
|
|
914
|
+
}
|
|
915
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
916
|
+
// SUMMARY PRINTER
|
|
917
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
918
|
+
function printSummary(results) {
|
|
919
|
+
console.log("\n" + "═".repeat(60));
|
|
920
|
+
console.log("PIPELINE EVAL SUMMARY");
|
|
921
|
+
console.log("═".repeat(60));
|
|
922
|
+
const passed = results.filter((r) => r.overallPass).length;
|
|
923
|
+
const total = results.length;
|
|
924
|
+
console.log(`\n Pipelines: ${passed}/${total} passed`);
|
|
925
|
+
console.log("");
|
|
926
|
+
// Per-pipeline table
|
|
927
|
+
console.log(" Pipeline Steps Criteria Time Result");
|
|
928
|
+
console.log(" " + "-".repeat(70));
|
|
929
|
+
for (const r of results) {
|
|
930
|
+
const stepsPassed = r.steps.filter((s) => s.ok).length;
|
|
931
|
+
const critPassed = r.criteria.filter((c) => c.pass).length;
|
|
932
|
+
const status = r.overallPass ? "PASS" : "FAIL";
|
|
933
|
+
const name = r.pipelineName.padEnd(28);
|
|
934
|
+
const stepsStr = `${stepsPassed}/${r.steps.length}`.padEnd(6);
|
|
935
|
+
const critStr = `${critPassed}/${r.criteria.length}`.padEnd(9);
|
|
936
|
+
const timeStr = `${r.totalMs}ms`.padEnd(7);
|
|
937
|
+
console.log(` ${name} ${stepsStr} ${critStr} ${timeStr} ${status}`);
|
|
938
|
+
}
|
|
939
|
+
// Aggregate criteria
|
|
940
|
+
console.log("\n Criteria Breakdown:");
|
|
941
|
+
const criteriaMap = new Map();
|
|
942
|
+
for (const r of results) {
|
|
943
|
+
for (const c of r.criteria) {
|
|
944
|
+
const entry = criteriaMap.get(c.criterion) ?? { pass: 0, total: 0 };
|
|
945
|
+
entry.total++;
|
|
946
|
+
if (c.pass)
|
|
947
|
+
entry.pass++;
|
|
948
|
+
criteriaMap.set(c.criterion, entry);
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
for (const [criterion, stats] of criteriaMap) {
|
|
952
|
+
const rate = ((stats.pass / stats.total) * 100).toFixed(0);
|
|
953
|
+
const icon = stats.pass === stats.total ? "[+]" : "[-]";
|
|
954
|
+
console.log(` ${icon} ${rate}% — ${criterion.slice(0, 65)}`);
|
|
955
|
+
}
|
|
956
|
+
// Total time
|
|
957
|
+
const totalTime = results.reduce((sum, r) => sum + r.totalMs, 0);
|
|
958
|
+
console.log(`\n Total time: ${totalTime}ms`);
|
|
959
|
+
console.log(` Overall pass rate: ${((passed / total) * 100).toFixed(1)}%`);
|
|
960
|
+
console.log("");
|
|
961
|
+
}
|
|
962
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
963
|
+
// CLI
|
|
964
|
+
// ══════════════════════════════════════════════════════════════════════════════
|
|
965
|
+
function parseArgs(argv) {
|
|
966
|
+
const options = {};
|
|
967
|
+
for (let i = 0; i < argv.length; i++) {
|
|
968
|
+
const arg = argv[i];
|
|
969
|
+
switch (arg) {
|
|
970
|
+
case "--pipeline":
|
|
971
|
+
options.pipeline = argv[++i];
|
|
972
|
+
break;
|
|
973
|
+
case "--all":
|
|
974
|
+
options.all = true;
|
|
975
|
+
break;
|
|
976
|
+
default:
|
|
977
|
+
if (arg.startsWith("--")) {
|
|
978
|
+
console.error(`Unknown flag: ${arg}`);
|
|
979
|
+
console.error(`Usage: npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]`);
|
|
980
|
+
console.error(`Pipelines: ${PIPELINES.map((p) => p.name).join(", ")}`);
|
|
981
|
+
process.exit(1);
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
return options;
|
|
986
|
+
}
|
|
987
|
+
async function main() {
|
|
988
|
+
// Load .env.local for GEMINI_API_KEY
|
|
989
|
+
if (!process.env.GEMINI_API_KEY) {
|
|
990
|
+
try {
|
|
991
|
+
const fs = await import("fs");
|
|
992
|
+
const path = await import("path");
|
|
993
|
+
const candidates = [
|
|
994
|
+
path.resolve(process.cwd(), ".env.local"),
|
|
995
|
+
path.resolve(process.cwd(), "../../.env.local"),
|
|
996
|
+
path.resolve(process.cwd(), "../../../.env.local"),
|
|
997
|
+
];
|
|
998
|
+
for (const envPath of candidates) {
|
|
999
|
+
if (fs.existsSync(envPath)) {
|
|
1000
|
+
const content = fs.readFileSync(envPath, "utf-8");
|
|
1001
|
+
for (const line of content.split("\n")) {
|
|
1002
|
+
const match = line.match(/^([^#=]+)=(.*)$/);
|
|
1003
|
+
if (match)
|
|
1004
|
+
process.env[match[1].trim()] = match[2].trim();
|
|
1005
|
+
}
|
|
1006
|
+
if (process.env.GEMINI_API_KEY) {
|
|
1007
|
+
console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
|
|
1008
|
+
break;
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
catch {
|
|
1014
|
+
/* ignore env loading errors */
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
const options = parseArgs(process.argv.slice(2));
|
|
1018
|
+
console.log("NodeBench Pipeline Eval Harness");
|
|
1019
|
+
console.log("═".repeat(40));
|
|
1020
|
+
console.log(` Gemini: ${process.env.GEMINI_API_KEY ? "available" : "unavailable (heuristic fallback)"}`);
|
|
1021
|
+
console.log(` Pipeline: ${options.pipeline ?? "all"}`);
|
|
1022
|
+
console.log("");
|
|
1023
|
+
// Wire up DB accessor for toolRegistry
|
|
1024
|
+
_setDbAccessor(getDb);
|
|
1025
|
+
const results = await runPipelineEval(options.pipeline);
|
|
1026
|
+
printSummary(results);
|
|
1027
|
+
// Exit with code based on pass rate
|
|
1028
|
+
const passRate = results.filter((r) => r.overallPass).length / results.length;
|
|
1029
|
+
process.exit(passRate >= 0.5 ? 0 : 1);
|
|
1030
|
+
}
|
|
1031
|
+
main().catch((err) => {
|
|
1032
|
+
console.error("[pipeline-eval] Fatal error:", err);
|
|
1033
|
+
process.exit(1);
|
|
1034
|
+
});
|
|
1035
|
+
//# sourceMappingURL=pipelineEval.js.map
|