nodebench-mcp 2.66.0 → 2.68.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1053 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * pipelineEval.ts — Multi-step pipeline eval harness for NodeBench MCP
4
+ *
5
+ * Unlike llmJudgeEval.ts which tests tools independently, this harness tests
6
+ * realistic multi-step chains where Tool A's output feeds Tool B's input.
7
+ *
8
+ * Architecture:
9
+ * 1. Pipeline Definitions — 6 canonical pipelines modeling real agent workflows
10
+ * 2. Chaining Engine — executes steps sequentially, extracts fields from output
11
+ * 3. LLM Judge — Gemini Flash Lite evaluates 5 boolean criteria on full trace
12
+ * 4. Per-step tracking — tool name, args, output size, duration, pass/fail
13
+ * 5. SQLite persistence — pipeline_eval_runs + pipeline_eval_steps tables
14
+ *
15
+ * Usage:
16
+ * cd packages/mcp-local
17
+ * npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]
18
+ */
19
+ import { getDb, genId } from "../db.js";
20
+ import { _setDbAccessor } from "../tools/toolRegistry.js";
21
+ import { loadToolsets, ALL_DOMAIN_KEYS } from "../toolsetRegistry.js";
22
+ // ══════════════════════════════════════════════════════════════════════════════
23
+ // SCHEMA
24
+ // ══════════════════════════════════════════════════════════════════════════════
25
+ const PIPELINE_EVAL_SCHEMA = `
26
+ CREATE TABLE IF NOT EXISTS pipeline_eval_runs (
27
+ run_id TEXT PRIMARY KEY,
28
+ pipeline_name TEXT NOT NULL,
29
+ timestamp TEXT NOT NULL DEFAULT (datetime('now')),
30
+ step_count INTEGER NOT NULL DEFAULT 0,
31
+ steps_passed INTEGER NOT NULL DEFAULT 0,
32
+ overall_pass INTEGER NOT NULL DEFAULT 0,
33
+ criteria_json TEXT,
34
+ total_ms INTEGER NOT NULL DEFAULT 0
35
+ );
36
+
37
+ CREATE TABLE IF NOT EXISTS pipeline_eval_steps (
38
+ id TEXT PRIMARY KEY,
39
+ run_id TEXT NOT NULL,
40
+ step_index INTEGER NOT NULL,
41
+ tool TEXT NOT NULL,
42
+ description TEXT,
43
+ args_json TEXT,
44
+ ok INTEGER NOT NULL DEFAULT 0,
45
+ output_size INTEGER NOT NULL DEFAULT 0,
46
+ output_preview TEXT,
47
+ error TEXT,
48
+ ms INTEGER NOT NULL DEFAULT 0
49
+ );
50
+
51
+ CREATE INDEX IF NOT EXISTS idx_pipeline_eval_steps_run ON pipeline_eval_steps(run_id);
52
+ `;
53
+ function ensureSchema() {
54
+ const db = getDb();
55
+ db.exec(PIPELINE_EVAL_SCHEMA);
56
+ }
57
+ // ══════════════════════════════════════════════════════════════════════════════
58
+ // TEST DATA SEEDING — populate SQLite before pipeline execution
59
+ // ══════════════════════════════════════════════════════════════════════════════
60
+ const SEED_PREFIX = "pipe_eval_";
61
+ function ensureToolSchemas() {
62
+ const db = getDb();
63
+ db.exec(`
64
+ CREATE TABLE IF NOT EXISTS causal_events (
65
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
66
+ userId TEXT NOT NULL,
67
+ eventType TEXT NOT NULL,
68
+ payload TEXT,
69
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
70
+ );
71
+ CREATE TABLE IF NOT EXISTS causal_important_changes (
72
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
73
+ changeId TEXT UNIQUE NOT NULL,
74
+ changeCategory TEXT NOT NULL,
75
+ impactScore REAL NOT NULL DEFAULT 0,
76
+ impactReason TEXT,
77
+ affectedEntities TEXT,
78
+ suggestedAction TEXT,
79
+ status TEXT NOT NULL DEFAULT 'detected',
80
+ timestampMs INTEGER,
81
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
82
+ );
83
+ CREATE TABLE IF NOT EXISTS founder_packets (
84
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
85
+ packetId TEXT UNIQUE NOT NULL,
86
+ sessionId TEXT,
87
+ packetType TEXT NOT NULL,
88
+ role TEXT NOT NULL DEFAULT 'founder',
89
+ content TEXT NOT NULL,
90
+ metadata TEXT,
91
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
92
+ );
93
+ CREATE TABLE IF NOT EXISTS tracking_actions (
94
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
95
+ actionId TEXT UNIQUE NOT NULL,
96
+ sessionId TEXT NOT NULL,
97
+ timestamp TEXT NOT NULL,
98
+ action TEXT NOT NULL,
99
+ category TEXT NOT NULL,
100
+ beforeState TEXT,
101
+ afterState TEXT,
102
+ reasoning TEXT,
103
+ filesChanged TEXT,
104
+ impactLevel TEXT NOT NULL,
105
+ dayOfWeek TEXT NOT NULL,
106
+ weekNumber INTEGER NOT NULL,
107
+ month TEXT NOT NULL,
108
+ quarter TEXT NOT NULL,
109
+ year INTEGER NOT NULL
110
+ );
111
+ CREATE TABLE IF NOT EXISTS tracking_milestones (
112
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
113
+ milestoneId TEXT UNIQUE NOT NULL,
114
+ title TEXT NOT NULL,
115
+ category TEXT NOT NULL,
116
+ description TEXT,
117
+ evidence TEXT,
118
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
119
+ );
120
+ CREATE TABLE IF NOT EXISTS intent_residuals (
121
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
122
+ intentId TEXT UNIQUE NOT NULL,
123
+ intent TEXT NOT NULL,
124
+ status TEXT NOT NULL DEFAULT 'active',
125
+ context TEXT,
126
+ createdAt TEXT NOT NULL,
127
+ updatedAt TEXT NOT NULL
128
+ );
129
+ CREATE TABLE IF NOT EXISTS session_summaries (
130
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
131
+ sessionId TEXT UNIQUE NOT NULL,
132
+ summary TEXT NOT NULL,
133
+ toolCount INTEGER NOT NULL DEFAULT 0,
134
+ entityCount INTEGER NOT NULL DEFAULT 0,
135
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
136
+ );
137
+ CREATE TABLE IF NOT EXISTS recon_sessions (
138
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
139
+ sessionId TEXT UNIQUE NOT NULL,
140
+ target TEXT NOT NULL,
141
+ scope TEXT NOT NULL DEFAULT 'quick',
142
+ status TEXT NOT NULL DEFAULT 'active',
143
+ findings TEXT,
144
+ metadata TEXT,
145
+ createdAt TEXT NOT NULL DEFAULT (datetime('now'))
146
+ );
147
+ `);
148
+ }
149
+ function seedPipelineData() {
150
+ const db = getDb();
151
+ ensureToolSchemas();
152
+ const iso = new Date().toISOString();
153
+ const dayAgo = Date.now() - 86_400_000;
154
+ // Seed events
155
+ const events = [
156
+ { userId: `${SEED_PREFIX}user`, eventType: "packet.generated", payload: JSON.stringify({ summary: "Weekly founder briefing generated", entityId: "nodebench" }), createdAt: iso },
157
+ { userId: `${SEED_PREFIX}user`, eventType: "recon.completed", payload: JSON.stringify({ target: "Anthropic", findings: 5 }), createdAt: iso },
158
+ { userId: `${SEED_PREFIX}user`, eventType: "strategy.review", payload: JSON.stringify({ topic: "MCP distribution", status: "active" }), createdAt: iso },
159
+ ];
160
+ const evtStmt = db.prepare(`INSERT OR IGNORE INTO causal_events (userId, eventType, payload, createdAt) VALUES (?, ?, ?, ?)`);
161
+ for (const e of events)
162
+ evtStmt.run(e.userId, e.eventType, e.payload, e.createdAt);
163
+ // Seed important changes
164
+ const changes = [
165
+ { changeId: `${SEED_PREFIX}chg_001`, changeCategory: "competitive", impactScore: 0.85, impactReason: "Anthropic launched Model Context Protocol marketplace with 200+ servers", affectedEntities: JSON.stringify(["anthropic", "nodebench"]), suggestedAction: "Evaluate marketplace listing for NodeBench", status: "detected", timestampMs: dayAgo },
166
+ { changeId: `${SEED_PREFIX}chg_002`, changeCategory: "product", impactScore: 0.72, impactReason: "Supermemory raised $4M seed for memory-layer MCP server", affectedEntities: JSON.stringify(["supermemory"]), suggestedAction: "Analyze differentiation vs Supermemory approach", status: "detected", timestampMs: dayAgo + 3600_000 },
167
+ { changeId: `${SEED_PREFIX}chg_003`, changeCategory: "strategy", impactScore: 0.68, impactReason: "NodeBench eval pass rate reached 100% — ready for distribution push", affectedEntities: JSON.stringify(["nodebench"]), suggestedAction: "Publish to npm and MCP registry", status: "detected", timestampMs: dayAgo + 7200_000 },
168
+ ];
169
+ const chgStmt = db.prepare(`INSERT OR IGNORE INTO causal_important_changes (changeId, changeCategory, impactScore, impactReason, affectedEntities, suggestedAction, status, timestampMs) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
170
+ for (const c of changes)
171
+ chgStmt.run(c.changeId, c.changeCategory, c.impactScore, c.impactReason, c.affectedEntities, c.suggestedAction, c.status, c.timestampMs);
172
+ // Seed a recon session for competitor_brief pipeline
173
+ // recon_sessions schema: id, target, description, status, created_at
174
+ try {
175
+ const reconStmt = db.prepare(`INSERT OR IGNORE INTO recon_sessions (id, target, description, status, created_at) VALUES (?, ?, ?, ?, ?)`);
176
+ reconStmt.run(`${SEED_PREFIX}recon_supermemory`, "Supermemory", "Memory-layer MCP server competitor analysis", "completed", iso);
177
+ // Also seed a finding for this session
178
+ db.prepare(`INSERT OR IGNORE INTO recon_findings (id, session_id, category, finding, severity, source, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)`)
179
+ .run(`${SEED_PREFIX}finding_001`, `${SEED_PREFIX}recon_supermemory`, "new_feature", "Supermemory raised $4M seed for universal memory infrastructure with MCP distribution", "medium", "web_search", iso);
180
+ }
181
+ catch { /* table may not exist yet — will be created by run_recon */ }
182
+ console.log(`[seedPipelineData] Seeded: 3 events, 3 important_changes, 1 recon session`);
183
+ }
184
+ function cleanupPipelineData() {
185
+ const db = getDb();
186
+ const prefix = `${SEED_PREFIX}%`;
187
+ try {
188
+ db.exec(`DELETE FROM causal_events WHERE userId LIKE '${prefix}'`);
189
+ db.exec(`DELETE FROM causal_important_changes WHERE changeId LIKE '${prefix}'`);
190
+ db.exec(`DELETE FROM recon_sessions WHERE sessionId LIKE '${prefix}'`);
191
+ db.exec(`DELETE FROM tracking_actions WHERE sessionId LIKE '${prefix}'`);
192
+ db.exec(`DELETE FROM tracking_milestones WHERE milestoneId LIKE '${prefix}'`);
193
+ db.exec(`DELETE FROM intent_residuals WHERE intentId LIKE '${prefix}'`);
194
+ }
195
+ catch {
196
+ /* tables may not exist yet */
197
+ }
198
+ }
199
+ // ══════════════════════════════════════════════════════════════════════════════
200
+ // OUTPUT EXTRACTION HELPERS
201
+ // ══════════════════════════════════════════════════════════════════════════════
202
+ /** Extract text from MCP content blocks or raw output */
203
+ function extractText(result) {
204
+ if (!result)
205
+ return "(null)";
206
+ if (typeof result === "string")
207
+ return result;
208
+ if (Array.isArray(result)) {
209
+ const texts = result
210
+ .map((block) => {
211
+ if (typeof block === "string")
212
+ return block;
213
+ if (block?.type === "text")
214
+ return block.text;
215
+ if (block?.text)
216
+ return block.text;
217
+ return JSON.stringify(block);
218
+ })
219
+ .filter(Boolean);
220
+ return texts.join("\n") || JSON.stringify(result);
221
+ }
222
+ if (typeof result === "object")
223
+ return JSON.stringify(result);
224
+ return String(result);
225
+ }
226
+ /** Try to parse JSON from text, return the object or null */
227
+ function tryParseJson(text) {
228
+ try {
229
+ const parsed = JSON.parse(text);
230
+ if (typeof parsed === "object" && parsed !== null)
231
+ return parsed;
232
+ }
233
+ catch {
234
+ // Try extracting JSON from content blocks
235
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
236
+ if (jsonMatch) {
237
+ try {
238
+ return JSON.parse(jsonMatch[0]);
239
+ }
240
+ catch { /* not JSON */ }
241
+ }
242
+ }
243
+ return null;
244
+ }
245
+ /** Extract a nested field from an object using a dot path */
246
+ function extractField(obj, path) {
247
+ if (!obj || typeof obj !== "object")
248
+ return undefined;
249
+ const parts = path.split(".");
250
+ let current = obj;
251
+ for (const part of parts) {
252
+ if (current == null || typeof current !== "object")
253
+ return undefined;
254
+ current = current[part];
255
+ }
256
+ return current;
257
+ }
258
+ // ══════════════════════════════════════════════════════════════════════════════
259
+ // PIPELINE DEFINITIONS — 6 canonical multi-step chains
260
+ // ══════════════════════════════════════════════════════════════════════════════
261
+ const PIPELINES = [
262
+ // ── Pipeline 1: founder_weekly_reset ──────────────────────────────
263
+ {
264
+ name: "founder_weekly_reset",
265
+ description: "Full weekly reset: discover → load → synthesize → track → milestone",
266
+ steps: [
267
+ {
268
+ tool: "discover_tools",
269
+ staticArgs: { query: "founder weekly reset", limit: 10 },
270
+ description: "Discover tools for founder weekly reset workflow",
271
+ },
272
+ {
273
+ tool: "load_toolset",
274
+ staticArgs: { toolset: "founder" },
275
+ description: "Load the founder toolset",
276
+ },
277
+ {
278
+ tool: "founder_local_weekly_reset",
279
+ staticArgs: { query: "NodeBench weekly founder reset" },
280
+ description: "Run the weekly reset pipeline",
281
+ },
282
+ {
283
+ tool: "track_action",
284
+ staticArgs: {
285
+ category: "strategy",
286
+ action: "weekly_reset",
287
+ beforeState: "{}",
288
+ },
289
+ dynamicArgs: {
290
+ afterState: (prev) => {
291
+ const text = extractText(prev);
292
+ return text.slice(0, 500);
293
+ },
294
+ },
295
+ description: "Track the weekly reset action with before/after state",
296
+ },
297
+ {
298
+ tool: "track_milestone",
299
+ staticArgs: {
300
+ title: "Weekly reset completed",
301
+ category: "operations",
302
+ },
303
+ dynamicArgs: {
304
+ description: (prev) => {
305
+ const text = extractText(prev);
306
+ return `Weekly reset tracked: ${text.slice(0, 200)}`;
307
+ },
308
+ },
309
+ description: "Record a milestone for the completed weekly reset",
310
+ },
311
+ ],
312
+ },
313
+ // ── Pipeline 2: company_intelligence ──────────────────────────────
314
+ {
315
+ name: "company_intelligence",
316
+ description: "Company analysis: discover → load → synthesize → export → record event",
317
+ steps: [
318
+ {
319
+ tool: "discover_tools",
320
+ staticArgs: { query: "company analysis Anthropic", limit: 10 },
321
+ description: "Discover tools for company analysis",
322
+ },
323
+ {
324
+ tool: "load_toolset",
325
+ staticArgs: { toolset: "founder" },
326
+ description: "Load the founder toolset",
327
+ },
328
+ {
329
+ tool: "founder_local_synthesize",
330
+ staticArgs: {
331
+ query: "Analyze Anthropic competitive position",
332
+ includeWeb: false,
333
+ packetType: "company_search",
334
+ daysBack: 7,
335
+ },
336
+ description: "Synthesize Anthropic competitive analysis",
337
+ },
338
+ {
339
+ tool: "export_artifact_packet",
340
+ staticArgs: { format: "markdown", audience: "banker" },
341
+ dynamicArgs: {
342
+ content: (prev) => {
343
+ const text = extractText(prev);
344
+ const parsed = tryParseJson(text);
345
+ if (parsed?.synthesis)
346
+ return String(parsed.synthesis);
347
+ if (parsed?.content)
348
+ return String(parsed.content);
349
+ return text.slice(0, 2000);
350
+ },
351
+ },
352
+ description: "Export the synthesis as a markdown artifact for banker audience",
353
+ },
354
+ {
355
+ tool: "record_event",
356
+ staticArgs: {
357
+ eventType: "packet.generated",
358
+ summary: "Anthropic analysis",
359
+ entityId: "anthropic",
360
+ },
361
+ description: "Record the analysis generation event",
362
+ },
363
+ ],
364
+ },
365
+ // ── Pipeline 3: competitor_brief ──────────────────────────────────
366
+ {
367
+ name: "competitor_brief",
368
+ description: "Competitor research: discover → load → recon → extract vars → memo → record learning",
369
+ steps: [
370
+ {
371
+ tool: "discover_tools",
372
+ staticArgs: { query: "competitor analysis", limit: 10 },
373
+ description: "Discover tools for competitor analysis",
374
+ },
375
+ {
376
+ tool: "load_toolset",
377
+ staticArgs: { toolset: "founder" },
378
+ description: "Load the founder toolset",
379
+ },
380
+ {
381
+ tool: "run_recon",
382
+ staticArgs: { target: "Supermemory", scope: "full", webEnrich: true },
383
+ description: "Run recon on Supermemory competitor",
384
+ },
385
+ {
386
+ tool: "extract_variables",
387
+ staticArgs: {},
388
+ dynamicArgs: {
389
+ sessionId: (prev) => {
390
+ const text = extractText(prev);
391
+ const parsed = tryParseJson(text);
392
+ if (parsed?.sessionId)
393
+ return String(parsed.sessionId);
394
+ // Fallback: look for sessionId pattern in text
395
+ const match = text.match(/sessionId["\s:]+["']?([a-zA-Z0-9_-]+)/);
396
+ return match?.[1] ?? `${SEED_PREFIX}recon_supermemory`;
397
+ },
398
+ },
399
+ description: "Extract key variables from the recon session",
400
+ },
401
+ {
402
+ tool: "render_decision_memo",
403
+ staticArgs: { format: "brief" },
404
+ dynamicArgs: {
405
+ sessionId: (prev) => {
406
+ const text = extractText(prev);
407
+ const parsed = tryParseJson(text);
408
+ if (parsed?.sessionId)
409
+ return String(parsed.sessionId);
410
+ const match = text.match(/sessionId["\s:]+["']?([a-zA-Z0-9_-]+)/);
411
+ return match?.[1] ?? `${SEED_PREFIX}recon_supermemory`;
412
+ },
413
+ },
414
+ description: "Render a decision memo from the extracted variables",
415
+ },
416
+ {
417
+ tool: "record_learning",
418
+ staticArgs: { topic: "supermemory_competitive" },
419
+ dynamicArgs: {
420
+ insight: (prev) => {
421
+ const text = extractText(prev);
422
+ return text.slice(0, 500);
423
+ },
424
+ },
425
+ description: "Record the competitive insight as a learning",
426
+ },
427
+ ],
428
+ },
429
+ // ── Pipeline 4: pre_delegation ────────────────────────────────────
430
+ {
431
+ name: "pre_delegation",
432
+ description: "Delegation prep: synthesize → export → track intent → track action",
433
+ steps: [
434
+ {
435
+ tool: "founder_local_synthesize",
436
+ staticArgs: {
437
+ query: "Prepare delegation packet for improving NodeBench suppression quality",
438
+ packetType: "pre_delegation",
439
+ daysBack: 7,
440
+ },
441
+ description: "Synthesize a delegation packet for suppression quality improvement",
442
+ },
443
+ {
444
+ tool: "export_artifact_packet",
445
+ staticArgs: { format: "markdown", audience: "teammate" },
446
+ dynamicArgs: {
447
+ content: (prev) => {
448
+ const text = extractText(prev);
449
+ const parsed = tryParseJson(text);
450
+ if (parsed?.synthesis)
451
+ return String(parsed.synthesis);
452
+ if (parsed?.content)
453
+ return String(parsed.content);
454
+ return text.slice(0, 2000);
455
+ },
456
+ },
457
+ description: "Export the delegation packet as markdown for a teammate",
458
+ },
459
+ {
460
+ tool: "track_intent",
461
+ staticArgs: {
462
+ intent: "Improve suppression quality",
463
+ status: "active",
464
+ },
465
+ description: "Track the delegation intent as active",
466
+ },
467
+ {
468
+ tool: "track_action",
469
+ staticArgs: {
470
+ category: "delegation",
471
+ action: "packet_created",
472
+ },
473
+ dynamicArgs: {
474
+ afterState: (prev) => {
475
+ const text = extractText(prev);
476
+ return text.slice(0, 300);
477
+ },
478
+ },
479
+ description: "Record the delegation packet creation action",
480
+ },
481
+ ],
482
+ },
483
+ // ── Pipeline 5: important_change_review ───────────────────────────
484
+ {
485
+ name: "important_change_review",
486
+ description: "Change review: get ledger → get changes → synthesize → flag change",
487
+ steps: [
488
+ {
489
+ tool: "get_event_ledger",
490
+ staticArgs: { limit: 20 },
491
+ description: "Retrieve the recent event ledger",
492
+ },
493
+ {
494
+ tool: "get_important_changes",
495
+ staticArgs: { status: "detected", limit: 10 },
496
+ description: "Get detected important changes",
497
+ },
498
+ {
499
+ tool: "founder_local_synthesize",
500
+ staticArgs: {
501
+ query: "Summarize important changes and recommend next actions",
502
+ packetType: "important_change",
503
+ daysBack: 7,
504
+ },
505
+ description: "Synthesize a summary of important changes with action recommendations",
506
+ },
507
+ {
508
+ tool: "flag_important_change",
509
+ staticArgs: {
510
+ changeCategory: "strategy",
511
+ impactScore: 7,
512
+ },
513
+ dynamicArgs: {
514
+ summary: (prev) => {
515
+ const text = extractText(prev);
516
+ const parsed = tryParseJson(text);
517
+ if (parsed?.synthesis)
518
+ return String(parsed.synthesis).slice(0, 500);
519
+ return text.slice(0, 500);
520
+ },
521
+ },
522
+ description: "Flag the synthesized change as a strategy-level important change",
523
+ },
524
+ ],
525
+ },
526
+ // ── Pipeline 6: session_memory_cycle ──────────────────────────────
527
+ {
528
+ name: "session_memory_cycle",
529
+ description: "Memory lifecycle: track intent → synthesize → summarize → recover → complete intent",
530
+ steps: [
531
+ {
532
+ tool: "track_intent",
533
+ staticArgs: {
534
+ intent: "Investigate Anthropic valuation",
535
+ status: "active",
536
+ },
537
+ description: "Track the investigation intent as active",
538
+ },
539
+ {
540
+ tool: "founder_local_synthesize",
541
+ staticArgs: {
542
+ query: "What is Anthropic's current valuation?",
543
+ packetType: "company_search",
544
+ daysBack: 7,
545
+ },
546
+ description: "Synthesize information about Anthropic valuation",
547
+ },
548
+ {
549
+ tool: "summarize_session",
550
+ staticArgs: { sessionId: "pipeline_eval_session" },
551
+ description: "Summarize the current session",
552
+ },
553
+ {
554
+ tool: "get_compaction_recovery",
555
+ staticArgs: {},
556
+ description: "Recover compacted session context",
557
+ },
558
+ {
559
+ tool: "track_intent",
560
+ staticArgs: {
561
+ intent: "Investigate Anthropic valuation",
562
+ status: "completed",
563
+ },
564
+ description: "Mark the investigation intent as completed",
565
+ },
566
+ ],
567
+ },
568
+ ];
569
+ // ══════════════════════════════════════════════════════════════════════════════
570
+ // TOOL EXECUTOR
571
+ // ══════════════════════════════════════════════════════════════════════════════
572
+ function findTool(tools, name) {
573
+ return tools.find((t) => t.name === name) ?? null;
574
+ }
575
+ async function callTool(tool, args = {}) {
576
+ const start = Date.now();
577
+ try {
578
+ const result = await tool.handler(args);
579
+ return { ok: true, result, ms: Date.now() - start };
580
+ }
581
+ catch (err) {
582
+ return { ok: false, result: null, error: err?.message ?? String(err), ms: Date.now() - start };
583
+ }
584
+ }
585
+ // ══════════════════════════════════════════════════════════════════════════════
586
+ // PIPELINE RUNNER — sequential execution with output chaining
587
+ // ══════════════════════════════════════════════════════════════════════════════
588
+ async function runPipeline(pipeline, allTools) {
589
+ const results = [];
590
+ let prevOutput = null;
591
+ for (let i = 0; i < pipeline.steps.length; i++) {
592
+ const step = pipeline.steps[i];
593
+ const tool = findTool(allTools, step.tool);
594
+ if (!tool) {
595
+ results.push({
596
+ stepIndex: i,
597
+ tool: step.tool,
598
+ description: step.description,
599
+ args: step.staticArgs,
600
+ ok: false,
601
+ outputSize: 0,
602
+ outputPreview: "",
603
+ error: `Tool "${step.tool}" not found in loaded toolsets`,
604
+ ms: 0,
605
+ });
606
+ // Continue to next step — don't break the chain entirely
607
+ continue;
608
+ }
609
+ // Build args: merge static + dynamic (extracted from previous output)
610
+ const args = { ...step.staticArgs };
611
+ if (step.dynamicArgs && prevOutput !== null) {
612
+ for (const [key, extractor] of Object.entries(step.dynamicArgs)) {
613
+ try {
614
+ const extracted = extractor(prevOutput);
615
+ if (extracted !== undefined && extracted !== null) {
616
+ args[key] = extracted;
617
+ }
618
+ }
619
+ catch {
620
+ // Extractor failed — use static arg or skip
621
+ }
622
+ }
623
+ }
624
+ const { ok, result, error, ms } = await callTool(tool, args);
625
+ const outputText = ok ? extractText(result) : (error ?? "(error)");
626
+ results.push({
627
+ stepIndex: i,
628
+ tool: step.tool,
629
+ description: step.description,
630
+ args,
631
+ ok,
632
+ outputSize: outputText.length,
633
+ outputPreview: outputText.slice(0, 500),
634
+ error: ok ? undefined : error,
635
+ ms,
636
+ });
637
+ // Pass output forward for chaining
638
+ prevOutput = result;
639
+ }
640
+ return results;
641
+ }
642
+ const PIPELINE_CRITERIA = [
643
+ {
644
+ criterion: "Final output contains structured data derived from earlier pipeline steps",
645
+ weight: 2,
646
+ },
647
+ {
648
+ criterion: "No step produced only errors or empty results",
649
+ weight: 2,
650
+ },
651
+ {
652
+ criterion: "Output entity names are consistent across pipeline steps",
653
+ weight: 1,
654
+ },
655
+ {
656
+ criterion: "Final artifact is usable without re-running the pipeline",
657
+ weight: 1,
658
+ },
659
+ {
660
+ criterion: "Pipeline completed all steps (no step was skipped due to missing input)",
661
+ weight: 2,
662
+ },
663
+ ];
664
+ /** Structural code-graded criteria — deterministic, no LLM needed */
665
+ function codeGradeCriteria(steps, pipeline) {
666
+ const results = [];
667
+ // Criterion: "No step produced only errors or empty results"
668
+ const allStepsProduced = steps.every((s) => s.ok && s.outputSize > 10);
669
+ const errorSteps = steps.filter((s) => !s.ok || s.outputSize <= 10);
670
+ results.push({
671
+ criterion: PIPELINE_CRITERIA[1].criterion,
672
+ weight: PIPELINE_CRITERIA[1].weight,
673
+ pass: allStepsProduced,
674
+ evidence: allStepsProduced
675
+ ? `All ${steps.length} steps produced non-empty output`
676
+ : `${errorSteps.length} step(s) failed or empty: ${errorSteps.map((s) => s.tool).join(", ")}`,
677
+ });
678
+ // Criterion: "Pipeline completed all steps (no step was skipped)"
679
+ const allCompleted = steps.length === pipeline.steps.length && steps.every((s) => s.outputSize > 0 || s.ok);
680
+ const missingTools = pipeline.steps
681
+ .filter((_, i) => !steps[i] || steps[i].error?.includes("not found"))
682
+ .map((s) => s.tool);
683
+ results.push({
684
+ criterion: PIPELINE_CRITERIA[4].criterion,
685
+ weight: PIPELINE_CRITERIA[4].weight,
686
+ pass: allCompleted,
687
+ evidence: allCompleted
688
+ ? `All ${pipeline.steps.length} steps executed`
689
+ : `Missing/skipped: ${missingTools.join(", ") || errorSteps.map((s) => s.tool).join(", ")}`,
690
+ });
691
+ return results;
692
+ }
693
+ // ══════════════════════════════════════════════════════════════════════════════
694
+ // LLM JUDGE — Gemini evaluates remaining criteria on full pipeline trace
695
+ // ══════════════════════════════════════════════════════════════════════════════
696
+ const GEMINI_MODEL = "gemini-3.1-flash-lite-preview";
697
+ async function geminiJudgePipeline(pipeline, steps, codeGraded) {
698
+ const apiKey = process.env.GEMINI_API_KEY;
699
+ if (!apiKey) {
700
+ // No API key — return heuristic-based results for LLM criteria
701
+ return llmCriteriaHeuristic(steps);
702
+ }
703
+ // Build the full trace for the judge
704
+ const traceText = steps
705
+ .map((s) => `Step ${s.stepIndex + 1} [${s.tool}]: ${s.description}\n` +
706
+ ` Args: ${JSON.stringify(s.args).slice(0, 300)}\n` +
707
+ ` OK: ${s.ok}, Size: ${s.outputSize}b, Time: ${s.ms}ms\n` +
708
+ ` Output: ${s.outputPreview}`)
709
+ .join("\n\n");
710
+ // Only ask Gemini about criteria NOT already code-graded
711
+ const codeGradedCriteria = new Set(codeGraded.map((c) => c.criterion));
712
+ const llmCriteria = PIPELINE_CRITERIA.filter((c) => !codeGradedCriteria.has(c.criterion));
713
+ if (llmCriteria.length === 0)
714
+ return [];
715
+ const criteriaList = llmCriteria.map((c, i) => `${i + 1}. ${c.criterion} (weight: ${c.weight})`).join("\n");
716
+ const prompt = `You are an evaluation judge for NodeBench MCP pipeline chains.
717
+
718
+ Pipeline: "${pipeline.name}" — ${pipeline.description}
719
+
720
+ Full execution trace:
721
+
722
+ ${traceText.slice(0, 8000)}
723
+
724
+ Evaluate these criteria:
725
+ ${criteriaList}
726
+
727
+ RULES:
728
+ - A pipeline step that returns structured JSON or prose analysis is valid output
729
+ - "Derived from earlier steps" means the final output references entities, data, or concepts from previous steps
730
+ - "Entity names consistent" means if step 1 mentions "Anthropic", later steps should too (not switch to a different entity)
731
+ - "Usable without re-running" means the final output has enough context to be read standalone
732
+
733
+ Respond in this exact JSON format (no markdown, no explanation):
734
+ {
735
+ "criteria": [
736
+ {"criterion": "exact criterion text", "pass": true, "evidence": "brief explanation"}
737
+ ]
738
+ }`;
739
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
740
+ try {
741
+ const resp = await fetch(url, {
742
+ method: "POST",
743
+ headers: {
744
+ "Content-Type": "application/json",
745
+ "x-goog-api-key": apiKey,
746
+ },
747
+ body: JSON.stringify({
748
+ contents: [{ parts: [{ text: prompt }] }],
749
+ generationConfig: { temperature: 0.1, maxOutputTokens: 1024 },
750
+ }),
751
+ signal: AbortSignal.timeout(30_000),
752
+ });
753
+ if (!resp.ok) {
754
+ console.warn(`[gemini] HTTP ${resp.status}: ${resp.statusText}`);
755
+ return llmCriteriaHeuristic(steps);
756
+ }
757
+ const json = (await resp.json());
758
+ const text = json?.candidates?.[0]?.content?.parts?.[0]?.text ?? "";
759
+ // Parse JSON from response
760
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
761
+ if (!jsonMatch)
762
+ return llmCriteriaHeuristic(steps);
763
+ const parsed = JSON.parse(jsonMatch[0]);
764
+ return parsed.criteria.map((c) => {
765
+ const def = llmCriteria.find((lc) => c.criterion.includes(lc.criterion.slice(0, 30)));
766
+ return {
767
+ criterion: c.criterion,
768
+ weight: def?.weight ?? 1,
769
+ pass: c.pass,
770
+ evidence: c.evidence,
771
+ };
772
+ });
773
+ }
774
+ catch (err) {
775
+ console.warn(`[gemini] Judge error: ${err?.message ?? err}`);
776
+ return llmCriteriaHeuristic(steps);
777
+ }
778
+ }
779
+ /** Heuristic fallback when Gemini is unavailable */
780
+ function llmCriteriaHeuristic(steps) {
781
+ const results = [];
782
+ const allOutputs = steps.map((s) => s.outputPreview).join(" ");
783
+ const lastOutput = steps[steps.length - 1]?.outputPreview ?? "";
784
+ // "Final output contains structured data derived from earlier pipeline steps"
785
+ const hasStructuredFinal = lastOutput.length > 50 && (lastOutput.includes("{") || lastOutput.includes("##") || lastOutput.includes("- "));
786
+ const referencesEarlier = steps.length > 1 && steps.slice(0, -1).some((s) => {
787
+ // Check if any entity/keyword from earlier steps appears in final output
788
+ const words = s.outputPreview.split(/\s+/).filter((w) => w.length > 5);
789
+ return words.some((w) => lastOutput.toLowerCase().includes(w.toLowerCase()));
790
+ });
791
+ results.push({
792
+ criterion: PIPELINE_CRITERIA[0].criterion,
793
+ weight: PIPELINE_CRITERIA[0].weight,
794
+ pass: hasStructuredFinal && (referencesEarlier || steps.length <= 2),
795
+ evidence: hasStructuredFinal
796
+ ? `Final output is ${lastOutput.length}b with structure; references earlier: ${referencesEarlier}`
797
+ : `Final output too short (${lastOutput.length}b) or unstructured`,
798
+ });
799
+ // "Output entity names are consistent across pipeline steps"
800
+ // Extract capitalized multi-word names from outputs
801
+ const entitySets = steps
802
+ .filter((s) => s.ok && s.outputSize > 20)
803
+ .map((s) => {
804
+ const matches = s.outputPreview.match(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g) ?? [];
805
+ return new Set(matches.filter((m) => m.length > 3));
806
+ });
807
+ const firstEntities = entitySets[0] ?? new Set();
808
+ const consistentEntities = entitySets.length <= 1 ||
809
+ [...firstEntities].some((e) => entitySets.slice(1).some((s) => s.has(e)));
810
+ results.push({
811
+ criterion: PIPELINE_CRITERIA[2].criterion,
812
+ weight: PIPELINE_CRITERIA[2].weight,
813
+ pass: consistentEntities,
814
+ evidence: consistentEntities
815
+ ? `Entity names consistent across steps`
816
+ : `No overlapping entities found between step outputs`,
817
+ });
818
+ // "Final artifact is usable without re-running the pipeline"
819
+ const isUsable = lastOutput.length > 100 && !lastOutput.startsWith("ERROR") && !lastOutput.startsWith("(null)");
820
+ results.push({
821
+ criterion: PIPELINE_CRITERIA[3].criterion,
822
+ weight: PIPELINE_CRITERIA[3].weight,
823
+ pass: isUsable,
824
+ evidence: isUsable
825
+ ? `Final output is ${lastOutput.length}b, substantive and self-contained`
826
+ : `Final output is too short or is an error`,
827
+ });
828
+ return results;
829
+ }
830
+ // ══════════════════════════════════════════════════════════════════════════════
831
+ // PERSISTENCE — SQLite storage
832
+ // ══════════════════════════════════════════════════════════════════════════════
833
+ function persistRun(result) {
834
+ const db = getDb();
835
+ ensureSchema();
836
+ db.prepare(`INSERT INTO pipeline_eval_runs (run_id, pipeline_name, timestamp, step_count, steps_passed, overall_pass, criteria_json, total_ms)
837
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(result.runId, result.pipelineName, result.timestamp, result.steps.length, result.steps.filter((s) => s.ok).length, result.overallPass ? 1 : 0, JSON.stringify(result.criteria), result.totalMs);
838
+ const stepStmt = db.prepare(`INSERT INTO pipeline_eval_steps (id, run_id, step_index, tool, description, args_json, ok, output_size, output_preview, error, ms)
839
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
840
+ for (const step of result.steps) {
841
+ stepStmt.run(genId("step"), result.runId, step.stepIndex, step.tool, step.description, JSON.stringify(step.args), step.ok ? 1 : 0, step.outputSize, step.outputPreview, step.error ?? null, step.ms);
842
+ }
843
+ }
844
+ // ══════════════════════════════════════════════════════════════════════════════
845
+ // MAIN EVAL RUNNER
846
+ // ══════════════════════════════════════════════════════════════════════════════
847
+ async function runPipelineEval(pipelineName) {
848
+ // 1. Load all tools
849
+ console.log("[pipeline-eval] Loading all tool domains...");
850
+ const allTools = await loadToolsets(ALL_DOMAIN_KEYS);
851
+ console.log(`[pipeline-eval] Loaded ${allTools.length} tools across ${ALL_DOMAIN_KEYS.length} domains`);
852
+ // Also create progressive discovery tools
853
+ const { createProgressiveDiscoveryTools } = await import("../tools/progressiveDiscoveryTools.js");
854
+ const discoveryTools = createProgressiveDiscoveryTools(allTools.map((t) => ({ name: t.name, description: t.description })));
855
+ const fullToolset = [...allTools, ...discoveryTools];
856
+ // 2. Seed test data
857
+ console.log("[pipeline-eval] Seeding test data...");
858
+ seedPipelineData();
859
+ // 3. Select pipelines to run
860
+ const pipelines = pipelineName
861
+ ? PIPELINES.filter((p) => p.name === pipelineName)
862
+ : PIPELINES;
863
+ if (pipelines.length === 0) {
864
+ console.error(`[pipeline-eval] Unknown pipeline: ${pipelineName}`);
865
+ console.error(` Available: ${PIPELINES.map((p) => p.name).join(", ")}`);
866
+ process.exit(1);
867
+ }
868
+ // 4. Run each pipeline
869
+ const results = [];
870
+ for (const pipeline of pipelines) {
871
+ console.log(`\n${"═".repeat(60)}`);
872
+ console.log(`PIPELINE: ${pipeline.name}`);
873
+ console.log(` ${pipeline.description}`);
874
+ console.log(` Steps: ${pipeline.steps.length}`);
875
+ console.log("═".repeat(60));
876
+ const runId = genId("pipe");
877
+ const startMs = Date.now();
878
+ // Execute the pipeline
879
+ const steps = await runPipeline(pipeline, fullToolset);
880
+ const totalMs = Date.now() - startMs;
881
+ // Print step results
882
+ for (const step of steps) {
883
+ const status = step.ok ? "PASS" : "FAIL";
884
+ const icon = step.ok ? "[+]" : "[-]";
885
+ console.log(` ${icon} Step ${step.stepIndex + 1}: ${step.tool} (${step.ms}ms) — ${status}`);
886
+ if (step.error) {
887
+ console.log(` Error: ${step.error.slice(0, 150)}`);
888
+ }
889
+ else {
890
+ console.log(` Output: ${step.outputSize}b — ${step.outputPreview.slice(0, 120)}...`);
891
+ }
892
+ }
893
+ // Grade criteria — code first, then LLM
894
+ console.log(`\n Grading criteria...`);
895
+ const codeGraded = codeGradeCriteria(steps, pipeline);
896
+ const llmGraded = await geminiJudgePipeline(pipeline, steps, codeGraded);
897
+ const allCriteria = [...codeGraded, ...llmGraded];
898
+ // Overall pass: weighted score
899
+ const totalWeight = allCriteria.reduce((sum, c) => sum + c.weight, 0);
900
+ const passedWeight = allCriteria.filter((c) => c.pass).reduce((sum, c) => sum + c.weight, 0);
901
+ const weightedPassRate = totalWeight > 0 ? passedWeight / totalWeight : 0;
902
+ const overallPass = weightedPassRate >= 0.6; // 60% weighted threshold
903
+ // Print criteria results
904
+ for (const c of allCriteria) {
905
+ const icon = c.pass ? "[+]" : "[-]";
906
+ console.log(` ${icon} (w=${c.weight}) ${c.criterion}`);
907
+ console.log(` ${c.evidence}`);
908
+ }
909
+ console.log(`\n RESULT: ${overallPass ? "PASS" : "FAIL"} (${(weightedPassRate * 100).toFixed(1)}% weighted, ${totalMs}ms)`);
910
+ const runResult = {
911
+ pipelineName: pipeline.name,
912
+ runId,
913
+ steps,
914
+ criteria: allCriteria,
915
+ overallPass,
916
+ totalMs,
917
+ timestamp: new Date().toISOString(),
918
+ };
919
+ // Persist to SQLite
920
+ try {
921
+ persistRun(runResult);
922
+ }
923
+ catch (err) {
924
+ console.warn(` [persist] Warning: ${err?.message}`);
925
+ }
926
+ results.push(runResult);
927
+ }
928
+ // Cleanup seed data
929
+ console.log("\n[pipeline-eval] Cleaning up seed data...");
930
+ cleanupPipelineData();
931
+ return results;
932
+ }
933
+ // ══════════════════════════════════════════════════════════════════════════════
934
+ // SUMMARY PRINTER
935
+ // ══════════════════════════════════════════════════════════════════════════════
936
+ function printSummary(results) {
937
+ console.log("\n" + "═".repeat(60));
938
+ console.log("PIPELINE EVAL SUMMARY");
939
+ console.log("═".repeat(60));
940
+ const passed = results.filter((r) => r.overallPass).length;
941
+ const total = results.length;
942
+ console.log(`\n Pipelines: ${passed}/${total} passed`);
943
+ console.log("");
944
+ // Per-pipeline table
945
+ console.log(" Pipeline Steps Criteria Time Result");
946
+ console.log(" " + "-".repeat(70));
947
+ for (const r of results) {
948
+ const stepsPassed = r.steps.filter((s) => s.ok).length;
949
+ const critPassed = r.criteria.filter((c) => c.pass).length;
950
+ const status = r.overallPass ? "PASS" : "FAIL";
951
+ const name = r.pipelineName.padEnd(28);
952
+ const stepsStr = `${stepsPassed}/${r.steps.length}`.padEnd(6);
953
+ const critStr = `${critPassed}/${r.criteria.length}`.padEnd(9);
954
+ const timeStr = `${r.totalMs}ms`.padEnd(7);
955
+ console.log(` ${name} ${stepsStr} ${critStr} ${timeStr} ${status}`);
956
+ }
957
+ // Aggregate criteria
958
+ console.log("\n Criteria Breakdown:");
959
+ const criteriaMap = new Map();
960
+ for (const r of results) {
961
+ for (const c of r.criteria) {
962
+ const entry = criteriaMap.get(c.criterion) ?? { pass: 0, total: 0 };
963
+ entry.total++;
964
+ if (c.pass)
965
+ entry.pass++;
966
+ criteriaMap.set(c.criterion, entry);
967
+ }
968
+ }
969
+ for (const [criterion, stats] of criteriaMap) {
970
+ const rate = ((stats.pass / stats.total) * 100).toFixed(0);
971
+ const icon = stats.pass === stats.total ? "[+]" : "[-]";
972
+ console.log(` ${icon} ${rate}% — ${criterion.slice(0, 65)}`);
973
+ }
974
+ // Total time
975
+ const totalTime = results.reduce((sum, r) => sum + r.totalMs, 0);
976
+ console.log(`\n Total time: ${totalTime}ms`);
977
+ console.log(` Overall pass rate: ${((passed / total) * 100).toFixed(1)}%`);
978
+ console.log("");
979
+ }
980
+ // ══════════════════════════════════════════════════════════════════════════════
981
+ // CLI
982
+ // ══════════════════════════════════════════════════════════════════════════════
983
+ function parseArgs(argv) {
984
+ const options = {};
985
+ for (let i = 0; i < argv.length; i++) {
986
+ const arg = argv[i];
987
+ switch (arg) {
988
+ case "--pipeline":
989
+ options.pipeline = argv[++i];
990
+ break;
991
+ case "--all":
992
+ options.all = true;
993
+ break;
994
+ default:
995
+ if (arg.startsWith("--")) {
996
+ console.error(`Unknown flag: ${arg}`);
997
+ console.error(`Usage: npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]`);
998
+ console.error(`Pipelines: ${PIPELINES.map((p) => p.name).join(", ")}`);
999
+ process.exit(1);
1000
+ }
1001
+ }
1002
+ }
1003
+ return options;
1004
+ }
1005
+ async function main() {
1006
+ // Load .env.local for GEMINI_API_KEY
1007
+ if (!process.env.GEMINI_API_KEY) {
1008
+ try {
1009
+ const fs = await import("fs");
1010
+ const path = await import("path");
1011
+ const candidates = [
1012
+ path.resolve(process.cwd(), ".env.local"),
1013
+ path.resolve(process.cwd(), "../../.env.local"),
1014
+ path.resolve(process.cwd(), "../../../.env.local"),
1015
+ ];
1016
+ for (const envPath of candidates) {
1017
+ if (fs.existsSync(envPath)) {
1018
+ const content = fs.readFileSync(envPath, "utf-8");
1019
+ for (const line of content.split("\n")) {
1020
+ const match = line.match(/^([^#=]+)=(.*)$/);
1021
+ if (match)
1022
+ process.env[match[1].trim()] = match[2].trim();
1023
+ }
1024
+ if (process.env.GEMINI_API_KEY) {
1025
+ console.log(`[env] Loaded GEMINI_API_KEY from ${envPath}`);
1026
+ break;
1027
+ }
1028
+ }
1029
+ }
1030
+ }
1031
+ catch {
1032
+ /* ignore env loading errors */
1033
+ }
1034
+ }
1035
+ const options = parseArgs(process.argv.slice(2));
1036
+ console.log("NodeBench Pipeline Eval Harness");
1037
+ console.log("═".repeat(40));
1038
+ console.log(` Gemini: ${process.env.GEMINI_API_KEY ? "available" : "unavailable (heuristic fallback)"}`);
1039
+ console.log(` Pipeline: ${options.pipeline ?? "all"}`);
1040
+ console.log("");
1041
+ // Wire up DB accessor for toolRegistry
1042
+ _setDbAccessor(getDb);
1043
+ const results = await runPipelineEval(options.pipeline);
1044
+ printSummary(results);
1045
+ // Exit with code based on pass rate
1046
+ const passRate = results.filter((r) => r.overallPass).length / results.length;
1047
+ process.exit(passRate >= 0.5 ? 0 : 1);
1048
+ }
1049
+ main().catch((err) => {
1050
+ console.error("[pipeline-eval] Fatal error:", err);
1051
+ process.exit(1);
1052
+ });
1053
+ //# sourceMappingURL=pipelineEval.js.map