nodebench-mcp 2.67.0 → 2.69.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/chainEval.d.ts +21 -0
- package/dist/benchmarks/chainEval.js +683 -0
- package/dist/benchmarks/chainEval.js.map +1 -0
- package/dist/benchmarks/llmJudgeEval.js +90 -7
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/pipelineEval.d.ts +63 -0
- package/dist/benchmarks/pipelineEval.js +1035 -0
- package/dist/benchmarks/pipelineEval.js.map +1 -0
- package/dist/benchmarks/searchQualityEval.js +4 -4
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/tools/founderTools.js +2 -1
- package/dist/tools/founderTools.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* chainEval.ts — Multi-step chain eval for NodeBench.
|
|
3
|
+
*
|
|
4
|
+
* Tests REAL agent workflows where Tool A's output feeds into Tool B's input.
|
|
5
|
+
* Single-tool eval proves "can each tool respond."
|
|
6
|
+
* Chain eval proves "can the pipeline produce a real result."
|
|
7
|
+
*
|
|
8
|
+
* 8 canonical chains matching the dogfood runbook:
|
|
9
|
+
* 1. Founder Weekly Reset (5 steps)
|
|
10
|
+
* 2. Pre-Delegation Packet (4 steps)
|
|
11
|
+
* 3. Important-Change Review (4 steps)
|
|
12
|
+
* 4. Competitor Intelligence (5 steps)
|
|
13
|
+
* 5. Banker Company Search (4 steps)
|
|
14
|
+
* 6. Student Strategy Brief (4 steps)
|
|
15
|
+
* 7. Setup + Discovery (3 steps)
|
|
16
|
+
* 8. Operator Causal Replay (4 steps)
|
|
17
|
+
*
|
|
18
|
+
* Usage:
|
|
19
|
+
* GEMINI_API_KEY=... npx tsx src/benchmarks/chainEval.ts [--chains N]
|
|
20
|
+
*/
|
|
21
|
+
import { loadAllToolsets } from "../toolsetRegistry.js";
|
|
22
|
+
import { getDb } from "../db.js";
|
|
23
|
+
// Meta tools are registered separately from domain toolsets
|
|
24
|
+
async function loadAllToolsIncludingMeta() {
|
|
25
|
+
const domainTools = await loadAllToolsets();
|
|
26
|
+
const extras = [];
|
|
27
|
+
// Load meta tools (check_mcp_setup, list_available_toolsets, etc.)
|
|
28
|
+
try {
|
|
29
|
+
const { createMetaTools } = await import("../tools/metaTools.js");
|
|
30
|
+
extras.push(...createMetaTools(domainTools));
|
|
31
|
+
}
|
|
32
|
+
catch { /* metaTools not available */ }
|
|
33
|
+
// Load progressive discovery tools (discover_tools, get_tool_quick_ref, etc.)
|
|
34
|
+
try {
|
|
35
|
+
const { createProgressiveDiscoveryTools } = await import("../tools/progressiveDiscoveryTools.js");
|
|
36
|
+
extras.push(...createProgressiveDiscoveryTools([...domainTools, ...extras]));
|
|
37
|
+
}
|
|
38
|
+
catch { /* progressiveDiscoveryTools not available */ }
|
|
39
|
+
return [...domainTools, ...extras];
|
|
40
|
+
}
|
|
41
|
+
/* ─── Helpers ────────────────────────────────────────────────────────────── */
|
|
42
|
+
function hasField(obj, ...keys) {
|
|
43
|
+
if (!obj || typeof obj !== "object")
|
|
44
|
+
return false;
|
|
45
|
+
return keys.some(k => {
|
|
46
|
+
if (k in obj && obj[k] !== null && obj[k] !== undefined) {
|
|
47
|
+
if (typeof obj[k] === "string")
|
|
48
|
+
return obj[k].length > 0;
|
|
49
|
+
if (Array.isArray(obj[k]))
|
|
50
|
+
return obj[k].length > 0;
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
return false;
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
function hasNonEmpty(obj, key) {
|
|
57
|
+
if (!obj || typeof obj !== "object")
|
|
58
|
+
return false;
|
|
59
|
+
const val = obj[key];
|
|
60
|
+
if (val === null || val === undefined)
|
|
61
|
+
return false;
|
|
62
|
+
if (typeof val === "string")
|
|
63
|
+
return val.length > 5;
|
|
64
|
+
if (Array.isArray(val))
|
|
65
|
+
return val.length > 0;
|
|
66
|
+
if (typeof val === "object")
|
|
67
|
+
return Object.keys(val).length > 0;
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
function outputContains(obj, substring) {
|
|
71
|
+
const str = JSON.stringify(obj).toLowerCase();
|
|
72
|
+
return str.includes(substring.toLowerCase());
|
|
73
|
+
}
|
|
74
|
+
/* ─── Chain Definitions ──────────────────────────────────────────────────── */
|
|
75
|
+
const CHAINS = [
|
|
76
|
+
// Chain 1: Founder Weekly Reset (the #1 habit)
|
|
77
|
+
{
|
|
78
|
+
id: "founder_weekly_reset",
|
|
79
|
+
name: "Founder Weekly Reset",
|
|
80
|
+
scenario: "weekly_reset",
|
|
81
|
+
lens: "founder",
|
|
82
|
+
steps: [
|
|
83
|
+
{
|
|
84
|
+
tool: "get_context_bundle",
|
|
85
|
+
buildArgs: () => ({ query: "founder weekly reset" }),
|
|
86
|
+
validate: (r) => ({
|
|
87
|
+
pass: hasField(r, "systemPromptPrefix", "pinned"),
|
|
88
|
+
reason: hasField(r, "systemPromptPrefix") ? "Context bundle loaded" : "Missing systemPromptPrefix",
|
|
89
|
+
}),
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
tool: "founder_local_gather",
|
|
93
|
+
buildArgs: () => ({ daysBack: 7 }),
|
|
94
|
+
validate: (r) => ({
|
|
95
|
+
pass: hasField(r, "identity", "gitActivity", "sessionMemory"),
|
|
96
|
+
reason: hasField(r, "identity") ? "Context gathered with identity" : "Missing identity in gather",
|
|
97
|
+
}),
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
tool: "founder_local_synthesize",
|
|
101
|
+
buildArgs: (prior) => ({
|
|
102
|
+
packetType: "weekly_reset",
|
|
103
|
+
daysBack: 7,
|
|
104
|
+
query: "Generate my founder weekly reset — what company we're building, what changed, main contradiction, next 3 moves",
|
|
105
|
+
}),
|
|
106
|
+
validate: (r) => ({
|
|
107
|
+
// LLM synthesis returns: summary, keyFindings, risks, nextSteps, entities
|
|
108
|
+
// Local pipeline returns: memo, canonicalEntity, whatChanged, contradictions, nextActions
|
|
109
|
+
pass: hasField(r, "summary", "memo") || hasField(r, "keyFindings", "whatChanged"),
|
|
110
|
+
reason: hasField(r, "summary") ? "LLM synthesis produced" : hasField(r, "memo") ? "Local packet produced" : "No synthesis or memo",
|
|
111
|
+
}),
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
tool: "track_action",
|
|
115
|
+
buildArgs: (prior) => {
|
|
116
|
+
const entity = prior[2]?.summary?.slice(0, 40) ?? prior[2]?.canonicalEntity?.canonicalMission?.slice(0, 40) ?? "weekly reset";
|
|
117
|
+
return { action: `Weekly reset: ${entity}`, category: "founder", impact: "significant" };
|
|
118
|
+
},
|
|
119
|
+
validate: (r) => ({
|
|
120
|
+
pass: hasField(r, "actionId", "tracked", "action"),
|
|
121
|
+
reason: "Action tracked",
|
|
122
|
+
}),
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
tool: "track_milestone",
|
|
126
|
+
buildArgs: (prior) => ({
|
|
127
|
+
title: "Weekly founder reset generated",
|
|
128
|
+
description: `Findings: ${prior[2]?.keyFindings?.length ?? prior[2]?.contradictions?.length ?? 0}`,
|
|
129
|
+
category: "founder_habit",
|
|
130
|
+
}),
|
|
131
|
+
validate: (r) => ({
|
|
132
|
+
pass: hasField(r, "milestoneId", "tracked", "title"),
|
|
133
|
+
reason: "Milestone tracked",
|
|
134
|
+
}),
|
|
135
|
+
},
|
|
136
|
+
],
|
|
137
|
+
finalValidation: (all) => {
|
|
138
|
+
const reasons = [];
|
|
139
|
+
let pass = true;
|
|
140
|
+
if (!all[0]?.systemPromptPrefix && !all[0]?.pinned) {
|
|
141
|
+
reasons.push("FAIL: No context bundle");
|
|
142
|
+
pass = false;
|
|
143
|
+
}
|
|
144
|
+
if (!all[1]?.identity) {
|
|
145
|
+
reasons.push("FAIL: No identity in gather");
|
|
146
|
+
pass = false;
|
|
147
|
+
}
|
|
148
|
+
// Accept either LLM synthesis or local memo
|
|
149
|
+
if (!all[2]?.summary && !all[2]?.memo) {
|
|
150
|
+
reasons.push("FAIL: No synthesis or memo");
|
|
151
|
+
pass = false;
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
const output = JSON.stringify(all[2]).toLowerCase();
|
|
155
|
+
if (!output.includes("contradiction") && !output.includes("risk"))
|
|
156
|
+
reasons.push("WARN: Missing contradictions/risks");
|
|
157
|
+
if (!output.includes("next") && !output.includes("action") && !output.includes("step"))
|
|
158
|
+
reasons.push("WARN: Missing next actions");
|
|
159
|
+
}
|
|
160
|
+
if (reasons.length === 0)
|
|
161
|
+
reasons.push("PASS: Complete weekly reset chain");
|
|
162
|
+
return { pass, reasons };
|
|
163
|
+
},
|
|
164
|
+
},
|
|
165
|
+
// Chain 2: Pre-Delegation Packet
|
|
166
|
+
{
|
|
167
|
+
id: "pre_delegation",
|
|
168
|
+
name: "Pre-Delegation Packet for Claude Code",
|
|
169
|
+
scenario: "delegation",
|
|
170
|
+
lens: "founder",
|
|
171
|
+
steps: [
|
|
172
|
+
{
|
|
173
|
+
tool: "get_context_bundle",
|
|
174
|
+
buildArgs: () => ({ query: "pre-delegation packet for Claude Code" }),
|
|
175
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
tool: "founder_local_gather",
|
|
179
|
+
buildArgs: () => ({ daysBack: 7 }),
|
|
180
|
+
validate: (r) => ({ pass: hasField(r, "identity"), reason: hasField(r, "identity") ? "Gathered" : "No identity" }),
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
tool: "founder_local_synthesize",
|
|
184
|
+
buildArgs: () => ({
|
|
185
|
+
packetType: "pre_delegation",
|
|
186
|
+
daysBack: 7,
|
|
187
|
+
query: "Create a pre-delegation packet for Claude Code focusing on no-bandage fixes, weekly founder reset, and packet lineage",
|
|
188
|
+
}),
|
|
189
|
+
validate: (r) => ({
|
|
190
|
+
pass: hasField(r, "summary", "memo", "keyFindings", "nextSteps"),
|
|
191
|
+
reason: hasField(r, "summary") ? "Delegation synthesis produced" : hasField(r, "memo") ? "Delegation memo produced" : "No output",
|
|
192
|
+
}),
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
tool: "track_action",
|
|
196
|
+
buildArgs: () => ({ action: "Pre-delegation packet created for Claude Code", category: "founder", impact: "significant" }),
|
|
197
|
+
validate: (r) => ({ pass: true, reason: "Action tracked" }),
|
|
198
|
+
},
|
|
199
|
+
],
|
|
200
|
+
finalValidation: (all) => {
|
|
201
|
+
const reasons = [];
|
|
202
|
+
let pass = true;
|
|
203
|
+
if (!all[2]?.summary && !all[2]?.memo) {
|
|
204
|
+
reasons.push("FAIL: No delegation output");
|
|
205
|
+
pass = false;
|
|
206
|
+
}
|
|
207
|
+
if (reasons.length === 0)
|
|
208
|
+
reasons.push("PASS: Complete delegation chain");
|
|
209
|
+
return { pass, reasons };
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
// Chain 3: Important-Change Review
|
|
213
|
+
{
|
|
214
|
+
id: "important_change",
|
|
215
|
+
name: "Important-Change Review",
|
|
216
|
+
scenario: "important_change",
|
|
217
|
+
lens: "operator",
|
|
218
|
+
steps: [
|
|
219
|
+
{
|
|
220
|
+
tool: "get_context_bundle",
|
|
221
|
+
buildArgs: () => ({ query: "important changes since last session" }),
|
|
222
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
tool: "founder_local_gather",
|
|
226
|
+
buildArgs: () => ({ daysBack: 14 }),
|
|
227
|
+
validate: (r) => ({ pass: hasField(r, "identity", "gitActivity", "sessionMemory"), reason: hasField(r, "identity") ? "Context gathered" : "Partial gather" }),
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
tool: "founder_local_synthesize",
|
|
231
|
+
buildArgs: () => ({
|
|
232
|
+
packetType: "important_change",
|
|
233
|
+
daysBack: 14,
|
|
234
|
+
query: "Show me only the important changes since my last meaningful session — strategy, positioning, architecture, competitor changes",
|
|
235
|
+
}),
|
|
236
|
+
validate: (r) => ({
|
|
237
|
+
pass: hasField(r, "summary", "memo", "keyFindings", "whatChanged"),
|
|
238
|
+
reason: hasField(r, "summary") ? "Changes synthesized" : hasField(r, "whatChanged") ? "Changes detected" : "No output",
|
|
239
|
+
}),
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
tool: "track_action",
|
|
243
|
+
buildArgs: (prior) => ({
|
|
244
|
+
action: `Important-change review: ${prior[2]?.keyFindings?.length ?? prior[2]?.whatChanged?.length ?? 0} findings`,
|
|
245
|
+
category: "operator",
|
|
246
|
+
impact: "moderate",
|
|
247
|
+
}),
|
|
248
|
+
validate: (r) => ({ pass: true, reason: "Tracked" }),
|
|
249
|
+
},
|
|
250
|
+
],
|
|
251
|
+
finalValidation: (all) => {
|
|
252
|
+
const reasons = [];
|
|
253
|
+
let pass = true;
|
|
254
|
+
if (!all[2]?.summary && !all[2]?.whatChanged) {
|
|
255
|
+
reasons.push("FAIL: No changes detected");
|
|
256
|
+
pass = false;
|
|
257
|
+
}
|
|
258
|
+
if (!all[2]?.summary && !all[2]?.memo) {
|
|
259
|
+
reasons.push("FAIL: No output");
|
|
260
|
+
pass = false;
|
|
261
|
+
}
|
|
262
|
+
if (reasons.length === 0)
|
|
263
|
+
reasons.push("PASS: Complete important-change chain");
|
|
264
|
+
return { pass, reasons };
|
|
265
|
+
},
|
|
266
|
+
},
|
|
267
|
+
// Chain 4: Competitor Intelligence Brief
|
|
268
|
+
{
|
|
269
|
+
id: "competitor_brief",
|
|
270
|
+
name: "Competitor Intelligence: Supermemory",
|
|
271
|
+
scenario: "competitor_brief",
|
|
272
|
+
lens: "researcher",
|
|
273
|
+
steps: [
|
|
274
|
+
{
|
|
275
|
+
tool: "get_context_bundle",
|
|
276
|
+
buildArgs: () => ({ query: "Supermemory competitor analysis" }),
|
|
277
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
tool: "discover_tools",
|
|
281
|
+
buildArgs: () => ({ query: "competitor intelligence analysis brief", limit: 5 }),
|
|
282
|
+
validate: (r) => ({
|
|
283
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
284
|
+
reason: "Discovery executed",
|
|
285
|
+
}),
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
tool: "founder_local_synthesize",
|
|
289
|
+
buildArgs: () => ({
|
|
290
|
+
packetType: "competitor_brief",
|
|
291
|
+
daysBack: 7,
|
|
292
|
+
query: "Analyze Supermemory as a competitor — what category they own, distribution advantages, what to absorb vs avoid",
|
|
293
|
+
}),
|
|
294
|
+
validate: (r) => ({
|
|
295
|
+
pass: hasField(r, "summary", "memo", "keyFindings"),
|
|
296
|
+
reason: hasField(r, "summary") ? "Competitor synthesis produced" : hasField(r, "memo") ? "Competitor memo produced" : "No output",
|
|
297
|
+
}),
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
tool: "track_action",
|
|
301
|
+
buildArgs: () => ({ action: "Competitor brief: Supermemory", category: "research", impact: "significant" }),
|
|
302
|
+
validate: (r) => ({ pass: true, reason: "Tracked" }),
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
tool: "track_milestone",
|
|
306
|
+
buildArgs: () => ({
|
|
307
|
+
title: "Competitor brief generated: Supermemory",
|
|
308
|
+
description: "Analyzed competitive position, distribution advantages, absorb vs avoid",
|
|
309
|
+
category: "research",
|
|
310
|
+
}),
|
|
311
|
+
validate: (r) => ({ pass: true, reason: "Milestone tracked" }),
|
|
312
|
+
},
|
|
313
|
+
],
|
|
314
|
+
finalValidation: (all) => {
|
|
315
|
+
const reasons = [];
|
|
316
|
+
let pass = true;
|
|
317
|
+
if (!all[2]?.summary && !all[2]?.memo) {
|
|
318
|
+
reasons.push("FAIL: No competitor output");
|
|
319
|
+
pass = false;
|
|
320
|
+
}
|
|
321
|
+
if (reasons.length === 0)
|
|
322
|
+
reasons.push("PASS: Complete competitor chain");
|
|
323
|
+
return { pass, reasons };
|
|
324
|
+
},
|
|
325
|
+
},
|
|
326
|
+
// Chain 5: Banker Company Search (Anthropic)
|
|
327
|
+
{
|
|
328
|
+
id: "banker_company_search",
|
|
329
|
+
name: "Banker Company Search: Anthropic",
|
|
330
|
+
scenario: "company_search",
|
|
331
|
+
lens: "banker",
|
|
332
|
+
steps: [
|
|
333
|
+
{
|
|
334
|
+
tool: "get_context_bundle",
|
|
335
|
+
buildArgs: () => ({ query: "Analyze Anthropic for a banker lens" }),
|
|
336
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
tool: "run_recon",
|
|
340
|
+
buildArgs: () => ({ target: "Anthropic", focus: "company profile, valuation, revenue, risks" }),
|
|
341
|
+
validate: (r) => ({
|
|
342
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
343
|
+
reason: "Recon executed",
|
|
344
|
+
}),
|
|
345
|
+
},
|
|
346
|
+
{
|
|
347
|
+
tool: "founder_local_synthesize",
|
|
348
|
+
buildArgs: () => ({
|
|
349
|
+
packetType: "competitor_brief",
|
|
350
|
+
daysBack: 7,
|
|
351
|
+
query: "Analyze Anthropic — company snapshot, strategic position, business quality, risks, 3 next diligence questions",
|
|
352
|
+
}),
|
|
353
|
+
validate: (r) => ({
|
|
354
|
+
pass: hasField(r, "summary", "memo", "keyFindings"),
|
|
355
|
+
reason: hasField(r, "summary") ? "Banker synthesis produced" : hasField(r, "memo") ? "Banker memo produced" : "No output",
|
|
356
|
+
}),
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
tool: "track_action",
|
|
360
|
+
buildArgs: () => ({ action: "Banker company search: Anthropic", category: "research", impact: "significant" }),
|
|
361
|
+
validate: (r) => ({ pass: true, reason: "Tracked" }),
|
|
362
|
+
},
|
|
363
|
+
],
|
|
364
|
+
finalValidation: (all) => {
|
|
365
|
+
const reasons = [];
|
|
366
|
+
let pass = true;
|
|
367
|
+
if (!all[2]?.summary && !all[2]?.memo) {
|
|
368
|
+
reasons.push("FAIL: No banker output");
|
|
369
|
+
pass = false;
|
|
370
|
+
}
|
|
371
|
+
if (reasons.length === 0)
|
|
372
|
+
reasons.push("PASS: Complete banker chain");
|
|
373
|
+
return { pass, reasons };
|
|
374
|
+
},
|
|
375
|
+
},
|
|
376
|
+
// Chain 6: Student Strategy Brief (Shopify)
|
|
377
|
+
{
|
|
378
|
+
id: "student_strategy",
|
|
379
|
+
name: "Student Strategy Brief: Shopify",
|
|
380
|
+
scenario: "company_search",
|
|
381
|
+
lens: "student",
|
|
382
|
+
steps: [
|
|
383
|
+
{
|
|
384
|
+
tool: "get_context_bundle",
|
|
385
|
+
buildArgs: () => ({ query: "Shopify AI commerce strategy" }),
|
|
386
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
tool: "founder_local_synthesize",
|
|
390
|
+
buildArgs: () => ({
|
|
391
|
+
packetType: "competitor_brief",
|
|
392
|
+
daysBack: 7,
|
|
393
|
+
query: "Help me understand Shopify's AI commerce strategy — plain-language summary, strategic upside, risks, 3 comparables, study brief",
|
|
394
|
+
}),
|
|
395
|
+
validate: (r) => ({
|
|
396
|
+
pass: hasField(r, "summary", "memo", "keyFindings"),
|
|
397
|
+
reason: hasField(r, "summary") ? "Study synthesis produced" : hasField(r, "memo") ? "Study brief produced" : "No output",
|
|
398
|
+
}),
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
tool: "track_action",
|
|
402
|
+
buildArgs: () => ({ action: "Student strategy brief: Shopify AI commerce", category: "research", impact: "moderate" }),
|
|
403
|
+
validate: (r) => ({ pass: true, reason: "Tracked" }),
|
|
404
|
+
},
|
|
405
|
+
],
|
|
406
|
+
finalValidation: (all) => {
|
|
407
|
+
const reasons = [];
|
|
408
|
+
let pass = true;
|
|
409
|
+
if (!all[1]?.summary && !all[1]?.memo) {
|
|
410
|
+
reasons.push("FAIL: No study output");
|
|
411
|
+
pass = false;
|
|
412
|
+
}
|
|
413
|
+
if (reasons.length === 0)
|
|
414
|
+
reasons.push("PASS: Complete student chain");
|
|
415
|
+
return { pass, reasons };
|
|
416
|
+
},
|
|
417
|
+
},
|
|
418
|
+
// Chain 7: Setup + Discovery
|
|
419
|
+
{
|
|
420
|
+
id: "setup_discovery",
|
|
421
|
+
name: "Setup + Discovery Sanity",
|
|
422
|
+
scenario: "weekly_reset",
|
|
423
|
+
lens: "founder",
|
|
424
|
+
steps: [
|
|
425
|
+
{
|
|
426
|
+
tool: "check_mcp_setup",
|
|
427
|
+
buildArgs: () => ({}),
|
|
428
|
+
validate: (r) => ({
|
|
429
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
430
|
+
reason: outputContains(r ?? {}, "healthy") ? "Setup healthy" : "Setup responded",
|
|
431
|
+
}),
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
tool: "discover_tools",
|
|
435
|
+
buildArgs: () => ({ query: "founder weekly reset company analysis", limit: 10 }),
|
|
436
|
+
validate: (r) => ({
|
|
437
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
438
|
+
reason: "Discovery executed",
|
|
439
|
+
}),
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
tool: "get_tool_quick_ref",
|
|
443
|
+
buildArgs: () => ({ toolName: "founder_local_synthesize" }),
|
|
444
|
+
validate: (r) => ({
|
|
445
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
446
|
+
reason: "Quick ref loaded",
|
|
447
|
+
}),
|
|
448
|
+
},
|
|
449
|
+
],
|
|
450
|
+
finalValidation: (all) => {
|
|
451
|
+
const reasons = [];
|
|
452
|
+
let pass = true;
|
|
453
|
+
if (all[0] === null || all[0] === undefined) {
|
|
454
|
+
reasons.push("FAIL: Setup failed");
|
|
455
|
+
pass = false;
|
|
456
|
+
}
|
|
457
|
+
if (all[1] === null || all[1] === undefined) {
|
|
458
|
+
reasons.push("FAIL: Discovery failed");
|
|
459
|
+
pass = false;
|
|
460
|
+
}
|
|
461
|
+
if (all[2] === null || all[2] === undefined) {
|
|
462
|
+
reasons.push("FAIL: Quick ref failed");
|
|
463
|
+
pass = false;
|
|
464
|
+
}
|
|
465
|
+
if (reasons.length === 0)
|
|
466
|
+
reasons.push("PASS: Setup + discovery chain");
|
|
467
|
+
return { pass, reasons };
|
|
468
|
+
},
|
|
469
|
+
},
|
|
470
|
+
// Chain 8: Operator Causal Replay
|
|
471
|
+
{
|
|
472
|
+
id: "operator_causal",
|
|
473
|
+
name: "Operator Causal Memory Replay",
|
|
474
|
+
scenario: "important_change",
|
|
475
|
+
lens: "operator",
|
|
476
|
+
steps: [
|
|
477
|
+
{
|
|
478
|
+
tool: "get_context_bundle",
|
|
479
|
+
buildArgs: () => ({ query: "causal chain product evolution" }),
|
|
480
|
+
validate: (r) => ({ pass: hasField(r, "pinned"), reason: "Context loaded" }),
|
|
481
|
+
},
|
|
482
|
+
{
|
|
483
|
+
tool: "get_session_journal",
|
|
484
|
+
buildArgs: () => ({ daysBack: 7 }),
|
|
485
|
+
validate: (r) => ({
|
|
486
|
+
pass: r !== null && r !== undefined && typeof r === "object",
|
|
487
|
+
reason: "Journal loaded",
|
|
488
|
+
}),
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
tool: "founder_local_synthesize",
|
|
492
|
+
buildArgs: () => ({
|
|
493
|
+
packetType: "important_change",
|
|
494
|
+
daysBack: 7,
|
|
495
|
+
query: "Reconstruct the causal chain for this week's product evolution — before/after state, important changes, trajectory",
|
|
496
|
+
}),
|
|
497
|
+
validate: (r) => ({
|
|
498
|
+
pass: hasField(r, "summary", "memo", "keyFindings", "whatChanged"),
|
|
499
|
+
reason: hasField(r, "summary") ? "Causal synthesis produced" : hasField(r, "whatChanged") ? "Changes detected" : "No output",
|
|
500
|
+
}),
|
|
501
|
+
},
|
|
502
|
+
{
|
|
503
|
+
tool: "track_action",
|
|
504
|
+
buildArgs: (prior) => ({
|
|
505
|
+
action: `Causal replay: ${prior[2]?.keyFindings?.length ?? prior[2]?.whatChanged?.length ?? 0} findings`,
|
|
506
|
+
category: "operator",
|
|
507
|
+
impact: "moderate",
|
|
508
|
+
}),
|
|
509
|
+
validate: (r) => ({ pass: true, reason: "Tracked" }),
|
|
510
|
+
},
|
|
511
|
+
],
|
|
512
|
+
finalValidation: (all) => {
|
|
513
|
+
const reasons = [];
|
|
514
|
+
let pass = true;
|
|
515
|
+
if (!all[2]?.summary && !all[2]?.whatChanged) {
|
|
516
|
+
reasons.push("FAIL: No causal replay output");
|
|
517
|
+
pass = false;
|
|
518
|
+
}
|
|
519
|
+
if (reasons.length === 0)
|
|
520
|
+
reasons.push("PASS: Complete causal replay chain");
|
|
521
|
+
return { pass, reasons };
|
|
522
|
+
},
|
|
523
|
+
},
|
|
524
|
+
];
|
|
525
|
+
/* ─── Chain Runner ───────────────────────────────────────────────────────── */
|
|
526
|
+
async function runChain(chain, tools) {
|
|
527
|
+
const stepResults = [];
|
|
528
|
+
const allResults = {};
|
|
529
|
+
let chainBroken = false;
|
|
530
|
+
const chainStart = Date.now();
|
|
531
|
+
for (let i = 0; i < chain.steps.length; i++) {
|
|
532
|
+
const step = chain.steps[i];
|
|
533
|
+
const stepStart = Date.now();
|
|
534
|
+
if (chainBroken) {
|
|
535
|
+
stepResults.push({
|
|
536
|
+
stepIndex: i,
|
|
537
|
+
tool: step.tool,
|
|
538
|
+
args: {},
|
|
539
|
+
output: null,
|
|
540
|
+
validation: { pass: false, reason: "SKIPPED: prior step failed" },
|
|
541
|
+
latencyMs: 0,
|
|
542
|
+
});
|
|
543
|
+
continue;
|
|
544
|
+
}
|
|
545
|
+
const tool = tools.find(t => t.name === step.tool);
|
|
546
|
+
if (!tool) {
|
|
547
|
+
stepResults.push({
|
|
548
|
+
stepIndex: i,
|
|
549
|
+
tool: step.tool,
|
|
550
|
+
args: {},
|
|
551
|
+
output: null,
|
|
552
|
+
validation: { pass: false, reason: `Tool not found: ${step.tool}` },
|
|
553
|
+
latencyMs: 0,
|
|
554
|
+
error: `Tool not found: ${step.tool}`,
|
|
555
|
+
});
|
|
556
|
+
chainBroken = true;
|
|
557
|
+
continue;
|
|
558
|
+
}
|
|
559
|
+
try {
|
|
560
|
+
const args = step.buildArgs(allResults);
|
|
561
|
+
const result = await tool.handler(args);
|
|
562
|
+
const latencyMs = Date.now() - stepStart;
|
|
563
|
+
const validation = step.validate(result);
|
|
564
|
+
allResults[i] = result;
|
|
565
|
+
stepResults.push({ stepIndex: i, tool: step.tool, args, output: result, validation, latencyMs });
|
|
566
|
+
// If validation fails on a critical step, break the chain
|
|
567
|
+
if (!validation.pass && i < chain.steps.length - 1) {
|
|
568
|
+
// Allow tracking steps (track_action, track_milestone) to fail without breaking
|
|
569
|
+
if (!step.tool.startsWith("track_")) {
|
|
570
|
+
chainBroken = true;
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
catch (err) {
|
|
575
|
+
const latencyMs = Date.now() - stepStart;
|
|
576
|
+
stepResults.push({
|
|
577
|
+
stepIndex: i,
|
|
578
|
+
tool: step.tool,
|
|
579
|
+
args: step.buildArgs(allResults),
|
|
580
|
+
output: null,
|
|
581
|
+
validation: { pass: false, reason: `Error: ${err.message?.slice(0, 100)}` },
|
|
582
|
+
latencyMs,
|
|
583
|
+
error: err.message,
|
|
584
|
+
});
|
|
585
|
+
if (!step.tool.startsWith("track_")) {
|
|
586
|
+
chainBroken = true;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
const finalValidation = chain.finalValidation(allResults);
|
|
591
|
+
const totalLatencyMs = Date.now() - chainStart;
|
|
592
|
+
const stepsCompleted = stepResults.filter(s => s.validation.pass).length;
|
|
593
|
+
return {
|
|
594
|
+
chainId: chain.id,
|
|
595
|
+
chainName: chain.name,
|
|
596
|
+
scenario: chain.scenario,
|
|
597
|
+
lens: chain.lens,
|
|
598
|
+
steps: stepResults,
|
|
599
|
+
finalValidation,
|
|
600
|
+
totalLatencyMs,
|
|
601
|
+
stepsCompleted,
|
|
602
|
+
stepsTotal: chain.steps.length,
|
|
603
|
+
chainBroken,
|
|
604
|
+
overallPass: finalValidation.pass && !chainBroken,
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
/* ─── SQLite Persistence ─────────────────────────────────────────────────── */
|
|
608
|
+
function persistChainResult(result) {
|
|
609
|
+
try {
|
|
610
|
+
const db = getDb();
|
|
611
|
+
db.exec(`CREATE TABLE IF NOT EXISTS chain_eval_runs (
|
|
612
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
613
|
+
chainId TEXT NOT NULL,
|
|
614
|
+
chainName TEXT NOT NULL,
|
|
615
|
+
scenario TEXT NOT NULL,
|
|
616
|
+
lens TEXT NOT NULL,
|
|
617
|
+
stepsCompleted INTEGER NOT NULL,
|
|
618
|
+
stepsTotal INTEGER NOT NULL,
|
|
619
|
+
chainBroken INTEGER NOT NULL,
|
|
620
|
+
overallPass INTEGER NOT NULL,
|
|
621
|
+
totalLatencyMs INTEGER NOT NULL,
|
|
622
|
+
finalReasons TEXT NOT NULL,
|
|
623
|
+
stepsJson TEXT NOT NULL,
|
|
624
|
+
timestamp TEXT NOT NULL DEFAULT (datetime('now'))
|
|
625
|
+
)`);
|
|
626
|
+
db.prepare(`INSERT INTO chain_eval_runs (chainId, chainName, scenario, lens, stepsCompleted, stepsTotal, chainBroken, overallPass, totalLatencyMs, finalReasons, stepsJson) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(result.chainId, result.chainName, result.scenario, result.lens, result.stepsCompleted, result.stepsTotal, result.chainBroken ? 1 : 0, result.overallPass ? 1 : 0, result.totalLatencyMs, JSON.stringify(result.finalValidation.reasons), JSON.stringify(result.steps.map(s => ({
|
|
627
|
+
tool: s.tool, pass: s.validation.pass, reason: s.validation.reason, latencyMs: s.latencyMs, error: s.error,
|
|
628
|
+
}))));
|
|
629
|
+
}
|
|
630
|
+
catch { /* SQLite not critical */ }
|
|
631
|
+
}
|
|
632
|
+
/* ─── Main ───────────────────────────────────────────────────────────────── */
|
|
633
|
+
async function main() {
|
|
634
|
+
const args = process.argv.slice(2);
|
|
635
|
+
const chainsFlag = args.find(a => a.startsWith("--chains="));
|
|
636
|
+
const maxChains = chainsFlag ? parseInt(chainsFlag.split("=")[1], 10) : CHAINS.length;
|
|
637
|
+
console.log(`\n NodeBench Chain Eval — ${maxChains} multi-step workflows\n`);
|
|
638
|
+
// Load all tools including meta/discovery tools
|
|
639
|
+
const tools = await loadAllToolsIncludingMeta();
|
|
640
|
+
console.log(` Tools loaded: ${tools.length}`);
|
|
641
|
+
const results = [];
|
|
642
|
+
for (let i = 0; i < Math.min(maxChains, CHAINS.length); i++) {
|
|
643
|
+
const chain = CHAINS[i];
|
|
644
|
+
process.stdout.write(` [${i + 1}/${maxChains}] ${chain.name} (${chain.steps.length} steps)... `);
|
|
645
|
+
const result = await runChain(chain, tools);
|
|
646
|
+
results.push(result);
|
|
647
|
+
persistChainResult(result);
|
|
648
|
+
const status = result.overallPass ? "✓ PASS" : result.chainBroken ? "✗ BROKEN" : "✗ FAIL";
|
|
649
|
+
console.log(`${status} (${result.stepsCompleted}/${result.stepsTotal} steps, ${result.totalLatencyMs}ms)`);
|
|
650
|
+
// Print step details for failures
|
|
651
|
+
if (!result.overallPass) {
|
|
652
|
+
for (const step of result.steps) {
|
|
653
|
+
const icon = step.validation.pass ? " ✓" : " ✗";
|
|
654
|
+
console.log(` ${icon} ${step.tool}: ${step.validation.reason}${step.error ? ` [${step.error.slice(0, 60)}]` : ""} (${step.latencyMs}ms)`);
|
|
655
|
+
}
|
|
656
|
+
for (const reason of result.finalValidation.reasons) {
|
|
657
|
+
console.log(` → ${reason}`);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
// Summary
|
|
662
|
+
const passed = results.filter(r => r.overallPass).length;
|
|
663
|
+
const broken = results.filter(r => r.chainBroken).length;
|
|
664
|
+
const totalSteps = results.reduce((s, r) => s + r.stepsTotal, 0);
|
|
665
|
+
const completedSteps = results.reduce((s, r) => s + r.stepsCompleted, 0);
|
|
666
|
+
const avgLatency = Math.round(results.reduce((s, r) => s + r.totalLatencyMs, 0) / results.length);
|
|
667
|
+
console.log(`\n ═══════════════════════════════════════════`);
|
|
668
|
+
console.log(` Chain Pass Rate: ${passed}/${results.length} (${Math.round(passed / results.length * 100)}%)`);
|
|
669
|
+
console.log(` Step Completion: ${completedSteps}/${totalSteps} (${Math.round(completedSteps / totalSteps * 100)}%)`);
|
|
670
|
+
console.log(` Chains Broken: ${broken}`);
|
|
671
|
+
console.log(` Avg Latency: ${avgLatency}ms`);
|
|
672
|
+
console.log(` ═══════════════════════════════════════════\n`);
|
|
673
|
+
// Per-chain summary table
|
|
674
|
+
console.log(` BY CHAIN:`);
|
|
675
|
+
for (const r of results) {
|
|
676
|
+
const icon = r.overallPass ? "✓" : "✗";
|
|
677
|
+
console.log(` ${icon} ${r.chainName.padEnd(40)} ${r.stepsCompleted}/${r.stepsTotal} steps ${r.totalLatencyMs}ms ${r.lens}`);
|
|
678
|
+
}
|
|
679
|
+
console.log();
|
|
680
|
+
process.exit(passed === results.length ? 0 : 1);
|
|
681
|
+
}
|
|
682
|
+
main().catch(err => { console.error("Chain eval error:", err); process.exit(1); });
|
|
683
|
+
//# sourceMappingURL=chainEval.js.map
|