nodebench-mcp 2.67.0 → 2.68.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/chainEval.d.ts +21 -0
- package/dist/benchmarks/chainEval.js +683 -0
- package/dist/benchmarks/chainEval.js.map +1 -0
- package/dist/benchmarks/llmJudgeEval.js +90 -7
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/pipelineEval.d.ts +63 -0
- package/dist/benchmarks/pipelineEval.js +1053 -0
- package/dist/benchmarks/pipelineEval.js.map +1 -0
- package/dist/benchmarks/searchQualityEval.js +4 -4
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* pipelineEval.ts — Multi-step pipeline eval harness for NodeBench MCP
|
|
4
|
+
*
|
|
5
|
+
* Unlike llmJudgeEval.ts which tests tools independently, this harness tests
|
|
6
|
+
* realistic multi-step chains where Tool A's output feeds Tool B's input.
|
|
7
|
+
*
|
|
8
|
+
* Architecture:
|
|
9
|
+
* 1. Pipeline Definitions — 6 canonical pipelines modeling real agent workflows
|
|
10
|
+
* 2. Chaining Engine — executes steps sequentially, extracts fields from output
|
|
11
|
+
* 3. LLM Judge — Gemini Flash Lite evaluates 5 boolean criteria on full trace
|
|
12
|
+
* 4. Per-step tracking — tool name, args, output size, duration, pass/fail
|
|
13
|
+
* 5. SQLite persistence — pipeline_eval_runs + pipeline_eval_steps tables
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* cd packages/mcp-local
|
|
17
|
+
* npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]
|
|
18
|
+
*/
|
|
19
|
+
export type PipelineName = "founder_weekly_reset" | "company_intelligence" | "competitor_brief" | "pre_delegation" | "important_change_review" | "session_memory_cycle";
|
|
20
|
+
export interface PipelineStep {
|
|
21
|
+
/** Tool name to call */
|
|
22
|
+
tool: string;
|
|
23
|
+
/** Static args merged with dynamic args from previous steps */
|
|
24
|
+
staticArgs: Record<string, unknown>;
|
|
25
|
+
/**
|
|
26
|
+
* Functions that extract values from the previous step's output to build
|
|
27
|
+
* dynamic args. Key = arg name, value = extractor function.
|
|
28
|
+
*/
|
|
29
|
+
dynamicArgs?: Record<string, (prevOutput: unknown) => unknown>;
|
|
30
|
+
/** Human-readable description of what this step does */
|
|
31
|
+
description: string;
|
|
32
|
+
}
|
|
33
|
+
export interface PipelineDefinition {
|
|
34
|
+
name: PipelineName;
|
|
35
|
+
description: string;
|
|
36
|
+
steps: PipelineStep[];
|
|
37
|
+
}
|
|
38
|
+
export interface StepResult {
|
|
39
|
+
stepIndex: number;
|
|
40
|
+
tool: string;
|
|
41
|
+
description: string;
|
|
42
|
+
args: Record<string, unknown>;
|
|
43
|
+
ok: boolean;
|
|
44
|
+
outputSize: number;
|
|
45
|
+
outputPreview: string;
|
|
46
|
+
error?: string;
|
|
47
|
+
ms: number;
|
|
48
|
+
}
|
|
49
|
+
export interface PipelineCriterion {
|
|
50
|
+
criterion: string;
|
|
51
|
+
weight: number;
|
|
52
|
+
pass: boolean;
|
|
53
|
+
evidence: string;
|
|
54
|
+
}
|
|
55
|
+
export interface PipelineRunResult {
|
|
56
|
+
pipelineName: PipelineName;
|
|
57
|
+
runId: string;
|
|
58
|
+
steps: StepResult[];
|
|
59
|
+
criteria: PipelineCriterion[];
|
|
60
|
+
overallPass: boolean;
|
|
61
|
+
totalMs: number;
|
|
62
|
+
timestamp: string;
|
|
63
|
+
}
|