nodebench-mcp 2.67.0 → 2.68.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * pipelineEval.ts — Multi-step pipeline eval harness for NodeBench MCP
4
+ *
5
+ * Unlike llmJudgeEval.ts which tests tools independently, this harness tests
6
+ * realistic multi-step chains where Tool A's output feeds Tool B's input.
7
+ *
8
+ * Architecture:
9
+ * 1. Pipeline Definitions — 6 canonical pipelines modeling real agent workflows
10
+ * 2. Chaining Engine — executes steps sequentially, extracts fields from output
11
+ * 3. LLM Judge — Gemini Flash Lite evaluates 5 boolean criteria on full trace
12
+ * 4. Per-step tracking — tool name, args, output size, duration, pass/fail
13
+ * 5. SQLite persistence — pipeline_eval_runs + pipeline_eval_steps tables
14
+ *
15
+ * Usage:
16
+ * cd packages/mcp-local
17
+ * npx tsx src/benchmarks/pipelineEval.ts [--pipeline NAME] [--all]
18
+ */
19
+ export type PipelineName = "founder_weekly_reset" | "company_intelligence" | "competitor_brief" | "pre_delegation" | "important_change_review" | "session_memory_cycle";
20
+ export interface PipelineStep {
21
+ /** Tool name to call */
22
+ tool: string;
23
+ /** Static args merged with dynamic args from previous steps */
24
+ staticArgs: Record<string, unknown>;
25
+ /**
26
+ * Functions that extract values from the previous step's output to build
27
+ * dynamic args. Key = arg name, value = extractor function.
28
+ */
29
+ dynamicArgs?: Record<string, (prevOutput: unknown) => unknown>;
30
+ /** Human-readable description of what this step does */
31
+ description: string;
32
+ }
33
+ export interface PipelineDefinition {
34
+ name: PipelineName;
35
+ description: string;
36
+ steps: PipelineStep[];
37
+ }
38
+ export interface StepResult {
39
+ stepIndex: number;
40
+ tool: string;
41
+ description: string;
42
+ args: Record<string, unknown>;
43
+ ok: boolean;
44
+ outputSize: number;
45
+ outputPreview: string;
46
+ error?: string;
47
+ ms: number;
48
+ }
49
+ export interface PipelineCriterion {
50
+ criterion: string;
51
+ weight: number;
52
+ pass: boolean;
53
+ evidence: string;
54
+ }
55
+ export interface PipelineRunResult {
56
+ pipelineName: PipelineName;
57
+ runId: string;
58
+ steps: StepResult[];
59
+ criteria: PipelineCriterion[];
60
+ overallPass: boolean;
61
+ totalMs: number;
62
+ timestamp: string;
63
+ }