mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
@@ -0,0 +1,188 @@
1
+ /**
2
+ * generate_html_report tool implementation.
3
+ *
4
+ * Generates a full single-file HTML report for a given run_id.
5
+ * All styles are inlined — no external CDN required.
6
+ */
7
+ function escapeHtml(s) {
8
+ return s
9
+ .replace(/&/g, "&")
10
+ .replace(/</g, "&lt;")
11
+ .replace(/>/g, "&gt;")
12
+ .replace(/"/g, "&quot;");
13
+ }
14
+ function statusBadge(status) {
15
+ const color = status === "pass" ? "#22c55e" : status === "fail" ? "#ef4444" : "#f59e0b";
16
+ return `<span style="display:inline-block;padding:2px 8px;border-radius:4px;background:${color};color:#fff;font-weight:bold;font-size:.85em">${escapeHtml(status)}</span>`;
17
+ }
18
+ function assertionRows(assertions) {
19
+ if (assertions.length === 0) {
20
+ return `<tr><td colspan="3" style="color:#888;font-style:italic">no assertions</td></tr>`;
21
+ }
22
+ return assertions
23
+ .map((a) => {
24
+ const color = a.passed ? "#22c55e" : "#ef4444";
25
+ return `<tr>
26
+ <td style="color:${color}">${a.passed ? "✓" : "✗"}</td>
27
+ <td><code>${escapeHtml(a.type)}</code></td>
28
+ <td style="color:${color}">${escapeHtml(a.message)}</td>
29
+ </tr>`;
30
+ })
31
+ .join("\n");
32
+ }
33
+ export function generateHtmlReportTool(runId, db) {
34
+ const run = db.getRunById(runId);
35
+ if (!run) {
36
+ throw new Error(`Run not found: ${runId}`);
37
+ }
38
+ const cases = db.getCaseResultsForRun(runId);
39
+ const duration = run.ended_at !== null ? `${run.ended_at - run.started_at}ms` : "—";
40
+ const overallColor = run.failed === 0 ? "#22c55e" : "#ef4444";
41
+ const caseRows = cases
42
+ .map((c) => {
43
+ const caseColor = c.status === "pass" ? "#22c55e" : c.status === "fail" ? "#ef4444" : "#f59e0b";
44
+ // Parse assertions_json — it is an array of AssertionResult[][]
45
+ // (one array of assertion results per step)
46
+ let stepsAssertions = [];
47
+ try {
48
+ if (c.assertions_json) {
49
+ stepsAssertions = JSON.parse(c.assertions_json);
50
+ }
51
+ }
52
+ catch {
53
+ // ignore parse errors
54
+ }
55
+ const assertionSections = stepsAssertions
56
+ .map((stepAssertions, idx) => {
57
+ if (stepAssertions.length === 0)
58
+ return "";
59
+ return `<details style="margin-top:8px">
60
+ <summary style="cursor:pointer;color:#aaa">Step ${idx + 1} assertions (${stepAssertions.length})</summary>
61
+ <table style="width:100%;margin-top:4px;border-collapse:collapse">
62
+ <thead><tr>
63
+ <th style="width:2em;text-align:left"></th>
64
+ <th style="text-align:left;color:#aaa">Type</th>
65
+ <th style="text-align:left;color:#aaa">Message</th>
66
+ </tr></thead>
67
+ <tbody>${assertionRows(stepAssertions)}</tbody>
68
+ </table>
69
+ </details>`;
70
+ })
71
+ .join("");
72
+ return `<tr>
73
+ <td style="padding:12px 8px;font-weight:bold;color:${caseColor}">${escapeHtml(c.case_name)}</td>
74
+ <td style="padding:12px 8px">${statusBadge(c.status)}</td>
75
+ <td style="padding:12px 8px;color:#888">${c.duration_ms !== null ? `${c.duration_ms}ms` : "—"}</td>
76
+ <td style="padding:12px 8px">
77
+ ${c.error_message ? `<span style="color:#ef4444">${escapeHtml(c.error_message)}</span>` : ""}
78
+ ${assertionSections}
79
+ </td>
80
+ </tr>`;
81
+ })
82
+ .join("\n");
83
+ return `<!DOCTYPE html>
84
+ <html lang="en">
85
+ <head>
86
+ <meta charset="UTF-8">
87
+ <meta name="viewport" content="width=device-width, initial-scale=1">
88
+ <title>MCP Eval Report — ${escapeHtml(run.suite_name)}</title>
89
+ <style>
90
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
91
+ body {
92
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
93
+ background: #0f1117;
94
+ color: #e2e8f0;
95
+ padding: 2rem;
96
+ line-height: 1.6;
97
+ }
98
+ h1 { font-size: 1.5rem; margin-bottom: .5rem; }
99
+ h2 { font-size: 1.1rem; margin: 1.5rem 0 .75rem; color: #94a3b8; }
100
+ .meta { color: #64748b; font-size: .85rem; margin-bottom: 1.5rem; }
101
+ .summary-grid {
102
+ display: grid;
103
+ grid-template-columns: repeat(auto-fill, minmax(140px, 1fr));
104
+ gap: 1rem;
105
+ margin-bottom: 2rem;
106
+ }
107
+ .stat-card {
108
+ background: #1e2433;
109
+ border-radius: 8px;
110
+ padding: 1rem;
111
+ text-align: center;
112
+ }
113
+ .stat-card .value {
114
+ font-size: 2rem;
115
+ font-weight: bold;
116
+ display: block;
117
+ }
118
+ .stat-card .label { font-size: .75rem; color: #64748b; }
119
+ table {
120
+ width: 100%;
121
+ border-collapse: collapse;
122
+ background: #1e2433;
123
+ border-radius: 8px;
124
+ overflow: hidden;
125
+ }
126
+ thead tr { background: #252d3d; }
127
+ th {
128
+ padding: 10px 8px;
129
+ text-align: left;
130
+ font-size: .8rem;
131
+ color: #94a3b8;
132
+ text-transform: uppercase;
133
+ letter-spacing: .05em;
134
+ }
135
+ tbody tr:hover { background: #252d3d; }
136
+ tbody tr + tr td { border-top: 1px solid #2d3748; }
137
+ td { vertical-align: top; }
138
+ code { background: #2d3748; padding: 1px 4px; border-radius: 3px; font-size: .85em; }
139
+ details summary { list-style: none; }
140
+ details summary::-webkit-details-marker { display: none; }
141
+ details[open] summary { margin-bottom: 4px; }
142
+ </style>
143
+ </head>
144
+ <body>
145
+ <h1>MCP Eval Runner — Report</h1>
146
+ <p class="meta">
147
+ Suite: <strong>${escapeHtml(run.suite_name)}</strong> &nbsp;|&nbsp;
148
+ Run ID: <code>${escapeHtml(run.id)}</code> &nbsp;|&nbsp;
149
+ Started: ${new Date(run.started_at).toISOString()} &nbsp;|&nbsp;
150
+ Duration: ${duration}
151
+ </p>
152
+
153
+ <div class="summary-grid">
154
+ <div class="stat-card">
155
+ <span class="value" style="color:#94a3b8">${run.total_cases}</span>
156
+ <span class="label">Total Cases</span>
157
+ </div>
158
+ <div class="stat-card">
159
+ <span class="value" style="color:#22c55e">${run.passed}</span>
160
+ <span class="label">Passed</span>
161
+ </div>
162
+ <div class="stat-card">
163
+ <span class="value" style="color:#ef4444">${run.failed}</span>
164
+ <span class="label">Failed</span>
165
+ </div>
166
+ <div class="stat-card">
167
+ <span class="value" style="color:${overallColor}">${run.failed === 0 ? "PASS" : "FAIL"}</span>
168
+ <span class="label">Overall</span>
169
+ </div>
170
+ </div>
171
+
172
+ <h2>Test Cases</h2>
173
+ <table>
174
+ <thead>
175
+ <tr>
176
+ <th>Case</th>
177
+ <th>Status</th>
178
+ <th>Duration</th>
179
+ <th>Details</th>
180
+ </tr>
181
+ </thead>
182
+ <tbody>
183
+ ${cases.length > 0 ? caseRows : `<tr><td colspan="4" style="padding:16px;text-align:center;color:#64748b">No cases found for this run.</td></tr>`}
184
+ </tbody>
185
+ </table>
186
+ </body>
187
+ </html>`;
188
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * list_cases and create_test_case MCP tool implementations.
3
+ */
4
+ /**
5
+ * list_cases — enumerate available fixtures with their step counts.
6
+ */
7
+ export declare function listCasesTool(fixturesDir: string): string;
8
+ /**
9
+ * create_test_case — create a new YAML fixture file.
10
+ */
11
+ export declare function createTestCaseTool(name: string, steps: unknown[], fixturesDir: string): string;
@@ -0,0 +1,41 @@
1
+ /**
2
+ * list_cases and create_test_case MCP tool implementations.
3
+ */
4
+ import { loadFixturesFromDir, writeFixture, validateFixture } from "../fixture.js";
5
+ /**
6
+ * list_cases — enumerate available fixtures with their step counts.
7
+ */
8
+ export function listCasesTool(fixturesDir) {
9
+ const fixtures = loadFixturesFromDir(fixturesDir);
10
+ if (fixtures.length === 0) {
11
+ return [
12
+ "No fixtures found in: " + fixturesDir,
13
+ "",
14
+ "Use create_test_case to add your first test case.",
15
+ ].join("\n");
16
+ }
17
+ const lines = [`Found ${fixtures.length} fixture(s) in: ${fixturesDir}`, ""];
18
+ for (const f of fixtures) {
19
+ lines.push(` • ${f.name} — ${f.steps.length} step(s)${f.description ? ` (${f.description})` : ""}`);
20
+ for (const step of f.steps) {
21
+ lines.push(` - [${step.id}] tool: ${step.tool}${step.description ? ` — ${step.description}` : ""}`);
22
+ }
23
+ }
24
+ return lines.join("\n");
25
+ }
26
+ /**
27
+ * create_test_case — create a new YAML fixture file.
28
+ */
29
+ export function createTestCaseTool(name, steps, fixturesDir) {
30
+ // Validate the input by constructing a fixture object and running it through validateFixture
31
+ const raw = { name, steps };
32
+ const fixture = validateFixture(raw, "create_test_case input");
33
+ const filePath = writeFixture(fixturesDir, fixture);
34
+ return [
35
+ `Created fixture: ${filePath}`,
36
+ ` Name: ${fixture.name}`,
37
+ ` Steps: ${fixture.steps.length}`,
38
+ "",
39
+ "Use run_case to execute it, or run_suite to run all fixtures.",
40
+ ].join("\n");
41
+ }
@@ -0,0 +1,12 @@
1
+ /**
2
+ * regression_report and compare_results MCP tool implementations.
3
+ */
4
+ import type { EvalDb } from "../db.js";
5
+ /**
6
+ * regression_report — compare current state to the last run; returns what changed.
7
+ */
8
+ export declare function regressionReportTool(fixturesDir: string, db: EvalDb): Promise<string>;
9
+ /**
10
+ * compare_results — diff two named run results by run ID.
11
+ */
12
+ export declare function compareResultsTool(runIdA: string, runIdB: string, db: EvalDb): Promise<string>;
@@ -0,0 +1,120 @@
1
+ /**
2
+ * regression_report and compare_results MCP tool implementations.
3
+ */
4
+ /**
5
+ * regression_report — compare current state to the last run; returns what changed.
6
+ */
7
+ export async function regressionReportTool(fixturesDir, db) {
8
+ const runs = db.getAllRuns(2);
9
+ if (runs.length === 0) {
10
+ return ["No run history found.", "", "Run run_suite first to establish a baseline."].join("\n");
11
+ }
12
+ if (runs.length === 1) {
13
+ const [latest] = runs;
14
+ const cases = db.getCaseResultsForRun(latest.id);
15
+ const lines = [
16
+ `Only one run found (Run ID: ${latest.id}).`,
17
+ `Started: ${new Date(latest.started_at).toISOString()}`,
18
+ `Results: ${latest.passed} passed, ${latest.failed} failed`,
19
+ "",
20
+ "Run again after making changes to see regressions.",
21
+ "",
22
+ "Cases:",
23
+ ];
24
+ for (const c of cases) {
25
+ lines.push(` ${c.status === "pass" ? "✓" : "✗"} ${c.case_name} — ${c.status}`);
26
+ }
27
+ return lines.join("\n");
28
+ }
29
+ const [latest, previous] = runs;
30
+ return diffRuns(previous, latest, db);
31
+ }
32
+ /**
33
+ * compare_results — diff two named run results by run ID.
34
+ */
35
+ export async function compareResultsTool(runIdA, runIdB, db) {
36
+ const runA = db.getRunById(runIdA);
37
+ const runB = db.getRunById(runIdB);
38
+ if (!runA) {
39
+ throw new Error(`Run not found: ${runIdA}`);
40
+ }
41
+ if (!runB) {
42
+ throw new Error(`Run not found: ${runIdB}`);
43
+ }
44
+ return diffRuns(runA, runB, db);
45
+ }
46
+ function diffRuns(previous, latest, db) {
47
+ const prevCases = db.getCaseResultsForRun(previous.id);
48
+ const latestCases = db.getCaseResultsForRun(latest.id);
49
+ const prevMap = new Map(prevCases.map((c) => [c.case_name, c]));
50
+ const latestMap = new Map(latestCases.map((c) => [c.case_name, c]));
51
+ const regressions = [];
52
+ const fixes = [];
53
+ const newCases = [];
54
+ const removed = [];
55
+ const unchanged = [];
56
+ // Check for regressions and fixes
57
+ for (const [name, latestCase] of latestMap) {
58
+ const prevCase = prevMap.get(name);
59
+ if (!prevCase) {
60
+ newCases.push(` + ${name} (new, ${latestCase.status})`);
61
+ }
62
+ else if (prevCase.status === "pass" && latestCase.status !== "pass") {
63
+ regressions.push(` ✗ ${name}: ${prevCase.status} → ${latestCase.status}`);
64
+ }
65
+ else if (prevCase.status !== "pass" && latestCase.status === "pass") {
66
+ fixes.push(` ✓ ${name}: ${prevCase.status} → ${latestCase.status}`);
67
+ }
68
+ else {
69
+ unchanged.push(` = ${name}: ${latestCase.status}`);
70
+ }
71
+ }
72
+ // Check for removed cases
73
+ for (const [name] of prevMap) {
74
+ if (!latestMap.has(name)) {
75
+ removed.push(` - ${name} (removed)`);
76
+ }
77
+ }
78
+ const lines = [
79
+ "=== Regression Report ===",
80
+ "",
81
+ `Previous run: ${previous.id} (${new Date(previous.started_at).toISOString()})`,
82
+ ` ${previous.passed}/${previous.total_cases} passed`,
83
+ "",
84
+ `Latest run: ${latest.id} (${new Date(latest.started_at).toISOString()})`,
85
+ ` ${latest.passed}/${latest.total_cases} passed`,
86
+ "",
87
+ ];
88
+ if (regressions.length > 0) {
89
+ lines.push(`REGRESSIONS (${regressions.length}):`);
90
+ lines.push(...regressions);
91
+ lines.push("");
92
+ }
93
+ if (fixes.length > 0) {
94
+ lines.push(`FIXED (${fixes.length}):`);
95
+ lines.push(...fixes);
96
+ lines.push("");
97
+ }
98
+ if (newCases.length > 0) {
99
+ lines.push(`NEW CASES (${newCases.length}):`);
100
+ lines.push(...newCases);
101
+ lines.push("");
102
+ }
103
+ if (removed.length > 0) {
104
+ lines.push(`REMOVED (${removed.length}):`);
105
+ lines.push(...removed);
106
+ lines.push("");
107
+ }
108
+ if (unchanged.length > 0) {
109
+ lines.push(`UNCHANGED (${unchanged.length}):`);
110
+ lines.push(...unchanged);
111
+ lines.push("");
112
+ }
113
+ if (regressions.length === 0 &&
114
+ fixes.length === 0 &&
115
+ newCases.length === 0 &&
116
+ removed.length === 0) {
117
+ lines.push("No changes detected between runs.");
118
+ }
119
+ return lines.join("\n");
120
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * run_suite and run_case MCP tool implementations.
3
+ */
4
+ import type { EvalDb } from "../db.js";
5
+ import type { RunnerOptions } from "../runner.js";
6
+ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
7
+ export interface RunToolOptions {
8
+ fixturesDir: string;
9
+ db: EvalDb;
10
+ runnerOptions: RunnerOptions;
11
+ server?: McpServer;
12
+ }
13
+ /**
14
+ * run_suite — execute all fixtures in the fixtures directory.
15
+ */
16
+ export declare function runSuiteTool(opts: RunToolOptions): Promise<string>;
17
+ /**
18
+ * run_case — run a single named test case.
19
+ */
20
+ export declare function runCaseTool(name: string, opts: RunToolOptions): Promise<string>;
@@ -0,0 +1,166 @@
1
+ /**
2
+ * run_suite and run_case MCP tool implementations.
3
+ */
4
+ import { loadFixturesFromDir, loadFixture } from "../fixture.js";
5
+ import { runSuite, runCase } from "../runner.js";
6
+ import { formatResult } from "../reporter.js";
7
+ import path from "path";
8
+ import fs from "fs";
9
+ import crypto from "crypto";
10
+ /**
11
+ * Send a progress notification for the current suite run.
12
+ */
13
+ async function sendProgress(server, suiteId, progress, total) {
14
+ if (!server)
15
+ return;
16
+ try {
17
+ await server.server.notification({
18
+ method: "notifications/progress",
19
+ params: {
20
+ progressToken: `eval-run-${suiteId}`,
21
+ progress,
22
+ total,
23
+ },
24
+ });
25
+ }
26
+ catch {
27
+ // Notifications are best-effort; ignore send errors
28
+ }
29
+ }
30
+ /**
31
+ * Send an MCP logging notification for an assertion result.
32
+ */
33
+ async function sendAssertionLog(server, assertionType, caseId, passed) {
34
+ if (!server)
35
+ return;
36
+ try {
37
+ await server.server.notification({
38
+ method: "notifications/message",
39
+ params: {
40
+ level: "info",
41
+ logger: "eval-runner",
42
+ data: `Assertion ${assertionType} on case ${caseId}: ${passed ? "PASS" : "FAIL"}`,
43
+ },
44
+ });
45
+ }
46
+ catch {
47
+ // Notifications are best-effort; ignore send errors
48
+ }
49
+ }
50
+ /**
51
+ * run_suite — execute all fixtures in the fixtures directory.
52
+ */
53
+ export async function runSuiteTool(opts) {
54
+ const fixtures = loadFixturesFromDir(opts.fixturesDir);
55
+ if (fixtures.length === 0) {
56
+ return [
57
+ "No fixtures found in: " + opts.fixturesDir,
58
+ "",
59
+ "NOTE: This runs in simulation mode (Phase 1).",
60
+ "Create fixtures with create_test_case or add YAML files to the fixtures directory.",
61
+ ].join("\n");
62
+ }
63
+ const total = fixtures.length;
64
+ const suiteId = crypto.randomUUID();
65
+ let completed = 0;
66
+ // Run fixtures one at a time, emitting progress + assertion logs per case
67
+ const caseResultsAcc = [];
68
+ for (const fixture of fixtures) {
69
+ const caseResult = await runCase(fixture, opts.runnerOptions);
70
+ caseResultsAcc.push(caseResult);
71
+ // Emit assertion-level logging notifications
72
+ for (const step of caseResult.steps) {
73
+ for (const assertion of step.assertions) {
74
+ await sendAssertionLog(opts.server, assertion.type, fixture.name, assertion.passed);
75
+ }
76
+ }
77
+ completed++;
78
+ await sendProgress(opts.server, suiteId, completed, total);
79
+ }
80
+ // Persist results through runSuite (which also recalculates pass/fail totals)
81
+ const result = await runSuite(fixtures, "default", opts.runnerOptions, opts.db);
82
+ const formatted = formatResult(result, opts.runnerOptions.format);
83
+ return formatted;
84
+ }
85
+ /**
86
+ * run_case — run a single named test case.
87
+ */
88
+ export async function runCaseTool(name, opts) {
89
+ // Find the fixture file by name
90
+ const fixtures = loadFixturesFromDir(opts.fixturesDir);
91
+ const fixture = fixtures.find((f) => f.name === name);
92
+ if (!fixture) {
93
+ // Try to find by filename as well
94
+ const candidates = [
95
+ path.join(opts.fixturesDir, `${name}.yaml`),
96
+ path.join(opts.fixturesDir, `${name}.yml`),
97
+ path.join(opts.fixturesDir, `${name}.json`),
98
+ ];
99
+ for (const c of candidates) {
100
+ if (fs.existsSync(c)) {
101
+ const loaded = loadFixture(c);
102
+ const caseResult = await runCase(loaded, opts.runnerOptions);
103
+ // Emit assertion-level logging notifications
104
+ for (const step of caseResult.steps) {
105
+ for (const assertion of step.assertions) {
106
+ await sendAssertionLog(opts.server, assertion.type, loaded.name, assertion.passed);
107
+ }
108
+ }
109
+ const suiteResult = {
110
+ run_id: crypto.randomUUID(),
111
+ suite_name: name,
112
+ started_at: Date.now() - caseResult.duration_ms,
113
+ ended_at: Date.now(),
114
+ total_cases: 1,
115
+ passed: caseResult.status === "pass" ? 1 : 0,
116
+ failed: caseResult.status !== "pass" ? 1 : 0,
117
+ cases: [caseResult],
118
+ };
119
+ return formatResult(suiteResult, opts.runnerOptions.format);
120
+ }
121
+ }
122
+ throw new Error(`No fixture named "${name}" found in ${opts.fixturesDir}`);
123
+ }
124
+ const caseResult = await runCase(fixture, opts.runnerOptions);
125
+ // Emit assertion-level logging notifications
126
+ for (const step of caseResult.steps) {
127
+ for (const assertion of step.assertions) {
128
+ await sendAssertionLog(opts.server, assertion.type, fixture.name, assertion.passed);
129
+ }
130
+ }
131
+ // Persist to DB — run record must exist before case_result (foreign key)
132
+ const runId = crypto.randomUUID();
133
+ const now = Date.now();
134
+ opts.db.insertRun({
135
+ id: runId,
136
+ suite_name: name,
137
+ started_at: now - caseResult.duration_ms,
138
+ ended_at: now,
139
+ total_cases: 1,
140
+ passed: caseResult.status === "pass" ? 1 : 0,
141
+ failed: caseResult.status !== "pass" ? 1 : 0,
142
+ format: opts.runnerOptions.format ?? "console",
143
+ });
144
+ opts.db.insertCaseResult({
145
+ id: crypto.randomUUID(),
146
+ run_id: runId,
147
+ case_name: caseResult.case_name,
148
+ status: caseResult.status,
149
+ duration_ms: caseResult.duration_ms,
150
+ error_message: caseResult.error ?? null,
151
+ assertions_json: JSON.stringify(caseResult.steps.map((s) => s.assertions)),
152
+ created_at: now,
153
+ });
154
+ const suiteResult = {
155
+ run_id: runId,
156
+ suite_name: name,
157
+ started_at: now - caseResult.duration_ms,
158
+ ended_at: now,
159
+ total_cases: 1,
160
+ passed: caseResult.status === "pass" ? 1 : 0,
161
+ failed: caseResult.status !== "pass" ? 1 : 0,
162
+ cases: [caseResult],
163
+ };
164
+ const formatted = formatResult(suiteResult, opts.runnerOptions.format);
165
+ return formatted;
166
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * scaffold_fixture tool implementation.
3
+ *
4
+ * Generates a boilerplate YAML fixture file given a name and a list of tool names.
5
+ * Each tool name becomes a documented step with placeholder input and assertions.
6
+ */
7
+ /**
8
+ * Scaffold a YAML fixture file and write it to the fixtures directory.
9
+ * Returns the path to the created file.
10
+ */
11
+ export declare function scaffoldFixtureTool(name: string, toolNames: string[], fixturesDir: string): string;
@@ -0,0 +1,90 @@
1
+ /**
2
+ * scaffold_fixture tool implementation.
3
+ *
4
+ * Generates a boilerplate YAML fixture file given a name and a list of tool names.
5
+ * Each tool name becomes a documented step with placeholder input and assertions.
6
+ */
7
+ import fs from "fs";
8
+ import path from "path";
9
+ /**
10
+ * Build a YAML fixture template string from a name and list of tool names.
11
+ */
12
+ function buildFixtureYaml(name, toolNames) {
13
+ const safeId = (s) => s.replace(/[^a-z0-9_]/gi, "_");
14
+ const steps = toolNames.map((tool, idx) => {
15
+ const prevId = idx > 0 ? `step_${idx}_${safeId(toolNames[idx - 1])}` : null;
16
+ const id = `step_${idx + 1}_${safeId(tool)}`;
17
+ const pipingComment = prevId
18
+ ? ` # Pipe previous step output: "{{steps.${prevId}.output}}"\n`
19
+ : "";
20
+ return [
21
+ ` - id: ${id}`,
22
+ ` description: "Call ${tool} and verify the response"`,
23
+ ` tool: ${tool}`,
24
+ ` input:`,
25
+ ` # TODO: fill in tool-specific input parameters`,
26
+ ` example_param: "example_value"`,
27
+ pipingComment.trimEnd(),
28
+ ` expected_output: "TODO: paste or describe expected output here"`,
29
+ ` expect:`,
30
+ ` output_contains: "TODO: expected substring"`,
31
+ ` # output_not_contains: "error"`,
32
+ ` # output_matches: "^result:"`,
33
+ ` tool_called: ${tool}`,
34
+ ` latency_under: 5000`,
35
+ ` # schema_match:`,
36
+ ` # type: object`,
37
+ ` # required: [result]`,
38
+ ` # properties:`,
39
+ ` # result:`,
40
+ ` # type: string`,
41
+ ]
42
+ .filter((l) => l !== "")
43
+ .join("\n");
44
+ });
45
+ return [
46
+ `# Fixture: ${name}`,
47
+ `# Generated by mcp-eval-runner scaffold_fixture`,
48
+ `# Edit the TODO fields before running.`,
49
+ `#`,
50
+ `# Live execution: add a 'server' block to run against a real MCP server:`,
51
+ `# server:`,
52
+ `# command: node`,
53
+ `# args: ["/path/to/your/server.js"]`,
54
+ `# env:`,
55
+ `# MY_ENV_VAR: "value"`,
56
+ ``,
57
+ `name: ${name}`,
58
+ `description: "Auto-scaffolded fixture for tools: ${toolNames.join(", ")}"`,
59
+ ``,
60
+ `# server:`,
61
+ `# command: node`,
62
+ `# args: ["/path/to/server.js"]`,
63
+ ``,
64
+ `steps:`,
65
+ steps.join("\n\n"),
66
+ ``,
67
+ ].join("\n");
68
+ }
69
+ /**
70
+ * Scaffold a YAML fixture file and write it to the fixtures directory.
71
+ * Returns the path to the created file.
72
+ */
73
+ export function scaffoldFixtureTool(name, toolNames, fixturesDir) {
74
+ if (!name || name.trim() === "") {
75
+ throw new Error('"name" must be a non-empty string');
76
+ }
77
+ if (!toolNames || toolNames.length === 0) {
78
+ throw new Error('"tool_names" must be a non-empty array');
79
+ }
80
+ fs.mkdirSync(fixturesDir, { recursive: true });
81
+ const safeName = name.replace(/[^a-z0-9_-]/gi, "_");
82
+ const fileName = `${safeName}.yaml`;
83
+ const filePath = path.join(fixturesDir, fileName);
84
+ if (fs.existsSync(filePath)) {
85
+ throw new Error(`Fixture already exists: ${filePath}`);
86
+ }
87
+ const content = buildFixtureYaml(name, toolNames);
88
+ fs.writeFileSync(filePath, content, "utf-8");
89
+ return filePath;
90
+ }