mcp-eval-runner 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +39 -0
- package/CHANGELOG.md +67 -0
- package/LICENSE +21 -0
- package/README.md +328 -0
- package/dist/assertions.d.ts +63 -0
- package/dist/assertions.js +187 -0
- package/dist/audit-log.d.ts +26 -0
- package/dist/audit-log.js +57 -0
- package/dist/auth.d.ts +15 -0
- package/dist/auth.js +83 -0
- package/dist/db.d.ts +40 -0
- package/dist/db.js +94 -0
- package/dist/deployment-gate.d.ts +27 -0
- package/dist/deployment-gate.js +43 -0
- package/dist/fixture-library.d.ts +26 -0
- package/dist/fixture-library.js +85 -0
- package/dist/fixture.d.ts +87 -0
- package/dist/fixture.js +170 -0
- package/dist/http-server.d.ts +7 -0
- package/dist/http-server.js +34 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +158 -0
- package/dist/llm-judge.d.ts +24 -0
- package/dist/llm-judge.js +139 -0
- package/dist/rate-limiter.d.ts +13 -0
- package/dist/rate-limiter.js +36 -0
- package/dist/reporter.d.ts +8 -0
- package/dist/reporter.js +163 -0
- package/dist/runner.d.ts +57 -0
- package/dist/runner.js +339 -0
- package/dist/server.d.ts +22 -0
- package/dist/server.js +583 -0
- package/dist/tools/html_report.d.ts +8 -0
- package/dist/tools/html_report.js +188 -0
- package/dist/tools/manage.d.ts +11 -0
- package/dist/tools/manage.js +41 -0
- package/dist/tools/report.d.ts +12 -0
- package/dist/tools/report.js +120 -0
- package/dist/tools/run.d.ts +20 -0
- package/dist/tools/run.js +166 -0
- package/dist/tools/scaffold.d.ts +11 -0
- package/dist/tools/scaffold.js +90 -0
- package/evals/reference/mcp-fetch.yaml +46 -0
- package/evals/reference/mcp-filesystem.yaml +63 -0
- package/evals/reference/mcp-memory.yaml +70 -0
- package/evals/reference/step-piping-example.yaml +25 -0
- package/evals/smoke.yaml +12 -0
- package/package.json +67 -0
package/dist/runner.js
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test execution engine for MCP Eval Runner.
|
|
3
|
+
*
|
|
4
|
+
* Supports two execution modes:
|
|
5
|
+
*
|
|
6
|
+
* 1. Live mode (default when server config is present in fixture):
|
|
7
|
+
* Spawns the target MCP server as a child process (stdio transport) or
|
|
8
|
+
* connects via HTTP, calls each step's tool with the provided input, and
|
|
9
|
+
* evaluates assertions against the real response.
|
|
10
|
+
*
|
|
11
|
+
* 2. Simulation mode (fallback when no server config is present):
|
|
12
|
+
* Evaluates assertions against `expected_output` from the fixture.
|
|
13
|
+
* Useful for authoring and CI dry-runs without a running server.
|
|
14
|
+
*
|
|
15
|
+
* Step output piping:
|
|
16
|
+
* Steps can reference the output of a previous step using the
|
|
17
|
+
* `{{steps.<step_id>.output}}` placeholder in their `input` values.
|
|
18
|
+
*/
|
|
19
|
+
import crypto from "crypto";
|
|
20
|
+
import { spawn } from "child_process";
|
|
21
|
+
import { evaluateAllAssertions, evaluateAllAssertionsAsync, } from "./assertions.js";
|
|
22
|
+
/**
|
|
23
|
+
* Resolve `{{steps.<id>.output}}` placeholders in input values.
|
|
24
|
+
*/
|
|
25
|
+
function resolveInputPlaceholders(input, context) {
|
|
26
|
+
const resolved = {};
|
|
27
|
+
for (const [key, value] of Object.entries(input)) {
|
|
28
|
+
if (typeof value === "string") {
|
|
29
|
+
resolved[key] = value.replace(/\{\{steps\.([^}]+)\.output\}\}/g, (_match, stepId) => {
|
|
30
|
+
return context.get(stepId) ?? "";
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
resolved[key] = value;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return resolved;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Minimal MCP stdio client.
|
|
41
|
+
* Spawns the server command, sends JSON-RPC requests over stdin/stdout.
|
|
42
|
+
*/
|
|
43
|
+
class McpStdioClient {
|
|
44
|
+
proc;
|
|
45
|
+
buffer = "";
|
|
46
|
+
pending = new Map();
|
|
47
|
+
nextId = 1;
|
|
48
|
+
ready = false;
|
|
49
|
+
initPromise;
|
|
50
|
+
constructor(command, args, env) {
|
|
51
|
+
this.proc = spawn(command, args, {
|
|
52
|
+
env: { ...process.env, ...env },
|
|
53
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
54
|
+
});
|
|
55
|
+
this.proc.stdout.on("data", (chunk) => {
|
|
56
|
+
this.buffer += chunk.toString("utf-8");
|
|
57
|
+
this.flushBuffer();
|
|
58
|
+
});
|
|
59
|
+
this.proc.stderr.on("data", () => {
|
|
60
|
+
// Ignore stderr from the server under test
|
|
61
|
+
});
|
|
62
|
+
this.initPromise = this.initialize();
|
|
63
|
+
}
|
|
64
|
+
flushBuffer() {
|
|
65
|
+
const lines = this.buffer.split("\n");
|
|
66
|
+
this.buffer = lines.pop() ?? "";
|
|
67
|
+
for (const line of lines) {
|
|
68
|
+
const trimmed = line.trim();
|
|
69
|
+
if (!trimmed)
|
|
70
|
+
continue;
|
|
71
|
+
try {
|
|
72
|
+
const msg = JSON.parse(trimmed);
|
|
73
|
+
const handler = this.pending.get(msg.id);
|
|
74
|
+
if (handler) {
|
|
75
|
+
this.pending.delete(msg.id);
|
|
76
|
+
handler.resolve(msg);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
// Non-JSON line from server — ignore
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
send(req) {
|
|
85
|
+
return new Promise((resolve, reject) => {
|
|
86
|
+
this.pending.set(req.id, { resolve, reject });
|
|
87
|
+
this.proc.stdin.write(JSON.stringify(req) + "\n");
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
async initialize() {
|
|
91
|
+
const initReq = {
|
|
92
|
+
jsonrpc: "2.0",
|
|
93
|
+
id: this.nextId++,
|
|
94
|
+
method: "initialize",
|
|
95
|
+
params: {
|
|
96
|
+
protocolVersion: "2024-11-05",
|
|
97
|
+
capabilities: {},
|
|
98
|
+
clientInfo: { name: "mcp-eval-runner", version: "1.0.0" },
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
await this.send(initReq);
|
|
102
|
+
// Send initialized notification (no response expected)
|
|
103
|
+
this.proc.stdin.write(JSON.stringify({ jsonrpc: "2.0", method: "notifications/initialized", params: {} }) + "\n");
|
|
104
|
+
this.ready = true;
|
|
105
|
+
}
|
|
106
|
+
async callTool(toolName, toolInput, timeoutMs) {
|
|
107
|
+
await this.initPromise;
|
|
108
|
+
if (!this.ready)
|
|
109
|
+
throw new Error("MCP client not initialized");
|
|
110
|
+
const req = {
|
|
111
|
+
jsonrpc: "2.0",
|
|
112
|
+
id: this.nextId++,
|
|
113
|
+
method: "tools/call",
|
|
114
|
+
params: { name: toolName, arguments: toolInput },
|
|
115
|
+
};
|
|
116
|
+
const responsePromise = this.send(req);
|
|
117
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error(`Tool call timed out after ${timeoutMs}ms`)), timeoutMs));
|
|
118
|
+
const response = await Promise.race([responsePromise, timeoutPromise]);
|
|
119
|
+
if (response.error) {
|
|
120
|
+
throw new Error(`MCP error ${response.error.code}: ${response.error.message}`);
|
|
121
|
+
}
|
|
122
|
+
// Extract text content from the result
|
|
123
|
+
const result = response.result;
|
|
124
|
+
if (result?.content) {
|
|
125
|
+
return result.content
|
|
126
|
+
.filter((c) => c.type === "text")
|
|
127
|
+
.map((c) => c.text ?? "")
|
|
128
|
+
.join("\n");
|
|
129
|
+
}
|
|
130
|
+
return JSON.stringify(result);
|
|
131
|
+
}
|
|
132
|
+
close() {
|
|
133
|
+
try {
|
|
134
|
+
this.proc.stdin.end();
|
|
135
|
+
this.proc.kill();
|
|
136
|
+
}
|
|
137
|
+
catch {
|
|
138
|
+
// best-effort
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// ── Live step execution ───────────────────────────────────────────────────────
|
|
143
|
+
async function executeStepLive(step, client, timeoutMs, context) {
|
|
144
|
+
const start = Date.now();
|
|
145
|
+
const resolvedInput = resolveInputPlaceholders(step.input, context);
|
|
146
|
+
let output;
|
|
147
|
+
let status = "pass";
|
|
148
|
+
let error;
|
|
149
|
+
const assertionResults = [];
|
|
150
|
+
try {
|
|
151
|
+
output = await client.callTool(step.tool, resolvedInput, timeoutMs);
|
|
152
|
+
}
|
|
153
|
+
catch (err) {
|
|
154
|
+
output = "";
|
|
155
|
+
error = err instanceof Error ? err.message : String(err);
|
|
156
|
+
status = "error";
|
|
157
|
+
const duration_ms = Date.now() - start;
|
|
158
|
+
return {
|
|
159
|
+
step_id: step.id,
|
|
160
|
+
tool: step.tool,
|
|
161
|
+
status,
|
|
162
|
+
duration_ms,
|
|
163
|
+
output,
|
|
164
|
+
assertions: [],
|
|
165
|
+
error,
|
|
166
|
+
mode: "live",
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
const duration_ms = Date.now() - start;
|
|
170
|
+
if (step.expect) {
|
|
171
|
+
const { passed, results } = await evaluateAllAssertionsAsync(step.expect, {
|
|
172
|
+
tool: step.tool,
|
|
173
|
+
output,
|
|
174
|
+
latency_ms: duration_ms,
|
|
175
|
+
});
|
|
176
|
+
assertionResults.push(...results);
|
|
177
|
+
if (!passed)
|
|
178
|
+
status = "fail";
|
|
179
|
+
}
|
|
180
|
+
return {
|
|
181
|
+
step_id: step.id,
|
|
182
|
+
tool: step.tool,
|
|
183
|
+
status,
|
|
184
|
+
duration_ms,
|
|
185
|
+
output,
|
|
186
|
+
assertions: assertionResults,
|
|
187
|
+
error,
|
|
188
|
+
mode: "live",
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
// ── Simulation step execution ─────────────────────────────────────────────────
|
|
192
|
+
function simulateStep(step, context) {
|
|
193
|
+
const start = Date.now();
|
|
194
|
+
const simulatedLatency = Math.floor(Math.random() * 10) + 1;
|
|
195
|
+
// Resolve placeholders even in simulation mode
|
|
196
|
+
const resolvedInput = resolveInputPlaceholders(step.input, context);
|
|
197
|
+
void resolvedInput; // used for placeholder resolution side-effect
|
|
198
|
+
const output = step.expected_output ?? "";
|
|
199
|
+
const assertionResults = [];
|
|
200
|
+
let status = "pass";
|
|
201
|
+
if (step.expect) {
|
|
202
|
+
const { passed, results } = evaluateAllAssertions(step.expect, {
|
|
203
|
+
tool: step.tool,
|
|
204
|
+
output,
|
|
205
|
+
latency_ms: simulatedLatency,
|
|
206
|
+
});
|
|
207
|
+
assertionResults.push(...results);
|
|
208
|
+
if (!passed)
|
|
209
|
+
status = "fail";
|
|
210
|
+
}
|
|
211
|
+
const duration_ms = Date.now() - start + simulatedLatency;
|
|
212
|
+
return {
|
|
213
|
+
step_id: step.id,
|
|
214
|
+
tool: step.tool,
|
|
215
|
+
status,
|
|
216
|
+
duration_ms,
|
|
217
|
+
output,
|
|
218
|
+
assertions: assertionResults,
|
|
219
|
+
mode: "simulation",
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
// ── Case runner ───────────────────────────────────────────────────────────────
|
|
223
|
+
export async function runCase(fixture, options) {
|
|
224
|
+
const start = Date.now();
|
|
225
|
+
const stepResults = [];
|
|
226
|
+
const context = new Map();
|
|
227
|
+
// Determine if live execution is possible
|
|
228
|
+
const serverConfig = fixture.server;
|
|
229
|
+
let client = null;
|
|
230
|
+
if (serverConfig?.command) {
|
|
231
|
+
try {
|
|
232
|
+
client = new McpStdioClient(serverConfig.command, serverConfig.args ?? [], serverConfig.env);
|
|
233
|
+
}
|
|
234
|
+
catch (err) {
|
|
235
|
+
const duration_ms = Date.now() - start;
|
|
236
|
+
return {
|
|
237
|
+
case_name: fixture.name,
|
|
238
|
+
status: "error",
|
|
239
|
+
duration_ms,
|
|
240
|
+
steps: [],
|
|
241
|
+
error: `Failed to start server: ${err instanceof Error ? err.message : String(err)}`,
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
try {
|
|
246
|
+
for (const step of fixture.steps) {
|
|
247
|
+
let stepResult;
|
|
248
|
+
if (client) {
|
|
249
|
+
stepResult = await executeStepLive(step, client, options.timeoutMs, context);
|
|
250
|
+
}
|
|
251
|
+
else {
|
|
252
|
+
stepResult = simulateStep(step, context);
|
|
253
|
+
}
|
|
254
|
+
stepResults.push(stepResult);
|
|
255
|
+
// Store output for downstream step piping
|
|
256
|
+
context.set(step.id, stepResult.output);
|
|
257
|
+
// Stop on first error in live mode to avoid cascading failures
|
|
258
|
+
if (stepResult.status === "error" && client) {
|
|
259
|
+
break;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
catch (err) {
|
|
264
|
+
const duration_ms = Date.now() - start;
|
|
265
|
+
client?.close();
|
|
266
|
+
return {
|
|
267
|
+
case_name: fixture.name,
|
|
268
|
+
status: "error",
|
|
269
|
+
duration_ms,
|
|
270
|
+
steps: stepResults,
|
|
271
|
+
error: err instanceof Error ? err.message : String(err),
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
client?.close();
|
|
275
|
+
const duration_ms = Date.now() - start;
|
|
276
|
+
const allPassed = stepResults.every((s) => s.status === "pass");
|
|
277
|
+
const status = allPassed
|
|
278
|
+
? "pass"
|
|
279
|
+
: stepResults.some((s) => s.status === "error")
|
|
280
|
+
? "error"
|
|
281
|
+
: "fail";
|
|
282
|
+
return { case_name: fixture.name, status, duration_ms, steps: stepResults };
|
|
283
|
+
}
|
|
284
|
+
// ── Suite runner ──────────────────────────────────────────────────────────────
|
|
285
|
+
export async function runSuite(fixtures, suiteName, options, db) {
|
|
286
|
+
const run_id = crypto.randomUUID();
|
|
287
|
+
const started_at = Date.now();
|
|
288
|
+
const concurrency = Math.max(1, options.concurrency ?? 1);
|
|
289
|
+
db.insertRun({
|
|
290
|
+
id: run_id,
|
|
291
|
+
suite_name: suiteName,
|
|
292
|
+
started_at,
|
|
293
|
+
ended_at: null,
|
|
294
|
+
total_cases: fixtures.length,
|
|
295
|
+
passed: 0,
|
|
296
|
+
failed: 0,
|
|
297
|
+
format: options.format,
|
|
298
|
+
});
|
|
299
|
+
const caseResults = new Array(fixtures.length);
|
|
300
|
+
let passed = 0;
|
|
301
|
+
let failed = 0;
|
|
302
|
+
// Process in batches of `concurrency`
|
|
303
|
+
for (let i = 0; i < fixtures.length; i += concurrency) {
|
|
304
|
+
const batch = fixtures.slice(i, i + concurrency);
|
|
305
|
+
const batchResults = await Promise.all(batch.map((fixture) => runCase(fixture, options)));
|
|
306
|
+
for (let j = 0; j < batchResults.length; j++) {
|
|
307
|
+
const caseResult = batchResults[j];
|
|
308
|
+
caseResults[i + j] = caseResult;
|
|
309
|
+
if (caseResult.status === "pass") {
|
|
310
|
+
passed++;
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
failed++;
|
|
314
|
+
}
|
|
315
|
+
db.insertCaseResult({
|
|
316
|
+
id: crypto.randomUUID(),
|
|
317
|
+
run_id,
|
|
318
|
+
case_name: caseResult.case_name,
|
|
319
|
+
status: caseResult.status,
|
|
320
|
+
duration_ms: caseResult.duration_ms,
|
|
321
|
+
error_message: caseResult.error ?? null,
|
|
322
|
+
assertions_json: JSON.stringify(caseResult.steps.map((s) => s.assertions)),
|
|
323
|
+
created_at: Date.now(),
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
const ended_at = Date.now();
|
|
328
|
+
db.updateRun(run_id, { ended_at, total_cases: fixtures.length, passed, failed });
|
|
329
|
+
return {
|
|
330
|
+
run_id,
|
|
331
|
+
suite_name: suiteName,
|
|
332
|
+
started_at,
|
|
333
|
+
ended_at,
|
|
334
|
+
total_cases: fixtures.length,
|
|
335
|
+
passed,
|
|
336
|
+
failed,
|
|
337
|
+
cases: caseResults,
|
|
338
|
+
};
|
|
339
|
+
}
|
package/dist/server.d.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Server for mcp-eval-runner.
|
|
3
|
+
* Exposes tools: run_suite, run_case, list_cases, create_test_case,
|
|
4
|
+
* regression_report, compare_results, generate_html_report, scaffold_fixture.
|
|
5
|
+
* Exposes resources: eval://{fixture_name}
|
|
6
|
+
* Exposes prompts: write-test-case
|
|
7
|
+
*/
|
|
8
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
9
|
+
export interface ServerOptions {
|
|
10
|
+
fixturesDir: string;
|
|
11
|
+
dbPath: string;
|
|
12
|
+
timeoutMs: number;
|
|
13
|
+
format: "console" | "json" | "html";
|
|
14
|
+
watch: boolean;
|
|
15
|
+
concurrency?: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Check whether the given requestId has been cancelled by the client.
|
|
19
|
+
*/
|
|
20
|
+
export declare function isCancelled(requestId: string): boolean;
|
|
21
|
+
export declare function createServer(opts: ServerOptions): Promise<McpServer>;
|
|
22
|
+
export declare function startServer(opts: ServerOptions): Promise<void>;
|