mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
package/dist/runner.js ADDED
@@ -0,0 +1,339 @@
1
+ /**
2
+ * Test execution engine for MCP Eval Runner.
3
+ *
4
+ * Supports two execution modes:
5
+ *
6
+ * 1. Live mode (default when server config is present in fixture):
7
+ * Spawns the target MCP server as a child process (stdio transport) or
8
+ * connects via HTTP, calls each step's tool with the provided input, and
9
+ * evaluates assertions against the real response.
10
+ *
11
+ * 2. Simulation mode (fallback when no server config is present):
12
+ * Evaluates assertions against `expected_output` from the fixture.
13
+ * Useful for authoring and CI dry-runs without a running server.
14
+ *
15
+ * Step output piping:
16
+ * Steps can reference the output of a previous step using the
17
+ * `{{steps.<step_id>.output}}` placeholder in their `input` values.
18
+ */
19
+ import crypto from "crypto";
20
+ import { spawn } from "child_process";
21
+ import { evaluateAllAssertions, evaluateAllAssertionsAsync, } from "./assertions.js";
22
+ /**
23
+ * Resolve `{{steps.<id>.output}}` placeholders in input values.
24
+ */
25
+ function resolveInputPlaceholders(input, context) {
26
+ const resolved = {};
27
+ for (const [key, value] of Object.entries(input)) {
28
+ if (typeof value === "string") {
29
+ resolved[key] = value.replace(/\{\{steps\.([^}]+)\.output\}\}/g, (_match, stepId) => {
30
+ return context.get(stepId) ?? "";
31
+ });
32
+ }
33
+ else {
34
+ resolved[key] = value;
35
+ }
36
+ }
37
+ return resolved;
38
+ }
39
+ /**
40
+ * Minimal MCP stdio client.
41
+ * Spawns the server command, sends JSON-RPC requests over stdin/stdout.
42
+ */
43
+ class McpStdioClient {
44
+ proc;
45
+ buffer = "";
46
+ pending = new Map();
47
+ nextId = 1;
48
+ ready = false;
49
+ initPromise;
50
+ constructor(command, args, env) {
51
+ this.proc = spawn(command, args, {
52
+ env: { ...process.env, ...env },
53
+ stdio: ["pipe", "pipe", "pipe"],
54
+ });
55
+ this.proc.stdout.on("data", (chunk) => {
56
+ this.buffer += chunk.toString("utf-8");
57
+ this.flushBuffer();
58
+ });
59
+ this.proc.stderr.on("data", () => {
60
+ // Ignore stderr from the server under test
61
+ });
62
+ this.initPromise = this.initialize();
63
+ }
64
+ flushBuffer() {
65
+ const lines = this.buffer.split("\n");
66
+ this.buffer = lines.pop() ?? "";
67
+ for (const line of lines) {
68
+ const trimmed = line.trim();
69
+ if (!trimmed)
70
+ continue;
71
+ try {
72
+ const msg = JSON.parse(trimmed);
73
+ const handler = this.pending.get(msg.id);
74
+ if (handler) {
75
+ this.pending.delete(msg.id);
76
+ handler.resolve(msg);
77
+ }
78
+ }
79
+ catch {
80
+ // Non-JSON line from server — ignore
81
+ }
82
+ }
83
+ }
84
+ send(req) {
85
+ return new Promise((resolve, reject) => {
86
+ this.pending.set(req.id, { resolve, reject });
87
+ this.proc.stdin.write(JSON.stringify(req) + "\n");
88
+ });
89
+ }
90
+ async initialize() {
91
+ const initReq = {
92
+ jsonrpc: "2.0",
93
+ id: this.nextId++,
94
+ method: "initialize",
95
+ params: {
96
+ protocolVersion: "2024-11-05",
97
+ capabilities: {},
98
+ clientInfo: { name: "mcp-eval-runner", version: "1.0.0" },
99
+ },
100
+ };
101
+ await this.send(initReq);
102
+ // Send initialized notification (no response expected)
103
+ this.proc.stdin.write(JSON.stringify({ jsonrpc: "2.0", method: "notifications/initialized", params: {} }) + "\n");
104
+ this.ready = true;
105
+ }
106
+ async callTool(toolName, toolInput, timeoutMs) {
107
+ await this.initPromise;
108
+ if (!this.ready)
109
+ throw new Error("MCP client not initialized");
110
+ const req = {
111
+ jsonrpc: "2.0",
112
+ id: this.nextId++,
113
+ method: "tools/call",
114
+ params: { name: toolName, arguments: toolInput },
115
+ };
116
+ const responsePromise = this.send(req);
117
+ const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error(`Tool call timed out after ${timeoutMs}ms`)), timeoutMs));
118
+ const response = await Promise.race([responsePromise, timeoutPromise]);
119
+ if (response.error) {
120
+ throw new Error(`MCP error ${response.error.code}: ${response.error.message}`);
121
+ }
122
+ // Extract text content from the result
123
+ const result = response.result;
124
+ if (result?.content) {
125
+ return result.content
126
+ .filter((c) => c.type === "text")
127
+ .map((c) => c.text ?? "")
128
+ .join("\n");
129
+ }
130
+ return JSON.stringify(result);
131
+ }
132
+ close() {
133
+ try {
134
+ this.proc.stdin.end();
135
+ this.proc.kill();
136
+ }
137
+ catch {
138
+ // best-effort
139
+ }
140
+ }
141
+ }
142
+ // ── Live step execution ───────────────────────────────────────────────────────
143
+ async function executeStepLive(step, client, timeoutMs, context) {
144
+ const start = Date.now();
145
+ const resolvedInput = resolveInputPlaceholders(step.input, context);
146
+ let output;
147
+ let status = "pass";
148
+ let error;
149
+ const assertionResults = [];
150
+ try {
151
+ output = await client.callTool(step.tool, resolvedInput, timeoutMs);
152
+ }
153
+ catch (err) {
154
+ output = "";
155
+ error = err instanceof Error ? err.message : String(err);
156
+ status = "error";
157
+ const duration_ms = Date.now() - start;
158
+ return {
159
+ step_id: step.id,
160
+ tool: step.tool,
161
+ status,
162
+ duration_ms,
163
+ output,
164
+ assertions: [],
165
+ error,
166
+ mode: "live",
167
+ };
168
+ }
169
+ const duration_ms = Date.now() - start;
170
+ if (step.expect) {
171
+ const { passed, results } = await evaluateAllAssertionsAsync(step.expect, {
172
+ tool: step.tool,
173
+ output,
174
+ latency_ms: duration_ms,
175
+ });
176
+ assertionResults.push(...results);
177
+ if (!passed)
178
+ status = "fail";
179
+ }
180
+ return {
181
+ step_id: step.id,
182
+ tool: step.tool,
183
+ status,
184
+ duration_ms,
185
+ output,
186
+ assertions: assertionResults,
187
+ error,
188
+ mode: "live",
189
+ };
190
+ }
191
+ // ── Simulation step execution ─────────────────────────────────────────────────
192
+ function simulateStep(step, context) {
193
+ const start = Date.now();
194
+ const simulatedLatency = Math.floor(Math.random() * 10) + 1;
195
+ // Resolve placeholders even in simulation mode
196
+ const resolvedInput = resolveInputPlaceholders(step.input, context);
197
+ void resolvedInput; // used for placeholder resolution side-effect
198
+ const output = step.expected_output ?? "";
199
+ const assertionResults = [];
200
+ let status = "pass";
201
+ if (step.expect) {
202
+ const { passed, results } = evaluateAllAssertions(step.expect, {
203
+ tool: step.tool,
204
+ output,
205
+ latency_ms: simulatedLatency,
206
+ });
207
+ assertionResults.push(...results);
208
+ if (!passed)
209
+ status = "fail";
210
+ }
211
+ const duration_ms = Date.now() - start + simulatedLatency;
212
+ return {
213
+ step_id: step.id,
214
+ tool: step.tool,
215
+ status,
216
+ duration_ms,
217
+ output,
218
+ assertions: assertionResults,
219
+ mode: "simulation",
220
+ };
221
+ }
222
+ // ── Case runner ───────────────────────────────────────────────────────────────
223
+ export async function runCase(fixture, options) {
224
+ const start = Date.now();
225
+ const stepResults = [];
226
+ const context = new Map();
227
+ // Determine if live execution is possible
228
+ const serverConfig = fixture.server;
229
+ let client = null;
230
+ if (serverConfig?.command) {
231
+ try {
232
+ client = new McpStdioClient(serverConfig.command, serverConfig.args ?? [], serverConfig.env);
233
+ }
234
+ catch (err) {
235
+ const duration_ms = Date.now() - start;
236
+ return {
237
+ case_name: fixture.name,
238
+ status: "error",
239
+ duration_ms,
240
+ steps: [],
241
+ error: `Failed to start server: ${err instanceof Error ? err.message : String(err)}`,
242
+ };
243
+ }
244
+ }
245
+ try {
246
+ for (const step of fixture.steps) {
247
+ let stepResult;
248
+ if (client) {
249
+ stepResult = await executeStepLive(step, client, options.timeoutMs, context);
250
+ }
251
+ else {
252
+ stepResult = simulateStep(step, context);
253
+ }
254
+ stepResults.push(stepResult);
255
+ // Store output for downstream step piping
256
+ context.set(step.id, stepResult.output);
257
+ // Stop on first error in live mode to avoid cascading failures
258
+ if (stepResult.status === "error" && client) {
259
+ break;
260
+ }
261
+ }
262
+ }
263
+ catch (err) {
264
+ const duration_ms = Date.now() - start;
265
+ client?.close();
266
+ return {
267
+ case_name: fixture.name,
268
+ status: "error",
269
+ duration_ms,
270
+ steps: stepResults,
271
+ error: err instanceof Error ? err.message : String(err),
272
+ };
273
+ }
274
+ client?.close();
275
+ const duration_ms = Date.now() - start;
276
+ const allPassed = stepResults.every((s) => s.status === "pass");
277
+ const status = allPassed
278
+ ? "pass"
279
+ : stepResults.some((s) => s.status === "error")
280
+ ? "error"
281
+ : "fail";
282
+ return { case_name: fixture.name, status, duration_ms, steps: stepResults };
283
+ }
284
+ // ── Suite runner ──────────────────────────────────────────────────────────────
285
+ export async function runSuite(fixtures, suiteName, options, db) {
286
+ const run_id = crypto.randomUUID();
287
+ const started_at = Date.now();
288
+ const concurrency = Math.max(1, options.concurrency ?? 1);
289
+ db.insertRun({
290
+ id: run_id,
291
+ suite_name: suiteName,
292
+ started_at,
293
+ ended_at: null,
294
+ total_cases: fixtures.length,
295
+ passed: 0,
296
+ failed: 0,
297
+ format: options.format,
298
+ });
299
+ const caseResults = new Array(fixtures.length);
300
+ let passed = 0;
301
+ let failed = 0;
302
+ // Process in batches of `concurrency`
303
+ for (let i = 0; i < fixtures.length; i += concurrency) {
304
+ const batch = fixtures.slice(i, i + concurrency);
305
+ const batchResults = await Promise.all(batch.map((fixture) => runCase(fixture, options)));
306
+ for (let j = 0; j < batchResults.length; j++) {
307
+ const caseResult = batchResults[j];
308
+ caseResults[i + j] = caseResult;
309
+ if (caseResult.status === "pass") {
310
+ passed++;
311
+ }
312
+ else {
313
+ failed++;
314
+ }
315
+ db.insertCaseResult({
316
+ id: crypto.randomUUID(),
317
+ run_id,
318
+ case_name: caseResult.case_name,
319
+ status: caseResult.status,
320
+ duration_ms: caseResult.duration_ms,
321
+ error_message: caseResult.error ?? null,
322
+ assertions_json: JSON.stringify(caseResult.steps.map((s) => s.assertions)),
323
+ created_at: Date.now(),
324
+ });
325
+ }
326
+ }
327
+ const ended_at = Date.now();
328
+ db.updateRun(run_id, { ended_at, total_cases: fixtures.length, passed, failed });
329
+ return {
330
+ run_id,
331
+ suite_name: suiteName,
332
+ started_at,
333
+ ended_at,
334
+ total_cases: fixtures.length,
335
+ passed,
336
+ failed,
337
+ cases: caseResults,
338
+ };
339
+ }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * MCP Server for mcp-eval-runner.
3
+ * Exposes tools: run_suite, run_case, list_cases, create_test_case,
4
+ * regression_report, compare_results, generate_html_report, scaffold_fixture.
5
+ * Exposes resources: eval://{fixture_name}
6
+ * Exposes prompts: write-test-case
7
+ */
8
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
9
+ export interface ServerOptions {
10
+ fixturesDir: string;
11
+ dbPath: string;
12
+ timeoutMs: number;
13
+ format: "console" | "json" | "html";
14
+ watch: boolean;
15
+ concurrency?: number;
16
+ }
17
+ /**
18
+ * Check whether the given requestId has been cancelled by the client.
19
+ */
20
+ export declare function isCancelled(requestId: string): boolean;
21
+ export declare function createServer(opts: ServerOptions): Promise<McpServer>;
22
+ export declare function startServer(opts: ServerOptions): Promise<void>;