mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
@@ -0,0 +1,139 @@
1
+ /**
2
+ * LLM-as-judge assertion for mcp-eval-runner.
3
+ * Calls an external LLM API via HTTP POST to score semantic similarity
4
+ * between actual and expected outputs.
5
+ *
6
+ * Credentials: LLM_JUDGE_API_KEY + LLM_JUDGE_BASE_URL env vars.
7
+ * Assertion type: llm_judge with prompt_template, min_score, model fields.
8
+ */
9
+ import https from "https";
10
+ import http from "http";
11
+ /**
12
+ * Make an HTTP/HTTPS POST request and return the parsed JSON response.
13
+ */
14
+ function httpPost(url, body, headers) {
15
+ return new Promise((resolve, reject) => {
16
+ const parsed = new URL(url);
17
+ const isHttps = parsed.protocol === "https:";
18
+ const transport = isHttps ? https : http;
19
+ const bodyStr = JSON.stringify(body);
20
+ const reqHeaders = {
21
+ "Content-Type": "application/json",
22
+ "Content-Length": Buffer.byteLength(bodyStr).toString(),
23
+ ...headers,
24
+ };
25
+ const options = {
26
+ hostname: parsed.hostname,
27
+ port: parsed.port || (isHttps ? 443 : 80),
28
+ path: parsed.pathname + (parsed.search || ""),
29
+ method: "POST",
30
+ headers: reqHeaders,
31
+ };
32
+ const req = transport.request(options, (res) => {
33
+ const chunks = [];
34
+ res.on("data", (chunk) => chunks.push(chunk));
35
+ res.on("end", () => {
36
+ try {
37
+ const text = Buffer.concat(chunks).toString("utf-8");
38
+ resolve(JSON.parse(text));
39
+ }
40
+ catch (err) {
41
+ reject(new Error(`Failed to parse LLM API response: ${err}`));
42
+ }
43
+ });
44
+ });
45
+ req.on("error", reject);
46
+ req.write(bodyStr);
47
+ req.end();
48
+ });
49
+ }
50
+ /**
51
+ * Run an LLM-as-judge assertion.
52
+ *
53
+ * Renders the prompt template with {actual} and {expected} placeholders,
54
+ * calls the LLM API, and extracts a score from the response (0.0–1.0).
55
+ * Expects the LLM to respond with a JSON object containing a "score" field,
56
+ * or a plain number.
57
+ */
58
+ export async function runLlmJudge(assertion, actual, expected) {
59
+ const apiKey = process.env.LLM_JUDGE_API_KEY;
60
+ const baseUrl = process.env.LLM_JUDGE_BASE_URL;
61
+ if (!apiKey || !baseUrl) {
62
+ return {
63
+ type: "llm_judge",
64
+ passed: false,
65
+ message: "LLM judge not configured: missing LLM_JUDGE_API_KEY or LLM_JUDGE_BASE_URL env vars",
66
+ };
67
+ }
68
+ // Render the prompt template
69
+ const prompt = assertion.prompt_template
70
+ .replace(/\{actual\}/g, actual)
71
+ .replace(/\{expected\}/g, expected);
72
+ const requestBody = {
73
+ model: assertion.model,
74
+ messages: [{ role: "user", content: prompt }],
75
+ max_tokens: 256,
76
+ };
77
+ const endpoint = baseUrl.replace(/\/$/, "") + "/chat/completions";
78
+ let response;
79
+ try {
80
+ response = await httpPost(endpoint, requestBody, {
81
+ Authorization: `Bearer ${apiKey}`,
82
+ });
83
+ }
84
+ catch (err) {
85
+ return {
86
+ type: "llm_judge",
87
+ passed: false,
88
+ message: `LLM judge HTTP error: ${err instanceof Error ? err.message : String(err)}`,
89
+ };
90
+ }
91
+ if (response.error) {
92
+ return {
93
+ type: "llm_judge",
94
+ passed: false,
95
+ message: `LLM judge API error: ${response.error.message}`,
96
+ };
97
+ }
98
+ const content = response.choices?.[0]?.message?.content ?? "";
99
+ let score;
100
+ // Try parsing as JSON with a "score" field first
101
+ try {
102
+ // Extract JSON from the response if it's embedded in text
103
+ const jsonMatch = content.match(/\{[^}]*"score"\s*:\s*([0-9.]+)[^}]*\}/);
104
+ if (jsonMatch) {
105
+ const parsed = JSON.parse(jsonMatch[0]);
106
+ if (typeof parsed.score === "number") {
107
+ score = parsed.score;
108
+ }
109
+ }
110
+ }
111
+ catch {
112
+ // fall through to numeric parsing
113
+ }
114
+ // Try parsing the entire content as a plain number
115
+ if (score === undefined) {
116
+ const trimmed = content.trim();
117
+ const num = parseFloat(trimmed);
118
+ if (!isNaN(num)) {
119
+ score = num;
120
+ }
121
+ }
122
+ if (score === undefined) {
123
+ return {
124
+ type: "llm_judge",
125
+ passed: false,
126
+ message: `LLM judge could not extract a score from response: "${content}"`,
127
+ };
128
+ }
129
+ // Clamp score to [0, 1]
130
+ const clampedScore = Math.max(0, Math.min(1, score));
131
+ const passed = clampedScore >= assertion.min_score;
132
+ return {
133
+ type: "llm_judge",
134
+ passed,
135
+ message: passed
136
+ ? `LLM judge score ${clampedScore.toFixed(3)} >= threshold ${assertion.min_score}`
137
+ : `LLM judge score ${clampedScore.toFixed(3)} < threshold ${assertion.min_score}`,
138
+ };
139
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Sliding window in-memory rate limiter middleware for mcp-eval-runner.
3
+ * Tracks requests per IP address or API key (X-API-Key header).
4
+ * Returns 429 Too Many Requests when the limit is exceeded.
5
+ */
6
+ import type { RequestHandler } from "express";
7
+ /**
8
+ * Create a sliding window rate limiter Express middleware.
9
+ *
10
+ * @param maxRequests - Maximum number of requests allowed per window (default: 60)
11
+ * @param windowMs - Window duration in milliseconds (default: 60000 = 60s)
12
+ */
13
+ export declare function createRateLimiter(maxRequests?: number, windowMs?: number): RequestHandler;
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Sliding window in-memory rate limiter middleware for mcp-eval-runner.
3
+ * Tracks requests per IP address or API key (X-API-Key header).
4
+ * Returns 429 Too Many Requests when the limit is exceeded.
5
+ */
6
+ /**
7
+ * Create a sliding window rate limiter Express middleware.
8
+ *
9
+ * @param maxRequests - Maximum number of requests allowed per window (default: 60)
10
+ * @param windowMs - Window duration in milliseconds (default: 60000 = 60s)
11
+ */
12
+ export function createRateLimiter(maxRequests = 60, windowMs = 60000) {
13
+ const store = new Map();
14
+ return (req, res, next) => {
15
+ const now = Date.now();
16
+ const windowStart = now - windowMs;
17
+ // Identify the client by API key or IP address
18
+ const apiKey = req.headers["x-api-key"];
19
+ const clientKey = (Array.isArray(apiKey) ? apiKey[0] : apiKey) ?? req.ip ?? "unknown";
20
+ // Get or create the window entry for this client
21
+ let entry = store.get(clientKey);
22
+ if (!entry) {
23
+ entry = { timestamps: [] };
24
+ store.set(clientKey, entry);
25
+ }
26
+ // Prune timestamps outside the current window
27
+ entry.timestamps = entry.timestamps.filter((ts) => ts > windowStart);
28
+ if (entry.timestamps.length >= maxRequests) {
29
+ res.status(429).json({ error: "Too Many Requests: rate limit exceeded" });
30
+ return;
31
+ }
32
+ // Record this request
33
+ entry.timestamps.push(now);
34
+ next();
35
+ };
36
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Console/JSON/HTML output formatting for MCP Eval Runner.
3
+ */
4
+ import type { SuiteRunResult } from "./runner.js";
5
+ export declare function formatConsole(result: SuiteRunResult): string;
6
+ export declare function formatJson(result: SuiteRunResult): string;
7
+ export declare function formatHtml(result: SuiteRunResult): string;
8
+ export declare function formatResult(result: SuiteRunResult, format: "console" | "json" | "html"): string;
@@ -0,0 +1,163 @@
1
+ /**
2
+ * Console/JSON/HTML output formatting for MCP Eval Runner.
3
+ */
4
+ // ANSI color codes
5
+ const RESET = "\x1b[0m";
6
+ const GREEN = "\x1b[32m";
7
+ const RED = "\x1b[31m";
8
+ const YELLOW = "\x1b[33m";
9
+ const BOLD = "\x1b[1m";
10
+ const DIM = "\x1b[2m";
11
+ function pass(s) {
12
+ return `${GREEN}${s}${RESET}`;
13
+ }
14
+ function fail(s) {
15
+ return `${RED}${s}${RESET}`;
16
+ }
17
+ function warn(s) {
18
+ return `${YELLOW}${s}${RESET}`;
19
+ }
20
+ function bold(s) {
21
+ return `${BOLD}${s}${RESET}`;
22
+ }
23
+ function dim(s) {
24
+ return `${DIM}${s}${RESET}`;
25
+ }
26
+ function statusIcon(status) {
27
+ switch (status) {
28
+ case "pass":
29
+ return pass("✓");
30
+ case "fail":
31
+ return fail("✗");
32
+ case "error":
33
+ return warn("!");
34
+ }
35
+ }
36
+ function formatAssertions(assertions, indent) {
37
+ return assertions
38
+ .map((a) => {
39
+ const icon = a.passed ? pass(" ✓") : fail(" ✗");
40
+ return `${indent}${icon} [${a.type}] ${a.passed ? dim(a.message) : fail(a.message)}`;
41
+ })
42
+ .join("\n");
43
+ }
44
+ function formatStep(step, indent = " ") {
45
+ const icon = statusIcon(step.status);
46
+ const lines = [
47
+ `${indent}${icon} step:${step.step_id} (tool: ${step.tool}) ${dim(`${step.duration_ms}ms`)}`,
48
+ ];
49
+ if (step.assertions.length > 0) {
50
+ lines.push(formatAssertions(step.assertions, indent + " "));
51
+ }
52
+ if (step.error) {
53
+ lines.push(`${indent} ${fail("Error: " + step.error)}`);
54
+ }
55
+ return lines.join("\n");
56
+ }
57
+ function formatCase(c) {
58
+ const icon = statusIcon(c.status);
59
+ const lines = [` ${icon} ${bold(c.case_name)} ${dim(`(${c.duration_ms}ms)`)}`];
60
+ for (const step of c.steps) {
61
+ lines.push(formatStep(step));
62
+ }
63
+ if (c.error) {
64
+ lines.push(` ${fail("Error: " + c.error)}`);
65
+ }
66
+ return lines.join("\n");
67
+ }
68
+ export function formatConsole(result) {
69
+ const lines = [];
70
+ lines.push("");
71
+ lines.push(bold(`MCP Eval Runner — Suite: ${result.suite_name}`));
72
+ lines.push(dim(`Run ID: ${result.run_id}`));
73
+ lines.push(dim(`Started: ${new Date(result.started_at).toISOString()} Duration: ${result.ended_at - result.started_at}ms`));
74
+ lines.push("");
75
+ for (const c of result.cases) {
76
+ lines.push(formatCase(c));
77
+ }
78
+ lines.push("");
79
+ const summary = result.failed === 0
80
+ ? pass(`✓ ${result.passed}/${result.total_cases} passed`)
81
+ : fail(`✗ ${result.failed}/${result.total_cases} failed`) +
82
+ (result.passed > 0 ? `, ${pass(String(result.passed))} passed` : "");
83
+ lines.push(bold("Summary: ") + summary);
84
+ lines.push("");
85
+ return lines.join("\n");
86
+ }
87
+ export function formatJson(result) {
88
+ return JSON.stringify(result, null, 2);
89
+ }
90
+ export function formatHtml(result) {
91
+ const rows = result.cases
92
+ .map((c) => {
93
+ const color = c.status === "pass" ? "#22c55e" : c.status === "fail" ? "#ef4444" : "#f59e0b";
94
+ const stepsHtml = c.steps
95
+ .map((s) => {
96
+ const sc = s.status === "pass" ? "#22c55e" : "#ef4444";
97
+ const assertionsHtml = s.assertions
98
+ .map((a) => `<li style="color:${a.passed ? "#22c55e" : "#ef4444"}">${a.type}: ${escapeHtml(a.message)}</li>`)
99
+ .join("");
100
+ return `
101
+ <tr>
102
+ <td style="padding-left:2em;color:${sc}">${escapeHtml(s.step_id)}</td>
103
+ <td>${escapeHtml(s.tool)}</td>
104
+ <td style="color:${sc}">${s.status}</td>
105
+ <td>${s.duration_ms}ms</td>
106
+ <td><ul>${assertionsHtml}</ul></td>
107
+ </tr>`;
108
+ })
109
+ .join("");
110
+ return `
111
+ <tr>
112
+ <td colspan="5" style="font-weight:bold;color:${color}">${escapeHtml(c.case_name)} — ${c.status} (${c.duration_ms}ms)</td>
113
+ </tr>
114
+ ${stepsHtml}`;
115
+ })
116
+ .join("");
117
+ return `<!DOCTYPE html>
118
+ <html lang="en">
119
+ <head>
120
+ <meta charset="UTF-8">
121
+ <title>MCP Eval Runner Report</title>
122
+ <style>
123
+ body { font-family: monospace; background: #111; color: #eee; padding: 2em; }
124
+ table { border-collapse: collapse; width: 100%; }
125
+ th, td { text-align: left; padding: 4px 8px; border-bottom: 1px solid #333; }
126
+ th { background: #222; }
127
+ .pass { color: #22c55e; }
128
+ .fail { color: #ef4444; }
129
+ </style>
130
+ </head>
131
+ <body>
132
+ <h1>MCP Eval Runner</h1>
133
+ <p>Suite: <strong>${escapeHtml(result.suite_name)}</strong> | Run ID: ${escapeHtml(result.run_id)}</p>
134
+ <p>Started: ${new Date(result.started_at).toISOString()} | Duration: ${result.ended_at - result.started_at}ms</p>
135
+ <p class="${result.failed === 0 ? "pass" : "fail"}">
136
+ ${result.passed}/${result.total_cases} passed, ${result.failed} failed
137
+ </p>
138
+ <table>
139
+ <thead>
140
+ <tr><th>Case / Step</th><th>Tool</th><th>Status</th><th>Duration</th><th>Assertions</th></tr>
141
+ </thead>
142
+ <tbody>${rows}</tbody>
143
+ </table>
144
+ </body>
145
+ </html>`;
146
+ }
147
+ function escapeHtml(s) {
148
+ return s
149
+ .replace(/&/g, "&amp;")
150
+ .replace(/</g, "&lt;")
151
+ .replace(/>/g, "&gt;")
152
+ .replace(/"/g, "&quot;");
153
+ }
154
+ export function formatResult(result, format) {
155
+ switch (format) {
156
+ case "json":
157
+ return formatJson(result);
158
+ case "html":
159
+ return formatHtml(result);
160
+ default:
161
+ return formatConsole(result);
162
+ }
163
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Test execution engine for MCP Eval Runner.
3
+ *
4
+ * Supports two execution modes:
5
+ *
6
+ * 1. Live mode (default when server config is present in fixture):
7
+ * Spawns the target MCP server as a child process (stdio transport) or
8
+ * connects via HTTP, calls each step's tool with the provided input, and
9
+ * evaluates assertions against the real response.
10
+ *
11
+ * 2. Simulation mode (fallback when no server config is present):
12
+ * Evaluates assertions against `expected_output` from the fixture.
13
+ * Useful for authoring and CI dry-runs without a running server.
14
+ *
15
+ * Step output piping:
16
+ * Steps can reference the output of a previous step using the
17
+ * `{{steps.<step_id>.output}}` placeholder in their `input` values.
18
+ */
19
+ import { type AssertionResult } from "./assertions.js";
20
+ import type { Fixture } from "./fixture.js";
21
+ import type { EvalDb } from "./db.js";
22
+ export interface StepRunResult {
23
+ step_id: string;
24
+ tool: string;
25
+ status: "pass" | "fail" | "error";
26
+ duration_ms: number;
27
+ output: string;
28
+ assertions: AssertionResult[];
29
+ error?: string;
30
+ mode: "live" | "simulation";
31
+ }
32
+ export interface CaseRunResult {
33
+ case_name: string;
34
+ status: "pass" | "fail" | "error";
35
+ duration_ms: number;
36
+ steps: StepRunResult[];
37
+ error?: string;
38
+ }
39
+ export interface SuiteRunResult {
40
+ run_id: string;
41
+ suite_name: string;
42
+ started_at: number;
43
+ ended_at: number;
44
+ total_cases: number;
45
+ passed: number;
46
+ failed: number;
47
+ cases: CaseRunResult[];
48
+ }
49
+ export interface RunnerOptions {
50
+ fixturesDir: string;
51
+ dbPath: string;
52
+ timeoutMs: number;
53
+ format: "console" | "json" | "html";
54
+ concurrency?: number;
55
+ }
56
+ export declare function runCase(fixture: Fixture, options: Pick<RunnerOptions, "timeoutMs">): Promise<CaseRunResult>;
57
+ export declare function runSuite(fixtures: Fixture[], suiteName: string, options: RunnerOptions, db: EvalDb): Promise<SuiteRunResult>;