mcp-eval-runner 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +39 -0
- package/CHANGELOG.md +67 -0
- package/LICENSE +21 -0
- package/README.md +328 -0
- package/dist/assertions.d.ts +63 -0
- package/dist/assertions.js +187 -0
- package/dist/audit-log.d.ts +26 -0
- package/dist/audit-log.js +57 -0
- package/dist/auth.d.ts +15 -0
- package/dist/auth.js +83 -0
- package/dist/db.d.ts +40 -0
- package/dist/db.js +94 -0
- package/dist/deployment-gate.d.ts +27 -0
- package/dist/deployment-gate.js +43 -0
- package/dist/fixture-library.d.ts +26 -0
- package/dist/fixture-library.js +85 -0
- package/dist/fixture.d.ts +87 -0
- package/dist/fixture.js +170 -0
- package/dist/http-server.d.ts +7 -0
- package/dist/http-server.js +34 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +158 -0
- package/dist/llm-judge.d.ts +24 -0
- package/dist/llm-judge.js +139 -0
- package/dist/rate-limiter.d.ts +13 -0
- package/dist/rate-limiter.js +36 -0
- package/dist/reporter.d.ts +8 -0
- package/dist/reporter.js +163 -0
- package/dist/runner.d.ts +57 -0
- package/dist/runner.js +339 -0
- package/dist/server.d.ts +22 -0
- package/dist/server.js +583 -0
- package/dist/tools/html_report.d.ts +8 -0
- package/dist/tools/html_report.js +188 -0
- package/dist/tools/manage.d.ts +11 -0
- package/dist/tools/manage.js +41 -0
- package/dist/tools/report.d.ts +12 -0
- package/dist/tools/report.js +120 -0
- package/dist/tools/run.d.ts +20 -0
- package/dist/tools/run.js +166 -0
- package/dist/tools/scaffold.d.ts +11 -0
- package/dist/tools/scaffold.js +90 -0
- package/evals/reference/mcp-fetch.yaml +46 -0
- package/evals/reference/mcp-filesystem.yaml +63 -0
- package/evals/reference/mcp-memory.yaml +70 -0
- package/evals/reference/step-piping-example.yaml +25 -0
- package/evals/smoke.yaml +12 -0
- package/package.json +67 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared fixture discovery and publishing for mcp-eval-runner.
|
|
3
|
+
* Supports discovering fixtures across multiple directories and publishing
|
|
4
|
+
* (copying) fixtures to a destination directory.
|
|
5
|
+
*/
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
import path from "path";
|
|
8
|
+
import { loadFixturesFromDir } from "./fixture.js";
|
|
9
|
+
/**
|
|
10
|
+
* Discover all fixture files across the given directories.
|
|
11
|
+
* Returns a deduplicated list (by name, first occurrence wins).
|
|
12
|
+
*
|
|
13
|
+
* @param dirs - Array of directory paths to scan for fixtures
|
|
14
|
+
*/
|
|
15
|
+
export function discoverFixtures(dirs) {
|
|
16
|
+
const seen = new Set();
|
|
17
|
+
const entries = [];
|
|
18
|
+
for (const dir of dirs) {
|
|
19
|
+
if (!fs.existsSync(dir)) {
|
|
20
|
+
continue;
|
|
21
|
+
}
|
|
22
|
+
const fixtures = loadFixturesFromDir(dir);
|
|
23
|
+
for (const fixture of fixtures) {
|
|
24
|
+
if (seen.has(fixture.name)) {
|
|
25
|
+
continue;
|
|
26
|
+
}
|
|
27
|
+
seen.add(fixture.name);
|
|
28
|
+
// Find the actual file path for this fixture
|
|
29
|
+
const candidates = [
|
|
30
|
+
path.join(dir, `${fixture.name}.yaml`),
|
|
31
|
+
path.join(dir, `${fixture.name}.yml`),
|
|
32
|
+
path.join(dir, `${fixture.name}.json`),
|
|
33
|
+
// Also try the sanitized name
|
|
34
|
+
path.join(dir, `${fixture.name.replace(/[^a-z0-9_-]/gi, "_")}.yaml`),
|
|
35
|
+
path.join(dir, `${fixture.name.replace(/[^a-z0-9_-]/gi, "_")}.yml`),
|
|
36
|
+
path.join(dir, `${fixture.name.replace(/[^a-z0-9_-]/gi, "_")}.json`),
|
|
37
|
+
];
|
|
38
|
+
let filePath = dir;
|
|
39
|
+
for (const candidate of candidates) {
|
|
40
|
+
if (fs.existsSync(candidate)) {
|
|
41
|
+
filePath = candidate;
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
entries.push({
|
|
46
|
+
name: fixture.name,
|
|
47
|
+
path: filePath,
|
|
48
|
+
suite_count: 1, // Each fixture file is one suite
|
|
49
|
+
case_count: fixture.steps.length,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return entries;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Publish (copy) a fixture YAML/JSON file to the destination directory.
|
|
57
|
+
* Creates the destination directory if it does not exist.
|
|
58
|
+
*
|
|
59
|
+
* @param fixture - The fixture object or a path to a fixture file
|
|
60
|
+
* @param dest - Destination directory path
|
|
61
|
+
*/
|
|
62
|
+
export function publishFixture(fixture, dest) {
|
|
63
|
+
fs.mkdirSync(dest, { recursive: true });
|
|
64
|
+
// If fixture is a string, treat it as a file path to copy
|
|
65
|
+
if (typeof fixture === "string") {
|
|
66
|
+
const srcPath = fixture;
|
|
67
|
+
if (!fs.existsSync(srcPath)) {
|
|
68
|
+
throw new Error(`Fixture file not found: ${srcPath}`);
|
|
69
|
+
}
|
|
70
|
+
const fileName = path.basename(srcPath);
|
|
71
|
+
const destPath = path.join(dest, fileName);
|
|
72
|
+
fs.copyFileSync(srcPath, destPath);
|
|
73
|
+
return;
|
|
74
|
+
}
|
|
75
|
+
// If fixture is an object, serialize it as YAML-like JSON and write
|
|
76
|
+
if (typeof fixture === "object" && fixture !== null) {
|
|
77
|
+
const obj = fixture;
|
|
78
|
+
const name = obj.name ?? "fixture";
|
|
79
|
+
const safeName = String(name).replace(/[^a-z0-9_-]/gi, "_");
|
|
80
|
+
const destPath = path.join(dest, `${safeName}.json`);
|
|
81
|
+
fs.writeFileSync(destPath, JSON.stringify(fixture, null, 2), "utf-8");
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
throw new Error(`Invalid fixture: expected a file path string or fixture object`);
|
|
85
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML fixture loading and validation for MCP Eval Runner.
|
|
3
|
+
*
|
|
4
|
+
* Fixture JSON Schema:
|
|
5
|
+
* {
|
|
6
|
+
* "type": "object",
|
|
7
|
+
* "required": ["name", "steps"],
|
|
8
|
+
* "properties": {
|
|
9
|
+
* "name": { "type": "string" },
|
|
10
|
+
* "description": { "type": "string" },
|
|
11
|
+
* "server": {
|
|
12
|
+
* "type": "object",
|
|
13
|
+
* "description": "Optional server config for live execution mode",
|
|
14
|
+
* "properties": {
|
|
15
|
+
* "command": { "type": "string" },
|
|
16
|
+
* "args": { "type": "array", "items": { "type": "string" } },
|
|
17
|
+
* "env": { "type": "object" }
|
|
18
|
+
* },
|
|
19
|
+
* "required": ["command"]
|
|
20
|
+
* },
|
|
21
|
+
* "steps": {
|
|
22
|
+
* "type": "array",
|
|
23
|
+
* "items": {
|
|
24
|
+
* "type": "object",
|
|
25
|
+
* "required": ["id", "tool"],
|
|
26
|
+
* "properties": {
|
|
27
|
+
* "id": { "type": "string" },
|
|
28
|
+
* "description": { "type": "string" },
|
|
29
|
+
* "tool": { "type": "string" },
|
|
30
|
+
* "input": { "type": "object" },
|
|
31
|
+
* "expected_output": { "type": "string" },
|
|
32
|
+
* "expect": {
|
|
33
|
+
* "type": "object",
|
|
34
|
+
* "properties": {
|
|
35
|
+
* "output_contains": { "type": "string" },
|
|
36
|
+
* "output_not_contains": { "type": "string" },
|
|
37
|
+
* "output_equals": { "type": "string" },
|
|
38
|
+
* "output_matches": { "type": "string" },
|
|
39
|
+
* "tool_called": { "type": "string" },
|
|
40
|
+
* "latency_under": { "type": "number" }
|
|
41
|
+
* }
|
|
42
|
+
* }
|
|
43
|
+
* }
|
|
44
|
+
* }
|
|
45
|
+
* }
|
|
46
|
+
* }
|
|
47
|
+
* }
|
|
48
|
+
*/
|
|
49
|
+
import type { Assertion } from "./assertions.js";
|
|
50
|
+
export interface ServerConfig {
|
|
51
|
+
command: string;
|
|
52
|
+
args?: string[];
|
|
53
|
+
env?: Record<string, string>;
|
|
54
|
+
}
|
|
55
|
+
export interface FixtureStep {
|
|
56
|
+
id: string;
|
|
57
|
+
description?: string;
|
|
58
|
+
tool: string;
|
|
59
|
+
input: Record<string, unknown>;
|
|
60
|
+
expected_output?: string;
|
|
61
|
+
expect?: Assertion;
|
|
62
|
+
}
|
|
63
|
+
export interface Fixture {
|
|
64
|
+
name: string;
|
|
65
|
+
description?: string;
|
|
66
|
+
server?: ServerConfig;
|
|
67
|
+
steps: FixtureStep[];
|
|
68
|
+
}
|
|
69
|
+
export declare class FixtureValidationError extends Error {
|
|
70
|
+
constructor(message: string);
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Load and validate a fixture from a YAML or JSON file.
|
|
74
|
+
*/
|
|
75
|
+
export declare function loadFixture(filePath: string): Fixture;
|
|
76
|
+
/**
|
|
77
|
+
* Validate a raw parsed object against the fixture schema.
|
|
78
|
+
*/
|
|
79
|
+
export declare function validateFixture(raw: unknown, source?: string): Fixture;
|
|
80
|
+
/**
|
|
81
|
+
* Load all fixtures from a directory.
|
|
82
|
+
*/
|
|
83
|
+
export declare function loadFixturesFromDir(fixturesDir: string): Fixture[];
|
|
84
|
+
/**
|
|
85
|
+
* Write a fixture to a YAML file in the fixtures directory.
|
|
86
|
+
*/
|
|
87
|
+
export declare function writeFixture(fixturesDir: string, fixture: Fixture): string;
|
package/dist/fixture.js
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML fixture loading and validation for MCP Eval Runner.
|
|
3
|
+
*
|
|
4
|
+
* Fixture JSON Schema:
|
|
5
|
+
* {
|
|
6
|
+
* "type": "object",
|
|
7
|
+
* "required": ["name", "steps"],
|
|
8
|
+
* "properties": {
|
|
9
|
+
* "name": { "type": "string" },
|
|
10
|
+
* "description": { "type": "string" },
|
|
11
|
+
* "server": {
|
|
12
|
+
* "type": "object",
|
|
13
|
+
* "description": "Optional server config for live execution mode",
|
|
14
|
+
* "properties": {
|
|
15
|
+
* "command": { "type": "string" },
|
|
16
|
+
* "args": { "type": "array", "items": { "type": "string" } },
|
|
17
|
+
* "env": { "type": "object" }
|
|
18
|
+
* },
|
|
19
|
+
* "required": ["command"]
|
|
20
|
+
* },
|
|
21
|
+
* "steps": {
|
|
22
|
+
* "type": "array",
|
|
23
|
+
* "items": {
|
|
24
|
+
* "type": "object",
|
|
25
|
+
* "required": ["id", "tool"],
|
|
26
|
+
* "properties": {
|
|
27
|
+
* "id": { "type": "string" },
|
|
28
|
+
* "description": { "type": "string" },
|
|
29
|
+
* "tool": { "type": "string" },
|
|
30
|
+
* "input": { "type": "object" },
|
|
31
|
+
* "expected_output": { "type": "string" },
|
|
32
|
+
* "expect": {
|
|
33
|
+
* "type": "object",
|
|
34
|
+
* "properties": {
|
|
35
|
+
* "output_contains": { "type": "string" },
|
|
36
|
+
* "output_not_contains": { "type": "string" },
|
|
37
|
+
* "output_equals": { "type": "string" },
|
|
38
|
+
* "output_matches": { "type": "string" },
|
|
39
|
+
* "tool_called": { "type": "string" },
|
|
40
|
+
* "latency_under": { "type": "number" }
|
|
41
|
+
* }
|
|
42
|
+
* }
|
|
43
|
+
* }
|
|
44
|
+
* }
|
|
45
|
+
* }
|
|
46
|
+
* }
|
|
47
|
+
* }
|
|
48
|
+
*/
|
|
49
|
+
import fs from "fs";
|
|
50
|
+
import path from "path";
|
|
51
|
+
import yaml from "js-yaml";
|
|
52
|
+
export class FixtureValidationError extends Error {
|
|
53
|
+
constructor(message) {
|
|
54
|
+
super(message);
|
|
55
|
+
this.name = "FixtureValidationError";
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Parse optional server config block from a fixture.
|
|
60
|
+
*/
|
|
61
|
+
function parseServerConfig(raw) {
|
|
62
|
+
if (typeof raw !== "object" || raw === null)
|
|
63
|
+
return undefined;
|
|
64
|
+
const obj = raw;
|
|
65
|
+
if (typeof obj["command"] !== "string" || !obj["command"].trim())
|
|
66
|
+
return undefined;
|
|
67
|
+
return {
|
|
68
|
+
command: obj["command"],
|
|
69
|
+
args: Array.isArray(obj["args"]) ? obj["args"] : [],
|
|
70
|
+
env: typeof obj["env"] === "object" && obj["env"] !== null
|
|
71
|
+
? obj["env"]
|
|
72
|
+
: undefined,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Load and validate a fixture from a YAML or JSON file.
|
|
77
|
+
*/
|
|
78
|
+
export function loadFixture(filePath) {
|
|
79
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
80
|
+
let raw;
|
|
81
|
+
if (filePath.endsWith(".yaml") || filePath.endsWith(".yml")) {
|
|
82
|
+
raw = yaml.load(content);
|
|
83
|
+
}
|
|
84
|
+
else if (filePath.endsWith(".json")) {
|
|
85
|
+
raw = JSON.parse(content);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
throw new FixtureValidationError(`Unsupported fixture format: ${filePath}. Use .yaml, .yml, or .json`);
|
|
89
|
+
}
|
|
90
|
+
return validateFixture(raw, filePath);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Validate a raw parsed object against the fixture schema.
|
|
94
|
+
*/
|
|
95
|
+
export function validateFixture(raw, source) {
|
|
96
|
+
if (typeof raw !== "object" || raw === null) {
|
|
97
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: must be an object`);
|
|
98
|
+
}
|
|
99
|
+
const obj = raw;
|
|
100
|
+
if (typeof obj["name"] !== "string" || obj["name"].trim() === "") {
|
|
101
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: "name" must be a non-empty string`);
|
|
102
|
+
}
|
|
103
|
+
if (!Array.isArray(obj["steps"])) {
|
|
104
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: "steps" must be an array`);
|
|
105
|
+
}
|
|
106
|
+
const steps = obj["steps"].map((step, idx) => {
|
|
107
|
+
if (typeof step !== "object" || step === null) {
|
|
108
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: step[${idx}] must be an object`);
|
|
109
|
+
}
|
|
110
|
+
const s = step;
|
|
111
|
+
if (typeof s["id"] !== "string" || s["id"].trim() === "") {
|
|
112
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: step[${idx}].id must be a non-empty string`);
|
|
113
|
+
}
|
|
114
|
+
if (typeof s["tool"] !== "string" || s["tool"].trim() === "") {
|
|
115
|
+
throw new FixtureValidationError(`${source ?? "Fixture"}: step[${idx}].tool must be a non-empty string`);
|
|
116
|
+
}
|
|
117
|
+
return {
|
|
118
|
+
id: s["id"],
|
|
119
|
+
description: typeof s["description"] === "string" ? s["description"] : undefined,
|
|
120
|
+
tool: s["tool"],
|
|
121
|
+
input: typeof s["input"] === "object" && s["input"] !== null
|
|
122
|
+
? s["input"]
|
|
123
|
+
: {},
|
|
124
|
+
expected_output: typeof s["expected_output"] === "string" ? s["expected_output"] : undefined,
|
|
125
|
+
expect: typeof s["expect"] === "object" && s["expect"] !== null
|
|
126
|
+
? s["expect"]
|
|
127
|
+
: undefined,
|
|
128
|
+
};
|
|
129
|
+
});
|
|
130
|
+
return {
|
|
131
|
+
name: obj["name"],
|
|
132
|
+
description: typeof obj["description"] === "string" ? obj["description"] : undefined,
|
|
133
|
+
server: parseServerConfig(obj["server"]),
|
|
134
|
+
steps,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Load all fixtures from a directory.
|
|
139
|
+
*/
|
|
140
|
+
export function loadFixturesFromDir(fixturesDir) {
|
|
141
|
+
if (!fs.existsSync(fixturesDir)) {
|
|
142
|
+
return [];
|
|
143
|
+
}
|
|
144
|
+
const files = fs
|
|
145
|
+
.readdirSync(fixturesDir)
|
|
146
|
+
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml") || f.endsWith(".json"))
|
|
147
|
+
.sort();
|
|
148
|
+
const fixtures = [];
|
|
149
|
+
for (const file of files) {
|
|
150
|
+
try {
|
|
151
|
+
fixtures.push(loadFixture(path.join(fixturesDir, file)));
|
|
152
|
+
}
|
|
153
|
+
catch (err) {
|
|
154
|
+
// Log but continue loading other fixtures
|
|
155
|
+
console.error(`Warning: failed to load fixture ${file}: ${err}`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
return fixtures;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Write a fixture to a YAML file in the fixtures directory.
|
|
162
|
+
*/
|
|
163
|
+
export function writeFixture(fixturesDir, fixture) {
|
|
164
|
+
fs.mkdirSync(fixturesDir, { recursive: true });
|
|
165
|
+
const fileName = `${fixture.name.replace(/[^a-z0-9_-]/gi, "_")}.yaml`;
|
|
166
|
+
const filePath = path.join(fixturesDir, fileName);
|
|
167
|
+
const content = yaml.dump(fixture, { lineWidth: 100 });
|
|
168
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
169
|
+
return filePath;
|
|
170
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streamable HTTP transport server for mcp-eval-runner.
|
|
3
|
+
* Starts an Express HTTP server that handles MCP requests via
|
|
4
|
+
* the StreamableHTTPServerTransport in stateless mode.
|
|
5
|
+
*/
|
|
6
|
+
import type { ServerOptions } from "./server.js";
|
|
7
|
+
export declare function startHttpServer(port: number, opts: ServerOptions): Promise<void>;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streamable HTTP transport server for mcp-eval-runner.
|
|
3
|
+
* Starts an Express HTTP server that handles MCP requests via
|
|
4
|
+
* the StreamableHTTPServerTransport in stateless mode.
|
|
5
|
+
*/
|
|
6
|
+
import express from "express";
|
|
7
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
8
|
+
import { createServer } from "./server.js";
|
|
9
|
+
import { createAuthMiddleware } from "./auth.js";
|
|
10
|
+
import { createRateLimiter } from "./rate-limiter.js";
|
|
11
|
+
export async function startHttpServer(port, opts) {
|
|
12
|
+
const app = express();
|
|
13
|
+
app.use(express.json());
|
|
14
|
+
const server = await createServer(opts);
|
|
15
|
+
// Apply auth and rate limiting before /mcp route
|
|
16
|
+
app.use("/mcp", createAuthMiddleware());
|
|
17
|
+
app.use("/mcp", createRateLimiter(60, 60000));
|
|
18
|
+
app.post("/mcp", async (req, res) => {
|
|
19
|
+
const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: undefined });
|
|
20
|
+
await server.connect(transport);
|
|
21
|
+
await transport.handleRequest(req, res, req.body);
|
|
22
|
+
});
|
|
23
|
+
app.get("/mcp", async (req, res) => {
|
|
24
|
+
const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: undefined });
|
|
25
|
+
await server.connect(transport);
|
|
26
|
+
await transport.handleRequest(req, res);
|
|
27
|
+
});
|
|
28
|
+
app.delete("/mcp", (_req, res) => {
|
|
29
|
+
res.status(405).json({ error: "Method not allowed" });
|
|
30
|
+
});
|
|
31
|
+
app.listen(port, () => {
|
|
32
|
+
console.error(`MCP Eval Runner HTTP server listening on port ${port}`);
|
|
33
|
+
});
|
|
34
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI entry point for mcp-eval-runner.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* mcp-eval-runner [options]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --fixtures, --fixtures-dir Path to fixtures directory (default: ./evals)
|
|
10
|
+
* --db, --db-path Path to SQLite database (default: ~/.mcp/evals.db)
|
|
11
|
+
* --timeout Timeout per step in ms (default: 30000)
|
|
12
|
+
* --format Output format: console|json|html (default: console)
|
|
13
|
+
* --watch Watch fixtures directory for changes and re-run affected fixtures
|
|
14
|
+
*/
|
|
15
|
+
export {};
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI entry point for mcp-eval-runner.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* mcp-eval-runner [options]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --fixtures, --fixtures-dir Path to fixtures directory (default: ./evals)
|
|
10
|
+
* --db, --db-path Path to SQLite database (default: ~/.mcp/evals.db)
|
|
11
|
+
* --timeout Timeout per step in ms (default: 30000)
|
|
12
|
+
* --format Output format: console|json|html (default: console)
|
|
13
|
+
* --watch Watch fixtures directory for changes and re-run affected fixtures
|
|
14
|
+
*/
|
|
15
|
+
import yargs from "yargs";
|
|
16
|
+
import { hideBin } from "yargs/helpers";
|
|
17
|
+
import path from "path";
|
|
18
|
+
import os from "os";
|
|
19
|
+
import { startServer } from "./server.js";
|
|
20
|
+
import { startHttpServer } from "./http-server.js";
|
|
21
|
+
import { EvalDb } from "./db.js";
|
|
22
|
+
import { loadFixture } from "./fixture.js";
|
|
23
|
+
import { runCase } from "./runner.js";
|
|
24
|
+
import { formatResult } from "./reporter.js";
|
|
25
|
+
const argv = yargs(hideBin(process.argv))
|
|
26
|
+
.option("fixtures", {
|
|
27
|
+
alias: "fixtures-dir",
|
|
28
|
+
type: "string",
|
|
29
|
+
description: "Path to fixtures directory",
|
|
30
|
+
default: "./evals",
|
|
31
|
+
})
|
|
32
|
+
.option("db", {
|
|
33
|
+
alias: "db-path",
|
|
34
|
+
type: "string",
|
|
35
|
+
description: "Path to SQLite database",
|
|
36
|
+
default: "~/.mcp/evals.db",
|
|
37
|
+
})
|
|
38
|
+
.option("timeout", {
|
|
39
|
+
type: "number",
|
|
40
|
+
description: "Timeout per step in milliseconds",
|
|
41
|
+
default: 30000,
|
|
42
|
+
})
|
|
43
|
+
.option("format", {
|
|
44
|
+
type: "string",
|
|
45
|
+
choices: ["console", "json", "html"],
|
|
46
|
+
description: "Output format",
|
|
47
|
+
default: "console",
|
|
48
|
+
})
|
|
49
|
+
.option("watch", {
|
|
50
|
+
type: "boolean",
|
|
51
|
+
description: "Watch fixtures directory for changes and re-run affected fixtures",
|
|
52
|
+
default: false,
|
|
53
|
+
})
|
|
54
|
+
.option("concurrency", {
|
|
55
|
+
type: "number",
|
|
56
|
+
description: "Number of test cases to run in parallel (default: 1)",
|
|
57
|
+
default: 1,
|
|
58
|
+
})
|
|
59
|
+
.option("http-port", {
|
|
60
|
+
type: "number",
|
|
61
|
+
description: "Start an HTTP server on this port instead of stdio transport",
|
|
62
|
+
})
|
|
63
|
+
.help()
|
|
64
|
+
.parseSync();
|
|
65
|
+
// Resolve fixtures dir relative to cwd
|
|
66
|
+
const fixturesDir = path.resolve(process.cwd(), argv.fixtures);
|
|
67
|
+
// Expand ~ in db path
|
|
68
|
+
const rawDb = argv.db;
|
|
69
|
+
const dbPath = rawDb.startsWith("~/") ? path.join(os.homedir(), rawDb.slice(2)) : rawDb;
|
|
70
|
+
const format = argv.format;
|
|
71
|
+
// Enhanced watch mode: re-run the affected fixture when a file changes
|
|
72
|
+
if (argv.watch) {
|
|
73
|
+
import("chokidar").then(({ default: chokidar }) => {
|
|
74
|
+
const watcher = chokidar.watch(fixturesDir, {
|
|
75
|
+
ignoreInitial: true,
|
|
76
|
+
persistent: true,
|
|
77
|
+
});
|
|
78
|
+
const db = new EvalDb(dbPath);
|
|
79
|
+
const runnerOpts = {
|
|
80
|
+
fixturesDir,
|
|
81
|
+
dbPath,
|
|
82
|
+
timeoutMs: argv.timeout,
|
|
83
|
+
format,
|
|
84
|
+
};
|
|
85
|
+
watcher.on("all", async (event, filePath) => {
|
|
86
|
+
process.stderr.write(`[mcp-eval-runner] watch: ${event} ${filePath}\n`);
|
|
87
|
+
// Only re-run on add / change of fixture files
|
|
88
|
+
const isFixture = filePath.endsWith(".yaml") || filePath.endsWith(".yml") || filePath.endsWith(".json");
|
|
89
|
+
if (!isFixture || (event !== "add" && event !== "change")) {
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
try {
|
|
93
|
+
process.stderr.write(`[mcp-eval-runner] Re-running fixture: ${filePath}\n`);
|
|
94
|
+
const fixture = loadFixture(filePath);
|
|
95
|
+
const caseResult = await runCase(fixture, { timeoutMs: argv.timeout });
|
|
96
|
+
const suiteResult = {
|
|
97
|
+
run_id: crypto.randomUUID(),
|
|
98
|
+
suite_name: fixture.name,
|
|
99
|
+
started_at: Date.now() - caseResult.duration_ms,
|
|
100
|
+
ended_at: Date.now(),
|
|
101
|
+
total_cases: 1,
|
|
102
|
+
passed: caseResult.status === "pass" ? 1 : 0,
|
|
103
|
+
failed: caseResult.status !== "pass" ? 1 : 0,
|
|
104
|
+
cases: [caseResult],
|
|
105
|
+
};
|
|
106
|
+
// Persist result
|
|
107
|
+
db.insertRun({
|
|
108
|
+
id: suiteResult.run_id,
|
|
109
|
+
suite_name: suiteResult.suite_name,
|
|
110
|
+
started_at: suiteResult.started_at,
|
|
111
|
+
ended_at: suiteResult.ended_at,
|
|
112
|
+
total_cases: 1,
|
|
113
|
+
passed: suiteResult.passed,
|
|
114
|
+
failed: suiteResult.failed,
|
|
115
|
+
format: runnerOpts.format,
|
|
116
|
+
});
|
|
117
|
+
db.insertCaseResult({
|
|
118
|
+
id: crypto.randomUUID(),
|
|
119
|
+
run_id: suiteResult.run_id,
|
|
120
|
+
case_name: caseResult.case_name,
|
|
121
|
+
status: caseResult.status,
|
|
122
|
+
duration_ms: caseResult.duration_ms,
|
|
123
|
+
error_message: caseResult.error ?? null,
|
|
124
|
+
assertions_json: JSON.stringify(caseResult.steps.map((s) => s.assertions)),
|
|
125
|
+
created_at: Date.now(),
|
|
126
|
+
});
|
|
127
|
+
const formatted = formatResult(suiteResult, format);
|
|
128
|
+
// Write results to stderr so they don't interfere with MCP stdio transport
|
|
129
|
+
process.stderr.write(formatted + "\n");
|
|
130
|
+
}
|
|
131
|
+
catch (err) {
|
|
132
|
+
process.stderr.write(`[mcp-eval-runner] Error re-running fixture ${filePath}: ${err}\n`);
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
process.stderr.write(`[mcp-eval-runner] Watching ${fixturesDir} for changes...\n`);
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
const serverOpts = {
|
|
139
|
+
fixturesDir,
|
|
140
|
+
dbPath,
|
|
141
|
+
timeoutMs: argv.timeout,
|
|
142
|
+
format,
|
|
143
|
+
watch: argv.watch,
|
|
144
|
+
concurrency: argv.concurrency,
|
|
145
|
+
};
|
|
146
|
+
const httpPort = argv["http-port"];
|
|
147
|
+
if (httpPort !== undefined) {
|
|
148
|
+
startHttpServer(httpPort, serverOpts).catch((err) => {
|
|
149
|
+
process.stderr.write(`Fatal error: ${err}\n`);
|
|
150
|
+
process.exit(1);
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
startServer(serverOpts).catch((err) => {
|
|
155
|
+
process.stderr.write(`Fatal error: ${err}\n`);
|
|
156
|
+
process.exit(1);
|
|
157
|
+
});
|
|
158
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-judge assertion for mcp-eval-runner.
|
|
3
|
+
* Calls an external LLM API via HTTP POST to score semantic similarity
|
|
4
|
+
* between actual and expected outputs.
|
|
5
|
+
*
|
|
6
|
+
* Credentials: LLM_JUDGE_API_KEY + LLM_JUDGE_BASE_URL env vars.
|
|
7
|
+
* Assertion type: llm_judge with prompt_template, min_score, model fields.
|
|
8
|
+
*/
|
|
9
|
+
import type { AssertionResult } from "./assertions.js";
|
|
10
|
+
export interface LlmJudgeAssertion {
|
|
11
|
+
prompt_template: string;
|
|
12
|
+
min_score: number;
|
|
13
|
+
model: string;
|
|
14
|
+
expected?: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Run an LLM-as-judge assertion.
|
|
18
|
+
*
|
|
19
|
+
* Renders the prompt template with {actual} and {expected} placeholders,
|
|
20
|
+
* calls the LLM API, and extracts a score from the response (0.0–1.0).
|
|
21
|
+
* Expects the LLM to respond with a JSON object containing a "score" field,
|
|
22
|
+
* or a plain number.
|
|
23
|
+
*/
|
|
24
|
+
export declare function runLlmJudge(assertion: LlmJudgeAssertion, actual: string, expected: string): Promise<AssertionResult>;
|