mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Assertion evaluators for MCP Eval Runner.
3
+ * Each assertion checks a specific property of a step result.
4
+ *
5
+ * Supported assertion types:
6
+ * output_contains: "substring" — output includes substring
7
+ * output_not_contains: "substring" — output must NOT include substring
8
+ * output_equals: "exact string" — output exactly matches
9
+ * output_matches: "regex" — output matches a regular expression
10
+ * tool_called: "tool_name" — step used the named tool
11
+ * latency_under: 500 — latency in ms must be below threshold
12
+ * schema_match: { type: "object", properties: {...}, required: [...] }
13
+ * — output (parsed as JSON) matches JSON Schema
14
+ * llm_judge: { prompt_template, min_score, model, expected }
15
+ * — semantic similarity via LLM judge
16
+ */
17
+ import { runLlmJudge } from "./llm-judge.js";
18
+ // ── Minimal inline JSON Schema validator ─────────────────────────────────────
19
+ function validateJsonSchema(schema, value, path = "") {
20
+ // type check
21
+ if (schema.type !== undefined) {
22
+ const actualType = Array.isArray(value) ? "array" : typeof value;
23
+ const nullType = value === null ? "null" : null;
24
+ const effectiveType = nullType ?? actualType;
25
+ if (effectiveType !== schema.type) {
26
+ return `${path || "value"}: expected type "${schema.type}", got "${effectiveType}"`;
27
+ }
28
+ }
29
+ if (schema.type === "object" ||
30
+ (schema.properties && value !== null && typeof value === "object" && !Array.isArray(value))) {
31
+ const obj = value;
32
+ // required fields
33
+ if (schema.required) {
34
+ for (const key of schema.required) {
35
+ if (!(key in obj)) {
36
+ return `${path || "value"}: missing required property "${key}"`;
37
+ }
38
+ }
39
+ }
40
+ // properties
41
+ if (schema.properties) {
42
+ for (const [key, subSchema] of Object.entries(schema.properties)) {
43
+ if (key in obj) {
44
+ const err = validateJsonSchema(subSchema, obj[key], path ? `${path}.${key}` : key);
45
+ if (err)
46
+ return err;
47
+ }
48
+ }
49
+ }
50
+ // additionalProperties
51
+ if (schema.additionalProperties === false && schema.properties) {
52
+ for (const key of Object.keys(obj)) {
53
+ if (!(key in schema.properties)) {
54
+ return `${path || "value"}: unexpected additional property "${key}"`;
55
+ }
56
+ }
57
+ }
58
+ }
59
+ if (schema.type === "array" && Array.isArray(value) && schema.items) {
60
+ for (let i = 0; i < value.length; i++) {
61
+ const err = validateJsonSchema(schema.items, value[i], `${path || "value"}[${i}]`);
62
+ if (err)
63
+ return err;
64
+ }
65
+ }
66
+ return null;
67
+ }
68
+ /**
69
+ * Evaluate a single assertion against a step result.
70
+ */
71
+ export function evaluateAssertion(assertion, result) {
72
+ const results = [];
73
+ if (assertion.output_contains !== undefined) {
74
+ const passed = result.output.includes(assertion.output_contains);
75
+ results.push({
76
+ type: "output_contains",
77
+ passed,
78
+ message: passed
79
+ ? `Output contains "${assertion.output_contains}"`
80
+ : `Expected output to contain "${assertion.output_contains}", but got: "${result.output}"`,
81
+ });
82
+ }
83
+ if (assertion.output_not_contains !== undefined) {
84
+ const passed = !result.output.includes(assertion.output_not_contains);
85
+ results.push({
86
+ type: "output_not_contains",
87
+ passed,
88
+ message: passed
89
+ ? `Output does not contain "${assertion.output_not_contains}"`
90
+ : `Expected output NOT to contain "${assertion.output_not_contains}"`,
91
+ });
92
+ }
93
+ if (assertion.output_matches !== undefined) {
94
+ let passed = false;
95
+ let message;
96
+ try {
97
+ const regex = new RegExp(assertion.output_matches);
98
+ passed = regex.test(result.output);
99
+ message = passed
100
+ ? `Output matches pattern /${assertion.output_matches}/`
101
+ : `Output does not match pattern /${assertion.output_matches}/`;
102
+ }
103
+ catch {
104
+ message = `Invalid regex pattern: "${assertion.output_matches}"`;
105
+ }
106
+ results.push({ type: "output_matches", passed, message });
107
+ }
108
+ if (assertion.output_equals !== undefined) {
109
+ const passed = result.output === assertion.output_equals;
110
+ results.push({
111
+ type: "output_equals",
112
+ passed,
113
+ message: passed
114
+ ? `Output equals "${assertion.output_equals}"`
115
+ : `Expected output to equal "${assertion.output_equals}", but got: "${result.output}"`,
116
+ });
117
+ }
118
+ if (assertion.tool_called !== undefined) {
119
+ const passed = result.tool === assertion.tool_called;
120
+ results.push({
121
+ type: "tool_called",
122
+ passed,
123
+ message: passed
124
+ ? `Tool "${assertion.tool_called}" was called`
125
+ : `Expected tool "${assertion.tool_called}" to be called, but got "${result.tool}"`,
126
+ });
127
+ }
128
+ if (assertion.latency_under !== undefined) {
129
+ const passed = result.latency_ms < assertion.latency_under;
130
+ results.push({
131
+ type: "latency_under",
132
+ passed,
133
+ message: passed
134
+ ? `Latency ${result.latency_ms}ms < ${assertion.latency_under}ms`
135
+ : `Expected latency under ${assertion.latency_under}ms, but got ${result.latency_ms}ms`,
136
+ });
137
+ }
138
+ if (assertion.schema_match !== undefined) {
139
+ let parsed;
140
+ let parseError;
141
+ try {
142
+ parsed = JSON.parse(result.output);
143
+ }
144
+ catch {
145
+ parseError = `Output is not valid JSON: "${result.output}"`;
146
+ }
147
+ if (parseError) {
148
+ results.push({
149
+ type: "schema_match",
150
+ passed: false,
151
+ message: parseError,
152
+ });
153
+ }
154
+ else {
155
+ const err = validateJsonSchema(assertion.schema_match, parsed);
156
+ const passed = err === null;
157
+ results.push({
158
+ type: "schema_match",
159
+ passed,
160
+ message: passed ? `Output matches JSON schema` : `Schema validation failed: ${err}`,
161
+ });
162
+ }
163
+ }
164
+ return results;
165
+ }
166
+ /**
167
+ * Evaluate all assertions for a step, returning aggregate pass/fail.
168
+ */
169
+ export function evaluateAllAssertions(assertions, result) {
170
+ const results = evaluateAssertion(assertions, result);
171
+ const passed = results.every((r) => r.passed);
172
+ return { passed, results };
173
+ }
174
+ /**
175
+ * Evaluate all assertions for a step including async assertion types (e.g. llm_judge).
176
+ */
177
+ export async function evaluateAllAssertionsAsync(assertions, result) {
178
+ const results = evaluateAssertion(assertions, result);
179
+ // Handle async llm_judge assertion
180
+ if (assertions.llm_judge !== undefined) {
181
+ const expected = assertions.llm_judge.expected ?? result.output;
182
+ const judgeResult = await runLlmJudge(assertions.llm_judge, result.output, expected);
183
+ results.push(judgeResult);
184
+ }
185
+ const passed = results.every((r) => r.passed);
186
+ return { passed, results };
187
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Append-only audit log for mcp-eval-runner eval runs.
3
+ * Logs are written to ~/.mcp/eval-runner-audit.jsonl (one JSON object per line).
4
+ * Supports recording individual entries and exporting filtered ranges.
5
+ */
6
+ export interface AuditEntry {
7
+ timestamp: string;
8
+ run_id: string;
9
+ fixture_name: string;
10
+ passed: boolean;
11
+ duration_ms: number;
12
+ user_id?: string;
13
+ }
14
+ export declare class AuditLog {
15
+ private filePath;
16
+ constructor(filePath?: string);
17
+ /**
18
+ * Append a single audit entry to the log file.
19
+ */
20
+ record(entry: AuditEntry): void;
21
+ /**
22
+ * Export audit entries, optionally filtered by ISO timestamp range [from, to].
23
+ * Both `from` and `to` are inclusive ISO date strings.
24
+ */
25
+ export(from?: string, to?: string): AuditEntry[];
26
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Append-only audit log for mcp-eval-runner eval runs.
3
+ * Logs are written to ~/.mcp/eval-runner-audit.jsonl (one JSON object per line).
4
+ * Supports recording individual entries and exporting filtered ranges.
5
+ */
6
+ import fs from "fs";
7
+ import path from "path";
8
+ import os from "os";
9
+ const DEFAULT_AUDIT_PATH = path.join(os.homedir(), ".mcp", "eval-runner-audit.jsonl");
10
+ export class AuditLog {
11
+ filePath;
12
+ constructor(filePath) {
13
+ this.filePath = filePath ?? DEFAULT_AUDIT_PATH;
14
+ // Ensure parent directory exists
15
+ fs.mkdirSync(path.dirname(this.filePath), { recursive: true });
16
+ }
17
+ /**
18
+ * Append a single audit entry to the log file.
19
+ */
20
+ record(entry) {
21
+ const line = JSON.stringify(entry) + "\n";
22
+ fs.appendFileSync(this.filePath, line, "utf-8");
23
+ }
24
+ /**
25
+ * Export audit entries, optionally filtered by ISO timestamp range [from, to].
26
+ * Both `from` and `to` are inclusive ISO date strings.
27
+ */
28
+ export(from, to) {
29
+ if (!fs.existsSync(this.filePath)) {
30
+ return [];
31
+ }
32
+ const raw = fs.readFileSync(this.filePath, "utf-8");
33
+ const lines = raw
34
+ .split("\n")
35
+ .map((l) => l.trim())
36
+ .filter(Boolean);
37
+ const entries = [];
38
+ for (const line of lines) {
39
+ try {
40
+ entries.push(JSON.parse(line));
41
+ }
42
+ catch {
43
+ // Skip malformed lines
44
+ }
45
+ }
46
+ let filtered = entries;
47
+ if (from !== undefined) {
48
+ const fromTs = new Date(from).getTime();
49
+ filtered = filtered.filter((e) => new Date(e.timestamp).getTime() >= fromTs);
50
+ }
51
+ if (to !== undefined) {
52
+ const toTs = new Date(to).getTime();
53
+ filtered = filtered.filter((e) => new Date(e.timestamp).getTime() <= toTs);
54
+ }
55
+ return filtered;
56
+ }
57
+ }
package/dist/auth.d.ts ADDED
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Authentication middleware for mcp-eval-runner HTTP server.
3
+ * Supports X-API-Key header validation and HMAC-SHA256 JWT Bearer token validation.
4
+ * Pass-through when neither MCP_API_KEY nor MCP_JWT_SECRET env vars are set.
5
+ */
6
+ import type { RequestHandler } from "express";
7
+ /**
8
+ * Create Express authentication middleware.
9
+ *
10
+ * Behavior:
11
+ * - If MCP_API_KEY is set, validates X-API-Key header; returns 401 if missing or mismatched.
12
+ * - If MCP_JWT_SECRET is set, validates Authorization: Bearer <token> as HMAC-SHA256 JWT.
13
+ * - If neither env var is set, passes through all requests.
14
+ */
15
+ export declare function createAuthMiddleware(): RequestHandler;
package/dist/auth.js ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Authentication middleware for mcp-eval-runner HTTP server.
3
+ * Supports X-API-Key header validation and HMAC-SHA256 JWT Bearer token validation.
4
+ * Pass-through when neither MCP_API_KEY nor MCP_JWT_SECRET env vars are set.
5
+ */
6
+ import crypto from "crypto";
7
+ /**
8
+ * Decode a base64url-encoded string to a UTF-8 string.
9
+ */
10
+ function _base64urlDecode(input) {
11
+ // Convert base64url to base64
12
+ const base64 = input.replace(/-/g, "+").replace(/_/g, "/");
13
+ const padded = base64.padEnd(base64.length + ((4 - (base64.length % 4)) % 4), "=");
14
+ return Buffer.from(padded, "base64").toString("utf-8");
15
+ }
16
+ /**
17
+ * Verify a JWT token using HMAC-SHA256 with the given secret.
18
+ * JWT format: base64url(header).base64url(payload).base64url(signature)
19
+ * Signature = HMAC-SHA256(secret, "header.payload")
20
+ */
21
+ function verifyJwt(token, secret) {
22
+ const parts = token.split(".");
23
+ if (parts.length !== 3) {
24
+ return false;
25
+ }
26
+ const [headerB64, payloadB64, signatureB64] = parts;
27
+ const signingInput = `${headerB64}.${payloadB64}`;
28
+ // Compute expected signature
29
+ const expectedSig = crypto.createHmac("sha256", secret).update(signingInput).digest("base64url");
30
+ // Constant-time comparison to prevent timing attacks
31
+ try {
32
+ const expectedBuf = Buffer.from(expectedSig, "base64url");
33
+ const actualBuf = Buffer.from(signatureB64, "base64url");
34
+ if (expectedBuf.length !== actualBuf.length) {
35
+ return false;
36
+ }
37
+ return crypto.timingSafeEqual(expectedBuf, actualBuf);
38
+ }
39
+ catch {
40
+ return false;
41
+ }
42
+ }
43
+ /**
44
+ * Create Express authentication middleware.
45
+ *
46
+ * Behavior:
47
+ * - If MCP_API_KEY is set, validates X-API-Key header; returns 401 if missing or mismatched.
48
+ * - If MCP_JWT_SECRET is set, validates Authorization: Bearer <token> as HMAC-SHA256 JWT.
49
+ * - If neither env var is set, passes through all requests.
50
+ */
51
+ export function createAuthMiddleware() {
52
+ return (req, res, next) => {
53
+ const apiKey = process.env.MCP_API_KEY;
54
+ const jwtSecret = process.env.MCP_JWT_SECRET;
55
+ // If neither auth mechanism is configured, pass through
56
+ if (!apiKey && !jwtSecret) {
57
+ next();
58
+ return;
59
+ }
60
+ // Validate X-API-Key if MCP_API_KEY is set
61
+ if (apiKey) {
62
+ const providedKey = req.headers["x-api-key"];
63
+ if (!providedKey || providedKey !== apiKey) {
64
+ res.status(401).json({ error: "Unauthorized: invalid or missing API key" });
65
+ return;
66
+ }
67
+ }
68
+ // Validate JWT Bearer token if MCP_JWT_SECRET is set
69
+ if (jwtSecret) {
70
+ const authHeader = req.headers["authorization"];
71
+ if (!authHeader || !authHeader.startsWith("Bearer ")) {
72
+ res.status(401).json({ error: "Unauthorized: missing Bearer token" });
73
+ return;
74
+ }
75
+ const token = authHeader.slice("Bearer ".length).trim();
76
+ if (!verifyJwt(token, jwtSecret)) {
77
+ res.status(401).json({ error: "Unauthorized: invalid JWT signature" });
78
+ return;
79
+ }
80
+ }
81
+ next();
82
+ };
83
+ }
package/dist/db.d.ts ADDED
@@ -0,0 +1,40 @@
1
+ /**
2
+ * SQLite schema and queries for MCP Eval Runner run history.
3
+ * Uses the built-in node:sqlite module (Node.js >= 22.5).
4
+ */
5
+ export interface RunRecord {
6
+ id: string;
7
+ suite_name: string;
8
+ started_at: number;
9
+ ended_at: number | null;
10
+ total_cases: number;
11
+ passed: number;
12
+ failed: number;
13
+ format: string;
14
+ }
15
+ export interface CaseResultRecord {
16
+ id: string;
17
+ run_id: string;
18
+ case_name: string;
19
+ status: "pass" | "fail" | "error";
20
+ duration_ms: number | null;
21
+ error_message: string | null;
22
+ assertions_json: string | null;
23
+ created_at: number;
24
+ }
25
+ /**
26
+ * Expand ~ to the user's home directory.
27
+ */
28
+ export declare function expandHome(p: string): string;
29
+ export declare class EvalDb {
30
+ private db;
31
+ constructor(dbPath: string);
32
+ insertRun(run: RunRecord): void;
33
+ updateRun(id: string, updates: Partial<Pick<RunRecord, "ended_at" | "total_cases" | "passed" | "failed">>): void;
34
+ insertCaseResult(result: CaseResultRecord): void;
35
+ getLastRun(suiteName?: string): RunRecord | undefined;
36
+ getRunById(id: string): RunRecord | undefined;
37
+ getCaseResultsForRun(runId: string): CaseResultRecord[];
38
+ getAllRuns(limit?: number): RunRecord[];
39
+ close(): void;
40
+ }
package/dist/db.js ADDED
@@ -0,0 +1,94 @@
1
+ /**
2
+ * SQLite schema and queries for MCP Eval Runner run history.
3
+ * Uses the built-in node:sqlite module (Node.js >= 22.5).
4
+ */
5
+ // node:sqlite is a stable built-in since Node 22.5
6
+ import { DatabaseSync } from "node:sqlite";
7
+ import path from "path";
8
+ import os from "os";
9
+ import fs from "fs";
10
+ const SCHEMA = `
11
+ CREATE TABLE IF NOT EXISTS runs (
12
+ id TEXT PRIMARY KEY,
13
+ suite_name TEXT NOT NULL,
14
+ started_at INTEGER NOT NULL,
15
+ ended_at INTEGER,
16
+ total_cases INTEGER NOT NULL DEFAULT 0,
17
+ passed INTEGER NOT NULL DEFAULT 0,
18
+ failed INTEGER NOT NULL DEFAULT 0,
19
+ format TEXT NOT NULL DEFAULT 'console'
20
+ );
21
+
22
+ CREATE TABLE IF NOT EXISTS case_results (
23
+ id TEXT PRIMARY KEY,
24
+ run_id TEXT NOT NULL,
25
+ case_name TEXT NOT NULL,
26
+ status TEXT NOT NULL,
27
+ duration_ms INTEGER,
28
+ error_message TEXT,
29
+ assertions_json TEXT,
30
+ created_at INTEGER NOT NULL,
31
+ FOREIGN KEY (run_id) REFERENCES runs(id)
32
+ );
33
+ `;
34
+ /**
35
+ * Expand ~ to the user's home directory.
36
+ */
37
+ export function expandHome(p) {
38
+ if (p.startsWith("~/") || p === "~") {
39
+ return path.join(os.homedir(), p.slice(1));
40
+ }
41
+ return p;
42
+ }
43
+ export class EvalDb {
44
+ db;
45
+ constructor(dbPath) {
46
+ const resolved = expandHome(dbPath);
47
+ const dir = path.dirname(resolved);
48
+ fs.mkdirSync(dir, { recursive: true });
49
+ this.db = new DatabaseSync(resolved);
50
+ this.db.exec(SCHEMA);
51
+ }
52
+ insertRun(run) {
53
+ const stmt = this.db.prepare(`INSERT INTO runs (id, suite_name, started_at, ended_at, total_cases, passed, failed, format)
54
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
55
+ stmt.run(run.id, run.suite_name, run.started_at, run.ended_at, run.total_cases, run.passed, run.failed, run.format);
56
+ }
57
+ updateRun(id, updates) {
58
+ const entries = Object.entries(updates);
59
+ if (entries.length === 0)
60
+ return;
61
+ const fields = entries.map(([k]) => `${k} = ?`).join(", ");
62
+ const values = entries.map(([, v]) => v);
63
+ this.db.prepare(`UPDATE runs SET ${fields} WHERE id = ?`).run(...values, id);
64
+ }
65
+ insertCaseResult(result) {
66
+ const stmt = this.db.prepare(`INSERT INTO case_results (id, run_id, case_name, status, duration_ms, error_message, assertions_json, created_at)
67
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
68
+ stmt.run(result.id, result.run_id, result.case_name, result.status, result.duration_ms, result.error_message, result.assertions_json, result.created_at);
69
+ }
70
+ getLastRun(suiteName) {
71
+ if (suiteName) {
72
+ return this.db
73
+ .prepare(`SELECT * FROM runs WHERE suite_name = ? ORDER BY started_at DESC LIMIT 1`)
74
+ .get(suiteName);
75
+ }
76
+ return this.db.prepare(`SELECT * FROM runs ORDER BY started_at DESC LIMIT 1`).get();
77
+ }
78
+ getRunById(id) {
79
+ return this.db.prepare(`SELECT * FROM runs WHERE id = ?`).get(id);
80
+ }
81
+ getCaseResultsForRun(runId) {
82
+ return this.db
83
+ .prepare(`SELECT * FROM case_results WHERE run_id = ? ORDER BY created_at ASC`)
84
+ .all(runId);
85
+ }
86
+ getAllRuns(limit = 50) {
87
+ return this.db
88
+ .prepare(`SELECT * FROM runs ORDER BY started_at DESC LIMIT ?`)
89
+ .all(limit);
90
+ }
91
+ close() {
92
+ this.db.close();
93
+ }
94
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * CI deployment gate for mcp-eval-runner.
3
+ * Evaluates whether a recent set of runs meets a minimum pass rate threshold.
4
+ * Intended for use in CI pipelines to block deploys on regressions.
5
+ */
6
+ import type { EvalDb } from "./db.js";
7
+ export interface GateConfig {
8
+ /** Optional: filter runs by workflow/suite name */
9
+ workflow_name?: string;
10
+ /** Minimum acceptable pass rate (0.0 – 1.0) */
11
+ min_pass_rate: number;
12
+ /** Number of most-recent runs to consider (default: 10) */
13
+ lookback_runs?: number;
14
+ }
15
+ export interface GateResult {
16
+ passed: boolean;
17
+ current_rate: number;
18
+ threshold: number;
19
+ run_count: number;
20
+ }
21
+ /**
22
+ * Evaluate the deployment gate against recent run history.
23
+ *
24
+ * Queries the last `lookback_runs` runs (optionally filtered by suite name),
25
+ * computes the aggregate pass rate, and compares it to `min_pass_rate`.
26
+ */
27
+ export declare function evaluateGate(db: EvalDb, config: GateConfig): GateResult;
@@ -0,0 +1,43 @@
1
+ /**
2
+ * CI deployment gate for mcp-eval-runner.
3
+ * Evaluates whether a recent set of runs meets a minimum pass rate threshold.
4
+ * Intended for use in CI pipelines to block deploys on regressions.
5
+ */
6
+ /**
7
+ * Evaluate the deployment gate against recent run history.
8
+ *
9
+ * Queries the last `lookback_runs` runs (optionally filtered by suite name),
10
+ * computes the aggregate pass rate, and compares it to `min_pass_rate`.
11
+ */
12
+ export function evaluateGate(db, config) {
13
+ const lookback = config.lookback_runs ?? 10;
14
+ const threshold = config.min_pass_rate;
15
+ // Fetch recent runs from the database
16
+ const allRuns = db.getAllRuns(lookback * 10); // over-fetch then filter
17
+ const relevantRuns = config.workflow_name
18
+ ? allRuns.filter((r) => r.suite_name === config.workflow_name).slice(0, lookback)
19
+ : allRuns.slice(0, lookback);
20
+ const runCount = relevantRuns.length;
21
+ if (runCount === 0) {
22
+ return {
23
+ passed: false,
24
+ current_rate: 0,
25
+ threshold,
26
+ run_count: 0,
27
+ };
28
+ }
29
+ // Aggregate total_cases and passed across all relevant runs
30
+ let totalCases = 0;
31
+ let totalPassed = 0;
32
+ for (const run of relevantRuns) {
33
+ totalCases += run.total_cases;
34
+ totalPassed += run.passed;
35
+ }
36
+ const currentRate = totalCases > 0 ? totalPassed / totalCases : 0;
37
+ return {
38
+ passed: currentRate >= threshold,
39
+ current_rate: currentRate,
40
+ threshold,
41
+ run_count: runCount,
42
+ };
43
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Shared fixture discovery and publishing for mcp-eval-runner.
3
+ * Supports discovering fixtures across multiple directories and publishing
4
+ * (copying) fixtures to a destination directory.
5
+ */
6
+ export interface FixtureEntry {
7
+ name: string;
8
+ path: string;
9
+ suite_count: number;
10
+ case_count: number;
11
+ }
12
+ /**
13
+ * Discover all fixture files across the given directories.
14
+ * Returns a deduplicated list (by name, first occurrence wins).
15
+ *
16
+ * @param dirs - Array of directory paths to scan for fixtures
17
+ */
18
+ export declare function discoverFixtures(dirs: string[]): FixtureEntry[];
19
+ /**
20
+ * Publish (copy) a fixture YAML/JSON file to the destination directory.
21
+ * Creates the destination directory if it does not exist.
22
+ *
23
+ * @param fixture - The fixture object or a path to a fixture file
24
+ * @param dest - Destination directory path
25
+ */
26
+ export declare function publishFixture(fixture: unknown, dest: string): void;