npm - mcp-eval-runner - Versions diffs - 1.0.0 - Mend

mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/.env.example +39 -0
package/CHANGELOG.md +67 -0
package/LICENSE +21 -0
package/README.md +328 -0
package/dist/assertions.d.ts +63 -0
package/dist/assertions.js +187 -0
package/dist/audit-log.d.ts +26 -0
package/dist/audit-log.js +57 -0
package/dist/auth.d.ts +15 -0
package/dist/auth.js +83 -0
package/dist/db.d.ts +40 -0
package/dist/db.js +94 -0
package/dist/deployment-gate.d.ts +27 -0
package/dist/deployment-gate.js +43 -0
package/dist/fixture-library.d.ts +26 -0
package/dist/fixture-library.js +85 -0
package/dist/fixture.d.ts +87 -0
package/dist/fixture.js +170 -0
package/dist/http-server.d.ts +7 -0
package/dist/http-server.js +34 -0
package/dist/index.d.ts +15 -0
package/dist/index.js +158 -0
package/dist/llm-judge.d.ts +24 -0
package/dist/llm-judge.js +139 -0
package/dist/rate-limiter.d.ts +13 -0
package/dist/rate-limiter.js +36 -0
package/dist/reporter.d.ts +8 -0
package/dist/reporter.js +163 -0
package/dist/runner.d.ts +57 -0
package/dist/runner.js +339 -0
package/dist/server.d.ts +22 -0
package/dist/server.js +583 -0
package/dist/tools/html_report.d.ts +8 -0
package/dist/tools/html_report.js +188 -0
package/dist/tools/manage.d.ts +11 -0
package/dist/tools/manage.js +41 -0
package/dist/tools/report.d.ts +12 -0
package/dist/tools/report.js +120 -0
package/dist/tools/run.d.ts +20 -0
package/dist/tools/run.js +166 -0
package/dist/tools/scaffold.d.ts +11 -0
package/dist/tools/scaffold.js +90 -0
package/evals/reference/mcp-fetch.yaml +46 -0
package/evals/reference/mcp-filesystem.yaml +63 -0
package/evals/reference/mcp-memory.yaml +70 -0
package/evals/reference/step-piping-example.yaml +25 -0
package/evals/smoke.yaml +12 -0
package/package.json +67 -0

package/.env.example ADDED Viewed

@@ -0,0 +1,39 @@
+# mcp-eval-runner environment variables
+# Copy this file to .env and fill in values as needed.
+# All variables are optional — the server runs without them,
+# but certain features (auth, LLM judge) will be disabled.
+# ---------------------------------------------------------------------------
+# Authentication (src/auth.ts)
+# ---------------------------------------------------------------------------
+# Static API key for X-API-Key header validation.
+# When set, every request to /mcp must include the header:
+#   X-API-Key: <value>
+MCP_API_KEY=your-api-key-here
+# Secret for HMAC-SHA256 JWT Bearer token validation.
+# When set, requests to /mcp must include:
+#   Authorization: Bearer <jwt>
+MCP_JWT_SECRET=your-jwt-secret-here
+# ---------------------------------------------------------------------------
+# LLM Judge (src/llm-judge.ts)
+# ---------------------------------------------------------------------------
+# API key passed as a Bearer token to the external LLM API.
+# Required to use the `llm_judge` assertion type in fixture files.
+LLM_JUDGE_API_KEY=your-llm-judge-api-key-here
+# Base URL of the OpenAI-compatible chat completions endpoint.
+# Example: https://api.openai.com/v1/chat/completions
+LLM_JUDGE_BASE_URL=https://api.openai.com/v1/chat/completions
+# ---------------------------------------------------------------------------
+# Fixture Library (src/server.ts)
+# ---------------------------------------------------------------------------
+# Colon-separated list of additional directories to scan for fixture files
+# when using the list_fixtures tool without explicit dirs.
+# Example: /home/user/fixtures:/opt/shared-fixtures
+FIXTURE_LIBRARY_DIRS=/path/to/fixtures:/path/to/more/fixtures

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,67 @@
+# Changelog
+All notable changes to MCP Eval Runner will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [1.0.1] - 2026-03-23
+### Fixed
+- **smoke fixture** (`evals/smoke.yaml`): the fixture previously ran in simulation mode because it had no `server` block. In simulation mode, `output_contains` assertions evaluate against `expected_output`, which was empty, so the assertion always failed. Fixed by adding a `server` block (`command: node`, `args: ["dist/index.js"]`) so the fixture now runs in live mode against the eval runner itself and the `output_contains: "smoke"` assertion passes against the real `list_cases` response.
+## [1.0.0] - 2026-03-12
+### Added
+- `.env.example` documenting `MCP_API_KEY`, `MCP_JWT_SECRET`, `LLM_JUDGE_API_KEY`, `LLM_JUDGE_BASE_URL`, and `FIXTURE_LIBRARY_DIRS`.
+### Changed
+- `@modelcontextprotocol/sdk` upgraded from `^1.0.0` to `^1.12.0`.
+- `chokidar` upgraded from `^3.x` to `^5.0.0` (no API changes required — already ESM).
+- `@types/node` upgraded from `^22.x` to `^24.12.0` (Node 24 LTS).
+- `eslint` upgraded from `^9.x` to `^10.0.3`; `eslint-config-prettier` from `^9.x` to `^10.1.8`.
+- `yargs` upgraded from `^17.x` to `^18.0.0`.
+- Added `author`, `license`, `repository`, and `homepage` fields to `package.json`.
+### Fixed
+- Prefixed unused `base64urlDecode` in `src/auth.ts` and `writeFixture` in `tests/fixture-library.test.ts` with `_` to satisfy `no-unused-vars` lint rule.
+### Security
+- Resolved **GHSA-67mh-4wv8-2f99** (`esbuild` ≤ 0.24.2 dev-server cross-origin exposure) by upgrading `vitest` and `@vitest/coverage-v8` to `^4.1.0`. Affects local development only; not a production runtime concern.
+## [0.2.0] - 2026-03-12
+### Added
+- **Extended assertions** (`src/assertions.ts`): additional assertion types beyond string equality — numeric comparisons, regex matching, and JSON path checks.
+- **LLM judge** (`src/llm-judge.ts`): AI-powered assertion scoring via any OpenAI-compatible completions endpoint, configured via `LLM_JUDGE_API_KEY` and `LLM_JUDGE_BASE_URL`.
+- **Deployment gate** (`src/deployment-gate.ts`): block CI promotion if the eval pass rate falls below a configurable threshold. Integrates with `run_deployment_gate` tool.
+- **Fixture library** (`src/fixture-library.ts`): centralized fixture discovery across multiple directories; supports `FIXTURE_LIBRARY_DIRS` env var for additional scan paths.
+- **Audit log** (`src/audit-log.ts`): append-only JSONL audit trail of every eval run.
+- **JWT / API-key auth middleware** (`src/auth.ts`): HTTP transport protected via `MCP_API_KEY` or `MCP_JWT_SECRET`. stdio is unaffected.
+- **Per-client rate limiter** (`src/rate-limiter.ts`): sliding-window request throttle on the HTTP transport.
+- **New tools**: `run_deployment_gate`, `search_fixtures`, `add_fixture`, `llm_judge`.
+- **`npm run inspect` script**: launches MCP Inspector for interactive pre-publish verification.
+- MCP Inspector verification instructions added to README.
+- Tests for assertions, audit log, auth, deployment gate, fixture library, LLM judge, and rate limiter.
+## [0.1.0] - 2026-03-12
+### Added
+- Initial public release of `mcp-eval-runner`.
+- YAML fixture format: `prompt → expected tool calls → expected outputs`.
+- Regression suite execution directly from MCP clients (Claude Code, Cursor, etc.).
+- Pass/fail results with structured diffs for failed assertions.
+- Support for running individual fixtures or full suites.
+- Watch mode for continuous re-evaluation during development (`chokidar`-based).
+- Streamable HTTP transport via `--http-port` flag (default: disabled, uses stdio).
+- GitHub Actions CI workflow running build, test, and lint on push/PR to `main`.
+- Vitest test suite with coverage via `@vitest/coverage-v8`.

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 MCP Eval Runner contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,328 @@
+# MCP Eval Runner
+npm `mcp-eval-runner` package
+A standardized testing harness for MCP servers and agent workflows. Define test cases as YAML fixtures (steps → expected tool calls → expected outputs), run regression suites directly from your MCP client, and get pass/fail results with diffs — without leaving Claude Code or Cursor.
+[Tool reference](#tools) | [Configuration](#configuration) | [Fixture format](#fixture-format) | [Contributing](#contributing) | [Troubleshooting](#troubleshooting) | [Design principles](#design-principles)
+## Key features
+- **YAML fixtures**: Test cases are plain files in version control — diffable, reviewable, and shareable.
+- **Two execution modes**: Live mode spawns a real MCP server and calls tools via stdio; simulation mode runs assertions against `expected_output` without a server.
+- **Composable assertions**: Combine `output_contains`, `output_not_contains`, `output_equals`, `output_matches`, `schema_match`, `tool_called`, and `latency_under` per step.
+- **Step output piping**: Reference a previous step's output in downstream inputs via `{{steps.<step_id>.output}}`.
+- **Regression reports**: Compare the current run to any past run and surface what changed.
+- **Watch mode**: Automatically reruns the affected fixture when files change.
+- **CI-ready**: Includes a GitHub Action for running evals on every config change.
+## Requirements
+- Node.js v22.5.0 or newer.
+- npm.
+## Getting started
+Add the following config to your MCP client:
+```json
+{
+  "mcpServers": {
+    "eval-runner": {
+      "command": "npx",
+      "args": ["-y", "mcp-eval-runner@latest"]
+    }
+  }
+}
+```
+By default, eval fixtures are loaded from `./evals/` in the current working directory. To use a different path:
+```json
+{
+  "mcpServers": {
+    "eval-runner": {
+      "command": "npx",
+      "args": ["-y", "mcp-eval-runner@latest", "--fixtures=~/my-project/evals"]
+    }
+  }
+}
+```
+### MCP Client configuration
+Amp · Claude Code · Cline · Cursor · VS Code · Windsurf · Zed
+## Your first prompt
+Create a file at `evals/smoke.yaml`. Use **live mode** (recommended) by including a `server` block:
+```yaml
+name: smoke
+description: "Verify eval runner itself is working"
+server:
+  command: node
+  args: ["dist/index.js"]
+steps:
+  - id: list_check
+    description: "List available test cases"
+    tool: list_cases
+    input: {}
+    expect:
+      output_contains: "smoke"
+```
+Then enter the following in your MCP client:
+```
+Run the eval suite.
+```
+Your client should return a pass/fail result for the smoke test.
+## Fixture format
+Fixtures are YAML (or JSON) files placed in the fixtures directory. Each file defines one test case.
+### Top-level fields
+| Field         | Required | Description                                                                               |
+| ------------- | -------- | ----------------------------------------------------------------------------------------- |
+| `name`        | Yes      | Unique name for the test case                                                             |
+| `description` | No       | Human-readable description                                                                |
+| `server`      | No       | Server config — if present, runs in **live mode**; if absent, runs in **simulation mode** |
+| `steps`       | Yes      | Array of steps to execute                                                                 |
+### `server` block (live mode)
+```yaml
+server:
+  command: node # executable to spawn
+  args: ["dist/index.js"] # arguments
+  env: # optional environment variables
+    MY_VAR: "value"
+```
+When `server` is present the eval runner spawns the server as a child process, connects via MCP stdio transport, and calls each step's tool against the live server.
+### `steps` array
+Each step has the following fields:
+| Field             | Required | Description                                                   |
+| ----------------- | -------- | ------------------------------------------------------------- |
+| `id`              | Yes      | Unique identifier within the fixture (used for output piping) |
+| `tool`            | Yes      | MCP tool name to call                                         |
+| `description`     | No       | Human-readable step description                               |
+| `input`           | No       | Key-value map of arguments passed to the tool (default: `{}`) |
+| `expected_output` | No       | Literal string used as output in simulation mode              |
+| `expect`          | No       | Assertions evaluated against the step output                  |
+### Execution modes
+**Live mode** — fixture has a `server` block:
+- The server is spawned and each step calls the named tool via MCP stdio.
+- Assertions run against the real tool response.
+- Errors from the server cause the step (and by default the case) to fail immediately.
+**Simulation mode** — no `server` block:
+- No server is started.
+- Each step's output is taken from `expected_output` (or empty string if absent).
+- Assertions run against that static output.
+- Useful for authoring and CI dry-runs, but `output_contains` assertions will always fail if `expected_output` is not set.
+### Assertion types
+All assertions go inside a step's `expect` block:
+```yaml
+expect:
+  output_contains: "substring" # output includes this text
+  output_not_contains: "error" # output must NOT include this text
+  output_equals: "exact string" # output exactly matches
+  output_matches: "regex pattern" # output matches a regular expression
+  tool_called: "tool_name" # verifies which tool was called
+  latency_under: 500 # latency in ms must be below this threshold
+  schema_match: # output (parsed as JSON) matches JSON Schema
+    type: object
+    required: [id]
+    properties:
+      id:
+        type: number
+```
+Multiple assertions in one `expect` block are all evaluated; the step fails if any assertion fails.
+### Step output piping
+Reference the output of a previous step in a downstream step's `input` using `{{steps.<step_id>.output}}`:
+```yaml
+steps:
+  - id: search_step
+    tool: search
+    input:
+      query: "mcp eval runner"
+    expected_output: "result: mcp-eval-runner v1.0"
+    expect:
+      output_contains: "mcp-eval-runner"
+  - id: summarize_step
+    tool: summarize
+    input:
+      text: "{{steps.search_step.output}}"
+    expected_output: "Summary: mcp-eval-runner v1.0"
+    expect:
+      output_contains: "Summary"
+```
+Piping works in both live mode and simulation mode.
+### Note on `create_test_case`
+Fixtures created with the `create_test_case` tool do not include a `server` block. They always run in simulation mode. To use live mode, add a `server` block manually to the generated YAML file.
+## Tools
+### Running
+- `run_suite` — execute all fixtures in the fixtures directory; returns a pass/fail summary
+- `run_case` — run a single named fixture by name
+- `list_cases` — enumerate available fixtures with step counts and descriptions
+### Authoring
+- `create_test_case` — create a new YAML fixture file (simulation mode; no `server` block)
+- `scaffold_fixture` — generate a boilerplate fixture with placeholder steps and pre-filled assertion comments
+### Reporting
+- `regression_report` — compare the current fixture state to the last run; surfaces regressions and fixes
+- `compare_results` — diff two specific runs by run ID
+- `generate_html_report` — generate a single-file HTML report for a completed run
+### Operations
+- `evaluate_deployment_gate` — CI gate; fails if recent pass rate drops below a configurable threshold
+- `discover_fixtures` — discover fixture files across one or more directories (respects `FIXTURE_LIBRARY_DIRS`)
+## Configuration
+### `--fixtures` / `--fixtures-dir`
+Directory to load YAML/JSON eval fixture files from.
+Type: `string`
+Default: `./evals`
+### `--db` / `--db-path`
+Path to the SQLite database file used to store run history.
+Type: `string`
+Default: `~/.mcp/evals.db`
+### `--timeout`
+Maximum time in milliseconds to wait for a single step before marking it as failed.
+Type: `number`
+Default: `30000`
+### `--watch`
+Watch the fixtures directory and rerun the affected fixture automatically when files change.
+Type: `boolean`
+Default: `false`
+### `--format`
+Output format for eval results.
+Type: `string`
+Choices: `console`, `json`, `html`
+Default: `console`
+### `--concurrency`
+Number of test cases to run in parallel.
+Type: `number`
+Default: `1`
+### `--http-port`
+Start an HTTP server on this port instead of stdio transport.
+Type: `number`
+Default: disabled (uses stdio)
+Pass flags via the `args` property in your JSON config:
+```json
+{
+  "mcpServers": {
+    "eval-runner": {
+      "command": "npx",
+      "args": ["-y", "mcp-eval-runner@latest", "--watch", "--timeout=60000"]
+    }
+  }
+}
+```
+## Design principles
+- **No mocking**: Live mode evals run against real servers. Correctness is non-negotiable.
+- **Fixtures are text**: YAML/JSON in version control; no proprietary formats or databases.
+- **Dogfood-first**: The eval runner's own smoke fixture tests the eval runner itself.
+## Verification
+Before publishing a new version, verify the server with MCP Inspector to confirm all tools are exposed correctly and the protocol handshake succeeds.
+**Interactive UI** (opens browser):
+```bash
+npm run build && npm run inspect
+```
+**CLI mode** (scripted / CI-friendly):
+```bash
+# List all tools
+npx @modelcontextprotocol/inspector --cli node dist/index.js --method tools/list
+# List resources and prompts
+npx @modelcontextprotocol/inspector --cli node dist/index.js --method resources/list
+npx @modelcontextprotocol/inspector --cli node dist/index.js --method prompts/list
+# Call a tool (example — replace with a relevant read-only tool for this plugin)
+npx @modelcontextprotocol/inspector --cli node dist/index.js \
+  --method tools/call --tool-name list_cases
+# Call a tool with arguments
+npx @modelcontextprotocol/inspector --cli node dist/index.js \
+  --method tools/call --tool-name run_case --tool-arg name=smoke
+```
+Run before publishing to catch regressions in tool registration and runtime startup.
+## Contributing
+New assertion types go in `src/assertions.ts` — implement the `Assertion` interface and add a test. Integration tests live under `tests/` as unit tests and under `evals/` as eval fixtures.
+```bash
+npm install && npm test
+```
+## MCP Registry & Marketplace
+This plugin is available on:
+- [MCP Registry](https://registry.modelcontextprotocol.io)
+- [MCP Marketplace](https://marketplace.modelcontextprotocol.io)
+Search for `mcp-eval-runner`.

package/dist/assertions.d.ts ADDED Viewed

@@ -0,0 +1,63 @@
+/**
+ * Assertion evaluators for MCP Eval Runner.
+ * Each assertion checks a specific property of a step result.
+ *
+ * Supported assertion types:
+ *   output_contains: "substring"       — output includes substring
+ *   output_not_contains: "substring"   — output must NOT include substring
+ *   output_equals: "exact string"      — output exactly matches
+ *   output_matches: "regex"            — output matches a regular expression
+ *   tool_called: "tool_name"           — step used the named tool
+ *   latency_under: 500                 — latency in ms must be below threshold
+ *   schema_match: { type: "object", properties: {...}, required: [...] }
+ *                                      — output (parsed as JSON) matches JSON Schema
+ *   llm_judge: { prompt_template, min_score, model, expected }
+ *                                      — semantic similarity via LLM judge
+ */
+import type { LlmJudgeAssertion } from "./llm-judge.js";
+export interface Assertion {
+    output_contains?: string;
+    output_not_contains?: string;
+    output_equals?: string;
+    output_matches?: string;
+    tool_called?: string;
+    latency_under?: number;
+    schema_match?: JsonSchema;
+    llm_judge?: LlmJudgeAssertion;
+}
+export interface JsonSchema {
+    type?: string;
+    properties?: Record<string, JsonSchema>;
+    required?: string[];
+    additionalProperties?: boolean | JsonSchema;
+    items?: JsonSchema;
+    [key: string]: unknown;
+}
+export interface AssertionResult {
+    type: string;
+    passed: boolean;
+    message: string;
+}
+export interface StepResult {
+    tool: string;
+    output: string;
+    latency_ms: number;
+}
+/**
+ * Evaluate a single assertion against a step result.
+ */
+export declare function evaluateAssertion(assertion: Assertion, result: StepResult): AssertionResult[];
+/**
+ * Evaluate all assertions for a step, returning aggregate pass/fail.
+ */
+export declare function evaluateAllAssertions(assertions: Assertion, result: StepResult): {
+    passed: boolean;
+    results: AssertionResult[];
+};
+/**
+ * Evaluate all assertions for a step including async assertion types (e.g. llm_judge).
+ */
+export declare function evaluateAllAssertionsAsync(assertions: Assertion, result: StepResult): Promise<{
+    passed: boolean;
+    results: AssertionResult[];
+}>;