mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
package/.env.example ADDED
@@ -0,0 +1,39 @@
1
+ # mcp-eval-runner environment variables
2
+ # Copy this file to .env and fill in values as needed.
3
+ # All variables are optional — the server runs without them,
4
+ # but certain features (auth, LLM judge) will be disabled.
5
+
6
+ # ---------------------------------------------------------------------------
7
+ # Authentication (src/auth.ts)
8
+ # ---------------------------------------------------------------------------
9
+
10
+ # Static API key for X-API-Key header validation.
11
+ # When set, every request to /mcp must include the header:
12
+ # X-API-Key: <value>
13
+ MCP_API_KEY=your-api-key-here
14
+
15
+ # Secret for HMAC-SHA256 JWT Bearer token validation.
16
+ # When set, requests to /mcp must include:
17
+ # Authorization: Bearer <jwt>
18
+ MCP_JWT_SECRET=your-jwt-secret-here
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # LLM Judge (src/llm-judge.ts)
22
+ # ---------------------------------------------------------------------------
23
+
24
+ # API key passed as a Bearer token to the external LLM API.
25
+ # Required to use the `llm_judge` assertion type in fixture files.
26
+ LLM_JUDGE_API_KEY=your-llm-judge-api-key-here
27
+
28
+ # Base URL of the OpenAI-compatible chat completions endpoint.
29
+ # Example: https://api.openai.com/v1/chat/completions
30
+ LLM_JUDGE_BASE_URL=https://api.openai.com/v1/chat/completions
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Fixture Library (src/server.ts)
34
+ # ---------------------------------------------------------------------------
35
+
36
+ # Colon-separated list of additional directories to scan for fixture files
37
+ # when using the list_fixtures tool without explicit dirs.
38
+ # Example: /home/user/fixtures:/opt/shared-fixtures
39
+ FIXTURE_LIBRARY_DIRS=/path/to/fixtures:/path/to/more/fixtures
package/CHANGELOG.md ADDED
@@ -0,0 +1,67 @@
1
+ # Changelog
2
+
3
+ All notable changes to MCP Eval Runner will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [1.0.1] - 2026-03-23
11
+
12
+ ### Fixed
13
+
14
+ - **smoke fixture** (`evals/smoke.yaml`): the fixture previously ran in simulation mode because it had no `server` block. In simulation mode, `output_contains` assertions evaluate against `expected_output`, which was empty, so the assertion always failed. Fixed by adding a `server` block (`command: node`, `args: ["dist/index.js"]`) so the fixture now runs in live mode against the eval runner itself and the `output_contains: "smoke"` assertion passes against the real `list_cases` response.
15
+
16
+ ## [1.0.0] - 2026-03-12
17
+
18
+ ### Added
19
+
20
+ - `.env.example` documenting `MCP_API_KEY`, `MCP_JWT_SECRET`, `LLM_JUDGE_API_KEY`, `LLM_JUDGE_BASE_URL`, and `FIXTURE_LIBRARY_DIRS`.
21
+
22
+ ### Changed
23
+
24
+ - `@modelcontextprotocol/sdk` upgraded from `^1.0.0` to `^1.12.0`.
25
+ - `chokidar` upgraded from `^3.x` to `^5.0.0` (no API changes required — already ESM).
26
+ - `@types/node` upgraded from `^22.x` to `^24.12.0` (Node 24 LTS).
27
+ - `eslint` upgraded from `^9.x` to `^10.0.3`; `eslint-config-prettier` from `^9.x` to `^10.1.8`.
28
+ - `yargs` upgraded from `^17.x` to `^18.0.0`.
29
+ - Added `author`, `license`, `repository`, and `homepage` fields to `package.json`.
30
+
31
+ ### Fixed
32
+
33
+ - Prefixed unused `base64urlDecode` in `src/auth.ts` and `writeFixture` in `tests/fixture-library.test.ts` with `_` to satisfy `no-unused-vars` lint rule.
34
+
35
+ ### Security
36
+
37
+ - Resolved **GHSA-67mh-4wv8-2f99** (`esbuild` ≤ 0.24.2 dev-server cross-origin exposure) by upgrading `vitest` and `@vitest/coverage-v8` to `^4.1.0`. Affects local development only; not a production runtime concern.
38
+
39
+ ## [0.2.0] - 2026-03-12
40
+
41
+ ### Added
42
+
43
+ - **Extended assertions** (`src/assertions.ts`): additional assertion types beyond string equality — numeric comparisons, regex matching, and JSON path checks.
44
+ - **LLM judge** (`src/llm-judge.ts`): AI-powered assertion scoring via any OpenAI-compatible completions endpoint, configured via `LLM_JUDGE_API_KEY` and `LLM_JUDGE_BASE_URL`.
45
+ - **Deployment gate** (`src/deployment-gate.ts`): block CI promotion if the eval pass rate falls below a configurable threshold. Integrates with `run_deployment_gate` tool.
46
+ - **Fixture library** (`src/fixture-library.ts`): centralized fixture discovery across multiple directories; supports `FIXTURE_LIBRARY_DIRS` env var for additional scan paths.
47
+ - **Audit log** (`src/audit-log.ts`): append-only JSONL audit trail of every eval run.
48
+ - **JWT / API-key auth middleware** (`src/auth.ts`): HTTP transport protected via `MCP_API_KEY` or `MCP_JWT_SECRET`. stdio is unaffected.
49
+ - **Per-client rate limiter** (`src/rate-limiter.ts`): sliding-window request throttle on the HTTP transport.
50
+ - **New tools**: `run_deployment_gate`, `search_fixtures`, `add_fixture`, `llm_judge`.
51
+ - **`npm run inspect` script**: launches MCP Inspector for interactive pre-publish verification.
52
+ - MCP Inspector verification instructions added to README.
53
+ - Tests for assertions, audit log, auth, deployment gate, fixture library, LLM judge, and rate limiter.
54
+
55
+ ## [0.1.0] - 2026-03-12
56
+
57
+ ### Added
58
+
59
+ - Initial public release of `mcp-eval-runner`.
60
+ - YAML fixture format: `prompt → expected tool calls → expected outputs`.
61
+ - Regression suite execution directly from MCP clients (Claude Code, Cursor, etc.).
62
+ - Pass/fail results with structured diffs for failed assertions.
63
+ - Support for running individual fixtures or full suites.
64
+ - Watch mode for continuous re-evaluation during development (`chokidar`-based).
65
+ - Streamable HTTP transport via `--http-port` flag (default: disabled, uses stdio).
66
+ - GitHub Actions CI workflow running build, test, and lint on push/PR to `main`.
67
+ - Vitest test suite with coverage via `@vitest/coverage-v8`.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MCP Eval Runner contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,328 @@
1
+ # MCP Eval Runner
2
+
3
+ npm `mcp-eval-runner` package
4
+
5
+ A standardized testing harness for MCP servers and agent workflows. Define test cases as YAML fixtures (steps → expected tool calls → expected outputs), run regression suites directly from your MCP client, and get pass/fail results with diffs — without leaving Claude Code or Cursor.
6
+
7
+ [Tool reference](#tools) | [Configuration](#configuration) | [Fixture format](#fixture-format) | [Contributing](#contributing) | [Troubleshooting](#troubleshooting) | [Design principles](#design-principles)
8
+
9
+ ## Key features
10
+
11
+ - **YAML fixtures**: Test cases are plain files in version control — diffable, reviewable, and shareable.
12
+ - **Two execution modes**: Live mode spawns a real MCP server and calls tools via stdio; simulation mode runs assertions against `expected_output` without a server.
13
+ - **Composable assertions**: Combine `output_contains`, `output_not_contains`, `output_equals`, `output_matches`, `schema_match`, `tool_called`, and `latency_under` per step.
14
+ - **Step output piping**: Reference a previous step's output in downstream inputs via `{{steps.<step_id>.output}}`.
15
+ - **Regression reports**: Compare the current run to any past run and surface what changed.
16
+ - **Watch mode**: Automatically reruns the affected fixture when files change.
17
+ - **CI-ready**: Includes a GitHub Action for running evals on every config change.
18
+
19
+ ## Requirements
20
+
21
+ - Node.js v22.5.0 or newer.
22
+ - npm.
23
+
24
+ ## Getting started
25
+
26
+ Add the following config to your MCP client:
27
+
28
+ ```json
29
+ {
30
+ "mcpServers": {
31
+ "eval-runner": {
32
+ "command": "npx",
33
+ "args": ["-y", "mcp-eval-runner@latest"]
34
+ }
35
+ }
36
+ }
37
+ ```
38
+
39
+ By default, eval fixtures are loaded from `./evals/` in the current working directory. To use a different path:
40
+
41
+ ```json
42
+ {
43
+ "mcpServers": {
44
+ "eval-runner": {
45
+ "command": "npx",
46
+ "args": ["-y", "mcp-eval-runner@latest", "--fixtures=~/my-project/evals"]
47
+ }
48
+ }
49
+ }
50
+ ```
51
+
52
+ ### MCP Client configuration
53
+
54
+ Amp · Claude Code · Cline · Cursor · VS Code · Windsurf · Zed
55
+
56
+ ## Your first prompt
57
+
58
+ Create a file at `evals/smoke.yaml`. Use **live mode** (recommended) by including a `server` block:
59
+
60
+ ```yaml
61
+ name: smoke
62
+ description: "Verify eval runner itself is working"
63
+ server:
64
+ command: node
65
+ args: ["dist/index.js"]
66
+ steps:
67
+ - id: list_check
68
+ description: "List available test cases"
69
+ tool: list_cases
70
+ input: {}
71
+ expect:
72
+ output_contains: "smoke"
73
+ ```
74
+
75
+ Then enter the following in your MCP client:
76
+
77
+ ```
78
+ Run the eval suite.
79
+ ```
80
+
81
+ Your client should return a pass/fail result for the smoke test.
82
+
83
+ ## Fixture format
84
+
85
+ Fixtures are YAML (or JSON) files placed in the fixtures directory. Each file defines one test case.
86
+
87
+ ### Top-level fields
88
+
89
+ | Field | Required | Description |
90
+ | ------------- | -------- | ----------------------------------------------------------------------------------------- |
91
+ | `name` | Yes | Unique name for the test case |
92
+ | `description` | No | Human-readable description |
93
+ | `server` | No | Server config — if present, runs in **live mode**; if absent, runs in **simulation mode** |
94
+ | `steps` | Yes | Array of steps to execute |
95
+
96
+ ### `server` block (live mode)
97
+
98
+ ```yaml
99
+ server:
100
+ command: node # executable to spawn
101
+ args: ["dist/index.js"] # arguments
102
+ env: # optional environment variables
103
+ MY_VAR: "value"
104
+ ```
105
+
106
+ When `server` is present the eval runner spawns the server as a child process, connects via MCP stdio transport, and calls each step's tool against the live server.
107
+
108
+ ### `steps` array
109
+
110
+ Each step has the following fields:
111
+
112
+ | Field | Required | Description |
113
+ | ----------------- | -------- | ------------------------------------------------------------- |
114
+ | `id` | Yes | Unique identifier within the fixture (used for output piping) |
115
+ | `tool` | Yes | MCP tool name to call |
116
+ | `description` | No | Human-readable step description |
117
+ | `input` | No | Key-value map of arguments passed to the tool (default: `{}`) |
118
+ | `expected_output` | No | Literal string used as output in simulation mode |
119
+ | `expect` | No | Assertions evaluated against the step output |
120
+
121
+ ### Execution modes
122
+
123
+ **Live mode** — fixture has a `server` block:
124
+
125
+ - The server is spawned and each step calls the named tool via MCP stdio.
126
+ - Assertions run against the real tool response.
127
+ - Errors from the server cause the step (and by default the case) to fail immediately.
128
+
129
+ **Simulation mode** — no `server` block:
130
+
131
+ - No server is started.
132
+ - Each step's output is taken from `expected_output` (or empty string if absent).
133
+ - Assertions run against that static output.
134
+ - Useful for authoring and CI dry-runs, but `output_contains` assertions will always fail if `expected_output` is not set.
135
+
136
+ ### Assertion types
137
+
138
+ All assertions go inside a step's `expect` block:
139
+
140
+ ```yaml
141
+ expect:
142
+ output_contains: "substring" # output includes this text
143
+ output_not_contains: "error" # output must NOT include this text
144
+ output_equals: "exact string" # output exactly matches
145
+ output_matches: "regex pattern" # output matches a regular expression
146
+ tool_called: "tool_name" # verifies which tool was called
147
+ latency_under: 500 # latency in ms must be below this threshold
148
+ schema_match: # output (parsed as JSON) matches JSON Schema
149
+ type: object
150
+ required: [id]
151
+ properties:
152
+ id:
153
+ type: number
154
+ ```
155
+
156
+ Multiple assertions in one `expect` block are all evaluated; the step fails if any assertion fails.
157
+
158
+ ### Step output piping
159
+
160
+ Reference the output of a previous step in a downstream step's `input` using `{{steps.<step_id>.output}}`:
161
+
162
+ ```yaml
163
+ steps:
164
+ - id: search_step
165
+ tool: search
166
+ input:
167
+ query: "mcp eval runner"
168
+ expected_output: "result: mcp-eval-runner v1.0"
169
+ expect:
170
+ output_contains: "mcp-eval-runner"
171
+
172
+ - id: summarize_step
173
+ tool: summarize
174
+ input:
175
+ text: "{{steps.search_step.output}}"
176
+ expected_output: "Summary: mcp-eval-runner v1.0"
177
+ expect:
178
+ output_contains: "Summary"
179
+ ```
180
+
181
+ Piping works in both live mode and simulation mode.
182
+
183
+ ### Note on `create_test_case`
184
+
185
+ Fixtures created with the `create_test_case` tool do not include a `server` block. They always run in simulation mode. To use live mode, add a `server` block manually to the generated YAML file.
186
+
187
+ ## Tools
188
+
189
+ ### Running
190
+
191
+ - `run_suite` — execute all fixtures in the fixtures directory; returns a pass/fail summary
192
+ - `run_case` — run a single named fixture by name
193
+ - `list_cases` — enumerate available fixtures with step counts and descriptions
194
+
195
+ ### Authoring
196
+
197
+ - `create_test_case` — create a new YAML fixture file (simulation mode; no `server` block)
198
+ - `scaffold_fixture` — generate a boilerplate fixture with placeholder steps and pre-filled assertion comments
199
+
200
+ ### Reporting
201
+
202
+ - `regression_report` — compare the current fixture state to the last run; surfaces regressions and fixes
203
+ - `compare_results` — diff two specific runs by run ID
204
+ - `generate_html_report` — generate a single-file HTML report for a completed run
205
+
206
+ ### Operations
207
+
208
+ - `evaluate_deployment_gate` — CI gate; fails if recent pass rate drops below a configurable threshold
209
+ - `discover_fixtures` — discover fixture files across one or more directories (respects `FIXTURE_LIBRARY_DIRS`)
210
+
211
+ ## Configuration
212
+
213
+ ### `--fixtures` / `--fixtures-dir`
214
+
215
+ Directory to load YAML/JSON eval fixture files from.
216
+
217
+ Type: `string`
218
+ Default: `./evals`
219
+
220
+ ### `--db` / `--db-path`
221
+
222
+ Path to the SQLite database file used to store run history.
223
+
224
+ Type: `string`
225
+ Default: `~/.mcp/evals.db`
226
+
227
+ ### `--timeout`
228
+
229
+ Maximum time in milliseconds to wait for a single step before marking it as failed.
230
+
231
+ Type: `number`
232
+ Default: `30000`
233
+
234
+ ### `--watch`
235
+
236
+ Watch the fixtures directory and rerun the affected fixture automatically when files change.
237
+
238
+ Type: `boolean`
239
+ Default: `false`
240
+
241
+ ### `--format`
242
+
243
+ Output format for eval results.
244
+
245
+ Type: `string`
246
+ Choices: `console`, `json`, `html`
247
+ Default: `console`
248
+
249
+ ### `--concurrency`
250
+
251
+ Number of test cases to run in parallel.
252
+
253
+ Type: `number`
254
+ Default: `1`
255
+
256
+ ### `--http-port`
257
+
258
+ Start an HTTP server on this port instead of stdio transport.
259
+
260
+ Type: `number`
261
+ Default: disabled (uses stdio)
262
+
263
+ Pass flags via the `args` property in your JSON config:
264
+
265
+ ```json
266
+ {
267
+ "mcpServers": {
268
+ "eval-runner": {
269
+ "command": "npx",
270
+ "args": ["-y", "mcp-eval-runner@latest", "--watch", "--timeout=60000"]
271
+ }
272
+ }
273
+ }
274
+ ```
275
+
276
+ ## Design principles
277
+
278
+ - **No mocking**: Live mode evals run against real servers. Correctness is non-negotiable.
279
+ - **Fixtures are text**: YAML/JSON in version control; no proprietary formats or databases.
280
+ - **Dogfood-first**: The eval runner's own smoke fixture tests the eval runner itself.
281
+
282
+ ## Verification
283
+
284
+ Before publishing a new version, verify the server with MCP Inspector to confirm all tools are exposed correctly and the protocol handshake succeeds.
285
+
286
+ **Interactive UI** (opens browser):
287
+
288
+ ```bash
289
+ npm run build && npm run inspect
290
+ ```
291
+
292
+ **CLI mode** (scripted / CI-friendly):
293
+
294
+ ```bash
295
+ # List all tools
296
+ npx @modelcontextprotocol/inspector --cli node dist/index.js --method tools/list
297
+
298
+ # List resources and prompts
299
+ npx @modelcontextprotocol/inspector --cli node dist/index.js --method resources/list
300
+ npx @modelcontextprotocol/inspector --cli node dist/index.js --method prompts/list
301
+
302
+ # Call a tool (example — replace with a relevant read-only tool for this plugin)
303
+ npx @modelcontextprotocol/inspector --cli node dist/index.js \
304
+ --method tools/call --tool-name list_cases
305
+
306
+ # Call a tool with arguments
307
+ npx @modelcontextprotocol/inspector --cli node dist/index.js \
308
+ --method tools/call --tool-name run_case --tool-arg name=smoke
309
+ ```
310
+
311
+ Run before publishing to catch regressions in tool registration and runtime startup.
312
+
313
+ ## Contributing
314
+
315
+ New assertion types go in `src/assertions.ts` — implement the `Assertion` interface and add a test. Integration tests live under `tests/` as unit tests and under `evals/` as eval fixtures.
316
+
317
+ ```bash
318
+ npm install && npm test
319
+ ```
320
+
321
+ ## MCP Registry & Marketplace
322
+
323
+ This plugin is available on:
324
+
325
+ - [MCP Registry](https://registry.modelcontextprotocol.io)
326
+ - [MCP Marketplace](https://marketplace.modelcontextprotocol.io)
327
+
328
+ Search for `mcp-eval-runner`.
@@ -0,0 +1,63 @@
1
+ /**
2
+ * Assertion evaluators for MCP Eval Runner.
3
+ * Each assertion checks a specific property of a step result.
4
+ *
5
+ * Supported assertion types:
6
+ * output_contains: "substring" — output includes substring
7
+ * output_not_contains: "substring" — output must NOT include substring
8
+ * output_equals: "exact string" — output exactly matches
9
+ * output_matches: "regex" — output matches a regular expression
10
+ * tool_called: "tool_name" — step used the named tool
11
+ * latency_under: 500 — latency in ms must be below threshold
12
+ * schema_match: { type: "object", properties: {...}, required: [...] }
13
+ * — output (parsed as JSON) matches JSON Schema
14
+ * llm_judge: { prompt_template, min_score, model, expected }
15
+ * — semantic similarity via LLM judge
16
+ */
17
+ import type { LlmJudgeAssertion } from "./llm-judge.js";
18
+ export interface Assertion {
19
+ output_contains?: string;
20
+ output_not_contains?: string;
21
+ output_equals?: string;
22
+ output_matches?: string;
23
+ tool_called?: string;
24
+ latency_under?: number;
25
+ schema_match?: JsonSchema;
26
+ llm_judge?: LlmJudgeAssertion;
27
+ }
28
+ export interface JsonSchema {
29
+ type?: string;
30
+ properties?: Record<string, JsonSchema>;
31
+ required?: string[];
32
+ additionalProperties?: boolean | JsonSchema;
33
+ items?: JsonSchema;
34
+ [key: string]: unknown;
35
+ }
36
+ export interface AssertionResult {
37
+ type: string;
38
+ passed: boolean;
39
+ message: string;
40
+ }
41
+ export interface StepResult {
42
+ tool: string;
43
+ output: string;
44
+ latency_ms: number;
45
+ }
46
+ /**
47
+ * Evaluate a single assertion against a step result.
48
+ */
49
+ export declare function evaluateAssertion(assertion: Assertion, result: StepResult): AssertionResult[];
50
+ /**
51
+ * Evaluate all assertions for a step, returning aggregate pass/fail.
52
+ */
53
+ export declare function evaluateAllAssertions(assertions: Assertion, result: StepResult): {
54
+ passed: boolean;
55
+ results: AssertionResult[];
56
+ };
57
+ /**
58
+ * Evaluate all assertions for a step including async assertion types (e.g. llm_judge).
59
+ */
60
+ export declare function evaluateAllAssertionsAsync(assertions: Assertion, result: StepResult): Promise<{
61
+ passed: boolean;
62
+ results: AssertionResult[];
63
+ }>;