mcp-eval-runner 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +39 -0
- package/CHANGELOG.md +67 -0
- package/LICENSE +21 -0
- package/README.md +328 -0
- package/dist/assertions.d.ts +63 -0
- package/dist/assertions.js +187 -0
- package/dist/audit-log.d.ts +26 -0
- package/dist/audit-log.js +57 -0
- package/dist/auth.d.ts +15 -0
- package/dist/auth.js +83 -0
- package/dist/db.d.ts +40 -0
- package/dist/db.js +94 -0
- package/dist/deployment-gate.d.ts +27 -0
- package/dist/deployment-gate.js +43 -0
- package/dist/fixture-library.d.ts +26 -0
- package/dist/fixture-library.js +85 -0
- package/dist/fixture.d.ts +87 -0
- package/dist/fixture.js +170 -0
- package/dist/http-server.d.ts +7 -0
- package/dist/http-server.js +34 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +158 -0
- package/dist/llm-judge.d.ts +24 -0
- package/dist/llm-judge.js +139 -0
- package/dist/rate-limiter.d.ts +13 -0
- package/dist/rate-limiter.js +36 -0
- package/dist/reporter.d.ts +8 -0
- package/dist/reporter.js +163 -0
- package/dist/runner.d.ts +57 -0
- package/dist/runner.js +339 -0
- package/dist/server.d.ts +22 -0
- package/dist/server.js +583 -0
- package/dist/tools/html_report.d.ts +8 -0
- package/dist/tools/html_report.js +188 -0
- package/dist/tools/manage.d.ts +11 -0
- package/dist/tools/manage.js +41 -0
- package/dist/tools/report.d.ts +12 -0
- package/dist/tools/report.js +120 -0
- package/dist/tools/run.d.ts +20 -0
- package/dist/tools/run.js +166 -0
- package/dist/tools/scaffold.d.ts +11 -0
- package/dist/tools/scaffold.js +90 -0
- package/evals/reference/mcp-fetch.yaml +46 -0
- package/evals/reference/mcp-filesystem.yaml +63 -0
- package/evals/reference/mcp-memory.yaml +70 -0
- package/evals/reference/step-piping-example.yaml +25 -0
- package/evals/smoke.yaml +12 -0
- package/package.json +67 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: mcp_fetch_reference
|
|
2
|
+
description: >
|
|
3
|
+
Reference eval suite for the official MCP Fetch server
|
|
4
|
+
(github.com/modelcontextprotocol/servers/tree/main/src/fetch).
|
|
5
|
+
Covers URL fetching in multiple formats and robots.txt handling.
|
|
6
|
+
|
|
7
|
+
server:
|
|
8
|
+
command: uvx
|
|
9
|
+
args: ["mcp-server-fetch"]
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- id: fetch_html
|
|
13
|
+
description: Fetch a URL and return as markdown
|
|
14
|
+
tool: fetch
|
|
15
|
+
input:
|
|
16
|
+
url: "https://example.com"
|
|
17
|
+
max_length: 1000
|
|
18
|
+
expected_output: "Example Domain"
|
|
19
|
+
expect:
|
|
20
|
+
output_contains: "Example Domain"
|
|
21
|
+
output_not_contains: "error"
|
|
22
|
+
latency_under: 10000
|
|
23
|
+
|
|
24
|
+
- id: fetch_raw
|
|
25
|
+
description: Fetch raw HTML content
|
|
26
|
+
tool: fetch
|
|
27
|
+
input:
|
|
28
|
+
url: "https://example.com"
|
|
29
|
+
raw: true
|
|
30
|
+
max_length: 500
|
|
31
|
+
expected_output: "<html"
|
|
32
|
+
expect:
|
|
33
|
+
output_matches: "<html|<!DOCTYPE"
|
|
34
|
+
latency_under: 10000
|
|
35
|
+
|
|
36
|
+
- id: fetch_with_start_index
|
|
37
|
+
description: Fetch with pagination offset
|
|
38
|
+
tool: fetch
|
|
39
|
+
input:
|
|
40
|
+
url: "https://example.com"
|
|
41
|
+
start_index: 100
|
|
42
|
+
max_length: 200
|
|
43
|
+
expected_output: ""
|
|
44
|
+
expect:
|
|
45
|
+
output_not_contains: "error"
|
|
46
|
+
latency_under: 10000
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
name: mcp_filesystem_reference
|
|
2
|
+
description: >
|
|
3
|
+
Reference eval suite for the official MCP Filesystem server
|
|
4
|
+
(github.com/modelcontextprotocol/servers/tree/main/src/filesystem).
|
|
5
|
+
Covers read, write, list, and search operations.
|
|
6
|
+
Set MCP_FS_CMD and MCP_FS_ROOT, or remove the server block for simulation mode.
|
|
7
|
+
|
|
8
|
+
server:
|
|
9
|
+
command: node
|
|
10
|
+
args: ["${MCP_FS_CMD:-/usr/local/bin/mcp-filesystem}", "${MCP_FS_ROOT:-/tmp}"]
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
- id: list_root
|
|
14
|
+
description: List the allowed root directory
|
|
15
|
+
tool: list_directory
|
|
16
|
+
input:
|
|
17
|
+
path: "/"
|
|
18
|
+
expected_output: "directory"
|
|
19
|
+
expect:
|
|
20
|
+
output_not_contains: "error"
|
|
21
|
+
latency_under: 3000
|
|
22
|
+
|
|
23
|
+
- id: write_file
|
|
24
|
+
description: Write a test file
|
|
25
|
+
tool: write_file
|
|
26
|
+
input:
|
|
27
|
+
path: "/eval-test-file.txt"
|
|
28
|
+
content: "mcp-eval-runner test content"
|
|
29
|
+
expected_output: "written"
|
|
30
|
+
expect:
|
|
31
|
+
output_not_contains: "error"
|
|
32
|
+
latency_under: 3000
|
|
33
|
+
|
|
34
|
+
- id: read_file
|
|
35
|
+
description: Read back the file we just wrote — uses step output piping
|
|
36
|
+
tool: read_file
|
|
37
|
+
input:
|
|
38
|
+
path: "/eval-test-file.txt"
|
|
39
|
+
expected_output: "mcp-eval-runner test content"
|
|
40
|
+
expect:
|
|
41
|
+
output_contains: "mcp-eval-runner test content"
|
|
42
|
+
latency_under: 3000
|
|
43
|
+
|
|
44
|
+
- id: search_files
|
|
45
|
+
description: Search for the test file by name
|
|
46
|
+
tool: search_files
|
|
47
|
+
input:
|
|
48
|
+
path: "/"
|
|
49
|
+
pattern: "eval-test-file"
|
|
50
|
+
expected_output: "eval-test-file.txt"
|
|
51
|
+
expect:
|
|
52
|
+
output_contains: "eval-test-file"
|
|
53
|
+
latency_under: 5000
|
|
54
|
+
|
|
55
|
+
- id: delete_file
|
|
56
|
+
description: Clean up — delete the test file
|
|
57
|
+
tool: delete_file
|
|
58
|
+
input:
|
|
59
|
+
path: "/eval-test-file.txt"
|
|
60
|
+
expected_output: "deleted"
|
|
61
|
+
expect:
|
|
62
|
+
output_not_contains: "error"
|
|
63
|
+
latency_under: 3000
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
name: mcp_memory_reference
|
|
2
|
+
description: >
|
|
3
|
+
Reference eval suite for the official MCP Memory server
|
|
4
|
+
(github.com/modelcontextprotocol/servers/tree/main/src/memory).
|
|
5
|
+
Covers entity creation, relation creation, search, and deletion.
|
|
6
|
+
Set MCP_MEMORY_CMD to the path of your built memory server binary,
|
|
7
|
+
or remove the server block to run in simulation mode.
|
|
8
|
+
|
|
9
|
+
server:
|
|
10
|
+
command: node
|
|
11
|
+
args: ["${MCP_MEMORY_CMD:-/usr/local/bin/mcp-memory}"]
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- id: create_entity
|
|
15
|
+
description: Create a test entity in the knowledge graph
|
|
16
|
+
tool: create_entities
|
|
17
|
+
input:
|
|
18
|
+
entities:
|
|
19
|
+
- name: TestProject
|
|
20
|
+
entityType: Project
|
|
21
|
+
observations:
|
|
22
|
+
- "A test project created by mcp-eval-runner"
|
|
23
|
+
expected_output: "TestProject"
|
|
24
|
+
expect:
|
|
25
|
+
output_contains: "TestProject"
|
|
26
|
+
latency_under: 5000
|
|
27
|
+
|
|
28
|
+
- id: create_relation
|
|
29
|
+
description: Create a relation between entities
|
|
30
|
+
tool: create_relations
|
|
31
|
+
input:
|
|
32
|
+
relations:
|
|
33
|
+
- from: TestProject
|
|
34
|
+
to: mcp-eval-runner
|
|
35
|
+
relationType: tested_by
|
|
36
|
+
expected_output: "tested_by"
|
|
37
|
+
expect:
|
|
38
|
+
output_contains: "tested_by"
|
|
39
|
+
latency_under: 5000
|
|
40
|
+
|
|
41
|
+
- id: search_nodes
|
|
42
|
+
description: Search for the entity we just created
|
|
43
|
+
tool: search_nodes
|
|
44
|
+
input:
|
|
45
|
+
query: TestProject
|
|
46
|
+
expected_output: "TestProject"
|
|
47
|
+
expect:
|
|
48
|
+
output_contains: "TestProject"
|
|
49
|
+
output_not_contains: "error"
|
|
50
|
+
latency_under: 5000
|
|
51
|
+
|
|
52
|
+
- id: read_graph
|
|
53
|
+
description: Read the full graph and verify structure
|
|
54
|
+
tool: read_graph
|
|
55
|
+
input: {}
|
|
56
|
+
expected_output: "TestProject"
|
|
57
|
+
expect:
|
|
58
|
+
output_contains: "TestProject"
|
|
59
|
+
latency_under: 5000
|
|
60
|
+
|
|
61
|
+
- id: delete_entity
|
|
62
|
+
description: Clean up — delete the test entity
|
|
63
|
+
tool: delete_entities
|
|
64
|
+
input:
|
|
65
|
+
entityNames:
|
|
66
|
+
- TestProject
|
|
67
|
+
expected_output: "deleted"
|
|
68
|
+
expect:
|
|
69
|
+
output_not_contains: "error"
|
|
70
|
+
latency_under: 5000
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: step_piping_example
|
|
2
|
+
description: >
|
|
3
|
+
Demonstrates step output piping: the output of step_1 is injected into
|
|
4
|
+
step_2's input using the {{steps.<id>.output}} placeholder syntax.
|
|
5
|
+
This fixture runs in simulation mode (no server block).
|
|
6
|
+
|
|
7
|
+
steps:
|
|
8
|
+
- id: step_1
|
|
9
|
+
description: First step — produces output that step_2 will consume
|
|
10
|
+
tool: search
|
|
11
|
+
input:
|
|
12
|
+
query: "mcp eval runner"
|
|
13
|
+
expected_output: "result: mcp-eval-runner v1.0"
|
|
14
|
+
expect:
|
|
15
|
+
output_contains: "mcp-eval-runner"
|
|
16
|
+
|
|
17
|
+
- id: step_2
|
|
18
|
+
description: Second step — input references step_1's output via placeholder
|
|
19
|
+
tool: summarize
|
|
20
|
+
input:
|
|
21
|
+
# {{steps.step_1.output}} is replaced at runtime with step_1's actual output
|
|
22
|
+
text: "{{steps.step_1.output}}"
|
|
23
|
+
expected_output: "Summary: mcp-eval-runner v1.0"
|
|
24
|
+
expect:
|
|
25
|
+
output_contains: "Summary"
|
package/evals/smoke.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
name: smoke
|
|
2
|
+
description: "Verify eval runner itself is working"
|
|
3
|
+
server:
|
|
4
|
+
command: node
|
|
5
|
+
args: ["dist/index.js"]
|
|
6
|
+
steps:
|
|
7
|
+
- id: list_check
|
|
8
|
+
description: "List available test cases and verify smoke fixture appears"
|
|
9
|
+
tool: list_cases
|
|
10
|
+
input: {}
|
|
11
|
+
expect:
|
|
12
|
+
output_contains: "smoke"
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mcp-eval-runner",
|
|
3
|
+
"mcpName": "io.github.dbsectrainer/mcp-eval-runner",
|
|
4
|
+
"version": "1.0.0",
|
|
5
|
+
"description": "A standardized testing harness for MCP servers and agent workflows",
|
|
6
|
+
"author": "dbsectrainer",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "https://github.com/dbsectrainer/mcp-eval-runner.git"
|
|
11
|
+
},
|
|
12
|
+
"homepage": "https://github.com/dbsectrainer/mcp-eval-runner#readme",
|
|
13
|
+
"type": "module",
|
|
14
|
+
"bin": {
|
|
15
|
+
"mcp-eval-runner": "./dist/index.js"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"evals",
|
|
20
|
+
"README.md",
|
|
21
|
+
"CHANGELOG.md",
|
|
22
|
+
"LICENSE",
|
|
23
|
+
".env.example"
|
|
24
|
+
],
|
|
25
|
+
"keywords": [
|
|
26
|
+
"mcp",
|
|
27
|
+
"mcp-server",
|
|
28
|
+
"testing",
|
|
29
|
+
"evals"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"build": "tsc",
|
|
33
|
+
"test": "vitest run",
|
|
34
|
+
"coverage": "vitest run --coverage",
|
|
35
|
+
"dev": "tsx src/index.ts",
|
|
36
|
+
"inspect": "npx @modelcontextprotocol/inspector node dist/index.js",
|
|
37
|
+
"lint": "eslint src tests",
|
|
38
|
+
"format": "prettier --write src tests",
|
|
39
|
+
"format:check": "prettier --check src tests",
|
|
40
|
+
"prepublishOnly": "npm run build"
|
|
41
|
+
},
|
|
42
|
+
"dependencies": {
|
|
43
|
+
"@modelcontextprotocol/sdk": "^1.12.0",
|
|
44
|
+
"chokidar": "^5.0.0",
|
|
45
|
+
"express": "^5.2.1",
|
|
46
|
+
"js-yaml": "^4.1.0"
|
|
47
|
+
},
|
|
48
|
+
"devDependencies": {
|
|
49
|
+
"@types/express": "^5.0.0",
|
|
50
|
+
"@types/js-yaml": "^4.0.0",
|
|
51
|
+
"@types/node": "^24.12.0",
|
|
52
|
+
"@types/yargs": "^17.0.0",
|
|
53
|
+
"@typescript-eslint/eslint-plugin": "^8.0.0",
|
|
54
|
+
"@typescript-eslint/parser": "^8.0.0",
|
|
55
|
+
"@vitest/coverage-v8": "^4.1.0",
|
|
56
|
+
"eslint": "^10.0.3",
|
|
57
|
+
"eslint-config-prettier": "^10.1.8",
|
|
58
|
+
"prettier": "^3.0.0",
|
|
59
|
+
"tsx": "^4.0.0",
|
|
60
|
+
"typescript": "^5.0.0",
|
|
61
|
+
"vitest": "^4.1.0",
|
|
62
|
+
"yargs": "^18.0.0"
|
|
63
|
+
},
|
|
64
|
+
"engines": {
|
|
65
|
+
"node": ">=22.5.0"
|
|
66
|
+
}
|
|
67
|
+
}
|