mcp-eval-runner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.env.example +39 -0
  2. package/CHANGELOG.md +67 -0
  3. package/LICENSE +21 -0
  4. package/README.md +328 -0
  5. package/dist/assertions.d.ts +63 -0
  6. package/dist/assertions.js +187 -0
  7. package/dist/audit-log.d.ts +26 -0
  8. package/dist/audit-log.js +57 -0
  9. package/dist/auth.d.ts +15 -0
  10. package/dist/auth.js +83 -0
  11. package/dist/db.d.ts +40 -0
  12. package/dist/db.js +94 -0
  13. package/dist/deployment-gate.d.ts +27 -0
  14. package/dist/deployment-gate.js +43 -0
  15. package/dist/fixture-library.d.ts +26 -0
  16. package/dist/fixture-library.js +85 -0
  17. package/dist/fixture.d.ts +87 -0
  18. package/dist/fixture.js +170 -0
  19. package/dist/http-server.d.ts +7 -0
  20. package/dist/http-server.js +34 -0
  21. package/dist/index.d.ts +15 -0
  22. package/dist/index.js +158 -0
  23. package/dist/llm-judge.d.ts +24 -0
  24. package/dist/llm-judge.js +139 -0
  25. package/dist/rate-limiter.d.ts +13 -0
  26. package/dist/rate-limiter.js +36 -0
  27. package/dist/reporter.d.ts +8 -0
  28. package/dist/reporter.js +163 -0
  29. package/dist/runner.d.ts +57 -0
  30. package/dist/runner.js +339 -0
  31. package/dist/server.d.ts +22 -0
  32. package/dist/server.js +583 -0
  33. package/dist/tools/html_report.d.ts +8 -0
  34. package/dist/tools/html_report.js +188 -0
  35. package/dist/tools/manage.d.ts +11 -0
  36. package/dist/tools/manage.js +41 -0
  37. package/dist/tools/report.d.ts +12 -0
  38. package/dist/tools/report.js +120 -0
  39. package/dist/tools/run.d.ts +20 -0
  40. package/dist/tools/run.js +166 -0
  41. package/dist/tools/scaffold.d.ts +11 -0
  42. package/dist/tools/scaffold.js +90 -0
  43. package/evals/reference/mcp-fetch.yaml +46 -0
  44. package/evals/reference/mcp-filesystem.yaml +63 -0
  45. package/evals/reference/mcp-memory.yaml +70 -0
  46. package/evals/reference/step-piping-example.yaml +25 -0
  47. package/evals/smoke.yaml +12 -0
  48. package/package.json +67 -0
@@ -0,0 +1,46 @@
1
+ name: mcp_fetch_reference
2
+ description: >
3
+ Reference eval suite for the official MCP Fetch server
4
+ (github.com/modelcontextprotocol/servers/tree/main/src/fetch).
5
+ Covers URL fetching in multiple formats and robots.txt handling.
6
+
7
+ server:
8
+ command: uvx
9
+ args: ["mcp-server-fetch"]
10
+
11
+ steps:
12
+ - id: fetch_html
13
+ description: Fetch a URL and return as markdown
14
+ tool: fetch
15
+ input:
16
+ url: "https://example.com"
17
+ max_length: 1000
18
+ expected_output: "Example Domain"
19
+ expect:
20
+ output_contains: "Example Domain"
21
+ output_not_contains: "error"
22
+ latency_under: 10000
23
+
24
+ - id: fetch_raw
25
+ description: Fetch raw HTML content
26
+ tool: fetch
27
+ input:
28
+ url: "https://example.com"
29
+ raw: true
30
+ max_length: 500
31
+ expected_output: "<html"
32
+ expect:
33
+ output_matches: "<html|<!DOCTYPE"
34
+ latency_under: 10000
35
+
36
+ - id: fetch_with_start_index
37
+ description: Fetch with pagination offset
38
+ tool: fetch
39
+ input:
40
+ url: "https://example.com"
41
+ start_index: 100
42
+ max_length: 200
43
+ expected_output: ""
44
+ expect:
45
+ output_not_contains: "error"
46
+ latency_under: 10000
@@ -0,0 +1,63 @@
1
+ name: mcp_filesystem_reference
2
+ description: >
3
+ Reference eval suite for the official MCP Filesystem server
4
+ (github.com/modelcontextprotocol/servers/tree/main/src/filesystem).
5
+ Covers read, write, list, and search operations.
6
+ Set MCP_FS_CMD and MCP_FS_ROOT, or remove the server block for simulation mode.
7
+
8
+ server:
9
+ command: node
10
+ args: ["${MCP_FS_CMD:-/usr/local/bin/mcp-filesystem}", "${MCP_FS_ROOT:-/tmp}"]
11
+
12
+ steps:
13
+ - id: list_root
14
+ description: List the allowed root directory
15
+ tool: list_directory
16
+ input:
17
+ path: "/"
18
+ expected_output: "directory"
19
+ expect:
20
+ output_not_contains: "error"
21
+ latency_under: 3000
22
+
23
+ - id: write_file
24
+ description: Write a test file
25
+ tool: write_file
26
+ input:
27
+ path: "/eval-test-file.txt"
28
+ content: "mcp-eval-runner test content"
29
+ expected_output: "written"
30
+ expect:
31
+ output_not_contains: "error"
32
+ latency_under: 3000
33
+
34
+ - id: read_file
35
+ description: Read back the file we just wrote — uses step output piping
36
+ tool: read_file
37
+ input:
38
+ path: "/eval-test-file.txt"
39
+ expected_output: "mcp-eval-runner test content"
40
+ expect:
41
+ output_contains: "mcp-eval-runner test content"
42
+ latency_under: 3000
43
+
44
+ - id: search_files
45
+ description: Search for the test file by name
46
+ tool: search_files
47
+ input:
48
+ path: "/"
49
+ pattern: "eval-test-file"
50
+ expected_output: "eval-test-file.txt"
51
+ expect:
52
+ output_contains: "eval-test-file"
53
+ latency_under: 5000
54
+
55
+ - id: delete_file
56
+ description: Clean up — delete the test file
57
+ tool: delete_file
58
+ input:
59
+ path: "/eval-test-file.txt"
60
+ expected_output: "deleted"
61
+ expect:
62
+ output_not_contains: "error"
63
+ latency_under: 3000
@@ -0,0 +1,70 @@
1
+ name: mcp_memory_reference
2
+ description: >
3
+ Reference eval suite for the official MCP Memory server
4
+ (github.com/modelcontextprotocol/servers/tree/main/src/memory).
5
+ Covers entity creation, relation creation, search, and deletion.
6
+ Set MCP_MEMORY_CMD to the path of your built memory server binary,
7
+ or remove the server block to run in simulation mode.
8
+
9
+ server:
10
+ command: node
11
+ args: ["${MCP_MEMORY_CMD:-/usr/local/bin/mcp-memory}"]
12
+
13
+ steps:
14
+ - id: create_entity
15
+ description: Create a test entity in the knowledge graph
16
+ tool: create_entities
17
+ input:
18
+ entities:
19
+ - name: TestProject
20
+ entityType: Project
21
+ observations:
22
+ - "A test project created by mcp-eval-runner"
23
+ expected_output: "TestProject"
24
+ expect:
25
+ output_contains: "TestProject"
26
+ latency_under: 5000
27
+
28
+ - id: create_relation
29
+ description: Create a relation between entities
30
+ tool: create_relations
31
+ input:
32
+ relations:
33
+ - from: TestProject
34
+ to: mcp-eval-runner
35
+ relationType: tested_by
36
+ expected_output: "tested_by"
37
+ expect:
38
+ output_contains: "tested_by"
39
+ latency_under: 5000
40
+
41
+ - id: search_nodes
42
+ description: Search for the entity we just created
43
+ tool: search_nodes
44
+ input:
45
+ query: TestProject
46
+ expected_output: "TestProject"
47
+ expect:
48
+ output_contains: "TestProject"
49
+ output_not_contains: "error"
50
+ latency_under: 5000
51
+
52
+ - id: read_graph
53
+ description: Read the full graph and verify structure
54
+ tool: read_graph
55
+ input: {}
56
+ expected_output: "TestProject"
57
+ expect:
58
+ output_contains: "TestProject"
59
+ latency_under: 5000
60
+
61
+ - id: delete_entity
62
+ description: Clean up — delete the test entity
63
+ tool: delete_entities
64
+ input:
65
+ entityNames:
66
+ - TestProject
67
+ expected_output: "deleted"
68
+ expect:
69
+ output_not_contains: "error"
70
+ latency_under: 5000
@@ -0,0 +1,25 @@
1
+ name: step_piping_example
2
+ description: >
3
+ Demonstrates step output piping: the output of step_1 is injected into
4
+ step_2's input using the {{steps.<id>.output}} placeholder syntax.
5
+ This fixture runs in simulation mode (no server block).
6
+
7
+ steps:
8
+ - id: step_1
9
+ description: First step — produces output that step_2 will consume
10
+ tool: search
11
+ input:
12
+ query: "mcp eval runner"
13
+ expected_output: "result: mcp-eval-runner v1.0"
14
+ expect:
15
+ output_contains: "mcp-eval-runner"
16
+
17
+ - id: step_2
18
+ description: Second step — input references step_1's output via placeholder
19
+ tool: summarize
20
+ input:
21
+ # {{steps.step_1.output}} is replaced at runtime with step_1's actual output
22
+ text: "{{steps.step_1.output}}"
23
+ expected_output: "Summary: mcp-eval-runner v1.0"
24
+ expect:
25
+ output_contains: "Summary"
@@ -0,0 +1,12 @@
1
+ name: smoke
2
+ description: "Verify eval runner itself is working"
3
+ server:
4
+ command: node
5
+ args: ["dist/index.js"]
6
+ steps:
7
+ - id: list_check
8
+ description: "List available test cases and verify smoke fixture appears"
9
+ tool: list_cases
10
+ input: {}
11
+ expect:
12
+ output_contains: "smoke"
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "mcp-eval-runner",
3
+ "mcpName": "io.github.dbsectrainer/mcp-eval-runner",
4
+ "version": "1.0.0",
5
+ "description": "A standardized testing harness for MCP servers and agent workflows",
6
+ "author": "dbsectrainer",
7
+ "license": "MIT",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/dbsectrainer/mcp-eval-runner.git"
11
+ },
12
+ "homepage": "https://github.com/dbsectrainer/mcp-eval-runner#readme",
13
+ "type": "module",
14
+ "bin": {
15
+ "mcp-eval-runner": "./dist/index.js"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "evals",
20
+ "README.md",
21
+ "CHANGELOG.md",
22
+ "LICENSE",
23
+ ".env.example"
24
+ ],
25
+ "keywords": [
26
+ "mcp",
27
+ "mcp-server",
28
+ "testing",
29
+ "evals"
30
+ ],
31
+ "scripts": {
32
+ "build": "tsc",
33
+ "test": "vitest run",
34
+ "coverage": "vitest run --coverage",
35
+ "dev": "tsx src/index.ts",
36
+ "inspect": "npx @modelcontextprotocol/inspector node dist/index.js",
37
+ "lint": "eslint src tests",
38
+ "format": "prettier --write src tests",
39
+ "format:check": "prettier --check src tests",
40
+ "prepublishOnly": "npm run build"
41
+ },
42
+ "dependencies": {
43
+ "@modelcontextprotocol/sdk": "^1.12.0",
44
+ "chokidar": "^5.0.0",
45
+ "express": "^5.2.1",
46
+ "js-yaml": "^4.1.0"
47
+ },
48
+ "devDependencies": {
49
+ "@types/express": "^5.0.0",
50
+ "@types/js-yaml": "^4.0.0",
51
+ "@types/node": "^24.12.0",
52
+ "@types/yargs": "^17.0.0",
53
+ "@typescript-eslint/eslint-plugin": "^8.0.0",
54
+ "@typescript-eslint/parser": "^8.0.0",
55
+ "@vitest/coverage-v8": "^4.1.0",
56
+ "eslint": "^10.0.3",
57
+ "eslint-config-prettier": "^10.1.8",
58
+ "prettier": "^3.0.0",
59
+ "tsx": "^4.0.0",
60
+ "typescript": "^5.0.0",
61
+ "vitest": "^4.1.0",
62
+ "yargs": "^18.0.0"
63
+ },
64
+ "engines": {
65
+ "node": ">=22.5.0"
66
+ }
67
+ }