@gleanwork/mcp-server-tester 1.0.0-beta.2 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -22
- package/dist/cli/index.js +38 -12
- package/dist/fixtures/mcp.d.ts +14 -6
- package/dist/fixtures/mcp.js +9 -6
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +69 -47
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +208 -1175
- package/dist/index.d.ts +208 -1175
- package/dist/index.js +69 -47
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -8
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @gleanwork/mcp-server-tester
|
|
2
2
|
|
|
3
|
-
[](https://github.com/gleanwork/.github/blob/main/docs/repository-stability.md#ga)
|
|
4
4
|
[](https://www.npmjs.com/package/@gleanwork/mcp-server-tester)
|
|
5
5
|
[](https://github.com/gleanwork/mcp-server-tester/actions/workflows/ci.yml)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -11,7 +11,7 @@ A testing and evaluation framework for [Model Context Protocol (MCP)](https://mo
|
|
|
11
11
|
|
|
12
12
|
The `mcp` Playwright fixture connects to your MCP server (stdio or HTTP) and exposes a high-level API for calling tools and asserting responses. Custom matchers keep assertions readable.
|
|
13
13
|
|
|
14
|
-
```typescript
|
|
14
|
+
```typescript snippet=snippets/basic-test.ts
|
|
15
15
|
import { test, expect } from '@gleanwork/mcp-server-tester/fixtures/mcp';
|
|
16
16
|
|
|
17
17
|
test('read_file returns file contents', async ({ mcp }) => {
|
|
@@ -30,24 +30,25 @@ Playwright tests are fast, deterministic, and designed for CI. Use them for regr
|
|
|
30
30
|
|
|
31
31
|
Available matchers:
|
|
32
32
|
|
|
33
|
-
| Matcher | Description
|
|
34
|
-
| ------------------------ |
|
|
35
|
-
| `
|
|
36
|
-
| `
|
|
37
|
-
| `
|
|
38
|
-
| `
|
|
39
|
-
| `
|
|
40
|
-
| `
|
|
41
|
-
| `
|
|
42
|
-
| `
|
|
43
|
-
| `
|
|
44
|
-
| `
|
|
33
|
+
| Matcher | Description |
|
|
34
|
+
| ------------------------ | ---------------------------------------------------- |
|
|
35
|
+
| `toMatchToolResponse` | Response exactly matches expected value (deep equal) |
|
|
36
|
+
| `toContainToolText` | Response contains expected substrings |
|
|
37
|
+
| `toMatchToolSchema` | Response validates against a Zod schema |
|
|
38
|
+
| `toMatchToolPattern` | Response matches a regex pattern |
|
|
39
|
+
| `toMatchToolSnapshot` | Response matches a saved baseline |
|
|
40
|
+
| `toBeToolError` | Response is (or is not) an error |
|
|
41
|
+
| `toHaveToolResponseSize` | Response size is within bounds |
|
|
42
|
+
| `toSatisfyToolPredicate` | Response satisfies a custom function |
|
|
43
|
+
| `toHaveToolCalls` | LLM called the expected tools |
|
|
44
|
+
| `toHaveToolCallCount` | LLM made N tool calls |
|
|
45
|
+
| `toPassToolJudge` | LLM evaluates response quality against a rubric |
|
|
45
46
|
|
|
46
47
|
## Eval Datasets
|
|
47
48
|
|
|
48
49
|
Eval datasets let you define test cases as JSON files and run them with `runEvalDataset()`. Each case specifies a tool call and one or more assertions.
|
|
49
50
|
|
|
50
|
-
```json
|
|
51
|
+
```json snippet=snippets/eval-dataset.json
|
|
51
52
|
{
|
|
52
53
|
"name": "file-ops",
|
|
53
54
|
"cases": [
|
|
@@ -72,7 +73,7 @@ Eval datasets let you define test cases as JSON files and run them with `runEval
|
|
|
72
73
|
}
|
|
73
74
|
```
|
|
74
75
|
|
|
75
|
-
```typescript
|
|
76
|
+
```typescript snippet=snippets/run-eval-dataset.ts
|
|
76
77
|
import { test, expect } from '@gleanwork/mcp-server-tester/fixtures/mcp';
|
|
77
78
|
import { loadEvalDataset, runEvalDataset } from '@gleanwork/mcp-server-tester';
|
|
78
79
|
import { z } from 'zod';
|
|
@@ -101,12 +102,12 @@ Supported assertion types:
|
|
|
101
102
|
|
|
102
103
|
In LLM host mode, a real LLM receives your server's tool list and a natural language prompt, then decides which tools to call. This tests whether your tool names, descriptions, and input schemas are clear enough for autonomous use — a different question from whether the tools return correct output.
|
|
103
104
|
|
|
104
|
-
```json
|
|
105
|
+
```json snippet=snippets/mcp-host-dataset.json
|
|
105
106
|
{
|
|
106
107
|
"id": "find-config",
|
|
107
|
-
"mode": "
|
|
108
|
+
"mode": "mcp_host",
|
|
108
109
|
"scenario": "Find the application config file and return its contents",
|
|
109
|
-
"
|
|
110
|
+
"mcpHostConfig": {
|
|
110
111
|
"provider": "anthropic",
|
|
111
112
|
"model": "claude-opus-4-20250514"
|
|
112
113
|
},
|
|
@@ -118,7 +119,7 @@ In LLM host mode, a real LLM receives your server's tool list and a natural lang
|
|
|
118
119
|
}
|
|
119
120
|
```
|
|
120
121
|
|
|
121
|
-
LLM host mode makes real API calls and produces non-deterministic results. Use `iterations` to run a case multiple times and measure pass rate rather than expecting 100% on a single run. See the [LLM Host Guide](docs/
|
|
122
|
+
LLM host mode makes real API calls and produces non-deterministic results. Use `iterations` to run a case multiple times and measure pass rate rather than expecting 100% on a single run. See the [LLM Host Guide](docs/mcp-host.md) for configuration and cost management.
|
|
122
123
|
|
|
123
124
|
## Installation
|
|
124
125
|
|
|
@@ -146,7 +147,7 @@ The CLI wizard creates a `playwright.config.ts`, example tests, and a sample eva
|
|
|
146
147
|
|
|
147
148
|
Point the framework at your MCP server in `playwright.config.ts`:
|
|
148
149
|
|
|
149
|
-
```typescript
|
|
150
|
+
```typescript snippet=snippets/playwright-config.ts
|
|
150
151
|
import { defineConfig } from '@playwright/test';
|
|
151
152
|
|
|
152
153
|
export default defineConfig({
|
|
@@ -173,12 +174,13 @@ For HTTP servers, set `transport: 'http'` and `serverUrl`. For servers that requ
|
|
|
173
174
|
|
|
174
175
|
- [Quick Start](./docs/quickstart.md) — detailed setup and configuration
|
|
175
176
|
- [Expectations](./docs/expectations.md) — all assertion types including snapshot sanitizers
|
|
176
|
-
- [LLM Host Simulation](docs/
|
|
177
|
+
- [LLM Host Simulation](docs/mcp-host.md) — tool discoverability testing
|
|
177
178
|
- [API Reference](./docs/api-reference.md)
|
|
178
179
|
- [Transports](./docs/transports.md) — stdio and HTTP configuration, OAuth
|
|
179
180
|
- [CLI Commands](./docs/cli.md) — init, generate, login, token
|
|
180
181
|
- [UI Reporter](./docs/ui-reporter.md) — interactive web UI for test results
|
|
181
182
|
- [Development](./docs/development.md) — contributing and building
|
|
183
|
+
- [Migration Guide (v0.12 → v1.0)](./docs/migrations/migration-1.0.md) — upgrading from pre-1.0 releases
|
|
182
184
|
|
|
183
185
|
## Examples
|
|
184
186
|
|
package/dist/cli/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
|
|
|
17
17
|
import { z } from 'zod';
|
|
18
18
|
import createDebug from 'debug';
|
|
19
19
|
import { ProxyAgent, Agent } from 'undici';
|
|
20
|
-
import { readFileSync } from 'fs';
|
|
20
|
+
import { existsSync, readFileSync } from 'fs';
|
|
21
21
|
import * as oauth from 'oauth4webapi';
|
|
22
22
|
import { homedir } from 'os';
|
|
23
23
|
import * as http from 'http';
|
|
@@ -80,7 +80,7 @@ function JsonPreview({ data, maxLines = 15 }) {
|
|
|
80
80
|
|
|
81
81
|
// package.json
|
|
82
82
|
var package_default = {
|
|
83
|
-
version: "1.0.0-beta.
|
|
83
|
+
version: "1.0.0-beta.4"};
|
|
84
84
|
|
|
85
85
|
// src/cli/templates/index.ts
|
|
86
86
|
function getPlaywrightConfigTemplate(answers) {
|
|
@@ -255,10 +255,10 @@ function getPackageJsonTemplate(projectName) {
|
|
|
255
255
|
"evals"
|
|
256
256
|
],
|
|
257
257
|
"dependencies": {
|
|
258
|
-
"@modelcontextprotocol/sdk": "^1.0
|
|
258
|
+
"@modelcontextprotocol/sdk": "^1.27.0",
|
|
259
259
|
"@playwright/test": "^1.49.0",
|
|
260
260
|
"@gleanwork/mcp-server-tester": "^${package_default.version}",
|
|
261
|
-
"zod": "^
|
|
261
|
+
"zod": "^4.0.0"
|
|
262
262
|
},
|
|
263
263
|
"devDependencies": {
|
|
264
264
|
"typescript": "^5.7.2"
|
|
@@ -531,7 +531,7 @@ async function init(options) {
|
|
|
531
531
|
await waitUntilExit();
|
|
532
532
|
}
|
|
533
533
|
var MCPHostCapabilitiesSchema = z.object({
|
|
534
|
-
sampling: z.record(z.unknown()).optional(),
|
|
534
|
+
sampling: z.record(z.string(), z.unknown()).optional(),
|
|
535
535
|
roots: z.object({
|
|
536
536
|
listChanged: z.boolean()
|
|
537
537
|
}).optional()
|
|
@@ -590,7 +590,7 @@ var HttpConfigSchema = z.object({
|
|
|
590
590
|
}
|
|
591
591
|
return true;
|
|
592
592
|
}),
|
|
593
|
-
headers: z.record(z.string()).optional(),
|
|
593
|
+
headers: z.record(z.string(), z.string()).optional(),
|
|
594
594
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
595
595
|
connectTimeoutMs: z.number().positive().optional(),
|
|
596
596
|
requestTimeoutMs: z.number().positive().optional(),
|
|
@@ -827,7 +827,7 @@ async function retryWithBackoff(fn, maxAttempts) {
|
|
|
827
827
|
delayMs,
|
|
828
828
|
err.message
|
|
829
829
|
);
|
|
830
|
-
await new Promise((
|
|
830
|
+
await new Promise((resolve4) => setTimeout(resolve4, delayMs));
|
|
831
831
|
} else {
|
|
832
832
|
throw err;
|
|
833
833
|
}
|
|
@@ -877,7 +877,10 @@ async function createMCPClientForConfig(config, options) {
|
|
|
877
877
|
validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
|
|
878
878
|
);
|
|
879
879
|
} else if (isHttpConfig(validatedConfig)) {
|
|
880
|
-
const headers = {
|
|
880
|
+
const headers = {
|
|
881
|
+
"User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
|
|
882
|
+
...validatedConfig.headers
|
|
883
|
+
};
|
|
881
884
|
if (validatedConfig.auth?.clientCredentials && true) {
|
|
882
885
|
const ccConfig = validatedConfig.auth.clientCredentials;
|
|
883
886
|
const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
|
|
@@ -1680,7 +1683,7 @@ ${errorText}`
|
|
|
1680
1683
|
*/
|
|
1681
1684
|
async startCallbackServer(expectedState) {
|
|
1682
1685
|
const timeoutMs = this.config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
1683
|
-
return new Promise((
|
|
1686
|
+
return new Promise((resolve4, reject) => {
|
|
1684
1687
|
const server = http.createServer();
|
|
1685
1688
|
const connections = /* @__PURE__ */ new Set();
|
|
1686
1689
|
server.on("connection", (socket) => {
|
|
@@ -1753,7 +1756,7 @@ ${errorText}`
|
|
|
1753
1756
|
server.listen(preferredPort, "127.0.0.1", () => {
|
|
1754
1757
|
const address = server.address();
|
|
1755
1758
|
debug2("Callback server listening on port", address.port);
|
|
1756
|
-
|
|
1759
|
+
resolve4({ port: address.port, codePromise, close: forceClose });
|
|
1757
1760
|
});
|
|
1758
1761
|
server.on("error", (err) => {
|
|
1759
1762
|
reject(err);
|
|
@@ -1774,8 +1777,8 @@ ${errorText}`
|
|
|
1774
1777
|
return;
|
|
1775
1778
|
}
|
|
1776
1779
|
try {
|
|
1777
|
-
const
|
|
1778
|
-
await
|
|
1780
|
+
const open2 = await import('open');
|
|
1781
|
+
await open2.default(url.toString());
|
|
1779
1782
|
debug2("Opened browser for authentication");
|
|
1780
1783
|
} catch (error) {
|
|
1781
1784
|
debug2("Failed to open browser:", error);
|
|
@@ -3059,6 +3062,24 @@ async function token(serverUrl, options) {
|
|
|
3059
3062
|
);
|
|
3060
3063
|
await waitUntilExit();
|
|
3061
3064
|
}
|
|
3065
|
+
async function open(options) {
|
|
3066
|
+
const outputDir = resolve(options.dir ?? ".mcp-test-results");
|
|
3067
|
+
const reportPath = join(outputDir, "latest", "index.html");
|
|
3068
|
+
if (!existsSync(reportPath)) {
|
|
3069
|
+
console.error(`No report found at ${reportPath}`);
|
|
3070
|
+
console.error("Run your Playwright tests first to generate a report.");
|
|
3071
|
+
process.exit(1);
|
|
3072
|
+
}
|
|
3073
|
+
console.log(`Opening report: ${reportPath}`);
|
|
3074
|
+
try {
|
|
3075
|
+
const { default: openBrowser } = await import('open');
|
|
3076
|
+
await openBrowser(reportPath);
|
|
3077
|
+
} catch (error) {
|
|
3078
|
+
console.error("Failed to open report in browser:", error);
|
|
3079
|
+
console.error(`Open manually: file://${reportPath}`);
|
|
3080
|
+
process.exit(1);
|
|
3081
|
+
}
|
|
3082
|
+
}
|
|
3062
3083
|
|
|
3063
3084
|
// src/cli/index.ts
|
|
3064
3085
|
var program = new Command();
|
|
@@ -3074,4 +3095,9 @@ program.command("token").description("Output stored OAuth tokens for CI/CD use")
|
|
|
3074
3095
|
"Output format: env, json, or gh (default: env)",
|
|
3075
3096
|
"env"
|
|
3076
3097
|
).option("--state-dir <dir>", "Custom directory for token storage").action(token);
|
|
3098
|
+
program.command("open").description("Open the MCP eval reporter UI in your browser").option(
|
|
3099
|
+
"-d, --dir <directory>",
|
|
3100
|
+
"Report output directory",
|
|
3101
|
+
".mcp-test-results"
|
|
3102
|
+
).action(open);
|
|
3077
3103
|
program.parse();
|
package/dist/fixtures/mcp.d.ts
CHANGED
|
@@ -169,6 +169,14 @@ declare function toMatchToolPattern(this: {
|
|
|
169
169
|
/**
|
|
170
170
|
* Creates the toMatchToolSnapshot matcher function
|
|
171
171
|
*
|
|
172
|
+
* @remarks
|
|
173
|
+
* **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
|
|
174
|
+
* internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
|
|
175
|
+
* Calling it outside a Playwright test will throw a cryptic context error.
|
|
176
|
+
*
|
|
177
|
+
* To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
|
|
178
|
+
* function directly.
|
|
179
|
+
*
|
|
172
180
|
* Note: This is an async matcher that uses Playwright's snapshot testing.
|
|
173
181
|
*/
|
|
174
182
|
declare function toMatchToolSnapshot(this: {
|
|
@@ -209,9 +217,9 @@ type RubricSpec = BuiltInRubric | {
|
|
|
209
217
|
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
210
218
|
|
|
211
219
|
/**
|
|
212
|
-
* Tool call validators for
|
|
220
|
+
* Tool call validators for mcp_host simulation results.
|
|
213
221
|
*
|
|
214
|
-
* These validators extract the tool call trace from an
|
|
222
|
+
* These validators extract the tool call trace from an MCPHostSimulationResult
|
|
215
223
|
* and apply assertions against expected call lists and counts.
|
|
216
224
|
*/
|
|
217
225
|
|
|
@@ -400,7 +408,7 @@ declare global {
|
|
|
400
408
|
*/
|
|
401
409
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
402
410
|
/**
|
|
403
|
-
* Validates which tools the LLM called during
|
|
411
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
404
412
|
*
|
|
405
413
|
* @example
|
|
406
414
|
* ```typescript
|
|
@@ -412,7 +420,7 @@ declare global {
|
|
|
412
420
|
*/
|
|
413
421
|
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
414
422
|
/**
|
|
415
|
-
* Validates the number of tool calls made during
|
|
423
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
416
424
|
*
|
|
417
425
|
* @example
|
|
418
426
|
* ```typescript
|
|
@@ -523,7 +531,7 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
523
531
|
/**
|
|
524
532
|
* toHaveToolCalls Matcher
|
|
525
533
|
*
|
|
526
|
-
* Validates which tools the LLM called during
|
|
534
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
527
535
|
*/
|
|
528
536
|
|
|
529
537
|
/**
|
|
@@ -539,7 +547,7 @@ declare function toHaveToolCalls(this: {
|
|
|
539
547
|
/**
|
|
540
548
|
* toHaveToolCallCount Matcher
|
|
541
549
|
*
|
|
542
|
-
* Validates the number of tool calls made during
|
|
550
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
543
551
|
*/
|
|
544
552
|
|
|
545
553
|
/**
|
package/dist/fixtures/mcp.js
CHANGED
|
@@ -1215,7 +1215,7 @@ function validateToolCalls(response, expectation) {
|
|
|
1215
1215
|
if (!isSimulationResult(response)) {
|
|
1216
1216
|
return {
|
|
1217
1217
|
pass: false,
|
|
1218
|
-
message: "toolsTriggered expectation requires
|
|
1218
|
+
message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
1219
1219
|
};
|
|
1220
1220
|
}
|
|
1221
1221
|
const actual = response.toolCalls;
|
|
@@ -1275,7 +1275,7 @@ function validateToolCallCount(response, options) {
|
|
|
1275
1275
|
if (!isSimulationResult(response)) {
|
|
1276
1276
|
return {
|
|
1277
1277
|
pass: false,
|
|
1278
|
-
message: "toolCallCount expectation requires
|
|
1278
|
+
message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
1279
1279
|
};
|
|
1280
1280
|
}
|
|
1281
1281
|
const count = response.toolCalls.length;
|
|
@@ -1337,7 +1337,7 @@ var expect = expect$1.extend({
|
|
|
1337
1337
|
toHaveToolCallCount
|
|
1338
1338
|
});
|
|
1339
1339
|
var MCPHostCapabilitiesSchema = z.object({
|
|
1340
|
-
sampling: z.record(z.unknown()).optional(),
|
|
1340
|
+
sampling: z.record(z.string(), z.unknown()).optional(),
|
|
1341
1341
|
roots: z.object({
|
|
1342
1342
|
listChanged: z.boolean()
|
|
1343
1343
|
}).optional()
|
|
@@ -1396,7 +1396,7 @@ var HttpConfigSchema = z.object({
|
|
|
1396
1396
|
}
|
|
1397
1397
|
return true;
|
|
1398
1398
|
}),
|
|
1399
|
-
headers: z.record(z.string()).optional(),
|
|
1399
|
+
headers: z.record(z.string(), z.string()).optional(),
|
|
1400
1400
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
1401
1401
|
connectTimeoutMs: z.number().positive().optional(),
|
|
1402
1402
|
requestTimeoutMs: z.number().positive().optional(),
|
|
@@ -1434,7 +1434,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
|
|
|
1434
1434
|
|
|
1435
1435
|
// package.json
|
|
1436
1436
|
var package_default = {
|
|
1437
|
-
version: "1.0.0-beta.
|
|
1437
|
+
version: "1.0.0-beta.4"};
|
|
1438
1438
|
var debug = createDebug("mcp-server-tester:oauth-flow");
|
|
1439
1439
|
async function generatePKCE() {
|
|
1440
1440
|
const codeVerifier = oauth.generateRandomCodeVerifier();
|
|
@@ -1687,7 +1687,10 @@ async function createMCPClientForConfig(config, options) {
|
|
|
1687
1687
|
validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
|
|
1688
1688
|
);
|
|
1689
1689
|
} else if (isHttpConfig(validatedConfig)) {
|
|
1690
|
-
const headers = {
|
|
1690
|
+
const headers = {
|
|
1691
|
+
"User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
|
|
1692
|
+
...validatedConfig.headers
|
|
1693
|
+
};
|
|
1691
1694
|
if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
|
|
1692
1695
|
const ccConfig = validatedConfig.auth.clientCredentials;
|
|
1693
1696
|
const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
|