@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -10
- package/dist/cli/index.js +34 -11
- package/dist/fixtures/mcp.d.ts +6 -6
- package/dist/fixtures/mcp.js +5 -5
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +79 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +215 -1168
- package/dist/index.d.ts +215 -1168
- package/dist/index.js +79 -45
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +9 -6
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @gleanwork/mcp-server-tester
|
|
2
2
|
|
|
3
|
-
[](https://github.com/gleanwork/.github/blob/main/docs/repository-stability.md#ga)
|
|
4
4
|
[](https://www.npmjs.com/package/@gleanwork/mcp-server-tester)
|
|
5
5
|
[](https://github.com/gleanwork/mcp-server-tester/actions/workflows/ci.yml)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -11,7 +11,7 @@ A testing and evaluation framework for [Model Context Protocol (MCP)](https://mo
|
|
|
11
11
|
|
|
12
12
|
The `mcp` Playwright fixture connects to your MCP server (stdio or HTTP) and exposes a high-level API for calling tools and asserting responses. Custom matchers keep assertions readable.
|
|
13
13
|
|
|
14
|
-
```typescript
|
|
14
|
+
```typescript snippet=snippets/basic-test.ts
|
|
15
15
|
import { test, expect } from '@gleanwork/mcp-server-tester/fixtures/mcp';
|
|
16
16
|
|
|
17
17
|
test('read_file returns file contents', async ({ mcp }) => {
|
|
@@ -48,7 +48,7 @@ Available matchers:
|
|
|
48
48
|
|
|
49
49
|
Eval datasets let you define test cases as JSON files and run them with `runEvalDataset()`. Each case specifies a tool call and one or more assertions.
|
|
50
50
|
|
|
51
|
-
```json
|
|
51
|
+
```json snippet=snippets/eval-dataset.json
|
|
52
52
|
{
|
|
53
53
|
"name": "file-ops",
|
|
54
54
|
"cases": [
|
|
@@ -73,7 +73,7 @@ Eval datasets let you define test cases as JSON files and run them with `runEval
|
|
|
73
73
|
}
|
|
74
74
|
```
|
|
75
75
|
|
|
76
|
-
```typescript
|
|
76
|
+
```typescript snippet=snippets/run-eval-dataset.ts
|
|
77
77
|
import { test, expect } from '@gleanwork/mcp-server-tester/fixtures/mcp';
|
|
78
78
|
import { loadEvalDataset, runEvalDataset } from '@gleanwork/mcp-server-tester';
|
|
79
79
|
import { z } from 'zod';
|
|
@@ -102,12 +102,12 @@ Supported assertion types:
|
|
|
102
102
|
|
|
103
103
|
In LLM host mode, a real LLM receives your server's tool list and a natural language prompt, then decides which tools to call. This tests whether your tool names, descriptions, and input schemas are clear enough for autonomous use — a different question from whether the tools return correct output.
|
|
104
104
|
|
|
105
|
-
```json
|
|
105
|
+
```json snippet=snippets/mcp-host-dataset.json
|
|
106
106
|
{
|
|
107
107
|
"id": "find-config",
|
|
108
|
-
"mode": "
|
|
108
|
+
"mode": "mcp_host",
|
|
109
109
|
"scenario": "Find the application config file and return its contents",
|
|
110
|
-
"
|
|
110
|
+
"mcpHostConfig": {
|
|
111
111
|
"provider": "anthropic",
|
|
112
112
|
"model": "claude-opus-4-20250514"
|
|
113
113
|
},
|
|
@@ -119,7 +119,7 @@ In LLM host mode, a real LLM receives your server's tool list and a natural lang
|
|
|
119
119
|
}
|
|
120
120
|
```
|
|
121
121
|
|
|
122
|
-
LLM host mode makes real API calls and produces non-deterministic results. Use `iterations` to run a case multiple times and measure pass rate rather than expecting 100% on a single run. See the [LLM Host Guide](docs/
|
|
122
|
+
LLM host mode makes real API calls and produces non-deterministic results. Use `iterations` to run a case multiple times and measure pass rate rather than expecting 100% on a single run. See the [LLM Host Guide](docs/mcp-host.md) for configuration and cost management.
|
|
123
123
|
|
|
124
124
|
## Installation
|
|
125
125
|
|
|
@@ -147,7 +147,7 @@ The CLI wizard creates a `playwright.config.ts`, example tests, and a sample eva
|
|
|
147
147
|
|
|
148
148
|
Point the framework at your MCP server in `playwright.config.ts`:
|
|
149
149
|
|
|
150
|
-
```typescript
|
|
150
|
+
```typescript snippet=snippets/playwright-config.ts
|
|
151
151
|
import { defineConfig } from '@playwright/test';
|
|
152
152
|
|
|
153
153
|
export default defineConfig({
|
|
@@ -174,12 +174,13 @@ For HTTP servers, set `transport: 'http'` and `serverUrl`. For servers that requ
|
|
|
174
174
|
|
|
175
175
|
- [Quick Start](./docs/quickstart.md) — detailed setup and configuration
|
|
176
176
|
- [Expectations](./docs/expectations.md) — all assertion types including snapshot sanitizers
|
|
177
|
-
- [LLM Host Simulation](docs/
|
|
177
|
+
- [LLM Host Simulation](docs/mcp-host.md) — tool discoverability testing
|
|
178
178
|
- [API Reference](./docs/api-reference.md)
|
|
179
179
|
- [Transports](./docs/transports.md) — stdio and HTTP configuration, OAuth
|
|
180
180
|
- [CLI Commands](./docs/cli.md) — init, generate, login, token
|
|
181
181
|
- [UI Reporter](./docs/ui-reporter.md) — interactive web UI for test results
|
|
182
182
|
- [Development](./docs/development.md) — contributing and building
|
|
183
|
+
- [Migration Guide (v0.12 → v1.0)](./docs/migrations/migration-1.0.md) — upgrading from pre-1.0 releases
|
|
183
184
|
|
|
184
185
|
## Examples
|
|
185
186
|
|
package/dist/cli/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
|
|
|
17
17
|
import { z } from 'zod';
|
|
18
18
|
import createDebug from 'debug';
|
|
19
19
|
import { ProxyAgent, Agent } from 'undici';
|
|
20
|
-
import { readFileSync } from 'fs';
|
|
20
|
+
import { existsSync, readFileSync } from 'fs';
|
|
21
21
|
import * as oauth from 'oauth4webapi';
|
|
22
22
|
import { homedir } from 'os';
|
|
23
23
|
import * as http from 'http';
|
|
@@ -80,7 +80,7 @@ function JsonPreview({ data, maxLines = 15 }) {
|
|
|
80
80
|
|
|
81
81
|
// package.json
|
|
82
82
|
var package_default = {
|
|
83
|
-
version: "1.0.0-beta.
|
|
83
|
+
version: "1.0.0-beta.5"};
|
|
84
84
|
|
|
85
85
|
// src/cli/templates/index.ts
|
|
86
86
|
function getPlaywrightConfigTemplate(answers) {
|
|
@@ -255,10 +255,10 @@ function getPackageJsonTemplate(projectName) {
|
|
|
255
255
|
"evals"
|
|
256
256
|
],
|
|
257
257
|
"dependencies": {
|
|
258
|
-
"@modelcontextprotocol/sdk": "^1.0
|
|
258
|
+
"@modelcontextprotocol/sdk": "^1.27.0",
|
|
259
259
|
"@playwright/test": "^1.49.0",
|
|
260
260
|
"@gleanwork/mcp-server-tester": "^${package_default.version}",
|
|
261
|
-
"zod": "^
|
|
261
|
+
"zod": "^4.0.0"
|
|
262
262
|
},
|
|
263
263
|
"devDependencies": {
|
|
264
264
|
"typescript": "^5.7.2"
|
|
@@ -531,7 +531,7 @@ async function init(options) {
|
|
|
531
531
|
await waitUntilExit();
|
|
532
532
|
}
|
|
533
533
|
var MCPHostCapabilitiesSchema = z.object({
|
|
534
|
-
sampling: z.record(z.unknown()).optional(),
|
|
534
|
+
sampling: z.record(z.string(), z.unknown()).optional(),
|
|
535
535
|
roots: z.object({
|
|
536
536
|
listChanged: z.boolean()
|
|
537
537
|
}).optional()
|
|
@@ -590,7 +590,7 @@ var HttpConfigSchema = z.object({
|
|
|
590
590
|
}
|
|
591
591
|
return true;
|
|
592
592
|
}),
|
|
593
|
-
headers: z.record(z.string()).optional(),
|
|
593
|
+
headers: z.record(z.string(), z.string()).optional(),
|
|
594
594
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
595
595
|
connectTimeoutMs: z.number().positive().optional(),
|
|
596
596
|
requestTimeoutMs: z.number().positive().optional(),
|
|
@@ -827,7 +827,7 @@ async function retryWithBackoff(fn, maxAttempts) {
|
|
|
827
827
|
delayMs,
|
|
828
828
|
err.message
|
|
829
829
|
);
|
|
830
|
-
await new Promise((
|
|
830
|
+
await new Promise((resolve4) => setTimeout(resolve4, delayMs));
|
|
831
831
|
} else {
|
|
832
832
|
throw err;
|
|
833
833
|
}
|
|
@@ -1683,7 +1683,7 @@ ${errorText}`
|
|
|
1683
1683
|
*/
|
|
1684
1684
|
async startCallbackServer(expectedState) {
|
|
1685
1685
|
const timeoutMs = this.config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
1686
|
-
return new Promise((
|
|
1686
|
+
return new Promise((resolve4, reject) => {
|
|
1687
1687
|
const server = http.createServer();
|
|
1688
1688
|
const connections = /* @__PURE__ */ new Set();
|
|
1689
1689
|
server.on("connection", (socket) => {
|
|
@@ -1756,7 +1756,7 @@ ${errorText}`
|
|
|
1756
1756
|
server.listen(preferredPort, "127.0.0.1", () => {
|
|
1757
1757
|
const address = server.address();
|
|
1758
1758
|
debug2("Callback server listening on port", address.port);
|
|
1759
|
-
|
|
1759
|
+
resolve4({ port: address.port, codePromise, close: forceClose });
|
|
1760
1760
|
});
|
|
1761
1761
|
server.on("error", (err) => {
|
|
1762
1762
|
reject(err);
|
|
@@ -1777,8 +1777,8 @@ ${errorText}`
|
|
|
1777
1777
|
return;
|
|
1778
1778
|
}
|
|
1779
1779
|
try {
|
|
1780
|
-
const
|
|
1781
|
-
await
|
|
1780
|
+
const open2 = await import('open');
|
|
1781
|
+
await open2.default(url.toString());
|
|
1782
1782
|
debug2("Opened browser for authentication");
|
|
1783
1783
|
} catch (error) {
|
|
1784
1784
|
debug2("Failed to open browser:", error);
|
|
@@ -3062,6 +3062,24 @@ async function token(serverUrl, options) {
|
|
|
3062
3062
|
);
|
|
3063
3063
|
await waitUntilExit();
|
|
3064
3064
|
}
|
|
3065
|
+
async function open(options) {
|
|
3066
|
+
const outputDir = resolve(options.dir ?? ".mcp-test-results");
|
|
3067
|
+
const reportPath = join(outputDir, "latest", "index.html");
|
|
3068
|
+
if (!existsSync(reportPath)) {
|
|
3069
|
+
console.error(`No report found at ${reportPath}`);
|
|
3070
|
+
console.error("Run your Playwright tests first to generate a report.");
|
|
3071
|
+
process.exit(1);
|
|
3072
|
+
}
|
|
3073
|
+
console.log(`Opening report: ${reportPath}`);
|
|
3074
|
+
try {
|
|
3075
|
+
const { default: openBrowser } = await import('open');
|
|
3076
|
+
await openBrowser(reportPath);
|
|
3077
|
+
} catch (error) {
|
|
3078
|
+
console.error("Failed to open report in browser:", error);
|
|
3079
|
+
console.error(`Open manually: file://${reportPath}`);
|
|
3080
|
+
process.exit(1);
|
|
3081
|
+
}
|
|
3082
|
+
}
|
|
3065
3083
|
|
|
3066
3084
|
// src/cli/index.ts
|
|
3067
3085
|
var program = new Command();
|
|
@@ -3077,4 +3095,9 @@ program.command("token").description("Output stored OAuth tokens for CI/CD use")
|
|
|
3077
3095
|
"Output format: env, json, or gh (default: env)",
|
|
3078
3096
|
"env"
|
|
3079
3097
|
).option("--state-dir <dir>", "Custom directory for token storage").action(token);
|
|
3098
|
+
program.command("open").description("Open the MCP eval reporter UI in your browser").option(
|
|
3099
|
+
"-d, --dir <directory>",
|
|
3100
|
+
"Report output directory",
|
|
3101
|
+
".mcp-test-results"
|
|
3102
|
+
).action(open);
|
|
3080
3103
|
program.parse();
|
package/dist/fixtures/mcp.d.ts
CHANGED
|
@@ -217,9 +217,9 @@ type RubricSpec = BuiltInRubric | {
|
|
|
217
217
|
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
218
218
|
|
|
219
219
|
/**
|
|
220
|
-
* Tool call validators for
|
|
220
|
+
* Tool call validators for mcp_host simulation results.
|
|
221
221
|
*
|
|
222
|
-
* These validators extract the tool call trace from an
|
|
222
|
+
* These validators extract the tool call trace from an MCPHostSimulationResult
|
|
223
223
|
* and apply assertions against expected call lists and counts.
|
|
224
224
|
*/
|
|
225
225
|
|
|
@@ -408,7 +408,7 @@ declare global {
|
|
|
408
408
|
*/
|
|
409
409
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
410
410
|
/**
|
|
411
|
-
* Validates which tools the LLM called during
|
|
411
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
412
412
|
*
|
|
413
413
|
* @example
|
|
414
414
|
* ```typescript
|
|
@@ -420,7 +420,7 @@ declare global {
|
|
|
420
420
|
*/
|
|
421
421
|
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
422
422
|
/**
|
|
423
|
-
* Validates the number of tool calls made during
|
|
423
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
424
424
|
*
|
|
425
425
|
* @example
|
|
426
426
|
* ```typescript
|
|
@@ -531,7 +531,7 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
531
531
|
/**
|
|
532
532
|
* toHaveToolCalls Matcher
|
|
533
533
|
*
|
|
534
|
-
* Validates which tools the LLM called during
|
|
534
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
535
535
|
*/
|
|
536
536
|
|
|
537
537
|
/**
|
|
@@ -547,7 +547,7 @@ declare function toHaveToolCalls(this: {
|
|
|
547
547
|
/**
|
|
548
548
|
* toHaveToolCallCount Matcher
|
|
549
549
|
*
|
|
550
|
-
* Validates the number of tool calls made during
|
|
550
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
551
551
|
*/
|
|
552
552
|
|
|
553
553
|
/**
|
package/dist/fixtures/mcp.js
CHANGED
|
@@ -1215,7 +1215,7 @@ function validateToolCalls(response, expectation) {
|
|
|
1215
1215
|
if (!isSimulationResult(response)) {
|
|
1216
1216
|
return {
|
|
1217
1217
|
pass: false,
|
|
1218
|
-
message: "toolsTriggered expectation requires
|
|
1218
|
+
message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
1219
1219
|
};
|
|
1220
1220
|
}
|
|
1221
1221
|
const actual = response.toolCalls;
|
|
@@ -1275,7 +1275,7 @@ function validateToolCallCount(response, options) {
|
|
|
1275
1275
|
if (!isSimulationResult(response)) {
|
|
1276
1276
|
return {
|
|
1277
1277
|
pass: false,
|
|
1278
|
-
message: "toolCallCount expectation requires
|
|
1278
|
+
message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
1279
1279
|
};
|
|
1280
1280
|
}
|
|
1281
1281
|
const count = response.toolCalls.length;
|
|
@@ -1337,7 +1337,7 @@ var expect = expect$1.extend({
|
|
|
1337
1337
|
toHaveToolCallCount
|
|
1338
1338
|
});
|
|
1339
1339
|
var MCPHostCapabilitiesSchema = z.object({
|
|
1340
|
-
sampling: z.record(z.unknown()).optional(),
|
|
1340
|
+
sampling: z.record(z.string(), z.unknown()).optional(),
|
|
1341
1341
|
roots: z.object({
|
|
1342
1342
|
listChanged: z.boolean()
|
|
1343
1343
|
}).optional()
|
|
@@ -1396,7 +1396,7 @@ var HttpConfigSchema = z.object({
|
|
|
1396
1396
|
}
|
|
1397
1397
|
return true;
|
|
1398
1398
|
}),
|
|
1399
|
-
headers: z.record(z.string()).optional(),
|
|
1399
|
+
headers: z.record(z.string(), z.string()).optional(),
|
|
1400
1400
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
1401
1401
|
connectTimeoutMs: z.number().positive().optional(),
|
|
1402
1402
|
requestTimeoutMs: z.number().positive().optional(),
|
|
@@ -1434,7 +1434,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
|
|
|
1434
1434
|
|
|
1435
1435
|
// package.json
|
|
1436
1436
|
var package_default = {
|
|
1437
|
-
version: "1.0.0-beta.
|
|
1437
|
+
version: "1.0.0-beta.5"};
|
|
1438
1438
|
var debug = createDebug("mcp-server-tester:oauth-flow");
|
|
1439
1439
|
async function generatePKCE() {
|
|
1440
1440
|
const codeVerifier = oauth.generateRandomCodeVerifier();
|