@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +354 -37
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +721 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +533 -116
- package/dist/index.d.ts +533 -116
- package/dist/index.js +719 -78
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -6
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/README.md
CHANGED
|
@@ -126,7 +126,7 @@ LLM host mode makes real API calls and produces non-deterministic results. Use `
|
|
|
126
126
|
Requires Node.js 22+.
|
|
127
127
|
|
|
128
128
|
```bash
|
|
129
|
-
npm install --save-dev @gleanwork/mcp-server-tester @playwright/test
|
|
129
|
+
npm install --save-dev @gleanwork/mcp-server-tester @playwright/test
|
|
130
130
|
```
|
|
131
131
|
|
|
132
132
|
The Anthropic SDK is only needed for LLM-as-judge assertions or LLM host mode with the Anthropic provider:
|
|
@@ -182,6 +182,25 @@ For HTTP servers, set `transport: 'http'` and `serverUrl`. For servers that requ
|
|
|
182
182
|
- [Development](./docs/development.md) — contributing and building
|
|
183
183
|
- [Migration Guide (v0.12 → v1.0)](./docs/migrations/migration-1.0.md) — upgrading from pre-1.0 releases
|
|
184
184
|
|
|
185
|
+
## AI Skills
|
|
186
|
+
|
|
187
|
+
Install AI skills to help your coding assistant generate tests, eval datasets, and MCP host evals:
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
npx skills add -g gleanwork/mcp-server-tester
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
This installs skills globally so they're available across all your projects. Four skills are included:
|
|
194
|
+
|
|
195
|
+
| Skill | Description |
|
|
196
|
+
| --------------------- | ----------------------------------------------------------- |
|
|
197
|
+
| `mcp-tester-guide` | Framework reference — matchers, config, auth, anti-patterns |
|
|
198
|
+
| `write-mcp-test` | Generate direct-mode Playwright tests |
|
|
199
|
+
| `write-mcp-eval` | Generate data-driven eval datasets |
|
|
200
|
+
| `write-mcp-host-eval` | Generate LLM host simulation evals |
|
|
201
|
+
|
|
202
|
+
Compatible with Claude Code, Cursor, Windsurf, Copilot, and [40+ other AI agents](https://github.com/nicepkg/nice-skills).
|
|
203
|
+
|
|
185
204
|
## Examples
|
|
186
205
|
|
|
187
206
|
The `examples/` directory contains complete working examples:
|
package/dist/cli/index.js
CHANGED
|
@@ -80,7 +80,7 @@ function JsonPreview({ data, maxLines = 15 }) {
|
|
|
80
80
|
|
|
81
81
|
// package.json
|
|
82
82
|
var package_default = {
|
|
83
|
-
version: "1.0.0
|
|
83
|
+
version: "1.0.0"};
|
|
84
84
|
|
|
85
85
|
// src/cli/templates/index.ts
|
|
86
86
|
function getPlaywrightConfigTemplate(answers) {
|
|
@@ -1005,6 +1005,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
1005
1005
|
}
|
|
1006
1006
|
async function closeMCPClient(client) {
|
|
1007
1007
|
try {
|
|
1008
|
+
const transport = client.transport;
|
|
1009
|
+
if (transport instanceof StreamableHTTPClientTransport) {
|
|
1010
|
+
try {
|
|
1011
|
+
await transport.terminateSession();
|
|
1012
|
+
} catch (sessionError) {
|
|
1013
|
+
debugClient(
|
|
1014
|
+
"Error terminating session: %s",
|
|
1015
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
1016
|
+
);
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1008
1019
|
await client.close();
|
|
1009
1020
|
} catch (error) {
|
|
1010
1021
|
debugClient(
|
package/dist/fixtures/mcp.d.ts
CHANGED
|
@@ -214,7 +214,7 @@ type RubricSpec = BuiltInRubric | {
|
|
|
214
214
|
};
|
|
215
215
|
|
|
216
216
|
/** Valid LLM judge provider kinds. */
|
|
217
|
-
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
217
|
+
type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
218
218
|
|
|
219
219
|
/**
|
|
220
220
|
* Tool call validators for mcp_host simulation results.
|
|
@@ -258,6 +258,12 @@ interface JudgeMatcherOptions {
|
|
|
258
258
|
provider?: ProviderKind;
|
|
259
259
|
/** Override the judge model */
|
|
260
260
|
model?: string;
|
|
261
|
+
/**
|
|
262
|
+
* Name of a registered custom judge executor.
|
|
263
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
264
|
+
* and its `pass` result is authoritative.
|
|
265
|
+
*/
|
|
266
|
+
judge?: string;
|
|
261
267
|
}
|
|
262
268
|
/**
|
|
263
269
|
* Declaration merging for Playwright matchers
|
|
@@ -348,21 +354,30 @@ declare global {
|
|
|
348
354
|
*/
|
|
349
355
|
toBeToolError(expected?: boolean | string | string[]): R;
|
|
350
356
|
/**
|
|
351
|
-
* Validates that a response passes LLM-as-judge evaluation
|
|
357
|
+
* Validates that a response passes LLM-as-judge evaluation.
|
|
352
358
|
*
|
|
353
|
-
*
|
|
354
|
-
*
|
|
359
|
+
* Two call signatures:
|
|
360
|
+
* - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
|
|
361
|
+
* - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
|
|
355
362
|
*
|
|
356
363
|
* @example
|
|
357
364
|
* ```typescript
|
|
365
|
+
* // Built-in LLM judge with rubric
|
|
358
366
|
* expect(result).toPassToolJudge('Response should be helpful and accurate');
|
|
359
|
-
* expect(result).toPassToolJudge('
|
|
367
|
+
* expect(result).toPassToolJudge('correctness', {
|
|
360
368
|
* reference: expectedOutput,
|
|
361
369
|
* passingThreshold: 0.8,
|
|
362
370
|
* });
|
|
371
|
+
*
|
|
372
|
+
* // Named custom judge (registered via registerJudge)
|
|
373
|
+
* expect(result).toPassToolJudge({ judge: 'glean-completeness' });
|
|
363
374
|
* ```
|
|
364
375
|
*/
|
|
365
376
|
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
377
|
+
toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
|
|
378
|
+
toPassToolJudge(judges: Array<JudgeMatcherOptions & {
|
|
379
|
+
rubric?: RubricSpec;
|
|
380
|
+
}>): Promise<R>;
|
|
366
381
|
/**
|
|
367
382
|
* Validates that a response meets size constraints
|
|
368
383
|
*
|
|
@@ -452,16 +467,26 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
|
|
|
452
467
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
453
468
|
* Delegates evaluation logic to validateJudge() for consistency
|
|
454
469
|
* with the validator/matcher duality pattern.
|
|
470
|
+
*
|
|
471
|
+
* Supports three call signatures:
|
|
472
|
+
* - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
|
|
473
|
+
* - toPassToolJudge({ judge: 'name', ... }) — named custom judge
|
|
474
|
+
* - toPassToolJudge([...judges]) — multi-judge (all must pass)
|
|
455
475
|
*/
|
|
456
476
|
|
|
457
477
|
/**
|
|
458
|
-
*
|
|
478
|
+
* The toPassToolJudge matcher function.
|
|
459
479
|
*
|
|
460
|
-
*
|
|
480
|
+
* Accepts either:
|
|
481
|
+
* (received, rubric, options?) — rubric-based LLM judge
|
|
482
|
+
* (received, options) — named custom judge (options.judge required)
|
|
483
|
+
* (received, judges[]) — multi-judge (all must pass)
|
|
461
484
|
*/
|
|
462
485
|
declare function toPassToolJudge(this: {
|
|
463
486
|
isNot: boolean;
|
|
464
|
-
}, received: unknown,
|
|
487
|
+
}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
|
|
488
|
+
rubric?: RubricSpec;
|
|
489
|
+
}>, maybeOptions?: JudgeMatcherOptions): Promise<{
|
|
465
490
|
pass: boolean;
|
|
466
491
|
message: () => string;
|
|
467
492
|
}>;
|