@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -126,7 +126,7 @@ LLM host mode makes real API calls and produces non-deterministic results. Use `
126
126
  Requires Node.js 22+.
127
127
 
128
128
  ```bash
129
- npm install --save-dev @gleanwork/mcp-server-tester @playwright/test zod
129
+ npm install --save-dev @gleanwork/mcp-server-tester @playwright/test
130
130
  ```
131
131
 
132
132
  The Anthropic SDK is only needed for LLM-as-judge assertions or LLM host mode with the Anthropic provider:
@@ -182,6 +182,25 @@ For HTTP servers, set `transport: 'http'` and `serverUrl`. For servers that requ
182
182
  - [Development](./docs/development.md) — contributing and building
183
183
  - [Migration Guide (v0.12 → v1.0)](./docs/migrations/migration-1.0.md) — upgrading from pre-1.0 releases
184
184
 
185
+ ## AI Skills
186
+
187
+ Install AI skills to help your coding assistant generate tests, eval datasets, and MCP host evals:
188
+
189
+ ```bash
190
+ npx skills add -g gleanwork/mcp-server-tester
191
+ ```
192
+
193
+ This installs skills globally so they're available across all your projects. Four skills are included:
194
+
195
+ | Skill | Description |
196
+ | --------------------- | ----------------------------------------------------------- |
197
+ | `mcp-tester-guide` | Framework reference — matchers, config, auth, anti-patterns |
198
+ | `write-mcp-test` | Generate direct-mode Playwright tests |
199
+ | `write-mcp-eval` | Generate data-driven eval datasets |
200
+ | `write-mcp-host-eval` | Generate LLM host simulation evals |
201
+
202
+ Compatible with Claude Code, Cursor, Windsurf, Copilot, and [40+ other AI agents](https://github.com/nicepkg/nice-skills).
203
+
185
204
  ## Examples
186
205
 
187
206
  The `examples/` directory contains complete working examples:
package/dist/cli/index.js CHANGED
@@ -80,7 +80,7 @@ function JsonPreview({ data, maxLines = 15 }) {
80
80
 
81
81
  // package.json
82
82
  var package_default = {
83
- version: "1.0.0-beta.7"};
83
+ version: "1.0.0"};
84
84
 
85
85
  // src/cli/templates/index.ts
86
86
  function getPlaywrightConfigTemplate(answers) {
@@ -1005,6 +1005,17 @@ async function createMCPClientForConfig(config, options) {
1005
1005
  }
1006
1006
  async function closeMCPClient(client) {
1007
1007
  try {
1008
+ const transport = client.transport;
1009
+ if (transport instanceof StreamableHTTPClientTransport) {
1010
+ try {
1011
+ await transport.terminateSession();
1012
+ } catch (sessionError) {
1013
+ debugClient(
1014
+ "Error terminating session: %s",
1015
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
1016
+ );
1017
+ }
1018
+ }
1008
1019
  await client.close();
1009
1020
  } catch (error) {
1010
1021
  debugClient(
@@ -214,7 +214,7 @@ type RubricSpec = BuiltInRubric | {
214
214
  };
215
215
 
216
216
  /** Valid LLM judge provider kinds. */
217
- type ProviderKind = 'anthropic' | 'openai' | 'google';
217
+ type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
218
218
 
219
219
  /**
220
220
  * Tool call validators for mcp_host simulation results.
@@ -258,6 +258,12 @@ interface JudgeMatcherOptions {
258
258
  provider?: ProviderKind;
259
259
  /** Override the judge model */
260
260
  model?: string;
261
+ /**
262
+ * Name of a registered custom judge executor.
263
+ * When set, the named judge handles the entire evaluation pipeline
264
+ * and its `pass` result is authoritative.
265
+ */
266
+ judge?: string;
261
267
  }
262
268
  /**
263
269
  * Declaration merging for Playwright matchers
@@ -348,21 +354,30 @@ declare global {
348
354
  */
349
355
  toBeToolError(expected?: boolean | string | string[]): R;
350
356
  /**
351
- * Validates that a response passes LLM-as-judge evaluation
357
+ * Validates that a response passes LLM-as-judge evaluation.
352
358
  *
353
- * @param rubric - Evaluation rubric/criteria
354
- * @param options - Judge options
359
+ * Two call signatures:
360
+ * - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
361
+ * - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
355
362
  *
356
363
  * @example
357
364
  * ```typescript
365
+ * // Built-in LLM judge with rubric
358
366
  * expect(result).toPassToolJudge('Response should be helpful and accurate');
359
- * expect(result).toPassToolJudge('Response should match reference', {
367
+ * expect(result).toPassToolJudge('correctness', {
360
368
  * reference: expectedOutput,
361
369
  * passingThreshold: 0.8,
362
370
  * });
371
+ *
372
+ * // Named custom judge (registered via registerJudge)
373
+ * expect(result).toPassToolJudge({ judge: 'glean-completeness' });
363
374
  * ```
364
375
  */
365
376
  toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
377
+ toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
378
+ toPassToolJudge(judges: Array<JudgeMatcherOptions & {
379
+ rubric?: RubricSpec;
380
+ }>): Promise<R>;
366
381
  /**
367
382
  * Validates that a response meets size constraints
368
383
  *
@@ -452,16 +467,26 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
452
467
  * Validates that a response passes LLM-as-judge evaluation.
453
468
  * Delegates evaluation logic to validateJudge() for consistency
454
469
  * with the validator/matcher duality pattern.
470
+ *
471
+ * Supports three call signatures:
472
+ * - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
473
+ * - toPassToolJudge({ judge: 'name', ... }) — named custom judge
474
+ * - toPassToolJudge([...judges]) — multi-judge (all must pass)
455
475
  */
456
476
 
457
477
  /**
458
- * Creates the toPassToolJudge matcher function
478
+ * The toPassToolJudge matcher function.
459
479
  *
460
- * Note: This is an async matcher that calls an LLM for evaluation.
480
+ * Accepts either:
481
+ * (received, rubric, options?) — rubric-based LLM judge
482
+ * (received, options) — named custom judge (options.judge required)
483
+ * (received, judges[]) — multi-judge (all must pass)
461
484
  */
462
485
  declare function toPassToolJudge(this: {
463
486
  isNot: boolean;
464
- }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
487
+ }, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
488
+ rubric?: RubricSpec;
489
+ }>, maybeOptions?: JudgeMatcherOptions): Promise<{
465
490
  pass: boolean;
466
491
  message: () => string;
467
492
  }>;