@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -337
- package/dist/cli/index.js +455 -174
- package/dist/fixtures/mcp.d.ts +121 -44
- package/dist/fixtures/mcp.js +974 -244
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/fixtures/mcpAuth.js +6 -2
- package/dist/fixtures/mcpAuth.js.map +1 -1
- package/dist/index.cjs +4936 -1292
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1660 -570
- package/dist/index.d.ts +1660 -570
- package/dist/index.js +4923 -1288
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +35 -16
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +8 -3
- package/dist/reporters/mcpReporter.d.ts +8 -3
- package/dist/reporters/mcpReporter.js +36 -17
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +5 -5
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +63 -8
- package/src/reporters/ui-dist/app.js +5 -5
- package/src/reporters/ui-dist/styles.css +1 -1
package/dist/fixtures/mcp.d.ts
CHANGED
|
@@ -55,10 +55,33 @@ interface PatternValidatorOptions {
|
|
|
55
55
|
/** Whether to perform case-sensitive matching (default: true) */
|
|
56
56
|
caseSensitive?: boolean;
|
|
57
57
|
}
|
|
58
|
+
/**
|
|
59
|
+
* Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
|
|
60
|
+
* Pass these values in the sanitizers array to replace non-deterministic
|
|
61
|
+
* values with stable placeholders before snapshot comparison.
|
|
62
|
+
*
|
|
63
|
+
* @example
|
|
64
|
+
* expect(result).toMatchToolSnapshot('my-snapshot', [
|
|
65
|
+
* SnapshotSanitizers.UUID,
|
|
66
|
+
* SnapshotSanitizers.ISO_DATE,
|
|
67
|
+
* ]);
|
|
68
|
+
*/
|
|
69
|
+
declare const SnapshotSanitizers: {
|
|
70
|
+
/** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
|
|
71
|
+
readonly TIMESTAMP: "timestamp";
|
|
72
|
+
/** Replaces UUID v1-v5 strings with a stable placeholder */
|
|
73
|
+
readonly UUID: "uuid";
|
|
74
|
+
/** Replaces ISO 8601 date/datetime strings with a stable placeholder */
|
|
75
|
+
readonly ISO_DATE: "iso-date";
|
|
76
|
+
/** Replaces MongoDB ObjectId strings with a stable placeholder */
|
|
77
|
+
readonly OBJECT_ID: "objectId";
|
|
78
|
+
/** Replaces JWT tokens with a stable placeholder */
|
|
79
|
+
readonly JWT: "jwt";
|
|
80
|
+
};
|
|
58
81
|
/**
|
|
59
82
|
* Built-in sanitizer names for common variable patterns
|
|
60
83
|
*/
|
|
61
|
-
type BuiltInSanitizer =
|
|
84
|
+
type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
|
|
62
85
|
/**
|
|
63
86
|
* Custom regex-based sanitizer
|
|
64
87
|
*/
|
|
@@ -171,48 +194,40 @@ declare function toBeToolError(this: {
|
|
|
171
194
|
};
|
|
172
195
|
|
|
173
196
|
/**
|
|
174
|
-
*
|
|
197
|
+
* Built-in judge rubrics matching Glean EvalV2's named judge types.
|
|
198
|
+
* Use these for consistent, standardized evaluations across teams.
|
|
199
|
+
*
|
|
200
|
+
* All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
|
|
175
201
|
*/
|
|
176
|
-
type
|
|
202
|
+
type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
|
|
203
|
+
/** A rubric specification: either a built-in named rubric or custom text. */
|
|
204
|
+
type RubricSpec = BuiltInRubric | {
|
|
205
|
+
text: string;
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
/** Valid LLM judge provider kinds. */
|
|
209
|
+
type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'google';
|
|
210
|
+
|
|
177
211
|
/**
|
|
178
|
-
*
|
|
212
|
+
* Tool call validators for llm_host simulation results.
|
|
213
|
+
*
|
|
214
|
+
* These validators extract the tool call trace from an LLMHostSimulationResult
|
|
215
|
+
* and apply assertions against expected call lists and counts.
|
|
179
216
|
*/
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
*/
|
|
195
|
-
model?: string;
|
|
196
|
-
/**
|
|
197
|
-
* Maximum tokens for response
|
|
198
|
-
* @default 1000
|
|
199
|
-
*/
|
|
200
|
-
maxTokens?: number;
|
|
201
|
-
/**
|
|
202
|
-
* Temperature (0-1, lower is more deterministic)
|
|
203
|
-
* @default 0.0
|
|
204
|
-
*/
|
|
205
|
-
temperature?: number;
|
|
206
|
-
/**
|
|
207
|
-
* Maximum budget in USD for the judge evaluation
|
|
208
|
-
* @default 0.10
|
|
209
|
-
*/
|
|
210
|
-
maxBudgetUsd?: number;
|
|
211
|
-
/**
|
|
212
|
-
* Maximum size (in bytes) for tool output before failing the test
|
|
213
|
-
* When set, the judge will fail if the candidate response exceeds this size
|
|
214
|
-
*/
|
|
215
|
-
maxToolOutputSize?: number;
|
|
217
|
+
|
|
218
|
+
interface ToolCallExpectation {
|
|
219
|
+
calls: Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
arguments?: Record<string, unknown>;
|
|
222
|
+
required?: boolean;
|
|
223
|
+
}>;
|
|
224
|
+
order?: 'strict' | 'any';
|
|
225
|
+
exclusive?: boolean;
|
|
226
|
+
}
|
|
227
|
+
interface ToolCallCountOptions {
|
|
228
|
+
min?: number;
|
|
229
|
+
max?: number;
|
|
230
|
+
exact?: number;
|
|
216
231
|
}
|
|
217
232
|
|
|
218
233
|
/**
|
|
@@ -229,8 +244,12 @@ interface JudgeMatcherOptions {
|
|
|
229
244
|
reference?: unknown;
|
|
230
245
|
/** Score threshold for passing (default: 0.7) */
|
|
231
246
|
passingThreshold?: number;
|
|
232
|
-
/**
|
|
233
|
-
|
|
247
|
+
/** Number of judge evaluations (scores averaged) */
|
|
248
|
+
reps?: number;
|
|
249
|
+
/** Override the judge provider */
|
|
250
|
+
provider?: ProviderKind;
|
|
251
|
+
/** Override the judge model */
|
|
252
|
+
model?: string;
|
|
234
253
|
}
|
|
235
254
|
/**
|
|
236
255
|
* Declaration merging for Playwright matchers
|
|
@@ -335,7 +354,7 @@ declare global {
|
|
|
335
354
|
* });
|
|
336
355
|
* ```
|
|
337
356
|
*/
|
|
338
|
-
toPassToolJudge(rubric:
|
|
357
|
+
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
339
358
|
/**
|
|
340
359
|
* Validates that a response meets size constraints
|
|
341
360
|
*
|
|
@@ -380,6 +399,28 @@ declare global {
|
|
|
380
399
|
* ```
|
|
381
400
|
*/
|
|
382
401
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
402
|
+
/**
|
|
403
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
404
|
+
*
|
|
405
|
+
* @example
|
|
406
|
+
* ```typescript
|
|
407
|
+
* expect(simulationResult).toHaveToolCalls({
|
|
408
|
+
* calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
|
|
409
|
+
* order: 'any',
|
|
410
|
+
* });
|
|
411
|
+
* ```
|
|
412
|
+
*/
|
|
413
|
+
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
414
|
+
/**
|
|
415
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
416
|
+
*
|
|
417
|
+
* @example
|
|
418
|
+
* ```typescript
|
|
419
|
+
* expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
|
|
420
|
+
* expect(simulationResult).toHaveToolCallCount({ exact: 2 });
|
|
421
|
+
* ```
|
|
422
|
+
*/
|
|
423
|
+
toHaveToolCallCount(options: ToolCallCountOptions): R;
|
|
383
424
|
}
|
|
384
425
|
}
|
|
385
426
|
}
|
|
@@ -401,6 +442,8 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
|
|
|
401
442
|
* toPassToolJudge Matcher
|
|
402
443
|
*
|
|
403
444
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
445
|
+
* Delegates evaluation logic to validateJudge() for consistency
|
|
446
|
+
* with the validator/matcher duality pattern.
|
|
404
447
|
*/
|
|
405
448
|
|
|
406
449
|
/**
|
|
@@ -410,7 +453,7 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
|
|
|
410
453
|
*/
|
|
411
454
|
declare function toPassToolJudge(this: {
|
|
412
455
|
isNot: boolean;
|
|
413
|
-
}, received: unknown, rubric:
|
|
456
|
+
}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
|
|
414
457
|
pass: boolean;
|
|
415
458
|
message: () => string;
|
|
416
459
|
}>;
|
|
@@ -477,6 +520,38 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
477
520
|
message: () => string;
|
|
478
521
|
}>;
|
|
479
522
|
|
|
523
|
+
/**
|
|
524
|
+
* toHaveToolCalls Matcher
|
|
525
|
+
*
|
|
526
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
527
|
+
*/
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* Creates the toHaveToolCalls matcher function
|
|
531
|
+
*/
|
|
532
|
+
declare function toHaveToolCalls(this: {
|
|
533
|
+
isNot: boolean;
|
|
534
|
+
}, received: unknown, expectation: ToolCallExpectation): {
|
|
535
|
+
pass: boolean;
|
|
536
|
+
message: () => string;
|
|
537
|
+
};
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* toHaveToolCallCount Matcher
|
|
541
|
+
*
|
|
542
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
543
|
+
*/
|
|
544
|
+
|
|
545
|
+
/**
|
|
546
|
+
* Creates the toHaveToolCallCount matcher function
|
|
547
|
+
*/
|
|
548
|
+
declare function toHaveToolCallCount(this: {
|
|
549
|
+
isNot: boolean;
|
|
550
|
+
}, received: unknown, options: ToolCallCountOptions): {
|
|
551
|
+
pass: boolean;
|
|
552
|
+
message: () => string;
|
|
553
|
+
};
|
|
554
|
+
|
|
480
555
|
/**
|
|
481
556
|
* Extended Playwright expect with MCP tool matchers
|
|
482
557
|
*
|
|
@@ -503,6 +578,8 @@ declare const expect: playwright_test.Expect<{
|
|
|
503
578
|
toPassToolJudge: typeof toPassToolJudge;
|
|
504
579
|
toHaveToolResponseSize: typeof toHaveToolResponseSize;
|
|
505
580
|
toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
|
|
581
|
+
toHaveToolCalls: typeof toHaveToolCalls;
|
|
582
|
+
toHaveToolCallCount: typeof toHaveToolCallCount;
|
|
506
583
|
}>;
|
|
507
584
|
|
|
508
585
|
/**
|