@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,10 +55,33 @@ interface PatternValidatorOptions {
55
55
  /** Whether to perform case-sensitive matching (default: true) */
56
56
  caseSensitive?: boolean;
57
57
  }
58
+ /**
59
+ * Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
60
+ * Pass these values in the sanitizers array to replace non-deterministic
61
+ * values with stable placeholders before snapshot comparison.
62
+ *
63
+ * @example
64
+ * expect(result).toMatchToolSnapshot('my-snapshot', [
65
+ * SnapshotSanitizers.UUID,
66
+ * SnapshotSanitizers.ISO_DATE,
67
+ * ]);
68
+ */
69
+ declare const SnapshotSanitizers: {
70
+ /** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
71
+ readonly TIMESTAMP: "timestamp";
72
+ /** Replaces UUID v1-v5 strings with a stable placeholder */
73
+ readonly UUID: "uuid";
74
+ /** Replaces ISO 8601 date/datetime strings with a stable placeholder */
75
+ readonly ISO_DATE: "iso-date";
76
+ /** Replaces MongoDB ObjectId strings with a stable placeholder */
77
+ readonly OBJECT_ID: "objectId";
78
+ /** Replaces JWT tokens with a stable placeholder */
79
+ readonly JWT: "jwt";
80
+ };
58
81
  /**
59
82
  * Built-in sanitizer names for common variable patterns
60
83
  */
61
- type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
84
+ type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
62
85
  /**
63
86
  * Custom regex-based sanitizer
64
87
  */
@@ -171,48 +194,40 @@ declare function toBeToolError(this: {
171
194
  };
172
195
 
173
196
  /**
174
- * Supported LLM provider types
197
+ * Built-in judge rubrics matching Glean EvalV2's named judge types.
198
+ * Use these for consistent, standardized evaluations across teams.
199
+ *
200
+ * All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
175
201
  */
176
- type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
202
+ type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
203
+ /** A rubric specification: either a built-in named rubric or custom text. */
204
+ type RubricSpec = BuiltInRubric | {
205
+ text: string;
206
+ };
207
+
208
+ /** Valid LLM judge provider kinds. */
209
+ type ProviderKind = 'anthropic' | 'openai' | 'google';
210
+
177
211
  /**
178
- * Configuration for an LLM judge
212
+ * Tool call validators for llm_host simulation results.
213
+ *
214
+ * These validators extract the tool call trace from an LLMHostSimulationResult
215
+ * and apply assertions against expected call lists and counts.
179
216
  */
180
- interface JudgeConfig {
181
- /**
182
- * LLM provider to use
183
- * @default 'claude'
184
- */
185
- provider?: ProviderKind;
186
- /**
187
- * Environment variable name containing the API key
188
- * @default 'ANTHROPIC_API_KEY'
189
- */
190
- apiKeyEnvVar?: string;
191
- /**
192
- * Model to use for judging
193
- * @default 'claude-sonnet-4-20250514'
194
- */
195
- model?: string;
196
- /**
197
- * Maximum tokens for response
198
- * @default 1000
199
- */
200
- maxTokens?: number;
201
- /**
202
- * Temperature (0-1, lower is more deterministic)
203
- * @default 0.0
204
- */
205
- temperature?: number;
206
- /**
207
- * Maximum budget in USD for the judge evaluation
208
- * @default 0.10
209
- */
210
- maxBudgetUsd?: number;
211
- /**
212
- * Maximum size (in bytes) for tool output before failing the test
213
- * When set, the judge will fail if the candidate response exceeds this size
214
- */
215
- maxToolOutputSize?: number;
217
+
218
+ interface ToolCallExpectation {
219
+ calls: Array<{
220
+ name: string;
221
+ arguments?: Record<string, unknown>;
222
+ required?: boolean;
223
+ }>;
224
+ order?: 'strict' | 'any';
225
+ exclusive?: boolean;
226
+ }
227
+ interface ToolCallCountOptions {
228
+ min?: number;
229
+ max?: number;
230
+ exact?: number;
216
231
  }
217
232
 
218
233
  /**
@@ -229,8 +244,12 @@ interface JudgeMatcherOptions {
229
244
  reference?: unknown;
230
245
  /** Score threshold for passing (default: 0.7) */
231
246
  passingThreshold?: number;
232
- /** Judge configuration override */
233
- judgeConfig?: JudgeConfig;
247
+ /** Number of judge evaluations (scores averaged) */
248
+ reps?: number;
249
+ /** Override the judge provider */
250
+ provider?: ProviderKind;
251
+ /** Override the judge model */
252
+ model?: string;
234
253
  }
235
254
  /**
236
255
  * Declaration merging for Playwright matchers
@@ -335,7 +354,7 @@ declare global {
335
354
  * });
336
355
  * ```
337
356
  */
338
- toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
357
+ toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
339
358
  /**
340
359
  * Validates that a response meets size constraints
341
360
  *
@@ -380,6 +399,28 @@ declare global {
380
399
  * ```
381
400
  */
382
401
  toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
402
+ /**
403
+ * Validates which tools the LLM called during an llm_host simulation.
404
+ *
405
+ * @example
406
+ * ```typescript
407
+ * expect(simulationResult).toHaveToolCalls({
408
+ * calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
409
+ * order: 'any',
410
+ * });
411
+ * ```
412
+ */
413
+ toHaveToolCalls(expectation: ToolCallExpectation): R;
414
+ /**
415
+ * Validates the number of tool calls made during an llm_host simulation.
416
+ *
417
+ * @example
418
+ * ```typescript
419
+ * expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
420
+ * expect(simulationResult).toHaveToolCallCount({ exact: 2 });
421
+ * ```
422
+ */
423
+ toHaveToolCallCount(options: ToolCallCountOptions): R;
383
424
  }
384
425
  }
385
426
  }
@@ -401,6 +442,8 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
401
442
  * toPassToolJudge Matcher
402
443
  *
403
444
  * Validates that a response passes LLM-as-judge evaluation.
445
+ * Delegates evaluation logic to validateJudge() for consistency
446
+ * with the validator/matcher duality pattern.
404
447
  */
405
448
 
406
449
  /**
@@ -410,7 +453,7 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
410
453
  */
411
454
  declare function toPassToolJudge(this: {
412
455
  isNot: boolean;
413
- }, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
456
+ }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
414
457
  pass: boolean;
415
458
  message: () => string;
416
459
  }>;
@@ -477,6 +520,38 @@ declare function toSatisfyToolPredicate(this: {
477
520
  message: () => string;
478
521
  }>;
479
522
 
523
+ /**
524
+ * toHaveToolCalls Matcher
525
+ *
526
+ * Validates which tools the LLM called during an llm_host simulation.
527
+ */
528
+
529
+ /**
530
+ * Creates the toHaveToolCalls matcher function
531
+ */
532
+ declare function toHaveToolCalls(this: {
533
+ isNot: boolean;
534
+ }, received: unknown, expectation: ToolCallExpectation): {
535
+ pass: boolean;
536
+ message: () => string;
537
+ };
538
+
539
+ /**
540
+ * toHaveToolCallCount Matcher
541
+ *
542
+ * Validates the number of tool calls made during an llm_host simulation.
543
+ */
544
+
545
+ /**
546
+ * Creates the toHaveToolCallCount matcher function
547
+ */
548
+ declare function toHaveToolCallCount(this: {
549
+ isNot: boolean;
550
+ }, received: unknown, options: ToolCallCountOptions): {
551
+ pass: boolean;
552
+ message: () => string;
553
+ };
554
+
480
555
  /**
481
556
  * Extended Playwright expect with MCP tool matchers
482
557
  *
@@ -503,6 +578,8 @@ declare const expect: playwright_test.Expect<{
503
578
  toPassToolJudge: typeof toPassToolJudge;
504
579
  toHaveToolResponseSize: typeof toHaveToolResponseSize;
505
580
  toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
581
+ toHaveToolCalls: typeof toHaveToolCalls;
582
+ toHaveToolCallCount: typeof toHaveToolCallCount;
506
583
  }>;
507
584
 
508
585
  /**