@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -42,6 +42,28 @@ interface MCPOAuthConfig {
42
42
  */
43
43
  redirectUri?: string;
44
44
  }
45
+ /**
46
+ * OAuth 2.1 client credentials configuration for machine-to-machine (CI/CD) authentication.
47
+ * Credentials can be provided here or via MCP_CLIENT_ID/MCP_CLIENT_SECRET environment variables.
48
+ */
49
+ interface MCPClientCredentialsConfig {
50
+ /**
51
+ * OAuth client ID (falls back to MCP_CLIENT_ID env var)
52
+ */
53
+ clientId?: string;
54
+ /**
55
+ * OAuth client secret (falls back to MCP_CLIENT_SECRET env var)
56
+ */
57
+ clientSecret?: string;
58
+ /**
59
+ * Token endpoint URL (required)
60
+ */
61
+ tokenEndpoint?: string;
62
+ /**
63
+ * Scopes to request
64
+ */
65
+ scopes?: string[];
66
+ }
45
67
  /**
46
68
  * Authentication configuration for MCP connections
47
69
  */
@@ -54,6 +76,10 @@ interface MCPAuthConfig {
54
76
  * Full OAuth configuration for browser-based authentication
55
77
  */
56
78
  oauth?: MCPOAuthConfig;
79
+ /**
80
+ * OAuth 2.1 client credentials grant for machine-to-machine authentication
81
+ */
82
+ clientCredentials?: MCPClientCredentialsConfig;
57
83
  }
58
84
  /**
59
85
  * MCP host capabilities that can be registered with the server
@@ -74,35 +100,67 @@ interface MCPHostCapabilities {
74
100
  };
75
101
  }
76
102
  /**
77
- * Configuration for MCP client connection
78
- *
79
- * Supports both stdio (local) and HTTP (remote) transports
103
+ * Configuration for MCP client connection via stdio transport (local process)
80
104
  */
81
- interface MCPConfig {
105
+ interface StdioMCPConfig {
82
106
  /**
83
- * Transport type
107
+ * Transport type discriminant
84
108
  */
85
- transport: 'http' | 'stdio';
109
+ transport: 'stdio';
86
110
  /**
87
- * Server URL (required when transport === 'http')
111
+ * Command to execute (required for stdio transport)
88
112
  */
89
- serverUrl?: string;
113
+ command: string;
90
114
  /**
91
- * HTTP headers (optional for http transport, e.g., Authorization)
115
+ * Command arguments
92
116
  */
93
- headers?: Record<string, string>;
117
+ args?: Array<string>;
94
118
  /**
95
- * Command to execute (required when transport === 'stdio')
119
+ * Working directory for the command
96
120
  */
97
- command?: string;
121
+ cwd?: string;
98
122
  /**
99
- * Command arguments (optional for stdio)
123
+ * Suppress stderr output from the server process.
124
+ * When true, server stderr is ignored instead of inherited.
100
125
  */
101
- args?: Array<string>;
126
+ quiet?: boolean;
102
127
  /**
103
- * Working directory for the command (optional for stdio)
128
+ * Host capabilities to register with the server
104
129
  */
105
- cwd?: string;
130
+ capabilities?: MCPHostCapabilities;
131
+ /**
132
+ * Connection timeout in milliseconds
133
+ */
134
+ connectTimeoutMs?: number;
135
+ /**
136
+ * Request timeout in milliseconds
137
+ */
138
+ requestTimeoutMs?: number;
139
+ /**
140
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
141
+ */
142
+ callTimeoutMs?: number;
143
+ }
144
+ /**
145
+ * Configuration for MCP client connection via HTTP transport (remote server)
146
+ */
147
+ interface HttpMCPConfig {
148
+ /**
149
+ * Transport type discriminant
150
+ */
151
+ transport: 'http';
152
+ /**
153
+ * Server URL (required for http transport)
154
+ */
155
+ serverUrl: string;
156
+ /**
157
+ * HTTP headers (e.g., Authorization)
158
+ */
159
+ headers?: Record<string, string>;
160
+ /**
161
+ * Authentication configuration
162
+ */
163
+ auth?: MCPAuthConfig;
106
164
  /**
107
165
  * Host capabilities to register with the server
108
166
  */
@@ -116,15 +174,57 @@ interface MCPConfig {
116
174
  */
117
175
  requestTimeoutMs?: number;
118
176
  /**
119
- * Suppress stderr output from the server process (stdio only)
120
- * When true, server stderr is ignored instead of inherited
177
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
121
178
  */
122
- quiet?: boolean;
179
+ callTimeoutMs?: number;
123
180
  /**
124
- * Authentication configuration (optional for http transport)
181
+ * HTTP proxy configuration. Falls back to HTTPS_PROXY/HTTP_PROXY environment variables.
125
182
  */
126
- auth?: MCPAuthConfig;
183
+ proxy?: {
184
+ /**
185
+ * Proxy URL. Credentials can be embedded directly if required:
186
+ * `http://user:pass@proxy.example.com:8080`
187
+ */
188
+ url: string;
189
+ };
190
+ /**
191
+ * Number of retry attempts for transient connection failures and 429 rate limit responses.
192
+ * Uses exponential backoff with Retry-After header awareness. Defaults to 0 (no retries).
193
+ */
194
+ retryAttempts?: number;
195
+ /**
196
+ * TLS/mTLS configuration for custom certificates or disabling cert validation.
197
+ * File paths should point to PEM-encoded certificate files.
198
+ */
199
+ tls?: {
200
+ /**
201
+ * Path to CA certificate PEM file (for custom/self-signed CAs)
202
+ */
203
+ ca?: string;
204
+ /**
205
+ * Path to client certificate PEM file (for mutual TLS)
206
+ */
207
+ cert?: string;
208
+ /**
209
+ * Path to client private key PEM file (for mutual TLS)
210
+ */
211
+ key?: string;
212
+ /**
213
+ * Whether to reject unauthorized certificates. Defaults to true.
214
+ * Set to false to disable certificate validation (not recommended for production).
215
+ */
216
+ rejectUnauthorized?: boolean;
217
+ };
127
218
  }
219
+ /**
220
+ * Configuration for MCP client connection.
221
+ *
222
+ * This is a discriminated union — narrow with `isStdioConfig()` or `isHttpConfig()`
223
+ * before accessing transport-specific fields.
224
+ *
225
+ * Supports both stdio (local) and HTTP (remote) transports.
226
+ */
227
+ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
128
228
  /**
129
229
  * Union schema for MCPConfig (validates based on transport type)
130
230
  */
@@ -155,6 +255,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
155
255
  }>>;
156
256
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
157
257
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
258
+ callTimeoutMs: z.ZodOptional<z.ZodNumber>;
158
259
  quiet: z.ZodOptional<z.ZodBoolean>;
159
260
  }, "strip", z.ZodTypeAny, {
160
261
  transport: "stdio";
@@ -169,6 +270,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
169
270
  } | undefined;
170
271
  connectTimeoutMs?: number | undefined;
171
272
  requestTimeoutMs?: number | undefined;
273
+ callTimeoutMs?: number | undefined;
172
274
  quiet?: boolean | undefined;
173
275
  }, {
174
276
  transport: "stdio";
@@ -183,10 +285,11 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
183
285
  } | undefined;
184
286
  connectTimeoutMs?: number | undefined;
185
287
  requestTimeoutMs?: number | undefined;
288
+ callTimeoutMs?: number | undefined;
186
289
  quiet?: boolean | undefined;
187
290
  }>, z.ZodObject<{
188
291
  transport: z.ZodLiteral<"http">;
189
- serverUrl: z.ZodString;
292
+ serverUrl: z.ZodEffects<z.ZodString, string, string>;
190
293
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
191
294
  capabilities: z.ZodOptional<z.ZodObject<{
192
295
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
@@ -210,6 +313,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
210
313
  }>>;
211
314
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
212
315
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
316
+ callTimeoutMs: z.ZodOptional<z.ZodNumber>;
213
317
  auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
214
318
  accessToken: z.ZodOptional<z.ZodString>;
215
319
  oauth: z.ZodOptional<z.ZodObject<{
@@ -237,6 +341,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
237
341
  clientSecret?: string | undefined;
238
342
  redirectUri?: string | undefined;
239
343
  }>>;
344
+ clientCredentials: z.ZodOptional<z.ZodObject<{
345
+ clientId: z.ZodOptional<z.ZodString>;
346
+ clientSecret: z.ZodOptional<z.ZodString>;
347
+ tokenEndpoint: z.ZodOptional<z.ZodString>;
348
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
349
+ }, "strip", z.ZodTypeAny, {
350
+ scopes?: string[] | undefined;
351
+ clientId?: string | undefined;
352
+ clientSecret?: string | undefined;
353
+ tokenEndpoint?: string | undefined;
354
+ }, {
355
+ scopes?: string[] | undefined;
356
+ clientId?: string | undefined;
357
+ clientSecret?: string | undefined;
358
+ tokenEndpoint?: string | undefined;
359
+ }>>;
240
360
  }, "strip", z.ZodTypeAny, {
241
361
  accessToken?: string | undefined;
242
362
  oauth?: {
@@ -248,6 +368,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
248
368
  clientSecret?: string | undefined;
249
369
  redirectUri?: string | undefined;
250
370
  } | undefined;
371
+ clientCredentials?: {
372
+ scopes?: string[] | undefined;
373
+ clientId?: string | undefined;
374
+ clientSecret?: string | undefined;
375
+ tokenEndpoint?: string | undefined;
376
+ } | undefined;
251
377
  }, {
252
378
  accessToken?: string | undefined;
253
379
  oauth?: {
@@ -259,6 +385,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
259
385
  clientSecret?: string | undefined;
260
386
  redirectUri?: string | undefined;
261
387
  } | undefined;
388
+ clientCredentials?: {
389
+ scopes?: string[] | undefined;
390
+ clientId?: string | undefined;
391
+ clientSecret?: string | undefined;
392
+ tokenEndpoint?: string | undefined;
393
+ } | undefined;
262
394
  }>, {
263
395
  accessToken?: string | undefined;
264
396
  oauth?: {
@@ -270,6 +402,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
270
402
  clientSecret?: string | undefined;
271
403
  redirectUri?: string | undefined;
272
404
  } | undefined;
405
+ clientCredentials?: {
406
+ scopes?: string[] | undefined;
407
+ clientId?: string | undefined;
408
+ clientSecret?: string | undefined;
409
+ tokenEndpoint?: string | undefined;
410
+ } | undefined;
273
411
  }, {
274
412
  accessToken?: string | undefined;
275
413
  oauth?: {
@@ -281,6 +419,36 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
281
419
  clientSecret?: string | undefined;
282
420
  redirectUri?: string | undefined;
283
421
  } | undefined;
422
+ clientCredentials?: {
423
+ scopes?: string[] | undefined;
424
+ clientId?: string | undefined;
425
+ clientSecret?: string | undefined;
426
+ tokenEndpoint?: string | undefined;
427
+ } | undefined;
428
+ }>>;
429
+ proxy: z.ZodOptional<z.ZodObject<{
430
+ url: z.ZodString;
431
+ }, "strip", z.ZodTypeAny, {
432
+ url: string;
433
+ }, {
434
+ url: string;
435
+ }>>;
436
+ retryAttempts: z.ZodOptional<z.ZodNumber>;
437
+ tls: z.ZodOptional<z.ZodObject<{
438
+ ca: z.ZodOptional<z.ZodString>;
439
+ cert: z.ZodOptional<z.ZodString>;
440
+ key: z.ZodOptional<z.ZodString>;
441
+ rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
442
+ }, "strip", z.ZodTypeAny, {
443
+ ca?: string | undefined;
444
+ cert?: string | undefined;
445
+ key?: string | undefined;
446
+ rejectUnauthorized?: boolean | undefined;
447
+ }, {
448
+ ca?: string | undefined;
449
+ cert?: string | undefined;
450
+ key?: string | undefined;
451
+ rejectUnauthorized?: boolean | undefined;
284
452
  }>>;
285
453
  }, "strip", z.ZodTypeAny, {
286
454
  serverUrl: string;
@@ -293,6 +461,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
293
461
  } | undefined;
294
462
  connectTimeoutMs?: number | undefined;
295
463
  requestTimeoutMs?: number | undefined;
464
+ callTimeoutMs?: number | undefined;
296
465
  headers?: Record<string, string> | undefined;
297
466
  auth?: {
298
467
  accessToken?: string | undefined;
@@ -305,6 +474,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
305
474
  clientSecret?: string | undefined;
306
475
  redirectUri?: string | undefined;
307
476
  } | undefined;
477
+ clientCredentials?: {
478
+ scopes?: string[] | undefined;
479
+ clientId?: string | undefined;
480
+ clientSecret?: string | undefined;
481
+ tokenEndpoint?: string | undefined;
482
+ } | undefined;
483
+ } | undefined;
484
+ proxy?: {
485
+ url: string;
486
+ } | undefined;
487
+ retryAttempts?: number | undefined;
488
+ tls?: {
489
+ ca?: string | undefined;
490
+ cert?: string | undefined;
491
+ key?: string | undefined;
492
+ rejectUnauthorized?: boolean | undefined;
308
493
  } | undefined;
309
494
  }, {
310
495
  serverUrl: string;
@@ -317,6 +502,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
317
502
  } | undefined;
318
503
  connectTimeoutMs?: number | undefined;
319
504
  requestTimeoutMs?: number | undefined;
505
+ callTimeoutMs?: number | undefined;
320
506
  headers?: Record<string, string> | undefined;
321
507
  auth?: {
322
508
  accessToken?: string | undefined;
@@ -329,6 +515,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
329
515
  clientSecret?: string | undefined;
330
516
  redirectUri?: string | undefined;
331
517
  } | undefined;
518
+ clientCredentials?: {
519
+ scopes?: string[] | undefined;
520
+ clientId?: string | undefined;
521
+ clientSecret?: string | undefined;
522
+ tokenEndpoint?: string | undefined;
523
+ } | undefined;
524
+ } | undefined;
525
+ proxy?: {
526
+ url: string;
527
+ } | undefined;
528
+ retryAttempts?: number | undefined;
529
+ tls?: {
530
+ ca?: string | undefined;
531
+ cert?: string | undefined;
532
+ key?: string | undefined;
533
+ rejectUnauthorized?: boolean | undefined;
332
534
  } | undefined;
333
535
  }>]>;
334
536
  /**
@@ -342,17 +544,11 @@ declare function validateMCPConfig(config: unknown): MCPConfig;
342
544
  /**
343
545
  * Type guard to check if a config is for stdio transport
344
546
  */
345
- declare function isStdioConfig(config: MCPConfig): config is MCPConfig & {
346
- transport: 'stdio';
347
- command: string;
348
- };
547
+ declare function isStdioConfig(config: MCPConfig): config is StdioMCPConfig;
349
548
  /**
350
549
  * Type guard to check if a config is for HTTP transport
351
550
  */
352
- declare function isHttpConfig(config: MCPConfig): config is MCPConfig & {
353
- transport: 'http';
354
- serverUrl: string;
355
- };
551
+ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
356
552
 
357
553
  /**
358
554
  * Auth types for MCP OAuth integration
@@ -601,6 +797,9 @@ declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
601
797
  tokens(): Promise<OAuthTokens | undefined>;
602
798
  /**
603
799
  * Stores new OAuth tokens for the current session
800
+ *
801
+ * The code verifier is cleared after a successful token exchange — it is
802
+ * single-use per PKCE spec and must not persist beyond the exchange.
604
803
  */
605
804
  saveTokens(tokens: OAuthTokens): Promise<void>;
606
805
  /**
@@ -757,6 +956,38 @@ interface AuthServerMetadata {
757
956
  */
758
957
  issuer: string;
759
958
  }
959
+ /**
960
+ * Configuration for client credentials grant
961
+ */
962
+ interface ClientCredentialsConfig {
963
+ /**
964
+ * Token endpoint URL
965
+ */
966
+ tokenEndpoint: string;
967
+ /**
968
+ * OAuth client ID
969
+ */
970
+ clientId: string;
971
+ /**
972
+ * OAuth client secret
973
+ */
974
+ clientSecret: string;
975
+ /**
976
+ * Scopes to request (optional)
977
+ */
978
+ scopes?: string[];
979
+ }
980
+ /**
981
+ * Performs the OAuth 2.1 client credentials grant to obtain an access token.
982
+ * Suitable for CI/CD machine-to-machine authentication.
983
+ *
984
+ * Uses oauth4webapi for spec-compliant request construction and response validation,
985
+ * consistent with how the rest of this module handles OAuth flows.
986
+ *
987
+ * @param config - Client credentials configuration
988
+ * @returns Token result
989
+ */
990
+ declare function performClientCredentialsFlow(config: ClientCredentialsConfig): Promise<TokenResult>;
760
991
 
761
992
  /**
762
993
  * OAuth Protected Resource and Authorization Server discovery
@@ -915,8 +1146,9 @@ declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?
915
1146
  * ```typescript
916
1147
  * // After running: npx mcp-server-tester login https://api.example.com/mcp
917
1148
  * const tokens = await loadTokens('https://api.example.com/mcp');
918
- * if (tokens) {
919
- * console.log('Access token:', tokens.accessToken);
1149
+ * if (tokens?.accessToken) {
1150
+ * // Use the token — never log raw token values
1151
+ * headers.Authorization = `Bearer ${tokens.accessToken}`;
920
1152
  * }
921
1153
  * ```
922
1154
  */
@@ -1127,6 +1359,14 @@ interface CreateMCPClientOptions {
1127
1359
  * This takes precedence over static token auth in config.auth.accessToken.
1128
1360
  */
1129
1361
  authProvider?: OAuthClientProvider;
1362
+ /**
1363
+ * Sampling handler callback for LLM sampling requests from the server.
1364
+ *
1365
+ * When provided, the client will advertise sampling capability to the server.
1366
+ * When absent, sampling is removed from declared capabilities so the client
1367
+ * does not falsely advertise support it cannot fulfill.
1368
+ */
1369
+ samplingHandler?: unknown;
1130
1370
  }
1131
1371
  /**
1132
1372
  * Creates and connects an MCP client based on the provided configuration
@@ -1251,6 +1491,14 @@ interface ValidationResult {
1251
1491
  message: string;
1252
1492
  /** Additional structured details about the validation */
1253
1493
  details?: Record<string, unknown>;
1494
+ /**
1495
+ * Optional quantitative metrics from the validation.
1496
+ * Populated by validateToolCalls for precision/recall.
1497
+ */
1498
+ metrics?: {
1499
+ precision?: number;
1500
+ recall?: number;
1501
+ };
1254
1502
  }
1255
1503
  /**
1256
1504
  * Options for text validation
@@ -1282,10 +1530,33 @@ interface PatternValidatorOptions {
1282
1530
  /** Whether to perform case-sensitive matching (default: true) */
1283
1531
  caseSensitive?: boolean;
1284
1532
  }
1533
+ /**
1534
+ * Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
1535
+ * Pass these values in the sanitizers array to replace non-deterministic
1536
+ * values with stable placeholders before snapshot comparison.
1537
+ *
1538
+ * @example
1539
+ * expect(result).toMatchToolSnapshot('my-snapshot', [
1540
+ * SnapshotSanitizers.UUID,
1541
+ * SnapshotSanitizers.ISO_DATE,
1542
+ * ]);
1543
+ */
1544
+ declare const SnapshotSanitizers: {
1545
+ /** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
1546
+ readonly TIMESTAMP: "timestamp";
1547
+ /** Replaces UUID v1-v5 strings with a stable placeholder */
1548
+ readonly UUID: "uuid";
1549
+ /** Replaces ISO 8601 date/datetime strings with a stable placeholder */
1550
+ readonly ISO_DATE: "iso-date";
1551
+ /** Replaces MongoDB ObjectId strings with a stable placeholder */
1552
+ readonly OBJECT_ID: "objectId";
1553
+ /** Replaces JWT tokens with a stable placeholder */
1554
+ readonly JWT: "jwt";
1555
+ };
1285
1556
  /**
1286
1557
  * Built-in sanitizer names for common variable patterns
1287
1558
  */
1288
- type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
1559
+ type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
1289
1560
  /**
1290
1561
  * Custom regex-based sanitizer
1291
1562
  */
@@ -1511,38 +1782,63 @@ declare function validateError(response: unknown, expected?: boolean | string |
1511
1782
  declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
1512
1783
 
1513
1784
  /**
1514
- * Validator Utilities
1785
+ * Tool call validators for llm_host simulation results.
1515
1786
  *
1516
- * Shared utility functions for validation operations.
1517
- * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
1787
+ * These validators extract the tool call trace from an LLMHostSimulationResult
1788
+ * and apply assertions against expected call lists and counts.
1518
1789
  */
1519
1790
 
1791
+ interface ToolCallExpectation {
1792
+ calls: Array<{
1793
+ name: string;
1794
+ arguments?: Record<string, unknown>;
1795
+ required?: boolean;
1796
+ }>;
1797
+ order?: 'strict' | 'any';
1798
+ exclusive?: boolean;
1799
+ }
1800
+ interface ToolCallCountOptions {
1801
+ min?: number;
1802
+ max?: number;
1803
+ exact?: number;
1804
+ }
1520
1805
  /**
1521
- * Gets the size of a response in bytes
1522
- *
1523
- * Serializes the response to JSON (with pretty printing for consistency)
1524
- * and returns the byte length using UTF-8 encoding.
1806
+ * Validates tool calls made during an LLM host simulation.
1525
1807
  *
1526
- * @param response - Response in any format
1527
- * @returns Size in bytes
1808
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1809
+ * @param expectation - Expected tool call specification
1528
1810
  */
1529
- declare function getResponseSizeBytes(response: unknown): number;
1811
+ declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
1530
1812
  /**
1531
- * Normalizes whitespace in text for consistent comparison
1532
- *
1533
- * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
1534
- * and trims leading/trailing whitespace.
1813
+ * Validates the number of tool calls made during an LLM host simulation.
1535
1814
  *
1536
- * @param text - Text to normalize
1537
- * @returns Normalized text with collapsed whitespace
1815
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1816
+ * @param options - Count constraints (min, max, exact)
1817
+ */
1818
+ declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
1819
+
1820
+ /**
1821
+ * Built-in judge rubrics matching Glean EvalV2's named judge types.
1822
+ * Use these for consistent, standardized evaluations across teams.
1538
1823
  *
1539
- * @example
1540
- * ```typescript
1541
- * normalizeWhitespace(' hello\n\n world ');
1542
- * // Returns: "hello world"
1543
- * ```
1824
+ * All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
1544
1825
  */
1545
- declare function normalizeWhitespace(text: string): string;
1826
+ type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
1827
+ declare const BUILT_IN_RUBRICS: Record<BuiltInRubric, string>;
1828
+ /** A rubric specification: either a built-in named rubric or custom text. */
1829
+ type RubricSpec = BuiltInRubric | {
1830
+ text: string;
1831
+ };
1832
+ /**
1833
+ * Returns true if `s` is a built-in rubric name.
1834
+ */
1835
+ declare function isBuiltInRubric(s: unknown): s is BuiltInRubric;
1836
+ /**
1837
+ * Resolves a RubricSpec to its full rubric text.
1838
+ * - Built-in name → returns the expanded rubric text from BUILT_IN_RUBRICS
1839
+ * - Custom object → returns rubric.text as-is
1840
+ */
1841
+ declare function resolveRubric(rubric: RubricSpec): string;
1546
1842
 
1547
1843
  /**
1548
1844
  * Usage metrics from Claude Agent SDK response
@@ -1577,10 +1873,8 @@ interface UsageMetrics {
1577
1873
  */
1578
1874
  cacheCreationInputTokens?: number;
1579
1875
  }
1580
- /**
1581
- * Supported LLM provider types
1582
- */
1583
- type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
1876
+ /** Valid LLM judge provider kinds. */
1877
+ type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'google';
1584
1878
  /**
1585
1879
  * Configuration for an LLM judge
1586
1880
  */
@@ -1649,7 +1943,24 @@ interface JudgeResult {
1649
1943
  * Whether the candidate exceeded maxToolOutputSize
1650
1944
  */
1651
1945
  exceedsMaxToolOutputSize?: boolean;
1946
+ /**
1947
+ * Standard deviation of individual rep scores.
1948
+ * Only populated when the judge was run with reps > 1.
1949
+ */
1950
+ scoreStdDev?: number;
1951
+ /**
1952
+ * True when the standard deviation across reps exceeds 0.2, indicating
1953
+ * that the rubric may be ambiguous or the judge is non-deterministic.
1954
+ * Only populated when the judge was run with reps > 1.
1955
+ */
1956
+ highVariance?: boolean;
1957
+ /**
1958
+ * Individual scores from each judge rep.
1959
+ * Only populated when the judge was run with reps > 1.
1960
+ */
1961
+ scores?: number[];
1652
1962
  }
1963
+
1653
1964
  /**
1654
1965
  * LLM judge client interface
1655
1966
  */
@@ -1665,6 +1976,75 @@ interface Judge {
1665
1976
  evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
1666
1977
  }
1667
1978
 
1979
+ /**
1980
+ * Judge Validator
1981
+ *
1982
+ * Validates a response using an LLM-as-a-judge evaluation.
1983
+ */
1984
+
1985
+ /**
1986
+ * Configuration for the judge validator
1987
+ */
1988
+ interface JudgeValidatorConfig {
1989
+ /** The evaluation rubric: a built-in name or custom { text: string } */
1990
+ rubric: RubricSpec;
1991
+ /** Optional reference response to compare against */
1992
+ reference?: unknown;
1993
+ /** Minimum score required to pass (0-1, default: 0.7) */
1994
+ threshold?: number;
1995
+ /** Number of judge evaluations to run. Scores averaged. @default 1 */
1996
+ reps?: number;
1997
+ /** Judge provider. @default 'claude' */
1998
+ provider?: ProviderKind;
1999
+ /** Model override (e.g., 'claude-opus-4-20250514') */
2000
+ model?: string;
2001
+ /** Environment variable name for API key */
2002
+ apiKeyEnvVar?: string;
2003
+ /** Max tokens for judge response */
2004
+ maxTokens?: number;
2005
+ /** Temperature for judge LLM (0–1) */
2006
+ temperature?: number;
2007
+ /** Max budget in USD per evaluation */
2008
+ maxBudgetUsd?: number;
2009
+ /** Fail if response exceeds this size in bytes before judging */
2010
+ maxToolOutputSize?: number;
2011
+ }
2012
+ declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
2013
+
2014
+ /**
2015
+ * Validator Utilities
2016
+ *
2017
+ * Shared utility functions for validation operations.
2018
+ * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
2019
+ */
2020
+
2021
+ /**
2022
+ * Gets the size of a response in bytes
2023
+ *
2024
+ * Serializes the response to JSON (with pretty printing for consistency)
2025
+ * and returns the byte length using UTF-8 encoding.
2026
+ *
2027
+ * @param response - Response in any format
2028
+ * @returns Size in bytes
2029
+ */
2030
+ declare function getResponseSizeBytes(response: unknown): number;
2031
+ /**
2032
+ * Normalizes whitespace in text for consistent comparison
2033
+ *
2034
+ * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
2035
+ * and trims leading/trailing whitespace.
2036
+ *
2037
+ * @param text - Text to normalize
2038
+ * @returns Normalized text with collapsed whitespace
2039
+ *
2040
+ * @example
2041
+ * ```typescript
2042
+ * normalizeWhitespace(' hello\n\n world ');
2043
+ * // Returns: "hello world"
2044
+ * ```
2045
+ */
2046
+ declare function normalizeWhitespace(text: string): string;
2047
+
1668
2048
  /**
1669
2049
  * Matcher Types
1670
2050
  *
@@ -1679,8 +2059,12 @@ interface JudgeMatcherOptions {
1679
2059
  reference?: unknown;
1680
2060
  /** Score threshold for passing (default: 0.7) */
1681
2061
  passingThreshold?: number;
1682
- /** Judge configuration override */
1683
- judgeConfig?: JudgeConfig;
2062
+ /** Number of judge evaluations (scores averaged) */
2063
+ reps?: number;
2064
+ /** Override the judge provider */
2065
+ provider?: ProviderKind;
2066
+ /** Override the judge model */
2067
+ model?: string;
1684
2068
  }
1685
2069
  /**
1686
2070
  * Declaration merging for Playwright matchers
@@ -1785,7 +2169,7 @@ declare global {
1785
2169
  * });
1786
2170
  * ```
1787
2171
  */
1788
- toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
2172
+ toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
1789
2173
  /**
1790
2174
  * Validates that a response meets size constraints
1791
2175
  *
@@ -1830,11 +2214,33 @@ declare global {
1830
2214
  * ```
1831
2215
  */
1832
2216
  toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
1833
- }
1834
- }
1835
- }
1836
- /**
1837
- * Predicate result returned by the user's predicate function
2217
+ /**
2218
+ * Validates which tools the LLM called during an llm_host simulation.
2219
+ *
2220
+ * @example
2221
+ * ```typescript
2222
+ * expect(simulationResult).toHaveToolCalls({
2223
+ * calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
2224
+ * order: 'any',
2225
+ * });
2226
+ * ```
2227
+ */
2228
+ toHaveToolCalls(expectation: ToolCallExpectation): R;
2229
+ /**
2230
+ * Validates the number of tool calls made during an llm_host simulation.
2231
+ *
2232
+ * @example
2233
+ * ```typescript
2234
+ * expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
2235
+ * expect(simulationResult).toHaveToolCallCount({ exact: 2 });
2236
+ * ```
2237
+ */
2238
+ toHaveToolCallCount(options: ToolCallCountOptions): R;
2239
+ }
2240
+ }
2241
+ }
2242
+ /**
2243
+ * Predicate result returned by the user's predicate function
1838
2244
  */
1839
2245
  interface PredicateResult {
1840
2246
  /** Whether the predicate passed */
@@ -1873,7 +2279,7 @@ type ResultSource = 'eval' | 'test';
1873
2279
  /**
1874
2280
  * Known expectation types supported by the framework
1875
2281
  */
1876
- type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
2282
+ type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size' | 'toolsTriggered' | 'toolCallCount';
1877
2283
  /**
1878
2284
  * Result of an expectation check
1879
2285
  */
@@ -1912,6 +2318,10 @@ interface MCPFixtureOptions {
1912
2318
  * Used for filtering and grouping in the reporter
1913
2319
  */
1914
2320
  project?: string;
2321
+ /**
2322
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
2323
+ */
2324
+ callTimeoutMs?: number;
1915
2325
  }
1916
2326
  /**
1917
2327
  * High-level API for interacting with MCP servers in tests
@@ -1954,29 +2364,43 @@ interface MCPFixtureApi {
1954
2364
  } | null;
1955
2365
  }
1956
2366
  /**
1957
- * Creates an MCP fixture wrapper around a Client
2367
+ * Creates an MCP fixture wrapper around a Client, providing a high-level
2368
+ * {@link MCPFixtureApi} without requiring Playwright's `test.extend` pattern.
1958
2369
  *
1959
- * When testInfo is provided, automatically tracks all MCP operations with test.step()
1960
- * and creates attachments for the MCP Test Reporter.
2370
+ * Use this when you need to set up an MCP fixture manually — for example in
2371
+ * custom fixture hierarchies, non-Playwright test runners (e.g. Vitest,
2372
+ * Jest), or when you want to compose the fixture with other lifecycle
2373
+ * management logic that doesn't fit the standard `test.extend` model.
1961
2374
  *
1962
- * @param client - The MCP client to wrap
1963
- * @param testInfo - Optional Playwright TestInfo for auto-tracking
2375
+ * For the typical Playwright use case, prefer importing `test` and `mcp`
2376
+ * directly from `@gleanwork/mcp-server-tester/fixtures/mcp`, which wires
2377
+ * this function up automatically.
2378
+ *
2379
+ * When `testInfo` is provided, all MCP operations are automatically wrapped
2380
+ * in `test.step()` calls and attachments are created for the MCP Test
2381
+ * Reporter. Omit `testInfo` for lightweight usage outside Playwright.
2382
+ *
2383
+ * @param client - The MCP client to wrap (created via `createMCPClientForConfig`)
2384
+ * @param testInfo - Optional Playwright TestInfo for auto-tracking and reporter attachments
2385
+ * @param options - Optional fixture options (authType, project)
1964
2386
  * @returns MCPFixtureApi instance
1965
2387
  *
1966
2388
  * @example
1967
2389
  * ```typescript
1968
- * // With tracking (recommended)
2390
+ * // Advanced: custom fixture setup inside test.extend
1969
2391
  * const test = base.extend<{ mcp: MCPFixtureApi }>({
1970
2392
  * mcp: async ({}, use, testInfo) => {
1971
2393
  * const client = await createMCPClientForConfig(config);
1972
- * const api = createMCPFixture(client, testInfo);
2394
+ * const api = createMCPFixture(client, testInfo, { authType: 'api-token' });
1973
2395
  * await use(api);
1974
2396
  * await closeMCPClient(client);
1975
2397
  * }
1976
2398
  * });
1977
2399
  *
1978
- * // Without tracking
2400
+ * // Non-Playwright usage (no reporter attachments)
2401
+ * const client = await createMCPClientForConfig(config);
1979
2402
  * const api = createMCPFixture(client);
2403
+ * const tools = await api.listTools();
1980
2404
  * ```
1981
2405
  */
1982
2406
  declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
@@ -2082,6 +2506,8 @@ declare function toBeToolError(this: {
2082
2506
  * toPassToolJudge Matcher
2083
2507
  *
2084
2508
  * Validates that a response passes LLM-as-judge evaluation.
2509
+ * Delegates evaluation logic to validateJudge() for consistency
2510
+ * with the validator/matcher duality pattern.
2085
2511
  */
2086
2512
 
2087
2513
  /**
@@ -2091,7 +2517,7 @@ declare function toBeToolError(this: {
2091
2517
  */
2092
2518
  declare function toPassToolJudge(this: {
2093
2519
  isNot: boolean;
2094
- }, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
2520
+ }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
2095
2521
  pass: boolean;
2096
2522
  message: () => string;
2097
2523
  }>;
@@ -2158,6 +2584,38 @@ declare function toSatisfyToolPredicate(this: {
2158
2584
  message: () => string;
2159
2585
  }>;
2160
2586
 
2587
+ /**
2588
+ * toHaveToolCalls Matcher
2589
+ *
2590
+ * Validates which tools the LLM called during an llm_host simulation.
2591
+ */
2592
+
2593
+ /**
2594
+ * Creates the toHaveToolCalls matcher function
2595
+ */
2596
+ declare function toHaveToolCalls(this: {
2597
+ isNot: boolean;
2598
+ }, received: unknown, expectation: ToolCallExpectation): {
2599
+ pass: boolean;
2600
+ message: () => string;
2601
+ };
2602
+
2603
+ /**
2604
+ * toHaveToolCallCount Matcher
2605
+ *
2606
+ * Validates the number of tool calls made during an llm_host simulation.
2607
+ */
2608
+
2609
+ /**
2610
+ * Creates the toHaveToolCallCount matcher function
2611
+ */
2612
+ declare function toHaveToolCallCount(this: {
2613
+ isNot: boolean;
2614
+ }, received: unknown, options: ToolCallCountOptions): {
2615
+ pass: boolean;
2616
+ message: () => string;
2617
+ };
2618
+
2161
2619
  /**
2162
2620
  * Extended Playwright expect with MCP tool matchers
2163
2621
  *
@@ -2184,6 +2642,8 @@ declare const expect: playwright_test.Expect<{
2184
2642
  toPassToolJudge: typeof toPassToolJudge;
2185
2643
  toHaveToolResponseSize: typeof toHaveToolResponseSize;
2186
2644
  toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
2645
+ toHaveToolCalls: typeof toHaveToolCalls;
2646
+ toHaveToolCallCount: typeof toHaveToolCallCount;
2187
2647
  }>;
2188
2648
 
2189
2649
  /**
@@ -2233,9 +2693,30 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
2233
2693
  */
2234
2694
 
2235
2695
  /**
2236
- * LLM provider for host simulation
2696
+ * LLM provider for host simulation.
2697
+ *
2698
+ * All providers run through the Vercel AI SDK (`ai` package).
2699
+ * Each provider requires its corresponding @ai-sdk/* package:
2700
+ *
2701
+ * openai → npm install ai @ai-sdk/openai
2702
+ * anthropic → npm install ai @ai-sdk/anthropic
2703
+ * google → npm install ai @ai-sdk/google
2704
+ * azure → npm install ai @ai-sdk/azure
2705
+ * mistral → npm install ai @ai-sdk/mistral
2706
+ * ollama → npm install ai @ai-sdk/ollama (local, no API key)
2707
+ * deepseek → npm install ai @ai-sdk/deepseek
2708
+ * openrouter → npm install ai @openrouter/ai-sdk-provider
2709
+ * xai → npm install ai @ai-sdk/xai
2710
+ */
2711
+ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'ollama' | 'deepseek' | 'openrouter' | 'xai'
2712
+ /**
2713
+ * Anthropic Claude via Google Vertex AI.
2714
+ * Requires @ai-sdk/google-vertex and Application Default Credentials (gcloud auth).
2715
+ * Set GOOGLE_VERTEX_PROJECT and GOOGLE_VERTEX_LOCATION env vars.
2716
+ * Use this instead of 'anthropic' in environments where api.anthropic.com is blocked.
2717
+ * @example model: 'claude-3-5-haiku@20241022'
2237
2718
  */
2238
- type LLMProvider = 'openai' | 'anthropic';
2719
+ | 'vertex-anthropic';
2239
2720
  /**
2240
2721
  * Configuration for LLM host simulation
2241
2722
  */
@@ -2246,12 +2727,10 @@ interface LLMHostConfig {
2246
2727
  provider: LLMProvider;
2247
2728
  /**
2248
2729
  * Environment variable name containing the API key
2249
- * @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
2250
2730
  */
2251
2731
  apiKeyEnvVar?: string;
2252
2732
  /**
2253
- * Model to use
2254
- * @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
2733
+ * Model to use (provider-specific default if omitted)
2255
2734
  */
2256
2735
  model?: string;
2257
2736
  /**
@@ -2260,11 +2739,11 @@ interface LLMHostConfig {
2260
2739
  maxTokens?: number;
2261
2740
  /**
2262
2741
  * Temperature (0-1, lower is more deterministic)
2263
- * @default 0.0
2742
+ * @default 0
2264
2743
  */
2265
2744
  temperature?: number;
2266
2745
  /**
2267
- * Maximum number of tool calls to allow in a single conversation
2746
+ * Maximum number of tool call steps to allow in a single conversation
2268
2747
  * @default 10
2269
2748
  */
2270
2749
  maxToolCalls?: number;
@@ -2273,72 +2752,49 @@ interface LLMHostConfig {
2273
2752
  * A tool call made by the LLM
2274
2753
  */
2275
2754
  interface LLMToolCall {
2276
- /**
2277
- * Tool name
2278
- */
2755
+ /** Tool name */
2279
2756
  name: string;
2280
- /**
2281
- * Tool arguments (as provided by LLM)
2282
- */
2757
+ /** Tool arguments (as provided by LLM) */
2283
2758
  arguments: Record<string, unknown>;
2284
- /**
2285
- * Optional tool call ID (for tracking)
2286
- */
2759
+ /** Optional tool call ID (for tracking) */
2287
2760
  id?: string;
2288
2761
  }
2289
- /**
2290
- * Result of a tool call validation
2291
- */
2292
- interface ToolCallValidationResult {
2293
- /**
2294
- * Whether the tool call was valid
2295
- */
2296
- valid: boolean;
2297
- /**
2298
- * List of actual tool calls made
2299
- */
2300
- actualCalls: Array<LLMToolCall>;
2301
- /**
2302
- * Expected tool calls (if specified in eval case)
2303
- */
2304
- expectedCalls?: Array<LLMToolCall>;
2305
- /**
2306
- * Details about validation (e.g., missing calls, incorrect arguments)
2307
- */
2308
- details?: string;
2309
- }
2310
2762
  /**
2311
2763
  * Result from an LLM host simulation
2312
2764
  */
2313
2765
  interface LLMHostSimulationResult {
2314
- /**
2315
- * Whether the simulation succeeded
2316
- */
2766
+ /** Whether the simulation succeeded */
2317
2767
  success: boolean;
2318
- /**
2319
- * Tool calls made by the LLM
2320
- */
2768
+ /** Tool calls made by the LLM */
2321
2769
  toolCalls: Array<LLMToolCall>;
2322
- /**
2323
- * Final response from the LLM
2324
- */
2770
+ /** Final response from the LLM */
2325
2771
  response?: string;
2326
- /**
2327
- * Error message if simulation failed
2328
- */
2772
+ /** Error message if simulation failed */
2329
2773
  error?: string;
2330
- /**
2331
- * Full conversation history (for debugging)
2332
- */
2774
+ /** The scenario prompt that was given to the LLM */
2775
+ scenario?: string;
2776
+ /** The conversation turns for attribution analysis */
2333
2777
  conversationHistory?: Array<{
2334
2778
  role: 'user' | 'assistant' | 'tool';
2335
2779
  content: string;
2336
2780
  }>;
2781
+ /**
2782
+ * Milliseconds spent waiting for LLM responses
2783
+ * (excludes MCP tool execution time)
2784
+ */
2785
+ llmDurationMs?: number;
2786
+ /**
2787
+ * Milliseconds spent executing MCP tool calls
2788
+ * (excludes LLM response time)
2789
+ */
2790
+ mcpDurationMs?: number;
2337
2791
  }
2338
2792
  /**
2339
- * Interface for LLM host simulators
2793
+ * Interface for LLM host simulators.
2340
2794
  *
2341
- * Implementations communicate with MCP servers via the actual MCP protocol
2795
+ * The only built-in implementation is the Vercel AI SDK orchestrator
2796
+ * (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
2797
+ * created for specialised testing needs.
2342
2798
  */
2343
2799
  interface LLMHostSimulator {
2344
2800
  /**
@@ -2351,24 +2807,6 @@ interface LLMHostSimulator {
2351
2807
  */
2352
2808
  simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
2353
2809
  }
2354
- /**
2355
- * Expected tool call specification (for validation)
2356
- */
2357
- interface ExpectedToolCall {
2358
- /**
2359
- * Tool name
2360
- */
2361
- name: string;
2362
- /**
2363
- * Expected arguments (partial match)
2364
- */
2365
- arguments?: Record<string, unknown>;
2366
- /**
2367
- * Whether this call is required
2368
- * @default true
2369
- */
2370
- required?: boolean;
2371
- }
2372
2810
 
2373
2811
  /**
2374
2812
  * Evaluation mode
@@ -2423,6 +2861,41 @@ interface EvalCase {
2423
2861
  * For 'llm_host' mode, can include 'expectedToolCalls' for validation
2424
2862
  */
2425
2863
  metadata?: Record<string, unknown>;
2864
+ /**
2865
+ * Number of times to run this case and compute an accuracy score.
2866
+ * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
2867
+ * by `accuracyThreshold` rather than a single run.
2868
+ * @default 1
2869
+ */
2870
+ iterations?: number;
2871
+ /**
2872
+ * Minimum accuracy (0–1) required to pass when `iterations > 1`.
2873
+ * @default 1.0 (all iterations must pass)
2874
+ */
2875
+ accuracyThreshold?: number;
2876
+ /**
2877
+ * Number of times to invoke the LLM judge per `passesJudge` assertion.
2878
+ * Scores are averaged; the mean must meet the threshold to pass.
2879
+ * Reduces judge variance caused by non-determinism.
2880
+ * Per-assertion `passesJudge.reps` overrides this value.
2881
+ * @default 1
2882
+ */
2883
+ judgeReps?: number;
2884
+ /**
2885
+ * Golden/expected answer for this case.
2886
+ * When set, automatically passed as `reference` to the LLM judge
2887
+ * (unless passesJudge.reference is explicitly provided).
2888
+ * Mirrors EvalV2's `canonical_answer` field.
2889
+ */
2890
+ canonicalAnswer?: string;
2891
+ /**
2892
+ * Arbitrary string labels for this case.
2893
+ * Use for filtering eval runs with `EvalRunnerOptions.filterTags`
2894
+ * and for slicing results by category.
2895
+ *
2896
+ * @example ['tool-finding', 'multi-hop', 'search']
2897
+ */
2898
+ tags?: string[];
2426
2899
  /**
2427
2900
  * Expectations to validate against the tool response
2428
2901
  *
@@ -2486,14 +2959,30 @@ interface EvalExpectBlock {
2486
2959
  * LLM-as-judge evaluation (toPassToolJudge)
2487
2960
  */
2488
2961
  passesJudge?: {
2489
- /** Evaluation rubric/criteria */
2490
- rubric: string;
2962
+ /** Built-in rubric name or custom rubric object */
2963
+ rubric: BuiltInRubric | {
2964
+ text: string;
2965
+ };
2491
2966
  /** Reference response to compare against */
2492
2967
  reference?: unknown;
2493
2968
  /** Score threshold for passing (0-1, default: 0.7) */
2494
2969
  threshold?: number;
2495
- /** Judge configuration ID */
2496
- configId?: string;
2970
+ /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
2971
+ reps?: number;
2972
+ /** Judge provider. @default 'claude' */
2973
+ provider?: 'claude' | 'anthropic' | 'openai' | 'google';
2974
+ /** Model override (e.g., 'claude-opus-4-20250514') */
2975
+ model?: string;
2976
+ /** Environment variable name for API key */
2977
+ apiKeyEnvVar?: string;
2978
+ /** Max tokens for judge response */
2979
+ maxTokens?: number;
2980
+ /** Temperature for judge LLM (0–1) */
2981
+ temperature?: number;
2982
+ /** Max budget in USD per evaluation */
2983
+ maxBudgetUsd?: number;
2984
+ /** Fail if response exceeds this size in bytes before judging */
2985
+ maxToolOutputSize?: number;
2497
2986
  };
2498
2987
  /**
2499
2988
  * Response size validation (toHaveToolResponseSize)
@@ -2504,6 +2993,39 @@ interface EvalExpectBlock {
2504
2993
  /** Minimum required size in bytes */
2505
2994
  minBytes?: number;
2506
2995
  };
2996
+ /**
2997
+ * Asserts which tools the LLM called during an llm_host simulation.
2998
+ * Only meaningful for llm_host mode — direct mode has no tool call trace.
2999
+ */
3000
+ toolsTriggered?: {
3001
+ /** Expected tool calls */
3002
+ calls: Array<{
3003
+ /** Tool name */
3004
+ name: string;
3005
+ /** Expected arguments (partial match — extra keys are allowed) */
3006
+ arguments?: Record<string, unknown>;
3007
+ /** Whether this call MUST have been made (default: true) */
3008
+ required?: boolean;
3009
+ }>;
3010
+ /**
3011
+ * 'strict': calls must appear in the exact order listed
3012
+ * 'any': calls can appear in any order (default)
3013
+ */
3014
+ order?: 'strict' | 'any';
3015
+ /** If true, no tool calls outside the `calls` list are allowed */
3016
+ exclusive?: boolean;
3017
+ };
3018
+ /**
3019
+ * Asserts the number of tool calls made during an llm_host simulation.
3020
+ */
3021
+ toolCallCount?: {
3022
+ /** Minimum number of tool calls */
3023
+ min?: number;
3024
+ /** Maximum number of tool calls */
3025
+ max?: number;
3026
+ /** Exact number of tool calls */
3027
+ exact?: number;
3028
+ };
2507
3029
  }
2508
3030
  /**
2509
3031
  * A complete eval dataset containing multiple test cases
@@ -2543,21 +3065,21 @@ declare const EvalCaseSchema: z.ZodObject<{
2543
3065
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2544
3066
  scenario: z.ZodOptional<z.ZodString>;
2545
3067
  llmHostConfig: z.ZodOptional<z.ZodObject<{
2546
- provider: z.ZodEnum<["openai", "anthropic"]>;
3068
+ provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2547
3069
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2548
3070
  model: z.ZodOptional<z.ZodString>;
2549
3071
  maxTokens: z.ZodOptional<z.ZodNumber>;
2550
3072
  temperature: z.ZodOptional<z.ZodNumber>;
2551
3073
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
2552
3074
  }, "strip", z.ZodTypeAny, {
2553
- provider: "anthropic" | "openai";
3075
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2554
3076
  model?: string | undefined;
2555
3077
  maxTokens?: number | undefined;
2556
3078
  apiKeyEnvVar?: string | undefined;
2557
3079
  temperature?: number | undefined;
2558
3080
  maxToolCalls?: number | undefined;
2559
3081
  }, {
2560
- provider: "anthropic" | "openai";
3082
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2561
3083
  model?: string | undefined;
2562
3084
  maxTokens?: number | undefined;
2563
3085
  apiKeyEnvVar?: string | undefined;
@@ -2565,6 +3087,11 @@ declare const EvalCaseSchema: z.ZodObject<{
2565
3087
  maxToolCalls?: number | undefined;
2566
3088
  }>>;
2567
3089
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3090
+ iterations: z.ZodOptional<z.ZodNumber>;
3091
+ accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3092
+ judgeReps: z.ZodOptional<z.ZodNumber>;
3093
+ canonicalAnswer: z.ZodOptional<z.ZodString>;
3094
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2568
3095
  expect: z.ZodOptional<z.ZodObject<{
2569
3096
  response: z.ZodOptional<z.ZodUnknown>;
2570
3097
  schema: z.ZodOptional<z.ZodString>;
@@ -2589,20 +3116,51 @@ declare const EvalCaseSchema: z.ZodObject<{
2589
3116
  }>]>, "many">>;
2590
3117
  isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2591
3118
  passesJudge: z.ZodOptional<z.ZodObject<{
2592
- rubric: z.ZodString;
3119
+ rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3120
+ text: z.ZodString;
3121
+ }, "strip", z.ZodTypeAny, {
3122
+ text: string;
3123
+ }, {
3124
+ text: string;
3125
+ }>]>;
2593
3126
  reference: z.ZodOptional<z.ZodUnknown>;
2594
3127
  threshold: z.ZodOptional<z.ZodNumber>;
2595
- configId: z.ZodOptional<z.ZodString>;
3128
+ reps: z.ZodOptional<z.ZodNumber>;
3129
+ provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
3130
+ model: z.ZodOptional<z.ZodString>;
3131
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3132
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3133
+ temperature: z.ZodOptional<z.ZodNumber>;
3134
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3135
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2596
3136
  }, "strip", z.ZodTypeAny, {
2597
- rubric: string;
3137
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3138
+ text: string;
3139
+ };
3140
+ model?: string | undefined;
3141
+ maxTokens?: number | undefined;
3142
+ maxBudgetUsd?: number | undefined;
2598
3143
  reference?: unknown;
2599
3144
  threshold?: number | undefined;
2600
- configId?: string | undefined;
3145
+ reps?: number | undefined;
3146
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3147
+ apiKeyEnvVar?: string | undefined;
3148
+ temperature?: number | undefined;
3149
+ maxToolOutputSize?: number | undefined;
2601
3150
  }, {
2602
- rubric: string;
3151
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3152
+ text: string;
3153
+ };
3154
+ model?: string | undefined;
3155
+ maxTokens?: number | undefined;
3156
+ maxBudgetUsd?: number | undefined;
2603
3157
  reference?: unknown;
2604
3158
  threshold?: number | undefined;
2605
- configId?: string | undefined;
3159
+ reps?: number | undefined;
3160
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3161
+ apiKeyEnvVar?: string | undefined;
3162
+ temperature?: number | undefined;
3163
+ maxToolOutputSize?: number | undefined;
2606
3164
  }>>;
2607
3165
  responseSize: z.ZodOptional<z.ZodObject<{
2608
3166
  maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2614,11 +3172,71 @@ declare const EvalCaseSchema: z.ZodObject<{
2614
3172
  maxBytes?: number | undefined;
2615
3173
  minBytes?: number | undefined;
2616
3174
  }>>;
3175
+ toolsTriggered: z.ZodOptional<z.ZodObject<{
3176
+ calls: z.ZodArray<z.ZodObject<{
3177
+ name: z.ZodString;
3178
+ arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3179
+ required: z.ZodOptional<z.ZodBoolean>;
3180
+ }, "strip", z.ZodTypeAny, {
3181
+ name: string;
3182
+ required?: boolean | undefined;
3183
+ arguments?: Record<string, unknown> | undefined;
3184
+ }, {
3185
+ name: string;
3186
+ required?: boolean | undefined;
3187
+ arguments?: Record<string, unknown> | undefined;
3188
+ }>, "many">;
3189
+ order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3190
+ exclusive: z.ZodOptional<z.ZodBoolean>;
3191
+ }, "strip", z.ZodTypeAny, {
3192
+ calls: {
3193
+ name: string;
3194
+ required?: boolean | undefined;
3195
+ arguments?: Record<string, unknown> | undefined;
3196
+ }[];
3197
+ order?: "strict" | "any" | undefined;
3198
+ exclusive?: boolean | undefined;
3199
+ }, {
3200
+ calls: {
3201
+ name: string;
3202
+ required?: boolean | undefined;
3203
+ arguments?: Record<string, unknown> | undefined;
3204
+ }[];
3205
+ order?: "strict" | "any" | undefined;
3206
+ exclusive?: boolean | undefined;
3207
+ }>>;
3208
+ toolCallCount: z.ZodOptional<z.ZodObject<{
3209
+ min: z.ZodOptional<z.ZodNumber>;
3210
+ max: z.ZodOptional<z.ZodNumber>;
3211
+ exact: z.ZodOptional<z.ZodNumber>;
3212
+ }, "strip", z.ZodTypeAny, {
3213
+ exact?: number | undefined;
3214
+ min?: number | undefined;
3215
+ max?: number | undefined;
3216
+ }, {
3217
+ exact?: number | undefined;
3218
+ min?: number | undefined;
3219
+ max?: number | undefined;
3220
+ }>>;
2617
3221
  }, "strip", z.ZodTypeAny, {
3222
+ response?: unknown;
2618
3223
  isError?: string | boolean | string[] | undefined;
2619
3224
  schema?: string | undefined;
2620
3225
  snapshot?: string | undefined;
2621
- response?: unknown;
3226
+ toolsTriggered?: {
3227
+ calls: {
3228
+ name: string;
3229
+ required?: boolean | undefined;
3230
+ arguments?: Record<string, unknown> | undefined;
3231
+ }[];
3232
+ order?: "strict" | "any" | undefined;
3233
+ exclusive?: boolean | undefined;
3234
+ } | undefined;
3235
+ toolCallCount?: {
3236
+ exact?: number | undefined;
3237
+ min?: number | undefined;
3238
+ max?: number | undefined;
3239
+ } | undefined;
2622
3240
  containsText?: string | string[] | undefined;
2623
3241
  matchesPattern?: string | string[] | undefined;
2624
3242
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2628,20 +3246,43 @@ declare const EvalCaseSchema: z.ZodObject<{
2628
3246
  remove: string[];
2629
3247
  })[] | undefined;
2630
3248
  passesJudge?: {
2631
- rubric: string;
3249
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3250
+ text: string;
3251
+ };
3252
+ model?: string | undefined;
3253
+ maxTokens?: number | undefined;
3254
+ maxBudgetUsd?: number | undefined;
2632
3255
  reference?: unknown;
2633
3256
  threshold?: number | undefined;
2634
- configId?: string | undefined;
3257
+ reps?: number | undefined;
3258
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3259
+ apiKeyEnvVar?: string | undefined;
3260
+ temperature?: number | undefined;
3261
+ maxToolOutputSize?: number | undefined;
2635
3262
  } | undefined;
2636
3263
  responseSize?: {
2637
3264
  maxBytes?: number | undefined;
2638
3265
  minBytes?: number | undefined;
2639
3266
  } | undefined;
2640
3267
  }, {
3268
+ response?: unknown;
2641
3269
  isError?: string | boolean | string[] | undefined;
2642
3270
  schema?: string | undefined;
2643
3271
  snapshot?: string | undefined;
2644
- response?: unknown;
3272
+ toolsTriggered?: {
3273
+ calls: {
3274
+ name: string;
3275
+ required?: boolean | undefined;
3276
+ arguments?: Record<string, unknown> | undefined;
3277
+ }[];
3278
+ order?: "strict" | "any" | undefined;
3279
+ exclusive?: boolean | undefined;
3280
+ } | undefined;
3281
+ toolCallCount?: {
3282
+ exact?: number | undefined;
3283
+ min?: number | undefined;
3284
+ max?: number | undefined;
3285
+ } | undefined;
2645
3286
  containsText?: string | string[] | undefined;
2646
3287
  matchesPattern?: string | string[] | undefined;
2647
3288
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2651,10 +3292,19 @@ declare const EvalCaseSchema: z.ZodObject<{
2651
3292
  remove: string[];
2652
3293
  })[] | undefined;
2653
3294
  passesJudge?: {
2654
- rubric: string;
3295
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3296
+ text: string;
3297
+ };
3298
+ model?: string | undefined;
3299
+ maxTokens?: number | undefined;
3300
+ maxBudgetUsd?: number | undefined;
2655
3301
  reference?: unknown;
2656
3302
  threshold?: number | undefined;
2657
- configId?: string | undefined;
3303
+ reps?: number | undefined;
3304
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3305
+ apiKeyEnvVar?: string | undefined;
3306
+ temperature?: number | undefined;
3307
+ maxToolOutputSize?: number | undefined;
2658
3308
  } | undefined;
2659
3309
  responseSize?: {
2660
3310
  maxBytes?: number | undefined;
@@ -2664,24 +3314,43 @@ declare const EvalCaseSchema: z.ZodObject<{
2664
3314
  }, "strip", z.ZodTypeAny, {
2665
3315
  id: string;
2666
3316
  args?: Record<string, unknown> | undefined;
2667
- metadata?: Record<string, unknown> | undefined;
2668
3317
  mode?: "direct" | "llm_host" | undefined;
3318
+ metadata?: Record<string, unknown> | undefined;
2669
3319
  description?: string | undefined;
2670
3320
  toolName?: string | undefined;
2671
3321
  scenario?: string | undefined;
2672
3322
  llmHostConfig?: {
2673
- provider: "anthropic" | "openai";
3323
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2674
3324
  model?: string | undefined;
2675
3325
  maxTokens?: number | undefined;
2676
3326
  apiKeyEnvVar?: string | undefined;
2677
3327
  temperature?: number | undefined;
2678
3328
  maxToolCalls?: number | undefined;
2679
3329
  } | undefined;
3330
+ iterations?: number | undefined;
3331
+ accuracyThreshold?: number | undefined;
3332
+ judgeReps?: number | undefined;
3333
+ canonicalAnswer?: string | undefined;
3334
+ tags?: string[] | undefined;
2680
3335
  expect?: {
3336
+ response?: unknown;
2681
3337
  isError?: string | boolean | string[] | undefined;
2682
3338
  schema?: string | undefined;
2683
3339
  snapshot?: string | undefined;
2684
- response?: unknown;
3340
+ toolsTriggered?: {
3341
+ calls: {
3342
+ name: string;
3343
+ required?: boolean | undefined;
3344
+ arguments?: Record<string, unknown> | undefined;
3345
+ }[];
3346
+ order?: "strict" | "any" | undefined;
3347
+ exclusive?: boolean | undefined;
3348
+ } | undefined;
3349
+ toolCallCount?: {
3350
+ exact?: number | undefined;
3351
+ min?: number | undefined;
3352
+ max?: number | undefined;
3353
+ } | undefined;
2685
3354
  containsText?: string | string[] | undefined;
2686
3355
  matchesPattern?: string | string[] | undefined;
2687
3356
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2691,10 +3360,19 @@ declare const EvalCaseSchema: z.ZodObject<{
2691
3360
  remove: string[];
2692
3361
  })[] | undefined;
2693
3362
  passesJudge?: {
2694
- rubric: string;
3363
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3364
+ text: string;
3365
+ };
3366
+ model?: string | undefined;
3367
+ maxTokens?: number | undefined;
3368
+ maxBudgetUsd?: number | undefined;
2695
3369
  reference?: unknown;
2696
3370
  threshold?: number | undefined;
2697
- configId?: string | undefined;
3371
+ reps?: number | undefined;
3372
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3373
+ apiKeyEnvVar?: string | undefined;
3374
+ temperature?: number | undefined;
3375
+ maxToolOutputSize?: number | undefined;
2698
3376
  } | undefined;
2699
3377
  responseSize?: {
2700
3378
  maxBytes?: number | undefined;
@@ -2704,24 +3382,43 @@ declare const EvalCaseSchema: z.ZodObject<{
2704
3382
  }, {
2705
3383
  id: string;
2706
3384
  args?: Record<string, unknown> | undefined;
2707
- metadata?: Record<string, unknown> | undefined;
2708
3385
  mode?: "direct" | "llm_host" | undefined;
3386
+ metadata?: Record<string, unknown> | undefined;
2709
3387
  description?: string | undefined;
2710
3388
  toolName?: string | undefined;
2711
3389
  scenario?: string | undefined;
2712
3390
  llmHostConfig?: {
2713
- provider: "anthropic" | "openai";
3391
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2714
3392
  model?: string | undefined;
2715
3393
  maxTokens?: number | undefined;
2716
3394
  apiKeyEnvVar?: string | undefined;
2717
3395
  temperature?: number | undefined;
2718
3396
  maxToolCalls?: number | undefined;
2719
3397
  } | undefined;
3398
+ iterations?: number | undefined;
3399
+ accuracyThreshold?: number | undefined;
3400
+ judgeReps?: number | undefined;
3401
+ canonicalAnswer?: string | undefined;
3402
+ tags?: string[] | undefined;
2720
3403
  expect?: {
3404
+ response?: unknown;
2721
3405
  isError?: string | boolean | string[] | undefined;
2722
3406
  schema?: string | undefined;
2723
3407
  snapshot?: string | undefined;
2724
- response?: unknown;
3408
+ toolsTriggered?: {
3409
+ calls: {
3410
+ name: string;
3411
+ required?: boolean | undefined;
3412
+ arguments?: Record<string, unknown> | undefined;
3413
+ }[];
3414
+ order?: "strict" | "any" | undefined;
3415
+ exclusive?: boolean | undefined;
3416
+ } | undefined;
3417
+ toolCallCount?: {
3418
+ exact?: number | undefined;
3419
+ min?: number | undefined;
3420
+ max?: number | undefined;
3421
+ } | undefined;
2725
3422
  containsText?: string | string[] | undefined;
2726
3423
  matchesPattern?: string | string[] | undefined;
2727
3424
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2731,10 +3428,19 @@ declare const EvalCaseSchema: z.ZodObject<{
2731
3428
  remove: string[];
2732
3429
  })[] | undefined;
2733
3430
  passesJudge?: {
2734
- rubric: string;
3431
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3432
+ text: string;
3433
+ };
3434
+ model?: string | undefined;
3435
+ maxTokens?: number | undefined;
3436
+ maxBudgetUsd?: number | undefined;
2735
3437
  reference?: unknown;
2736
3438
  threshold?: number | undefined;
2737
- configId?: string | undefined;
3439
+ reps?: number | undefined;
3440
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3441
+ apiKeyEnvVar?: string | undefined;
3442
+ temperature?: number | undefined;
3443
+ maxToolOutputSize?: number | undefined;
2738
3444
  } | undefined;
2739
3445
  responseSize?: {
2740
3446
  maxBytes?: number | undefined;
@@ -2756,21 +3462,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
2756
3462
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2757
3463
  scenario: z.ZodOptional<z.ZodString>;
2758
3464
  llmHostConfig: z.ZodOptional<z.ZodObject<{
2759
- provider: z.ZodEnum<["openai", "anthropic"]>;
3465
+ provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2760
3466
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2761
3467
  model: z.ZodOptional<z.ZodString>;
2762
3468
  maxTokens: z.ZodOptional<z.ZodNumber>;
2763
3469
  temperature: z.ZodOptional<z.ZodNumber>;
2764
3470
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
2765
3471
  }, "strip", z.ZodTypeAny, {
2766
- provider: "anthropic" | "openai";
3472
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2767
3473
  model?: string | undefined;
2768
3474
  maxTokens?: number | undefined;
2769
3475
  apiKeyEnvVar?: string | undefined;
2770
3476
  temperature?: number | undefined;
2771
3477
  maxToolCalls?: number | undefined;
2772
3478
  }, {
2773
- provider: "anthropic" | "openai";
3479
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2774
3480
  model?: string | undefined;
2775
3481
  maxTokens?: number | undefined;
2776
3482
  apiKeyEnvVar?: string | undefined;
@@ -2778,6 +3484,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
2778
3484
  maxToolCalls?: number | undefined;
2779
3485
  }>>;
2780
3486
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3487
+ iterations: z.ZodOptional<z.ZodNumber>;
3488
+ accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3489
+ judgeReps: z.ZodOptional<z.ZodNumber>;
3490
+ canonicalAnswer: z.ZodOptional<z.ZodString>;
3491
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2781
3492
  expect: z.ZodOptional<z.ZodObject<{
2782
3493
  response: z.ZodOptional<z.ZodUnknown>;
2783
3494
  schema: z.ZodOptional<z.ZodString>;
@@ -2802,20 +3513,51 @@ declare const EvalDatasetSchema: z.ZodObject<{
2802
3513
  }>]>, "many">>;
2803
3514
  isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2804
3515
  passesJudge: z.ZodOptional<z.ZodObject<{
2805
- rubric: z.ZodString;
3516
+ rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3517
+ text: z.ZodString;
3518
+ }, "strip", z.ZodTypeAny, {
3519
+ text: string;
3520
+ }, {
3521
+ text: string;
3522
+ }>]>;
2806
3523
  reference: z.ZodOptional<z.ZodUnknown>;
2807
3524
  threshold: z.ZodOptional<z.ZodNumber>;
2808
- configId: z.ZodOptional<z.ZodString>;
3525
+ reps: z.ZodOptional<z.ZodNumber>;
3526
+ provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
3527
+ model: z.ZodOptional<z.ZodString>;
3528
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3529
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3530
+ temperature: z.ZodOptional<z.ZodNumber>;
3531
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3532
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2809
3533
  }, "strip", z.ZodTypeAny, {
2810
- rubric: string;
3534
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3535
+ text: string;
3536
+ };
3537
+ model?: string | undefined;
3538
+ maxTokens?: number | undefined;
3539
+ maxBudgetUsd?: number | undefined;
2811
3540
  reference?: unknown;
2812
3541
  threshold?: number | undefined;
2813
- configId?: string | undefined;
3542
+ reps?: number | undefined;
3543
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3544
+ apiKeyEnvVar?: string | undefined;
3545
+ temperature?: number | undefined;
3546
+ maxToolOutputSize?: number | undefined;
2814
3547
  }, {
2815
- rubric: string;
3548
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3549
+ text: string;
3550
+ };
3551
+ model?: string | undefined;
3552
+ maxTokens?: number | undefined;
3553
+ maxBudgetUsd?: number | undefined;
2816
3554
  reference?: unknown;
2817
3555
  threshold?: number | undefined;
2818
- configId?: string | undefined;
3556
+ reps?: number | undefined;
3557
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3558
+ apiKeyEnvVar?: string | undefined;
3559
+ temperature?: number | undefined;
3560
+ maxToolOutputSize?: number | undefined;
2819
3561
  }>>;
2820
3562
  responseSize: z.ZodOptional<z.ZodObject<{
2821
3563
  maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2827,11 +3569,71 @@ declare const EvalDatasetSchema: z.ZodObject<{
2827
3569
  maxBytes?: number | undefined;
2828
3570
  minBytes?: number | undefined;
2829
3571
  }>>;
3572
+ toolsTriggered: z.ZodOptional<z.ZodObject<{
3573
+ calls: z.ZodArray<z.ZodObject<{
3574
+ name: z.ZodString;
3575
+ arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3576
+ required: z.ZodOptional<z.ZodBoolean>;
3577
+ }, "strip", z.ZodTypeAny, {
3578
+ name: string;
3579
+ required?: boolean | undefined;
3580
+ arguments?: Record<string, unknown> | undefined;
3581
+ }, {
3582
+ name: string;
3583
+ required?: boolean | undefined;
3584
+ arguments?: Record<string, unknown> | undefined;
3585
+ }>, "many">;
3586
+ order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3587
+ exclusive: z.ZodOptional<z.ZodBoolean>;
3588
+ }, "strip", z.ZodTypeAny, {
3589
+ calls: {
3590
+ name: string;
3591
+ required?: boolean | undefined;
3592
+ arguments?: Record<string, unknown> | undefined;
3593
+ }[];
3594
+ order?: "strict" | "any" | undefined;
3595
+ exclusive?: boolean | undefined;
3596
+ }, {
3597
+ calls: {
3598
+ name: string;
3599
+ required?: boolean | undefined;
3600
+ arguments?: Record<string, unknown> | undefined;
3601
+ }[];
3602
+ order?: "strict" | "any" | undefined;
3603
+ exclusive?: boolean | undefined;
3604
+ }>>;
3605
+ toolCallCount: z.ZodOptional<z.ZodObject<{
3606
+ min: z.ZodOptional<z.ZodNumber>;
3607
+ max: z.ZodOptional<z.ZodNumber>;
3608
+ exact: z.ZodOptional<z.ZodNumber>;
3609
+ }, "strip", z.ZodTypeAny, {
3610
+ exact?: number | undefined;
3611
+ min?: number | undefined;
3612
+ max?: number | undefined;
3613
+ }, {
3614
+ exact?: number | undefined;
3615
+ min?: number | undefined;
3616
+ max?: number | undefined;
3617
+ }>>;
2830
3618
  }, "strip", z.ZodTypeAny, {
3619
+ response?: unknown;
2831
3620
  isError?: string | boolean | string[] | undefined;
2832
3621
  schema?: string | undefined;
2833
3622
  snapshot?: string | undefined;
2834
- response?: unknown;
3623
+ toolsTriggered?: {
3624
+ calls: {
3625
+ name: string;
3626
+ required?: boolean | undefined;
3627
+ arguments?: Record<string, unknown> | undefined;
3628
+ }[];
3629
+ order?: "strict" | "any" | undefined;
3630
+ exclusive?: boolean | undefined;
3631
+ } | undefined;
3632
+ toolCallCount?: {
3633
+ exact?: number | undefined;
3634
+ min?: number | undefined;
3635
+ max?: number | undefined;
3636
+ } | undefined;
2835
3637
  containsText?: string | string[] | undefined;
2836
3638
  matchesPattern?: string | string[] | undefined;
2837
3639
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2841,20 +3643,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
2841
3643
  remove: string[];
2842
3644
  })[] | undefined;
2843
3645
  passesJudge?: {
2844
- rubric: string;
3646
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3647
+ text: string;
3648
+ };
3649
+ model?: string | undefined;
3650
+ maxTokens?: number | undefined;
3651
+ maxBudgetUsd?: number | undefined;
2845
3652
  reference?: unknown;
2846
3653
  threshold?: number | undefined;
2847
- configId?: string | undefined;
3654
+ reps?: number | undefined;
3655
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3656
+ apiKeyEnvVar?: string | undefined;
3657
+ temperature?: number | undefined;
3658
+ maxToolOutputSize?: number | undefined;
2848
3659
  } | undefined;
2849
3660
  responseSize?: {
2850
3661
  maxBytes?: number | undefined;
2851
3662
  minBytes?: number | undefined;
2852
3663
  } | undefined;
2853
3664
  }, {
3665
+ response?: unknown;
2854
3666
  isError?: string | boolean | string[] | undefined;
2855
3667
  schema?: string | undefined;
2856
3668
  snapshot?: string | undefined;
2857
- response?: unknown;
3669
+ toolsTriggered?: {
3670
+ calls: {
3671
+ name: string;
3672
+ required?: boolean | undefined;
3673
+ arguments?: Record<string, unknown> | undefined;
3674
+ }[];
3675
+ order?: "strict" | "any" | undefined;
3676
+ exclusive?: boolean | undefined;
3677
+ } | undefined;
3678
+ toolCallCount?: {
3679
+ exact?: number | undefined;
3680
+ min?: number | undefined;
3681
+ max?: number | undefined;
3682
+ } | undefined;
2858
3683
  containsText?: string | string[] | undefined;
2859
3684
  matchesPattern?: string | string[] | undefined;
2860
3685
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2864,10 +3689,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
2864
3689
  remove: string[];
2865
3690
  })[] | undefined;
2866
3691
  passesJudge?: {
2867
- rubric: string;
3692
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3693
+ text: string;
3694
+ };
3695
+ model?: string | undefined;
3696
+ maxTokens?: number | undefined;
3697
+ maxBudgetUsd?: number | undefined;
2868
3698
  reference?: unknown;
2869
3699
  threshold?: number | undefined;
2870
- configId?: string | undefined;
3700
+ reps?: number | undefined;
3701
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3702
+ apiKeyEnvVar?: string | undefined;
3703
+ temperature?: number | undefined;
3704
+ maxToolOutputSize?: number | undefined;
2871
3705
  } | undefined;
2872
3706
  responseSize?: {
2873
3707
  maxBytes?: number | undefined;
@@ -2877,24 +3711,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
2877
3711
  }, "strip", z.ZodTypeAny, {
2878
3712
  id: string;
2879
3713
  args?: Record<string, unknown> | undefined;
2880
- metadata?: Record<string, unknown> | undefined;
2881
3714
  mode?: "direct" | "llm_host" | undefined;
3715
+ metadata?: Record<string, unknown> | undefined;
2882
3716
  description?: string | undefined;
2883
3717
  toolName?: string | undefined;
2884
3718
  scenario?: string | undefined;
2885
3719
  llmHostConfig?: {
2886
- provider: "anthropic" | "openai";
3720
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2887
3721
  model?: string | undefined;
2888
3722
  maxTokens?: number | undefined;
2889
3723
  apiKeyEnvVar?: string | undefined;
2890
3724
  temperature?: number | undefined;
2891
3725
  maxToolCalls?: number | undefined;
2892
3726
  } | undefined;
3727
+ iterations?: number | undefined;
3728
+ accuracyThreshold?: number | undefined;
3729
+ judgeReps?: number | undefined;
3730
+ canonicalAnswer?: string | undefined;
3731
+ tags?: string[] | undefined;
2893
3732
  expect?: {
3733
+ response?: unknown;
2894
3734
  isError?: string | boolean | string[] | undefined;
2895
3735
  schema?: string | undefined;
2896
3736
  snapshot?: string | undefined;
2897
- response?: unknown;
3737
+ toolsTriggered?: {
3738
+ calls: {
3739
+ name: string;
3740
+ required?: boolean | undefined;
3741
+ arguments?: Record<string, unknown> | undefined;
3742
+ }[];
3743
+ order?: "strict" | "any" | undefined;
3744
+ exclusive?: boolean | undefined;
3745
+ } | undefined;
3746
+ toolCallCount?: {
3747
+ exact?: number | undefined;
3748
+ min?: number | undefined;
3749
+ max?: number | undefined;
3750
+ } | undefined;
2898
3751
  containsText?: string | string[] | undefined;
2899
3752
  matchesPattern?: string | string[] | undefined;
2900
3753
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2904,10 +3757,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
2904
3757
  remove: string[];
2905
3758
  })[] | undefined;
2906
3759
  passesJudge?: {
2907
- rubric: string;
3760
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3761
+ text: string;
3762
+ };
3763
+ model?: string | undefined;
3764
+ maxTokens?: number | undefined;
3765
+ maxBudgetUsd?: number | undefined;
2908
3766
  reference?: unknown;
2909
3767
  threshold?: number | undefined;
2910
- configId?: string | undefined;
3768
+ reps?: number | undefined;
3769
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3770
+ apiKeyEnvVar?: string | undefined;
3771
+ temperature?: number | undefined;
3772
+ maxToolOutputSize?: number | undefined;
2911
3773
  } | undefined;
2912
3774
  responseSize?: {
2913
3775
  maxBytes?: number | undefined;
@@ -2917,24 +3779,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
2917
3779
  }, {
2918
3780
  id: string;
2919
3781
  args?: Record<string, unknown> | undefined;
2920
- metadata?: Record<string, unknown> | undefined;
2921
3782
  mode?: "direct" | "llm_host" | undefined;
3783
+ metadata?: Record<string, unknown> | undefined;
2922
3784
  description?: string | undefined;
2923
3785
  toolName?: string | undefined;
2924
3786
  scenario?: string | undefined;
2925
3787
  llmHostConfig?: {
2926
- provider: "anthropic" | "openai";
3788
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2927
3789
  model?: string | undefined;
2928
3790
  maxTokens?: number | undefined;
2929
3791
  apiKeyEnvVar?: string | undefined;
2930
3792
  temperature?: number | undefined;
2931
3793
  maxToolCalls?: number | undefined;
2932
3794
  } | undefined;
3795
+ iterations?: number | undefined;
3796
+ accuracyThreshold?: number | undefined;
3797
+ judgeReps?: number | undefined;
3798
+ canonicalAnswer?: string | undefined;
3799
+ tags?: string[] | undefined;
2933
3800
  expect?: {
3801
+ response?: unknown;
2934
3802
  isError?: string | boolean | string[] | undefined;
2935
3803
  schema?: string | undefined;
2936
3804
  snapshot?: string | undefined;
2937
- response?: unknown;
3805
+ toolsTriggered?: {
3806
+ calls: {
3807
+ name: string;
3808
+ required?: boolean | undefined;
3809
+ arguments?: Record<string, unknown> | undefined;
3810
+ }[];
3811
+ order?: "strict" | "any" | undefined;
3812
+ exclusive?: boolean | undefined;
3813
+ } | undefined;
3814
+ toolCallCount?: {
3815
+ exact?: number | undefined;
3816
+ min?: number | undefined;
3817
+ max?: number | undefined;
3818
+ } | undefined;
2938
3819
  containsText?: string | string[] | undefined;
2939
3820
  matchesPattern?: string | string[] | undefined;
2940
3821
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2944,10 +3825,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
2944
3825
  remove: string[];
2945
3826
  })[] | undefined;
2946
3827
  passesJudge?: {
2947
- rubric: string;
3828
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3829
+ text: string;
3830
+ };
3831
+ model?: string | undefined;
3832
+ maxTokens?: number | undefined;
3833
+ maxBudgetUsd?: number | undefined;
2948
3834
  reference?: unknown;
2949
3835
  threshold?: number | undefined;
2950
- configId?: string | undefined;
3836
+ reps?: number | undefined;
3837
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3838
+ apiKeyEnvVar?: string | undefined;
3839
+ temperature?: number | undefined;
3840
+ maxToolOutputSize?: number | undefined;
2951
3841
  } | undefined;
2952
3842
  responseSize?: {
2953
3843
  maxBytes?: number | undefined;
@@ -2961,24 +3851,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
2961
3851
  cases: {
2962
3852
  id: string;
2963
3853
  args?: Record<string, unknown> | undefined;
2964
- metadata?: Record<string, unknown> | undefined;
2965
3854
  mode?: "direct" | "llm_host" | undefined;
3855
+ metadata?: Record<string, unknown> | undefined;
2966
3856
  description?: string | undefined;
2967
3857
  toolName?: string | undefined;
2968
3858
  scenario?: string | undefined;
2969
3859
  llmHostConfig?: {
2970
- provider: "anthropic" | "openai";
3860
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2971
3861
  model?: string | undefined;
2972
3862
  maxTokens?: number | undefined;
2973
3863
  apiKeyEnvVar?: string | undefined;
2974
3864
  temperature?: number | undefined;
2975
3865
  maxToolCalls?: number | undefined;
2976
3866
  } | undefined;
3867
+ iterations?: number | undefined;
3868
+ accuracyThreshold?: number | undefined;
3869
+ judgeReps?: number | undefined;
3870
+ canonicalAnswer?: string | undefined;
3871
+ tags?: string[] | undefined;
2977
3872
  expect?: {
3873
+ response?: unknown;
2978
3874
  isError?: string | boolean | string[] | undefined;
2979
3875
  schema?: string | undefined;
2980
3876
  snapshot?: string | undefined;
2981
- response?: unknown;
3877
+ toolsTriggered?: {
3878
+ calls: {
3879
+ name: string;
3880
+ required?: boolean | undefined;
3881
+ arguments?: Record<string, unknown> | undefined;
3882
+ }[];
3883
+ order?: "strict" | "any" | undefined;
3884
+ exclusive?: boolean | undefined;
3885
+ } | undefined;
3886
+ toolCallCount?: {
3887
+ exact?: number | undefined;
3888
+ min?: number | undefined;
3889
+ max?: number | undefined;
3890
+ } | undefined;
2982
3891
  containsText?: string | string[] | undefined;
2983
3892
  matchesPattern?: string | string[] | undefined;
2984
3893
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2988,10 +3897,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
2988
3897
  remove: string[];
2989
3898
  })[] | undefined;
2990
3899
  passesJudge?: {
2991
- rubric: string;
3900
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3901
+ text: string;
3902
+ };
3903
+ model?: string | undefined;
3904
+ maxTokens?: number | undefined;
3905
+ maxBudgetUsd?: number | undefined;
2992
3906
  reference?: unknown;
2993
3907
  threshold?: number | undefined;
2994
- configId?: string | undefined;
3908
+ reps?: number | undefined;
3909
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3910
+ apiKeyEnvVar?: string | undefined;
3911
+ temperature?: number | undefined;
3912
+ maxToolOutputSize?: number | undefined;
2995
3913
  } | undefined;
2996
3914
  responseSize?: {
2997
3915
  maxBytes?: number | undefined;
@@ -3006,24 +3924,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
3006
3924
  cases: {
3007
3925
  id: string;
3008
3926
  args?: Record<string, unknown> | undefined;
3009
- metadata?: Record<string, unknown> | undefined;
3010
3927
  mode?: "direct" | "llm_host" | undefined;
3928
+ metadata?: Record<string, unknown> | undefined;
3011
3929
  description?: string | undefined;
3012
3930
  toolName?: string | undefined;
3013
3931
  scenario?: string | undefined;
3014
3932
  llmHostConfig?: {
3015
- provider: "anthropic" | "openai";
3933
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3016
3934
  model?: string | undefined;
3017
3935
  maxTokens?: number | undefined;
3018
3936
  apiKeyEnvVar?: string | undefined;
3019
3937
  temperature?: number | undefined;
3020
3938
  maxToolCalls?: number | undefined;
3021
3939
  } | undefined;
3940
+ iterations?: number | undefined;
3941
+ accuracyThreshold?: number | undefined;
3942
+ judgeReps?: number | undefined;
3943
+ canonicalAnswer?: string | undefined;
3944
+ tags?: string[] | undefined;
3022
3945
  expect?: {
3946
+ response?: unknown;
3023
3947
  isError?: string | boolean | string[] | undefined;
3024
3948
  schema?: string | undefined;
3025
3949
  snapshot?: string | undefined;
3026
- response?: unknown;
3950
+ toolsTriggered?: {
3951
+ calls: {
3952
+ name: string;
3953
+ required?: boolean | undefined;
3954
+ arguments?: Record<string, unknown> | undefined;
3955
+ }[];
3956
+ order?: "strict" | "any" | undefined;
3957
+ exclusive?: boolean | undefined;
3958
+ } | undefined;
3959
+ toolCallCount?: {
3960
+ exact?: number | undefined;
3961
+ min?: number | undefined;
3962
+ max?: number | undefined;
3963
+ } | undefined;
3027
3964
  containsText?: string | string[] | undefined;
3028
3965
  matchesPattern?: string | string[] | undefined;
3029
3966
  snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -3033,10 +3970,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
3033
3970
  remove: string[];
3034
3971
  })[] | undefined;
3035
3972
  passesJudge?: {
3036
- rubric: string;
3973
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3974
+ text: string;
3975
+ };
3976
+ model?: string | undefined;
3977
+ maxTokens?: number | undefined;
3978
+ maxBudgetUsd?: number | undefined;
3037
3979
  reference?: unknown;
3038
3980
  threshold?: number | undefined;
3039
- configId?: string | undefined;
3981
+ reps?: number | undefined;
3982
+ provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
3983
+ apiKeyEnvVar?: string | undefined;
3984
+ temperature?: number | undefined;
3985
+ maxToolOutputSize?: number | undefined;
3040
3986
  } | undefined;
3041
3987
  responseSize?: {
3042
3988
  maxBytes?: number | undefined;
@@ -3126,50 +4072,140 @@ declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions)
3126
4072
  declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
3127
4073
 
3128
4074
  /**
3129
- * Context passed to the eval runner
4075
+ * Reporter-specific type definitions
4076
+ *
4077
+ * These types are used by the MCP reporter and UI.
4078
+ *
4079
+ * @packageDocumentation
3130
4080
  */
3131
- interface EvalContext {
4081
+
4082
+ /**
4083
+ * Experiment tracking metadata for an eval run
4084
+ */
4085
+ interface EvalRunMetadata {
4086
+ /** Git commit hash at time of run */
4087
+ gitHash?: string;
4088
+ /** ISO timestamp of the run */
4089
+ timestamp: string;
4090
+ /** Package version from package.json */
4091
+ packageVersion: string;
4092
+ /** LLM host model identifier (if llm_host mode) */
4093
+ llmHostModel?: string;
4094
+ /** Judge model identifier (if judge was used) */
4095
+ judgeModel?: string;
4096
+ }
4097
+ /**
4098
+ * Individual conformance check result
4099
+ */
4100
+ interface MCPConformanceCheck$1 {
3132
4101
  /**
3133
- * MCP fixture API for interacting with the server
4102
+ * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
3134
4103
  */
3135
- mcp: MCPFixtureApi;
4104
+ name: string;
3136
4105
  /**
3137
- * Optional Playwright TestInfo for reporter integration
3138
- * When provided, eval results will be attached to the test for the MCP reporter
4106
+ * Whether the check passed
3139
4107
  */
3140
- testInfo?: TestInfo;
4108
+ pass: boolean;
3141
4109
  /**
3142
- * Optional Playwright expect function for snapshot testing
3143
- * Required for snapshot expectations to work properly
4110
+ * Human-readable message describing the result
3144
4111
  */
3145
- expect?: Expect;
4112
+ message: string;
3146
4113
  }
3147
-
3148
4114
  /**
3149
- * Result of a single eval case
4115
+ * Conformance check result as stored in reporter data
3150
4116
  */
3151
- interface EvalCaseResult$1 {
4117
+ interface MCPConformanceResultData {
3152
4118
  /**
3153
- * Case ID
4119
+ * Test title where conformance check was run
3154
4120
  */
3155
- id: string;
4121
+ testTitle: string;
3156
4122
  /**
3157
- * Dataset name this case belongs to
4123
+ * Whether all checks passed
3158
4124
  */
3159
- datasetName: string;
4125
+ pass: boolean;
3160
4126
  /**
3161
- * MCP tool name that was called
4127
+ * Individual check results
3162
4128
  */
3163
- toolName: string;
4129
+ checks: MCPConformanceCheck$1[];
3164
4130
  /**
3165
- * Evaluation mode (direct or llm_host)
3166
- * @deprecated Mode is inferred from test context, not displayed in reports
4131
+ * Server info if available
3167
4132
  */
3168
- mode?: 'direct' | 'llm_host';
3169
- /**
3170
- * Source of this result
3171
- * - 'eval': From runEvalDataset() using JSON eval datasets
3172
- * - 'test': From direct API test tracking (MCP fixture calls)
4133
+ serverInfo?: {
4134
+ name?: string;
4135
+ version?: string;
4136
+ };
4137
+ /**
4138
+ * Number of tools discovered
4139
+ */
4140
+ toolCount: number;
4141
+ /**
4142
+ * Auth type used for this check
4143
+ */
4144
+ authType?: AuthType;
4145
+ /**
4146
+ * Project name
4147
+ */
4148
+ project?: string;
4149
+ }
4150
+ /**
4151
+ * Server capabilities data from mcp-list-tools attachment
4152
+ */
4153
+ interface MCPServerCapabilitiesData {
4154
+ /**
4155
+ * Test title where listTools was called
4156
+ */
4157
+ testTitle: string;
4158
+ /**
4159
+ * List of tools available on the server
4160
+ */
4161
+ tools: Array<{
4162
+ name: string;
4163
+ description?: string;
4164
+ }>;
4165
+ /**
4166
+ * Total number of tools
4167
+ */
4168
+ toolCount: number;
4169
+ /**
4170
+ * Auth type used for this test
4171
+ */
4172
+ authType?: AuthType;
4173
+ /**
4174
+ * Project name
4175
+ */
4176
+ project?: string;
4177
+ }
4178
+ /**
4179
+ * Result of a single iteration within a multi-iteration eval case
4180
+ */
4181
+ interface IterationResult {
4182
+ /** Whether this iteration passed */
4183
+ pass: boolean;
4184
+ /** Execution time for this iteration */
4185
+ durationMs: number;
4186
+ /** Error message if the iteration failed with an exception */
4187
+ error?: string;
4188
+ /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
4189
+ isInfrastructureError?: boolean;
4190
+ }
4191
+ /**
4192
+ * Result of a single eval case
4193
+ */
4194
+ interface EvalCaseResult {
4195
+ /**
4196
+ * Case ID
4197
+ */
4198
+ id: string;
4199
+ /**
4200
+ * Dataset name this case belongs to
4201
+ */
4202
+ datasetName: string;
4203
+ /**
4204
+ * MCP tool name that was called
4205
+ */
4206
+ toolName: string;
4207
+ /**
4208
+ * Source of this result
3173
4209
  */
3174
4210
  source: ResultSource;
3175
4211
  /**
@@ -3194,14 +4230,164 @@ interface EvalCaseResult$1 {
3194
4230
  authType?: AuthType;
3195
4231
  /**
3196
4232
  * Playwright project name this test belongs to
3197
- * Used for filtering/grouping results by project in the reporter
3198
4233
  */
3199
4234
  project?: string;
3200
4235
  /**
3201
4236
  * Execution time in milliseconds
3202
4237
  */
3203
4238
  durationMs: number;
4239
+ /**
4240
+ * Assertion pass rate (0–1): passes divided by non-infrastructure iterations.
4241
+ * Only present when the case was run with `iterations > 1`.
4242
+ *
4243
+ * Infrastructure errors (network timeouts, rate limits, etc.) are excluded from
4244
+ * the denominator so that environment reliability does not inflate this metric.
4245
+ */
4246
+ assertionPassRate?: number;
4247
+ /**
4248
+ * Infrastructure error rate (0–1): infra errors divided by total iterations.
4249
+ * Only present when the case was run with `iterations > 1`.
4250
+ */
4251
+ infrastructureErrorRate?: number;
4252
+ /**
4253
+ * Accuracy score (0–1) across all iterations.
4254
+ * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
4255
+ * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
4256
+ */
4257
+ accuracy?: number;
4258
+ /**
4259
+ * Per-iteration pass/fail breakdown.
4260
+ * Only present when the case was run with `iterations > 1`.
4261
+ */
4262
+ iterationResults?: Array<IterationResult>;
4263
+ /**
4264
+ * Tags from the source eval case, for filtering and slicing reports.
4265
+ */
4266
+ tags?: string[];
4267
+ /**
4268
+ * Precision of tool calls made (0–1).
4269
+ * 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
4270
+ * Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
4271
+ */
4272
+ toolPrecision?: number;
4273
+ /**
4274
+ * Recall of required tool calls (0–1).
4275
+ * 1.0 means all required tools were called; <1.0 means some were missed.
4276
+ * Only populated when toolsTriggered expectation was evaluated.
4277
+ */
4278
+ toolRecall?: number;
4279
+ /**
4280
+ * Pass/fail status of this case in the baseline run.
4281
+ * Only present when a baseline was provided to runEvalDataset.
4282
+ */
4283
+ baselinePass?: boolean;
4284
+ /**
4285
+ * Number of iterations that failed due to infrastructure errors (network, rate limits, etc.)
4286
+ * Only present when the case was run with `iterations > 1`.
4287
+ */
4288
+ infrastructureErrorCount?: number;
3204
4289
  }
4290
+ /**
4291
+ * Aggregated MCP eval run data
4292
+ */
4293
+ interface MCPEvalRunData {
4294
+ /**
4295
+ * Run timestamp (ISO 8601)
4296
+ */
4297
+ timestamp: string;
4298
+ /**
4299
+ * Total duration in milliseconds
4300
+ */
4301
+ durationMs: number;
4302
+ /**
4303
+ * Environment info
4304
+ */
4305
+ environment: {
4306
+ ci: boolean;
4307
+ node: string;
4308
+ platform: string;
4309
+ };
4310
+ /**
4311
+ * Aggregate metrics
4312
+ */
4313
+ metrics: {
4314
+ /**
4315
+ * Total number of eval cases
4316
+ */
4317
+ total: number;
4318
+ /**
4319
+ * Number of passed cases
4320
+ */
4321
+ passed: number;
4322
+ /**
4323
+ * Number of failed cases
4324
+ */
4325
+ failed: number;
4326
+ /**
4327
+ * Pass rate (0-1)
4328
+ */
4329
+ passRate: number;
4330
+ /**
4331
+ * Dataset breakdown: dataset name -> count
4332
+ */
4333
+ datasetBreakdown: Record<string, number>;
4334
+ /**
4335
+ * Expectation type breakdown
4336
+ */
4337
+ expectationBreakdown: ExpectationBreakdown;
4338
+ };
4339
+ /**
4340
+ * All eval results from this run
4341
+ */
4342
+ results: EvalCaseResult[];
4343
+ /**
4344
+ * Conformance check results (optional)
4345
+ */
4346
+ conformanceChecks?: MCPConformanceResultData[];
4347
+ /**
4348
+ * Server capabilities discovered via listTools (optional)
4349
+ */
4350
+ serverCapabilities?: MCPServerCapabilitiesData[];
4351
+ }
4352
+ /**
4353
+ * Historical summary for trend charts
4354
+ */
4355
+ interface MCPEvalHistoricalSummary {
4356
+ timestamp: string;
4357
+ total: number;
4358
+ passed: number;
4359
+ failed: number;
4360
+ passRate: number;
4361
+ durationMs: number;
4362
+ }
4363
+ /**
4364
+ * Complete data structure passed to UI
4365
+ */
4366
+ interface MCPEvalData {
4367
+ runData: MCPEvalRunData;
4368
+ historical: MCPEvalHistoricalSummary[];
4369
+ }
4370
+
4371
+ /**
4372
+ * Context passed to the eval runner
4373
+ */
4374
+ interface EvalContext {
4375
+ /**
4376
+ * MCP fixture API for interacting with the server
4377
+ */
4378
+ mcp: MCPFixtureApi;
4379
+ /**
4380
+ * Optional Playwright TestInfo for reporter integration
4381
+ * When provided, eval results will be attached to the test for the MCP reporter
4382
+ */
4383
+ testInfo?: TestInfo;
4384
+ /**
4385
+ * Optional Playwright expect function for snapshot testing
4386
+ * Required for snapshot expectations to work properly
4387
+ */
4388
+ expect?: Expect;
4389
+ }
4390
+
3205
4391
  /**
3206
4392
  * Overall result of running an eval dataset
3207
4393
  */
@@ -3221,11 +4407,48 @@ interface EvalRunnerResult {
3221
4407
  /**
3222
4408
  * Individual case results
3223
4409
  */
3224
- caseResults: Array<EvalCaseResult$1>;
4410
+ caseResults: Array<EvalCaseResult>;
3225
4411
  /**
3226
4412
  * Overall execution time in milliseconds
3227
4413
  */
3228
4414
  durationMs: number;
4415
+ /**
4416
+ * Difference between current pass rate and baseline pass rate.
4417
+ * Positive = improvement, negative = regression.
4418
+ * Only present when `baselineResultsFrom` was provided.
4419
+ */
4420
+ deltaPassRate?: number;
4421
+ /**
4422
+ * Number of cases that regressed: passed in baseline, failed now.
4423
+ * Only present when `baselineResultsFrom` was provided.
4424
+ */
4425
+ regressions?: number;
4426
+ /**
4427
+ * Number of cases that improved: failed in baseline, passed now.
4428
+ * Only present when `baselineResultsFrom` was provided.
4429
+ */
4430
+ improvements?: number;
4431
+ /**
4432
+ * Average tool precision across all llm_host cases that have a
4433
+ * `toolsTriggered` expectation (precision = fraction of called tools
4434
+ * that were expected). Only present when at least one such case ran.
4435
+ */
4436
+ datasetToolPrecision?: number;
4437
+ /**
4438
+ * Average tool recall across all llm_host cases that have a
4439
+ * `toolsTriggered` expectation (recall = fraction of required tools
4440
+ * that were actually called). Only present when at least one such case ran.
4441
+ */
4442
+ datasetToolRecall?: number;
4443
+ /**
4444
+ * Harmonic mean of `datasetToolPrecision` and `datasetToolRecall`.
4445
+ * Only present when at least one case contributes precision/recall data.
4446
+ */
4447
+ datasetToolF1?: number;
4448
+ /**
4449
+ * Experiment tracking metadata captured at run time.
4450
+ */
4451
+ metadata?: EvalRunMetadata;
3229
4452
  }
3230
4453
  /**
3231
4454
  * Options for running eval dataset
@@ -3251,12 +4474,6 @@ interface EvalRunnerOptions {
3251
4474
  * ```
3252
4475
  */
3253
4476
  schemas?: Record<string, ZodType>;
3254
- /**
3255
- * Judge configuration registry by ID
3256
- *
3257
- * Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
3258
- */
3259
- judgeConfigs?: Record<string, JudgeConfig>;
3260
4477
  /**
3261
4478
  * Whether to stop on first failure
3262
4479
  * @default false
@@ -3265,7 +4482,71 @@ interface EvalRunnerOptions {
3265
4482
  /**
3266
4483
  * Optional callback called after each case
3267
4484
  */
3268
- onCaseComplete?: (result: EvalCaseResult$1) => void | Promise<void>;
4485
+ onCaseComplete?: (result: EvalCaseResult) => void | Promise<void>;
4486
+ /**
4487
+ * Maximum number of eval cases to run concurrently.
4488
+ * When > 1, cases run in parallel (ignores stopOnFailure ordering).
4489
+ * @default 1 (sequential)
4490
+ */
4491
+ concurrency?: number;
4492
+ /**
4493
+ * Default iteration count for `llm_host` mode cases that do not specify
4494
+ * `iterations` explicitly. Has no effect on `direct` mode cases (which are
4495
+ * deterministic and always default to 1 iteration).
4496
+ *
4497
+ * Set to 10 for standard runs or 20 for release gates. Individual cases can
4498
+ * still override this with their own `iterations` field.
4499
+ *
4500
+ * @default 1 (preserves historical behaviour when not set)
4501
+ *
4502
+ * @example
4503
+ * ```typescript
4504
+ * // Run all llm_host cases 10 times each by default
4505
+ * await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
4506
+ * ```
4507
+ */
4508
+ defaultLlmIterations?: number;
4509
+ /**
4510
+ * Default number of judge evaluations for cases that do not specify
4511
+ * `judgeReps` explicitly. Applies to any case with a `passesJudge`
4512
+ * expectation. Per-case `judgeReps` overrides this.
4513
+ *
4514
+ * @default 1 (single judge run)
4515
+ */
4516
+ defaultJudgeReps?: number;
4517
+ /**
4518
+ * When set, only eval cases whose `tags` array contains at least one of
4519
+ * the specified tags are run. Cases without a `tags` field are excluded.
4520
+ * When undefined or empty, all cases run (default behavior).
4521
+ */
4522
+ filterTags?: string[];
4523
+ /**
4524
+ * If set, saves the run results to this file path after completion.
4525
+ * Use with `baselineResultsFrom` on the next run for regression detection.
4526
+ *
4527
+ * @example '.mcp-test-results/baseline.json'
4528
+ */
4529
+ saveResultsTo?: string;
4530
+ /**
4531
+ * If set, loads this file as the baseline and computes delta metrics vs the current run.
4532
+ * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
4533
+ * and tags each `EvalCaseResult.baselinePass`.
4534
+ */
4535
+ baselineResultsFrom?: string;
4536
+ /**
4537
+ * LLM host model identifier to record in run metadata.
4538
+ * Use this to identify which model was used when running llm_host cases.
4539
+ *
4540
+ * @example 'claude-opus-4-20250514'
4541
+ */
4542
+ llmHostModel?: string;
4543
+ /**
4544
+ * Judge model identifier to record in run metadata.
4545
+ * Use this to identify which model was used for judge evaluations.
4546
+ *
4547
+ * @example 'claude-sonnet-4-20250514'
4548
+ */
4549
+ judgeModel?: string;
3269
4550
  }
3270
4551
  /**
3271
4552
  * Options for running a single eval case
@@ -3279,17 +4560,14 @@ interface EvalCaseOptions {
3279
4560
  * Schema registry for schema validation by name
3280
4561
  */
3281
4562
  schemas?: Record<string, ZodType>;
3282
- /**
3283
- * Judge configuration registry by ID
3284
- */
3285
- judgeConfigs?: Record<string, JudgeConfig>;
3286
4563
  }
3287
4564
  /**
3288
- * Runs a single eval case and returns the result
4565
+ * Runs a single eval case and returns the result.
4566
+ * When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
3289
4567
  *
3290
4568
  * @param evalCase - The eval case to run
3291
4569
  * @param context - Context containing mcp, testInfo, expect
3292
- * @param options - Optional configuration (datasetName, schemas, judgeConfigs)
4570
+ * @param options - Optional configuration (datasetName, schemas)
3293
4571
  * @returns The result of running the eval case
3294
4572
  *
3295
4573
  * @example
@@ -3303,131 +4581,166 @@ interface EvalCaseOptions {
3303
4581
  * expect(result.pass).toBe(true);
3304
4582
  * ```
3305
4583
  */
3306
- declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult$1>;
4584
+ declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
4585
+ declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
4586
+
3307
4587
  /**
3308
- * Runs an eval dataset against an MCP server
4588
+ * Saves eval results to a JSON file for use as a baseline in future runs.
3309
4589
  *
3310
- * This function composes runEvalCase() for each case in the dataset,
3311
- * adding dataset-level features like stopOnFailure and callbacks.
4590
+ * @param result - The eval run result to save
4591
+ * @param filePath - Path to write the JSON file (parent dirs created automatically)
4592
+ */
4593
+ declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
4594
+ /**
4595
+ * Loads a previously saved baseline from a JSON file.
3312
4596
  *
3313
- * @param options - Eval runner options (dataset, schemas, judgeConfigs)
3314
- * @param context - Eval context (mcp fixture, optional testInfo, optional expect)
3315
- * @returns Eval results
4597
+ * @param filePath - Path to the JSON file written by saveBaseline
4598
+ * @returns The saved EvalRunnerResult
4599
+ * @throws If the file cannot be read or parsed
4600
+ */
4601
+ declare function loadBaseline(filePath: string): Promise<EvalRunnerResult>;
4602
+
4603
+ /** Outcome of comparing two servers on a single eval case. */
4604
+ type ComparisonOutcome = 'A_WINS' | 'B_WINS' | 'TIE' | 'BOTH_FAIL';
4605
+ /** Result of comparing a single eval case across two servers. */
4606
+ interface CaseComparisonResult {
4607
+ /** Case ID */
4608
+ id: string;
4609
+ /** Comparison outcome */
4610
+ outcome: ComparisonOutcome;
4611
+ /** Result from server A */
4612
+ serverA: EvalCaseResult;
4613
+ /** Result from server B */
4614
+ serverB: EvalCaseResult;
4615
+ }
4616
+ /** Aggregated result of running a dataset against two servers. */
4617
+ interface ServerComparisonResult {
4618
+ /** Dataset name */
4619
+ dataset: string;
4620
+ /** Total cases compared (cases present in both runs) */
4621
+ total: number;
4622
+ /** Cases where server A passed and server B failed */
4623
+ aWins: number;
4624
+ /** Cases where server B passed and server A failed */
4625
+ bWins: number;
4626
+ /** Cases where both passed */
4627
+ ties: number;
4628
+ /** Cases where both failed */
4629
+ bothFail: number;
4630
+ /** Raw count of cases where both servers failed (same as bothFail) */
4631
+ bothFailCount: number;
4632
+ /** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
4633
+ decidedCases: number;
4634
+ /** Fraction of total cases where both servers failed (bothFail / total) */
4635
+ failureAlignment: number;
4636
+ /** A win rate (aWins / decidedCases, excludes BOTH_FAIL) */
4637
+ aWinRate: number;
4638
+ /** B win rate (bWins / decidedCases, excludes BOTH_FAIL) */
4639
+ bWinRate: number;
4640
+ /** Tie rate (ties / decidedCases, excludes BOTH_FAIL) */
4641
+ tieRate: number;
4642
+ /** Per-case comparison results */
4643
+ cases: CaseComparisonResult[];
4644
+ /** Full result from server A */
4645
+ serverAResult: EvalRunnerResult;
4646
+ /** Full result from server B */
4647
+ serverBResult: EvalRunnerResult;
4648
+ /** Total duration in milliseconds */
4649
+ durationMs: number;
4650
+ }
4651
+ /**
4652
+ * Options for `runServerComparison`.
4653
+ * Same as `EvalRunnerOptions` without baseline-specific fields.
4654
+ */
4655
+ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baselineResultsFrom'>;
4656
+ /**
4657
+ * Runs the same eval dataset against two MCP servers in parallel and
4658
+ * returns a detailed per-case comparison of results.
3316
4659
  *
3317
- * @example
3318
- * // Basic usage
3319
- * const result = await runEvalDataset(
3320
- * {
3321
- * dataset,
3322
- * schemas: { WeatherResponse: WeatherSchema },
3323
- * },
3324
- * { mcp }
3325
- * );
4660
+ * Both servers receive identical cases and options. The comparison uses
4661
+ * simple pass/fail per case: A_WINS means A passed and B failed, etc.
4662
+ *
4663
+ * @param options - Eval dataset and runner options (shared between both servers)
4664
+ * @param contextA - MCP context for server A (e.g., Glean MCP)
4665
+ * @param contextB - MCP context for server B (e.g., native MCP)
4666
+ * @returns Comparison result with per-case outcomes and aggregate win rates
3326
4667
  *
3327
4668
  * @example
3328
- * // With MCP reporter integration
3329
- * test('eval dataset', async ({ mcp }, testInfo) => {
3330
- * const result = await runEvalDataset(
3331
- * { dataset },
3332
- * { mcp, testInfo } // testInfo enables MCP reporter
3333
- * );
3334
- * });
4669
+ * ```typescript
4670
+ * const comparison = await runServerComparison(
4671
+ * { dataset },
4672
+ * { mcp: gleanMcpFixture },
4673
+ * { mcp: nativeMcpFixture }
4674
+ * );
4675
+ * console.log(`Glean MCP wins: ${(comparison.aWinRate * 100).toFixed(1)}%`);
4676
+ * console.log(`Native MCP wins: ${(comparison.bWinRate * 100).toFixed(1)}%`);
4677
+ * ```
3335
4678
  */
3336
- declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
4679
+ declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
3337
4680
 
3338
4681
  /**
3339
4682
  * LLM Host Simulation - Main entry point
3340
4683
  *
3341
- * Provides the public API for simulating LLM hosts interacting
3342
- * with MCP servers through actual LLM providers.
4684
+ * All providers (openai, anthropic, google, azure, mistral, ollama, deepseek,
4685
+ * openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
4686
+ * generateText + stopWhen for a uniform multi-turn tool-calling loop with
4687
+ * built-in latency decomposition.
4688
+ *
4689
+ * Required packages per provider:
4690
+ * openai → npm install ai @ai-sdk/openai
4691
+ * anthropic → npm install ai @ai-sdk/anthropic
4692
+ * google → npm install ai @ai-sdk/google
4693
+ * azure → npm install ai @ai-sdk/azure
4694
+ * mistral → npm install ai @ai-sdk/mistral
4695
+ * ollama → npm install ai @ai-sdk/ollama (local, no API key)
4696
+ * deepseek → npm install ai @ai-sdk/deepseek
4697
+ * openrouter → npm install ai @openrouter/ai-sdk-provider
4698
+ * xai → npm install ai @ai-sdk/xai
3343
4699
  */
3344
4700
 
3345
4701
  /**
3346
- * Simulates an LLM host interacting with an MCP server
4702
+ * Simulates an LLM host interacting with an MCP server.
4703
+ *
4704
+ * The LLM chooses which tools to call based solely on their descriptions and
4705
+ * schemas, testing discoverability and parameter clarity at the level a real
4706
+ * user (via Claude Desktop, ChatGPT, etc.) would experience.
3347
4707
  *
3348
- * This function uses actual LLM providers (OpenAI or Anthropic) to test
3349
- * MCP servers through natural language scenarios. The LLM chooses which
3350
- * tools to call based on their descriptions, testing discoverability and
3351
- * parameter clarity.
4708
+ * All providers run through the Vercel AI SDK's generateText with maxSteps,
4709
+ * which handles multi-turn tool calling natively and provides per-step latency
4710
+ * decomposition (llmDurationMs vs. mcpDurationMs).
3352
4711
  *
3353
4712
  * @param mcp - MCP fixture API
3354
- * @param scenario - Natural language prompt describing what to do
3355
- * @param config - LLM host configuration
3356
- * @returns Simulation result with tool calls and final response
4713
+ * @param scenario - Natural language prompt describing what the LLM should do
4714
+ * @param config - LLM host configuration (provider, model, temperature, etc.)
4715
+ * @returns Simulation result with tool calls, final response, and latency data
3357
4716
  *
3358
4717
  * @example
3359
4718
  * ```typescript
3360
4719
  * const result = await simulateLLMHost(mcp,
3361
- * "Get the weather for London",
3362
- * {
3363
- * provider: 'openai',
3364
- * model: 'gpt-4o'
3365
- * }
4720
+ * "Find recent documents about MCP testing frameworks",
4721
+ * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
3366
4722
  * );
3367
4723
  *
3368
4724
  * expect(result.success).toBe(true);
3369
- * expect(result.toolCalls).toContainEqual({
3370
- * name: 'get_weather',
3371
- * arguments: { city: 'London' }
3372
- * });
4725
+ * expect(result.toolCalls.map(c => c.name)).toContain('search');
3373
4726
  * ```
3374
4727
  */
3375
4728
  declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
3376
4729
  /**
3377
- * Checks if the required SDK is available for a given provider
4730
+ * Returns true if the given provider is supported.
3378
4731
  *
3379
- * This performs a quick check without actually loading the SDK.
3380
- * The actual SDK loading happens in the adapter when simulation runs.
3381
- *
3382
- * @param provider - LLM provider to check
3383
- * @returns true if an adapter is registered for the provider
4732
+ * Note: this does not check whether the required @ai-sdk/* package is
4733
+ * installed that is validated at simulation time with a helpful error.
3384
4734
  */
3385
4735
  declare function isProviderAvailable(provider: LLMProvider): boolean;
3386
4736
  /**
3387
- * Gets a helpful error message for missing dependencies
4737
+ * Returns a human-readable installation message for a given provider.
3388
4738
  *
3389
- * @param provider - LLM provider
3390
- * @returns Error message with installation instructions
4739
+ * @remarks This is a diagnostic utility for checking whether optional
4740
+ * @ai-sdk/* packages are installed. Not part of the primary usage path.
3391
4741
  */
3392
4742
  declare function getMissingDependencyMessage(provider: LLMProvider): string;
3393
4743
 
3394
- /**
3395
- * Tool call validator for LLM host mode
3396
- *
3397
- * Validates that the LLM made the expected tool calls with correct arguments
3398
- */
3399
-
3400
- /**
3401
- * Tool call validation function signature
3402
- */
3403
- type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
3404
- /**
3405
- * Creates a tool call validator for LLM host mode
3406
- *
3407
- * Validates that the LLM made the expected tool calls with correct arguments.
3408
- * Supports partial argument matching and optional calls.
3409
- *
3410
- * @returns Validator function
3411
- *
3412
- * @example
3413
- * ```typescript
3414
- * // In your eval case:
3415
- * {
3416
- * "id": "weather-london",
3417
- * "mode": "llm_host",
3418
- * "scenario": "Get the weather for London",
3419
- * "expectedToolCalls": [
3420
- * {
3421
- * "name": "get_weather",
3422
- * "arguments": { "city": "London" },
3423
- * "required": true
3424
- * }
3425
- * ]
3426
- * }
3427
- * ```
3428
- */
3429
- declare function createToolCallValidator(): ToolCallValidator;
3430
-
3431
4744
  /**
3432
4745
  * Creates an LLM judge for evaluating tool responses
3433
4746
  *
@@ -3494,7 +4807,7 @@ interface MCPConformanceOptions {
3494
4807
  /**
3495
4808
  * Individual check result
3496
4809
  */
3497
- interface MCPConformanceCheck$1 {
4810
+ interface MCPConformanceCheck {
3498
4811
  name: string;
3499
4812
  pass: boolean;
3500
4813
  message: string;
@@ -3539,7 +4852,7 @@ interface MCPConformanceResult {
3539
4852
  /**
3540
4853
  * List of check results
3541
4854
  */
3542
- checks: MCPConformanceCheck$1[];
4855
+ checks: MCPConformanceCheck[];
3543
4856
  /**
3544
4857
  * Raw MCP responses for snapshotting
3545
4858
  *
@@ -3588,229 +4901,6 @@ interface MCPConformanceResult {
3588
4901
  */
3589
4902
  declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
3590
4903
 
3591
- /**
3592
- * Reporter-specific type definitions
3593
- *
3594
- * These types are used by the MCP reporter and UI.
3595
- *
3596
- * @packageDocumentation
3597
- */
3598
-
3599
- /**
3600
- * Individual conformance check result
3601
- */
3602
- interface MCPConformanceCheck {
3603
- /**
3604
- * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
3605
- */
3606
- name: string;
3607
- /**
3608
- * Whether the check passed
3609
- */
3610
- pass: boolean;
3611
- /**
3612
- * Human-readable message describing the result
3613
- */
3614
- message: string;
3615
- }
3616
- /**
3617
- * Conformance check result as stored in reporter data
3618
- */
3619
- interface MCPConformanceResultData {
3620
- /**
3621
- * Test title where conformance check was run
3622
- */
3623
- testTitle: string;
3624
- /**
3625
- * Whether all checks passed
3626
- */
3627
- pass: boolean;
3628
- /**
3629
- * Individual check results
3630
- */
3631
- checks: MCPConformanceCheck[];
3632
- /**
3633
- * Server info if available
3634
- */
3635
- serverInfo?: {
3636
- name?: string;
3637
- version?: string;
3638
- };
3639
- /**
3640
- * Number of tools discovered
3641
- */
3642
- toolCount: number;
3643
- /**
3644
- * Auth type used for this check
3645
- */
3646
- authType?: AuthType;
3647
- /**
3648
- * Project name
3649
- */
3650
- project?: string;
3651
- }
3652
- /**
3653
- * Server capabilities data from mcp-list-tools attachment
3654
- */
3655
- interface MCPServerCapabilitiesData {
3656
- /**
3657
- * Test title where listTools was called
3658
- */
3659
- testTitle: string;
3660
- /**
3661
- * List of tools available on the server
3662
- */
3663
- tools: Array<{
3664
- name: string;
3665
- description?: string;
3666
- }>;
3667
- /**
3668
- * Total number of tools
3669
- */
3670
- toolCount: number;
3671
- /**
3672
- * Auth type used for this test
3673
- */
3674
- authType?: AuthType;
3675
- /**
3676
- * Project name
3677
- */
3678
- project?: string;
3679
- }
3680
- /**
3681
- * Result of a single eval case
3682
- */
3683
- interface EvalCaseResult {
3684
- /**
3685
- * Case ID
3686
- */
3687
- id: string;
3688
- /**
3689
- * Dataset name this case belongs to
3690
- */
3691
- datasetName: string;
3692
- /**
3693
- * MCP tool name that was called
3694
- */
3695
- toolName: string;
3696
- /**
3697
- * Source of this result
3698
- */
3699
- source: ResultSource;
3700
- /**
3701
- * Overall pass/fail status
3702
- */
3703
- pass: boolean;
3704
- /**
3705
- * Tool response
3706
- */
3707
- response?: unknown;
3708
- /**
3709
- * Error if tool call failed
3710
- */
3711
- error?: string;
3712
- /**
3713
- * Expectation results
3714
- */
3715
- expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
3716
- /**
3717
- * Authentication type used for this test
3718
- */
3719
- authType?: AuthType;
3720
- /**
3721
- * Playwright project name this test belongs to
3722
- */
3723
- project?: string;
3724
- /**
3725
- * Execution time in milliseconds
3726
- */
3727
- durationMs: number;
3728
- /**
3729
- * @deprecated Mode is inferred from test context, not displayed in reports
3730
- */
3731
- mode?: 'direct' | 'llm_host';
3732
- }
3733
- /**
3734
- * Aggregated MCP eval run data
3735
- */
3736
- interface MCPEvalRunData {
3737
- /**
3738
- * Run timestamp (ISO 8601)
3739
- */
3740
- timestamp: string;
3741
- /**
3742
- * Total duration in milliseconds
3743
- */
3744
- durationMs: number;
3745
- /**
3746
- * Environment info
3747
- */
3748
- environment: {
3749
- ci: boolean;
3750
- node: string;
3751
- platform: string;
3752
- };
3753
- /**
3754
- * Aggregate metrics
3755
- */
3756
- metrics: {
3757
- /**
3758
- * Total number of eval cases
3759
- */
3760
- total: number;
3761
- /**
3762
- * Number of passed cases
3763
- */
3764
- passed: number;
3765
- /**
3766
- * Number of failed cases
3767
- */
3768
- failed: number;
3769
- /**
3770
- * Pass rate (0-1)
3771
- */
3772
- passRate: number;
3773
- /**
3774
- * Dataset breakdown: dataset name -> count
3775
- */
3776
- datasetBreakdown: Record<string, number>;
3777
- /**
3778
- * Expectation type breakdown
3779
- */
3780
- expectationBreakdown: ExpectationBreakdown;
3781
- };
3782
- /**
3783
- * All eval results from this run
3784
- */
3785
- results: EvalCaseResult[];
3786
- /**
3787
- * Conformance check results (optional)
3788
- */
3789
- conformanceChecks?: MCPConformanceResultData[];
3790
- /**
3791
- * Server capabilities discovered via listTools (optional)
3792
- */
3793
- serverCapabilities?: MCPServerCapabilitiesData[];
3794
- }
3795
- /**
3796
- * Historical summary for trend charts
3797
- */
3798
- interface MCPEvalHistoricalSummary {
3799
- timestamp: string;
3800
- total: number;
3801
- passed: number;
3802
- failed: number;
3803
- passRate: number;
3804
- durationMs: number;
3805
- }
3806
- /**
3807
- * Complete data structure passed to UI
3808
- */
3809
- interface MCPEvalData {
3810
- runData: MCPEvalRunData;
3811
- historical: MCPEvalHistoricalSummary[];
3812
- }
3813
-
3814
4904
  /**
3815
4905
  * Reporter types - re-exported from canonical source
3816
4906
  *
@@ -3831,7 +4921,7 @@ interface MCPEvalReporterConfig {
3831
4921
  outputDir?: string;
3832
4922
  /**
3833
4923
  * Auto-open report in browser after test run
3834
- * @default true (disabled in CI)
4924
+ * @default false
3835
4925
  */
3836
4926
  autoOpen?: boolean;
3837
4927
  /**
@@ -3854,4 +4944,4 @@ interface MCPEvalReporterConfig {
3854
4944
  includeAutoTracking?: boolean;
3855
4945
  }
3856
4946
 
3857
- export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult$1 as EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type ExpectedToolCall, type FieldRemovalSanitizer, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck$1 as MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type SizeValidatorOptions, type SnapshotSanitizer, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallValidationResult, type ToolCallValidator, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, createToolCallValidator, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, extractText as extractTextFromResponse, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performOAuthSetup, performOAuthSetupIfNeeded, runConformanceChecks, runEvalCase, runEvalDataset, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText };
4947
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };