@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -42,6 +42,28 @@ interface MCPOAuthConfig {
42
42
  */
43
43
  redirectUri?: string;
44
44
  }
45
+ /**
46
+ * OAuth 2.1 client credentials configuration for machine-to-machine (CI/CD) authentication.
47
+ * Credentials can be provided here or via MCP_CLIENT_ID/MCP_CLIENT_SECRET environment variables.
48
+ */
49
+ interface MCPClientCredentialsConfig {
50
+ /**
51
+ * OAuth client ID (falls back to MCP_CLIENT_ID env var)
52
+ */
53
+ clientId?: string;
54
+ /**
55
+ * OAuth client secret (falls back to MCP_CLIENT_SECRET env var)
56
+ */
57
+ clientSecret?: string;
58
+ /**
59
+ * Token endpoint URL (required)
60
+ */
61
+ tokenEndpoint?: string;
62
+ /**
63
+ * Scopes to request
64
+ */
65
+ scopes?: string[];
66
+ }
45
67
  /**
46
68
  * Authentication configuration for MCP connections
47
69
  */
@@ -54,6 +76,10 @@ interface MCPAuthConfig {
54
76
  * Full OAuth configuration for browser-based authentication
55
77
  */
56
78
  oauth?: MCPOAuthConfig;
79
+ /**
80
+ * OAuth 2.1 client credentials grant for machine-to-machine authentication
81
+ */
82
+ clientCredentials?: MCPClientCredentialsConfig;
57
83
  }
58
84
  /**
59
85
  * MCP host capabilities that can be registered with the server
@@ -74,35 +100,35 @@ interface MCPHostCapabilities {
74
100
  };
75
101
  }
76
102
  /**
77
- * Configuration for MCP client connection
78
- *
79
- * Supports both stdio (local) and HTTP (remote) transports
103
+ * Configuration for MCP client connection via stdio transport (local process)
80
104
  */
81
- interface MCPConfig {
105
+ interface StdioMCPConfig {
82
106
  /**
83
- * Transport type
107
+ * Transport type discriminant
84
108
  */
85
- transport: 'http' | 'stdio';
109
+ transport: 'stdio';
86
110
  /**
87
- * Server URL (required when transport === 'http')
111
+ * Command to execute (required for stdio transport)
88
112
  */
89
- serverUrl?: string;
113
+ command: string;
90
114
  /**
91
- * HTTP headers (optional for http transport, e.g., Authorization)
115
+ * Command arguments
92
116
  */
93
- headers?: Record<string, string>;
117
+ args?: Array<string>;
94
118
  /**
95
- * Command to execute (required when transport === 'stdio')
119
+ * Working directory for the command
96
120
  */
97
- command?: string;
121
+ cwd?: string;
98
122
  /**
99
- * Command arguments (optional for stdio)
123
+ * Environment variables to pass to the subprocess.
124
+ * Merged with the current process environment.
100
125
  */
101
- args?: Array<string>;
126
+ env?: Record<string, string>;
102
127
  /**
103
- * Working directory for the command (optional for stdio)
128
+ * Suppress stderr output from the server process.
129
+ * When true, server stderr is ignored instead of inherited.
104
130
  */
105
- cwd?: string;
131
+ quiet?: boolean;
106
132
  /**
107
133
  * Host capabilities to register with the server
108
134
  */
@@ -116,15 +142,94 @@ interface MCPConfig {
116
142
  */
117
143
  requestTimeoutMs?: number;
118
144
  /**
119
- * Suppress stderr output from the server process (stdio only)
120
- * When true, server stderr is ignored instead of inherited
145
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
121
146
  */
122
- quiet?: boolean;
147
+ callTimeoutMs?: number;
148
+ }
149
+ /**
150
+ * Configuration for MCP client connection via HTTP transport (remote server)
151
+ */
152
+ interface HttpMCPConfig {
153
+ /**
154
+ * Transport type discriminant
155
+ */
156
+ transport: 'http';
123
157
  /**
124
- * Authentication configuration (optional for http transport)
158
+ * Server URL (required for http transport)
159
+ */
160
+ serverUrl: string;
161
+ /**
162
+ * HTTP headers (e.g., Authorization)
163
+ */
164
+ headers?: Record<string, string>;
165
+ /**
166
+ * Authentication configuration
125
167
  */
126
168
  auth?: MCPAuthConfig;
169
+ /**
170
+ * Host capabilities to register with the server
171
+ */
172
+ capabilities?: MCPHostCapabilities;
173
+ /**
174
+ * Connection timeout in milliseconds
175
+ */
176
+ connectTimeoutMs?: number;
177
+ /**
178
+ * Request timeout in milliseconds
179
+ */
180
+ requestTimeoutMs?: number;
181
+ /**
182
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
183
+ */
184
+ callTimeoutMs?: number;
185
+ /**
186
+ * HTTP proxy configuration. Falls back to HTTPS_PROXY/HTTP_PROXY environment variables.
187
+ */
188
+ proxy?: {
189
+ /**
190
+ * Proxy URL. Credentials can be embedded directly if required:
191
+ * `http://user:pass@proxy.example.com:8080`
192
+ */
193
+ url: string;
194
+ };
195
+ /**
196
+ * Number of retry attempts for transient connection failures and 429 rate limit responses.
197
+ * Uses exponential backoff with Retry-After header awareness. Defaults to 0 (no retries).
198
+ */
199
+ retryAttempts?: number;
200
+ /**
201
+ * TLS/mTLS configuration for custom certificates or disabling cert validation.
202
+ * File paths should point to PEM-encoded certificate files.
203
+ */
204
+ tls?: {
205
+ /**
206
+ * Path to CA certificate PEM file (for custom/self-signed CAs)
207
+ */
208
+ ca?: string;
209
+ /**
210
+ * Path to client certificate PEM file (for mutual TLS)
211
+ */
212
+ cert?: string;
213
+ /**
214
+ * Path to client private key PEM file (for mutual TLS)
215
+ */
216
+ key?: string;
217
+ /**
218
+ * Whether to reject unauthorized certificates. Defaults to true.
219
+ * Set to false to disable certificate validation (not recommended for production).
220
+ */
221
+ rejectUnauthorized?: boolean;
222
+ };
127
223
  }
224
+ /**
225
+ * Configuration for MCP client connection.
226
+ *
227
+ * This is a discriminated union — narrow with `isStdioConfig()` or `isHttpConfig()`
228
+ * before accessing transport-specific fields.
229
+ *
230
+ * Supports both stdio (local) and HTTP (remote) transports.
231
+ */
232
+ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
128
233
  /**
129
234
  * Union schema for MCPConfig (validates based on transport type)
130
235
  */
@@ -133,6 +238,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
133
238
  command: z.ZodString;
134
239
  args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
135
240
  cwd: z.ZodOptional<z.ZodString>;
241
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
136
242
  capabilities: z.ZodOptional<z.ZodObject<{
137
243
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
138
244
  roots: z.ZodOptional<z.ZodObject<{
@@ -155,12 +261,14 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
155
261
  }>>;
156
262
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
157
263
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
264
+ callTimeoutMs: z.ZodOptional<z.ZodNumber>;
158
265
  quiet: z.ZodOptional<z.ZodBoolean>;
159
266
  }, "strip", z.ZodTypeAny, {
160
267
  transport: "stdio";
161
268
  command: string;
162
269
  args?: string[] | undefined;
163
270
  cwd?: string | undefined;
271
+ env?: Record<string, string> | undefined;
164
272
  capabilities?: {
165
273
  sampling?: Record<string, unknown> | undefined;
166
274
  roots?: {
@@ -169,12 +277,14 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
169
277
  } | undefined;
170
278
  connectTimeoutMs?: number | undefined;
171
279
  requestTimeoutMs?: number | undefined;
280
+ callTimeoutMs?: number | undefined;
172
281
  quiet?: boolean | undefined;
173
282
  }, {
174
283
  transport: "stdio";
175
284
  command: string;
176
285
  args?: string[] | undefined;
177
286
  cwd?: string | undefined;
287
+ env?: Record<string, string> | undefined;
178
288
  capabilities?: {
179
289
  sampling?: Record<string, unknown> | undefined;
180
290
  roots?: {
@@ -183,10 +293,11 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
183
293
  } | undefined;
184
294
  connectTimeoutMs?: number | undefined;
185
295
  requestTimeoutMs?: number | undefined;
296
+ callTimeoutMs?: number | undefined;
186
297
  quiet?: boolean | undefined;
187
298
  }>, z.ZodObject<{
188
299
  transport: z.ZodLiteral<"http">;
189
- serverUrl: z.ZodString;
300
+ serverUrl: z.ZodEffects<z.ZodString, string, string>;
190
301
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
191
302
  capabilities: z.ZodOptional<z.ZodObject<{
192
303
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
@@ -210,6 +321,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
210
321
  }>>;
211
322
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
212
323
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
324
+ callTimeoutMs: z.ZodOptional<z.ZodNumber>;
213
325
  auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
214
326
  accessToken: z.ZodOptional<z.ZodString>;
215
327
  oauth: z.ZodOptional<z.ZodObject<{
@@ -237,6 +349,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
237
349
  clientSecret?: string | undefined;
238
350
  redirectUri?: string | undefined;
239
351
  }>>;
352
+ clientCredentials: z.ZodOptional<z.ZodObject<{
353
+ clientId: z.ZodOptional<z.ZodString>;
354
+ clientSecret: z.ZodOptional<z.ZodString>;
355
+ tokenEndpoint: z.ZodOptional<z.ZodString>;
356
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
357
+ }, "strip", z.ZodTypeAny, {
358
+ scopes?: string[] | undefined;
359
+ clientId?: string | undefined;
360
+ clientSecret?: string | undefined;
361
+ tokenEndpoint?: string | undefined;
362
+ }, {
363
+ scopes?: string[] | undefined;
364
+ clientId?: string | undefined;
365
+ clientSecret?: string | undefined;
366
+ tokenEndpoint?: string | undefined;
367
+ }>>;
240
368
  }, "strip", z.ZodTypeAny, {
241
369
  accessToken?: string | undefined;
242
370
  oauth?: {
@@ -248,6 +376,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
248
376
  clientSecret?: string | undefined;
249
377
  redirectUri?: string | undefined;
250
378
  } | undefined;
379
+ clientCredentials?: {
380
+ scopes?: string[] | undefined;
381
+ clientId?: string | undefined;
382
+ clientSecret?: string | undefined;
383
+ tokenEndpoint?: string | undefined;
384
+ } | undefined;
251
385
  }, {
252
386
  accessToken?: string | undefined;
253
387
  oauth?: {
@@ -259,6 +393,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
259
393
  clientSecret?: string | undefined;
260
394
  redirectUri?: string | undefined;
261
395
  } | undefined;
396
+ clientCredentials?: {
397
+ scopes?: string[] | undefined;
398
+ clientId?: string | undefined;
399
+ clientSecret?: string | undefined;
400
+ tokenEndpoint?: string | undefined;
401
+ } | undefined;
262
402
  }>, {
263
403
  accessToken?: string | undefined;
264
404
  oauth?: {
@@ -270,6 +410,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
270
410
  clientSecret?: string | undefined;
271
411
  redirectUri?: string | undefined;
272
412
  } | undefined;
413
+ clientCredentials?: {
414
+ scopes?: string[] | undefined;
415
+ clientId?: string | undefined;
416
+ clientSecret?: string | undefined;
417
+ tokenEndpoint?: string | undefined;
418
+ } | undefined;
273
419
  }, {
274
420
  accessToken?: string | undefined;
275
421
  oauth?: {
@@ -281,6 +427,36 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
281
427
  clientSecret?: string | undefined;
282
428
  redirectUri?: string | undefined;
283
429
  } | undefined;
430
+ clientCredentials?: {
431
+ scopes?: string[] | undefined;
432
+ clientId?: string | undefined;
433
+ clientSecret?: string | undefined;
434
+ tokenEndpoint?: string | undefined;
435
+ } | undefined;
436
+ }>>;
437
+ proxy: z.ZodOptional<z.ZodObject<{
438
+ url: z.ZodString;
439
+ }, "strip", z.ZodTypeAny, {
440
+ url: string;
441
+ }, {
442
+ url: string;
443
+ }>>;
444
+ retryAttempts: z.ZodOptional<z.ZodNumber>;
445
+ tls: z.ZodOptional<z.ZodObject<{
446
+ ca: z.ZodOptional<z.ZodString>;
447
+ cert: z.ZodOptional<z.ZodString>;
448
+ key: z.ZodOptional<z.ZodString>;
449
+ rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
450
+ }, "strip", z.ZodTypeAny, {
451
+ ca?: string | undefined;
452
+ cert?: string | undefined;
453
+ key?: string | undefined;
454
+ rejectUnauthorized?: boolean | undefined;
455
+ }, {
456
+ ca?: string | undefined;
457
+ cert?: string | undefined;
458
+ key?: string | undefined;
459
+ rejectUnauthorized?: boolean | undefined;
284
460
  }>>;
285
461
  }, "strip", z.ZodTypeAny, {
286
462
  serverUrl: string;
@@ -293,6 +469,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
293
469
  } | undefined;
294
470
  connectTimeoutMs?: number | undefined;
295
471
  requestTimeoutMs?: number | undefined;
472
+ callTimeoutMs?: number | undefined;
296
473
  headers?: Record<string, string> | undefined;
297
474
  auth?: {
298
475
  accessToken?: string | undefined;
@@ -305,6 +482,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
305
482
  clientSecret?: string | undefined;
306
483
  redirectUri?: string | undefined;
307
484
  } | undefined;
485
+ clientCredentials?: {
486
+ scopes?: string[] | undefined;
487
+ clientId?: string | undefined;
488
+ clientSecret?: string | undefined;
489
+ tokenEndpoint?: string | undefined;
490
+ } | undefined;
491
+ } | undefined;
492
+ proxy?: {
493
+ url: string;
494
+ } | undefined;
495
+ retryAttempts?: number | undefined;
496
+ tls?: {
497
+ ca?: string | undefined;
498
+ cert?: string | undefined;
499
+ key?: string | undefined;
500
+ rejectUnauthorized?: boolean | undefined;
308
501
  } | undefined;
309
502
  }, {
310
503
  serverUrl: string;
@@ -317,6 +510,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
317
510
  } | undefined;
318
511
  connectTimeoutMs?: number | undefined;
319
512
  requestTimeoutMs?: number | undefined;
513
+ callTimeoutMs?: number | undefined;
320
514
  headers?: Record<string, string> | undefined;
321
515
  auth?: {
322
516
  accessToken?: string | undefined;
@@ -329,6 +523,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
329
523
  clientSecret?: string | undefined;
330
524
  redirectUri?: string | undefined;
331
525
  } | undefined;
526
+ clientCredentials?: {
527
+ scopes?: string[] | undefined;
528
+ clientId?: string | undefined;
529
+ clientSecret?: string | undefined;
530
+ tokenEndpoint?: string | undefined;
531
+ } | undefined;
532
+ } | undefined;
533
+ proxy?: {
534
+ url: string;
535
+ } | undefined;
536
+ retryAttempts?: number | undefined;
537
+ tls?: {
538
+ ca?: string | undefined;
539
+ cert?: string | undefined;
540
+ key?: string | undefined;
541
+ rejectUnauthorized?: boolean | undefined;
332
542
  } | undefined;
333
543
  }>]>;
334
544
  /**
@@ -342,17 +552,11 @@ declare function validateMCPConfig(config: unknown): MCPConfig;
342
552
  /**
343
553
  * Type guard to check if a config is for stdio transport
344
554
  */
345
- declare function isStdioConfig(config: MCPConfig): config is MCPConfig & {
346
- transport: 'stdio';
347
- command: string;
348
- };
555
+ declare function isStdioConfig(config: MCPConfig): config is StdioMCPConfig;
349
556
  /**
350
557
  * Type guard to check if a config is for HTTP transport
351
558
  */
352
- declare function isHttpConfig(config: MCPConfig): config is MCPConfig & {
353
- transport: 'http';
354
- serverUrl: string;
355
- };
559
+ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
356
560
 
357
561
  /**
358
562
  * Auth types for MCP OAuth integration
@@ -601,6 +805,9 @@ declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
601
805
  tokens(): Promise<OAuthTokens | undefined>;
602
806
  /**
603
807
  * Stores new OAuth tokens for the current session
808
+ *
809
+ * The code verifier is cleared after a successful token exchange — it is
810
+ * single-use per PKCE spec and must not persist beyond the exchange.
604
811
  */
605
812
  saveTokens(tokens: OAuthTokens): Promise<void>;
606
813
  /**
@@ -757,6 +964,38 @@ interface AuthServerMetadata {
757
964
  */
758
965
  issuer: string;
759
966
  }
967
+ /**
968
+ * Configuration for client credentials grant
969
+ */
970
+ interface ClientCredentialsConfig {
971
+ /**
972
+ * Token endpoint URL
973
+ */
974
+ tokenEndpoint: string;
975
+ /**
976
+ * OAuth client ID
977
+ */
978
+ clientId: string;
979
+ /**
980
+ * OAuth client secret
981
+ */
982
+ clientSecret: string;
983
+ /**
984
+ * Scopes to request (optional)
985
+ */
986
+ scopes?: string[];
987
+ }
988
+ /**
989
+ * Performs the OAuth 2.1 client credentials grant to obtain an access token.
990
+ * Suitable for CI/CD machine-to-machine authentication.
991
+ *
992
+ * Uses oauth4webapi for spec-compliant request construction and response validation,
993
+ * consistent with how the rest of this module handles OAuth flows.
994
+ *
995
+ * @param config - Client credentials configuration
996
+ * @returns Token result
997
+ */
998
+ declare function performClientCredentialsFlow(config: ClientCredentialsConfig): Promise<TokenResult>;
760
999
 
761
1000
  /**
762
1001
  * OAuth Protected Resource and Authorization Server discovery
@@ -915,8 +1154,9 @@ declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?
915
1154
  * ```typescript
916
1155
  * // After running: npx mcp-server-tester login https://api.example.com/mcp
917
1156
  * const tokens = await loadTokens('https://api.example.com/mcp');
918
- * if (tokens) {
919
- * console.log('Access token:', tokens.accessToken);
1157
+ * if (tokens?.accessToken) {
1158
+ * // Use the token — never log raw token values
1159
+ * headers.Authorization = `Bearer ${tokens.accessToken}`;
920
1160
  * }
921
1161
  * ```
922
1162
  */
@@ -1127,6 +1367,14 @@ interface CreateMCPClientOptions {
1127
1367
  * This takes precedence over static token auth in config.auth.accessToken.
1128
1368
  */
1129
1369
  authProvider?: OAuthClientProvider;
1370
+ /**
1371
+ * Sampling handler callback for LLM sampling requests from the server.
1372
+ *
1373
+ * When provided, the client will advertise sampling capability to the server.
1374
+ * When absent, sampling is removed from declared capabilities so the client
1375
+ * does not falsely advertise support it cannot fulfill.
1376
+ */
1377
+ samplingHandler?: (...args: unknown[]) => unknown;
1130
1378
  }
1131
1379
  /**
1132
1380
  * Creates and connects an MCP client based on the provided configuration
@@ -1251,6 +1499,14 @@ interface ValidationResult {
1251
1499
  message: string;
1252
1500
  /** Additional structured details about the validation */
1253
1501
  details?: Record<string, unknown>;
1502
+ /**
1503
+ * Optional quantitative metrics from the validation.
1504
+ * Populated by validateToolCalls for precision/recall.
1505
+ */
1506
+ metrics?: {
1507
+ precision?: number;
1508
+ recall?: number;
1509
+ };
1254
1510
  }
1255
1511
  /**
1256
1512
  * Options for text validation
@@ -1282,10 +1538,33 @@ interface PatternValidatorOptions {
1282
1538
  /** Whether to perform case-sensitive matching (default: true) */
1283
1539
  caseSensitive?: boolean;
1284
1540
  }
1541
+ /**
1542
+ * Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
1543
+ * Pass these values in the sanitizers array to replace non-deterministic
1544
+ * values with stable placeholders before snapshot comparison.
1545
+ *
1546
+ * @example
1547
+ * expect(result).toMatchToolSnapshot('my-snapshot', [
1548
+ * SnapshotSanitizers.UUID,
1549
+ * SnapshotSanitizers.ISO_DATE,
1550
+ * ]);
1551
+ */
1552
+ declare const SnapshotSanitizers: {
1553
+ /** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
1554
+ readonly TIMESTAMP: "timestamp";
1555
+ /** Replaces UUID v1-v5 strings with a stable placeholder */
1556
+ readonly UUID: "uuid";
1557
+ /** Replaces ISO 8601 date/datetime strings with a stable placeholder */
1558
+ readonly ISO_DATE: "iso-date";
1559
+ /** Replaces MongoDB ObjectId strings with a stable placeholder */
1560
+ readonly OBJECT_ID: "objectId";
1561
+ /** Replaces JWT tokens with a stable placeholder */
1562
+ readonly JWT: "jwt";
1563
+ };
1285
1564
  /**
1286
1565
  * Built-in sanitizer names for common variable patterns
1287
1566
  */
1288
- type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
1567
+ type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
1289
1568
  /**
1290
1569
  * Custom regex-based sanitizer
1291
1570
  */
@@ -1511,38 +1790,63 @@ declare function validateError(response: unknown, expected?: boolean | string |
1511
1790
  declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
1512
1791
 
1513
1792
  /**
1514
- * Validator Utilities
1793
+ * Tool call validators for llm_host simulation results.
1515
1794
  *
1516
- * Shared utility functions for validation operations.
1517
- * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
1795
+ * These validators extract the tool call trace from an LLMHostSimulationResult
1796
+ * and apply assertions against expected call lists and counts.
1518
1797
  */
1519
1798
 
1799
+ interface ToolCallExpectation {
1800
+ calls: Array<{
1801
+ name: string;
1802
+ arguments?: Record<string, unknown>;
1803
+ required?: boolean;
1804
+ }>;
1805
+ order?: 'strict' | 'any';
1806
+ exclusive?: boolean;
1807
+ }
1808
+ interface ToolCallCountOptions {
1809
+ min?: number;
1810
+ max?: number;
1811
+ exact?: number;
1812
+ }
1520
1813
  /**
1521
- * Gets the size of a response in bytes
1522
- *
1523
- * Serializes the response to JSON (with pretty printing for consistency)
1524
- * and returns the byte length using UTF-8 encoding.
1814
+ * Validates tool calls made during an LLM host simulation.
1525
1815
  *
1526
- * @param response - Response in any format
1527
- * @returns Size in bytes
1816
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1817
+ * @param expectation - Expected tool call specification
1528
1818
  */
1529
- declare function getResponseSizeBytes(response: unknown): number;
1819
+ declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
1530
1820
  /**
1531
- * Normalizes whitespace in text for consistent comparison
1532
- *
1533
- * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
1534
- * and trims leading/trailing whitespace.
1821
+ * Validates the number of tool calls made during an LLM host simulation.
1535
1822
  *
1536
- * @param text - Text to normalize
1537
- * @returns Normalized text with collapsed whitespace
1823
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1824
+ * @param options - Count constraints (min, max, exact)
1825
+ */
1826
+ declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
1827
+
1828
+ /**
1829
+ * Built-in judge rubrics matching Glean EvalV2's named judge types.
1830
+ * Use these for consistent, standardized evaluations across teams.
1538
1831
  *
1539
- * @example
1540
- * ```typescript
1541
- * normalizeWhitespace(' hello\n\n world ');
1542
- * // Returns: "hello world"
1543
- * ```
1832
+ * All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
1544
1833
  */
1545
- declare function normalizeWhitespace(text: string): string;
1834
+ type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
1835
+ declare const BUILT_IN_RUBRICS: Record<BuiltInRubric, string>;
1836
+ /** A rubric specification: either a built-in named rubric or custom text. */
1837
+ type RubricSpec = BuiltInRubric | {
1838
+ text: string;
1839
+ };
1840
+ /**
1841
+ * Returns true if `s` is a built-in rubric name.
1842
+ */
1843
+ declare function isBuiltInRubric(s: unknown): s is BuiltInRubric;
1844
+ /**
1845
+ * Resolves a RubricSpec to its full rubric text.
1846
+ * - Built-in name → returns the expanded rubric text from BUILT_IN_RUBRICS
1847
+ * - Custom object → returns rubric.text as-is
1848
+ */
1849
+ declare function resolveRubric(rubric: RubricSpec): string;
1546
1850
 
1547
1851
  /**
1548
1852
  * Usage metrics from Claude Agent SDK response
@@ -1577,17 +1881,15 @@ interface UsageMetrics {
1577
1881
  */
1578
1882
  cacheCreationInputTokens?: number;
1579
1883
  }
1580
- /**
1581
- * Supported LLM provider types
1582
- */
1583
- type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
1884
+ /** Valid LLM judge provider kinds. */
1885
+ type ProviderKind = 'anthropic' | 'openai' | 'google';
1584
1886
  /**
1585
1887
  * Configuration for an LLM judge
1586
1888
  */
1587
1889
  interface JudgeConfig {
1588
1890
  /**
1589
1891
  * LLM provider to use
1590
- * @default 'claude'
1892
+ * @default 'anthropic'
1591
1893
  */
1592
1894
  provider?: ProviderKind;
1593
1895
  /**
@@ -1649,7 +1951,24 @@ interface JudgeResult {
1649
1951
  * Whether the candidate exceeded maxToolOutputSize
1650
1952
  */
1651
1953
  exceedsMaxToolOutputSize?: boolean;
1954
+ /**
1955
+ * Standard deviation of individual rep scores.
1956
+ * Only populated when the judge was run with reps > 1.
1957
+ */
1958
+ scoreStdDev?: number;
1959
+ /**
1960
+ * True when the standard deviation across reps exceeds 0.2, indicating
1961
+ * that the rubric may be ambiguous or the judge is non-deterministic.
1962
+ * Only populated when the judge was run with reps > 1.
1963
+ */
1964
+ highVariance?: boolean;
1965
+ /**
1966
+ * Individual scores from each judge rep.
1967
+ * Only populated when the judge was run with reps > 1.
1968
+ */
1969
+ scores?: number[];
1652
1970
  }
1971
+
1653
1972
  /**
1654
1973
  * LLM judge client interface
1655
1974
  */
@@ -1665,6 +1984,75 @@ interface Judge {
1665
1984
  evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
1666
1985
  }
1667
1986
 
1987
+ /**
1988
+ * Judge Validator
1989
+ *
1990
+ * Validates a response using an LLM-as-a-judge evaluation.
1991
+ */
1992
+
1993
+ /**
1994
+ * Configuration for the judge validator
1995
+ */
1996
+ interface JudgeValidatorConfig {
1997
+ /** The evaluation rubric: a built-in name or custom { text: string } */
1998
+ rubric: RubricSpec;
1999
+ /** Optional reference response to compare against */
2000
+ reference?: unknown;
2001
+ /** Minimum score required to pass (0-1, default: 0.7) */
2002
+ threshold?: number;
2003
+ /** Number of judge evaluations to run. Scores averaged. @default 1 */
2004
+ reps?: number;
2005
+ /** Judge provider. @default 'claude' */
2006
+ provider?: ProviderKind;
2007
+ /** Model override (e.g., 'claude-opus-4-20250514') */
2008
+ model?: string;
2009
+ /** Environment variable name for API key */
2010
+ apiKeyEnvVar?: string;
2011
+ /** Max tokens for judge response */
2012
+ maxTokens?: number;
2013
+ /** Temperature for judge LLM (0–1) */
2014
+ temperature?: number;
2015
+ /** Max budget in USD per evaluation */
2016
+ maxBudgetUsd?: number;
2017
+ /** Fail if response exceeds this size in bytes before judging */
2018
+ maxToolOutputSize?: number;
2019
+ }
2020
+ declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
2021
+
2022
+ /**
2023
+ * Validator Utilities
2024
+ *
2025
+ * Shared utility functions for validation operations.
2026
+ * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
2027
+ */
2028
+
2029
+ /**
2030
+ * Gets the size of a response in bytes
2031
+ *
2032
+ * Serializes the response to JSON (with pretty printing for consistency)
2033
+ * and returns the byte length using UTF-8 encoding.
2034
+ *
2035
+ * @param response - Response in any format
2036
+ * @returns Size in bytes
2037
+ */
2038
+ declare function getResponseSizeBytes(response: unknown): number;
2039
+ /**
2040
+ * Normalizes whitespace in text for consistent comparison
2041
+ *
2042
+ * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
2043
+ * and trims leading/trailing whitespace.
2044
+ *
2045
+ * @param text - Text to normalize
2046
+ * @returns Normalized text with collapsed whitespace
2047
+ *
2048
+ * @example
2049
+ * ```typescript
2050
+ * normalizeWhitespace(' hello\n\n world ');
2051
+ * // Returns: "hello world"
2052
+ * ```
2053
+ */
2054
+ declare function normalizeWhitespace(text: string): string;
2055
+
1668
2056
  /**
1669
2057
  * Matcher Types
1670
2058
  *
@@ -1679,8 +2067,12 @@ interface JudgeMatcherOptions {
1679
2067
  reference?: unknown;
1680
2068
  /** Score threshold for passing (default: 0.7) */
1681
2069
  passingThreshold?: number;
1682
- /** Judge configuration override */
1683
- judgeConfig?: JudgeConfig;
2070
+ /** Number of judge evaluations (scores averaged) */
2071
+ reps?: number;
2072
+ /** Override the judge provider */
2073
+ provider?: ProviderKind;
2074
+ /** Override the judge model */
2075
+ model?: string;
1684
2076
  }
1685
2077
  /**
1686
2078
  * Declaration merging for Playwright matchers
@@ -1785,7 +2177,7 @@ declare global {
1785
2177
  * });
1786
2178
  * ```
1787
2179
  */
1788
- toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
2180
+ toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
1789
2181
  /**
1790
2182
  * Validates that a response meets size constraints
1791
2183
  *
@@ -1830,11 +2222,33 @@ declare global {
1830
2222
  * ```
1831
2223
  */
1832
2224
  toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
1833
- }
1834
- }
1835
- }
1836
- /**
1837
- * Predicate result returned by the user's predicate function
2225
+ /**
2226
+ * Validates which tools the LLM called during an llm_host simulation.
2227
+ *
2228
+ * @example
2229
+ * ```typescript
2230
+ * expect(simulationResult).toHaveToolCalls({
2231
+ * calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
2232
+ * order: 'any',
2233
+ * });
2234
+ * ```
2235
+ */
2236
+ toHaveToolCalls(expectation: ToolCallExpectation): R;
2237
+ /**
2238
+ * Validates the number of tool calls made during an llm_host simulation.
2239
+ *
2240
+ * @example
2241
+ * ```typescript
2242
+ * expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
2243
+ * expect(simulationResult).toHaveToolCallCount({ exact: 2 });
2244
+ * ```
2245
+ */
2246
+ toHaveToolCallCount(options: ToolCallCountOptions): R;
2247
+ }
2248
+ }
2249
+ }
2250
+ /**
2251
+ * Predicate result returned by the user's predicate function
1838
2252
  */
1839
2253
  interface PredicateResult {
1840
2254
  /** Whether the predicate passed */
@@ -1873,7 +2287,7 @@ type ResultSource = 'eval' | 'test';
1873
2287
  /**
1874
2288
  * Known expectation types supported by the framework
1875
2289
  */
1876
- type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
2290
+ type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size' | 'toolsTriggered' | 'toolCallCount';
1877
2291
  /**
1878
2292
  * Result of an expectation check
1879
2293
  */
@@ -1912,6 +2326,10 @@ interface MCPFixtureOptions {
1912
2326
  * Used for filtering and grouping in the reporter
1913
2327
  */
1914
2328
  project?: string;
2329
+ /**
2330
+ * Timeout in milliseconds for MCP tool/list operations. Default: 30000
2331
+ */
2332
+ callTimeoutMs?: number;
1915
2333
  }
1916
2334
  /**
1917
2335
  * High-level API for interacting with MCP servers in tests
@@ -1954,29 +2372,43 @@ interface MCPFixtureApi {
1954
2372
  } | null;
1955
2373
  }
1956
2374
  /**
1957
- * Creates an MCP fixture wrapper around a Client
2375
+ * Creates an MCP fixture wrapper around a Client, providing a high-level
2376
+ * {@link MCPFixtureApi} without requiring Playwright's `test.extend` pattern.
1958
2377
  *
1959
- * When testInfo is provided, automatically tracks all MCP operations with test.step()
1960
- * and creates attachments for the MCP Test Reporter.
2378
+ * Use this when you need to set up an MCP fixture manually — for example in
2379
+ * custom fixture hierarchies, non-Playwright test runners (e.g. Vitest,
2380
+ * Jest), or when you want to compose the fixture with other lifecycle
2381
+ * management logic that doesn't fit the standard `test.extend` model.
1961
2382
  *
1962
- * @param client - The MCP client to wrap
1963
- * @param testInfo - Optional Playwright TestInfo for auto-tracking
2383
+ * For the typical Playwright use case, prefer importing `test` and `mcp`
2384
+ * directly from `@gleanwork/mcp-server-tester/fixtures/mcp`, which wires
2385
+ * this function up automatically.
2386
+ *
2387
+ * When `testInfo` is provided, all MCP operations are automatically wrapped
2388
+ * in `test.step()` calls and attachments are created for the MCP Test
2389
+ * Reporter. Omit `testInfo` for lightweight usage outside Playwright.
2390
+ *
2391
+ * @param client - The MCP client to wrap (created via `createMCPClientForConfig`)
2392
+ * @param testInfo - Optional Playwright TestInfo for auto-tracking and reporter attachments
2393
+ * @param options - Optional fixture options (authType, project)
1964
2394
  * @returns MCPFixtureApi instance
1965
2395
  *
1966
2396
  * @example
1967
2397
  * ```typescript
1968
- * // With tracking (recommended)
2398
+ * // Advanced: custom fixture setup inside test.extend
1969
2399
  * const test = base.extend<{ mcp: MCPFixtureApi }>({
1970
2400
  * mcp: async ({}, use, testInfo) => {
1971
2401
  * const client = await createMCPClientForConfig(config);
1972
- * const api = createMCPFixture(client, testInfo);
2402
+ * const api = createMCPFixture(client, testInfo, { authType: 'api-token' });
1973
2403
  * await use(api);
1974
2404
  * await closeMCPClient(client);
1975
2405
  * }
1976
2406
  * });
1977
2407
  *
1978
- * // Without tracking
2408
+ * // Non-Playwright usage (no reporter attachments)
2409
+ * const client = await createMCPClientForConfig(config);
1979
2410
  * const api = createMCPFixture(client);
2411
+ * const tools = await api.listTools();
1980
2412
  * ```
1981
2413
  */
1982
2414
  declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
@@ -2082,6 +2514,8 @@ declare function toBeToolError(this: {
2082
2514
  * toPassToolJudge Matcher
2083
2515
  *
2084
2516
  * Validates that a response passes LLM-as-judge evaluation.
2517
+ * Delegates evaluation logic to validateJudge() for consistency
2518
+ * with the validator/matcher duality pattern.
2085
2519
  */
2086
2520
 
2087
2521
  /**
@@ -2091,7 +2525,7 @@ declare function toBeToolError(this: {
2091
2525
  */
2092
2526
  declare function toPassToolJudge(this: {
2093
2527
  isNot: boolean;
2094
- }, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
2528
+ }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
2095
2529
  pass: boolean;
2096
2530
  message: () => string;
2097
2531
  }>;
@@ -2158,6 +2592,38 @@ declare function toSatisfyToolPredicate(this: {
2158
2592
  message: () => string;
2159
2593
  }>;
2160
2594
 
2595
+ /**
2596
+ * toHaveToolCalls Matcher
2597
+ *
2598
+ * Validates which tools the LLM called during an llm_host simulation.
2599
+ */
2600
+
2601
+ /**
2602
+ * Creates the toHaveToolCalls matcher function
2603
+ */
2604
+ declare function toHaveToolCalls(this: {
2605
+ isNot: boolean;
2606
+ }, received: unknown, expectation: ToolCallExpectation): {
2607
+ pass: boolean;
2608
+ message: () => string;
2609
+ };
2610
+
2611
+ /**
2612
+ * toHaveToolCallCount Matcher
2613
+ *
2614
+ * Validates the number of tool calls made during an llm_host simulation.
2615
+ */
2616
+
2617
+ /**
2618
+ * Creates the toHaveToolCallCount matcher function
2619
+ */
2620
+ declare function toHaveToolCallCount(this: {
2621
+ isNot: boolean;
2622
+ }, received: unknown, options: ToolCallCountOptions): {
2623
+ pass: boolean;
2624
+ message: () => string;
2625
+ };
2626
+
2161
2627
  /**
2162
2628
  * Extended Playwright expect with MCP tool matchers
2163
2629
  *
@@ -2184,6 +2650,8 @@ declare const expect: playwright_test.Expect<{
2184
2650
  toPassToolJudge: typeof toPassToolJudge;
2185
2651
  toHaveToolResponseSize: typeof toHaveToolResponseSize;
2186
2652
  toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
2653
+ toHaveToolCalls: typeof toHaveToolCalls;
2654
+ toHaveToolCallCount: typeof toHaveToolCallCount;
2187
2655
  }>;
2188
2656
 
2189
2657
  /**
@@ -2223,7 +2691,33 @@ type MCPFixtures = {
2223
2691
  * expect(tools.length).toBeGreaterThan(0);
2224
2692
  * });
2225
2693
  */
2226
- declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
2694
+ declare const test$1: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
2695
+
2696
+ /**
2697
+ * Test-scoped auth fixtures interface
2698
+ */
2699
+ interface MCPAuthFixtures {
2700
+ /**
2701
+ * OAuth client provider for MCP authentication
2702
+ */
2703
+ mcpAuthProvider: OAuthClientProvider | undefined;
2704
+ }
2705
+ /**
2706
+ * Extended Playwright test with MCP auth fixtures
2707
+ *
2708
+ * Use this when you need OAuth authentication for MCP server testing.
2709
+ *
2710
+ * @example
2711
+ * ```typescript
2712
+ * // test.ts
2713
+ * import { test } from '@gleanwork/mcp-server-tester/fixtures/mcpAuth';
2714
+ *
2715
+ * test('authenticated MCP call', async ({ mcpAuthProvider }) => {
2716
+ * // mcpAuthProvider can be passed to createMCPClientForConfig
2717
+ * });
2718
+ * ```
2719
+ */
2720
+ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
2227
2721
 
2228
2722
  /**
2229
2723
  * Types and interfaces for LLM host simulation mode
@@ -2233,9 +2727,29 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
2233
2727
  */
2234
2728
 
2235
2729
  /**
2236
- * LLM provider for host simulation
2730
+ * LLM provider for host simulation.
2731
+ *
2732
+ * All providers run through the Vercel AI SDK (`ai` package).
2733
+ * Each provider requires its corresponding @ai-sdk/* package:
2734
+ *
2735
+ * openai → npm install ai @ai-sdk/openai
2736
+ * anthropic → npm install ai @ai-sdk/anthropic
2737
+ * google → npm install ai @ai-sdk/google
2738
+ * azure → npm install ai @ai-sdk/azure
2739
+ * mistral → npm install ai @ai-sdk/mistral
2740
+ * deepseek → npm install ai @ai-sdk/deepseek
2741
+ * openrouter → npm install ai @openrouter/ai-sdk-provider
2742
+ * xai → npm install ai @ai-sdk/xai
2743
+ */
2744
+ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'deepseek' | 'openrouter' | 'xai'
2745
+ /**
2746
+ * Anthropic Claude via Google Vertex AI.
2747
+ * Requires @ai-sdk/google-vertex and Application Default Credentials (gcloud auth).
2748
+ * Set GOOGLE_VERTEX_PROJECT and GOOGLE_VERTEX_LOCATION env vars.
2749
+ * Use this instead of 'anthropic' in environments where api.anthropic.com is blocked.
2750
+ * @example model: 'claude-3-5-haiku@20241022'
2237
2751
  */
2238
- type LLMProvider = 'openai' | 'anthropic';
2752
+ | 'vertex-anthropic';
2239
2753
  /**
2240
2754
  * Configuration for LLM host simulation
2241
2755
  */
@@ -2246,12 +2760,10 @@ interface LLMHostConfig {
2246
2760
  provider: LLMProvider;
2247
2761
  /**
2248
2762
  * Environment variable name containing the API key
2249
- * @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
2250
2763
  */
2251
2764
  apiKeyEnvVar?: string;
2252
2765
  /**
2253
- * Model to use
2254
- * @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
2766
+ * Model to use (provider-specific default if omitted)
2255
2767
  */
2256
2768
  model?: string;
2257
2769
  /**
@@ -2260,11 +2772,11 @@ interface LLMHostConfig {
2260
2772
  maxTokens?: number;
2261
2773
  /**
2262
2774
  * Temperature (0-1, lower is more deterministic)
2263
- * @default 0.0
2775
+ * @default 0
2264
2776
  */
2265
2777
  temperature?: number;
2266
2778
  /**
2267
- * Maximum number of tool calls to allow in a single conversation
2779
+ * Maximum number of tool call steps to allow in a single conversation
2268
2780
  * @default 10
2269
2781
  */
2270
2782
  maxToolCalls?: number;
@@ -2273,72 +2785,49 @@ interface LLMHostConfig {
2273
2785
  * A tool call made by the LLM
2274
2786
  */
2275
2787
  interface LLMToolCall {
2276
- /**
2277
- * Tool name
2278
- */
2788
+ /** Tool name */
2279
2789
  name: string;
2280
- /**
2281
- * Tool arguments (as provided by LLM)
2282
- */
2790
+ /** Tool arguments (as provided by LLM) */
2283
2791
  arguments: Record<string, unknown>;
2284
- /**
2285
- * Optional tool call ID (for tracking)
2286
- */
2792
+ /** Optional tool call ID (for tracking) */
2287
2793
  id?: string;
2288
2794
  }
2289
- /**
2290
- * Result of a tool call validation
2291
- */
2292
- interface ToolCallValidationResult {
2293
- /**
2294
- * Whether the tool call was valid
2295
- */
2296
- valid: boolean;
2297
- /**
2298
- * List of actual tool calls made
2299
- */
2300
- actualCalls: Array<LLMToolCall>;
2301
- /**
2302
- * Expected tool calls (if specified in eval case)
2303
- */
2304
- expectedCalls?: Array<LLMToolCall>;
2305
- /**
2306
- * Details about validation (e.g., missing calls, incorrect arguments)
2307
- */
2308
- details?: string;
2309
- }
2310
2795
  /**
2311
2796
  * Result from an LLM host simulation
2312
2797
  */
2313
2798
  interface LLMHostSimulationResult {
2314
- /**
2315
- * Whether the simulation succeeded
2316
- */
2799
+ /** Whether the simulation succeeded */
2317
2800
  success: boolean;
2318
- /**
2319
- * Tool calls made by the LLM
2320
- */
2801
+ /** Tool calls made by the LLM */
2321
2802
  toolCalls: Array<LLMToolCall>;
2322
- /**
2323
- * Final response from the LLM
2324
- */
2803
+ /** Final response from the LLM */
2325
2804
  response?: string;
2326
- /**
2327
- * Error message if simulation failed
2328
- */
2805
+ /** Error message if simulation failed */
2329
2806
  error?: string;
2330
- /**
2331
- * Full conversation history (for debugging)
2332
- */
2807
+ /** The scenario prompt that was given to the LLM */
2808
+ scenario?: string;
2809
+ /** The conversation turns for attribution analysis */
2333
2810
  conversationHistory?: Array<{
2334
2811
  role: 'user' | 'assistant' | 'tool';
2335
2812
  content: string;
2336
2813
  }>;
2814
+ /**
2815
+ * Milliseconds spent waiting for LLM responses
2816
+ * (excludes MCP tool execution time)
2817
+ */
2818
+ llmDurationMs?: number;
2819
+ /**
2820
+ * Milliseconds spent executing MCP tool calls
2821
+ * (excludes LLM response time)
2822
+ */
2823
+ mcpDurationMs?: number;
2337
2824
  }
2338
2825
  /**
2339
- * Interface for LLM host simulators
2826
+ * Interface for LLM host simulators.
2340
2827
  *
2341
- * Implementations communicate with MCP servers via the actual MCP protocol
2828
+ * The only built-in implementation is the Vercel AI SDK orchestrator
2829
+ * (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
2830
+ * created for specialised testing needs.
2342
2831
  */
2343
2832
  interface LLMHostSimulator {
2344
2833
  /**
@@ -2351,24 +2840,6 @@ interface LLMHostSimulator {
2351
2840
  */
2352
2841
  simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
2353
2842
  }
2354
- /**
2355
- * Expected tool call specification (for validation)
2356
- */
2357
- interface ExpectedToolCall {
2358
- /**
2359
- * Tool name
2360
- */
2361
- name: string;
2362
- /**
2363
- * Expected arguments (partial match)
2364
- */
2365
- arguments?: Record<string, unknown>;
2366
- /**
2367
- * Whether this call is required
2368
- * @default true
2369
- */
2370
- required?: boolean;
2371
- }
2372
2843
 
2373
2844
  /**
2374
2845
  * Evaluation mode
@@ -2423,6 +2894,41 @@ interface EvalCase {
2423
2894
  * For 'llm_host' mode, can include 'expectedToolCalls' for validation
2424
2895
  */
2425
2896
  metadata?: Record<string, unknown>;
2897
+ /**
2898
+ * Number of times to run this case and compute an accuracy score.
2899
+ * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
2900
+ * by `accuracyThreshold` rather than a single run.
2901
+ * @default 1
2902
+ */
2903
+ iterations?: number;
2904
+ /**
2905
+ * Minimum accuracy (0–1) required to pass when `iterations > 1`.
2906
+ * @default 1.0 (all iterations must pass)
2907
+ */
2908
+ accuracyThreshold?: number;
2909
+ /**
2910
+ * Number of times to invoke the LLM judge per `passesJudge` assertion.
2911
+ * Scores are averaged; the mean must meet the threshold to pass.
2912
+ * Reduces judge variance caused by non-determinism.
2913
+ * Per-assertion `passesJudge.reps` overrides this value.
2914
+ * @default 1
2915
+ */
2916
+ judgeReps?: number;
2917
+ /**
2918
+ * Golden/expected answer for this case.
2919
+ * When set, automatically passed as `reference` to the LLM judge
2920
+ * (unless passesJudge.reference is explicitly provided).
2921
+ * Mirrors EvalV2's `canonical_answer` field.
2922
+ */
2923
+ canonicalAnswer?: string;
2924
+ /**
2925
+ * Arbitrary string labels for this case.
2926
+ * Use for filtering eval runs with `EvalRunnerOptions.filterTags`
2927
+ * and for slicing results by category.
2928
+ *
2929
+ * @example ['tool-finding', 'multi-hop', 'search']
2930
+ */
2931
+ tags?: string[];
2426
2932
  /**
2427
2933
  * Expectations to validate against the tool response
2428
2934
  *
@@ -2486,14 +2992,30 @@ interface EvalExpectBlock {
2486
2992
  * LLM-as-judge evaluation (toPassToolJudge)
2487
2993
  */
2488
2994
  passesJudge?: {
2489
- /** Evaluation rubric/criteria */
2490
- rubric: string;
2995
+ /** Built-in rubric name or custom rubric object */
2996
+ rubric: BuiltInRubric | {
2997
+ text: string;
2998
+ };
2491
2999
  /** Reference response to compare against */
2492
3000
  reference?: unknown;
2493
3001
  /** Score threshold for passing (0-1, default: 0.7) */
2494
3002
  threshold?: number;
2495
- /** Judge configuration ID */
2496
- configId?: string;
3003
+ /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
3004
+ reps?: number;
3005
+ /** Judge provider. @default 'anthropic' */
3006
+ provider?: 'anthropic' | 'openai' | 'google';
3007
+ /** Model override (e.g., 'claude-opus-4-20250514') */
3008
+ model?: string;
3009
+ /** Environment variable name for API key */
3010
+ apiKeyEnvVar?: string;
3011
+ /** Max tokens for judge response */
3012
+ maxTokens?: number;
3013
+ /** Temperature for judge LLM (0–1) */
3014
+ temperature?: number;
3015
+ /** Max budget in USD per evaluation */
3016
+ maxBudgetUsd?: number;
3017
+ /** Fail if response exceeds this size in bytes before judging */
3018
+ maxToolOutputSize?: number;
2497
3019
  };
2498
3020
  /**
2499
3021
  * Response size validation (toHaveToolResponseSize)
@@ -2504,6 +3026,39 @@ interface EvalExpectBlock {
2504
3026
  /** Minimum required size in bytes */
2505
3027
  minBytes?: number;
2506
3028
  };
3029
+ /**
3030
+ * Asserts which tools the LLM called during an llm_host simulation.
3031
+ * Only meaningful for llm_host mode — direct mode has no tool call trace.
3032
+ */
3033
+ toolsTriggered?: {
3034
+ /** Expected tool calls */
3035
+ calls: Array<{
3036
+ /** Tool name */
3037
+ name: string;
3038
+ /** Expected arguments (partial match — extra keys are allowed) */
3039
+ arguments?: Record<string, unknown>;
3040
+ /** Whether this call MUST have been made (default: true) */
3041
+ required?: boolean;
3042
+ }>;
3043
+ /**
3044
+ * 'strict': calls must appear in the exact order listed
3045
+ * 'any': calls can appear in any order (default)
3046
+ */
3047
+ order?: 'strict' | 'any';
3048
+ /** If true, no tool calls outside the `calls` list are allowed */
3049
+ exclusive?: boolean;
3050
+ };
3051
+ /**
3052
+ * Asserts the number of tool calls made during an llm_host simulation.
3053
+ */
3054
+ toolCallCount?: {
3055
+ /** Minimum number of tool calls */
3056
+ min?: number;
3057
+ /** Maximum number of tool calls */
3058
+ max?: number;
3059
+ /** Exact number of tool calls */
3060
+ exact?: number;
3061
+ };
2507
3062
  }
2508
3063
  /**
2509
3064
  * A complete eval dataset containing multiple test cases
@@ -2543,21 +3098,21 @@ declare const EvalCaseSchema: z.ZodObject<{
2543
3098
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2544
3099
  scenario: z.ZodOptional<z.ZodString>;
2545
3100
  llmHostConfig: z.ZodOptional<z.ZodObject<{
2546
- provider: z.ZodEnum<["openai", "anthropic"]>;
3101
+ provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2547
3102
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2548
3103
  model: z.ZodOptional<z.ZodString>;
2549
3104
  maxTokens: z.ZodOptional<z.ZodNumber>;
2550
3105
  temperature: z.ZodOptional<z.ZodNumber>;
2551
3106
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
2552
3107
  }, "strip", z.ZodTypeAny, {
2553
- provider: "anthropic" | "openai";
3108
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2554
3109
  model?: string | undefined;
2555
3110
  maxTokens?: number | undefined;
2556
3111
  apiKeyEnvVar?: string | undefined;
2557
3112
  temperature?: number | undefined;
2558
3113
  maxToolCalls?: number | undefined;
2559
3114
  }, {
2560
- provider: "anthropic" | "openai";
3115
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2561
3116
  model?: string | undefined;
2562
3117
  maxTokens?: number | undefined;
2563
3118
  apiKeyEnvVar?: string | undefined;
@@ -2565,6 +3120,11 @@ declare const EvalCaseSchema: z.ZodObject<{
2565
3120
  maxToolCalls?: number | undefined;
2566
3121
  }>>;
2567
3122
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3123
+ iterations: z.ZodOptional<z.ZodNumber>;
3124
+ accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3125
+ judgeReps: z.ZodOptional<z.ZodNumber>;
3126
+ canonicalAnswer: z.ZodOptional<z.ZodString>;
3127
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2568
3128
  expect: z.ZodOptional<z.ZodObject<{
2569
3129
  response: z.ZodOptional<z.ZodUnknown>;
2570
3130
  schema: z.ZodOptional<z.ZodString>;
@@ -2589,20 +3149,51 @@ declare const EvalCaseSchema: z.ZodObject<{
2589
3149
  }>]>, "many">>;
2590
3150
  isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2591
3151
  passesJudge: z.ZodOptional<z.ZodObject<{
2592
- rubric: z.ZodString;
3152
+ rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3153
+ text: z.ZodString;
3154
+ }, "strip", z.ZodTypeAny, {
3155
+ text: string;
3156
+ }, {
3157
+ text: string;
3158
+ }>]>;
2593
3159
  reference: z.ZodOptional<z.ZodUnknown>;
2594
3160
  threshold: z.ZodOptional<z.ZodNumber>;
2595
- configId: z.ZodOptional<z.ZodString>;
3161
+ reps: z.ZodOptional<z.ZodNumber>;
3162
+ provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
3163
+ model: z.ZodOptional<z.ZodString>;
3164
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3165
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3166
+ temperature: z.ZodOptional<z.ZodNumber>;
3167
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3168
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2596
3169
  }, "strip", z.ZodTypeAny, {
2597
- rubric: string;
3170
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3171
+ text: string;
3172
+ };
3173
+ model?: string | undefined;
3174
+ maxTokens?: number | undefined;
3175
+ maxBudgetUsd?: number | undefined;
2598
3176
  reference?: unknown;
2599
3177
  threshold?: number | undefined;
2600
- configId?: string | undefined;
3178
+ reps?: number | undefined;
3179
+ provider?: "openai" | "anthropic" | "google" | undefined;
3180
+ apiKeyEnvVar?: string | undefined;
3181
+ temperature?: number | undefined;
3182
+ maxToolOutputSize?: number | undefined;
2601
3183
  }, {
2602
- rubric: string;
3184
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3185
+ text: string;
3186
+ };
3187
+ model?: string | undefined;
3188
+ maxTokens?: number | undefined;
3189
+ maxBudgetUsd?: number | undefined;
2603
3190
  reference?: unknown;
2604
3191
  threshold?: number | undefined;
2605
- configId?: string | undefined;
3192
+ reps?: number | undefined;
3193
+ provider?: "openai" | "anthropic" | "google" | undefined;
3194
+ apiKeyEnvVar?: string | undefined;
3195
+ temperature?: number | undefined;
3196
+ maxToolOutputSize?: number | undefined;
2606
3197
  }>>;
2607
3198
  responseSize: z.ZodOptional<z.ZodObject<{
2608
3199
  maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2614,47 +3205,139 @@ declare const EvalCaseSchema: z.ZodObject<{
2614
3205
  maxBytes?: number | undefined;
2615
3206
  minBytes?: number | undefined;
2616
3207
  }>>;
3208
+ toolsTriggered: z.ZodOptional<z.ZodObject<{
3209
+ calls: z.ZodArray<z.ZodObject<{
3210
+ name: z.ZodString;
3211
+ arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3212
+ required: z.ZodOptional<z.ZodBoolean>;
3213
+ }, "strip", z.ZodTypeAny, {
3214
+ name: string;
3215
+ required?: boolean | undefined;
3216
+ arguments?: Record<string, unknown> | undefined;
3217
+ }, {
3218
+ name: string;
3219
+ required?: boolean | undefined;
3220
+ arguments?: Record<string, unknown> | undefined;
3221
+ }>, "many">;
3222
+ order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3223
+ exclusive: z.ZodOptional<z.ZodBoolean>;
3224
+ }, "strip", z.ZodTypeAny, {
3225
+ calls: {
3226
+ name: string;
3227
+ required?: boolean | undefined;
3228
+ arguments?: Record<string, unknown> | undefined;
3229
+ }[];
3230
+ order?: "strict" | "any" | undefined;
3231
+ exclusive?: boolean | undefined;
3232
+ }, {
3233
+ calls: {
3234
+ name: string;
3235
+ required?: boolean | undefined;
3236
+ arguments?: Record<string, unknown> | undefined;
3237
+ }[];
3238
+ order?: "strict" | "any" | undefined;
3239
+ exclusive?: boolean | undefined;
3240
+ }>>;
3241
+ toolCallCount: z.ZodOptional<z.ZodObject<{
3242
+ min: z.ZodOptional<z.ZodNumber>;
3243
+ max: z.ZodOptional<z.ZodNumber>;
3244
+ exact: z.ZodOptional<z.ZodNumber>;
3245
+ }, "strip", z.ZodTypeAny, {
3246
+ exact?: number | undefined;
3247
+ min?: number | undefined;
3248
+ max?: number | undefined;
3249
+ }, {
3250
+ exact?: number | undefined;
3251
+ min?: number | undefined;
3252
+ max?: number | undefined;
3253
+ }>>;
2617
3254
  }, "strip", z.ZodTypeAny, {
3255
+ response?: unknown;
2618
3256
  isError?: string | boolean | string[] | undefined;
2619
3257
  schema?: string | undefined;
2620
3258
  snapshot?: string | undefined;
2621
- response?: unknown;
3259
+ toolsTriggered?: {
3260
+ calls: {
3261
+ name: string;
3262
+ required?: boolean | undefined;
3263
+ arguments?: Record<string, unknown> | undefined;
3264
+ }[];
3265
+ order?: "strict" | "any" | undefined;
3266
+ exclusive?: boolean | undefined;
3267
+ } | undefined;
3268
+ toolCallCount?: {
3269
+ exact?: number | undefined;
3270
+ min?: number | undefined;
3271
+ max?: number | undefined;
3272
+ } | undefined;
2622
3273
  containsText?: string | string[] | undefined;
2623
3274
  matchesPattern?: string | string[] | undefined;
2624
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3275
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2625
3276
  pattern: string;
2626
3277
  replacement?: string | undefined;
2627
3278
  } | {
2628
3279
  remove: string[];
2629
3280
  })[] | undefined;
2630
3281
  passesJudge?: {
2631
- rubric: string;
3282
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3283
+ text: string;
3284
+ };
3285
+ model?: string | undefined;
3286
+ maxTokens?: number | undefined;
3287
+ maxBudgetUsd?: number | undefined;
2632
3288
  reference?: unknown;
2633
3289
  threshold?: number | undefined;
2634
- configId?: string | undefined;
3290
+ reps?: number | undefined;
3291
+ provider?: "openai" | "anthropic" | "google" | undefined;
3292
+ apiKeyEnvVar?: string | undefined;
3293
+ temperature?: number | undefined;
3294
+ maxToolOutputSize?: number | undefined;
2635
3295
  } | undefined;
2636
3296
  responseSize?: {
2637
3297
  maxBytes?: number | undefined;
2638
3298
  minBytes?: number | undefined;
2639
3299
  } | undefined;
2640
3300
  }, {
3301
+ response?: unknown;
2641
3302
  isError?: string | boolean | string[] | undefined;
2642
3303
  schema?: string | undefined;
2643
3304
  snapshot?: string | undefined;
2644
- response?: unknown;
3305
+ toolsTriggered?: {
3306
+ calls: {
3307
+ name: string;
3308
+ required?: boolean | undefined;
3309
+ arguments?: Record<string, unknown> | undefined;
3310
+ }[];
3311
+ order?: "strict" | "any" | undefined;
3312
+ exclusive?: boolean | undefined;
3313
+ } | undefined;
3314
+ toolCallCount?: {
3315
+ exact?: number | undefined;
3316
+ min?: number | undefined;
3317
+ max?: number | undefined;
3318
+ } | undefined;
2645
3319
  containsText?: string | string[] | undefined;
2646
3320
  matchesPattern?: string | string[] | undefined;
2647
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3321
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2648
3322
  pattern: string;
2649
3323
  replacement?: string | undefined;
2650
3324
  } | {
2651
3325
  remove: string[];
2652
3326
  })[] | undefined;
2653
3327
  passesJudge?: {
2654
- rubric: string;
3328
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3329
+ text: string;
3330
+ };
3331
+ model?: string | undefined;
3332
+ maxTokens?: number | undefined;
3333
+ maxBudgetUsd?: number | undefined;
2655
3334
  reference?: unknown;
2656
3335
  threshold?: number | undefined;
2657
- configId?: string | undefined;
3336
+ reps?: number | undefined;
3337
+ provider?: "openai" | "anthropic" | "google" | undefined;
3338
+ apiKeyEnvVar?: string | undefined;
3339
+ temperature?: number | undefined;
3340
+ maxToolOutputSize?: number | undefined;
2658
3341
  } | undefined;
2659
3342
  responseSize?: {
2660
3343
  maxBytes?: number | undefined;
@@ -2664,37 +3347,65 @@ declare const EvalCaseSchema: z.ZodObject<{
2664
3347
  }, "strip", z.ZodTypeAny, {
2665
3348
  id: string;
2666
3349
  args?: Record<string, unknown> | undefined;
2667
- metadata?: Record<string, unknown> | undefined;
2668
3350
  mode?: "direct" | "llm_host" | undefined;
3351
+ metadata?: Record<string, unknown> | undefined;
2669
3352
  description?: string | undefined;
2670
3353
  toolName?: string | undefined;
2671
3354
  scenario?: string | undefined;
2672
3355
  llmHostConfig?: {
2673
- provider: "anthropic" | "openai";
3356
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2674
3357
  model?: string | undefined;
2675
3358
  maxTokens?: number | undefined;
2676
3359
  apiKeyEnvVar?: string | undefined;
2677
3360
  temperature?: number | undefined;
2678
3361
  maxToolCalls?: number | undefined;
2679
3362
  } | undefined;
3363
+ iterations?: number | undefined;
3364
+ accuracyThreshold?: number | undefined;
3365
+ judgeReps?: number | undefined;
3366
+ canonicalAnswer?: string | undefined;
3367
+ tags?: string[] | undefined;
2680
3368
  expect?: {
3369
+ response?: unknown;
2681
3370
  isError?: string | boolean | string[] | undefined;
2682
3371
  schema?: string | undefined;
2683
3372
  snapshot?: string | undefined;
2684
- response?: unknown;
3373
+ toolsTriggered?: {
3374
+ calls: {
3375
+ name: string;
3376
+ required?: boolean | undefined;
3377
+ arguments?: Record<string, unknown> | undefined;
3378
+ }[];
3379
+ order?: "strict" | "any" | undefined;
3380
+ exclusive?: boolean | undefined;
3381
+ } | undefined;
3382
+ toolCallCount?: {
3383
+ exact?: number | undefined;
3384
+ min?: number | undefined;
3385
+ max?: number | undefined;
3386
+ } | undefined;
2685
3387
  containsText?: string | string[] | undefined;
2686
3388
  matchesPattern?: string | string[] | undefined;
2687
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3389
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2688
3390
  pattern: string;
2689
3391
  replacement?: string | undefined;
2690
3392
  } | {
2691
3393
  remove: string[];
2692
3394
  })[] | undefined;
2693
3395
  passesJudge?: {
2694
- rubric: string;
3396
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3397
+ text: string;
3398
+ };
3399
+ model?: string | undefined;
3400
+ maxTokens?: number | undefined;
3401
+ maxBudgetUsd?: number | undefined;
2695
3402
  reference?: unknown;
2696
3403
  threshold?: number | undefined;
2697
- configId?: string | undefined;
3404
+ reps?: number | undefined;
3405
+ provider?: "openai" | "anthropic" | "google" | undefined;
3406
+ apiKeyEnvVar?: string | undefined;
3407
+ temperature?: number | undefined;
3408
+ maxToolOutputSize?: number | undefined;
2698
3409
  } | undefined;
2699
3410
  responseSize?: {
2700
3411
  maxBytes?: number | undefined;
@@ -2704,37 +3415,65 @@ declare const EvalCaseSchema: z.ZodObject<{
2704
3415
  }, {
2705
3416
  id: string;
2706
3417
  args?: Record<string, unknown> | undefined;
2707
- metadata?: Record<string, unknown> | undefined;
2708
3418
  mode?: "direct" | "llm_host" | undefined;
3419
+ metadata?: Record<string, unknown> | undefined;
2709
3420
  description?: string | undefined;
2710
3421
  toolName?: string | undefined;
2711
3422
  scenario?: string | undefined;
2712
3423
  llmHostConfig?: {
2713
- provider: "anthropic" | "openai";
3424
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2714
3425
  model?: string | undefined;
2715
3426
  maxTokens?: number | undefined;
2716
3427
  apiKeyEnvVar?: string | undefined;
2717
3428
  temperature?: number | undefined;
2718
3429
  maxToolCalls?: number | undefined;
2719
3430
  } | undefined;
3431
+ iterations?: number | undefined;
3432
+ accuracyThreshold?: number | undefined;
3433
+ judgeReps?: number | undefined;
3434
+ canonicalAnswer?: string | undefined;
3435
+ tags?: string[] | undefined;
2720
3436
  expect?: {
3437
+ response?: unknown;
2721
3438
  isError?: string | boolean | string[] | undefined;
2722
3439
  schema?: string | undefined;
2723
3440
  snapshot?: string | undefined;
2724
- response?: unknown;
3441
+ toolsTriggered?: {
3442
+ calls: {
3443
+ name: string;
3444
+ required?: boolean | undefined;
3445
+ arguments?: Record<string, unknown> | undefined;
3446
+ }[];
3447
+ order?: "strict" | "any" | undefined;
3448
+ exclusive?: boolean | undefined;
3449
+ } | undefined;
3450
+ toolCallCount?: {
3451
+ exact?: number | undefined;
3452
+ min?: number | undefined;
3453
+ max?: number | undefined;
3454
+ } | undefined;
2725
3455
  containsText?: string | string[] | undefined;
2726
3456
  matchesPattern?: string | string[] | undefined;
2727
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3457
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2728
3458
  pattern: string;
2729
3459
  replacement?: string | undefined;
2730
3460
  } | {
2731
3461
  remove: string[];
2732
3462
  })[] | undefined;
2733
3463
  passesJudge?: {
2734
- rubric: string;
3464
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3465
+ text: string;
3466
+ };
3467
+ model?: string | undefined;
3468
+ maxTokens?: number | undefined;
3469
+ maxBudgetUsd?: number | undefined;
2735
3470
  reference?: unknown;
2736
3471
  threshold?: number | undefined;
2737
- configId?: string | undefined;
3472
+ reps?: number | undefined;
3473
+ provider?: "openai" | "anthropic" | "google" | undefined;
3474
+ apiKeyEnvVar?: string | undefined;
3475
+ temperature?: number | undefined;
3476
+ maxToolOutputSize?: number | undefined;
2738
3477
  } | undefined;
2739
3478
  responseSize?: {
2740
3479
  maxBytes?: number | undefined;
@@ -2756,21 +3495,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
2756
3495
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2757
3496
  scenario: z.ZodOptional<z.ZodString>;
2758
3497
  llmHostConfig: z.ZodOptional<z.ZodObject<{
2759
- provider: z.ZodEnum<["openai", "anthropic"]>;
3498
+ provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2760
3499
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2761
3500
  model: z.ZodOptional<z.ZodString>;
2762
3501
  maxTokens: z.ZodOptional<z.ZodNumber>;
2763
3502
  temperature: z.ZodOptional<z.ZodNumber>;
2764
3503
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
2765
3504
  }, "strip", z.ZodTypeAny, {
2766
- provider: "anthropic" | "openai";
3505
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2767
3506
  model?: string | undefined;
2768
3507
  maxTokens?: number | undefined;
2769
3508
  apiKeyEnvVar?: string | undefined;
2770
3509
  temperature?: number | undefined;
2771
3510
  maxToolCalls?: number | undefined;
2772
3511
  }, {
2773
- provider: "anthropic" | "openai";
3512
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2774
3513
  model?: string | undefined;
2775
3514
  maxTokens?: number | undefined;
2776
3515
  apiKeyEnvVar?: string | undefined;
@@ -2778,6 +3517,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
2778
3517
  maxToolCalls?: number | undefined;
2779
3518
  }>>;
2780
3519
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3520
+ iterations: z.ZodOptional<z.ZodNumber>;
3521
+ accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3522
+ judgeReps: z.ZodOptional<z.ZodNumber>;
3523
+ canonicalAnswer: z.ZodOptional<z.ZodString>;
3524
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2781
3525
  expect: z.ZodOptional<z.ZodObject<{
2782
3526
  response: z.ZodOptional<z.ZodUnknown>;
2783
3527
  schema: z.ZodOptional<z.ZodString>;
@@ -2802,20 +3546,51 @@ declare const EvalDatasetSchema: z.ZodObject<{
2802
3546
  }>]>, "many">>;
2803
3547
  isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2804
3548
  passesJudge: z.ZodOptional<z.ZodObject<{
2805
- rubric: z.ZodString;
3549
+ rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3550
+ text: z.ZodString;
3551
+ }, "strip", z.ZodTypeAny, {
3552
+ text: string;
3553
+ }, {
3554
+ text: string;
3555
+ }>]>;
2806
3556
  reference: z.ZodOptional<z.ZodUnknown>;
2807
3557
  threshold: z.ZodOptional<z.ZodNumber>;
2808
- configId: z.ZodOptional<z.ZodString>;
3558
+ reps: z.ZodOptional<z.ZodNumber>;
3559
+ provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
3560
+ model: z.ZodOptional<z.ZodString>;
3561
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3562
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3563
+ temperature: z.ZodOptional<z.ZodNumber>;
3564
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3565
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2809
3566
  }, "strip", z.ZodTypeAny, {
2810
- rubric: string;
3567
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3568
+ text: string;
3569
+ };
3570
+ model?: string | undefined;
3571
+ maxTokens?: number | undefined;
3572
+ maxBudgetUsd?: number | undefined;
2811
3573
  reference?: unknown;
2812
3574
  threshold?: number | undefined;
2813
- configId?: string | undefined;
3575
+ reps?: number | undefined;
3576
+ provider?: "openai" | "anthropic" | "google" | undefined;
3577
+ apiKeyEnvVar?: string | undefined;
3578
+ temperature?: number | undefined;
3579
+ maxToolOutputSize?: number | undefined;
2814
3580
  }, {
2815
- rubric: string;
3581
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3582
+ text: string;
3583
+ };
3584
+ model?: string | undefined;
3585
+ maxTokens?: number | undefined;
3586
+ maxBudgetUsd?: number | undefined;
2816
3587
  reference?: unknown;
2817
3588
  threshold?: number | undefined;
2818
- configId?: string | undefined;
3589
+ reps?: number | undefined;
3590
+ provider?: "openai" | "anthropic" | "google" | undefined;
3591
+ apiKeyEnvVar?: string | undefined;
3592
+ temperature?: number | undefined;
3593
+ maxToolOutputSize?: number | undefined;
2819
3594
  }>>;
2820
3595
  responseSize: z.ZodOptional<z.ZodObject<{
2821
3596
  maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2827,47 +3602,139 @@ declare const EvalDatasetSchema: z.ZodObject<{
2827
3602
  maxBytes?: number | undefined;
2828
3603
  minBytes?: number | undefined;
2829
3604
  }>>;
3605
+ toolsTriggered: z.ZodOptional<z.ZodObject<{
3606
+ calls: z.ZodArray<z.ZodObject<{
3607
+ name: z.ZodString;
3608
+ arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3609
+ required: z.ZodOptional<z.ZodBoolean>;
3610
+ }, "strip", z.ZodTypeAny, {
3611
+ name: string;
3612
+ required?: boolean | undefined;
3613
+ arguments?: Record<string, unknown> | undefined;
3614
+ }, {
3615
+ name: string;
3616
+ required?: boolean | undefined;
3617
+ arguments?: Record<string, unknown> | undefined;
3618
+ }>, "many">;
3619
+ order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3620
+ exclusive: z.ZodOptional<z.ZodBoolean>;
3621
+ }, "strip", z.ZodTypeAny, {
3622
+ calls: {
3623
+ name: string;
3624
+ required?: boolean | undefined;
3625
+ arguments?: Record<string, unknown> | undefined;
3626
+ }[];
3627
+ order?: "strict" | "any" | undefined;
3628
+ exclusive?: boolean | undefined;
3629
+ }, {
3630
+ calls: {
3631
+ name: string;
3632
+ required?: boolean | undefined;
3633
+ arguments?: Record<string, unknown> | undefined;
3634
+ }[];
3635
+ order?: "strict" | "any" | undefined;
3636
+ exclusive?: boolean | undefined;
3637
+ }>>;
3638
+ toolCallCount: z.ZodOptional<z.ZodObject<{
3639
+ min: z.ZodOptional<z.ZodNumber>;
3640
+ max: z.ZodOptional<z.ZodNumber>;
3641
+ exact: z.ZodOptional<z.ZodNumber>;
3642
+ }, "strip", z.ZodTypeAny, {
3643
+ exact?: number | undefined;
3644
+ min?: number | undefined;
3645
+ max?: number | undefined;
3646
+ }, {
3647
+ exact?: number | undefined;
3648
+ min?: number | undefined;
3649
+ max?: number | undefined;
3650
+ }>>;
2830
3651
  }, "strip", z.ZodTypeAny, {
3652
+ response?: unknown;
2831
3653
  isError?: string | boolean | string[] | undefined;
2832
3654
  schema?: string | undefined;
2833
3655
  snapshot?: string | undefined;
2834
- response?: unknown;
3656
+ toolsTriggered?: {
3657
+ calls: {
3658
+ name: string;
3659
+ required?: boolean | undefined;
3660
+ arguments?: Record<string, unknown> | undefined;
3661
+ }[];
3662
+ order?: "strict" | "any" | undefined;
3663
+ exclusive?: boolean | undefined;
3664
+ } | undefined;
3665
+ toolCallCount?: {
3666
+ exact?: number | undefined;
3667
+ min?: number | undefined;
3668
+ max?: number | undefined;
3669
+ } | undefined;
2835
3670
  containsText?: string | string[] | undefined;
2836
3671
  matchesPattern?: string | string[] | undefined;
2837
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3672
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2838
3673
  pattern: string;
2839
3674
  replacement?: string | undefined;
2840
3675
  } | {
2841
3676
  remove: string[];
2842
3677
  })[] | undefined;
2843
3678
  passesJudge?: {
2844
- rubric: string;
3679
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3680
+ text: string;
3681
+ };
3682
+ model?: string | undefined;
3683
+ maxTokens?: number | undefined;
3684
+ maxBudgetUsd?: number | undefined;
2845
3685
  reference?: unknown;
2846
3686
  threshold?: number | undefined;
2847
- configId?: string | undefined;
3687
+ reps?: number | undefined;
3688
+ provider?: "openai" | "anthropic" | "google" | undefined;
3689
+ apiKeyEnvVar?: string | undefined;
3690
+ temperature?: number | undefined;
3691
+ maxToolOutputSize?: number | undefined;
2848
3692
  } | undefined;
2849
3693
  responseSize?: {
2850
3694
  maxBytes?: number | undefined;
2851
3695
  minBytes?: number | undefined;
2852
3696
  } | undefined;
2853
3697
  }, {
3698
+ response?: unknown;
2854
3699
  isError?: string | boolean | string[] | undefined;
2855
3700
  schema?: string | undefined;
2856
3701
  snapshot?: string | undefined;
2857
- response?: unknown;
3702
+ toolsTriggered?: {
3703
+ calls: {
3704
+ name: string;
3705
+ required?: boolean | undefined;
3706
+ arguments?: Record<string, unknown> | undefined;
3707
+ }[];
3708
+ order?: "strict" | "any" | undefined;
3709
+ exclusive?: boolean | undefined;
3710
+ } | undefined;
3711
+ toolCallCount?: {
3712
+ exact?: number | undefined;
3713
+ min?: number | undefined;
3714
+ max?: number | undefined;
3715
+ } | undefined;
2858
3716
  containsText?: string | string[] | undefined;
2859
3717
  matchesPattern?: string | string[] | undefined;
2860
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3718
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2861
3719
  pattern: string;
2862
3720
  replacement?: string | undefined;
2863
3721
  } | {
2864
3722
  remove: string[];
2865
3723
  })[] | undefined;
2866
3724
  passesJudge?: {
2867
- rubric: string;
3725
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3726
+ text: string;
3727
+ };
3728
+ model?: string | undefined;
3729
+ maxTokens?: number | undefined;
3730
+ maxBudgetUsd?: number | undefined;
2868
3731
  reference?: unknown;
2869
3732
  threshold?: number | undefined;
2870
- configId?: string | undefined;
3733
+ reps?: number | undefined;
3734
+ provider?: "openai" | "anthropic" | "google" | undefined;
3735
+ apiKeyEnvVar?: string | undefined;
3736
+ temperature?: number | undefined;
3737
+ maxToolOutputSize?: number | undefined;
2871
3738
  } | undefined;
2872
3739
  responseSize?: {
2873
3740
  maxBytes?: number | undefined;
@@ -2877,37 +3744,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
2877
3744
  }, "strip", z.ZodTypeAny, {
2878
3745
  id: string;
2879
3746
  args?: Record<string, unknown> | undefined;
2880
- metadata?: Record<string, unknown> | undefined;
2881
3747
  mode?: "direct" | "llm_host" | undefined;
3748
+ metadata?: Record<string, unknown> | undefined;
2882
3749
  description?: string | undefined;
2883
3750
  toolName?: string | undefined;
2884
3751
  scenario?: string | undefined;
2885
3752
  llmHostConfig?: {
2886
- provider: "anthropic" | "openai";
3753
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2887
3754
  model?: string | undefined;
2888
3755
  maxTokens?: number | undefined;
2889
3756
  apiKeyEnvVar?: string | undefined;
2890
3757
  temperature?: number | undefined;
2891
3758
  maxToolCalls?: number | undefined;
2892
3759
  } | undefined;
3760
+ iterations?: number | undefined;
3761
+ accuracyThreshold?: number | undefined;
3762
+ judgeReps?: number | undefined;
3763
+ canonicalAnswer?: string | undefined;
3764
+ tags?: string[] | undefined;
2893
3765
  expect?: {
3766
+ response?: unknown;
2894
3767
  isError?: string | boolean | string[] | undefined;
2895
3768
  schema?: string | undefined;
2896
3769
  snapshot?: string | undefined;
2897
- response?: unknown;
3770
+ toolsTriggered?: {
3771
+ calls: {
3772
+ name: string;
3773
+ required?: boolean | undefined;
3774
+ arguments?: Record<string, unknown> | undefined;
3775
+ }[];
3776
+ order?: "strict" | "any" | undefined;
3777
+ exclusive?: boolean | undefined;
3778
+ } | undefined;
3779
+ toolCallCount?: {
3780
+ exact?: number | undefined;
3781
+ min?: number | undefined;
3782
+ max?: number | undefined;
3783
+ } | undefined;
2898
3784
  containsText?: string | string[] | undefined;
2899
3785
  matchesPattern?: string | string[] | undefined;
2900
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3786
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2901
3787
  pattern: string;
2902
3788
  replacement?: string | undefined;
2903
3789
  } | {
2904
3790
  remove: string[];
2905
3791
  })[] | undefined;
2906
3792
  passesJudge?: {
2907
- rubric: string;
3793
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3794
+ text: string;
3795
+ };
3796
+ model?: string | undefined;
3797
+ maxTokens?: number | undefined;
3798
+ maxBudgetUsd?: number | undefined;
2908
3799
  reference?: unknown;
2909
3800
  threshold?: number | undefined;
2910
- configId?: string | undefined;
3801
+ reps?: number | undefined;
3802
+ provider?: "openai" | "anthropic" | "google" | undefined;
3803
+ apiKeyEnvVar?: string | undefined;
3804
+ temperature?: number | undefined;
3805
+ maxToolOutputSize?: number | undefined;
2911
3806
  } | undefined;
2912
3807
  responseSize?: {
2913
3808
  maxBytes?: number | undefined;
@@ -2917,37 +3812,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
2917
3812
  }, {
2918
3813
  id: string;
2919
3814
  args?: Record<string, unknown> | undefined;
2920
- metadata?: Record<string, unknown> | undefined;
2921
3815
  mode?: "direct" | "llm_host" | undefined;
3816
+ metadata?: Record<string, unknown> | undefined;
2922
3817
  description?: string | undefined;
2923
3818
  toolName?: string | undefined;
2924
3819
  scenario?: string | undefined;
2925
3820
  llmHostConfig?: {
2926
- provider: "anthropic" | "openai";
3821
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2927
3822
  model?: string | undefined;
2928
3823
  maxTokens?: number | undefined;
2929
3824
  apiKeyEnvVar?: string | undefined;
2930
3825
  temperature?: number | undefined;
2931
3826
  maxToolCalls?: number | undefined;
2932
3827
  } | undefined;
3828
+ iterations?: number | undefined;
3829
+ accuracyThreshold?: number | undefined;
3830
+ judgeReps?: number | undefined;
3831
+ canonicalAnswer?: string | undefined;
3832
+ tags?: string[] | undefined;
2933
3833
  expect?: {
3834
+ response?: unknown;
2934
3835
  isError?: string | boolean | string[] | undefined;
2935
3836
  schema?: string | undefined;
2936
3837
  snapshot?: string | undefined;
2937
- response?: unknown;
3838
+ toolsTriggered?: {
3839
+ calls: {
3840
+ name: string;
3841
+ required?: boolean | undefined;
3842
+ arguments?: Record<string, unknown> | undefined;
3843
+ }[];
3844
+ order?: "strict" | "any" | undefined;
3845
+ exclusive?: boolean | undefined;
3846
+ } | undefined;
3847
+ toolCallCount?: {
3848
+ exact?: number | undefined;
3849
+ min?: number | undefined;
3850
+ max?: number | undefined;
3851
+ } | undefined;
2938
3852
  containsText?: string | string[] | undefined;
2939
3853
  matchesPattern?: string | string[] | undefined;
2940
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3854
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2941
3855
  pattern: string;
2942
3856
  replacement?: string | undefined;
2943
3857
  } | {
2944
3858
  remove: string[];
2945
3859
  })[] | undefined;
2946
3860
  passesJudge?: {
2947
- rubric: string;
3861
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3862
+ text: string;
3863
+ };
3864
+ model?: string | undefined;
3865
+ maxTokens?: number | undefined;
3866
+ maxBudgetUsd?: number | undefined;
2948
3867
  reference?: unknown;
2949
3868
  threshold?: number | undefined;
2950
- configId?: string | undefined;
3869
+ reps?: number | undefined;
3870
+ provider?: "openai" | "anthropic" | "google" | undefined;
3871
+ apiKeyEnvVar?: string | undefined;
3872
+ temperature?: number | undefined;
3873
+ maxToolOutputSize?: number | undefined;
2951
3874
  } | undefined;
2952
3875
  responseSize?: {
2953
3876
  maxBytes?: number | undefined;
@@ -2961,37 +3884,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
2961
3884
  cases: {
2962
3885
  id: string;
2963
3886
  args?: Record<string, unknown> | undefined;
2964
- metadata?: Record<string, unknown> | undefined;
2965
3887
  mode?: "direct" | "llm_host" | undefined;
3888
+ metadata?: Record<string, unknown> | undefined;
2966
3889
  description?: string | undefined;
2967
3890
  toolName?: string | undefined;
2968
3891
  scenario?: string | undefined;
2969
3892
  llmHostConfig?: {
2970
- provider: "anthropic" | "openai";
3893
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
2971
3894
  model?: string | undefined;
2972
3895
  maxTokens?: number | undefined;
2973
3896
  apiKeyEnvVar?: string | undefined;
2974
3897
  temperature?: number | undefined;
2975
3898
  maxToolCalls?: number | undefined;
2976
3899
  } | undefined;
3900
+ iterations?: number | undefined;
3901
+ accuracyThreshold?: number | undefined;
3902
+ judgeReps?: number | undefined;
3903
+ canonicalAnswer?: string | undefined;
3904
+ tags?: string[] | undefined;
2977
3905
  expect?: {
3906
+ response?: unknown;
2978
3907
  isError?: string | boolean | string[] | undefined;
2979
3908
  schema?: string | undefined;
2980
3909
  snapshot?: string | undefined;
2981
- response?: unknown;
3910
+ toolsTriggered?: {
3911
+ calls: {
3912
+ name: string;
3913
+ required?: boolean | undefined;
3914
+ arguments?: Record<string, unknown> | undefined;
3915
+ }[];
3916
+ order?: "strict" | "any" | undefined;
3917
+ exclusive?: boolean | undefined;
3918
+ } | undefined;
3919
+ toolCallCount?: {
3920
+ exact?: number | undefined;
3921
+ min?: number | undefined;
3922
+ max?: number | undefined;
3923
+ } | undefined;
2982
3924
  containsText?: string | string[] | undefined;
2983
3925
  matchesPattern?: string | string[] | undefined;
2984
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3926
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
2985
3927
  pattern: string;
2986
3928
  replacement?: string | undefined;
2987
3929
  } | {
2988
3930
  remove: string[];
2989
3931
  })[] | undefined;
2990
3932
  passesJudge?: {
2991
- rubric: string;
3933
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3934
+ text: string;
3935
+ };
3936
+ model?: string | undefined;
3937
+ maxTokens?: number | undefined;
3938
+ maxBudgetUsd?: number | undefined;
2992
3939
  reference?: unknown;
2993
3940
  threshold?: number | undefined;
2994
- configId?: string | undefined;
3941
+ reps?: number | undefined;
3942
+ provider?: "openai" | "anthropic" | "google" | undefined;
3943
+ apiKeyEnvVar?: string | undefined;
3944
+ temperature?: number | undefined;
3945
+ maxToolOutputSize?: number | undefined;
2995
3946
  } | undefined;
2996
3947
  responseSize?: {
2997
3948
  maxBytes?: number | undefined;
@@ -3006,37 +3957,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
3006
3957
  cases: {
3007
3958
  id: string;
3008
3959
  args?: Record<string, unknown> | undefined;
3009
- metadata?: Record<string, unknown> | undefined;
3010
3960
  mode?: "direct" | "llm_host" | undefined;
3961
+ metadata?: Record<string, unknown> | undefined;
3011
3962
  description?: string | undefined;
3012
3963
  toolName?: string | undefined;
3013
3964
  scenario?: string | undefined;
3014
3965
  llmHostConfig?: {
3015
- provider: "anthropic" | "openai";
3966
+ provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3016
3967
  model?: string | undefined;
3017
3968
  maxTokens?: number | undefined;
3018
3969
  apiKeyEnvVar?: string | undefined;
3019
3970
  temperature?: number | undefined;
3020
3971
  maxToolCalls?: number | undefined;
3021
3972
  } | undefined;
3973
+ iterations?: number | undefined;
3974
+ accuracyThreshold?: number | undefined;
3975
+ judgeReps?: number | undefined;
3976
+ canonicalAnswer?: string | undefined;
3977
+ tags?: string[] | undefined;
3022
3978
  expect?: {
3979
+ response?: unknown;
3023
3980
  isError?: string | boolean | string[] | undefined;
3024
3981
  schema?: string | undefined;
3025
3982
  snapshot?: string | undefined;
3026
- response?: unknown;
3983
+ toolsTriggered?: {
3984
+ calls: {
3985
+ name: string;
3986
+ required?: boolean | undefined;
3987
+ arguments?: Record<string, unknown> | undefined;
3988
+ }[];
3989
+ order?: "strict" | "any" | undefined;
3990
+ exclusive?: boolean | undefined;
3991
+ } | undefined;
3992
+ toolCallCount?: {
3993
+ exact?: number | undefined;
3994
+ min?: number | undefined;
3995
+ max?: number | undefined;
3996
+ } | undefined;
3027
3997
  containsText?: string | string[] | undefined;
3028
3998
  matchesPattern?: string | string[] | undefined;
3029
- snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
3999
+ snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3030
4000
  pattern: string;
3031
4001
  replacement?: string | undefined;
3032
4002
  } | {
3033
4003
  remove: string[];
3034
4004
  })[] | undefined;
3035
4005
  passesJudge?: {
3036
- rubric: string;
4006
+ rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
4007
+ text: string;
4008
+ };
4009
+ model?: string | undefined;
4010
+ maxTokens?: number | undefined;
4011
+ maxBudgetUsd?: number | undefined;
3037
4012
  reference?: unknown;
3038
4013
  threshold?: number | undefined;
3039
- configId?: string | undefined;
4014
+ reps?: number | undefined;
4015
+ provider?: "openai" | "anthropic" | "google" | undefined;
4016
+ apiKeyEnvVar?: string | undefined;
4017
+ temperature?: number | undefined;
4018
+ maxToolOutputSize?: number | undefined;
3040
4019
  } | undefined;
3041
4020
  responseSize?: {
3042
4021
  maxBytes?: number | undefined;
@@ -3126,29 +4105,126 @@ declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions)
3126
4105
  declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
3127
4106
 
3128
4107
  /**
3129
- * Context passed to the eval runner
4108
+ * Reporter-specific type definitions
4109
+ *
4110
+ * These types are used by the MCP reporter and UI.
4111
+ *
4112
+ * @packageDocumentation
3130
4113
  */
3131
- interface EvalContext {
4114
+
4115
+ /**
4116
+ * Experiment tracking metadata for an eval run
4117
+ */
4118
+ interface EvalRunMetadata {
4119
+ /** Git commit hash at time of run */
4120
+ gitHash?: string;
4121
+ /** ISO timestamp of the run */
4122
+ timestamp: string;
4123
+ /** Package version from package.json */
4124
+ packageVersion: string;
4125
+ /** LLM host model identifier (if llm_host mode) */
4126
+ llmHostModel?: string;
4127
+ /** Judge model identifier (if judge was used) */
4128
+ judgeModel?: string;
4129
+ }
4130
+ /**
4131
+ * Individual conformance check result
4132
+ */
4133
+ interface MCPConformanceCheck$1 {
3132
4134
  /**
3133
- * MCP fixture API for interacting with the server
4135
+ * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
3134
4136
  */
3135
- mcp: MCPFixtureApi;
4137
+ name: string;
3136
4138
  /**
3137
- * Optional Playwright TestInfo for reporter integration
3138
- * When provided, eval results will be attached to the test for the MCP reporter
4139
+ * Whether the check passed
3139
4140
  */
3140
- testInfo?: TestInfo;
4141
+ pass: boolean;
3141
4142
  /**
3142
- * Optional Playwright expect function for snapshot testing
3143
- * Required for snapshot expectations to work properly
4143
+ * Human-readable message describing the result
3144
4144
  */
3145
- expect?: Expect;
4145
+ message: string;
3146
4146
  }
3147
-
3148
4147
  /**
3149
- * Result of a single eval case
4148
+ * Conformance check result as stored in reporter data
3150
4149
  */
3151
- interface EvalCaseResult$1 {
4150
+ interface MCPConformanceResultData {
4151
+ /**
4152
+ * Test title where conformance check was run
4153
+ */
4154
+ testTitle: string;
4155
+ /**
4156
+ * Whether all checks passed
4157
+ */
4158
+ pass: boolean;
4159
+ /**
4160
+ * Individual check results
4161
+ */
4162
+ checks: MCPConformanceCheck$1[];
4163
+ /**
4164
+ * Server info if available
4165
+ */
4166
+ serverInfo?: {
4167
+ name?: string;
4168
+ version?: string;
4169
+ };
4170
+ /**
4171
+ * Number of tools discovered
4172
+ */
4173
+ toolCount: number;
4174
+ /**
4175
+ * Auth type used for this check
4176
+ */
4177
+ authType?: AuthType;
4178
+ /**
4179
+ * Project name
4180
+ */
4181
+ project?: string;
4182
+ }
4183
+ /**
4184
+ * Server capabilities data from mcp-list-tools attachment
4185
+ */
4186
+ interface MCPServerCapabilitiesData {
4187
+ /**
4188
+ * Test title where listTools was called
4189
+ */
4190
+ testTitle: string;
4191
+ /**
4192
+ * List of tools available on the server
4193
+ */
4194
+ tools: Array<{
4195
+ name: string;
4196
+ description?: string;
4197
+ }>;
4198
+ /**
4199
+ * Total number of tools
4200
+ */
4201
+ toolCount: number;
4202
+ /**
4203
+ * Auth type used for this test
4204
+ */
4205
+ authType?: AuthType;
4206
+ /**
4207
+ * Project name
4208
+ */
4209
+ project?: string;
4210
+ }
4211
+ /**
4212
+ * Result of a single iteration within a multi-iteration eval case
4213
+ */
4214
+ interface IterationResult {
4215
+ /** Whether this iteration passed */
4216
+ pass: boolean;
4217
+ /** Execution time for this iteration */
4218
+ durationMs: number;
4219
+ /** Error message if the iteration failed with an exception */
4220
+ error?: string;
4221
+ /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
4222
+ isInfrastructureError?: boolean;
4223
+ }
4224
+ /**
4225
+ * Result of a single eval case
4226
+ */
4227
+ interface EvalCaseResult {
3152
4228
  /**
3153
4229
  * Case ID
3154
4230
  */
@@ -3161,15 +4237,8 @@ interface EvalCaseResult$1 {
3161
4237
  * MCP tool name that was called
3162
4238
  */
3163
4239
  toolName: string;
3164
- /**
3165
- * Evaluation mode (direct or llm_host)
3166
- * @deprecated Mode is inferred from test context, not displayed in reports
3167
- */
3168
- mode?: 'direct' | 'llm_host';
3169
4240
  /**
3170
4241
  * Source of this result
3171
- * - 'eval': From runEvalDataset() using JSON eval datasets
3172
- * - 'test': From direct API test tracking (MCP fixture calls)
3173
4242
  */
3174
4243
  source: ResultSource;
3175
4244
  /**
@@ -3194,14 +4263,164 @@ interface EvalCaseResult$1 {
3194
4263
  authType?: AuthType;
3195
4264
  /**
3196
4265
  * Playwright project name this test belongs to
3197
- * Used for filtering/grouping results by project in the reporter
3198
4266
  */
3199
4267
  project?: string;
3200
4268
  /**
3201
4269
  * Execution time in milliseconds
3202
4270
  */
3203
4271
  durationMs: number;
4272
+ /**
4273
+ * Assertion pass rate (0–1): passes divided by non-infrastructure iterations.
4274
+ * Only present when the case was run with `iterations > 1`.
4275
+ *
4276
+ * Infrastructure errors (network timeouts, rate limits, etc.) are excluded from
4277
+ * the denominator so that environment reliability does not inflate this metric.
4278
+ */
4279
+ assertionPassRate?: number;
4280
+ /**
4281
+ * Infrastructure error rate (0–1): infra errors divided by total iterations.
4282
+ * Only present when the case was run with `iterations > 1`.
4283
+ */
4284
+ infrastructureErrorRate?: number;
4285
+ /**
4286
+ * Accuracy score (0–1) across all iterations.
4287
+ * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
4288
+ * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
4289
+ */
4290
+ accuracy?: number;
4291
+ /**
4292
+ * Per-iteration pass/fail breakdown.
4293
+ * Only present when the case was run with `iterations > 1`.
4294
+ */
4295
+ iterationResults?: Array<IterationResult>;
4296
+ /**
4297
+ * Tags from the source eval case, for filtering and slicing reports.
4298
+ */
4299
+ tags?: string[];
4300
+ /**
4301
+ * Precision of tool calls made (0–1).
4302
+ * 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
4303
+ * Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
4304
+ */
4305
+ toolPrecision?: number;
4306
+ /**
4307
+ * Recall of required tool calls (0–1).
4308
+ * 1.0 means all required tools were called; <1.0 means some were missed.
4309
+ * Only populated when toolsTriggered expectation was evaluated.
4310
+ */
4311
+ toolRecall?: number;
4312
+ /**
4313
+ * Pass/fail status of this case in the baseline run.
4314
+ * Only present when a baseline was provided to runEvalDataset.
4315
+ */
4316
+ baselinePass?: boolean;
4317
+ /**
4318
+ * Number of iterations that failed due to infrastructure errors (network, rate limits, etc.)
4319
+ * Only present when the case was run with `iterations > 1`.
4320
+ */
4321
+ infrastructureErrorCount?: number;
4322
+ }
4323
+ /**
4324
+ * Aggregated MCP eval run data
4325
+ */
4326
+ interface MCPEvalRunData {
4327
+ /**
4328
+ * Run timestamp (ISO 8601)
4329
+ */
4330
+ timestamp: string;
4331
+ /**
4332
+ * Total duration in milliseconds
4333
+ */
4334
+ durationMs: number;
4335
+ /**
4336
+ * Environment info
4337
+ */
4338
+ environment: {
4339
+ ci: boolean;
4340
+ node: string;
4341
+ platform: string;
4342
+ };
4343
+ /**
4344
+ * Aggregate metrics
4345
+ */
4346
+ metrics: {
4347
+ /**
4348
+ * Total number of eval cases
4349
+ */
4350
+ total: number;
4351
+ /**
4352
+ * Number of passed cases
4353
+ */
4354
+ passed: number;
4355
+ /**
4356
+ * Number of failed cases
4357
+ */
4358
+ failed: number;
4359
+ /**
4360
+ * Pass rate (0-1)
4361
+ */
4362
+ passRate: number;
4363
+ /**
4364
+ * Dataset breakdown: dataset name -> count
4365
+ */
4366
+ datasetBreakdown: Record<string, number>;
4367
+ /**
4368
+ * Expectation type breakdown
4369
+ */
4370
+ expectationBreakdown: ExpectationBreakdown;
4371
+ };
4372
+ /**
4373
+ * All eval results from this run
4374
+ */
4375
+ results: EvalCaseResult[];
4376
+ /**
4377
+ * Conformance check results (optional)
4378
+ */
4379
+ conformanceChecks?: MCPConformanceResultData[];
4380
+ /**
4381
+ * Server capabilities discovered via listTools (optional)
4382
+ */
4383
+ serverCapabilities?: MCPServerCapabilitiesData[];
4384
+ }
4385
+ /**
4386
+ * Historical summary for trend charts
4387
+ */
4388
+ interface MCPEvalHistoricalSummary {
4389
+ timestamp: string;
4390
+ total: number;
4391
+ passed: number;
4392
+ failed: number;
4393
+ passRate: number;
4394
+ durationMs: number;
4395
+ }
4396
+ /**
4397
+ * Complete data structure passed to UI
4398
+ */
4399
+ interface MCPEvalData {
4400
+ runData: MCPEvalRunData;
4401
+ historical: MCPEvalHistoricalSummary[];
3204
4402
  }
4403
+
4404
+ /**
4405
+ * Context passed to the eval runner
4406
+ */
4407
+ interface EvalContext {
4408
+ /**
4409
+ * MCP fixture API for interacting with the server
4410
+ */
4411
+ mcp: MCPFixtureApi;
4412
+ /**
4413
+ * Optional Playwright TestInfo for reporter integration
4414
+ * When provided, eval results will be attached to the test for the MCP reporter
4415
+ */
4416
+ testInfo?: TestInfo;
4417
+ /**
4418
+ * Optional Playwright expect function for snapshot testing
4419
+ * Required for snapshot expectations to work properly
4420
+ */
4421
+ expect?: Expect;
4422
+ }
4423
+
3205
4424
  /**
3206
4425
  * Overall result of running an eval dataset
3207
4426
  */
@@ -3221,11 +4440,48 @@ interface EvalRunnerResult {
3221
4440
  /**
3222
4441
  * Individual case results
3223
4442
  */
3224
- caseResults: Array<EvalCaseResult$1>;
4443
+ caseResults: Array<EvalCaseResult>;
3225
4444
  /**
3226
4445
  * Overall execution time in milliseconds
3227
4446
  */
3228
4447
  durationMs: number;
4448
+ /**
4449
+ * Difference between current pass rate and baseline pass rate.
4450
+ * Positive = improvement, negative = regression.
4451
+ * Only present when `baselineResultsFrom` was provided.
4452
+ */
4453
+ deltaPassRate?: number;
4454
+ /**
4455
+ * Number of cases that regressed: passed in baseline, failed now.
4456
+ * Only present when `baselineResultsFrom` was provided.
4457
+ */
4458
+ regressions?: number;
4459
+ /**
4460
+ * Number of cases that improved: failed in baseline, passed now.
4461
+ * Only present when `baselineResultsFrom` was provided.
4462
+ */
4463
+ improvements?: number;
4464
+ /**
4465
+ * Average tool precision across all llm_host cases that have a
4466
+ * `toolsTriggered` expectation (precision = fraction of called tools
4467
+ * that were expected). Only present when at least one such case ran.
4468
+ */
4469
+ datasetToolPrecision?: number;
4470
+ /**
4471
+ * Average tool recall across all llm_host cases that have a
4472
+ * `toolsTriggered` expectation (recall = fraction of required tools
4473
+ * that were actually called). Only present when at least one such case ran.
4474
+ */
4475
+ datasetToolRecall?: number;
4476
+ /**
4477
+ * Harmonic mean of `datasetToolPrecision` and `datasetToolRecall`.
4478
+ * Only present when at least one case contributes precision/recall data.
4479
+ */
4480
+ datasetToolF1?: number;
4481
+ /**
4482
+ * Experiment tracking metadata captured at run time.
4483
+ */
4484
+ metadata?: EvalRunMetadata;
3229
4485
  }
3230
4486
  /**
3231
4487
  * Options for running eval dataset
@@ -3251,12 +4507,6 @@ interface EvalRunnerOptions {
3251
4507
  * ```
3252
4508
  */
3253
4509
  schemas?: Record<string, ZodType>;
3254
- /**
3255
- * Judge configuration registry by ID
3256
- *
3257
- * Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
3258
- */
3259
- judgeConfigs?: Record<string, JudgeConfig>;
3260
4510
  /**
3261
4511
  * Whether to stop on first failure
3262
4512
  * @default false
@@ -3265,7 +4515,71 @@ interface EvalRunnerOptions {
3265
4515
  /**
3266
4516
  * Optional callback called after each case
3267
4517
  */
3268
- onCaseComplete?: (result: EvalCaseResult$1) => void | Promise<void>;
4518
+ onCaseComplete?: (result: EvalCaseResult) => void | Promise<void>;
4519
+ /**
4520
+ * Maximum number of eval cases to run concurrently.
4521
+ * When > 1, cases run in parallel (ignores stopOnFailure ordering).
4522
+ * @default 1 (sequential)
4523
+ */
4524
+ concurrency?: number;
4525
+ /**
4526
+ * Default iteration count for `llm_host` mode cases that do not specify
4527
+ * `iterations` explicitly. Has no effect on `direct` mode cases (which are
4528
+ * deterministic and always default to 1 iteration).
4529
+ *
4530
+ * Set to 10 for standard runs or 20 for release gates. Individual cases can
4531
+ * still override this with their own `iterations` field.
4532
+ *
4533
+ * @default 1 (preserves historical behaviour when not set)
4534
+ *
4535
+ * @example
4536
+ * ```typescript
4537
+ * // Run all llm_host cases 10 times each by default
4538
+ * await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
4539
+ * ```
4540
+ */
4541
+ defaultLlmIterations?: number;
4542
+ /**
4543
+ * Default number of judge evaluations for cases that do not specify
4544
+ * `judgeReps` explicitly. Applies to any case with a `passesJudge`
4545
+ * expectation. Per-case `judgeReps` overrides this.
4546
+ *
4547
+ * @default 1 (single judge run)
4548
+ */
4549
+ defaultJudgeReps?: number;
4550
+ /**
4551
+ * When set, only eval cases whose `tags` array contains at least one of
4552
+ * the specified tags are run. Cases without a `tags` field are excluded.
4553
+ * When undefined or empty, all cases run (default behavior).
4554
+ */
4555
+ filterTags?: string[];
4556
+ /**
4557
+ * If set, saves the run results to this file path after completion.
4558
+ * Use with `baselineResultsFrom` on the next run for regression detection.
4559
+ *
4560
+ * @example '.mcp-test-results/baseline.json'
4561
+ */
4562
+ saveResultsTo?: string;
4563
+ /**
4564
+ * If set, loads this file as the baseline and computes delta metrics vs the current run.
4565
+ * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
4566
+ * and tags each `EvalCaseResult.baselinePass`.
4567
+ */
4568
+ baselineResultsFrom?: string;
4569
+ /**
4570
+ * LLM host model identifier to record in run metadata.
4571
+ * Use this to identify which model was used when running llm_host cases.
4572
+ *
4573
+ * @example 'claude-opus-4-20250514'
4574
+ */
4575
+ llmHostModel?: string;
4576
+ /**
4577
+ * Judge model identifier to record in run metadata.
4578
+ * Use this to identify which model was used for judge evaluations.
4579
+ *
4580
+ * @example 'claude-sonnet-4-20250514'
4581
+ */
4582
+ judgeModel?: string;
3269
4583
  }
3270
4584
  /**
3271
4585
  * Options for running a single eval case
@@ -3279,17 +4593,14 @@ interface EvalCaseOptions {
3279
4593
  * Schema registry for schema validation by name
3280
4594
  */
3281
4595
  schemas?: Record<string, ZodType>;
3282
- /**
3283
- * Judge configuration registry by ID
3284
- */
3285
- judgeConfigs?: Record<string, JudgeConfig>;
3286
4596
  }
3287
4597
  /**
3288
- * Runs a single eval case and returns the result
4598
+ * Runs a single eval case and returns the result.
4599
+ * When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
3289
4600
  *
3290
4601
  * @param evalCase - The eval case to run
3291
4602
  * @param context - Context containing mcp, testInfo, expect
3292
- * @param options - Optional configuration (datasetName, schemas, judgeConfigs)
4603
+ * @param options - Optional configuration (datasetName, schemas)
3293
4604
  * @returns The result of running the eval case
3294
4605
  *
3295
4606
  * @example
@@ -3303,131 +4614,165 @@ interface EvalCaseOptions {
3303
4614
  * expect(result.pass).toBe(true);
3304
4615
  * ```
3305
4616
  */
3306
- declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult$1>;
4617
+ declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
4618
+ declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
4619
+
3307
4620
  /**
3308
- * Runs an eval dataset against an MCP server
4621
+ * Saves eval results to a JSON file for use as a baseline in future runs.
3309
4622
  *
3310
- * This function composes runEvalCase() for each case in the dataset,
3311
- * adding dataset-level features like stopOnFailure and callbacks.
4623
+ * @param result - The eval run result to save
4624
+ * @param filePath - Path to write the JSON file (parent dirs created automatically)
4625
+ */
4626
+ declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
4627
+ /**
4628
+ * Loads a previously saved baseline from a JSON file.
3312
4629
  *
3313
- * @param options - Eval runner options (dataset, schemas, judgeConfigs)
3314
- * @param context - Eval context (mcp fixture, optional testInfo, optional expect)
3315
- * @returns Eval results
4630
+ * @param filePath - Path to the JSON file written by saveBaseline
4631
+ * @returns The saved EvalRunnerResult
4632
+ * @throws If the file cannot be read or parsed
4633
+ */
4634
+ declare function loadBaseline(filePath: string): Promise<EvalRunnerResult>;
4635
+
4636
+ /** Outcome of comparing two servers on a single eval case. */
4637
+ type ComparisonOutcome = 'A_WINS' | 'B_WINS' | 'TIE' | 'BOTH_FAIL';
4638
+ /** Result of comparing a single eval case across two servers. */
4639
+ interface CaseComparisonResult {
4640
+ /** Case ID */
4641
+ id: string;
4642
+ /** Comparison outcome */
4643
+ outcome: ComparisonOutcome;
4644
+ /** Result from server A */
4645
+ serverA: EvalCaseResult;
4646
+ /** Result from server B */
4647
+ serverB: EvalCaseResult;
4648
+ }
4649
+ /** Aggregated result of running a dataset against two servers. */
4650
+ interface ServerComparisonResult {
4651
+ /** Dataset name */
4652
+ dataset: string;
4653
+ /** Total cases compared (cases present in both runs) */
4654
+ total: number;
4655
+ /** Cases where server A passed and server B failed */
4656
+ aWins: number;
4657
+ /** Cases where server B passed and server A failed */
4658
+ bWins: number;
4659
+ /** Cases where both passed */
4660
+ ties: number;
4661
+ /** Cases where both failed */
4662
+ bothFail: number;
4663
+ /** Raw count of cases where both servers failed (same as bothFail) */
4664
+ bothFailCount: number;
4665
+ /** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
4666
+ decidedCases: number;
4667
+ /** Fraction of total cases where both servers failed (bothFail / total) */
4668
+ failureAlignment: number;
4669
+ /** A win rate (aWins / decidedCases, excludes BOTH_FAIL) */
4670
+ aWinRate: number;
4671
+ /** B win rate (bWins / decidedCases, excludes BOTH_FAIL) */
4672
+ bWinRate: number;
4673
+ /** Tie rate (ties / decidedCases, excludes BOTH_FAIL) */
4674
+ tieRate: number;
4675
+ /** Per-case comparison results */
4676
+ cases: CaseComparisonResult[];
4677
+ /** Full result from server A */
4678
+ serverAResult: EvalRunnerResult;
4679
+ /** Full result from server B */
4680
+ serverBResult: EvalRunnerResult;
4681
+ /** Total duration in milliseconds */
4682
+ durationMs: number;
4683
+ }
4684
+ /**
4685
+ * Options for `runServerComparison`.
4686
+ * Same as `EvalRunnerOptions` without baseline-specific fields.
4687
+ */
4688
+ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baselineResultsFrom'>;
4689
+ /**
4690
+ * Runs the same eval dataset against two MCP servers in parallel and
4691
+ * returns a detailed per-case comparison of results.
3316
4692
  *
3317
- * @example
3318
- * // Basic usage
3319
- * const result = await runEvalDataset(
3320
- * {
3321
- * dataset,
3322
- * schemas: { WeatherResponse: WeatherSchema },
3323
- * },
3324
- * { mcp }
3325
- * );
4693
+ * Both servers receive identical cases and options. The comparison uses
4694
+ * simple pass/fail per case: A_WINS means A passed and B failed, etc.
4695
+ *
4696
+ * @param options - Eval dataset and runner options (shared between both servers)
4697
+ * @param contextA - MCP context for server A (e.g., Glean MCP)
4698
+ * @param contextB - MCP context for server B (e.g., native MCP)
4699
+ * @returns Comparison result with per-case outcomes and aggregate win rates
3326
4700
  *
3327
4701
  * @example
3328
- * // With MCP reporter integration
3329
- * test('eval dataset', async ({ mcp }, testInfo) => {
3330
- * const result = await runEvalDataset(
3331
- * { dataset },
3332
- * { mcp, testInfo } // testInfo enables MCP reporter
3333
- * );
3334
- * });
4702
+ * ```typescript
4703
+ * const comparison = await runServerComparison(
4704
+ * { dataset },
4705
+ * { mcp: gleanMcpFixture },
4706
+ * { mcp: nativeMcpFixture }
4707
+ * );
4708
+ * console.log(`Glean MCP wins: ${(comparison.aWinRate * 100).toFixed(1)}%`);
4709
+ * console.log(`Native MCP wins: ${(comparison.bWinRate * 100).toFixed(1)}%`);
4710
+ * ```
3335
4711
  */
3336
- declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
4712
+ declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
3337
4713
 
3338
4714
  /**
3339
4715
  * LLM Host Simulation - Main entry point
3340
4716
  *
3341
- * Provides the public API for simulating LLM hosts interacting
3342
- * with MCP servers through actual LLM providers.
4717
+ * All providers (openai, anthropic, google, azure, mistral, deepseek,
4718
+ * openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
4719
+ * generateText + stopWhen for a uniform multi-turn tool-calling loop with
4720
+ * built-in latency decomposition.
4721
+ *
4722
+ * Required packages per provider:
4723
+ * openai → npm install ai @ai-sdk/openai
4724
+ * anthropic → npm install ai @ai-sdk/anthropic
4725
+ * google → npm install ai @ai-sdk/google
4726
+ * azure → npm install ai @ai-sdk/azure
4727
+ * mistral → npm install ai @ai-sdk/mistral
4728
+ * deepseek → npm install ai @ai-sdk/deepseek
4729
+ * openrouter → npm install ai @openrouter/ai-sdk-provider
4730
+ * xai → npm install ai @ai-sdk/xai
3343
4731
  */
3344
4732
 
3345
4733
  /**
3346
- * Simulates an LLM host interacting with an MCP server
4734
+ * Simulates an LLM host interacting with an MCP server.
4735
+ *
4736
+ * The LLM chooses which tools to call based solely on their descriptions and
4737
+ * schemas, testing discoverability and parameter clarity at the level a real
4738
+ * user (via Claude Desktop, ChatGPT, etc.) would experience.
3347
4739
  *
3348
- * This function uses actual LLM providers (OpenAI or Anthropic) to test
3349
- * MCP servers through natural language scenarios. The LLM chooses which
3350
- * tools to call based on their descriptions, testing discoverability and
3351
- * parameter clarity.
4740
+ * All providers run through the Vercel AI SDK's generateText with maxSteps,
4741
+ * which handles multi-turn tool calling natively and provides per-step latency
4742
+ * decomposition (llmDurationMs vs. mcpDurationMs).
3352
4743
  *
3353
4744
  * @param mcp - MCP fixture API
3354
- * @param scenario - Natural language prompt describing what to do
3355
- * @param config - LLM host configuration
3356
- * @returns Simulation result with tool calls and final response
4745
+ * @param scenario - Natural language prompt describing what the LLM should do
4746
+ * @param config - LLM host configuration (provider, model, temperature, etc.)
4747
+ * @returns Simulation result with tool calls, final response, and latency data
3357
4748
  *
3358
4749
  * @example
3359
4750
  * ```typescript
3360
4751
  * const result = await simulateLLMHost(mcp,
3361
- * "Get the weather for London",
3362
- * {
3363
- * provider: 'openai',
3364
- * model: 'gpt-4o'
3365
- * }
4752
+ * "Find recent documents about MCP testing frameworks",
4753
+ * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
3366
4754
  * );
3367
4755
  *
3368
4756
  * expect(result.success).toBe(true);
3369
- * expect(result.toolCalls).toContainEqual({
3370
- * name: 'get_weather',
3371
- * arguments: { city: 'London' }
3372
- * });
4757
+ * expect(result.toolCalls.map(c => c.name)).toContain('search');
3373
4758
  * ```
3374
4759
  */
3375
4760
  declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
3376
4761
  /**
3377
- * Checks if the required SDK is available for a given provider
4762
+ * Returns true if the given provider is supported.
3378
4763
  *
3379
- * This performs a quick check without actually loading the SDK.
3380
- * The actual SDK loading happens in the adapter when simulation runs.
3381
- *
3382
- * @param provider - LLM provider to check
3383
- * @returns true if an adapter is registered for the provider
4764
+ * Note: this does not check whether the required @ai-sdk/* package is
4765
+ * installed that is validated at simulation time with a helpful error.
3384
4766
  */
3385
4767
  declare function isProviderAvailable(provider: LLMProvider): boolean;
3386
4768
  /**
3387
- * Gets a helpful error message for missing dependencies
4769
+ * Returns a human-readable installation message for a given provider.
3388
4770
  *
3389
- * @param provider - LLM provider
3390
- * @returns Error message with installation instructions
4771
+ * @remarks This is a diagnostic utility for checking whether optional
4772
+ * @ai-sdk/* packages are installed. Not part of the primary usage path.
3391
4773
  */
3392
4774
  declare function getMissingDependencyMessage(provider: LLMProvider): string;
3393
4775
 
3394
- /**
3395
- * Tool call validator for LLM host mode
3396
- *
3397
- * Validates that the LLM made the expected tool calls with correct arguments
3398
- */
3399
-
3400
- /**
3401
- * Tool call validation function signature
3402
- */
3403
- type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
3404
- /**
3405
- * Creates a tool call validator for LLM host mode
3406
- *
3407
- * Validates that the LLM made the expected tool calls with correct arguments.
3408
- * Supports partial argument matching and optional calls.
3409
- *
3410
- * @returns Validator function
3411
- *
3412
- * @example
3413
- * ```typescript
3414
- * // In your eval case:
3415
- * {
3416
- * "id": "weather-london",
3417
- * "mode": "llm_host",
3418
- * "scenario": "Get the weather for London",
3419
- * "expectedToolCalls": [
3420
- * {
3421
- * "name": "get_weather",
3422
- * "arguments": { "city": "London" },
3423
- * "required": true
3424
- * }
3425
- * ]
3426
- * }
3427
- * ```
3428
- */
3429
- declare function createToolCallValidator(): ToolCallValidator;
3430
-
3431
4776
  /**
3432
4777
  * Creates an LLM judge for evaluating tool responses
3433
4778
  *
@@ -3494,7 +4839,7 @@ interface MCPConformanceOptions {
3494
4839
  /**
3495
4840
  * Individual check result
3496
4841
  */
3497
- interface MCPConformanceCheck$1 {
4842
+ interface MCPConformanceCheck {
3498
4843
  name: string;
3499
4844
  pass: boolean;
3500
4845
  message: string;
@@ -3539,7 +4884,7 @@ interface MCPConformanceResult {
3539
4884
  /**
3540
4885
  * List of check results
3541
4886
  */
3542
- checks: MCPConformanceCheck$1[];
4887
+ checks: MCPConformanceCheck[];
3543
4888
  /**
3544
4889
  * Raw MCP responses for snapshotting
3545
4890
  *
@@ -3588,229 +4933,6 @@ interface MCPConformanceResult {
3588
4933
  */
3589
4934
  declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
3590
4935
 
3591
- /**
3592
- * Reporter-specific type definitions
3593
- *
3594
- * These types are used by the MCP reporter and UI.
3595
- *
3596
- * @packageDocumentation
3597
- */
3598
-
3599
- /**
3600
- * Individual conformance check result
3601
- */
3602
- interface MCPConformanceCheck {
3603
- /**
3604
- * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
3605
- */
3606
- name: string;
3607
- /**
3608
- * Whether the check passed
3609
- */
3610
- pass: boolean;
3611
- /**
3612
- * Human-readable message describing the result
3613
- */
3614
- message: string;
3615
- }
3616
- /**
3617
- * Conformance check result as stored in reporter data
3618
- */
3619
- interface MCPConformanceResultData {
3620
- /**
3621
- * Test title where conformance check was run
3622
- */
3623
- testTitle: string;
3624
- /**
3625
- * Whether all checks passed
3626
- */
3627
- pass: boolean;
3628
- /**
3629
- * Individual check results
3630
- */
3631
- checks: MCPConformanceCheck[];
3632
- /**
3633
- * Server info if available
3634
- */
3635
- serverInfo?: {
3636
- name?: string;
3637
- version?: string;
3638
- };
3639
- /**
3640
- * Number of tools discovered
3641
- */
3642
- toolCount: number;
3643
- /**
3644
- * Auth type used for this check
3645
- */
3646
- authType?: AuthType;
3647
- /**
3648
- * Project name
3649
- */
3650
- project?: string;
3651
- }
3652
- /**
3653
- * Server capabilities data from mcp-list-tools attachment
3654
- */
3655
- interface MCPServerCapabilitiesData {
3656
- /**
3657
- * Test title where listTools was called
3658
- */
3659
- testTitle: string;
3660
- /**
3661
- * List of tools available on the server
3662
- */
3663
- tools: Array<{
3664
- name: string;
3665
- description?: string;
3666
- }>;
3667
- /**
3668
- * Total number of tools
3669
- */
3670
- toolCount: number;
3671
- /**
3672
- * Auth type used for this test
3673
- */
3674
- authType?: AuthType;
3675
- /**
3676
- * Project name
3677
- */
3678
- project?: string;
3679
- }
3680
- /**
3681
- * Result of a single eval case
3682
- */
3683
- interface EvalCaseResult {
3684
- /**
3685
- * Case ID
3686
- */
3687
- id: string;
3688
- /**
3689
- * Dataset name this case belongs to
3690
- */
3691
- datasetName: string;
3692
- /**
3693
- * MCP tool name that was called
3694
- */
3695
- toolName: string;
3696
- /**
3697
- * Source of this result
3698
- */
3699
- source: ResultSource;
3700
- /**
3701
- * Overall pass/fail status
3702
- */
3703
- pass: boolean;
3704
- /**
3705
- * Tool response
3706
- */
3707
- response?: unknown;
3708
- /**
3709
- * Error if tool call failed
3710
- */
3711
- error?: string;
3712
- /**
3713
- * Expectation results
3714
- */
3715
- expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
3716
- /**
3717
- * Authentication type used for this test
3718
- */
3719
- authType?: AuthType;
3720
- /**
3721
- * Playwright project name this test belongs to
3722
- */
3723
- project?: string;
3724
- /**
3725
- * Execution time in milliseconds
3726
- */
3727
- durationMs: number;
3728
- /**
3729
- * @deprecated Mode is inferred from test context, not displayed in reports
3730
- */
3731
- mode?: 'direct' | 'llm_host';
3732
- }
3733
- /**
3734
- * Aggregated MCP eval run data
3735
- */
3736
- interface MCPEvalRunData {
3737
- /**
3738
- * Run timestamp (ISO 8601)
3739
- */
3740
- timestamp: string;
3741
- /**
3742
- * Total duration in milliseconds
3743
- */
3744
- durationMs: number;
3745
- /**
3746
- * Environment info
3747
- */
3748
- environment: {
3749
- ci: boolean;
3750
- node: string;
3751
- platform: string;
3752
- };
3753
- /**
3754
- * Aggregate metrics
3755
- */
3756
- metrics: {
3757
- /**
3758
- * Total number of eval cases
3759
- */
3760
- total: number;
3761
- /**
3762
- * Number of passed cases
3763
- */
3764
- passed: number;
3765
- /**
3766
- * Number of failed cases
3767
- */
3768
- failed: number;
3769
- /**
3770
- * Pass rate (0-1)
3771
- */
3772
- passRate: number;
3773
- /**
3774
- * Dataset breakdown: dataset name -> count
3775
- */
3776
- datasetBreakdown: Record<string, number>;
3777
- /**
3778
- * Expectation type breakdown
3779
- */
3780
- expectationBreakdown: ExpectationBreakdown;
3781
- };
3782
- /**
3783
- * All eval results from this run
3784
- */
3785
- results: EvalCaseResult[];
3786
- /**
3787
- * Conformance check results (optional)
3788
- */
3789
- conformanceChecks?: MCPConformanceResultData[];
3790
- /**
3791
- * Server capabilities discovered via listTools (optional)
3792
- */
3793
- serverCapabilities?: MCPServerCapabilitiesData[];
3794
- }
3795
- /**
3796
- * Historical summary for trend charts
3797
- */
3798
- interface MCPEvalHistoricalSummary {
3799
- timestamp: string;
3800
- total: number;
3801
- passed: number;
3802
- failed: number;
3803
- passRate: number;
3804
- durationMs: number;
3805
- }
3806
- /**
3807
- * Complete data structure passed to UI
3808
- */
3809
- interface MCPEvalData {
3810
- runData: MCPEvalRunData;
3811
- historical: MCPEvalHistoricalSummary[];
3812
- }
3813
-
3814
4936
  /**
3815
4937
  * Reporter types - re-exported from canonical source
3816
4938
  *
@@ -3831,7 +4953,7 @@ interface MCPEvalReporterConfig {
3831
4953
  outputDir?: string;
3832
4954
  /**
3833
4955
  * Auto-open report in browser after test run
3834
- * @default true (disabled in CI)
4956
+ * @default false
3835
4957
  */
3836
4958
  autoOpen?: boolean;
3837
4959
  /**
@@ -3854,4 +4976,4 @@ interface MCPEvalReporterConfig {
3854
4976
  includeAutoTracking?: boolean;
3855
4977
  }
3856
4978
 
3857
- export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult$1 as EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type ExpectedToolCall, type FieldRemovalSanitizer, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck$1 as MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type SizeValidatorOptions, type SnapshotSanitizer, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallValidationResult, type ToolCallValidator, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, createToolCallValidator, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, extractText as extractTextFromResponse, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performOAuthSetup, performOAuthSetupIfNeeded, runConformanceChecks, runEvalCase, runEvalDataset, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText };
4979
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };