@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -337
- package/dist/cli/index.js +455 -174
- package/dist/fixtures/mcp.d.ts +121 -44
- package/dist/fixtures/mcp.js +974 -244
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/fixtures/mcpAuth.js +6 -2
- package/dist/fixtures/mcpAuth.js.map +1 -1
- package/dist/index.cjs +4936 -1292
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1660 -570
- package/dist/index.d.ts +1660 -570
- package/dist/index.js +4923 -1288
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +35 -16
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +8 -3
- package/dist/reporters/mcpReporter.d.ts +8 -3
- package/dist/reporters/mcpReporter.js +36 -17
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +5 -5
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +63 -8
- package/src/reporters/ui-dist/app.js +5 -5
- package/src/reporters/ui-dist/styles.css +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -42,6 +42,28 @@ interface MCPOAuthConfig {
|
|
|
42
42
|
*/
|
|
43
43
|
redirectUri?: string;
|
|
44
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* OAuth 2.1 client credentials configuration for machine-to-machine (CI/CD) authentication.
|
|
47
|
+
* Credentials can be provided here or via MCP_CLIENT_ID/MCP_CLIENT_SECRET environment variables.
|
|
48
|
+
*/
|
|
49
|
+
interface MCPClientCredentialsConfig {
|
|
50
|
+
/**
|
|
51
|
+
* OAuth client ID (falls back to MCP_CLIENT_ID env var)
|
|
52
|
+
*/
|
|
53
|
+
clientId?: string;
|
|
54
|
+
/**
|
|
55
|
+
* OAuth client secret (falls back to MCP_CLIENT_SECRET env var)
|
|
56
|
+
*/
|
|
57
|
+
clientSecret?: string;
|
|
58
|
+
/**
|
|
59
|
+
* Token endpoint URL (required)
|
|
60
|
+
*/
|
|
61
|
+
tokenEndpoint?: string;
|
|
62
|
+
/**
|
|
63
|
+
* Scopes to request
|
|
64
|
+
*/
|
|
65
|
+
scopes?: string[];
|
|
66
|
+
}
|
|
45
67
|
/**
|
|
46
68
|
* Authentication configuration for MCP connections
|
|
47
69
|
*/
|
|
@@ -54,6 +76,10 @@ interface MCPAuthConfig {
|
|
|
54
76
|
* Full OAuth configuration for browser-based authentication
|
|
55
77
|
*/
|
|
56
78
|
oauth?: MCPOAuthConfig;
|
|
79
|
+
/**
|
|
80
|
+
* OAuth 2.1 client credentials grant for machine-to-machine authentication
|
|
81
|
+
*/
|
|
82
|
+
clientCredentials?: MCPClientCredentialsConfig;
|
|
57
83
|
}
|
|
58
84
|
/**
|
|
59
85
|
* MCP host capabilities that can be registered with the server
|
|
@@ -74,35 +100,67 @@ interface MCPHostCapabilities {
|
|
|
74
100
|
};
|
|
75
101
|
}
|
|
76
102
|
/**
|
|
77
|
-
* Configuration for MCP client connection
|
|
78
|
-
*
|
|
79
|
-
* Supports both stdio (local) and HTTP (remote) transports
|
|
103
|
+
* Configuration for MCP client connection via stdio transport (local process)
|
|
80
104
|
*/
|
|
81
|
-
interface
|
|
105
|
+
interface StdioMCPConfig {
|
|
82
106
|
/**
|
|
83
|
-
* Transport type
|
|
107
|
+
* Transport type discriminant
|
|
84
108
|
*/
|
|
85
|
-
transport: '
|
|
109
|
+
transport: 'stdio';
|
|
86
110
|
/**
|
|
87
|
-
*
|
|
111
|
+
* Command to execute (required for stdio transport)
|
|
88
112
|
*/
|
|
89
|
-
|
|
113
|
+
command: string;
|
|
90
114
|
/**
|
|
91
|
-
*
|
|
115
|
+
* Command arguments
|
|
92
116
|
*/
|
|
93
|
-
|
|
117
|
+
args?: Array<string>;
|
|
94
118
|
/**
|
|
95
|
-
*
|
|
119
|
+
* Working directory for the command
|
|
96
120
|
*/
|
|
97
|
-
|
|
121
|
+
cwd?: string;
|
|
98
122
|
/**
|
|
99
|
-
*
|
|
123
|
+
* Suppress stderr output from the server process.
|
|
124
|
+
* When true, server stderr is ignored instead of inherited.
|
|
100
125
|
*/
|
|
101
|
-
|
|
126
|
+
quiet?: boolean;
|
|
102
127
|
/**
|
|
103
|
-
*
|
|
128
|
+
* Host capabilities to register with the server
|
|
104
129
|
*/
|
|
105
|
-
|
|
130
|
+
capabilities?: MCPHostCapabilities;
|
|
131
|
+
/**
|
|
132
|
+
* Connection timeout in milliseconds
|
|
133
|
+
*/
|
|
134
|
+
connectTimeoutMs?: number;
|
|
135
|
+
/**
|
|
136
|
+
* Request timeout in milliseconds
|
|
137
|
+
*/
|
|
138
|
+
requestTimeoutMs?: number;
|
|
139
|
+
/**
|
|
140
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
141
|
+
*/
|
|
142
|
+
callTimeoutMs?: number;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Configuration for MCP client connection via HTTP transport (remote server)
|
|
146
|
+
*/
|
|
147
|
+
interface HttpMCPConfig {
|
|
148
|
+
/**
|
|
149
|
+
* Transport type discriminant
|
|
150
|
+
*/
|
|
151
|
+
transport: 'http';
|
|
152
|
+
/**
|
|
153
|
+
* Server URL (required for http transport)
|
|
154
|
+
*/
|
|
155
|
+
serverUrl: string;
|
|
156
|
+
/**
|
|
157
|
+
* HTTP headers (e.g., Authorization)
|
|
158
|
+
*/
|
|
159
|
+
headers?: Record<string, string>;
|
|
160
|
+
/**
|
|
161
|
+
* Authentication configuration
|
|
162
|
+
*/
|
|
163
|
+
auth?: MCPAuthConfig;
|
|
106
164
|
/**
|
|
107
165
|
* Host capabilities to register with the server
|
|
108
166
|
*/
|
|
@@ -116,15 +174,57 @@ interface MCPConfig {
|
|
|
116
174
|
*/
|
|
117
175
|
requestTimeoutMs?: number;
|
|
118
176
|
/**
|
|
119
|
-
*
|
|
120
|
-
* When true, server stderr is ignored instead of inherited
|
|
177
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
121
178
|
*/
|
|
122
|
-
|
|
179
|
+
callTimeoutMs?: number;
|
|
123
180
|
/**
|
|
124
|
-
*
|
|
181
|
+
* HTTP proxy configuration. Falls back to HTTPS_PROXY/HTTP_PROXY environment variables.
|
|
125
182
|
*/
|
|
126
|
-
|
|
183
|
+
proxy?: {
|
|
184
|
+
/**
|
|
185
|
+
* Proxy URL. Credentials can be embedded directly if required:
|
|
186
|
+
* `http://user:pass@proxy.example.com:8080`
|
|
187
|
+
*/
|
|
188
|
+
url: string;
|
|
189
|
+
};
|
|
190
|
+
/**
|
|
191
|
+
* Number of retry attempts for transient connection failures and 429 rate limit responses.
|
|
192
|
+
* Uses exponential backoff with Retry-After header awareness. Defaults to 0 (no retries).
|
|
193
|
+
*/
|
|
194
|
+
retryAttempts?: number;
|
|
195
|
+
/**
|
|
196
|
+
* TLS/mTLS configuration for custom certificates or disabling cert validation.
|
|
197
|
+
* File paths should point to PEM-encoded certificate files.
|
|
198
|
+
*/
|
|
199
|
+
tls?: {
|
|
200
|
+
/**
|
|
201
|
+
* Path to CA certificate PEM file (for custom/self-signed CAs)
|
|
202
|
+
*/
|
|
203
|
+
ca?: string;
|
|
204
|
+
/**
|
|
205
|
+
* Path to client certificate PEM file (for mutual TLS)
|
|
206
|
+
*/
|
|
207
|
+
cert?: string;
|
|
208
|
+
/**
|
|
209
|
+
* Path to client private key PEM file (for mutual TLS)
|
|
210
|
+
*/
|
|
211
|
+
key?: string;
|
|
212
|
+
/**
|
|
213
|
+
* Whether to reject unauthorized certificates. Defaults to true.
|
|
214
|
+
* Set to false to disable certificate validation (not recommended for production).
|
|
215
|
+
*/
|
|
216
|
+
rejectUnauthorized?: boolean;
|
|
217
|
+
};
|
|
127
218
|
}
|
|
219
|
+
/**
|
|
220
|
+
* Configuration for MCP client connection.
|
|
221
|
+
*
|
|
222
|
+
* This is a discriminated union — narrow with `isStdioConfig()` or `isHttpConfig()`
|
|
223
|
+
* before accessing transport-specific fields.
|
|
224
|
+
*
|
|
225
|
+
* Supports both stdio (local) and HTTP (remote) transports.
|
|
226
|
+
*/
|
|
227
|
+
type MCPConfig = StdioMCPConfig | HttpMCPConfig;
|
|
128
228
|
/**
|
|
129
229
|
* Union schema for MCPConfig (validates based on transport type)
|
|
130
230
|
*/
|
|
@@ -155,6 +255,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
155
255
|
}>>;
|
|
156
256
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
157
257
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
258
|
+
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
158
259
|
quiet: z.ZodOptional<z.ZodBoolean>;
|
|
159
260
|
}, "strip", z.ZodTypeAny, {
|
|
160
261
|
transport: "stdio";
|
|
@@ -169,6 +270,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
169
270
|
} | undefined;
|
|
170
271
|
connectTimeoutMs?: number | undefined;
|
|
171
272
|
requestTimeoutMs?: number | undefined;
|
|
273
|
+
callTimeoutMs?: number | undefined;
|
|
172
274
|
quiet?: boolean | undefined;
|
|
173
275
|
}, {
|
|
174
276
|
transport: "stdio";
|
|
@@ -183,10 +285,11 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
183
285
|
} | undefined;
|
|
184
286
|
connectTimeoutMs?: number | undefined;
|
|
185
287
|
requestTimeoutMs?: number | undefined;
|
|
288
|
+
callTimeoutMs?: number | undefined;
|
|
186
289
|
quiet?: boolean | undefined;
|
|
187
290
|
}>, z.ZodObject<{
|
|
188
291
|
transport: z.ZodLiteral<"http">;
|
|
189
|
-
serverUrl: z.ZodString
|
|
292
|
+
serverUrl: z.ZodEffects<z.ZodString, string, string>;
|
|
190
293
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
191
294
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
192
295
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
@@ -210,6 +313,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
210
313
|
}>>;
|
|
211
314
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
212
315
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
316
|
+
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
213
317
|
auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
|
|
214
318
|
accessToken: z.ZodOptional<z.ZodString>;
|
|
215
319
|
oauth: z.ZodOptional<z.ZodObject<{
|
|
@@ -237,6 +341,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
237
341
|
clientSecret?: string | undefined;
|
|
238
342
|
redirectUri?: string | undefined;
|
|
239
343
|
}>>;
|
|
344
|
+
clientCredentials: z.ZodOptional<z.ZodObject<{
|
|
345
|
+
clientId: z.ZodOptional<z.ZodString>;
|
|
346
|
+
clientSecret: z.ZodOptional<z.ZodString>;
|
|
347
|
+
tokenEndpoint: z.ZodOptional<z.ZodString>;
|
|
348
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
349
|
+
}, "strip", z.ZodTypeAny, {
|
|
350
|
+
scopes?: string[] | undefined;
|
|
351
|
+
clientId?: string | undefined;
|
|
352
|
+
clientSecret?: string | undefined;
|
|
353
|
+
tokenEndpoint?: string | undefined;
|
|
354
|
+
}, {
|
|
355
|
+
scopes?: string[] | undefined;
|
|
356
|
+
clientId?: string | undefined;
|
|
357
|
+
clientSecret?: string | undefined;
|
|
358
|
+
tokenEndpoint?: string | undefined;
|
|
359
|
+
}>>;
|
|
240
360
|
}, "strip", z.ZodTypeAny, {
|
|
241
361
|
accessToken?: string | undefined;
|
|
242
362
|
oauth?: {
|
|
@@ -248,6 +368,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
248
368
|
clientSecret?: string | undefined;
|
|
249
369
|
redirectUri?: string | undefined;
|
|
250
370
|
} | undefined;
|
|
371
|
+
clientCredentials?: {
|
|
372
|
+
scopes?: string[] | undefined;
|
|
373
|
+
clientId?: string | undefined;
|
|
374
|
+
clientSecret?: string | undefined;
|
|
375
|
+
tokenEndpoint?: string | undefined;
|
|
376
|
+
} | undefined;
|
|
251
377
|
}, {
|
|
252
378
|
accessToken?: string | undefined;
|
|
253
379
|
oauth?: {
|
|
@@ -259,6 +385,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
259
385
|
clientSecret?: string | undefined;
|
|
260
386
|
redirectUri?: string | undefined;
|
|
261
387
|
} | undefined;
|
|
388
|
+
clientCredentials?: {
|
|
389
|
+
scopes?: string[] | undefined;
|
|
390
|
+
clientId?: string | undefined;
|
|
391
|
+
clientSecret?: string | undefined;
|
|
392
|
+
tokenEndpoint?: string | undefined;
|
|
393
|
+
} | undefined;
|
|
262
394
|
}>, {
|
|
263
395
|
accessToken?: string | undefined;
|
|
264
396
|
oauth?: {
|
|
@@ -270,6 +402,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
270
402
|
clientSecret?: string | undefined;
|
|
271
403
|
redirectUri?: string | undefined;
|
|
272
404
|
} | undefined;
|
|
405
|
+
clientCredentials?: {
|
|
406
|
+
scopes?: string[] | undefined;
|
|
407
|
+
clientId?: string | undefined;
|
|
408
|
+
clientSecret?: string | undefined;
|
|
409
|
+
tokenEndpoint?: string | undefined;
|
|
410
|
+
} | undefined;
|
|
273
411
|
}, {
|
|
274
412
|
accessToken?: string | undefined;
|
|
275
413
|
oauth?: {
|
|
@@ -281,6 +419,36 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
281
419
|
clientSecret?: string | undefined;
|
|
282
420
|
redirectUri?: string | undefined;
|
|
283
421
|
} | undefined;
|
|
422
|
+
clientCredentials?: {
|
|
423
|
+
scopes?: string[] | undefined;
|
|
424
|
+
clientId?: string | undefined;
|
|
425
|
+
clientSecret?: string | undefined;
|
|
426
|
+
tokenEndpoint?: string | undefined;
|
|
427
|
+
} | undefined;
|
|
428
|
+
}>>;
|
|
429
|
+
proxy: z.ZodOptional<z.ZodObject<{
|
|
430
|
+
url: z.ZodString;
|
|
431
|
+
}, "strip", z.ZodTypeAny, {
|
|
432
|
+
url: string;
|
|
433
|
+
}, {
|
|
434
|
+
url: string;
|
|
435
|
+
}>>;
|
|
436
|
+
retryAttempts: z.ZodOptional<z.ZodNumber>;
|
|
437
|
+
tls: z.ZodOptional<z.ZodObject<{
|
|
438
|
+
ca: z.ZodOptional<z.ZodString>;
|
|
439
|
+
cert: z.ZodOptional<z.ZodString>;
|
|
440
|
+
key: z.ZodOptional<z.ZodString>;
|
|
441
|
+
rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
|
|
442
|
+
}, "strip", z.ZodTypeAny, {
|
|
443
|
+
ca?: string | undefined;
|
|
444
|
+
cert?: string | undefined;
|
|
445
|
+
key?: string | undefined;
|
|
446
|
+
rejectUnauthorized?: boolean | undefined;
|
|
447
|
+
}, {
|
|
448
|
+
ca?: string | undefined;
|
|
449
|
+
cert?: string | undefined;
|
|
450
|
+
key?: string | undefined;
|
|
451
|
+
rejectUnauthorized?: boolean | undefined;
|
|
284
452
|
}>>;
|
|
285
453
|
}, "strip", z.ZodTypeAny, {
|
|
286
454
|
serverUrl: string;
|
|
@@ -293,6 +461,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
293
461
|
} | undefined;
|
|
294
462
|
connectTimeoutMs?: number | undefined;
|
|
295
463
|
requestTimeoutMs?: number | undefined;
|
|
464
|
+
callTimeoutMs?: number | undefined;
|
|
296
465
|
headers?: Record<string, string> | undefined;
|
|
297
466
|
auth?: {
|
|
298
467
|
accessToken?: string | undefined;
|
|
@@ -305,6 +474,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
305
474
|
clientSecret?: string | undefined;
|
|
306
475
|
redirectUri?: string | undefined;
|
|
307
476
|
} | undefined;
|
|
477
|
+
clientCredentials?: {
|
|
478
|
+
scopes?: string[] | undefined;
|
|
479
|
+
clientId?: string | undefined;
|
|
480
|
+
clientSecret?: string | undefined;
|
|
481
|
+
tokenEndpoint?: string | undefined;
|
|
482
|
+
} | undefined;
|
|
483
|
+
} | undefined;
|
|
484
|
+
proxy?: {
|
|
485
|
+
url: string;
|
|
486
|
+
} | undefined;
|
|
487
|
+
retryAttempts?: number | undefined;
|
|
488
|
+
tls?: {
|
|
489
|
+
ca?: string | undefined;
|
|
490
|
+
cert?: string | undefined;
|
|
491
|
+
key?: string | undefined;
|
|
492
|
+
rejectUnauthorized?: boolean | undefined;
|
|
308
493
|
} | undefined;
|
|
309
494
|
}, {
|
|
310
495
|
serverUrl: string;
|
|
@@ -317,6 +502,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
317
502
|
} | undefined;
|
|
318
503
|
connectTimeoutMs?: number | undefined;
|
|
319
504
|
requestTimeoutMs?: number | undefined;
|
|
505
|
+
callTimeoutMs?: number | undefined;
|
|
320
506
|
headers?: Record<string, string> | undefined;
|
|
321
507
|
auth?: {
|
|
322
508
|
accessToken?: string | undefined;
|
|
@@ -329,6 +515,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
329
515
|
clientSecret?: string | undefined;
|
|
330
516
|
redirectUri?: string | undefined;
|
|
331
517
|
} | undefined;
|
|
518
|
+
clientCredentials?: {
|
|
519
|
+
scopes?: string[] | undefined;
|
|
520
|
+
clientId?: string | undefined;
|
|
521
|
+
clientSecret?: string | undefined;
|
|
522
|
+
tokenEndpoint?: string | undefined;
|
|
523
|
+
} | undefined;
|
|
524
|
+
} | undefined;
|
|
525
|
+
proxy?: {
|
|
526
|
+
url: string;
|
|
527
|
+
} | undefined;
|
|
528
|
+
retryAttempts?: number | undefined;
|
|
529
|
+
tls?: {
|
|
530
|
+
ca?: string | undefined;
|
|
531
|
+
cert?: string | undefined;
|
|
532
|
+
key?: string | undefined;
|
|
533
|
+
rejectUnauthorized?: boolean | undefined;
|
|
332
534
|
} | undefined;
|
|
333
535
|
}>]>;
|
|
334
536
|
/**
|
|
@@ -342,17 +544,11 @@ declare function validateMCPConfig(config: unknown): MCPConfig;
|
|
|
342
544
|
/**
|
|
343
545
|
* Type guard to check if a config is for stdio transport
|
|
344
546
|
*/
|
|
345
|
-
declare function isStdioConfig(config: MCPConfig): config is
|
|
346
|
-
transport: 'stdio';
|
|
347
|
-
command: string;
|
|
348
|
-
};
|
|
547
|
+
declare function isStdioConfig(config: MCPConfig): config is StdioMCPConfig;
|
|
349
548
|
/**
|
|
350
549
|
* Type guard to check if a config is for HTTP transport
|
|
351
550
|
*/
|
|
352
|
-
declare function isHttpConfig(config: MCPConfig): config is
|
|
353
|
-
transport: 'http';
|
|
354
|
-
serverUrl: string;
|
|
355
|
-
};
|
|
551
|
+
declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
|
|
356
552
|
|
|
357
553
|
/**
|
|
358
554
|
* Auth types for MCP OAuth integration
|
|
@@ -601,6 +797,9 @@ declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
|
|
|
601
797
|
tokens(): Promise<OAuthTokens | undefined>;
|
|
602
798
|
/**
|
|
603
799
|
* Stores new OAuth tokens for the current session
|
|
800
|
+
*
|
|
801
|
+
* The code verifier is cleared after a successful token exchange — it is
|
|
802
|
+
* single-use per PKCE spec and must not persist beyond the exchange.
|
|
604
803
|
*/
|
|
605
804
|
saveTokens(tokens: OAuthTokens): Promise<void>;
|
|
606
805
|
/**
|
|
@@ -757,6 +956,38 @@ interface AuthServerMetadata {
|
|
|
757
956
|
*/
|
|
758
957
|
issuer: string;
|
|
759
958
|
}
|
|
959
|
+
/**
|
|
960
|
+
* Configuration for client credentials grant
|
|
961
|
+
*/
|
|
962
|
+
interface ClientCredentialsConfig {
|
|
963
|
+
/**
|
|
964
|
+
* Token endpoint URL
|
|
965
|
+
*/
|
|
966
|
+
tokenEndpoint: string;
|
|
967
|
+
/**
|
|
968
|
+
* OAuth client ID
|
|
969
|
+
*/
|
|
970
|
+
clientId: string;
|
|
971
|
+
/**
|
|
972
|
+
* OAuth client secret
|
|
973
|
+
*/
|
|
974
|
+
clientSecret: string;
|
|
975
|
+
/**
|
|
976
|
+
* Scopes to request (optional)
|
|
977
|
+
*/
|
|
978
|
+
scopes?: string[];
|
|
979
|
+
}
|
|
980
|
+
/**
|
|
981
|
+
* Performs the OAuth 2.1 client credentials grant to obtain an access token.
|
|
982
|
+
* Suitable for CI/CD machine-to-machine authentication.
|
|
983
|
+
*
|
|
984
|
+
* Uses oauth4webapi for spec-compliant request construction and response validation,
|
|
985
|
+
* consistent with how the rest of this module handles OAuth flows.
|
|
986
|
+
*
|
|
987
|
+
* @param config - Client credentials configuration
|
|
988
|
+
* @returns Token result
|
|
989
|
+
*/
|
|
990
|
+
declare function performClientCredentialsFlow(config: ClientCredentialsConfig): Promise<TokenResult>;
|
|
760
991
|
|
|
761
992
|
/**
|
|
762
993
|
* OAuth Protected Resource and Authorization Server discovery
|
|
@@ -915,8 +1146,9 @@ declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?
|
|
|
915
1146
|
* ```typescript
|
|
916
1147
|
* // After running: npx mcp-server-tester login https://api.example.com/mcp
|
|
917
1148
|
* const tokens = await loadTokens('https://api.example.com/mcp');
|
|
918
|
-
* if (tokens) {
|
|
919
|
-
*
|
|
1149
|
+
* if (tokens?.accessToken) {
|
|
1150
|
+
* // Use the token — never log raw token values
|
|
1151
|
+
* headers.Authorization = `Bearer ${tokens.accessToken}`;
|
|
920
1152
|
* }
|
|
921
1153
|
* ```
|
|
922
1154
|
*/
|
|
@@ -1127,6 +1359,14 @@ interface CreateMCPClientOptions {
|
|
|
1127
1359
|
* This takes precedence over static token auth in config.auth.accessToken.
|
|
1128
1360
|
*/
|
|
1129
1361
|
authProvider?: OAuthClientProvider;
|
|
1362
|
+
/**
|
|
1363
|
+
* Sampling handler callback for LLM sampling requests from the server.
|
|
1364
|
+
*
|
|
1365
|
+
* When provided, the client will advertise sampling capability to the server.
|
|
1366
|
+
* When absent, sampling is removed from declared capabilities so the client
|
|
1367
|
+
* does not falsely advertise support it cannot fulfill.
|
|
1368
|
+
*/
|
|
1369
|
+
samplingHandler?: unknown;
|
|
1130
1370
|
}
|
|
1131
1371
|
/**
|
|
1132
1372
|
* Creates and connects an MCP client based on the provided configuration
|
|
@@ -1251,6 +1491,14 @@ interface ValidationResult {
|
|
|
1251
1491
|
message: string;
|
|
1252
1492
|
/** Additional structured details about the validation */
|
|
1253
1493
|
details?: Record<string, unknown>;
|
|
1494
|
+
/**
|
|
1495
|
+
* Optional quantitative metrics from the validation.
|
|
1496
|
+
* Populated by validateToolCalls for precision/recall.
|
|
1497
|
+
*/
|
|
1498
|
+
metrics?: {
|
|
1499
|
+
precision?: number;
|
|
1500
|
+
recall?: number;
|
|
1501
|
+
};
|
|
1254
1502
|
}
|
|
1255
1503
|
/**
|
|
1256
1504
|
* Options for text validation
|
|
@@ -1282,10 +1530,33 @@ interface PatternValidatorOptions {
|
|
|
1282
1530
|
/** Whether to perform case-sensitive matching (default: true) */
|
|
1283
1531
|
caseSensitive?: boolean;
|
|
1284
1532
|
}
|
|
1533
|
+
/**
|
|
1534
|
+
* Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
|
|
1535
|
+
* Pass these values in the sanitizers array to replace non-deterministic
|
|
1536
|
+
* values with stable placeholders before snapshot comparison.
|
|
1537
|
+
*
|
|
1538
|
+
* @example
|
|
1539
|
+
* expect(result).toMatchToolSnapshot('my-snapshot', [
|
|
1540
|
+
* SnapshotSanitizers.UUID,
|
|
1541
|
+
* SnapshotSanitizers.ISO_DATE,
|
|
1542
|
+
* ]);
|
|
1543
|
+
*/
|
|
1544
|
+
declare const SnapshotSanitizers: {
|
|
1545
|
+
/** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
|
|
1546
|
+
readonly TIMESTAMP: "timestamp";
|
|
1547
|
+
/** Replaces UUID v1-v5 strings with a stable placeholder */
|
|
1548
|
+
readonly UUID: "uuid";
|
|
1549
|
+
/** Replaces ISO 8601 date/datetime strings with a stable placeholder */
|
|
1550
|
+
readonly ISO_DATE: "iso-date";
|
|
1551
|
+
/** Replaces MongoDB ObjectId strings with a stable placeholder */
|
|
1552
|
+
readonly OBJECT_ID: "objectId";
|
|
1553
|
+
/** Replaces JWT tokens with a stable placeholder */
|
|
1554
|
+
readonly JWT: "jwt";
|
|
1555
|
+
};
|
|
1285
1556
|
/**
|
|
1286
1557
|
* Built-in sanitizer names for common variable patterns
|
|
1287
1558
|
*/
|
|
1288
|
-
type BuiltInSanitizer =
|
|
1559
|
+
type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
|
|
1289
1560
|
/**
|
|
1290
1561
|
* Custom regex-based sanitizer
|
|
1291
1562
|
*/
|
|
@@ -1511,38 +1782,63 @@ declare function validateError(response: unknown, expected?: boolean | string |
|
|
|
1511
1782
|
declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
|
|
1512
1783
|
|
|
1513
1784
|
/**
|
|
1514
|
-
*
|
|
1785
|
+
* Tool call validators for llm_host simulation results.
|
|
1515
1786
|
*
|
|
1516
|
-
*
|
|
1517
|
-
*
|
|
1787
|
+
* These validators extract the tool call trace from an LLMHostSimulationResult
|
|
1788
|
+
* and apply assertions against expected call lists and counts.
|
|
1518
1789
|
*/
|
|
1519
1790
|
|
|
1791
|
+
interface ToolCallExpectation {
|
|
1792
|
+
calls: Array<{
|
|
1793
|
+
name: string;
|
|
1794
|
+
arguments?: Record<string, unknown>;
|
|
1795
|
+
required?: boolean;
|
|
1796
|
+
}>;
|
|
1797
|
+
order?: 'strict' | 'any';
|
|
1798
|
+
exclusive?: boolean;
|
|
1799
|
+
}
|
|
1800
|
+
interface ToolCallCountOptions {
|
|
1801
|
+
min?: number;
|
|
1802
|
+
max?: number;
|
|
1803
|
+
exact?: number;
|
|
1804
|
+
}
|
|
1520
1805
|
/**
|
|
1521
|
-
*
|
|
1522
|
-
*
|
|
1523
|
-
* Serializes the response to JSON (with pretty printing for consistency)
|
|
1524
|
-
* and returns the byte length using UTF-8 encoding.
|
|
1806
|
+
* Validates tool calls made during an LLM host simulation.
|
|
1525
1807
|
*
|
|
1526
|
-
* @param response -
|
|
1527
|
-
* @
|
|
1808
|
+
* @param response - Must be an LLMHostSimulationResult (from llm_host mode)
|
|
1809
|
+
* @param expectation - Expected tool call specification
|
|
1528
1810
|
*/
|
|
1529
|
-
declare function
|
|
1811
|
+
declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
|
|
1530
1812
|
/**
|
|
1531
|
-
*
|
|
1532
|
-
*
|
|
1533
|
-
* Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
|
|
1534
|
-
* and trims leading/trailing whitespace.
|
|
1813
|
+
* Validates the number of tool calls made during an LLM host simulation.
|
|
1535
1814
|
*
|
|
1536
|
-
* @param
|
|
1537
|
-
* @
|
|
1815
|
+
* @param response - Must be an LLMHostSimulationResult (from llm_host mode)
|
|
1816
|
+
* @param options - Count constraints (min, max, exact)
|
|
1817
|
+
*/
|
|
1818
|
+
declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
|
|
1819
|
+
|
|
1820
|
+
/**
|
|
1821
|
+
* Built-in judge rubrics matching Glean EvalV2's named judge types.
|
|
1822
|
+
* Use these for consistent, standardized evaluations across teams.
|
|
1538
1823
|
*
|
|
1539
|
-
*
|
|
1540
|
-
* ```typescript
|
|
1541
|
-
* normalizeWhitespace(' hello\n\n world ');
|
|
1542
|
-
* // Returns: "hello world"
|
|
1543
|
-
* ```
|
|
1824
|
+
* All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
|
|
1544
1825
|
*/
|
|
1545
|
-
|
|
1826
|
+
type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
|
|
1827
|
+
declare const BUILT_IN_RUBRICS: Record<BuiltInRubric, string>;
|
|
1828
|
+
/** A rubric specification: either a built-in named rubric or custom text. */
|
|
1829
|
+
type RubricSpec = BuiltInRubric | {
|
|
1830
|
+
text: string;
|
|
1831
|
+
};
|
|
1832
|
+
/**
|
|
1833
|
+
* Returns true if `s` is a built-in rubric name.
|
|
1834
|
+
*/
|
|
1835
|
+
declare function isBuiltInRubric(s: unknown): s is BuiltInRubric;
|
|
1836
|
+
/**
|
|
1837
|
+
* Resolves a RubricSpec to its full rubric text.
|
|
1838
|
+
* - Built-in name → returns the expanded rubric text from BUILT_IN_RUBRICS
|
|
1839
|
+
* - Custom object → returns rubric.text as-is
|
|
1840
|
+
*/
|
|
1841
|
+
declare function resolveRubric(rubric: RubricSpec): string;
|
|
1546
1842
|
|
|
1547
1843
|
/**
|
|
1548
1844
|
* Usage metrics from Claude Agent SDK response
|
|
@@ -1577,10 +1873,8 @@ interface UsageMetrics {
|
|
|
1577
1873
|
*/
|
|
1578
1874
|
cacheCreationInputTokens?: number;
|
|
1579
1875
|
}
|
|
1580
|
-
/**
|
|
1581
|
-
|
|
1582
|
-
*/
|
|
1583
|
-
type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
|
|
1876
|
+
/** Valid LLM judge provider kinds. */
|
|
1877
|
+
type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'google';
|
|
1584
1878
|
/**
|
|
1585
1879
|
* Configuration for an LLM judge
|
|
1586
1880
|
*/
|
|
@@ -1649,7 +1943,24 @@ interface JudgeResult {
|
|
|
1649
1943
|
* Whether the candidate exceeded maxToolOutputSize
|
|
1650
1944
|
*/
|
|
1651
1945
|
exceedsMaxToolOutputSize?: boolean;
|
|
1946
|
+
/**
|
|
1947
|
+
* Standard deviation of individual rep scores.
|
|
1948
|
+
* Only populated when the judge was run with reps > 1.
|
|
1949
|
+
*/
|
|
1950
|
+
scoreStdDev?: number;
|
|
1951
|
+
/**
|
|
1952
|
+
* True when the standard deviation across reps exceeds 0.2, indicating
|
|
1953
|
+
* that the rubric may be ambiguous or the judge is non-deterministic.
|
|
1954
|
+
* Only populated when the judge was run with reps > 1.
|
|
1955
|
+
*/
|
|
1956
|
+
highVariance?: boolean;
|
|
1957
|
+
/**
|
|
1958
|
+
* Individual scores from each judge rep.
|
|
1959
|
+
* Only populated when the judge was run with reps > 1.
|
|
1960
|
+
*/
|
|
1961
|
+
scores?: number[];
|
|
1652
1962
|
}
|
|
1963
|
+
|
|
1653
1964
|
/**
|
|
1654
1965
|
* LLM judge client interface
|
|
1655
1966
|
*/
|
|
@@ -1665,6 +1976,75 @@ interface Judge {
|
|
|
1665
1976
|
evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
|
|
1666
1977
|
}
|
|
1667
1978
|
|
|
1979
|
+
/**
|
|
1980
|
+
* Judge Validator
|
|
1981
|
+
*
|
|
1982
|
+
* Validates a response using an LLM-as-a-judge evaluation.
|
|
1983
|
+
*/
|
|
1984
|
+
|
|
1985
|
+
/**
|
|
1986
|
+
* Configuration for the judge validator
|
|
1987
|
+
*/
|
|
1988
|
+
interface JudgeValidatorConfig {
|
|
1989
|
+
/** The evaluation rubric: a built-in name or custom { text: string } */
|
|
1990
|
+
rubric: RubricSpec;
|
|
1991
|
+
/** Optional reference response to compare against */
|
|
1992
|
+
reference?: unknown;
|
|
1993
|
+
/** Minimum score required to pass (0-1, default: 0.7) */
|
|
1994
|
+
threshold?: number;
|
|
1995
|
+
/** Number of judge evaluations to run. Scores averaged. @default 1 */
|
|
1996
|
+
reps?: number;
|
|
1997
|
+
/** Judge provider. @default 'claude' */
|
|
1998
|
+
provider?: ProviderKind;
|
|
1999
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2000
|
+
model?: string;
|
|
2001
|
+
/** Environment variable name for API key */
|
|
2002
|
+
apiKeyEnvVar?: string;
|
|
2003
|
+
/** Max tokens for judge response */
|
|
2004
|
+
maxTokens?: number;
|
|
2005
|
+
/** Temperature for judge LLM (0–1) */
|
|
2006
|
+
temperature?: number;
|
|
2007
|
+
/** Max budget in USD per evaluation */
|
|
2008
|
+
maxBudgetUsd?: number;
|
|
2009
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
2010
|
+
maxToolOutputSize?: number;
|
|
2011
|
+
}
|
|
2012
|
+
declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
|
|
2013
|
+
|
|
2014
|
+
/**
|
|
2015
|
+
* Validator Utilities
|
|
2016
|
+
*
|
|
2017
|
+
* Shared utility functions for validation operations.
|
|
2018
|
+
* Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
|
|
2019
|
+
*/
|
|
2020
|
+
|
|
2021
|
+
/**
|
|
2022
|
+
* Gets the size of a response in bytes
|
|
2023
|
+
*
|
|
2024
|
+
* Serializes the response to JSON (with pretty printing for consistency)
|
|
2025
|
+
* and returns the byte length using UTF-8 encoding.
|
|
2026
|
+
*
|
|
2027
|
+
* @param response - Response in any format
|
|
2028
|
+
* @returns Size in bytes
|
|
2029
|
+
*/
|
|
2030
|
+
declare function getResponseSizeBytes(response: unknown): number;
|
|
2031
|
+
/**
|
|
2032
|
+
* Normalizes whitespace in text for consistent comparison
|
|
2033
|
+
*
|
|
2034
|
+
* Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
|
|
2035
|
+
* and trims leading/trailing whitespace.
|
|
2036
|
+
*
|
|
2037
|
+
* @param text - Text to normalize
|
|
2038
|
+
* @returns Normalized text with collapsed whitespace
|
|
2039
|
+
*
|
|
2040
|
+
* @example
|
|
2041
|
+
* ```typescript
|
|
2042
|
+
* normalizeWhitespace(' hello\n\n world ');
|
|
2043
|
+
* // Returns: "hello world"
|
|
2044
|
+
* ```
|
|
2045
|
+
*/
|
|
2046
|
+
declare function normalizeWhitespace(text: string): string;
|
|
2047
|
+
|
|
1668
2048
|
/**
|
|
1669
2049
|
* Matcher Types
|
|
1670
2050
|
*
|
|
@@ -1679,8 +2059,12 @@ interface JudgeMatcherOptions {
|
|
|
1679
2059
|
reference?: unknown;
|
|
1680
2060
|
/** Score threshold for passing (default: 0.7) */
|
|
1681
2061
|
passingThreshold?: number;
|
|
1682
|
-
/**
|
|
1683
|
-
|
|
2062
|
+
/** Number of judge evaluations (scores averaged) */
|
|
2063
|
+
reps?: number;
|
|
2064
|
+
/** Override the judge provider */
|
|
2065
|
+
provider?: ProviderKind;
|
|
2066
|
+
/** Override the judge model */
|
|
2067
|
+
model?: string;
|
|
1684
2068
|
}
|
|
1685
2069
|
/**
|
|
1686
2070
|
* Declaration merging for Playwright matchers
|
|
@@ -1785,7 +2169,7 @@ declare global {
|
|
|
1785
2169
|
* });
|
|
1786
2170
|
* ```
|
|
1787
2171
|
*/
|
|
1788
|
-
toPassToolJudge(rubric:
|
|
2172
|
+
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
1789
2173
|
/**
|
|
1790
2174
|
* Validates that a response meets size constraints
|
|
1791
2175
|
*
|
|
@@ -1830,11 +2214,33 @@ declare global {
|
|
|
1830
2214
|
* ```
|
|
1831
2215
|
*/
|
|
1832
2216
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
2217
|
+
/**
|
|
2218
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
2219
|
+
*
|
|
2220
|
+
* @example
|
|
2221
|
+
* ```typescript
|
|
2222
|
+
* expect(simulationResult).toHaveToolCalls({
|
|
2223
|
+
* calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
|
|
2224
|
+
* order: 'any',
|
|
2225
|
+
* });
|
|
2226
|
+
* ```
|
|
2227
|
+
*/
|
|
2228
|
+
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
2229
|
+
/**
|
|
2230
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
2231
|
+
*
|
|
2232
|
+
* @example
|
|
2233
|
+
* ```typescript
|
|
2234
|
+
* expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
|
|
2235
|
+
* expect(simulationResult).toHaveToolCallCount({ exact: 2 });
|
|
2236
|
+
* ```
|
|
2237
|
+
*/
|
|
2238
|
+
toHaveToolCallCount(options: ToolCallCountOptions): R;
|
|
2239
|
+
}
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
/**
|
|
2243
|
+
* Predicate result returned by the user's predicate function
|
|
1838
2244
|
*/
|
|
1839
2245
|
interface PredicateResult {
|
|
1840
2246
|
/** Whether the predicate passed */
|
|
@@ -1873,7 +2279,7 @@ type ResultSource = 'eval' | 'test';
|
|
|
1873
2279
|
/**
|
|
1874
2280
|
* Known expectation types supported by the framework
|
|
1875
2281
|
*/
|
|
1876
|
-
type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
|
|
2282
|
+
type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size' | 'toolsTriggered' | 'toolCallCount';
|
|
1877
2283
|
/**
|
|
1878
2284
|
* Result of an expectation check
|
|
1879
2285
|
*/
|
|
@@ -1912,6 +2318,10 @@ interface MCPFixtureOptions {
|
|
|
1912
2318
|
* Used for filtering and grouping in the reporter
|
|
1913
2319
|
*/
|
|
1914
2320
|
project?: string;
|
|
2321
|
+
/**
|
|
2322
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
2323
|
+
*/
|
|
2324
|
+
callTimeoutMs?: number;
|
|
1915
2325
|
}
|
|
1916
2326
|
/**
|
|
1917
2327
|
* High-level API for interacting with MCP servers in tests
|
|
@@ -1954,29 +2364,43 @@ interface MCPFixtureApi {
|
|
|
1954
2364
|
} | null;
|
|
1955
2365
|
}
|
|
1956
2366
|
/**
|
|
1957
|
-
* Creates an MCP fixture wrapper around a Client
|
|
2367
|
+
* Creates an MCP fixture wrapper around a Client, providing a high-level
|
|
2368
|
+
* {@link MCPFixtureApi} without requiring Playwright's `test.extend` pattern.
|
|
1958
2369
|
*
|
|
1959
|
-
*
|
|
1960
|
-
*
|
|
2370
|
+
* Use this when you need to set up an MCP fixture manually — for example in
|
|
2371
|
+
* custom fixture hierarchies, non-Playwright test runners (e.g. Vitest,
|
|
2372
|
+
* Jest), or when you want to compose the fixture with other lifecycle
|
|
2373
|
+
* management logic that doesn't fit the standard `test.extend` model.
|
|
1961
2374
|
*
|
|
1962
|
-
*
|
|
1963
|
-
*
|
|
2375
|
+
* For the typical Playwright use case, prefer importing `test` and `mcp`
|
|
2376
|
+
* directly from `@gleanwork/mcp-server-tester/fixtures/mcp`, which wires
|
|
2377
|
+
* this function up automatically.
|
|
2378
|
+
*
|
|
2379
|
+
* When `testInfo` is provided, all MCP operations are automatically wrapped
|
|
2380
|
+
* in `test.step()` calls and attachments are created for the MCP Test
|
|
2381
|
+
* Reporter. Omit `testInfo` for lightweight usage outside Playwright.
|
|
2382
|
+
*
|
|
2383
|
+
* @param client - The MCP client to wrap (created via `createMCPClientForConfig`)
|
|
2384
|
+
* @param testInfo - Optional Playwright TestInfo for auto-tracking and reporter attachments
|
|
2385
|
+
* @param options - Optional fixture options (authType, project)
|
|
1964
2386
|
* @returns MCPFixtureApi instance
|
|
1965
2387
|
*
|
|
1966
2388
|
* @example
|
|
1967
2389
|
* ```typescript
|
|
1968
|
-
* //
|
|
2390
|
+
* // Advanced: custom fixture setup inside test.extend
|
|
1969
2391
|
* const test = base.extend<{ mcp: MCPFixtureApi }>({
|
|
1970
2392
|
* mcp: async ({}, use, testInfo) => {
|
|
1971
2393
|
* const client = await createMCPClientForConfig(config);
|
|
1972
|
-
* const api = createMCPFixture(client, testInfo);
|
|
2394
|
+
* const api = createMCPFixture(client, testInfo, { authType: 'api-token' });
|
|
1973
2395
|
* await use(api);
|
|
1974
2396
|
* await closeMCPClient(client);
|
|
1975
2397
|
* }
|
|
1976
2398
|
* });
|
|
1977
2399
|
*
|
|
1978
|
-
* //
|
|
2400
|
+
* // Non-Playwright usage (no reporter attachments)
|
|
2401
|
+
* const client = await createMCPClientForConfig(config);
|
|
1979
2402
|
* const api = createMCPFixture(client);
|
|
2403
|
+
* const tools = await api.listTools();
|
|
1980
2404
|
* ```
|
|
1981
2405
|
*/
|
|
1982
2406
|
declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
|
|
@@ -2082,6 +2506,8 @@ declare function toBeToolError(this: {
|
|
|
2082
2506
|
* toPassToolJudge Matcher
|
|
2083
2507
|
*
|
|
2084
2508
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
2509
|
+
* Delegates evaluation logic to validateJudge() for consistency
|
|
2510
|
+
* with the validator/matcher duality pattern.
|
|
2085
2511
|
*/
|
|
2086
2512
|
|
|
2087
2513
|
/**
|
|
@@ -2091,7 +2517,7 @@ declare function toBeToolError(this: {
|
|
|
2091
2517
|
*/
|
|
2092
2518
|
declare function toPassToolJudge(this: {
|
|
2093
2519
|
isNot: boolean;
|
|
2094
|
-
}, received: unknown, rubric:
|
|
2520
|
+
}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
|
|
2095
2521
|
pass: boolean;
|
|
2096
2522
|
message: () => string;
|
|
2097
2523
|
}>;
|
|
@@ -2158,6 +2584,38 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
2158
2584
|
message: () => string;
|
|
2159
2585
|
}>;
|
|
2160
2586
|
|
|
2587
|
+
/**
|
|
2588
|
+
* toHaveToolCalls Matcher
|
|
2589
|
+
*
|
|
2590
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
2591
|
+
*/
|
|
2592
|
+
|
|
2593
|
+
/**
|
|
2594
|
+
* Creates the toHaveToolCalls matcher function
|
|
2595
|
+
*/
|
|
2596
|
+
declare function toHaveToolCalls(this: {
|
|
2597
|
+
isNot: boolean;
|
|
2598
|
+
}, received: unknown, expectation: ToolCallExpectation): {
|
|
2599
|
+
pass: boolean;
|
|
2600
|
+
message: () => string;
|
|
2601
|
+
};
|
|
2602
|
+
|
|
2603
|
+
/**
|
|
2604
|
+
* toHaveToolCallCount Matcher
|
|
2605
|
+
*
|
|
2606
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
2607
|
+
*/
|
|
2608
|
+
|
|
2609
|
+
/**
|
|
2610
|
+
* Creates the toHaveToolCallCount matcher function
|
|
2611
|
+
*/
|
|
2612
|
+
declare function toHaveToolCallCount(this: {
|
|
2613
|
+
isNot: boolean;
|
|
2614
|
+
}, received: unknown, options: ToolCallCountOptions): {
|
|
2615
|
+
pass: boolean;
|
|
2616
|
+
message: () => string;
|
|
2617
|
+
};
|
|
2618
|
+
|
|
2161
2619
|
/**
|
|
2162
2620
|
* Extended Playwright expect with MCP tool matchers
|
|
2163
2621
|
*
|
|
@@ -2184,6 +2642,8 @@ declare const expect: playwright_test.Expect<{
|
|
|
2184
2642
|
toPassToolJudge: typeof toPassToolJudge;
|
|
2185
2643
|
toHaveToolResponseSize: typeof toHaveToolResponseSize;
|
|
2186
2644
|
toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
|
|
2645
|
+
toHaveToolCalls: typeof toHaveToolCalls;
|
|
2646
|
+
toHaveToolCallCount: typeof toHaveToolCallCount;
|
|
2187
2647
|
}>;
|
|
2188
2648
|
|
|
2189
2649
|
/**
|
|
@@ -2233,9 +2693,30 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
|
|
|
2233
2693
|
*/
|
|
2234
2694
|
|
|
2235
2695
|
/**
|
|
2236
|
-
* LLM provider for host simulation
|
|
2696
|
+
* LLM provider for host simulation.
|
|
2697
|
+
*
|
|
2698
|
+
* All providers run through the Vercel AI SDK (`ai` package).
|
|
2699
|
+
* Each provider requires its corresponding @ai-sdk/* package:
|
|
2700
|
+
*
|
|
2701
|
+
* openai → npm install ai @ai-sdk/openai
|
|
2702
|
+
* anthropic → npm install ai @ai-sdk/anthropic
|
|
2703
|
+
* google → npm install ai @ai-sdk/google
|
|
2704
|
+
* azure → npm install ai @ai-sdk/azure
|
|
2705
|
+
* mistral → npm install ai @ai-sdk/mistral
|
|
2706
|
+
* ollama → npm install ai @ai-sdk/ollama (local, no API key)
|
|
2707
|
+
* deepseek → npm install ai @ai-sdk/deepseek
|
|
2708
|
+
* openrouter → npm install ai @openrouter/ai-sdk-provider
|
|
2709
|
+
* xai → npm install ai @ai-sdk/xai
|
|
2710
|
+
*/
|
|
2711
|
+
type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'ollama' | 'deepseek' | 'openrouter' | 'xai'
|
|
2712
|
+
/**
|
|
2713
|
+
* Anthropic Claude via Google Vertex AI.
|
|
2714
|
+
* Requires @ai-sdk/google-vertex and Application Default Credentials (gcloud auth).
|
|
2715
|
+
* Set GOOGLE_VERTEX_PROJECT and GOOGLE_VERTEX_LOCATION env vars.
|
|
2716
|
+
* Use this instead of 'anthropic' in environments where api.anthropic.com is blocked.
|
|
2717
|
+
* @example model: 'claude-3-5-haiku@20241022'
|
|
2237
2718
|
*/
|
|
2238
|
-
|
|
2719
|
+
| 'vertex-anthropic';
|
|
2239
2720
|
/**
|
|
2240
2721
|
* Configuration for LLM host simulation
|
|
2241
2722
|
*/
|
|
@@ -2246,12 +2727,10 @@ interface LLMHostConfig {
|
|
|
2246
2727
|
provider: LLMProvider;
|
|
2247
2728
|
/**
|
|
2248
2729
|
* Environment variable name containing the API key
|
|
2249
|
-
* @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
|
|
2250
2730
|
*/
|
|
2251
2731
|
apiKeyEnvVar?: string;
|
|
2252
2732
|
/**
|
|
2253
|
-
* Model to use
|
|
2254
|
-
* @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
|
|
2733
|
+
* Model to use (provider-specific default if omitted)
|
|
2255
2734
|
*/
|
|
2256
2735
|
model?: string;
|
|
2257
2736
|
/**
|
|
@@ -2260,11 +2739,11 @@ interface LLMHostConfig {
|
|
|
2260
2739
|
maxTokens?: number;
|
|
2261
2740
|
/**
|
|
2262
2741
|
* Temperature (0-1, lower is more deterministic)
|
|
2263
|
-
* @default 0
|
|
2742
|
+
* @default 0
|
|
2264
2743
|
*/
|
|
2265
2744
|
temperature?: number;
|
|
2266
2745
|
/**
|
|
2267
|
-
* Maximum number of tool
|
|
2746
|
+
* Maximum number of tool call steps to allow in a single conversation
|
|
2268
2747
|
* @default 10
|
|
2269
2748
|
*/
|
|
2270
2749
|
maxToolCalls?: number;
|
|
@@ -2273,72 +2752,49 @@ interface LLMHostConfig {
|
|
|
2273
2752
|
* A tool call made by the LLM
|
|
2274
2753
|
*/
|
|
2275
2754
|
interface LLMToolCall {
|
|
2276
|
-
/**
|
|
2277
|
-
* Tool name
|
|
2278
|
-
*/
|
|
2755
|
+
/** Tool name */
|
|
2279
2756
|
name: string;
|
|
2280
|
-
/**
|
|
2281
|
-
* Tool arguments (as provided by LLM)
|
|
2282
|
-
*/
|
|
2757
|
+
/** Tool arguments (as provided by LLM) */
|
|
2283
2758
|
arguments: Record<string, unknown>;
|
|
2284
|
-
/**
|
|
2285
|
-
* Optional tool call ID (for tracking)
|
|
2286
|
-
*/
|
|
2759
|
+
/** Optional tool call ID (for tracking) */
|
|
2287
2760
|
id?: string;
|
|
2288
2761
|
}
|
|
2289
|
-
/**
|
|
2290
|
-
* Result of a tool call validation
|
|
2291
|
-
*/
|
|
2292
|
-
interface ToolCallValidationResult {
|
|
2293
|
-
/**
|
|
2294
|
-
* Whether the tool call was valid
|
|
2295
|
-
*/
|
|
2296
|
-
valid: boolean;
|
|
2297
|
-
/**
|
|
2298
|
-
* List of actual tool calls made
|
|
2299
|
-
*/
|
|
2300
|
-
actualCalls: Array<LLMToolCall>;
|
|
2301
|
-
/**
|
|
2302
|
-
* Expected tool calls (if specified in eval case)
|
|
2303
|
-
*/
|
|
2304
|
-
expectedCalls?: Array<LLMToolCall>;
|
|
2305
|
-
/**
|
|
2306
|
-
* Details about validation (e.g., missing calls, incorrect arguments)
|
|
2307
|
-
*/
|
|
2308
|
-
details?: string;
|
|
2309
|
-
}
|
|
2310
2762
|
/**
|
|
2311
2763
|
* Result from an LLM host simulation
|
|
2312
2764
|
*/
|
|
2313
2765
|
interface LLMHostSimulationResult {
|
|
2314
|
-
/**
|
|
2315
|
-
* Whether the simulation succeeded
|
|
2316
|
-
*/
|
|
2766
|
+
/** Whether the simulation succeeded */
|
|
2317
2767
|
success: boolean;
|
|
2318
|
-
/**
|
|
2319
|
-
* Tool calls made by the LLM
|
|
2320
|
-
*/
|
|
2768
|
+
/** Tool calls made by the LLM */
|
|
2321
2769
|
toolCalls: Array<LLMToolCall>;
|
|
2322
|
-
/**
|
|
2323
|
-
* Final response from the LLM
|
|
2324
|
-
*/
|
|
2770
|
+
/** Final response from the LLM */
|
|
2325
2771
|
response?: string;
|
|
2326
|
-
/**
|
|
2327
|
-
* Error message if simulation failed
|
|
2328
|
-
*/
|
|
2772
|
+
/** Error message if simulation failed */
|
|
2329
2773
|
error?: string;
|
|
2330
|
-
/**
|
|
2331
|
-
|
|
2332
|
-
|
|
2774
|
+
/** The scenario prompt that was given to the LLM */
|
|
2775
|
+
scenario?: string;
|
|
2776
|
+
/** The conversation turns for attribution analysis */
|
|
2333
2777
|
conversationHistory?: Array<{
|
|
2334
2778
|
role: 'user' | 'assistant' | 'tool';
|
|
2335
2779
|
content: string;
|
|
2336
2780
|
}>;
|
|
2781
|
+
/**
|
|
2782
|
+
* Milliseconds spent waiting for LLM responses
|
|
2783
|
+
* (excludes MCP tool execution time)
|
|
2784
|
+
*/
|
|
2785
|
+
llmDurationMs?: number;
|
|
2786
|
+
/**
|
|
2787
|
+
* Milliseconds spent executing MCP tool calls
|
|
2788
|
+
* (excludes LLM response time)
|
|
2789
|
+
*/
|
|
2790
|
+
mcpDurationMs?: number;
|
|
2337
2791
|
}
|
|
2338
2792
|
/**
|
|
2339
|
-
* Interface for LLM host simulators
|
|
2793
|
+
* Interface for LLM host simulators.
|
|
2340
2794
|
*
|
|
2341
|
-
*
|
|
2795
|
+
* The only built-in implementation is the Vercel AI SDK orchestrator
|
|
2796
|
+
* (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
|
|
2797
|
+
* created for specialised testing needs.
|
|
2342
2798
|
*/
|
|
2343
2799
|
interface LLMHostSimulator {
|
|
2344
2800
|
/**
|
|
@@ -2351,24 +2807,6 @@ interface LLMHostSimulator {
|
|
|
2351
2807
|
*/
|
|
2352
2808
|
simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
2353
2809
|
}
|
|
2354
|
-
/**
|
|
2355
|
-
* Expected tool call specification (for validation)
|
|
2356
|
-
*/
|
|
2357
|
-
interface ExpectedToolCall {
|
|
2358
|
-
/**
|
|
2359
|
-
* Tool name
|
|
2360
|
-
*/
|
|
2361
|
-
name: string;
|
|
2362
|
-
/**
|
|
2363
|
-
* Expected arguments (partial match)
|
|
2364
|
-
*/
|
|
2365
|
-
arguments?: Record<string, unknown>;
|
|
2366
|
-
/**
|
|
2367
|
-
* Whether this call is required
|
|
2368
|
-
* @default true
|
|
2369
|
-
*/
|
|
2370
|
-
required?: boolean;
|
|
2371
|
-
}
|
|
2372
2810
|
|
|
2373
2811
|
/**
|
|
2374
2812
|
* Evaluation mode
|
|
@@ -2423,6 +2861,41 @@ interface EvalCase {
|
|
|
2423
2861
|
* For 'llm_host' mode, can include 'expectedToolCalls' for validation
|
|
2424
2862
|
*/
|
|
2425
2863
|
metadata?: Record<string, unknown>;
|
|
2864
|
+
/**
|
|
2865
|
+
* Number of times to run this case and compute an accuracy score.
|
|
2866
|
+
* When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
|
|
2867
|
+
* by `accuracyThreshold` rather than a single run.
|
|
2868
|
+
* @default 1
|
|
2869
|
+
*/
|
|
2870
|
+
iterations?: number;
|
|
2871
|
+
/**
|
|
2872
|
+
* Minimum accuracy (0–1) required to pass when `iterations > 1`.
|
|
2873
|
+
* @default 1.0 (all iterations must pass)
|
|
2874
|
+
*/
|
|
2875
|
+
accuracyThreshold?: number;
|
|
2876
|
+
/**
|
|
2877
|
+
* Number of times to invoke the LLM judge per `passesJudge` assertion.
|
|
2878
|
+
* Scores are averaged; the mean must meet the threshold to pass.
|
|
2879
|
+
* Reduces judge variance caused by non-determinism.
|
|
2880
|
+
* Per-assertion `passesJudge.reps` overrides this value.
|
|
2881
|
+
* @default 1
|
|
2882
|
+
*/
|
|
2883
|
+
judgeReps?: number;
|
|
2884
|
+
/**
|
|
2885
|
+
* Golden/expected answer for this case.
|
|
2886
|
+
* When set, automatically passed as `reference` to the LLM judge
|
|
2887
|
+
* (unless passesJudge.reference is explicitly provided).
|
|
2888
|
+
* Mirrors EvalV2's `canonical_answer` field.
|
|
2889
|
+
*/
|
|
2890
|
+
canonicalAnswer?: string;
|
|
2891
|
+
/**
|
|
2892
|
+
* Arbitrary string labels for this case.
|
|
2893
|
+
* Use for filtering eval runs with `EvalRunnerOptions.filterTags`
|
|
2894
|
+
* and for slicing results by category.
|
|
2895
|
+
*
|
|
2896
|
+
* @example ['tool-finding', 'multi-hop', 'search']
|
|
2897
|
+
*/
|
|
2898
|
+
tags?: string[];
|
|
2426
2899
|
/**
|
|
2427
2900
|
* Expectations to validate against the tool response
|
|
2428
2901
|
*
|
|
@@ -2486,14 +2959,30 @@ interface EvalExpectBlock {
|
|
|
2486
2959
|
* LLM-as-judge evaluation (toPassToolJudge)
|
|
2487
2960
|
*/
|
|
2488
2961
|
passesJudge?: {
|
|
2489
|
-
/**
|
|
2490
|
-
rubric:
|
|
2962
|
+
/** Built-in rubric name or custom rubric object */
|
|
2963
|
+
rubric: BuiltInRubric | {
|
|
2964
|
+
text: string;
|
|
2965
|
+
};
|
|
2491
2966
|
/** Reference response to compare against */
|
|
2492
2967
|
reference?: unknown;
|
|
2493
2968
|
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2494
2969
|
threshold?: number;
|
|
2495
|
-
/**
|
|
2496
|
-
|
|
2970
|
+
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
2971
|
+
reps?: number;
|
|
2972
|
+
/** Judge provider. @default 'claude' */
|
|
2973
|
+
provider?: 'claude' | 'anthropic' | 'openai' | 'google';
|
|
2974
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2975
|
+
model?: string;
|
|
2976
|
+
/** Environment variable name for API key */
|
|
2977
|
+
apiKeyEnvVar?: string;
|
|
2978
|
+
/** Max tokens for judge response */
|
|
2979
|
+
maxTokens?: number;
|
|
2980
|
+
/** Temperature for judge LLM (0–1) */
|
|
2981
|
+
temperature?: number;
|
|
2982
|
+
/** Max budget in USD per evaluation */
|
|
2983
|
+
maxBudgetUsd?: number;
|
|
2984
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
2985
|
+
maxToolOutputSize?: number;
|
|
2497
2986
|
};
|
|
2498
2987
|
/**
|
|
2499
2988
|
* Response size validation (toHaveToolResponseSize)
|
|
@@ -2504,6 +2993,39 @@ interface EvalExpectBlock {
|
|
|
2504
2993
|
/** Minimum required size in bytes */
|
|
2505
2994
|
minBytes?: number;
|
|
2506
2995
|
};
|
|
2996
|
+
/**
|
|
2997
|
+
* Asserts which tools the LLM called during an llm_host simulation.
|
|
2998
|
+
* Only meaningful for llm_host mode — direct mode has no tool call trace.
|
|
2999
|
+
*/
|
|
3000
|
+
toolsTriggered?: {
|
|
3001
|
+
/** Expected tool calls */
|
|
3002
|
+
calls: Array<{
|
|
3003
|
+
/** Tool name */
|
|
3004
|
+
name: string;
|
|
3005
|
+
/** Expected arguments (partial match — extra keys are allowed) */
|
|
3006
|
+
arguments?: Record<string, unknown>;
|
|
3007
|
+
/** Whether this call MUST have been made (default: true) */
|
|
3008
|
+
required?: boolean;
|
|
3009
|
+
}>;
|
|
3010
|
+
/**
|
|
3011
|
+
* 'strict': calls must appear in the exact order listed
|
|
3012
|
+
* 'any': calls can appear in any order (default)
|
|
3013
|
+
*/
|
|
3014
|
+
order?: 'strict' | 'any';
|
|
3015
|
+
/** If true, no tool calls outside the `calls` list are allowed */
|
|
3016
|
+
exclusive?: boolean;
|
|
3017
|
+
};
|
|
3018
|
+
/**
|
|
3019
|
+
* Asserts the number of tool calls made during an llm_host simulation.
|
|
3020
|
+
*/
|
|
3021
|
+
toolCallCount?: {
|
|
3022
|
+
/** Minimum number of tool calls */
|
|
3023
|
+
min?: number;
|
|
3024
|
+
/** Maximum number of tool calls */
|
|
3025
|
+
max?: number;
|
|
3026
|
+
/** Exact number of tool calls */
|
|
3027
|
+
exact?: number;
|
|
3028
|
+
};
|
|
2507
3029
|
}
|
|
2508
3030
|
/**
|
|
2509
3031
|
* A complete eval dataset containing multiple test cases
|
|
@@ -2543,21 +3065,21 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2543
3065
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2544
3066
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2545
3067
|
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2546
|
-
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
3068
|
+
provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
|
|
2547
3069
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2548
3070
|
model: z.ZodOptional<z.ZodString>;
|
|
2549
3071
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2550
3072
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2551
3073
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2552
3074
|
}, "strip", z.ZodTypeAny, {
|
|
2553
|
-
provider: "anthropic" | "
|
|
3075
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2554
3076
|
model?: string | undefined;
|
|
2555
3077
|
maxTokens?: number | undefined;
|
|
2556
3078
|
apiKeyEnvVar?: string | undefined;
|
|
2557
3079
|
temperature?: number | undefined;
|
|
2558
3080
|
maxToolCalls?: number | undefined;
|
|
2559
3081
|
}, {
|
|
2560
|
-
provider: "anthropic" | "
|
|
3082
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2561
3083
|
model?: string | undefined;
|
|
2562
3084
|
maxTokens?: number | undefined;
|
|
2563
3085
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2565,6 +3087,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2565
3087
|
maxToolCalls?: number | undefined;
|
|
2566
3088
|
}>>;
|
|
2567
3089
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3090
|
+
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3091
|
+
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3092
|
+
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3093
|
+
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3094
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
2568
3095
|
expect: z.ZodOptional<z.ZodObject<{
|
|
2569
3096
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
2570
3097
|
schema: z.ZodOptional<z.ZodString>;
|
|
@@ -2589,20 +3116,51 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2589
3116
|
}>]>, "many">>;
|
|
2590
3117
|
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2591
3118
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2592
|
-
rubric: z.
|
|
3119
|
+
rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
|
|
3120
|
+
text: z.ZodString;
|
|
3121
|
+
}, "strip", z.ZodTypeAny, {
|
|
3122
|
+
text: string;
|
|
3123
|
+
}, {
|
|
3124
|
+
text: string;
|
|
3125
|
+
}>]>;
|
|
2593
3126
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2594
3127
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2595
|
-
|
|
3128
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3129
|
+
provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
|
|
3130
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3131
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3132
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3133
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3134
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3135
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2596
3136
|
}, "strip", z.ZodTypeAny, {
|
|
2597
|
-
rubric:
|
|
3137
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3138
|
+
text: string;
|
|
3139
|
+
};
|
|
3140
|
+
model?: string | undefined;
|
|
3141
|
+
maxTokens?: number | undefined;
|
|
3142
|
+
maxBudgetUsd?: number | undefined;
|
|
2598
3143
|
reference?: unknown;
|
|
2599
3144
|
threshold?: number | undefined;
|
|
2600
|
-
|
|
3145
|
+
reps?: number | undefined;
|
|
3146
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3147
|
+
apiKeyEnvVar?: string | undefined;
|
|
3148
|
+
temperature?: number | undefined;
|
|
3149
|
+
maxToolOutputSize?: number | undefined;
|
|
2601
3150
|
}, {
|
|
2602
|
-
rubric:
|
|
3151
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3152
|
+
text: string;
|
|
3153
|
+
};
|
|
3154
|
+
model?: string | undefined;
|
|
3155
|
+
maxTokens?: number | undefined;
|
|
3156
|
+
maxBudgetUsd?: number | undefined;
|
|
2603
3157
|
reference?: unknown;
|
|
2604
3158
|
threshold?: number | undefined;
|
|
2605
|
-
|
|
3159
|
+
reps?: number | undefined;
|
|
3160
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3161
|
+
apiKeyEnvVar?: string | undefined;
|
|
3162
|
+
temperature?: number | undefined;
|
|
3163
|
+
maxToolOutputSize?: number | undefined;
|
|
2606
3164
|
}>>;
|
|
2607
3165
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2608
3166
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2614,11 +3172,71 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2614
3172
|
maxBytes?: number | undefined;
|
|
2615
3173
|
minBytes?: number | undefined;
|
|
2616
3174
|
}>>;
|
|
3175
|
+
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3176
|
+
calls: z.ZodArray<z.ZodObject<{
|
|
3177
|
+
name: z.ZodString;
|
|
3178
|
+
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3179
|
+
required: z.ZodOptional<z.ZodBoolean>;
|
|
3180
|
+
}, "strip", z.ZodTypeAny, {
|
|
3181
|
+
name: string;
|
|
3182
|
+
required?: boolean | undefined;
|
|
3183
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3184
|
+
}, {
|
|
3185
|
+
name: string;
|
|
3186
|
+
required?: boolean | undefined;
|
|
3187
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3188
|
+
}>, "many">;
|
|
3189
|
+
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3190
|
+
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3191
|
+
}, "strip", z.ZodTypeAny, {
|
|
3192
|
+
calls: {
|
|
3193
|
+
name: string;
|
|
3194
|
+
required?: boolean | undefined;
|
|
3195
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3196
|
+
}[];
|
|
3197
|
+
order?: "strict" | "any" | undefined;
|
|
3198
|
+
exclusive?: boolean | undefined;
|
|
3199
|
+
}, {
|
|
3200
|
+
calls: {
|
|
3201
|
+
name: string;
|
|
3202
|
+
required?: boolean | undefined;
|
|
3203
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3204
|
+
}[];
|
|
3205
|
+
order?: "strict" | "any" | undefined;
|
|
3206
|
+
exclusive?: boolean | undefined;
|
|
3207
|
+
}>>;
|
|
3208
|
+
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3209
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
3210
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
3211
|
+
exact: z.ZodOptional<z.ZodNumber>;
|
|
3212
|
+
}, "strip", z.ZodTypeAny, {
|
|
3213
|
+
exact?: number | undefined;
|
|
3214
|
+
min?: number | undefined;
|
|
3215
|
+
max?: number | undefined;
|
|
3216
|
+
}, {
|
|
3217
|
+
exact?: number | undefined;
|
|
3218
|
+
min?: number | undefined;
|
|
3219
|
+
max?: number | undefined;
|
|
3220
|
+
}>>;
|
|
2617
3221
|
}, "strip", z.ZodTypeAny, {
|
|
3222
|
+
response?: unknown;
|
|
2618
3223
|
isError?: string | boolean | string[] | undefined;
|
|
2619
3224
|
schema?: string | undefined;
|
|
2620
3225
|
snapshot?: string | undefined;
|
|
2621
|
-
|
|
3226
|
+
toolsTriggered?: {
|
|
3227
|
+
calls: {
|
|
3228
|
+
name: string;
|
|
3229
|
+
required?: boolean | undefined;
|
|
3230
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3231
|
+
}[];
|
|
3232
|
+
order?: "strict" | "any" | undefined;
|
|
3233
|
+
exclusive?: boolean | undefined;
|
|
3234
|
+
} | undefined;
|
|
3235
|
+
toolCallCount?: {
|
|
3236
|
+
exact?: number | undefined;
|
|
3237
|
+
min?: number | undefined;
|
|
3238
|
+
max?: number | undefined;
|
|
3239
|
+
} | undefined;
|
|
2622
3240
|
containsText?: string | string[] | undefined;
|
|
2623
3241
|
matchesPattern?: string | string[] | undefined;
|
|
2624
3242
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2628,20 +3246,43 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2628
3246
|
remove: string[];
|
|
2629
3247
|
})[] | undefined;
|
|
2630
3248
|
passesJudge?: {
|
|
2631
|
-
rubric:
|
|
3249
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3250
|
+
text: string;
|
|
3251
|
+
};
|
|
3252
|
+
model?: string | undefined;
|
|
3253
|
+
maxTokens?: number | undefined;
|
|
3254
|
+
maxBudgetUsd?: number | undefined;
|
|
2632
3255
|
reference?: unknown;
|
|
2633
3256
|
threshold?: number | undefined;
|
|
2634
|
-
|
|
3257
|
+
reps?: number | undefined;
|
|
3258
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3259
|
+
apiKeyEnvVar?: string | undefined;
|
|
3260
|
+
temperature?: number | undefined;
|
|
3261
|
+
maxToolOutputSize?: number | undefined;
|
|
2635
3262
|
} | undefined;
|
|
2636
3263
|
responseSize?: {
|
|
2637
3264
|
maxBytes?: number | undefined;
|
|
2638
3265
|
minBytes?: number | undefined;
|
|
2639
3266
|
} | undefined;
|
|
2640
3267
|
}, {
|
|
3268
|
+
response?: unknown;
|
|
2641
3269
|
isError?: string | boolean | string[] | undefined;
|
|
2642
3270
|
schema?: string | undefined;
|
|
2643
3271
|
snapshot?: string | undefined;
|
|
2644
|
-
|
|
3272
|
+
toolsTriggered?: {
|
|
3273
|
+
calls: {
|
|
3274
|
+
name: string;
|
|
3275
|
+
required?: boolean | undefined;
|
|
3276
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3277
|
+
}[];
|
|
3278
|
+
order?: "strict" | "any" | undefined;
|
|
3279
|
+
exclusive?: boolean | undefined;
|
|
3280
|
+
} | undefined;
|
|
3281
|
+
toolCallCount?: {
|
|
3282
|
+
exact?: number | undefined;
|
|
3283
|
+
min?: number | undefined;
|
|
3284
|
+
max?: number | undefined;
|
|
3285
|
+
} | undefined;
|
|
2645
3286
|
containsText?: string | string[] | undefined;
|
|
2646
3287
|
matchesPattern?: string | string[] | undefined;
|
|
2647
3288
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2651,10 +3292,19 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2651
3292
|
remove: string[];
|
|
2652
3293
|
})[] | undefined;
|
|
2653
3294
|
passesJudge?: {
|
|
2654
|
-
rubric:
|
|
3295
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3296
|
+
text: string;
|
|
3297
|
+
};
|
|
3298
|
+
model?: string | undefined;
|
|
3299
|
+
maxTokens?: number | undefined;
|
|
3300
|
+
maxBudgetUsd?: number | undefined;
|
|
2655
3301
|
reference?: unknown;
|
|
2656
3302
|
threshold?: number | undefined;
|
|
2657
|
-
|
|
3303
|
+
reps?: number | undefined;
|
|
3304
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3305
|
+
apiKeyEnvVar?: string | undefined;
|
|
3306
|
+
temperature?: number | undefined;
|
|
3307
|
+
maxToolOutputSize?: number | undefined;
|
|
2658
3308
|
} | undefined;
|
|
2659
3309
|
responseSize?: {
|
|
2660
3310
|
maxBytes?: number | undefined;
|
|
@@ -2664,24 +3314,43 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2664
3314
|
}, "strip", z.ZodTypeAny, {
|
|
2665
3315
|
id: string;
|
|
2666
3316
|
args?: Record<string, unknown> | undefined;
|
|
2667
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2668
3317
|
mode?: "direct" | "llm_host" | undefined;
|
|
3318
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2669
3319
|
description?: string | undefined;
|
|
2670
3320
|
toolName?: string | undefined;
|
|
2671
3321
|
scenario?: string | undefined;
|
|
2672
3322
|
llmHostConfig?: {
|
|
2673
|
-
provider: "anthropic" | "
|
|
3323
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2674
3324
|
model?: string | undefined;
|
|
2675
3325
|
maxTokens?: number | undefined;
|
|
2676
3326
|
apiKeyEnvVar?: string | undefined;
|
|
2677
3327
|
temperature?: number | undefined;
|
|
2678
3328
|
maxToolCalls?: number | undefined;
|
|
2679
3329
|
} | undefined;
|
|
3330
|
+
iterations?: number | undefined;
|
|
3331
|
+
accuracyThreshold?: number | undefined;
|
|
3332
|
+
judgeReps?: number | undefined;
|
|
3333
|
+
canonicalAnswer?: string | undefined;
|
|
3334
|
+
tags?: string[] | undefined;
|
|
2680
3335
|
expect?: {
|
|
3336
|
+
response?: unknown;
|
|
2681
3337
|
isError?: string | boolean | string[] | undefined;
|
|
2682
3338
|
schema?: string | undefined;
|
|
2683
3339
|
snapshot?: string | undefined;
|
|
2684
|
-
|
|
3340
|
+
toolsTriggered?: {
|
|
3341
|
+
calls: {
|
|
3342
|
+
name: string;
|
|
3343
|
+
required?: boolean | undefined;
|
|
3344
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3345
|
+
}[];
|
|
3346
|
+
order?: "strict" | "any" | undefined;
|
|
3347
|
+
exclusive?: boolean | undefined;
|
|
3348
|
+
} | undefined;
|
|
3349
|
+
toolCallCount?: {
|
|
3350
|
+
exact?: number | undefined;
|
|
3351
|
+
min?: number | undefined;
|
|
3352
|
+
max?: number | undefined;
|
|
3353
|
+
} | undefined;
|
|
2685
3354
|
containsText?: string | string[] | undefined;
|
|
2686
3355
|
matchesPattern?: string | string[] | undefined;
|
|
2687
3356
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2691,10 +3360,19 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2691
3360
|
remove: string[];
|
|
2692
3361
|
})[] | undefined;
|
|
2693
3362
|
passesJudge?: {
|
|
2694
|
-
rubric:
|
|
3363
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3364
|
+
text: string;
|
|
3365
|
+
};
|
|
3366
|
+
model?: string | undefined;
|
|
3367
|
+
maxTokens?: number | undefined;
|
|
3368
|
+
maxBudgetUsd?: number | undefined;
|
|
2695
3369
|
reference?: unknown;
|
|
2696
3370
|
threshold?: number | undefined;
|
|
2697
|
-
|
|
3371
|
+
reps?: number | undefined;
|
|
3372
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3373
|
+
apiKeyEnvVar?: string | undefined;
|
|
3374
|
+
temperature?: number | undefined;
|
|
3375
|
+
maxToolOutputSize?: number | undefined;
|
|
2698
3376
|
} | undefined;
|
|
2699
3377
|
responseSize?: {
|
|
2700
3378
|
maxBytes?: number | undefined;
|
|
@@ -2704,24 +3382,43 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2704
3382
|
}, {
|
|
2705
3383
|
id: string;
|
|
2706
3384
|
args?: Record<string, unknown> | undefined;
|
|
2707
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2708
3385
|
mode?: "direct" | "llm_host" | undefined;
|
|
3386
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2709
3387
|
description?: string | undefined;
|
|
2710
3388
|
toolName?: string | undefined;
|
|
2711
3389
|
scenario?: string | undefined;
|
|
2712
3390
|
llmHostConfig?: {
|
|
2713
|
-
provider: "anthropic" | "
|
|
3391
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2714
3392
|
model?: string | undefined;
|
|
2715
3393
|
maxTokens?: number | undefined;
|
|
2716
3394
|
apiKeyEnvVar?: string | undefined;
|
|
2717
3395
|
temperature?: number | undefined;
|
|
2718
3396
|
maxToolCalls?: number | undefined;
|
|
2719
3397
|
} | undefined;
|
|
3398
|
+
iterations?: number | undefined;
|
|
3399
|
+
accuracyThreshold?: number | undefined;
|
|
3400
|
+
judgeReps?: number | undefined;
|
|
3401
|
+
canonicalAnswer?: string | undefined;
|
|
3402
|
+
tags?: string[] | undefined;
|
|
2720
3403
|
expect?: {
|
|
3404
|
+
response?: unknown;
|
|
2721
3405
|
isError?: string | boolean | string[] | undefined;
|
|
2722
3406
|
schema?: string | undefined;
|
|
2723
3407
|
snapshot?: string | undefined;
|
|
2724
|
-
|
|
3408
|
+
toolsTriggered?: {
|
|
3409
|
+
calls: {
|
|
3410
|
+
name: string;
|
|
3411
|
+
required?: boolean | undefined;
|
|
3412
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3413
|
+
}[];
|
|
3414
|
+
order?: "strict" | "any" | undefined;
|
|
3415
|
+
exclusive?: boolean | undefined;
|
|
3416
|
+
} | undefined;
|
|
3417
|
+
toolCallCount?: {
|
|
3418
|
+
exact?: number | undefined;
|
|
3419
|
+
min?: number | undefined;
|
|
3420
|
+
max?: number | undefined;
|
|
3421
|
+
} | undefined;
|
|
2725
3422
|
containsText?: string | string[] | undefined;
|
|
2726
3423
|
matchesPattern?: string | string[] | undefined;
|
|
2727
3424
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2731,10 +3428,19 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2731
3428
|
remove: string[];
|
|
2732
3429
|
})[] | undefined;
|
|
2733
3430
|
passesJudge?: {
|
|
2734
|
-
rubric:
|
|
3431
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3432
|
+
text: string;
|
|
3433
|
+
};
|
|
3434
|
+
model?: string | undefined;
|
|
3435
|
+
maxTokens?: number | undefined;
|
|
3436
|
+
maxBudgetUsd?: number | undefined;
|
|
2735
3437
|
reference?: unknown;
|
|
2736
3438
|
threshold?: number | undefined;
|
|
2737
|
-
|
|
3439
|
+
reps?: number | undefined;
|
|
3440
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3441
|
+
apiKeyEnvVar?: string | undefined;
|
|
3442
|
+
temperature?: number | undefined;
|
|
3443
|
+
maxToolOutputSize?: number | undefined;
|
|
2738
3444
|
} | undefined;
|
|
2739
3445
|
responseSize?: {
|
|
2740
3446
|
maxBytes?: number | undefined;
|
|
@@ -2756,21 +3462,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2756
3462
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2757
3463
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2758
3464
|
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2759
|
-
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
3465
|
+
provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
|
|
2760
3466
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2761
3467
|
model: z.ZodOptional<z.ZodString>;
|
|
2762
3468
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2763
3469
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2764
3470
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2765
3471
|
}, "strip", z.ZodTypeAny, {
|
|
2766
|
-
provider: "anthropic" | "
|
|
3472
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2767
3473
|
model?: string | undefined;
|
|
2768
3474
|
maxTokens?: number | undefined;
|
|
2769
3475
|
apiKeyEnvVar?: string | undefined;
|
|
2770
3476
|
temperature?: number | undefined;
|
|
2771
3477
|
maxToolCalls?: number | undefined;
|
|
2772
3478
|
}, {
|
|
2773
|
-
provider: "anthropic" | "
|
|
3479
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2774
3480
|
model?: string | undefined;
|
|
2775
3481
|
maxTokens?: number | undefined;
|
|
2776
3482
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2778,6 +3484,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2778
3484
|
maxToolCalls?: number | undefined;
|
|
2779
3485
|
}>>;
|
|
2780
3486
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3487
|
+
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3488
|
+
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3489
|
+
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3490
|
+
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3491
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
2781
3492
|
expect: z.ZodOptional<z.ZodObject<{
|
|
2782
3493
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
2783
3494
|
schema: z.ZodOptional<z.ZodString>;
|
|
@@ -2802,20 +3513,51 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2802
3513
|
}>]>, "many">>;
|
|
2803
3514
|
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2804
3515
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2805
|
-
rubric: z.
|
|
3516
|
+
rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
|
|
3517
|
+
text: z.ZodString;
|
|
3518
|
+
}, "strip", z.ZodTypeAny, {
|
|
3519
|
+
text: string;
|
|
3520
|
+
}, {
|
|
3521
|
+
text: string;
|
|
3522
|
+
}>]>;
|
|
2806
3523
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2807
3524
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2808
|
-
|
|
3525
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3526
|
+
provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
|
|
3527
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3528
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3529
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3530
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3531
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3532
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2809
3533
|
}, "strip", z.ZodTypeAny, {
|
|
2810
|
-
rubric:
|
|
3534
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3535
|
+
text: string;
|
|
3536
|
+
};
|
|
3537
|
+
model?: string | undefined;
|
|
3538
|
+
maxTokens?: number | undefined;
|
|
3539
|
+
maxBudgetUsd?: number | undefined;
|
|
2811
3540
|
reference?: unknown;
|
|
2812
3541
|
threshold?: number | undefined;
|
|
2813
|
-
|
|
3542
|
+
reps?: number | undefined;
|
|
3543
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3544
|
+
apiKeyEnvVar?: string | undefined;
|
|
3545
|
+
temperature?: number | undefined;
|
|
3546
|
+
maxToolOutputSize?: number | undefined;
|
|
2814
3547
|
}, {
|
|
2815
|
-
rubric:
|
|
3548
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3549
|
+
text: string;
|
|
3550
|
+
};
|
|
3551
|
+
model?: string | undefined;
|
|
3552
|
+
maxTokens?: number | undefined;
|
|
3553
|
+
maxBudgetUsd?: number | undefined;
|
|
2816
3554
|
reference?: unknown;
|
|
2817
3555
|
threshold?: number | undefined;
|
|
2818
|
-
|
|
3556
|
+
reps?: number | undefined;
|
|
3557
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3558
|
+
apiKeyEnvVar?: string | undefined;
|
|
3559
|
+
temperature?: number | undefined;
|
|
3560
|
+
maxToolOutputSize?: number | undefined;
|
|
2819
3561
|
}>>;
|
|
2820
3562
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2821
3563
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2827,11 +3569,71 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2827
3569
|
maxBytes?: number | undefined;
|
|
2828
3570
|
minBytes?: number | undefined;
|
|
2829
3571
|
}>>;
|
|
3572
|
+
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3573
|
+
calls: z.ZodArray<z.ZodObject<{
|
|
3574
|
+
name: z.ZodString;
|
|
3575
|
+
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3576
|
+
required: z.ZodOptional<z.ZodBoolean>;
|
|
3577
|
+
}, "strip", z.ZodTypeAny, {
|
|
3578
|
+
name: string;
|
|
3579
|
+
required?: boolean | undefined;
|
|
3580
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3581
|
+
}, {
|
|
3582
|
+
name: string;
|
|
3583
|
+
required?: boolean | undefined;
|
|
3584
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3585
|
+
}>, "many">;
|
|
3586
|
+
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3587
|
+
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3588
|
+
}, "strip", z.ZodTypeAny, {
|
|
3589
|
+
calls: {
|
|
3590
|
+
name: string;
|
|
3591
|
+
required?: boolean | undefined;
|
|
3592
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3593
|
+
}[];
|
|
3594
|
+
order?: "strict" | "any" | undefined;
|
|
3595
|
+
exclusive?: boolean | undefined;
|
|
3596
|
+
}, {
|
|
3597
|
+
calls: {
|
|
3598
|
+
name: string;
|
|
3599
|
+
required?: boolean | undefined;
|
|
3600
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3601
|
+
}[];
|
|
3602
|
+
order?: "strict" | "any" | undefined;
|
|
3603
|
+
exclusive?: boolean | undefined;
|
|
3604
|
+
}>>;
|
|
3605
|
+
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3606
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
3607
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
3608
|
+
exact: z.ZodOptional<z.ZodNumber>;
|
|
3609
|
+
}, "strip", z.ZodTypeAny, {
|
|
3610
|
+
exact?: number | undefined;
|
|
3611
|
+
min?: number | undefined;
|
|
3612
|
+
max?: number | undefined;
|
|
3613
|
+
}, {
|
|
3614
|
+
exact?: number | undefined;
|
|
3615
|
+
min?: number | undefined;
|
|
3616
|
+
max?: number | undefined;
|
|
3617
|
+
}>>;
|
|
2830
3618
|
}, "strip", z.ZodTypeAny, {
|
|
3619
|
+
response?: unknown;
|
|
2831
3620
|
isError?: string | boolean | string[] | undefined;
|
|
2832
3621
|
schema?: string | undefined;
|
|
2833
3622
|
snapshot?: string | undefined;
|
|
2834
|
-
|
|
3623
|
+
toolsTriggered?: {
|
|
3624
|
+
calls: {
|
|
3625
|
+
name: string;
|
|
3626
|
+
required?: boolean | undefined;
|
|
3627
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3628
|
+
}[];
|
|
3629
|
+
order?: "strict" | "any" | undefined;
|
|
3630
|
+
exclusive?: boolean | undefined;
|
|
3631
|
+
} | undefined;
|
|
3632
|
+
toolCallCount?: {
|
|
3633
|
+
exact?: number | undefined;
|
|
3634
|
+
min?: number | undefined;
|
|
3635
|
+
max?: number | undefined;
|
|
3636
|
+
} | undefined;
|
|
2835
3637
|
containsText?: string | string[] | undefined;
|
|
2836
3638
|
matchesPattern?: string | string[] | undefined;
|
|
2837
3639
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2841,20 +3643,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2841
3643
|
remove: string[];
|
|
2842
3644
|
})[] | undefined;
|
|
2843
3645
|
passesJudge?: {
|
|
2844
|
-
rubric:
|
|
3646
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3647
|
+
text: string;
|
|
3648
|
+
};
|
|
3649
|
+
model?: string | undefined;
|
|
3650
|
+
maxTokens?: number | undefined;
|
|
3651
|
+
maxBudgetUsd?: number | undefined;
|
|
2845
3652
|
reference?: unknown;
|
|
2846
3653
|
threshold?: number | undefined;
|
|
2847
|
-
|
|
3654
|
+
reps?: number | undefined;
|
|
3655
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3656
|
+
apiKeyEnvVar?: string | undefined;
|
|
3657
|
+
temperature?: number | undefined;
|
|
3658
|
+
maxToolOutputSize?: number | undefined;
|
|
2848
3659
|
} | undefined;
|
|
2849
3660
|
responseSize?: {
|
|
2850
3661
|
maxBytes?: number | undefined;
|
|
2851
3662
|
minBytes?: number | undefined;
|
|
2852
3663
|
} | undefined;
|
|
2853
3664
|
}, {
|
|
3665
|
+
response?: unknown;
|
|
2854
3666
|
isError?: string | boolean | string[] | undefined;
|
|
2855
3667
|
schema?: string | undefined;
|
|
2856
3668
|
snapshot?: string | undefined;
|
|
2857
|
-
|
|
3669
|
+
toolsTriggered?: {
|
|
3670
|
+
calls: {
|
|
3671
|
+
name: string;
|
|
3672
|
+
required?: boolean | undefined;
|
|
3673
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3674
|
+
}[];
|
|
3675
|
+
order?: "strict" | "any" | undefined;
|
|
3676
|
+
exclusive?: boolean | undefined;
|
|
3677
|
+
} | undefined;
|
|
3678
|
+
toolCallCount?: {
|
|
3679
|
+
exact?: number | undefined;
|
|
3680
|
+
min?: number | undefined;
|
|
3681
|
+
max?: number | undefined;
|
|
3682
|
+
} | undefined;
|
|
2858
3683
|
containsText?: string | string[] | undefined;
|
|
2859
3684
|
matchesPattern?: string | string[] | undefined;
|
|
2860
3685
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2864,10 +3689,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2864
3689
|
remove: string[];
|
|
2865
3690
|
})[] | undefined;
|
|
2866
3691
|
passesJudge?: {
|
|
2867
|
-
rubric:
|
|
3692
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3693
|
+
text: string;
|
|
3694
|
+
};
|
|
3695
|
+
model?: string | undefined;
|
|
3696
|
+
maxTokens?: number | undefined;
|
|
3697
|
+
maxBudgetUsd?: number | undefined;
|
|
2868
3698
|
reference?: unknown;
|
|
2869
3699
|
threshold?: number | undefined;
|
|
2870
|
-
|
|
3700
|
+
reps?: number | undefined;
|
|
3701
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3702
|
+
apiKeyEnvVar?: string | undefined;
|
|
3703
|
+
temperature?: number | undefined;
|
|
3704
|
+
maxToolOutputSize?: number | undefined;
|
|
2871
3705
|
} | undefined;
|
|
2872
3706
|
responseSize?: {
|
|
2873
3707
|
maxBytes?: number | undefined;
|
|
@@ -2877,24 +3711,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2877
3711
|
}, "strip", z.ZodTypeAny, {
|
|
2878
3712
|
id: string;
|
|
2879
3713
|
args?: Record<string, unknown> | undefined;
|
|
2880
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2881
3714
|
mode?: "direct" | "llm_host" | undefined;
|
|
3715
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2882
3716
|
description?: string | undefined;
|
|
2883
3717
|
toolName?: string | undefined;
|
|
2884
3718
|
scenario?: string | undefined;
|
|
2885
3719
|
llmHostConfig?: {
|
|
2886
|
-
provider: "anthropic" | "
|
|
3720
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2887
3721
|
model?: string | undefined;
|
|
2888
3722
|
maxTokens?: number | undefined;
|
|
2889
3723
|
apiKeyEnvVar?: string | undefined;
|
|
2890
3724
|
temperature?: number | undefined;
|
|
2891
3725
|
maxToolCalls?: number | undefined;
|
|
2892
3726
|
} | undefined;
|
|
3727
|
+
iterations?: number | undefined;
|
|
3728
|
+
accuracyThreshold?: number | undefined;
|
|
3729
|
+
judgeReps?: number | undefined;
|
|
3730
|
+
canonicalAnswer?: string | undefined;
|
|
3731
|
+
tags?: string[] | undefined;
|
|
2893
3732
|
expect?: {
|
|
3733
|
+
response?: unknown;
|
|
2894
3734
|
isError?: string | boolean | string[] | undefined;
|
|
2895
3735
|
schema?: string | undefined;
|
|
2896
3736
|
snapshot?: string | undefined;
|
|
2897
|
-
|
|
3737
|
+
toolsTriggered?: {
|
|
3738
|
+
calls: {
|
|
3739
|
+
name: string;
|
|
3740
|
+
required?: boolean | undefined;
|
|
3741
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3742
|
+
}[];
|
|
3743
|
+
order?: "strict" | "any" | undefined;
|
|
3744
|
+
exclusive?: boolean | undefined;
|
|
3745
|
+
} | undefined;
|
|
3746
|
+
toolCallCount?: {
|
|
3747
|
+
exact?: number | undefined;
|
|
3748
|
+
min?: number | undefined;
|
|
3749
|
+
max?: number | undefined;
|
|
3750
|
+
} | undefined;
|
|
2898
3751
|
containsText?: string | string[] | undefined;
|
|
2899
3752
|
matchesPattern?: string | string[] | undefined;
|
|
2900
3753
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2904,10 +3757,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2904
3757
|
remove: string[];
|
|
2905
3758
|
})[] | undefined;
|
|
2906
3759
|
passesJudge?: {
|
|
2907
|
-
rubric:
|
|
3760
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3761
|
+
text: string;
|
|
3762
|
+
};
|
|
3763
|
+
model?: string | undefined;
|
|
3764
|
+
maxTokens?: number | undefined;
|
|
3765
|
+
maxBudgetUsd?: number | undefined;
|
|
2908
3766
|
reference?: unknown;
|
|
2909
3767
|
threshold?: number | undefined;
|
|
2910
|
-
|
|
3768
|
+
reps?: number | undefined;
|
|
3769
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3770
|
+
apiKeyEnvVar?: string | undefined;
|
|
3771
|
+
temperature?: number | undefined;
|
|
3772
|
+
maxToolOutputSize?: number | undefined;
|
|
2911
3773
|
} | undefined;
|
|
2912
3774
|
responseSize?: {
|
|
2913
3775
|
maxBytes?: number | undefined;
|
|
@@ -2917,24 +3779,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2917
3779
|
}, {
|
|
2918
3780
|
id: string;
|
|
2919
3781
|
args?: Record<string, unknown> | undefined;
|
|
2920
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2921
3782
|
mode?: "direct" | "llm_host" | undefined;
|
|
3783
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2922
3784
|
description?: string | undefined;
|
|
2923
3785
|
toolName?: string | undefined;
|
|
2924
3786
|
scenario?: string | undefined;
|
|
2925
3787
|
llmHostConfig?: {
|
|
2926
|
-
provider: "anthropic" | "
|
|
3788
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2927
3789
|
model?: string | undefined;
|
|
2928
3790
|
maxTokens?: number | undefined;
|
|
2929
3791
|
apiKeyEnvVar?: string | undefined;
|
|
2930
3792
|
temperature?: number | undefined;
|
|
2931
3793
|
maxToolCalls?: number | undefined;
|
|
2932
3794
|
} | undefined;
|
|
3795
|
+
iterations?: number | undefined;
|
|
3796
|
+
accuracyThreshold?: number | undefined;
|
|
3797
|
+
judgeReps?: number | undefined;
|
|
3798
|
+
canonicalAnswer?: string | undefined;
|
|
3799
|
+
tags?: string[] | undefined;
|
|
2933
3800
|
expect?: {
|
|
3801
|
+
response?: unknown;
|
|
2934
3802
|
isError?: string | boolean | string[] | undefined;
|
|
2935
3803
|
schema?: string | undefined;
|
|
2936
3804
|
snapshot?: string | undefined;
|
|
2937
|
-
|
|
3805
|
+
toolsTriggered?: {
|
|
3806
|
+
calls: {
|
|
3807
|
+
name: string;
|
|
3808
|
+
required?: boolean | undefined;
|
|
3809
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3810
|
+
}[];
|
|
3811
|
+
order?: "strict" | "any" | undefined;
|
|
3812
|
+
exclusive?: boolean | undefined;
|
|
3813
|
+
} | undefined;
|
|
3814
|
+
toolCallCount?: {
|
|
3815
|
+
exact?: number | undefined;
|
|
3816
|
+
min?: number | undefined;
|
|
3817
|
+
max?: number | undefined;
|
|
3818
|
+
} | undefined;
|
|
2938
3819
|
containsText?: string | string[] | undefined;
|
|
2939
3820
|
matchesPattern?: string | string[] | undefined;
|
|
2940
3821
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2944,10 +3825,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2944
3825
|
remove: string[];
|
|
2945
3826
|
})[] | undefined;
|
|
2946
3827
|
passesJudge?: {
|
|
2947
|
-
rubric:
|
|
3828
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3829
|
+
text: string;
|
|
3830
|
+
};
|
|
3831
|
+
model?: string | undefined;
|
|
3832
|
+
maxTokens?: number | undefined;
|
|
3833
|
+
maxBudgetUsd?: number | undefined;
|
|
2948
3834
|
reference?: unknown;
|
|
2949
3835
|
threshold?: number | undefined;
|
|
2950
|
-
|
|
3836
|
+
reps?: number | undefined;
|
|
3837
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3838
|
+
apiKeyEnvVar?: string | undefined;
|
|
3839
|
+
temperature?: number | undefined;
|
|
3840
|
+
maxToolOutputSize?: number | undefined;
|
|
2951
3841
|
} | undefined;
|
|
2952
3842
|
responseSize?: {
|
|
2953
3843
|
maxBytes?: number | undefined;
|
|
@@ -2961,24 +3851,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2961
3851
|
cases: {
|
|
2962
3852
|
id: string;
|
|
2963
3853
|
args?: Record<string, unknown> | undefined;
|
|
2964
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2965
3854
|
mode?: "direct" | "llm_host" | undefined;
|
|
3855
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2966
3856
|
description?: string | undefined;
|
|
2967
3857
|
toolName?: string | undefined;
|
|
2968
3858
|
scenario?: string | undefined;
|
|
2969
3859
|
llmHostConfig?: {
|
|
2970
|
-
provider: "anthropic" | "
|
|
3860
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2971
3861
|
model?: string | undefined;
|
|
2972
3862
|
maxTokens?: number | undefined;
|
|
2973
3863
|
apiKeyEnvVar?: string | undefined;
|
|
2974
3864
|
temperature?: number | undefined;
|
|
2975
3865
|
maxToolCalls?: number | undefined;
|
|
2976
3866
|
} | undefined;
|
|
3867
|
+
iterations?: number | undefined;
|
|
3868
|
+
accuracyThreshold?: number | undefined;
|
|
3869
|
+
judgeReps?: number | undefined;
|
|
3870
|
+
canonicalAnswer?: string | undefined;
|
|
3871
|
+
tags?: string[] | undefined;
|
|
2977
3872
|
expect?: {
|
|
3873
|
+
response?: unknown;
|
|
2978
3874
|
isError?: string | boolean | string[] | undefined;
|
|
2979
3875
|
schema?: string | undefined;
|
|
2980
3876
|
snapshot?: string | undefined;
|
|
2981
|
-
|
|
3877
|
+
toolsTriggered?: {
|
|
3878
|
+
calls: {
|
|
3879
|
+
name: string;
|
|
3880
|
+
required?: boolean | undefined;
|
|
3881
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3882
|
+
}[];
|
|
3883
|
+
order?: "strict" | "any" | undefined;
|
|
3884
|
+
exclusive?: boolean | undefined;
|
|
3885
|
+
} | undefined;
|
|
3886
|
+
toolCallCount?: {
|
|
3887
|
+
exact?: number | undefined;
|
|
3888
|
+
min?: number | undefined;
|
|
3889
|
+
max?: number | undefined;
|
|
3890
|
+
} | undefined;
|
|
2982
3891
|
containsText?: string | string[] | undefined;
|
|
2983
3892
|
matchesPattern?: string | string[] | undefined;
|
|
2984
3893
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -2988,10 +3897,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2988
3897
|
remove: string[];
|
|
2989
3898
|
})[] | undefined;
|
|
2990
3899
|
passesJudge?: {
|
|
2991
|
-
rubric:
|
|
3900
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3901
|
+
text: string;
|
|
3902
|
+
};
|
|
3903
|
+
model?: string | undefined;
|
|
3904
|
+
maxTokens?: number | undefined;
|
|
3905
|
+
maxBudgetUsd?: number | undefined;
|
|
2992
3906
|
reference?: unknown;
|
|
2993
3907
|
threshold?: number | undefined;
|
|
2994
|
-
|
|
3908
|
+
reps?: number | undefined;
|
|
3909
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3910
|
+
apiKeyEnvVar?: string | undefined;
|
|
3911
|
+
temperature?: number | undefined;
|
|
3912
|
+
maxToolOutputSize?: number | undefined;
|
|
2995
3913
|
} | undefined;
|
|
2996
3914
|
responseSize?: {
|
|
2997
3915
|
maxBytes?: number | undefined;
|
|
@@ -3006,24 +3924,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3006
3924
|
cases: {
|
|
3007
3925
|
id: string;
|
|
3008
3926
|
args?: Record<string, unknown> | undefined;
|
|
3009
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3010
3927
|
mode?: "direct" | "llm_host" | undefined;
|
|
3928
|
+
metadata?: Record<string, unknown> | undefined;
|
|
3011
3929
|
description?: string | undefined;
|
|
3012
3930
|
toolName?: string | undefined;
|
|
3013
3931
|
scenario?: string | undefined;
|
|
3014
3932
|
llmHostConfig?: {
|
|
3015
|
-
provider: "anthropic" | "
|
|
3933
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3016
3934
|
model?: string | undefined;
|
|
3017
3935
|
maxTokens?: number | undefined;
|
|
3018
3936
|
apiKeyEnvVar?: string | undefined;
|
|
3019
3937
|
temperature?: number | undefined;
|
|
3020
3938
|
maxToolCalls?: number | undefined;
|
|
3021
3939
|
} | undefined;
|
|
3940
|
+
iterations?: number | undefined;
|
|
3941
|
+
accuracyThreshold?: number | undefined;
|
|
3942
|
+
judgeReps?: number | undefined;
|
|
3943
|
+
canonicalAnswer?: string | undefined;
|
|
3944
|
+
tags?: string[] | undefined;
|
|
3022
3945
|
expect?: {
|
|
3946
|
+
response?: unknown;
|
|
3023
3947
|
isError?: string | boolean | string[] | undefined;
|
|
3024
3948
|
schema?: string | undefined;
|
|
3025
3949
|
snapshot?: string | undefined;
|
|
3026
|
-
|
|
3950
|
+
toolsTriggered?: {
|
|
3951
|
+
calls: {
|
|
3952
|
+
name: string;
|
|
3953
|
+
required?: boolean | undefined;
|
|
3954
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3955
|
+
}[];
|
|
3956
|
+
order?: "strict" | "any" | undefined;
|
|
3957
|
+
exclusive?: boolean | undefined;
|
|
3958
|
+
} | undefined;
|
|
3959
|
+
toolCallCount?: {
|
|
3960
|
+
exact?: number | undefined;
|
|
3961
|
+
min?: number | undefined;
|
|
3962
|
+
max?: number | undefined;
|
|
3963
|
+
} | undefined;
|
|
3027
3964
|
containsText?: string | string[] | undefined;
|
|
3028
3965
|
matchesPattern?: string | string[] | undefined;
|
|
3029
3966
|
snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
|
|
@@ -3033,10 +3970,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3033
3970
|
remove: string[];
|
|
3034
3971
|
})[] | undefined;
|
|
3035
3972
|
passesJudge?: {
|
|
3036
|
-
rubric:
|
|
3973
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3974
|
+
text: string;
|
|
3975
|
+
};
|
|
3976
|
+
model?: string | undefined;
|
|
3977
|
+
maxTokens?: number | undefined;
|
|
3978
|
+
maxBudgetUsd?: number | undefined;
|
|
3037
3979
|
reference?: unknown;
|
|
3038
3980
|
threshold?: number | undefined;
|
|
3039
|
-
|
|
3981
|
+
reps?: number | undefined;
|
|
3982
|
+
provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
|
|
3983
|
+
apiKeyEnvVar?: string | undefined;
|
|
3984
|
+
temperature?: number | undefined;
|
|
3985
|
+
maxToolOutputSize?: number | undefined;
|
|
3040
3986
|
} | undefined;
|
|
3041
3987
|
responseSize?: {
|
|
3042
3988
|
maxBytes?: number | undefined;
|
|
@@ -3126,50 +4072,140 @@ declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions)
|
|
|
3126
4072
|
declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
|
|
3127
4073
|
|
|
3128
4074
|
/**
|
|
3129
|
-
*
|
|
4075
|
+
* Reporter-specific type definitions
|
|
4076
|
+
*
|
|
4077
|
+
* These types are used by the MCP reporter and UI.
|
|
4078
|
+
*
|
|
4079
|
+
* @packageDocumentation
|
|
3130
4080
|
*/
|
|
3131
|
-
|
|
4081
|
+
|
|
4082
|
+
/**
|
|
4083
|
+
* Experiment tracking metadata for an eval run
|
|
4084
|
+
*/
|
|
4085
|
+
interface EvalRunMetadata {
|
|
4086
|
+
/** Git commit hash at time of run */
|
|
4087
|
+
gitHash?: string;
|
|
4088
|
+
/** ISO timestamp of the run */
|
|
4089
|
+
timestamp: string;
|
|
4090
|
+
/** Package version from package.json */
|
|
4091
|
+
packageVersion: string;
|
|
4092
|
+
/** LLM host model identifier (if llm_host mode) */
|
|
4093
|
+
llmHostModel?: string;
|
|
4094
|
+
/** Judge model identifier (if judge was used) */
|
|
4095
|
+
judgeModel?: string;
|
|
4096
|
+
}
|
|
4097
|
+
/**
|
|
4098
|
+
* Individual conformance check result
|
|
4099
|
+
*/
|
|
4100
|
+
interface MCPConformanceCheck$1 {
|
|
3132
4101
|
/**
|
|
3133
|
-
*
|
|
4102
|
+
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
3134
4103
|
*/
|
|
3135
|
-
|
|
4104
|
+
name: string;
|
|
3136
4105
|
/**
|
|
3137
|
-
*
|
|
3138
|
-
* When provided, eval results will be attached to the test for the MCP reporter
|
|
4106
|
+
* Whether the check passed
|
|
3139
4107
|
*/
|
|
3140
|
-
|
|
4108
|
+
pass: boolean;
|
|
3141
4109
|
/**
|
|
3142
|
-
*
|
|
3143
|
-
* Required for snapshot expectations to work properly
|
|
4110
|
+
* Human-readable message describing the result
|
|
3144
4111
|
*/
|
|
3145
|
-
|
|
4112
|
+
message: string;
|
|
3146
4113
|
}
|
|
3147
|
-
|
|
3148
4114
|
/**
|
|
3149
|
-
*
|
|
4115
|
+
* Conformance check result as stored in reporter data
|
|
3150
4116
|
*/
|
|
3151
|
-
interface
|
|
4117
|
+
interface MCPConformanceResultData {
|
|
3152
4118
|
/**
|
|
3153
|
-
*
|
|
4119
|
+
* Test title where conformance check was run
|
|
3154
4120
|
*/
|
|
3155
|
-
|
|
4121
|
+
testTitle: string;
|
|
3156
4122
|
/**
|
|
3157
|
-
*
|
|
4123
|
+
* Whether all checks passed
|
|
3158
4124
|
*/
|
|
3159
|
-
|
|
4125
|
+
pass: boolean;
|
|
3160
4126
|
/**
|
|
3161
|
-
*
|
|
4127
|
+
* Individual check results
|
|
3162
4128
|
*/
|
|
3163
|
-
|
|
4129
|
+
checks: MCPConformanceCheck$1[];
|
|
3164
4130
|
/**
|
|
3165
|
-
*
|
|
3166
|
-
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
4131
|
+
* Server info if available
|
|
3167
4132
|
*/
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
4133
|
+
serverInfo?: {
|
|
4134
|
+
name?: string;
|
|
4135
|
+
version?: string;
|
|
4136
|
+
};
|
|
4137
|
+
/**
|
|
4138
|
+
* Number of tools discovered
|
|
4139
|
+
*/
|
|
4140
|
+
toolCount: number;
|
|
4141
|
+
/**
|
|
4142
|
+
* Auth type used for this check
|
|
4143
|
+
*/
|
|
4144
|
+
authType?: AuthType;
|
|
4145
|
+
/**
|
|
4146
|
+
* Project name
|
|
4147
|
+
*/
|
|
4148
|
+
project?: string;
|
|
4149
|
+
}
|
|
4150
|
+
/**
|
|
4151
|
+
* Server capabilities data from mcp-list-tools attachment
|
|
4152
|
+
*/
|
|
4153
|
+
interface MCPServerCapabilitiesData {
|
|
4154
|
+
/**
|
|
4155
|
+
* Test title where listTools was called
|
|
4156
|
+
*/
|
|
4157
|
+
testTitle: string;
|
|
4158
|
+
/**
|
|
4159
|
+
* List of tools available on the server
|
|
4160
|
+
*/
|
|
4161
|
+
tools: Array<{
|
|
4162
|
+
name: string;
|
|
4163
|
+
description?: string;
|
|
4164
|
+
}>;
|
|
4165
|
+
/**
|
|
4166
|
+
* Total number of tools
|
|
4167
|
+
*/
|
|
4168
|
+
toolCount: number;
|
|
4169
|
+
/**
|
|
4170
|
+
* Auth type used for this test
|
|
4171
|
+
*/
|
|
4172
|
+
authType?: AuthType;
|
|
4173
|
+
/**
|
|
4174
|
+
* Project name
|
|
4175
|
+
*/
|
|
4176
|
+
project?: string;
|
|
4177
|
+
}
|
|
4178
|
+
/**
|
|
4179
|
+
* Result of a single iteration within a multi-iteration eval case
|
|
4180
|
+
*/
|
|
4181
|
+
interface IterationResult {
|
|
4182
|
+
/** Whether this iteration passed */
|
|
4183
|
+
pass: boolean;
|
|
4184
|
+
/** Execution time for this iteration */
|
|
4185
|
+
durationMs: number;
|
|
4186
|
+
/** Error message if the iteration failed with an exception */
|
|
4187
|
+
error?: string;
|
|
4188
|
+
/** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
|
|
4189
|
+
isInfrastructureError?: boolean;
|
|
4190
|
+
}
|
|
4191
|
+
/**
|
|
4192
|
+
* Result of a single eval case
|
|
4193
|
+
*/
|
|
4194
|
+
interface EvalCaseResult {
|
|
4195
|
+
/**
|
|
4196
|
+
* Case ID
|
|
4197
|
+
*/
|
|
4198
|
+
id: string;
|
|
4199
|
+
/**
|
|
4200
|
+
* Dataset name this case belongs to
|
|
4201
|
+
*/
|
|
4202
|
+
datasetName: string;
|
|
4203
|
+
/**
|
|
4204
|
+
* MCP tool name that was called
|
|
4205
|
+
*/
|
|
4206
|
+
toolName: string;
|
|
4207
|
+
/**
|
|
4208
|
+
* Source of this result
|
|
3173
4209
|
*/
|
|
3174
4210
|
source: ResultSource;
|
|
3175
4211
|
/**
|
|
@@ -3194,14 +4230,164 @@ interface EvalCaseResult$1 {
|
|
|
3194
4230
|
authType?: AuthType;
|
|
3195
4231
|
/**
|
|
3196
4232
|
* Playwright project name this test belongs to
|
|
3197
|
-
* Used for filtering/grouping results by project in the reporter
|
|
3198
4233
|
*/
|
|
3199
4234
|
project?: string;
|
|
3200
4235
|
/**
|
|
3201
4236
|
* Execution time in milliseconds
|
|
3202
4237
|
*/
|
|
3203
4238
|
durationMs: number;
|
|
4239
|
+
/**
|
|
4240
|
+
* Assertion pass rate (0–1): passes divided by non-infrastructure iterations.
|
|
4241
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4242
|
+
*
|
|
4243
|
+
* Infrastructure errors (network timeouts, rate limits, etc.) are excluded from
|
|
4244
|
+
* the denominator so that environment reliability does not inflate this metric.
|
|
4245
|
+
*/
|
|
4246
|
+
assertionPassRate?: number;
|
|
4247
|
+
/**
|
|
4248
|
+
* Infrastructure error rate (0–1): infra errors divided by total iterations.
|
|
4249
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4250
|
+
*/
|
|
4251
|
+
infrastructureErrorRate?: number;
|
|
4252
|
+
/**
|
|
4253
|
+
* Accuracy score (0–1) across all iterations.
|
|
4254
|
+
* Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
|
|
4255
|
+
* @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
|
|
4256
|
+
*/
|
|
4257
|
+
accuracy?: number;
|
|
4258
|
+
/**
|
|
4259
|
+
* Per-iteration pass/fail breakdown.
|
|
4260
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4261
|
+
*/
|
|
4262
|
+
iterationResults?: Array<IterationResult>;
|
|
4263
|
+
/**
|
|
4264
|
+
* Tags from the source eval case, for filtering and slicing reports.
|
|
4265
|
+
*/
|
|
4266
|
+
tags?: string[];
|
|
4267
|
+
/**
|
|
4268
|
+
* Precision of tool calls made (0–1).
|
|
4269
|
+
* 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
|
|
4270
|
+
* Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
|
|
4271
|
+
*/
|
|
4272
|
+
toolPrecision?: number;
|
|
4273
|
+
/**
|
|
4274
|
+
* Recall of required tool calls (0–1).
|
|
4275
|
+
* 1.0 means all required tools were called; <1.0 means some were missed.
|
|
4276
|
+
* Only populated when toolsTriggered expectation was evaluated.
|
|
4277
|
+
*/
|
|
4278
|
+
toolRecall?: number;
|
|
4279
|
+
/**
|
|
4280
|
+
* Pass/fail status of this case in the baseline run.
|
|
4281
|
+
* Only present when a baseline was provided to runEvalDataset.
|
|
4282
|
+
*/
|
|
4283
|
+
baselinePass?: boolean;
|
|
4284
|
+
/**
|
|
4285
|
+
* Number of iterations that failed due to infrastructure errors (network, rate limits, etc.)
|
|
4286
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4287
|
+
*/
|
|
4288
|
+
infrastructureErrorCount?: number;
|
|
3204
4289
|
}
|
|
4290
|
+
/**
|
|
4291
|
+
* Aggregated MCP eval run data
|
|
4292
|
+
*/
|
|
4293
|
+
interface MCPEvalRunData {
|
|
4294
|
+
/**
|
|
4295
|
+
* Run timestamp (ISO 8601)
|
|
4296
|
+
*/
|
|
4297
|
+
timestamp: string;
|
|
4298
|
+
/**
|
|
4299
|
+
* Total duration in milliseconds
|
|
4300
|
+
*/
|
|
4301
|
+
durationMs: number;
|
|
4302
|
+
/**
|
|
4303
|
+
* Environment info
|
|
4304
|
+
*/
|
|
4305
|
+
environment: {
|
|
4306
|
+
ci: boolean;
|
|
4307
|
+
node: string;
|
|
4308
|
+
platform: string;
|
|
4309
|
+
};
|
|
4310
|
+
/**
|
|
4311
|
+
* Aggregate metrics
|
|
4312
|
+
*/
|
|
4313
|
+
metrics: {
|
|
4314
|
+
/**
|
|
4315
|
+
* Total number of eval cases
|
|
4316
|
+
*/
|
|
4317
|
+
total: number;
|
|
4318
|
+
/**
|
|
4319
|
+
* Number of passed cases
|
|
4320
|
+
*/
|
|
4321
|
+
passed: number;
|
|
4322
|
+
/**
|
|
4323
|
+
* Number of failed cases
|
|
4324
|
+
*/
|
|
4325
|
+
failed: number;
|
|
4326
|
+
/**
|
|
4327
|
+
* Pass rate (0-1)
|
|
4328
|
+
*/
|
|
4329
|
+
passRate: number;
|
|
4330
|
+
/**
|
|
4331
|
+
* Dataset breakdown: dataset name -> count
|
|
4332
|
+
*/
|
|
4333
|
+
datasetBreakdown: Record<string, number>;
|
|
4334
|
+
/**
|
|
4335
|
+
* Expectation type breakdown
|
|
4336
|
+
*/
|
|
4337
|
+
expectationBreakdown: ExpectationBreakdown;
|
|
4338
|
+
};
|
|
4339
|
+
/**
|
|
4340
|
+
* All eval results from this run
|
|
4341
|
+
*/
|
|
4342
|
+
results: EvalCaseResult[];
|
|
4343
|
+
/**
|
|
4344
|
+
* Conformance check results (optional)
|
|
4345
|
+
*/
|
|
4346
|
+
conformanceChecks?: MCPConformanceResultData[];
|
|
4347
|
+
/**
|
|
4348
|
+
* Server capabilities discovered via listTools (optional)
|
|
4349
|
+
*/
|
|
4350
|
+
serverCapabilities?: MCPServerCapabilitiesData[];
|
|
4351
|
+
}
|
|
4352
|
+
/**
|
|
4353
|
+
* Historical summary for trend charts
|
|
4354
|
+
*/
|
|
4355
|
+
interface MCPEvalHistoricalSummary {
|
|
4356
|
+
timestamp: string;
|
|
4357
|
+
total: number;
|
|
4358
|
+
passed: number;
|
|
4359
|
+
failed: number;
|
|
4360
|
+
passRate: number;
|
|
4361
|
+
durationMs: number;
|
|
4362
|
+
}
|
|
4363
|
+
/**
|
|
4364
|
+
* Complete data structure passed to UI
|
|
4365
|
+
*/
|
|
4366
|
+
interface MCPEvalData {
|
|
4367
|
+
runData: MCPEvalRunData;
|
|
4368
|
+
historical: MCPEvalHistoricalSummary[];
|
|
4369
|
+
}
|
|
4370
|
+
|
|
4371
|
+
/**
|
|
4372
|
+
* Context passed to the eval runner
|
|
4373
|
+
*/
|
|
4374
|
+
interface EvalContext {
|
|
4375
|
+
/**
|
|
4376
|
+
* MCP fixture API for interacting with the server
|
|
4377
|
+
*/
|
|
4378
|
+
mcp: MCPFixtureApi;
|
|
4379
|
+
/**
|
|
4380
|
+
* Optional Playwright TestInfo for reporter integration
|
|
4381
|
+
* When provided, eval results will be attached to the test for the MCP reporter
|
|
4382
|
+
*/
|
|
4383
|
+
testInfo?: TestInfo;
|
|
4384
|
+
/**
|
|
4385
|
+
* Optional Playwright expect function for snapshot testing
|
|
4386
|
+
* Required for snapshot expectations to work properly
|
|
4387
|
+
*/
|
|
4388
|
+
expect?: Expect;
|
|
4389
|
+
}
|
|
4390
|
+
|
|
3205
4391
|
/**
|
|
3206
4392
|
* Overall result of running an eval dataset
|
|
3207
4393
|
*/
|
|
@@ -3221,11 +4407,48 @@ interface EvalRunnerResult {
|
|
|
3221
4407
|
/**
|
|
3222
4408
|
* Individual case results
|
|
3223
4409
|
*/
|
|
3224
|
-
caseResults: Array<EvalCaseResult
|
|
4410
|
+
caseResults: Array<EvalCaseResult>;
|
|
3225
4411
|
/**
|
|
3226
4412
|
* Overall execution time in milliseconds
|
|
3227
4413
|
*/
|
|
3228
4414
|
durationMs: number;
|
|
4415
|
+
/**
|
|
4416
|
+
* Difference between current pass rate and baseline pass rate.
|
|
4417
|
+
* Positive = improvement, negative = regression.
|
|
4418
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4419
|
+
*/
|
|
4420
|
+
deltaPassRate?: number;
|
|
4421
|
+
/**
|
|
4422
|
+
* Number of cases that regressed: passed in baseline, failed now.
|
|
4423
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4424
|
+
*/
|
|
4425
|
+
regressions?: number;
|
|
4426
|
+
/**
|
|
4427
|
+
* Number of cases that improved: failed in baseline, passed now.
|
|
4428
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4429
|
+
*/
|
|
4430
|
+
improvements?: number;
|
|
4431
|
+
/**
|
|
4432
|
+
* Average tool precision across all llm_host cases that have a
|
|
4433
|
+
* `toolsTriggered` expectation (precision = fraction of called tools
|
|
4434
|
+
* that were expected). Only present when at least one such case ran.
|
|
4435
|
+
*/
|
|
4436
|
+
datasetToolPrecision?: number;
|
|
4437
|
+
/**
|
|
4438
|
+
* Average tool recall across all llm_host cases that have a
|
|
4439
|
+
* `toolsTriggered` expectation (recall = fraction of required tools
|
|
4440
|
+
* that were actually called). Only present when at least one such case ran.
|
|
4441
|
+
*/
|
|
4442
|
+
datasetToolRecall?: number;
|
|
4443
|
+
/**
|
|
4444
|
+
* Harmonic mean of `datasetToolPrecision` and `datasetToolRecall`.
|
|
4445
|
+
* Only present when at least one case contributes precision/recall data.
|
|
4446
|
+
*/
|
|
4447
|
+
datasetToolF1?: number;
|
|
4448
|
+
/**
|
|
4449
|
+
* Experiment tracking metadata captured at run time.
|
|
4450
|
+
*/
|
|
4451
|
+
metadata?: EvalRunMetadata;
|
|
3229
4452
|
}
|
|
3230
4453
|
/**
|
|
3231
4454
|
* Options for running eval dataset
|
|
@@ -3251,12 +4474,6 @@ interface EvalRunnerOptions {
|
|
|
3251
4474
|
* ```
|
|
3252
4475
|
*/
|
|
3253
4476
|
schemas?: Record<string, ZodType>;
|
|
3254
|
-
/**
|
|
3255
|
-
* Judge configuration registry by ID
|
|
3256
|
-
*
|
|
3257
|
-
* Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
|
|
3258
|
-
*/
|
|
3259
|
-
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3260
4477
|
/**
|
|
3261
4478
|
* Whether to stop on first failure
|
|
3262
4479
|
* @default false
|
|
@@ -3265,7 +4482,71 @@ interface EvalRunnerOptions {
|
|
|
3265
4482
|
/**
|
|
3266
4483
|
* Optional callback called after each case
|
|
3267
4484
|
*/
|
|
3268
|
-
onCaseComplete?: (result: EvalCaseResult
|
|
4485
|
+
onCaseComplete?: (result: EvalCaseResult) => void | Promise<void>;
|
|
4486
|
+
/**
|
|
4487
|
+
* Maximum number of eval cases to run concurrently.
|
|
4488
|
+
* When > 1, cases run in parallel (ignores stopOnFailure ordering).
|
|
4489
|
+
* @default 1 (sequential)
|
|
4490
|
+
*/
|
|
4491
|
+
concurrency?: number;
|
|
4492
|
+
/**
|
|
4493
|
+
* Default iteration count for `llm_host` mode cases that do not specify
|
|
4494
|
+
* `iterations` explicitly. Has no effect on `direct` mode cases (which are
|
|
4495
|
+
* deterministic and always default to 1 iteration).
|
|
4496
|
+
*
|
|
4497
|
+
* Set to 10 for standard runs or 20 for release gates. Individual cases can
|
|
4498
|
+
* still override this with their own `iterations` field.
|
|
4499
|
+
*
|
|
4500
|
+
* @default 1 (preserves historical behaviour when not set)
|
|
4501
|
+
*
|
|
4502
|
+
* @example
|
|
4503
|
+
* ```typescript
|
|
4504
|
+
* // Run all llm_host cases 10 times each by default
|
|
4505
|
+
* await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
|
|
4506
|
+
* ```
|
|
4507
|
+
*/
|
|
4508
|
+
defaultLlmIterations?: number;
|
|
4509
|
+
/**
|
|
4510
|
+
* Default number of judge evaluations for cases that do not specify
|
|
4511
|
+
* `judgeReps` explicitly. Applies to any case with a `passesJudge`
|
|
4512
|
+
* expectation. Per-case `judgeReps` overrides this.
|
|
4513
|
+
*
|
|
4514
|
+
* @default 1 (single judge run)
|
|
4515
|
+
*/
|
|
4516
|
+
defaultJudgeReps?: number;
|
|
4517
|
+
/**
|
|
4518
|
+
* When set, only eval cases whose `tags` array contains at least one of
|
|
4519
|
+
* the specified tags are run. Cases without a `tags` field are excluded.
|
|
4520
|
+
* When undefined or empty, all cases run (default behavior).
|
|
4521
|
+
*/
|
|
4522
|
+
filterTags?: string[];
|
|
4523
|
+
/**
|
|
4524
|
+
* If set, saves the run results to this file path after completion.
|
|
4525
|
+
* Use with `baselineResultsFrom` on the next run for regression detection.
|
|
4526
|
+
*
|
|
4527
|
+
* @example '.mcp-test-results/baseline.json'
|
|
4528
|
+
*/
|
|
4529
|
+
saveResultsTo?: string;
|
|
4530
|
+
/**
|
|
4531
|
+
* If set, loads this file as the baseline and computes delta metrics vs the current run.
|
|
4532
|
+
* Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
|
|
4533
|
+
* and tags each `EvalCaseResult.baselinePass`.
|
|
4534
|
+
*/
|
|
4535
|
+
baselineResultsFrom?: string;
|
|
4536
|
+
/**
|
|
4537
|
+
* LLM host model identifier to record in run metadata.
|
|
4538
|
+
* Use this to identify which model was used when running llm_host cases.
|
|
4539
|
+
*
|
|
4540
|
+
* @example 'claude-opus-4-20250514'
|
|
4541
|
+
*/
|
|
4542
|
+
llmHostModel?: string;
|
|
4543
|
+
/**
|
|
4544
|
+
* Judge model identifier to record in run metadata.
|
|
4545
|
+
* Use this to identify which model was used for judge evaluations.
|
|
4546
|
+
*
|
|
4547
|
+
* @example 'claude-sonnet-4-20250514'
|
|
4548
|
+
*/
|
|
4549
|
+
judgeModel?: string;
|
|
3269
4550
|
}
|
|
3270
4551
|
/**
|
|
3271
4552
|
* Options for running a single eval case
|
|
@@ -3279,17 +4560,14 @@ interface EvalCaseOptions {
|
|
|
3279
4560
|
* Schema registry for schema validation by name
|
|
3280
4561
|
*/
|
|
3281
4562
|
schemas?: Record<string, ZodType>;
|
|
3282
|
-
/**
|
|
3283
|
-
* Judge configuration registry by ID
|
|
3284
|
-
*/
|
|
3285
|
-
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3286
4563
|
}
|
|
3287
4564
|
/**
|
|
3288
|
-
* Runs a single eval case and returns the result
|
|
4565
|
+
* Runs a single eval case and returns the result.
|
|
4566
|
+
* When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
|
|
3289
4567
|
*
|
|
3290
4568
|
* @param evalCase - The eval case to run
|
|
3291
4569
|
* @param context - Context containing mcp, testInfo, expect
|
|
3292
|
-
* @param options - Optional configuration (datasetName, schemas
|
|
4570
|
+
* @param options - Optional configuration (datasetName, schemas)
|
|
3293
4571
|
* @returns The result of running the eval case
|
|
3294
4572
|
*
|
|
3295
4573
|
* @example
|
|
@@ -3303,131 +4581,166 @@ interface EvalCaseOptions {
|
|
|
3303
4581
|
* expect(result.pass).toBe(true);
|
|
3304
4582
|
* ```
|
|
3305
4583
|
*/
|
|
3306
|
-
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult
|
|
4584
|
+
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
|
|
4585
|
+
declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
|
|
4586
|
+
|
|
3307
4587
|
/**
|
|
3308
|
-
*
|
|
4588
|
+
* Saves eval results to a JSON file for use as a baseline in future runs.
|
|
3309
4589
|
*
|
|
3310
|
-
*
|
|
3311
|
-
*
|
|
4590
|
+
* @param result - The eval run result to save
|
|
4591
|
+
* @param filePath - Path to write the JSON file (parent dirs created automatically)
|
|
4592
|
+
*/
|
|
4593
|
+
declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
|
|
4594
|
+
/**
|
|
4595
|
+
* Loads a previously saved baseline from a JSON file.
|
|
3312
4596
|
*
|
|
3313
|
-
* @param
|
|
3314
|
-
* @
|
|
3315
|
-
* @
|
|
4597
|
+
* @param filePath - Path to the JSON file written by saveBaseline
|
|
4598
|
+
* @returns The saved EvalRunnerResult
|
|
4599
|
+
* @throws If the file cannot be read or parsed
|
|
4600
|
+
*/
|
|
4601
|
+
declare function loadBaseline(filePath: string): Promise<EvalRunnerResult>;
|
|
4602
|
+
|
|
4603
|
+
/** Outcome of comparing two servers on a single eval case. */
|
|
4604
|
+
type ComparisonOutcome = 'A_WINS' | 'B_WINS' | 'TIE' | 'BOTH_FAIL';
|
|
4605
|
+
/** Result of comparing a single eval case across two servers. */
|
|
4606
|
+
interface CaseComparisonResult {
|
|
4607
|
+
/** Case ID */
|
|
4608
|
+
id: string;
|
|
4609
|
+
/** Comparison outcome */
|
|
4610
|
+
outcome: ComparisonOutcome;
|
|
4611
|
+
/** Result from server A */
|
|
4612
|
+
serverA: EvalCaseResult;
|
|
4613
|
+
/** Result from server B */
|
|
4614
|
+
serverB: EvalCaseResult;
|
|
4615
|
+
}
|
|
4616
|
+
/** Aggregated result of running a dataset against two servers. */
|
|
4617
|
+
interface ServerComparisonResult {
|
|
4618
|
+
/** Dataset name */
|
|
4619
|
+
dataset: string;
|
|
4620
|
+
/** Total cases compared (cases present in both runs) */
|
|
4621
|
+
total: number;
|
|
4622
|
+
/** Cases where server A passed and server B failed */
|
|
4623
|
+
aWins: number;
|
|
4624
|
+
/** Cases where server B passed and server A failed */
|
|
4625
|
+
bWins: number;
|
|
4626
|
+
/** Cases where both passed */
|
|
4627
|
+
ties: number;
|
|
4628
|
+
/** Cases where both failed */
|
|
4629
|
+
bothFail: number;
|
|
4630
|
+
/** Raw count of cases where both servers failed (same as bothFail) */
|
|
4631
|
+
bothFailCount: number;
|
|
4632
|
+
/** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
|
|
4633
|
+
decidedCases: number;
|
|
4634
|
+
/** Fraction of total cases where both servers failed (bothFail / total) */
|
|
4635
|
+
failureAlignment: number;
|
|
4636
|
+
/** A win rate (aWins / decidedCases, excludes BOTH_FAIL) */
|
|
4637
|
+
aWinRate: number;
|
|
4638
|
+
/** B win rate (bWins / decidedCases, excludes BOTH_FAIL) */
|
|
4639
|
+
bWinRate: number;
|
|
4640
|
+
/** Tie rate (ties / decidedCases, excludes BOTH_FAIL) */
|
|
4641
|
+
tieRate: number;
|
|
4642
|
+
/** Per-case comparison results */
|
|
4643
|
+
cases: CaseComparisonResult[];
|
|
4644
|
+
/** Full result from server A */
|
|
4645
|
+
serverAResult: EvalRunnerResult;
|
|
4646
|
+
/** Full result from server B */
|
|
4647
|
+
serverBResult: EvalRunnerResult;
|
|
4648
|
+
/** Total duration in milliseconds */
|
|
4649
|
+
durationMs: number;
|
|
4650
|
+
}
|
|
4651
|
+
/**
|
|
4652
|
+
* Options for `runServerComparison`.
|
|
4653
|
+
* Same as `EvalRunnerOptions` without baseline-specific fields.
|
|
4654
|
+
*/
|
|
4655
|
+
type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baselineResultsFrom'>;
|
|
4656
|
+
/**
|
|
4657
|
+
* Runs the same eval dataset against two MCP servers in parallel and
|
|
4658
|
+
* returns a detailed per-case comparison of results.
|
|
3316
4659
|
*
|
|
3317
|
-
*
|
|
3318
|
-
*
|
|
3319
|
-
*
|
|
3320
|
-
*
|
|
3321
|
-
*
|
|
3322
|
-
*
|
|
3323
|
-
*
|
|
3324
|
-
* { mcp }
|
|
3325
|
-
* );
|
|
4660
|
+
* Both servers receive identical cases and options. The comparison uses
|
|
4661
|
+
* simple pass/fail per case: A_WINS means A passed and B failed, etc.
|
|
4662
|
+
*
|
|
4663
|
+
* @param options - Eval dataset and runner options (shared between both servers)
|
|
4664
|
+
* @param contextA - MCP context for server A (e.g., Glean MCP)
|
|
4665
|
+
* @param contextB - MCP context for server B (e.g., native MCP)
|
|
4666
|
+
* @returns Comparison result with per-case outcomes and aggregate win rates
|
|
3326
4667
|
*
|
|
3327
4668
|
* @example
|
|
3328
|
-
*
|
|
3329
|
-
*
|
|
3330
|
-
*
|
|
3331
|
-
*
|
|
3332
|
-
*
|
|
3333
|
-
*
|
|
3334
|
-
* });
|
|
4669
|
+
* ```typescript
|
|
4670
|
+
* const comparison = await runServerComparison(
|
|
4671
|
+
* { dataset },
|
|
4672
|
+
* { mcp: gleanMcpFixture },
|
|
4673
|
+
* { mcp: nativeMcpFixture }
|
|
4674
|
+
* );
|
|
4675
|
+
* console.log(`Glean MCP wins: ${(comparison.aWinRate * 100).toFixed(1)}%`);
|
|
4676
|
+
* console.log(`Native MCP wins: ${(comparison.bWinRate * 100).toFixed(1)}%`);
|
|
4677
|
+
* ```
|
|
3335
4678
|
*/
|
|
3336
|
-
declare function
|
|
4679
|
+
declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
|
|
3337
4680
|
|
|
3338
4681
|
/**
|
|
3339
4682
|
* LLM Host Simulation - Main entry point
|
|
3340
4683
|
*
|
|
3341
|
-
*
|
|
3342
|
-
*
|
|
4684
|
+
* All providers (openai, anthropic, google, azure, mistral, ollama, deepseek,
|
|
4685
|
+
* openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
|
|
4686
|
+
* generateText + stopWhen for a uniform multi-turn tool-calling loop with
|
|
4687
|
+
* built-in latency decomposition.
|
|
4688
|
+
*
|
|
4689
|
+
* Required packages per provider:
|
|
4690
|
+
* openai → npm install ai @ai-sdk/openai
|
|
4691
|
+
* anthropic → npm install ai @ai-sdk/anthropic
|
|
4692
|
+
* google → npm install ai @ai-sdk/google
|
|
4693
|
+
* azure → npm install ai @ai-sdk/azure
|
|
4694
|
+
* mistral → npm install ai @ai-sdk/mistral
|
|
4695
|
+
* ollama → npm install ai @ai-sdk/ollama (local, no API key)
|
|
4696
|
+
* deepseek → npm install ai @ai-sdk/deepseek
|
|
4697
|
+
* openrouter → npm install ai @openrouter/ai-sdk-provider
|
|
4698
|
+
* xai → npm install ai @ai-sdk/xai
|
|
3343
4699
|
*/
|
|
3344
4700
|
|
|
3345
4701
|
/**
|
|
3346
|
-
* Simulates an LLM host interacting with an MCP server
|
|
4702
|
+
* Simulates an LLM host interacting with an MCP server.
|
|
4703
|
+
*
|
|
4704
|
+
* The LLM chooses which tools to call based solely on their descriptions and
|
|
4705
|
+
* schemas, testing discoverability and parameter clarity at the level a real
|
|
4706
|
+
* user (via Claude Desktop, ChatGPT, etc.) would experience.
|
|
3347
4707
|
*
|
|
3348
|
-
*
|
|
3349
|
-
*
|
|
3350
|
-
*
|
|
3351
|
-
* parameter clarity.
|
|
4708
|
+
* All providers run through the Vercel AI SDK's generateText with maxSteps,
|
|
4709
|
+
* which handles multi-turn tool calling natively and provides per-step latency
|
|
4710
|
+
* decomposition (llmDurationMs vs. mcpDurationMs).
|
|
3352
4711
|
*
|
|
3353
4712
|
* @param mcp - MCP fixture API
|
|
3354
|
-
* @param scenario - Natural language prompt describing what
|
|
3355
|
-
* @param config - LLM host configuration
|
|
3356
|
-
* @returns Simulation result with tool calls and
|
|
4713
|
+
* @param scenario - Natural language prompt describing what the LLM should do
|
|
4714
|
+
* @param config - LLM host configuration (provider, model, temperature, etc.)
|
|
4715
|
+
* @returns Simulation result with tool calls, final response, and latency data
|
|
3357
4716
|
*
|
|
3358
4717
|
* @example
|
|
3359
4718
|
* ```typescript
|
|
3360
4719
|
* const result = await simulateLLMHost(mcp,
|
|
3361
|
-
* "
|
|
3362
|
-
* {
|
|
3363
|
-
* provider: 'openai',
|
|
3364
|
-
* model: 'gpt-4o'
|
|
3365
|
-
* }
|
|
4720
|
+
* "Find recent documents about MCP testing frameworks",
|
|
4721
|
+
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
3366
4722
|
* );
|
|
3367
4723
|
*
|
|
3368
4724
|
* expect(result.success).toBe(true);
|
|
3369
|
-
* expect(result.toolCalls).
|
|
3370
|
-
* name: 'get_weather',
|
|
3371
|
-
* arguments: { city: 'London' }
|
|
3372
|
-
* });
|
|
4725
|
+
* expect(result.toolCalls.map(c => c.name)).toContain('search');
|
|
3373
4726
|
* ```
|
|
3374
4727
|
*/
|
|
3375
4728
|
declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
3376
4729
|
/**
|
|
3377
|
-
*
|
|
4730
|
+
* Returns true if the given provider is supported.
|
|
3378
4731
|
*
|
|
3379
|
-
*
|
|
3380
|
-
*
|
|
3381
|
-
*
|
|
3382
|
-
* @param provider - LLM provider to check
|
|
3383
|
-
* @returns true if an adapter is registered for the provider
|
|
4732
|
+
* Note: this does not check whether the required @ai-sdk/* package is
|
|
4733
|
+
* installed — that is validated at simulation time with a helpful error.
|
|
3384
4734
|
*/
|
|
3385
4735
|
declare function isProviderAvailable(provider: LLMProvider): boolean;
|
|
3386
4736
|
/**
|
|
3387
|
-
*
|
|
4737
|
+
* Returns a human-readable installation message for a given provider.
|
|
3388
4738
|
*
|
|
3389
|
-
* @
|
|
3390
|
-
* @
|
|
4739
|
+
* @remarks This is a diagnostic utility for checking whether optional
|
|
4740
|
+
* @ai-sdk/* packages are installed. Not part of the primary usage path.
|
|
3391
4741
|
*/
|
|
3392
4742
|
declare function getMissingDependencyMessage(provider: LLMProvider): string;
|
|
3393
4743
|
|
|
3394
|
-
/**
|
|
3395
|
-
* Tool call validator for LLM host mode
|
|
3396
|
-
*
|
|
3397
|
-
* Validates that the LLM made the expected tool calls with correct arguments
|
|
3398
|
-
*/
|
|
3399
|
-
|
|
3400
|
-
/**
|
|
3401
|
-
* Tool call validation function signature
|
|
3402
|
-
*/
|
|
3403
|
-
type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
|
|
3404
|
-
/**
|
|
3405
|
-
* Creates a tool call validator for LLM host mode
|
|
3406
|
-
*
|
|
3407
|
-
* Validates that the LLM made the expected tool calls with correct arguments.
|
|
3408
|
-
* Supports partial argument matching and optional calls.
|
|
3409
|
-
*
|
|
3410
|
-
* @returns Validator function
|
|
3411
|
-
*
|
|
3412
|
-
* @example
|
|
3413
|
-
* ```typescript
|
|
3414
|
-
* // In your eval case:
|
|
3415
|
-
* {
|
|
3416
|
-
* "id": "weather-london",
|
|
3417
|
-
* "mode": "llm_host",
|
|
3418
|
-
* "scenario": "Get the weather for London",
|
|
3419
|
-
* "expectedToolCalls": [
|
|
3420
|
-
* {
|
|
3421
|
-
* "name": "get_weather",
|
|
3422
|
-
* "arguments": { "city": "London" },
|
|
3423
|
-
* "required": true
|
|
3424
|
-
* }
|
|
3425
|
-
* ]
|
|
3426
|
-
* }
|
|
3427
|
-
* ```
|
|
3428
|
-
*/
|
|
3429
|
-
declare function createToolCallValidator(): ToolCallValidator;
|
|
3430
|
-
|
|
3431
4744
|
/**
|
|
3432
4745
|
* Creates an LLM judge for evaluating tool responses
|
|
3433
4746
|
*
|
|
@@ -3494,7 +4807,7 @@ interface MCPConformanceOptions {
|
|
|
3494
4807
|
/**
|
|
3495
4808
|
* Individual check result
|
|
3496
4809
|
*/
|
|
3497
|
-
interface MCPConformanceCheck
|
|
4810
|
+
interface MCPConformanceCheck {
|
|
3498
4811
|
name: string;
|
|
3499
4812
|
pass: boolean;
|
|
3500
4813
|
message: string;
|
|
@@ -3539,7 +4852,7 @@ interface MCPConformanceResult {
|
|
|
3539
4852
|
/**
|
|
3540
4853
|
* List of check results
|
|
3541
4854
|
*/
|
|
3542
|
-
checks: MCPConformanceCheck
|
|
4855
|
+
checks: MCPConformanceCheck[];
|
|
3543
4856
|
/**
|
|
3544
4857
|
* Raw MCP responses for snapshotting
|
|
3545
4858
|
*
|
|
@@ -3588,229 +4901,6 @@ interface MCPConformanceResult {
|
|
|
3588
4901
|
*/
|
|
3589
4902
|
declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
|
|
3590
4903
|
|
|
3591
|
-
/**
|
|
3592
|
-
* Reporter-specific type definitions
|
|
3593
|
-
*
|
|
3594
|
-
* These types are used by the MCP reporter and UI.
|
|
3595
|
-
*
|
|
3596
|
-
* @packageDocumentation
|
|
3597
|
-
*/
|
|
3598
|
-
|
|
3599
|
-
/**
|
|
3600
|
-
* Individual conformance check result
|
|
3601
|
-
*/
|
|
3602
|
-
interface MCPConformanceCheck {
|
|
3603
|
-
/**
|
|
3604
|
-
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
3605
|
-
*/
|
|
3606
|
-
name: string;
|
|
3607
|
-
/**
|
|
3608
|
-
* Whether the check passed
|
|
3609
|
-
*/
|
|
3610
|
-
pass: boolean;
|
|
3611
|
-
/**
|
|
3612
|
-
* Human-readable message describing the result
|
|
3613
|
-
*/
|
|
3614
|
-
message: string;
|
|
3615
|
-
}
|
|
3616
|
-
/**
|
|
3617
|
-
* Conformance check result as stored in reporter data
|
|
3618
|
-
*/
|
|
3619
|
-
interface MCPConformanceResultData {
|
|
3620
|
-
/**
|
|
3621
|
-
* Test title where conformance check was run
|
|
3622
|
-
*/
|
|
3623
|
-
testTitle: string;
|
|
3624
|
-
/**
|
|
3625
|
-
* Whether all checks passed
|
|
3626
|
-
*/
|
|
3627
|
-
pass: boolean;
|
|
3628
|
-
/**
|
|
3629
|
-
* Individual check results
|
|
3630
|
-
*/
|
|
3631
|
-
checks: MCPConformanceCheck[];
|
|
3632
|
-
/**
|
|
3633
|
-
* Server info if available
|
|
3634
|
-
*/
|
|
3635
|
-
serverInfo?: {
|
|
3636
|
-
name?: string;
|
|
3637
|
-
version?: string;
|
|
3638
|
-
};
|
|
3639
|
-
/**
|
|
3640
|
-
* Number of tools discovered
|
|
3641
|
-
*/
|
|
3642
|
-
toolCount: number;
|
|
3643
|
-
/**
|
|
3644
|
-
* Auth type used for this check
|
|
3645
|
-
*/
|
|
3646
|
-
authType?: AuthType;
|
|
3647
|
-
/**
|
|
3648
|
-
* Project name
|
|
3649
|
-
*/
|
|
3650
|
-
project?: string;
|
|
3651
|
-
}
|
|
3652
|
-
/**
|
|
3653
|
-
* Server capabilities data from mcp-list-tools attachment
|
|
3654
|
-
*/
|
|
3655
|
-
interface MCPServerCapabilitiesData {
|
|
3656
|
-
/**
|
|
3657
|
-
* Test title where listTools was called
|
|
3658
|
-
*/
|
|
3659
|
-
testTitle: string;
|
|
3660
|
-
/**
|
|
3661
|
-
* List of tools available on the server
|
|
3662
|
-
*/
|
|
3663
|
-
tools: Array<{
|
|
3664
|
-
name: string;
|
|
3665
|
-
description?: string;
|
|
3666
|
-
}>;
|
|
3667
|
-
/**
|
|
3668
|
-
* Total number of tools
|
|
3669
|
-
*/
|
|
3670
|
-
toolCount: number;
|
|
3671
|
-
/**
|
|
3672
|
-
* Auth type used for this test
|
|
3673
|
-
*/
|
|
3674
|
-
authType?: AuthType;
|
|
3675
|
-
/**
|
|
3676
|
-
* Project name
|
|
3677
|
-
*/
|
|
3678
|
-
project?: string;
|
|
3679
|
-
}
|
|
3680
|
-
/**
|
|
3681
|
-
* Result of a single eval case
|
|
3682
|
-
*/
|
|
3683
|
-
interface EvalCaseResult {
|
|
3684
|
-
/**
|
|
3685
|
-
* Case ID
|
|
3686
|
-
*/
|
|
3687
|
-
id: string;
|
|
3688
|
-
/**
|
|
3689
|
-
* Dataset name this case belongs to
|
|
3690
|
-
*/
|
|
3691
|
-
datasetName: string;
|
|
3692
|
-
/**
|
|
3693
|
-
* MCP tool name that was called
|
|
3694
|
-
*/
|
|
3695
|
-
toolName: string;
|
|
3696
|
-
/**
|
|
3697
|
-
* Source of this result
|
|
3698
|
-
*/
|
|
3699
|
-
source: ResultSource;
|
|
3700
|
-
/**
|
|
3701
|
-
* Overall pass/fail status
|
|
3702
|
-
*/
|
|
3703
|
-
pass: boolean;
|
|
3704
|
-
/**
|
|
3705
|
-
* Tool response
|
|
3706
|
-
*/
|
|
3707
|
-
response?: unknown;
|
|
3708
|
-
/**
|
|
3709
|
-
* Error if tool call failed
|
|
3710
|
-
*/
|
|
3711
|
-
error?: string;
|
|
3712
|
-
/**
|
|
3713
|
-
* Expectation results
|
|
3714
|
-
*/
|
|
3715
|
-
expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
|
|
3716
|
-
/**
|
|
3717
|
-
* Authentication type used for this test
|
|
3718
|
-
*/
|
|
3719
|
-
authType?: AuthType;
|
|
3720
|
-
/**
|
|
3721
|
-
* Playwright project name this test belongs to
|
|
3722
|
-
*/
|
|
3723
|
-
project?: string;
|
|
3724
|
-
/**
|
|
3725
|
-
* Execution time in milliseconds
|
|
3726
|
-
*/
|
|
3727
|
-
durationMs: number;
|
|
3728
|
-
/**
|
|
3729
|
-
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
3730
|
-
*/
|
|
3731
|
-
mode?: 'direct' | 'llm_host';
|
|
3732
|
-
}
|
|
3733
|
-
/**
|
|
3734
|
-
* Aggregated MCP eval run data
|
|
3735
|
-
*/
|
|
3736
|
-
interface MCPEvalRunData {
|
|
3737
|
-
/**
|
|
3738
|
-
* Run timestamp (ISO 8601)
|
|
3739
|
-
*/
|
|
3740
|
-
timestamp: string;
|
|
3741
|
-
/**
|
|
3742
|
-
* Total duration in milliseconds
|
|
3743
|
-
*/
|
|
3744
|
-
durationMs: number;
|
|
3745
|
-
/**
|
|
3746
|
-
* Environment info
|
|
3747
|
-
*/
|
|
3748
|
-
environment: {
|
|
3749
|
-
ci: boolean;
|
|
3750
|
-
node: string;
|
|
3751
|
-
platform: string;
|
|
3752
|
-
};
|
|
3753
|
-
/**
|
|
3754
|
-
* Aggregate metrics
|
|
3755
|
-
*/
|
|
3756
|
-
metrics: {
|
|
3757
|
-
/**
|
|
3758
|
-
* Total number of eval cases
|
|
3759
|
-
*/
|
|
3760
|
-
total: number;
|
|
3761
|
-
/**
|
|
3762
|
-
* Number of passed cases
|
|
3763
|
-
*/
|
|
3764
|
-
passed: number;
|
|
3765
|
-
/**
|
|
3766
|
-
* Number of failed cases
|
|
3767
|
-
*/
|
|
3768
|
-
failed: number;
|
|
3769
|
-
/**
|
|
3770
|
-
* Pass rate (0-1)
|
|
3771
|
-
*/
|
|
3772
|
-
passRate: number;
|
|
3773
|
-
/**
|
|
3774
|
-
* Dataset breakdown: dataset name -> count
|
|
3775
|
-
*/
|
|
3776
|
-
datasetBreakdown: Record<string, number>;
|
|
3777
|
-
/**
|
|
3778
|
-
* Expectation type breakdown
|
|
3779
|
-
*/
|
|
3780
|
-
expectationBreakdown: ExpectationBreakdown;
|
|
3781
|
-
};
|
|
3782
|
-
/**
|
|
3783
|
-
* All eval results from this run
|
|
3784
|
-
*/
|
|
3785
|
-
results: EvalCaseResult[];
|
|
3786
|
-
/**
|
|
3787
|
-
* Conformance check results (optional)
|
|
3788
|
-
*/
|
|
3789
|
-
conformanceChecks?: MCPConformanceResultData[];
|
|
3790
|
-
/**
|
|
3791
|
-
* Server capabilities discovered via listTools (optional)
|
|
3792
|
-
*/
|
|
3793
|
-
serverCapabilities?: MCPServerCapabilitiesData[];
|
|
3794
|
-
}
|
|
3795
|
-
/**
|
|
3796
|
-
* Historical summary for trend charts
|
|
3797
|
-
*/
|
|
3798
|
-
interface MCPEvalHistoricalSummary {
|
|
3799
|
-
timestamp: string;
|
|
3800
|
-
total: number;
|
|
3801
|
-
passed: number;
|
|
3802
|
-
failed: number;
|
|
3803
|
-
passRate: number;
|
|
3804
|
-
durationMs: number;
|
|
3805
|
-
}
|
|
3806
|
-
/**
|
|
3807
|
-
* Complete data structure passed to UI
|
|
3808
|
-
*/
|
|
3809
|
-
interface MCPEvalData {
|
|
3810
|
-
runData: MCPEvalRunData;
|
|
3811
|
-
historical: MCPEvalHistoricalSummary[];
|
|
3812
|
-
}
|
|
3813
|
-
|
|
3814
4904
|
/**
|
|
3815
4905
|
* Reporter types - re-exported from canonical source
|
|
3816
4906
|
*
|
|
@@ -3831,7 +4921,7 @@ interface MCPEvalReporterConfig {
|
|
|
3831
4921
|
outputDir?: string;
|
|
3832
4922
|
/**
|
|
3833
4923
|
* Auto-open report in browser after test run
|
|
3834
|
-
* @default
|
|
4924
|
+
* @default false
|
|
3835
4925
|
*/
|
|
3836
4926
|
autoOpen?: boolean;
|
|
3837
4927
|
/**
|
|
@@ -3854,4 +4944,4 @@ interface MCPEvalReporterConfig {
|
|
|
3854
4944
|
includeAutoTracking?: boolean;
|
|
3855
4945
|
}
|
|
3856
4946
|
|
|
3857
|
-
export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult
|
|
4947
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|