@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -337
- package/dist/cli/index.js +468 -176
- package/dist/fixtures/mcp.d.ts +121 -44
- package/dist/fixtures/mcp.js +988 -248
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/fixtures/mcpAuth.js +6 -2
- package/dist/fixtures/mcpAuth.js.map +1 -1
- package/dist/index.cjs +5034 -1284
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1697 -575
- package/dist/index.d.ts +1697 -575
- package/dist/index.js +5020 -1280
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +35 -16
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +8 -3
- package/dist/reporters/mcpReporter.d.ts +8 -3
- package/dist/reporters/mcpReporter.js +36 -17
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +5 -5
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +64 -8
- package/src/reporters/ui-dist/app.js +5 -5
- package/src/reporters/ui-dist/styles.css +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -42,6 +42,28 @@ interface MCPOAuthConfig {
|
|
|
42
42
|
*/
|
|
43
43
|
redirectUri?: string;
|
|
44
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* OAuth 2.1 client credentials configuration for machine-to-machine (CI/CD) authentication.
|
|
47
|
+
* Credentials can be provided here or via MCP_CLIENT_ID/MCP_CLIENT_SECRET environment variables.
|
|
48
|
+
*/
|
|
49
|
+
interface MCPClientCredentialsConfig {
|
|
50
|
+
/**
|
|
51
|
+
* OAuth client ID (falls back to MCP_CLIENT_ID env var)
|
|
52
|
+
*/
|
|
53
|
+
clientId?: string;
|
|
54
|
+
/**
|
|
55
|
+
* OAuth client secret (falls back to MCP_CLIENT_SECRET env var)
|
|
56
|
+
*/
|
|
57
|
+
clientSecret?: string;
|
|
58
|
+
/**
|
|
59
|
+
* Token endpoint URL (required)
|
|
60
|
+
*/
|
|
61
|
+
tokenEndpoint?: string;
|
|
62
|
+
/**
|
|
63
|
+
* Scopes to request
|
|
64
|
+
*/
|
|
65
|
+
scopes?: string[];
|
|
66
|
+
}
|
|
45
67
|
/**
|
|
46
68
|
* Authentication configuration for MCP connections
|
|
47
69
|
*/
|
|
@@ -54,6 +76,10 @@ interface MCPAuthConfig {
|
|
|
54
76
|
* Full OAuth configuration for browser-based authentication
|
|
55
77
|
*/
|
|
56
78
|
oauth?: MCPOAuthConfig;
|
|
79
|
+
/**
|
|
80
|
+
* OAuth 2.1 client credentials grant for machine-to-machine authentication
|
|
81
|
+
*/
|
|
82
|
+
clientCredentials?: MCPClientCredentialsConfig;
|
|
57
83
|
}
|
|
58
84
|
/**
|
|
59
85
|
* MCP host capabilities that can be registered with the server
|
|
@@ -74,35 +100,35 @@ interface MCPHostCapabilities {
|
|
|
74
100
|
};
|
|
75
101
|
}
|
|
76
102
|
/**
|
|
77
|
-
* Configuration for MCP client connection
|
|
78
|
-
*
|
|
79
|
-
* Supports both stdio (local) and HTTP (remote) transports
|
|
103
|
+
* Configuration for MCP client connection via stdio transport (local process)
|
|
80
104
|
*/
|
|
81
|
-
interface
|
|
105
|
+
interface StdioMCPConfig {
|
|
82
106
|
/**
|
|
83
|
-
* Transport type
|
|
107
|
+
* Transport type discriminant
|
|
84
108
|
*/
|
|
85
|
-
transport: '
|
|
109
|
+
transport: 'stdio';
|
|
86
110
|
/**
|
|
87
|
-
*
|
|
111
|
+
* Command to execute (required for stdio transport)
|
|
88
112
|
*/
|
|
89
|
-
|
|
113
|
+
command: string;
|
|
90
114
|
/**
|
|
91
|
-
*
|
|
115
|
+
* Command arguments
|
|
92
116
|
*/
|
|
93
|
-
|
|
117
|
+
args?: Array<string>;
|
|
94
118
|
/**
|
|
95
|
-
*
|
|
119
|
+
* Working directory for the command
|
|
96
120
|
*/
|
|
97
|
-
|
|
121
|
+
cwd?: string;
|
|
98
122
|
/**
|
|
99
|
-
*
|
|
123
|
+
* Environment variables to pass to the subprocess.
|
|
124
|
+
* Merged with the current process environment.
|
|
100
125
|
*/
|
|
101
|
-
|
|
126
|
+
env?: Record<string, string>;
|
|
102
127
|
/**
|
|
103
|
-
*
|
|
128
|
+
* Suppress stderr output from the server process.
|
|
129
|
+
* When true, server stderr is ignored instead of inherited.
|
|
104
130
|
*/
|
|
105
|
-
|
|
131
|
+
quiet?: boolean;
|
|
106
132
|
/**
|
|
107
133
|
* Host capabilities to register with the server
|
|
108
134
|
*/
|
|
@@ -116,15 +142,94 @@ interface MCPConfig {
|
|
|
116
142
|
*/
|
|
117
143
|
requestTimeoutMs?: number;
|
|
118
144
|
/**
|
|
119
|
-
*
|
|
120
|
-
* When true, server stderr is ignored instead of inherited
|
|
145
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
121
146
|
*/
|
|
122
|
-
|
|
147
|
+
callTimeoutMs?: number;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Configuration for MCP client connection via HTTP transport (remote server)
|
|
151
|
+
*/
|
|
152
|
+
interface HttpMCPConfig {
|
|
153
|
+
/**
|
|
154
|
+
* Transport type discriminant
|
|
155
|
+
*/
|
|
156
|
+
transport: 'http';
|
|
123
157
|
/**
|
|
124
|
-
*
|
|
158
|
+
* Server URL (required for http transport)
|
|
159
|
+
*/
|
|
160
|
+
serverUrl: string;
|
|
161
|
+
/**
|
|
162
|
+
* HTTP headers (e.g., Authorization)
|
|
163
|
+
*/
|
|
164
|
+
headers?: Record<string, string>;
|
|
165
|
+
/**
|
|
166
|
+
* Authentication configuration
|
|
125
167
|
*/
|
|
126
168
|
auth?: MCPAuthConfig;
|
|
169
|
+
/**
|
|
170
|
+
* Host capabilities to register with the server
|
|
171
|
+
*/
|
|
172
|
+
capabilities?: MCPHostCapabilities;
|
|
173
|
+
/**
|
|
174
|
+
* Connection timeout in milliseconds
|
|
175
|
+
*/
|
|
176
|
+
connectTimeoutMs?: number;
|
|
177
|
+
/**
|
|
178
|
+
* Request timeout in milliseconds
|
|
179
|
+
*/
|
|
180
|
+
requestTimeoutMs?: number;
|
|
181
|
+
/**
|
|
182
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
183
|
+
*/
|
|
184
|
+
callTimeoutMs?: number;
|
|
185
|
+
/**
|
|
186
|
+
* HTTP proxy configuration. Falls back to HTTPS_PROXY/HTTP_PROXY environment variables.
|
|
187
|
+
*/
|
|
188
|
+
proxy?: {
|
|
189
|
+
/**
|
|
190
|
+
* Proxy URL. Credentials can be embedded directly if required:
|
|
191
|
+
* `http://user:pass@proxy.example.com:8080`
|
|
192
|
+
*/
|
|
193
|
+
url: string;
|
|
194
|
+
};
|
|
195
|
+
/**
|
|
196
|
+
* Number of retry attempts for transient connection failures and 429 rate limit responses.
|
|
197
|
+
* Uses exponential backoff with Retry-After header awareness. Defaults to 0 (no retries).
|
|
198
|
+
*/
|
|
199
|
+
retryAttempts?: number;
|
|
200
|
+
/**
|
|
201
|
+
* TLS/mTLS configuration for custom certificates or disabling cert validation.
|
|
202
|
+
* File paths should point to PEM-encoded certificate files.
|
|
203
|
+
*/
|
|
204
|
+
tls?: {
|
|
205
|
+
/**
|
|
206
|
+
* Path to CA certificate PEM file (for custom/self-signed CAs)
|
|
207
|
+
*/
|
|
208
|
+
ca?: string;
|
|
209
|
+
/**
|
|
210
|
+
* Path to client certificate PEM file (for mutual TLS)
|
|
211
|
+
*/
|
|
212
|
+
cert?: string;
|
|
213
|
+
/**
|
|
214
|
+
* Path to client private key PEM file (for mutual TLS)
|
|
215
|
+
*/
|
|
216
|
+
key?: string;
|
|
217
|
+
/**
|
|
218
|
+
* Whether to reject unauthorized certificates. Defaults to true.
|
|
219
|
+
* Set to false to disable certificate validation (not recommended for production).
|
|
220
|
+
*/
|
|
221
|
+
rejectUnauthorized?: boolean;
|
|
222
|
+
};
|
|
127
223
|
}
|
|
224
|
+
/**
|
|
225
|
+
* Configuration for MCP client connection.
|
|
226
|
+
*
|
|
227
|
+
* This is a discriminated union — narrow with `isStdioConfig()` or `isHttpConfig()`
|
|
228
|
+
* before accessing transport-specific fields.
|
|
229
|
+
*
|
|
230
|
+
* Supports both stdio (local) and HTTP (remote) transports.
|
|
231
|
+
*/
|
|
232
|
+
type MCPConfig = StdioMCPConfig | HttpMCPConfig;
|
|
128
233
|
/**
|
|
129
234
|
* Union schema for MCPConfig (validates based on transport type)
|
|
130
235
|
*/
|
|
@@ -133,6 +238,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
133
238
|
command: z.ZodString;
|
|
134
239
|
args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
135
240
|
cwd: z.ZodOptional<z.ZodString>;
|
|
241
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
136
242
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
137
243
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
138
244
|
roots: z.ZodOptional<z.ZodObject<{
|
|
@@ -155,12 +261,14 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
155
261
|
}>>;
|
|
156
262
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
157
263
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
264
|
+
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
158
265
|
quiet: z.ZodOptional<z.ZodBoolean>;
|
|
159
266
|
}, "strip", z.ZodTypeAny, {
|
|
160
267
|
transport: "stdio";
|
|
161
268
|
command: string;
|
|
162
269
|
args?: string[] | undefined;
|
|
163
270
|
cwd?: string | undefined;
|
|
271
|
+
env?: Record<string, string> | undefined;
|
|
164
272
|
capabilities?: {
|
|
165
273
|
sampling?: Record<string, unknown> | undefined;
|
|
166
274
|
roots?: {
|
|
@@ -169,12 +277,14 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
169
277
|
} | undefined;
|
|
170
278
|
connectTimeoutMs?: number | undefined;
|
|
171
279
|
requestTimeoutMs?: number | undefined;
|
|
280
|
+
callTimeoutMs?: number | undefined;
|
|
172
281
|
quiet?: boolean | undefined;
|
|
173
282
|
}, {
|
|
174
283
|
transport: "stdio";
|
|
175
284
|
command: string;
|
|
176
285
|
args?: string[] | undefined;
|
|
177
286
|
cwd?: string | undefined;
|
|
287
|
+
env?: Record<string, string> | undefined;
|
|
178
288
|
capabilities?: {
|
|
179
289
|
sampling?: Record<string, unknown> | undefined;
|
|
180
290
|
roots?: {
|
|
@@ -183,10 +293,11 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
183
293
|
} | undefined;
|
|
184
294
|
connectTimeoutMs?: number | undefined;
|
|
185
295
|
requestTimeoutMs?: number | undefined;
|
|
296
|
+
callTimeoutMs?: number | undefined;
|
|
186
297
|
quiet?: boolean | undefined;
|
|
187
298
|
}>, z.ZodObject<{
|
|
188
299
|
transport: z.ZodLiteral<"http">;
|
|
189
|
-
serverUrl: z.ZodString
|
|
300
|
+
serverUrl: z.ZodEffects<z.ZodString, string, string>;
|
|
190
301
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
191
302
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
192
303
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
@@ -210,6 +321,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
210
321
|
}>>;
|
|
211
322
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
212
323
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
324
|
+
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
213
325
|
auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
|
|
214
326
|
accessToken: z.ZodOptional<z.ZodString>;
|
|
215
327
|
oauth: z.ZodOptional<z.ZodObject<{
|
|
@@ -237,6 +349,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
237
349
|
clientSecret?: string | undefined;
|
|
238
350
|
redirectUri?: string | undefined;
|
|
239
351
|
}>>;
|
|
352
|
+
clientCredentials: z.ZodOptional<z.ZodObject<{
|
|
353
|
+
clientId: z.ZodOptional<z.ZodString>;
|
|
354
|
+
clientSecret: z.ZodOptional<z.ZodString>;
|
|
355
|
+
tokenEndpoint: z.ZodOptional<z.ZodString>;
|
|
356
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
357
|
+
}, "strip", z.ZodTypeAny, {
|
|
358
|
+
scopes?: string[] | undefined;
|
|
359
|
+
clientId?: string | undefined;
|
|
360
|
+
clientSecret?: string | undefined;
|
|
361
|
+
tokenEndpoint?: string | undefined;
|
|
362
|
+
}, {
|
|
363
|
+
scopes?: string[] | undefined;
|
|
364
|
+
clientId?: string | undefined;
|
|
365
|
+
clientSecret?: string | undefined;
|
|
366
|
+
tokenEndpoint?: string | undefined;
|
|
367
|
+
}>>;
|
|
240
368
|
}, "strip", z.ZodTypeAny, {
|
|
241
369
|
accessToken?: string | undefined;
|
|
242
370
|
oauth?: {
|
|
@@ -248,6 +376,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
248
376
|
clientSecret?: string | undefined;
|
|
249
377
|
redirectUri?: string | undefined;
|
|
250
378
|
} | undefined;
|
|
379
|
+
clientCredentials?: {
|
|
380
|
+
scopes?: string[] | undefined;
|
|
381
|
+
clientId?: string | undefined;
|
|
382
|
+
clientSecret?: string | undefined;
|
|
383
|
+
tokenEndpoint?: string | undefined;
|
|
384
|
+
} | undefined;
|
|
251
385
|
}, {
|
|
252
386
|
accessToken?: string | undefined;
|
|
253
387
|
oauth?: {
|
|
@@ -259,6 +393,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
259
393
|
clientSecret?: string | undefined;
|
|
260
394
|
redirectUri?: string | undefined;
|
|
261
395
|
} | undefined;
|
|
396
|
+
clientCredentials?: {
|
|
397
|
+
scopes?: string[] | undefined;
|
|
398
|
+
clientId?: string | undefined;
|
|
399
|
+
clientSecret?: string | undefined;
|
|
400
|
+
tokenEndpoint?: string | undefined;
|
|
401
|
+
} | undefined;
|
|
262
402
|
}>, {
|
|
263
403
|
accessToken?: string | undefined;
|
|
264
404
|
oauth?: {
|
|
@@ -270,6 +410,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
270
410
|
clientSecret?: string | undefined;
|
|
271
411
|
redirectUri?: string | undefined;
|
|
272
412
|
} | undefined;
|
|
413
|
+
clientCredentials?: {
|
|
414
|
+
scopes?: string[] | undefined;
|
|
415
|
+
clientId?: string | undefined;
|
|
416
|
+
clientSecret?: string | undefined;
|
|
417
|
+
tokenEndpoint?: string | undefined;
|
|
418
|
+
} | undefined;
|
|
273
419
|
}, {
|
|
274
420
|
accessToken?: string | undefined;
|
|
275
421
|
oauth?: {
|
|
@@ -281,6 +427,36 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
281
427
|
clientSecret?: string | undefined;
|
|
282
428
|
redirectUri?: string | undefined;
|
|
283
429
|
} | undefined;
|
|
430
|
+
clientCredentials?: {
|
|
431
|
+
scopes?: string[] | undefined;
|
|
432
|
+
clientId?: string | undefined;
|
|
433
|
+
clientSecret?: string | undefined;
|
|
434
|
+
tokenEndpoint?: string | undefined;
|
|
435
|
+
} | undefined;
|
|
436
|
+
}>>;
|
|
437
|
+
proxy: z.ZodOptional<z.ZodObject<{
|
|
438
|
+
url: z.ZodString;
|
|
439
|
+
}, "strip", z.ZodTypeAny, {
|
|
440
|
+
url: string;
|
|
441
|
+
}, {
|
|
442
|
+
url: string;
|
|
443
|
+
}>>;
|
|
444
|
+
retryAttempts: z.ZodOptional<z.ZodNumber>;
|
|
445
|
+
tls: z.ZodOptional<z.ZodObject<{
|
|
446
|
+
ca: z.ZodOptional<z.ZodString>;
|
|
447
|
+
cert: z.ZodOptional<z.ZodString>;
|
|
448
|
+
key: z.ZodOptional<z.ZodString>;
|
|
449
|
+
rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
|
|
450
|
+
}, "strip", z.ZodTypeAny, {
|
|
451
|
+
ca?: string | undefined;
|
|
452
|
+
cert?: string | undefined;
|
|
453
|
+
key?: string | undefined;
|
|
454
|
+
rejectUnauthorized?: boolean | undefined;
|
|
455
|
+
}, {
|
|
456
|
+
ca?: string | undefined;
|
|
457
|
+
cert?: string | undefined;
|
|
458
|
+
key?: string | undefined;
|
|
459
|
+
rejectUnauthorized?: boolean | undefined;
|
|
284
460
|
}>>;
|
|
285
461
|
}, "strip", z.ZodTypeAny, {
|
|
286
462
|
serverUrl: string;
|
|
@@ -293,6 +469,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
293
469
|
} | undefined;
|
|
294
470
|
connectTimeoutMs?: number | undefined;
|
|
295
471
|
requestTimeoutMs?: number | undefined;
|
|
472
|
+
callTimeoutMs?: number | undefined;
|
|
296
473
|
headers?: Record<string, string> | undefined;
|
|
297
474
|
auth?: {
|
|
298
475
|
accessToken?: string | undefined;
|
|
@@ -305,6 +482,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
305
482
|
clientSecret?: string | undefined;
|
|
306
483
|
redirectUri?: string | undefined;
|
|
307
484
|
} | undefined;
|
|
485
|
+
clientCredentials?: {
|
|
486
|
+
scopes?: string[] | undefined;
|
|
487
|
+
clientId?: string | undefined;
|
|
488
|
+
clientSecret?: string | undefined;
|
|
489
|
+
tokenEndpoint?: string | undefined;
|
|
490
|
+
} | undefined;
|
|
491
|
+
} | undefined;
|
|
492
|
+
proxy?: {
|
|
493
|
+
url: string;
|
|
494
|
+
} | undefined;
|
|
495
|
+
retryAttempts?: number | undefined;
|
|
496
|
+
tls?: {
|
|
497
|
+
ca?: string | undefined;
|
|
498
|
+
cert?: string | undefined;
|
|
499
|
+
key?: string | undefined;
|
|
500
|
+
rejectUnauthorized?: boolean | undefined;
|
|
308
501
|
} | undefined;
|
|
309
502
|
}, {
|
|
310
503
|
serverUrl: string;
|
|
@@ -317,6 +510,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
317
510
|
} | undefined;
|
|
318
511
|
connectTimeoutMs?: number | undefined;
|
|
319
512
|
requestTimeoutMs?: number | undefined;
|
|
513
|
+
callTimeoutMs?: number | undefined;
|
|
320
514
|
headers?: Record<string, string> | undefined;
|
|
321
515
|
auth?: {
|
|
322
516
|
accessToken?: string | undefined;
|
|
@@ -329,6 +523,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
|
|
|
329
523
|
clientSecret?: string | undefined;
|
|
330
524
|
redirectUri?: string | undefined;
|
|
331
525
|
} | undefined;
|
|
526
|
+
clientCredentials?: {
|
|
527
|
+
scopes?: string[] | undefined;
|
|
528
|
+
clientId?: string | undefined;
|
|
529
|
+
clientSecret?: string | undefined;
|
|
530
|
+
tokenEndpoint?: string | undefined;
|
|
531
|
+
} | undefined;
|
|
532
|
+
} | undefined;
|
|
533
|
+
proxy?: {
|
|
534
|
+
url: string;
|
|
535
|
+
} | undefined;
|
|
536
|
+
retryAttempts?: number | undefined;
|
|
537
|
+
tls?: {
|
|
538
|
+
ca?: string | undefined;
|
|
539
|
+
cert?: string | undefined;
|
|
540
|
+
key?: string | undefined;
|
|
541
|
+
rejectUnauthorized?: boolean | undefined;
|
|
332
542
|
} | undefined;
|
|
333
543
|
}>]>;
|
|
334
544
|
/**
|
|
@@ -342,17 +552,11 @@ declare function validateMCPConfig(config: unknown): MCPConfig;
|
|
|
342
552
|
/**
|
|
343
553
|
* Type guard to check if a config is for stdio transport
|
|
344
554
|
*/
|
|
345
|
-
declare function isStdioConfig(config: MCPConfig): config is
|
|
346
|
-
transport: 'stdio';
|
|
347
|
-
command: string;
|
|
348
|
-
};
|
|
555
|
+
declare function isStdioConfig(config: MCPConfig): config is StdioMCPConfig;
|
|
349
556
|
/**
|
|
350
557
|
* Type guard to check if a config is for HTTP transport
|
|
351
558
|
*/
|
|
352
|
-
declare function isHttpConfig(config: MCPConfig): config is
|
|
353
|
-
transport: 'http';
|
|
354
|
-
serverUrl: string;
|
|
355
|
-
};
|
|
559
|
+
declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
|
|
356
560
|
|
|
357
561
|
/**
|
|
358
562
|
* Auth types for MCP OAuth integration
|
|
@@ -601,6 +805,9 @@ declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
|
|
|
601
805
|
tokens(): Promise<OAuthTokens | undefined>;
|
|
602
806
|
/**
|
|
603
807
|
* Stores new OAuth tokens for the current session
|
|
808
|
+
*
|
|
809
|
+
* The code verifier is cleared after a successful token exchange — it is
|
|
810
|
+
* single-use per PKCE spec and must not persist beyond the exchange.
|
|
604
811
|
*/
|
|
605
812
|
saveTokens(tokens: OAuthTokens): Promise<void>;
|
|
606
813
|
/**
|
|
@@ -757,6 +964,38 @@ interface AuthServerMetadata {
|
|
|
757
964
|
*/
|
|
758
965
|
issuer: string;
|
|
759
966
|
}
|
|
967
|
+
/**
|
|
968
|
+
* Configuration for client credentials grant
|
|
969
|
+
*/
|
|
970
|
+
interface ClientCredentialsConfig {
|
|
971
|
+
/**
|
|
972
|
+
* Token endpoint URL
|
|
973
|
+
*/
|
|
974
|
+
tokenEndpoint: string;
|
|
975
|
+
/**
|
|
976
|
+
* OAuth client ID
|
|
977
|
+
*/
|
|
978
|
+
clientId: string;
|
|
979
|
+
/**
|
|
980
|
+
* OAuth client secret
|
|
981
|
+
*/
|
|
982
|
+
clientSecret: string;
|
|
983
|
+
/**
|
|
984
|
+
* Scopes to request (optional)
|
|
985
|
+
*/
|
|
986
|
+
scopes?: string[];
|
|
987
|
+
}
|
|
988
|
+
/**
|
|
989
|
+
* Performs the OAuth 2.1 client credentials grant to obtain an access token.
|
|
990
|
+
* Suitable for CI/CD machine-to-machine authentication.
|
|
991
|
+
*
|
|
992
|
+
* Uses oauth4webapi for spec-compliant request construction and response validation,
|
|
993
|
+
* consistent with how the rest of this module handles OAuth flows.
|
|
994
|
+
*
|
|
995
|
+
* @param config - Client credentials configuration
|
|
996
|
+
* @returns Token result
|
|
997
|
+
*/
|
|
998
|
+
declare function performClientCredentialsFlow(config: ClientCredentialsConfig): Promise<TokenResult>;
|
|
760
999
|
|
|
761
1000
|
/**
|
|
762
1001
|
* OAuth Protected Resource and Authorization Server discovery
|
|
@@ -915,8 +1154,9 @@ declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?
|
|
|
915
1154
|
* ```typescript
|
|
916
1155
|
* // After running: npx mcp-server-tester login https://api.example.com/mcp
|
|
917
1156
|
* const tokens = await loadTokens('https://api.example.com/mcp');
|
|
918
|
-
* if (tokens) {
|
|
919
|
-
*
|
|
1157
|
+
* if (tokens?.accessToken) {
|
|
1158
|
+
* // Use the token — never log raw token values
|
|
1159
|
+
* headers.Authorization = `Bearer ${tokens.accessToken}`;
|
|
920
1160
|
* }
|
|
921
1161
|
* ```
|
|
922
1162
|
*/
|
|
@@ -1127,6 +1367,14 @@ interface CreateMCPClientOptions {
|
|
|
1127
1367
|
* This takes precedence over static token auth in config.auth.accessToken.
|
|
1128
1368
|
*/
|
|
1129
1369
|
authProvider?: OAuthClientProvider;
|
|
1370
|
+
/**
|
|
1371
|
+
* Sampling handler callback for LLM sampling requests from the server.
|
|
1372
|
+
*
|
|
1373
|
+
* When provided, the client will advertise sampling capability to the server.
|
|
1374
|
+
* When absent, sampling is removed from declared capabilities so the client
|
|
1375
|
+
* does not falsely advertise support it cannot fulfill.
|
|
1376
|
+
*/
|
|
1377
|
+
samplingHandler?: (...args: unknown[]) => unknown;
|
|
1130
1378
|
}
|
|
1131
1379
|
/**
|
|
1132
1380
|
* Creates and connects an MCP client based on the provided configuration
|
|
@@ -1251,6 +1499,14 @@ interface ValidationResult {
|
|
|
1251
1499
|
message: string;
|
|
1252
1500
|
/** Additional structured details about the validation */
|
|
1253
1501
|
details?: Record<string, unknown>;
|
|
1502
|
+
/**
|
|
1503
|
+
* Optional quantitative metrics from the validation.
|
|
1504
|
+
* Populated by validateToolCalls for precision/recall.
|
|
1505
|
+
*/
|
|
1506
|
+
metrics?: {
|
|
1507
|
+
precision?: number;
|
|
1508
|
+
recall?: number;
|
|
1509
|
+
};
|
|
1254
1510
|
}
|
|
1255
1511
|
/**
|
|
1256
1512
|
* Options for text validation
|
|
@@ -1282,10 +1538,33 @@ interface PatternValidatorOptions {
|
|
|
1282
1538
|
/** Whether to perform case-sensitive matching (default: true) */
|
|
1283
1539
|
caseSensitive?: boolean;
|
|
1284
1540
|
}
|
|
1541
|
+
/**
|
|
1542
|
+
* Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
|
|
1543
|
+
* Pass these values in the sanitizers array to replace non-deterministic
|
|
1544
|
+
* values with stable placeholders before snapshot comparison.
|
|
1545
|
+
*
|
|
1546
|
+
* @example
|
|
1547
|
+
* expect(result).toMatchToolSnapshot('my-snapshot', [
|
|
1548
|
+
* SnapshotSanitizers.UUID,
|
|
1549
|
+
* SnapshotSanitizers.ISO_DATE,
|
|
1550
|
+
* ]);
|
|
1551
|
+
*/
|
|
1552
|
+
declare const SnapshotSanitizers: {
|
|
1553
|
+
/** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
|
|
1554
|
+
readonly TIMESTAMP: "timestamp";
|
|
1555
|
+
/** Replaces UUID v1-v5 strings with a stable placeholder */
|
|
1556
|
+
readonly UUID: "uuid";
|
|
1557
|
+
/** Replaces ISO 8601 date/datetime strings with a stable placeholder */
|
|
1558
|
+
readonly ISO_DATE: "iso-date";
|
|
1559
|
+
/** Replaces MongoDB ObjectId strings with a stable placeholder */
|
|
1560
|
+
readonly OBJECT_ID: "objectId";
|
|
1561
|
+
/** Replaces JWT tokens with a stable placeholder */
|
|
1562
|
+
readonly JWT: "jwt";
|
|
1563
|
+
};
|
|
1285
1564
|
/**
|
|
1286
1565
|
* Built-in sanitizer names for common variable patterns
|
|
1287
1566
|
*/
|
|
1288
|
-
type BuiltInSanitizer =
|
|
1567
|
+
type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
|
|
1289
1568
|
/**
|
|
1290
1569
|
* Custom regex-based sanitizer
|
|
1291
1570
|
*/
|
|
@@ -1511,38 +1790,63 @@ declare function validateError(response: unknown, expected?: boolean | string |
|
|
|
1511
1790
|
declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
|
|
1512
1791
|
|
|
1513
1792
|
/**
|
|
1514
|
-
*
|
|
1793
|
+
* Tool call validators for llm_host simulation results.
|
|
1515
1794
|
*
|
|
1516
|
-
*
|
|
1517
|
-
*
|
|
1795
|
+
* These validators extract the tool call trace from an LLMHostSimulationResult
|
|
1796
|
+
* and apply assertions against expected call lists and counts.
|
|
1518
1797
|
*/
|
|
1519
1798
|
|
|
1799
|
+
interface ToolCallExpectation {
|
|
1800
|
+
calls: Array<{
|
|
1801
|
+
name: string;
|
|
1802
|
+
arguments?: Record<string, unknown>;
|
|
1803
|
+
required?: boolean;
|
|
1804
|
+
}>;
|
|
1805
|
+
order?: 'strict' | 'any';
|
|
1806
|
+
exclusive?: boolean;
|
|
1807
|
+
}
|
|
1808
|
+
interface ToolCallCountOptions {
|
|
1809
|
+
min?: number;
|
|
1810
|
+
max?: number;
|
|
1811
|
+
exact?: number;
|
|
1812
|
+
}
|
|
1520
1813
|
/**
|
|
1521
|
-
*
|
|
1522
|
-
*
|
|
1523
|
-
* Serializes the response to JSON (with pretty printing for consistency)
|
|
1524
|
-
* and returns the byte length using UTF-8 encoding.
|
|
1814
|
+
* Validates tool calls made during an LLM host simulation.
|
|
1525
1815
|
*
|
|
1526
|
-
* @param response -
|
|
1527
|
-
* @
|
|
1816
|
+
* @param response - Must be an LLMHostSimulationResult (from llm_host mode)
|
|
1817
|
+
* @param expectation - Expected tool call specification
|
|
1528
1818
|
*/
|
|
1529
|
-
declare function
|
|
1819
|
+
declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
|
|
1530
1820
|
/**
|
|
1531
|
-
*
|
|
1532
|
-
*
|
|
1533
|
-
* Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
|
|
1534
|
-
* and trims leading/trailing whitespace.
|
|
1821
|
+
* Validates the number of tool calls made during an LLM host simulation.
|
|
1535
1822
|
*
|
|
1536
|
-
* @param
|
|
1537
|
-
* @
|
|
1823
|
+
* @param response - Must be an LLMHostSimulationResult (from llm_host mode)
|
|
1824
|
+
* @param options - Count constraints (min, max, exact)
|
|
1825
|
+
*/
|
|
1826
|
+
declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
|
|
1827
|
+
|
|
1828
|
+
/**
|
|
1829
|
+
* Built-in judge rubrics matching Glean EvalV2's named judge types.
|
|
1830
|
+
* Use these for consistent, standardized evaluations across teams.
|
|
1538
1831
|
*
|
|
1539
|
-
*
|
|
1540
|
-
* ```typescript
|
|
1541
|
-
* normalizeWhitespace(' hello\n\n world ');
|
|
1542
|
-
* // Returns: "hello world"
|
|
1543
|
-
* ```
|
|
1832
|
+
* All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
|
|
1544
1833
|
*/
|
|
1545
|
-
|
|
1834
|
+
type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
|
|
1835
|
+
declare const BUILT_IN_RUBRICS: Record<BuiltInRubric, string>;
|
|
1836
|
+
/** A rubric specification: either a built-in named rubric or custom text. */
|
|
1837
|
+
type RubricSpec = BuiltInRubric | {
|
|
1838
|
+
text: string;
|
|
1839
|
+
};
|
|
1840
|
+
/**
|
|
1841
|
+
* Returns true if `s` is a built-in rubric name.
|
|
1842
|
+
*/
|
|
1843
|
+
declare function isBuiltInRubric(s: unknown): s is BuiltInRubric;
|
|
1844
|
+
/**
|
|
1845
|
+
* Resolves a RubricSpec to its full rubric text.
|
|
1846
|
+
* - Built-in name → returns the expanded rubric text from BUILT_IN_RUBRICS
|
|
1847
|
+
* - Custom object → returns rubric.text as-is
|
|
1848
|
+
*/
|
|
1849
|
+
declare function resolveRubric(rubric: RubricSpec): string;
|
|
1546
1850
|
|
|
1547
1851
|
/**
|
|
1548
1852
|
* Usage metrics from Claude Agent SDK response
|
|
@@ -1577,17 +1881,15 @@ interface UsageMetrics {
|
|
|
1577
1881
|
*/
|
|
1578
1882
|
cacheCreationInputTokens?: number;
|
|
1579
1883
|
}
|
|
1580
|
-
/**
|
|
1581
|
-
|
|
1582
|
-
*/
|
|
1583
|
-
type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
|
|
1884
|
+
/** Valid LLM judge provider kinds. */
|
|
1885
|
+
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
1584
1886
|
/**
|
|
1585
1887
|
* Configuration for an LLM judge
|
|
1586
1888
|
*/
|
|
1587
1889
|
interface JudgeConfig {
|
|
1588
1890
|
/**
|
|
1589
1891
|
* LLM provider to use
|
|
1590
|
-
* @default '
|
|
1892
|
+
* @default 'anthropic'
|
|
1591
1893
|
*/
|
|
1592
1894
|
provider?: ProviderKind;
|
|
1593
1895
|
/**
|
|
@@ -1649,7 +1951,24 @@ interface JudgeResult {
|
|
|
1649
1951
|
* Whether the candidate exceeded maxToolOutputSize
|
|
1650
1952
|
*/
|
|
1651
1953
|
exceedsMaxToolOutputSize?: boolean;
|
|
1954
|
+
/**
|
|
1955
|
+
* Standard deviation of individual rep scores.
|
|
1956
|
+
* Only populated when the judge was run with reps > 1.
|
|
1957
|
+
*/
|
|
1958
|
+
scoreStdDev?: number;
|
|
1959
|
+
/**
|
|
1960
|
+
* True when the standard deviation across reps exceeds 0.2, indicating
|
|
1961
|
+
* that the rubric may be ambiguous or the judge is non-deterministic.
|
|
1962
|
+
* Only populated when the judge was run with reps > 1.
|
|
1963
|
+
*/
|
|
1964
|
+
highVariance?: boolean;
|
|
1965
|
+
/**
|
|
1966
|
+
* Individual scores from each judge rep.
|
|
1967
|
+
* Only populated when the judge was run with reps > 1.
|
|
1968
|
+
*/
|
|
1969
|
+
scores?: number[];
|
|
1652
1970
|
}
|
|
1971
|
+
|
|
1653
1972
|
/**
|
|
1654
1973
|
* LLM judge client interface
|
|
1655
1974
|
*/
|
|
@@ -1665,6 +1984,75 @@ interface Judge {
|
|
|
1665
1984
|
evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
|
|
1666
1985
|
}
|
|
1667
1986
|
|
|
1987
|
+
/**
|
|
1988
|
+
* Judge Validator
|
|
1989
|
+
*
|
|
1990
|
+
* Validates a response using an LLM-as-a-judge evaluation.
|
|
1991
|
+
*/
|
|
1992
|
+
|
|
1993
|
+
/**
|
|
1994
|
+
* Configuration for the judge validator
|
|
1995
|
+
*/
|
|
1996
|
+
interface JudgeValidatorConfig {
|
|
1997
|
+
/** The evaluation rubric: a built-in name or custom { text: string } */
|
|
1998
|
+
rubric: RubricSpec;
|
|
1999
|
+
/** Optional reference response to compare against */
|
|
2000
|
+
reference?: unknown;
|
|
2001
|
+
/** Minimum score required to pass (0-1, default: 0.7) */
|
|
2002
|
+
threshold?: number;
|
|
2003
|
+
/** Number of judge evaluations to run. Scores averaged. @default 1 */
|
|
2004
|
+
reps?: number;
|
|
2005
|
+
/** Judge provider. @default 'claude' */
|
|
2006
|
+
provider?: ProviderKind;
|
|
2007
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
2008
|
+
model?: string;
|
|
2009
|
+
/** Environment variable name for API key */
|
|
2010
|
+
apiKeyEnvVar?: string;
|
|
2011
|
+
/** Max tokens for judge response */
|
|
2012
|
+
maxTokens?: number;
|
|
2013
|
+
/** Temperature for judge LLM (0–1) */
|
|
2014
|
+
temperature?: number;
|
|
2015
|
+
/** Max budget in USD per evaluation */
|
|
2016
|
+
maxBudgetUsd?: number;
|
|
2017
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
2018
|
+
maxToolOutputSize?: number;
|
|
2019
|
+
}
|
|
2020
|
+
declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
|
|
2021
|
+
|
|
2022
|
+
/**
|
|
2023
|
+
* Validator Utilities
|
|
2024
|
+
*
|
|
2025
|
+
* Shared utility functions for validation operations.
|
|
2026
|
+
* Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
|
|
2027
|
+
*/
|
|
2028
|
+
|
|
2029
|
+
/**
|
|
2030
|
+
* Gets the size of a response in bytes
|
|
2031
|
+
*
|
|
2032
|
+
* Serializes the response to JSON (with pretty printing for consistency)
|
|
2033
|
+
* and returns the byte length using UTF-8 encoding.
|
|
2034
|
+
*
|
|
2035
|
+
* @param response - Response in any format
|
|
2036
|
+
* @returns Size in bytes
|
|
2037
|
+
*/
|
|
2038
|
+
declare function getResponseSizeBytes(response: unknown): number;
|
|
2039
|
+
/**
|
|
2040
|
+
* Normalizes whitespace in text for consistent comparison
|
|
2041
|
+
*
|
|
2042
|
+
* Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
|
|
2043
|
+
* and trims leading/trailing whitespace.
|
|
2044
|
+
*
|
|
2045
|
+
* @param text - Text to normalize
|
|
2046
|
+
* @returns Normalized text with collapsed whitespace
|
|
2047
|
+
*
|
|
2048
|
+
* @example
|
|
2049
|
+
* ```typescript
|
|
2050
|
+
* normalizeWhitespace(' hello\n\n world ');
|
|
2051
|
+
* // Returns: "hello world"
|
|
2052
|
+
* ```
|
|
2053
|
+
*/
|
|
2054
|
+
declare function normalizeWhitespace(text: string): string;
|
|
2055
|
+
|
|
1668
2056
|
/**
|
|
1669
2057
|
* Matcher Types
|
|
1670
2058
|
*
|
|
@@ -1679,8 +2067,12 @@ interface JudgeMatcherOptions {
|
|
|
1679
2067
|
reference?: unknown;
|
|
1680
2068
|
/** Score threshold for passing (default: 0.7) */
|
|
1681
2069
|
passingThreshold?: number;
|
|
1682
|
-
/**
|
|
1683
|
-
|
|
2070
|
+
/** Number of judge evaluations (scores averaged) */
|
|
2071
|
+
reps?: number;
|
|
2072
|
+
/** Override the judge provider */
|
|
2073
|
+
provider?: ProviderKind;
|
|
2074
|
+
/** Override the judge model */
|
|
2075
|
+
model?: string;
|
|
1684
2076
|
}
|
|
1685
2077
|
/**
|
|
1686
2078
|
* Declaration merging for Playwright matchers
|
|
@@ -1785,7 +2177,7 @@ declare global {
|
|
|
1785
2177
|
* });
|
|
1786
2178
|
* ```
|
|
1787
2179
|
*/
|
|
1788
|
-
toPassToolJudge(rubric:
|
|
2180
|
+
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
1789
2181
|
/**
|
|
1790
2182
|
* Validates that a response meets size constraints
|
|
1791
2183
|
*
|
|
@@ -1830,11 +2222,33 @@ declare global {
|
|
|
1830
2222
|
* ```
|
|
1831
2223
|
*/
|
|
1832
2224
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
2225
|
+
/**
|
|
2226
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
2227
|
+
*
|
|
2228
|
+
* @example
|
|
2229
|
+
* ```typescript
|
|
2230
|
+
* expect(simulationResult).toHaveToolCalls({
|
|
2231
|
+
* calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
|
|
2232
|
+
* order: 'any',
|
|
2233
|
+
* });
|
|
2234
|
+
* ```
|
|
2235
|
+
*/
|
|
2236
|
+
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
2237
|
+
/**
|
|
2238
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
2239
|
+
*
|
|
2240
|
+
* @example
|
|
2241
|
+
* ```typescript
|
|
2242
|
+
* expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
|
|
2243
|
+
* expect(simulationResult).toHaveToolCallCount({ exact: 2 });
|
|
2244
|
+
* ```
|
|
2245
|
+
*/
|
|
2246
|
+
toHaveToolCallCount(options: ToolCallCountOptions): R;
|
|
2247
|
+
}
|
|
2248
|
+
}
|
|
2249
|
+
}
|
|
2250
|
+
/**
|
|
2251
|
+
* Predicate result returned by the user's predicate function
|
|
1838
2252
|
*/
|
|
1839
2253
|
interface PredicateResult {
|
|
1840
2254
|
/** Whether the predicate passed */
|
|
@@ -1873,7 +2287,7 @@ type ResultSource = 'eval' | 'test';
|
|
|
1873
2287
|
/**
|
|
1874
2288
|
* Known expectation types supported by the framework
|
|
1875
2289
|
*/
|
|
1876
|
-
type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
|
|
2290
|
+
type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size' | 'toolsTriggered' | 'toolCallCount';
|
|
1877
2291
|
/**
|
|
1878
2292
|
* Result of an expectation check
|
|
1879
2293
|
*/
|
|
@@ -1912,6 +2326,10 @@ interface MCPFixtureOptions {
|
|
|
1912
2326
|
* Used for filtering and grouping in the reporter
|
|
1913
2327
|
*/
|
|
1914
2328
|
project?: string;
|
|
2329
|
+
/**
|
|
2330
|
+
* Timeout in milliseconds for MCP tool/list operations. Default: 30000
|
|
2331
|
+
*/
|
|
2332
|
+
callTimeoutMs?: number;
|
|
1915
2333
|
}
|
|
1916
2334
|
/**
|
|
1917
2335
|
* High-level API for interacting with MCP servers in tests
|
|
@@ -1954,29 +2372,43 @@ interface MCPFixtureApi {
|
|
|
1954
2372
|
} | null;
|
|
1955
2373
|
}
|
|
1956
2374
|
/**
|
|
1957
|
-
* Creates an MCP fixture wrapper around a Client
|
|
2375
|
+
* Creates an MCP fixture wrapper around a Client, providing a high-level
|
|
2376
|
+
* {@link MCPFixtureApi} without requiring Playwright's `test.extend` pattern.
|
|
1958
2377
|
*
|
|
1959
|
-
*
|
|
1960
|
-
*
|
|
2378
|
+
* Use this when you need to set up an MCP fixture manually — for example in
|
|
2379
|
+
* custom fixture hierarchies, non-Playwright test runners (e.g. Vitest,
|
|
2380
|
+
* Jest), or when you want to compose the fixture with other lifecycle
|
|
2381
|
+
* management logic that doesn't fit the standard `test.extend` model.
|
|
1961
2382
|
*
|
|
1962
|
-
*
|
|
1963
|
-
*
|
|
2383
|
+
* For the typical Playwright use case, prefer importing `test` and `mcp`
|
|
2384
|
+
* directly from `@gleanwork/mcp-server-tester/fixtures/mcp`, which wires
|
|
2385
|
+
* this function up automatically.
|
|
2386
|
+
*
|
|
2387
|
+
* When `testInfo` is provided, all MCP operations are automatically wrapped
|
|
2388
|
+
* in `test.step()` calls and attachments are created for the MCP Test
|
|
2389
|
+
* Reporter. Omit `testInfo` for lightweight usage outside Playwright.
|
|
2390
|
+
*
|
|
2391
|
+
* @param client - The MCP client to wrap (created via `createMCPClientForConfig`)
|
|
2392
|
+
* @param testInfo - Optional Playwright TestInfo for auto-tracking and reporter attachments
|
|
2393
|
+
* @param options - Optional fixture options (authType, project)
|
|
1964
2394
|
* @returns MCPFixtureApi instance
|
|
1965
2395
|
*
|
|
1966
2396
|
* @example
|
|
1967
2397
|
* ```typescript
|
|
1968
|
-
* //
|
|
2398
|
+
* // Advanced: custom fixture setup inside test.extend
|
|
1969
2399
|
* const test = base.extend<{ mcp: MCPFixtureApi }>({
|
|
1970
2400
|
* mcp: async ({}, use, testInfo) => {
|
|
1971
2401
|
* const client = await createMCPClientForConfig(config);
|
|
1972
|
-
* const api = createMCPFixture(client, testInfo);
|
|
2402
|
+
* const api = createMCPFixture(client, testInfo, { authType: 'api-token' });
|
|
1973
2403
|
* await use(api);
|
|
1974
2404
|
* await closeMCPClient(client);
|
|
1975
2405
|
* }
|
|
1976
2406
|
* });
|
|
1977
2407
|
*
|
|
1978
|
-
* //
|
|
2408
|
+
* // Non-Playwright usage (no reporter attachments)
|
|
2409
|
+
* const client = await createMCPClientForConfig(config);
|
|
1979
2410
|
* const api = createMCPFixture(client);
|
|
2411
|
+
* const tools = await api.listTools();
|
|
1980
2412
|
* ```
|
|
1981
2413
|
*/
|
|
1982
2414
|
declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
|
|
@@ -2082,6 +2514,8 @@ declare function toBeToolError(this: {
|
|
|
2082
2514
|
* toPassToolJudge Matcher
|
|
2083
2515
|
*
|
|
2084
2516
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
2517
|
+
* Delegates evaluation logic to validateJudge() for consistency
|
|
2518
|
+
* with the validator/matcher duality pattern.
|
|
2085
2519
|
*/
|
|
2086
2520
|
|
|
2087
2521
|
/**
|
|
@@ -2091,7 +2525,7 @@ declare function toBeToolError(this: {
|
|
|
2091
2525
|
*/
|
|
2092
2526
|
declare function toPassToolJudge(this: {
|
|
2093
2527
|
isNot: boolean;
|
|
2094
|
-
}, received: unknown, rubric:
|
|
2528
|
+
}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
|
|
2095
2529
|
pass: boolean;
|
|
2096
2530
|
message: () => string;
|
|
2097
2531
|
}>;
|
|
@@ -2158,6 +2592,38 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
2158
2592
|
message: () => string;
|
|
2159
2593
|
}>;
|
|
2160
2594
|
|
|
2595
|
+
/**
|
|
2596
|
+
* toHaveToolCalls Matcher
|
|
2597
|
+
*
|
|
2598
|
+
* Validates which tools the LLM called during an llm_host simulation.
|
|
2599
|
+
*/
|
|
2600
|
+
|
|
2601
|
+
/**
|
|
2602
|
+
* Creates the toHaveToolCalls matcher function
|
|
2603
|
+
*/
|
|
2604
|
+
declare function toHaveToolCalls(this: {
|
|
2605
|
+
isNot: boolean;
|
|
2606
|
+
}, received: unknown, expectation: ToolCallExpectation): {
|
|
2607
|
+
pass: boolean;
|
|
2608
|
+
message: () => string;
|
|
2609
|
+
};
|
|
2610
|
+
|
|
2611
|
+
/**
|
|
2612
|
+
* toHaveToolCallCount Matcher
|
|
2613
|
+
*
|
|
2614
|
+
* Validates the number of tool calls made during an llm_host simulation.
|
|
2615
|
+
*/
|
|
2616
|
+
|
|
2617
|
+
/**
|
|
2618
|
+
* Creates the toHaveToolCallCount matcher function
|
|
2619
|
+
*/
|
|
2620
|
+
declare function toHaveToolCallCount(this: {
|
|
2621
|
+
isNot: boolean;
|
|
2622
|
+
}, received: unknown, options: ToolCallCountOptions): {
|
|
2623
|
+
pass: boolean;
|
|
2624
|
+
message: () => string;
|
|
2625
|
+
};
|
|
2626
|
+
|
|
2161
2627
|
/**
|
|
2162
2628
|
* Extended Playwright expect with MCP tool matchers
|
|
2163
2629
|
*
|
|
@@ -2184,6 +2650,8 @@ declare const expect: playwright_test.Expect<{
|
|
|
2184
2650
|
toPassToolJudge: typeof toPassToolJudge;
|
|
2185
2651
|
toHaveToolResponseSize: typeof toHaveToolResponseSize;
|
|
2186
2652
|
toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
|
|
2653
|
+
toHaveToolCalls: typeof toHaveToolCalls;
|
|
2654
|
+
toHaveToolCallCount: typeof toHaveToolCallCount;
|
|
2187
2655
|
}>;
|
|
2188
2656
|
|
|
2189
2657
|
/**
|
|
@@ -2223,7 +2691,33 @@ type MCPFixtures = {
|
|
|
2223
2691
|
* expect(tools.length).toBeGreaterThan(0);
|
|
2224
2692
|
* });
|
|
2225
2693
|
*/
|
|
2226
|
-
declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2694
|
+
declare const test$1: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2695
|
+
|
|
2696
|
+
/**
|
|
2697
|
+
* Test-scoped auth fixtures interface
|
|
2698
|
+
*/
|
|
2699
|
+
interface MCPAuthFixtures {
|
|
2700
|
+
/**
|
|
2701
|
+
* OAuth client provider for MCP authentication
|
|
2702
|
+
*/
|
|
2703
|
+
mcpAuthProvider: OAuthClientProvider | undefined;
|
|
2704
|
+
}
|
|
2705
|
+
/**
|
|
2706
|
+
* Extended Playwright test with MCP auth fixtures
|
|
2707
|
+
*
|
|
2708
|
+
* Use this when you need OAuth authentication for MCP server testing.
|
|
2709
|
+
*
|
|
2710
|
+
* @example
|
|
2711
|
+
* ```typescript
|
|
2712
|
+
* // test.ts
|
|
2713
|
+
* import { test } from '@gleanwork/mcp-server-tester/fixtures/mcpAuth';
|
|
2714
|
+
*
|
|
2715
|
+
* test('authenticated MCP call', async ({ mcpAuthProvider }) => {
|
|
2716
|
+
* // mcpAuthProvider can be passed to createMCPClientForConfig
|
|
2717
|
+
* });
|
|
2718
|
+
* ```
|
|
2719
|
+
*/
|
|
2720
|
+
declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2227
2721
|
|
|
2228
2722
|
/**
|
|
2229
2723
|
* Types and interfaces for LLM host simulation mode
|
|
@@ -2233,9 +2727,29 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
|
|
|
2233
2727
|
*/
|
|
2234
2728
|
|
|
2235
2729
|
/**
|
|
2236
|
-
* LLM provider for host simulation
|
|
2730
|
+
* LLM provider for host simulation.
|
|
2731
|
+
*
|
|
2732
|
+
* All providers run through the Vercel AI SDK (`ai` package).
|
|
2733
|
+
* Each provider requires its corresponding @ai-sdk/* package:
|
|
2734
|
+
*
|
|
2735
|
+
* openai → npm install ai @ai-sdk/openai
|
|
2736
|
+
* anthropic → npm install ai @ai-sdk/anthropic
|
|
2737
|
+
* google → npm install ai @ai-sdk/google
|
|
2738
|
+
* azure → npm install ai @ai-sdk/azure
|
|
2739
|
+
* mistral → npm install ai @ai-sdk/mistral
|
|
2740
|
+
* deepseek → npm install ai @ai-sdk/deepseek
|
|
2741
|
+
* openrouter → npm install ai @openrouter/ai-sdk-provider
|
|
2742
|
+
* xai → npm install ai @ai-sdk/xai
|
|
2743
|
+
*/
|
|
2744
|
+
type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'deepseek' | 'openrouter' | 'xai'
|
|
2745
|
+
/**
|
|
2746
|
+
* Anthropic Claude via Google Vertex AI.
|
|
2747
|
+
* Requires @ai-sdk/google-vertex and Application Default Credentials (gcloud auth).
|
|
2748
|
+
* Set GOOGLE_VERTEX_PROJECT and GOOGLE_VERTEX_LOCATION env vars.
|
|
2749
|
+
* Use this instead of 'anthropic' in environments where api.anthropic.com is blocked.
|
|
2750
|
+
* @example model: 'claude-3-5-haiku@20241022'
|
|
2237
2751
|
*/
|
|
2238
|
-
|
|
2752
|
+
| 'vertex-anthropic';
|
|
2239
2753
|
/**
|
|
2240
2754
|
* Configuration for LLM host simulation
|
|
2241
2755
|
*/
|
|
@@ -2246,12 +2760,10 @@ interface LLMHostConfig {
|
|
|
2246
2760
|
provider: LLMProvider;
|
|
2247
2761
|
/**
|
|
2248
2762
|
* Environment variable name containing the API key
|
|
2249
|
-
* @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
|
|
2250
2763
|
*/
|
|
2251
2764
|
apiKeyEnvVar?: string;
|
|
2252
2765
|
/**
|
|
2253
|
-
* Model to use
|
|
2254
|
-
* @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
|
|
2766
|
+
* Model to use (provider-specific default if omitted)
|
|
2255
2767
|
*/
|
|
2256
2768
|
model?: string;
|
|
2257
2769
|
/**
|
|
@@ -2260,11 +2772,11 @@ interface LLMHostConfig {
|
|
|
2260
2772
|
maxTokens?: number;
|
|
2261
2773
|
/**
|
|
2262
2774
|
* Temperature (0-1, lower is more deterministic)
|
|
2263
|
-
* @default 0
|
|
2775
|
+
* @default 0
|
|
2264
2776
|
*/
|
|
2265
2777
|
temperature?: number;
|
|
2266
2778
|
/**
|
|
2267
|
-
* Maximum number of tool
|
|
2779
|
+
* Maximum number of tool call steps to allow in a single conversation
|
|
2268
2780
|
* @default 10
|
|
2269
2781
|
*/
|
|
2270
2782
|
maxToolCalls?: number;
|
|
@@ -2273,72 +2785,49 @@ interface LLMHostConfig {
|
|
|
2273
2785
|
* A tool call made by the LLM
|
|
2274
2786
|
*/
|
|
2275
2787
|
interface LLMToolCall {
|
|
2276
|
-
/**
|
|
2277
|
-
* Tool name
|
|
2278
|
-
*/
|
|
2788
|
+
/** Tool name */
|
|
2279
2789
|
name: string;
|
|
2280
|
-
/**
|
|
2281
|
-
* Tool arguments (as provided by LLM)
|
|
2282
|
-
*/
|
|
2790
|
+
/** Tool arguments (as provided by LLM) */
|
|
2283
2791
|
arguments: Record<string, unknown>;
|
|
2284
|
-
/**
|
|
2285
|
-
* Optional tool call ID (for tracking)
|
|
2286
|
-
*/
|
|
2792
|
+
/** Optional tool call ID (for tracking) */
|
|
2287
2793
|
id?: string;
|
|
2288
2794
|
}
|
|
2289
|
-
/**
|
|
2290
|
-
* Result of a tool call validation
|
|
2291
|
-
*/
|
|
2292
|
-
interface ToolCallValidationResult {
|
|
2293
|
-
/**
|
|
2294
|
-
* Whether the tool call was valid
|
|
2295
|
-
*/
|
|
2296
|
-
valid: boolean;
|
|
2297
|
-
/**
|
|
2298
|
-
* List of actual tool calls made
|
|
2299
|
-
*/
|
|
2300
|
-
actualCalls: Array<LLMToolCall>;
|
|
2301
|
-
/**
|
|
2302
|
-
* Expected tool calls (if specified in eval case)
|
|
2303
|
-
*/
|
|
2304
|
-
expectedCalls?: Array<LLMToolCall>;
|
|
2305
|
-
/**
|
|
2306
|
-
* Details about validation (e.g., missing calls, incorrect arguments)
|
|
2307
|
-
*/
|
|
2308
|
-
details?: string;
|
|
2309
|
-
}
|
|
2310
2795
|
/**
|
|
2311
2796
|
* Result from an LLM host simulation
|
|
2312
2797
|
*/
|
|
2313
2798
|
interface LLMHostSimulationResult {
|
|
2314
|
-
/**
|
|
2315
|
-
* Whether the simulation succeeded
|
|
2316
|
-
*/
|
|
2799
|
+
/** Whether the simulation succeeded */
|
|
2317
2800
|
success: boolean;
|
|
2318
|
-
/**
|
|
2319
|
-
* Tool calls made by the LLM
|
|
2320
|
-
*/
|
|
2801
|
+
/** Tool calls made by the LLM */
|
|
2321
2802
|
toolCalls: Array<LLMToolCall>;
|
|
2322
|
-
/**
|
|
2323
|
-
* Final response from the LLM
|
|
2324
|
-
*/
|
|
2803
|
+
/** Final response from the LLM */
|
|
2325
2804
|
response?: string;
|
|
2326
|
-
/**
|
|
2327
|
-
* Error message if simulation failed
|
|
2328
|
-
*/
|
|
2805
|
+
/** Error message if simulation failed */
|
|
2329
2806
|
error?: string;
|
|
2330
|
-
/**
|
|
2331
|
-
|
|
2332
|
-
|
|
2807
|
+
/** The scenario prompt that was given to the LLM */
|
|
2808
|
+
scenario?: string;
|
|
2809
|
+
/** The conversation turns for attribution analysis */
|
|
2333
2810
|
conversationHistory?: Array<{
|
|
2334
2811
|
role: 'user' | 'assistant' | 'tool';
|
|
2335
2812
|
content: string;
|
|
2336
2813
|
}>;
|
|
2814
|
+
/**
|
|
2815
|
+
* Milliseconds spent waiting for LLM responses
|
|
2816
|
+
* (excludes MCP tool execution time)
|
|
2817
|
+
*/
|
|
2818
|
+
llmDurationMs?: number;
|
|
2819
|
+
/**
|
|
2820
|
+
* Milliseconds spent executing MCP tool calls
|
|
2821
|
+
* (excludes LLM response time)
|
|
2822
|
+
*/
|
|
2823
|
+
mcpDurationMs?: number;
|
|
2337
2824
|
}
|
|
2338
2825
|
/**
|
|
2339
|
-
* Interface for LLM host simulators
|
|
2826
|
+
* Interface for LLM host simulators.
|
|
2340
2827
|
*
|
|
2341
|
-
*
|
|
2828
|
+
* The only built-in implementation is the Vercel AI SDK orchestrator
|
|
2829
|
+
* (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
|
|
2830
|
+
* created for specialised testing needs.
|
|
2342
2831
|
*/
|
|
2343
2832
|
interface LLMHostSimulator {
|
|
2344
2833
|
/**
|
|
@@ -2351,24 +2840,6 @@ interface LLMHostSimulator {
|
|
|
2351
2840
|
*/
|
|
2352
2841
|
simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
2353
2842
|
}
|
|
2354
|
-
/**
|
|
2355
|
-
* Expected tool call specification (for validation)
|
|
2356
|
-
*/
|
|
2357
|
-
interface ExpectedToolCall {
|
|
2358
|
-
/**
|
|
2359
|
-
* Tool name
|
|
2360
|
-
*/
|
|
2361
|
-
name: string;
|
|
2362
|
-
/**
|
|
2363
|
-
* Expected arguments (partial match)
|
|
2364
|
-
*/
|
|
2365
|
-
arguments?: Record<string, unknown>;
|
|
2366
|
-
/**
|
|
2367
|
-
* Whether this call is required
|
|
2368
|
-
* @default true
|
|
2369
|
-
*/
|
|
2370
|
-
required?: boolean;
|
|
2371
|
-
}
|
|
2372
2843
|
|
|
2373
2844
|
/**
|
|
2374
2845
|
* Evaluation mode
|
|
@@ -2423,6 +2894,41 @@ interface EvalCase {
|
|
|
2423
2894
|
* For 'llm_host' mode, can include 'expectedToolCalls' for validation
|
|
2424
2895
|
*/
|
|
2425
2896
|
metadata?: Record<string, unknown>;
|
|
2897
|
+
/**
|
|
2898
|
+
* Number of times to run this case and compute an accuracy score.
|
|
2899
|
+
* When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
|
|
2900
|
+
* by `accuracyThreshold` rather than a single run.
|
|
2901
|
+
* @default 1
|
|
2902
|
+
*/
|
|
2903
|
+
iterations?: number;
|
|
2904
|
+
/**
|
|
2905
|
+
* Minimum accuracy (0–1) required to pass when `iterations > 1`.
|
|
2906
|
+
* @default 1.0 (all iterations must pass)
|
|
2907
|
+
*/
|
|
2908
|
+
accuracyThreshold?: number;
|
|
2909
|
+
/**
|
|
2910
|
+
* Number of times to invoke the LLM judge per `passesJudge` assertion.
|
|
2911
|
+
* Scores are averaged; the mean must meet the threshold to pass.
|
|
2912
|
+
* Reduces judge variance caused by non-determinism.
|
|
2913
|
+
* Per-assertion `passesJudge.reps` overrides this value.
|
|
2914
|
+
* @default 1
|
|
2915
|
+
*/
|
|
2916
|
+
judgeReps?: number;
|
|
2917
|
+
/**
|
|
2918
|
+
* Golden/expected answer for this case.
|
|
2919
|
+
* When set, automatically passed as `reference` to the LLM judge
|
|
2920
|
+
* (unless passesJudge.reference is explicitly provided).
|
|
2921
|
+
* Mirrors EvalV2's `canonical_answer` field.
|
|
2922
|
+
*/
|
|
2923
|
+
canonicalAnswer?: string;
|
|
2924
|
+
/**
|
|
2925
|
+
* Arbitrary string labels for this case.
|
|
2926
|
+
* Use for filtering eval runs with `EvalRunnerOptions.filterTags`
|
|
2927
|
+
* and for slicing results by category.
|
|
2928
|
+
*
|
|
2929
|
+
* @example ['tool-finding', 'multi-hop', 'search']
|
|
2930
|
+
*/
|
|
2931
|
+
tags?: string[];
|
|
2426
2932
|
/**
|
|
2427
2933
|
* Expectations to validate against the tool response
|
|
2428
2934
|
*
|
|
@@ -2486,14 +2992,30 @@ interface EvalExpectBlock {
|
|
|
2486
2992
|
* LLM-as-judge evaluation (toPassToolJudge)
|
|
2487
2993
|
*/
|
|
2488
2994
|
passesJudge?: {
|
|
2489
|
-
/**
|
|
2490
|
-
rubric:
|
|
2995
|
+
/** Built-in rubric name or custom rubric object */
|
|
2996
|
+
rubric: BuiltInRubric | {
|
|
2997
|
+
text: string;
|
|
2998
|
+
};
|
|
2491
2999
|
/** Reference response to compare against */
|
|
2492
3000
|
reference?: unknown;
|
|
2493
3001
|
/** Score threshold for passing (0-1, default: 0.7) */
|
|
2494
3002
|
threshold?: number;
|
|
2495
|
-
/**
|
|
2496
|
-
|
|
3003
|
+
/** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
|
|
3004
|
+
reps?: number;
|
|
3005
|
+
/** Judge provider. @default 'anthropic' */
|
|
3006
|
+
provider?: 'anthropic' | 'openai' | 'google';
|
|
3007
|
+
/** Model override (e.g., 'claude-opus-4-20250514') */
|
|
3008
|
+
model?: string;
|
|
3009
|
+
/** Environment variable name for API key */
|
|
3010
|
+
apiKeyEnvVar?: string;
|
|
3011
|
+
/** Max tokens for judge response */
|
|
3012
|
+
maxTokens?: number;
|
|
3013
|
+
/** Temperature for judge LLM (0–1) */
|
|
3014
|
+
temperature?: number;
|
|
3015
|
+
/** Max budget in USD per evaluation */
|
|
3016
|
+
maxBudgetUsd?: number;
|
|
3017
|
+
/** Fail if response exceeds this size in bytes before judging */
|
|
3018
|
+
maxToolOutputSize?: number;
|
|
2497
3019
|
};
|
|
2498
3020
|
/**
|
|
2499
3021
|
* Response size validation (toHaveToolResponseSize)
|
|
@@ -2504,6 +3026,39 @@ interface EvalExpectBlock {
|
|
|
2504
3026
|
/** Minimum required size in bytes */
|
|
2505
3027
|
minBytes?: number;
|
|
2506
3028
|
};
|
|
3029
|
+
/**
|
|
3030
|
+
* Asserts which tools the LLM called during an llm_host simulation.
|
|
3031
|
+
* Only meaningful for llm_host mode — direct mode has no tool call trace.
|
|
3032
|
+
*/
|
|
3033
|
+
toolsTriggered?: {
|
|
3034
|
+
/** Expected tool calls */
|
|
3035
|
+
calls: Array<{
|
|
3036
|
+
/** Tool name */
|
|
3037
|
+
name: string;
|
|
3038
|
+
/** Expected arguments (partial match — extra keys are allowed) */
|
|
3039
|
+
arguments?: Record<string, unknown>;
|
|
3040
|
+
/** Whether this call MUST have been made (default: true) */
|
|
3041
|
+
required?: boolean;
|
|
3042
|
+
}>;
|
|
3043
|
+
/**
|
|
3044
|
+
* 'strict': calls must appear in the exact order listed
|
|
3045
|
+
* 'any': calls can appear in any order (default)
|
|
3046
|
+
*/
|
|
3047
|
+
order?: 'strict' | 'any';
|
|
3048
|
+
/** If true, no tool calls outside the `calls` list are allowed */
|
|
3049
|
+
exclusive?: boolean;
|
|
3050
|
+
};
|
|
3051
|
+
/**
|
|
3052
|
+
* Asserts the number of tool calls made during an llm_host simulation.
|
|
3053
|
+
*/
|
|
3054
|
+
toolCallCount?: {
|
|
3055
|
+
/** Minimum number of tool calls */
|
|
3056
|
+
min?: number;
|
|
3057
|
+
/** Maximum number of tool calls */
|
|
3058
|
+
max?: number;
|
|
3059
|
+
/** Exact number of tool calls */
|
|
3060
|
+
exact?: number;
|
|
3061
|
+
};
|
|
2507
3062
|
}
|
|
2508
3063
|
/**
|
|
2509
3064
|
* A complete eval dataset containing multiple test cases
|
|
@@ -2543,21 +3098,21 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2543
3098
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2544
3099
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2545
3100
|
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2546
|
-
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
3101
|
+
provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
|
|
2547
3102
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2548
3103
|
model: z.ZodOptional<z.ZodString>;
|
|
2549
3104
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2550
3105
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2551
3106
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2552
3107
|
}, "strip", z.ZodTypeAny, {
|
|
2553
|
-
provider: "anthropic" | "
|
|
3108
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2554
3109
|
model?: string | undefined;
|
|
2555
3110
|
maxTokens?: number | undefined;
|
|
2556
3111
|
apiKeyEnvVar?: string | undefined;
|
|
2557
3112
|
temperature?: number | undefined;
|
|
2558
3113
|
maxToolCalls?: number | undefined;
|
|
2559
3114
|
}, {
|
|
2560
|
-
provider: "anthropic" | "
|
|
3115
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2561
3116
|
model?: string | undefined;
|
|
2562
3117
|
maxTokens?: number | undefined;
|
|
2563
3118
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2565,6 +3120,11 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2565
3120
|
maxToolCalls?: number | undefined;
|
|
2566
3121
|
}>>;
|
|
2567
3122
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3123
|
+
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3124
|
+
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3125
|
+
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3126
|
+
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3127
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
2568
3128
|
expect: z.ZodOptional<z.ZodObject<{
|
|
2569
3129
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
2570
3130
|
schema: z.ZodOptional<z.ZodString>;
|
|
@@ -2589,20 +3149,51 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2589
3149
|
}>]>, "many">>;
|
|
2590
3150
|
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2591
3151
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2592
|
-
rubric: z.
|
|
3152
|
+
rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
|
|
3153
|
+
text: z.ZodString;
|
|
3154
|
+
}, "strip", z.ZodTypeAny, {
|
|
3155
|
+
text: string;
|
|
3156
|
+
}, {
|
|
3157
|
+
text: string;
|
|
3158
|
+
}>]>;
|
|
2593
3159
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2594
3160
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2595
|
-
|
|
3161
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3162
|
+
provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
|
|
3163
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3164
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3165
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3166
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3167
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3168
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2596
3169
|
}, "strip", z.ZodTypeAny, {
|
|
2597
|
-
rubric:
|
|
3170
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3171
|
+
text: string;
|
|
3172
|
+
};
|
|
3173
|
+
model?: string | undefined;
|
|
3174
|
+
maxTokens?: number | undefined;
|
|
3175
|
+
maxBudgetUsd?: number | undefined;
|
|
2598
3176
|
reference?: unknown;
|
|
2599
3177
|
threshold?: number | undefined;
|
|
2600
|
-
|
|
3178
|
+
reps?: number | undefined;
|
|
3179
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3180
|
+
apiKeyEnvVar?: string | undefined;
|
|
3181
|
+
temperature?: number | undefined;
|
|
3182
|
+
maxToolOutputSize?: number | undefined;
|
|
2601
3183
|
}, {
|
|
2602
|
-
rubric:
|
|
3184
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3185
|
+
text: string;
|
|
3186
|
+
};
|
|
3187
|
+
model?: string | undefined;
|
|
3188
|
+
maxTokens?: number | undefined;
|
|
3189
|
+
maxBudgetUsd?: number | undefined;
|
|
2603
3190
|
reference?: unknown;
|
|
2604
3191
|
threshold?: number | undefined;
|
|
2605
|
-
|
|
3192
|
+
reps?: number | undefined;
|
|
3193
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3194
|
+
apiKeyEnvVar?: string | undefined;
|
|
3195
|
+
temperature?: number | undefined;
|
|
3196
|
+
maxToolOutputSize?: number | undefined;
|
|
2606
3197
|
}>>;
|
|
2607
3198
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2608
3199
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2614,47 +3205,139 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2614
3205
|
maxBytes?: number | undefined;
|
|
2615
3206
|
minBytes?: number | undefined;
|
|
2616
3207
|
}>>;
|
|
3208
|
+
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3209
|
+
calls: z.ZodArray<z.ZodObject<{
|
|
3210
|
+
name: z.ZodString;
|
|
3211
|
+
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3212
|
+
required: z.ZodOptional<z.ZodBoolean>;
|
|
3213
|
+
}, "strip", z.ZodTypeAny, {
|
|
3214
|
+
name: string;
|
|
3215
|
+
required?: boolean | undefined;
|
|
3216
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3217
|
+
}, {
|
|
3218
|
+
name: string;
|
|
3219
|
+
required?: boolean | undefined;
|
|
3220
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3221
|
+
}>, "many">;
|
|
3222
|
+
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3223
|
+
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3224
|
+
}, "strip", z.ZodTypeAny, {
|
|
3225
|
+
calls: {
|
|
3226
|
+
name: string;
|
|
3227
|
+
required?: boolean | undefined;
|
|
3228
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3229
|
+
}[];
|
|
3230
|
+
order?: "strict" | "any" | undefined;
|
|
3231
|
+
exclusive?: boolean | undefined;
|
|
3232
|
+
}, {
|
|
3233
|
+
calls: {
|
|
3234
|
+
name: string;
|
|
3235
|
+
required?: boolean | undefined;
|
|
3236
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3237
|
+
}[];
|
|
3238
|
+
order?: "strict" | "any" | undefined;
|
|
3239
|
+
exclusive?: boolean | undefined;
|
|
3240
|
+
}>>;
|
|
3241
|
+
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3242
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
3243
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
3244
|
+
exact: z.ZodOptional<z.ZodNumber>;
|
|
3245
|
+
}, "strip", z.ZodTypeAny, {
|
|
3246
|
+
exact?: number | undefined;
|
|
3247
|
+
min?: number | undefined;
|
|
3248
|
+
max?: number | undefined;
|
|
3249
|
+
}, {
|
|
3250
|
+
exact?: number | undefined;
|
|
3251
|
+
min?: number | undefined;
|
|
3252
|
+
max?: number | undefined;
|
|
3253
|
+
}>>;
|
|
2617
3254
|
}, "strip", z.ZodTypeAny, {
|
|
3255
|
+
response?: unknown;
|
|
2618
3256
|
isError?: string | boolean | string[] | undefined;
|
|
2619
3257
|
schema?: string | undefined;
|
|
2620
3258
|
snapshot?: string | undefined;
|
|
2621
|
-
|
|
3259
|
+
toolsTriggered?: {
|
|
3260
|
+
calls: {
|
|
3261
|
+
name: string;
|
|
3262
|
+
required?: boolean | undefined;
|
|
3263
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3264
|
+
}[];
|
|
3265
|
+
order?: "strict" | "any" | undefined;
|
|
3266
|
+
exclusive?: boolean | undefined;
|
|
3267
|
+
} | undefined;
|
|
3268
|
+
toolCallCount?: {
|
|
3269
|
+
exact?: number | undefined;
|
|
3270
|
+
min?: number | undefined;
|
|
3271
|
+
max?: number | undefined;
|
|
3272
|
+
} | undefined;
|
|
2622
3273
|
containsText?: string | string[] | undefined;
|
|
2623
3274
|
matchesPattern?: string | string[] | undefined;
|
|
2624
|
-
snapshotSanitizers?: ("
|
|
3275
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2625
3276
|
pattern: string;
|
|
2626
3277
|
replacement?: string | undefined;
|
|
2627
3278
|
} | {
|
|
2628
3279
|
remove: string[];
|
|
2629
3280
|
})[] | undefined;
|
|
2630
3281
|
passesJudge?: {
|
|
2631
|
-
rubric:
|
|
3282
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3283
|
+
text: string;
|
|
3284
|
+
};
|
|
3285
|
+
model?: string | undefined;
|
|
3286
|
+
maxTokens?: number | undefined;
|
|
3287
|
+
maxBudgetUsd?: number | undefined;
|
|
2632
3288
|
reference?: unknown;
|
|
2633
3289
|
threshold?: number | undefined;
|
|
2634
|
-
|
|
3290
|
+
reps?: number | undefined;
|
|
3291
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3292
|
+
apiKeyEnvVar?: string | undefined;
|
|
3293
|
+
temperature?: number | undefined;
|
|
3294
|
+
maxToolOutputSize?: number | undefined;
|
|
2635
3295
|
} | undefined;
|
|
2636
3296
|
responseSize?: {
|
|
2637
3297
|
maxBytes?: number | undefined;
|
|
2638
3298
|
minBytes?: number | undefined;
|
|
2639
3299
|
} | undefined;
|
|
2640
3300
|
}, {
|
|
3301
|
+
response?: unknown;
|
|
2641
3302
|
isError?: string | boolean | string[] | undefined;
|
|
2642
3303
|
schema?: string | undefined;
|
|
2643
3304
|
snapshot?: string | undefined;
|
|
2644
|
-
|
|
3305
|
+
toolsTriggered?: {
|
|
3306
|
+
calls: {
|
|
3307
|
+
name: string;
|
|
3308
|
+
required?: boolean | undefined;
|
|
3309
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3310
|
+
}[];
|
|
3311
|
+
order?: "strict" | "any" | undefined;
|
|
3312
|
+
exclusive?: boolean | undefined;
|
|
3313
|
+
} | undefined;
|
|
3314
|
+
toolCallCount?: {
|
|
3315
|
+
exact?: number | undefined;
|
|
3316
|
+
min?: number | undefined;
|
|
3317
|
+
max?: number | undefined;
|
|
3318
|
+
} | undefined;
|
|
2645
3319
|
containsText?: string | string[] | undefined;
|
|
2646
3320
|
matchesPattern?: string | string[] | undefined;
|
|
2647
|
-
snapshotSanitizers?: ("
|
|
3321
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2648
3322
|
pattern: string;
|
|
2649
3323
|
replacement?: string | undefined;
|
|
2650
3324
|
} | {
|
|
2651
3325
|
remove: string[];
|
|
2652
3326
|
})[] | undefined;
|
|
2653
3327
|
passesJudge?: {
|
|
2654
|
-
rubric:
|
|
3328
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3329
|
+
text: string;
|
|
3330
|
+
};
|
|
3331
|
+
model?: string | undefined;
|
|
3332
|
+
maxTokens?: number | undefined;
|
|
3333
|
+
maxBudgetUsd?: number | undefined;
|
|
2655
3334
|
reference?: unknown;
|
|
2656
3335
|
threshold?: number | undefined;
|
|
2657
|
-
|
|
3336
|
+
reps?: number | undefined;
|
|
3337
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3338
|
+
apiKeyEnvVar?: string | undefined;
|
|
3339
|
+
temperature?: number | undefined;
|
|
3340
|
+
maxToolOutputSize?: number | undefined;
|
|
2658
3341
|
} | undefined;
|
|
2659
3342
|
responseSize?: {
|
|
2660
3343
|
maxBytes?: number | undefined;
|
|
@@ -2664,37 +3347,65 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2664
3347
|
}, "strip", z.ZodTypeAny, {
|
|
2665
3348
|
id: string;
|
|
2666
3349
|
args?: Record<string, unknown> | undefined;
|
|
2667
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2668
3350
|
mode?: "direct" | "llm_host" | undefined;
|
|
3351
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2669
3352
|
description?: string | undefined;
|
|
2670
3353
|
toolName?: string | undefined;
|
|
2671
3354
|
scenario?: string | undefined;
|
|
2672
3355
|
llmHostConfig?: {
|
|
2673
|
-
provider: "anthropic" | "
|
|
3356
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2674
3357
|
model?: string | undefined;
|
|
2675
3358
|
maxTokens?: number | undefined;
|
|
2676
3359
|
apiKeyEnvVar?: string | undefined;
|
|
2677
3360
|
temperature?: number | undefined;
|
|
2678
3361
|
maxToolCalls?: number | undefined;
|
|
2679
3362
|
} | undefined;
|
|
3363
|
+
iterations?: number | undefined;
|
|
3364
|
+
accuracyThreshold?: number | undefined;
|
|
3365
|
+
judgeReps?: number | undefined;
|
|
3366
|
+
canonicalAnswer?: string | undefined;
|
|
3367
|
+
tags?: string[] | undefined;
|
|
2680
3368
|
expect?: {
|
|
3369
|
+
response?: unknown;
|
|
2681
3370
|
isError?: string | boolean | string[] | undefined;
|
|
2682
3371
|
schema?: string | undefined;
|
|
2683
3372
|
snapshot?: string | undefined;
|
|
2684
|
-
|
|
3373
|
+
toolsTriggered?: {
|
|
3374
|
+
calls: {
|
|
3375
|
+
name: string;
|
|
3376
|
+
required?: boolean | undefined;
|
|
3377
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3378
|
+
}[];
|
|
3379
|
+
order?: "strict" | "any" | undefined;
|
|
3380
|
+
exclusive?: boolean | undefined;
|
|
3381
|
+
} | undefined;
|
|
3382
|
+
toolCallCount?: {
|
|
3383
|
+
exact?: number | undefined;
|
|
3384
|
+
min?: number | undefined;
|
|
3385
|
+
max?: number | undefined;
|
|
3386
|
+
} | undefined;
|
|
2685
3387
|
containsText?: string | string[] | undefined;
|
|
2686
3388
|
matchesPattern?: string | string[] | undefined;
|
|
2687
|
-
snapshotSanitizers?: ("
|
|
3389
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2688
3390
|
pattern: string;
|
|
2689
3391
|
replacement?: string | undefined;
|
|
2690
3392
|
} | {
|
|
2691
3393
|
remove: string[];
|
|
2692
3394
|
})[] | undefined;
|
|
2693
3395
|
passesJudge?: {
|
|
2694
|
-
rubric:
|
|
3396
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3397
|
+
text: string;
|
|
3398
|
+
};
|
|
3399
|
+
model?: string | undefined;
|
|
3400
|
+
maxTokens?: number | undefined;
|
|
3401
|
+
maxBudgetUsd?: number | undefined;
|
|
2695
3402
|
reference?: unknown;
|
|
2696
3403
|
threshold?: number | undefined;
|
|
2697
|
-
|
|
3404
|
+
reps?: number | undefined;
|
|
3405
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3406
|
+
apiKeyEnvVar?: string | undefined;
|
|
3407
|
+
temperature?: number | undefined;
|
|
3408
|
+
maxToolOutputSize?: number | undefined;
|
|
2698
3409
|
} | undefined;
|
|
2699
3410
|
responseSize?: {
|
|
2700
3411
|
maxBytes?: number | undefined;
|
|
@@ -2704,37 +3415,65 @@ declare const EvalCaseSchema: z.ZodObject<{
|
|
|
2704
3415
|
}, {
|
|
2705
3416
|
id: string;
|
|
2706
3417
|
args?: Record<string, unknown> | undefined;
|
|
2707
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2708
3418
|
mode?: "direct" | "llm_host" | undefined;
|
|
3419
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2709
3420
|
description?: string | undefined;
|
|
2710
3421
|
toolName?: string | undefined;
|
|
2711
3422
|
scenario?: string | undefined;
|
|
2712
3423
|
llmHostConfig?: {
|
|
2713
|
-
provider: "anthropic" | "
|
|
3424
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2714
3425
|
model?: string | undefined;
|
|
2715
3426
|
maxTokens?: number | undefined;
|
|
2716
3427
|
apiKeyEnvVar?: string | undefined;
|
|
2717
3428
|
temperature?: number | undefined;
|
|
2718
3429
|
maxToolCalls?: number | undefined;
|
|
2719
3430
|
} | undefined;
|
|
3431
|
+
iterations?: number | undefined;
|
|
3432
|
+
accuracyThreshold?: number | undefined;
|
|
3433
|
+
judgeReps?: number | undefined;
|
|
3434
|
+
canonicalAnswer?: string | undefined;
|
|
3435
|
+
tags?: string[] | undefined;
|
|
2720
3436
|
expect?: {
|
|
3437
|
+
response?: unknown;
|
|
2721
3438
|
isError?: string | boolean | string[] | undefined;
|
|
2722
3439
|
schema?: string | undefined;
|
|
2723
3440
|
snapshot?: string | undefined;
|
|
2724
|
-
|
|
3441
|
+
toolsTriggered?: {
|
|
3442
|
+
calls: {
|
|
3443
|
+
name: string;
|
|
3444
|
+
required?: boolean | undefined;
|
|
3445
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3446
|
+
}[];
|
|
3447
|
+
order?: "strict" | "any" | undefined;
|
|
3448
|
+
exclusive?: boolean | undefined;
|
|
3449
|
+
} | undefined;
|
|
3450
|
+
toolCallCount?: {
|
|
3451
|
+
exact?: number | undefined;
|
|
3452
|
+
min?: number | undefined;
|
|
3453
|
+
max?: number | undefined;
|
|
3454
|
+
} | undefined;
|
|
2725
3455
|
containsText?: string | string[] | undefined;
|
|
2726
3456
|
matchesPattern?: string | string[] | undefined;
|
|
2727
|
-
snapshotSanitizers?: ("
|
|
3457
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2728
3458
|
pattern: string;
|
|
2729
3459
|
replacement?: string | undefined;
|
|
2730
3460
|
} | {
|
|
2731
3461
|
remove: string[];
|
|
2732
3462
|
})[] | undefined;
|
|
2733
3463
|
passesJudge?: {
|
|
2734
|
-
rubric:
|
|
3464
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3465
|
+
text: string;
|
|
3466
|
+
};
|
|
3467
|
+
model?: string | undefined;
|
|
3468
|
+
maxTokens?: number | undefined;
|
|
3469
|
+
maxBudgetUsd?: number | undefined;
|
|
2735
3470
|
reference?: unknown;
|
|
2736
3471
|
threshold?: number | undefined;
|
|
2737
|
-
|
|
3472
|
+
reps?: number | undefined;
|
|
3473
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3474
|
+
apiKeyEnvVar?: string | undefined;
|
|
3475
|
+
temperature?: number | undefined;
|
|
3476
|
+
maxToolOutputSize?: number | undefined;
|
|
2738
3477
|
} | undefined;
|
|
2739
3478
|
responseSize?: {
|
|
2740
3479
|
maxBytes?: number | undefined;
|
|
@@ -2756,21 +3495,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2756
3495
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2757
3496
|
scenario: z.ZodOptional<z.ZodString>;
|
|
2758
3497
|
llmHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2759
|
-
provider: z.ZodEnum<["openai", "anthropic"]>;
|
|
3498
|
+
provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
|
|
2760
3499
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
2761
3500
|
model: z.ZodOptional<z.ZodString>;
|
|
2762
3501
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
2763
3502
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
2764
3503
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
2765
3504
|
}, "strip", z.ZodTypeAny, {
|
|
2766
|
-
provider: "anthropic" | "
|
|
3505
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2767
3506
|
model?: string | undefined;
|
|
2768
3507
|
maxTokens?: number | undefined;
|
|
2769
3508
|
apiKeyEnvVar?: string | undefined;
|
|
2770
3509
|
temperature?: number | undefined;
|
|
2771
3510
|
maxToolCalls?: number | undefined;
|
|
2772
3511
|
}, {
|
|
2773
|
-
provider: "anthropic" | "
|
|
3512
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2774
3513
|
model?: string | undefined;
|
|
2775
3514
|
maxTokens?: number | undefined;
|
|
2776
3515
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2778,6 +3517,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2778
3517
|
maxToolCalls?: number | undefined;
|
|
2779
3518
|
}>>;
|
|
2780
3519
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3520
|
+
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3521
|
+
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3522
|
+
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3523
|
+
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3524
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
2781
3525
|
expect: z.ZodOptional<z.ZodObject<{
|
|
2782
3526
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
2783
3527
|
schema: z.ZodOptional<z.ZodString>;
|
|
@@ -2802,20 +3546,51 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2802
3546
|
}>]>, "many">>;
|
|
2803
3547
|
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2804
3548
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
2805
|
-
rubric: z.
|
|
3549
|
+
rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
|
|
3550
|
+
text: z.ZodString;
|
|
3551
|
+
}, "strip", z.ZodTypeAny, {
|
|
3552
|
+
text: string;
|
|
3553
|
+
}, {
|
|
3554
|
+
text: string;
|
|
3555
|
+
}>]>;
|
|
2806
3556
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
2807
3557
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
2808
|
-
|
|
3558
|
+
reps: z.ZodOptional<z.ZodNumber>;
|
|
3559
|
+
provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
|
|
3560
|
+
model: z.ZodOptional<z.ZodString>;
|
|
3561
|
+
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3562
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3563
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3564
|
+
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3565
|
+
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
2809
3566
|
}, "strip", z.ZodTypeAny, {
|
|
2810
|
-
rubric:
|
|
3567
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3568
|
+
text: string;
|
|
3569
|
+
};
|
|
3570
|
+
model?: string | undefined;
|
|
3571
|
+
maxTokens?: number | undefined;
|
|
3572
|
+
maxBudgetUsd?: number | undefined;
|
|
2811
3573
|
reference?: unknown;
|
|
2812
3574
|
threshold?: number | undefined;
|
|
2813
|
-
|
|
3575
|
+
reps?: number | undefined;
|
|
3576
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3577
|
+
apiKeyEnvVar?: string | undefined;
|
|
3578
|
+
temperature?: number | undefined;
|
|
3579
|
+
maxToolOutputSize?: number | undefined;
|
|
2814
3580
|
}, {
|
|
2815
|
-
rubric:
|
|
3581
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3582
|
+
text: string;
|
|
3583
|
+
};
|
|
3584
|
+
model?: string | undefined;
|
|
3585
|
+
maxTokens?: number | undefined;
|
|
3586
|
+
maxBudgetUsd?: number | undefined;
|
|
2816
3587
|
reference?: unknown;
|
|
2817
3588
|
threshold?: number | undefined;
|
|
2818
|
-
|
|
3589
|
+
reps?: number | undefined;
|
|
3590
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3591
|
+
apiKeyEnvVar?: string | undefined;
|
|
3592
|
+
temperature?: number | undefined;
|
|
3593
|
+
maxToolOutputSize?: number | undefined;
|
|
2819
3594
|
}>>;
|
|
2820
3595
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
2821
3596
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2827,47 +3602,139 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2827
3602
|
maxBytes?: number | undefined;
|
|
2828
3603
|
minBytes?: number | undefined;
|
|
2829
3604
|
}>>;
|
|
3605
|
+
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3606
|
+
calls: z.ZodArray<z.ZodObject<{
|
|
3607
|
+
name: z.ZodString;
|
|
3608
|
+
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3609
|
+
required: z.ZodOptional<z.ZodBoolean>;
|
|
3610
|
+
}, "strip", z.ZodTypeAny, {
|
|
3611
|
+
name: string;
|
|
3612
|
+
required?: boolean | undefined;
|
|
3613
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3614
|
+
}, {
|
|
3615
|
+
name: string;
|
|
3616
|
+
required?: boolean | undefined;
|
|
3617
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3618
|
+
}>, "many">;
|
|
3619
|
+
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3620
|
+
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3621
|
+
}, "strip", z.ZodTypeAny, {
|
|
3622
|
+
calls: {
|
|
3623
|
+
name: string;
|
|
3624
|
+
required?: boolean | undefined;
|
|
3625
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3626
|
+
}[];
|
|
3627
|
+
order?: "strict" | "any" | undefined;
|
|
3628
|
+
exclusive?: boolean | undefined;
|
|
3629
|
+
}, {
|
|
3630
|
+
calls: {
|
|
3631
|
+
name: string;
|
|
3632
|
+
required?: boolean | undefined;
|
|
3633
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3634
|
+
}[];
|
|
3635
|
+
order?: "strict" | "any" | undefined;
|
|
3636
|
+
exclusive?: boolean | undefined;
|
|
3637
|
+
}>>;
|
|
3638
|
+
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3639
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
3640
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
3641
|
+
exact: z.ZodOptional<z.ZodNumber>;
|
|
3642
|
+
}, "strip", z.ZodTypeAny, {
|
|
3643
|
+
exact?: number | undefined;
|
|
3644
|
+
min?: number | undefined;
|
|
3645
|
+
max?: number | undefined;
|
|
3646
|
+
}, {
|
|
3647
|
+
exact?: number | undefined;
|
|
3648
|
+
min?: number | undefined;
|
|
3649
|
+
max?: number | undefined;
|
|
3650
|
+
}>>;
|
|
2830
3651
|
}, "strip", z.ZodTypeAny, {
|
|
3652
|
+
response?: unknown;
|
|
2831
3653
|
isError?: string | boolean | string[] | undefined;
|
|
2832
3654
|
schema?: string | undefined;
|
|
2833
3655
|
snapshot?: string | undefined;
|
|
2834
|
-
|
|
3656
|
+
toolsTriggered?: {
|
|
3657
|
+
calls: {
|
|
3658
|
+
name: string;
|
|
3659
|
+
required?: boolean | undefined;
|
|
3660
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3661
|
+
}[];
|
|
3662
|
+
order?: "strict" | "any" | undefined;
|
|
3663
|
+
exclusive?: boolean | undefined;
|
|
3664
|
+
} | undefined;
|
|
3665
|
+
toolCallCount?: {
|
|
3666
|
+
exact?: number | undefined;
|
|
3667
|
+
min?: number | undefined;
|
|
3668
|
+
max?: number | undefined;
|
|
3669
|
+
} | undefined;
|
|
2835
3670
|
containsText?: string | string[] | undefined;
|
|
2836
3671
|
matchesPattern?: string | string[] | undefined;
|
|
2837
|
-
snapshotSanitizers?: ("
|
|
3672
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2838
3673
|
pattern: string;
|
|
2839
3674
|
replacement?: string | undefined;
|
|
2840
3675
|
} | {
|
|
2841
3676
|
remove: string[];
|
|
2842
3677
|
})[] | undefined;
|
|
2843
3678
|
passesJudge?: {
|
|
2844
|
-
rubric:
|
|
3679
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3680
|
+
text: string;
|
|
3681
|
+
};
|
|
3682
|
+
model?: string | undefined;
|
|
3683
|
+
maxTokens?: number | undefined;
|
|
3684
|
+
maxBudgetUsd?: number | undefined;
|
|
2845
3685
|
reference?: unknown;
|
|
2846
3686
|
threshold?: number | undefined;
|
|
2847
|
-
|
|
3687
|
+
reps?: number | undefined;
|
|
3688
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3689
|
+
apiKeyEnvVar?: string | undefined;
|
|
3690
|
+
temperature?: number | undefined;
|
|
3691
|
+
maxToolOutputSize?: number | undefined;
|
|
2848
3692
|
} | undefined;
|
|
2849
3693
|
responseSize?: {
|
|
2850
3694
|
maxBytes?: number | undefined;
|
|
2851
3695
|
minBytes?: number | undefined;
|
|
2852
3696
|
} | undefined;
|
|
2853
3697
|
}, {
|
|
3698
|
+
response?: unknown;
|
|
2854
3699
|
isError?: string | boolean | string[] | undefined;
|
|
2855
3700
|
schema?: string | undefined;
|
|
2856
3701
|
snapshot?: string | undefined;
|
|
2857
|
-
|
|
3702
|
+
toolsTriggered?: {
|
|
3703
|
+
calls: {
|
|
3704
|
+
name: string;
|
|
3705
|
+
required?: boolean | undefined;
|
|
3706
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3707
|
+
}[];
|
|
3708
|
+
order?: "strict" | "any" | undefined;
|
|
3709
|
+
exclusive?: boolean | undefined;
|
|
3710
|
+
} | undefined;
|
|
3711
|
+
toolCallCount?: {
|
|
3712
|
+
exact?: number | undefined;
|
|
3713
|
+
min?: number | undefined;
|
|
3714
|
+
max?: number | undefined;
|
|
3715
|
+
} | undefined;
|
|
2858
3716
|
containsText?: string | string[] | undefined;
|
|
2859
3717
|
matchesPattern?: string | string[] | undefined;
|
|
2860
|
-
snapshotSanitizers?: ("
|
|
3718
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2861
3719
|
pattern: string;
|
|
2862
3720
|
replacement?: string | undefined;
|
|
2863
3721
|
} | {
|
|
2864
3722
|
remove: string[];
|
|
2865
3723
|
})[] | undefined;
|
|
2866
3724
|
passesJudge?: {
|
|
2867
|
-
rubric:
|
|
3725
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3726
|
+
text: string;
|
|
3727
|
+
};
|
|
3728
|
+
model?: string | undefined;
|
|
3729
|
+
maxTokens?: number | undefined;
|
|
3730
|
+
maxBudgetUsd?: number | undefined;
|
|
2868
3731
|
reference?: unknown;
|
|
2869
3732
|
threshold?: number | undefined;
|
|
2870
|
-
|
|
3733
|
+
reps?: number | undefined;
|
|
3734
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3735
|
+
apiKeyEnvVar?: string | undefined;
|
|
3736
|
+
temperature?: number | undefined;
|
|
3737
|
+
maxToolOutputSize?: number | undefined;
|
|
2871
3738
|
} | undefined;
|
|
2872
3739
|
responseSize?: {
|
|
2873
3740
|
maxBytes?: number | undefined;
|
|
@@ -2877,37 +3744,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2877
3744
|
}, "strip", z.ZodTypeAny, {
|
|
2878
3745
|
id: string;
|
|
2879
3746
|
args?: Record<string, unknown> | undefined;
|
|
2880
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2881
3747
|
mode?: "direct" | "llm_host" | undefined;
|
|
3748
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2882
3749
|
description?: string | undefined;
|
|
2883
3750
|
toolName?: string | undefined;
|
|
2884
3751
|
scenario?: string | undefined;
|
|
2885
3752
|
llmHostConfig?: {
|
|
2886
|
-
provider: "anthropic" | "
|
|
3753
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2887
3754
|
model?: string | undefined;
|
|
2888
3755
|
maxTokens?: number | undefined;
|
|
2889
3756
|
apiKeyEnvVar?: string | undefined;
|
|
2890
3757
|
temperature?: number | undefined;
|
|
2891
3758
|
maxToolCalls?: number | undefined;
|
|
2892
3759
|
} | undefined;
|
|
3760
|
+
iterations?: number | undefined;
|
|
3761
|
+
accuracyThreshold?: number | undefined;
|
|
3762
|
+
judgeReps?: number | undefined;
|
|
3763
|
+
canonicalAnswer?: string | undefined;
|
|
3764
|
+
tags?: string[] | undefined;
|
|
2893
3765
|
expect?: {
|
|
3766
|
+
response?: unknown;
|
|
2894
3767
|
isError?: string | boolean | string[] | undefined;
|
|
2895
3768
|
schema?: string | undefined;
|
|
2896
3769
|
snapshot?: string | undefined;
|
|
2897
|
-
|
|
3770
|
+
toolsTriggered?: {
|
|
3771
|
+
calls: {
|
|
3772
|
+
name: string;
|
|
3773
|
+
required?: boolean | undefined;
|
|
3774
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3775
|
+
}[];
|
|
3776
|
+
order?: "strict" | "any" | undefined;
|
|
3777
|
+
exclusive?: boolean | undefined;
|
|
3778
|
+
} | undefined;
|
|
3779
|
+
toolCallCount?: {
|
|
3780
|
+
exact?: number | undefined;
|
|
3781
|
+
min?: number | undefined;
|
|
3782
|
+
max?: number | undefined;
|
|
3783
|
+
} | undefined;
|
|
2898
3784
|
containsText?: string | string[] | undefined;
|
|
2899
3785
|
matchesPattern?: string | string[] | undefined;
|
|
2900
|
-
snapshotSanitizers?: ("
|
|
3786
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2901
3787
|
pattern: string;
|
|
2902
3788
|
replacement?: string | undefined;
|
|
2903
3789
|
} | {
|
|
2904
3790
|
remove: string[];
|
|
2905
3791
|
})[] | undefined;
|
|
2906
3792
|
passesJudge?: {
|
|
2907
|
-
rubric:
|
|
3793
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3794
|
+
text: string;
|
|
3795
|
+
};
|
|
3796
|
+
model?: string | undefined;
|
|
3797
|
+
maxTokens?: number | undefined;
|
|
3798
|
+
maxBudgetUsd?: number | undefined;
|
|
2908
3799
|
reference?: unknown;
|
|
2909
3800
|
threshold?: number | undefined;
|
|
2910
|
-
|
|
3801
|
+
reps?: number | undefined;
|
|
3802
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3803
|
+
apiKeyEnvVar?: string | undefined;
|
|
3804
|
+
temperature?: number | undefined;
|
|
3805
|
+
maxToolOutputSize?: number | undefined;
|
|
2911
3806
|
} | undefined;
|
|
2912
3807
|
responseSize?: {
|
|
2913
3808
|
maxBytes?: number | undefined;
|
|
@@ -2917,37 +3812,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2917
3812
|
}, {
|
|
2918
3813
|
id: string;
|
|
2919
3814
|
args?: Record<string, unknown> | undefined;
|
|
2920
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2921
3815
|
mode?: "direct" | "llm_host" | undefined;
|
|
3816
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2922
3817
|
description?: string | undefined;
|
|
2923
3818
|
toolName?: string | undefined;
|
|
2924
3819
|
scenario?: string | undefined;
|
|
2925
3820
|
llmHostConfig?: {
|
|
2926
|
-
provider: "anthropic" | "
|
|
3821
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2927
3822
|
model?: string | undefined;
|
|
2928
3823
|
maxTokens?: number | undefined;
|
|
2929
3824
|
apiKeyEnvVar?: string | undefined;
|
|
2930
3825
|
temperature?: number | undefined;
|
|
2931
3826
|
maxToolCalls?: number | undefined;
|
|
2932
3827
|
} | undefined;
|
|
3828
|
+
iterations?: number | undefined;
|
|
3829
|
+
accuracyThreshold?: number | undefined;
|
|
3830
|
+
judgeReps?: number | undefined;
|
|
3831
|
+
canonicalAnswer?: string | undefined;
|
|
3832
|
+
tags?: string[] | undefined;
|
|
2933
3833
|
expect?: {
|
|
3834
|
+
response?: unknown;
|
|
2934
3835
|
isError?: string | boolean | string[] | undefined;
|
|
2935
3836
|
schema?: string | undefined;
|
|
2936
3837
|
snapshot?: string | undefined;
|
|
2937
|
-
|
|
3838
|
+
toolsTriggered?: {
|
|
3839
|
+
calls: {
|
|
3840
|
+
name: string;
|
|
3841
|
+
required?: boolean | undefined;
|
|
3842
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3843
|
+
}[];
|
|
3844
|
+
order?: "strict" | "any" | undefined;
|
|
3845
|
+
exclusive?: boolean | undefined;
|
|
3846
|
+
} | undefined;
|
|
3847
|
+
toolCallCount?: {
|
|
3848
|
+
exact?: number | undefined;
|
|
3849
|
+
min?: number | undefined;
|
|
3850
|
+
max?: number | undefined;
|
|
3851
|
+
} | undefined;
|
|
2938
3852
|
containsText?: string | string[] | undefined;
|
|
2939
3853
|
matchesPattern?: string | string[] | undefined;
|
|
2940
|
-
snapshotSanitizers?: ("
|
|
3854
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2941
3855
|
pattern: string;
|
|
2942
3856
|
replacement?: string | undefined;
|
|
2943
3857
|
} | {
|
|
2944
3858
|
remove: string[];
|
|
2945
3859
|
})[] | undefined;
|
|
2946
3860
|
passesJudge?: {
|
|
2947
|
-
rubric:
|
|
3861
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3862
|
+
text: string;
|
|
3863
|
+
};
|
|
3864
|
+
model?: string | undefined;
|
|
3865
|
+
maxTokens?: number | undefined;
|
|
3866
|
+
maxBudgetUsd?: number | undefined;
|
|
2948
3867
|
reference?: unknown;
|
|
2949
3868
|
threshold?: number | undefined;
|
|
2950
|
-
|
|
3869
|
+
reps?: number | undefined;
|
|
3870
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3871
|
+
apiKeyEnvVar?: string | undefined;
|
|
3872
|
+
temperature?: number | undefined;
|
|
3873
|
+
maxToolOutputSize?: number | undefined;
|
|
2951
3874
|
} | undefined;
|
|
2952
3875
|
responseSize?: {
|
|
2953
3876
|
maxBytes?: number | undefined;
|
|
@@ -2961,37 +3884,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
2961
3884
|
cases: {
|
|
2962
3885
|
id: string;
|
|
2963
3886
|
args?: Record<string, unknown> | undefined;
|
|
2964
|
-
metadata?: Record<string, unknown> | undefined;
|
|
2965
3887
|
mode?: "direct" | "llm_host" | undefined;
|
|
3888
|
+
metadata?: Record<string, unknown> | undefined;
|
|
2966
3889
|
description?: string | undefined;
|
|
2967
3890
|
toolName?: string | undefined;
|
|
2968
3891
|
scenario?: string | undefined;
|
|
2969
3892
|
llmHostConfig?: {
|
|
2970
|
-
provider: "anthropic" | "
|
|
3893
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
2971
3894
|
model?: string | undefined;
|
|
2972
3895
|
maxTokens?: number | undefined;
|
|
2973
3896
|
apiKeyEnvVar?: string | undefined;
|
|
2974
3897
|
temperature?: number | undefined;
|
|
2975
3898
|
maxToolCalls?: number | undefined;
|
|
2976
3899
|
} | undefined;
|
|
3900
|
+
iterations?: number | undefined;
|
|
3901
|
+
accuracyThreshold?: number | undefined;
|
|
3902
|
+
judgeReps?: number | undefined;
|
|
3903
|
+
canonicalAnswer?: string | undefined;
|
|
3904
|
+
tags?: string[] | undefined;
|
|
2977
3905
|
expect?: {
|
|
3906
|
+
response?: unknown;
|
|
2978
3907
|
isError?: string | boolean | string[] | undefined;
|
|
2979
3908
|
schema?: string | undefined;
|
|
2980
3909
|
snapshot?: string | undefined;
|
|
2981
|
-
|
|
3910
|
+
toolsTriggered?: {
|
|
3911
|
+
calls: {
|
|
3912
|
+
name: string;
|
|
3913
|
+
required?: boolean | undefined;
|
|
3914
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3915
|
+
}[];
|
|
3916
|
+
order?: "strict" | "any" | undefined;
|
|
3917
|
+
exclusive?: boolean | undefined;
|
|
3918
|
+
} | undefined;
|
|
3919
|
+
toolCallCount?: {
|
|
3920
|
+
exact?: number | undefined;
|
|
3921
|
+
min?: number | undefined;
|
|
3922
|
+
max?: number | undefined;
|
|
3923
|
+
} | undefined;
|
|
2982
3924
|
containsText?: string | string[] | undefined;
|
|
2983
3925
|
matchesPattern?: string | string[] | undefined;
|
|
2984
|
-
snapshotSanitizers?: ("
|
|
3926
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
2985
3927
|
pattern: string;
|
|
2986
3928
|
replacement?: string | undefined;
|
|
2987
3929
|
} | {
|
|
2988
3930
|
remove: string[];
|
|
2989
3931
|
})[] | undefined;
|
|
2990
3932
|
passesJudge?: {
|
|
2991
|
-
rubric:
|
|
3933
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3934
|
+
text: string;
|
|
3935
|
+
};
|
|
3936
|
+
model?: string | undefined;
|
|
3937
|
+
maxTokens?: number | undefined;
|
|
3938
|
+
maxBudgetUsd?: number | undefined;
|
|
2992
3939
|
reference?: unknown;
|
|
2993
3940
|
threshold?: number | undefined;
|
|
2994
|
-
|
|
3941
|
+
reps?: number | undefined;
|
|
3942
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3943
|
+
apiKeyEnvVar?: string | undefined;
|
|
3944
|
+
temperature?: number | undefined;
|
|
3945
|
+
maxToolOutputSize?: number | undefined;
|
|
2995
3946
|
} | undefined;
|
|
2996
3947
|
responseSize?: {
|
|
2997
3948
|
maxBytes?: number | undefined;
|
|
@@ -3006,37 +3957,65 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3006
3957
|
cases: {
|
|
3007
3958
|
id: string;
|
|
3008
3959
|
args?: Record<string, unknown> | undefined;
|
|
3009
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3010
3960
|
mode?: "direct" | "llm_host" | undefined;
|
|
3961
|
+
metadata?: Record<string, unknown> | undefined;
|
|
3011
3962
|
description?: string | undefined;
|
|
3012
3963
|
toolName?: string | undefined;
|
|
3013
3964
|
scenario?: string | undefined;
|
|
3014
3965
|
llmHostConfig?: {
|
|
3015
|
-
provider: "anthropic" | "
|
|
3966
|
+
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3016
3967
|
model?: string | undefined;
|
|
3017
3968
|
maxTokens?: number | undefined;
|
|
3018
3969
|
apiKeyEnvVar?: string | undefined;
|
|
3019
3970
|
temperature?: number | undefined;
|
|
3020
3971
|
maxToolCalls?: number | undefined;
|
|
3021
3972
|
} | undefined;
|
|
3973
|
+
iterations?: number | undefined;
|
|
3974
|
+
accuracyThreshold?: number | undefined;
|
|
3975
|
+
judgeReps?: number | undefined;
|
|
3976
|
+
canonicalAnswer?: string | undefined;
|
|
3977
|
+
tags?: string[] | undefined;
|
|
3022
3978
|
expect?: {
|
|
3979
|
+
response?: unknown;
|
|
3023
3980
|
isError?: string | boolean | string[] | undefined;
|
|
3024
3981
|
schema?: string | undefined;
|
|
3025
3982
|
snapshot?: string | undefined;
|
|
3026
|
-
|
|
3983
|
+
toolsTriggered?: {
|
|
3984
|
+
calls: {
|
|
3985
|
+
name: string;
|
|
3986
|
+
required?: boolean | undefined;
|
|
3987
|
+
arguments?: Record<string, unknown> | undefined;
|
|
3988
|
+
}[];
|
|
3989
|
+
order?: "strict" | "any" | undefined;
|
|
3990
|
+
exclusive?: boolean | undefined;
|
|
3991
|
+
} | undefined;
|
|
3992
|
+
toolCallCount?: {
|
|
3993
|
+
exact?: number | undefined;
|
|
3994
|
+
min?: number | undefined;
|
|
3995
|
+
max?: number | undefined;
|
|
3996
|
+
} | undefined;
|
|
3027
3997
|
containsText?: string | string[] | undefined;
|
|
3028
3998
|
matchesPattern?: string | string[] | undefined;
|
|
3029
|
-
snapshotSanitizers?: ("
|
|
3999
|
+
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3030
4000
|
pattern: string;
|
|
3031
4001
|
replacement?: string | undefined;
|
|
3032
4002
|
} | {
|
|
3033
4003
|
remove: string[];
|
|
3034
4004
|
})[] | undefined;
|
|
3035
4005
|
passesJudge?: {
|
|
3036
|
-
rubric:
|
|
4006
|
+
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
4007
|
+
text: string;
|
|
4008
|
+
};
|
|
4009
|
+
model?: string | undefined;
|
|
4010
|
+
maxTokens?: number | undefined;
|
|
4011
|
+
maxBudgetUsd?: number | undefined;
|
|
3037
4012
|
reference?: unknown;
|
|
3038
4013
|
threshold?: number | undefined;
|
|
3039
|
-
|
|
4014
|
+
reps?: number | undefined;
|
|
4015
|
+
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
4016
|
+
apiKeyEnvVar?: string | undefined;
|
|
4017
|
+
temperature?: number | undefined;
|
|
4018
|
+
maxToolOutputSize?: number | undefined;
|
|
3040
4019
|
} | undefined;
|
|
3041
4020
|
responseSize?: {
|
|
3042
4021
|
maxBytes?: number | undefined;
|
|
@@ -3126,29 +4105,126 @@ declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions)
|
|
|
3126
4105
|
declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
|
|
3127
4106
|
|
|
3128
4107
|
/**
|
|
3129
|
-
*
|
|
4108
|
+
* Reporter-specific type definitions
|
|
4109
|
+
*
|
|
4110
|
+
* These types are used by the MCP reporter and UI.
|
|
4111
|
+
*
|
|
4112
|
+
* @packageDocumentation
|
|
3130
4113
|
*/
|
|
3131
|
-
|
|
4114
|
+
|
|
4115
|
+
/**
|
|
4116
|
+
* Experiment tracking metadata for an eval run
|
|
4117
|
+
*/
|
|
4118
|
+
interface EvalRunMetadata {
|
|
4119
|
+
/** Git commit hash at time of run */
|
|
4120
|
+
gitHash?: string;
|
|
4121
|
+
/** ISO timestamp of the run */
|
|
4122
|
+
timestamp: string;
|
|
4123
|
+
/** Package version from package.json */
|
|
4124
|
+
packageVersion: string;
|
|
4125
|
+
/** LLM host model identifier (if llm_host mode) */
|
|
4126
|
+
llmHostModel?: string;
|
|
4127
|
+
/** Judge model identifier (if judge was used) */
|
|
4128
|
+
judgeModel?: string;
|
|
4129
|
+
}
|
|
4130
|
+
/**
|
|
4131
|
+
* Individual conformance check result
|
|
4132
|
+
*/
|
|
4133
|
+
interface MCPConformanceCheck$1 {
|
|
3132
4134
|
/**
|
|
3133
|
-
*
|
|
4135
|
+
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
3134
4136
|
*/
|
|
3135
|
-
|
|
4137
|
+
name: string;
|
|
3136
4138
|
/**
|
|
3137
|
-
*
|
|
3138
|
-
* When provided, eval results will be attached to the test for the MCP reporter
|
|
4139
|
+
* Whether the check passed
|
|
3139
4140
|
*/
|
|
3140
|
-
|
|
4141
|
+
pass: boolean;
|
|
3141
4142
|
/**
|
|
3142
|
-
*
|
|
3143
|
-
* Required for snapshot expectations to work properly
|
|
4143
|
+
* Human-readable message describing the result
|
|
3144
4144
|
*/
|
|
3145
|
-
|
|
4145
|
+
message: string;
|
|
3146
4146
|
}
|
|
3147
|
-
|
|
3148
4147
|
/**
|
|
3149
|
-
*
|
|
4148
|
+
* Conformance check result as stored in reporter data
|
|
3150
4149
|
*/
|
|
3151
|
-
interface
|
|
4150
|
+
interface MCPConformanceResultData {
|
|
4151
|
+
/**
|
|
4152
|
+
* Test title where conformance check was run
|
|
4153
|
+
*/
|
|
4154
|
+
testTitle: string;
|
|
4155
|
+
/**
|
|
4156
|
+
* Whether all checks passed
|
|
4157
|
+
*/
|
|
4158
|
+
pass: boolean;
|
|
4159
|
+
/**
|
|
4160
|
+
* Individual check results
|
|
4161
|
+
*/
|
|
4162
|
+
checks: MCPConformanceCheck$1[];
|
|
4163
|
+
/**
|
|
4164
|
+
* Server info if available
|
|
4165
|
+
*/
|
|
4166
|
+
serverInfo?: {
|
|
4167
|
+
name?: string;
|
|
4168
|
+
version?: string;
|
|
4169
|
+
};
|
|
4170
|
+
/**
|
|
4171
|
+
* Number of tools discovered
|
|
4172
|
+
*/
|
|
4173
|
+
toolCount: number;
|
|
4174
|
+
/**
|
|
4175
|
+
* Auth type used for this check
|
|
4176
|
+
*/
|
|
4177
|
+
authType?: AuthType;
|
|
4178
|
+
/**
|
|
4179
|
+
* Project name
|
|
4180
|
+
*/
|
|
4181
|
+
project?: string;
|
|
4182
|
+
}
|
|
4183
|
+
/**
|
|
4184
|
+
* Server capabilities data from mcp-list-tools attachment
|
|
4185
|
+
*/
|
|
4186
|
+
interface MCPServerCapabilitiesData {
|
|
4187
|
+
/**
|
|
4188
|
+
* Test title where listTools was called
|
|
4189
|
+
*/
|
|
4190
|
+
testTitle: string;
|
|
4191
|
+
/**
|
|
4192
|
+
* List of tools available on the server
|
|
4193
|
+
*/
|
|
4194
|
+
tools: Array<{
|
|
4195
|
+
name: string;
|
|
4196
|
+
description?: string;
|
|
4197
|
+
}>;
|
|
4198
|
+
/**
|
|
4199
|
+
* Total number of tools
|
|
4200
|
+
*/
|
|
4201
|
+
toolCount: number;
|
|
4202
|
+
/**
|
|
4203
|
+
* Auth type used for this test
|
|
4204
|
+
*/
|
|
4205
|
+
authType?: AuthType;
|
|
4206
|
+
/**
|
|
4207
|
+
* Project name
|
|
4208
|
+
*/
|
|
4209
|
+
project?: string;
|
|
4210
|
+
}
|
|
4211
|
+
/**
|
|
4212
|
+
* Result of a single iteration within a multi-iteration eval case
|
|
4213
|
+
*/
|
|
4214
|
+
interface IterationResult {
|
|
4215
|
+
/** Whether this iteration passed */
|
|
4216
|
+
pass: boolean;
|
|
4217
|
+
/** Execution time for this iteration */
|
|
4218
|
+
durationMs: number;
|
|
4219
|
+
/** Error message if the iteration failed with an exception */
|
|
4220
|
+
error?: string;
|
|
4221
|
+
/** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
|
|
4222
|
+
isInfrastructureError?: boolean;
|
|
4223
|
+
}
|
|
4224
|
+
/**
|
|
4225
|
+
* Result of a single eval case
|
|
4226
|
+
*/
|
|
4227
|
+
interface EvalCaseResult {
|
|
3152
4228
|
/**
|
|
3153
4229
|
* Case ID
|
|
3154
4230
|
*/
|
|
@@ -3161,15 +4237,8 @@ interface EvalCaseResult$1 {
|
|
|
3161
4237
|
* MCP tool name that was called
|
|
3162
4238
|
*/
|
|
3163
4239
|
toolName: string;
|
|
3164
|
-
/**
|
|
3165
|
-
* Evaluation mode (direct or llm_host)
|
|
3166
|
-
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
3167
|
-
*/
|
|
3168
|
-
mode?: 'direct' | 'llm_host';
|
|
3169
4240
|
/**
|
|
3170
4241
|
* Source of this result
|
|
3171
|
-
* - 'eval': From runEvalDataset() using JSON eval datasets
|
|
3172
|
-
* - 'test': From direct API test tracking (MCP fixture calls)
|
|
3173
4242
|
*/
|
|
3174
4243
|
source: ResultSource;
|
|
3175
4244
|
/**
|
|
@@ -3194,14 +4263,164 @@ interface EvalCaseResult$1 {
|
|
|
3194
4263
|
authType?: AuthType;
|
|
3195
4264
|
/**
|
|
3196
4265
|
* Playwright project name this test belongs to
|
|
3197
|
-
* Used for filtering/grouping results by project in the reporter
|
|
3198
4266
|
*/
|
|
3199
4267
|
project?: string;
|
|
3200
4268
|
/**
|
|
3201
4269
|
* Execution time in milliseconds
|
|
3202
4270
|
*/
|
|
3203
4271
|
durationMs: number;
|
|
4272
|
+
/**
|
|
4273
|
+
* Assertion pass rate (0–1): passes divided by non-infrastructure iterations.
|
|
4274
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4275
|
+
*
|
|
4276
|
+
* Infrastructure errors (network timeouts, rate limits, etc.) are excluded from
|
|
4277
|
+
* the denominator so that environment reliability does not inflate this metric.
|
|
4278
|
+
*/
|
|
4279
|
+
assertionPassRate?: number;
|
|
4280
|
+
/**
|
|
4281
|
+
* Infrastructure error rate (0–1): infra errors divided by total iterations.
|
|
4282
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4283
|
+
*/
|
|
4284
|
+
infrastructureErrorRate?: number;
|
|
4285
|
+
/**
|
|
4286
|
+
* Accuracy score (0–1) across all iterations.
|
|
4287
|
+
* Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
|
|
4288
|
+
* @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
|
|
4289
|
+
*/
|
|
4290
|
+
accuracy?: number;
|
|
4291
|
+
/**
|
|
4292
|
+
* Per-iteration pass/fail breakdown.
|
|
4293
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4294
|
+
*/
|
|
4295
|
+
iterationResults?: Array<IterationResult>;
|
|
4296
|
+
/**
|
|
4297
|
+
* Tags from the source eval case, for filtering and slicing reports.
|
|
4298
|
+
*/
|
|
4299
|
+
tags?: string[];
|
|
4300
|
+
/**
|
|
4301
|
+
* Precision of tool calls made (0–1).
|
|
4302
|
+
* 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
|
|
4303
|
+
* Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
|
|
4304
|
+
*/
|
|
4305
|
+
toolPrecision?: number;
|
|
4306
|
+
/**
|
|
4307
|
+
* Recall of required tool calls (0–1).
|
|
4308
|
+
* 1.0 means all required tools were called; <1.0 means some were missed.
|
|
4309
|
+
* Only populated when toolsTriggered expectation was evaluated.
|
|
4310
|
+
*/
|
|
4311
|
+
toolRecall?: number;
|
|
4312
|
+
/**
|
|
4313
|
+
* Pass/fail status of this case in the baseline run.
|
|
4314
|
+
* Only present when a baseline was provided to runEvalDataset.
|
|
4315
|
+
*/
|
|
4316
|
+
baselinePass?: boolean;
|
|
4317
|
+
/**
|
|
4318
|
+
* Number of iterations that failed due to infrastructure errors (network, rate limits, etc.)
|
|
4319
|
+
* Only present when the case was run with `iterations > 1`.
|
|
4320
|
+
*/
|
|
4321
|
+
infrastructureErrorCount?: number;
|
|
4322
|
+
}
|
|
4323
|
+
/**
|
|
4324
|
+
* Aggregated MCP eval run data
|
|
4325
|
+
*/
|
|
4326
|
+
interface MCPEvalRunData {
|
|
4327
|
+
/**
|
|
4328
|
+
* Run timestamp (ISO 8601)
|
|
4329
|
+
*/
|
|
4330
|
+
timestamp: string;
|
|
4331
|
+
/**
|
|
4332
|
+
* Total duration in milliseconds
|
|
4333
|
+
*/
|
|
4334
|
+
durationMs: number;
|
|
4335
|
+
/**
|
|
4336
|
+
* Environment info
|
|
4337
|
+
*/
|
|
4338
|
+
environment: {
|
|
4339
|
+
ci: boolean;
|
|
4340
|
+
node: string;
|
|
4341
|
+
platform: string;
|
|
4342
|
+
};
|
|
4343
|
+
/**
|
|
4344
|
+
* Aggregate metrics
|
|
4345
|
+
*/
|
|
4346
|
+
metrics: {
|
|
4347
|
+
/**
|
|
4348
|
+
* Total number of eval cases
|
|
4349
|
+
*/
|
|
4350
|
+
total: number;
|
|
4351
|
+
/**
|
|
4352
|
+
* Number of passed cases
|
|
4353
|
+
*/
|
|
4354
|
+
passed: number;
|
|
4355
|
+
/**
|
|
4356
|
+
* Number of failed cases
|
|
4357
|
+
*/
|
|
4358
|
+
failed: number;
|
|
4359
|
+
/**
|
|
4360
|
+
* Pass rate (0-1)
|
|
4361
|
+
*/
|
|
4362
|
+
passRate: number;
|
|
4363
|
+
/**
|
|
4364
|
+
* Dataset breakdown: dataset name -> count
|
|
4365
|
+
*/
|
|
4366
|
+
datasetBreakdown: Record<string, number>;
|
|
4367
|
+
/**
|
|
4368
|
+
* Expectation type breakdown
|
|
4369
|
+
*/
|
|
4370
|
+
expectationBreakdown: ExpectationBreakdown;
|
|
4371
|
+
};
|
|
4372
|
+
/**
|
|
4373
|
+
* All eval results from this run
|
|
4374
|
+
*/
|
|
4375
|
+
results: EvalCaseResult[];
|
|
4376
|
+
/**
|
|
4377
|
+
* Conformance check results (optional)
|
|
4378
|
+
*/
|
|
4379
|
+
conformanceChecks?: MCPConformanceResultData[];
|
|
4380
|
+
/**
|
|
4381
|
+
* Server capabilities discovered via listTools (optional)
|
|
4382
|
+
*/
|
|
4383
|
+
serverCapabilities?: MCPServerCapabilitiesData[];
|
|
4384
|
+
}
|
|
4385
|
+
/**
|
|
4386
|
+
* Historical summary for trend charts
|
|
4387
|
+
*/
|
|
4388
|
+
interface MCPEvalHistoricalSummary {
|
|
4389
|
+
timestamp: string;
|
|
4390
|
+
total: number;
|
|
4391
|
+
passed: number;
|
|
4392
|
+
failed: number;
|
|
4393
|
+
passRate: number;
|
|
4394
|
+
durationMs: number;
|
|
4395
|
+
}
|
|
4396
|
+
/**
|
|
4397
|
+
* Complete data structure passed to UI
|
|
4398
|
+
*/
|
|
4399
|
+
interface MCPEvalData {
|
|
4400
|
+
runData: MCPEvalRunData;
|
|
4401
|
+
historical: MCPEvalHistoricalSummary[];
|
|
3204
4402
|
}
|
|
4403
|
+
|
|
4404
|
+
/**
|
|
4405
|
+
* Context passed to the eval runner
|
|
4406
|
+
*/
|
|
4407
|
+
interface EvalContext {
|
|
4408
|
+
/**
|
|
4409
|
+
* MCP fixture API for interacting with the server
|
|
4410
|
+
*/
|
|
4411
|
+
mcp: MCPFixtureApi;
|
|
4412
|
+
/**
|
|
4413
|
+
* Optional Playwright TestInfo for reporter integration
|
|
4414
|
+
* When provided, eval results will be attached to the test for the MCP reporter
|
|
4415
|
+
*/
|
|
4416
|
+
testInfo?: TestInfo;
|
|
4417
|
+
/**
|
|
4418
|
+
* Optional Playwright expect function for snapshot testing
|
|
4419
|
+
* Required for snapshot expectations to work properly
|
|
4420
|
+
*/
|
|
4421
|
+
expect?: Expect;
|
|
4422
|
+
}
|
|
4423
|
+
|
|
3205
4424
|
/**
|
|
3206
4425
|
* Overall result of running an eval dataset
|
|
3207
4426
|
*/
|
|
@@ -3221,11 +4440,48 @@ interface EvalRunnerResult {
|
|
|
3221
4440
|
/**
|
|
3222
4441
|
* Individual case results
|
|
3223
4442
|
*/
|
|
3224
|
-
caseResults: Array<EvalCaseResult
|
|
4443
|
+
caseResults: Array<EvalCaseResult>;
|
|
3225
4444
|
/**
|
|
3226
4445
|
* Overall execution time in milliseconds
|
|
3227
4446
|
*/
|
|
3228
4447
|
durationMs: number;
|
|
4448
|
+
/**
|
|
4449
|
+
* Difference between current pass rate and baseline pass rate.
|
|
4450
|
+
* Positive = improvement, negative = regression.
|
|
4451
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4452
|
+
*/
|
|
4453
|
+
deltaPassRate?: number;
|
|
4454
|
+
/**
|
|
4455
|
+
* Number of cases that regressed: passed in baseline, failed now.
|
|
4456
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4457
|
+
*/
|
|
4458
|
+
regressions?: number;
|
|
4459
|
+
/**
|
|
4460
|
+
* Number of cases that improved: failed in baseline, passed now.
|
|
4461
|
+
* Only present when `baselineResultsFrom` was provided.
|
|
4462
|
+
*/
|
|
4463
|
+
improvements?: number;
|
|
4464
|
+
/**
|
|
4465
|
+
* Average tool precision across all llm_host cases that have a
|
|
4466
|
+
* `toolsTriggered` expectation (precision = fraction of called tools
|
|
4467
|
+
* that were expected). Only present when at least one such case ran.
|
|
4468
|
+
*/
|
|
4469
|
+
datasetToolPrecision?: number;
|
|
4470
|
+
/**
|
|
4471
|
+
* Average tool recall across all llm_host cases that have a
|
|
4472
|
+
* `toolsTriggered` expectation (recall = fraction of required tools
|
|
4473
|
+
* that were actually called). Only present when at least one such case ran.
|
|
4474
|
+
*/
|
|
4475
|
+
datasetToolRecall?: number;
|
|
4476
|
+
/**
|
|
4477
|
+
* Harmonic mean of `datasetToolPrecision` and `datasetToolRecall`.
|
|
4478
|
+
* Only present when at least one case contributes precision/recall data.
|
|
4479
|
+
*/
|
|
4480
|
+
datasetToolF1?: number;
|
|
4481
|
+
/**
|
|
4482
|
+
* Experiment tracking metadata captured at run time.
|
|
4483
|
+
*/
|
|
4484
|
+
metadata?: EvalRunMetadata;
|
|
3229
4485
|
}
|
|
3230
4486
|
/**
|
|
3231
4487
|
* Options for running eval dataset
|
|
@@ -3251,12 +4507,6 @@ interface EvalRunnerOptions {
|
|
|
3251
4507
|
* ```
|
|
3252
4508
|
*/
|
|
3253
4509
|
schemas?: Record<string, ZodType>;
|
|
3254
|
-
/**
|
|
3255
|
-
* Judge configuration registry by ID
|
|
3256
|
-
*
|
|
3257
|
-
* Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
|
|
3258
|
-
*/
|
|
3259
|
-
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3260
4510
|
/**
|
|
3261
4511
|
* Whether to stop on first failure
|
|
3262
4512
|
* @default false
|
|
@@ -3265,7 +4515,71 @@ interface EvalRunnerOptions {
|
|
|
3265
4515
|
/**
|
|
3266
4516
|
* Optional callback called after each case
|
|
3267
4517
|
*/
|
|
3268
|
-
onCaseComplete?: (result: EvalCaseResult
|
|
4518
|
+
onCaseComplete?: (result: EvalCaseResult) => void | Promise<void>;
|
|
4519
|
+
/**
|
|
4520
|
+
* Maximum number of eval cases to run concurrently.
|
|
4521
|
+
* When > 1, cases run in parallel (ignores stopOnFailure ordering).
|
|
4522
|
+
* @default 1 (sequential)
|
|
4523
|
+
*/
|
|
4524
|
+
concurrency?: number;
|
|
4525
|
+
/**
|
|
4526
|
+
* Default iteration count for `llm_host` mode cases that do not specify
|
|
4527
|
+
* `iterations` explicitly. Has no effect on `direct` mode cases (which are
|
|
4528
|
+
* deterministic and always default to 1 iteration).
|
|
4529
|
+
*
|
|
4530
|
+
* Set to 10 for standard runs or 20 for release gates. Individual cases can
|
|
4531
|
+
* still override this with their own `iterations` field.
|
|
4532
|
+
*
|
|
4533
|
+
* @default 1 (preserves historical behaviour when not set)
|
|
4534
|
+
*
|
|
4535
|
+
* @example
|
|
4536
|
+
* ```typescript
|
|
4537
|
+
* // Run all llm_host cases 10 times each by default
|
|
4538
|
+
* await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
|
|
4539
|
+
* ```
|
|
4540
|
+
*/
|
|
4541
|
+
defaultLlmIterations?: number;
|
|
4542
|
+
/**
|
|
4543
|
+
* Default number of judge evaluations for cases that do not specify
|
|
4544
|
+
* `judgeReps` explicitly. Applies to any case with a `passesJudge`
|
|
4545
|
+
* expectation. Per-case `judgeReps` overrides this.
|
|
4546
|
+
*
|
|
4547
|
+
* @default 1 (single judge run)
|
|
4548
|
+
*/
|
|
4549
|
+
defaultJudgeReps?: number;
|
|
4550
|
+
/**
|
|
4551
|
+
* When set, only eval cases whose `tags` array contains at least one of
|
|
4552
|
+
* the specified tags are run. Cases without a `tags` field are excluded.
|
|
4553
|
+
* When undefined or empty, all cases run (default behavior).
|
|
4554
|
+
*/
|
|
4555
|
+
filterTags?: string[];
|
|
4556
|
+
/**
|
|
4557
|
+
* If set, saves the run results to this file path after completion.
|
|
4558
|
+
* Use with `baselineResultsFrom` on the next run for regression detection.
|
|
4559
|
+
*
|
|
4560
|
+
* @example '.mcp-test-results/baseline.json'
|
|
4561
|
+
*/
|
|
4562
|
+
saveResultsTo?: string;
|
|
4563
|
+
/**
|
|
4564
|
+
* If set, loads this file as the baseline and computes delta metrics vs the current run.
|
|
4565
|
+
* Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
|
|
4566
|
+
* and tags each `EvalCaseResult.baselinePass`.
|
|
4567
|
+
*/
|
|
4568
|
+
baselineResultsFrom?: string;
|
|
4569
|
+
/**
|
|
4570
|
+
* LLM host model identifier to record in run metadata.
|
|
4571
|
+
* Use this to identify which model was used when running llm_host cases.
|
|
4572
|
+
*
|
|
4573
|
+
* @example 'claude-opus-4-20250514'
|
|
4574
|
+
*/
|
|
4575
|
+
llmHostModel?: string;
|
|
4576
|
+
/**
|
|
4577
|
+
* Judge model identifier to record in run metadata.
|
|
4578
|
+
* Use this to identify which model was used for judge evaluations.
|
|
4579
|
+
*
|
|
4580
|
+
* @example 'claude-sonnet-4-20250514'
|
|
4581
|
+
*/
|
|
4582
|
+
judgeModel?: string;
|
|
3269
4583
|
}
|
|
3270
4584
|
/**
|
|
3271
4585
|
* Options for running a single eval case
|
|
@@ -3279,17 +4593,14 @@ interface EvalCaseOptions {
|
|
|
3279
4593
|
* Schema registry for schema validation by name
|
|
3280
4594
|
*/
|
|
3281
4595
|
schemas?: Record<string, ZodType>;
|
|
3282
|
-
/**
|
|
3283
|
-
* Judge configuration registry by ID
|
|
3284
|
-
*/
|
|
3285
|
-
judgeConfigs?: Record<string, JudgeConfig>;
|
|
3286
4596
|
}
|
|
3287
4597
|
/**
|
|
3288
|
-
* Runs a single eval case and returns the result
|
|
4598
|
+
* Runs a single eval case and returns the result.
|
|
4599
|
+
* When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
|
|
3289
4600
|
*
|
|
3290
4601
|
* @param evalCase - The eval case to run
|
|
3291
4602
|
* @param context - Context containing mcp, testInfo, expect
|
|
3292
|
-
* @param options - Optional configuration (datasetName, schemas
|
|
4603
|
+
* @param options - Optional configuration (datasetName, schemas)
|
|
3293
4604
|
* @returns The result of running the eval case
|
|
3294
4605
|
*
|
|
3295
4606
|
* @example
|
|
@@ -3303,131 +4614,165 @@ interface EvalCaseOptions {
|
|
|
3303
4614
|
* expect(result.pass).toBe(true);
|
|
3304
4615
|
* ```
|
|
3305
4616
|
*/
|
|
3306
|
-
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult
|
|
4617
|
+
declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
|
|
4618
|
+
declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
|
|
4619
|
+
|
|
3307
4620
|
/**
|
|
3308
|
-
*
|
|
4621
|
+
* Saves eval results to a JSON file for use as a baseline in future runs.
|
|
3309
4622
|
*
|
|
3310
|
-
*
|
|
3311
|
-
*
|
|
4623
|
+
* @param result - The eval run result to save
|
|
4624
|
+
* @param filePath - Path to write the JSON file (parent dirs created automatically)
|
|
4625
|
+
*/
|
|
4626
|
+
declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
|
|
4627
|
+
/**
|
|
4628
|
+
* Loads a previously saved baseline from a JSON file.
|
|
3312
4629
|
*
|
|
3313
|
-
* @param
|
|
3314
|
-
* @
|
|
3315
|
-
* @
|
|
4630
|
+
* @param filePath - Path to the JSON file written by saveBaseline
|
|
4631
|
+
* @returns The saved EvalRunnerResult
|
|
4632
|
+
* @throws If the file cannot be read or parsed
|
|
4633
|
+
*/
|
|
4634
|
+
declare function loadBaseline(filePath: string): Promise<EvalRunnerResult>;
|
|
4635
|
+
|
|
4636
|
+
/** Outcome of comparing two servers on a single eval case. */
|
|
4637
|
+
type ComparisonOutcome = 'A_WINS' | 'B_WINS' | 'TIE' | 'BOTH_FAIL';
|
|
4638
|
+
/** Result of comparing a single eval case across two servers. */
|
|
4639
|
+
interface CaseComparisonResult {
|
|
4640
|
+
/** Case ID */
|
|
4641
|
+
id: string;
|
|
4642
|
+
/** Comparison outcome */
|
|
4643
|
+
outcome: ComparisonOutcome;
|
|
4644
|
+
/** Result from server A */
|
|
4645
|
+
serverA: EvalCaseResult;
|
|
4646
|
+
/** Result from server B */
|
|
4647
|
+
serverB: EvalCaseResult;
|
|
4648
|
+
}
|
|
4649
|
+
/** Aggregated result of running a dataset against two servers. */
|
|
4650
|
+
interface ServerComparisonResult {
|
|
4651
|
+
/** Dataset name */
|
|
4652
|
+
dataset: string;
|
|
4653
|
+
/** Total cases compared (cases present in both runs) */
|
|
4654
|
+
total: number;
|
|
4655
|
+
/** Cases where server A passed and server B failed */
|
|
4656
|
+
aWins: number;
|
|
4657
|
+
/** Cases where server B passed and server A failed */
|
|
4658
|
+
bWins: number;
|
|
4659
|
+
/** Cases where both passed */
|
|
4660
|
+
ties: number;
|
|
4661
|
+
/** Cases where both failed */
|
|
4662
|
+
bothFail: number;
|
|
4663
|
+
/** Raw count of cases where both servers failed (same as bothFail) */
|
|
4664
|
+
bothFailCount: number;
|
|
4665
|
+
/** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
|
|
4666
|
+
decidedCases: number;
|
|
4667
|
+
/** Fraction of total cases where both servers failed (bothFail / total) */
|
|
4668
|
+
failureAlignment: number;
|
|
4669
|
+
/** A win rate (aWins / decidedCases, excludes BOTH_FAIL) */
|
|
4670
|
+
aWinRate: number;
|
|
4671
|
+
/** B win rate (bWins / decidedCases, excludes BOTH_FAIL) */
|
|
4672
|
+
bWinRate: number;
|
|
4673
|
+
/** Tie rate (ties / decidedCases, excludes BOTH_FAIL) */
|
|
4674
|
+
tieRate: number;
|
|
4675
|
+
/** Per-case comparison results */
|
|
4676
|
+
cases: CaseComparisonResult[];
|
|
4677
|
+
/** Full result from server A */
|
|
4678
|
+
serverAResult: EvalRunnerResult;
|
|
4679
|
+
/** Full result from server B */
|
|
4680
|
+
serverBResult: EvalRunnerResult;
|
|
4681
|
+
/** Total duration in milliseconds */
|
|
4682
|
+
durationMs: number;
|
|
4683
|
+
}
|
|
4684
|
+
/**
|
|
4685
|
+
* Options for `runServerComparison`.
|
|
4686
|
+
* Same as `EvalRunnerOptions` without baseline-specific fields.
|
|
4687
|
+
*/
|
|
4688
|
+
type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baselineResultsFrom'>;
|
|
4689
|
+
/**
|
|
4690
|
+
* Runs the same eval dataset against two MCP servers in parallel and
|
|
4691
|
+
* returns a detailed per-case comparison of results.
|
|
3316
4692
|
*
|
|
3317
|
-
*
|
|
3318
|
-
*
|
|
3319
|
-
*
|
|
3320
|
-
*
|
|
3321
|
-
*
|
|
3322
|
-
*
|
|
3323
|
-
*
|
|
3324
|
-
* { mcp }
|
|
3325
|
-
* );
|
|
4693
|
+
* Both servers receive identical cases and options. The comparison uses
|
|
4694
|
+
* simple pass/fail per case: A_WINS means A passed and B failed, etc.
|
|
4695
|
+
*
|
|
4696
|
+
* @param options - Eval dataset and runner options (shared between both servers)
|
|
4697
|
+
* @param contextA - MCP context for server A (e.g., Glean MCP)
|
|
4698
|
+
* @param contextB - MCP context for server B (e.g., native MCP)
|
|
4699
|
+
* @returns Comparison result with per-case outcomes and aggregate win rates
|
|
3326
4700
|
*
|
|
3327
4701
|
* @example
|
|
3328
|
-
*
|
|
3329
|
-
*
|
|
3330
|
-
*
|
|
3331
|
-
*
|
|
3332
|
-
*
|
|
3333
|
-
*
|
|
3334
|
-
* });
|
|
4702
|
+
* ```typescript
|
|
4703
|
+
* const comparison = await runServerComparison(
|
|
4704
|
+
* { dataset },
|
|
4705
|
+
* { mcp: gleanMcpFixture },
|
|
4706
|
+
* { mcp: nativeMcpFixture }
|
|
4707
|
+
* );
|
|
4708
|
+
* console.log(`Glean MCP wins: ${(comparison.aWinRate * 100).toFixed(1)}%`);
|
|
4709
|
+
* console.log(`Native MCP wins: ${(comparison.bWinRate * 100).toFixed(1)}%`);
|
|
4710
|
+
* ```
|
|
3335
4711
|
*/
|
|
3336
|
-
declare function
|
|
4712
|
+
declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
|
|
3337
4713
|
|
|
3338
4714
|
/**
|
|
3339
4715
|
* LLM Host Simulation - Main entry point
|
|
3340
4716
|
*
|
|
3341
|
-
*
|
|
3342
|
-
*
|
|
4717
|
+
* All providers (openai, anthropic, google, azure, mistral, deepseek,
|
|
4718
|
+
* openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
|
|
4719
|
+
* generateText + stopWhen for a uniform multi-turn tool-calling loop with
|
|
4720
|
+
* built-in latency decomposition.
|
|
4721
|
+
*
|
|
4722
|
+
* Required packages per provider:
|
|
4723
|
+
* openai → npm install ai @ai-sdk/openai
|
|
4724
|
+
* anthropic → npm install ai @ai-sdk/anthropic
|
|
4725
|
+
* google → npm install ai @ai-sdk/google
|
|
4726
|
+
* azure → npm install ai @ai-sdk/azure
|
|
4727
|
+
* mistral → npm install ai @ai-sdk/mistral
|
|
4728
|
+
* deepseek → npm install ai @ai-sdk/deepseek
|
|
4729
|
+
* openrouter → npm install ai @openrouter/ai-sdk-provider
|
|
4730
|
+
* xai → npm install ai @ai-sdk/xai
|
|
3343
4731
|
*/
|
|
3344
4732
|
|
|
3345
4733
|
/**
|
|
3346
|
-
* Simulates an LLM host interacting with an MCP server
|
|
4734
|
+
* Simulates an LLM host interacting with an MCP server.
|
|
4735
|
+
*
|
|
4736
|
+
* The LLM chooses which tools to call based solely on their descriptions and
|
|
4737
|
+
* schemas, testing discoverability and parameter clarity at the level a real
|
|
4738
|
+
* user (via Claude Desktop, ChatGPT, etc.) would experience.
|
|
3347
4739
|
*
|
|
3348
|
-
*
|
|
3349
|
-
*
|
|
3350
|
-
*
|
|
3351
|
-
* parameter clarity.
|
|
4740
|
+
* All providers run through the Vercel AI SDK's generateText with maxSteps,
|
|
4741
|
+
* which handles multi-turn tool calling natively and provides per-step latency
|
|
4742
|
+
* decomposition (llmDurationMs vs. mcpDurationMs).
|
|
3352
4743
|
*
|
|
3353
4744
|
* @param mcp - MCP fixture API
|
|
3354
|
-
* @param scenario - Natural language prompt describing what
|
|
3355
|
-
* @param config - LLM host configuration
|
|
3356
|
-
* @returns Simulation result with tool calls and
|
|
4745
|
+
* @param scenario - Natural language prompt describing what the LLM should do
|
|
4746
|
+
* @param config - LLM host configuration (provider, model, temperature, etc.)
|
|
4747
|
+
* @returns Simulation result with tool calls, final response, and latency data
|
|
3357
4748
|
*
|
|
3358
4749
|
* @example
|
|
3359
4750
|
* ```typescript
|
|
3360
4751
|
* const result = await simulateLLMHost(mcp,
|
|
3361
|
-
* "
|
|
3362
|
-
* {
|
|
3363
|
-
* provider: 'openai',
|
|
3364
|
-
* model: 'gpt-4o'
|
|
3365
|
-
* }
|
|
4752
|
+
* "Find recent documents about MCP testing frameworks",
|
|
4753
|
+
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
3366
4754
|
* );
|
|
3367
4755
|
*
|
|
3368
4756
|
* expect(result.success).toBe(true);
|
|
3369
|
-
* expect(result.toolCalls).
|
|
3370
|
-
* name: 'get_weather',
|
|
3371
|
-
* arguments: { city: 'London' }
|
|
3372
|
-
* });
|
|
4757
|
+
* expect(result.toolCalls.map(c => c.name)).toContain('search');
|
|
3373
4758
|
* ```
|
|
3374
4759
|
*/
|
|
3375
4760
|
declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
|
|
3376
4761
|
/**
|
|
3377
|
-
*
|
|
4762
|
+
* Returns true if the given provider is supported.
|
|
3378
4763
|
*
|
|
3379
|
-
*
|
|
3380
|
-
*
|
|
3381
|
-
*
|
|
3382
|
-
* @param provider - LLM provider to check
|
|
3383
|
-
* @returns true if an adapter is registered for the provider
|
|
4764
|
+
* Note: this does not check whether the required @ai-sdk/* package is
|
|
4765
|
+
* installed — that is validated at simulation time with a helpful error.
|
|
3384
4766
|
*/
|
|
3385
4767
|
declare function isProviderAvailable(provider: LLMProvider): boolean;
|
|
3386
4768
|
/**
|
|
3387
|
-
*
|
|
4769
|
+
* Returns a human-readable installation message for a given provider.
|
|
3388
4770
|
*
|
|
3389
|
-
* @
|
|
3390
|
-
* @
|
|
4771
|
+
* @remarks This is a diagnostic utility for checking whether optional
|
|
4772
|
+
* @ai-sdk/* packages are installed. Not part of the primary usage path.
|
|
3391
4773
|
*/
|
|
3392
4774
|
declare function getMissingDependencyMessage(provider: LLMProvider): string;
|
|
3393
4775
|
|
|
3394
|
-
/**
|
|
3395
|
-
* Tool call validator for LLM host mode
|
|
3396
|
-
*
|
|
3397
|
-
* Validates that the LLM made the expected tool calls with correct arguments
|
|
3398
|
-
*/
|
|
3399
|
-
|
|
3400
|
-
/**
|
|
3401
|
-
* Tool call validation function signature
|
|
3402
|
-
*/
|
|
3403
|
-
type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
|
|
3404
|
-
/**
|
|
3405
|
-
* Creates a tool call validator for LLM host mode
|
|
3406
|
-
*
|
|
3407
|
-
* Validates that the LLM made the expected tool calls with correct arguments.
|
|
3408
|
-
* Supports partial argument matching and optional calls.
|
|
3409
|
-
*
|
|
3410
|
-
* @returns Validator function
|
|
3411
|
-
*
|
|
3412
|
-
* @example
|
|
3413
|
-
* ```typescript
|
|
3414
|
-
* // In your eval case:
|
|
3415
|
-
* {
|
|
3416
|
-
* "id": "weather-london",
|
|
3417
|
-
* "mode": "llm_host",
|
|
3418
|
-
* "scenario": "Get the weather for London",
|
|
3419
|
-
* "expectedToolCalls": [
|
|
3420
|
-
* {
|
|
3421
|
-
* "name": "get_weather",
|
|
3422
|
-
* "arguments": { "city": "London" },
|
|
3423
|
-
* "required": true
|
|
3424
|
-
* }
|
|
3425
|
-
* ]
|
|
3426
|
-
* }
|
|
3427
|
-
* ```
|
|
3428
|
-
*/
|
|
3429
|
-
declare function createToolCallValidator(): ToolCallValidator;
|
|
3430
|
-
|
|
3431
4776
|
/**
|
|
3432
4777
|
* Creates an LLM judge for evaluating tool responses
|
|
3433
4778
|
*
|
|
@@ -3494,7 +4839,7 @@ interface MCPConformanceOptions {
|
|
|
3494
4839
|
/**
|
|
3495
4840
|
* Individual check result
|
|
3496
4841
|
*/
|
|
3497
|
-
interface MCPConformanceCheck
|
|
4842
|
+
interface MCPConformanceCheck {
|
|
3498
4843
|
name: string;
|
|
3499
4844
|
pass: boolean;
|
|
3500
4845
|
message: string;
|
|
@@ -3539,7 +4884,7 @@ interface MCPConformanceResult {
|
|
|
3539
4884
|
/**
|
|
3540
4885
|
* List of check results
|
|
3541
4886
|
*/
|
|
3542
|
-
checks: MCPConformanceCheck
|
|
4887
|
+
checks: MCPConformanceCheck[];
|
|
3543
4888
|
/**
|
|
3544
4889
|
* Raw MCP responses for snapshotting
|
|
3545
4890
|
*
|
|
@@ -3588,229 +4933,6 @@ interface MCPConformanceResult {
|
|
|
3588
4933
|
*/
|
|
3589
4934
|
declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
|
|
3590
4935
|
|
|
3591
|
-
/**
|
|
3592
|
-
* Reporter-specific type definitions
|
|
3593
|
-
*
|
|
3594
|
-
* These types are used by the MCP reporter and UI.
|
|
3595
|
-
*
|
|
3596
|
-
* @packageDocumentation
|
|
3597
|
-
*/
|
|
3598
|
-
|
|
3599
|
-
/**
|
|
3600
|
-
* Individual conformance check result
|
|
3601
|
-
*/
|
|
3602
|
-
interface MCPConformanceCheck {
|
|
3603
|
-
/**
|
|
3604
|
-
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
3605
|
-
*/
|
|
3606
|
-
name: string;
|
|
3607
|
-
/**
|
|
3608
|
-
* Whether the check passed
|
|
3609
|
-
*/
|
|
3610
|
-
pass: boolean;
|
|
3611
|
-
/**
|
|
3612
|
-
* Human-readable message describing the result
|
|
3613
|
-
*/
|
|
3614
|
-
message: string;
|
|
3615
|
-
}
|
|
3616
|
-
/**
|
|
3617
|
-
* Conformance check result as stored in reporter data
|
|
3618
|
-
*/
|
|
3619
|
-
interface MCPConformanceResultData {
|
|
3620
|
-
/**
|
|
3621
|
-
* Test title where conformance check was run
|
|
3622
|
-
*/
|
|
3623
|
-
testTitle: string;
|
|
3624
|
-
/**
|
|
3625
|
-
* Whether all checks passed
|
|
3626
|
-
*/
|
|
3627
|
-
pass: boolean;
|
|
3628
|
-
/**
|
|
3629
|
-
* Individual check results
|
|
3630
|
-
*/
|
|
3631
|
-
checks: MCPConformanceCheck[];
|
|
3632
|
-
/**
|
|
3633
|
-
* Server info if available
|
|
3634
|
-
*/
|
|
3635
|
-
serverInfo?: {
|
|
3636
|
-
name?: string;
|
|
3637
|
-
version?: string;
|
|
3638
|
-
};
|
|
3639
|
-
/**
|
|
3640
|
-
* Number of tools discovered
|
|
3641
|
-
*/
|
|
3642
|
-
toolCount: number;
|
|
3643
|
-
/**
|
|
3644
|
-
* Auth type used for this check
|
|
3645
|
-
*/
|
|
3646
|
-
authType?: AuthType;
|
|
3647
|
-
/**
|
|
3648
|
-
* Project name
|
|
3649
|
-
*/
|
|
3650
|
-
project?: string;
|
|
3651
|
-
}
|
|
3652
|
-
/**
|
|
3653
|
-
* Server capabilities data from mcp-list-tools attachment
|
|
3654
|
-
*/
|
|
3655
|
-
interface MCPServerCapabilitiesData {
|
|
3656
|
-
/**
|
|
3657
|
-
* Test title where listTools was called
|
|
3658
|
-
*/
|
|
3659
|
-
testTitle: string;
|
|
3660
|
-
/**
|
|
3661
|
-
* List of tools available on the server
|
|
3662
|
-
*/
|
|
3663
|
-
tools: Array<{
|
|
3664
|
-
name: string;
|
|
3665
|
-
description?: string;
|
|
3666
|
-
}>;
|
|
3667
|
-
/**
|
|
3668
|
-
* Total number of tools
|
|
3669
|
-
*/
|
|
3670
|
-
toolCount: number;
|
|
3671
|
-
/**
|
|
3672
|
-
* Auth type used for this test
|
|
3673
|
-
*/
|
|
3674
|
-
authType?: AuthType;
|
|
3675
|
-
/**
|
|
3676
|
-
* Project name
|
|
3677
|
-
*/
|
|
3678
|
-
project?: string;
|
|
3679
|
-
}
|
|
3680
|
-
/**
|
|
3681
|
-
* Result of a single eval case
|
|
3682
|
-
*/
|
|
3683
|
-
interface EvalCaseResult {
|
|
3684
|
-
/**
|
|
3685
|
-
* Case ID
|
|
3686
|
-
*/
|
|
3687
|
-
id: string;
|
|
3688
|
-
/**
|
|
3689
|
-
* Dataset name this case belongs to
|
|
3690
|
-
*/
|
|
3691
|
-
datasetName: string;
|
|
3692
|
-
/**
|
|
3693
|
-
* MCP tool name that was called
|
|
3694
|
-
*/
|
|
3695
|
-
toolName: string;
|
|
3696
|
-
/**
|
|
3697
|
-
* Source of this result
|
|
3698
|
-
*/
|
|
3699
|
-
source: ResultSource;
|
|
3700
|
-
/**
|
|
3701
|
-
* Overall pass/fail status
|
|
3702
|
-
*/
|
|
3703
|
-
pass: boolean;
|
|
3704
|
-
/**
|
|
3705
|
-
* Tool response
|
|
3706
|
-
*/
|
|
3707
|
-
response?: unknown;
|
|
3708
|
-
/**
|
|
3709
|
-
* Error if tool call failed
|
|
3710
|
-
*/
|
|
3711
|
-
error?: string;
|
|
3712
|
-
/**
|
|
3713
|
-
* Expectation results
|
|
3714
|
-
*/
|
|
3715
|
-
expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
|
|
3716
|
-
/**
|
|
3717
|
-
* Authentication type used for this test
|
|
3718
|
-
*/
|
|
3719
|
-
authType?: AuthType;
|
|
3720
|
-
/**
|
|
3721
|
-
* Playwright project name this test belongs to
|
|
3722
|
-
*/
|
|
3723
|
-
project?: string;
|
|
3724
|
-
/**
|
|
3725
|
-
* Execution time in milliseconds
|
|
3726
|
-
*/
|
|
3727
|
-
durationMs: number;
|
|
3728
|
-
/**
|
|
3729
|
-
* @deprecated Mode is inferred from test context, not displayed in reports
|
|
3730
|
-
*/
|
|
3731
|
-
mode?: 'direct' | 'llm_host';
|
|
3732
|
-
}
|
|
3733
|
-
/**
|
|
3734
|
-
* Aggregated MCP eval run data
|
|
3735
|
-
*/
|
|
3736
|
-
interface MCPEvalRunData {
|
|
3737
|
-
/**
|
|
3738
|
-
* Run timestamp (ISO 8601)
|
|
3739
|
-
*/
|
|
3740
|
-
timestamp: string;
|
|
3741
|
-
/**
|
|
3742
|
-
* Total duration in milliseconds
|
|
3743
|
-
*/
|
|
3744
|
-
durationMs: number;
|
|
3745
|
-
/**
|
|
3746
|
-
* Environment info
|
|
3747
|
-
*/
|
|
3748
|
-
environment: {
|
|
3749
|
-
ci: boolean;
|
|
3750
|
-
node: string;
|
|
3751
|
-
platform: string;
|
|
3752
|
-
};
|
|
3753
|
-
/**
|
|
3754
|
-
* Aggregate metrics
|
|
3755
|
-
*/
|
|
3756
|
-
metrics: {
|
|
3757
|
-
/**
|
|
3758
|
-
* Total number of eval cases
|
|
3759
|
-
*/
|
|
3760
|
-
total: number;
|
|
3761
|
-
/**
|
|
3762
|
-
* Number of passed cases
|
|
3763
|
-
*/
|
|
3764
|
-
passed: number;
|
|
3765
|
-
/**
|
|
3766
|
-
* Number of failed cases
|
|
3767
|
-
*/
|
|
3768
|
-
failed: number;
|
|
3769
|
-
/**
|
|
3770
|
-
* Pass rate (0-1)
|
|
3771
|
-
*/
|
|
3772
|
-
passRate: number;
|
|
3773
|
-
/**
|
|
3774
|
-
* Dataset breakdown: dataset name -> count
|
|
3775
|
-
*/
|
|
3776
|
-
datasetBreakdown: Record<string, number>;
|
|
3777
|
-
/**
|
|
3778
|
-
* Expectation type breakdown
|
|
3779
|
-
*/
|
|
3780
|
-
expectationBreakdown: ExpectationBreakdown;
|
|
3781
|
-
};
|
|
3782
|
-
/**
|
|
3783
|
-
* All eval results from this run
|
|
3784
|
-
*/
|
|
3785
|
-
results: EvalCaseResult[];
|
|
3786
|
-
/**
|
|
3787
|
-
* Conformance check results (optional)
|
|
3788
|
-
*/
|
|
3789
|
-
conformanceChecks?: MCPConformanceResultData[];
|
|
3790
|
-
/**
|
|
3791
|
-
* Server capabilities discovered via listTools (optional)
|
|
3792
|
-
*/
|
|
3793
|
-
serverCapabilities?: MCPServerCapabilitiesData[];
|
|
3794
|
-
}
|
|
3795
|
-
/**
|
|
3796
|
-
* Historical summary for trend charts
|
|
3797
|
-
*/
|
|
3798
|
-
interface MCPEvalHistoricalSummary {
|
|
3799
|
-
timestamp: string;
|
|
3800
|
-
total: number;
|
|
3801
|
-
passed: number;
|
|
3802
|
-
failed: number;
|
|
3803
|
-
passRate: number;
|
|
3804
|
-
durationMs: number;
|
|
3805
|
-
}
|
|
3806
|
-
/**
|
|
3807
|
-
* Complete data structure passed to UI
|
|
3808
|
-
*/
|
|
3809
|
-
interface MCPEvalData {
|
|
3810
|
-
runData: MCPEvalRunData;
|
|
3811
|
-
historical: MCPEvalHistoricalSummary[];
|
|
3812
|
-
}
|
|
3813
|
-
|
|
3814
4936
|
/**
|
|
3815
4937
|
* Reporter types - re-exported from canonical source
|
|
3816
4938
|
*
|
|
@@ -3831,7 +4953,7 @@ interface MCPEvalReporterConfig {
|
|
|
3831
4953
|
outputDir?: string;
|
|
3832
4954
|
/**
|
|
3833
4955
|
* Auto-open report in browser after test run
|
|
3834
|
-
* @default
|
|
4956
|
+
* @default false
|
|
3835
4957
|
*/
|
|
3836
4958
|
autoOpen?: boolean;
|
|
3837
4959
|
/**
|
|
@@ -3854,4 +4976,4 @@ interface MCPEvalReporterConfig {
|
|
|
3854
4976
|
includeAutoTracking?: boolean;
|
|
3855
4977
|
}
|
|
3856
4978
|
|
|
3857
|
-
export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult
|
|
4979
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|