@gleanwork/mcp-server-tester 1.0.0-beta.2 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -22
- package/dist/cli/index.js +38 -12
- package/dist/fixtures/mcp.d.ts +14 -6
- package/dist/fixtures/mcp.js +9 -6
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +69 -47
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +208 -1175
- package/dist/index.d.ts +208 -1175
- package/dist/index.js +69 -47
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -8
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -233,314 +233,64 @@ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
|
|
|
233
233
|
/**
|
|
234
234
|
* Union schema for MCPConfig (validates based on transport type)
|
|
235
235
|
*/
|
|
236
|
-
declare const MCPConfigSchema: z.ZodDiscriminatedUnion<
|
|
236
|
+
declare const MCPConfigSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
237
237
|
transport: z.ZodLiteral<"stdio">;
|
|
238
238
|
command: z.ZodString;
|
|
239
|
-
args: z.ZodOptional<z.ZodArray<z.ZodString
|
|
239
|
+
args: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
240
240
|
cwd: z.ZodOptional<z.ZodString>;
|
|
241
241
|
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
242
242
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
243
243
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
244
244
|
roots: z.ZodOptional<z.ZodObject<{
|
|
245
245
|
listChanged: z.ZodBoolean;
|
|
246
|
-
},
|
|
247
|
-
|
|
248
|
-
}, {
|
|
249
|
-
listChanged: boolean;
|
|
250
|
-
}>>;
|
|
251
|
-
}, "strip", z.ZodTypeAny, {
|
|
252
|
-
sampling?: Record<string, unknown> | undefined;
|
|
253
|
-
roots?: {
|
|
254
|
-
listChanged: boolean;
|
|
255
|
-
} | undefined;
|
|
256
|
-
}, {
|
|
257
|
-
sampling?: Record<string, unknown> | undefined;
|
|
258
|
-
roots?: {
|
|
259
|
-
listChanged: boolean;
|
|
260
|
-
} | undefined;
|
|
261
|
-
}>>;
|
|
246
|
+
}, z.core.$strip>>;
|
|
247
|
+
}, z.core.$strip>>;
|
|
262
248
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
263
249
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
264
250
|
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
265
251
|
quiet: z.ZodOptional<z.ZodBoolean>;
|
|
266
|
-
},
|
|
267
|
-
transport: "stdio";
|
|
268
|
-
command: string;
|
|
269
|
-
args?: string[] | undefined;
|
|
270
|
-
cwd?: string | undefined;
|
|
271
|
-
env?: Record<string, string> | undefined;
|
|
272
|
-
capabilities?: {
|
|
273
|
-
sampling?: Record<string, unknown> | undefined;
|
|
274
|
-
roots?: {
|
|
275
|
-
listChanged: boolean;
|
|
276
|
-
} | undefined;
|
|
277
|
-
} | undefined;
|
|
278
|
-
connectTimeoutMs?: number | undefined;
|
|
279
|
-
requestTimeoutMs?: number | undefined;
|
|
280
|
-
callTimeoutMs?: number | undefined;
|
|
281
|
-
quiet?: boolean | undefined;
|
|
282
|
-
}, {
|
|
283
|
-
transport: "stdio";
|
|
284
|
-
command: string;
|
|
285
|
-
args?: string[] | undefined;
|
|
286
|
-
cwd?: string | undefined;
|
|
287
|
-
env?: Record<string, string> | undefined;
|
|
288
|
-
capabilities?: {
|
|
289
|
-
sampling?: Record<string, unknown> | undefined;
|
|
290
|
-
roots?: {
|
|
291
|
-
listChanged: boolean;
|
|
292
|
-
} | undefined;
|
|
293
|
-
} | undefined;
|
|
294
|
-
connectTimeoutMs?: number | undefined;
|
|
295
|
-
requestTimeoutMs?: number | undefined;
|
|
296
|
-
callTimeoutMs?: number | undefined;
|
|
297
|
-
quiet?: boolean | undefined;
|
|
298
|
-
}>, z.ZodObject<{
|
|
252
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
299
253
|
transport: z.ZodLiteral<"http">;
|
|
300
|
-
serverUrl: z.
|
|
254
|
+
serverUrl: z.ZodString;
|
|
301
255
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
302
256
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
303
257
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
304
258
|
roots: z.ZodOptional<z.ZodObject<{
|
|
305
259
|
listChanged: z.ZodBoolean;
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
}, {
|
|
309
|
-
listChanged: boolean;
|
|
310
|
-
}>>;
|
|
311
|
-
}, "strip", z.ZodTypeAny, {
|
|
312
|
-
sampling?: Record<string, unknown> | undefined;
|
|
313
|
-
roots?: {
|
|
314
|
-
listChanged: boolean;
|
|
315
|
-
} | undefined;
|
|
316
|
-
}, {
|
|
317
|
-
sampling?: Record<string, unknown> | undefined;
|
|
318
|
-
roots?: {
|
|
319
|
-
listChanged: boolean;
|
|
320
|
-
} | undefined;
|
|
321
|
-
}>>;
|
|
260
|
+
}, z.core.$strip>>;
|
|
261
|
+
}, z.core.$strip>>;
|
|
322
262
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
323
263
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
324
264
|
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
325
|
-
auth: z.ZodOptional<z.
|
|
265
|
+
auth: z.ZodOptional<z.ZodObject<{
|
|
326
266
|
accessToken: z.ZodOptional<z.ZodString>;
|
|
327
267
|
oauth: z.ZodOptional<z.ZodObject<{
|
|
328
268
|
serverUrl: z.ZodString;
|
|
329
|
-
scopes: z.ZodOptional<z.ZodArray<z.ZodString
|
|
269
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
330
270
|
resource: z.ZodOptional<z.ZodString>;
|
|
331
271
|
authStatePath: z.ZodOptional<z.ZodString>;
|
|
332
272
|
clientId: z.ZodOptional<z.ZodString>;
|
|
333
273
|
clientSecret: z.ZodOptional<z.ZodString>;
|
|
334
274
|
redirectUri: z.ZodOptional<z.ZodString>;
|
|
335
|
-
},
|
|
336
|
-
serverUrl: string;
|
|
337
|
-
scopes?: string[] | undefined;
|
|
338
|
-
resource?: string | undefined;
|
|
339
|
-
authStatePath?: string | undefined;
|
|
340
|
-
clientId?: string | undefined;
|
|
341
|
-
clientSecret?: string | undefined;
|
|
342
|
-
redirectUri?: string | undefined;
|
|
343
|
-
}, {
|
|
344
|
-
serverUrl: string;
|
|
345
|
-
scopes?: string[] | undefined;
|
|
346
|
-
resource?: string | undefined;
|
|
347
|
-
authStatePath?: string | undefined;
|
|
348
|
-
clientId?: string | undefined;
|
|
349
|
-
clientSecret?: string | undefined;
|
|
350
|
-
redirectUri?: string | undefined;
|
|
351
|
-
}>>;
|
|
275
|
+
}, z.core.$strip>>;
|
|
352
276
|
clientCredentials: z.ZodOptional<z.ZodObject<{
|
|
353
277
|
clientId: z.ZodOptional<z.ZodString>;
|
|
354
278
|
clientSecret: z.ZodOptional<z.ZodString>;
|
|
355
279
|
tokenEndpoint: z.ZodOptional<z.ZodString>;
|
|
356
|
-
scopes: z.ZodOptional<z.ZodArray<z.ZodString
|
|
357
|
-
},
|
|
358
|
-
|
|
359
|
-
clientId?: string | undefined;
|
|
360
|
-
clientSecret?: string | undefined;
|
|
361
|
-
tokenEndpoint?: string | undefined;
|
|
362
|
-
}, {
|
|
363
|
-
scopes?: string[] | undefined;
|
|
364
|
-
clientId?: string | undefined;
|
|
365
|
-
clientSecret?: string | undefined;
|
|
366
|
-
tokenEndpoint?: string | undefined;
|
|
367
|
-
}>>;
|
|
368
|
-
}, "strip", z.ZodTypeAny, {
|
|
369
|
-
accessToken?: string | undefined;
|
|
370
|
-
oauth?: {
|
|
371
|
-
serverUrl: string;
|
|
372
|
-
scopes?: string[] | undefined;
|
|
373
|
-
resource?: string | undefined;
|
|
374
|
-
authStatePath?: string | undefined;
|
|
375
|
-
clientId?: string | undefined;
|
|
376
|
-
clientSecret?: string | undefined;
|
|
377
|
-
redirectUri?: string | undefined;
|
|
378
|
-
} | undefined;
|
|
379
|
-
clientCredentials?: {
|
|
380
|
-
scopes?: string[] | undefined;
|
|
381
|
-
clientId?: string | undefined;
|
|
382
|
-
clientSecret?: string | undefined;
|
|
383
|
-
tokenEndpoint?: string | undefined;
|
|
384
|
-
} | undefined;
|
|
385
|
-
}, {
|
|
386
|
-
accessToken?: string | undefined;
|
|
387
|
-
oauth?: {
|
|
388
|
-
serverUrl: string;
|
|
389
|
-
scopes?: string[] | undefined;
|
|
390
|
-
resource?: string | undefined;
|
|
391
|
-
authStatePath?: string | undefined;
|
|
392
|
-
clientId?: string | undefined;
|
|
393
|
-
clientSecret?: string | undefined;
|
|
394
|
-
redirectUri?: string | undefined;
|
|
395
|
-
} | undefined;
|
|
396
|
-
clientCredentials?: {
|
|
397
|
-
scopes?: string[] | undefined;
|
|
398
|
-
clientId?: string | undefined;
|
|
399
|
-
clientSecret?: string | undefined;
|
|
400
|
-
tokenEndpoint?: string | undefined;
|
|
401
|
-
} | undefined;
|
|
402
|
-
}>, {
|
|
403
|
-
accessToken?: string | undefined;
|
|
404
|
-
oauth?: {
|
|
405
|
-
serverUrl: string;
|
|
406
|
-
scopes?: string[] | undefined;
|
|
407
|
-
resource?: string | undefined;
|
|
408
|
-
authStatePath?: string | undefined;
|
|
409
|
-
clientId?: string | undefined;
|
|
410
|
-
clientSecret?: string | undefined;
|
|
411
|
-
redirectUri?: string | undefined;
|
|
412
|
-
} | undefined;
|
|
413
|
-
clientCredentials?: {
|
|
414
|
-
scopes?: string[] | undefined;
|
|
415
|
-
clientId?: string | undefined;
|
|
416
|
-
clientSecret?: string | undefined;
|
|
417
|
-
tokenEndpoint?: string | undefined;
|
|
418
|
-
} | undefined;
|
|
419
|
-
}, {
|
|
420
|
-
accessToken?: string | undefined;
|
|
421
|
-
oauth?: {
|
|
422
|
-
serverUrl: string;
|
|
423
|
-
scopes?: string[] | undefined;
|
|
424
|
-
resource?: string | undefined;
|
|
425
|
-
authStatePath?: string | undefined;
|
|
426
|
-
clientId?: string | undefined;
|
|
427
|
-
clientSecret?: string | undefined;
|
|
428
|
-
redirectUri?: string | undefined;
|
|
429
|
-
} | undefined;
|
|
430
|
-
clientCredentials?: {
|
|
431
|
-
scopes?: string[] | undefined;
|
|
432
|
-
clientId?: string | undefined;
|
|
433
|
-
clientSecret?: string | undefined;
|
|
434
|
-
tokenEndpoint?: string | undefined;
|
|
435
|
-
} | undefined;
|
|
436
|
-
}>>;
|
|
280
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
281
|
+
}, z.core.$strip>>;
|
|
282
|
+
}, z.core.$strip>>;
|
|
437
283
|
proxy: z.ZodOptional<z.ZodObject<{
|
|
438
284
|
url: z.ZodString;
|
|
439
|
-
},
|
|
440
|
-
url: string;
|
|
441
|
-
}, {
|
|
442
|
-
url: string;
|
|
443
|
-
}>>;
|
|
285
|
+
}, z.core.$strip>>;
|
|
444
286
|
retryAttempts: z.ZodOptional<z.ZodNumber>;
|
|
445
287
|
tls: z.ZodOptional<z.ZodObject<{
|
|
446
288
|
ca: z.ZodOptional<z.ZodString>;
|
|
447
289
|
cert: z.ZodOptional<z.ZodString>;
|
|
448
290
|
key: z.ZodOptional<z.ZodString>;
|
|
449
291
|
rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
|
|
450
|
-
},
|
|
451
|
-
|
|
452
|
-
cert?: string | undefined;
|
|
453
|
-
key?: string | undefined;
|
|
454
|
-
rejectUnauthorized?: boolean | undefined;
|
|
455
|
-
}, {
|
|
456
|
-
ca?: string | undefined;
|
|
457
|
-
cert?: string | undefined;
|
|
458
|
-
key?: string | undefined;
|
|
459
|
-
rejectUnauthorized?: boolean | undefined;
|
|
460
|
-
}>>;
|
|
461
|
-
}, "strip", z.ZodTypeAny, {
|
|
462
|
-
serverUrl: string;
|
|
463
|
-
transport: "http";
|
|
464
|
-
capabilities?: {
|
|
465
|
-
sampling?: Record<string, unknown> | undefined;
|
|
466
|
-
roots?: {
|
|
467
|
-
listChanged: boolean;
|
|
468
|
-
} | undefined;
|
|
469
|
-
} | undefined;
|
|
470
|
-
connectTimeoutMs?: number | undefined;
|
|
471
|
-
requestTimeoutMs?: number | undefined;
|
|
472
|
-
callTimeoutMs?: number | undefined;
|
|
473
|
-
headers?: Record<string, string> | undefined;
|
|
474
|
-
auth?: {
|
|
475
|
-
accessToken?: string | undefined;
|
|
476
|
-
oauth?: {
|
|
477
|
-
serverUrl: string;
|
|
478
|
-
scopes?: string[] | undefined;
|
|
479
|
-
resource?: string | undefined;
|
|
480
|
-
authStatePath?: string | undefined;
|
|
481
|
-
clientId?: string | undefined;
|
|
482
|
-
clientSecret?: string | undefined;
|
|
483
|
-
redirectUri?: string | undefined;
|
|
484
|
-
} | undefined;
|
|
485
|
-
clientCredentials?: {
|
|
486
|
-
scopes?: string[] | undefined;
|
|
487
|
-
clientId?: string | undefined;
|
|
488
|
-
clientSecret?: string | undefined;
|
|
489
|
-
tokenEndpoint?: string | undefined;
|
|
490
|
-
} | undefined;
|
|
491
|
-
} | undefined;
|
|
492
|
-
proxy?: {
|
|
493
|
-
url: string;
|
|
494
|
-
} | undefined;
|
|
495
|
-
retryAttempts?: number | undefined;
|
|
496
|
-
tls?: {
|
|
497
|
-
ca?: string | undefined;
|
|
498
|
-
cert?: string | undefined;
|
|
499
|
-
key?: string | undefined;
|
|
500
|
-
rejectUnauthorized?: boolean | undefined;
|
|
501
|
-
} | undefined;
|
|
502
|
-
}, {
|
|
503
|
-
serverUrl: string;
|
|
504
|
-
transport: "http";
|
|
505
|
-
capabilities?: {
|
|
506
|
-
sampling?: Record<string, unknown> | undefined;
|
|
507
|
-
roots?: {
|
|
508
|
-
listChanged: boolean;
|
|
509
|
-
} | undefined;
|
|
510
|
-
} | undefined;
|
|
511
|
-
connectTimeoutMs?: number | undefined;
|
|
512
|
-
requestTimeoutMs?: number | undefined;
|
|
513
|
-
callTimeoutMs?: number | undefined;
|
|
514
|
-
headers?: Record<string, string> | undefined;
|
|
515
|
-
auth?: {
|
|
516
|
-
accessToken?: string | undefined;
|
|
517
|
-
oauth?: {
|
|
518
|
-
serverUrl: string;
|
|
519
|
-
scopes?: string[] | undefined;
|
|
520
|
-
resource?: string | undefined;
|
|
521
|
-
authStatePath?: string | undefined;
|
|
522
|
-
clientId?: string | undefined;
|
|
523
|
-
clientSecret?: string | undefined;
|
|
524
|
-
redirectUri?: string | undefined;
|
|
525
|
-
} | undefined;
|
|
526
|
-
clientCredentials?: {
|
|
527
|
-
scopes?: string[] | undefined;
|
|
528
|
-
clientId?: string | undefined;
|
|
529
|
-
clientSecret?: string | undefined;
|
|
530
|
-
tokenEndpoint?: string | undefined;
|
|
531
|
-
} | undefined;
|
|
532
|
-
} | undefined;
|
|
533
|
-
proxy?: {
|
|
534
|
-
url: string;
|
|
535
|
-
} | undefined;
|
|
536
|
-
retryAttempts?: number | undefined;
|
|
537
|
-
tls?: {
|
|
538
|
-
ca?: string | undefined;
|
|
539
|
-
cert?: string | undefined;
|
|
540
|
-
key?: string | undefined;
|
|
541
|
-
rejectUnauthorized?: boolean | undefined;
|
|
542
|
-
} | undefined;
|
|
543
|
-
}>]>;
|
|
292
|
+
}, z.core.$strip>>;
|
|
293
|
+
}, z.core.$strip>], "transport">;
|
|
544
294
|
/**
|
|
545
295
|
* Validates an MCPConfig object
|
|
546
296
|
*
|
|
@@ -1790,9 +1540,9 @@ declare function validateError(response: unknown, expected?: boolean | string |
|
|
|
1790
1540
|
declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
|
|
1791
1541
|
|
|
1792
1542
|
/**
|
|
1793
|
-
* Tool call validators for
|
|
1543
|
+
* Tool call validators for mcp_host simulation results.
|
|
1794
1544
|
*
|
|
1795
|
-
* These validators extract the tool call trace from an
|
|
1545
|
+
* These validators extract the tool call trace from an MCPHostSimulationResult
|
|
1796
1546
|
* and apply assertions against expected call lists and counts.
|
|
1797
1547
|
*/
|
|
1798
1548
|
|
|
@@ -1811,16 +1561,16 @@ interface ToolCallCountOptions {
|
|
|
1811
1561
|
exact?: number;
|
|
1812
1562
|
}
|
|
1813
1563
|
/**
|
|
1814
|
-
* Validates tool calls made during an
|
|
1564
|
+
* Validates tool calls made during an MCP host simulation.
|
|
1815
1565
|
*
|
|
1816
|
-
* @param response - Must be an
|
|
1566
|
+
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
|
|
1817
1567
|
* @param expectation - Expected tool call specification
|
|
1818
1568
|
*/
|
|
1819
1569
|
declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
|
|
1820
1570
|
/**
|
|
1821
|
-
* Validates the number of tool calls made during an
|
|
1571
|
+
* Validates the number of tool calls made during an MCP host simulation.
|
|
1822
1572
|
*
|
|
1823
|
-
* @param response - Must be an
|
|
1573
|
+
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
|
|
1824
1574
|
* @param options - Count constraints (min, max, exact)
|
|
1825
1575
|
*/
|
|
1826
1576
|
declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
|
|
@@ -2223,7 +1973,7 @@ declare global {
|
|
|
2223
1973
|
*/
|
|
2224
1974
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
2225
1975
|
/**
|
|
2226
|
-
* Validates which tools the LLM called during
|
|
1976
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
2227
1977
|
*
|
|
2228
1978
|
* @example
|
|
2229
1979
|
* ```typescript
|
|
@@ -2235,7 +1985,7 @@ declare global {
|
|
|
2235
1985
|
*/
|
|
2236
1986
|
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
2237
1987
|
/**
|
|
2238
|
-
* Validates the number of tool calls made during
|
|
1988
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
2239
1989
|
*
|
|
2240
1990
|
* @example
|
|
2241
1991
|
* ```typescript
|
|
@@ -2486,6 +2236,14 @@ declare function toMatchToolPattern(this: {
|
|
|
2486
2236
|
/**
|
|
2487
2237
|
* Creates the toMatchToolSnapshot matcher function
|
|
2488
2238
|
*
|
|
2239
|
+
* @remarks
|
|
2240
|
+
* **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
|
|
2241
|
+
* internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
|
|
2242
|
+
* Calling it outside a Playwright test will throw a cryptic context error.
|
|
2243
|
+
*
|
|
2244
|
+
* To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
|
|
2245
|
+
* function directly.
|
|
2246
|
+
*
|
|
2489
2247
|
* Note: This is an async matcher that uses Playwright's snapshot testing.
|
|
2490
2248
|
*/
|
|
2491
2249
|
declare function toMatchToolSnapshot(this: {
|
|
@@ -2595,7 +2353,7 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
2595
2353
|
/**
|
|
2596
2354
|
* toHaveToolCalls Matcher
|
|
2597
2355
|
*
|
|
2598
|
-
* Validates which tools the LLM called during
|
|
2356
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
2599
2357
|
*/
|
|
2600
2358
|
|
|
2601
2359
|
/**
|
|
@@ -2611,7 +2369,7 @@ declare function toHaveToolCalls(this: {
|
|
|
2611
2369
|
/**
|
|
2612
2370
|
* toHaveToolCallCount Matcher
|
|
2613
2371
|
*
|
|
2614
|
-
* Validates the number of tool calls made during
|
|
2372
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
2615
2373
|
*/
|
|
2616
2374
|
|
|
2617
2375
|
/**
|
|
@@ -2720,9 +2478,9 @@ interface MCPAuthFixtures {
|
|
|
2720
2478
|
declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2721
2479
|
|
|
2722
2480
|
/**
|
|
2723
|
-
* Types and interfaces for
|
|
2481
|
+
* Types and interfaces for MCP host simulation mode
|
|
2724
2482
|
*
|
|
2725
|
-
* This module provides types for testing MCP servers through
|
|
2483
|
+
* This module provides types for testing MCP servers through MCP hosts,
|
|
2726
2484
|
* validating tool descriptions, parameter clarity, and discoverability.
|
|
2727
2485
|
*/
|
|
2728
2486
|
|
|
@@ -2751,9 +2509,9 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
|
|
|
2751
2509
|
*/
|
|
2752
2510
|
| 'vertex-anthropic';
|
|
2753
2511
|
/**
|
|
2754
|
-
* Configuration for
|
|
2512
|
+
* Configuration for MCP host simulation
|
|
2755
2513
|
*/
|
|
2756
|
-
interface
|
|
2514
|
+
interface MCPHostConfig {
|
|
2757
2515
|
/**
|
|
2758
2516
|
* LLM provider to use
|
|
2759
2517
|
*/
|
|
@@ -2793,9 +2551,9 @@ interface LLMToolCall {
|
|
|
2793
2551
|
id?: string;
|
|
2794
2552
|
}
|
|
2795
2553
|
/**
|
|
2796
|
-
* Result from an
|
|
2554
|
+
* Result from an MCP host simulation
|
|
2797
2555
|
*/
|
|
2798
|
-
interface
|
|
2556
|
+
interface MCPHostSimulationResult {
|
|
2799
2557
|
/** Whether the simulation succeeded */
|
|
2800
2558
|
success: boolean;
|
|
2801
2559
|
/** Tool calls made by the LLM */
|
|
@@ -2823,33 +2581,33 @@ interface LLMHostSimulationResult {
|
|
|
2823
2581
|
mcpDurationMs?: number;
|
|
2824
2582
|
}
|
|
2825
2583
|
/**
|
|
2826
|
-
* Interface for
|
|
2584
|
+
* Interface for MCP host simulators.
|
|
2827
2585
|
*
|
|
2828
2586
|
* The only built-in implementation is the Vercel AI SDK orchestrator
|
|
2829
|
-
* (src/evals/
|
|
2587
|
+
* (src/evals/mcpHost/adapters/vercel.ts). Custom implementations can be
|
|
2830
2588
|
* created for specialised testing needs.
|
|
2831
2589
|
*/
|
|
2832
|
-
interface
|
|
2590
|
+
interface MCPHostSimulator {
|
|
2833
2591
|
/**
|
|
2834
|
-
* Simulates an
|
|
2592
|
+
* Simulates an MCP host interacting with an MCP server
|
|
2835
2593
|
*
|
|
2836
2594
|
* @param mcp - MCP fixture API
|
|
2837
2595
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
2838
|
-
* @param config -
|
|
2596
|
+
* @param config - MCP host configuration
|
|
2839
2597
|
* @returns Simulation result with tool calls and response
|
|
2840
2598
|
*/
|
|
2841
|
-
simulate(mcp: MCPFixtureApi, scenario: string, config:
|
|
2599
|
+
simulate(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
2842
2600
|
}
|
|
2843
2601
|
|
|
2844
2602
|
/**
|
|
2845
2603
|
* Evaluation mode
|
|
2846
2604
|
*/
|
|
2847
|
-
type EvalMode = 'direct' | '
|
|
2605
|
+
type EvalMode = 'direct' | 'mcp_host';
|
|
2848
2606
|
/**
|
|
2849
2607
|
* A single eval test case
|
|
2850
2608
|
*
|
|
2851
2609
|
* For 'direct' mode: toolName and args are required
|
|
2852
|
-
* For '
|
|
2610
|
+
* For 'mcp_host' mode: scenario and mcpHostConfig are required
|
|
2853
2611
|
*/
|
|
2854
2612
|
interface EvalCase {
|
|
2855
2613
|
/**
|
|
@@ -2863,40 +2621,40 @@ interface EvalCase {
|
|
|
2863
2621
|
/**
|
|
2864
2622
|
* Evaluation mode
|
|
2865
2623
|
* - 'direct': Direct API calls to MCP tools (default)
|
|
2866
|
-
* - '
|
|
2624
|
+
* - 'mcp_host': LLM-driven tool selection via natural language
|
|
2867
2625
|
*
|
|
2868
2626
|
* @default 'direct'
|
|
2869
2627
|
*/
|
|
2870
2628
|
mode?: EvalMode;
|
|
2871
2629
|
/**
|
|
2872
|
-
* Name of the MCP tool to call (required for 'direct' mode, optional for '
|
|
2630
|
+
* Name of the MCP tool to call (required for 'direct' mode, optional for 'mcp_host' mode)
|
|
2873
2631
|
*/
|
|
2874
2632
|
toolName?: string;
|
|
2875
2633
|
/**
|
|
2876
|
-
* Arguments to pass to the tool (required for 'direct' mode, optional for '
|
|
2634
|
+
* Arguments to pass to the tool (required for 'direct' mode, optional for 'mcp_host' mode)
|
|
2877
2635
|
*/
|
|
2878
2636
|
args?: Record<string, unknown>;
|
|
2879
2637
|
/**
|
|
2880
|
-
* Natural language scenario for LLM to execute (optional, required for '
|
|
2638
|
+
* Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
|
|
2881
2639
|
*
|
|
2882
2640
|
* @example "Get the weather for London and tell me if I need an umbrella"
|
|
2883
2641
|
*/
|
|
2884
2642
|
scenario?: string;
|
|
2885
2643
|
/**
|
|
2886
|
-
*
|
|
2644
|
+
* MCP host configuration (optional for 'mcp_host' mode)
|
|
2887
2645
|
*
|
|
2888
2646
|
* If not specified, uses default configuration from test environment
|
|
2889
2647
|
*/
|
|
2890
|
-
|
|
2648
|
+
mcpHostConfig?: MCPHostConfig;
|
|
2891
2649
|
/**
|
|
2892
2650
|
* Additional metadata for this test case
|
|
2893
2651
|
*
|
|
2894
|
-
* For '
|
|
2652
|
+
* For 'mcp_host' mode, can include 'expectedToolCalls' for validation
|
|
2895
2653
|
*/
|
|
2896
2654
|
metadata?: Record<string, unknown>;
|
|
2897
2655
|
/**
|
|
2898
|
-
* Number of times to run this case and compute an
|
|
2899
|
-
* When > 1, `EvalCaseResult.
|
|
2656
|
+
* Number of times to run this case and compute an assertion pass rate.
|
|
2657
|
+
* When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
|
|
2900
2658
|
* by `accuracyThreshold` rather than a single run.
|
|
2901
2659
|
* @default 1
|
|
2902
2660
|
*/
|
|
@@ -3027,8 +2785,8 @@ interface EvalExpectBlock {
|
|
|
3027
2785
|
minBytes?: number;
|
|
3028
2786
|
};
|
|
3029
2787
|
/**
|
|
3030
|
-
* Asserts which tools the LLM called during
|
|
3031
|
-
* Only meaningful for
|
|
2788
|
+
* Asserts which tools the LLM called during a mcp_host simulation.
|
|
2789
|
+
* Only meaningful for mcp_host mode — direct mode has no tool call trace.
|
|
3032
2790
|
*/
|
|
3033
2791
|
toolsTriggered?: {
|
|
3034
2792
|
/** Expected tool calls */
|
|
@@ -3049,7 +2807,7 @@ interface EvalExpectBlock {
|
|
|
3049
2807
|
exclusive?: boolean;
|
|
3050
2808
|
};
|
|
3051
2809
|
/**
|
|
3052
|
-
* Asserts the number of tool calls made during
|
|
2810
|
+
* Asserts the number of tool calls made during a mcp_host simulation.
|
|
3053
2811
|
*/
|
|
3054
2812
|
toolCallCount?: {
|
|
3055
2813
|
/** Minimum number of tool calls */
|
|
@@ -3088,399 +2846,109 @@ interface EvalDataset {
|
|
|
3088
2846
|
/**
|
|
3089
2847
|
* Zod schema for EvalCase
|
|
3090
2848
|
*
|
|
3091
|
-
* toolName and args are optional for
|
|
2849
|
+
* toolName and args are optional for mcp_host mode (which uses scenario instead)
|
|
3092
2850
|
*/
|
|
3093
2851
|
declare const EvalCaseSchema: z.ZodObject<{
|
|
3094
2852
|
id: z.ZodString;
|
|
3095
2853
|
description: z.ZodOptional<z.ZodString>;
|
|
3096
|
-
mode: z.ZodOptional<z.ZodEnum<
|
|
2854
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
2855
|
+
direct: "direct";
|
|
2856
|
+
mcp_host: "mcp_host";
|
|
2857
|
+
}>>;
|
|
3097
2858
|
toolName: z.ZodOptional<z.ZodString>;
|
|
3098
2859
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3099
2860
|
scenario: z.ZodOptional<z.ZodString>;
|
|
3100
|
-
|
|
3101
|
-
provider: z.ZodEnum<
|
|
2861
|
+
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2862
|
+
provider: z.ZodEnum<{
|
|
2863
|
+
openai: "openai";
|
|
2864
|
+
anthropic: "anthropic";
|
|
2865
|
+
azure: "azure";
|
|
2866
|
+
google: "google";
|
|
2867
|
+
mistral: "mistral";
|
|
2868
|
+
deepseek: "deepseek";
|
|
2869
|
+
openrouter: "openrouter";
|
|
2870
|
+
xai: "xai";
|
|
2871
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
2872
|
+
}>;
|
|
3102
2873
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3103
2874
|
model: z.ZodOptional<z.ZodString>;
|
|
3104
2875
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3105
2876
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3106
2877
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3107
|
-
},
|
|
3108
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3109
|
-
model?: string | undefined;
|
|
3110
|
-
maxTokens?: number | undefined;
|
|
3111
|
-
apiKeyEnvVar?: string | undefined;
|
|
3112
|
-
temperature?: number | undefined;
|
|
3113
|
-
maxToolCalls?: number | undefined;
|
|
3114
|
-
}, {
|
|
3115
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3116
|
-
model?: string | undefined;
|
|
3117
|
-
maxTokens?: number | undefined;
|
|
3118
|
-
apiKeyEnvVar?: string | undefined;
|
|
3119
|
-
temperature?: number | undefined;
|
|
3120
|
-
maxToolCalls?: number | undefined;
|
|
3121
|
-
}>>;
|
|
2878
|
+
}, z.core.$strip>>;
|
|
3122
2879
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3123
2880
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3124
2881
|
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3125
2882
|
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3126
2883
|
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3127
|
-
tags: z.ZodOptional<z.ZodArray<z.ZodString
|
|
2884
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
3128
2885
|
expect: z.ZodOptional<z.ZodObject<{
|
|
3129
2886
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
3130
2887
|
schema: z.ZodOptional<z.ZodString>;
|
|
3131
|
-
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
3132
|
-
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
2888
|
+
containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2889
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3133
2890
|
snapshot: z.ZodOptional<z.ZodString>;
|
|
3134
|
-
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<
|
|
2891
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2892
|
+
timestamp: "timestamp";
|
|
2893
|
+
uuid: "uuid";
|
|
2894
|
+
"iso-date": "iso-date";
|
|
2895
|
+
objectId: "objectId";
|
|
2896
|
+
jwt: "jwt";
|
|
2897
|
+
}>, z.ZodObject<{
|
|
3135
2898
|
pattern: z.ZodString;
|
|
3136
2899
|
replacement: z.ZodOptional<z.ZodString>;
|
|
3137
|
-
},
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
pattern: string;
|
|
3142
|
-
replacement?: string | undefined;
|
|
3143
|
-
}>, z.ZodObject<{
|
|
3144
|
-
remove: z.ZodArray<z.ZodString, "many">;
|
|
3145
|
-
}, "strip", z.ZodTypeAny, {
|
|
3146
|
-
remove: string[];
|
|
3147
|
-
}, {
|
|
3148
|
-
remove: string[];
|
|
3149
|
-
}>]>, "many">>;
|
|
3150
|
-
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2900
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
2901
|
+
remove: z.ZodArray<z.ZodString>;
|
|
2902
|
+
}, z.core.$strip>]>>>;
|
|
2903
|
+
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3151
2904
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3152
|
-
rubric: z.ZodUnion<[z.ZodEnum<
|
|
2905
|
+
rubric: z.ZodUnion<readonly [z.ZodEnum<{
|
|
2906
|
+
correctness: "correctness";
|
|
2907
|
+
completeness: "completeness";
|
|
2908
|
+
groundedness: "groundedness";
|
|
2909
|
+
"instruction-following": "instruction-following";
|
|
2910
|
+
conciseness: "conciseness";
|
|
2911
|
+
}>, z.ZodObject<{
|
|
3153
2912
|
text: z.ZodString;
|
|
3154
|
-
},
|
|
3155
|
-
text: string;
|
|
3156
|
-
}, {
|
|
3157
|
-
text: string;
|
|
3158
|
-
}>]>;
|
|
2913
|
+
}, z.core.$strip>]>;
|
|
3159
2914
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3160
2915
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3161
2916
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3162
|
-
provider: z.ZodOptional<z.ZodEnum<
|
|
2917
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2918
|
+
openai: "openai";
|
|
2919
|
+
anthropic: "anthropic";
|
|
2920
|
+
google: "google";
|
|
2921
|
+
}>>;
|
|
3163
2922
|
model: z.ZodOptional<z.ZodString>;
|
|
3164
2923
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3165
2924
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3166
2925
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3167
2926
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3168
2927
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3169
|
-
},
|
|
3170
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3171
|
-
text: string;
|
|
3172
|
-
};
|
|
3173
|
-
model?: string | undefined;
|
|
3174
|
-
maxTokens?: number | undefined;
|
|
3175
|
-
maxBudgetUsd?: number | undefined;
|
|
3176
|
-
reference?: unknown;
|
|
3177
|
-
threshold?: number | undefined;
|
|
3178
|
-
reps?: number | undefined;
|
|
3179
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3180
|
-
apiKeyEnvVar?: string | undefined;
|
|
3181
|
-
temperature?: number | undefined;
|
|
3182
|
-
maxToolOutputSize?: number | undefined;
|
|
3183
|
-
}, {
|
|
3184
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3185
|
-
text: string;
|
|
3186
|
-
};
|
|
3187
|
-
model?: string | undefined;
|
|
3188
|
-
maxTokens?: number | undefined;
|
|
3189
|
-
maxBudgetUsd?: number | undefined;
|
|
3190
|
-
reference?: unknown;
|
|
3191
|
-
threshold?: number | undefined;
|
|
3192
|
-
reps?: number | undefined;
|
|
3193
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3194
|
-
apiKeyEnvVar?: string | undefined;
|
|
3195
|
-
temperature?: number | undefined;
|
|
3196
|
-
maxToolOutputSize?: number | undefined;
|
|
3197
|
-
}>>;
|
|
2928
|
+
}, z.core.$strip>>;
|
|
3198
2929
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3199
2930
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3200
2931
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
3201
|
-
},
|
|
3202
|
-
maxBytes?: number | undefined;
|
|
3203
|
-
minBytes?: number | undefined;
|
|
3204
|
-
}, {
|
|
3205
|
-
maxBytes?: number | undefined;
|
|
3206
|
-
minBytes?: number | undefined;
|
|
3207
|
-
}>>;
|
|
2932
|
+
}, z.core.$strip>>;
|
|
3208
2933
|
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3209
2934
|
calls: z.ZodArray<z.ZodObject<{
|
|
3210
2935
|
name: z.ZodString;
|
|
3211
2936
|
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3212
2937
|
required: z.ZodOptional<z.ZodBoolean>;
|
|
3213
|
-
},
|
|
3214
|
-
|
|
3215
|
-
|
|
3216
|
-
|
|
3217
|
-
}
|
|
3218
|
-
name: string;
|
|
3219
|
-
required?: boolean | undefined;
|
|
3220
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3221
|
-
}>, "many">;
|
|
3222
|
-
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
2938
|
+
}, z.core.$strip>>;
|
|
2939
|
+
order: z.ZodOptional<z.ZodEnum<{
|
|
2940
|
+
any: "any";
|
|
2941
|
+
strict: "strict";
|
|
2942
|
+
}>>;
|
|
3223
2943
|
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3224
|
-
},
|
|
3225
|
-
calls: {
|
|
3226
|
-
name: string;
|
|
3227
|
-
required?: boolean | undefined;
|
|
3228
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3229
|
-
}[];
|
|
3230
|
-
order?: "strict" | "any" | undefined;
|
|
3231
|
-
exclusive?: boolean | undefined;
|
|
3232
|
-
}, {
|
|
3233
|
-
calls: {
|
|
3234
|
-
name: string;
|
|
3235
|
-
required?: boolean | undefined;
|
|
3236
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3237
|
-
}[];
|
|
3238
|
-
order?: "strict" | "any" | undefined;
|
|
3239
|
-
exclusive?: boolean | undefined;
|
|
3240
|
-
}>>;
|
|
2944
|
+
}, z.core.$strip>>;
|
|
3241
2945
|
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3242
2946
|
min: z.ZodOptional<z.ZodNumber>;
|
|
3243
2947
|
max: z.ZodOptional<z.ZodNumber>;
|
|
3244
2948
|
exact: z.ZodOptional<z.ZodNumber>;
|
|
3245
|
-
},
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
max?: number | undefined;
|
|
3249
|
-
}, {
|
|
3250
|
-
exact?: number | undefined;
|
|
3251
|
-
min?: number | undefined;
|
|
3252
|
-
max?: number | undefined;
|
|
3253
|
-
}>>;
|
|
3254
|
-
}, "strip", z.ZodTypeAny, {
|
|
3255
|
-
response?: unknown;
|
|
3256
|
-
isError?: string | boolean | string[] | undefined;
|
|
3257
|
-
schema?: string | undefined;
|
|
3258
|
-
snapshot?: string | undefined;
|
|
3259
|
-
toolsTriggered?: {
|
|
3260
|
-
calls: {
|
|
3261
|
-
name: string;
|
|
3262
|
-
required?: boolean | undefined;
|
|
3263
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3264
|
-
}[];
|
|
3265
|
-
order?: "strict" | "any" | undefined;
|
|
3266
|
-
exclusive?: boolean | undefined;
|
|
3267
|
-
} | undefined;
|
|
3268
|
-
toolCallCount?: {
|
|
3269
|
-
exact?: number | undefined;
|
|
3270
|
-
min?: number | undefined;
|
|
3271
|
-
max?: number | undefined;
|
|
3272
|
-
} | undefined;
|
|
3273
|
-
containsText?: string | string[] | undefined;
|
|
3274
|
-
matchesPattern?: string | string[] | undefined;
|
|
3275
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3276
|
-
pattern: string;
|
|
3277
|
-
replacement?: string | undefined;
|
|
3278
|
-
} | {
|
|
3279
|
-
remove: string[];
|
|
3280
|
-
})[] | undefined;
|
|
3281
|
-
passesJudge?: {
|
|
3282
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3283
|
-
text: string;
|
|
3284
|
-
};
|
|
3285
|
-
model?: string | undefined;
|
|
3286
|
-
maxTokens?: number | undefined;
|
|
3287
|
-
maxBudgetUsd?: number | undefined;
|
|
3288
|
-
reference?: unknown;
|
|
3289
|
-
threshold?: number | undefined;
|
|
3290
|
-
reps?: number | undefined;
|
|
3291
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3292
|
-
apiKeyEnvVar?: string | undefined;
|
|
3293
|
-
temperature?: number | undefined;
|
|
3294
|
-
maxToolOutputSize?: number | undefined;
|
|
3295
|
-
} | undefined;
|
|
3296
|
-
responseSize?: {
|
|
3297
|
-
maxBytes?: number | undefined;
|
|
3298
|
-
minBytes?: number | undefined;
|
|
3299
|
-
} | undefined;
|
|
3300
|
-
}, {
|
|
3301
|
-
response?: unknown;
|
|
3302
|
-
isError?: string | boolean | string[] | undefined;
|
|
3303
|
-
schema?: string | undefined;
|
|
3304
|
-
snapshot?: string | undefined;
|
|
3305
|
-
toolsTriggered?: {
|
|
3306
|
-
calls: {
|
|
3307
|
-
name: string;
|
|
3308
|
-
required?: boolean | undefined;
|
|
3309
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3310
|
-
}[];
|
|
3311
|
-
order?: "strict" | "any" | undefined;
|
|
3312
|
-
exclusive?: boolean | undefined;
|
|
3313
|
-
} | undefined;
|
|
3314
|
-
toolCallCount?: {
|
|
3315
|
-
exact?: number | undefined;
|
|
3316
|
-
min?: number | undefined;
|
|
3317
|
-
max?: number | undefined;
|
|
3318
|
-
} | undefined;
|
|
3319
|
-
containsText?: string | string[] | undefined;
|
|
3320
|
-
matchesPattern?: string | string[] | undefined;
|
|
3321
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3322
|
-
pattern: string;
|
|
3323
|
-
replacement?: string | undefined;
|
|
3324
|
-
} | {
|
|
3325
|
-
remove: string[];
|
|
3326
|
-
})[] | undefined;
|
|
3327
|
-
passesJudge?: {
|
|
3328
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3329
|
-
text: string;
|
|
3330
|
-
};
|
|
3331
|
-
model?: string | undefined;
|
|
3332
|
-
maxTokens?: number | undefined;
|
|
3333
|
-
maxBudgetUsd?: number | undefined;
|
|
3334
|
-
reference?: unknown;
|
|
3335
|
-
threshold?: number | undefined;
|
|
3336
|
-
reps?: number | undefined;
|
|
3337
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3338
|
-
apiKeyEnvVar?: string | undefined;
|
|
3339
|
-
temperature?: number | undefined;
|
|
3340
|
-
maxToolOutputSize?: number | undefined;
|
|
3341
|
-
} | undefined;
|
|
3342
|
-
responseSize?: {
|
|
3343
|
-
maxBytes?: number | undefined;
|
|
3344
|
-
minBytes?: number | undefined;
|
|
3345
|
-
} | undefined;
|
|
3346
|
-
}>>;
|
|
3347
|
-
}, "strip", z.ZodTypeAny, {
|
|
3348
|
-
id: string;
|
|
3349
|
-
args?: Record<string, unknown> | undefined;
|
|
3350
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3351
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3352
|
-
description?: string | undefined;
|
|
3353
|
-
toolName?: string | undefined;
|
|
3354
|
-
scenario?: string | undefined;
|
|
3355
|
-
llmHostConfig?: {
|
|
3356
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3357
|
-
model?: string | undefined;
|
|
3358
|
-
maxTokens?: number | undefined;
|
|
3359
|
-
apiKeyEnvVar?: string | undefined;
|
|
3360
|
-
temperature?: number | undefined;
|
|
3361
|
-
maxToolCalls?: number | undefined;
|
|
3362
|
-
} | undefined;
|
|
3363
|
-
iterations?: number | undefined;
|
|
3364
|
-
accuracyThreshold?: number | undefined;
|
|
3365
|
-
judgeReps?: number | undefined;
|
|
3366
|
-
canonicalAnswer?: string | undefined;
|
|
3367
|
-
tags?: string[] | undefined;
|
|
3368
|
-
expect?: {
|
|
3369
|
-
response?: unknown;
|
|
3370
|
-
isError?: string | boolean | string[] | undefined;
|
|
3371
|
-
schema?: string | undefined;
|
|
3372
|
-
snapshot?: string | undefined;
|
|
3373
|
-
toolsTriggered?: {
|
|
3374
|
-
calls: {
|
|
3375
|
-
name: string;
|
|
3376
|
-
required?: boolean | undefined;
|
|
3377
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3378
|
-
}[];
|
|
3379
|
-
order?: "strict" | "any" | undefined;
|
|
3380
|
-
exclusive?: boolean | undefined;
|
|
3381
|
-
} | undefined;
|
|
3382
|
-
toolCallCount?: {
|
|
3383
|
-
exact?: number | undefined;
|
|
3384
|
-
min?: number | undefined;
|
|
3385
|
-
max?: number | undefined;
|
|
3386
|
-
} | undefined;
|
|
3387
|
-
containsText?: string | string[] | undefined;
|
|
3388
|
-
matchesPattern?: string | string[] | undefined;
|
|
3389
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3390
|
-
pattern: string;
|
|
3391
|
-
replacement?: string | undefined;
|
|
3392
|
-
} | {
|
|
3393
|
-
remove: string[];
|
|
3394
|
-
})[] | undefined;
|
|
3395
|
-
passesJudge?: {
|
|
3396
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3397
|
-
text: string;
|
|
3398
|
-
};
|
|
3399
|
-
model?: string | undefined;
|
|
3400
|
-
maxTokens?: number | undefined;
|
|
3401
|
-
maxBudgetUsd?: number | undefined;
|
|
3402
|
-
reference?: unknown;
|
|
3403
|
-
threshold?: number | undefined;
|
|
3404
|
-
reps?: number | undefined;
|
|
3405
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3406
|
-
apiKeyEnvVar?: string | undefined;
|
|
3407
|
-
temperature?: number | undefined;
|
|
3408
|
-
maxToolOutputSize?: number | undefined;
|
|
3409
|
-
} | undefined;
|
|
3410
|
-
responseSize?: {
|
|
3411
|
-
maxBytes?: number | undefined;
|
|
3412
|
-
minBytes?: number | undefined;
|
|
3413
|
-
} | undefined;
|
|
3414
|
-
} | undefined;
|
|
3415
|
-
}, {
|
|
3416
|
-
id: string;
|
|
3417
|
-
args?: Record<string, unknown> | undefined;
|
|
3418
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3419
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3420
|
-
description?: string | undefined;
|
|
3421
|
-
toolName?: string | undefined;
|
|
3422
|
-
scenario?: string | undefined;
|
|
3423
|
-
llmHostConfig?: {
|
|
3424
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3425
|
-
model?: string | undefined;
|
|
3426
|
-
maxTokens?: number | undefined;
|
|
3427
|
-
apiKeyEnvVar?: string | undefined;
|
|
3428
|
-
temperature?: number | undefined;
|
|
3429
|
-
maxToolCalls?: number | undefined;
|
|
3430
|
-
} | undefined;
|
|
3431
|
-
iterations?: number | undefined;
|
|
3432
|
-
accuracyThreshold?: number | undefined;
|
|
3433
|
-
judgeReps?: number | undefined;
|
|
3434
|
-
canonicalAnswer?: string | undefined;
|
|
3435
|
-
tags?: string[] | undefined;
|
|
3436
|
-
expect?: {
|
|
3437
|
-
response?: unknown;
|
|
3438
|
-
isError?: string | boolean | string[] | undefined;
|
|
3439
|
-
schema?: string | undefined;
|
|
3440
|
-
snapshot?: string | undefined;
|
|
3441
|
-
toolsTriggered?: {
|
|
3442
|
-
calls: {
|
|
3443
|
-
name: string;
|
|
3444
|
-
required?: boolean | undefined;
|
|
3445
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3446
|
-
}[];
|
|
3447
|
-
order?: "strict" | "any" | undefined;
|
|
3448
|
-
exclusive?: boolean | undefined;
|
|
3449
|
-
} | undefined;
|
|
3450
|
-
toolCallCount?: {
|
|
3451
|
-
exact?: number | undefined;
|
|
3452
|
-
min?: number | undefined;
|
|
3453
|
-
max?: number | undefined;
|
|
3454
|
-
} | undefined;
|
|
3455
|
-
containsText?: string | string[] | undefined;
|
|
3456
|
-
matchesPattern?: string | string[] | undefined;
|
|
3457
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3458
|
-
pattern: string;
|
|
3459
|
-
replacement?: string | undefined;
|
|
3460
|
-
} | {
|
|
3461
|
-
remove: string[];
|
|
3462
|
-
})[] | undefined;
|
|
3463
|
-
passesJudge?: {
|
|
3464
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3465
|
-
text: string;
|
|
3466
|
-
};
|
|
3467
|
-
model?: string | undefined;
|
|
3468
|
-
maxTokens?: number | undefined;
|
|
3469
|
-
maxBudgetUsd?: number | undefined;
|
|
3470
|
-
reference?: unknown;
|
|
3471
|
-
threshold?: number | undefined;
|
|
3472
|
-
reps?: number | undefined;
|
|
3473
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3474
|
-
apiKeyEnvVar?: string | undefined;
|
|
3475
|
-
temperature?: number | undefined;
|
|
3476
|
-
maxToolOutputSize?: number | undefined;
|
|
3477
|
-
} | undefined;
|
|
3478
|
-
responseSize?: {
|
|
3479
|
-
maxBytes?: number | undefined;
|
|
3480
|
-
minBytes?: number | undefined;
|
|
3481
|
-
} | undefined;
|
|
3482
|
-
} | undefined;
|
|
3483
|
-
}>;
|
|
2949
|
+
}, z.core.$strip>>;
|
|
2950
|
+
}, z.core.$strip>>;
|
|
2951
|
+
}, z.core.$strip>;
|
|
3484
2952
|
/**
|
|
3485
2953
|
* Zod schema for EvalDataset (without schemas field, as schemas aren't serializable)
|
|
3486
2954
|
*/
|
|
@@ -3490,542 +2958,106 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3490
2958
|
cases: z.ZodArray<z.ZodObject<{
|
|
3491
2959
|
id: z.ZodString;
|
|
3492
2960
|
description: z.ZodOptional<z.ZodString>;
|
|
3493
|
-
mode: z.ZodOptional<z.ZodEnum<
|
|
2961
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
2962
|
+
direct: "direct";
|
|
2963
|
+
mcp_host: "mcp_host";
|
|
2964
|
+
}>>;
|
|
3494
2965
|
toolName: z.ZodOptional<z.ZodString>;
|
|
3495
2966
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3496
2967
|
scenario: z.ZodOptional<z.ZodString>;
|
|
3497
|
-
|
|
3498
|
-
provider: z.ZodEnum<
|
|
2968
|
+
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2969
|
+
provider: z.ZodEnum<{
|
|
2970
|
+
openai: "openai";
|
|
2971
|
+
anthropic: "anthropic";
|
|
2972
|
+
azure: "azure";
|
|
2973
|
+
google: "google";
|
|
2974
|
+
mistral: "mistral";
|
|
2975
|
+
deepseek: "deepseek";
|
|
2976
|
+
openrouter: "openrouter";
|
|
2977
|
+
xai: "xai";
|
|
2978
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
2979
|
+
}>;
|
|
3499
2980
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3500
2981
|
model: z.ZodOptional<z.ZodString>;
|
|
3501
2982
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3502
2983
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3503
2984
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3504
|
-
},
|
|
3505
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3506
|
-
model?: string | undefined;
|
|
3507
|
-
maxTokens?: number | undefined;
|
|
3508
|
-
apiKeyEnvVar?: string | undefined;
|
|
3509
|
-
temperature?: number | undefined;
|
|
3510
|
-
maxToolCalls?: number | undefined;
|
|
3511
|
-
}, {
|
|
3512
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3513
|
-
model?: string | undefined;
|
|
3514
|
-
maxTokens?: number | undefined;
|
|
3515
|
-
apiKeyEnvVar?: string | undefined;
|
|
3516
|
-
temperature?: number | undefined;
|
|
3517
|
-
maxToolCalls?: number | undefined;
|
|
3518
|
-
}>>;
|
|
2985
|
+
}, z.core.$strip>>;
|
|
3519
2986
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3520
2987
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3521
2988
|
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3522
2989
|
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3523
2990
|
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3524
|
-
tags: z.ZodOptional<z.ZodArray<z.ZodString
|
|
2991
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
3525
2992
|
expect: z.ZodOptional<z.ZodObject<{
|
|
3526
2993
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
3527
2994
|
schema: z.ZodOptional<z.ZodString>;
|
|
3528
|
-
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
3529
|
-
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
2995
|
+
containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2996
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3530
2997
|
snapshot: z.ZodOptional<z.ZodString>;
|
|
3531
|
-
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<
|
|
2998
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2999
|
+
timestamp: "timestamp";
|
|
3000
|
+
uuid: "uuid";
|
|
3001
|
+
"iso-date": "iso-date";
|
|
3002
|
+
objectId: "objectId";
|
|
3003
|
+
jwt: "jwt";
|
|
3004
|
+
}>, z.ZodObject<{
|
|
3532
3005
|
pattern: z.ZodString;
|
|
3533
3006
|
replacement: z.ZodOptional<z.ZodString>;
|
|
3534
|
-
},
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
pattern: string;
|
|
3539
|
-
replacement?: string | undefined;
|
|
3540
|
-
}>, z.ZodObject<{
|
|
3541
|
-
remove: z.ZodArray<z.ZodString, "many">;
|
|
3542
|
-
}, "strip", z.ZodTypeAny, {
|
|
3543
|
-
remove: string[];
|
|
3544
|
-
}, {
|
|
3545
|
-
remove: string[];
|
|
3546
|
-
}>]>, "many">>;
|
|
3547
|
-
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
3007
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
3008
|
+
remove: z.ZodArray<z.ZodString>;
|
|
3009
|
+
}, z.core.$strip>]>>>;
|
|
3010
|
+
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3548
3011
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3549
|
-
rubric: z.ZodUnion<[z.ZodEnum<
|
|
3012
|
+
rubric: z.ZodUnion<readonly [z.ZodEnum<{
|
|
3013
|
+
correctness: "correctness";
|
|
3014
|
+
completeness: "completeness";
|
|
3015
|
+
groundedness: "groundedness";
|
|
3016
|
+
"instruction-following": "instruction-following";
|
|
3017
|
+
conciseness: "conciseness";
|
|
3018
|
+
}>, z.ZodObject<{
|
|
3550
3019
|
text: z.ZodString;
|
|
3551
|
-
},
|
|
3552
|
-
text: string;
|
|
3553
|
-
}, {
|
|
3554
|
-
text: string;
|
|
3555
|
-
}>]>;
|
|
3020
|
+
}, z.core.$strip>]>;
|
|
3556
3021
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3557
3022
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3558
3023
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3559
|
-
provider: z.ZodOptional<z.ZodEnum<
|
|
3024
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3025
|
+
openai: "openai";
|
|
3026
|
+
anthropic: "anthropic";
|
|
3027
|
+
google: "google";
|
|
3028
|
+
}>>;
|
|
3560
3029
|
model: z.ZodOptional<z.ZodString>;
|
|
3561
3030
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3562
3031
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3563
3032
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3564
3033
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3565
3034
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3566
|
-
},
|
|
3567
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3568
|
-
text: string;
|
|
3569
|
-
};
|
|
3570
|
-
model?: string | undefined;
|
|
3571
|
-
maxTokens?: number | undefined;
|
|
3572
|
-
maxBudgetUsd?: number | undefined;
|
|
3573
|
-
reference?: unknown;
|
|
3574
|
-
threshold?: number | undefined;
|
|
3575
|
-
reps?: number | undefined;
|
|
3576
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3577
|
-
apiKeyEnvVar?: string | undefined;
|
|
3578
|
-
temperature?: number | undefined;
|
|
3579
|
-
maxToolOutputSize?: number | undefined;
|
|
3580
|
-
}, {
|
|
3581
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3582
|
-
text: string;
|
|
3583
|
-
};
|
|
3584
|
-
model?: string | undefined;
|
|
3585
|
-
maxTokens?: number | undefined;
|
|
3586
|
-
maxBudgetUsd?: number | undefined;
|
|
3587
|
-
reference?: unknown;
|
|
3588
|
-
threshold?: number | undefined;
|
|
3589
|
-
reps?: number | undefined;
|
|
3590
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3591
|
-
apiKeyEnvVar?: string | undefined;
|
|
3592
|
-
temperature?: number | undefined;
|
|
3593
|
-
maxToolOutputSize?: number | undefined;
|
|
3594
|
-
}>>;
|
|
3035
|
+
}, z.core.$strip>>;
|
|
3595
3036
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3596
3037
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3597
3038
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
3598
|
-
},
|
|
3599
|
-
maxBytes?: number | undefined;
|
|
3600
|
-
minBytes?: number | undefined;
|
|
3601
|
-
}, {
|
|
3602
|
-
maxBytes?: number | undefined;
|
|
3603
|
-
minBytes?: number | undefined;
|
|
3604
|
-
}>>;
|
|
3039
|
+
}, z.core.$strip>>;
|
|
3605
3040
|
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3606
3041
|
calls: z.ZodArray<z.ZodObject<{
|
|
3607
3042
|
name: z.ZodString;
|
|
3608
3043
|
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3609
3044
|
required: z.ZodOptional<z.ZodBoolean>;
|
|
3610
|
-
},
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
}
|
|
3615
|
-
name: string;
|
|
3616
|
-
required?: boolean | undefined;
|
|
3617
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3618
|
-
}>, "many">;
|
|
3619
|
-
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3045
|
+
}, z.core.$strip>>;
|
|
3046
|
+
order: z.ZodOptional<z.ZodEnum<{
|
|
3047
|
+
any: "any";
|
|
3048
|
+
strict: "strict";
|
|
3049
|
+
}>>;
|
|
3620
3050
|
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3621
|
-
},
|
|
3622
|
-
calls: {
|
|
3623
|
-
name: string;
|
|
3624
|
-
required?: boolean | undefined;
|
|
3625
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3626
|
-
}[];
|
|
3627
|
-
order?: "strict" | "any" | undefined;
|
|
3628
|
-
exclusive?: boolean | undefined;
|
|
3629
|
-
}, {
|
|
3630
|
-
calls: {
|
|
3631
|
-
name: string;
|
|
3632
|
-
required?: boolean | undefined;
|
|
3633
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3634
|
-
}[];
|
|
3635
|
-
order?: "strict" | "any" | undefined;
|
|
3636
|
-
exclusive?: boolean | undefined;
|
|
3637
|
-
}>>;
|
|
3051
|
+
}, z.core.$strip>>;
|
|
3638
3052
|
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3639
3053
|
min: z.ZodOptional<z.ZodNumber>;
|
|
3640
3054
|
max: z.ZodOptional<z.ZodNumber>;
|
|
3641
3055
|
exact: z.ZodOptional<z.ZodNumber>;
|
|
3642
|
-
},
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
max?: number | undefined;
|
|
3646
|
-
}, {
|
|
3647
|
-
exact?: number | undefined;
|
|
3648
|
-
min?: number | undefined;
|
|
3649
|
-
max?: number | undefined;
|
|
3650
|
-
}>>;
|
|
3651
|
-
}, "strip", z.ZodTypeAny, {
|
|
3652
|
-
response?: unknown;
|
|
3653
|
-
isError?: string | boolean | string[] | undefined;
|
|
3654
|
-
schema?: string | undefined;
|
|
3655
|
-
snapshot?: string | undefined;
|
|
3656
|
-
toolsTriggered?: {
|
|
3657
|
-
calls: {
|
|
3658
|
-
name: string;
|
|
3659
|
-
required?: boolean | undefined;
|
|
3660
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3661
|
-
}[];
|
|
3662
|
-
order?: "strict" | "any" | undefined;
|
|
3663
|
-
exclusive?: boolean | undefined;
|
|
3664
|
-
} | undefined;
|
|
3665
|
-
toolCallCount?: {
|
|
3666
|
-
exact?: number | undefined;
|
|
3667
|
-
min?: number | undefined;
|
|
3668
|
-
max?: number | undefined;
|
|
3669
|
-
} | undefined;
|
|
3670
|
-
containsText?: string | string[] | undefined;
|
|
3671
|
-
matchesPattern?: string | string[] | undefined;
|
|
3672
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3673
|
-
pattern: string;
|
|
3674
|
-
replacement?: string | undefined;
|
|
3675
|
-
} | {
|
|
3676
|
-
remove: string[];
|
|
3677
|
-
})[] | undefined;
|
|
3678
|
-
passesJudge?: {
|
|
3679
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3680
|
-
text: string;
|
|
3681
|
-
};
|
|
3682
|
-
model?: string | undefined;
|
|
3683
|
-
maxTokens?: number | undefined;
|
|
3684
|
-
maxBudgetUsd?: number | undefined;
|
|
3685
|
-
reference?: unknown;
|
|
3686
|
-
threshold?: number | undefined;
|
|
3687
|
-
reps?: number | undefined;
|
|
3688
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3689
|
-
apiKeyEnvVar?: string | undefined;
|
|
3690
|
-
temperature?: number | undefined;
|
|
3691
|
-
maxToolOutputSize?: number | undefined;
|
|
3692
|
-
} | undefined;
|
|
3693
|
-
responseSize?: {
|
|
3694
|
-
maxBytes?: number | undefined;
|
|
3695
|
-
minBytes?: number | undefined;
|
|
3696
|
-
} | undefined;
|
|
3697
|
-
}, {
|
|
3698
|
-
response?: unknown;
|
|
3699
|
-
isError?: string | boolean | string[] | undefined;
|
|
3700
|
-
schema?: string | undefined;
|
|
3701
|
-
snapshot?: string | undefined;
|
|
3702
|
-
toolsTriggered?: {
|
|
3703
|
-
calls: {
|
|
3704
|
-
name: string;
|
|
3705
|
-
required?: boolean | undefined;
|
|
3706
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3707
|
-
}[];
|
|
3708
|
-
order?: "strict" | "any" | undefined;
|
|
3709
|
-
exclusive?: boolean | undefined;
|
|
3710
|
-
} | undefined;
|
|
3711
|
-
toolCallCount?: {
|
|
3712
|
-
exact?: number | undefined;
|
|
3713
|
-
min?: number | undefined;
|
|
3714
|
-
max?: number | undefined;
|
|
3715
|
-
} | undefined;
|
|
3716
|
-
containsText?: string | string[] | undefined;
|
|
3717
|
-
matchesPattern?: string | string[] | undefined;
|
|
3718
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3719
|
-
pattern: string;
|
|
3720
|
-
replacement?: string | undefined;
|
|
3721
|
-
} | {
|
|
3722
|
-
remove: string[];
|
|
3723
|
-
})[] | undefined;
|
|
3724
|
-
passesJudge?: {
|
|
3725
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3726
|
-
text: string;
|
|
3727
|
-
};
|
|
3728
|
-
model?: string | undefined;
|
|
3729
|
-
maxTokens?: number | undefined;
|
|
3730
|
-
maxBudgetUsd?: number | undefined;
|
|
3731
|
-
reference?: unknown;
|
|
3732
|
-
threshold?: number | undefined;
|
|
3733
|
-
reps?: number | undefined;
|
|
3734
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3735
|
-
apiKeyEnvVar?: string | undefined;
|
|
3736
|
-
temperature?: number | undefined;
|
|
3737
|
-
maxToolOutputSize?: number | undefined;
|
|
3738
|
-
} | undefined;
|
|
3739
|
-
responseSize?: {
|
|
3740
|
-
maxBytes?: number | undefined;
|
|
3741
|
-
minBytes?: number | undefined;
|
|
3742
|
-
} | undefined;
|
|
3743
|
-
}>>;
|
|
3744
|
-
}, "strip", z.ZodTypeAny, {
|
|
3745
|
-
id: string;
|
|
3746
|
-
args?: Record<string, unknown> | undefined;
|
|
3747
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3748
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3749
|
-
description?: string | undefined;
|
|
3750
|
-
toolName?: string | undefined;
|
|
3751
|
-
scenario?: string | undefined;
|
|
3752
|
-
llmHostConfig?: {
|
|
3753
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3754
|
-
model?: string | undefined;
|
|
3755
|
-
maxTokens?: number | undefined;
|
|
3756
|
-
apiKeyEnvVar?: string | undefined;
|
|
3757
|
-
temperature?: number | undefined;
|
|
3758
|
-
maxToolCalls?: number | undefined;
|
|
3759
|
-
} | undefined;
|
|
3760
|
-
iterations?: number | undefined;
|
|
3761
|
-
accuracyThreshold?: number | undefined;
|
|
3762
|
-
judgeReps?: number | undefined;
|
|
3763
|
-
canonicalAnswer?: string | undefined;
|
|
3764
|
-
tags?: string[] | undefined;
|
|
3765
|
-
expect?: {
|
|
3766
|
-
response?: unknown;
|
|
3767
|
-
isError?: string | boolean | string[] | undefined;
|
|
3768
|
-
schema?: string | undefined;
|
|
3769
|
-
snapshot?: string | undefined;
|
|
3770
|
-
toolsTriggered?: {
|
|
3771
|
-
calls: {
|
|
3772
|
-
name: string;
|
|
3773
|
-
required?: boolean | undefined;
|
|
3774
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3775
|
-
}[];
|
|
3776
|
-
order?: "strict" | "any" | undefined;
|
|
3777
|
-
exclusive?: boolean | undefined;
|
|
3778
|
-
} | undefined;
|
|
3779
|
-
toolCallCount?: {
|
|
3780
|
-
exact?: number | undefined;
|
|
3781
|
-
min?: number | undefined;
|
|
3782
|
-
max?: number | undefined;
|
|
3783
|
-
} | undefined;
|
|
3784
|
-
containsText?: string | string[] | undefined;
|
|
3785
|
-
matchesPattern?: string | string[] | undefined;
|
|
3786
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3787
|
-
pattern: string;
|
|
3788
|
-
replacement?: string | undefined;
|
|
3789
|
-
} | {
|
|
3790
|
-
remove: string[];
|
|
3791
|
-
})[] | undefined;
|
|
3792
|
-
passesJudge?: {
|
|
3793
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3794
|
-
text: string;
|
|
3795
|
-
};
|
|
3796
|
-
model?: string | undefined;
|
|
3797
|
-
maxTokens?: number | undefined;
|
|
3798
|
-
maxBudgetUsd?: number | undefined;
|
|
3799
|
-
reference?: unknown;
|
|
3800
|
-
threshold?: number | undefined;
|
|
3801
|
-
reps?: number | undefined;
|
|
3802
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3803
|
-
apiKeyEnvVar?: string | undefined;
|
|
3804
|
-
temperature?: number | undefined;
|
|
3805
|
-
maxToolOutputSize?: number | undefined;
|
|
3806
|
-
} | undefined;
|
|
3807
|
-
responseSize?: {
|
|
3808
|
-
maxBytes?: number | undefined;
|
|
3809
|
-
minBytes?: number | undefined;
|
|
3810
|
-
} | undefined;
|
|
3811
|
-
} | undefined;
|
|
3812
|
-
}, {
|
|
3813
|
-
id: string;
|
|
3814
|
-
args?: Record<string, unknown> | undefined;
|
|
3815
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3816
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3817
|
-
description?: string | undefined;
|
|
3818
|
-
toolName?: string | undefined;
|
|
3819
|
-
scenario?: string | undefined;
|
|
3820
|
-
llmHostConfig?: {
|
|
3821
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3822
|
-
model?: string | undefined;
|
|
3823
|
-
maxTokens?: number | undefined;
|
|
3824
|
-
apiKeyEnvVar?: string | undefined;
|
|
3825
|
-
temperature?: number | undefined;
|
|
3826
|
-
maxToolCalls?: number | undefined;
|
|
3827
|
-
} | undefined;
|
|
3828
|
-
iterations?: number | undefined;
|
|
3829
|
-
accuracyThreshold?: number | undefined;
|
|
3830
|
-
judgeReps?: number | undefined;
|
|
3831
|
-
canonicalAnswer?: string | undefined;
|
|
3832
|
-
tags?: string[] | undefined;
|
|
3833
|
-
expect?: {
|
|
3834
|
-
response?: unknown;
|
|
3835
|
-
isError?: string | boolean | string[] | undefined;
|
|
3836
|
-
schema?: string | undefined;
|
|
3837
|
-
snapshot?: string | undefined;
|
|
3838
|
-
toolsTriggered?: {
|
|
3839
|
-
calls: {
|
|
3840
|
-
name: string;
|
|
3841
|
-
required?: boolean | undefined;
|
|
3842
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3843
|
-
}[];
|
|
3844
|
-
order?: "strict" | "any" | undefined;
|
|
3845
|
-
exclusive?: boolean | undefined;
|
|
3846
|
-
} | undefined;
|
|
3847
|
-
toolCallCount?: {
|
|
3848
|
-
exact?: number | undefined;
|
|
3849
|
-
min?: number | undefined;
|
|
3850
|
-
max?: number | undefined;
|
|
3851
|
-
} | undefined;
|
|
3852
|
-
containsText?: string | string[] | undefined;
|
|
3853
|
-
matchesPattern?: string | string[] | undefined;
|
|
3854
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3855
|
-
pattern: string;
|
|
3856
|
-
replacement?: string | undefined;
|
|
3857
|
-
} | {
|
|
3858
|
-
remove: string[];
|
|
3859
|
-
})[] | undefined;
|
|
3860
|
-
passesJudge?: {
|
|
3861
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3862
|
-
text: string;
|
|
3863
|
-
};
|
|
3864
|
-
model?: string | undefined;
|
|
3865
|
-
maxTokens?: number | undefined;
|
|
3866
|
-
maxBudgetUsd?: number | undefined;
|
|
3867
|
-
reference?: unknown;
|
|
3868
|
-
threshold?: number | undefined;
|
|
3869
|
-
reps?: number | undefined;
|
|
3870
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3871
|
-
apiKeyEnvVar?: string | undefined;
|
|
3872
|
-
temperature?: number | undefined;
|
|
3873
|
-
maxToolOutputSize?: number | undefined;
|
|
3874
|
-
} | undefined;
|
|
3875
|
-
responseSize?: {
|
|
3876
|
-
maxBytes?: number | undefined;
|
|
3877
|
-
minBytes?: number | undefined;
|
|
3878
|
-
} | undefined;
|
|
3879
|
-
} | undefined;
|
|
3880
|
-
}>, "many">;
|
|
3056
|
+
}, z.core.$strip>>;
|
|
3057
|
+
}, z.core.$strip>>;
|
|
3058
|
+
}, z.core.$strip>>;
|
|
3881
3059
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3882
|
-
},
|
|
3883
|
-
name: string;
|
|
3884
|
-
cases: {
|
|
3885
|
-
id: string;
|
|
3886
|
-
args?: Record<string, unknown> | undefined;
|
|
3887
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3888
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3889
|
-
description?: string | undefined;
|
|
3890
|
-
toolName?: string | undefined;
|
|
3891
|
-
scenario?: string | undefined;
|
|
3892
|
-
llmHostConfig?: {
|
|
3893
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3894
|
-
model?: string | undefined;
|
|
3895
|
-
maxTokens?: number | undefined;
|
|
3896
|
-
apiKeyEnvVar?: string | undefined;
|
|
3897
|
-
temperature?: number | undefined;
|
|
3898
|
-
maxToolCalls?: number | undefined;
|
|
3899
|
-
} | undefined;
|
|
3900
|
-
iterations?: number | undefined;
|
|
3901
|
-
accuracyThreshold?: number | undefined;
|
|
3902
|
-
judgeReps?: number | undefined;
|
|
3903
|
-
canonicalAnswer?: string | undefined;
|
|
3904
|
-
tags?: string[] | undefined;
|
|
3905
|
-
expect?: {
|
|
3906
|
-
response?: unknown;
|
|
3907
|
-
isError?: string | boolean | string[] | undefined;
|
|
3908
|
-
schema?: string | undefined;
|
|
3909
|
-
snapshot?: string | undefined;
|
|
3910
|
-
toolsTriggered?: {
|
|
3911
|
-
calls: {
|
|
3912
|
-
name: string;
|
|
3913
|
-
required?: boolean | undefined;
|
|
3914
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3915
|
-
}[];
|
|
3916
|
-
order?: "strict" | "any" | undefined;
|
|
3917
|
-
exclusive?: boolean | undefined;
|
|
3918
|
-
} | undefined;
|
|
3919
|
-
toolCallCount?: {
|
|
3920
|
-
exact?: number | undefined;
|
|
3921
|
-
min?: number | undefined;
|
|
3922
|
-
max?: number | undefined;
|
|
3923
|
-
} | undefined;
|
|
3924
|
-
containsText?: string | string[] | undefined;
|
|
3925
|
-
matchesPattern?: string | string[] | undefined;
|
|
3926
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3927
|
-
pattern: string;
|
|
3928
|
-
replacement?: string | undefined;
|
|
3929
|
-
} | {
|
|
3930
|
-
remove: string[];
|
|
3931
|
-
})[] | undefined;
|
|
3932
|
-
passesJudge?: {
|
|
3933
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3934
|
-
text: string;
|
|
3935
|
-
};
|
|
3936
|
-
model?: string | undefined;
|
|
3937
|
-
maxTokens?: number | undefined;
|
|
3938
|
-
maxBudgetUsd?: number | undefined;
|
|
3939
|
-
reference?: unknown;
|
|
3940
|
-
threshold?: number | undefined;
|
|
3941
|
-
reps?: number | undefined;
|
|
3942
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3943
|
-
apiKeyEnvVar?: string | undefined;
|
|
3944
|
-
temperature?: number | undefined;
|
|
3945
|
-
maxToolOutputSize?: number | undefined;
|
|
3946
|
-
} | undefined;
|
|
3947
|
-
responseSize?: {
|
|
3948
|
-
maxBytes?: number | undefined;
|
|
3949
|
-
minBytes?: number | undefined;
|
|
3950
|
-
} | undefined;
|
|
3951
|
-
} | undefined;
|
|
3952
|
-
}[];
|
|
3953
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3954
|
-
description?: string | undefined;
|
|
3955
|
-
}, {
|
|
3956
|
-
name: string;
|
|
3957
|
-
cases: {
|
|
3958
|
-
id: string;
|
|
3959
|
-
args?: Record<string, unknown> | undefined;
|
|
3960
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3961
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3962
|
-
description?: string | undefined;
|
|
3963
|
-
toolName?: string | undefined;
|
|
3964
|
-
scenario?: string | undefined;
|
|
3965
|
-
llmHostConfig?: {
|
|
3966
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3967
|
-
model?: string | undefined;
|
|
3968
|
-
maxTokens?: number | undefined;
|
|
3969
|
-
apiKeyEnvVar?: string | undefined;
|
|
3970
|
-
temperature?: number | undefined;
|
|
3971
|
-
maxToolCalls?: number | undefined;
|
|
3972
|
-
} | undefined;
|
|
3973
|
-
iterations?: number | undefined;
|
|
3974
|
-
accuracyThreshold?: number | undefined;
|
|
3975
|
-
judgeReps?: number | undefined;
|
|
3976
|
-
canonicalAnswer?: string | undefined;
|
|
3977
|
-
tags?: string[] | undefined;
|
|
3978
|
-
expect?: {
|
|
3979
|
-
response?: unknown;
|
|
3980
|
-
isError?: string | boolean | string[] | undefined;
|
|
3981
|
-
schema?: string | undefined;
|
|
3982
|
-
snapshot?: string | undefined;
|
|
3983
|
-
toolsTriggered?: {
|
|
3984
|
-
calls: {
|
|
3985
|
-
name: string;
|
|
3986
|
-
required?: boolean | undefined;
|
|
3987
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3988
|
-
}[];
|
|
3989
|
-
order?: "strict" | "any" | undefined;
|
|
3990
|
-
exclusive?: boolean | undefined;
|
|
3991
|
-
} | undefined;
|
|
3992
|
-
toolCallCount?: {
|
|
3993
|
-
exact?: number | undefined;
|
|
3994
|
-
min?: number | undefined;
|
|
3995
|
-
max?: number | undefined;
|
|
3996
|
-
} | undefined;
|
|
3997
|
-
containsText?: string | string[] | undefined;
|
|
3998
|
-
matchesPattern?: string | string[] | undefined;
|
|
3999
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
4000
|
-
pattern: string;
|
|
4001
|
-
replacement?: string | undefined;
|
|
4002
|
-
} | {
|
|
4003
|
-
remove: string[];
|
|
4004
|
-
})[] | undefined;
|
|
4005
|
-
passesJudge?: {
|
|
4006
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
4007
|
-
text: string;
|
|
4008
|
-
};
|
|
4009
|
-
model?: string | undefined;
|
|
4010
|
-
maxTokens?: number | undefined;
|
|
4011
|
-
maxBudgetUsd?: number | undefined;
|
|
4012
|
-
reference?: unknown;
|
|
4013
|
-
threshold?: number | undefined;
|
|
4014
|
-
reps?: number | undefined;
|
|
4015
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
4016
|
-
apiKeyEnvVar?: string | undefined;
|
|
4017
|
-
temperature?: number | undefined;
|
|
4018
|
-
maxToolOutputSize?: number | undefined;
|
|
4019
|
-
} | undefined;
|
|
4020
|
-
responseSize?: {
|
|
4021
|
-
maxBytes?: number | undefined;
|
|
4022
|
-
minBytes?: number | undefined;
|
|
4023
|
-
} | undefined;
|
|
4024
|
-
} | undefined;
|
|
4025
|
-
}[];
|
|
4026
|
-
metadata?: Record<string, unknown> | undefined;
|
|
4027
|
-
description?: string | undefined;
|
|
4028
|
-
}>;
|
|
3060
|
+
}, z.core.$strip>;
|
|
4029
3061
|
/**
|
|
4030
3062
|
* Type for serialized eval dataset (without Zod schemas)
|
|
4031
3063
|
*/
|
|
@@ -4122,15 +3154,15 @@ interface EvalRunMetadata {
|
|
|
4122
3154
|
timestamp: string;
|
|
4123
3155
|
/** Package version from package.json */
|
|
4124
3156
|
packageVersion: string;
|
|
4125
|
-
/**
|
|
4126
|
-
|
|
3157
|
+
/** MCP host model identifier (if mcp_host mode) */
|
|
3158
|
+
mcpHostModel?: string;
|
|
4127
3159
|
/** Judge model identifier (if judge was used) */
|
|
4128
3160
|
judgeModel?: string;
|
|
4129
3161
|
}
|
|
4130
3162
|
/**
|
|
4131
3163
|
* Individual conformance check result
|
|
4132
3164
|
*/
|
|
4133
|
-
interface MCPConformanceCheck
|
|
3165
|
+
interface MCPConformanceCheck {
|
|
4134
3166
|
/**
|
|
4135
3167
|
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
4136
3168
|
*/
|
|
@@ -4159,7 +3191,7 @@ interface MCPConformanceResultData {
|
|
|
4159
3191
|
/**
|
|
4160
3192
|
* Individual check results
|
|
4161
3193
|
*/
|
|
4162
|
-
checks: MCPConformanceCheck
|
|
3194
|
+
checks: MCPConformanceCheck[];
|
|
4163
3195
|
/**
|
|
4164
3196
|
* Server info if available
|
|
4165
3197
|
*/
|
|
@@ -4282,12 +3314,6 @@ interface EvalCaseResult {
|
|
|
4282
3314
|
* Only present when the case was run with `iterations > 1`.
|
|
4283
3315
|
*/
|
|
4284
3316
|
infrastructureErrorRate?: number;
|
|
4285
|
-
/**
|
|
4286
|
-
* Accuracy score (0–1) across all iterations.
|
|
4287
|
-
* Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
|
|
4288
|
-
* @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
|
|
4289
|
-
*/
|
|
4290
|
-
accuracy?: number;
|
|
4291
3317
|
/**
|
|
4292
3318
|
* Per-iteration pass/fail breakdown.
|
|
4293
3319
|
* Only present when the case was run with `iterations > 1`.
|
|
@@ -4300,7 +3326,7 @@ interface EvalCaseResult {
|
|
|
4300
3326
|
/**
|
|
4301
3327
|
* Precision of tool calls made (0–1).
|
|
4302
3328
|
* 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
|
|
4303
|
-
*
|
|
3329
|
+
* Populated whenever a `toolsTriggered` expectation is evaluated.
|
|
4304
3330
|
*/
|
|
4305
3331
|
toolPrecision?: number;
|
|
4306
3332
|
/**
|
|
@@ -4319,6 +3345,23 @@ interface EvalCaseResult {
|
|
|
4319
3345
|
* Only present when the case was run with `iterations > 1`.
|
|
4320
3346
|
*/
|
|
4321
3347
|
infrastructureErrorCount?: number;
|
|
3348
|
+
/**
|
|
3349
|
+
* Ordered trace of tool calls made by the LLM in mcp_host mode.
|
|
3350
|
+
* Only populated when the eval case uses toolsTriggered expectations.
|
|
3351
|
+
*/
|
|
3352
|
+
mcpHostTrace?: {
|
|
3353
|
+
/** The ordered sequence of tool calls made by the LLM */
|
|
3354
|
+
calls: Array<{
|
|
3355
|
+
name: string;
|
|
3356
|
+
arguments: Record<string, unknown>;
|
|
3357
|
+
/** 'expected' = was in the expected set, 'unexpected' = was not expected */
|
|
3358
|
+
status: 'expected' | 'unexpected';
|
|
3359
|
+
}>;
|
|
3360
|
+
/** Tools that were required but never called */
|
|
3361
|
+
missed: Array<{
|
|
3362
|
+
name: string;
|
|
3363
|
+
}>;
|
|
3364
|
+
};
|
|
4322
3365
|
}
|
|
4323
3366
|
/**
|
|
4324
3367
|
* Aggregated MCP eval run data
|
|
@@ -4462,13 +3505,13 @@ interface EvalRunnerResult {
|
|
|
4462
3505
|
*/
|
|
4463
3506
|
improvements?: number;
|
|
4464
3507
|
/**
|
|
4465
|
-
* Average tool precision across all
|
|
3508
|
+
* Average tool precision across all mcp_host cases that have a
|
|
4466
3509
|
* `toolsTriggered` expectation (precision = fraction of called tools
|
|
4467
3510
|
* that were expected). Only present when at least one such case ran.
|
|
4468
3511
|
*/
|
|
4469
3512
|
datasetToolPrecision?: number;
|
|
4470
3513
|
/**
|
|
4471
|
-
* Average tool recall across all
|
|
3514
|
+
* Average tool recall across all mcp_host cases that have a
|
|
4472
3515
|
* `toolsTriggered` expectation (recall = fraction of required tools
|
|
4473
3516
|
* that were actually called). Only present when at least one such case ran.
|
|
4474
3517
|
*/
|
|
@@ -4523,7 +3566,7 @@ interface EvalRunnerOptions {
|
|
|
4523
3566
|
*/
|
|
4524
3567
|
concurrency?: number;
|
|
4525
3568
|
/**
|
|
4526
|
-
* Default iteration count for `
|
|
3569
|
+
* Default iteration count for `mcp_host` mode cases that do not specify
|
|
4527
3570
|
* `iterations` explicitly. Has no effect on `direct` mode cases (which are
|
|
4528
3571
|
* deterministic and always default to 1 iteration).
|
|
4529
3572
|
*
|
|
@@ -4534,7 +3577,7 @@ interface EvalRunnerOptions {
|
|
|
4534
3577
|
*
|
|
4535
3578
|
* @example
|
|
4536
3579
|
* ```typescript
|
|
4537
|
-
* // Run all
|
|
3580
|
+
* // Run all mcp_host cases 10 times each by default
|
|
4538
3581
|
* await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
|
|
4539
3582
|
* ```
|
|
4540
3583
|
*/
|
|
@@ -4567,12 +3610,12 @@ interface EvalRunnerOptions {
|
|
|
4567
3610
|
*/
|
|
4568
3611
|
baselineResultsFrom?: string;
|
|
4569
3612
|
/**
|
|
4570
|
-
*
|
|
4571
|
-
* Use this to identify which model was used when running
|
|
3613
|
+
* MCP host model identifier to record in run metadata.
|
|
3614
|
+
* Use this to identify which model was used when running mcp_host cases.
|
|
4572
3615
|
*
|
|
4573
3616
|
* @example 'claude-opus-4-20250514'
|
|
4574
3617
|
*/
|
|
4575
|
-
|
|
3618
|
+
mcpHostModel?: string;
|
|
4576
3619
|
/**
|
|
4577
3620
|
* Judge model identifier to record in run metadata.
|
|
4578
3621
|
* Use this to identify which model was used for judge evaluations.
|
|
@@ -4660,8 +3703,6 @@ interface ServerComparisonResult {
|
|
|
4660
3703
|
ties: number;
|
|
4661
3704
|
/** Cases where both failed */
|
|
4662
3705
|
bothFail: number;
|
|
4663
|
-
/** Raw count of cases where both servers failed (same as bothFail) */
|
|
4664
|
-
bothFailCount: number;
|
|
4665
3706
|
/** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
|
|
4666
3707
|
decidedCases: number;
|
|
4667
3708
|
/** Fraction of total cases where both servers failed (bothFail / total) */
|
|
@@ -4712,7 +3753,7 @@ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baseli
|
|
|
4712
3753
|
declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
|
|
4713
3754
|
|
|
4714
3755
|
/**
|
|
4715
|
-
*
|
|
3756
|
+
* MCP Host Simulation - Main entry point
|
|
4716
3757
|
*
|
|
4717
3758
|
* All providers (openai, anthropic, google, azure, mistral, deepseek,
|
|
4718
3759
|
* openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
|
|
@@ -4731,7 +3772,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4731
3772
|
*/
|
|
4732
3773
|
|
|
4733
3774
|
/**
|
|
4734
|
-
* Simulates an
|
|
3775
|
+
* Simulates an MCP host interacting with an MCP server.
|
|
4735
3776
|
*
|
|
4736
3777
|
* The LLM chooses which tools to call based solely on their descriptions and
|
|
4737
3778
|
* schemas, testing discoverability and parameter clarity at the level a real
|
|
@@ -4743,12 +3784,12 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4743
3784
|
*
|
|
4744
3785
|
* @param mcp - MCP fixture API
|
|
4745
3786
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
4746
|
-
* @param config -
|
|
3787
|
+
* @param config - MCP host configuration (provider, model, temperature, etc.)
|
|
4747
3788
|
* @returns Simulation result with tool calls, final response, and latency data
|
|
4748
3789
|
*
|
|
4749
3790
|
* @example
|
|
4750
3791
|
* ```typescript
|
|
4751
|
-
* const result = await
|
|
3792
|
+
* const result = await simulateMCPHost(mcp,
|
|
4752
3793
|
* "Find recent documents about MCP testing frameworks",
|
|
4753
3794
|
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
4754
3795
|
* );
|
|
@@ -4757,7 +3798,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4757
3798
|
* expect(result.toolCalls.map(c => c.name)).toContain('search');
|
|
4758
3799
|
* ```
|
|
4759
3800
|
*/
|
|
4760
|
-
declare function
|
|
3801
|
+
declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
4761
3802
|
/**
|
|
4762
3803
|
* Returns true if the given provider is supported.
|
|
4763
3804
|
*
|
|
@@ -4836,14 +3877,6 @@ interface MCPConformanceOptions {
|
|
|
4836
3877
|
*/
|
|
4837
3878
|
checkPrompts?: boolean;
|
|
4838
3879
|
}
|
|
4839
|
-
/**
|
|
4840
|
-
* Individual check result
|
|
4841
|
-
*/
|
|
4842
|
-
interface MCPConformanceCheck {
|
|
4843
|
-
name: string;
|
|
4844
|
-
pass: boolean;
|
|
4845
|
-
message: string;
|
|
4846
|
-
}
|
|
4847
3880
|
/**
|
|
4848
3881
|
* Raw MCP responses for snapshotting
|
|
4849
3882
|
*/
|
|
@@ -4976,4 +4009,4 @@ interface MCPEvalReporterConfig {
|
|
|
4976
4009
|
includeAutoTracking?: boolean;
|
|
4977
4010
|
}
|
|
4978
4011
|
|
|
4979
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type
|
|
4012
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|