@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -10
- package/dist/cli/index.js +34 -11
- package/dist/fixtures/mcp.d.ts +6 -6
- package/dist/fixtures/mcp.js +5 -5
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +64 -43
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +199 -1168
- package/dist/index.d.ts +199 -1168
- package/dist/index.js +64 -43
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +9 -6
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -233,314 +233,64 @@ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
|
|
|
233
233
|
/**
|
|
234
234
|
* Union schema for MCPConfig (validates based on transport type)
|
|
235
235
|
*/
|
|
236
|
-
declare const MCPConfigSchema: z.ZodDiscriminatedUnion<
|
|
236
|
+
declare const MCPConfigSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
237
237
|
transport: z.ZodLiteral<"stdio">;
|
|
238
238
|
command: z.ZodString;
|
|
239
|
-
args: z.ZodOptional<z.ZodArray<z.ZodString
|
|
239
|
+
args: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
240
240
|
cwd: z.ZodOptional<z.ZodString>;
|
|
241
241
|
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
242
242
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
243
243
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
244
244
|
roots: z.ZodOptional<z.ZodObject<{
|
|
245
245
|
listChanged: z.ZodBoolean;
|
|
246
|
-
},
|
|
247
|
-
|
|
248
|
-
}, {
|
|
249
|
-
listChanged: boolean;
|
|
250
|
-
}>>;
|
|
251
|
-
}, "strip", z.ZodTypeAny, {
|
|
252
|
-
sampling?: Record<string, unknown> | undefined;
|
|
253
|
-
roots?: {
|
|
254
|
-
listChanged: boolean;
|
|
255
|
-
} | undefined;
|
|
256
|
-
}, {
|
|
257
|
-
sampling?: Record<string, unknown> | undefined;
|
|
258
|
-
roots?: {
|
|
259
|
-
listChanged: boolean;
|
|
260
|
-
} | undefined;
|
|
261
|
-
}>>;
|
|
246
|
+
}, z.core.$strip>>;
|
|
247
|
+
}, z.core.$strip>>;
|
|
262
248
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
263
249
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
264
250
|
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
265
251
|
quiet: z.ZodOptional<z.ZodBoolean>;
|
|
266
|
-
},
|
|
267
|
-
transport: "stdio";
|
|
268
|
-
command: string;
|
|
269
|
-
args?: string[] | undefined;
|
|
270
|
-
cwd?: string | undefined;
|
|
271
|
-
env?: Record<string, string> | undefined;
|
|
272
|
-
capabilities?: {
|
|
273
|
-
sampling?: Record<string, unknown> | undefined;
|
|
274
|
-
roots?: {
|
|
275
|
-
listChanged: boolean;
|
|
276
|
-
} | undefined;
|
|
277
|
-
} | undefined;
|
|
278
|
-
connectTimeoutMs?: number | undefined;
|
|
279
|
-
requestTimeoutMs?: number | undefined;
|
|
280
|
-
callTimeoutMs?: number | undefined;
|
|
281
|
-
quiet?: boolean | undefined;
|
|
282
|
-
}, {
|
|
283
|
-
transport: "stdio";
|
|
284
|
-
command: string;
|
|
285
|
-
args?: string[] | undefined;
|
|
286
|
-
cwd?: string | undefined;
|
|
287
|
-
env?: Record<string, string> | undefined;
|
|
288
|
-
capabilities?: {
|
|
289
|
-
sampling?: Record<string, unknown> | undefined;
|
|
290
|
-
roots?: {
|
|
291
|
-
listChanged: boolean;
|
|
292
|
-
} | undefined;
|
|
293
|
-
} | undefined;
|
|
294
|
-
connectTimeoutMs?: number | undefined;
|
|
295
|
-
requestTimeoutMs?: number | undefined;
|
|
296
|
-
callTimeoutMs?: number | undefined;
|
|
297
|
-
quiet?: boolean | undefined;
|
|
298
|
-
}>, z.ZodObject<{
|
|
252
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
299
253
|
transport: z.ZodLiteral<"http">;
|
|
300
|
-
serverUrl: z.
|
|
254
|
+
serverUrl: z.ZodString;
|
|
301
255
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
302
256
|
capabilities: z.ZodOptional<z.ZodObject<{
|
|
303
257
|
sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
304
258
|
roots: z.ZodOptional<z.ZodObject<{
|
|
305
259
|
listChanged: z.ZodBoolean;
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
}, {
|
|
309
|
-
listChanged: boolean;
|
|
310
|
-
}>>;
|
|
311
|
-
}, "strip", z.ZodTypeAny, {
|
|
312
|
-
sampling?: Record<string, unknown> | undefined;
|
|
313
|
-
roots?: {
|
|
314
|
-
listChanged: boolean;
|
|
315
|
-
} | undefined;
|
|
316
|
-
}, {
|
|
317
|
-
sampling?: Record<string, unknown> | undefined;
|
|
318
|
-
roots?: {
|
|
319
|
-
listChanged: boolean;
|
|
320
|
-
} | undefined;
|
|
321
|
-
}>>;
|
|
260
|
+
}, z.core.$strip>>;
|
|
261
|
+
}, z.core.$strip>>;
|
|
322
262
|
connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
323
263
|
requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
324
264
|
callTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
325
|
-
auth: z.ZodOptional<z.
|
|
265
|
+
auth: z.ZodOptional<z.ZodObject<{
|
|
326
266
|
accessToken: z.ZodOptional<z.ZodString>;
|
|
327
267
|
oauth: z.ZodOptional<z.ZodObject<{
|
|
328
268
|
serverUrl: z.ZodString;
|
|
329
|
-
scopes: z.ZodOptional<z.ZodArray<z.ZodString
|
|
269
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
330
270
|
resource: z.ZodOptional<z.ZodString>;
|
|
331
271
|
authStatePath: z.ZodOptional<z.ZodString>;
|
|
332
272
|
clientId: z.ZodOptional<z.ZodString>;
|
|
333
273
|
clientSecret: z.ZodOptional<z.ZodString>;
|
|
334
274
|
redirectUri: z.ZodOptional<z.ZodString>;
|
|
335
|
-
},
|
|
336
|
-
serverUrl: string;
|
|
337
|
-
scopes?: string[] | undefined;
|
|
338
|
-
resource?: string | undefined;
|
|
339
|
-
authStatePath?: string | undefined;
|
|
340
|
-
clientId?: string | undefined;
|
|
341
|
-
clientSecret?: string | undefined;
|
|
342
|
-
redirectUri?: string | undefined;
|
|
343
|
-
}, {
|
|
344
|
-
serverUrl: string;
|
|
345
|
-
scopes?: string[] | undefined;
|
|
346
|
-
resource?: string | undefined;
|
|
347
|
-
authStatePath?: string | undefined;
|
|
348
|
-
clientId?: string | undefined;
|
|
349
|
-
clientSecret?: string | undefined;
|
|
350
|
-
redirectUri?: string | undefined;
|
|
351
|
-
}>>;
|
|
275
|
+
}, z.core.$strip>>;
|
|
352
276
|
clientCredentials: z.ZodOptional<z.ZodObject<{
|
|
353
277
|
clientId: z.ZodOptional<z.ZodString>;
|
|
354
278
|
clientSecret: z.ZodOptional<z.ZodString>;
|
|
355
279
|
tokenEndpoint: z.ZodOptional<z.ZodString>;
|
|
356
|
-
scopes: z.ZodOptional<z.ZodArray<z.ZodString
|
|
357
|
-
},
|
|
358
|
-
|
|
359
|
-
clientId?: string | undefined;
|
|
360
|
-
clientSecret?: string | undefined;
|
|
361
|
-
tokenEndpoint?: string | undefined;
|
|
362
|
-
}, {
|
|
363
|
-
scopes?: string[] | undefined;
|
|
364
|
-
clientId?: string | undefined;
|
|
365
|
-
clientSecret?: string | undefined;
|
|
366
|
-
tokenEndpoint?: string | undefined;
|
|
367
|
-
}>>;
|
|
368
|
-
}, "strip", z.ZodTypeAny, {
|
|
369
|
-
accessToken?: string | undefined;
|
|
370
|
-
oauth?: {
|
|
371
|
-
serverUrl: string;
|
|
372
|
-
scopes?: string[] | undefined;
|
|
373
|
-
resource?: string | undefined;
|
|
374
|
-
authStatePath?: string | undefined;
|
|
375
|
-
clientId?: string | undefined;
|
|
376
|
-
clientSecret?: string | undefined;
|
|
377
|
-
redirectUri?: string | undefined;
|
|
378
|
-
} | undefined;
|
|
379
|
-
clientCredentials?: {
|
|
380
|
-
scopes?: string[] | undefined;
|
|
381
|
-
clientId?: string | undefined;
|
|
382
|
-
clientSecret?: string | undefined;
|
|
383
|
-
tokenEndpoint?: string | undefined;
|
|
384
|
-
} | undefined;
|
|
385
|
-
}, {
|
|
386
|
-
accessToken?: string | undefined;
|
|
387
|
-
oauth?: {
|
|
388
|
-
serverUrl: string;
|
|
389
|
-
scopes?: string[] | undefined;
|
|
390
|
-
resource?: string | undefined;
|
|
391
|
-
authStatePath?: string | undefined;
|
|
392
|
-
clientId?: string | undefined;
|
|
393
|
-
clientSecret?: string | undefined;
|
|
394
|
-
redirectUri?: string | undefined;
|
|
395
|
-
} | undefined;
|
|
396
|
-
clientCredentials?: {
|
|
397
|
-
scopes?: string[] | undefined;
|
|
398
|
-
clientId?: string | undefined;
|
|
399
|
-
clientSecret?: string | undefined;
|
|
400
|
-
tokenEndpoint?: string | undefined;
|
|
401
|
-
} | undefined;
|
|
402
|
-
}>, {
|
|
403
|
-
accessToken?: string | undefined;
|
|
404
|
-
oauth?: {
|
|
405
|
-
serverUrl: string;
|
|
406
|
-
scopes?: string[] | undefined;
|
|
407
|
-
resource?: string | undefined;
|
|
408
|
-
authStatePath?: string | undefined;
|
|
409
|
-
clientId?: string | undefined;
|
|
410
|
-
clientSecret?: string | undefined;
|
|
411
|
-
redirectUri?: string | undefined;
|
|
412
|
-
} | undefined;
|
|
413
|
-
clientCredentials?: {
|
|
414
|
-
scopes?: string[] | undefined;
|
|
415
|
-
clientId?: string | undefined;
|
|
416
|
-
clientSecret?: string | undefined;
|
|
417
|
-
tokenEndpoint?: string | undefined;
|
|
418
|
-
} | undefined;
|
|
419
|
-
}, {
|
|
420
|
-
accessToken?: string | undefined;
|
|
421
|
-
oauth?: {
|
|
422
|
-
serverUrl: string;
|
|
423
|
-
scopes?: string[] | undefined;
|
|
424
|
-
resource?: string | undefined;
|
|
425
|
-
authStatePath?: string | undefined;
|
|
426
|
-
clientId?: string | undefined;
|
|
427
|
-
clientSecret?: string | undefined;
|
|
428
|
-
redirectUri?: string | undefined;
|
|
429
|
-
} | undefined;
|
|
430
|
-
clientCredentials?: {
|
|
431
|
-
scopes?: string[] | undefined;
|
|
432
|
-
clientId?: string | undefined;
|
|
433
|
-
clientSecret?: string | undefined;
|
|
434
|
-
tokenEndpoint?: string | undefined;
|
|
435
|
-
} | undefined;
|
|
436
|
-
}>>;
|
|
280
|
+
scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
281
|
+
}, z.core.$strip>>;
|
|
282
|
+
}, z.core.$strip>>;
|
|
437
283
|
proxy: z.ZodOptional<z.ZodObject<{
|
|
438
284
|
url: z.ZodString;
|
|
439
|
-
},
|
|
440
|
-
url: string;
|
|
441
|
-
}, {
|
|
442
|
-
url: string;
|
|
443
|
-
}>>;
|
|
285
|
+
}, z.core.$strip>>;
|
|
444
286
|
retryAttempts: z.ZodOptional<z.ZodNumber>;
|
|
445
287
|
tls: z.ZodOptional<z.ZodObject<{
|
|
446
288
|
ca: z.ZodOptional<z.ZodString>;
|
|
447
289
|
cert: z.ZodOptional<z.ZodString>;
|
|
448
290
|
key: z.ZodOptional<z.ZodString>;
|
|
449
291
|
rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
|
|
450
|
-
},
|
|
451
|
-
|
|
452
|
-
cert?: string | undefined;
|
|
453
|
-
key?: string | undefined;
|
|
454
|
-
rejectUnauthorized?: boolean | undefined;
|
|
455
|
-
}, {
|
|
456
|
-
ca?: string | undefined;
|
|
457
|
-
cert?: string | undefined;
|
|
458
|
-
key?: string | undefined;
|
|
459
|
-
rejectUnauthorized?: boolean | undefined;
|
|
460
|
-
}>>;
|
|
461
|
-
}, "strip", z.ZodTypeAny, {
|
|
462
|
-
serverUrl: string;
|
|
463
|
-
transport: "http";
|
|
464
|
-
capabilities?: {
|
|
465
|
-
sampling?: Record<string, unknown> | undefined;
|
|
466
|
-
roots?: {
|
|
467
|
-
listChanged: boolean;
|
|
468
|
-
} | undefined;
|
|
469
|
-
} | undefined;
|
|
470
|
-
connectTimeoutMs?: number | undefined;
|
|
471
|
-
requestTimeoutMs?: number | undefined;
|
|
472
|
-
callTimeoutMs?: number | undefined;
|
|
473
|
-
headers?: Record<string, string> | undefined;
|
|
474
|
-
auth?: {
|
|
475
|
-
accessToken?: string | undefined;
|
|
476
|
-
oauth?: {
|
|
477
|
-
serverUrl: string;
|
|
478
|
-
scopes?: string[] | undefined;
|
|
479
|
-
resource?: string | undefined;
|
|
480
|
-
authStatePath?: string | undefined;
|
|
481
|
-
clientId?: string | undefined;
|
|
482
|
-
clientSecret?: string | undefined;
|
|
483
|
-
redirectUri?: string | undefined;
|
|
484
|
-
} | undefined;
|
|
485
|
-
clientCredentials?: {
|
|
486
|
-
scopes?: string[] | undefined;
|
|
487
|
-
clientId?: string | undefined;
|
|
488
|
-
clientSecret?: string | undefined;
|
|
489
|
-
tokenEndpoint?: string | undefined;
|
|
490
|
-
} | undefined;
|
|
491
|
-
} | undefined;
|
|
492
|
-
proxy?: {
|
|
493
|
-
url: string;
|
|
494
|
-
} | undefined;
|
|
495
|
-
retryAttempts?: number | undefined;
|
|
496
|
-
tls?: {
|
|
497
|
-
ca?: string | undefined;
|
|
498
|
-
cert?: string | undefined;
|
|
499
|
-
key?: string | undefined;
|
|
500
|
-
rejectUnauthorized?: boolean | undefined;
|
|
501
|
-
} | undefined;
|
|
502
|
-
}, {
|
|
503
|
-
serverUrl: string;
|
|
504
|
-
transport: "http";
|
|
505
|
-
capabilities?: {
|
|
506
|
-
sampling?: Record<string, unknown> | undefined;
|
|
507
|
-
roots?: {
|
|
508
|
-
listChanged: boolean;
|
|
509
|
-
} | undefined;
|
|
510
|
-
} | undefined;
|
|
511
|
-
connectTimeoutMs?: number | undefined;
|
|
512
|
-
requestTimeoutMs?: number | undefined;
|
|
513
|
-
callTimeoutMs?: number | undefined;
|
|
514
|
-
headers?: Record<string, string> | undefined;
|
|
515
|
-
auth?: {
|
|
516
|
-
accessToken?: string | undefined;
|
|
517
|
-
oauth?: {
|
|
518
|
-
serverUrl: string;
|
|
519
|
-
scopes?: string[] | undefined;
|
|
520
|
-
resource?: string | undefined;
|
|
521
|
-
authStatePath?: string | undefined;
|
|
522
|
-
clientId?: string | undefined;
|
|
523
|
-
clientSecret?: string | undefined;
|
|
524
|
-
redirectUri?: string | undefined;
|
|
525
|
-
} | undefined;
|
|
526
|
-
clientCredentials?: {
|
|
527
|
-
scopes?: string[] | undefined;
|
|
528
|
-
clientId?: string | undefined;
|
|
529
|
-
clientSecret?: string | undefined;
|
|
530
|
-
tokenEndpoint?: string | undefined;
|
|
531
|
-
} | undefined;
|
|
532
|
-
} | undefined;
|
|
533
|
-
proxy?: {
|
|
534
|
-
url: string;
|
|
535
|
-
} | undefined;
|
|
536
|
-
retryAttempts?: number | undefined;
|
|
537
|
-
tls?: {
|
|
538
|
-
ca?: string | undefined;
|
|
539
|
-
cert?: string | undefined;
|
|
540
|
-
key?: string | undefined;
|
|
541
|
-
rejectUnauthorized?: boolean | undefined;
|
|
542
|
-
} | undefined;
|
|
543
|
-
}>]>;
|
|
292
|
+
}, z.core.$strip>>;
|
|
293
|
+
}, z.core.$strip>], "transport">;
|
|
544
294
|
/**
|
|
545
295
|
* Validates an MCPConfig object
|
|
546
296
|
*
|
|
@@ -1790,9 +1540,9 @@ declare function validateError(response: unknown, expected?: boolean | string |
|
|
|
1790
1540
|
declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
|
|
1791
1541
|
|
|
1792
1542
|
/**
|
|
1793
|
-
* Tool call validators for
|
|
1543
|
+
* Tool call validators for mcp_host simulation results.
|
|
1794
1544
|
*
|
|
1795
|
-
* These validators extract the tool call trace from an
|
|
1545
|
+
* These validators extract the tool call trace from an MCPHostSimulationResult
|
|
1796
1546
|
* and apply assertions against expected call lists and counts.
|
|
1797
1547
|
*/
|
|
1798
1548
|
|
|
@@ -1811,16 +1561,16 @@ interface ToolCallCountOptions {
|
|
|
1811
1561
|
exact?: number;
|
|
1812
1562
|
}
|
|
1813
1563
|
/**
|
|
1814
|
-
* Validates tool calls made during an
|
|
1564
|
+
* Validates tool calls made during an MCP host simulation.
|
|
1815
1565
|
*
|
|
1816
|
-
* @param response - Must be an
|
|
1566
|
+
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
|
|
1817
1567
|
* @param expectation - Expected tool call specification
|
|
1818
1568
|
*/
|
|
1819
1569
|
declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
|
|
1820
1570
|
/**
|
|
1821
|
-
* Validates the number of tool calls made during an
|
|
1571
|
+
* Validates the number of tool calls made during an MCP host simulation.
|
|
1822
1572
|
*
|
|
1823
|
-
* @param response - Must be an
|
|
1573
|
+
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
|
|
1824
1574
|
* @param options - Count constraints (min, max, exact)
|
|
1825
1575
|
*/
|
|
1826
1576
|
declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
|
|
@@ -2223,7 +1973,7 @@ declare global {
|
|
|
2223
1973
|
*/
|
|
2224
1974
|
toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
|
|
2225
1975
|
/**
|
|
2226
|
-
* Validates which tools the LLM called during
|
|
1976
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
2227
1977
|
*
|
|
2228
1978
|
* @example
|
|
2229
1979
|
* ```typescript
|
|
@@ -2235,7 +1985,7 @@ declare global {
|
|
|
2235
1985
|
*/
|
|
2236
1986
|
toHaveToolCalls(expectation: ToolCallExpectation): R;
|
|
2237
1987
|
/**
|
|
2238
|
-
* Validates the number of tool calls made during
|
|
1988
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
2239
1989
|
*
|
|
2240
1990
|
* @example
|
|
2241
1991
|
* ```typescript
|
|
@@ -2603,7 +2353,7 @@ declare function toSatisfyToolPredicate(this: {
|
|
|
2603
2353
|
/**
|
|
2604
2354
|
* toHaveToolCalls Matcher
|
|
2605
2355
|
*
|
|
2606
|
-
* Validates which tools the LLM called during
|
|
2356
|
+
* Validates which tools the LLM called during a mcp_host simulation.
|
|
2607
2357
|
*/
|
|
2608
2358
|
|
|
2609
2359
|
/**
|
|
@@ -2619,7 +2369,7 @@ declare function toHaveToolCalls(this: {
|
|
|
2619
2369
|
/**
|
|
2620
2370
|
* toHaveToolCallCount Matcher
|
|
2621
2371
|
*
|
|
2622
|
-
* Validates the number of tool calls made during
|
|
2372
|
+
* Validates the number of tool calls made during a mcp_host simulation.
|
|
2623
2373
|
*/
|
|
2624
2374
|
|
|
2625
2375
|
/**
|
|
@@ -2728,9 +2478,9 @@ interface MCPAuthFixtures {
|
|
|
2728
2478
|
declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
|
|
2729
2479
|
|
|
2730
2480
|
/**
|
|
2731
|
-
* Types and interfaces for
|
|
2481
|
+
* Types and interfaces for MCP host simulation mode
|
|
2732
2482
|
*
|
|
2733
|
-
* This module provides types for testing MCP servers through
|
|
2483
|
+
* This module provides types for testing MCP servers through MCP hosts,
|
|
2734
2484
|
* validating tool descriptions, parameter clarity, and discoverability.
|
|
2735
2485
|
*/
|
|
2736
2486
|
|
|
@@ -2759,9 +2509,9 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
|
|
|
2759
2509
|
*/
|
|
2760
2510
|
| 'vertex-anthropic';
|
|
2761
2511
|
/**
|
|
2762
|
-
* Configuration for
|
|
2512
|
+
* Configuration for MCP host simulation
|
|
2763
2513
|
*/
|
|
2764
|
-
interface
|
|
2514
|
+
interface MCPHostConfig {
|
|
2765
2515
|
/**
|
|
2766
2516
|
* LLM provider to use
|
|
2767
2517
|
*/
|
|
@@ -2801,9 +2551,9 @@ interface LLMToolCall {
|
|
|
2801
2551
|
id?: string;
|
|
2802
2552
|
}
|
|
2803
2553
|
/**
|
|
2804
|
-
* Result from an
|
|
2554
|
+
* Result from an MCP host simulation
|
|
2805
2555
|
*/
|
|
2806
|
-
interface
|
|
2556
|
+
interface MCPHostSimulationResult {
|
|
2807
2557
|
/** Whether the simulation succeeded */
|
|
2808
2558
|
success: boolean;
|
|
2809
2559
|
/** Tool calls made by the LLM */
|
|
@@ -2831,33 +2581,33 @@ interface LLMHostSimulationResult {
|
|
|
2831
2581
|
mcpDurationMs?: number;
|
|
2832
2582
|
}
|
|
2833
2583
|
/**
|
|
2834
|
-
* Interface for
|
|
2584
|
+
* Interface for MCP host simulators.
|
|
2835
2585
|
*
|
|
2836
2586
|
* The only built-in implementation is the Vercel AI SDK orchestrator
|
|
2837
|
-
* (src/evals/
|
|
2587
|
+
* (src/evals/mcpHost/adapters/vercel.ts). Custom implementations can be
|
|
2838
2588
|
* created for specialised testing needs.
|
|
2839
2589
|
*/
|
|
2840
|
-
interface
|
|
2590
|
+
interface MCPHostSimulator {
|
|
2841
2591
|
/**
|
|
2842
|
-
* Simulates an
|
|
2592
|
+
* Simulates an MCP host interacting with an MCP server
|
|
2843
2593
|
*
|
|
2844
2594
|
* @param mcp - MCP fixture API
|
|
2845
2595
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
2846
|
-
* @param config -
|
|
2596
|
+
* @param config - MCP host configuration
|
|
2847
2597
|
* @returns Simulation result with tool calls and response
|
|
2848
2598
|
*/
|
|
2849
|
-
simulate(mcp: MCPFixtureApi, scenario: string, config:
|
|
2599
|
+
simulate(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
2850
2600
|
}
|
|
2851
2601
|
|
|
2852
2602
|
/**
|
|
2853
2603
|
* Evaluation mode
|
|
2854
2604
|
*/
|
|
2855
|
-
type EvalMode = 'direct' | '
|
|
2605
|
+
type EvalMode = 'direct' | 'mcp_host';
|
|
2856
2606
|
/**
|
|
2857
2607
|
* A single eval test case
|
|
2858
2608
|
*
|
|
2859
2609
|
* For 'direct' mode: toolName and args are required
|
|
2860
|
-
* For '
|
|
2610
|
+
* For 'mcp_host' mode: scenario and mcpHostConfig are required
|
|
2861
2611
|
*/
|
|
2862
2612
|
interface EvalCase {
|
|
2863
2613
|
/**
|
|
@@ -2871,39 +2621,39 @@ interface EvalCase {
|
|
|
2871
2621
|
/**
|
|
2872
2622
|
* Evaluation mode
|
|
2873
2623
|
* - 'direct': Direct API calls to MCP tools (default)
|
|
2874
|
-
* - '
|
|
2624
|
+
* - 'mcp_host': LLM-driven tool selection via natural language
|
|
2875
2625
|
*
|
|
2876
2626
|
* @default 'direct'
|
|
2877
2627
|
*/
|
|
2878
2628
|
mode?: EvalMode;
|
|
2879
2629
|
/**
|
|
2880
|
-
* Name of the MCP tool to call (required for 'direct' mode, optional for '
|
|
2630
|
+
* Name of the MCP tool to call (required for 'direct' mode, optional for 'mcp_host' mode)
|
|
2881
2631
|
*/
|
|
2882
2632
|
toolName?: string;
|
|
2883
2633
|
/**
|
|
2884
|
-
* Arguments to pass to the tool (required for 'direct' mode, optional for '
|
|
2634
|
+
* Arguments to pass to the tool (required for 'direct' mode, optional for 'mcp_host' mode)
|
|
2885
2635
|
*/
|
|
2886
2636
|
args?: Record<string, unknown>;
|
|
2887
2637
|
/**
|
|
2888
|
-
* Natural language scenario for LLM to execute (optional, required for '
|
|
2638
|
+
* Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
|
|
2889
2639
|
*
|
|
2890
2640
|
* @example "Get the weather for London and tell me if I need an umbrella"
|
|
2891
2641
|
*/
|
|
2892
2642
|
scenario?: string;
|
|
2893
2643
|
/**
|
|
2894
|
-
*
|
|
2644
|
+
* MCP host configuration (optional for 'mcp_host' mode)
|
|
2895
2645
|
*
|
|
2896
2646
|
* If not specified, uses default configuration from test environment
|
|
2897
2647
|
*/
|
|
2898
|
-
|
|
2648
|
+
mcpHostConfig?: MCPHostConfig;
|
|
2899
2649
|
/**
|
|
2900
2650
|
* Additional metadata for this test case
|
|
2901
2651
|
*
|
|
2902
|
-
* For '
|
|
2652
|
+
* For 'mcp_host' mode, can include 'expectedToolCalls' for validation
|
|
2903
2653
|
*/
|
|
2904
2654
|
metadata?: Record<string, unknown>;
|
|
2905
2655
|
/**
|
|
2906
|
-
* Number of times to run this case and compute an
|
|
2656
|
+
* Number of times to run this case and compute an assertion pass rate.
|
|
2907
2657
|
* When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
|
|
2908
2658
|
* by `accuracyThreshold` rather than a single run.
|
|
2909
2659
|
* @default 1
|
|
@@ -3035,8 +2785,8 @@ interface EvalExpectBlock {
|
|
|
3035
2785
|
minBytes?: number;
|
|
3036
2786
|
};
|
|
3037
2787
|
/**
|
|
3038
|
-
* Asserts which tools the LLM called during
|
|
3039
|
-
* Only meaningful for
|
|
2788
|
+
* Asserts which tools the LLM called during a mcp_host simulation.
|
|
2789
|
+
* Only meaningful for mcp_host mode — direct mode has no tool call trace.
|
|
3040
2790
|
*/
|
|
3041
2791
|
toolsTriggered?: {
|
|
3042
2792
|
/** Expected tool calls */
|
|
@@ -3057,7 +2807,7 @@ interface EvalExpectBlock {
|
|
|
3057
2807
|
exclusive?: boolean;
|
|
3058
2808
|
};
|
|
3059
2809
|
/**
|
|
3060
|
-
* Asserts the number of tool calls made during
|
|
2810
|
+
* Asserts the number of tool calls made during a mcp_host simulation.
|
|
3061
2811
|
*/
|
|
3062
2812
|
toolCallCount?: {
|
|
3063
2813
|
/** Minimum number of tool calls */
|
|
@@ -3096,399 +2846,109 @@ interface EvalDataset {
|
|
|
3096
2846
|
/**
|
|
3097
2847
|
* Zod schema for EvalCase
|
|
3098
2848
|
*
|
|
3099
|
-
* toolName and args are optional for
|
|
2849
|
+
* toolName and args are optional for mcp_host mode (which uses scenario instead)
|
|
3100
2850
|
*/
|
|
3101
2851
|
declare const EvalCaseSchema: z.ZodObject<{
|
|
3102
2852
|
id: z.ZodString;
|
|
3103
2853
|
description: z.ZodOptional<z.ZodString>;
|
|
3104
|
-
mode: z.ZodOptional<z.ZodEnum<
|
|
2854
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
2855
|
+
direct: "direct";
|
|
2856
|
+
mcp_host: "mcp_host";
|
|
2857
|
+
}>>;
|
|
3105
2858
|
toolName: z.ZodOptional<z.ZodString>;
|
|
3106
2859
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3107
2860
|
scenario: z.ZodOptional<z.ZodString>;
|
|
3108
|
-
|
|
3109
|
-
provider: z.ZodEnum<
|
|
2861
|
+
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2862
|
+
provider: z.ZodEnum<{
|
|
2863
|
+
openai: "openai";
|
|
2864
|
+
anthropic: "anthropic";
|
|
2865
|
+
azure: "azure";
|
|
2866
|
+
google: "google";
|
|
2867
|
+
mistral: "mistral";
|
|
2868
|
+
deepseek: "deepseek";
|
|
2869
|
+
openrouter: "openrouter";
|
|
2870
|
+
xai: "xai";
|
|
2871
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
2872
|
+
}>;
|
|
3110
2873
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3111
2874
|
model: z.ZodOptional<z.ZodString>;
|
|
3112
2875
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3113
2876
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3114
2877
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3115
|
-
},
|
|
3116
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3117
|
-
model?: string | undefined;
|
|
3118
|
-
maxTokens?: number | undefined;
|
|
3119
|
-
apiKeyEnvVar?: string | undefined;
|
|
3120
|
-
temperature?: number | undefined;
|
|
3121
|
-
maxToolCalls?: number | undefined;
|
|
3122
|
-
}, {
|
|
3123
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3124
|
-
model?: string | undefined;
|
|
3125
|
-
maxTokens?: number | undefined;
|
|
3126
|
-
apiKeyEnvVar?: string | undefined;
|
|
3127
|
-
temperature?: number | undefined;
|
|
3128
|
-
maxToolCalls?: number | undefined;
|
|
3129
|
-
}>>;
|
|
2878
|
+
}, z.core.$strip>>;
|
|
3130
2879
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3131
2880
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3132
2881
|
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3133
2882
|
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3134
2883
|
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3135
|
-
tags: z.ZodOptional<z.ZodArray<z.ZodString
|
|
2884
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
3136
2885
|
expect: z.ZodOptional<z.ZodObject<{
|
|
3137
2886
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
3138
2887
|
schema: z.ZodOptional<z.ZodString>;
|
|
3139
|
-
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
3140
|
-
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
2888
|
+
containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2889
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3141
2890
|
snapshot: z.ZodOptional<z.ZodString>;
|
|
3142
|
-
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<
|
|
2891
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2892
|
+
timestamp: "timestamp";
|
|
2893
|
+
uuid: "uuid";
|
|
2894
|
+
"iso-date": "iso-date";
|
|
2895
|
+
objectId: "objectId";
|
|
2896
|
+
jwt: "jwt";
|
|
2897
|
+
}>, z.ZodObject<{
|
|
3143
2898
|
pattern: z.ZodString;
|
|
3144
2899
|
replacement: z.ZodOptional<z.ZodString>;
|
|
3145
|
-
},
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
pattern: string;
|
|
3150
|
-
replacement?: string | undefined;
|
|
3151
|
-
}>, z.ZodObject<{
|
|
3152
|
-
remove: z.ZodArray<z.ZodString, "many">;
|
|
3153
|
-
}, "strip", z.ZodTypeAny, {
|
|
3154
|
-
remove: string[];
|
|
3155
|
-
}, {
|
|
3156
|
-
remove: string[];
|
|
3157
|
-
}>]>, "many">>;
|
|
3158
|
-
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
2900
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
2901
|
+
remove: z.ZodArray<z.ZodString>;
|
|
2902
|
+
}, z.core.$strip>]>>>;
|
|
2903
|
+
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3159
2904
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3160
|
-
rubric: z.ZodUnion<[z.ZodEnum<
|
|
2905
|
+
rubric: z.ZodUnion<readonly [z.ZodEnum<{
|
|
2906
|
+
correctness: "correctness";
|
|
2907
|
+
completeness: "completeness";
|
|
2908
|
+
groundedness: "groundedness";
|
|
2909
|
+
"instruction-following": "instruction-following";
|
|
2910
|
+
conciseness: "conciseness";
|
|
2911
|
+
}>, z.ZodObject<{
|
|
3161
2912
|
text: z.ZodString;
|
|
3162
|
-
},
|
|
3163
|
-
text: string;
|
|
3164
|
-
}, {
|
|
3165
|
-
text: string;
|
|
3166
|
-
}>]>;
|
|
2913
|
+
}, z.core.$strip>]>;
|
|
3167
2914
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3168
2915
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3169
2916
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3170
|
-
provider: z.ZodOptional<z.ZodEnum<
|
|
2917
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
2918
|
+
openai: "openai";
|
|
2919
|
+
anthropic: "anthropic";
|
|
2920
|
+
google: "google";
|
|
2921
|
+
}>>;
|
|
3171
2922
|
model: z.ZodOptional<z.ZodString>;
|
|
3172
2923
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3173
2924
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3174
2925
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3175
2926
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3176
2927
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3177
|
-
},
|
|
3178
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3179
|
-
text: string;
|
|
3180
|
-
};
|
|
3181
|
-
model?: string | undefined;
|
|
3182
|
-
maxTokens?: number | undefined;
|
|
3183
|
-
maxBudgetUsd?: number | undefined;
|
|
3184
|
-
reference?: unknown;
|
|
3185
|
-
threshold?: number | undefined;
|
|
3186
|
-
reps?: number | undefined;
|
|
3187
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3188
|
-
apiKeyEnvVar?: string | undefined;
|
|
3189
|
-
temperature?: number | undefined;
|
|
3190
|
-
maxToolOutputSize?: number | undefined;
|
|
3191
|
-
}, {
|
|
3192
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3193
|
-
text: string;
|
|
3194
|
-
};
|
|
3195
|
-
model?: string | undefined;
|
|
3196
|
-
maxTokens?: number | undefined;
|
|
3197
|
-
maxBudgetUsd?: number | undefined;
|
|
3198
|
-
reference?: unknown;
|
|
3199
|
-
threshold?: number | undefined;
|
|
3200
|
-
reps?: number | undefined;
|
|
3201
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3202
|
-
apiKeyEnvVar?: string | undefined;
|
|
3203
|
-
temperature?: number | undefined;
|
|
3204
|
-
maxToolOutputSize?: number | undefined;
|
|
3205
|
-
}>>;
|
|
2928
|
+
}, z.core.$strip>>;
|
|
3206
2929
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3207
2930
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3208
2931
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
3209
|
-
},
|
|
3210
|
-
maxBytes?: number | undefined;
|
|
3211
|
-
minBytes?: number | undefined;
|
|
3212
|
-
}, {
|
|
3213
|
-
maxBytes?: number | undefined;
|
|
3214
|
-
minBytes?: number | undefined;
|
|
3215
|
-
}>>;
|
|
2932
|
+
}, z.core.$strip>>;
|
|
3216
2933
|
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3217
2934
|
calls: z.ZodArray<z.ZodObject<{
|
|
3218
2935
|
name: z.ZodString;
|
|
3219
2936
|
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3220
2937
|
required: z.ZodOptional<z.ZodBoolean>;
|
|
3221
|
-
},
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
}
|
|
3226
|
-
name: string;
|
|
3227
|
-
required?: boolean | undefined;
|
|
3228
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3229
|
-
}>, "many">;
|
|
3230
|
-
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
2938
|
+
}, z.core.$strip>>;
|
|
2939
|
+
order: z.ZodOptional<z.ZodEnum<{
|
|
2940
|
+
any: "any";
|
|
2941
|
+
strict: "strict";
|
|
2942
|
+
}>>;
|
|
3231
2943
|
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3232
|
-
},
|
|
3233
|
-
calls: {
|
|
3234
|
-
name: string;
|
|
3235
|
-
required?: boolean | undefined;
|
|
3236
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3237
|
-
}[];
|
|
3238
|
-
order?: "strict" | "any" | undefined;
|
|
3239
|
-
exclusive?: boolean | undefined;
|
|
3240
|
-
}, {
|
|
3241
|
-
calls: {
|
|
3242
|
-
name: string;
|
|
3243
|
-
required?: boolean | undefined;
|
|
3244
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3245
|
-
}[];
|
|
3246
|
-
order?: "strict" | "any" | undefined;
|
|
3247
|
-
exclusive?: boolean | undefined;
|
|
3248
|
-
}>>;
|
|
2944
|
+
}, z.core.$strip>>;
|
|
3249
2945
|
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3250
2946
|
min: z.ZodOptional<z.ZodNumber>;
|
|
3251
2947
|
max: z.ZodOptional<z.ZodNumber>;
|
|
3252
2948
|
exact: z.ZodOptional<z.ZodNumber>;
|
|
3253
|
-
},
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
max?: number | undefined;
|
|
3257
|
-
}, {
|
|
3258
|
-
exact?: number | undefined;
|
|
3259
|
-
min?: number | undefined;
|
|
3260
|
-
max?: number | undefined;
|
|
3261
|
-
}>>;
|
|
3262
|
-
}, "strip", z.ZodTypeAny, {
|
|
3263
|
-
response?: unknown;
|
|
3264
|
-
isError?: string | boolean | string[] | undefined;
|
|
3265
|
-
schema?: string | undefined;
|
|
3266
|
-
snapshot?: string | undefined;
|
|
3267
|
-
toolsTriggered?: {
|
|
3268
|
-
calls: {
|
|
3269
|
-
name: string;
|
|
3270
|
-
required?: boolean | undefined;
|
|
3271
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3272
|
-
}[];
|
|
3273
|
-
order?: "strict" | "any" | undefined;
|
|
3274
|
-
exclusive?: boolean | undefined;
|
|
3275
|
-
} | undefined;
|
|
3276
|
-
toolCallCount?: {
|
|
3277
|
-
exact?: number | undefined;
|
|
3278
|
-
min?: number | undefined;
|
|
3279
|
-
max?: number | undefined;
|
|
3280
|
-
} | undefined;
|
|
3281
|
-
containsText?: string | string[] | undefined;
|
|
3282
|
-
matchesPattern?: string | string[] | undefined;
|
|
3283
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3284
|
-
pattern: string;
|
|
3285
|
-
replacement?: string | undefined;
|
|
3286
|
-
} | {
|
|
3287
|
-
remove: string[];
|
|
3288
|
-
})[] | undefined;
|
|
3289
|
-
passesJudge?: {
|
|
3290
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3291
|
-
text: string;
|
|
3292
|
-
};
|
|
3293
|
-
model?: string | undefined;
|
|
3294
|
-
maxTokens?: number | undefined;
|
|
3295
|
-
maxBudgetUsd?: number | undefined;
|
|
3296
|
-
reference?: unknown;
|
|
3297
|
-
threshold?: number | undefined;
|
|
3298
|
-
reps?: number | undefined;
|
|
3299
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3300
|
-
apiKeyEnvVar?: string | undefined;
|
|
3301
|
-
temperature?: number | undefined;
|
|
3302
|
-
maxToolOutputSize?: number | undefined;
|
|
3303
|
-
} | undefined;
|
|
3304
|
-
responseSize?: {
|
|
3305
|
-
maxBytes?: number | undefined;
|
|
3306
|
-
minBytes?: number | undefined;
|
|
3307
|
-
} | undefined;
|
|
3308
|
-
}, {
|
|
3309
|
-
response?: unknown;
|
|
3310
|
-
isError?: string | boolean | string[] | undefined;
|
|
3311
|
-
schema?: string | undefined;
|
|
3312
|
-
snapshot?: string | undefined;
|
|
3313
|
-
toolsTriggered?: {
|
|
3314
|
-
calls: {
|
|
3315
|
-
name: string;
|
|
3316
|
-
required?: boolean | undefined;
|
|
3317
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3318
|
-
}[];
|
|
3319
|
-
order?: "strict" | "any" | undefined;
|
|
3320
|
-
exclusive?: boolean | undefined;
|
|
3321
|
-
} | undefined;
|
|
3322
|
-
toolCallCount?: {
|
|
3323
|
-
exact?: number | undefined;
|
|
3324
|
-
min?: number | undefined;
|
|
3325
|
-
max?: number | undefined;
|
|
3326
|
-
} | undefined;
|
|
3327
|
-
containsText?: string | string[] | undefined;
|
|
3328
|
-
matchesPattern?: string | string[] | undefined;
|
|
3329
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3330
|
-
pattern: string;
|
|
3331
|
-
replacement?: string | undefined;
|
|
3332
|
-
} | {
|
|
3333
|
-
remove: string[];
|
|
3334
|
-
})[] | undefined;
|
|
3335
|
-
passesJudge?: {
|
|
3336
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3337
|
-
text: string;
|
|
3338
|
-
};
|
|
3339
|
-
model?: string | undefined;
|
|
3340
|
-
maxTokens?: number | undefined;
|
|
3341
|
-
maxBudgetUsd?: number | undefined;
|
|
3342
|
-
reference?: unknown;
|
|
3343
|
-
threshold?: number | undefined;
|
|
3344
|
-
reps?: number | undefined;
|
|
3345
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3346
|
-
apiKeyEnvVar?: string | undefined;
|
|
3347
|
-
temperature?: number | undefined;
|
|
3348
|
-
maxToolOutputSize?: number | undefined;
|
|
3349
|
-
} | undefined;
|
|
3350
|
-
responseSize?: {
|
|
3351
|
-
maxBytes?: number | undefined;
|
|
3352
|
-
minBytes?: number | undefined;
|
|
3353
|
-
} | undefined;
|
|
3354
|
-
}>>;
|
|
3355
|
-
}, "strip", z.ZodTypeAny, {
|
|
3356
|
-
id: string;
|
|
3357
|
-
args?: Record<string, unknown> | undefined;
|
|
3358
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3359
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3360
|
-
description?: string | undefined;
|
|
3361
|
-
toolName?: string | undefined;
|
|
3362
|
-
scenario?: string | undefined;
|
|
3363
|
-
llmHostConfig?: {
|
|
3364
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3365
|
-
model?: string | undefined;
|
|
3366
|
-
maxTokens?: number | undefined;
|
|
3367
|
-
apiKeyEnvVar?: string | undefined;
|
|
3368
|
-
temperature?: number | undefined;
|
|
3369
|
-
maxToolCalls?: number | undefined;
|
|
3370
|
-
} | undefined;
|
|
3371
|
-
iterations?: number | undefined;
|
|
3372
|
-
accuracyThreshold?: number | undefined;
|
|
3373
|
-
judgeReps?: number | undefined;
|
|
3374
|
-
canonicalAnswer?: string | undefined;
|
|
3375
|
-
tags?: string[] | undefined;
|
|
3376
|
-
expect?: {
|
|
3377
|
-
response?: unknown;
|
|
3378
|
-
isError?: string | boolean | string[] | undefined;
|
|
3379
|
-
schema?: string | undefined;
|
|
3380
|
-
snapshot?: string | undefined;
|
|
3381
|
-
toolsTriggered?: {
|
|
3382
|
-
calls: {
|
|
3383
|
-
name: string;
|
|
3384
|
-
required?: boolean | undefined;
|
|
3385
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3386
|
-
}[];
|
|
3387
|
-
order?: "strict" | "any" | undefined;
|
|
3388
|
-
exclusive?: boolean | undefined;
|
|
3389
|
-
} | undefined;
|
|
3390
|
-
toolCallCount?: {
|
|
3391
|
-
exact?: number | undefined;
|
|
3392
|
-
min?: number | undefined;
|
|
3393
|
-
max?: number | undefined;
|
|
3394
|
-
} | undefined;
|
|
3395
|
-
containsText?: string | string[] | undefined;
|
|
3396
|
-
matchesPattern?: string | string[] | undefined;
|
|
3397
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3398
|
-
pattern: string;
|
|
3399
|
-
replacement?: string | undefined;
|
|
3400
|
-
} | {
|
|
3401
|
-
remove: string[];
|
|
3402
|
-
})[] | undefined;
|
|
3403
|
-
passesJudge?: {
|
|
3404
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3405
|
-
text: string;
|
|
3406
|
-
};
|
|
3407
|
-
model?: string | undefined;
|
|
3408
|
-
maxTokens?: number | undefined;
|
|
3409
|
-
maxBudgetUsd?: number | undefined;
|
|
3410
|
-
reference?: unknown;
|
|
3411
|
-
threshold?: number | undefined;
|
|
3412
|
-
reps?: number | undefined;
|
|
3413
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3414
|
-
apiKeyEnvVar?: string | undefined;
|
|
3415
|
-
temperature?: number | undefined;
|
|
3416
|
-
maxToolOutputSize?: number | undefined;
|
|
3417
|
-
} | undefined;
|
|
3418
|
-
responseSize?: {
|
|
3419
|
-
maxBytes?: number | undefined;
|
|
3420
|
-
minBytes?: number | undefined;
|
|
3421
|
-
} | undefined;
|
|
3422
|
-
} | undefined;
|
|
3423
|
-
}, {
|
|
3424
|
-
id: string;
|
|
3425
|
-
args?: Record<string, unknown> | undefined;
|
|
3426
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3427
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3428
|
-
description?: string | undefined;
|
|
3429
|
-
toolName?: string | undefined;
|
|
3430
|
-
scenario?: string | undefined;
|
|
3431
|
-
llmHostConfig?: {
|
|
3432
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3433
|
-
model?: string | undefined;
|
|
3434
|
-
maxTokens?: number | undefined;
|
|
3435
|
-
apiKeyEnvVar?: string | undefined;
|
|
3436
|
-
temperature?: number | undefined;
|
|
3437
|
-
maxToolCalls?: number | undefined;
|
|
3438
|
-
} | undefined;
|
|
3439
|
-
iterations?: number | undefined;
|
|
3440
|
-
accuracyThreshold?: number | undefined;
|
|
3441
|
-
judgeReps?: number | undefined;
|
|
3442
|
-
canonicalAnswer?: string | undefined;
|
|
3443
|
-
tags?: string[] | undefined;
|
|
3444
|
-
expect?: {
|
|
3445
|
-
response?: unknown;
|
|
3446
|
-
isError?: string | boolean | string[] | undefined;
|
|
3447
|
-
schema?: string | undefined;
|
|
3448
|
-
snapshot?: string | undefined;
|
|
3449
|
-
toolsTriggered?: {
|
|
3450
|
-
calls: {
|
|
3451
|
-
name: string;
|
|
3452
|
-
required?: boolean | undefined;
|
|
3453
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3454
|
-
}[];
|
|
3455
|
-
order?: "strict" | "any" | undefined;
|
|
3456
|
-
exclusive?: boolean | undefined;
|
|
3457
|
-
} | undefined;
|
|
3458
|
-
toolCallCount?: {
|
|
3459
|
-
exact?: number | undefined;
|
|
3460
|
-
min?: number | undefined;
|
|
3461
|
-
max?: number | undefined;
|
|
3462
|
-
} | undefined;
|
|
3463
|
-
containsText?: string | string[] | undefined;
|
|
3464
|
-
matchesPattern?: string | string[] | undefined;
|
|
3465
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3466
|
-
pattern: string;
|
|
3467
|
-
replacement?: string | undefined;
|
|
3468
|
-
} | {
|
|
3469
|
-
remove: string[];
|
|
3470
|
-
})[] | undefined;
|
|
3471
|
-
passesJudge?: {
|
|
3472
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3473
|
-
text: string;
|
|
3474
|
-
};
|
|
3475
|
-
model?: string | undefined;
|
|
3476
|
-
maxTokens?: number | undefined;
|
|
3477
|
-
maxBudgetUsd?: number | undefined;
|
|
3478
|
-
reference?: unknown;
|
|
3479
|
-
threshold?: number | undefined;
|
|
3480
|
-
reps?: number | undefined;
|
|
3481
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3482
|
-
apiKeyEnvVar?: string | undefined;
|
|
3483
|
-
temperature?: number | undefined;
|
|
3484
|
-
maxToolOutputSize?: number | undefined;
|
|
3485
|
-
} | undefined;
|
|
3486
|
-
responseSize?: {
|
|
3487
|
-
maxBytes?: number | undefined;
|
|
3488
|
-
minBytes?: number | undefined;
|
|
3489
|
-
} | undefined;
|
|
3490
|
-
} | undefined;
|
|
3491
|
-
}>;
|
|
2949
|
+
}, z.core.$strip>>;
|
|
2950
|
+
}, z.core.$strip>>;
|
|
2951
|
+
}, z.core.$strip>;
|
|
3492
2952
|
/**
|
|
3493
2953
|
* Zod schema for EvalDataset (without schemas field, as schemas aren't serializable)
|
|
3494
2954
|
*/
|
|
@@ -3498,542 +2958,106 @@ declare const EvalDatasetSchema: z.ZodObject<{
|
|
|
3498
2958
|
cases: z.ZodArray<z.ZodObject<{
|
|
3499
2959
|
id: z.ZodString;
|
|
3500
2960
|
description: z.ZodOptional<z.ZodString>;
|
|
3501
|
-
mode: z.ZodOptional<z.ZodEnum<
|
|
2961
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
2962
|
+
direct: "direct";
|
|
2963
|
+
mcp_host: "mcp_host";
|
|
2964
|
+
}>>;
|
|
3502
2965
|
toolName: z.ZodOptional<z.ZodString>;
|
|
3503
2966
|
args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3504
2967
|
scenario: z.ZodOptional<z.ZodString>;
|
|
3505
|
-
|
|
3506
|
-
provider: z.ZodEnum<
|
|
2968
|
+
mcpHostConfig: z.ZodOptional<z.ZodObject<{
|
|
2969
|
+
provider: z.ZodEnum<{
|
|
2970
|
+
openai: "openai";
|
|
2971
|
+
anthropic: "anthropic";
|
|
2972
|
+
azure: "azure";
|
|
2973
|
+
google: "google";
|
|
2974
|
+
mistral: "mistral";
|
|
2975
|
+
deepseek: "deepseek";
|
|
2976
|
+
openrouter: "openrouter";
|
|
2977
|
+
xai: "xai";
|
|
2978
|
+
"vertex-anthropic": "vertex-anthropic";
|
|
2979
|
+
}>;
|
|
3507
2980
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3508
2981
|
model: z.ZodOptional<z.ZodString>;
|
|
3509
2982
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3510
2983
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3511
2984
|
maxToolCalls: z.ZodOptional<z.ZodNumber>;
|
|
3512
|
-
},
|
|
3513
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3514
|
-
model?: string | undefined;
|
|
3515
|
-
maxTokens?: number | undefined;
|
|
3516
|
-
apiKeyEnvVar?: string | undefined;
|
|
3517
|
-
temperature?: number | undefined;
|
|
3518
|
-
maxToolCalls?: number | undefined;
|
|
3519
|
-
}, {
|
|
3520
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3521
|
-
model?: string | undefined;
|
|
3522
|
-
maxTokens?: number | undefined;
|
|
3523
|
-
apiKeyEnvVar?: string | undefined;
|
|
3524
|
-
temperature?: number | undefined;
|
|
3525
|
-
maxToolCalls?: number | undefined;
|
|
3526
|
-
}>>;
|
|
2985
|
+
}, z.core.$strip>>;
|
|
3527
2986
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3528
2987
|
iterations: z.ZodOptional<z.ZodNumber>;
|
|
3529
2988
|
accuracyThreshold: z.ZodOptional<z.ZodNumber>;
|
|
3530
2989
|
judgeReps: z.ZodOptional<z.ZodNumber>;
|
|
3531
2990
|
canonicalAnswer: z.ZodOptional<z.ZodString>;
|
|
3532
|
-
tags: z.ZodOptional<z.ZodArray<z.ZodString
|
|
2991
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
3533
2992
|
expect: z.ZodOptional<z.ZodObject<{
|
|
3534
2993
|
response: z.ZodOptional<z.ZodUnknown>;
|
|
3535
2994
|
schema: z.ZodOptional<z.ZodString>;
|
|
3536
|
-
containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
3537
|
-
matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString
|
|
2995
|
+
containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
2996
|
+
matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3538
2997
|
snapshot: z.ZodOptional<z.ZodString>;
|
|
3539
|
-
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<
|
|
2998
|
+
snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
|
|
2999
|
+
timestamp: "timestamp";
|
|
3000
|
+
uuid: "uuid";
|
|
3001
|
+
"iso-date": "iso-date";
|
|
3002
|
+
objectId: "objectId";
|
|
3003
|
+
jwt: "jwt";
|
|
3004
|
+
}>, z.ZodObject<{
|
|
3540
3005
|
pattern: z.ZodString;
|
|
3541
3006
|
replacement: z.ZodOptional<z.ZodString>;
|
|
3542
|
-
},
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
pattern: string;
|
|
3547
|
-
replacement?: string | undefined;
|
|
3548
|
-
}>, z.ZodObject<{
|
|
3549
|
-
remove: z.ZodArray<z.ZodString, "many">;
|
|
3550
|
-
}, "strip", z.ZodTypeAny, {
|
|
3551
|
-
remove: string[];
|
|
3552
|
-
}, {
|
|
3553
|
-
remove: string[];
|
|
3554
|
-
}>]>, "many">>;
|
|
3555
|
-
isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
|
|
3007
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
3008
|
+
remove: z.ZodArray<z.ZodString>;
|
|
3009
|
+
}, z.core.$strip>]>>>;
|
|
3010
|
+
isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
|
|
3556
3011
|
passesJudge: z.ZodOptional<z.ZodObject<{
|
|
3557
|
-
rubric: z.ZodUnion<[z.ZodEnum<
|
|
3012
|
+
rubric: z.ZodUnion<readonly [z.ZodEnum<{
|
|
3013
|
+
correctness: "correctness";
|
|
3014
|
+
completeness: "completeness";
|
|
3015
|
+
groundedness: "groundedness";
|
|
3016
|
+
"instruction-following": "instruction-following";
|
|
3017
|
+
conciseness: "conciseness";
|
|
3018
|
+
}>, z.ZodObject<{
|
|
3558
3019
|
text: z.ZodString;
|
|
3559
|
-
},
|
|
3560
|
-
text: string;
|
|
3561
|
-
}, {
|
|
3562
|
-
text: string;
|
|
3563
|
-
}>]>;
|
|
3020
|
+
}, z.core.$strip>]>;
|
|
3564
3021
|
reference: z.ZodOptional<z.ZodUnknown>;
|
|
3565
3022
|
threshold: z.ZodOptional<z.ZodNumber>;
|
|
3566
3023
|
reps: z.ZodOptional<z.ZodNumber>;
|
|
3567
|
-
provider: z.ZodOptional<z.ZodEnum<
|
|
3024
|
+
provider: z.ZodOptional<z.ZodEnum<{
|
|
3025
|
+
openai: "openai";
|
|
3026
|
+
anthropic: "anthropic";
|
|
3027
|
+
google: "google";
|
|
3028
|
+
}>>;
|
|
3568
3029
|
model: z.ZodOptional<z.ZodString>;
|
|
3569
3030
|
apiKeyEnvVar: z.ZodOptional<z.ZodString>;
|
|
3570
3031
|
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
3571
3032
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
3572
3033
|
maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
|
|
3573
3034
|
maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
|
|
3574
|
-
},
|
|
3575
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3576
|
-
text: string;
|
|
3577
|
-
};
|
|
3578
|
-
model?: string | undefined;
|
|
3579
|
-
maxTokens?: number | undefined;
|
|
3580
|
-
maxBudgetUsd?: number | undefined;
|
|
3581
|
-
reference?: unknown;
|
|
3582
|
-
threshold?: number | undefined;
|
|
3583
|
-
reps?: number | undefined;
|
|
3584
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3585
|
-
apiKeyEnvVar?: string | undefined;
|
|
3586
|
-
temperature?: number | undefined;
|
|
3587
|
-
maxToolOutputSize?: number | undefined;
|
|
3588
|
-
}, {
|
|
3589
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3590
|
-
text: string;
|
|
3591
|
-
};
|
|
3592
|
-
model?: string | undefined;
|
|
3593
|
-
maxTokens?: number | undefined;
|
|
3594
|
-
maxBudgetUsd?: number | undefined;
|
|
3595
|
-
reference?: unknown;
|
|
3596
|
-
threshold?: number | undefined;
|
|
3597
|
-
reps?: number | undefined;
|
|
3598
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3599
|
-
apiKeyEnvVar?: string | undefined;
|
|
3600
|
-
temperature?: number | undefined;
|
|
3601
|
-
maxToolOutputSize?: number | undefined;
|
|
3602
|
-
}>>;
|
|
3035
|
+
}, z.core.$strip>>;
|
|
3603
3036
|
responseSize: z.ZodOptional<z.ZodObject<{
|
|
3604
3037
|
maxBytes: z.ZodOptional<z.ZodNumber>;
|
|
3605
3038
|
minBytes: z.ZodOptional<z.ZodNumber>;
|
|
3606
|
-
},
|
|
3607
|
-
maxBytes?: number | undefined;
|
|
3608
|
-
minBytes?: number | undefined;
|
|
3609
|
-
}, {
|
|
3610
|
-
maxBytes?: number | undefined;
|
|
3611
|
-
minBytes?: number | undefined;
|
|
3612
|
-
}>>;
|
|
3039
|
+
}, z.core.$strip>>;
|
|
3613
3040
|
toolsTriggered: z.ZodOptional<z.ZodObject<{
|
|
3614
3041
|
calls: z.ZodArray<z.ZodObject<{
|
|
3615
3042
|
name: z.ZodString;
|
|
3616
3043
|
arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3617
3044
|
required: z.ZodOptional<z.ZodBoolean>;
|
|
3618
|
-
},
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
}
|
|
3623
|
-
name: string;
|
|
3624
|
-
required?: boolean | undefined;
|
|
3625
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3626
|
-
}>, "many">;
|
|
3627
|
-
order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
|
|
3045
|
+
}, z.core.$strip>>;
|
|
3046
|
+
order: z.ZodOptional<z.ZodEnum<{
|
|
3047
|
+
any: "any";
|
|
3048
|
+
strict: "strict";
|
|
3049
|
+
}>>;
|
|
3628
3050
|
exclusive: z.ZodOptional<z.ZodBoolean>;
|
|
3629
|
-
},
|
|
3630
|
-
calls: {
|
|
3631
|
-
name: string;
|
|
3632
|
-
required?: boolean | undefined;
|
|
3633
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3634
|
-
}[];
|
|
3635
|
-
order?: "strict" | "any" | undefined;
|
|
3636
|
-
exclusive?: boolean | undefined;
|
|
3637
|
-
}, {
|
|
3638
|
-
calls: {
|
|
3639
|
-
name: string;
|
|
3640
|
-
required?: boolean | undefined;
|
|
3641
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3642
|
-
}[];
|
|
3643
|
-
order?: "strict" | "any" | undefined;
|
|
3644
|
-
exclusive?: boolean | undefined;
|
|
3645
|
-
}>>;
|
|
3051
|
+
}, z.core.$strip>>;
|
|
3646
3052
|
toolCallCount: z.ZodOptional<z.ZodObject<{
|
|
3647
3053
|
min: z.ZodOptional<z.ZodNumber>;
|
|
3648
3054
|
max: z.ZodOptional<z.ZodNumber>;
|
|
3649
3055
|
exact: z.ZodOptional<z.ZodNumber>;
|
|
3650
|
-
},
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
max?: number | undefined;
|
|
3654
|
-
}, {
|
|
3655
|
-
exact?: number | undefined;
|
|
3656
|
-
min?: number | undefined;
|
|
3657
|
-
max?: number | undefined;
|
|
3658
|
-
}>>;
|
|
3659
|
-
}, "strip", z.ZodTypeAny, {
|
|
3660
|
-
response?: unknown;
|
|
3661
|
-
isError?: string | boolean | string[] | undefined;
|
|
3662
|
-
schema?: string | undefined;
|
|
3663
|
-
snapshot?: string | undefined;
|
|
3664
|
-
toolsTriggered?: {
|
|
3665
|
-
calls: {
|
|
3666
|
-
name: string;
|
|
3667
|
-
required?: boolean | undefined;
|
|
3668
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3669
|
-
}[];
|
|
3670
|
-
order?: "strict" | "any" | undefined;
|
|
3671
|
-
exclusive?: boolean | undefined;
|
|
3672
|
-
} | undefined;
|
|
3673
|
-
toolCallCount?: {
|
|
3674
|
-
exact?: number | undefined;
|
|
3675
|
-
min?: number | undefined;
|
|
3676
|
-
max?: number | undefined;
|
|
3677
|
-
} | undefined;
|
|
3678
|
-
containsText?: string | string[] | undefined;
|
|
3679
|
-
matchesPattern?: string | string[] | undefined;
|
|
3680
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3681
|
-
pattern: string;
|
|
3682
|
-
replacement?: string | undefined;
|
|
3683
|
-
} | {
|
|
3684
|
-
remove: string[];
|
|
3685
|
-
})[] | undefined;
|
|
3686
|
-
passesJudge?: {
|
|
3687
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3688
|
-
text: string;
|
|
3689
|
-
};
|
|
3690
|
-
model?: string | undefined;
|
|
3691
|
-
maxTokens?: number | undefined;
|
|
3692
|
-
maxBudgetUsd?: number | undefined;
|
|
3693
|
-
reference?: unknown;
|
|
3694
|
-
threshold?: number | undefined;
|
|
3695
|
-
reps?: number | undefined;
|
|
3696
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3697
|
-
apiKeyEnvVar?: string | undefined;
|
|
3698
|
-
temperature?: number | undefined;
|
|
3699
|
-
maxToolOutputSize?: number | undefined;
|
|
3700
|
-
} | undefined;
|
|
3701
|
-
responseSize?: {
|
|
3702
|
-
maxBytes?: number | undefined;
|
|
3703
|
-
minBytes?: number | undefined;
|
|
3704
|
-
} | undefined;
|
|
3705
|
-
}, {
|
|
3706
|
-
response?: unknown;
|
|
3707
|
-
isError?: string | boolean | string[] | undefined;
|
|
3708
|
-
schema?: string | undefined;
|
|
3709
|
-
snapshot?: string | undefined;
|
|
3710
|
-
toolsTriggered?: {
|
|
3711
|
-
calls: {
|
|
3712
|
-
name: string;
|
|
3713
|
-
required?: boolean | undefined;
|
|
3714
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3715
|
-
}[];
|
|
3716
|
-
order?: "strict" | "any" | undefined;
|
|
3717
|
-
exclusive?: boolean | undefined;
|
|
3718
|
-
} | undefined;
|
|
3719
|
-
toolCallCount?: {
|
|
3720
|
-
exact?: number | undefined;
|
|
3721
|
-
min?: number | undefined;
|
|
3722
|
-
max?: number | undefined;
|
|
3723
|
-
} | undefined;
|
|
3724
|
-
containsText?: string | string[] | undefined;
|
|
3725
|
-
matchesPattern?: string | string[] | undefined;
|
|
3726
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3727
|
-
pattern: string;
|
|
3728
|
-
replacement?: string | undefined;
|
|
3729
|
-
} | {
|
|
3730
|
-
remove: string[];
|
|
3731
|
-
})[] | undefined;
|
|
3732
|
-
passesJudge?: {
|
|
3733
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3734
|
-
text: string;
|
|
3735
|
-
};
|
|
3736
|
-
model?: string | undefined;
|
|
3737
|
-
maxTokens?: number | undefined;
|
|
3738
|
-
maxBudgetUsd?: number | undefined;
|
|
3739
|
-
reference?: unknown;
|
|
3740
|
-
threshold?: number | undefined;
|
|
3741
|
-
reps?: number | undefined;
|
|
3742
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3743
|
-
apiKeyEnvVar?: string | undefined;
|
|
3744
|
-
temperature?: number | undefined;
|
|
3745
|
-
maxToolOutputSize?: number | undefined;
|
|
3746
|
-
} | undefined;
|
|
3747
|
-
responseSize?: {
|
|
3748
|
-
maxBytes?: number | undefined;
|
|
3749
|
-
minBytes?: number | undefined;
|
|
3750
|
-
} | undefined;
|
|
3751
|
-
}>>;
|
|
3752
|
-
}, "strip", z.ZodTypeAny, {
|
|
3753
|
-
id: string;
|
|
3754
|
-
args?: Record<string, unknown> | undefined;
|
|
3755
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3756
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3757
|
-
description?: string | undefined;
|
|
3758
|
-
toolName?: string | undefined;
|
|
3759
|
-
scenario?: string | undefined;
|
|
3760
|
-
llmHostConfig?: {
|
|
3761
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3762
|
-
model?: string | undefined;
|
|
3763
|
-
maxTokens?: number | undefined;
|
|
3764
|
-
apiKeyEnvVar?: string | undefined;
|
|
3765
|
-
temperature?: number | undefined;
|
|
3766
|
-
maxToolCalls?: number | undefined;
|
|
3767
|
-
} | undefined;
|
|
3768
|
-
iterations?: number | undefined;
|
|
3769
|
-
accuracyThreshold?: number | undefined;
|
|
3770
|
-
judgeReps?: number | undefined;
|
|
3771
|
-
canonicalAnswer?: string | undefined;
|
|
3772
|
-
tags?: string[] | undefined;
|
|
3773
|
-
expect?: {
|
|
3774
|
-
response?: unknown;
|
|
3775
|
-
isError?: string | boolean | string[] | undefined;
|
|
3776
|
-
schema?: string | undefined;
|
|
3777
|
-
snapshot?: string | undefined;
|
|
3778
|
-
toolsTriggered?: {
|
|
3779
|
-
calls: {
|
|
3780
|
-
name: string;
|
|
3781
|
-
required?: boolean | undefined;
|
|
3782
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3783
|
-
}[];
|
|
3784
|
-
order?: "strict" | "any" | undefined;
|
|
3785
|
-
exclusive?: boolean | undefined;
|
|
3786
|
-
} | undefined;
|
|
3787
|
-
toolCallCount?: {
|
|
3788
|
-
exact?: number | undefined;
|
|
3789
|
-
min?: number | undefined;
|
|
3790
|
-
max?: number | undefined;
|
|
3791
|
-
} | undefined;
|
|
3792
|
-
containsText?: string | string[] | undefined;
|
|
3793
|
-
matchesPattern?: string | string[] | undefined;
|
|
3794
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3795
|
-
pattern: string;
|
|
3796
|
-
replacement?: string | undefined;
|
|
3797
|
-
} | {
|
|
3798
|
-
remove: string[];
|
|
3799
|
-
})[] | undefined;
|
|
3800
|
-
passesJudge?: {
|
|
3801
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3802
|
-
text: string;
|
|
3803
|
-
};
|
|
3804
|
-
model?: string | undefined;
|
|
3805
|
-
maxTokens?: number | undefined;
|
|
3806
|
-
maxBudgetUsd?: number | undefined;
|
|
3807
|
-
reference?: unknown;
|
|
3808
|
-
threshold?: number | undefined;
|
|
3809
|
-
reps?: number | undefined;
|
|
3810
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3811
|
-
apiKeyEnvVar?: string | undefined;
|
|
3812
|
-
temperature?: number | undefined;
|
|
3813
|
-
maxToolOutputSize?: number | undefined;
|
|
3814
|
-
} | undefined;
|
|
3815
|
-
responseSize?: {
|
|
3816
|
-
maxBytes?: number | undefined;
|
|
3817
|
-
minBytes?: number | undefined;
|
|
3818
|
-
} | undefined;
|
|
3819
|
-
} | undefined;
|
|
3820
|
-
}, {
|
|
3821
|
-
id: string;
|
|
3822
|
-
args?: Record<string, unknown> | undefined;
|
|
3823
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3824
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3825
|
-
description?: string | undefined;
|
|
3826
|
-
toolName?: string | undefined;
|
|
3827
|
-
scenario?: string | undefined;
|
|
3828
|
-
llmHostConfig?: {
|
|
3829
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3830
|
-
model?: string | undefined;
|
|
3831
|
-
maxTokens?: number | undefined;
|
|
3832
|
-
apiKeyEnvVar?: string | undefined;
|
|
3833
|
-
temperature?: number | undefined;
|
|
3834
|
-
maxToolCalls?: number | undefined;
|
|
3835
|
-
} | undefined;
|
|
3836
|
-
iterations?: number | undefined;
|
|
3837
|
-
accuracyThreshold?: number | undefined;
|
|
3838
|
-
judgeReps?: number | undefined;
|
|
3839
|
-
canonicalAnswer?: string | undefined;
|
|
3840
|
-
tags?: string[] | undefined;
|
|
3841
|
-
expect?: {
|
|
3842
|
-
response?: unknown;
|
|
3843
|
-
isError?: string | boolean | string[] | undefined;
|
|
3844
|
-
schema?: string | undefined;
|
|
3845
|
-
snapshot?: string | undefined;
|
|
3846
|
-
toolsTriggered?: {
|
|
3847
|
-
calls: {
|
|
3848
|
-
name: string;
|
|
3849
|
-
required?: boolean | undefined;
|
|
3850
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3851
|
-
}[];
|
|
3852
|
-
order?: "strict" | "any" | undefined;
|
|
3853
|
-
exclusive?: boolean | undefined;
|
|
3854
|
-
} | undefined;
|
|
3855
|
-
toolCallCount?: {
|
|
3856
|
-
exact?: number | undefined;
|
|
3857
|
-
min?: number | undefined;
|
|
3858
|
-
max?: number | undefined;
|
|
3859
|
-
} | undefined;
|
|
3860
|
-
containsText?: string | string[] | undefined;
|
|
3861
|
-
matchesPattern?: string | string[] | undefined;
|
|
3862
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3863
|
-
pattern: string;
|
|
3864
|
-
replacement?: string | undefined;
|
|
3865
|
-
} | {
|
|
3866
|
-
remove: string[];
|
|
3867
|
-
})[] | undefined;
|
|
3868
|
-
passesJudge?: {
|
|
3869
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3870
|
-
text: string;
|
|
3871
|
-
};
|
|
3872
|
-
model?: string | undefined;
|
|
3873
|
-
maxTokens?: number | undefined;
|
|
3874
|
-
maxBudgetUsd?: number | undefined;
|
|
3875
|
-
reference?: unknown;
|
|
3876
|
-
threshold?: number | undefined;
|
|
3877
|
-
reps?: number | undefined;
|
|
3878
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3879
|
-
apiKeyEnvVar?: string | undefined;
|
|
3880
|
-
temperature?: number | undefined;
|
|
3881
|
-
maxToolOutputSize?: number | undefined;
|
|
3882
|
-
} | undefined;
|
|
3883
|
-
responseSize?: {
|
|
3884
|
-
maxBytes?: number | undefined;
|
|
3885
|
-
minBytes?: number | undefined;
|
|
3886
|
-
} | undefined;
|
|
3887
|
-
} | undefined;
|
|
3888
|
-
}>, "many">;
|
|
3056
|
+
}, z.core.$strip>>;
|
|
3057
|
+
}, z.core.$strip>>;
|
|
3058
|
+
}, z.core.$strip>>;
|
|
3889
3059
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3890
|
-
},
|
|
3891
|
-
name: string;
|
|
3892
|
-
cases: {
|
|
3893
|
-
id: string;
|
|
3894
|
-
args?: Record<string, unknown> | undefined;
|
|
3895
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3896
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3897
|
-
description?: string | undefined;
|
|
3898
|
-
toolName?: string | undefined;
|
|
3899
|
-
scenario?: string | undefined;
|
|
3900
|
-
llmHostConfig?: {
|
|
3901
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3902
|
-
model?: string | undefined;
|
|
3903
|
-
maxTokens?: number | undefined;
|
|
3904
|
-
apiKeyEnvVar?: string | undefined;
|
|
3905
|
-
temperature?: number | undefined;
|
|
3906
|
-
maxToolCalls?: number | undefined;
|
|
3907
|
-
} | undefined;
|
|
3908
|
-
iterations?: number | undefined;
|
|
3909
|
-
accuracyThreshold?: number | undefined;
|
|
3910
|
-
judgeReps?: number | undefined;
|
|
3911
|
-
canonicalAnswer?: string | undefined;
|
|
3912
|
-
tags?: string[] | undefined;
|
|
3913
|
-
expect?: {
|
|
3914
|
-
response?: unknown;
|
|
3915
|
-
isError?: string | boolean | string[] | undefined;
|
|
3916
|
-
schema?: string | undefined;
|
|
3917
|
-
snapshot?: string | undefined;
|
|
3918
|
-
toolsTriggered?: {
|
|
3919
|
-
calls: {
|
|
3920
|
-
name: string;
|
|
3921
|
-
required?: boolean | undefined;
|
|
3922
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3923
|
-
}[];
|
|
3924
|
-
order?: "strict" | "any" | undefined;
|
|
3925
|
-
exclusive?: boolean | undefined;
|
|
3926
|
-
} | undefined;
|
|
3927
|
-
toolCallCount?: {
|
|
3928
|
-
exact?: number | undefined;
|
|
3929
|
-
min?: number | undefined;
|
|
3930
|
-
max?: number | undefined;
|
|
3931
|
-
} | undefined;
|
|
3932
|
-
containsText?: string | string[] | undefined;
|
|
3933
|
-
matchesPattern?: string | string[] | undefined;
|
|
3934
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
3935
|
-
pattern: string;
|
|
3936
|
-
replacement?: string | undefined;
|
|
3937
|
-
} | {
|
|
3938
|
-
remove: string[];
|
|
3939
|
-
})[] | undefined;
|
|
3940
|
-
passesJudge?: {
|
|
3941
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
3942
|
-
text: string;
|
|
3943
|
-
};
|
|
3944
|
-
model?: string | undefined;
|
|
3945
|
-
maxTokens?: number | undefined;
|
|
3946
|
-
maxBudgetUsd?: number | undefined;
|
|
3947
|
-
reference?: unknown;
|
|
3948
|
-
threshold?: number | undefined;
|
|
3949
|
-
reps?: number | undefined;
|
|
3950
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
3951
|
-
apiKeyEnvVar?: string | undefined;
|
|
3952
|
-
temperature?: number | undefined;
|
|
3953
|
-
maxToolOutputSize?: number | undefined;
|
|
3954
|
-
} | undefined;
|
|
3955
|
-
responseSize?: {
|
|
3956
|
-
maxBytes?: number | undefined;
|
|
3957
|
-
minBytes?: number | undefined;
|
|
3958
|
-
} | undefined;
|
|
3959
|
-
} | undefined;
|
|
3960
|
-
}[];
|
|
3961
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3962
|
-
description?: string | undefined;
|
|
3963
|
-
}, {
|
|
3964
|
-
name: string;
|
|
3965
|
-
cases: {
|
|
3966
|
-
id: string;
|
|
3967
|
-
args?: Record<string, unknown> | undefined;
|
|
3968
|
-
mode?: "direct" | "llm_host" | undefined;
|
|
3969
|
-
metadata?: Record<string, unknown> | undefined;
|
|
3970
|
-
description?: string | undefined;
|
|
3971
|
-
toolName?: string | undefined;
|
|
3972
|
-
scenario?: string | undefined;
|
|
3973
|
-
llmHostConfig?: {
|
|
3974
|
-
provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
|
|
3975
|
-
model?: string | undefined;
|
|
3976
|
-
maxTokens?: number | undefined;
|
|
3977
|
-
apiKeyEnvVar?: string | undefined;
|
|
3978
|
-
temperature?: number | undefined;
|
|
3979
|
-
maxToolCalls?: number | undefined;
|
|
3980
|
-
} | undefined;
|
|
3981
|
-
iterations?: number | undefined;
|
|
3982
|
-
accuracyThreshold?: number | undefined;
|
|
3983
|
-
judgeReps?: number | undefined;
|
|
3984
|
-
canonicalAnswer?: string | undefined;
|
|
3985
|
-
tags?: string[] | undefined;
|
|
3986
|
-
expect?: {
|
|
3987
|
-
response?: unknown;
|
|
3988
|
-
isError?: string | boolean | string[] | undefined;
|
|
3989
|
-
schema?: string | undefined;
|
|
3990
|
-
snapshot?: string | undefined;
|
|
3991
|
-
toolsTriggered?: {
|
|
3992
|
-
calls: {
|
|
3993
|
-
name: string;
|
|
3994
|
-
required?: boolean | undefined;
|
|
3995
|
-
arguments?: Record<string, unknown> | undefined;
|
|
3996
|
-
}[];
|
|
3997
|
-
order?: "strict" | "any" | undefined;
|
|
3998
|
-
exclusive?: boolean | undefined;
|
|
3999
|
-
} | undefined;
|
|
4000
|
-
toolCallCount?: {
|
|
4001
|
-
exact?: number | undefined;
|
|
4002
|
-
min?: number | undefined;
|
|
4003
|
-
max?: number | undefined;
|
|
4004
|
-
} | undefined;
|
|
4005
|
-
containsText?: string | string[] | undefined;
|
|
4006
|
-
matchesPattern?: string | string[] | undefined;
|
|
4007
|
-
snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
|
|
4008
|
-
pattern: string;
|
|
4009
|
-
replacement?: string | undefined;
|
|
4010
|
-
} | {
|
|
4011
|
-
remove: string[];
|
|
4012
|
-
})[] | undefined;
|
|
4013
|
-
passesJudge?: {
|
|
4014
|
-
rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
|
|
4015
|
-
text: string;
|
|
4016
|
-
};
|
|
4017
|
-
model?: string | undefined;
|
|
4018
|
-
maxTokens?: number | undefined;
|
|
4019
|
-
maxBudgetUsd?: number | undefined;
|
|
4020
|
-
reference?: unknown;
|
|
4021
|
-
threshold?: number | undefined;
|
|
4022
|
-
reps?: number | undefined;
|
|
4023
|
-
provider?: "openai" | "anthropic" | "google" | undefined;
|
|
4024
|
-
apiKeyEnvVar?: string | undefined;
|
|
4025
|
-
temperature?: number | undefined;
|
|
4026
|
-
maxToolOutputSize?: number | undefined;
|
|
4027
|
-
} | undefined;
|
|
4028
|
-
responseSize?: {
|
|
4029
|
-
maxBytes?: number | undefined;
|
|
4030
|
-
minBytes?: number | undefined;
|
|
4031
|
-
} | undefined;
|
|
4032
|
-
} | undefined;
|
|
4033
|
-
}[];
|
|
4034
|
-
metadata?: Record<string, unknown> | undefined;
|
|
4035
|
-
description?: string | undefined;
|
|
4036
|
-
}>;
|
|
3060
|
+
}, z.core.$strip>;
|
|
4037
3061
|
/**
|
|
4038
3062
|
* Type for serialized eval dataset (without Zod schemas)
|
|
4039
3063
|
*/
|
|
@@ -4130,15 +3154,15 @@ interface EvalRunMetadata {
|
|
|
4130
3154
|
timestamp: string;
|
|
4131
3155
|
/** Package version from package.json */
|
|
4132
3156
|
packageVersion: string;
|
|
4133
|
-
/**
|
|
4134
|
-
|
|
3157
|
+
/** MCP host model identifier (if mcp_host mode) */
|
|
3158
|
+
mcpHostModel?: string;
|
|
4135
3159
|
/** Judge model identifier (if judge was used) */
|
|
4136
3160
|
judgeModel?: string;
|
|
4137
3161
|
}
|
|
4138
3162
|
/**
|
|
4139
3163
|
* Individual conformance check result
|
|
4140
3164
|
*/
|
|
4141
|
-
interface MCPConformanceCheck
|
|
3165
|
+
interface MCPConformanceCheck {
|
|
4142
3166
|
/**
|
|
4143
3167
|
* Check name (e.g., 'server_info_present', 'list_tools_succeeds')
|
|
4144
3168
|
*/
|
|
@@ -4167,7 +3191,7 @@ interface MCPConformanceResultData {
|
|
|
4167
3191
|
/**
|
|
4168
3192
|
* Individual check results
|
|
4169
3193
|
*/
|
|
4170
|
-
checks: MCPConformanceCheck
|
|
3194
|
+
checks: MCPConformanceCheck[];
|
|
4171
3195
|
/**
|
|
4172
3196
|
* Server info if available
|
|
4173
3197
|
*/
|
|
@@ -4302,7 +3326,7 @@ interface EvalCaseResult {
|
|
|
4302
3326
|
/**
|
|
4303
3327
|
* Precision of tool calls made (0–1).
|
|
4304
3328
|
* 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
|
|
4305
|
-
*
|
|
3329
|
+
* Populated whenever a `toolsTriggered` expectation is evaluated.
|
|
4306
3330
|
*/
|
|
4307
3331
|
toolPrecision?: number;
|
|
4308
3332
|
/**
|
|
@@ -4321,6 +3345,23 @@ interface EvalCaseResult {
|
|
|
4321
3345
|
* Only present when the case was run with `iterations > 1`.
|
|
4322
3346
|
*/
|
|
4323
3347
|
infrastructureErrorCount?: number;
|
|
3348
|
+
/**
|
|
3349
|
+
* Ordered trace of tool calls made by the LLM in mcp_host mode.
|
|
3350
|
+
* Only populated when the eval case uses toolsTriggered expectations.
|
|
3351
|
+
*/
|
|
3352
|
+
mcpHostTrace?: {
|
|
3353
|
+
/** The ordered sequence of tool calls made by the LLM */
|
|
3354
|
+
calls: Array<{
|
|
3355
|
+
name: string;
|
|
3356
|
+
arguments: Record<string, unknown>;
|
|
3357
|
+
/** 'expected' = was in the expected set, 'unexpected' = was not expected */
|
|
3358
|
+
status: 'expected' | 'unexpected';
|
|
3359
|
+
}>;
|
|
3360
|
+
/** Tools that were required but never called */
|
|
3361
|
+
missed: Array<{
|
|
3362
|
+
name: string;
|
|
3363
|
+
}>;
|
|
3364
|
+
};
|
|
4324
3365
|
}
|
|
4325
3366
|
/**
|
|
4326
3367
|
* Aggregated MCP eval run data
|
|
@@ -4464,13 +3505,13 @@ interface EvalRunnerResult {
|
|
|
4464
3505
|
*/
|
|
4465
3506
|
improvements?: number;
|
|
4466
3507
|
/**
|
|
4467
|
-
* Average tool precision across all
|
|
3508
|
+
* Average tool precision across all mcp_host cases that have a
|
|
4468
3509
|
* `toolsTriggered` expectation (precision = fraction of called tools
|
|
4469
3510
|
* that were expected). Only present when at least one such case ran.
|
|
4470
3511
|
*/
|
|
4471
3512
|
datasetToolPrecision?: number;
|
|
4472
3513
|
/**
|
|
4473
|
-
* Average tool recall across all
|
|
3514
|
+
* Average tool recall across all mcp_host cases that have a
|
|
4474
3515
|
* `toolsTriggered` expectation (recall = fraction of required tools
|
|
4475
3516
|
* that were actually called). Only present when at least one such case ran.
|
|
4476
3517
|
*/
|
|
@@ -4525,7 +3566,7 @@ interface EvalRunnerOptions {
|
|
|
4525
3566
|
*/
|
|
4526
3567
|
concurrency?: number;
|
|
4527
3568
|
/**
|
|
4528
|
-
* Default iteration count for `
|
|
3569
|
+
* Default iteration count for `mcp_host` mode cases that do not specify
|
|
4529
3570
|
* `iterations` explicitly. Has no effect on `direct` mode cases (which are
|
|
4530
3571
|
* deterministic and always default to 1 iteration).
|
|
4531
3572
|
*
|
|
@@ -4536,7 +3577,7 @@ interface EvalRunnerOptions {
|
|
|
4536
3577
|
*
|
|
4537
3578
|
* @example
|
|
4538
3579
|
* ```typescript
|
|
4539
|
-
* // Run all
|
|
3580
|
+
* // Run all mcp_host cases 10 times each by default
|
|
4540
3581
|
* await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
|
|
4541
3582
|
* ```
|
|
4542
3583
|
*/
|
|
@@ -4569,12 +3610,12 @@ interface EvalRunnerOptions {
|
|
|
4569
3610
|
*/
|
|
4570
3611
|
baselineResultsFrom?: string;
|
|
4571
3612
|
/**
|
|
4572
|
-
*
|
|
4573
|
-
* Use this to identify which model was used when running
|
|
3613
|
+
* MCP host model identifier to record in run metadata.
|
|
3614
|
+
* Use this to identify which model was used when running mcp_host cases.
|
|
4574
3615
|
*
|
|
4575
3616
|
* @example 'claude-opus-4-20250514'
|
|
4576
3617
|
*/
|
|
4577
|
-
|
|
3618
|
+
mcpHostModel?: string;
|
|
4578
3619
|
/**
|
|
4579
3620
|
* Judge model identifier to record in run metadata.
|
|
4580
3621
|
* Use this to identify which model was used for judge evaluations.
|
|
@@ -4662,8 +3703,6 @@ interface ServerComparisonResult {
|
|
|
4662
3703
|
ties: number;
|
|
4663
3704
|
/** Cases where both failed */
|
|
4664
3705
|
bothFail: number;
|
|
4665
|
-
/** Raw count of cases where both servers failed (same as bothFail) */
|
|
4666
|
-
bothFailCount: number;
|
|
4667
3706
|
/** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
|
|
4668
3707
|
decidedCases: number;
|
|
4669
3708
|
/** Fraction of total cases where both servers failed (bothFail / total) */
|
|
@@ -4714,7 +3753,7 @@ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baseli
|
|
|
4714
3753
|
declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
|
|
4715
3754
|
|
|
4716
3755
|
/**
|
|
4717
|
-
*
|
|
3756
|
+
* MCP Host Simulation - Main entry point
|
|
4718
3757
|
*
|
|
4719
3758
|
* All providers (openai, anthropic, google, azure, mistral, deepseek,
|
|
4720
3759
|
* openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
|
|
@@ -4733,7 +3772,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4733
3772
|
*/
|
|
4734
3773
|
|
|
4735
3774
|
/**
|
|
4736
|
-
* Simulates an
|
|
3775
|
+
* Simulates an MCP host interacting with an MCP server.
|
|
4737
3776
|
*
|
|
4738
3777
|
* The LLM chooses which tools to call based solely on their descriptions and
|
|
4739
3778
|
* schemas, testing discoverability and parameter clarity at the level a real
|
|
@@ -4745,12 +3784,12 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4745
3784
|
*
|
|
4746
3785
|
* @param mcp - MCP fixture API
|
|
4747
3786
|
* @param scenario - Natural language prompt describing what the LLM should do
|
|
4748
|
-
* @param config -
|
|
3787
|
+
* @param config - MCP host configuration (provider, model, temperature, etc.)
|
|
4749
3788
|
* @returns Simulation result with tool calls, final response, and latency data
|
|
4750
3789
|
*
|
|
4751
3790
|
* @example
|
|
4752
3791
|
* ```typescript
|
|
4753
|
-
* const result = await
|
|
3792
|
+
* const result = await simulateMCPHost(mcp,
|
|
4754
3793
|
* "Find recent documents about MCP testing frameworks",
|
|
4755
3794
|
* { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
|
|
4756
3795
|
* );
|
|
@@ -4759,7 +3798,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
|
|
|
4759
3798
|
* expect(result.toolCalls.map(c => c.name)).toContain('search');
|
|
4760
3799
|
* ```
|
|
4761
3800
|
*/
|
|
4762
|
-
declare function
|
|
3801
|
+
declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
|
|
4763
3802
|
/**
|
|
4764
3803
|
* Returns true if the given provider is supported.
|
|
4765
3804
|
*
|
|
@@ -4838,14 +3877,6 @@ interface MCPConformanceOptions {
|
|
|
4838
3877
|
*/
|
|
4839
3878
|
checkPrompts?: boolean;
|
|
4840
3879
|
}
|
|
4841
|
-
/**
|
|
4842
|
-
* Individual check result
|
|
4843
|
-
*/
|
|
4844
|
-
interface MCPConformanceCheck {
|
|
4845
|
-
name: string;
|
|
4846
|
-
pass: boolean;
|
|
4847
|
-
message: string;
|
|
4848
|
-
}
|
|
4849
3880
|
/**
|
|
4850
3881
|
* Raw MCP responses for snapshotting
|
|
4851
3882
|
*/
|
|
@@ -4978,4 +4009,4 @@ interface MCPEvalReporterConfig {
|
|
|
4978
4009
|
includeAutoTracking?: boolean;
|
|
4979
4010
|
}
|
|
4980
4011
|
|
|
4981
|
-
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type
|
|
4012
|
+
export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|