@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -233,314 +233,64 @@ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
233
233
  /**
234
234
  * Union schema for MCPConfig (validates based on transport type)
235
235
  */
236
- declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject<{
236
+ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
237
237
  transport: z.ZodLiteral<"stdio">;
238
238
  command: z.ZodString;
239
- args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
239
+ args: z.ZodOptional<z.ZodArray<z.ZodString>>;
240
240
  cwd: z.ZodOptional<z.ZodString>;
241
241
  env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
242
242
  capabilities: z.ZodOptional<z.ZodObject<{
243
243
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
244
244
  roots: z.ZodOptional<z.ZodObject<{
245
245
  listChanged: z.ZodBoolean;
246
- }, "strip", z.ZodTypeAny, {
247
- listChanged: boolean;
248
- }, {
249
- listChanged: boolean;
250
- }>>;
251
- }, "strip", z.ZodTypeAny, {
252
- sampling?: Record<string, unknown> | undefined;
253
- roots?: {
254
- listChanged: boolean;
255
- } | undefined;
256
- }, {
257
- sampling?: Record<string, unknown> | undefined;
258
- roots?: {
259
- listChanged: boolean;
260
- } | undefined;
261
- }>>;
246
+ }, z.core.$strip>>;
247
+ }, z.core.$strip>>;
262
248
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
263
249
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
264
250
  callTimeoutMs: z.ZodOptional<z.ZodNumber>;
265
251
  quiet: z.ZodOptional<z.ZodBoolean>;
266
- }, "strip", z.ZodTypeAny, {
267
- transport: "stdio";
268
- command: string;
269
- args?: string[] | undefined;
270
- cwd?: string | undefined;
271
- env?: Record<string, string> | undefined;
272
- capabilities?: {
273
- sampling?: Record<string, unknown> | undefined;
274
- roots?: {
275
- listChanged: boolean;
276
- } | undefined;
277
- } | undefined;
278
- connectTimeoutMs?: number | undefined;
279
- requestTimeoutMs?: number | undefined;
280
- callTimeoutMs?: number | undefined;
281
- quiet?: boolean | undefined;
282
- }, {
283
- transport: "stdio";
284
- command: string;
285
- args?: string[] | undefined;
286
- cwd?: string | undefined;
287
- env?: Record<string, string> | undefined;
288
- capabilities?: {
289
- sampling?: Record<string, unknown> | undefined;
290
- roots?: {
291
- listChanged: boolean;
292
- } | undefined;
293
- } | undefined;
294
- connectTimeoutMs?: number | undefined;
295
- requestTimeoutMs?: number | undefined;
296
- callTimeoutMs?: number | undefined;
297
- quiet?: boolean | undefined;
298
- }>, z.ZodObject<{
252
+ }, z.core.$strip>, z.ZodObject<{
299
253
  transport: z.ZodLiteral<"http">;
300
- serverUrl: z.ZodEffects<z.ZodString, string, string>;
254
+ serverUrl: z.ZodString;
301
255
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
302
256
  capabilities: z.ZodOptional<z.ZodObject<{
303
257
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
304
258
  roots: z.ZodOptional<z.ZodObject<{
305
259
  listChanged: z.ZodBoolean;
306
- }, "strip", z.ZodTypeAny, {
307
- listChanged: boolean;
308
- }, {
309
- listChanged: boolean;
310
- }>>;
311
- }, "strip", z.ZodTypeAny, {
312
- sampling?: Record<string, unknown> | undefined;
313
- roots?: {
314
- listChanged: boolean;
315
- } | undefined;
316
- }, {
317
- sampling?: Record<string, unknown> | undefined;
318
- roots?: {
319
- listChanged: boolean;
320
- } | undefined;
321
- }>>;
260
+ }, z.core.$strip>>;
261
+ }, z.core.$strip>>;
322
262
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
323
263
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
324
264
  callTimeoutMs: z.ZodOptional<z.ZodNumber>;
325
- auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
265
+ auth: z.ZodOptional<z.ZodObject<{
326
266
  accessToken: z.ZodOptional<z.ZodString>;
327
267
  oauth: z.ZodOptional<z.ZodObject<{
328
268
  serverUrl: z.ZodString;
329
- scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
269
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
330
270
  resource: z.ZodOptional<z.ZodString>;
331
271
  authStatePath: z.ZodOptional<z.ZodString>;
332
272
  clientId: z.ZodOptional<z.ZodString>;
333
273
  clientSecret: z.ZodOptional<z.ZodString>;
334
274
  redirectUri: z.ZodOptional<z.ZodString>;
335
- }, "strip", z.ZodTypeAny, {
336
- serverUrl: string;
337
- scopes?: string[] | undefined;
338
- resource?: string | undefined;
339
- authStatePath?: string | undefined;
340
- clientId?: string | undefined;
341
- clientSecret?: string | undefined;
342
- redirectUri?: string | undefined;
343
- }, {
344
- serverUrl: string;
345
- scopes?: string[] | undefined;
346
- resource?: string | undefined;
347
- authStatePath?: string | undefined;
348
- clientId?: string | undefined;
349
- clientSecret?: string | undefined;
350
- redirectUri?: string | undefined;
351
- }>>;
275
+ }, z.core.$strip>>;
352
276
  clientCredentials: z.ZodOptional<z.ZodObject<{
353
277
  clientId: z.ZodOptional<z.ZodString>;
354
278
  clientSecret: z.ZodOptional<z.ZodString>;
355
279
  tokenEndpoint: z.ZodOptional<z.ZodString>;
356
- scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
357
- }, "strip", z.ZodTypeAny, {
358
- scopes?: string[] | undefined;
359
- clientId?: string | undefined;
360
- clientSecret?: string | undefined;
361
- tokenEndpoint?: string | undefined;
362
- }, {
363
- scopes?: string[] | undefined;
364
- clientId?: string | undefined;
365
- clientSecret?: string | undefined;
366
- tokenEndpoint?: string | undefined;
367
- }>>;
368
- }, "strip", z.ZodTypeAny, {
369
- accessToken?: string | undefined;
370
- oauth?: {
371
- serverUrl: string;
372
- scopes?: string[] | undefined;
373
- resource?: string | undefined;
374
- authStatePath?: string | undefined;
375
- clientId?: string | undefined;
376
- clientSecret?: string | undefined;
377
- redirectUri?: string | undefined;
378
- } | undefined;
379
- clientCredentials?: {
380
- scopes?: string[] | undefined;
381
- clientId?: string | undefined;
382
- clientSecret?: string | undefined;
383
- tokenEndpoint?: string | undefined;
384
- } | undefined;
385
- }, {
386
- accessToken?: string | undefined;
387
- oauth?: {
388
- serverUrl: string;
389
- scopes?: string[] | undefined;
390
- resource?: string | undefined;
391
- authStatePath?: string | undefined;
392
- clientId?: string | undefined;
393
- clientSecret?: string | undefined;
394
- redirectUri?: string | undefined;
395
- } | undefined;
396
- clientCredentials?: {
397
- scopes?: string[] | undefined;
398
- clientId?: string | undefined;
399
- clientSecret?: string | undefined;
400
- tokenEndpoint?: string | undefined;
401
- } | undefined;
402
- }>, {
403
- accessToken?: string | undefined;
404
- oauth?: {
405
- serverUrl: string;
406
- scopes?: string[] | undefined;
407
- resource?: string | undefined;
408
- authStatePath?: string | undefined;
409
- clientId?: string | undefined;
410
- clientSecret?: string | undefined;
411
- redirectUri?: string | undefined;
412
- } | undefined;
413
- clientCredentials?: {
414
- scopes?: string[] | undefined;
415
- clientId?: string | undefined;
416
- clientSecret?: string | undefined;
417
- tokenEndpoint?: string | undefined;
418
- } | undefined;
419
- }, {
420
- accessToken?: string | undefined;
421
- oauth?: {
422
- serverUrl: string;
423
- scopes?: string[] | undefined;
424
- resource?: string | undefined;
425
- authStatePath?: string | undefined;
426
- clientId?: string | undefined;
427
- clientSecret?: string | undefined;
428
- redirectUri?: string | undefined;
429
- } | undefined;
430
- clientCredentials?: {
431
- scopes?: string[] | undefined;
432
- clientId?: string | undefined;
433
- clientSecret?: string | undefined;
434
- tokenEndpoint?: string | undefined;
435
- } | undefined;
436
- }>>;
280
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
281
+ }, z.core.$strip>>;
282
+ }, z.core.$strip>>;
437
283
  proxy: z.ZodOptional<z.ZodObject<{
438
284
  url: z.ZodString;
439
- }, "strip", z.ZodTypeAny, {
440
- url: string;
441
- }, {
442
- url: string;
443
- }>>;
285
+ }, z.core.$strip>>;
444
286
  retryAttempts: z.ZodOptional<z.ZodNumber>;
445
287
  tls: z.ZodOptional<z.ZodObject<{
446
288
  ca: z.ZodOptional<z.ZodString>;
447
289
  cert: z.ZodOptional<z.ZodString>;
448
290
  key: z.ZodOptional<z.ZodString>;
449
291
  rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
450
- }, "strip", z.ZodTypeAny, {
451
- ca?: string | undefined;
452
- cert?: string | undefined;
453
- key?: string | undefined;
454
- rejectUnauthorized?: boolean | undefined;
455
- }, {
456
- ca?: string | undefined;
457
- cert?: string | undefined;
458
- key?: string | undefined;
459
- rejectUnauthorized?: boolean | undefined;
460
- }>>;
461
- }, "strip", z.ZodTypeAny, {
462
- serverUrl: string;
463
- transport: "http";
464
- capabilities?: {
465
- sampling?: Record<string, unknown> | undefined;
466
- roots?: {
467
- listChanged: boolean;
468
- } | undefined;
469
- } | undefined;
470
- connectTimeoutMs?: number | undefined;
471
- requestTimeoutMs?: number | undefined;
472
- callTimeoutMs?: number | undefined;
473
- headers?: Record<string, string> | undefined;
474
- auth?: {
475
- accessToken?: string | undefined;
476
- oauth?: {
477
- serverUrl: string;
478
- scopes?: string[] | undefined;
479
- resource?: string | undefined;
480
- authStatePath?: string | undefined;
481
- clientId?: string | undefined;
482
- clientSecret?: string | undefined;
483
- redirectUri?: string | undefined;
484
- } | undefined;
485
- clientCredentials?: {
486
- scopes?: string[] | undefined;
487
- clientId?: string | undefined;
488
- clientSecret?: string | undefined;
489
- tokenEndpoint?: string | undefined;
490
- } | undefined;
491
- } | undefined;
492
- proxy?: {
493
- url: string;
494
- } | undefined;
495
- retryAttempts?: number | undefined;
496
- tls?: {
497
- ca?: string | undefined;
498
- cert?: string | undefined;
499
- key?: string | undefined;
500
- rejectUnauthorized?: boolean | undefined;
501
- } | undefined;
502
- }, {
503
- serverUrl: string;
504
- transport: "http";
505
- capabilities?: {
506
- sampling?: Record<string, unknown> | undefined;
507
- roots?: {
508
- listChanged: boolean;
509
- } | undefined;
510
- } | undefined;
511
- connectTimeoutMs?: number | undefined;
512
- requestTimeoutMs?: number | undefined;
513
- callTimeoutMs?: number | undefined;
514
- headers?: Record<string, string> | undefined;
515
- auth?: {
516
- accessToken?: string | undefined;
517
- oauth?: {
518
- serverUrl: string;
519
- scopes?: string[] | undefined;
520
- resource?: string | undefined;
521
- authStatePath?: string | undefined;
522
- clientId?: string | undefined;
523
- clientSecret?: string | undefined;
524
- redirectUri?: string | undefined;
525
- } | undefined;
526
- clientCredentials?: {
527
- scopes?: string[] | undefined;
528
- clientId?: string | undefined;
529
- clientSecret?: string | undefined;
530
- tokenEndpoint?: string | undefined;
531
- } | undefined;
532
- } | undefined;
533
- proxy?: {
534
- url: string;
535
- } | undefined;
536
- retryAttempts?: number | undefined;
537
- tls?: {
538
- ca?: string | undefined;
539
- cert?: string | undefined;
540
- key?: string | undefined;
541
- rejectUnauthorized?: boolean | undefined;
542
- } | undefined;
543
- }>]>;
292
+ }, z.core.$strip>>;
293
+ }, z.core.$strip>], "transport">;
544
294
  /**
545
295
  * Validates an MCPConfig object
546
296
  *
@@ -1790,9 +1540,9 @@ declare function validateError(response: unknown, expected?: boolean | string |
1790
1540
  declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
1791
1541
 
1792
1542
  /**
1793
- * Tool call validators for llm_host simulation results.
1543
+ * Tool call validators for mcp_host simulation results.
1794
1544
  *
1795
- * These validators extract the tool call trace from an LLMHostSimulationResult
1545
+ * These validators extract the tool call trace from an MCPHostSimulationResult
1796
1546
  * and apply assertions against expected call lists and counts.
1797
1547
  */
1798
1548
 
@@ -1811,16 +1561,16 @@ interface ToolCallCountOptions {
1811
1561
  exact?: number;
1812
1562
  }
1813
1563
  /**
1814
- * Validates tool calls made during an LLM host simulation.
1564
+ * Validates tool calls made during an MCP host simulation.
1815
1565
  *
1816
- * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1566
+ * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
1817
1567
  * @param expectation - Expected tool call specification
1818
1568
  */
1819
1569
  declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
1820
1570
  /**
1821
- * Validates the number of tool calls made during an LLM host simulation.
1571
+ * Validates the number of tool calls made during an MCP host simulation.
1822
1572
  *
1823
- * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1573
+ * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
1824
1574
  * @param options - Count constraints (min, max, exact)
1825
1575
  */
1826
1576
  declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
@@ -2223,7 +1973,7 @@ declare global {
2223
1973
  */
2224
1974
  toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
2225
1975
  /**
2226
- * Validates which tools the LLM called during an llm_host simulation.
1976
+ * Validates which tools the LLM called during a mcp_host simulation.
2227
1977
  *
2228
1978
  * @example
2229
1979
  * ```typescript
@@ -2235,7 +1985,7 @@ declare global {
2235
1985
  */
2236
1986
  toHaveToolCalls(expectation: ToolCallExpectation): R;
2237
1987
  /**
2238
- * Validates the number of tool calls made during an llm_host simulation.
1988
+ * Validates the number of tool calls made during a mcp_host simulation.
2239
1989
  *
2240
1990
  * @example
2241
1991
  * ```typescript
@@ -2603,7 +2353,7 @@ declare function toSatisfyToolPredicate(this: {
2603
2353
  /**
2604
2354
  * toHaveToolCalls Matcher
2605
2355
  *
2606
- * Validates which tools the LLM called during an llm_host simulation.
2356
+ * Validates which tools the LLM called during a mcp_host simulation.
2607
2357
  */
2608
2358
 
2609
2359
  /**
@@ -2619,7 +2369,7 @@ declare function toHaveToolCalls(this: {
2619
2369
  /**
2620
2370
  * toHaveToolCallCount Matcher
2621
2371
  *
2622
- * Validates the number of tool calls made during an llm_host simulation.
2372
+ * Validates the number of tool calls made during a mcp_host simulation.
2623
2373
  */
2624
2374
 
2625
2375
  /**
@@ -2728,9 +2478,9 @@ interface MCPAuthFixtures {
2728
2478
  declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
2729
2479
 
2730
2480
  /**
2731
- * Types and interfaces for LLM host simulation mode
2481
+ * Types and interfaces for MCP host simulation mode
2732
2482
  *
2733
- * This module provides types for testing MCP servers through LLM hosts,
2483
+ * This module provides types for testing MCP servers through MCP hosts,
2734
2484
  * validating tool descriptions, parameter clarity, and discoverability.
2735
2485
  */
2736
2486
 
@@ -2759,9 +2509,9 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
2759
2509
  */
2760
2510
  | 'vertex-anthropic';
2761
2511
  /**
2762
- * Configuration for LLM host simulation
2512
+ * Configuration for MCP host simulation
2763
2513
  */
2764
- interface LLMHostConfig {
2514
+ interface MCPHostConfig {
2765
2515
  /**
2766
2516
  * LLM provider to use
2767
2517
  */
@@ -2801,9 +2551,9 @@ interface LLMToolCall {
2801
2551
  id?: string;
2802
2552
  }
2803
2553
  /**
2804
- * Result from an LLM host simulation
2554
+ * Result from an MCP host simulation
2805
2555
  */
2806
- interface LLMHostSimulationResult {
2556
+ interface MCPHostSimulationResult {
2807
2557
  /** Whether the simulation succeeded */
2808
2558
  success: boolean;
2809
2559
  /** Tool calls made by the LLM */
@@ -2831,33 +2581,33 @@ interface LLMHostSimulationResult {
2831
2581
  mcpDurationMs?: number;
2832
2582
  }
2833
2583
  /**
2834
- * Interface for LLM host simulators.
2584
+ * Interface for MCP host simulators.
2835
2585
  *
2836
2586
  * The only built-in implementation is the Vercel AI SDK orchestrator
2837
- * (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
2587
+ * (src/evals/mcpHost/adapters/vercel.ts). Custom implementations can be
2838
2588
  * created for specialised testing needs.
2839
2589
  */
2840
- interface LLMHostSimulator {
2590
+ interface MCPHostSimulator {
2841
2591
  /**
2842
- * Simulates an LLM host interacting with an MCP server
2592
+ * Simulates an MCP host interacting with an MCP server
2843
2593
  *
2844
2594
  * @param mcp - MCP fixture API
2845
2595
  * @param scenario - Natural language prompt describing what the LLM should do
2846
- * @param config - LLM host configuration
2596
+ * @param config - MCP host configuration
2847
2597
  * @returns Simulation result with tool calls and response
2848
2598
  */
2849
- simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
2599
+ simulate(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
2850
2600
  }
2851
2601
 
2852
2602
  /**
2853
2603
  * Evaluation mode
2854
2604
  */
2855
- type EvalMode = 'direct' | 'llm_host';
2605
+ type EvalMode = 'direct' | 'mcp_host';
2856
2606
  /**
2857
2607
  * A single eval test case
2858
2608
  *
2859
2609
  * For 'direct' mode: toolName and args are required
2860
- * For 'llm_host' mode: scenario and llmHostConfig are required
2610
+ * For 'mcp_host' mode: scenario and mcpHostConfig are required
2861
2611
  */
2862
2612
  interface EvalCase {
2863
2613
  /**
@@ -2871,39 +2621,39 @@ interface EvalCase {
2871
2621
  /**
2872
2622
  * Evaluation mode
2873
2623
  * - 'direct': Direct API calls to MCP tools (default)
2874
- * - 'llm_host': LLM-driven tool selection via natural language
2624
+ * - 'mcp_host': LLM-driven tool selection via natural language
2875
2625
  *
2876
2626
  * @default 'direct'
2877
2627
  */
2878
2628
  mode?: EvalMode;
2879
2629
  /**
2880
- * Name of the MCP tool to call (required for 'direct' mode, optional for 'llm_host' mode)
2630
+ * Name of the MCP tool to call (required for 'direct' mode, optional for 'mcp_host' mode)
2881
2631
  */
2882
2632
  toolName?: string;
2883
2633
  /**
2884
- * Arguments to pass to the tool (required for 'direct' mode, optional for 'llm_host' mode)
2634
+ * Arguments to pass to the tool (required for 'direct' mode, optional for 'mcp_host' mode)
2885
2635
  */
2886
2636
  args?: Record<string, unknown>;
2887
2637
  /**
2888
- * Natural language scenario for LLM to execute (optional, required for 'llm_host' mode)
2638
+ * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
2889
2639
  *
2890
2640
  * @example "Get the weather for London and tell me if I need an umbrella"
2891
2641
  */
2892
2642
  scenario?: string;
2893
2643
  /**
2894
- * LLM host configuration (optional for 'llm_host' mode)
2644
+ * MCP host configuration (optional for 'mcp_host' mode)
2895
2645
  *
2896
2646
  * If not specified, uses default configuration from test environment
2897
2647
  */
2898
- llmHostConfig?: LLMHostConfig;
2648
+ mcpHostConfig?: MCPHostConfig;
2899
2649
  /**
2900
2650
  * Additional metadata for this test case
2901
2651
  *
2902
- * For 'llm_host' mode, can include 'expectedToolCalls' for validation
2652
+ * For 'mcp_host' mode, can include 'expectedToolCalls' for validation
2903
2653
  */
2904
2654
  metadata?: Record<string, unknown>;
2905
2655
  /**
2906
- * Number of times to run this case and compute an accuracy score.
2656
+ * Number of times to run this case and compute an assertion pass rate.
2907
2657
  * When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
2908
2658
  * by `accuracyThreshold` rather than a single run.
2909
2659
  * @default 1
@@ -3035,8 +2785,8 @@ interface EvalExpectBlock {
3035
2785
  minBytes?: number;
3036
2786
  };
3037
2787
  /**
3038
- * Asserts which tools the LLM called during an llm_host simulation.
3039
- * Only meaningful for llm_host mode — direct mode has no tool call trace.
2788
+ * Asserts which tools the LLM called during a mcp_host simulation.
2789
+ * Only meaningful for mcp_host mode — direct mode has no tool call trace.
3040
2790
  */
3041
2791
  toolsTriggered?: {
3042
2792
  /** Expected tool calls */
@@ -3057,7 +2807,7 @@ interface EvalExpectBlock {
3057
2807
  exclusive?: boolean;
3058
2808
  };
3059
2809
  /**
3060
- * Asserts the number of tool calls made during an llm_host simulation.
2810
+ * Asserts the number of tool calls made during a mcp_host simulation.
3061
2811
  */
3062
2812
  toolCallCount?: {
3063
2813
  /** Minimum number of tool calls */
@@ -3096,399 +2846,109 @@ interface EvalDataset {
3096
2846
  /**
3097
2847
  * Zod schema for EvalCase
3098
2848
  *
3099
- * toolName and args are optional for llm_host mode (which uses scenario instead)
2849
+ * toolName and args are optional for mcp_host mode (which uses scenario instead)
3100
2850
  */
3101
2851
  declare const EvalCaseSchema: z.ZodObject<{
3102
2852
  id: z.ZodString;
3103
2853
  description: z.ZodOptional<z.ZodString>;
3104
- mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
2854
+ mode: z.ZodOptional<z.ZodEnum<{
2855
+ direct: "direct";
2856
+ mcp_host: "mcp_host";
2857
+ }>>;
3105
2858
  toolName: z.ZodOptional<z.ZodString>;
3106
2859
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3107
2860
  scenario: z.ZodOptional<z.ZodString>;
3108
- llmHostConfig: z.ZodOptional<z.ZodObject<{
3109
- provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2861
+ mcpHostConfig: z.ZodOptional<z.ZodObject<{
2862
+ provider: z.ZodEnum<{
2863
+ openai: "openai";
2864
+ anthropic: "anthropic";
2865
+ azure: "azure";
2866
+ google: "google";
2867
+ mistral: "mistral";
2868
+ deepseek: "deepseek";
2869
+ openrouter: "openrouter";
2870
+ xai: "xai";
2871
+ "vertex-anthropic": "vertex-anthropic";
2872
+ }>;
3110
2873
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3111
2874
  model: z.ZodOptional<z.ZodString>;
3112
2875
  maxTokens: z.ZodOptional<z.ZodNumber>;
3113
2876
  temperature: z.ZodOptional<z.ZodNumber>;
3114
2877
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3115
- }, "strip", z.ZodTypeAny, {
3116
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3117
- model?: string | undefined;
3118
- maxTokens?: number | undefined;
3119
- apiKeyEnvVar?: string | undefined;
3120
- temperature?: number | undefined;
3121
- maxToolCalls?: number | undefined;
3122
- }, {
3123
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3124
- model?: string | undefined;
3125
- maxTokens?: number | undefined;
3126
- apiKeyEnvVar?: string | undefined;
3127
- temperature?: number | undefined;
3128
- maxToolCalls?: number | undefined;
3129
- }>>;
2878
+ }, z.core.$strip>>;
3130
2879
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3131
2880
  iterations: z.ZodOptional<z.ZodNumber>;
3132
2881
  accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3133
2882
  judgeReps: z.ZodOptional<z.ZodNumber>;
3134
2883
  canonicalAnswer: z.ZodOptional<z.ZodString>;
3135
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2884
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
3136
2885
  expect: z.ZodOptional<z.ZodObject<{
3137
2886
  response: z.ZodOptional<z.ZodUnknown>;
3138
2887
  schema: z.ZodOptional<z.ZodString>;
3139
- containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3140
- matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2888
+ containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
2889
+ matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
3141
2890
  snapshot: z.ZodOptional<z.ZodString>;
3142
- snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
2891
+ snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
2892
+ timestamp: "timestamp";
2893
+ uuid: "uuid";
2894
+ "iso-date": "iso-date";
2895
+ objectId: "objectId";
2896
+ jwt: "jwt";
2897
+ }>, z.ZodObject<{
3143
2898
  pattern: z.ZodString;
3144
2899
  replacement: z.ZodOptional<z.ZodString>;
3145
- }, "strip", z.ZodTypeAny, {
3146
- pattern: string;
3147
- replacement?: string | undefined;
3148
- }, {
3149
- pattern: string;
3150
- replacement?: string | undefined;
3151
- }>, z.ZodObject<{
3152
- remove: z.ZodArray<z.ZodString, "many">;
3153
- }, "strip", z.ZodTypeAny, {
3154
- remove: string[];
3155
- }, {
3156
- remove: string[];
3157
- }>]>, "many">>;
3158
- isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2900
+ }, z.core.$strip>, z.ZodObject<{
2901
+ remove: z.ZodArray<z.ZodString>;
2902
+ }, z.core.$strip>]>>>;
2903
+ isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3159
2904
  passesJudge: z.ZodOptional<z.ZodObject<{
3160
- rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
2905
+ rubric: z.ZodUnion<readonly [z.ZodEnum<{
2906
+ correctness: "correctness";
2907
+ completeness: "completeness";
2908
+ groundedness: "groundedness";
2909
+ "instruction-following": "instruction-following";
2910
+ conciseness: "conciseness";
2911
+ }>, z.ZodObject<{
3161
2912
  text: z.ZodString;
3162
- }, "strip", z.ZodTypeAny, {
3163
- text: string;
3164
- }, {
3165
- text: string;
3166
- }>]>;
2913
+ }, z.core.$strip>]>;
3167
2914
  reference: z.ZodOptional<z.ZodUnknown>;
3168
2915
  threshold: z.ZodOptional<z.ZodNumber>;
3169
2916
  reps: z.ZodOptional<z.ZodNumber>;
3170
- provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
2917
+ provider: z.ZodOptional<z.ZodEnum<{
2918
+ openai: "openai";
2919
+ anthropic: "anthropic";
2920
+ google: "google";
2921
+ }>>;
3171
2922
  model: z.ZodOptional<z.ZodString>;
3172
2923
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3173
2924
  maxTokens: z.ZodOptional<z.ZodNumber>;
3174
2925
  temperature: z.ZodOptional<z.ZodNumber>;
3175
2926
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3176
2927
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3177
- }, "strip", z.ZodTypeAny, {
3178
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3179
- text: string;
3180
- };
3181
- model?: string | undefined;
3182
- maxTokens?: number | undefined;
3183
- maxBudgetUsd?: number | undefined;
3184
- reference?: unknown;
3185
- threshold?: number | undefined;
3186
- reps?: number | undefined;
3187
- provider?: "openai" | "anthropic" | "google" | undefined;
3188
- apiKeyEnvVar?: string | undefined;
3189
- temperature?: number | undefined;
3190
- maxToolOutputSize?: number | undefined;
3191
- }, {
3192
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3193
- text: string;
3194
- };
3195
- model?: string | undefined;
3196
- maxTokens?: number | undefined;
3197
- maxBudgetUsd?: number | undefined;
3198
- reference?: unknown;
3199
- threshold?: number | undefined;
3200
- reps?: number | undefined;
3201
- provider?: "openai" | "anthropic" | "google" | undefined;
3202
- apiKeyEnvVar?: string | undefined;
3203
- temperature?: number | undefined;
3204
- maxToolOutputSize?: number | undefined;
3205
- }>>;
2928
+ }, z.core.$strip>>;
3206
2929
  responseSize: z.ZodOptional<z.ZodObject<{
3207
2930
  maxBytes: z.ZodOptional<z.ZodNumber>;
3208
2931
  minBytes: z.ZodOptional<z.ZodNumber>;
3209
- }, "strip", z.ZodTypeAny, {
3210
- maxBytes?: number | undefined;
3211
- minBytes?: number | undefined;
3212
- }, {
3213
- maxBytes?: number | undefined;
3214
- minBytes?: number | undefined;
3215
- }>>;
2932
+ }, z.core.$strip>>;
3216
2933
  toolsTriggered: z.ZodOptional<z.ZodObject<{
3217
2934
  calls: z.ZodArray<z.ZodObject<{
3218
2935
  name: z.ZodString;
3219
2936
  arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3220
2937
  required: z.ZodOptional<z.ZodBoolean>;
3221
- }, "strip", z.ZodTypeAny, {
3222
- name: string;
3223
- required?: boolean | undefined;
3224
- arguments?: Record<string, unknown> | undefined;
3225
- }, {
3226
- name: string;
3227
- required?: boolean | undefined;
3228
- arguments?: Record<string, unknown> | undefined;
3229
- }>, "many">;
3230
- order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
2938
+ }, z.core.$strip>>;
2939
+ order: z.ZodOptional<z.ZodEnum<{
2940
+ any: "any";
2941
+ strict: "strict";
2942
+ }>>;
3231
2943
  exclusive: z.ZodOptional<z.ZodBoolean>;
3232
- }, "strip", z.ZodTypeAny, {
3233
- calls: {
3234
- name: string;
3235
- required?: boolean | undefined;
3236
- arguments?: Record<string, unknown> | undefined;
3237
- }[];
3238
- order?: "strict" | "any" | undefined;
3239
- exclusive?: boolean | undefined;
3240
- }, {
3241
- calls: {
3242
- name: string;
3243
- required?: boolean | undefined;
3244
- arguments?: Record<string, unknown> | undefined;
3245
- }[];
3246
- order?: "strict" | "any" | undefined;
3247
- exclusive?: boolean | undefined;
3248
- }>>;
2944
+ }, z.core.$strip>>;
3249
2945
  toolCallCount: z.ZodOptional<z.ZodObject<{
3250
2946
  min: z.ZodOptional<z.ZodNumber>;
3251
2947
  max: z.ZodOptional<z.ZodNumber>;
3252
2948
  exact: z.ZodOptional<z.ZodNumber>;
3253
- }, "strip", z.ZodTypeAny, {
3254
- exact?: number | undefined;
3255
- min?: number | undefined;
3256
- max?: number | undefined;
3257
- }, {
3258
- exact?: number | undefined;
3259
- min?: number | undefined;
3260
- max?: number | undefined;
3261
- }>>;
3262
- }, "strip", z.ZodTypeAny, {
3263
- response?: unknown;
3264
- isError?: string | boolean | string[] | undefined;
3265
- schema?: string | undefined;
3266
- snapshot?: string | undefined;
3267
- toolsTriggered?: {
3268
- calls: {
3269
- name: string;
3270
- required?: boolean | undefined;
3271
- arguments?: Record<string, unknown> | undefined;
3272
- }[];
3273
- order?: "strict" | "any" | undefined;
3274
- exclusive?: boolean | undefined;
3275
- } | undefined;
3276
- toolCallCount?: {
3277
- exact?: number | undefined;
3278
- min?: number | undefined;
3279
- max?: number | undefined;
3280
- } | undefined;
3281
- containsText?: string | string[] | undefined;
3282
- matchesPattern?: string | string[] | undefined;
3283
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3284
- pattern: string;
3285
- replacement?: string | undefined;
3286
- } | {
3287
- remove: string[];
3288
- })[] | undefined;
3289
- passesJudge?: {
3290
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3291
- text: string;
3292
- };
3293
- model?: string | undefined;
3294
- maxTokens?: number | undefined;
3295
- maxBudgetUsd?: number | undefined;
3296
- reference?: unknown;
3297
- threshold?: number | undefined;
3298
- reps?: number | undefined;
3299
- provider?: "openai" | "anthropic" | "google" | undefined;
3300
- apiKeyEnvVar?: string | undefined;
3301
- temperature?: number | undefined;
3302
- maxToolOutputSize?: number | undefined;
3303
- } | undefined;
3304
- responseSize?: {
3305
- maxBytes?: number | undefined;
3306
- minBytes?: number | undefined;
3307
- } | undefined;
3308
- }, {
3309
- response?: unknown;
3310
- isError?: string | boolean | string[] | undefined;
3311
- schema?: string | undefined;
3312
- snapshot?: string | undefined;
3313
- toolsTriggered?: {
3314
- calls: {
3315
- name: string;
3316
- required?: boolean | undefined;
3317
- arguments?: Record<string, unknown> | undefined;
3318
- }[];
3319
- order?: "strict" | "any" | undefined;
3320
- exclusive?: boolean | undefined;
3321
- } | undefined;
3322
- toolCallCount?: {
3323
- exact?: number | undefined;
3324
- min?: number | undefined;
3325
- max?: number | undefined;
3326
- } | undefined;
3327
- containsText?: string | string[] | undefined;
3328
- matchesPattern?: string | string[] | undefined;
3329
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3330
- pattern: string;
3331
- replacement?: string | undefined;
3332
- } | {
3333
- remove: string[];
3334
- })[] | undefined;
3335
- passesJudge?: {
3336
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3337
- text: string;
3338
- };
3339
- model?: string | undefined;
3340
- maxTokens?: number | undefined;
3341
- maxBudgetUsd?: number | undefined;
3342
- reference?: unknown;
3343
- threshold?: number | undefined;
3344
- reps?: number | undefined;
3345
- provider?: "openai" | "anthropic" | "google" | undefined;
3346
- apiKeyEnvVar?: string | undefined;
3347
- temperature?: number | undefined;
3348
- maxToolOutputSize?: number | undefined;
3349
- } | undefined;
3350
- responseSize?: {
3351
- maxBytes?: number | undefined;
3352
- minBytes?: number | undefined;
3353
- } | undefined;
3354
- }>>;
3355
- }, "strip", z.ZodTypeAny, {
3356
- id: string;
3357
- args?: Record<string, unknown> | undefined;
3358
- mode?: "direct" | "llm_host" | undefined;
3359
- metadata?: Record<string, unknown> | undefined;
3360
- description?: string | undefined;
3361
- toolName?: string | undefined;
3362
- scenario?: string | undefined;
3363
- llmHostConfig?: {
3364
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3365
- model?: string | undefined;
3366
- maxTokens?: number | undefined;
3367
- apiKeyEnvVar?: string | undefined;
3368
- temperature?: number | undefined;
3369
- maxToolCalls?: number | undefined;
3370
- } | undefined;
3371
- iterations?: number | undefined;
3372
- accuracyThreshold?: number | undefined;
3373
- judgeReps?: number | undefined;
3374
- canonicalAnswer?: string | undefined;
3375
- tags?: string[] | undefined;
3376
- expect?: {
3377
- response?: unknown;
3378
- isError?: string | boolean | string[] | undefined;
3379
- schema?: string | undefined;
3380
- snapshot?: string | undefined;
3381
- toolsTriggered?: {
3382
- calls: {
3383
- name: string;
3384
- required?: boolean | undefined;
3385
- arguments?: Record<string, unknown> | undefined;
3386
- }[];
3387
- order?: "strict" | "any" | undefined;
3388
- exclusive?: boolean | undefined;
3389
- } | undefined;
3390
- toolCallCount?: {
3391
- exact?: number | undefined;
3392
- min?: number | undefined;
3393
- max?: number | undefined;
3394
- } | undefined;
3395
- containsText?: string | string[] | undefined;
3396
- matchesPattern?: string | string[] | undefined;
3397
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3398
- pattern: string;
3399
- replacement?: string | undefined;
3400
- } | {
3401
- remove: string[];
3402
- })[] | undefined;
3403
- passesJudge?: {
3404
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3405
- text: string;
3406
- };
3407
- model?: string | undefined;
3408
- maxTokens?: number | undefined;
3409
- maxBudgetUsd?: number | undefined;
3410
- reference?: unknown;
3411
- threshold?: number | undefined;
3412
- reps?: number | undefined;
3413
- provider?: "openai" | "anthropic" | "google" | undefined;
3414
- apiKeyEnvVar?: string | undefined;
3415
- temperature?: number | undefined;
3416
- maxToolOutputSize?: number | undefined;
3417
- } | undefined;
3418
- responseSize?: {
3419
- maxBytes?: number | undefined;
3420
- minBytes?: number | undefined;
3421
- } | undefined;
3422
- } | undefined;
3423
- }, {
3424
- id: string;
3425
- args?: Record<string, unknown> | undefined;
3426
- mode?: "direct" | "llm_host" | undefined;
3427
- metadata?: Record<string, unknown> | undefined;
3428
- description?: string | undefined;
3429
- toolName?: string | undefined;
3430
- scenario?: string | undefined;
3431
- llmHostConfig?: {
3432
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3433
- model?: string | undefined;
3434
- maxTokens?: number | undefined;
3435
- apiKeyEnvVar?: string | undefined;
3436
- temperature?: number | undefined;
3437
- maxToolCalls?: number | undefined;
3438
- } | undefined;
3439
- iterations?: number | undefined;
3440
- accuracyThreshold?: number | undefined;
3441
- judgeReps?: number | undefined;
3442
- canonicalAnswer?: string | undefined;
3443
- tags?: string[] | undefined;
3444
- expect?: {
3445
- response?: unknown;
3446
- isError?: string | boolean | string[] | undefined;
3447
- schema?: string | undefined;
3448
- snapshot?: string | undefined;
3449
- toolsTriggered?: {
3450
- calls: {
3451
- name: string;
3452
- required?: boolean | undefined;
3453
- arguments?: Record<string, unknown> | undefined;
3454
- }[];
3455
- order?: "strict" | "any" | undefined;
3456
- exclusive?: boolean | undefined;
3457
- } | undefined;
3458
- toolCallCount?: {
3459
- exact?: number | undefined;
3460
- min?: number | undefined;
3461
- max?: number | undefined;
3462
- } | undefined;
3463
- containsText?: string | string[] | undefined;
3464
- matchesPattern?: string | string[] | undefined;
3465
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3466
- pattern: string;
3467
- replacement?: string | undefined;
3468
- } | {
3469
- remove: string[];
3470
- })[] | undefined;
3471
- passesJudge?: {
3472
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3473
- text: string;
3474
- };
3475
- model?: string | undefined;
3476
- maxTokens?: number | undefined;
3477
- maxBudgetUsd?: number | undefined;
3478
- reference?: unknown;
3479
- threshold?: number | undefined;
3480
- reps?: number | undefined;
3481
- provider?: "openai" | "anthropic" | "google" | undefined;
3482
- apiKeyEnvVar?: string | undefined;
3483
- temperature?: number | undefined;
3484
- maxToolOutputSize?: number | undefined;
3485
- } | undefined;
3486
- responseSize?: {
3487
- maxBytes?: number | undefined;
3488
- minBytes?: number | undefined;
3489
- } | undefined;
3490
- } | undefined;
3491
- }>;
2949
+ }, z.core.$strip>>;
2950
+ }, z.core.$strip>>;
2951
+ }, z.core.$strip>;
3492
2952
  /**
3493
2953
  * Zod schema for EvalDataset (without schemas field, as schemas aren't serializable)
3494
2954
  */
@@ -3498,542 +2958,106 @@ declare const EvalDatasetSchema: z.ZodObject<{
3498
2958
  cases: z.ZodArray<z.ZodObject<{
3499
2959
  id: z.ZodString;
3500
2960
  description: z.ZodOptional<z.ZodString>;
3501
- mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
2961
+ mode: z.ZodOptional<z.ZodEnum<{
2962
+ direct: "direct";
2963
+ mcp_host: "mcp_host";
2964
+ }>>;
3502
2965
  toolName: z.ZodOptional<z.ZodString>;
3503
2966
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3504
2967
  scenario: z.ZodOptional<z.ZodString>;
3505
- llmHostConfig: z.ZodOptional<z.ZodObject<{
3506
- provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2968
+ mcpHostConfig: z.ZodOptional<z.ZodObject<{
2969
+ provider: z.ZodEnum<{
2970
+ openai: "openai";
2971
+ anthropic: "anthropic";
2972
+ azure: "azure";
2973
+ google: "google";
2974
+ mistral: "mistral";
2975
+ deepseek: "deepseek";
2976
+ openrouter: "openrouter";
2977
+ xai: "xai";
2978
+ "vertex-anthropic": "vertex-anthropic";
2979
+ }>;
3507
2980
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3508
2981
  model: z.ZodOptional<z.ZodString>;
3509
2982
  maxTokens: z.ZodOptional<z.ZodNumber>;
3510
2983
  temperature: z.ZodOptional<z.ZodNumber>;
3511
2984
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3512
- }, "strip", z.ZodTypeAny, {
3513
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3514
- model?: string | undefined;
3515
- maxTokens?: number | undefined;
3516
- apiKeyEnvVar?: string | undefined;
3517
- temperature?: number | undefined;
3518
- maxToolCalls?: number | undefined;
3519
- }, {
3520
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3521
- model?: string | undefined;
3522
- maxTokens?: number | undefined;
3523
- apiKeyEnvVar?: string | undefined;
3524
- temperature?: number | undefined;
3525
- maxToolCalls?: number | undefined;
3526
- }>>;
2985
+ }, z.core.$strip>>;
3527
2986
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3528
2987
  iterations: z.ZodOptional<z.ZodNumber>;
3529
2988
  accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3530
2989
  judgeReps: z.ZodOptional<z.ZodNumber>;
3531
2990
  canonicalAnswer: z.ZodOptional<z.ZodString>;
3532
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2991
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
3533
2992
  expect: z.ZodOptional<z.ZodObject<{
3534
2993
  response: z.ZodOptional<z.ZodUnknown>;
3535
2994
  schema: z.ZodOptional<z.ZodString>;
3536
- containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3537
- matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2995
+ containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
2996
+ matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
3538
2997
  snapshot: z.ZodOptional<z.ZodString>;
3539
- snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
2998
+ snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
2999
+ timestamp: "timestamp";
3000
+ uuid: "uuid";
3001
+ "iso-date": "iso-date";
3002
+ objectId: "objectId";
3003
+ jwt: "jwt";
3004
+ }>, z.ZodObject<{
3540
3005
  pattern: z.ZodString;
3541
3006
  replacement: z.ZodOptional<z.ZodString>;
3542
- }, "strip", z.ZodTypeAny, {
3543
- pattern: string;
3544
- replacement?: string | undefined;
3545
- }, {
3546
- pattern: string;
3547
- replacement?: string | undefined;
3548
- }>, z.ZodObject<{
3549
- remove: z.ZodArray<z.ZodString, "many">;
3550
- }, "strip", z.ZodTypeAny, {
3551
- remove: string[];
3552
- }, {
3553
- remove: string[];
3554
- }>]>, "many">>;
3555
- isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3007
+ }, z.core.$strip>, z.ZodObject<{
3008
+ remove: z.ZodArray<z.ZodString>;
3009
+ }, z.core.$strip>]>>>;
3010
+ isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3556
3011
  passesJudge: z.ZodOptional<z.ZodObject<{
3557
- rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3012
+ rubric: z.ZodUnion<readonly [z.ZodEnum<{
3013
+ correctness: "correctness";
3014
+ completeness: "completeness";
3015
+ groundedness: "groundedness";
3016
+ "instruction-following": "instruction-following";
3017
+ conciseness: "conciseness";
3018
+ }>, z.ZodObject<{
3558
3019
  text: z.ZodString;
3559
- }, "strip", z.ZodTypeAny, {
3560
- text: string;
3561
- }, {
3562
- text: string;
3563
- }>]>;
3020
+ }, z.core.$strip>]>;
3564
3021
  reference: z.ZodOptional<z.ZodUnknown>;
3565
3022
  threshold: z.ZodOptional<z.ZodNumber>;
3566
3023
  reps: z.ZodOptional<z.ZodNumber>;
3567
- provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
3024
+ provider: z.ZodOptional<z.ZodEnum<{
3025
+ openai: "openai";
3026
+ anthropic: "anthropic";
3027
+ google: "google";
3028
+ }>>;
3568
3029
  model: z.ZodOptional<z.ZodString>;
3569
3030
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3570
3031
  maxTokens: z.ZodOptional<z.ZodNumber>;
3571
3032
  temperature: z.ZodOptional<z.ZodNumber>;
3572
3033
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3573
3034
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3574
- }, "strip", z.ZodTypeAny, {
3575
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3576
- text: string;
3577
- };
3578
- model?: string | undefined;
3579
- maxTokens?: number | undefined;
3580
- maxBudgetUsd?: number | undefined;
3581
- reference?: unknown;
3582
- threshold?: number | undefined;
3583
- reps?: number | undefined;
3584
- provider?: "openai" | "anthropic" | "google" | undefined;
3585
- apiKeyEnvVar?: string | undefined;
3586
- temperature?: number | undefined;
3587
- maxToolOutputSize?: number | undefined;
3588
- }, {
3589
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3590
- text: string;
3591
- };
3592
- model?: string | undefined;
3593
- maxTokens?: number | undefined;
3594
- maxBudgetUsd?: number | undefined;
3595
- reference?: unknown;
3596
- threshold?: number | undefined;
3597
- reps?: number | undefined;
3598
- provider?: "openai" | "anthropic" | "google" | undefined;
3599
- apiKeyEnvVar?: string | undefined;
3600
- temperature?: number | undefined;
3601
- maxToolOutputSize?: number | undefined;
3602
- }>>;
3035
+ }, z.core.$strip>>;
3603
3036
  responseSize: z.ZodOptional<z.ZodObject<{
3604
3037
  maxBytes: z.ZodOptional<z.ZodNumber>;
3605
3038
  minBytes: z.ZodOptional<z.ZodNumber>;
3606
- }, "strip", z.ZodTypeAny, {
3607
- maxBytes?: number | undefined;
3608
- minBytes?: number | undefined;
3609
- }, {
3610
- maxBytes?: number | undefined;
3611
- minBytes?: number | undefined;
3612
- }>>;
3039
+ }, z.core.$strip>>;
3613
3040
  toolsTriggered: z.ZodOptional<z.ZodObject<{
3614
3041
  calls: z.ZodArray<z.ZodObject<{
3615
3042
  name: z.ZodString;
3616
3043
  arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3617
3044
  required: z.ZodOptional<z.ZodBoolean>;
3618
- }, "strip", z.ZodTypeAny, {
3619
- name: string;
3620
- required?: boolean | undefined;
3621
- arguments?: Record<string, unknown> | undefined;
3622
- }, {
3623
- name: string;
3624
- required?: boolean | undefined;
3625
- arguments?: Record<string, unknown> | undefined;
3626
- }>, "many">;
3627
- order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3045
+ }, z.core.$strip>>;
3046
+ order: z.ZodOptional<z.ZodEnum<{
3047
+ any: "any";
3048
+ strict: "strict";
3049
+ }>>;
3628
3050
  exclusive: z.ZodOptional<z.ZodBoolean>;
3629
- }, "strip", z.ZodTypeAny, {
3630
- calls: {
3631
- name: string;
3632
- required?: boolean | undefined;
3633
- arguments?: Record<string, unknown> | undefined;
3634
- }[];
3635
- order?: "strict" | "any" | undefined;
3636
- exclusive?: boolean | undefined;
3637
- }, {
3638
- calls: {
3639
- name: string;
3640
- required?: boolean | undefined;
3641
- arguments?: Record<string, unknown> | undefined;
3642
- }[];
3643
- order?: "strict" | "any" | undefined;
3644
- exclusive?: boolean | undefined;
3645
- }>>;
3051
+ }, z.core.$strip>>;
3646
3052
  toolCallCount: z.ZodOptional<z.ZodObject<{
3647
3053
  min: z.ZodOptional<z.ZodNumber>;
3648
3054
  max: z.ZodOptional<z.ZodNumber>;
3649
3055
  exact: z.ZodOptional<z.ZodNumber>;
3650
- }, "strip", z.ZodTypeAny, {
3651
- exact?: number | undefined;
3652
- min?: number | undefined;
3653
- max?: number | undefined;
3654
- }, {
3655
- exact?: number | undefined;
3656
- min?: number | undefined;
3657
- max?: number | undefined;
3658
- }>>;
3659
- }, "strip", z.ZodTypeAny, {
3660
- response?: unknown;
3661
- isError?: string | boolean | string[] | undefined;
3662
- schema?: string | undefined;
3663
- snapshot?: string | undefined;
3664
- toolsTriggered?: {
3665
- calls: {
3666
- name: string;
3667
- required?: boolean | undefined;
3668
- arguments?: Record<string, unknown> | undefined;
3669
- }[];
3670
- order?: "strict" | "any" | undefined;
3671
- exclusive?: boolean | undefined;
3672
- } | undefined;
3673
- toolCallCount?: {
3674
- exact?: number | undefined;
3675
- min?: number | undefined;
3676
- max?: number | undefined;
3677
- } | undefined;
3678
- containsText?: string | string[] | undefined;
3679
- matchesPattern?: string | string[] | undefined;
3680
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3681
- pattern: string;
3682
- replacement?: string | undefined;
3683
- } | {
3684
- remove: string[];
3685
- })[] | undefined;
3686
- passesJudge?: {
3687
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3688
- text: string;
3689
- };
3690
- model?: string | undefined;
3691
- maxTokens?: number | undefined;
3692
- maxBudgetUsd?: number | undefined;
3693
- reference?: unknown;
3694
- threshold?: number | undefined;
3695
- reps?: number | undefined;
3696
- provider?: "openai" | "anthropic" | "google" | undefined;
3697
- apiKeyEnvVar?: string | undefined;
3698
- temperature?: number | undefined;
3699
- maxToolOutputSize?: number | undefined;
3700
- } | undefined;
3701
- responseSize?: {
3702
- maxBytes?: number | undefined;
3703
- minBytes?: number | undefined;
3704
- } | undefined;
3705
- }, {
3706
- response?: unknown;
3707
- isError?: string | boolean | string[] | undefined;
3708
- schema?: string | undefined;
3709
- snapshot?: string | undefined;
3710
- toolsTriggered?: {
3711
- calls: {
3712
- name: string;
3713
- required?: boolean | undefined;
3714
- arguments?: Record<string, unknown> | undefined;
3715
- }[];
3716
- order?: "strict" | "any" | undefined;
3717
- exclusive?: boolean | undefined;
3718
- } | undefined;
3719
- toolCallCount?: {
3720
- exact?: number | undefined;
3721
- min?: number | undefined;
3722
- max?: number | undefined;
3723
- } | undefined;
3724
- containsText?: string | string[] | undefined;
3725
- matchesPattern?: string | string[] | undefined;
3726
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3727
- pattern: string;
3728
- replacement?: string | undefined;
3729
- } | {
3730
- remove: string[];
3731
- })[] | undefined;
3732
- passesJudge?: {
3733
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3734
- text: string;
3735
- };
3736
- model?: string | undefined;
3737
- maxTokens?: number | undefined;
3738
- maxBudgetUsd?: number | undefined;
3739
- reference?: unknown;
3740
- threshold?: number | undefined;
3741
- reps?: number | undefined;
3742
- provider?: "openai" | "anthropic" | "google" | undefined;
3743
- apiKeyEnvVar?: string | undefined;
3744
- temperature?: number | undefined;
3745
- maxToolOutputSize?: number | undefined;
3746
- } | undefined;
3747
- responseSize?: {
3748
- maxBytes?: number | undefined;
3749
- minBytes?: number | undefined;
3750
- } | undefined;
3751
- }>>;
3752
- }, "strip", z.ZodTypeAny, {
3753
- id: string;
3754
- args?: Record<string, unknown> | undefined;
3755
- mode?: "direct" | "llm_host" | undefined;
3756
- metadata?: Record<string, unknown> | undefined;
3757
- description?: string | undefined;
3758
- toolName?: string | undefined;
3759
- scenario?: string | undefined;
3760
- llmHostConfig?: {
3761
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3762
- model?: string | undefined;
3763
- maxTokens?: number | undefined;
3764
- apiKeyEnvVar?: string | undefined;
3765
- temperature?: number | undefined;
3766
- maxToolCalls?: number | undefined;
3767
- } | undefined;
3768
- iterations?: number | undefined;
3769
- accuracyThreshold?: number | undefined;
3770
- judgeReps?: number | undefined;
3771
- canonicalAnswer?: string | undefined;
3772
- tags?: string[] | undefined;
3773
- expect?: {
3774
- response?: unknown;
3775
- isError?: string | boolean | string[] | undefined;
3776
- schema?: string | undefined;
3777
- snapshot?: string | undefined;
3778
- toolsTriggered?: {
3779
- calls: {
3780
- name: string;
3781
- required?: boolean | undefined;
3782
- arguments?: Record<string, unknown> | undefined;
3783
- }[];
3784
- order?: "strict" | "any" | undefined;
3785
- exclusive?: boolean | undefined;
3786
- } | undefined;
3787
- toolCallCount?: {
3788
- exact?: number | undefined;
3789
- min?: number | undefined;
3790
- max?: number | undefined;
3791
- } | undefined;
3792
- containsText?: string | string[] | undefined;
3793
- matchesPattern?: string | string[] | undefined;
3794
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3795
- pattern: string;
3796
- replacement?: string | undefined;
3797
- } | {
3798
- remove: string[];
3799
- })[] | undefined;
3800
- passesJudge?: {
3801
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3802
- text: string;
3803
- };
3804
- model?: string | undefined;
3805
- maxTokens?: number | undefined;
3806
- maxBudgetUsd?: number | undefined;
3807
- reference?: unknown;
3808
- threshold?: number | undefined;
3809
- reps?: number | undefined;
3810
- provider?: "openai" | "anthropic" | "google" | undefined;
3811
- apiKeyEnvVar?: string | undefined;
3812
- temperature?: number | undefined;
3813
- maxToolOutputSize?: number | undefined;
3814
- } | undefined;
3815
- responseSize?: {
3816
- maxBytes?: number | undefined;
3817
- minBytes?: number | undefined;
3818
- } | undefined;
3819
- } | undefined;
3820
- }, {
3821
- id: string;
3822
- args?: Record<string, unknown> | undefined;
3823
- mode?: "direct" | "llm_host" | undefined;
3824
- metadata?: Record<string, unknown> | undefined;
3825
- description?: string | undefined;
3826
- toolName?: string | undefined;
3827
- scenario?: string | undefined;
3828
- llmHostConfig?: {
3829
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3830
- model?: string | undefined;
3831
- maxTokens?: number | undefined;
3832
- apiKeyEnvVar?: string | undefined;
3833
- temperature?: number | undefined;
3834
- maxToolCalls?: number | undefined;
3835
- } | undefined;
3836
- iterations?: number | undefined;
3837
- accuracyThreshold?: number | undefined;
3838
- judgeReps?: number | undefined;
3839
- canonicalAnswer?: string | undefined;
3840
- tags?: string[] | undefined;
3841
- expect?: {
3842
- response?: unknown;
3843
- isError?: string | boolean | string[] | undefined;
3844
- schema?: string | undefined;
3845
- snapshot?: string | undefined;
3846
- toolsTriggered?: {
3847
- calls: {
3848
- name: string;
3849
- required?: boolean | undefined;
3850
- arguments?: Record<string, unknown> | undefined;
3851
- }[];
3852
- order?: "strict" | "any" | undefined;
3853
- exclusive?: boolean | undefined;
3854
- } | undefined;
3855
- toolCallCount?: {
3856
- exact?: number | undefined;
3857
- min?: number | undefined;
3858
- max?: number | undefined;
3859
- } | undefined;
3860
- containsText?: string | string[] | undefined;
3861
- matchesPattern?: string | string[] | undefined;
3862
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3863
- pattern: string;
3864
- replacement?: string | undefined;
3865
- } | {
3866
- remove: string[];
3867
- })[] | undefined;
3868
- passesJudge?: {
3869
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3870
- text: string;
3871
- };
3872
- model?: string | undefined;
3873
- maxTokens?: number | undefined;
3874
- maxBudgetUsd?: number | undefined;
3875
- reference?: unknown;
3876
- threshold?: number | undefined;
3877
- reps?: number | undefined;
3878
- provider?: "openai" | "anthropic" | "google" | undefined;
3879
- apiKeyEnvVar?: string | undefined;
3880
- temperature?: number | undefined;
3881
- maxToolOutputSize?: number | undefined;
3882
- } | undefined;
3883
- responseSize?: {
3884
- maxBytes?: number | undefined;
3885
- minBytes?: number | undefined;
3886
- } | undefined;
3887
- } | undefined;
3888
- }>, "many">;
3056
+ }, z.core.$strip>>;
3057
+ }, z.core.$strip>>;
3058
+ }, z.core.$strip>>;
3889
3059
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3890
- }, "strip", z.ZodTypeAny, {
3891
- name: string;
3892
- cases: {
3893
- id: string;
3894
- args?: Record<string, unknown> | undefined;
3895
- mode?: "direct" | "llm_host" | undefined;
3896
- metadata?: Record<string, unknown> | undefined;
3897
- description?: string | undefined;
3898
- toolName?: string | undefined;
3899
- scenario?: string | undefined;
3900
- llmHostConfig?: {
3901
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3902
- model?: string | undefined;
3903
- maxTokens?: number | undefined;
3904
- apiKeyEnvVar?: string | undefined;
3905
- temperature?: number | undefined;
3906
- maxToolCalls?: number | undefined;
3907
- } | undefined;
3908
- iterations?: number | undefined;
3909
- accuracyThreshold?: number | undefined;
3910
- judgeReps?: number | undefined;
3911
- canonicalAnswer?: string | undefined;
3912
- tags?: string[] | undefined;
3913
- expect?: {
3914
- response?: unknown;
3915
- isError?: string | boolean | string[] | undefined;
3916
- schema?: string | undefined;
3917
- snapshot?: string | undefined;
3918
- toolsTriggered?: {
3919
- calls: {
3920
- name: string;
3921
- required?: boolean | undefined;
3922
- arguments?: Record<string, unknown> | undefined;
3923
- }[];
3924
- order?: "strict" | "any" | undefined;
3925
- exclusive?: boolean | undefined;
3926
- } | undefined;
3927
- toolCallCount?: {
3928
- exact?: number | undefined;
3929
- min?: number | undefined;
3930
- max?: number | undefined;
3931
- } | undefined;
3932
- containsText?: string | string[] | undefined;
3933
- matchesPattern?: string | string[] | undefined;
3934
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3935
- pattern: string;
3936
- replacement?: string | undefined;
3937
- } | {
3938
- remove: string[];
3939
- })[] | undefined;
3940
- passesJudge?: {
3941
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3942
- text: string;
3943
- };
3944
- model?: string | undefined;
3945
- maxTokens?: number | undefined;
3946
- maxBudgetUsd?: number | undefined;
3947
- reference?: unknown;
3948
- threshold?: number | undefined;
3949
- reps?: number | undefined;
3950
- provider?: "openai" | "anthropic" | "google" | undefined;
3951
- apiKeyEnvVar?: string | undefined;
3952
- temperature?: number | undefined;
3953
- maxToolOutputSize?: number | undefined;
3954
- } | undefined;
3955
- responseSize?: {
3956
- maxBytes?: number | undefined;
3957
- minBytes?: number | undefined;
3958
- } | undefined;
3959
- } | undefined;
3960
- }[];
3961
- metadata?: Record<string, unknown> | undefined;
3962
- description?: string | undefined;
3963
- }, {
3964
- name: string;
3965
- cases: {
3966
- id: string;
3967
- args?: Record<string, unknown> | undefined;
3968
- mode?: "direct" | "llm_host" | undefined;
3969
- metadata?: Record<string, unknown> | undefined;
3970
- description?: string | undefined;
3971
- toolName?: string | undefined;
3972
- scenario?: string | undefined;
3973
- llmHostConfig?: {
3974
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3975
- model?: string | undefined;
3976
- maxTokens?: number | undefined;
3977
- apiKeyEnvVar?: string | undefined;
3978
- temperature?: number | undefined;
3979
- maxToolCalls?: number | undefined;
3980
- } | undefined;
3981
- iterations?: number | undefined;
3982
- accuracyThreshold?: number | undefined;
3983
- judgeReps?: number | undefined;
3984
- canonicalAnswer?: string | undefined;
3985
- tags?: string[] | undefined;
3986
- expect?: {
3987
- response?: unknown;
3988
- isError?: string | boolean | string[] | undefined;
3989
- schema?: string | undefined;
3990
- snapshot?: string | undefined;
3991
- toolsTriggered?: {
3992
- calls: {
3993
- name: string;
3994
- required?: boolean | undefined;
3995
- arguments?: Record<string, unknown> | undefined;
3996
- }[];
3997
- order?: "strict" | "any" | undefined;
3998
- exclusive?: boolean | undefined;
3999
- } | undefined;
4000
- toolCallCount?: {
4001
- exact?: number | undefined;
4002
- min?: number | undefined;
4003
- max?: number | undefined;
4004
- } | undefined;
4005
- containsText?: string | string[] | undefined;
4006
- matchesPattern?: string | string[] | undefined;
4007
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
4008
- pattern: string;
4009
- replacement?: string | undefined;
4010
- } | {
4011
- remove: string[];
4012
- })[] | undefined;
4013
- passesJudge?: {
4014
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
4015
- text: string;
4016
- };
4017
- model?: string | undefined;
4018
- maxTokens?: number | undefined;
4019
- maxBudgetUsd?: number | undefined;
4020
- reference?: unknown;
4021
- threshold?: number | undefined;
4022
- reps?: number | undefined;
4023
- provider?: "openai" | "anthropic" | "google" | undefined;
4024
- apiKeyEnvVar?: string | undefined;
4025
- temperature?: number | undefined;
4026
- maxToolOutputSize?: number | undefined;
4027
- } | undefined;
4028
- responseSize?: {
4029
- maxBytes?: number | undefined;
4030
- minBytes?: number | undefined;
4031
- } | undefined;
4032
- } | undefined;
4033
- }[];
4034
- metadata?: Record<string, unknown> | undefined;
4035
- description?: string | undefined;
4036
- }>;
3060
+ }, z.core.$strip>;
4037
3061
  /**
4038
3062
  * Type for serialized eval dataset (without Zod schemas)
4039
3063
  */
@@ -4130,15 +3154,15 @@ interface EvalRunMetadata {
4130
3154
  timestamp: string;
4131
3155
  /** Package version from package.json */
4132
3156
  packageVersion: string;
4133
- /** LLM host model identifier (if llm_host mode) */
4134
- llmHostModel?: string;
3157
+ /** MCP host model identifier (if mcp_host mode) */
3158
+ mcpHostModel?: string;
4135
3159
  /** Judge model identifier (if judge was used) */
4136
3160
  judgeModel?: string;
4137
3161
  }
4138
3162
  /**
4139
3163
  * Individual conformance check result
4140
3164
  */
4141
- interface MCPConformanceCheck$1 {
3165
+ interface MCPConformanceCheck {
4142
3166
  /**
4143
3167
  * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
4144
3168
  */
@@ -4167,7 +3191,7 @@ interface MCPConformanceResultData {
4167
3191
  /**
4168
3192
  * Individual check results
4169
3193
  */
4170
- checks: MCPConformanceCheck$1[];
3194
+ checks: MCPConformanceCheck[];
4171
3195
  /**
4172
3196
  * Server info if available
4173
3197
  */
@@ -4302,7 +3326,7 @@ interface EvalCaseResult {
4302
3326
  /**
4303
3327
  * Precision of tool calls made (0–1).
4304
3328
  * 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
4305
- * Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
3329
+ * Populated whenever a `toolsTriggered` expectation is evaluated.
4306
3330
  */
4307
3331
  toolPrecision?: number;
4308
3332
  /**
@@ -4321,6 +3345,23 @@ interface EvalCaseResult {
4321
3345
  * Only present when the case was run with `iterations > 1`.
4322
3346
  */
4323
3347
  infrastructureErrorCount?: number;
3348
+ /**
3349
+ * Ordered trace of tool calls made by the LLM in mcp_host mode.
3350
+ * Only populated when the eval case uses toolsTriggered expectations.
3351
+ */
3352
+ mcpHostTrace?: {
3353
+ /** The ordered sequence of tool calls made by the LLM */
3354
+ calls: Array<{
3355
+ name: string;
3356
+ arguments: Record<string, unknown>;
3357
+ /** 'expected' = was in the expected set, 'unexpected' = was not expected */
3358
+ status: 'expected' | 'unexpected';
3359
+ }>;
3360
+ /** Tools that were required but never called */
3361
+ missed: Array<{
3362
+ name: string;
3363
+ }>;
3364
+ };
4324
3365
  }
4325
3366
  /**
4326
3367
  * Aggregated MCP eval run data
@@ -4464,13 +3505,13 @@ interface EvalRunnerResult {
4464
3505
  */
4465
3506
  improvements?: number;
4466
3507
  /**
4467
- * Average tool precision across all llm_host cases that have a
3508
+ * Average tool precision across all mcp_host cases that have a
4468
3509
  * `toolsTriggered` expectation (precision = fraction of called tools
4469
3510
  * that were expected). Only present when at least one such case ran.
4470
3511
  */
4471
3512
  datasetToolPrecision?: number;
4472
3513
  /**
4473
- * Average tool recall across all llm_host cases that have a
3514
+ * Average tool recall across all mcp_host cases that have a
4474
3515
  * `toolsTriggered` expectation (recall = fraction of required tools
4475
3516
  * that were actually called). Only present when at least one such case ran.
4476
3517
  */
@@ -4525,7 +3566,7 @@ interface EvalRunnerOptions {
4525
3566
  */
4526
3567
  concurrency?: number;
4527
3568
  /**
4528
- * Default iteration count for `llm_host` mode cases that do not specify
3569
+ * Default iteration count for `mcp_host` mode cases that do not specify
4529
3570
  * `iterations` explicitly. Has no effect on `direct` mode cases (which are
4530
3571
  * deterministic and always default to 1 iteration).
4531
3572
  *
@@ -4536,7 +3577,7 @@ interface EvalRunnerOptions {
4536
3577
  *
4537
3578
  * @example
4538
3579
  * ```typescript
4539
- * // Run all llm_host cases 10 times each by default
3580
+ * // Run all mcp_host cases 10 times each by default
4540
3581
  * await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
4541
3582
  * ```
4542
3583
  */
@@ -4569,12 +3610,12 @@ interface EvalRunnerOptions {
4569
3610
  */
4570
3611
  baselineResultsFrom?: string;
4571
3612
  /**
4572
- * LLM host model identifier to record in run metadata.
4573
- * Use this to identify which model was used when running llm_host cases.
3613
+ * MCP host model identifier to record in run metadata.
3614
+ * Use this to identify which model was used when running mcp_host cases.
4574
3615
  *
4575
3616
  * @example 'claude-opus-4-20250514'
4576
3617
  */
4577
- llmHostModel?: string;
3618
+ mcpHostModel?: string;
4578
3619
  /**
4579
3620
  * Judge model identifier to record in run metadata.
4580
3621
  * Use this to identify which model was used for judge evaluations.
@@ -4662,8 +3703,6 @@ interface ServerComparisonResult {
4662
3703
  ties: number;
4663
3704
  /** Cases where both failed */
4664
3705
  bothFail: number;
4665
- /** Raw count of cases where both servers failed (same as bothFail) */
4666
- bothFailCount: number;
4667
3706
  /** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
4668
3707
  decidedCases: number;
4669
3708
  /** Fraction of total cases where both servers failed (bothFail / total) */
@@ -4714,7 +3753,7 @@ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baseli
4714
3753
  declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
4715
3754
 
4716
3755
  /**
4717
- * LLM Host Simulation - Main entry point
3756
+ * MCP Host Simulation - Main entry point
4718
3757
  *
4719
3758
  * All providers (openai, anthropic, google, azure, mistral, deepseek,
4720
3759
  * openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
@@ -4733,7 +3772,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4733
3772
  */
4734
3773
 
4735
3774
  /**
4736
- * Simulates an LLM host interacting with an MCP server.
3775
+ * Simulates an MCP host interacting with an MCP server.
4737
3776
  *
4738
3777
  * The LLM chooses which tools to call based solely on their descriptions and
4739
3778
  * schemas, testing discoverability and parameter clarity at the level a real
@@ -4745,12 +3784,12 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4745
3784
  *
4746
3785
  * @param mcp - MCP fixture API
4747
3786
  * @param scenario - Natural language prompt describing what the LLM should do
4748
- * @param config - LLM host configuration (provider, model, temperature, etc.)
3787
+ * @param config - MCP host configuration (provider, model, temperature, etc.)
4749
3788
  * @returns Simulation result with tool calls, final response, and latency data
4750
3789
  *
4751
3790
  * @example
4752
3791
  * ```typescript
4753
- * const result = await simulateLLMHost(mcp,
3792
+ * const result = await simulateMCPHost(mcp,
4754
3793
  * "Find recent documents about MCP testing frameworks",
4755
3794
  * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
4756
3795
  * );
@@ -4759,7 +3798,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4759
3798
  * expect(result.toolCalls.map(c => c.name)).toContain('search');
4760
3799
  * ```
4761
3800
  */
4762
- declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
3801
+ declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
4763
3802
  /**
4764
3803
  * Returns true if the given provider is supported.
4765
3804
  *
@@ -4838,14 +3877,6 @@ interface MCPConformanceOptions {
4838
3877
  */
4839
3878
  checkPrompts?: boolean;
4840
3879
  }
4841
- /**
4842
- * Individual check result
4843
- */
4844
- interface MCPConformanceCheck {
4845
- name: string;
4846
- pass: boolean;
4847
- message: string;
4848
- }
4849
3880
  /**
4850
3881
  * Raw MCP responses for snapshotting
4851
3882
  */
@@ -4978,4 +4009,4 @@ interface MCPEvalReporterConfig {
4978
4009
  includeAutoTracking?: boolean;
4979
4010
  }
4980
4011
 
4981
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4012
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };