@gleanwork/mcp-server-tester 1.0.0-beta.2 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -233,314 +233,64 @@ type MCPConfig = StdioMCPConfig | HttpMCPConfig;
233
233
  /**
234
234
  * Union schema for MCPConfig (validates based on transport type)
235
235
  */
236
- declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject<{
236
+ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
237
237
  transport: z.ZodLiteral<"stdio">;
238
238
  command: z.ZodString;
239
- args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
239
+ args: z.ZodOptional<z.ZodArray<z.ZodString>>;
240
240
  cwd: z.ZodOptional<z.ZodString>;
241
241
  env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
242
242
  capabilities: z.ZodOptional<z.ZodObject<{
243
243
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
244
244
  roots: z.ZodOptional<z.ZodObject<{
245
245
  listChanged: z.ZodBoolean;
246
- }, "strip", z.ZodTypeAny, {
247
- listChanged: boolean;
248
- }, {
249
- listChanged: boolean;
250
- }>>;
251
- }, "strip", z.ZodTypeAny, {
252
- sampling?: Record<string, unknown> | undefined;
253
- roots?: {
254
- listChanged: boolean;
255
- } | undefined;
256
- }, {
257
- sampling?: Record<string, unknown> | undefined;
258
- roots?: {
259
- listChanged: boolean;
260
- } | undefined;
261
- }>>;
246
+ }, z.core.$strip>>;
247
+ }, z.core.$strip>>;
262
248
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
263
249
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
264
250
  callTimeoutMs: z.ZodOptional<z.ZodNumber>;
265
251
  quiet: z.ZodOptional<z.ZodBoolean>;
266
- }, "strip", z.ZodTypeAny, {
267
- transport: "stdio";
268
- command: string;
269
- args?: string[] | undefined;
270
- cwd?: string | undefined;
271
- env?: Record<string, string> | undefined;
272
- capabilities?: {
273
- sampling?: Record<string, unknown> | undefined;
274
- roots?: {
275
- listChanged: boolean;
276
- } | undefined;
277
- } | undefined;
278
- connectTimeoutMs?: number | undefined;
279
- requestTimeoutMs?: number | undefined;
280
- callTimeoutMs?: number | undefined;
281
- quiet?: boolean | undefined;
282
- }, {
283
- transport: "stdio";
284
- command: string;
285
- args?: string[] | undefined;
286
- cwd?: string | undefined;
287
- env?: Record<string, string> | undefined;
288
- capabilities?: {
289
- sampling?: Record<string, unknown> | undefined;
290
- roots?: {
291
- listChanged: boolean;
292
- } | undefined;
293
- } | undefined;
294
- connectTimeoutMs?: number | undefined;
295
- requestTimeoutMs?: number | undefined;
296
- callTimeoutMs?: number | undefined;
297
- quiet?: boolean | undefined;
298
- }>, z.ZodObject<{
252
+ }, z.core.$strip>, z.ZodObject<{
299
253
  transport: z.ZodLiteral<"http">;
300
- serverUrl: z.ZodEffects<z.ZodString, string, string>;
254
+ serverUrl: z.ZodString;
301
255
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
302
256
  capabilities: z.ZodOptional<z.ZodObject<{
303
257
  sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
304
258
  roots: z.ZodOptional<z.ZodObject<{
305
259
  listChanged: z.ZodBoolean;
306
- }, "strip", z.ZodTypeAny, {
307
- listChanged: boolean;
308
- }, {
309
- listChanged: boolean;
310
- }>>;
311
- }, "strip", z.ZodTypeAny, {
312
- sampling?: Record<string, unknown> | undefined;
313
- roots?: {
314
- listChanged: boolean;
315
- } | undefined;
316
- }, {
317
- sampling?: Record<string, unknown> | undefined;
318
- roots?: {
319
- listChanged: boolean;
320
- } | undefined;
321
- }>>;
260
+ }, z.core.$strip>>;
261
+ }, z.core.$strip>>;
322
262
  connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
323
263
  requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
324
264
  callTimeoutMs: z.ZodOptional<z.ZodNumber>;
325
- auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
265
+ auth: z.ZodOptional<z.ZodObject<{
326
266
  accessToken: z.ZodOptional<z.ZodString>;
327
267
  oauth: z.ZodOptional<z.ZodObject<{
328
268
  serverUrl: z.ZodString;
329
- scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
269
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
330
270
  resource: z.ZodOptional<z.ZodString>;
331
271
  authStatePath: z.ZodOptional<z.ZodString>;
332
272
  clientId: z.ZodOptional<z.ZodString>;
333
273
  clientSecret: z.ZodOptional<z.ZodString>;
334
274
  redirectUri: z.ZodOptional<z.ZodString>;
335
- }, "strip", z.ZodTypeAny, {
336
- serverUrl: string;
337
- scopes?: string[] | undefined;
338
- resource?: string | undefined;
339
- authStatePath?: string | undefined;
340
- clientId?: string | undefined;
341
- clientSecret?: string | undefined;
342
- redirectUri?: string | undefined;
343
- }, {
344
- serverUrl: string;
345
- scopes?: string[] | undefined;
346
- resource?: string | undefined;
347
- authStatePath?: string | undefined;
348
- clientId?: string | undefined;
349
- clientSecret?: string | undefined;
350
- redirectUri?: string | undefined;
351
- }>>;
275
+ }, z.core.$strip>>;
352
276
  clientCredentials: z.ZodOptional<z.ZodObject<{
353
277
  clientId: z.ZodOptional<z.ZodString>;
354
278
  clientSecret: z.ZodOptional<z.ZodString>;
355
279
  tokenEndpoint: z.ZodOptional<z.ZodString>;
356
- scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
357
- }, "strip", z.ZodTypeAny, {
358
- scopes?: string[] | undefined;
359
- clientId?: string | undefined;
360
- clientSecret?: string | undefined;
361
- tokenEndpoint?: string | undefined;
362
- }, {
363
- scopes?: string[] | undefined;
364
- clientId?: string | undefined;
365
- clientSecret?: string | undefined;
366
- tokenEndpoint?: string | undefined;
367
- }>>;
368
- }, "strip", z.ZodTypeAny, {
369
- accessToken?: string | undefined;
370
- oauth?: {
371
- serverUrl: string;
372
- scopes?: string[] | undefined;
373
- resource?: string | undefined;
374
- authStatePath?: string | undefined;
375
- clientId?: string | undefined;
376
- clientSecret?: string | undefined;
377
- redirectUri?: string | undefined;
378
- } | undefined;
379
- clientCredentials?: {
380
- scopes?: string[] | undefined;
381
- clientId?: string | undefined;
382
- clientSecret?: string | undefined;
383
- tokenEndpoint?: string | undefined;
384
- } | undefined;
385
- }, {
386
- accessToken?: string | undefined;
387
- oauth?: {
388
- serverUrl: string;
389
- scopes?: string[] | undefined;
390
- resource?: string | undefined;
391
- authStatePath?: string | undefined;
392
- clientId?: string | undefined;
393
- clientSecret?: string | undefined;
394
- redirectUri?: string | undefined;
395
- } | undefined;
396
- clientCredentials?: {
397
- scopes?: string[] | undefined;
398
- clientId?: string | undefined;
399
- clientSecret?: string | undefined;
400
- tokenEndpoint?: string | undefined;
401
- } | undefined;
402
- }>, {
403
- accessToken?: string | undefined;
404
- oauth?: {
405
- serverUrl: string;
406
- scopes?: string[] | undefined;
407
- resource?: string | undefined;
408
- authStatePath?: string | undefined;
409
- clientId?: string | undefined;
410
- clientSecret?: string | undefined;
411
- redirectUri?: string | undefined;
412
- } | undefined;
413
- clientCredentials?: {
414
- scopes?: string[] | undefined;
415
- clientId?: string | undefined;
416
- clientSecret?: string | undefined;
417
- tokenEndpoint?: string | undefined;
418
- } | undefined;
419
- }, {
420
- accessToken?: string | undefined;
421
- oauth?: {
422
- serverUrl: string;
423
- scopes?: string[] | undefined;
424
- resource?: string | undefined;
425
- authStatePath?: string | undefined;
426
- clientId?: string | undefined;
427
- clientSecret?: string | undefined;
428
- redirectUri?: string | undefined;
429
- } | undefined;
430
- clientCredentials?: {
431
- scopes?: string[] | undefined;
432
- clientId?: string | undefined;
433
- clientSecret?: string | undefined;
434
- tokenEndpoint?: string | undefined;
435
- } | undefined;
436
- }>>;
280
+ scopes: z.ZodOptional<z.ZodArray<z.ZodString>>;
281
+ }, z.core.$strip>>;
282
+ }, z.core.$strip>>;
437
283
  proxy: z.ZodOptional<z.ZodObject<{
438
284
  url: z.ZodString;
439
- }, "strip", z.ZodTypeAny, {
440
- url: string;
441
- }, {
442
- url: string;
443
- }>>;
285
+ }, z.core.$strip>>;
444
286
  retryAttempts: z.ZodOptional<z.ZodNumber>;
445
287
  tls: z.ZodOptional<z.ZodObject<{
446
288
  ca: z.ZodOptional<z.ZodString>;
447
289
  cert: z.ZodOptional<z.ZodString>;
448
290
  key: z.ZodOptional<z.ZodString>;
449
291
  rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
450
- }, "strip", z.ZodTypeAny, {
451
- ca?: string | undefined;
452
- cert?: string | undefined;
453
- key?: string | undefined;
454
- rejectUnauthorized?: boolean | undefined;
455
- }, {
456
- ca?: string | undefined;
457
- cert?: string | undefined;
458
- key?: string | undefined;
459
- rejectUnauthorized?: boolean | undefined;
460
- }>>;
461
- }, "strip", z.ZodTypeAny, {
462
- serverUrl: string;
463
- transport: "http";
464
- capabilities?: {
465
- sampling?: Record<string, unknown> | undefined;
466
- roots?: {
467
- listChanged: boolean;
468
- } | undefined;
469
- } | undefined;
470
- connectTimeoutMs?: number | undefined;
471
- requestTimeoutMs?: number | undefined;
472
- callTimeoutMs?: number | undefined;
473
- headers?: Record<string, string> | undefined;
474
- auth?: {
475
- accessToken?: string | undefined;
476
- oauth?: {
477
- serverUrl: string;
478
- scopes?: string[] | undefined;
479
- resource?: string | undefined;
480
- authStatePath?: string | undefined;
481
- clientId?: string | undefined;
482
- clientSecret?: string | undefined;
483
- redirectUri?: string | undefined;
484
- } | undefined;
485
- clientCredentials?: {
486
- scopes?: string[] | undefined;
487
- clientId?: string | undefined;
488
- clientSecret?: string | undefined;
489
- tokenEndpoint?: string | undefined;
490
- } | undefined;
491
- } | undefined;
492
- proxy?: {
493
- url: string;
494
- } | undefined;
495
- retryAttempts?: number | undefined;
496
- tls?: {
497
- ca?: string | undefined;
498
- cert?: string | undefined;
499
- key?: string | undefined;
500
- rejectUnauthorized?: boolean | undefined;
501
- } | undefined;
502
- }, {
503
- serverUrl: string;
504
- transport: "http";
505
- capabilities?: {
506
- sampling?: Record<string, unknown> | undefined;
507
- roots?: {
508
- listChanged: boolean;
509
- } | undefined;
510
- } | undefined;
511
- connectTimeoutMs?: number | undefined;
512
- requestTimeoutMs?: number | undefined;
513
- callTimeoutMs?: number | undefined;
514
- headers?: Record<string, string> | undefined;
515
- auth?: {
516
- accessToken?: string | undefined;
517
- oauth?: {
518
- serverUrl: string;
519
- scopes?: string[] | undefined;
520
- resource?: string | undefined;
521
- authStatePath?: string | undefined;
522
- clientId?: string | undefined;
523
- clientSecret?: string | undefined;
524
- redirectUri?: string | undefined;
525
- } | undefined;
526
- clientCredentials?: {
527
- scopes?: string[] | undefined;
528
- clientId?: string | undefined;
529
- clientSecret?: string | undefined;
530
- tokenEndpoint?: string | undefined;
531
- } | undefined;
532
- } | undefined;
533
- proxy?: {
534
- url: string;
535
- } | undefined;
536
- retryAttempts?: number | undefined;
537
- tls?: {
538
- ca?: string | undefined;
539
- cert?: string | undefined;
540
- key?: string | undefined;
541
- rejectUnauthorized?: boolean | undefined;
542
- } | undefined;
543
- }>]>;
292
+ }, z.core.$strip>>;
293
+ }, z.core.$strip>], "transport">;
544
294
  /**
545
295
  * Validates an MCPConfig object
546
296
  *
@@ -1790,9 +1540,9 @@ declare function validateError(response: unknown, expected?: boolean | string |
1790
1540
  declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
1791
1541
 
1792
1542
  /**
1793
- * Tool call validators for llm_host simulation results.
1543
+ * Tool call validators for mcp_host simulation results.
1794
1544
  *
1795
- * These validators extract the tool call trace from an LLMHostSimulationResult
1545
+ * These validators extract the tool call trace from an MCPHostSimulationResult
1796
1546
  * and apply assertions against expected call lists and counts.
1797
1547
  */
1798
1548
 
@@ -1811,16 +1561,16 @@ interface ToolCallCountOptions {
1811
1561
  exact?: number;
1812
1562
  }
1813
1563
  /**
1814
- * Validates tool calls made during an LLM host simulation.
1564
+ * Validates tool calls made during an MCP host simulation.
1815
1565
  *
1816
- * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1566
+ * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
1817
1567
  * @param expectation - Expected tool call specification
1818
1568
  */
1819
1569
  declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
1820
1570
  /**
1821
- * Validates the number of tool calls made during an LLM host simulation.
1571
+ * Validates the number of tool calls made during an MCP host simulation.
1822
1572
  *
1823
- * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
1573
+ * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
1824
1574
  * @param options - Count constraints (min, max, exact)
1825
1575
  */
1826
1576
  declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
@@ -2223,7 +1973,7 @@ declare global {
2223
1973
  */
2224
1974
  toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
2225
1975
  /**
2226
- * Validates which tools the LLM called during an llm_host simulation.
1976
+ * Validates which tools the LLM called during a mcp_host simulation.
2227
1977
  *
2228
1978
  * @example
2229
1979
  * ```typescript
@@ -2235,7 +1985,7 @@ declare global {
2235
1985
  */
2236
1986
  toHaveToolCalls(expectation: ToolCallExpectation): R;
2237
1987
  /**
2238
- * Validates the number of tool calls made during an llm_host simulation.
1988
+ * Validates the number of tool calls made during a mcp_host simulation.
2239
1989
  *
2240
1990
  * @example
2241
1991
  * ```typescript
@@ -2486,6 +2236,14 @@ declare function toMatchToolPattern(this: {
2486
2236
  /**
2487
2237
  * Creates the toMatchToolSnapshot matcher function
2488
2238
  *
2239
+ * @remarks
2240
+ * **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
2241
+ * internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
2242
+ * Calling it outside a Playwright test will throw a cryptic context error.
2243
+ *
2244
+ * To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
2245
+ * function directly.
2246
+ *
2489
2247
  * Note: This is an async matcher that uses Playwright's snapshot testing.
2490
2248
  */
2491
2249
  declare function toMatchToolSnapshot(this: {
@@ -2595,7 +2353,7 @@ declare function toSatisfyToolPredicate(this: {
2595
2353
  /**
2596
2354
  * toHaveToolCalls Matcher
2597
2355
  *
2598
- * Validates which tools the LLM called during an llm_host simulation.
2356
+ * Validates which tools the LLM called during a mcp_host simulation.
2599
2357
  */
2600
2358
 
2601
2359
  /**
@@ -2611,7 +2369,7 @@ declare function toHaveToolCalls(this: {
2611
2369
  /**
2612
2370
  * toHaveToolCallCount Matcher
2613
2371
  *
2614
- * Validates the number of tool calls made during an llm_host simulation.
2372
+ * Validates the number of tool calls made during a mcp_host simulation.
2615
2373
  */
2616
2374
 
2617
2375
  /**
@@ -2720,9 +2478,9 @@ interface MCPAuthFixtures {
2720
2478
  declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs & playwright_test.PlaywrightTestOptions & MCPAuthFixtures, playwright_test.PlaywrightWorkerArgs & playwright_test.PlaywrightWorkerOptions>;
2721
2479
 
2722
2480
  /**
2723
- * Types and interfaces for LLM host simulation mode
2481
+ * Types and interfaces for MCP host simulation mode
2724
2482
  *
2725
- * This module provides types for testing MCP servers through LLM hosts,
2483
+ * This module provides types for testing MCP servers through MCP hosts,
2726
2484
  * validating tool descriptions, parameter clarity, and discoverability.
2727
2485
  */
2728
2486
 
@@ -2751,9 +2509,9 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
2751
2509
  */
2752
2510
  | 'vertex-anthropic';
2753
2511
  /**
2754
- * Configuration for LLM host simulation
2512
+ * Configuration for MCP host simulation
2755
2513
  */
2756
- interface LLMHostConfig {
2514
+ interface MCPHostConfig {
2757
2515
  /**
2758
2516
  * LLM provider to use
2759
2517
  */
@@ -2793,9 +2551,9 @@ interface LLMToolCall {
2793
2551
  id?: string;
2794
2552
  }
2795
2553
  /**
2796
- * Result from an LLM host simulation
2554
+ * Result from an MCP host simulation
2797
2555
  */
2798
- interface LLMHostSimulationResult {
2556
+ interface MCPHostSimulationResult {
2799
2557
  /** Whether the simulation succeeded */
2800
2558
  success: boolean;
2801
2559
  /** Tool calls made by the LLM */
@@ -2823,33 +2581,33 @@ interface LLMHostSimulationResult {
2823
2581
  mcpDurationMs?: number;
2824
2582
  }
2825
2583
  /**
2826
- * Interface for LLM host simulators.
2584
+ * Interface for MCP host simulators.
2827
2585
  *
2828
2586
  * The only built-in implementation is the Vercel AI SDK orchestrator
2829
- * (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
2587
+ * (src/evals/mcpHost/adapters/vercel.ts). Custom implementations can be
2830
2588
  * created for specialised testing needs.
2831
2589
  */
2832
- interface LLMHostSimulator {
2590
+ interface MCPHostSimulator {
2833
2591
  /**
2834
- * Simulates an LLM host interacting with an MCP server
2592
+ * Simulates an MCP host interacting with an MCP server
2835
2593
  *
2836
2594
  * @param mcp - MCP fixture API
2837
2595
  * @param scenario - Natural language prompt describing what the LLM should do
2838
- * @param config - LLM host configuration
2596
+ * @param config - MCP host configuration
2839
2597
  * @returns Simulation result with tool calls and response
2840
2598
  */
2841
- simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
2599
+ simulate(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
2842
2600
  }
2843
2601
 
2844
2602
  /**
2845
2603
  * Evaluation mode
2846
2604
  */
2847
- type EvalMode = 'direct' | 'llm_host';
2605
+ type EvalMode = 'direct' | 'mcp_host';
2848
2606
  /**
2849
2607
  * A single eval test case
2850
2608
  *
2851
2609
  * For 'direct' mode: toolName and args are required
2852
- * For 'llm_host' mode: scenario and llmHostConfig are required
2610
+ * For 'mcp_host' mode: scenario and mcpHostConfig are required
2853
2611
  */
2854
2612
  interface EvalCase {
2855
2613
  /**
@@ -2863,40 +2621,40 @@ interface EvalCase {
2863
2621
  /**
2864
2622
  * Evaluation mode
2865
2623
  * - 'direct': Direct API calls to MCP tools (default)
2866
- * - 'llm_host': LLM-driven tool selection via natural language
2624
+ * - 'mcp_host': LLM-driven tool selection via natural language
2867
2625
  *
2868
2626
  * @default 'direct'
2869
2627
  */
2870
2628
  mode?: EvalMode;
2871
2629
  /**
2872
- * Name of the MCP tool to call (required for 'direct' mode, optional for 'llm_host' mode)
2630
+ * Name of the MCP tool to call (required for 'direct' mode, optional for 'mcp_host' mode)
2873
2631
  */
2874
2632
  toolName?: string;
2875
2633
  /**
2876
- * Arguments to pass to the tool (required for 'direct' mode, optional for 'llm_host' mode)
2634
+ * Arguments to pass to the tool (required for 'direct' mode, optional for 'mcp_host' mode)
2877
2635
  */
2878
2636
  args?: Record<string, unknown>;
2879
2637
  /**
2880
- * Natural language scenario for LLM to execute (optional, required for 'llm_host' mode)
2638
+ * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
2881
2639
  *
2882
2640
  * @example "Get the weather for London and tell me if I need an umbrella"
2883
2641
  */
2884
2642
  scenario?: string;
2885
2643
  /**
2886
- * LLM host configuration (optional for 'llm_host' mode)
2644
+ * MCP host configuration (optional for 'mcp_host' mode)
2887
2645
  *
2888
2646
  * If not specified, uses default configuration from test environment
2889
2647
  */
2890
- llmHostConfig?: LLMHostConfig;
2648
+ mcpHostConfig?: MCPHostConfig;
2891
2649
  /**
2892
2650
  * Additional metadata for this test case
2893
2651
  *
2894
- * For 'llm_host' mode, can include 'expectedToolCalls' for validation
2652
+ * For 'mcp_host' mode, can include 'expectedToolCalls' for validation
2895
2653
  */
2896
2654
  metadata?: Record<string, unknown>;
2897
2655
  /**
2898
- * Number of times to run this case and compute an accuracy score.
2899
- * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
2656
+ * Number of times to run this case and compute an assertion pass rate.
2657
+ * When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
2900
2658
  * by `accuracyThreshold` rather than a single run.
2901
2659
  * @default 1
2902
2660
  */
@@ -3027,8 +2785,8 @@ interface EvalExpectBlock {
3027
2785
  minBytes?: number;
3028
2786
  };
3029
2787
  /**
3030
- * Asserts which tools the LLM called during an llm_host simulation.
3031
- * Only meaningful for llm_host mode — direct mode has no tool call trace.
2788
+ * Asserts which tools the LLM called during a mcp_host simulation.
2789
+ * Only meaningful for mcp_host mode — direct mode has no tool call trace.
3032
2790
  */
3033
2791
  toolsTriggered?: {
3034
2792
  /** Expected tool calls */
@@ -3049,7 +2807,7 @@ interface EvalExpectBlock {
3049
2807
  exclusive?: boolean;
3050
2808
  };
3051
2809
  /**
3052
- * Asserts the number of tool calls made during an llm_host simulation.
2810
+ * Asserts the number of tool calls made during a mcp_host simulation.
3053
2811
  */
3054
2812
  toolCallCount?: {
3055
2813
  /** Minimum number of tool calls */
@@ -3088,399 +2846,109 @@ interface EvalDataset {
3088
2846
  /**
3089
2847
  * Zod schema for EvalCase
3090
2848
  *
3091
- * toolName and args are optional for llm_host mode (which uses scenario instead)
2849
+ * toolName and args are optional for mcp_host mode (which uses scenario instead)
3092
2850
  */
3093
2851
  declare const EvalCaseSchema: z.ZodObject<{
3094
2852
  id: z.ZodString;
3095
2853
  description: z.ZodOptional<z.ZodString>;
3096
- mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
2854
+ mode: z.ZodOptional<z.ZodEnum<{
2855
+ direct: "direct";
2856
+ mcp_host: "mcp_host";
2857
+ }>>;
3097
2858
  toolName: z.ZodOptional<z.ZodString>;
3098
2859
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3099
2860
  scenario: z.ZodOptional<z.ZodString>;
3100
- llmHostConfig: z.ZodOptional<z.ZodObject<{
3101
- provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2861
+ mcpHostConfig: z.ZodOptional<z.ZodObject<{
2862
+ provider: z.ZodEnum<{
2863
+ openai: "openai";
2864
+ anthropic: "anthropic";
2865
+ azure: "azure";
2866
+ google: "google";
2867
+ mistral: "mistral";
2868
+ deepseek: "deepseek";
2869
+ openrouter: "openrouter";
2870
+ xai: "xai";
2871
+ "vertex-anthropic": "vertex-anthropic";
2872
+ }>;
3102
2873
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3103
2874
  model: z.ZodOptional<z.ZodString>;
3104
2875
  maxTokens: z.ZodOptional<z.ZodNumber>;
3105
2876
  temperature: z.ZodOptional<z.ZodNumber>;
3106
2877
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3107
- }, "strip", z.ZodTypeAny, {
3108
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3109
- model?: string | undefined;
3110
- maxTokens?: number | undefined;
3111
- apiKeyEnvVar?: string | undefined;
3112
- temperature?: number | undefined;
3113
- maxToolCalls?: number | undefined;
3114
- }, {
3115
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3116
- model?: string | undefined;
3117
- maxTokens?: number | undefined;
3118
- apiKeyEnvVar?: string | undefined;
3119
- temperature?: number | undefined;
3120
- maxToolCalls?: number | undefined;
3121
- }>>;
2878
+ }, z.core.$strip>>;
3122
2879
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3123
2880
  iterations: z.ZodOptional<z.ZodNumber>;
3124
2881
  accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3125
2882
  judgeReps: z.ZodOptional<z.ZodNumber>;
3126
2883
  canonicalAnswer: z.ZodOptional<z.ZodString>;
3127
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2884
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
3128
2885
  expect: z.ZodOptional<z.ZodObject<{
3129
2886
  response: z.ZodOptional<z.ZodUnknown>;
3130
2887
  schema: z.ZodOptional<z.ZodString>;
3131
- containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3132
- matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2888
+ containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
2889
+ matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
3133
2890
  snapshot: z.ZodOptional<z.ZodString>;
3134
- snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
2891
+ snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
2892
+ timestamp: "timestamp";
2893
+ uuid: "uuid";
2894
+ "iso-date": "iso-date";
2895
+ objectId: "objectId";
2896
+ jwt: "jwt";
2897
+ }>, z.ZodObject<{
3135
2898
  pattern: z.ZodString;
3136
2899
  replacement: z.ZodOptional<z.ZodString>;
3137
- }, "strip", z.ZodTypeAny, {
3138
- pattern: string;
3139
- replacement?: string | undefined;
3140
- }, {
3141
- pattern: string;
3142
- replacement?: string | undefined;
3143
- }>, z.ZodObject<{
3144
- remove: z.ZodArray<z.ZodString, "many">;
3145
- }, "strip", z.ZodTypeAny, {
3146
- remove: string[];
3147
- }, {
3148
- remove: string[];
3149
- }>]>, "many">>;
3150
- isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2900
+ }, z.core.$strip>, z.ZodObject<{
2901
+ remove: z.ZodArray<z.ZodString>;
2902
+ }, z.core.$strip>]>>>;
2903
+ isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3151
2904
  passesJudge: z.ZodOptional<z.ZodObject<{
3152
- rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
2905
+ rubric: z.ZodUnion<readonly [z.ZodEnum<{
2906
+ correctness: "correctness";
2907
+ completeness: "completeness";
2908
+ groundedness: "groundedness";
2909
+ "instruction-following": "instruction-following";
2910
+ conciseness: "conciseness";
2911
+ }>, z.ZodObject<{
3153
2912
  text: z.ZodString;
3154
- }, "strip", z.ZodTypeAny, {
3155
- text: string;
3156
- }, {
3157
- text: string;
3158
- }>]>;
2913
+ }, z.core.$strip>]>;
3159
2914
  reference: z.ZodOptional<z.ZodUnknown>;
3160
2915
  threshold: z.ZodOptional<z.ZodNumber>;
3161
2916
  reps: z.ZodOptional<z.ZodNumber>;
3162
- provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
2917
+ provider: z.ZodOptional<z.ZodEnum<{
2918
+ openai: "openai";
2919
+ anthropic: "anthropic";
2920
+ google: "google";
2921
+ }>>;
3163
2922
  model: z.ZodOptional<z.ZodString>;
3164
2923
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3165
2924
  maxTokens: z.ZodOptional<z.ZodNumber>;
3166
2925
  temperature: z.ZodOptional<z.ZodNumber>;
3167
2926
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3168
2927
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3169
- }, "strip", z.ZodTypeAny, {
3170
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3171
- text: string;
3172
- };
3173
- model?: string | undefined;
3174
- maxTokens?: number | undefined;
3175
- maxBudgetUsd?: number | undefined;
3176
- reference?: unknown;
3177
- threshold?: number | undefined;
3178
- reps?: number | undefined;
3179
- provider?: "openai" | "anthropic" | "google" | undefined;
3180
- apiKeyEnvVar?: string | undefined;
3181
- temperature?: number | undefined;
3182
- maxToolOutputSize?: number | undefined;
3183
- }, {
3184
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3185
- text: string;
3186
- };
3187
- model?: string | undefined;
3188
- maxTokens?: number | undefined;
3189
- maxBudgetUsd?: number | undefined;
3190
- reference?: unknown;
3191
- threshold?: number | undefined;
3192
- reps?: number | undefined;
3193
- provider?: "openai" | "anthropic" | "google" | undefined;
3194
- apiKeyEnvVar?: string | undefined;
3195
- temperature?: number | undefined;
3196
- maxToolOutputSize?: number | undefined;
3197
- }>>;
2928
+ }, z.core.$strip>>;
3198
2929
  responseSize: z.ZodOptional<z.ZodObject<{
3199
2930
  maxBytes: z.ZodOptional<z.ZodNumber>;
3200
2931
  minBytes: z.ZodOptional<z.ZodNumber>;
3201
- }, "strip", z.ZodTypeAny, {
3202
- maxBytes?: number | undefined;
3203
- minBytes?: number | undefined;
3204
- }, {
3205
- maxBytes?: number | undefined;
3206
- minBytes?: number | undefined;
3207
- }>>;
2932
+ }, z.core.$strip>>;
3208
2933
  toolsTriggered: z.ZodOptional<z.ZodObject<{
3209
2934
  calls: z.ZodArray<z.ZodObject<{
3210
2935
  name: z.ZodString;
3211
2936
  arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3212
2937
  required: z.ZodOptional<z.ZodBoolean>;
3213
- }, "strip", z.ZodTypeAny, {
3214
- name: string;
3215
- required?: boolean | undefined;
3216
- arguments?: Record<string, unknown> | undefined;
3217
- }, {
3218
- name: string;
3219
- required?: boolean | undefined;
3220
- arguments?: Record<string, unknown> | undefined;
3221
- }>, "many">;
3222
- order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
2938
+ }, z.core.$strip>>;
2939
+ order: z.ZodOptional<z.ZodEnum<{
2940
+ any: "any";
2941
+ strict: "strict";
2942
+ }>>;
3223
2943
  exclusive: z.ZodOptional<z.ZodBoolean>;
3224
- }, "strip", z.ZodTypeAny, {
3225
- calls: {
3226
- name: string;
3227
- required?: boolean | undefined;
3228
- arguments?: Record<string, unknown> | undefined;
3229
- }[];
3230
- order?: "strict" | "any" | undefined;
3231
- exclusive?: boolean | undefined;
3232
- }, {
3233
- calls: {
3234
- name: string;
3235
- required?: boolean | undefined;
3236
- arguments?: Record<string, unknown> | undefined;
3237
- }[];
3238
- order?: "strict" | "any" | undefined;
3239
- exclusive?: boolean | undefined;
3240
- }>>;
2944
+ }, z.core.$strip>>;
3241
2945
  toolCallCount: z.ZodOptional<z.ZodObject<{
3242
2946
  min: z.ZodOptional<z.ZodNumber>;
3243
2947
  max: z.ZodOptional<z.ZodNumber>;
3244
2948
  exact: z.ZodOptional<z.ZodNumber>;
3245
- }, "strip", z.ZodTypeAny, {
3246
- exact?: number | undefined;
3247
- min?: number | undefined;
3248
- max?: number | undefined;
3249
- }, {
3250
- exact?: number | undefined;
3251
- min?: number | undefined;
3252
- max?: number | undefined;
3253
- }>>;
3254
- }, "strip", z.ZodTypeAny, {
3255
- response?: unknown;
3256
- isError?: string | boolean | string[] | undefined;
3257
- schema?: string | undefined;
3258
- snapshot?: string | undefined;
3259
- toolsTriggered?: {
3260
- calls: {
3261
- name: string;
3262
- required?: boolean | undefined;
3263
- arguments?: Record<string, unknown> | undefined;
3264
- }[];
3265
- order?: "strict" | "any" | undefined;
3266
- exclusive?: boolean | undefined;
3267
- } | undefined;
3268
- toolCallCount?: {
3269
- exact?: number | undefined;
3270
- min?: number | undefined;
3271
- max?: number | undefined;
3272
- } | undefined;
3273
- containsText?: string | string[] | undefined;
3274
- matchesPattern?: string | string[] | undefined;
3275
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3276
- pattern: string;
3277
- replacement?: string | undefined;
3278
- } | {
3279
- remove: string[];
3280
- })[] | undefined;
3281
- passesJudge?: {
3282
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3283
- text: string;
3284
- };
3285
- model?: string | undefined;
3286
- maxTokens?: number | undefined;
3287
- maxBudgetUsd?: number | undefined;
3288
- reference?: unknown;
3289
- threshold?: number | undefined;
3290
- reps?: number | undefined;
3291
- provider?: "openai" | "anthropic" | "google" | undefined;
3292
- apiKeyEnvVar?: string | undefined;
3293
- temperature?: number | undefined;
3294
- maxToolOutputSize?: number | undefined;
3295
- } | undefined;
3296
- responseSize?: {
3297
- maxBytes?: number | undefined;
3298
- minBytes?: number | undefined;
3299
- } | undefined;
3300
- }, {
3301
- response?: unknown;
3302
- isError?: string | boolean | string[] | undefined;
3303
- schema?: string | undefined;
3304
- snapshot?: string | undefined;
3305
- toolsTriggered?: {
3306
- calls: {
3307
- name: string;
3308
- required?: boolean | undefined;
3309
- arguments?: Record<string, unknown> | undefined;
3310
- }[];
3311
- order?: "strict" | "any" | undefined;
3312
- exclusive?: boolean | undefined;
3313
- } | undefined;
3314
- toolCallCount?: {
3315
- exact?: number | undefined;
3316
- min?: number | undefined;
3317
- max?: number | undefined;
3318
- } | undefined;
3319
- containsText?: string | string[] | undefined;
3320
- matchesPattern?: string | string[] | undefined;
3321
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3322
- pattern: string;
3323
- replacement?: string | undefined;
3324
- } | {
3325
- remove: string[];
3326
- })[] | undefined;
3327
- passesJudge?: {
3328
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3329
- text: string;
3330
- };
3331
- model?: string | undefined;
3332
- maxTokens?: number | undefined;
3333
- maxBudgetUsd?: number | undefined;
3334
- reference?: unknown;
3335
- threshold?: number | undefined;
3336
- reps?: number | undefined;
3337
- provider?: "openai" | "anthropic" | "google" | undefined;
3338
- apiKeyEnvVar?: string | undefined;
3339
- temperature?: number | undefined;
3340
- maxToolOutputSize?: number | undefined;
3341
- } | undefined;
3342
- responseSize?: {
3343
- maxBytes?: number | undefined;
3344
- minBytes?: number | undefined;
3345
- } | undefined;
3346
- }>>;
3347
- }, "strip", z.ZodTypeAny, {
3348
- id: string;
3349
- args?: Record<string, unknown> | undefined;
3350
- mode?: "direct" | "llm_host" | undefined;
3351
- metadata?: Record<string, unknown> | undefined;
3352
- description?: string | undefined;
3353
- toolName?: string | undefined;
3354
- scenario?: string | undefined;
3355
- llmHostConfig?: {
3356
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3357
- model?: string | undefined;
3358
- maxTokens?: number | undefined;
3359
- apiKeyEnvVar?: string | undefined;
3360
- temperature?: number | undefined;
3361
- maxToolCalls?: number | undefined;
3362
- } | undefined;
3363
- iterations?: number | undefined;
3364
- accuracyThreshold?: number | undefined;
3365
- judgeReps?: number | undefined;
3366
- canonicalAnswer?: string | undefined;
3367
- tags?: string[] | undefined;
3368
- expect?: {
3369
- response?: unknown;
3370
- isError?: string | boolean | string[] | undefined;
3371
- schema?: string | undefined;
3372
- snapshot?: string | undefined;
3373
- toolsTriggered?: {
3374
- calls: {
3375
- name: string;
3376
- required?: boolean | undefined;
3377
- arguments?: Record<string, unknown> | undefined;
3378
- }[];
3379
- order?: "strict" | "any" | undefined;
3380
- exclusive?: boolean | undefined;
3381
- } | undefined;
3382
- toolCallCount?: {
3383
- exact?: number | undefined;
3384
- min?: number | undefined;
3385
- max?: number | undefined;
3386
- } | undefined;
3387
- containsText?: string | string[] | undefined;
3388
- matchesPattern?: string | string[] | undefined;
3389
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3390
- pattern: string;
3391
- replacement?: string | undefined;
3392
- } | {
3393
- remove: string[];
3394
- })[] | undefined;
3395
- passesJudge?: {
3396
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3397
- text: string;
3398
- };
3399
- model?: string | undefined;
3400
- maxTokens?: number | undefined;
3401
- maxBudgetUsd?: number | undefined;
3402
- reference?: unknown;
3403
- threshold?: number | undefined;
3404
- reps?: number | undefined;
3405
- provider?: "openai" | "anthropic" | "google" | undefined;
3406
- apiKeyEnvVar?: string | undefined;
3407
- temperature?: number | undefined;
3408
- maxToolOutputSize?: number | undefined;
3409
- } | undefined;
3410
- responseSize?: {
3411
- maxBytes?: number | undefined;
3412
- minBytes?: number | undefined;
3413
- } | undefined;
3414
- } | undefined;
3415
- }, {
3416
- id: string;
3417
- args?: Record<string, unknown> | undefined;
3418
- mode?: "direct" | "llm_host" | undefined;
3419
- metadata?: Record<string, unknown> | undefined;
3420
- description?: string | undefined;
3421
- toolName?: string | undefined;
3422
- scenario?: string | undefined;
3423
- llmHostConfig?: {
3424
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3425
- model?: string | undefined;
3426
- maxTokens?: number | undefined;
3427
- apiKeyEnvVar?: string | undefined;
3428
- temperature?: number | undefined;
3429
- maxToolCalls?: number | undefined;
3430
- } | undefined;
3431
- iterations?: number | undefined;
3432
- accuracyThreshold?: number | undefined;
3433
- judgeReps?: number | undefined;
3434
- canonicalAnswer?: string | undefined;
3435
- tags?: string[] | undefined;
3436
- expect?: {
3437
- response?: unknown;
3438
- isError?: string | boolean | string[] | undefined;
3439
- schema?: string | undefined;
3440
- snapshot?: string | undefined;
3441
- toolsTriggered?: {
3442
- calls: {
3443
- name: string;
3444
- required?: boolean | undefined;
3445
- arguments?: Record<string, unknown> | undefined;
3446
- }[];
3447
- order?: "strict" | "any" | undefined;
3448
- exclusive?: boolean | undefined;
3449
- } | undefined;
3450
- toolCallCount?: {
3451
- exact?: number | undefined;
3452
- min?: number | undefined;
3453
- max?: number | undefined;
3454
- } | undefined;
3455
- containsText?: string | string[] | undefined;
3456
- matchesPattern?: string | string[] | undefined;
3457
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3458
- pattern: string;
3459
- replacement?: string | undefined;
3460
- } | {
3461
- remove: string[];
3462
- })[] | undefined;
3463
- passesJudge?: {
3464
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3465
- text: string;
3466
- };
3467
- model?: string | undefined;
3468
- maxTokens?: number | undefined;
3469
- maxBudgetUsd?: number | undefined;
3470
- reference?: unknown;
3471
- threshold?: number | undefined;
3472
- reps?: number | undefined;
3473
- provider?: "openai" | "anthropic" | "google" | undefined;
3474
- apiKeyEnvVar?: string | undefined;
3475
- temperature?: number | undefined;
3476
- maxToolOutputSize?: number | undefined;
3477
- } | undefined;
3478
- responseSize?: {
3479
- maxBytes?: number | undefined;
3480
- minBytes?: number | undefined;
3481
- } | undefined;
3482
- } | undefined;
3483
- }>;
2949
+ }, z.core.$strip>>;
2950
+ }, z.core.$strip>>;
2951
+ }, z.core.$strip>;
3484
2952
  /**
3485
2953
  * Zod schema for EvalDataset (without schemas field, as schemas aren't serializable)
3486
2954
  */
@@ -3490,542 +2958,106 @@ declare const EvalDatasetSchema: z.ZodObject<{
3490
2958
  cases: z.ZodArray<z.ZodObject<{
3491
2959
  id: z.ZodString;
3492
2960
  description: z.ZodOptional<z.ZodString>;
3493
- mode: z.ZodOptional<z.ZodEnum<["direct", "llm_host"]>>;
2961
+ mode: z.ZodOptional<z.ZodEnum<{
2962
+ direct: "direct";
2963
+ mcp_host: "mcp_host";
2964
+ }>>;
3494
2965
  toolName: z.ZodOptional<z.ZodString>;
3495
2966
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3496
2967
  scenario: z.ZodOptional<z.ZodString>;
3497
- llmHostConfig: z.ZodOptional<z.ZodObject<{
3498
- provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
2968
+ mcpHostConfig: z.ZodOptional<z.ZodObject<{
2969
+ provider: z.ZodEnum<{
2970
+ openai: "openai";
2971
+ anthropic: "anthropic";
2972
+ azure: "azure";
2973
+ google: "google";
2974
+ mistral: "mistral";
2975
+ deepseek: "deepseek";
2976
+ openrouter: "openrouter";
2977
+ xai: "xai";
2978
+ "vertex-anthropic": "vertex-anthropic";
2979
+ }>;
3499
2980
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3500
2981
  model: z.ZodOptional<z.ZodString>;
3501
2982
  maxTokens: z.ZodOptional<z.ZodNumber>;
3502
2983
  temperature: z.ZodOptional<z.ZodNumber>;
3503
2984
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3504
- }, "strip", z.ZodTypeAny, {
3505
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3506
- model?: string | undefined;
3507
- maxTokens?: number | undefined;
3508
- apiKeyEnvVar?: string | undefined;
3509
- temperature?: number | undefined;
3510
- maxToolCalls?: number | undefined;
3511
- }, {
3512
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3513
- model?: string | undefined;
3514
- maxTokens?: number | undefined;
3515
- apiKeyEnvVar?: string | undefined;
3516
- temperature?: number | undefined;
3517
- maxToolCalls?: number | undefined;
3518
- }>>;
2985
+ }, z.core.$strip>>;
3519
2986
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3520
2987
  iterations: z.ZodOptional<z.ZodNumber>;
3521
2988
  accuracyThreshold: z.ZodOptional<z.ZodNumber>;
3522
2989
  judgeReps: z.ZodOptional<z.ZodNumber>;
3523
2990
  canonicalAnswer: z.ZodOptional<z.ZodString>;
3524
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
2991
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
3525
2992
  expect: z.ZodOptional<z.ZodObject<{
3526
2993
  response: z.ZodOptional<z.ZodUnknown>;
3527
2994
  schema: z.ZodOptional<z.ZodString>;
3528
- containsText: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3529
- matchesPattern: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
2995
+ containsText: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
2996
+ matchesPattern: z.ZodOptional<z.ZodUnion<readonly [z.ZodString, z.ZodArray<z.ZodString>]>>;
3530
2997
  snapshot: z.ZodOptional<z.ZodString>;
3531
- snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["timestamp", "uuid", "iso-date", "objectId", "jwt"]>, z.ZodObject<{
2998
+ snapshotSanitizers: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodEnum<{
2999
+ timestamp: "timestamp";
3000
+ uuid: "uuid";
3001
+ "iso-date": "iso-date";
3002
+ objectId: "objectId";
3003
+ jwt: "jwt";
3004
+ }>, z.ZodObject<{
3532
3005
  pattern: z.ZodString;
3533
3006
  replacement: z.ZodOptional<z.ZodString>;
3534
- }, "strip", z.ZodTypeAny, {
3535
- pattern: string;
3536
- replacement?: string | undefined;
3537
- }, {
3538
- pattern: string;
3539
- replacement?: string | undefined;
3540
- }>, z.ZodObject<{
3541
- remove: z.ZodArray<z.ZodString, "many">;
3542
- }, "strip", z.ZodTypeAny, {
3543
- remove: string[];
3544
- }, {
3545
- remove: string[];
3546
- }>]>, "many">>;
3547
- isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
3007
+ }, z.core.$strip>, z.ZodObject<{
3008
+ remove: z.ZodArray<z.ZodString>;
3009
+ }, z.core.$strip>]>>>;
3010
+ isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3548
3011
  passesJudge: z.ZodOptional<z.ZodObject<{
3549
- rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
3012
+ rubric: z.ZodUnion<readonly [z.ZodEnum<{
3013
+ correctness: "correctness";
3014
+ completeness: "completeness";
3015
+ groundedness: "groundedness";
3016
+ "instruction-following": "instruction-following";
3017
+ conciseness: "conciseness";
3018
+ }>, z.ZodObject<{
3550
3019
  text: z.ZodString;
3551
- }, "strip", z.ZodTypeAny, {
3552
- text: string;
3553
- }, {
3554
- text: string;
3555
- }>]>;
3020
+ }, z.core.$strip>]>;
3556
3021
  reference: z.ZodOptional<z.ZodUnknown>;
3557
3022
  threshold: z.ZodOptional<z.ZodNumber>;
3558
3023
  reps: z.ZodOptional<z.ZodNumber>;
3559
- provider: z.ZodOptional<z.ZodEnum<["anthropic", "openai", "google"]>>;
3024
+ provider: z.ZodOptional<z.ZodEnum<{
3025
+ openai: "openai";
3026
+ anthropic: "anthropic";
3027
+ google: "google";
3028
+ }>>;
3560
3029
  model: z.ZodOptional<z.ZodString>;
3561
3030
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3562
3031
  maxTokens: z.ZodOptional<z.ZodNumber>;
3563
3032
  temperature: z.ZodOptional<z.ZodNumber>;
3564
3033
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3565
3034
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3566
- }, "strip", z.ZodTypeAny, {
3567
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3568
- text: string;
3569
- };
3570
- model?: string | undefined;
3571
- maxTokens?: number | undefined;
3572
- maxBudgetUsd?: number | undefined;
3573
- reference?: unknown;
3574
- threshold?: number | undefined;
3575
- reps?: number | undefined;
3576
- provider?: "openai" | "anthropic" | "google" | undefined;
3577
- apiKeyEnvVar?: string | undefined;
3578
- temperature?: number | undefined;
3579
- maxToolOutputSize?: number | undefined;
3580
- }, {
3581
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3582
- text: string;
3583
- };
3584
- model?: string | undefined;
3585
- maxTokens?: number | undefined;
3586
- maxBudgetUsd?: number | undefined;
3587
- reference?: unknown;
3588
- threshold?: number | undefined;
3589
- reps?: number | undefined;
3590
- provider?: "openai" | "anthropic" | "google" | undefined;
3591
- apiKeyEnvVar?: string | undefined;
3592
- temperature?: number | undefined;
3593
- maxToolOutputSize?: number | undefined;
3594
- }>>;
3035
+ }, z.core.$strip>>;
3595
3036
  responseSize: z.ZodOptional<z.ZodObject<{
3596
3037
  maxBytes: z.ZodOptional<z.ZodNumber>;
3597
3038
  minBytes: z.ZodOptional<z.ZodNumber>;
3598
- }, "strip", z.ZodTypeAny, {
3599
- maxBytes?: number | undefined;
3600
- minBytes?: number | undefined;
3601
- }, {
3602
- maxBytes?: number | undefined;
3603
- minBytes?: number | undefined;
3604
- }>>;
3039
+ }, z.core.$strip>>;
3605
3040
  toolsTriggered: z.ZodOptional<z.ZodObject<{
3606
3041
  calls: z.ZodArray<z.ZodObject<{
3607
3042
  name: z.ZodString;
3608
3043
  arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3609
3044
  required: z.ZodOptional<z.ZodBoolean>;
3610
- }, "strip", z.ZodTypeAny, {
3611
- name: string;
3612
- required?: boolean | undefined;
3613
- arguments?: Record<string, unknown> | undefined;
3614
- }, {
3615
- name: string;
3616
- required?: boolean | undefined;
3617
- arguments?: Record<string, unknown> | undefined;
3618
- }>, "many">;
3619
- order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
3045
+ }, z.core.$strip>>;
3046
+ order: z.ZodOptional<z.ZodEnum<{
3047
+ any: "any";
3048
+ strict: "strict";
3049
+ }>>;
3620
3050
  exclusive: z.ZodOptional<z.ZodBoolean>;
3621
- }, "strip", z.ZodTypeAny, {
3622
- calls: {
3623
- name: string;
3624
- required?: boolean | undefined;
3625
- arguments?: Record<string, unknown> | undefined;
3626
- }[];
3627
- order?: "strict" | "any" | undefined;
3628
- exclusive?: boolean | undefined;
3629
- }, {
3630
- calls: {
3631
- name: string;
3632
- required?: boolean | undefined;
3633
- arguments?: Record<string, unknown> | undefined;
3634
- }[];
3635
- order?: "strict" | "any" | undefined;
3636
- exclusive?: boolean | undefined;
3637
- }>>;
3051
+ }, z.core.$strip>>;
3638
3052
  toolCallCount: z.ZodOptional<z.ZodObject<{
3639
3053
  min: z.ZodOptional<z.ZodNumber>;
3640
3054
  max: z.ZodOptional<z.ZodNumber>;
3641
3055
  exact: z.ZodOptional<z.ZodNumber>;
3642
- }, "strip", z.ZodTypeAny, {
3643
- exact?: number | undefined;
3644
- min?: number | undefined;
3645
- max?: number | undefined;
3646
- }, {
3647
- exact?: number | undefined;
3648
- min?: number | undefined;
3649
- max?: number | undefined;
3650
- }>>;
3651
- }, "strip", z.ZodTypeAny, {
3652
- response?: unknown;
3653
- isError?: string | boolean | string[] | undefined;
3654
- schema?: string | undefined;
3655
- snapshot?: string | undefined;
3656
- toolsTriggered?: {
3657
- calls: {
3658
- name: string;
3659
- required?: boolean | undefined;
3660
- arguments?: Record<string, unknown> | undefined;
3661
- }[];
3662
- order?: "strict" | "any" | undefined;
3663
- exclusive?: boolean | undefined;
3664
- } | undefined;
3665
- toolCallCount?: {
3666
- exact?: number | undefined;
3667
- min?: number | undefined;
3668
- max?: number | undefined;
3669
- } | undefined;
3670
- containsText?: string | string[] | undefined;
3671
- matchesPattern?: string | string[] | undefined;
3672
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3673
- pattern: string;
3674
- replacement?: string | undefined;
3675
- } | {
3676
- remove: string[];
3677
- })[] | undefined;
3678
- passesJudge?: {
3679
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3680
- text: string;
3681
- };
3682
- model?: string | undefined;
3683
- maxTokens?: number | undefined;
3684
- maxBudgetUsd?: number | undefined;
3685
- reference?: unknown;
3686
- threshold?: number | undefined;
3687
- reps?: number | undefined;
3688
- provider?: "openai" | "anthropic" | "google" | undefined;
3689
- apiKeyEnvVar?: string | undefined;
3690
- temperature?: number | undefined;
3691
- maxToolOutputSize?: number | undefined;
3692
- } | undefined;
3693
- responseSize?: {
3694
- maxBytes?: number | undefined;
3695
- minBytes?: number | undefined;
3696
- } | undefined;
3697
- }, {
3698
- response?: unknown;
3699
- isError?: string | boolean | string[] | undefined;
3700
- schema?: string | undefined;
3701
- snapshot?: string | undefined;
3702
- toolsTriggered?: {
3703
- calls: {
3704
- name: string;
3705
- required?: boolean | undefined;
3706
- arguments?: Record<string, unknown> | undefined;
3707
- }[];
3708
- order?: "strict" | "any" | undefined;
3709
- exclusive?: boolean | undefined;
3710
- } | undefined;
3711
- toolCallCount?: {
3712
- exact?: number | undefined;
3713
- min?: number | undefined;
3714
- max?: number | undefined;
3715
- } | undefined;
3716
- containsText?: string | string[] | undefined;
3717
- matchesPattern?: string | string[] | undefined;
3718
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3719
- pattern: string;
3720
- replacement?: string | undefined;
3721
- } | {
3722
- remove: string[];
3723
- })[] | undefined;
3724
- passesJudge?: {
3725
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3726
- text: string;
3727
- };
3728
- model?: string | undefined;
3729
- maxTokens?: number | undefined;
3730
- maxBudgetUsd?: number | undefined;
3731
- reference?: unknown;
3732
- threshold?: number | undefined;
3733
- reps?: number | undefined;
3734
- provider?: "openai" | "anthropic" | "google" | undefined;
3735
- apiKeyEnvVar?: string | undefined;
3736
- temperature?: number | undefined;
3737
- maxToolOutputSize?: number | undefined;
3738
- } | undefined;
3739
- responseSize?: {
3740
- maxBytes?: number | undefined;
3741
- minBytes?: number | undefined;
3742
- } | undefined;
3743
- }>>;
3744
- }, "strip", z.ZodTypeAny, {
3745
- id: string;
3746
- args?: Record<string, unknown> | undefined;
3747
- mode?: "direct" | "llm_host" | undefined;
3748
- metadata?: Record<string, unknown> | undefined;
3749
- description?: string | undefined;
3750
- toolName?: string | undefined;
3751
- scenario?: string | undefined;
3752
- llmHostConfig?: {
3753
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3754
- model?: string | undefined;
3755
- maxTokens?: number | undefined;
3756
- apiKeyEnvVar?: string | undefined;
3757
- temperature?: number | undefined;
3758
- maxToolCalls?: number | undefined;
3759
- } | undefined;
3760
- iterations?: number | undefined;
3761
- accuracyThreshold?: number | undefined;
3762
- judgeReps?: number | undefined;
3763
- canonicalAnswer?: string | undefined;
3764
- tags?: string[] | undefined;
3765
- expect?: {
3766
- response?: unknown;
3767
- isError?: string | boolean | string[] | undefined;
3768
- schema?: string | undefined;
3769
- snapshot?: string | undefined;
3770
- toolsTriggered?: {
3771
- calls: {
3772
- name: string;
3773
- required?: boolean | undefined;
3774
- arguments?: Record<string, unknown> | undefined;
3775
- }[];
3776
- order?: "strict" | "any" | undefined;
3777
- exclusive?: boolean | undefined;
3778
- } | undefined;
3779
- toolCallCount?: {
3780
- exact?: number | undefined;
3781
- min?: number | undefined;
3782
- max?: number | undefined;
3783
- } | undefined;
3784
- containsText?: string | string[] | undefined;
3785
- matchesPattern?: string | string[] | undefined;
3786
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3787
- pattern: string;
3788
- replacement?: string | undefined;
3789
- } | {
3790
- remove: string[];
3791
- })[] | undefined;
3792
- passesJudge?: {
3793
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3794
- text: string;
3795
- };
3796
- model?: string | undefined;
3797
- maxTokens?: number | undefined;
3798
- maxBudgetUsd?: number | undefined;
3799
- reference?: unknown;
3800
- threshold?: number | undefined;
3801
- reps?: number | undefined;
3802
- provider?: "openai" | "anthropic" | "google" | undefined;
3803
- apiKeyEnvVar?: string | undefined;
3804
- temperature?: number | undefined;
3805
- maxToolOutputSize?: number | undefined;
3806
- } | undefined;
3807
- responseSize?: {
3808
- maxBytes?: number | undefined;
3809
- minBytes?: number | undefined;
3810
- } | undefined;
3811
- } | undefined;
3812
- }, {
3813
- id: string;
3814
- args?: Record<string, unknown> | undefined;
3815
- mode?: "direct" | "llm_host" | undefined;
3816
- metadata?: Record<string, unknown> | undefined;
3817
- description?: string | undefined;
3818
- toolName?: string | undefined;
3819
- scenario?: string | undefined;
3820
- llmHostConfig?: {
3821
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3822
- model?: string | undefined;
3823
- maxTokens?: number | undefined;
3824
- apiKeyEnvVar?: string | undefined;
3825
- temperature?: number | undefined;
3826
- maxToolCalls?: number | undefined;
3827
- } | undefined;
3828
- iterations?: number | undefined;
3829
- accuracyThreshold?: number | undefined;
3830
- judgeReps?: number | undefined;
3831
- canonicalAnswer?: string | undefined;
3832
- tags?: string[] | undefined;
3833
- expect?: {
3834
- response?: unknown;
3835
- isError?: string | boolean | string[] | undefined;
3836
- schema?: string | undefined;
3837
- snapshot?: string | undefined;
3838
- toolsTriggered?: {
3839
- calls: {
3840
- name: string;
3841
- required?: boolean | undefined;
3842
- arguments?: Record<string, unknown> | undefined;
3843
- }[];
3844
- order?: "strict" | "any" | undefined;
3845
- exclusive?: boolean | undefined;
3846
- } | undefined;
3847
- toolCallCount?: {
3848
- exact?: number | undefined;
3849
- min?: number | undefined;
3850
- max?: number | undefined;
3851
- } | undefined;
3852
- containsText?: string | string[] | undefined;
3853
- matchesPattern?: string | string[] | undefined;
3854
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3855
- pattern: string;
3856
- replacement?: string | undefined;
3857
- } | {
3858
- remove: string[];
3859
- })[] | undefined;
3860
- passesJudge?: {
3861
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3862
- text: string;
3863
- };
3864
- model?: string | undefined;
3865
- maxTokens?: number | undefined;
3866
- maxBudgetUsd?: number | undefined;
3867
- reference?: unknown;
3868
- threshold?: number | undefined;
3869
- reps?: number | undefined;
3870
- provider?: "openai" | "anthropic" | "google" | undefined;
3871
- apiKeyEnvVar?: string | undefined;
3872
- temperature?: number | undefined;
3873
- maxToolOutputSize?: number | undefined;
3874
- } | undefined;
3875
- responseSize?: {
3876
- maxBytes?: number | undefined;
3877
- minBytes?: number | undefined;
3878
- } | undefined;
3879
- } | undefined;
3880
- }>, "many">;
3056
+ }, z.core.$strip>>;
3057
+ }, z.core.$strip>>;
3058
+ }, z.core.$strip>>;
3881
3059
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
3882
- }, "strip", z.ZodTypeAny, {
3883
- name: string;
3884
- cases: {
3885
- id: string;
3886
- args?: Record<string, unknown> | undefined;
3887
- mode?: "direct" | "llm_host" | undefined;
3888
- metadata?: Record<string, unknown> | undefined;
3889
- description?: string | undefined;
3890
- toolName?: string | undefined;
3891
- scenario?: string | undefined;
3892
- llmHostConfig?: {
3893
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3894
- model?: string | undefined;
3895
- maxTokens?: number | undefined;
3896
- apiKeyEnvVar?: string | undefined;
3897
- temperature?: number | undefined;
3898
- maxToolCalls?: number | undefined;
3899
- } | undefined;
3900
- iterations?: number | undefined;
3901
- accuracyThreshold?: number | undefined;
3902
- judgeReps?: number | undefined;
3903
- canonicalAnswer?: string | undefined;
3904
- tags?: string[] | undefined;
3905
- expect?: {
3906
- response?: unknown;
3907
- isError?: string | boolean | string[] | undefined;
3908
- schema?: string | undefined;
3909
- snapshot?: string | undefined;
3910
- toolsTriggered?: {
3911
- calls: {
3912
- name: string;
3913
- required?: boolean | undefined;
3914
- arguments?: Record<string, unknown> | undefined;
3915
- }[];
3916
- order?: "strict" | "any" | undefined;
3917
- exclusive?: boolean | undefined;
3918
- } | undefined;
3919
- toolCallCount?: {
3920
- exact?: number | undefined;
3921
- min?: number | undefined;
3922
- max?: number | undefined;
3923
- } | undefined;
3924
- containsText?: string | string[] | undefined;
3925
- matchesPattern?: string | string[] | undefined;
3926
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
3927
- pattern: string;
3928
- replacement?: string | undefined;
3929
- } | {
3930
- remove: string[];
3931
- })[] | undefined;
3932
- passesJudge?: {
3933
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
3934
- text: string;
3935
- };
3936
- model?: string | undefined;
3937
- maxTokens?: number | undefined;
3938
- maxBudgetUsd?: number | undefined;
3939
- reference?: unknown;
3940
- threshold?: number | undefined;
3941
- reps?: number | undefined;
3942
- provider?: "openai" | "anthropic" | "google" | undefined;
3943
- apiKeyEnvVar?: string | undefined;
3944
- temperature?: number | undefined;
3945
- maxToolOutputSize?: number | undefined;
3946
- } | undefined;
3947
- responseSize?: {
3948
- maxBytes?: number | undefined;
3949
- minBytes?: number | undefined;
3950
- } | undefined;
3951
- } | undefined;
3952
- }[];
3953
- metadata?: Record<string, unknown> | undefined;
3954
- description?: string | undefined;
3955
- }, {
3956
- name: string;
3957
- cases: {
3958
- id: string;
3959
- args?: Record<string, unknown> | undefined;
3960
- mode?: "direct" | "llm_host" | undefined;
3961
- metadata?: Record<string, unknown> | undefined;
3962
- description?: string | undefined;
3963
- toolName?: string | undefined;
3964
- scenario?: string | undefined;
3965
- llmHostConfig?: {
3966
- provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
3967
- model?: string | undefined;
3968
- maxTokens?: number | undefined;
3969
- apiKeyEnvVar?: string | undefined;
3970
- temperature?: number | undefined;
3971
- maxToolCalls?: number | undefined;
3972
- } | undefined;
3973
- iterations?: number | undefined;
3974
- accuracyThreshold?: number | undefined;
3975
- judgeReps?: number | undefined;
3976
- canonicalAnswer?: string | undefined;
3977
- tags?: string[] | undefined;
3978
- expect?: {
3979
- response?: unknown;
3980
- isError?: string | boolean | string[] | undefined;
3981
- schema?: string | undefined;
3982
- snapshot?: string | undefined;
3983
- toolsTriggered?: {
3984
- calls: {
3985
- name: string;
3986
- required?: boolean | undefined;
3987
- arguments?: Record<string, unknown> | undefined;
3988
- }[];
3989
- order?: "strict" | "any" | undefined;
3990
- exclusive?: boolean | undefined;
3991
- } | undefined;
3992
- toolCallCount?: {
3993
- exact?: number | undefined;
3994
- min?: number | undefined;
3995
- max?: number | undefined;
3996
- } | undefined;
3997
- containsText?: string | string[] | undefined;
3998
- matchesPattern?: string | string[] | undefined;
3999
- snapshotSanitizers?: ("uuid" | "jwt" | "timestamp" | "iso-date" | "objectId" | {
4000
- pattern: string;
4001
- replacement?: string | undefined;
4002
- } | {
4003
- remove: string[];
4004
- })[] | undefined;
4005
- passesJudge?: {
4006
- rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
4007
- text: string;
4008
- };
4009
- model?: string | undefined;
4010
- maxTokens?: number | undefined;
4011
- maxBudgetUsd?: number | undefined;
4012
- reference?: unknown;
4013
- threshold?: number | undefined;
4014
- reps?: number | undefined;
4015
- provider?: "openai" | "anthropic" | "google" | undefined;
4016
- apiKeyEnvVar?: string | undefined;
4017
- temperature?: number | undefined;
4018
- maxToolOutputSize?: number | undefined;
4019
- } | undefined;
4020
- responseSize?: {
4021
- maxBytes?: number | undefined;
4022
- minBytes?: number | undefined;
4023
- } | undefined;
4024
- } | undefined;
4025
- }[];
4026
- metadata?: Record<string, unknown> | undefined;
4027
- description?: string | undefined;
4028
- }>;
3060
+ }, z.core.$strip>;
4029
3061
  /**
4030
3062
  * Type for serialized eval dataset (without Zod schemas)
4031
3063
  */
@@ -4122,15 +3154,15 @@ interface EvalRunMetadata {
4122
3154
  timestamp: string;
4123
3155
  /** Package version from package.json */
4124
3156
  packageVersion: string;
4125
- /** LLM host model identifier (if llm_host mode) */
4126
- llmHostModel?: string;
3157
+ /** MCP host model identifier (if mcp_host mode) */
3158
+ mcpHostModel?: string;
4127
3159
  /** Judge model identifier (if judge was used) */
4128
3160
  judgeModel?: string;
4129
3161
  }
4130
3162
  /**
4131
3163
  * Individual conformance check result
4132
3164
  */
4133
- interface MCPConformanceCheck$1 {
3165
+ interface MCPConformanceCheck {
4134
3166
  /**
4135
3167
  * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
4136
3168
  */
@@ -4159,7 +3191,7 @@ interface MCPConformanceResultData {
4159
3191
  /**
4160
3192
  * Individual check results
4161
3193
  */
4162
- checks: MCPConformanceCheck$1[];
3194
+ checks: MCPConformanceCheck[];
4163
3195
  /**
4164
3196
  * Server info if available
4165
3197
  */
@@ -4282,12 +3314,6 @@ interface EvalCaseResult {
4282
3314
  * Only present when the case was run with `iterations > 1`.
4283
3315
  */
4284
3316
  infrastructureErrorRate?: number;
4285
- /**
4286
- * Accuracy score (0–1) across all iterations.
4287
- * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
4288
- * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
4289
- */
4290
- accuracy?: number;
4291
3317
  /**
4292
3318
  * Per-iteration pass/fail breakdown.
4293
3319
  * Only present when the case was run with `iterations > 1`.
@@ -4300,7 +3326,7 @@ interface EvalCaseResult {
4300
3326
  /**
4301
3327
  * Precision of tool calls made (0–1).
4302
3328
  * 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
4303
- * Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
3329
+ * Populated whenever a `toolsTriggered` expectation is evaluated.
4304
3330
  */
4305
3331
  toolPrecision?: number;
4306
3332
  /**
@@ -4319,6 +3345,23 @@ interface EvalCaseResult {
4319
3345
  * Only present when the case was run with `iterations > 1`.
4320
3346
  */
4321
3347
  infrastructureErrorCount?: number;
3348
+ /**
3349
+ * Ordered trace of tool calls made by the LLM in mcp_host mode.
3350
+ * Only populated when the eval case uses toolsTriggered expectations.
3351
+ */
3352
+ mcpHostTrace?: {
3353
+ /** The ordered sequence of tool calls made by the LLM */
3354
+ calls: Array<{
3355
+ name: string;
3356
+ arguments: Record<string, unknown>;
3357
+ /** 'expected' = was in the expected set, 'unexpected' = was not expected */
3358
+ status: 'expected' | 'unexpected';
3359
+ }>;
3360
+ /** Tools that were required but never called */
3361
+ missed: Array<{
3362
+ name: string;
3363
+ }>;
3364
+ };
4322
3365
  }
4323
3366
  /**
4324
3367
  * Aggregated MCP eval run data
@@ -4462,13 +3505,13 @@ interface EvalRunnerResult {
4462
3505
  */
4463
3506
  improvements?: number;
4464
3507
  /**
4465
- * Average tool precision across all llm_host cases that have a
3508
+ * Average tool precision across all mcp_host cases that have a
4466
3509
  * `toolsTriggered` expectation (precision = fraction of called tools
4467
3510
  * that were expected). Only present when at least one such case ran.
4468
3511
  */
4469
3512
  datasetToolPrecision?: number;
4470
3513
  /**
4471
- * Average tool recall across all llm_host cases that have a
3514
+ * Average tool recall across all mcp_host cases that have a
4472
3515
  * `toolsTriggered` expectation (recall = fraction of required tools
4473
3516
  * that were actually called). Only present when at least one such case ran.
4474
3517
  */
@@ -4523,7 +3566,7 @@ interface EvalRunnerOptions {
4523
3566
  */
4524
3567
  concurrency?: number;
4525
3568
  /**
4526
- * Default iteration count for `llm_host` mode cases that do not specify
3569
+ * Default iteration count for `mcp_host` mode cases that do not specify
4527
3570
  * `iterations` explicitly. Has no effect on `direct` mode cases (which are
4528
3571
  * deterministic and always default to 1 iteration).
4529
3572
  *
@@ -4534,7 +3577,7 @@ interface EvalRunnerOptions {
4534
3577
  *
4535
3578
  * @example
4536
3579
  * ```typescript
4537
- * // Run all llm_host cases 10 times each by default
3580
+ * // Run all mcp_host cases 10 times each by default
4538
3581
  * await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
4539
3582
  * ```
4540
3583
  */
@@ -4567,12 +3610,12 @@ interface EvalRunnerOptions {
4567
3610
  */
4568
3611
  baselineResultsFrom?: string;
4569
3612
  /**
4570
- * LLM host model identifier to record in run metadata.
4571
- * Use this to identify which model was used when running llm_host cases.
3613
+ * MCP host model identifier to record in run metadata.
3614
+ * Use this to identify which model was used when running mcp_host cases.
4572
3615
  *
4573
3616
  * @example 'claude-opus-4-20250514'
4574
3617
  */
4575
- llmHostModel?: string;
3618
+ mcpHostModel?: string;
4576
3619
  /**
4577
3620
  * Judge model identifier to record in run metadata.
4578
3621
  * Use this to identify which model was used for judge evaluations.
@@ -4660,8 +3703,6 @@ interface ServerComparisonResult {
4660
3703
  ties: number;
4661
3704
  /** Cases where both failed */
4662
3705
  bothFail: number;
4663
- /** Raw count of cases where both servers failed (same as bothFail) */
4664
- bothFailCount: number;
4665
3706
  /** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
4666
3707
  decidedCases: number;
4667
3708
  /** Fraction of total cases where both servers failed (bothFail / total) */
@@ -4712,7 +3753,7 @@ type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baseli
4712
3753
  declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
4713
3754
 
4714
3755
  /**
4715
- * LLM Host Simulation - Main entry point
3756
+ * MCP Host Simulation - Main entry point
4716
3757
  *
4717
3758
  * All providers (openai, anthropic, google, azure, mistral, deepseek,
4718
3759
  * openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
@@ -4731,7 +3772,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4731
3772
  */
4732
3773
 
4733
3774
  /**
4734
- * Simulates an LLM host interacting with an MCP server.
3775
+ * Simulates an MCP host interacting with an MCP server.
4735
3776
  *
4736
3777
  * The LLM chooses which tools to call based solely on their descriptions and
4737
3778
  * schemas, testing discoverability and parameter clarity at the level a real
@@ -4743,12 +3784,12 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4743
3784
  *
4744
3785
  * @param mcp - MCP fixture API
4745
3786
  * @param scenario - Natural language prompt describing what the LLM should do
4746
- * @param config - LLM host configuration (provider, model, temperature, etc.)
3787
+ * @param config - MCP host configuration (provider, model, temperature, etc.)
4747
3788
  * @returns Simulation result with tool calls, final response, and latency data
4748
3789
  *
4749
3790
  * @example
4750
3791
  * ```typescript
4751
- * const result = await simulateLLMHost(mcp,
3792
+ * const result = await simulateMCPHost(mcp,
4752
3793
  * "Find recent documents about MCP testing frameworks",
4753
3794
  * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
4754
3795
  * );
@@ -4757,7 +3798,7 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
4757
3798
  * expect(result.toolCalls.map(c => c.name)).toContain('search');
4758
3799
  * ```
4759
3800
  */
4760
- declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
3801
+ declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
4761
3802
  /**
4762
3803
  * Returns true if the given provider is supported.
4763
3804
  *
@@ -4836,14 +3877,6 @@ interface MCPConformanceOptions {
4836
3877
  */
4837
3878
  checkPrompts?: boolean;
4838
3879
  }
4839
- /**
4840
- * Individual check result
4841
- */
4842
- interface MCPConformanceCheck {
4843
- name: string;
4844
- pass: boolean;
4845
- message: string;
4846
- }
4847
3880
  /**
4848
3881
  * Raw MCP responses for snapshotting
4849
3882
  */
@@ -4976,4 +4009,4 @@ interface MCPEvalReporterConfig {
4976
4009
  includeAutoTracking?: boolean;
4977
4010
  }
4978
4011
 
4979
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4012
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };