@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,10 +1,10 @@
1
1
  import { z, ZodType } from 'zod';
2
+ import { Page, TestInfo, Expect } from '@playwright/test';
2
3
  import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
3
4
  import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
4
5
  import * as oauth from 'oauth4webapi';
5
6
  import { Client } from '@modelcontextprotocol/sdk/client/index.js';
6
7
  import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
7
- import { TestInfo, Expect } from '@playwright/test';
8
8
  import * as playwright_test from 'playwright/test';
9
9
 
10
10
  /**
@@ -311,6 +311,7 @@ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
311
311
  /**
312
312
  * Auth types for MCP OAuth integration
313
313
  */
314
+
314
315
  /**
315
316
  * Stored OAuth tokens
316
317
  */
@@ -384,70 +385,90 @@ interface StoredOAuthState {
384
385
  savedAt: number;
385
386
  }
386
387
  /**
387
- * Configuration for OAuth setup flow
388
+ * Login form selectors for standard OAuth login automation
389
+ */
390
+ interface OAuthLoginSelectors {
391
+ /** Selector for username/email input field */
392
+ usernameInput: string;
393
+ /** Selector for password input field */
394
+ passwordInput: string;
395
+ /** Selector for login submit button */
396
+ submitButton: string;
397
+ /** Selector for consent/authorize button (optional) */
398
+ consentButton?: string;
399
+ }
400
+ /**
401
+ * Base configuration shared by all OAuth setup strategies
388
402
  */
389
- interface OAuthSetupConfig {
390
- /**
391
- * OAuth authorization server metadata URL
392
- */
403
+ interface OAuthSetupBaseConfig {
404
+ /** OAuth authorization server metadata URL */
393
405
  authServerUrl: string;
394
- /**
395
- * Scopes to request
396
- */
406
+ /** Scopes to request */
397
407
  scopes: Array<string>;
398
- /**
399
- * Resource indicator (RFC 8707)
400
- */
408
+ /** Path to save OAuth state file */
409
+ outputPath: string;
410
+ /** Pre-registered client ID (optional, uses DCR if not provided) */
411
+ clientId?: string;
412
+ /** Pre-registered client secret (optional) */
413
+ clientSecret?: string;
414
+ /** Redirect URI for OAuth callback */
415
+ redirectUri?: string;
416
+ /** Resource indicator (RFC 8707) */
401
417
  resource?: string;
402
- /**
403
- * Login form selectors for automation
404
- */
405
- loginSelectors: {
406
- /**
407
- * Selector for username/email input field
408
- */
409
- usernameInput: string;
410
- /**
411
- * Selector for password input field
412
- */
413
- passwordInput: string;
414
- /**
415
- * Selector for login submit button
416
- */
417
- submitButton: string;
418
- /**
419
- * Selector for consent/authorize button (optional)
420
- */
421
- consentButton?: string;
422
- };
423
- /**
424
- * Test user credentials
425
- */
418
+ /** Timeout for login flow in milliseconds (default: 30000) */
419
+ timeoutMs?: number;
420
+ }
421
+ /**
422
+ * Standard login strategy: automates a form with username, password, and submit button.
423
+ * Use when the IdP presents all login fields on a single page.
424
+ */
425
+ interface StandardLoginConfig {
426
+ /** Login form selectors for Playwright automation */
427
+ loginSelectors: OAuthLoginSelectors;
428
+ /** Test user credentials */
426
429
  credentials: {
427
430
  username: string;
428
431
  password: string;
429
432
  };
433
+ customLoginFlow?: never;
434
+ }
435
+ /**
436
+ * Custom login strategy: full control over the browser-based login flow.
437
+ * Use for multi-step logins, MFA, custom consent screens, or any flow
438
+ * that doesn't fit the standard username/password/submit pattern.
439
+ *
440
+ * The callback receives a Playwright Page already navigated to the OAuth
441
+ * authorization URL. Complete the login so the IdP redirects to the
442
+ * callback URL — `performOAuthSetup` handles PKCE, token exchange,
443
+ * and state persistence automatically.
444
+ */
445
+ interface CustomLoginConfig {
430
446
  /**
431
- * Path to save OAuth state file
432
- */
433
- outputPath: string;
434
- /**
435
- * Pre-registered client ID (optional, uses DCR if not provided)
436
- */
437
- clientId?: string;
438
- /**
439
- * Pre-registered client secret (optional)
440
- */
441
- clientSecret?: string;
442
- /**
443
- * Redirect URI for OAuth callback
444
- */
445
- redirectUri?: string;
446
- /**
447
- * Timeout for login flow in milliseconds (default: 30000)
447
+ * Custom Playwright automation for the IdP login flow.
448
+ *
449
+ * @param page - Playwright Page already navigated to the OAuth authorization URL
450
+ *
451
+ * @example
452
+ * ```typescript
453
+ * customLoginFlow: async (page) => {
454
+ * await page.fill('#username', process.env.TEST_USER!);
455
+ * await page.click('#continue');
456
+ * await page.fill('#password', process.env.TEST_PASS!);
457
+ * await page.click('#submit');
458
+ * }
459
+ * ```
448
460
  */
449
- timeoutMs?: number;
461
+ customLoginFlow: (page: Page) => Promise<void>;
462
+ loginSelectors?: never;
463
+ credentials?: never;
450
464
  }
465
+ /**
466
+ * Configuration for OAuth setup flow.
467
+ *
468
+ * Provide either `loginSelectors` + `credentials` for standard form-based login,
469
+ * or `customLoginFlow` for full control over the browser automation.
470
+ */
471
+ type OAuthSetupConfig = OAuthSetupBaseConfig & (StandardLoginConfig | CustomLoginConfig);
451
472
  /**
452
473
  * Result of token exchange or refresh
453
474
  */
@@ -714,6 +735,34 @@ interface AuthServerMetadata {
714
735
  */
715
736
  issuer: string;
716
737
  }
738
+ /**
739
+ * Configuration for token refresh
740
+ */
741
+ interface TokenRefreshConfig {
742
+ /**
743
+ * Authorization server metadata
744
+ */
745
+ authServer: AuthServerMetadata;
746
+ /**
747
+ * Client ID
748
+ */
749
+ clientId: string;
750
+ /**
751
+ * Client secret (for confidential clients)
752
+ */
753
+ clientSecret?: string;
754
+ /**
755
+ * Refresh token
756
+ */
757
+ refreshToken: string;
758
+ }
759
+ /**
760
+ * Refreshes an access token using a refresh token
761
+ *
762
+ * @param config - Token refresh configuration
763
+ * @returns New token result
764
+ */
765
+ declare function refreshAccessToken(config: TokenRefreshConfig): Promise<TokenResult>;
717
766
  /**
718
767
  * Configuration for client credentials grant
719
768
  */
@@ -1632,7 +1681,7 @@ interface UsageMetrics {
1632
1681
  cacheCreationInputTokens?: number;
1633
1682
  }
1634
1683
  /** Valid LLM judge provider kinds. */
1635
- type ProviderKind = 'anthropic' | 'openai' | 'google';
1684
+ type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
1636
1685
  /**
1637
1686
  * Configuration for an LLM judge
1638
1687
  */
@@ -1744,8 +1793,11 @@ interface Judge {
1744
1793
  * Configuration for the judge validator
1745
1794
  */
1746
1795
  interface JudgeValidatorConfig {
1747
- /** The evaluation rubric: a built-in name or custom { text: string } */
1748
- rubric: RubricSpec;
1796
+ /**
1797
+ * The evaluation rubric: a built-in name or custom { text: string }.
1798
+ * Required when no named `judge` is specified.
1799
+ */
1800
+ rubric?: RubricSpec;
1749
1801
  /** Optional reference response to compare against */
1750
1802
  reference?: unknown;
1751
1803
  /** Minimum score required to pass (0-1, default: 0.7) */
@@ -1766,6 +1818,13 @@ interface JudgeValidatorConfig {
1766
1818
  maxBudgetUsd?: number;
1767
1819
  /** Fail if response exceeds this size in bytes before judging */
1768
1820
  maxToolOutputSize?: number;
1821
+ /**
1822
+ * Name of a registered custom judge executor.
1823
+ * When set, the named judge handles the entire evaluation pipeline
1824
+ * and returns a normalized score. The `threshold` determines pass/fail.
1825
+ * Register judges with `registerJudge()` before tests run.
1826
+ */
1827
+ judge?: string;
1769
1828
  }
1770
1829
  declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
1771
1830
 
@@ -1823,6 +1882,12 @@ interface JudgeMatcherOptions {
1823
1882
  provider?: ProviderKind;
1824
1883
  /** Override the judge model */
1825
1884
  model?: string;
1885
+ /**
1886
+ * Name of a registered custom judge executor.
1887
+ * When set, the named judge handles the entire evaluation pipeline
1888
+ * and its `pass` result is authoritative.
1889
+ */
1890
+ judge?: string;
1826
1891
  }
1827
1892
  /**
1828
1893
  * Declaration merging for Playwright matchers
@@ -1913,21 +1978,30 @@ declare global {
1913
1978
  */
1914
1979
  toBeToolError(expected?: boolean | string | string[]): R;
1915
1980
  /**
1916
- * Validates that a response passes LLM-as-judge evaluation
1981
+ * Validates that a response passes LLM-as-judge evaluation.
1917
1982
  *
1918
- * @param rubric - Evaluation rubric/criteria
1919
- * @param options - Judge options
1983
+ * Two call signatures:
1984
+ * - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
1985
+ * - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
1920
1986
  *
1921
1987
  * @example
1922
1988
  * ```typescript
1989
+ * // Built-in LLM judge with rubric
1923
1990
  * expect(result).toPassToolJudge('Response should be helpful and accurate');
1924
- * expect(result).toPassToolJudge('Response should match reference', {
1991
+ * expect(result).toPassToolJudge('correctness', {
1925
1992
  * reference: expectedOutput,
1926
1993
  * passingThreshold: 0.8,
1927
1994
  * });
1995
+ *
1996
+ * // Named custom judge (registered via registerJudge)
1997
+ * expect(result).toPassToolJudge({ judge: 'glean-completeness' });
1928
1998
  * ```
1929
1999
  */
1930
2000
  toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
2001
+ toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
2002
+ toPassToolJudge(judges: Array<JudgeMatcherOptions & {
2003
+ rubric?: RubricSpec;
2004
+ }>): Promise<R>;
1931
2005
  /**
1932
2006
  * Validates that a response meets size constraints
1933
2007
  *
@@ -2050,6 +2124,33 @@ interface EvalExpectationResult {
2050
2124
  * Optional details about the result
2051
2125
  */
2052
2126
  details?: string;
2127
+ /**
2128
+ * Judge score (0-1). Populated for passesJudge expectations.
2129
+ */
2130
+ score?: number;
2131
+ /**
2132
+ * Judge reasoning. Populated for passesJudge expectations.
2133
+ */
2134
+ reasoning?: string;
2135
+ /**
2136
+ * Judge name — rubric name (e.g. 'correctness') or custom judge name.
2137
+ * Populated for passesJudge expectations.
2138
+ */
2139
+ judgeName?: string;
2140
+ /**
2141
+ * Judge provider used. Populated for passesJudge expectations.
2142
+ */
2143
+ judgeProvider?: string;
2144
+ /**
2145
+ * Judge model used. Populated for passesJudge expectations.
2146
+ */
2147
+ judgeModel?: string;
2148
+ /**
2149
+ * Per-judge breakdown when multiple judges are used.
2150
+ * Each entry contains the individual judge's result.
2151
+ * Only populated when passesJudge is an array with 2+ entries.
2152
+ */
2153
+ judgeResults?: EvalExpectationResult[];
2053
2154
  }
2054
2155
  /**
2055
2156
  * Map of expectation type to result
@@ -2058,7 +2159,7 @@ type ExpectationResultMap = Partial<Record<ExpectationType, EvalExpectationResul
2058
2159
  /**
2059
2160
  * Breakdown of expectation types used in a run
2060
2161
  */
2061
- type ExpectationBreakdown = Record<ExpectationType, number>;
2162
+ type ExpectationBreakdown = Partial<Record<ExpectationType, number>>;
2062
2163
 
2063
2164
  /**
2064
2165
  * Options for creating an MCP fixture
@@ -2274,16 +2375,26 @@ declare function toBeToolError(this: {
2274
2375
  * Validates that a response passes LLM-as-judge evaluation.
2275
2376
  * Delegates evaluation logic to validateJudge() for consistency
2276
2377
  * with the validator/matcher duality pattern.
2378
+ *
2379
+ * Supports three call signatures:
2380
+ * - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
2381
+ * - toPassToolJudge({ judge: 'name', ... }) — named custom judge
2382
+ * - toPassToolJudge([...judges]) — multi-judge (all must pass)
2277
2383
  */
2278
2384
 
2279
2385
  /**
2280
- * Creates the toPassToolJudge matcher function
2386
+ * The toPassToolJudge matcher function.
2281
2387
  *
2282
- * Note: This is an async matcher that calls an LLM for evaluation.
2388
+ * Accepts either:
2389
+ * (received, rubric, options?) — rubric-based LLM judge
2390
+ * (received, options) — named custom judge (options.judge required)
2391
+ * (received, judges[]) — multi-judge (all must pass)
2283
2392
  */
2284
2393
  declare function toPassToolJudge(this: {
2285
2394
  isNot: boolean;
2286
- }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
2395
+ }, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
2396
+ rubric?: RubricSpec;
2397
+ }>, maybeOptions?: JudgeMatcherOptions): Promise<{
2287
2398
  pass: boolean;
2288
2399
  message: () => string;
2289
2400
  }>;
@@ -2485,10 +2596,19 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
2485
2596
  */
2486
2597
 
2487
2598
  /**
2488
- * LLM provider for host simulation.
2599
+ * Host type for MCP host simulation.
2489
2600
  *
2490
- * All providers run through the Vercel AI SDK (`ai` package).
2491
- * Each provider requires its corresponding @ai-sdk/* package:
2601
+ * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
2602
+ * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
2603
+ * - 'browser': Web-based hosts (e.g., claude.ai). Uses Playwright/CDP. (Not yet implemented.)
2604
+ * - 'desktop': Desktop app hosts (e.g., Claude Desktop). Uses computer use. (Not yet implemented.)
2605
+ */
2606
+ type HostType = 'sdk' | 'cli' | 'browser' | 'desktop';
2607
+ /**
2608
+ * LLM provider for SDK-based host simulation.
2609
+ *
2610
+ * Each provider runs through the Vercel AI SDK (`ai` package)
2611
+ * and requires its corresponding @ai-sdk/* package:
2492
2612
  *
2493
2613
  * openai → npm install ai @ai-sdk/openai
2494
2614
  * anthropic → npm install ai @ai-sdk/anthropic
@@ -2508,14 +2628,81 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
2508
2628
  * @example model: 'claude-3-5-haiku@20241022'
2509
2629
  */
2510
2630
  | 'vertex-anthropic';
2631
+ /**
2632
+ * Output format for CLI host processes.
2633
+ *
2634
+ * - 'stream-json': NDJSON (one JSON object per line). Used by Claude Code (`--output-format stream-json`).
2635
+ * - 'json': Single JSON object on stdout.
2636
+ */
2637
+ type CLIOutputFormat = 'stream-json' | 'json';
2638
+ /**
2639
+ * Configuration for a CLI host process.
2640
+ *
2641
+ * The process is spawned directly (no shell) with `command` and `args`.
2642
+ * Use `{{scenario}}` in any args entry as a placeholder for the natural
2643
+ * language prompt — the framework replaces it before spawning.
2644
+ *
2645
+ * Because args are passed directly to the process (not through a shell),
2646
+ * special characters in the scenario (quotes, newlines, `$`, etc.) are
2647
+ * handled safely without escaping.
2648
+ *
2649
+ * @example Claude Code
2650
+ * ```json
2651
+ * {
2652
+ * "command": "claude",
2653
+ * "args": ["-p", "{{scenario}}", "--output-format", "stream-json",
2654
+ * "--verbose", "--mcp-config", "{...}"]
2655
+ * }
2656
+ * ```
2657
+ *
2658
+ * @example Custom CLI
2659
+ * ```json
2660
+ * {
2661
+ * "command": "my-agent",
2662
+ * "args": ["--prompt", "{{scenario}}", "--config", "./mcp.json"],
2663
+ * "outputFormat": "json"
2664
+ * }
2665
+ * ```
2666
+ */
2667
+ interface CLIConfig {
2668
+ /**
2669
+ * CLI binary to invoke.
2670
+ */
2671
+ command: string;
2672
+ /**
2673
+ * Arguments to pass. Use `{{scenario}}` as a placeholder for the prompt.
2674
+ */
2675
+ args: string[];
2676
+ /**
2677
+ * How to parse stdout.
2678
+ * @default 'stream-json'
2679
+ */
2680
+ outputFormat?: CLIOutputFormat;
2681
+ /**
2682
+ * Timeout in milliseconds.
2683
+ * @default 120000 (2 minutes)
2684
+ */
2685
+ timeout?: number;
2686
+ }
2511
2687
  /**
2512
2688
  * Configuration for MCP host simulation
2513
2689
  */
2514
2690
  interface MCPHostConfig {
2515
2691
  /**
2516
- * LLM provider to use
2692
+ * Host type for the simulation.
2693
+ *
2694
+ * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
2695
+ * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
2696
+ * - 'browser': Web-based hosts (not yet implemented).
2697
+ * - 'desktop': Desktop app hosts (not yet implemented).
2698
+ *
2699
+ * @default 'sdk'
2700
+ */
2701
+ hostType?: HostType;
2702
+ /**
2703
+ * LLM provider (required for 'sdk' host type, ignored for 'cli')
2517
2704
  */
2518
- provider: LLMProvider;
2705
+ provider?: LLMProvider;
2519
2706
  /**
2520
2707
  * Environment variable name containing the API key
2521
2708
  */
@@ -2538,6 +2725,10 @@ interface MCPHostConfig {
2538
2725
  * @default 10
2539
2726
  */
2540
2727
  maxToolCalls?: number;
2728
+ /**
2729
+ * CLI host configuration (required for 'cli' host type).
2730
+ */
2731
+ cli?: CLIConfig;
2541
2732
  }
2542
2733
  /**
2543
2734
  * A tool call made by the LLM
@@ -2709,6 +2900,42 @@ interface EvalCase {
2709
2900
  */
2710
2901
  expect?: EvalExpectBlock;
2711
2902
  }
2903
+ /**
2904
+ * Configuration for a single LLM-as-judge evaluation
2905
+ */
2906
+ interface JudgeExpectConfig {
2907
+ /**
2908
+ * Name of a registered custom judge executor.
2909
+ * When set, the named judge handles evaluation and returns a normalized score.
2910
+ * The `threshold` determines pass/fail. `reps` and LLM config fields
2911
+ * (provider, model, etc.) are ignored.
2912
+ */
2913
+ judge?: string;
2914
+ /** Built-in rubric name or custom rubric object. Required when no `judge` is specified. */
2915
+ rubric?: BuiltInRubric | {
2916
+ text: string;
2917
+ };
2918
+ /** Reference response to compare against */
2919
+ reference?: unknown;
2920
+ /** Score threshold for passing (0-1, default: 0.7) */
2921
+ threshold?: number;
2922
+ /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
2923
+ reps?: number;
2924
+ /** Judge provider. @default 'anthropic' */
2925
+ provider?: 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
2926
+ /** Model override (e.g., 'claude-opus-4-20250514') */
2927
+ model?: string;
2928
+ /** Environment variable name for API key */
2929
+ apiKeyEnvVar?: string;
2930
+ /** Max tokens for judge response */
2931
+ maxTokens?: number;
2932
+ /** Temperature for judge LLM (0–1) */
2933
+ temperature?: number;
2934
+ /** Max budget in USD per evaluation */
2935
+ maxBudgetUsd?: number;
2936
+ /** Fail if response exceeds this size in bytes before judging */
2937
+ maxToolOutputSize?: number;
2938
+ }
2712
2939
  /**
2713
2940
  * Unified expectation block for eval cases
2714
2941
  *
@@ -2748,33 +2975,11 @@ interface EvalExpectBlock {
2748
2975
  isError?: boolean | string | string[];
2749
2976
  /**
2750
2977
  * LLM-as-judge evaluation (toPassToolJudge)
2978
+ *
2979
+ * Accepts a single judge config or an array for multi-judge evaluation.
2980
+ * When an array is provided, all judges must pass (AND semantics).
2751
2981
  */
2752
- passesJudge?: {
2753
- /** Built-in rubric name or custom rubric object */
2754
- rubric: BuiltInRubric | {
2755
- text: string;
2756
- };
2757
- /** Reference response to compare against */
2758
- reference?: unknown;
2759
- /** Score threshold for passing (0-1, default: 0.7) */
2760
- threshold?: number;
2761
- /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
2762
- reps?: number;
2763
- /** Judge provider. @default 'anthropic' */
2764
- provider?: 'anthropic' | 'openai' | 'google';
2765
- /** Model override (e.g., 'claude-opus-4-20250514') */
2766
- model?: string;
2767
- /** Environment variable name for API key */
2768
- apiKeyEnvVar?: string;
2769
- /** Max tokens for judge response */
2770
- maxTokens?: number;
2771
- /** Temperature for judge LLM (0–1) */
2772
- temperature?: number;
2773
- /** Max budget in USD per evaluation */
2774
- maxBudgetUsd?: number;
2775
- /** Fail if response exceeds this size in bytes before judging */
2776
- maxToolOutputSize?: number;
2777
- };
2982
+ passesJudge?: JudgeExpectConfig | JudgeExpectConfig[];
2778
2983
  /**
2779
2984
  * Response size validation (toHaveToolResponseSize)
2780
2985
  */
@@ -2859,7 +3064,13 @@ declare const EvalCaseSchema: z.ZodObject<{
2859
3064
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2860
3065
  scenario: z.ZodOptional<z.ZodString>;
2861
3066
  mcpHostConfig: z.ZodOptional<z.ZodObject<{
2862
- provider: z.ZodEnum<{
3067
+ hostType: z.ZodOptional<z.ZodEnum<{
3068
+ sdk: "sdk";
3069
+ cli: "cli";
3070
+ browser: "browser";
3071
+ desktop: "desktop";
3072
+ }>>;
3073
+ provider: z.ZodOptional<z.ZodEnum<{
2863
3074
  openai: "openai";
2864
3075
  anthropic: "anthropic";
2865
3076
  azure: "azure";
@@ -2869,12 +3080,21 @@ declare const EvalCaseSchema: z.ZodObject<{
2869
3080
  openrouter: "openrouter";
2870
3081
  xai: "xai";
2871
3082
  "vertex-anthropic": "vertex-anthropic";
2872
- }>;
3083
+ }>>;
2873
3084
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2874
3085
  model: z.ZodOptional<z.ZodString>;
2875
3086
  maxTokens: z.ZodOptional<z.ZodNumber>;
2876
3087
  temperature: z.ZodOptional<z.ZodNumber>;
2877
3088
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3089
+ cli: z.ZodOptional<z.ZodObject<{
3090
+ command: z.ZodString;
3091
+ args: z.ZodArray<z.ZodString>;
3092
+ outputFormat: z.ZodOptional<z.ZodEnum<{
3093
+ json: "json";
3094
+ "stream-json": "stream-json";
3095
+ }>>;
3096
+ timeout: z.ZodOptional<z.ZodNumber>;
3097
+ }, z.core.$strip>>;
2878
3098
  }, z.core.$strip>>;
2879
3099
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2880
3100
  iterations: z.ZodOptional<z.ZodNumber>;
@@ -2901,8 +3121,9 @@ declare const EvalCaseSchema: z.ZodObject<{
2901
3121
  remove: z.ZodArray<z.ZodString>;
2902
3122
  }, z.core.$strip>]>>>;
2903
3123
  isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
2904
- passesJudge: z.ZodOptional<z.ZodObject<{
2905
- rubric: z.ZodUnion<readonly [z.ZodEnum<{
3124
+ passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
3125
+ judge: z.ZodOptional<z.ZodString>;
3126
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
2906
3127
  correctness: "correctness";
2907
3128
  completeness: "completeness";
2908
3129
  groundedness: "groundedness";
@@ -2910,7 +3131,7 @@ declare const EvalCaseSchema: z.ZodObject<{
2910
3131
  conciseness: "conciseness";
2911
3132
  }>, z.ZodObject<{
2912
3133
  text: z.ZodString;
2913
- }, z.core.$strip>]>;
3134
+ }, z.core.$strip>]>>;
2914
3135
  reference: z.ZodOptional<z.ZodUnknown>;
2915
3136
  threshold: z.ZodOptional<z.ZodNumber>;
2916
3137
  reps: z.ZodOptional<z.ZodNumber>;
@@ -2918,6 +3139,8 @@ declare const EvalCaseSchema: z.ZodObject<{
2918
3139
  openai: "openai";
2919
3140
  anthropic: "anthropic";
2920
3141
  google: "google";
3142
+ "vertex-anthropic": "vertex-anthropic";
3143
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
2921
3144
  }>>;
2922
3145
  model: z.ZodOptional<z.ZodString>;
2923
3146
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -2925,7 +3148,34 @@ declare const EvalCaseSchema: z.ZodObject<{
2925
3148
  temperature: z.ZodOptional<z.ZodNumber>;
2926
3149
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
2927
3150
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2928
- }, z.core.$strip>>;
3151
+ }, z.core.$strip>, z.ZodArray<z.ZodObject<{
3152
+ judge: z.ZodOptional<z.ZodString>;
3153
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3154
+ correctness: "correctness";
3155
+ completeness: "completeness";
3156
+ groundedness: "groundedness";
3157
+ "instruction-following": "instruction-following";
3158
+ conciseness: "conciseness";
3159
+ }>, z.ZodObject<{
3160
+ text: z.ZodString;
3161
+ }, z.core.$strip>]>>;
3162
+ reference: z.ZodOptional<z.ZodUnknown>;
3163
+ threshold: z.ZodOptional<z.ZodNumber>;
3164
+ reps: z.ZodOptional<z.ZodNumber>;
3165
+ provider: z.ZodOptional<z.ZodEnum<{
3166
+ openai: "openai";
3167
+ anthropic: "anthropic";
3168
+ google: "google";
3169
+ "vertex-anthropic": "vertex-anthropic";
3170
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3171
+ }>>;
3172
+ model: z.ZodOptional<z.ZodString>;
3173
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3174
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3175
+ temperature: z.ZodOptional<z.ZodNumber>;
3176
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3177
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3178
+ }, z.core.$strip>>]>>;
2929
3179
  responseSize: z.ZodOptional<z.ZodObject<{
2930
3180
  maxBytes: z.ZodOptional<z.ZodNumber>;
2931
3181
  minBytes: z.ZodOptional<z.ZodNumber>;
@@ -2966,7 +3216,13 @@ declare const EvalDatasetSchema: z.ZodObject<{
2966
3216
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2967
3217
  scenario: z.ZodOptional<z.ZodString>;
2968
3218
  mcpHostConfig: z.ZodOptional<z.ZodObject<{
2969
- provider: z.ZodEnum<{
3219
+ hostType: z.ZodOptional<z.ZodEnum<{
3220
+ sdk: "sdk";
3221
+ cli: "cli";
3222
+ browser: "browser";
3223
+ desktop: "desktop";
3224
+ }>>;
3225
+ provider: z.ZodOptional<z.ZodEnum<{
2970
3226
  openai: "openai";
2971
3227
  anthropic: "anthropic";
2972
3228
  azure: "azure";
@@ -2976,12 +3232,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
2976
3232
  openrouter: "openrouter";
2977
3233
  xai: "xai";
2978
3234
  "vertex-anthropic": "vertex-anthropic";
2979
- }>;
3235
+ }>>;
2980
3236
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2981
3237
  model: z.ZodOptional<z.ZodString>;
2982
3238
  maxTokens: z.ZodOptional<z.ZodNumber>;
2983
3239
  temperature: z.ZodOptional<z.ZodNumber>;
2984
3240
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3241
+ cli: z.ZodOptional<z.ZodObject<{
3242
+ command: z.ZodString;
3243
+ args: z.ZodArray<z.ZodString>;
3244
+ outputFormat: z.ZodOptional<z.ZodEnum<{
3245
+ json: "json";
3246
+ "stream-json": "stream-json";
3247
+ }>>;
3248
+ timeout: z.ZodOptional<z.ZodNumber>;
3249
+ }, z.core.$strip>>;
2985
3250
  }, z.core.$strip>>;
2986
3251
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2987
3252
  iterations: z.ZodOptional<z.ZodNumber>;
@@ -3008,8 +3273,9 @@ declare const EvalDatasetSchema: z.ZodObject<{
3008
3273
  remove: z.ZodArray<z.ZodString>;
3009
3274
  }, z.core.$strip>]>>>;
3010
3275
  isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3011
- passesJudge: z.ZodOptional<z.ZodObject<{
3012
- rubric: z.ZodUnion<readonly [z.ZodEnum<{
3276
+ passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
3277
+ judge: z.ZodOptional<z.ZodString>;
3278
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3013
3279
  correctness: "correctness";
3014
3280
  completeness: "completeness";
3015
3281
  groundedness: "groundedness";
@@ -3017,7 +3283,7 @@ declare const EvalDatasetSchema: z.ZodObject<{
3017
3283
  conciseness: "conciseness";
3018
3284
  }>, z.ZodObject<{
3019
3285
  text: z.ZodString;
3020
- }, z.core.$strip>]>;
3286
+ }, z.core.$strip>]>>;
3021
3287
  reference: z.ZodOptional<z.ZodUnknown>;
3022
3288
  threshold: z.ZodOptional<z.ZodNumber>;
3023
3289
  reps: z.ZodOptional<z.ZodNumber>;
@@ -3025,6 +3291,8 @@ declare const EvalDatasetSchema: z.ZodObject<{
3025
3291
  openai: "openai";
3026
3292
  anthropic: "anthropic";
3027
3293
  google: "google";
3294
+ "vertex-anthropic": "vertex-anthropic";
3295
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3028
3296
  }>>;
3029
3297
  model: z.ZodOptional<z.ZodString>;
3030
3298
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3032,7 +3300,34 @@ declare const EvalDatasetSchema: z.ZodObject<{
3032
3300
  temperature: z.ZodOptional<z.ZodNumber>;
3033
3301
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3034
3302
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3035
- }, z.core.$strip>>;
3303
+ }, z.core.$strip>, z.ZodArray<z.ZodObject<{
3304
+ judge: z.ZodOptional<z.ZodString>;
3305
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3306
+ correctness: "correctness";
3307
+ completeness: "completeness";
3308
+ groundedness: "groundedness";
3309
+ "instruction-following": "instruction-following";
3310
+ conciseness: "conciseness";
3311
+ }>, z.ZodObject<{
3312
+ text: z.ZodString;
3313
+ }, z.core.$strip>]>>;
3314
+ reference: z.ZodOptional<z.ZodUnknown>;
3315
+ threshold: z.ZodOptional<z.ZodNumber>;
3316
+ reps: z.ZodOptional<z.ZodNumber>;
3317
+ provider: z.ZodOptional<z.ZodEnum<{
3318
+ openai: "openai";
3319
+ anthropic: "anthropic";
3320
+ google: "google";
3321
+ "vertex-anthropic": "vertex-anthropic";
3322
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3323
+ }>>;
3324
+ model: z.ZodOptional<z.ZodString>;
3325
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3326
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3327
+ temperature: z.ZodOptional<z.ZodNumber>;
3328
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3329
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3330
+ }, z.core.$strip>>]>>;
3036
3331
  responseSize: z.ZodOptional<z.ZodObject<{
3037
3332
  maxBytes: z.ZodOptional<z.ZodNumber>;
3038
3333
  minBytes: z.ZodOptional<z.ZodNumber>;
@@ -3268,6 +3563,23 @@ interface IterationResult {
3268
3563
  }>;
3269
3564
  };
3270
3565
  }
3566
+ /**
3567
+ * Request data captured from the eval case input.
3568
+ * Preserves what was sent so results are self-contained for debugging.
3569
+ */
3570
+ interface EvalCaseRequest {
3571
+ /** Human-readable description of the case */
3572
+ description?: string;
3573
+ /** Tool arguments (direct mode) */
3574
+ args?: Record<string, unknown>;
3575
+ /** Natural language scenario sent to the LLM (mcp_host mode) */
3576
+ scenario?: string;
3577
+ /** LLM provider/model configuration (mcp_host mode) */
3578
+ mcpHostConfig?: {
3579
+ provider?: string;
3580
+ model?: string;
3581
+ };
3582
+ }
3271
3583
  /**
3272
3584
  * Result of a single eval case
3273
3585
  */
@@ -3292,6 +3604,11 @@ interface EvalCaseResult {
3292
3604
  * Overall pass/fail status
3293
3605
  */
3294
3606
  pass: boolean;
3607
+ /**
3608
+ * Request data from the eval case input (tool args, scenario, LLM config).
3609
+ * Populated so results are self-contained for debugging without the original dataset.
3610
+ */
3611
+ request?: EvalCaseRequest;
3295
3612
  /**
3296
3613
  * Tool response
3297
3614
  */
@@ -3835,24 +4152,31 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
3835
4152
  * schemas, testing discoverability and parameter clarity at the level a real
3836
4153
  * user (via Claude Desktop, ChatGPT, etc.) would experience.
3837
4154
  *
3838
- * All providers run through the Vercel AI SDK's generateText with maxSteps,
3839
- * which handles multi-turn tool calling natively and provides per-step latency
3840
- * decomposition (llmDurationMs vs. mcpDurationMs).
3841
- *
3842
- * @param mcp - MCP fixture API
4155
+ * @param mcp - MCP fixture API (used by SDK hosts; ignored by CLI/browser hosts which establish their own connections)
3843
4156
  * @param scenario - Natural language prompt describing what the LLM should do
3844
4157
  * @param config - MCP host configuration (provider, model, temperature, etc.)
3845
4158
  * @returns Simulation result with tool calls, final response, and latency data
3846
4159
  *
3847
4160
  * @example
3848
4161
  * ```typescript
4162
+ * // SDK host (default) — uses the framework's existing MCP connection
3849
4163
  * const result = await simulateMCPHost(mcp,
3850
4164
  * "Find recent documents about MCP testing frameworks",
3851
4165
  * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
3852
4166
  * );
3853
4167
  *
3854
- * expect(result.success).toBe(true);
3855
- * expect(result.toolCalls.map(c => c.name)).toContain('search');
4168
+ * // CLI host — spawns a CLI process with its own MCP connection
4169
+ * const result = await simulateMCPHost(mcp,
4170
+ * "Find recent documents about MCP testing frameworks",
4171
+ * {
4172
+ * hostType: 'cli',
4173
+ * provider: 'anthropic',
4174
+ * cli: {
4175
+ * command: 'claude',
4176
+ * args: ['-p', '{{scenario}}', '--output-format', 'stream-json', '--verbose'],
4177
+ * },
4178
+ * }
4179
+ * );
3856
4180
  * ```
3857
4181
  */
3858
4182
  declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
@@ -3905,6 +4229,99 @@ declare function getMissingDependencyMessage(provider: LLMProvider): string;
3905
4229
  */
3906
4230
  declare function createJudge(config?: JudgeConfig): Judge;
3907
4231
 
4232
+ /**
4233
+ * Custom Judge Registry
4234
+ *
4235
+ * Allows consumers to register named judge executors that can be referenced
4236
+ * by string ID in eval fixtures and programmatic tests. This enables
4237
+ * multi-step judge pipelines (LLM call + post-processing), custom scoring
4238
+ * logic, and reusable judge configurations without duplicating rubrics.
4239
+ */
4240
+ /**
4241
+ * Result returned by a custom judge executor.
4242
+ *
4243
+ * Custom judges must return a normalized score (0–1). The framework applies
4244
+ * the caller's `threshold` (default 0.7) to determine pass/fail. This keeps
4245
+ * judges reusable — the same judge can be used with different thresholds in
4246
+ * different tests.
4247
+ */
4248
+ interface CustomJudgeResult {
4249
+ /** Normalized score (0–1, where 1 is best) */
4250
+ score: number;
4251
+ /** Optional reasoning/explanation */
4252
+ reasoning?: string;
4253
+ }
4254
+ /**
4255
+ * A user-defined judge executor function.
4256
+ *
4257
+ * Custom executors own their entire evaluation pipeline — prompt construction,
4258
+ * LLM calls, and post-processing — but return a normalized score. The framework
4259
+ * determines pass/fail by comparing the score against the caller's threshold.
4260
+ *
4261
+ * @param candidate - The actual response to evaluate
4262
+ * @param reference - Optional reference/expected response
4263
+ * @returns Evaluation result with a normalized score and optional reasoning
4264
+ *
4265
+ * @example
4266
+ * ```typescript
4267
+ * const completenessJudge: CustomJudgeExecutor = async (candidate, reference) => {
4268
+ * // Step 1: LLM call with your own prompt and schema
4269
+ * const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
4270
+ * const { verdict, reasoning } = JSON.parse(llmResult);
4271
+ *
4272
+ * // Step 2: Deterministic post-processing into a normalized score
4273
+ * const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
4274
+ *
4275
+ * return { score, reasoning };
4276
+ * };
4277
+ * ```
4278
+ */
4279
+ type CustomJudgeExecutor = (candidate: unknown, reference?: unknown) => Promise<CustomJudgeResult>;
4280
+ /**
4281
+ * Registers a named custom judge executor.
4282
+ *
4283
+ * Call this in your test setup (e.g., `playwright.config.ts` or a global setup file)
4284
+ * before tests run. The name can then be referenced in JSON eval fixtures via the
4285
+ * `judge` field on `passesJudge`.
4286
+ *
4287
+ * @param name - Unique identifier for the judge
4288
+ * @param executor - The judge executor function
4289
+ * @throws {Error} If a judge with the same name is already registered
4290
+ *
4291
+ * @example
4292
+ * ```typescript
4293
+ * import { registerJudge } from '@gleanwork/mcp-server-tester';
4294
+ *
4295
+ * registerJudge('glean-completeness', async (candidate, reference) => {
4296
+ * // Step 1: LLM call with your own prompt and schema
4297
+ * const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
4298
+ * const { verdict, reasoning } = JSON.parse(llmResult);
4299
+ *
4300
+ * // Step 2: Deterministic post-processing into a normalized score
4301
+ * const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
4302
+ *
4303
+ * return { score, reasoning };
4304
+ * });
4305
+ *
4306
+ * // Then in tests — same judge, different thresholds:
4307
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.8 });
4308
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.5 });
4309
+ * ```
4310
+ */
4311
+ declare function registerJudge(name: string, executor: CustomJudgeExecutor): void;
4312
+ /**
4313
+ * Retrieves a registered custom judge executor by name.
4314
+ *
4315
+ * @param name - The judge name to look up
4316
+ * @returns The registered executor
4317
+ * @throws {Error} If no judge with the given name is registered
4318
+ */
4319
+ declare function getRegisteredJudge(name: string): CustomJudgeExecutor;
4320
+ /**
4321
+ * Clears all registered judges. Intended for test teardown.
4322
+ */
4323
+ declare function clearJudgeRegistry(): void;
4324
+
3908
4325
  /**
3909
4326
  * Options for conformance checks
3910
4327
  */
@@ -4066,4 +4483,4 @@ interface MCPEvalReporterConfig {
4066
4483
  includeAutoTracking?: boolean;
4067
4484
  }
4068
4485
 
4069
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4486
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseRequest, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunMetadata, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };