@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,10 +1,10 @@
1
1
  import { z, ZodType } from 'zod';
2
+ import { Page, TestInfo, Expect } from '@playwright/test';
2
3
  import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
3
4
  import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
4
5
  import * as oauth from 'oauth4webapi';
5
6
  import { Client } from '@modelcontextprotocol/sdk/client/index.js';
6
7
  import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
7
- import { TestInfo, Expect } from '@playwright/test';
8
8
  import * as playwright_test from 'playwright/test';
9
9
 
10
10
  /**
@@ -311,6 +311,7 @@ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
311
311
  /**
312
312
  * Auth types for MCP OAuth integration
313
313
  */
314
+
314
315
  /**
315
316
  * Stored OAuth tokens
316
317
  */
@@ -384,70 +385,90 @@ interface StoredOAuthState {
384
385
  savedAt: number;
385
386
  }
386
387
  /**
387
- * Configuration for OAuth setup flow
388
+ * Login form selectors for standard OAuth login automation
389
+ */
390
+ interface OAuthLoginSelectors {
391
+ /** Selector for username/email input field */
392
+ usernameInput: string;
393
+ /** Selector for password input field */
394
+ passwordInput: string;
395
+ /** Selector for login submit button */
396
+ submitButton: string;
397
+ /** Selector for consent/authorize button (optional) */
398
+ consentButton?: string;
399
+ }
400
+ /**
401
+ * Base configuration shared by all OAuth setup strategies
388
402
  */
389
- interface OAuthSetupConfig {
390
- /**
391
- * OAuth authorization server metadata URL
392
- */
403
+ interface OAuthSetupBaseConfig {
404
+ /** OAuth authorization server metadata URL */
393
405
  authServerUrl: string;
394
- /**
395
- * Scopes to request
396
- */
406
+ /** Scopes to request */
397
407
  scopes: Array<string>;
398
- /**
399
- * Resource indicator (RFC 8707)
400
- */
408
+ /** Path to save OAuth state file */
409
+ outputPath: string;
410
+ /** Pre-registered client ID (optional, uses DCR if not provided) */
411
+ clientId?: string;
412
+ /** Pre-registered client secret (optional) */
413
+ clientSecret?: string;
414
+ /** Redirect URI for OAuth callback */
415
+ redirectUri?: string;
416
+ /** Resource indicator (RFC 8707) */
401
417
  resource?: string;
402
- /**
403
- * Login form selectors for automation
404
- */
405
- loginSelectors: {
406
- /**
407
- * Selector for username/email input field
408
- */
409
- usernameInput: string;
410
- /**
411
- * Selector for password input field
412
- */
413
- passwordInput: string;
414
- /**
415
- * Selector for login submit button
416
- */
417
- submitButton: string;
418
- /**
419
- * Selector for consent/authorize button (optional)
420
- */
421
- consentButton?: string;
422
- };
423
- /**
424
- * Test user credentials
425
- */
418
+ /** Timeout for login flow in milliseconds (default: 30000) */
419
+ timeoutMs?: number;
420
+ }
421
+ /**
422
+ * Standard login strategy: automates a form with username, password, and submit button.
423
+ * Use when the IdP presents all login fields on a single page.
424
+ */
425
+ interface StandardLoginConfig {
426
+ /** Login form selectors for Playwright automation */
427
+ loginSelectors: OAuthLoginSelectors;
428
+ /** Test user credentials */
426
429
  credentials: {
427
430
  username: string;
428
431
  password: string;
429
432
  };
433
+ customLoginFlow?: never;
434
+ }
435
+ /**
436
+ * Custom login strategy: full control over the browser-based login flow.
437
+ * Use for multi-step logins, MFA, custom consent screens, or any flow
438
+ * that doesn't fit the standard username/password/submit pattern.
439
+ *
440
+ * The callback receives a Playwright Page already navigated to the OAuth
441
+ * authorization URL. Complete the login so the IdP redirects to the
442
+ * callback URL — `performOAuthSetup` handles PKCE, token exchange,
443
+ * and state persistence automatically.
444
+ */
445
+ interface CustomLoginConfig {
430
446
  /**
431
- * Path to save OAuth state file
432
- */
433
- outputPath: string;
434
- /**
435
- * Pre-registered client ID (optional, uses DCR if not provided)
436
- */
437
- clientId?: string;
438
- /**
439
- * Pre-registered client secret (optional)
440
- */
441
- clientSecret?: string;
442
- /**
443
- * Redirect URI for OAuth callback
444
- */
445
- redirectUri?: string;
446
- /**
447
- * Timeout for login flow in milliseconds (default: 30000)
447
+ * Custom Playwright automation for the IdP login flow.
448
+ *
449
+ * @param page - Playwright Page already navigated to the OAuth authorization URL
450
+ *
451
+ * @example
452
+ * ```typescript
453
+ * customLoginFlow: async (page) => {
454
+ * await page.fill('#username', process.env.TEST_USER!);
455
+ * await page.click('#continue');
456
+ * await page.fill('#password', process.env.TEST_PASS!);
457
+ * await page.click('#submit');
458
+ * }
459
+ * ```
448
460
  */
449
- timeoutMs?: number;
461
+ customLoginFlow: (page: Page) => Promise<void>;
462
+ loginSelectors?: never;
463
+ credentials?: never;
450
464
  }
465
+ /**
466
+ * Configuration for OAuth setup flow.
467
+ *
468
+ * Provide either `loginSelectors` + `credentials` for standard form-based login,
469
+ * or `customLoginFlow` for full control over the browser automation.
470
+ */
471
+ type OAuthSetupConfig = OAuthSetupBaseConfig & (StandardLoginConfig | CustomLoginConfig);
451
472
  /**
452
473
  * Result of token exchange or refresh
453
474
  */
@@ -1632,7 +1653,7 @@ interface UsageMetrics {
1632
1653
  cacheCreationInputTokens?: number;
1633
1654
  }
1634
1655
  /** Valid LLM judge provider kinds. */
1635
- type ProviderKind = 'anthropic' | 'openai' | 'google';
1656
+ type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
1636
1657
  /**
1637
1658
  * Configuration for an LLM judge
1638
1659
  */
@@ -1744,8 +1765,11 @@ interface Judge {
1744
1765
  * Configuration for the judge validator
1745
1766
  */
1746
1767
  interface JudgeValidatorConfig {
1747
- /** The evaluation rubric: a built-in name or custom { text: string } */
1748
- rubric: RubricSpec;
1768
+ /**
1769
+ * The evaluation rubric: a built-in name or custom { text: string }.
1770
+ * Required when no named `judge` is specified.
1771
+ */
1772
+ rubric?: RubricSpec;
1749
1773
  /** Optional reference response to compare against */
1750
1774
  reference?: unknown;
1751
1775
  /** Minimum score required to pass (0-1, default: 0.7) */
@@ -1766,6 +1790,13 @@ interface JudgeValidatorConfig {
1766
1790
  maxBudgetUsd?: number;
1767
1791
  /** Fail if response exceeds this size in bytes before judging */
1768
1792
  maxToolOutputSize?: number;
1793
+ /**
1794
+ * Name of a registered custom judge executor.
1795
+ * When set, the named judge handles the entire evaluation pipeline
1796
+ * and returns a normalized score. The `threshold` determines pass/fail.
1797
+ * Register judges with `registerJudge()` before tests run.
1798
+ */
1799
+ judge?: string;
1769
1800
  }
1770
1801
  declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
1771
1802
 
@@ -1823,6 +1854,12 @@ interface JudgeMatcherOptions {
1823
1854
  provider?: ProviderKind;
1824
1855
  /** Override the judge model */
1825
1856
  model?: string;
1857
+ /**
1858
+ * Name of a registered custom judge executor.
1859
+ * When set, the named judge handles the entire evaluation pipeline
1860
+ * and its `pass` result is authoritative.
1861
+ */
1862
+ judge?: string;
1826
1863
  }
1827
1864
  /**
1828
1865
  * Declaration merging for Playwright matchers
@@ -1913,21 +1950,30 @@ declare global {
1913
1950
  */
1914
1951
  toBeToolError(expected?: boolean | string | string[]): R;
1915
1952
  /**
1916
- * Validates that a response passes LLM-as-judge evaluation
1953
+ * Validates that a response passes LLM-as-judge evaluation.
1917
1954
  *
1918
- * @param rubric - Evaluation rubric/criteria
1919
- * @param options - Judge options
1955
+ * Two call signatures:
1956
+ * - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
1957
+ * - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
1920
1958
  *
1921
1959
  * @example
1922
1960
  * ```typescript
1961
+ * // Built-in LLM judge with rubric
1923
1962
  * expect(result).toPassToolJudge('Response should be helpful and accurate');
1924
- * expect(result).toPassToolJudge('Response should match reference', {
1963
+ * expect(result).toPassToolJudge('correctness', {
1925
1964
  * reference: expectedOutput,
1926
1965
  * passingThreshold: 0.8,
1927
1966
  * });
1967
+ *
1968
+ * // Named custom judge (registered via registerJudge)
1969
+ * expect(result).toPassToolJudge({ judge: 'glean-completeness' });
1928
1970
  * ```
1929
1971
  */
1930
1972
  toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
1973
+ toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
1974
+ toPassToolJudge(judges: Array<JudgeMatcherOptions & {
1975
+ rubric?: RubricSpec;
1976
+ }>): Promise<R>;
1931
1977
  /**
1932
1978
  * Validates that a response meets size constraints
1933
1979
  *
@@ -2050,6 +2096,33 @@ interface EvalExpectationResult {
2050
2096
  * Optional details about the result
2051
2097
  */
2052
2098
  details?: string;
2099
+ /**
2100
+ * Judge score (0-1). Populated for passesJudge expectations.
2101
+ */
2102
+ score?: number;
2103
+ /**
2104
+ * Judge reasoning. Populated for passesJudge expectations.
2105
+ */
2106
+ reasoning?: string;
2107
+ /**
2108
+ * Judge name — rubric name (e.g. 'correctness') or custom judge name.
2109
+ * Populated for passesJudge expectations.
2110
+ */
2111
+ judgeName?: string;
2112
+ /**
2113
+ * Judge provider used. Populated for passesJudge expectations.
2114
+ */
2115
+ judgeProvider?: string;
2116
+ /**
2117
+ * Judge model used. Populated for passesJudge expectations.
2118
+ */
2119
+ judgeModel?: string;
2120
+ /**
2121
+ * Per-judge breakdown when multiple judges are used.
2122
+ * Each entry contains the individual judge's result.
2123
+ * Only populated when passesJudge is an array with 2+ entries.
2124
+ */
2125
+ judgeResults?: EvalExpectationResult[];
2053
2126
  }
2054
2127
  /**
2055
2128
  * Map of expectation type to result
@@ -2274,16 +2347,26 @@ declare function toBeToolError(this: {
2274
2347
  * Validates that a response passes LLM-as-judge evaluation.
2275
2348
  * Delegates evaluation logic to validateJudge() for consistency
2276
2349
  * with the validator/matcher duality pattern.
2350
+ *
2351
+ * Supports three call signatures:
2352
+ * - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
2353
+ * - toPassToolJudge({ judge: 'name', ... }) — named custom judge
2354
+ * - toPassToolJudge([...judges]) — multi-judge (all must pass)
2277
2355
  */
2278
2356
 
2279
2357
  /**
2280
- * Creates the toPassToolJudge matcher function
2358
+ * The toPassToolJudge matcher function.
2281
2359
  *
2282
- * Note: This is an async matcher that calls an LLM for evaluation.
2360
+ * Accepts either:
2361
+ * (received, rubric, options?) — rubric-based LLM judge
2362
+ * (received, options) — named custom judge (options.judge required)
2363
+ * (received, judges[]) — multi-judge (all must pass)
2283
2364
  */
2284
2365
  declare function toPassToolJudge(this: {
2285
2366
  isNot: boolean;
2286
- }, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
2367
+ }, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
2368
+ rubric?: RubricSpec;
2369
+ }>, maybeOptions?: JudgeMatcherOptions): Promise<{
2287
2370
  pass: boolean;
2288
2371
  message: () => string;
2289
2372
  }>;
@@ -2485,10 +2568,19 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
2485
2568
  */
2486
2569
 
2487
2570
  /**
2488
- * LLM provider for host simulation.
2571
+ * Host type for MCP host simulation.
2572
+ *
2573
+ * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
2574
+ * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
2575
+ * - 'browser': Web-based hosts (e.g., claude.ai). Uses Playwright/CDP. (Not yet implemented.)
2576
+ * - 'desktop': Desktop app hosts (e.g., Claude Desktop). Uses computer use. (Not yet implemented.)
2577
+ */
2578
+ type HostType = 'sdk' | 'cli' | 'browser' | 'desktop';
2579
+ /**
2580
+ * LLM provider for SDK-based host simulation.
2489
2581
  *
2490
- * All providers run through the Vercel AI SDK (`ai` package).
2491
- * Each provider requires its corresponding @ai-sdk/* package:
2582
+ * Each provider runs through the Vercel AI SDK (`ai` package)
2583
+ * and requires its corresponding @ai-sdk/* package:
2492
2584
  *
2493
2585
  * openai → npm install ai @ai-sdk/openai
2494
2586
  * anthropic → npm install ai @ai-sdk/anthropic
@@ -2508,14 +2600,81 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
2508
2600
  * @example model: 'claude-3-5-haiku@20241022'
2509
2601
  */
2510
2602
  | 'vertex-anthropic';
2603
+ /**
2604
+ * Output format for CLI host processes.
2605
+ *
2606
+ * - 'stream-json': NDJSON (one JSON object per line). Used by Claude Code (`--output-format stream-json`).
2607
+ * - 'json': Single JSON object on stdout.
2608
+ */
2609
+ type CLIOutputFormat = 'stream-json' | 'json';
2610
+ /**
2611
+ * Configuration for a CLI host process.
2612
+ *
2613
+ * The process is spawned directly (no shell) with `command` and `args`.
2614
+ * Use `{{scenario}}` in any args entry as a placeholder for the natural
2615
+ * language prompt — the framework replaces it before spawning.
2616
+ *
2617
+ * Because args are passed directly to the process (not through a shell),
2618
+ * special characters in the scenario (quotes, newlines, `$`, etc.) are
2619
+ * handled safely without escaping.
2620
+ *
2621
+ * @example Claude Code
2622
+ * ```json
2623
+ * {
2624
+ * "command": "claude",
2625
+ * "args": ["-p", "{{scenario}}", "--output-format", "stream-json",
2626
+ * "--verbose", "--mcp-config", "{...}"]
2627
+ * }
2628
+ * ```
2629
+ *
2630
+ * @example Custom CLI
2631
+ * ```json
2632
+ * {
2633
+ * "command": "my-agent",
2634
+ * "args": ["--prompt", "{{scenario}}", "--config", "./mcp.json"],
2635
+ * "outputFormat": "json"
2636
+ * }
2637
+ * ```
2638
+ */
2639
+ interface CLIConfig {
2640
+ /**
2641
+ * CLI binary to invoke.
2642
+ */
2643
+ command: string;
2644
+ /**
2645
+ * Arguments to pass. Use `{{scenario}}` as a placeholder for the prompt.
2646
+ */
2647
+ args: string[];
2648
+ /**
2649
+ * How to parse stdout.
2650
+ * @default 'stream-json'
2651
+ */
2652
+ outputFormat?: CLIOutputFormat;
2653
+ /**
2654
+ * Timeout in milliseconds.
2655
+ * @default 120000 (2 minutes)
2656
+ */
2657
+ timeout?: number;
2658
+ }
2511
2659
  /**
2512
2660
  * Configuration for MCP host simulation
2513
2661
  */
2514
2662
  interface MCPHostConfig {
2515
2663
  /**
2516
- * LLM provider to use
2664
+ * Host type for the simulation.
2665
+ *
2666
+ * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
2667
+ * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
2668
+ * - 'browser': Web-based hosts (not yet implemented).
2669
+ * - 'desktop': Desktop app hosts (not yet implemented).
2670
+ *
2671
+ * @default 'sdk'
2672
+ */
2673
+ hostType?: HostType;
2674
+ /**
2675
+ * LLM provider (required for 'sdk' host type, ignored for 'cli')
2517
2676
  */
2518
- provider: LLMProvider;
2677
+ provider?: LLMProvider;
2519
2678
  /**
2520
2679
  * Environment variable name containing the API key
2521
2680
  */
@@ -2538,6 +2697,10 @@ interface MCPHostConfig {
2538
2697
  * @default 10
2539
2698
  */
2540
2699
  maxToolCalls?: number;
2700
+ /**
2701
+ * CLI host configuration (required for 'cli' host type).
2702
+ */
2703
+ cli?: CLIConfig;
2541
2704
  }
2542
2705
  /**
2543
2706
  * A tool call made by the LLM
@@ -2709,6 +2872,42 @@ interface EvalCase {
2709
2872
  */
2710
2873
  expect?: EvalExpectBlock;
2711
2874
  }
2875
+ /**
2876
+ * Configuration for a single LLM-as-judge evaluation
2877
+ */
2878
+ interface JudgeExpectConfig {
2879
+ /**
2880
+ * Name of a registered custom judge executor.
2881
+ * When set, the named judge handles evaluation and returns a normalized score.
2882
+ * The `threshold` determines pass/fail. `reps` and LLM config fields
2883
+ * (provider, model, etc.) are ignored.
2884
+ */
2885
+ judge?: string;
2886
+ /** Built-in rubric name or custom rubric object. Required when no `judge` is specified. */
2887
+ rubric?: BuiltInRubric | {
2888
+ text: string;
2889
+ };
2890
+ /** Reference response to compare against */
2891
+ reference?: unknown;
2892
+ /** Score threshold for passing (0-1, default: 0.7) */
2893
+ threshold?: number;
2894
+ /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
2895
+ reps?: number;
2896
+ /** Judge provider. @default 'anthropic' */
2897
+ provider?: 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
2898
+ /** Model override (e.g., 'claude-opus-4-20250514') */
2899
+ model?: string;
2900
+ /** Environment variable name for API key */
2901
+ apiKeyEnvVar?: string;
2902
+ /** Max tokens for judge response */
2903
+ maxTokens?: number;
2904
+ /** Temperature for judge LLM (0–1) */
2905
+ temperature?: number;
2906
+ /** Max budget in USD per evaluation */
2907
+ maxBudgetUsd?: number;
2908
+ /** Fail if response exceeds this size in bytes before judging */
2909
+ maxToolOutputSize?: number;
2910
+ }
2712
2911
  /**
2713
2912
  * Unified expectation block for eval cases
2714
2913
  *
@@ -2748,33 +2947,11 @@ interface EvalExpectBlock {
2748
2947
  isError?: boolean | string | string[];
2749
2948
  /**
2750
2949
  * LLM-as-judge evaluation (toPassToolJudge)
2950
+ *
2951
+ * Accepts a single judge config or an array for multi-judge evaluation.
2952
+ * When an array is provided, all judges must pass (AND semantics).
2751
2953
  */
2752
- passesJudge?: {
2753
- /** Built-in rubric name or custom rubric object */
2754
- rubric: BuiltInRubric | {
2755
- text: string;
2756
- };
2757
- /** Reference response to compare against */
2758
- reference?: unknown;
2759
- /** Score threshold for passing (0-1, default: 0.7) */
2760
- threshold?: number;
2761
- /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
2762
- reps?: number;
2763
- /** Judge provider. @default 'anthropic' */
2764
- provider?: 'anthropic' | 'openai' | 'google';
2765
- /** Model override (e.g., 'claude-opus-4-20250514') */
2766
- model?: string;
2767
- /** Environment variable name for API key */
2768
- apiKeyEnvVar?: string;
2769
- /** Max tokens for judge response */
2770
- maxTokens?: number;
2771
- /** Temperature for judge LLM (0–1) */
2772
- temperature?: number;
2773
- /** Max budget in USD per evaluation */
2774
- maxBudgetUsd?: number;
2775
- /** Fail if response exceeds this size in bytes before judging */
2776
- maxToolOutputSize?: number;
2777
- };
2954
+ passesJudge?: JudgeExpectConfig | JudgeExpectConfig[];
2778
2955
  /**
2779
2956
  * Response size validation (toHaveToolResponseSize)
2780
2957
  */
@@ -2859,7 +3036,13 @@ declare const EvalCaseSchema: z.ZodObject<{
2859
3036
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2860
3037
  scenario: z.ZodOptional<z.ZodString>;
2861
3038
  mcpHostConfig: z.ZodOptional<z.ZodObject<{
2862
- provider: z.ZodEnum<{
3039
+ hostType: z.ZodOptional<z.ZodEnum<{
3040
+ sdk: "sdk";
3041
+ cli: "cli";
3042
+ browser: "browser";
3043
+ desktop: "desktop";
3044
+ }>>;
3045
+ provider: z.ZodOptional<z.ZodEnum<{
2863
3046
  openai: "openai";
2864
3047
  anthropic: "anthropic";
2865
3048
  azure: "azure";
@@ -2869,12 +3052,21 @@ declare const EvalCaseSchema: z.ZodObject<{
2869
3052
  openrouter: "openrouter";
2870
3053
  xai: "xai";
2871
3054
  "vertex-anthropic": "vertex-anthropic";
2872
- }>;
3055
+ }>>;
2873
3056
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2874
3057
  model: z.ZodOptional<z.ZodString>;
2875
3058
  maxTokens: z.ZodOptional<z.ZodNumber>;
2876
3059
  temperature: z.ZodOptional<z.ZodNumber>;
2877
3060
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3061
+ cli: z.ZodOptional<z.ZodObject<{
3062
+ command: z.ZodString;
3063
+ args: z.ZodArray<z.ZodString>;
3064
+ outputFormat: z.ZodOptional<z.ZodEnum<{
3065
+ json: "json";
3066
+ "stream-json": "stream-json";
3067
+ }>>;
3068
+ timeout: z.ZodOptional<z.ZodNumber>;
3069
+ }, z.core.$strip>>;
2878
3070
  }, z.core.$strip>>;
2879
3071
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2880
3072
  iterations: z.ZodOptional<z.ZodNumber>;
@@ -2901,8 +3093,9 @@ declare const EvalCaseSchema: z.ZodObject<{
2901
3093
  remove: z.ZodArray<z.ZodString>;
2902
3094
  }, z.core.$strip>]>>>;
2903
3095
  isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
2904
- passesJudge: z.ZodOptional<z.ZodObject<{
2905
- rubric: z.ZodUnion<readonly [z.ZodEnum<{
3096
+ passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
3097
+ judge: z.ZodOptional<z.ZodString>;
3098
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
2906
3099
  correctness: "correctness";
2907
3100
  completeness: "completeness";
2908
3101
  groundedness: "groundedness";
@@ -2910,7 +3103,7 @@ declare const EvalCaseSchema: z.ZodObject<{
2910
3103
  conciseness: "conciseness";
2911
3104
  }>, z.ZodObject<{
2912
3105
  text: z.ZodString;
2913
- }, z.core.$strip>]>;
3106
+ }, z.core.$strip>]>>;
2914
3107
  reference: z.ZodOptional<z.ZodUnknown>;
2915
3108
  threshold: z.ZodOptional<z.ZodNumber>;
2916
3109
  reps: z.ZodOptional<z.ZodNumber>;
@@ -2918,6 +3111,8 @@ declare const EvalCaseSchema: z.ZodObject<{
2918
3111
  openai: "openai";
2919
3112
  anthropic: "anthropic";
2920
3113
  google: "google";
3114
+ "vertex-anthropic": "vertex-anthropic";
3115
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
2921
3116
  }>>;
2922
3117
  model: z.ZodOptional<z.ZodString>;
2923
3118
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -2925,7 +3120,34 @@ declare const EvalCaseSchema: z.ZodObject<{
2925
3120
  temperature: z.ZodOptional<z.ZodNumber>;
2926
3121
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
2927
3122
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
2928
- }, z.core.$strip>>;
3123
+ }, z.core.$strip>, z.ZodArray<z.ZodObject<{
3124
+ judge: z.ZodOptional<z.ZodString>;
3125
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3126
+ correctness: "correctness";
3127
+ completeness: "completeness";
3128
+ groundedness: "groundedness";
3129
+ "instruction-following": "instruction-following";
3130
+ conciseness: "conciseness";
3131
+ }>, z.ZodObject<{
3132
+ text: z.ZodString;
3133
+ }, z.core.$strip>]>>;
3134
+ reference: z.ZodOptional<z.ZodUnknown>;
3135
+ threshold: z.ZodOptional<z.ZodNumber>;
3136
+ reps: z.ZodOptional<z.ZodNumber>;
3137
+ provider: z.ZodOptional<z.ZodEnum<{
3138
+ openai: "openai";
3139
+ anthropic: "anthropic";
3140
+ google: "google";
3141
+ "vertex-anthropic": "vertex-anthropic";
3142
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3143
+ }>>;
3144
+ model: z.ZodOptional<z.ZodString>;
3145
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3146
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3147
+ temperature: z.ZodOptional<z.ZodNumber>;
3148
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3149
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3150
+ }, z.core.$strip>>]>>;
2929
3151
  responseSize: z.ZodOptional<z.ZodObject<{
2930
3152
  maxBytes: z.ZodOptional<z.ZodNumber>;
2931
3153
  minBytes: z.ZodOptional<z.ZodNumber>;
@@ -2966,7 +3188,13 @@ declare const EvalDatasetSchema: z.ZodObject<{
2966
3188
  args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2967
3189
  scenario: z.ZodOptional<z.ZodString>;
2968
3190
  mcpHostConfig: z.ZodOptional<z.ZodObject<{
2969
- provider: z.ZodEnum<{
3191
+ hostType: z.ZodOptional<z.ZodEnum<{
3192
+ sdk: "sdk";
3193
+ cli: "cli";
3194
+ browser: "browser";
3195
+ desktop: "desktop";
3196
+ }>>;
3197
+ provider: z.ZodOptional<z.ZodEnum<{
2970
3198
  openai: "openai";
2971
3199
  anthropic: "anthropic";
2972
3200
  azure: "azure";
@@ -2976,12 +3204,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
2976
3204
  openrouter: "openrouter";
2977
3205
  xai: "xai";
2978
3206
  "vertex-anthropic": "vertex-anthropic";
2979
- }>;
3207
+ }>>;
2980
3208
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
2981
3209
  model: z.ZodOptional<z.ZodString>;
2982
3210
  maxTokens: z.ZodOptional<z.ZodNumber>;
2983
3211
  temperature: z.ZodOptional<z.ZodNumber>;
2984
3212
  maxToolCalls: z.ZodOptional<z.ZodNumber>;
3213
+ cli: z.ZodOptional<z.ZodObject<{
3214
+ command: z.ZodString;
3215
+ args: z.ZodArray<z.ZodString>;
3216
+ outputFormat: z.ZodOptional<z.ZodEnum<{
3217
+ json: "json";
3218
+ "stream-json": "stream-json";
3219
+ }>>;
3220
+ timeout: z.ZodOptional<z.ZodNumber>;
3221
+ }, z.core.$strip>>;
2985
3222
  }, z.core.$strip>>;
2986
3223
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
2987
3224
  iterations: z.ZodOptional<z.ZodNumber>;
@@ -3008,8 +3245,9 @@ declare const EvalDatasetSchema: z.ZodObject<{
3008
3245
  remove: z.ZodArray<z.ZodString>;
3009
3246
  }, z.core.$strip>]>>>;
3010
3247
  isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
3011
- passesJudge: z.ZodOptional<z.ZodObject<{
3012
- rubric: z.ZodUnion<readonly [z.ZodEnum<{
3248
+ passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
3249
+ judge: z.ZodOptional<z.ZodString>;
3250
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3013
3251
  correctness: "correctness";
3014
3252
  completeness: "completeness";
3015
3253
  groundedness: "groundedness";
@@ -3017,7 +3255,7 @@ declare const EvalDatasetSchema: z.ZodObject<{
3017
3255
  conciseness: "conciseness";
3018
3256
  }>, z.ZodObject<{
3019
3257
  text: z.ZodString;
3020
- }, z.core.$strip>]>;
3258
+ }, z.core.$strip>]>>;
3021
3259
  reference: z.ZodOptional<z.ZodUnknown>;
3022
3260
  threshold: z.ZodOptional<z.ZodNumber>;
3023
3261
  reps: z.ZodOptional<z.ZodNumber>;
@@ -3025,6 +3263,8 @@ declare const EvalDatasetSchema: z.ZodObject<{
3025
3263
  openai: "openai";
3026
3264
  anthropic: "anthropic";
3027
3265
  google: "google";
3266
+ "vertex-anthropic": "vertex-anthropic";
3267
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3028
3268
  }>>;
3029
3269
  model: z.ZodOptional<z.ZodString>;
3030
3270
  apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3032,7 +3272,34 @@ declare const EvalDatasetSchema: z.ZodObject<{
3032
3272
  temperature: z.ZodOptional<z.ZodNumber>;
3033
3273
  maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3034
3274
  maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3035
- }, z.core.$strip>>;
3275
+ }, z.core.$strip>, z.ZodArray<z.ZodObject<{
3276
+ judge: z.ZodOptional<z.ZodString>;
3277
+ rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
3278
+ correctness: "correctness";
3279
+ completeness: "completeness";
3280
+ groundedness: "groundedness";
3281
+ "instruction-following": "instruction-following";
3282
+ conciseness: "conciseness";
3283
+ }>, z.ZodObject<{
3284
+ text: z.ZodString;
3285
+ }, z.core.$strip>]>>;
3286
+ reference: z.ZodOptional<z.ZodUnknown>;
3287
+ threshold: z.ZodOptional<z.ZodNumber>;
3288
+ reps: z.ZodOptional<z.ZodNumber>;
3289
+ provider: z.ZodOptional<z.ZodEnum<{
3290
+ openai: "openai";
3291
+ anthropic: "anthropic";
3292
+ google: "google";
3293
+ "vertex-anthropic": "vertex-anthropic";
3294
+ "anthropic-agent-sdk": "anthropic-agent-sdk";
3295
+ }>>;
3296
+ model: z.ZodOptional<z.ZodString>;
3297
+ apiKeyEnvVar: z.ZodOptional<z.ZodString>;
3298
+ maxTokens: z.ZodOptional<z.ZodNumber>;
3299
+ temperature: z.ZodOptional<z.ZodNumber>;
3300
+ maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
3301
+ maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
3302
+ }, z.core.$strip>>]>>;
3036
3303
  responseSize: z.ZodOptional<z.ZodObject<{
3037
3304
  maxBytes: z.ZodOptional<z.ZodNumber>;
3038
3305
  minBytes: z.ZodOptional<z.ZodNumber>;
@@ -3268,6 +3535,23 @@ interface IterationResult {
3268
3535
  }>;
3269
3536
  };
3270
3537
  }
3538
+ /**
3539
+ * Request data captured from the eval case input.
3540
+ * Preserves what was sent so results are self-contained for debugging.
3541
+ */
3542
+ interface EvalCaseRequest {
3543
+ /** Human-readable description of the case */
3544
+ description?: string;
3545
+ /** Tool arguments (direct mode) */
3546
+ args?: Record<string, unknown>;
3547
+ /** Natural language scenario sent to the LLM (mcp_host mode) */
3548
+ scenario?: string;
3549
+ /** LLM provider/model configuration (mcp_host mode) */
3550
+ mcpHostConfig?: {
3551
+ provider?: string;
3552
+ model?: string;
3553
+ };
3554
+ }
3271
3555
  /**
3272
3556
  * Result of a single eval case
3273
3557
  */
@@ -3292,6 +3576,11 @@ interface EvalCaseResult {
3292
3576
  * Overall pass/fail status
3293
3577
  */
3294
3578
  pass: boolean;
3579
+ /**
3580
+ * Request data from the eval case input (tool args, scenario, LLM config).
3581
+ * Populated so results are self-contained for debugging without the original dataset.
3582
+ */
3583
+ request?: EvalCaseRequest;
3295
3584
  /**
3296
3585
  * Tool response
3297
3586
  */
@@ -3835,24 +4124,31 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
3835
4124
  * schemas, testing discoverability and parameter clarity at the level a real
3836
4125
  * user (via Claude Desktop, ChatGPT, etc.) would experience.
3837
4126
  *
3838
- * All providers run through the Vercel AI SDK's generateText with maxSteps,
3839
- * which handles multi-turn tool calling natively and provides per-step latency
3840
- * decomposition (llmDurationMs vs. mcpDurationMs).
3841
- *
3842
- * @param mcp - MCP fixture API
4127
+ * @param mcp - MCP fixture API (used by SDK hosts; ignored by CLI/browser hosts which establish their own connections)
3843
4128
  * @param scenario - Natural language prompt describing what the LLM should do
3844
4129
  * @param config - MCP host configuration (provider, model, temperature, etc.)
3845
4130
  * @returns Simulation result with tool calls, final response, and latency data
3846
4131
  *
3847
4132
  * @example
3848
4133
  * ```typescript
4134
+ * // SDK host (default) — uses the framework's existing MCP connection
3849
4135
  * const result = await simulateMCPHost(mcp,
3850
4136
  * "Find recent documents about MCP testing frameworks",
3851
4137
  * { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
3852
4138
  * );
3853
4139
  *
3854
- * expect(result.success).toBe(true);
3855
- * expect(result.toolCalls.map(c => c.name)).toContain('search');
4140
+ * // CLI host — spawns a CLI process with its own MCP connection
4141
+ * const result = await simulateMCPHost(mcp,
4142
+ * "Find recent documents about MCP testing frameworks",
4143
+ * {
4144
+ * hostType: 'cli',
4145
+ * provider: 'anthropic',
4146
+ * cli: {
4147
+ * command: 'claude',
4148
+ * args: ['-p', '{{scenario}}', '--output-format', 'stream-json', '--verbose'],
4149
+ * },
4150
+ * }
4151
+ * );
3856
4152
  * ```
3857
4153
  */
3858
4154
  declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
@@ -3905,6 +4201,99 @@ declare function getMissingDependencyMessage(provider: LLMProvider): string;
3905
4201
  */
3906
4202
  declare function createJudge(config?: JudgeConfig): Judge;
3907
4203
 
4204
+ /**
4205
+ * Custom Judge Registry
4206
+ *
4207
+ * Allows consumers to register named judge executors that can be referenced
4208
+ * by string ID in eval fixtures and programmatic tests. This enables
4209
+ * multi-step judge pipelines (LLM call + post-processing), custom scoring
4210
+ * logic, and reusable judge configurations without duplicating rubrics.
4211
+ */
4212
+ /**
4213
+ * Result returned by a custom judge executor.
4214
+ *
4215
+ * Custom judges must return a normalized score (0–1). The framework applies
4216
+ * the caller's `threshold` (default 0.7) to determine pass/fail. This keeps
4217
+ * judges reusable — the same judge can be used with different thresholds in
4218
+ * different tests.
4219
+ */
4220
+ interface CustomJudgeResult {
4221
+ /** Normalized score (0–1, where 1 is best) */
4222
+ score: number;
4223
+ /** Optional reasoning/explanation */
4224
+ reasoning?: string;
4225
+ }
4226
+ /**
4227
+ * A user-defined judge executor function.
4228
+ *
4229
+ * Custom executors own their entire evaluation pipeline — prompt construction,
4230
+ * LLM calls, and post-processing — but return a normalized score. The framework
4231
+ * determines pass/fail by comparing the score against the caller's threshold.
4232
+ *
4233
+ * @param candidate - The actual response to evaluate
4234
+ * @param reference - Optional reference/expected response
4235
+ * @returns Evaluation result with a normalized score and optional reasoning
4236
+ *
4237
+ * @example
4238
+ * ```typescript
4239
+ * const completenessJudge: CustomJudgeExecutor = async (candidate, reference) => {
4240
+ * // Step 1: LLM call with your own prompt and schema
4241
+ * const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
4242
+ * const { verdict, reasoning } = JSON.parse(llmResult);
4243
+ *
4244
+ * // Step 2: Deterministic post-processing into a normalized score
4245
+ * const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
4246
+ *
4247
+ * return { score, reasoning };
4248
+ * };
4249
+ * ```
4250
+ */
4251
+ type CustomJudgeExecutor = (candidate: unknown, reference?: unknown) => Promise<CustomJudgeResult>;
4252
+ /**
4253
+ * Registers a named custom judge executor.
4254
+ *
4255
+ * Call this in your test setup (e.g., `playwright.config.ts` or a global setup file)
4256
+ * before tests run. The name can then be referenced in JSON eval fixtures via the
4257
+ * `judge` field on `passesJudge`.
4258
+ *
4259
+ * @param name - Unique identifier for the judge
4260
+ * @param executor - The judge executor function
4261
+ * @throws {Error} If a judge with the same name is already registered
4262
+ *
4263
+ * @example
4264
+ * ```typescript
4265
+ * import { registerJudge } from '@gleanwork/mcp-server-tester';
4266
+ *
4267
+ * registerJudge('glean-completeness', async (candidate, reference) => {
4268
+ * // Step 1: LLM call with your own prompt and schema
4269
+ * const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
4270
+ * const { verdict, reasoning } = JSON.parse(llmResult);
4271
+ *
4272
+ * // Step 2: Deterministic post-processing into a normalized score
4273
+ * const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
4274
+ *
4275
+ * return { score, reasoning };
4276
+ * });
4277
+ *
4278
+ * // Then in tests — same judge, different thresholds:
4279
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.8 });
4280
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.5 });
4281
+ * ```
4282
+ */
4283
+ declare function registerJudge(name: string, executor: CustomJudgeExecutor): void;
4284
+ /**
4285
+ * Retrieves a registered custom judge executor by name.
4286
+ *
4287
+ * @param name - The judge name to look up
4288
+ * @returns The registered executor
4289
+ * @throws {Error} If no judge with the given name is registered
4290
+ */
4291
+ declare function getRegisteredJudge(name: string): CustomJudgeExecutor;
4292
+ /**
4293
+ * Clears all registered judges. Intended for test teardown.
4294
+ */
4295
+ declare function clearJudgeRegistry(): void;
4296
+
3908
4297
  /**
3909
4298
  * Options for conformance checks
3910
4299
  */
@@ -4066,4 +4455,4 @@ interface MCPEvalReporterConfig {
4066
4455
  includeAutoTracking?: boolean;
4067
4456
  }
4068
4457
 
4069
- export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
4458
+ export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };