npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.6 → 1.0.0-beta.8 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/cli/index.js +1 -1
package/dist/fixtures/mcp.d.ts +33 -8
package/dist/fixtures/mcp.js +284 -24
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +649 -62
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +504 -115
package/dist/index.d.ts +504 -115
package/dist/index.js +648 -64
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +8 -134
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +12 -7
package/dist/reporters/mcpReporter.d.cts +0 -90
package/dist/reporters/mcpReporter.d.ts +0 -90

package/dist/index.d.cts CHANGED Viewed

@@ -1,10 +1,10 @@
 import { z, ZodType } from 'zod';
+import { Page, TestInfo, Expect } from '@playwright/test';
 import { OAuthClientProvider } from '@modelcontextprotocol/sdk/client/auth.js';
 import { OAuthClientMetadata, OAuthClientInformationFull, OAuthTokens } from '@modelcontextprotocol/sdk/shared/auth.js';
 import * as oauth from 'oauth4webapi';
 import { Client } from '@modelcontextprotocol/sdk/client/index.js';
 import { CallToolResult, Tool, Implementation, ServerCapabilities, Resource, Prompt } from '@modelcontextprotocol/sdk/types.js';
-import { TestInfo, Expect } from '@playwright/test';
 import * as playwright_test from 'playwright/test';
 /**
@@ -311,6 +311,7 @@ declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
 /**
  * Auth types for MCP OAuth integration
  */
 /**
  * Stored OAuth tokens
  */
@@ -384,70 +385,90 @@ interface StoredOAuthState {
     savedAt: number;
 }
 /**
- * Configuration for OAuth setup flow
+ * Login form selectors for standard OAuth login automation
+ */
+interface OAuthLoginSelectors {
+    /** Selector for username/email input field */
+    usernameInput: string;
+    /** Selector for password input field */
+    passwordInput: string;
+    /** Selector for login submit button */
+    submitButton: string;
+    /** Selector for consent/authorize button (optional) */
+    consentButton?: string;
+}
+/**
+ * Base configuration shared by all OAuth setup strategies
  */
-interface OAuthSetupConfig {
-    /**
-     * OAuth authorization server metadata URL
-     */
+interface OAuthSetupBaseConfig {
+    /** OAuth authorization server metadata URL */
     authServerUrl: string;
-    /**
-     * Scopes to request
-     */
+    /** Scopes to request */
     scopes: Array<string>;
-    /**
-     * Resource indicator (RFC 8707)
-     */
+    /** Path to save OAuth state file */
+    outputPath: string;
+    /** Pre-registered client ID (optional, uses DCR if not provided) */
+    clientId?: string;
+    /** Pre-registered client secret (optional) */
+    clientSecret?: string;
+    /** Redirect URI for OAuth callback */
+    redirectUri?: string;
+    /** Resource indicator (RFC 8707) */
     resource?: string;
-    /**
-     * Login form selectors for automation
-     */
-    loginSelectors: {
-        /**
-         * Selector for username/email input field
-         */
-        usernameInput: string;
-        /**
-         * Selector for password input field
-         */
-        passwordInput: string;
-        /**
-         * Selector for login submit button
-         */
-        submitButton: string;
-        /**
-         * Selector for consent/authorize button (optional)
-         */
-        consentButton?: string;
-    };
-    /**
-     * Test user credentials
-     */
+    /** Timeout for login flow in milliseconds (default: 30000) */
+    timeoutMs?: number;
+}
+/**
+ * Standard login strategy: automates a form with username, password, and submit button.
+ * Use when the IdP presents all login fields on a single page.
+ */
+interface StandardLoginConfig {
+    /** Login form selectors for Playwright automation */
+    loginSelectors: OAuthLoginSelectors;
+    /** Test user credentials */
     credentials: {
         username: string;
         password: string;
     };
+    customLoginFlow?: never;
+}
+/**
+ * Custom login strategy: full control over the browser-based login flow.
+ * Use for multi-step logins, MFA, custom consent screens, or any flow
+ * that doesn't fit the standard username/password/submit pattern.
+ *
+ * The callback receives a Playwright Page already navigated to the OAuth
+ * authorization URL. Complete the login so the IdP redirects to the
+ * callback URL — `performOAuthSetup` handles PKCE, token exchange,
+ * and state persistence automatically.
+ */
+interface CustomLoginConfig {
     /**
-     * Path to save OAuth state file
-     */
-    outputPath: string;
-    /**
-     * Pre-registered client ID (optional, uses DCR if not provided)
-     */
-    clientId?: string;
-    /**
-     * Pre-registered client secret (optional)
-     */
-    clientSecret?: string;
-    /**
-     * Redirect URI for OAuth callback
-     */
-    redirectUri?: string;
-    /**
-     * Timeout for login flow in milliseconds (default: 30000)
+     * Custom Playwright automation for the IdP login flow.
+     *
+     * @param page - Playwright Page already navigated to the OAuth authorization URL
+     *
+     * @example
+     * ```typescript
+     * customLoginFlow: async (page) => {
+     *   await page.fill('#username', process.env.TEST_USER!);
+     *   await page.click('#continue');
+     *   await page.fill('#password', process.env.TEST_PASS!);
+     *   await page.click('#submit');
+     * }
+     * ```
      */
-    timeoutMs?: number;
+    customLoginFlow: (page: Page) => Promise<void>;
+    loginSelectors?: never;
+    credentials?: never;
 }
+/**
+ * Configuration for OAuth setup flow.
+ *
+ * Provide either `loginSelectors` + `credentials` for standard form-based login,
+ * or `customLoginFlow` for full control over the browser automation.
+ */
+type OAuthSetupConfig = OAuthSetupBaseConfig & (StandardLoginConfig | CustomLoginConfig);
 /**
  * Result of token exchange or refresh
  */
@@ -1632,7 +1653,7 @@ interface UsageMetrics {
     cacheCreationInputTokens?: number;
 }
 /** Valid LLM judge provider kinds. */
-type ProviderKind = 'anthropic' | 'openai' | 'google';
+type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
 /**
  * Configuration for an LLM judge
  */
@@ -1744,8 +1765,11 @@ interface Judge {
  * Configuration for the judge validator
  */
 interface JudgeValidatorConfig {
-    /** The evaluation rubric: a built-in name or custom { text: string } */
-    rubric: RubricSpec;
+    /**
+     * The evaluation rubric: a built-in name or custom { text: string }.
+     * Required when no named `judge` is specified.
+     */
+    rubric?: RubricSpec;
     /** Optional reference response to compare against */
     reference?: unknown;
     /** Minimum score required to pass (0-1, default: 0.7) */
@@ -1766,6 +1790,13 @@ interface JudgeValidatorConfig {
     maxBudgetUsd?: number;
     /** Fail if response exceeds this size in bytes before judging */
     maxToolOutputSize?: number;
+    /**
+     * Name of a registered custom judge executor.
+     * When set, the named judge handles the entire evaluation pipeline
+     * and returns a normalized score. The `threshold` determines pass/fail.
+     * Register judges with `registerJudge()` before tests run.
+     */
+    judge?: string;
 }
 declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
@@ -1823,6 +1854,12 @@ interface JudgeMatcherOptions {
     provider?: ProviderKind;
     /** Override the judge model */
     model?: string;
+    /**
+     * Name of a registered custom judge executor.
+     * When set, the named judge handles the entire evaluation pipeline
+     * and its `pass` result is authoritative.
+     */
+    judge?: string;
 }
 /**
  * Declaration merging for Playwright matchers
@@ -1913,21 +1950,30 @@ declare global {
              */
             toBeToolError(expected?: boolean | string | string[]): R;
             /**
-             * Validates that a response passes LLM-as-judge evaluation
+             * Validates that a response passes LLM-as-judge evaluation.
              *
-             * @param rubric - Evaluation rubric/criteria
-             * @param options - Judge options
+             * Two call signatures:
+             * - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
+             * - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
              *
              * @example
              * ```typescript
+             * // Built-in LLM judge with rubric
              * expect(result).toPassToolJudge('Response should be helpful and accurate');
-             * expect(result).toPassToolJudge('Response should match reference', {
+             * expect(result).toPassToolJudge('correctness', {
              *   reference: expectedOutput,
              *   passingThreshold: 0.8,
              * });
+             *
+             * // Named custom judge (registered via registerJudge)
+             * expect(result).toPassToolJudge({ judge: 'glean-completeness' });
              * ```
              */
             toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(judges: Array<JudgeMatcherOptions & {
+                rubric?: RubricSpec;
+            }>): Promise<R>;
             /**
              * Validates that a response meets size constraints
              *
@@ -2050,6 +2096,33 @@ interface EvalExpectationResult {
      * Optional details about the result
      */
     details?: string;
+    /**
+     * Judge score (0-1). Populated for passesJudge expectations.
+     */
+    score?: number;
+    /**
+     * Judge reasoning. Populated for passesJudge expectations.
+     */
+    reasoning?: string;
+    /**
+     * Judge name — rubric name (e.g. 'correctness') or custom judge name.
+     * Populated for passesJudge expectations.
+     */
+    judgeName?: string;
+    /**
+     * Judge provider used. Populated for passesJudge expectations.
+     */
+    judgeProvider?: string;
+    /**
+     * Judge model used. Populated for passesJudge expectations.
+     */
+    judgeModel?: string;
+    /**
+     * Per-judge breakdown when multiple judges are used.
+     * Each entry contains the individual judge's result.
+     * Only populated when passesJudge is an array with 2+ entries.
+     */
+    judgeResults?: EvalExpectationResult[];
 }
 /**
  * Map of expectation type to result
@@ -2274,16 +2347,26 @@ declare function toBeToolError(this: {
  * Validates that a response passes LLM-as-judge evaluation.
  * Delegates evaluation logic to validateJudge() for consistency
  * with the validator/matcher duality pattern.
+ *
+ * Supports three call signatures:
+ *   - toPassToolJudge(rubric, options?)        — built-in LLM judge with rubric
+ *   - toPassToolJudge({ judge: 'name', ... })  — named custom judge
+ *   - toPassToolJudge([...judges])             — multi-judge (all must pass)
  */
 /**
- * Creates the toPassToolJudge matcher function
+ * The toPassToolJudge matcher function.
  *
- * Note: This is an async matcher that calls an LLM for evaluation.
+ * Accepts either:
+ *   (received, rubric, options?) — rubric-based LLM judge
+ *   (received, options)          — named custom judge (options.judge required)
+ *   (received, judges[])         — multi-judge (all must pass)
  */
 declare function toPassToolJudge(this: {
     isNot: boolean;
-}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
+}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
+    rubric?: RubricSpec;
+}>, maybeOptions?: JudgeMatcherOptions): Promise<{
     pass: boolean;
     message: () => string;
 }>;
@@ -2485,10 +2568,19 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
  */
 /**
- * LLM provider for host simulation.
+ * Host type for MCP host simulation.
+ *
+ * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
+ * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
+ * - 'browser': Web-based hosts (e.g., claude.ai). Uses Playwright/CDP. (Not yet implemented.)
+ * - 'desktop': Desktop app hosts (e.g., Claude Desktop). Uses computer use. (Not yet implemented.)
+ */
+type HostType = 'sdk' | 'cli' | 'browser' | 'desktop';
+/**
+ * LLM provider for SDK-based host simulation.
  *
- * All providers run through the Vercel AI SDK (`ai` package).
- * Each provider requires its corresponding @ai-sdk/* package:
+ * Each provider runs through the Vercel AI SDK (`ai` package)
+ * and requires its corresponding @ai-sdk/* package:
  *
  *   openai      → npm install ai @ai-sdk/openai
  *   anthropic   → npm install ai @ai-sdk/anthropic
@@ -2508,14 +2600,81 @@ type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'de
  * @example model: 'claude-3-5-haiku@20241022'
  */
  | 'vertex-anthropic';
+/**
+ * Output format for CLI host processes.
+ *
+ * - 'stream-json': NDJSON (one JSON object per line). Used by Claude Code (`--output-format stream-json`).
+ * - 'json': Single JSON object on stdout.
+ */
+type CLIOutputFormat = 'stream-json' | 'json';
+/**
+ * Configuration for a CLI host process.
+ *
+ * The process is spawned directly (no shell) with `command` and `args`.
+ * Use `{{scenario}}` in any args entry as a placeholder for the natural
+ * language prompt — the framework replaces it before spawning.
+ *
+ * Because args are passed directly to the process (not through a shell),
+ * special characters in the scenario (quotes, newlines, `$`, etc.) are
+ * handled safely without escaping.
+ *
+ * @example Claude Code
+ * ```json
+ * {
+ *   "command": "claude",
+ *   "args": ["-p", "{{scenario}}", "--output-format", "stream-json",
+ *            "--verbose", "--mcp-config", "{...}"]
+ * }
+ * ```
+ *
+ * @example Custom CLI
+ * ```json
+ * {
+ *   "command": "my-agent",
+ *   "args": ["--prompt", "{{scenario}}", "--config", "./mcp.json"],
+ *   "outputFormat": "json"
+ * }
+ * ```
+ */
+interface CLIConfig {
+    /**
+     * CLI binary to invoke.
+     */
+    command: string;
+    /**
+     * Arguments to pass. Use `{{scenario}}` as a placeholder for the prompt.
+     */
+    args: string[];
+    /**
+     * How to parse stdout.
+     * @default 'stream-json'
+     */
+    outputFormat?: CLIOutputFormat;
+    /**
+     * Timeout in milliseconds.
+     * @default 120000 (2 minutes)
+     */
+    timeout?: number;
+}
 /**
  * Configuration for MCP host simulation
  */
 interface MCPHostConfig {
     /**
-     * LLM provider to use
+     * Host type for the simulation.
+     *
+     * - 'sdk': Programmatic via Vercel AI SDK (default). The framework's MCP connection is reused.
+     * - 'cli': CLI-based hosts (e.g., Claude Code, Codex). Spawns a process with its own MCP connection.
+     * - 'browser': Web-based hosts (not yet implemented).
+     * - 'desktop': Desktop app hosts (not yet implemented).
+     *
+     * @default 'sdk'
+     */
+    hostType?: HostType;
+    /**
+     * LLM provider (required for 'sdk' host type, ignored for 'cli')
      */
-    provider: LLMProvider;
+    provider?: LLMProvider;
     /**
      * Environment variable name containing the API key
      */
@@ -2538,6 +2697,10 @@ interface MCPHostConfig {
      * @default 10
      */
     maxToolCalls?: number;
+    /**
+     * CLI host configuration (required for 'cli' host type).
+     */
+    cli?: CLIConfig;
 }
 /**
  * A tool call made by the LLM
@@ -2709,6 +2872,42 @@ interface EvalCase {
      */
     expect?: EvalExpectBlock;
 }
+/**
+ * Configuration for a single LLM-as-judge evaluation
+ */
+interface JudgeExpectConfig {
+    /**
+     * Name of a registered custom judge executor.
+     * When set, the named judge handles evaluation and returns a normalized score.
+     * The `threshold` determines pass/fail. `reps` and LLM config fields
+     * (provider, model, etc.) are ignored.
+     */
+    judge?: string;
+    /** Built-in rubric name or custom rubric object. Required when no `judge` is specified. */
+    rubric?: BuiltInRubric | {
+        text: string;
+    };
+    /** Reference response to compare against */
+    reference?: unknown;
+    /** Score threshold for passing (0-1, default: 0.7) */
+    threshold?: number;
+    /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
+    reps?: number;
+    /** Judge provider. @default 'anthropic' */
+    provider?: 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
+    /** Model override (e.g., 'claude-opus-4-20250514') */
+    model?: string;
+    /** Environment variable name for API key */
+    apiKeyEnvVar?: string;
+    /** Max tokens for judge response */
+    maxTokens?: number;
+    /** Temperature for judge LLM (0–1) */
+    temperature?: number;
+    /** Max budget in USD per evaluation */
+    maxBudgetUsd?: number;
+    /** Fail if response exceeds this size in bytes before judging */
+    maxToolOutputSize?: number;
+}
 /**
  * Unified expectation block for eval cases
  *
@@ -2748,33 +2947,11 @@ interface EvalExpectBlock {
     isError?: boolean | string | string[];
     /**
      * LLM-as-judge evaluation (toPassToolJudge)
+     *
+     * Accepts a single judge config or an array for multi-judge evaluation.
+     * When an array is provided, all judges must pass (AND semantics).
      */
-    passesJudge?: {
-        /** Built-in rubric name or custom rubric object */
-        rubric: BuiltInRubric | {
-            text: string;
-        };
-        /** Reference response to compare against */
-        reference?: unknown;
-        /** Score threshold for passing (0-1, default: 0.7) */
-        threshold?: number;
-        /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
-        reps?: number;
-        /** Judge provider. @default 'anthropic' */
-        provider?: 'anthropic' | 'openai' | 'google';
-        /** Model override (e.g., 'claude-opus-4-20250514') */
-        model?: string;
-        /** Environment variable name for API key */
-        apiKeyEnvVar?: string;
-        /** Max tokens for judge response */
-        maxTokens?: number;
-        /** Temperature for judge LLM (0–1) */
-        temperature?: number;
-        /** Max budget in USD per evaluation */
-        maxBudgetUsd?: number;
-        /** Fail if response exceeds this size in bytes before judging */
-        maxToolOutputSize?: number;
-    };
+    passesJudge?: JudgeExpectConfig | JudgeExpectConfig[];
     /**
      * Response size validation (toHaveToolResponseSize)
      */
@@ -2859,7 +3036,13 @@ declare const EvalCaseSchema: z.ZodObject<{
     args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
     scenario: z.ZodOptional<z.ZodString>;
     mcpHostConfig: z.ZodOptional<z.ZodObject<{
-        provider: z.ZodEnum<{
+        hostType: z.ZodOptional<z.ZodEnum<{
+            sdk: "sdk";
+            cli: "cli";
+            browser: "browser";
+            desktop: "desktop";
+        }>>;
+        provider: z.ZodOptional<z.ZodEnum<{
             openai: "openai";
             anthropic: "anthropic";
             azure: "azure";
@@ -2869,12 +3052,21 @@ declare const EvalCaseSchema: z.ZodObject<{
             openrouter: "openrouter";
             xai: "xai";
             "vertex-anthropic": "vertex-anthropic";
-        }>;
+        }>>;
         apiKeyEnvVar: z.ZodOptional<z.ZodString>;
         model: z.ZodOptional<z.ZodString>;
         maxTokens: z.ZodOptional<z.ZodNumber>;
         temperature: z.ZodOptional<z.ZodNumber>;
         maxToolCalls: z.ZodOptional<z.ZodNumber>;
+        cli: z.ZodOptional<z.ZodObject<{
+            command: z.ZodString;
+            args: z.ZodArray<z.ZodString>;
+            outputFormat: z.ZodOptional<z.ZodEnum<{
+                json: "json";
+                "stream-json": "stream-json";
+            }>>;
+            timeout: z.ZodOptional<z.ZodNumber>;
+        }, z.core.$strip>>;
     }, z.core.$strip>>;
     metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
     iterations: z.ZodOptional<z.ZodNumber>;
@@ -2901,8 +3093,9 @@ declare const EvalCaseSchema: z.ZodObject<{
             remove: z.ZodArray<z.ZodString>;
         }, z.core.$strip>]>>>;
         isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
-        passesJudge: z.ZodOptional<z.ZodObject<{
-            rubric: z.ZodUnion<readonly [z.ZodEnum<{
+        passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
+            judge: z.ZodOptional<z.ZodString>;
+            rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
                 correctness: "correctness";
                 completeness: "completeness";
                 groundedness: "groundedness";
@@ -2910,7 +3103,7 @@ declare const EvalCaseSchema: z.ZodObject<{
                 conciseness: "conciseness";
             }>, z.ZodObject<{
                 text: z.ZodString;
-            }, z.core.$strip>]>;
+            }, z.core.$strip>]>>;
             reference: z.ZodOptional<z.ZodUnknown>;
             threshold: z.ZodOptional<z.ZodNumber>;
             reps: z.ZodOptional<z.ZodNumber>;
@@ -2918,6 +3111,8 @@ declare const EvalCaseSchema: z.ZodObject<{
                 openai: "openai";
                 anthropic: "anthropic";
                 google: "google";
+                "vertex-anthropic": "vertex-anthropic";
+                "anthropic-agent-sdk": "anthropic-agent-sdk";
             }>>;
             model: z.ZodOptional<z.ZodString>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -2925,7 +3120,34 @@ declare const EvalCaseSchema: z.ZodObject<{
             temperature: z.ZodOptional<z.ZodNumber>;
             maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
             maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
-        }, z.core.$strip>>;
+        }, z.core.$strip>, z.ZodArray<z.ZodObject<{
+            judge: z.ZodOptional<z.ZodString>;
+            rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
+                correctness: "correctness";
+                completeness: "completeness";
+                groundedness: "groundedness";
+                "instruction-following": "instruction-following";
+                conciseness: "conciseness";
+            }>, z.ZodObject<{
+                text: z.ZodString;
+            }, z.core.$strip>]>>;
+            reference: z.ZodOptional<z.ZodUnknown>;
+            threshold: z.ZodOptional<z.ZodNumber>;
+            reps: z.ZodOptional<z.ZodNumber>;
+            provider: z.ZodOptional<z.ZodEnum<{
+                openai: "openai";
+                anthropic: "anthropic";
+                google: "google";
+                "vertex-anthropic": "vertex-anthropic";
+                "anthropic-agent-sdk": "anthropic-agent-sdk";
+            }>>;
+            model: z.ZodOptional<z.ZodString>;
+            apiKeyEnvVar: z.ZodOptional<z.ZodString>;
+            maxTokens: z.ZodOptional<z.ZodNumber>;
+            temperature: z.ZodOptional<z.ZodNumber>;
+            maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
+            maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
+        }, z.core.$strip>>]>>;
         responseSize: z.ZodOptional<z.ZodObject<{
             maxBytes: z.ZodOptional<z.ZodNumber>;
             minBytes: z.ZodOptional<z.ZodNumber>;
@@ -2966,7 +3188,13 @@ declare const EvalDatasetSchema: z.ZodObject<{
         args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
         scenario: z.ZodOptional<z.ZodString>;
         mcpHostConfig: z.ZodOptional<z.ZodObject<{
-            provider: z.ZodEnum<{
+            hostType: z.ZodOptional<z.ZodEnum<{
+                sdk: "sdk";
+                cli: "cli";
+                browser: "browser";
+                desktop: "desktop";
+            }>>;
+            provider: z.ZodOptional<z.ZodEnum<{
                 openai: "openai";
                 anthropic: "anthropic";
                 azure: "azure";
@@ -2976,12 +3204,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 openrouter: "openrouter";
                 xai: "xai";
                 "vertex-anthropic": "vertex-anthropic";
-            }>;
+            }>>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
             model: z.ZodOptional<z.ZodString>;
             maxTokens: z.ZodOptional<z.ZodNumber>;
             temperature: z.ZodOptional<z.ZodNumber>;
             maxToolCalls: z.ZodOptional<z.ZodNumber>;
+            cli: z.ZodOptional<z.ZodObject<{
+                command: z.ZodString;
+                args: z.ZodArray<z.ZodString>;
+                outputFormat: z.ZodOptional<z.ZodEnum<{
+                    json: "json";
+                    "stream-json": "stream-json";
+                }>>;
+                timeout: z.ZodOptional<z.ZodNumber>;
+            }, z.core.$strip>>;
         }, z.core.$strip>>;
         metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
         iterations: z.ZodOptional<z.ZodNumber>;
@@ -3008,8 +3245,9 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: z.ZodArray<z.ZodString>;
             }, z.core.$strip>]>>>;
             isError: z.ZodOptional<z.ZodUnion<readonly [z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString>]>>;
-            passesJudge: z.ZodOptional<z.ZodObject<{
-                rubric: z.ZodUnion<readonly [z.ZodEnum<{
+            passesJudge: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
+                judge: z.ZodOptional<z.ZodString>;
+                rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
                     correctness: "correctness";
                     completeness: "completeness";
                     groundedness: "groundedness";
@@ -3017,7 +3255,7 @@ declare const EvalDatasetSchema: z.ZodObject<{
                     conciseness: "conciseness";
                 }>, z.ZodObject<{
                     text: z.ZodString;
-                }, z.core.$strip>]>;
+                }, z.core.$strip>]>>;
                 reference: z.ZodOptional<z.ZodUnknown>;
                 threshold: z.ZodOptional<z.ZodNumber>;
                 reps: z.ZodOptional<z.ZodNumber>;
@@ -3025,6 +3263,8 @@ declare const EvalDatasetSchema: z.ZodObject<{
                     openai: "openai";
                     anthropic: "anthropic";
                     google: "google";
+                    "vertex-anthropic": "vertex-anthropic";
+                    "anthropic-agent-sdk": "anthropic-agent-sdk";
                 }>>;
                 model: z.ZodOptional<z.ZodString>;
                 apiKeyEnvVar: z.ZodOptional<z.ZodString>;
@@ -3032,7 +3272,34 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 temperature: z.ZodOptional<z.ZodNumber>;
                 maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
                 maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
-            }, z.core.$strip>>;
+            }, z.core.$strip>, z.ZodArray<z.ZodObject<{
+                judge: z.ZodOptional<z.ZodString>;
+                rubric: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
+                    correctness: "correctness";
+                    completeness: "completeness";
+                    groundedness: "groundedness";
+                    "instruction-following": "instruction-following";
+                    conciseness: "conciseness";
+                }>, z.ZodObject<{
+                    text: z.ZodString;
+                }, z.core.$strip>]>>;
+                reference: z.ZodOptional<z.ZodUnknown>;
+                threshold: z.ZodOptional<z.ZodNumber>;
+                reps: z.ZodOptional<z.ZodNumber>;
+                provider: z.ZodOptional<z.ZodEnum<{
+                    openai: "openai";
+                    anthropic: "anthropic";
+                    google: "google";
+                    "vertex-anthropic": "vertex-anthropic";
+                    "anthropic-agent-sdk": "anthropic-agent-sdk";
+                }>>;
+                model: z.ZodOptional<z.ZodString>;
+                apiKeyEnvVar: z.ZodOptional<z.ZodString>;
+                maxTokens: z.ZodOptional<z.ZodNumber>;
+                temperature: z.ZodOptional<z.ZodNumber>;
+                maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
+                maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
+            }, z.core.$strip>>]>>;
             responseSize: z.ZodOptional<z.ZodObject<{
                 maxBytes: z.ZodOptional<z.ZodNumber>;
                 minBytes: z.ZodOptional<z.ZodNumber>;
@@ -3268,6 +3535,23 @@ interface IterationResult {
         }>;
     };
 }
+/**
+ * Request data captured from the eval case input.
+ * Preserves what was sent so results are self-contained for debugging.
+ */
+interface EvalCaseRequest {
+    /** Human-readable description of the case */
+    description?: string;
+    /** Tool arguments (direct mode) */
+    args?: Record<string, unknown>;
+    /** Natural language scenario sent to the LLM (mcp_host mode) */
+    scenario?: string;
+    /** LLM provider/model configuration (mcp_host mode) */
+    mcpHostConfig?: {
+        provider?: string;
+        model?: string;
+    };
+}
 /**
  * Result of a single eval case
  */
@@ -3292,6 +3576,11 @@ interface EvalCaseResult {
      * Overall pass/fail status
      */
     pass: boolean;
+    /**
+     * Request data from the eval case input (tool args, scenario, LLM config).
+     * Populated so results are self-contained for debugging without the original dataset.
+     */
+    request?: EvalCaseRequest;
     /**
      * Tool response
      */
@@ -3835,24 +4124,31 @@ declare function runServerComparison(options: ServerComparisonOptions, contextA:
  * schemas, testing discoverability and parameter clarity at the level a real
  * user (via Claude Desktop, ChatGPT, etc.) would experience.
  *
- * All providers run through the Vercel AI SDK's generateText with maxSteps,
- * which handles multi-turn tool calling natively and provides per-step latency
- * decomposition (llmDurationMs vs. mcpDurationMs).
- *
- * @param mcp - MCP fixture API
+ * @param mcp - MCP fixture API (used by SDK hosts; ignored by CLI/browser hosts which establish their own connections)
  * @param scenario - Natural language prompt describing what the LLM should do
  * @param config - MCP host configuration (provider, model, temperature, etc.)
  * @returns Simulation result with tool calls, final response, and latency data
  *
  * @example
  * ```typescript
+ * // SDK host (default) — uses the framework's existing MCP connection
  * const result = await simulateMCPHost(mcp,
  *   "Find recent documents about MCP testing frameworks",
  *   { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
  * );
  *
- * expect(result.success).toBe(true);
- * expect(result.toolCalls.map(c => c.name)).toContain('search');
+ * // CLI host — spawns a CLI process with its own MCP connection
+ * const result = await simulateMCPHost(mcp,
+ *   "Find recent documents about MCP testing frameworks",
+ *   {
+ *     hostType: 'cli',
+ *     provider: 'anthropic',
+ *     cli: {
+ *       command: 'claude',
+ *       args: ['-p', '{{scenario}}', '--output-format', 'stream-json', '--verbose'],
+ *     },
+ *   }
+ * );
  * ```
  */
 declare function simulateMCPHost(mcp: MCPFixtureApi, scenario: string, config: MCPHostConfig): Promise<MCPHostSimulationResult>;
@@ -3905,6 +4201,99 @@ declare function getMissingDependencyMessage(provider: LLMProvider): string;
  */
 declare function createJudge(config?: JudgeConfig): Judge;
+/**
+ * Custom Judge Registry
+ *
+ * Allows consumers to register named judge executors that can be referenced
+ * by string ID in eval fixtures and programmatic tests. This enables
+ * multi-step judge pipelines (LLM call + post-processing), custom scoring
+ * logic, and reusable judge configurations without duplicating rubrics.
+ */
+/**
+ * Result returned by a custom judge executor.
+ *
+ * Custom judges must return a normalized score (0–1). The framework applies
+ * the caller's `threshold` (default 0.7) to determine pass/fail. This keeps
+ * judges reusable — the same judge can be used with different thresholds in
+ * different tests.
+ */
+interface CustomJudgeResult {
+    /** Normalized score (0–1, where 1 is best) */
+    score: number;
+    /** Optional reasoning/explanation */
+    reasoning?: string;
+}
+/**
+ * A user-defined judge executor function.
+ *
+ * Custom executors own their entire evaluation pipeline — prompt construction,
+ * LLM calls, and post-processing — but return a normalized score. The framework
+ * determines pass/fail by comparing the score against the caller's threshold.
+ *
+ * @param candidate - The actual response to evaluate
+ * @param reference - Optional reference/expected response
+ * @returns Evaluation result with a normalized score and optional reasoning
+ *
+ * @example
+ * ```typescript
+ * const completenessJudge: CustomJudgeExecutor = async (candidate, reference) => {
+ *   // Step 1: LLM call with your own prompt and schema
+ *   const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
+ *   const { verdict, reasoning } = JSON.parse(llmResult);
+ *
+ *   // Step 2: Deterministic post-processing into a normalized score
+ *   const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
+ *
+ *   return { score, reasoning };
+ * };
+ * ```
+ */
+type CustomJudgeExecutor = (candidate: unknown, reference?: unknown) => Promise<CustomJudgeResult>;
+/**
+ * Registers a named custom judge executor.
+ *
+ * Call this in your test setup (e.g., `playwright.config.ts` or a global setup file)
+ * before tests run. The name can then be referenced in JSON eval fixtures via the
+ * `judge` field on `passesJudge`.
+ *
+ * @param name - Unique identifier for the judge
+ * @param executor - The judge executor function
+ * @throws {Error} If a judge with the same name is already registered
+ *
+ * @example
+ * ```typescript
+ * import { registerJudge } from '@gleanwork/mcp-server-tester';
+ *
+ * registerJudge('glean-completeness', async (candidate, reference) => {
+ *   // Step 1: LLM call with your own prompt and schema
+ *   const llmResult = await callLLM(COMPLETENESS_PROMPT, candidate);
+ *   const { verdict, reasoning } = JSON.parse(llmResult);
+ *
+ *   // Step 2: Deterministic post-processing into a normalized score
+ *   const score = { Complete: 1.0, Incomplete: 0.5 }[verdict] ?? 0.0;
+ *
+ *   return { score, reasoning };
+ * });
+ *
+ * // Then in tests — same judge, different thresholds:
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.8 });
+ * // expect(result).toPassToolJudge({ judge: 'glean-completeness', passingThreshold: 0.5 });
+ * ```
+ */
+declare function registerJudge(name: string, executor: CustomJudgeExecutor): void;
+/**
+ * Retrieves a registered custom judge executor by name.
+ *
+ * @param name - The judge name to look up
+ * @returns The registered executor
+ * @throws {Error} If no judge with the given name is registered
+ */
+declare function getRegisteredJudge(name: string): CustomJudgeExecutor;
+/**
+ * Clears all registered judges. Intended for test teardown.
+ */
+declare function clearJudgeRegistry(): void;
 /**
  * Options for conformance checks
  */
@@ -4066,4 +4455,4 @@ interface MCPEvalReporterConfig {
     includeAutoTracking?: boolean;
 }
-export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
+export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, type CLIConfig, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CLIOutputFormat, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, type CustomJudgeExecutor, type CustomJudgeResult, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HostType, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeExpectConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPAuthFixtures, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPHostConfig, type MCPHostSimulationResult, type MCPHostSimulator, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SaveBaselineOptions, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test$1 as test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };