npm - @gleanwork/mcp-server-tester - Versions diffs - 0.12.0 → 1.0.0-beta.0 - Mend

@gleanwork/mcp-server-tester 0.12.0 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +120 -337
package/dist/cli/index.js +455 -174
package/dist/fixtures/mcp.d.ts +121 -44
package/dist/fixtures/mcp.js +974 -244
package/dist/fixtures/mcp.js.map +1 -1
package/dist/fixtures/mcpAuth.js +6 -2
package/dist/fixtures/mcpAuth.js.map +1 -1
package/dist/index.cjs +4936 -1292
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +1660 -570
package/dist/index.d.ts +1660 -570
package/dist/index.js +4923 -1288
package/dist/index.js.map +1 -1
package/dist/reporters/mcpReporter.cjs +35 -16
package/dist/reporters/mcpReporter.cjs.map +1 -1
package/dist/reporters/mcpReporter.d.cts +8 -3
package/dist/reporters/mcpReporter.d.ts +8 -3
package/dist/reporters/mcpReporter.js +36 -17
package/dist/reporters/mcpReporter.js.map +1 -1
package/dist/reporters/ui-dist/app.js +5 -5
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +63 -8
package/src/reporters/ui-dist/app.js +5 -5
package/src/reporters/ui-dist/styles.css +1 -1

package/dist/index.d.ts CHANGED Viewed

@@ -42,6 +42,28 @@ interface MCPOAuthConfig {
      */
     redirectUri?: string;
 }
+/**
+ * OAuth 2.1 client credentials configuration for machine-to-machine (CI/CD) authentication.
+ * Credentials can be provided here or via MCP_CLIENT_ID/MCP_CLIENT_SECRET environment variables.
+ */
+interface MCPClientCredentialsConfig {
+    /**
+     * OAuth client ID (falls back to MCP_CLIENT_ID env var)
+     */
+    clientId?: string;
+    /**
+     * OAuth client secret (falls back to MCP_CLIENT_SECRET env var)
+     */
+    clientSecret?: string;
+    /**
+     * Token endpoint URL (required)
+     */
+    tokenEndpoint?: string;
+    /**
+     * Scopes to request
+     */
+    scopes?: string[];
+}
 /**
  * Authentication configuration for MCP connections
  */
@@ -54,6 +76,10 @@ interface MCPAuthConfig {
      * Full OAuth configuration for browser-based authentication
      */
     oauth?: MCPOAuthConfig;
+    /**
+     * OAuth 2.1 client credentials grant for machine-to-machine authentication
+     */
+    clientCredentials?: MCPClientCredentialsConfig;
 }
 /**
  * MCP host capabilities that can be registered with the server
@@ -74,35 +100,67 @@ interface MCPHostCapabilities {
     };
 }
 /**
- * Configuration for MCP client connection
- *
- * Supports both stdio (local) and HTTP (remote) transports
+ * Configuration for MCP client connection via stdio transport (local process)
  */
-interface MCPConfig {
+interface StdioMCPConfig {
     /**
-     * Transport type
+     * Transport type discriminant
      */
-    transport: 'http' | 'stdio';
+    transport: 'stdio';
     /**
-     * Server URL (required when transport === 'http')
+     * Command to execute (required for stdio transport)
      */
-    serverUrl?: string;
+    command: string;
     /**
-     * HTTP headers (optional for http transport, e.g., Authorization)
+     * Command arguments
      */
-    headers?: Record<string, string>;
+    args?: Array<string>;
     /**
-     * Command to execute (required when transport === 'stdio')
+     * Working directory for the command
      */
-    command?: string;
+    cwd?: string;
     /**
-     * Command arguments (optional for stdio)
+     * Suppress stderr output from the server process.
+     * When true, server stderr is ignored instead of inherited.
      */
-    args?: Array<string>;
+    quiet?: boolean;
     /**
-     * Working directory for the command (optional for stdio)
+     * Host capabilities to register with the server
      */
-    cwd?: string;
+    capabilities?: MCPHostCapabilities;
+    /**
+     * Connection timeout in milliseconds
+     */
+    connectTimeoutMs?: number;
+    /**
+     * Request timeout in milliseconds
+     */
+    requestTimeoutMs?: number;
+    /**
+     * Timeout in milliseconds for MCP tool/list operations. Default: 30000
+     */
+    callTimeoutMs?: number;
+}
+/**
+ * Configuration for MCP client connection via HTTP transport (remote server)
+ */
+interface HttpMCPConfig {
+    /**
+     * Transport type discriminant
+     */
+    transport: 'http';
+    /**
+     * Server URL (required for http transport)
+     */
+    serverUrl: string;
+    /**
+     * HTTP headers (e.g., Authorization)
+     */
+    headers?: Record<string, string>;
+    /**
+     * Authentication configuration
+     */
+    auth?: MCPAuthConfig;
     /**
      * Host capabilities to register with the server
      */
@@ -116,15 +174,57 @@ interface MCPConfig {
      */
     requestTimeoutMs?: number;
     /**
-     * Suppress stderr output from the server process (stdio only)
-     * When true, server stderr is ignored instead of inherited
+     * Timeout in milliseconds for MCP tool/list operations. Default: 30000
      */
-    quiet?: boolean;
+    callTimeoutMs?: number;
     /**
-     * Authentication configuration (optional for http transport)
+     * HTTP proxy configuration. Falls back to HTTPS_PROXY/HTTP_PROXY environment variables.
      */
-    auth?: MCPAuthConfig;
+    proxy?: {
+        /**
+         * Proxy URL. Credentials can be embedded directly if required:
+         * `http://user:pass@proxy.example.com:8080`
+         */
+        url: string;
+    };
+    /**
+     * Number of retry attempts for transient connection failures and 429 rate limit responses.
+     * Uses exponential backoff with Retry-After header awareness. Defaults to 0 (no retries).
+     */
+    retryAttempts?: number;
+    /**
+     * TLS/mTLS configuration for custom certificates or disabling cert validation.
+     * File paths should point to PEM-encoded certificate files.
+     */
+    tls?: {
+        /**
+         * Path to CA certificate PEM file (for custom/self-signed CAs)
+         */
+        ca?: string;
+        /**
+         * Path to client certificate PEM file (for mutual TLS)
+         */
+        cert?: string;
+        /**
+         * Path to client private key PEM file (for mutual TLS)
+         */
+        key?: string;
+        /**
+         * Whether to reject unauthorized certificates. Defaults to true.
+         * Set to false to disable certificate validation (not recommended for production).
+         */
+        rejectUnauthorized?: boolean;
+    };
 }
+/**
+ * Configuration for MCP client connection.
+ *
+ * This is a discriminated union — narrow with `isStdioConfig()` or `isHttpConfig()`
+ * before accessing transport-specific fields.
+ *
+ * Supports both stdio (local) and HTTP (remote) transports.
+ */
+type MCPConfig = StdioMCPConfig | HttpMCPConfig;
 /**
  * Union schema for MCPConfig (validates based on transport type)
  */
@@ -155,6 +255,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     }>>;
     connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
     requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
+    callTimeoutMs: z.ZodOptional<z.ZodNumber>;
     quiet: z.ZodOptional<z.ZodBoolean>;
 }, "strip", z.ZodTypeAny, {
     transport: "stdio";
@@ -169,6 +270,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     } | undefined;
     connectTimeoutMs?: number | undefined;
     requestTimeoutMs?: number | undefined;
+    callTimeoutMs?: number | undefined;
     quiet?: boolean | undefined;
 }, {
     transport: "stdio";
@@ -183,10 +285,11 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     } | undefined;
     connectTimeoutMs?: number | undefined;
     requestTimeoutMs?: number | undefined;
+    callTimeoutMs?: number | undefined;
     quiet?: boolean | undefined;
 }>, z.ZodObject<{
     transport: z.ZodLiteral<"http">;
-    serverUrl: z.ZodString;
+    serverUrl: z.ZodEffects<z.ZodString, string, string>;
     headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
     capabilities: z.ZodOptional<z.ZodObject<{
         sampling: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
@@ -210,6 +313,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     }>>;
     connectTimeoutMs: z.ZodOptional<z.ZodNumber>;
     requestTimeoutMs: z.ZodOptional<z.ZodNumber>;
+    callTimeoutMs: z.ZodOptional<z.ZodNumber>;
     auth: z.ZodOptional<z.ZodEffects<z.ZodObject<{
         accessToken: z.ZodOptional<z.ZodString>;
         oauth: z.ZodOptional<z.ZodObject<{
@@ -237,6 +341,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         }>>;
+        clientCredentials: z.ZodOptional<z.ZodObject<{
+            clientId: z.ZodOptional<z.ZodString>;
+            clientSecret: z.ZodOptional<z.ZodString>;
+            tokenEndpoint: z.ZodOptional<z.ZodString>;
+            scopes: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+        }, "strip", z.ZodTypeAny, {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        }, {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        }>>;
     }, "strip", z.ZodTypeAny, {
         accessToken?: string | undefined;
         oauth?: {
@@ -248,6 +368,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
     }, {
         accessToken?: string | undefined;
         oauth?: {
@@ -259,6 +385,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
     }>, {
         accessToken?: string | undefined;
         oauth?: {
@@ -270,6 +402,12 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
     }, {
         accessToken?: string | undefined;
         oauth?: {
@@ -281,6 +419,36 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
+    }>>;
+    proxy: z.ZodOptional<z.ZodObject<{
+        url: z.ZodString;
+    }, "strip", z.ZodTypeAny, {
+        url: string;
+    }, {
+        url: string;
+    }>>;
+    retryAttempts: z.ZodOptional<z.ZodNumber>;
+    tls: z.ZodOptional<z.ZodObject<{
+        ca: z.ZodOptional<z.ZodString>;
+        cert: z.ZodOptional<z.ZodString>;
+        key: z.ZodOptional<z.ZodString>;
+        rejectUnauthorized: z.ZodOptional<z.ZodBoolean>;
+    }, "strip", z.ZodTypeAny, {
+        ca?: string | undefined;
+        cert?: string | undefined;
+        key?: string | undefined;
+        rejectUnauthorized?: boolean | undefined;
+    }, {
+        ca?: string | undefined;
+        cert?: string | undefined;
+        key?: string | undefined;
+        rejectUnauthorized?: boolean | undefined;
     }>>;
 }, "strip", z.ZodTypeAny, {
     serverUrl: string;
@@ -293,6 +461,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     } | undefined;
     connectTimeoutMs?: number | undefined;
     requestTimeoutMs?: number | undefined;
+    callTimeoutMs?: number | undefined;
     headers?: Record<string, string> | undefined;
     auth?: {
         accessToken?: string | undefined;
@@ -305,6 +474,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
+    } | undefined;
+    proxy?: {
+        url: string;
+    } | undefined;
+    retryAttempts?: number | undefined;
+    tls?: {
+        ca?: string | undefined;
+        cert?: string | undefined;
+        key?: string | undefined;
+        rejectUnauthorized?: boolean | undefined;
     } | undefined;
 }, {
     serverUrl: string;
@@ -317,6 +502,7 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
     } | undefined;
     connectTimeoutMs?: number | undefined;
     requestTimeoutMs?: number | undefined;
+    callTimeoutMs?: number | undefined;
     headers?: Record<string, string> | undefined;
     auth?: {
         accessToken?: string | undefined;
@@ -329,6 +515,22 @@ declare const MCPConfigSchema: z.ZodDiscriminatedUnion<"transport", [z.ZodObject
             clientSecret?: string | undefined;
             redirectUri?: string | undefined;
         } | undefined;
+        clientCredentials?: {
+            scopes?: string[] | undefined;
+            clientId?: string | undefined;
+            clientSecret?: string | undefined;
+            tokenEndpoint?: string | undefined;
+        } | undefined;
+    } | undefined;
+    proxy?: {
+        url: string;
+    } | undefined;
+    retryAttempts?: number | undefined;
+    tls?: {
+        ca?: string | undefined;
+        cert?: string | undefined;
+        key?: string | undefined;
+        rejectUnauthorized?: boolean | undefined;
     } | undefined;
 }>]>;
 /**
@@ -342,17 +544,11 @@ declare function validateMCPConfig(config: unknown): MCPConfig;
 /**
  * Type guard to check if a config is for stdio transport
  */
-declare function isStdioConfig(config: MCPConfig): config is MCPConfig & {
-    transport: 'stdio';
-    command: string;
-};
+declare function isStdioConfig(config: MCPConfig): config is StdioMCPConfig;
 /**
  * Type guard to check if a config is for HTTP transport
  */
-declare function isHttpConfig(config: MCPConfig): config is MCPConfig & {
-    transport: 'http';
-    serverUrl: string;
-};
+declare function isHttpConfig(config: MCPConfig): config is HttpMCPConfig;
 /**
  * Auth types for MCP OAuth integration
@@ -601,6 +797,9 @@ declare class PlaywrightOAuthClientProvider implements OAuthClientProvider {
     tokens(): Promise<OAuthTokens | undefined>;
     /**
      * Stores new OAuth tokens for the current session
+     *
+     * The code verifier is cleared after a successful token exchange — it is
+     * single-use per PKCE spec and must not persist beyond the exchange.
      */
     saveTokens(tokens: OAuthTokens): Promise<void>;
     /**
@@ -757,6 +956,38 @@ interface AuthServerMetadata {
      */
     issuer: string;
 }
+/**
+ * Configuration for client credentials grant
+ */
+interface ClientCredentialsConfig {
+    /**
+     * Token endpoint URL
+     */
+    tokenEndpoint: string;
+    /**
+     * OAuth client ID
+     */
+    clientId: string;
+    /**
+     * OAuth client secret
+     */
+    clientSecret: string;
+    /**
+     * Scopes to request (optional)
+     */
+    scopes?: string[];
+}
+/**
+ * Performs the OAuth 2.1 client credentials grant to obtain an access token.
+ * Suitable for CI/CD machine-to-machine authentication.
+ *
+ * Uses oauth4webapi for spec-compliant request construction and response validation,
+ * consistent with how the rest of this module handles OAuth flows.
+ *
+ * @param config - Client credentials configuration
+ * @returns Token result
+ */
+declare function performClientCredentialsFlow(config: ClientCredentialsConfig): Promise<TokenResult>;
 /**
  * OAuth Protected Resource and Authorization Server discovery
@@ -915,8 +1146,9 @@ declare function injectTokens(serverUrl: string, tokens: StoredTokens, stateDir?
  * ```typescript
  * // After running: npx mcp-server-tester login https://api.example.com/mcp
  * const tokens = await loadTokens('https://api.example.com/mcp');
- * if (tokens) {
- *   console.log('Access token:', tokens.accessToken);
+ * if (tokens?.accessToken) {
+ *   // Use the token — never log raw token values
+ *   headers.Authorization = `Bearer ${tokens.accessToken}`;
  * }
  * ```
  */
@@ -1127,6 +1359,14 @@ interface CreateMCPClientOptions {
      * This takes precedence over static token auth in config.auth.accessToken.
      */
     authProvider?: OAuthClientProvider;
+    /**
+     * Sampling handler callback for LLM sampling requests from the server.
+     *
+     * When provided, the client will advertise sampling capability to the server.
+     * When absent, sampling is removed from declared capabilities so the client
+     * does not falsely advertise support it cannot fulfill.
+     */
+    samplingHandler?: unknown;
 }
 /**
  * Creates and connects an MCP client based on the provided configuration
@@ -1251,6 +1491,14 @@ interface ValidationResult {
     message: string;
     /** Additional structured details about the validation */
     details?: Record<string, unknown>;
+    /**
+     * Optional quantitative metrics from the validation.
+     * Populated by validateToolCalls for precision/recall.
+     */
+    metrics?: {
+        precision?: number;
+        recall?: number;
+    };
 }
 /**
  * Options for text validation
@@ -1282,10 +1530,33 @@ interface PatternValidatorOptions {
     /** Whether to perform case-sensitive matching (default: true) */
     caseSensitive?: boolean;
 }
+/**
+ * Built-in snapshot sanitizer names for use with toMatchToolSnapshot.
+ * Pass these values in the sanitizers array to replace non-deterministic
+ * values with stable placeholders before snapshot comparison.
+ *
+ * @example
+ * expect(result).toMatchToolSnapshot('my-snapshot', [
+ *   SnapshotSanitizers.UUID,
+ *   SnapshotSanitizers.ISO_DATE,
+ * ]);
+ */
+declare const SnapshotSanitizers: {
+    /** Replaces Unix timestamps (seconds and milliseconds) with a stable placeholder */
+    readonly TIMESTAMP: "timestamp";
+    /** Replaces UUID v1-v5 strings with a stable placeholder */
+    readonly UUID: "uuid";
+    /** Replaces ISO 8601 date/datetime strings with a stable placeholder */
+    readonly ISO_DATE: "iso-date";
+    /** Replaces MongoDB ObjectId strings with a stable placeholder */
+    readonly OBJECT_ID: "objectId";
+    /** Replaces JWT tokens with a stable placeholder */
+    readonly JWT: "jwt";
+};
 /**
  * Built-in sanitizer names for common variable patterns
  */
-type BuiltInSanitizer = 'timestamp' | 'uuid' | 'iso-date' | 'objectId' | 'jwt';
+type BuiltInSanitizer = (typeof SnapshotSanitizers)[keyof typeof SnapshotSanitizers];
 /**
  * Custom regex-based sanitizer
  */
@@ -1511,38 +1782,63 @@ declare function validateError(response: unknown, expected?: boolean | string |
 declare function validateSize(response: unknown, options: SizeValidatorOptions): ValidationResult;
 /**
- * Validator Utilities
+ * Tool call validators for llm_host simulation results.
  *
- * Shared utility functions for validation operations.
- * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
+ * These validators extract the tool call trace from an LLMHostSimulationResult
+ * and apply assertions against expected call lists and counts.
  */
+interface ToolCallExpectation {
+    calls: Array<{
+        name: string;
+        arguments?: Record<string, unknown>;
+        required?: boolean;
+    }>;
+    order?: 'strict' | 'any';
+    exclusive?: boolean;
+}
+interface ToolCallCountOptions {
+    min?: number;
+    max?: number;
+    exact?: number;
+}
 /**
- * Gets the size of a response in bytes
- *
- * Serializes the response to JSON (with pretty printing for consistency)
- * and returns the byte length using UTF-8 encoding.
+ * Validates tool calls made during an LLM host simulation.
  *
- * @param response - Response in any format
- * @returns Size in bytes
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
+ * @param expectation - Expected tool call specification
  */
-declare function getResponseSizeBytes(response: unknown): number;
+declare function validateToolCalls(response: unknown, expectation: ToolCallExpectation): ValidationResult;
 /**
- * Normalizes whitespace in text for consistent comparison
- *
- * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
- * and trims leading/trailing whitespace.
+ * Validates the number of tool calls made during an LLM host simulation.
  *
- * @param text - Text to normalize
- * @returns Normalized text with collapsed whitespace
+ * @param response - Must be an LLMHostSimulationResult (from llm_host mode)
+ * @param options - Count constraints (min, max, exact)
+ */
+declare function validateToolCallCount(response: unknown, options: ToolCallCountOptions): ValidationResult;
+/**
+ * Built-in judge rubrics matching Glean EvalV2's named judge types.
+ * Use these for consistent, standardized evaluations across teams.
  *
- * @example
- * ```typescript
- * normalizeWhitespace('  hello\n\n  world  ');
- * // Returns: "hello world"
- * ```
+ * All built-in rubrics use a 5-point scale: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
  */
-declare function normalizeWhitespace(text: string): string;
+type BuiltInRubric = 'correctness' | 'completeness' | 'groundedness' | 'instruction-following' | 'conciseness';
+declare const BUILT_IN_RUBRICS: Record<BuiltInRubric, string>;
+/** A rubric specification: either a built-in named rubric or custom text. */
+type RubricSpec = BuiltInRubric | {
+    text: string;
+};
+/**
+ * Returns true if `s` is a built-in rubric name.
+ */
+declare function isBuiltInRubric(s: unknown): s is BuiltInRubric;
+/**
+ * Resolves a RubricSpec to its full rubric text.
+ * - Built-in name → returns the expanded rubric text from BUILT_IN_RUBRICS
+ * - Custom object → returns rubric.text as-is
+ */
+declare function resolveRubric(rubric: RubricSpec): string;
 /**
  * Usage metrics from Claude Agent SDK response
@@ -1577,10 +1873,8 @@ interface UsageMetrics {
      */
     cacheCreationInputTokens?: number;
 }
-/**
- * Supported LLM provider types
- */
-type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'custom-http';
+/** Valid LLM judge provider kinds. */
+type ProviderKind = 'claude' | 'anthropic' | 'openai' | 'google';
 /**
  * Configuration for an LLM judge
  */
@@ -1649,7 +1943,24 @@ interface JudgeResult {
      * Whether the candidate exceeded maxToolOutputSize
      */
     exceedsMaxToolOutputSize?: boolean;
+    /**
+     * Standard deviation of individual rep scores.
+     * Only populated when the judge was run with reps > 1.
+     */
+    scoreStdDev?: number;
+    /**
+     * True when the standard deviation across reps exceeds 0.2, indicating
+     * that the rubric may be ambiguous or the judge is non-deterministic.
+     * Only populated when the judge was run with reps > 1.
+     */
+    highVariance?: boolean;
+    /**
+     * Individual scores from each judge rep.
+     * Only populated when the judge was run with reps > 1.
+     */
+    scores?: number[];
 }
 /**
  * LLM judge client interface
  */
@@ -1665,6 +1976,75 @@ interface Judge {
     evaluate(candidate: unknown, reference: unknown, rubric: string): Promise<JudgeResult>;
 }
+/**
+ * Judge Validator
+ *
+ * Validates a response using an LLM-as-a-judge evaluation.
+ */
+/**
+ * Configuration for the judge validator
+ */
+interface JudgeValidatorConfig {
+    /** The evaluation rubric: a built-in name or custom { text: string } */
+    rubric: RubricSpec;
+    /** Optional reference response to compare against */
+    reference?: unknown;
+    /** Minimum score required to pass (0-1, default: 0.7) */
+    threshold?: number;
+    /** Number of judge evaluations to run. Scores averaged. @default 1 */
+    reps?: number;
+    /** Judge provider. @default 'claude' */
+    provider?: ProviderKind;
+    /** Model override (e.g., 'claude-opus-4-20250514') */
+    model?: string;
+    /** Environment variable name for API key */
+    apiKeyEnvVar?: string;
+    /** Max tokens for judge response */
+    maxTokens?: number;
+    /** Temperature for judge LLM (0–1) */
+    temperature?: number;
+    /** Max budget in USD per evaluation */
+    maxBudgetUsd?: number;
+    /** Fail if response exceeds this size in bytes before judging */
+    maxToolOutputSize?: number;
+}
+declare function validateJudge(response: unknown, config: JudgeValidatorConfig): Promise<ValidationResult>;
+/**
+ * Validator Utilities
+ *
+ * Shared utility functions for validation operations.
+ * Re-exports core utilities from mcp/response.ts and adds validation-specific helpers.
+ */
+/**
+ * Gets the size of a response in bytes
+ *
+ * Serializes the response to JSON (with pretty printing for consistency)
+ * and returns the byte length using UTF-8 encoding.
+ *
+ * @param response - Response in any format
+ * @returns Size in bytes
+ */
+declare function getResponseSizeBytes(response: unknown): number;
+/**
+ * Normalizes whitespace in text for consistent comparison
+ *
+ * Collapses multiple whitespace characters (spaces, tabs, newlines) into single spaces
+ * and trims leading/trailing whitespace.
+ *
+ * @param text - Text to normalize
+ * @returns Normalized text with collapsed whitespace
+ *
+ * @example
+ * ```typescript
+ * normalizeWhitespace('  hello\n\n  world  ');
+ * // Returns: "hello world"
+ * ```
+ */
+declare function normalizeWhitespace(text: string): string;
 /**
  * Matcher Types
  *
@@ -1679,8 +2059,12 @@ interface JudgeMatcherOptions {
     reference?: unknown;
     /** Score threshold for passing (default: 0.7) */
     passingThreshold?: number;
-    /** Judge configuration override */
-    judgeConfig?: JudgeConfig;
+    /** Number of judge evaluations (scores averaged) */
+    reps?: number;
+    /** Override the judge provider */
+    provider?: ProviderKind;
+    /** Override the judge model */
+    model?: string;
 }
 /**
  * Declaration merging for Playwright matchers
@@ -1785,7 +2169,7 @@ declare global {
              * });
              * ```
              */
-            toPassToolJudge(rubric: string, options?: JudgeMatcherOptions): Promise<R>;
+            toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
             /**
              * Validates that a response meets size constraints
              *
@@ -1830,11 +2214,33 @@ declare global {
              * ```
              */
             toSatisfyToolPredicate(predicate: ToolPredicate, description?: string): Promise<R>;
-        }
-    }
-}
-/**
- * Predicate result returned by the user's predicate function
+            /**
+             * Validates which tools the LLM called during an llm_host simulation.
+             *
+             * @example
+             * ```typescript
+             * expect(simulationResult).toHaveToolCalls({
+             *   calls: [{ name: 'search', arguments: { query: 'hello' }, required: true }],
+             *   order: 'any',
+             * });
+             * ```
+             */
+            toHaveToolCalls(expectation: ToolCallExpectation): R;
+            /**
+             * Validates the number of tool calls made during an llm_host simulation.
+             *
+             * @example
+             * ```typescript
+             * expect(simulationResult).toHaveToolCallCount({ min: 1, max: 3 });
+             * expect(simulationResult).toHaveToolCallCount({ exact: 2 });
+             * ```
+             */
+            toHaveToolCallCount(options: ToolCallCountOptions): R;
+        }
+    }
+}
+/**
+ * Predicate result returned by the user's predicate function
  */
 interface PredicateResult {
     /** Whether the predicate passed */
@@ -1873,7 +2279,7 @@ type ResultSource = 'eval' | 'test';
 /**
  * Known expectation types supported by the framework
  */
-type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size';
+type ExpectationType = 'exact' | 'schema' | 'textContains' | 'regex' | 'snapshot' | 'judge' | 'error' | 'size' | 'toolsTriggered' | 'toolCallCount';
 /**
  * Result of an expectation check
  */
@@ -1912,6 +2318,10 @@ interface MCPFixtureOptions {
      * Used for filtering and grouping in the reporter
      */
     project?: string;
+    /**
+     * Timeout in milliseconds for MCP tool/list operations. Default: 30000
+     */
+    callTimeoutMs?: number;
 }
 /**
  * High-level API for interacting with MCP servers in tests
@@ -1954,29 +2364,43 @@ interface MCPFixtureApi {
     } | null;
 }
 /**
- * Creates an MCP fixture wrapper around a Client
+ * Creates an MCP fixture wrapper around a Client, providing a high-level
+ * {@link MCPFixtureApi} without requiring Playwright's `test.extend` pattern.
  *
- * When testInfo is provided, automatically tracks all MCP operations with test.step()
- * and creates attachments for the MCP Test Reporter.
+ * Use this when you need to set up an MCP fixture manually — for example in
+ * custom fixture hierarchies, non-Playwright test runners (e.g. Vitest,
+ * Jest), or when you want to compose the fixture with other lifecycle
+ * management logic that doesn't fit the standard `test.extend` model.
  *
- * @param client - The MCP client to wrap
- * @param testInfo - Optional Playwright TestInfo for auto-tracking
+ * For the typical Playwright use case, prefer importing `test` and `mcp`
+ * directly from `@gleanwork/mcp-server-tester/fixtures/mcp`, which wires
+ * this function up automatically.
+ *
+ * When `testInfo` is provided, all MCP operations are automatically wrapped
+ * in `test.step()` calls and attachments are created for the MCP Test
+ * Reporter. Omit `testInfo` for lightweight usage outside Playwright.
+ *
+ * @param client - The MCP client to wrap (created via `createMCPClientForConfig`)
+ * @param testInfo - Optional Playwright TestInfo for auto-tracking and reporter attachments
+ * @param options - Optional fixture options (authType, project)
  * @returns MCPFixtureApi instance
  *
  * @example
  * ```typescript
- * // With tracking (recommended)
+ * // Advanced: custom fixture setup inside test.extend
  * const test = base.extend<{ mcp: MCPFixtureApi }>({
  *   mcp: async ({}, use, testInfo) => {
  *     const client = await createMCPClientForConfig(config);
- *     const api = createMCPFixture(client, testInfo);
+ *     const api = createMCPFixture(client, testInfo, { authType: 'api-token' });
  *     await use(api);
  *     await closeMCPClient(client);
  *   }
  * });
  *
- * // Without tracking
+ * // Non-Playwright usage (no reporter attachments)
+ * const client = await createMCPClientForConfig(config);
  * const api = createMCPFixture(client);
+ * const tools = await api.listTools();
  * ```
  */
 declare function createMCPFixture(client: Client, testInfo?: TestInfo, options?: MCPFixtureOptions): MCPFixtureApi;
@@ -2082,6 +2506,8 @@ declare function toBeToolError(this: {
  * toPassToolJudge Matcher
  *
  * Validates that a response passes LLM-as-judge evaluation.
+ * Delegates evaluation logic to validateJudge() for consistency
+ * with the validator/matcher duality pattern.
  */
 /**
@@ -2091,7 +2517,7 @@ declare function toBeToolError(this: {
  */
 declare function toPassToolJudge(this: {
     isNot: boolean;
-}, received: unknown, rubric: string, options?: JudgeMatcherOptions): Promise<{
+}, received: unknown, rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<{
     pass: boolean;
     message: () => string;
 }>;
@@ -2158,6 +2584,38 @@ declare function toSatisfyToolPredicate(this: {
     message: () => string;
 }>;
+/**
+ * toHaveToolCalls Matcher
+ *
+ * Validates which tools the LLM called during an llm_host simulation.
+ */
+/**
+ * Creates the toHaveToolCalls matcher function
+ */
+declare function toHaveToolCalls(this: {
+    isNot: boolean;
+}, received: unknown, expectation: ToolCallExpectation): {
+    pass: boolean;
+    message: () => string;
+};
+/**
+ * toHaveToolCallCount Matcher
+ *
+ * Validates the number of tool calls made during an llm_host simulation.
+ */
+/**
+ * Creates the toHaveToolCallCount matcher function
+ */
+declare function toHaveToolCallCount(this: {
+    isNot: boolean;
+}, received: unknown, options: ToolCallCountOptions): {
+    pass: boolean;
+    message: () => string;
+};
 /**
  * Extended Playwright expect with MCP tool matchers
  *
@@ -2184,6 +2642,8 @@ declare const expect: playwright_test.Expect<{
     toPassToolJudge: typeof toPassToolJudge;
     toHaveToolResponseSize: typeof toHaveToolResponseSize;
     toSatisfyToolPredicate: typeof toSatisfyToolPredicate;
+    toHaveToolCalls: typeof toHaveToolCalls;
+    toHaveToolCallCount: typeof toHaveToolCallCount;
 }>;
 /**
@@ -2233,9 +2693,30 @@ declare const test: playwright_test.TestType<playwright_test.PlaywrightTestArgs
  */
 /**
- * LLM provider for host simulation
+ * LLM provider for host simulation.
+ *
+ * All providers run through the Vercel AI SDK (`ai` package).
+ * Each provider requires its corresponding @ai-sdk/* package:
+ *
+ *   openai      → npm install ai @ai-sdk/openai
+ *   anthropic   → npm install ai @ai-sdk/anthropic
+ *   google      → npm install ai @ai-sdk/google
+ *   azure       → npm install ai @ai-sdk/azure
+ *   mistral     → npm install ai @ai-sdk/mistral
+ *   ollama      → npm install ai @ai-sdk/ollama  (local, no API key)
+ *   deepseek    → npm install ai @ai-sdk/deepseek
+ *   openrouter  → npm install ai @openrouter/ai-sdk-provider
+ *   xai         → npm install ai @ai-sdk/xai
+ */
+type LLMProvider = 'openai' | 'anthropic' | 'azure' | 'google' | 'mistral' | 'ollama' | 'deepseek' | 'openrouter' | 'xai'
+/**
+ * Anthropic Claude via Google Vertex AI.
+ * Requires @ai-sdk/google-vertex and Application Default Credentials (gcloud auth).
+ * Set GOOGLE_VERTEX_PROJECT and GOOGLE_VERTEX_LOCATION env vars.
+ * Use this instead of 'anthropic' in environments where api.anthropic.com is blocked.
+ * @example model: 'claude-3-5-haiku@20241022'
  */
-type LLMProvider = 'openai' | 'anthropic';
+ | 'vertex-anthropic';
 /**
  * Configuration for LLM host simulation
  */
@@ -2246,12 +2727,10 @@ interface LLMHostConfig {
     provider: LLMProvider;
     /**
      * Environment variable name containing the API key
-     * @default 'OPENAI_API_KEY' for openai, 'ANTHROPIC_API_KEY' for anthropic
      */
     apiKeyEnvVar?: string;
     /**
-     * Model to use
-     * @default 'gpt-4' for openai, 'claude-3-5-sonnet-20241022' for anthropic
+     * Model to use (provider-specific default if omitted)
      */
     model?: string;
     /**
@@ -2260,11 +2739,11 @@ interface LLMHostConfig {
     maxTokens?: number;
     /**
      * Temperature (0-1, lower is more deterministic)
-     * @default 0.0
+     * @default 0
      */
     temperature?: number;
     /**
-     * Maximum number of tool calls to allow in a single conversation
+     * Maximum number of tool call steps to allow in a single conversation
      * @default 10
      */
     maxToolCalls?: number;
@@ -2273,72 +2752,49 @@ interface LLMHostConfig {
  * A tool call made by the LLM
  */
 interface LLMToolCall {
-    /**
-     * Tool name
-     */
+    /** Tool name */
     name: string;
-    /**
-     * Tool arguments (as provided by LLM)
-     */
+    /** Tool arguments (as provided by LLM) */
     arguments: Record<string, unknown>;
-    /**
-     * Optional tool call ID (for tracking)
-     */
+    /** Optional tool call ID (for tracking) */
     id?: string;
 }
-/**
- * Result of a tool call validation
- */
-interface ToolCallValidationResult {
-    /**
-     * Whether the tool call was valid
-     */
-    valid: boolean;
-    /**
-     * List of actual tool calls made
-     */
-    actualCalls: Array<LLMToolCall>;
-    /**
-     * Expected tool calls (if specified in eval case)
-     */
-    expectedCalls?: Array<LLMToolCall>;
-    /**
-     * Details about validation (e.g., missing calls, incorrect arguments)
-     */
-    details?: string;
-}
 /**
  * Result from an LLM host simulation
  */
 interface LLMHostSimulationResult {
-    /**
-     * Whether the simulation succeeded
-     */
+    /** Whether the simulation succeeded */
     success: boolean;
-    /**
-     * Tool calls made by the LLM
-     */
+    /** Tool calls made by the LLM */
     toolCalls: Array<LLMToolCall>;
-    /**
-     * Final response from the LLM
-     */
+    /** Final response from the LLM */
     response?: string;
-    /**
-     * Error message if simulation failed
-     */
+    /** Error message if simulation failed */
     error?: string;
-    /**
-     * Full conversation history (for debugging)
-     */
+    /** The scenario prompt that was given to the LLM */
+    scenario?: string;
+    /** The conversation turns for attribution analysis */
     conversationHistory?: Array<{
         role: 'user' | 'assistant' | 'tool';
         content: string;
     }>;
+    /**
+     * Milliseconds spent waiting for LLM responses
+     * (excludes MCP tool execution time)
+     */
+    llmDurationMs?: number;
+    /**
+     * Milliseconds spent executing MCP tool calls
+     * (excludes LLM response time)
+     */
+    mcpDurationMs?: number;
 }
 /**
- * Interface for LLM host simulators
+ * Interface for LLM host simulators.
  *
- * Implementations communicate with MCP servers via the actual MCP protocol
+ * The only built-in implementation is the Vercel AI SDK orchestrator
+ * (src/evals/llmHost/adapters/vercel.ts). Custom implementations can be
+ * created for specialised testing needs.
  */
 interface LLMHostSimulator {
     /**
@@ -2351,24 +2807,6 @@ interface LLMHostSimulator {
      */
     simulate(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
 }
-/**
- * Expected tool call specification (for validation)
- */
-interface ExpectedToolCall {
-    /**
-     * Tool name
-     */
-    name: string;
-    /**
-     * Expected arguments (partial match)
-     */
-    arguments?: Record<string, unknown>;
-    /**
-     * Whether this call is required
-     * @default true
-     */
-    required?: boolean;
-}
 /**
  * Evaluation mode
@@ -2423,6 +2861,41 @@ interface EvalCase {
      * For 'llm_host' mode, can include 'expectedToolCalls' for validation
      */
     metadata?: Record<string, unknown>;
+    /**
+     * Number of times to run this case and compute an accuracy score.
+     * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
+     * by `accuracyThreshold` rather than a single run.
+     * @default 1
+     */
+    iterations?: number;
+    /**
+     * Minimum accuracy (0–1) required to pass when `iterations > 1`.
+     * @default 1.0 (all iterations must pass)
+     */
+    accuracyThreshold?: number;
+    /**
+     * Number of times to invoke the LLM judge per `passesJudge` assertion.
+     * Scores are averaged; the mean must meet the threshold to pass.
+     * Reduces judge variance caused by non-determinism.
+     * Per-assertion `passesJudge.reps` overrides this value.
+     * @default 1
+     */
+    judgeReps?: number;
+    /**
+     * Golden/expected answer for this case.
+     * When set, automatically passed as `reference` to the LLM judge
+     * (unless passesJudge.reference is explicitly provided).
+     * Mirrors EvalV2's `canonical_answer` field.
+     */
+    canonicalAnswer?: string;
+    /**
+     * Arbitrary string labels for this case.
+     * Use for filtering eval runs with `EvalRunnerOptions.filterTags`
+     * and for slicing results by category.
+     *
+     * @example ['tool-finding', 'multi-hop', 'search']
+     */
+    tags?: string[];
     /**
      * Expectations to validate against the tool response
      *
@@ -2486,14 +2959,30 @@ interface EvalExpectBlock {
      * LLM-as-judge evaluation (toPassToolJudge)
      */
     passesJudge?: {
-        /** Evaluation rubric/criteria */
-        rubric: string;
+        /** Built-in rubric name or custom rubric object */
+        rubric: BuiltInRubric | {
+            text: string;
+        };
         /** Reference response to compare against */
         reference?: unknown;
         /** Score threshold for passing (0-1, default: 0.7) */
         threshold?: number;
-        /** Judge configuration ID */
-        configId?: string;
+        /** Number of judge evaluations for this assertion. Overrides EvalCase.judgeReps. */
+        reps?: number;
+        /** Judge provider. @default 'claude' */
+        provider?: 'claude' | 'anthropic' | 'openai' | 'google';
+        /** Model override (e.g., 'claude-opus-4-20250514') */
+        model?: string;
+        /** Environment variable name for API key */
+        apiKeyEnvVar?: string;
+        /** Max tokens for judge response */
+        maxTokens?: number;
+        /** Temperature for judge LLM (0–1) */
+        temperature?: number;
+        /** Max budget in USD per evaluation */
+        maxBudgetUsd?: number;
+        /** Fail if response exceeds this size in bytes before judging */
+        maxToolOutputSize?: number;
     };
     /**
      * Response size validation (toHaveToolResponseSize)
@@ -2504,6 +2993,39 @@ interface EvalExpectBlock {
         /** Minimum required size in bytes */
         minBytes?: number;
     };
+    /**
+     * Asserts which tools the LLM called during an llm_host simulation.
+     * Only meaningful for llm_host mode — direct mode has no tool call trace.
+     */
+    toolsTriggered?: {
+        /** Expected tool calls */
+        calls: Array<{
+            /** Tool name */
+            name: string;
+            /** Expected arguments (partial match — extra keys are allowed) */
+            arguments?: Record<string, unknown>;
+            /** Whether this call MUST have been made (default: true) */
+            required?: boolean;
+        }>;
+        /**
+         * 'strict': calls must appear in the exact order listed
+         * 'any': calls can appear in any order (default)
+         */
+        order?: 'strict' | 'any';
+        /** If true, no tool calls outside the `calls` list are allowed */
+        exclusive?: boolean;
+    };
+    /**
+     * Asserts the number of tool calls made during an llm_host simulation.
+     */
+    toolCallCount?: {
+        /** Minimum number of tool calls */
+        min?: number;
+        /** Maximum number of tool calls */
+        max?: number;
+        /** Exact number of tool calls */
+        exact?: number;
+    };
 }
 /**
  * A complete eval dataset containing multiple test cases
@@ -2543,21 +3065,21 @@ declare const EvalCaseSchema: z.ZodObject<{
     args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
     scenario: z.ZodOptional<z.ZodString>;
     llmHostConfig: z.ZodOptional<z.ZodObject<{
-        provider: z.ZodEnum<["openai", "anthropic"]>;
+        provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
         apiKeyEnvVar: z.ZodOptional<z.ZodString>;
         model: z.ZodOptional<z.ZodString>;
         maxTokens: z.ZodOptional<z.ZodNumber>;
         temperature: z.ZodOptional<z.ZodNumber>;
         maxToolCalls: z.ZodOptional<z.ZodNumber>;
     }, "strip", z.ZodTypeAny, {
-        provider: "anthropic" | "openai";
+        provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
         model?: string | undefined;
         maxTokens?: number | undefined;
         apiKeyEnvVar?: string | undefined;
         temperature?: number | undefined;
         maxToolCalls?: number | undefined;
     }, {
-        provider: "anthropic" | "openai";
+        provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
         model?: string | undefined;
         maxTokens?: number | undefined;
         apiKeyEnvVar?: string | undefined;
@@ -2565,6 +3087,11 @@ declare const EvalCaseSchema: z.ZodObject<{
         maxToolCalls?: number | undefined;
     }>>;
     metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
+    iterations: z.ZodOptional<z.ZodNumber>;
+    accuracyThreshold: z.ZodOptional<z.ZodNumber>;
+    judgeReps: z.ZodOptional<z.ZodNumber>;
+    canonicalAnswer: z.ZodOptional<z.ZodString>;
+    tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
     expect: z.ZodOptional<z.ZodObject<{
         response: z.ZodOptional<z.ZodUnknown>;
         schema: z.ZodOptional<z.ZodString>;
@@ -2589,20 +3116,51 @@ declare const EvalCaseSchema: z.ZodObject<{
         }>]>, "many">>;
         isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
         passesJudge: z.ZodOptional<z.ZodObject<{
-            rubric: z.ZodString;
+            rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
+                text: z.ZodString;
+            }, "strip", z.ZodTypeAny, {
+                text: string;
+            }, {
+                text: string;
+            }>]>;
             reference: z.ZodOptional<z.ZodUnknown>;
             threshold: z.ZodOptional<z.ZodNumber>;
-            configId: z.ZodOptional<z.ZodString>;
+            reps: z.ZodOptional<z.ZodNumber>;
+            provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
+            model: z.ZodOptional<z.ZodString>;
+            apiKeyEnvVar: z.ZodOptional<z.ZodString>;
+            maxTokens: z.ZodOptional<z.ZodNumber>;
+            temperature: z.ZodOptional<z.ZodNumber>;
+            maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
+            maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
         }, "strip", z.ZodTypeAny, {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         }, {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         }>>;
         responseSize: z.ZodOptional<z.ZodObject<{
             maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2614,11 +3172,71 @@ declare const EvalCaseSchema: z.ZodObject<{
             maxBytes?: number | undefined;
             minBytes?: number | undefined;
         }>>;
+        toolsTriggered: z.ZodOptional<z.ZodObject<{
+            calls: z.ZodArray<z.ZodObject<{
+                name: z.ZodString;
+                arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
+                required: z.ZodOptional<z.ZodBoolean>;
+            }, "strip", z.ZodTypeAny, {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }, {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }>, "many">;
+            order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
+            exclusive: z.ZodOptional<z.ZodBoolean>;
+        }, "strip", z.ZodTypeAny, {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        }, {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        }>>;
+        toolCallCount: z.ZodOptional<z.ZodObject<{
+            min: z.ZodOptional<z.ZodNumber>;
+            max: z.ZodOptional<z.ZodNumber>;
+            exact: z.ZodOptional<z.ZodNumber>;
+        }, "strip", z.ZodTypeAny, {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        }, {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        }>>;
     }, "strip", z.ZodTypeAny, {
+        response?: unknown;
         isError?: string | boolean | string[] | undefined;
         schema?: string | undefined;
         snapshot?: string | undefined;
-        response?: unknown;
+        toolsTriggered?: {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        } | undefined;
+        toolCallCount?: {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        } | undefined;
         containsText?: string | string[] | undefined;
         matchesPattern?: string | string[] | undefined;
         snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2628,20 +3246,43 @@ declare const EvalCaseSchema: z.ZodObject<{
             remove: string[];
         })[] | undefined;
         passesJudge?: {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         } | undefined;
         responseSize?: {
             maxBytes?: number | undefined;
             minBytes?: number | undefined;
         } | undefined;
     }, {
+        response?: unknown;
         isError?: string | boolean | string[] | undefined;
         schema?: string | undefined;
         snapshot?: string | undefined;
-        response?: unknown;
+        toolsTriggered?: {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        } | undefined;
+        toolCallCount?: {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        } | undefined;
         containsText?: string | string[] | undefined;
         matchesPattern?: string | string[] | undefined;
         snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2651,10 +3292,19 @@ declare const EvalCaseSchema: z.ZodObject<{
             remove: string[];
         })[] | undefined;
         passesJudge?: {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         } | undefined;
         responseSize?: {
             maxBytes?: number | undefined;
@@ -2664,24 +3314,43 @@ declare const EvalCaseSchema: z.ZodObject<{
 }, "strip", z.ZodTypeAny, {
     id: string;
     args?: Record<string, unknown> | undefined;
-    metadata?: Record<string, unknown> | undefined;
     mode?: "direct" | "llm_host" | undefined;
+    metadata?: Record<string, unknown> | undefined;
     description?: string | undefined;
     toolName?: string | undefined;
     scenario?: string | undefined;
     llmHostConfig?: {
-        provider: "anthropic" | "openai";
+        provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
         model?: string | undefined;
         maxTokens?: number | undefined;
         apiKeyEnvVar?: string | undefined;
         temperature?: number | undefined;
         maxToolCalls?: number | undefined;
     } | undefined;
+    iterations?: number | undefined;
+    accuracyThreshold?: number | undefined;
+    judgeReps?: number | undefined;
+    canonicalAnswer?: string | undefined;
+    tags?: string[] | undefined;
     expect?: {
+        response?: unknown;
         isError?: string | boolean | string[] | undefined;
         schema?: string | undefined;
         snapshot?: string | undefined;
-        response?: unknown;
+        toolsTriggered?: {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        } | undefined;
+        toolCallCount?: {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        } | undefined;
         containsText?: string | string[] | undefined;
         matchesPattern?: string | string[] | undefined;
         snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2691,10 +3360,19 @@ declare const EvalCaseSchema: z.ZodObject<{
             remove: string[];
         })[] | undefined;
         passesJudge?: {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         } | undefined;
         responseSize?: {
             maxBytes?: number | undefined;
@@ -2704,24 +3382,43 @@ declare const EvalCaseSchema: z.ZodObject<{
 }, {
     id: string;
     args?: Record<string, unknown> | undefined;
-    metadata?: Record<string, unknown> | undefined;
     mode?: "direct" | "llm_host" | undefined;
+    metadata?: Record<string, unknown> | undefined;
     description?: string | undefined;
     toolName?: string | undefined;
     scenario?: string | undefined;
     llmHostConfig?: {
-        provider: "anthropic" | "openai";
+        provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
         model?: string | undefined;
         maxTokens?: number | undefined;
         apiKeyEnvVar?: string | undefined;
         temperature?: number | undefined;
         maxToolCalls?: number | undefined;
     } | undefined;
+    iterations?: number | undefined;
+    accuracyThreshold?: number | undefined;
+    judgeReps?: number | undefined;
+    canonicalAnswer?: string | undefined;
+    tags?: string[] | undefined;
     expect?: {
+        response?: unknown;
         isError?: string | boolean | string[] | undefined;
         schema?: string | undefined;
         snapshot?: string | undefined;
-        response?: unknown;
+        toolsTriggered?: {
+            calls: {
+                name: string;
+                required?: boolean | undefined;
+                arguments?: Record<string, unknown> | undefined;
+            }[];
+            order?: "strict" | "any" | undefined;
+            exclusive?: boolean | undefined;
+        } | undefined;
+        toolCallCount?: {
+            exact?: number | undefined;
+            min?: number | undefined;
+            max?: number | undefined;
+        } | undefined;
         containsText?: string | string[] | undefined;
         matchesPattern?: string | string[] | undefined;
         snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2731,10 +3428,19 @@ declare const EvalCaseSchema: z.ZodObject<{
             remove: string[];
         })[] | undefined;
         passesJudge?: {
-            rubric: string;
+            rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                text: string;
+            };
+            model?: string | undefined;
+            maxTokens?: number | undefined;
+            maxBudgetUsd?: number | undefined;
             reference?: unknown;
             threshold?: number | undefined;
-            configId?: string | undefined;
+            reps?: number | undefined;
+            provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+            apiKeyEnvVar?: string | undefined;
+            temperature?: number | undefined;
+            maxToolOutputSize?: number | undefined;
         } | undefined;
         responseSize?: {
             maxBytes?: number | undefined;
@@ -2756,21 +3462,21 @@ declare const EvalDatasetSchema: z.ZodObject<{
         args: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
         scenario: z.ZodOptional<z.ZodString>;
         llmHostConfig: z.ZodOptional<z.ZodObject<{
-            provider: z.ZodEnum<["openai", "anthropic"]>;
+            provider: z.ZodEnum<["openai", "anthropic", "azure", "google", "mistral", "ollama", "deepseek", "openrouter", "xai", "vertex-anthropic"]>;
             apiKeyEnvVar: z.ZodOptional<z.ZodString>;
             model: z.ZodOptional<z.ZodString>;
             maxTokens: z.ZodOptional<z.ZodNumber>;
             temperature: z.ZodOptional<z.ZodNumber>;
             maxToolCalls: z.ZodOptional<z.ZodNumber>;
         }, "strip", z.ZodTypeAny, {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
             temperature?: number | undefined;
             maxToolCalls?: number | undefined;
         }, {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
@@ -2778,6 +3484,11 @@ declare const EvalDatasetSchema: z.ZodObject<{
             maxToolCalls?: number | undefined;
         }>>;
         metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
+        iterations: z.ZodOptional<z.ZodNumber>;
+        accuracyThreshold: z.ZodOptional<z.ZodNumber>;
+        judgeReps: z.ZodOptional<z.ZodNumber>;
+        canonicalAnswer: z.ZodOptional<z.ZodString>;
+        tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
         expect: z.ZodOptional<z.ZodObject<{
             response: z.ZodOptional<z.ZodUnknown>;
             schema: z.ZodOptional<z.ZodString>;
@@ -2802,20 +3513,51 @@ declare const EvalDatasetSchema: z.ZodObject<{
             }>]>, "many">>;
             isError: z.ZodOptional<z.ZodUnion<[z.ZodBoolean, z.ZodString, z.ZodArray<z.ZodString, "many">]>>;
             passesJudge: z.ZodOptional<z.ZodObject<{
-                rubric: z.ZodString;
+                rubric: z.ZodUnion<[z.ZodEnum<["correctness", "completeness", "groundedness", "instruction-following", "conciseness"]>, z.ZodObject<{
+                    text: z.ZodString;
+                }, "strip", z.ZodTypeAny, {
+                    text: string;
+                }, {
+                    text: string;
+                }>]>;
                 reference: z.ZodOptional<z.ZodUnknown>;
                 threshold: z.ZodOptional<z.ZodNumber>;
-                configId: z.ZodOptional<z.ZodString>;
+                reps: z.ZodOptional<z.ZodNumber>;
+                provider: z.ZodOptional<z.ZodEnum<["claude", "anthropic", "openai", "google"]>>;
+                model: z.ZodOptional<z.ZodString>;
+                apiKeyEnvVar: z.ZodOptional<z.ZodString>;
+                maxTokens: z.ZodOptional<z.ZodNumber>;
+                temperature: z.ZodOptional<z.ZodNumber>;
+                maxBudgetUsd: z.ZodOptional<z.ZodNumber>;
+                maxToolOutputSize: z.ZodOptional<z.ZodNumber>;
             }, "strip", z.ZodTypeAny, {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             }, {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             }>>;
             responseSize: z.ZodOptional<z.ZodObject<{
                 maxBytes: z.ZodOptional<z.ZodNumber>;
@@ -2827,11 +3569,71 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 maxBytes?: number | undefined;
                 minBytes?: number | undefined;
             }>>;
+            toolsTriggered: z.ZodOptional<z.ZodObject<{
+                calls: z.ZodArray<z.ZodObject<{
+                    name: z.ZodString;
+                    arguments: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
+                    required: z.ZodOptional<z.ZodBoolean>;
+                }, "strip", z.ZodTypeAny, {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }, {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }>, "many">;
+                order: z.ZodOptional<z.ZodEnum<["strict", "any"]>>;
+                exclusive: z.ZodOptional<z.ZodBoolean>;
+            }, "strip", z.ZodTypeAny, {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            }, {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            }>>;
+            toolCallCount: z.ZodOptional<z.ZodObject<{
+                min: z.ZodOptional<z.ZodNumber>;
+                max: z.ZodOptional<z.ZodNumber>;
+                exact: z.ZodOptional<z.ZodNumber>;
+            }, "strip", z.ZodTypeAny, {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            }, {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            }>>;
         }, "strip", z.ZodTypeAny, {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2841,20 +3643,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
                 minBytes?: number | undefined;
             } | undefined;
         }, {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2864,10 +3689,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
@@ -2877,24 +3711,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
     }, "strip", z.ZodTypeAny, {
         id: string;
         args?: Record<string, unknown> | undefined;
-        metadata?: Record<string, unknown> | undefined;
         mode?: "direct" | "llm_host" | undefined;
+        metadata?: Record<string, unknown> | undefined;
         description?: string | undefined;
         toolName?: string | undefined;
         scenario?: string | undefined;
         llmHostConfig?: {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
             temperature?: number | undefined;
             maxToolCalls?: number | undefined;
         } | undefined;
+        iterations?: number | undefined;
+        accuracyThreshold?: number | undefined;
+        judgeReps?: number | undefined;
+        canonicalAnswer?: string | undefined;
+        tags?: string[] | undefined;
         expect?: {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2904,10 +3757,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
@@ -2917,24 +3779,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
     }, {
         id: string;
         args?: Record<string, unknown> | undefined;
-        metadata?: Record<string, unknown> | undefined;
         mode?: "direct" | "llm_host" | undefined;
+        metadata?: Record<string, unknown> | undefined;
         description?: string | undefined;
         toolName?: string | undefined;
         scenario?: string | undefined;
         llmHostConfig?: {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
             temperature?: number | undefined;
             maxToolCalls?: number | undefined;
         } | undefined;
+        iterations?: number | undefined;
+        accuracyThreshold?: number | undefined;
+        judgeReps?: number | undefined;
+        canonicalAnswer?: string | undefined;
+        tags?: string[] | undefined;
         expect?: {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2944,10 +3825,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
@@ -2961,24 +3851,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
     cases: {
         id: string;
         args?: Record<string, unknown> | undefined;
-        metadata?: Record<string, unknown> | undefined;
         mode?: "direct" | "llm_host" | undefined;
+        metadata?: Record<string, unknown> | undefined;
         description?: string | undefined;
         toolName?: string | undefined;
         scenario?: string | undefined;
         llmHostConfig?: {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
             temperature?: number | undefined;
             maxToolCalls?: number | undefined;
         } | undefined;
+        iterations?: number | undefined;
+        accuracyThreshold?: number | undefined;
+        judgeReps?: number | undefined;
+        canonicalAnswer?: string | undefined;
+        tags?: string[] | undefined;
         expect?: {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -2988,10 +3897,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
@@ -3006,24 +3924,43 @@ declare const EvalDatasetSchema: z.ZodObject<{
     cases: {
         id: string;
         args?: Record<string, unknown> | undefined;
-        metadata?: Record<string, unknown> | undefined;
         mode?: "direct" | "llm_host" | undefined;
+        metadata?: Record<string, unknown> | undefined;
         description?: string | undefined;
         toolName?: string | undefined;
         scenario?: string | undefined;
         llmHostConfig?: {
-            provider: "anthropic" | "openai";
+            provider: "openai" | "anthropic" | "azure" | "google" | "mistral" | "ollama" | "deepseek" | "openrouter" | "xai" | "vertex-anthropic";
             model?: string | undefined;
             maxTokens?: number | undefined;
             apiKeyEnvVar?: string | undefined;
             temperature?: number | undefined;
             maxToolCalls?: number | undefined;
         } | undefined;
+        iterations?: number | undefined;
+        accuracyThreshold?: number | undefined;
+        judgeReps?: number | undefined;
+        canonicalAnswer?: string | undefined;
+        tags?: string[] | undefined;
         expect?: {
+            response?: unknown;
             isError?: string | boolean | string[] | undefined;
             schema?: string | undefined;
             snapshot?: string | undefined;
-            response?: unknown;
+            toolsTriggered?: {
+                calls: {
+                    name: string;
+                    required?: boolean | undefined;
+                    arguments?: Record<string, unknown> | undefined;
+                }[];
+                order?: "strict" | "any" | undefined;
+                exclusive?: boolean | undefined;
+            } | undefined;
+            toolCallCount?: {
+                exact?: number | undefined;
+                min?: number | undefined;
+                max?: number | undefined;
+            } | undefined;
             containsText?: string | string[] | undefined;
             matchesPattern?: string | string[] | undefined;
             snapshotSanitizers?: ("timestamp" | "uuid" | "iso-date" | "objectId" | "jwt" | {
@@ -3033,10 +3970,19 @@ declare const EvalDatasetSchema: z.ZodObject<{
                 remove: string[];
             })[] | undefined;
             passesJudge?: {
-                rubric: string;
+                rubric: "correctness" | "completeness" | "groundedness" | "instruction-following" | "conciseness" | {
+                    text: string;
+                };
+                model?: string | undefined;
+                maxTokens?: number | undefined;
+                maxBudgetUsd?: number | undefined;
                 reference?: unknown;
                 threshold?: number | undefined;
-                configId?: string | undefined;
+                reps?: number | undefined;
+                provider?: "openai" | "anthropic" | "google" | "claude" | undefined;
+                apiKeyEnvVar?: string | undefined;
+                temperature?: number | undefined;
+                maxToolOutputSize?: number | undefined;
             } | undefined;
             responseSize?: {
                 maxBytes?: number | undefined;
@@ -3126,50 +4072,140 @@ declare function loadEvalDataset(filePath: string, options?: LoadDatasetOptions)
 declare function loadEvalDatasetFromObject(data: unknown, options?: LoadDatasetOptions): EvalDataset;
 /**
- * Context passed to the eval runner
+ * Reporter-specific type definitions
+ *
+ * These types are used by the MCP reporter and UI.
+ *
+ * @packageDocumentation
  */
-interface EvalContext {
+/**
+ * Experiment tracking metadata for an eval run
+ */
+interface EvalRunMetadata {
+    /** Git commit hash at time of run */
+    gitHash?: string;
+    /** ISO timestamp of the run */
+    timestamp: string;
+    /** Package version from package.json */
+    packageVersion: string;
+    /** LLM host model identifier (if llm_host mode) */
+    llmHostModel?: string;
+    /** Judge model identifier (if judge was used) */
+    judgeModel?: string;
+}
+/**
+ * Individual conformance check result
+ */
+interface MCPConformanceCheck$1 {
     /**
-     * MCP fixture API for interacting with the server
+     * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
      */
-    mcp: MCPFixtureApi;
+    name: string;
     /**
-     * Optional Playwright TestInfo for reporter integration
-     * When provided, eval results will be attached to the test for the MCP reporter
+     * Whether the check passed
      */
-    testInfo?: TestInfo;
+    pass: boolean;
     /**
-     * Optional Playwright expect function for snapshot testing
-     * Required for snapshot expectations to work properly
+     * Human-readable message describing the result
      */
-    expect?: Expect;
+    message: string;
 }
 /**
- * Result of a single eval case
+ * Conformance check result as stored in reporter data
  */
-interface EvalCaseResult$1 {
+interface MCPConformanceResultData {
     /**
-     * Case ID
+     * Test title where conformance check was run
      */
-    id: string;
+    testTitle: string;
     /**
-     * Dataset name this case belongs to
+     * Whether all checks passed
      */
-    datasetName: string;
+    pass: boolean;
     /**
-     * MCP tool name that was called
+     * Individual check results
      */
-    toolName: string;
+    checks: MCPConformanceCheck$1[];
     /**
-     * Evaluation mode (direct or llm_host)
-     * @deprecated Mode is inferred from test context, not displayed in reports
+     * Server info if available
      */
-    mode?: 'direct' | 'llm_host';
-    /**
-     * Source of this result
-     * - 'eval': From runEvalDataset() using JSON eval datasets
-     * - 'test': From direct API test tracking (MCP fixture calls)
+    serverInfo?: {
+        name?: string;
+        version?: string;
+    };
+    /**
+     * Number of tools discovered
+     */
+    toolCount: number;
+    /**
+     * Auth type used for this check
+     */
+    authType?: AuthType;
+    /**
+     * Project name
+     */
+    project?: string;
+}
+/**
+ * Server capabilities data from mcp-list-tools attachment
+ */
+interface MCPServerCapabilitiesData {
+    /**
+     * Test title where listTools was called
+     */
+    testTitle: string;
+    /**
+     * List of tools available on the server
+     */
+    tools: Array<{
+        name: string;
+        description?: string;
+    }>;
+    /**
+     * Total number of tools
+     */
+    toolCount: number;
+    /**
+     * Auth type used for this test
+     */
+    authType?: AuthType;
+    /**
+     * Project name
+     */
+    project?: string;
+}
+/**
+ * Result of a single iteration within a multi-iteration eval case
+ */
+interface IterationResult {
+    /** Whether this iteration passed */
+    pass: boolean;
+    /** Execution time for this iteration */
+    durationMs: number;
+    /** Error message if the iteration failed with an exception */
+    error?: string;
+    /** When true, this iteration failed due to network/infrastructure issues rather than an assertion failure */
+    isInfrastructureError?: boolean;
+}
+/**
+ * Result of a single eval case
+ */
+interface EvalCaseResult {
+    /**
+     * Case ID
+     */
+    id: string;
+    /**
+     * Dataset name this case belongs to
+     */
+    datasetName: string;
+    /**
+     * MCP tool name that was called
+     */
+    toolName: string;
+    /**
+     * Source of this result
      */
     source: ResultSource;
     /**
@@ -3194,14 +4230,164 @@ interface EvalCaseResult$1 {
     authType?: AuthType;
     /**
      * Playwright project name this test belongs to
-     * Used for filtering/grouping results by project in the reporter
      */
     project?: string;
     /**
      * Execution time in milliseconds
      */
     durationMs: number;
+    /**
+     * Assertion pass rate (0–1): passes divided by non-infrastructure iterations.
+     * Only present when the case was run with `iterations > 1`.
+     *
+     * Infrastructure errors (network timeouts, rate limits, etc.) are excluded from
+     * the denominator so that environment reliability does not inflate this metric.
+     */
+    assertionPassRate?: number;
+    /**
+     * Infrastructure error rate (0–1): infra errors divided by total iterations.
+     * Only present when the case was run with `iterations > 1`.
+     */
+    infrastructureErrorRate?: number;
+    /**
+     * Accuracy score (0–1) across all iterations.
+     * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
+     * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
+     */
+    accuracy?: number;
+    /**
+     * Per-iteration pass/fail breakdown.
+     * Only present when the case was run with `iterations > 1`.
+     */
+    iterationResults?: Array<IterationResult>;
+    /**
+     * Tags from the source eval case, for filtering and slicing reports.
+     */
+    tags?: string[];
+    /**
+     * Precision of tool calls made (0–1).
+     * 1.0 means every tool called was expected; <1.0 means unexpected tools were called.
+     * Only populated when exclusive: true in toolsTriggered and the expectation was evaluated.
+     */
+    toolPrecision?: number;
+    /**
+     * Recall of required tool calls (0–1).
+     * 1.0 means all required tools were called; <1.0 means some were missed.
+     * Only populated when toolsTriggered expectation was evaluated.
+     */
+    toolRecall?: number;
+    /**
+     * Pass/fail status of this case in the baseline run.
+     * Only present when a baseline was provided to runEvalDataset.
+     */
+    baselinePass?: boolean;
+    /**
+     * Number of iterations that failed due to infrastructure errors (network, rate limits, etc.)
+     * Only present when the case was run with `iterations > 1`.
+     */
+    infrastructureErrorCount?: number;
 }
+/**
+ * Aggregated MCP eval run data
+ */
+interface MCPEvalRunData {
+    /**
+     * Run timestamp (ISO 8601)
+     */
+    timestamp: string;
+    /**
+     * Total duration in milliseconds
+     */
+    durationMs: number;
+    /**
+     * Environment info
+     */
+    environment: {
+        ci: boolean;
+        node: string;
+        platform: string;
+    };
+    /**
+     * Aggregate metrics
+     */
+    metrics: {
+        /**
+         * Total number of eval cases
+         */
+        total: number;
+        /**
+         * Number of passed cases
+         */
+        passed: number;
+        /**
+         * Number of failed cases
+         */
+        failed: number;
+        /**
+         * Pass rate (0-1)
+         */
+        passRate: number;
+        /**
+         * Dataset breakdown: dataset name -> count
+         */
+        datasetBreakdown: Record<string, number>;
+        /**
+         * Expectation type breakdown
+         */
+        expectationBreakdown: ExpectationBreakdown;
+    };
+    /**
+     * All eval results from this run
+     */
+    results: EvalCaseResult[];
+    /**
+     * Conformance check results (optional)
+     */
+    conformanceChecks?: MCPConformanceResultData[];
+    /**
+     * Server capabilities discovered via listTools (optional)
+     */
+    serverCapabilities?: MCPServerCapabilitiesData[];
+}
+/**
+ * Historical summary for trend charts
+ */
+interface MCPEvalHistoricalSummary {
+    timestamp: string;
+    total: number;
+    passed: number;
+    failed: number;
+    passRate: number;
+    durationMs: number;
+}
+/**
+ * Complete data structure passed to UI
+ */
+interface MCPEvalData {
+    runData: MCPEvalRunData;
+    historical: MCPEvalHistoricalSummary[];
+}
+/**
+ * Context passed to the eval runner
+ */
+interface EvalContext {
+    /**
+     * MCP fixture API for interacting with the server
+     */
+    mcp: MCPFixtureApi;
+    /**
+     * Optional Playwright TestInfo for reporter integration
+     * When provided, eval results will be attached to the test for the MCP reporter
+     */
+    testInfo?: TestInfo;
+    /**
+     * Optional Playwright expect function for snapshot testing
+     * Required for snapshot expectations to work properly
+     */
+    expect?: Expect;
+}
 /**
  * Overall result of running an eval dataset
  */
@@ -3221,11 +4407,48 @@ interface EvalRunnerResult {
     /**
      * Individual case results
      */
-    caseResults: Array<EvalCaseResult$1>;
+    caseResults: Array<EvalCaseResult>;
     /**
      * Overall execution time in milliseconds
      */
     durationMs: number;
+    /**
+     * Difference between current pass rate and baseline pass rate.
+     * Positive = improvement, negative = regression.
+     * Only present when `baselineResultsFrom` was provided.
+     */
+    deltaPassRate?: number;
+    /**
+     * Number of cases that regressed: passed in baseline, failed now.
+     * Only present when `baselineResultsFrom` was provided.
+     */
+    regressions?: number;
+    /**
+     * Number of cases that improved: failed in baseline, passed now.
+     * Only present when `baselineResultsFrom` was provided.
+     */
+    improvements?: number;
+    /**
+     * Average tool precision across all llm_host cases that have a
+     * `toolsTriggered` expectation (precision = fraction of called tools
+     * that were expected). Only present when at least one such case ran.
+     */
+    datasetToolPrecision?: number;
+    /**
+     * Average tool recall across all llm_host cases that have a
+     * `toolsTriggered` expectation (recall = fraction of required tools
+     * that were actually called). Only present when at least one such case ran.
+     */
+    datasetToolRecall?: number;
+    /**
+     * Harmonic mean of `datasetToolPrecision` and `datasetToolRecall`.
+     * Only present when at least one case contributes precision/recall data.
+     */
+    datasetToolF1?: number;
+    /**
+     * Experiment tracking metadata captured at run time.
+     */
+    metadata?: EvalRunMetadata;
 }
 /**
  * Options for running eval dataset
@@ -3251,12 +4474,6 @@ interface EvalRunnerOptions {
      * ```
      */
     schemas?: Record<string, ZodType>;
-    /**
-     * Judge configuration registry by ID
-     *
-     * Maps config IDs to JudgeConfig for use with expect.passesJudge.configId
-     */
-    judgeConfigs?: Record<string, JudgeConfig>;
     /**
      * Whether to stop on first failure
      * @default false
@@ -3265,7 +4482,71 @@ interface EvalRunnerOptions {
     /**
      * Optional callback called after each case
      */
-    onCaseComplete?: (result: EvalCaseResult$1) => void | Promise<void>;
+    onCaseComplete?: (result: EvalCaseResult) => void | Promise<void>;
+    /**
+     * Maximum number of eval cases to run concurrently.
+     * When > 1, cases run in parallel (ignores stopOnFailure ordering).
+     * @default 1 (sequential)
+     */
+    concurrency?: number;
+    /**
+     * Default iteration count for `llm_host` mode cases that do not specify
+     * `iterations` explicitly. Has no effect on `direct` mode cases (which are
+     * deterministic and always default to 1 iteration).
+     *
+     * Set to 10 for standard runs or 20 for release gates. Individual cases can
+     * still override this with their own `iterations` field.
+     *
+     * @default 1 (preserves historical behaviour when not set)
+     *
+     * @example
+     * ```typescript
+     * // Run all llm_host cases 10 times each by default
+     * await runEvalDataset({ dataset, defaultLlmIterations: 10 }, { mcp });
+     * ```
+     */
+    defaultLlmIterations?: number;
+    /**
+     * Default number of judge evaluations for cases that do not specify
+     * `judgeReps` explicitly. Applies to any case with a `passesJudge`
+     * expectation. Per-case `judgeReps` overrides this.
+     *
+     * @default 1 (single judge run)
+     */
+    defaultJudgeReps?: number;
+    /**
+     * When set, only eval cases whose `tags` array contains at least one of
+     * the specified tags are run. Cases without a `tags` field are excluded.
+     * When undefined or empty, all cases run (default behavior).
+     */
+    filterTags?: string[];
+    /**
+     * If set, saves the run results to this file path after completion.
+     * Use with `baselineResultsFrom` on the next run for regression detection.
+     *
+     * @example '.mcp-test-results/baseline.json'
+     */
+    saveResultsTo?: string;
+    /**
+     * If set, loads this file as the baseline and computes delta metrics vs the current run.
+     * Populates `EvalRunnerResult.deltaPassRate`, `.regressions`, `.improvements`,
+     * and tags each `EvalCaseResult.baselinePass`.
+     */
+    baselineResultsFrom?: string;
+    /**
+     * LLM host model identifier to record in run metadata.
+     * Use this to identify which model was used when running llm_host cases.
+     *
+     * @example 'claude-opus-4-20250514'
+     */
+    llmHostModel?: string;
+    /**
+     * Judge model identifier to record in run metadata.
+     * Use this to identify which model was used for judge evaluations.
+     *
+     * @example 'claude-sonnet-4-20250514'
+     */
+    judgeModel?: string;
 }
 /**
  * Options for running a single eval case
@@ -3279,17 +4560,14 @@ interface EvalCaseOptions {
      * Schema registry for schema validation by name
      */
     schemas?: Record<string, ZodType>;
-    /**
-     * Judge configuration registry by ID
-     */
-    judgeConfigs?: Record<string, JudgeConfig>;
 }
 /**
- * Runs a single eval case and returns the result
+ * Runs a single eval case and returns the result.
+ * When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
  *
  * @param evalCase - The eval case to run
  * @param context - Context containing mcp, testInfo, expect
- * @param options - Optional configuration (datasetName, schemas, judgeConfigs)
+ * @param options - Optional configuration (datasetName, schemas)
  * @returns The result of running the eval case
  *
  * @example
@@ -3303,131 +4581,166 @@ interface EvalCaseOptions {
  * expect(result.pass).toBe(true);
  * ```
  */
-declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult$1>;
+declare function runEvalCase(evalCase: EvalCase, context: EvalContext, options?: EvalCaseOptions): Promise<EvalCaseResult>;
+declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
 /**
- * Runs an eval dataset against an MCP server
+ * Saves eval results to a JSON file for use as a baseline in future runs.
  *
- * This function composes runEvalCase() for each case in the dataset,
- * adding dataset-level features like stopOnFailure and callbacks.
+ * @param result - The eval run result to save
+ * @param filePath - Path to write the JSON file (parent dirs created automatically)
+ */
+declare function saveBaseline(result: EvalRunnerResult, filePath: string): Promise<void>;
+/**
+ * Loads a previously saved baseline from a JSON file.
  *
- * @param options - Eval runner options (dataset, schemas, judgeConfigs)
- * @param context - Eval context (mcp fixture, optional testInfo, optional expect)
- * @returns Eval results
+ * @param filePath - Path to the JSON file written by saveBaseline
+ * @returns The saved EvalRunnerResult
+ * @throws If the file cannot be read or parsed
+ */
+declare function loadBaseline(filePath: string): Promise<EvalRunnerResult>;
+/** Outcome of comparing two servers on a single eval case. */
+type ComparisonOutcome = 'A_WINS' | 'B_WINS' | 'TIE' | 'BOTH_FAIL';
+/** Result of comparing a single eval case across two servers. */
+interface CaseComparisonResult {
+    /** Case ID */
+    id: string;
+    /** Comparison outcome */
+    outcome: ComparisonOutcome;
+    /** Result from server A */
+    serverA: EvalCaseResult;
+    /** Result from server B */
+    serverB: EvalCaseResult;
+}
+/** Aggregated result of running a dataset against two servers. */
+interface ServerComparisonResult {
+    /** Dataset name */
+    dataset: string;
+    /** Total cases compared (cases present in both runs) */
+    total: number;
+    /** Cases where server A passed and server B failed */
+    aWins: number;
+    /** Cases where server B passed and server A failed */
+    bWins: number;
+    /** Cases where both passed */
+    ties: number;
+    /** Cases where both failed */
+    bothFail: number;
+    /** Raw count of cases where both servers failed (same as bothFail) */
+    bothFailCount: number;
+    /** Cases with a decisive outcome (aWins + bWins + ties, excludes BOTH_FAIL) */
+    decidedCases: number;
+    /** Fraction of total cases where both servers failed (bothFail / total) */
+    failureAlignment: number;
+    /** A win rate (aWins / decidedCases, excludes BOTH_FAIL) */
+    aWinRate: number;
+    /** B win rate (bWins / decidedCases, excludes BOTH_FAIL) */
+    bWinRate: number;
+    /** Tie rate (ties / decidedCases, excludes BOTH_FAIL) */
+    tieRate: number;
+    /** Per-case comparison results */
+    cases: CaseComparisonResult[];
+    /** Full result from server A */
+    serverAResult: EvalRunnerResult;
+    /** Full result from server B */
+    serverBResult: EvalRunnerResult;
+    /** Total duration in milliseconds */
+    durationMs: number;
+}
+/**
+ * Options for `runServerComparison`.
+ * Same as `EvalRunnerOptions` without baseline-specific fields.
+ */
+type ServerComparisonOptions = Omit<EvalRunnerOptions, 'saveResultsTo' | 'baselineResultsFrom'>;
+/**
+ * Runs the same eval dataset against two MCP servers in parallel and
+ * returns a detailed per-case comparison of results.
  *
- * @example
- * // Basic usage
- * const result = await runEvalDataset(
- *   {
- *     dataset,
- *     schemas: { WeatherResponse: WeatherSchema },
- *   },
- *   { mcp }
- * );
+ * Both servers receive identical cases and options. The comparison uses
+ * simple pass/fail per case: A_WINS means A passed and B failed, etc.
+ *
+ * @param options - Eval dataset and runner options (shared between both servers)
+ * @param contextA - MCP context for server A (e.g., Glean MCP)
+ * @param contextB - MCP context for server B (e.g., native MCP)
+ * @returns Comparison result with per-case outcomes and aggregate win rates
  *
  * @example
- * // With MCP reporter integration
- * test('eval dataset', async ({ mcp }, testInfo) => {
- *   const result = await runEvalDataset(
- *     { dataset },
- *     { mcp, testInfo }  // testInfo enables MCP reporter
- *   );
- * });
+ * ```typescript
+ * const comparison = await runServerComparison(
+ *   { dataset },
+ *   { mcp: gleanMcpFixture },
+ *   { mcp: nativeMcpFixture }
+ * );
+ * console.log(`Glean MCP wins: ${(comparison.aWinRate * 100).toFixed(1)}%`);
+ * console.log(`Native MCP wins: ${(comparison.bWinRate * 100).toFixed(1)}%`);
+ * ```
  */
-declare function runEvalDataset(options: EvalRunnerOptions, context: EvalContext): Promise<EvalRunnerResult>;
+declare function runServerComparison(options: ServerComparisonOptions, contextA: EvalContext, contextB: EvalContext): Promise<ServerComparisonResult>;
 /**
  * LLM Host Simulation - Main entry point
  *
- * Provides the public API for simulating LLM hosts interacting
- * with MCP servers through actual LLM providers.
+ * All providers (openai, anthropic, google, azure, mistral, ollama, deepseek,
+ * openrouter, xai) run through the Vercel AI SDK orchestrator, which uses
+ * generateText + stopWhen for a uniform multi-turn tool-calling loop with
+ * built-in latency decomposition.
+ *
+ * Required packages per provider:
+ *   openai      → npm install ai @ai-sdk/openai
+ *   anthropic   → npm install ai @ai-sdk/anthropic
+ *   google      → npm install ai @ai-sdk/google
+ *   azure       → npm install ai @ai-sdk/azure
+ *   mistral     → npm install ai @ai-sdk/mistral
+ *   ollama      → npm install ai @ai-sdk/ollama  (local, no API key)
+ *   deepseek    → npm install ai @ai-sdk/deepseek
+ *   openrouter  → npm install ai @openrouter/ai-sdk-provider
+ *   xai         → npm install ai @ai-sdk/xai
  */
 /**
- * Simulates an LLM host interacting with an MCP server
+ * Simulates an LLM host interacting with an MCP server.
+ *
+ * The LLM chooses which tools to call based solely on their descriptions and
+ * schemas, testing discoverability and parameter clarity at the level a real
+ * user (via Claude Desktop, ChatGPT, etc.) would experience.
  *
- * This function uses actual LLM providers (OpenAI or Anthropic) to test
- * MCP servers through natural language scenarios. The LLM chooses which
- * tools to call based on their descriptions, testing discoverability and
- * parameter clarity.
+ * All providers run through the Vercel AI SDK's generateText with maxSteps,
+ * which handles multi-turn tool calling natively and provides per-step latency
+ * decomposition (llmDurationMs vs. mcpDurationMs).
  *
  * @param mcp - MCP fixture API
- * @param scenario - Natural language prompt describing what to do
- * @param config - LLM host configuration
- * @returns Simulation result with tool calls and final response
+ * @param scenario - Natural language prompt describing what the LLM should do
+ * @param config - LLM host configuration (provider, model, temperature, etc.)
+ * @returns Simulation result with tool calls, final response, and latency data
  *
  * @example
  * ```typescript
  * const result = await simulateLLMHost(mcp,
- *   "Get the weather for London",
- *   {
- *     provider: 'openai',
- *     model: 'gpt-4o'
- *   }
+ *   "Find recent documents about MCP testing frameworks",
+ *   { provider: 'anthropic', model: 'claude-3-5-sonnet-20241022' }
  * );
  *
  * expect(result.success).toBe(true);
- * expect(result.toolCalls).toContainEqual({
- *   name: 'get_weather',
- *   arguments: { city: 'London' }
- * });
+ * expect(result.toolCalls.map(c => c.name)).toContain('search');
  * ```
  */
 declare function simulateLLMHost(mcp: MCPFixtureApi, scenario: string, config: LLMHostConfig): Promise<LLMHostSimulationResult>;
 /**
- * Checks if the required SDK is available for a given provider
+ * Returns true if the given provider is supported.
  *
- * This performs a quick check without actually loading the SDK.
- * The actual SDK loading happens in the adapter when simulation runs.
- *
- * @param provider - LLM provider to check
- * @returns true if an adapter is registered for the provider
+ * Note: this does not check whether the required @ai-sdk/* package is
+ * installed — that is validated at simulation time with a helpful error.
  */
 declare function isProviderAvailable(provider: LLMProvider): boolean;
 /**
- * Gets a helpful error message for missing dependencies
+ * Returns a human-readable installation message for a given provider.
  *
- * @param provider - LLM provider
- * @returns Error message with installation instructions
+ * @remarks This is a diagnostic utility for checking whether optional
+ * @ai-sdk/* packages are installed. Not part of the primary usage path.
  */
 declare function getMissingDependencyMessage(provider: LLMProvider): string;
-/**
- * Tool call validator for LLM host mode
- *
- * Validates that the LLM made the expected tool calls with correct arguments
- */
-/**
- * Tool call validation function signature
- */
-type ToolCallValidator = (evalCase: EvalCase, response: unknown) => Promise<EvalExpectationResult>;
-/**
- * Creates a tool call validator for LLM host mode
- *
- * Validates that the LLM made the expected tool calls with correct arguments.
- * Supports partial argument matching and optional calls.
- *
- * @returns Validator function
- *
- * @example
- * ```typescript
- * // In your eval case:
- * {
- *   "id": "weather-london",
- *   "mode": "llm_host",
- *   "scenario": "Get the weather for London",
- *   "expectedToolCalls": [
- *     {
- *       "name": "get_weather",
- *       "arguments": { "city": "London" },
- *       "required": true
- *     }
- *   ]
- * }
- * ```
- */
-declare function createToolCallValidator(): ToolCallValidator;
 /**
  * Creates an LLM judge for evaluating tool responses
  *
@@ -3494,7 +4807,7 @@ interface MCPConformanceOptions {
 /**
  * Individual check result
  */
-interface MCPConformanceCheck$1 {
+interface MCPConformanceCheck {
     name: string;
     pass: boolean;
     message: string;
@@ -3539,7 +4852,7 @@ interface MCPConformanceResult {
     /**
      * List of check results
      */
-    checks: MCPConformanceCheck$1[];
+    checks: MCPConformanceCheck[];
     /**
      * Raw MCP responses for snapshotting
      *
@@ -3588,229 +4901,6 @@ interface MCPConformanceResult {
  */
 declare function runConformanceChecks(mcp: MCPFixtureApi, options?: MCPConformanceOptions, testInfo?: TestInfo): Promise<MCPConformanceResult>;
-/**
- * Reporter-specific type definitions
- *
- * These types are used by the MCP reporter and UI.
- *
- * @packageDocumentation
- */
-/**
- * Individual conformance check result
- */
-interface MCPConformanceCheck {
-    /**
-     * Check name (e.g., 'server_info_present', 'list_tools_succeeds')
-     */
-    name: string;
-    /**
-     * Whether the check passed
-     */
-    pass: boolean;
-    /**
-     * Human-readable message describing the result
-     */
-    message: string;
-}
-/**
- * Conformance check result as stored in reporter data
- */
-interface MCPConformanceResultData {
-    /**
-     * Test title where conformance check was run
-     */
-    testTitle: string;
-    /**
-     * Whether all checks passed
-     */
-    pass: boolean;
-    /**
-     * Individual check results
-     */
-    checks: MCPConformanceCheck[];
-    /**
-     * Server info if available
-     */
-    serverInfo?: {
-        name?: string;
-        version?: string;
-    };
-    /**
-     * Number of tools discovered
-     */
-    toolCount: number;
-    /**
-     * Auth type used for this check
-     */
-    authType?: AuthType;
-    /**
-     * Project name
-     */
-    project?: string;
-}
-/**
- * Server capabilities data from mcp-list-tools attachment
- */
-interface MCPServerCapabilitiesData {
-    /**
-     * Test title where listTools was called
-     */
-    testTitle: string;
-    /**
-     * List of tools available on the server
-     */
-    tools: Array<{
-        name: string;
-        description?: string;
-    }>;
-    /**
-     * Total number of tools
-     */
-    toolCount: number;
-    /**
-     * Auth type used for this test
-     */
-    authType?: AuthType;
-    /**
-     * Project name
-     */
-    project?: string;
-}
-/**
- * Result of a single eval case
- */
-interface EvalCaseResult {
-    /**
-     * Case ID
-     */
-    id: string;
-    /**
-     * Dataset name this case belongs to
-     */
-    datasetName: string;
-    /**
-     * MCP tool name that was called
-     */
-    toolName: string;
-    /**
-     * Source of this result
-     */
-    source: ResultSource;
-    /**
-     * Overall pass/fail status
-     */
-    pass: boolean;
-    /**
-     * Tool response
-     */
-    response?: unknown;
-    /**
-     * Error if tool call failed
-     */
-    error?: string;
-    /**
-     * Expectation results
-     */
-    expectations: Partial<Record<ExpectationType, EvalExpectationResult>>;
-    /**
-     * Authentication type used for this test
-     */
-    authType?: AuthType;
-    /**
-     * Playwright project name this test belongs to
-     */
-    project?: string;
-    /**
-     * Execution time in milliseconds
-     */
-    durationMs: number;
-    /**
-     * @deprecated Mode is inferred from test context, not displayed in reports
-     */
-    mode?: 'direct' | 'llm_host';
-}
-/**
- * Aggregated MCP eval run data
- */
-interface MCPEvalRunData {
-    /**
-     * Run timestamp (ISO 8601)
-     */
-    timestamp: string;
-    /**
-     * Total duration in milliseconds
-     */
-    durationMs: number;
-    /**
-     * Environment info
-     */
-    environment: {
-        ci: boolean;
-        node: string;
-        platform: string;
-    };
-    /**
-     * Aggregate metrics
-     */
-    metrics: {
-        /**
-         * Total number of eval cases
-         */
-        total: number;
-        /**
-         * Number of passed cases
-         */
-        passed: number;
-        /**
-         * Number of failed cases
-         */
-        failed: number;
-        /**
-         * Pass rate (0-1)
-         */
-        passRate: number;
-        /**
-         * Dataset breakdown: dataset name -> count
-         */
-        datasetBreakdown: Record<string, number>;
-        /**
-         * Expectation type breakdown
-         */
-        expectationBreakdown: ExpectationBreakdown;
-    };
-    /**
-     * All eval results from this run
-     */
-    results: EvalCaseResult[];
-    /**
-     * Conformance check results (optional)
-     */
-    conformanceChecks?: MCPConformanceResultData[];
-    /**
-     * Server capabilities discovered via listTools (optional)
-     */
-    serverCapabilities?: MCPServerCapabilitiesData[];
-}
-/**
- * Historical summary for trend charts
- */
-interface MCPEvalHistoricalSummary {
-    timestamp: string;
-    total: number;
-    passed: number;
-    failed: number;
-    passRate: number;
-    durationMs: number;
-}
-/**
- * Complete data structure passed to UI
- */
-interface MCPEvalData {
-    runData: MCPEvalRunData;
-    historical: MCPEvalHistoricalSummary[];
-}
 /**
  * Reporter types - re-exported from canonical source
  *
@@ -3831,7 +4921,7 @@ interface MCPEvalReporterConfig {
     outputDir?: string;
     /**
      * Auto-open report in browser after test run
-     * @default true (disabled in CI)
+     * @default false
      */
     autoOpen?: boolean;
     /**
@@ -3854,4 +4944,4 @@ interface MCPEvalReporterConfig {
     includeAutoTracking?: boolean;
 }
-export { type AuthType, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult$1 as EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type ExpectedToolCall, type FieldRemovalSanitizer, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck$1 as MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type SizeValidatorOptions, type SnapshotSanitizer, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallValidationResult, type ToolCallValidator, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, createToolCallValidator, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, extractText as extractTextFromResponse, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performOAuthSetup, performOAuthSetupIfNeeded, runConformanceChecks, runEvalCase, runEvalDataset, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText };
+export { type AuthType, BUILT_IN_RUBRICS, type BuiltInRubric, type BuiltInSanitizer, CLIOAuthClient, type CLIOAuthClientConfig, type CLIOAuthResult, type CaseComparisonResult, type ClientCredentialsConfig, type ComparisonOutcome, type ContentBlock, type CreateMCPClientOptions, DiscoveryError, ENV_VAR_NAMES, type EvalCase, type EvalCaseResult, EvalCaseSchema, type EvalContext, type EvalDataset, EvalDatasetSchema, type EvalExpectBlock, type EvalExpectationResult, type EvalMode, type EvalRunnerOptions, type EvalRunnerResult, type ExpectationBreakdown, type ExpectationResultMap, type ExpectationType, type FieldRemovalSanitizer, type HttpMCPConfig, type IterationResult, type Judge, type JudgeConfig, type JudgeMatcherOptions, type JudgeResult, type JudgeValidatorConfig, type LLMHostConfig, type LLMHostSimulationResult, type LLMHostSimulator, type LLMProvider, type LLMToolCall, type LoadDatasetOptions, type MCPAuthConfig, type MCPClientCredentialsConfig, type MCPConfig, MCPConfigSchema, type MCPConformanceCheck, type MCPConformanceOptions, type MCPConformanceRaw, type MCPConformanceResult, type MCPConformanceResultData, type MCPEvalData, type MCPEvalHistoricalSummary, type MCPEvalReporterConfig, type MCPEvalRunData, type MCPFixtureApi, type MCPFixtureOptions, type MCPHostCapabilities, type MCPOAuthConfig, type MCPServerCapabilitiesData, MCP_PROTOCOL_VERSION, type NormalizedToolResponse, type OAuthSetupConfig, type PatternValidatorOptions, PlaywrightOAuthClientProvider, type PlaywrightOAuthClientProviderConfig, type PredicateResult, type ProtectedResourceDiscoveryResult, type ProtectedResourceMetadata, type ProviderKind, type RegexSanitizer, type ResultSource, type RubricSpec, type SchemaRegistry, type SchemaValidatorOptions, type SerializedEvalDataset, type ServerComparisonOptions, type ServerComparisonResult, type SizeValidatorOptions, type SnapshotSanitizer, SnapshotSanitizers, type StdioMCPConfig, type StoredClientInfo, type StoredOAuthState, type StoredServerMetadata, type StoredTokens, type TextValidatorOptions, type TokenResult, type ToolCallCountOptions, type ToolCallExpectation, type ToolPredicate, type UsageMetrics, type ValidationResult, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };