npm - @agentv/core - Versions diffs - 2.5.8 → 2.7.1-next.1 - Mend

@agentv/core 2.5.8 → 2.7.1-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/{chunk-LGQ5OPJD.js → chunk-6W5E3VR6.js} +383 -54
package/dist/chunk-6W5E3VR6.js.map +1 -0
package/dist/chunk-HFSYZHGF.js +82 -0
package/dist/chunk-HFSYZHGF.js.map +1 -0
package/dist/chunk-HMXZ2AX4.js +112 -0
package/dist/chunk-HMXZ2AX4.js.map +1 -0
package/dist/esm-5Q4BZALM.js +968 -0
package/dist/esm-5Q4BZALM.js.map +1 -0
package/dist/evaluation/validation/index.cjs +337 -70
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +294 -69
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +9221 -4040
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +1717 -234
package/dist/index.d.ts +1717 -234
package/dist/index.js +6563 -3147
package/dist/index.js.map +1 -1
package/dist/otlp-json-file-exporter-77FDBRSY.js +7 -0
package/dist/otlp-json-file-exporter-77FDBRSY.js.map +1 -0
package/dist/simple-trace-file-exporter-S76DMABU.js +7 -0
package/dist/simple-trace-file-exporter-S76DMABU.js.map +1 -0
package/package.json +18 -5
package/dist/chunk-LGQ5OPJD.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,217 @@
-import { z } from 'zod';
 import * as ai from 'ai';
+import { z } from 'zod';
+type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
+interface ChatMessage {
+    readonly role: ChatMessageRole;
+    readonly content: string;
+    readonly name?: string;
+}
+type ChatPrompt = readonly ChatMessage[];
+type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
+/** Callbacks for real-time observability during provider execution */
+interface ProviderStreamCallbacks {
+    onToolCallStart?: (toolName: string, toolCallId?: string) => void;
+    onToolCallEnd?: (toolName: string, input: unknown, output: unknown, durationMs: number, toolCallId?: string) => void;
+    onLlmCallEnd?: (model: string, tokenUsage?: ProviderTokenUsage) => void;
+}
+interface ProviderRequest {
+    readonly question: string;
+    readonly systemPrompt?: string;
+    readonly guidelines?: string;
+    readonly guideline_patterns?: readonly string[];
+    readonly chatPrompt?: ChatPrompt;
+    readonly inputFiles?: readonly string[];
+    readonly evalCaseId?: string;
+    readonly attempt?: number;
+    readonly maxOutputTokens?: number;
+    readonly temperature?: number;
+    readonly metadata?: JsonObject;
+    readonly signal?: AbortSignal;
+    /** Working directory override (e.g., from workspace_template) */
+    readonly cwd?: string;
+    /** VS Code .code-workspace file (resolved from workspace.template) */
+    readonly workspaceFile?: string;
+    /** When true, AgentV captures file changes from workspace — provider should skip forced diff prompt */
+    readonly captureFileChanges?: boolean;
+    /** Real-time observability callbacks (optional) */
+    readonly streamCallbacks?: ProviderStreamCallbacks;
+}
+/**
+ * A tool call within an output message.
+ * Represents a single tool invocation with its input and optional output.
+ */
+interface ToolCall {
+    /** Tool name */
+    readonly tool: string;
+    /** Tool input arguments */
+    readonly input?: unknown;
+    /** Tool output result */
+    readonly output?: unknown;
+    /** Stable identifier for pairing tool calls */
+    readonly id?: string;
+    /** ISO 8601 timestamp when the tool call started */
+    readonly startTime?: string;
+    /** ISO 8601 timestamp when the tool call ended */
+    readonly endTime?: string;
+    /** Duration of the tool call in milliseconds */
+    readonly durationMs?: number;
+}
+/**
+ * An output message from agent execution.
+ * Represents a single message in the conversation with optional tool calls.
+ */
+interface Message {
+    /** Message role (e.g., 'assistant', 'user', 'tool') */
+    readonly role: string;
+    /** Optional name for the message sender */
+    readonly name?: string;
+    /** Message content */
+    readonly content?: unknown;
+    /** Tool calls made in this message */
+    readonly toolCalls?: readonly ToolCall[];
+    /** ISO 8601 timestamp when the message started */
+    readonly startTime?: string;
+    /** ISO 8601 timestamp when the message ended */
+    readonly endTime?: string;
+    /** Duration of the message in milliseconds */
+    readonly durationMs?: number;
+    /** Provider-specific metadata */
+    readonly metadata?: Record<string, unknown>;
+    /** Per-message token usage metrics (optional) */
+    readonly tokenUsage?: ProviderTokenUsage;
+}
+/** @deprecated Use Message instead */
+type OutputMessage = Message;
+/**
+ * Token usage metrics reported by provider.
+ */
+interface ProviderTokenUsage {
+    /** Input/prompt tokens consumed */
+    readonly input: number;
+    /** Output/completion tokens generated */
+    readonly output: number;
+    /** Cached tokens (optional, provider-specific) */
+    readonly cached?: number;
+}
+interface ProviderResponse {
+    readonly raw?: unknown;
+    readonly usage?: JsonObject;
+    /** Output messages from agent execution (primary source for tool trajectory) */
+    readonly output?: readonly Message[];
+    /** Token usage metrics (optional) */
+    readonly tokenUsage?: ProviderTokenUsage;
+    /** Total cost in USD (optional) */
+    readonly costUsd?: number;
+    /** Execution duration in milliseconds (optional) */
+    readonly durationMs?: number;
+    /** ISO 8601 timestamp when execution started (optional) */
+    readonly startTime?: string;
+    /** ISO 8601 timestamp when execution ended (optional) */
+    readonly endTime?: string;
+}
+interface Provider {
+    readonly id: string;
+    readonly kind: ProviderKind;
+    readonly targetName: string;
+    invoke(request: ProviderRequest): Promise<ProviderResponse>;
+    /**
+     * Optional capability marker for provider-managed batching (single session handling multiple requests).
+     */
+    readonly supportsBatch?: boolean;
+    /**
+     * Optional batch invocation hook. When defined alongside supportsBatch=true,
+     * the orchestrator may send multiple requests in a single provider session.
+     */
+    invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
+    /**
+     * Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
+     * Used by evaluators that need generateObject/generateText from the AI SDK.
+     */
+    asLanguageModel?(): ai.LanguageModel;
+}
+type EnvLookup = Readonly<Record<string, string | undefined>>;
+interface TargetDefinition {
+    readonly name: string;
+    readonly provider: ProviderKind | string;
+    readonly judge_target?: string | undefined;
+    readonly workers?: number | undefined;
+    readonly provider_batching?: boolean | undefined;
+    readonly providerBatching?: boolean | undefined;
+    readonly endpoint?: string | unknown | undefined;
+    readonly resource?: string | unknown | undefined;
+    readonly resourceName?: string | unknown | undefined;
+    readonly api_key?: string | unknown | undefined;
+    readonly apiKey?: string | unknown | undefined;
+    readonly deployment?: string | unknown | undefined;
+    readonly deploymentName?: string | unknown | undefined;
+    readonly model?: string | unknown | undefined;
+    readonly version?: string | unknown | undefined;
+    readonly api_version?: string | unknown | undefined;
+    readonly variant?: string | unknown | undefined;
+    readonly thinking_budget?: number | unknown | undefined;
+    readonly thinkingBudget?: number | unknown | undefined;
+    readonly temperature?: number | unknown | undefined;
+    readonly max_output_tokens?: number | unknown | undefined;
+    readonly maxTokens?: number | unknown | undefined;
+    readonly executable?: string | unknown | undefined;
+    readonly command?: string | unknown | undefined;
+    readonly binary?: string | unknown | undefined;
+    readonly args?: unknown | undefined;
+    readonly arguments?: unknown | undefined;
+    readonly cwd?: string | unknown | undefined;
+    readonly timeout_seconds?: number | unknown | undefined;
+    readonly timeoutSeconds?: number | unknown | undefined;
+    readonly log_dir?: string | unknown | undefined;
+    readonly logDir?: string | unknown | undefined;
+    readonly log_directory?: string | unknown | undefined;
+    readonly logDirectory?: string | unknown | undefined;
+    readonly log_format?: string | unknown | undefined;
+    readonly logFormat?: string | unknown | undefined;
+    readonly log_output_format?: string | unknown | undefined;
+    readonly logOutputFormat?: string | unknown | undefined;
+    readonly system_prompt?: string | unknown | undefined;
+    readonly systemPrompt?: string | unknown | undefined;
+    readonly max_turns?: number | unknown | undefined;
+    readonly maxTurns?: number | unknown | undefined;
+    readonly max_budget_usd?: number | unknown | undefined;
+    readonly maxBudgetUsd?: number | unknown | undefined;
+    readonly response?: string | unknown | undefined;
+    readonly delayMs?: number | unknown | undefined;
+    readonly delayMinMs?: number | unknown | undefined;
+    readonly delayMaxMs?: number | unknown | undefined;
+    readonly wait?: boolean | unknown | undefined;
+    readonly dry_run?: boolean | unknown | undefined;
+    readonly dryRun?: boolean | unknown | undefined;
+    readonly subagent_root?: string | unknown | undefined;
+    readonly subagentRoot?: string | unknown | undefined;
+    readonly workspace_template?: string | unknown | undefined;
+    readonly workspaceTemplate?: string | unknown | undefined;
+    readonly command_template?: string | unknown | undefined;
+    readonly commandTemplate?: string | unknown | undefined;
+    readonly files_format?: string | unknown | undefined;
+    readonly filesFormat?: string | unknown | undefined;
+    readonly attachments_format?: string | unknown | undefined;
+    readonly attachmentsFormat?: string | unknown | undefined;
+    readonly env?: unknown | undefined;
+    readonly healthcheck?: unknown | undefined;
+    readonly cli_url?: string | unknown | undefined;
+    readonly cliUrl?: string | unknown | undefined;
+    readonly cli_path?: string | unknown | undefined;
+    readonly cliPath?: string | unknown | undefined;
+    readonly github_token?: string | unknown | undefined;
+    readonly githubToken?: string | unknown | undefined;
+    readonly max_retries?: number | unknown | undefined;
+    readonly maxRetries?: number | unknown | undefined;
+    readonly retry_initial_delay_ms?: number | unknown | undefined;
+    readonly retryInitialDelayMs?: number | unknown | undefined;
+    readonly retry_max_delay_ms?: number | unknown | undefined;
+    readonly retryMaxDelayMs?: number | unknown | undefined;
+    readonly retry_backoff_factor?: number | unknown | undefined;
+    readonly retryBackoffFactor?: number | unknown | undefined;
+    readonly retry_status_codes?: unknown | undefined;
+    readonly retryStatusCodes?: unknown | undefined;
+}
 /**
  * Trace event types for capturing agent execution traces.
@@ -37,7 +249,21 @@ interface TraceSummary {
     readonly durationMs?: number;
     /** Per-tool duration arrays in milliseconds (optional) */
     readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
+    /** ISO 8601 timestamp when execution started (derived from earliest span) */
+    readonly startTime?: string;
+    /** ISO 8601 timestamp when execution ended (derived from latest span) */
+    readonly endTime?: string;
+    /** Number of LLM calls (assistant messages) */
+    readonly llmCallCount?: number;
 }
+/**
+ * Argument matching mode for tool_trajectory expected items.
+ * - 'exact': bidirectional deep equality, no extra keys allowed (default)
+ * - 'superset': actual args must contain all expected keys (extras OK)
+ * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
+ * - 'ignore': skip argument checking entirely
+ */
+type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
 /**
  * Configuration for tool_trajectory evaluator.
  */
@@ -45,13 +271,18 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly name: string;
     readonly type: 'tool_trajectory';
     /** Matching mode */
-    readonly mode: 'any_order' | 'in_order' | 'exact';
+    readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
     /** Minimum call counts per tool (for any_order mode) */
     readonly minimums?: Readonly<Record<string, number>>;
-    /** Expected tool sequence (for in_order/exact modes) */
+    /** Expected tool sequence (for in_order/exact/subset/superset modes) */
     readonly expected?: readonly ToolTrajectoryExpectedItem[];
     /** Optional weight for top-level aggregation (defaults to 1.0) */
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+    /** Default argument matching mode for all expected items (defaults to 'exact') */
+    readonly argsMatch?: ArgsMatchMode | readonly string[];
 }
 /**
  * Expected tool call item in a trajectory sequence.
@@ -62,21 +293,35 @@ interface ToolTrajectoryExpectedItem {
     readonly args?: 'any' | Record<string, unknown>;
     /** Optional maximum duration in milliseconds for latency assertions */
     readonly maxDurationMs?: number;
+    /** Per-item argument matching mode override (takes precedence over evaluator-level argsMatch) */
+    readonly argsMatch?: ArgsMatchMode | readonly string[];
 }
 /**
  * Simplified input type for computeTraceSummary.
- * Matches OutputMessage structure without requiring full provider/types import.
+ * Matches Message structure without requiring full provider/types import.
  */
-interface OutputMessageLike {
+interface MessageLike {
+    readonly role?: string;
+    readonly startTime?: string;
+    readonly endTime?: string;
     readonly toolCalls?: readonly {
         readonly tool: string;
+        readonly startTime?: string;
+        readonly endTime?: string;
+        readonly durationMs?: number;
     }[];
 }
 /**
  * Compute a lightweight summary from output messages.
  * Used for default result persistence without payload bloat.
+ *
+ * Derives timing information from span boundaries:
+ * - startTime: earliest startTime across all messages and tool calls
+ * - endTime: latest endTime across all messages and tool calls
+ * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
+ * - llmCallCount: count of assistant messages
  */
-declare function computeTraceSummary(messages: readonly OutputMessageLike[]): TraceSummary;
+declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
 /**
  * Default tool names considered as exploration/read-only operations.
  * Can be overridden per-evaluation via config.
@@ -114,10 +359,15 @@ interface ExecutionMetrics {
     readonly tokenUsage?: TokenUsage;
     readonly costUsd?: number;
     readonly durationMs?: number;
+    /** ISO 8601 timestamp when execution started */
+    readonly startTime?: string;
+    /** ISO 8601 timestamp when execution ended */
+    readonly endTime?: string;
 }
 /**
  * Merge execution metrics from provider response into a trace summary.
  * Returns a new TraceSummary with metrics fields populated.
+ * Provider-level timing takes precedence over span-derived timing.
  *
  * @param summary - Base trace summary from computeTraceSummary
  * @param metrics - Optional execution metrics from provider
@@ -203,7 +453,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "regex", "is_json", "equals", "rubrics"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
@@ -215,6 +465,43 @@ type TargetAccessConfig = {
     /** Maximum number of target invocations allowed per execution (default: 50) */
     readonly max_calls?: number;
 };
+/**
+ * Configuration for workspace lifecycle scripts (before_all, after_all, before_each, after_each).
+ * Scripts are executed with workspace context passed via stdin.
+ */
+type WorkspaceScriptConfig = {
+    /** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */
+    readonly script: readonly string[];
+    /** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */
+    readonly timeout_ms?: number;
+    readonly timeoutMs?: number;
+    /** Optional working directory for script execution */
+    readonly cwd?: string;
+};
+/**
+ * Workspace configuration for eval tests.
+ * Can be specified at suite level and overridden per-case.
+ * Merge strategy: template/scripts replaced, env deep-merged.
+ *
+ * Lifecycle hooks follow bun:test/Vitest naming:
+ * - before_all: runs ONCE before first test, creates shared workspace
+ * - after_all: runs ONCE after last test, final cleanup
+ * - before_each: runs before each test (optional)
+ * - after_each: runs after each test (e.g., reset git state)
+ */
+type WorkspaceConfig = {
+    /** Template directory or .code-workspace file. Directories are copied to temp workspace.
+     *  .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
+    readonly template?: string;
+    /** Script to run once before first test (after workspace creation, before git baseline) */
+    readonly before_all?: WorkspaceScriptConfig;
+    /** Script to run once after last test (before workspace cleanup) */
+    readonly after_all?: WorkspaceScriptConfig;
+    /** Script to run before each test */
+    readonly before_each?: WorkspaceScriptConfig;
+    /** Script to run after each test (e.g., git reset for workspace reuse) */
+    readonly after_each?: WorkspaceScriptConfig;
+};
 type CodeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'code';
@@ -223,6 +510,9 @@ type CodeEvaluatorConfig = {
     readonly cwd?: string;
     readonly resolvedCwd?: string;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
     /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
     readonly config?: JsonObject;
     /** When present, enables target access for the script via local proxy */
@@ -250,32 +540,35 @@ type LlmJudgeEvaluatorConfig = {
     readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
     /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
     readonly config?: Record<string, unknown>;
 };
 /**
  * Score range definition for analytic rubric scoring.
- * Each range maps an integer score band (0-10) to an expected outcome description.
+ * Each range maps an integer score band (0-10) to an outcome description.
  */
 type ScoreRange = {
     /** Inclusive integer range [min, max] within 0-10 */
     readonly score_range: readonly [number, number];
     /** Description of what this score range represents */
-    readonly expected_outcome: string;
+    readonly outcome: string;
 };
 /**
  * Rubric item for LLM judge evaluation.
  * Supports two modes:
- * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
+ * - Checklist mode: boolean satisfied/not-satisfied with `outcome`
  * - Score-range mode: 0-10 integer scoring with `score_ranges`
  */
 type RubricItem = {
     readonly id: string;
     /**
-     * For checklist rubrics: the expected outcome text (required).
+     * For checklist rubrics: the outcome text (required).
      * For score-range rubrics: optional overall criterion description.
      */
-    readonly expected_outcome?: string;
+    readonly outcome?: string;
     readonly weight: number;
     /**
      * Legacy boolean gating (deprecated, treated as required_min_score: 10).
@@ -306,6 +599,9 @@ type CompositeAggregatorConfig = {
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly model?: string;
+} | {
+    readonly type: 'threshold';
+    readonly threshold: number;
 };
 type CompositeEvaluatorConfig = {
     readonly name: string;
@@ -313,6 +609,9 @@ type CompositeEvaluatorConfig = {
     readonly evaluators: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
 };
 /**
  * Match type for field accuracy evaluation.
@@ -354,6 +653,9 @@ type FieldAccuracyEvaluatorConfig = {
     /** Strategy for combining field scores (default: weighted_average) */
     readonly aggregation?: FieldAggregationType;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
 };
 /**
  * Configuration for the latency evaluator.
@@ -365,6 +667,9 @@ type LatencyEvaluatorConfig = {
     /** Maximum allowed duration in milliseconds */
     readonly threshold: number;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
 };
 /**
  * Configuration for the cost evaluator.
@@ -376,6 +681,9 @@ type CostEvaluatorConfig = {
     /** Maximum allowed cost in USD */
     readonly budget: number;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
 };
 /**
  * Configuration for the token_usage evaluator.
@@ -391,48 +699,256 @@ type TokenUsageEvaluatorConfig = {
     /** Maximum allowed output tokens (completion) */
     readonly max_output?: number;
     readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the execution_metrics evaluator.
+ * Provides declarative threshold-based checks on execution metrics.
+ * Only specified thresholds are checked; omitted ones are ignored.
+ */
+type ExecutionMetricsEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'execution_metrics';
+    /** Maximum allowed number of tool calls */
+    readonly max_tool_calls?: number;
+    /** Maximum allowed number of LLM calls (assistant messages) */
+    readonly max_llm_calls?: number;
+    /** Maximum allowed total tokens (input + output) */
+    readonly max_tokens?: number;
+    /** Maximum allowed cost in USD */
+    readonly max_cost_usd?: number;
+    /** Maximum allowed duration in milliseconds */
+    readonly max_duration_ms?: number;
+    /** Target exploration ratio (0-1, proportion of read-only tool calls) */
+    readonly target_exploration_ratio?: number;
+    /** Tolerance for exploration ratio check (default: 0.2) */
+    readonly exploration_tolerance?: number;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the agent_judge evaluator.
+ * Runs an agentic investigation loop to audit workspaces and verify criteria.
+ * Two modes:
+ * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
+ * - Judge target: Delegates to an external agent provider via Provider.invoke()
+ */
+type AgentJudgeEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'agent_judge';
+    /** Custom evaluation prompt (inline text or file path) */
+    readonly prompt?: string;
+    readonly promptPath?: string;
+    /** Resolved absolute path for prompt file */
+    readonly resolvedPromptPath?: string;
+    /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
+    readonly rubrics?: readonly RubricItem[];
+    /** Maximum agent steps for built-in mode (default 10, max 50) */
+    readonly max_steps?: number;
+    /** Temperature for built-in mode (default 0) */
+    readonly temperature?: number;
+    /** Target name — delegates agent loop to this provider instead of built-in mode */
+    readonly target?: string;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the contains assertion evaluator.
+ * Checks whether the candidate output contains a specified substring.
+ */
+type ContainsEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'contains';
+    readonly value: string;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the regex assertion evaluator.
+ * Checks whether the candidate output matches a regular expression pattern.
+ */
+type RegexEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'regex';
+    readonly value: string;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the is_json assertion evaluator.
+ * Checks whether the candidate output is valid JSON.
+ */
+type IsJsonEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'is_json';
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the equals assertion evaluator.
+ * Checks whether the candidate output exactly equals a specified string.
+ */
+type EqualsEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'equals';
+    readonly value: string;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the rubrics evaluator.
+ * Evaluates candidate output against a list of rubric criteria.
+ */
+type RubricsEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'rubrics';
+    readonly criteria: readonly RubricItem[];
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    readonly negate?: boolean;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
+type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
 /**
- * Eval case definition sourced from AgentV specs.
+ * Eval test definition sourced from AgentV specs.
  */
-interface EvalCase {
+interface EvalTest {
     readonly id: string;
     readonly dataset?: string;
     readonly conversation_id?: string;
     readonly question: string;
-    readonly input_messages: readonly TestMessage[];
+    readonly input: readonly TestMessage[];
     readonly input_segments: readonly JsonObject[];
-    readonly expected_messages: readonly JsonObject[];
+    readonly expected_output: readonly JsonObject[];
     readonly reference_answer?: string;
     readonly guideline_paths: readonly string[];
     readonly guideline_patterns?: readonly string[];
     readonly file_paths: readonly string[];
-    readonly expected_outcome: string;
+    readonly criteria: string;
     readonly evaluator?: EvaluatorKind;
     readonly evaluators?: readonly EvaluatorConfig[];
+    /** Workspace configuration (merged from suite-level and case-level) */
+    readonly workspace?: WorkspaceConfig;
+    /** Arbitrary metadata passed to workspace scripts via stdin */
+    readonly metadata?: Record<string, unknown>;
+    /** Per-test target override (matrix evaluation) */
+    readonly targets?: readonly string[];
+}
+/** @deprecated Use `EvalTest` instead */
+type EvalCase = EvalTest;
+/**
+ * Supported trial aggregation strategies.
+ */
+type TrialStrategy = 'pass_at_k' | 'mean' | 'confidence_interval';
+/**
+ * Configuration for running multiple trials per eval case.
+ */
+interface TrialsConfig {
+    readonly count: number;
+    readonly strategy: TrialStrategy;
+    readonly costLimitUsd?: number;
+}
+/**
+ * Result of a single trial attempt.
+ */
+interface TrialResult {
+    readonly attempt: number;
+    readonly score: number;
+    readonly verdict: EvaluationVerdict;
+    readonly scores?: readonly EvaluatorResult[];
+    readonly error?: string;
+    readonly costUsd?: number;
+}
+/**
+ * Aggregation metadata for pass_at_k strategy.
+ */
+interface PassAtKAggregation {
+    readonly strategy: 'pass_at_k';
+    readonly passedAttempts: number;
+    readonly totalAttempts: number;
 }
+/**
+ * Aggregation metadata for mean strategy.
+ */
+interface MeanAggregation {
+    readonly strategy: 'mean';
+    readonly mean: number;
+    readonly min: number;
+    readonly max: number;
+}
+/**
+ * Aggregation metadata for confidence_interval strategy.
+ */
+interface ConfidenceIntervalAggregation {
+    readonly strategy: 'confidence_interval';
+    readonly mean: number;
+    readonly ci95Lower: number;
+    readonly ci95Upper: number;
+    readonly stddev: number;
+}
+/**
+ * Discriminated union of trial aggregation results.
+ */
+type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
 /**
  * Evaluator scorecard for a single eval case run.
  */
 interface EvaluationResult {
     readonly timestamp: string;
-    readonly evalId: string;
+    readonly testId: string;
     readonly dataset?: string;
     readonly conversationId?: string;
     readonly score: number;
     readonly hits: readonly string[];
     readonly misses: readonly string[];
-    readonly candidateAnswer: string;
+    readonly answer: string;
     readonly target: string;
     readonly reasoning?: string;
-    readonly agentProviderRequest?: JsonObject;
-    readonly lmProviderRequest?: JsonObject;
-    readonly evaluatorProviderRequest?: JsonObject;
-    readonly evaluatorResults?: readonly EvaluatorResult[];
+    readonly requests?: {
+        readonly agent?: JsonObject;
+        readonly lm?: JsonObject;
+        readonly evaluator?: JsonObject;
+    };
+    readonly scores?: readonly EvaluatorResult[];
     readonly error?: string;
     /** Lightweight summary of the execution trace (always included when available) */
-    readonly traceSummary?: TraceSummary;
+    readonly trace?: TraceSummary;
+    /** Path to the temporary workspace directory (included on failure for debugging) */
+    readonly workspacePath?: string;
+    /** Input messages or prompt string sent to the agent */
+    readonly input?: readonly Message[] | string;
+    /** Full output messages from agent execution (only included when --trace flag is set) */
+    readonly output?: readonly Message[];
+    /** Captured output from workspace before_all script */
+    readonly beforeAllOutput?: string;
+    /** Captured output from workspace before_each script */
+    readonly beforeEachOutput?: string;
+    /** Captured output from workspace after_all script */
+    readonly afterAllOutput?: string;
+    /** Captured output from workspace after_each script */
+    readonly afterEachOutput?: string;
+    /** Unified diff of workspace file changes (when workspace_template is configured) */
+    readonly fileChanges?: string;
+    /** Individual trial results (only present when trials.count > 1) */
+    readonly trials?: readonly TrialResult[];
+    /** Aggregation metadata describing how the final score was computed from trials */
+    readonly aggregation?: TrialAggregation;
+    /** Whether the trial loop was terminated early due to cost limit */
+    readonly costLimited?: boolean;
 }
 type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
 interface EvaluatorResult {
@@ -446,7 +962,7 @@ interface EvaluatorResult {
     readonly reasoning?: string;
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
-    readonly evaluatorResults?: readonly EvaluatorResult[];
+    readonly scores?: readonly EvaluatorResult[];
     /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
     readonly details?: JsonObject;
 }
@@ -455,182 +971,88 @@ interface EvaluatorResult {
  */
 declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
-type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
-interface ChatMessage {
-    readonly role: ChatMessageRole;
-    readonly content: string;
-    readonly name?: string;
-}
-type ChatPrompt = readonly ChatMessage[];
-type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
-interface ProviderRequest {
-    readonly question: string;
-    readonly systemPrompt?: string;
-    readonly guidelines?: string;
+declare const MetadataSchema: z.ZodObject<{
+    name: z.ZodString;
+    description: z.ZodOptional<z.ZodString>;
+    version: z.ZodOptional<z.ZodString>;
+    author: z.ZodOptional<z.ZodString>;
+    tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+    license: z.ZodOptional<z.ZodString>;
+    requires: z.ZodOptional<z.ZodObject<{
+        agentv: z.ZodOptional<z.ZodString>;
+    }, "strip", z.ZodTypeAny, {
+        agentv?: string | undefined;
+    }, {
+        agentv?: string | undefined;
+    }>>;
+}, "strip", z.ZodTypeAny, {
+    name: string;
+    description?: string | undefined;
+    version?: string | undefined;
+    author?: string | undefined;
+    tags?: string[] | undefined;
+    license?: string | undefined;
+    requires?: {
+        agentv?: string | undefined;
+    } | undefined;
+}, {
+    name: string;
+    description?: string | undefined;
+    version?: string | undefined;
+    author?: string | undefined;
+    tags?: string[] | undefined;
+    license?: string | undefined;
+    requires?: {
+        agentv?: string | undefined;
+    } | undefined;
+}>;
+type EvalMetadata = z.infer<typeof MetadataSchema>;
+declare const DEFAULT_EVAL_PATTERNS: readonly string[];
+type AgentVConfig$1 = {
     readonly guideline_patterns?: readonly string[];
-    readonly chatPrompt?: ChatPrompt;
-    readonly inputFiles?: readonly string[];
-    readonly evalCaseId?: string;
-    readonly attempt?: number;
-    readonly maxOutputTokens?: number;
-    readonly temperature?: number;
-    readonly metadata?: JsonObject;
-    readonly signal?: AbortSignal;
-}
+    readonly eval_patterns?: readonly string[];
+};
 /**
- * A tool call within an output message.
- * Represents a single tool invocation with its input and optional output.
+ * Load optional .agentv/config.yaml configuration file.
+ * Searches from eval file directory up to repo root.
  */
-interface ToolCall {
-    /** Tool name */
-    readonly tool: string;
-    /** Tool input arguments */
-    readonly input?: unknown;
-    /** Tool output result */
-    readonly output?: unknown;
-    /** Stable identifier for pairing tool calls */
-    readonly id?: string;
-    /** ISO 8601 timestamp */
-    readonly timestamp?: string;
-    /** Duration of the tool call in milliseconds */
-    readonly durationMs?: number;
-}
+declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
 /**
- * An output message from agent execution.
- * Represents a single message in the conversation with optional tool calls.
+ * Determine whether a path references guideline content (instructions or prompts).
  */
-interface OutputMessage {
-    /** Message role (e.g., 'assistant', 'user', 'tool') */
-    readonly role: string;
-    /** Optional name for the message sender */
-    readonly name?: string;
-    /** Message content */
-    readonly content?: unknown;
-    /** Tool calls made in this message */
-    readonly toolCalls?: readonly ToolCall[];
-    /** ISO 8601 timestamp */
-    readonly timestamp?: string;
-    /** Duration of the message in milliseconds */
-    readonly durationMs?: number;
-    /** Provider-specific metadata */
-    readonly metadata?: Record<string, unknown>;
-}
+declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
 /**
- * Token usage metrics reported by provider.
+ * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
  */
-interface ProviderTokenUsage {
-    /** Input/prompt tokens consumed */
-    readonly input: number;
-    /** Output/completion tokens generated */
-    readonly output: number;
-    /** Cached tokens (optional, provider-specific) */
-    readonly cached?: number;
-}
-interface ProviderResponse {
-    readonly raw?: unknown;
-    readonly usage?: JsonObject;
-    /** Output messages from agent execution (primary source for tool trajectory) */
-    readonly outputMessages?: readonly OutputMessage[];
-    /** Token usage metrics (optional) */
-    readonly tokenUsage?: ProviderTokenUsage;
-    /** Total cost in USD (optional) */
-    readonly costUsd?: number;
-    /** Execution duration in milliseconds (optional) */
-    readonly durationMs?: number;
-}
-interface Provider {
-    readonly id: string;
-    readonly kind: ProviderKind;
-    readonly targetName: string;
-    invoke(request: ProviderRequest): Promise<ProviderResponse>;
-    /**
-     * Optional capability marker for provider-managed batching (single session handling multiple requests).
-     */
-    readonly supportsBatch?: boolean;
-    /**
-     * Optional batch invocation hook. When defined alongside supportsBatch=true,
-     * the orchestrator may send multiple requests in a single provider session.
-     */
-    invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
-    /**
-     * Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
-     * Used by evaluators that need generateObject/generateText from the AI SDK.
-     */
-    asLanguageModel?(): ai.LanguageModel;
-}
-type EnvLookup = Readonly<Record<string, string | undefined>>;
-interface TargetDefinition {
-    readonly name: string;
-    readonly provider: ProviderKind | string;
-    readonly judge_target?: string | undefined;
-    readonly workers?: number | undefined;
-    readonly provider_batching?: boolean | undefined;
-    readonly providerBatching?: boolean | undefined;
-    readonly endpoint?: string | unknown | undefined;
-    readonly resource?: string | unknown | undefined;
-    readonly resourceName?: string | unknown | undefined;
-    readonly api_key?: string | unknown | undefined;
-    readonly apiKey?: string | unknown | undefined;
-    readonly deployment?: string | unknown | undefined;
-    readonly deploymentName?: string | unknown | undefined;
-    readonly model?: string | unknown | undefined;
-    readonly version?: string | unknown | undefined;
-    readonly api_version?: string | unknown | undefined;
-    readonly variant?: string | unknown | undefined;
-    readonly thinking_budget?: number | unknown | undefined;
-    readonly thinkingBudget?: number | unknown | undefined;
-    readonly temperature?: number | unknown | undefined;
-    readonly max_output_tokens?: number | unknown | undefined;
-    readonly maxTokens?: number | unknown | undefined;
-    readonly executable?: string | unknown | undefined;
-    readonly command?: string | unknown | undefined;
-    readonly binary?: string | unknown | undefined;
-    readonly args?: unknown | undefined;
-    readonly arguments?: unknown | undefined;
-    readonly cwd?: string | unknown | undefined;
-    readonly timeout_seconds?: number | unknown | undefined;
-    readonly timeoutSeconds?: number | unknown | undefined;
-    readonly log_dir?: string | unknown | undefined;
-    readonly logDir?: string | unknown | undefined;
-    readonly log_directory?: string | unknown | undefined;
-    readonly logDirectory?: string | unknown | undefined;
-    readonly log_format?: string | unknown | undefined;
-    readonly logFormat?: string | unknown | undefined;
-    readonly log_output_format?: string | unknown | undefined;
-    readonly logOutputFormat?: string | unknown | undefined;
-    readonly system_prompt?: string | unknown | undefined;
-    readonly systemPrompt?: string | unknown | undefined;
-    readonly response?: string | unknown | undefined;
-    readonly delayMs?: number | unknown | undefined;
-    readonly delayMinMs?: number | unknown | undefined;
-    readonly delayMaxMs?: number | unknown | undefined;
-    readonly vscode_cmd?: string | unknown | undefined;
-    readonly wait?: boolean | unknown | undefined;
-    readonly dry_run?: boolean | unknown | undefined;
-    readonly dryRun?: boolean | unknown | undefined;
-    readonly subagent_root?: string | unknown | undefined;
-    readonly subagentRoot?: string | unknown | undefined;
-    readonly workspace_template?: string | unknown | undefined;
-    readonly workspaceTemplate?: string | unknown | undefined;
-    readonly command_template?: string | unknown | undefined;
-    readonly commandTemplate?: string | unknown | undefined;
-    readonly files_format?: string | unknown | undefined;
-    readonly filesFormat?: string | unknown | undefined;
-    readonly attachments_format?: string | unknown | undefined;
-    readonly attachmentsFormat?: string | unknown | undefined;
-    readonly env?: unknown | undefined;
-    readonly healthcheck?: unknown | undefined;
-    readonly max_retries?: number | unknown | undefined;
-    readonly maxRetries?: number | unknown | undefined;
-    readonly retry_initial_delay_ms?: number | unknown | undefined;
-    readonly retryInitialDelayMs?: number | unknown | undefined;
-    readonly retry_max_delay_ms?: number | unknown | undefined;
-    readonly retryMaxDelayMs?: number | unknown | undefined;
-    readonly retry_backoff_factor?: number | unknown | undefined;
-    readonly retryBackoffFactor?: number | unknown | undefined;
-    readonly retry_status_codes?: unknown | undefined;
-    readonly retryStatusCodes?: unknown | undefined;
+declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
+/**
+ * Extract targets array from parsed eval suite.
+ * Precedence: execution.targets (array) > execution.target (singular).
+ * Returns undefined when no targets array is specified.
+ */
+declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
+/**
+ * Extract per-test targets array from a raw test case object.
+ */
+declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
+/**
+ * Extract trials configuration from parsed eval suite's execution block.
+ * Returns undefined when count is 1 or not specified (no-op).
+ */
+declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
+/**
+ * Cache configuration parsed from execution block.
+ */
+interface CacheConfig {
+    readonly enabled: boolean;
+    readonly cachePath?: string;
 }
+/**
+ * Extract cache configuration from parsed eval suite's execution block.
+ * Returns undefined when no cache config is specified.
+ */
+declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
 /**
  * Formatting mode for segment content.
@@ -654,12 +1076,7 @@ interface PromptInputs {
  * @param testCase - The evaluation test case
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
  */
-declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
-/**
- * Determine whether a path references guideline content (instructions or prompts).
- */
-declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
+declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
 /**
  * Detect file format by extension.
@@ -668,21 +1085,49 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
 type LoadOptions = {
     readonly verbose?: boolean;
-    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
     readonly filter?: string;
 };
 /**
  * Read metadata from a test suite file (like target name).
- * This is a convenience function for CLI tools that need metadata without loading all eval cases.
+ * This is a convenience function for CLI tools that need metadata without loading all tests.
  */
 declare function readTestSuiteMetadata(testFilePath: string): Promise<{
     target?: string;
+    targets?: readonly string[];
+    trials?: TrialsConfig;
 }>;
 /**
- * Load eval cases from a AgentV specification file (YAML or JSONL).
+ * Load tests from an AgentV specification file (YAML or JSONL).
  * Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
  */
-declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
+type EvalSuiteResult = {
+    readonly tests: readonly EvalTest[];
+    readonly trials?: TrialsConfig;
+    /** Suite-level targets from execution.targets (matrix evaluation) */
+    readonly targets?: readonly string[];
+    /** Suite-level cache config from execution.cache */
+    readonly cacheConfig?: CacheConfig;
+    /** Suite-level metadata (name, description, version, etc.) */
+    readonly metadata?: EvalMetadata;
+};
+/**
+ * Load tests and suite metadata from a single parse.
+ * Prefer this over calling loadTests + readTestSuiteMetadata separately.
+ */
+declare function loadTestSuite(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<EvalSuiteResult>;
+/** @deprecated Use `loadTestSuite` instead */
+declare const loadEvalSuite: typeof loadTestSuite;
+declare function loadTests(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalTest[]>;
+/** @deprecated Use `loadTests` instead */
+declare const loadEvalCases: typeof loadTests;
+/**
+ * Load a single test by exact ID match.
+ * Throws if the ID is not found.
+ */
+declare function loadTestById(evalFilePath: string, repoRoot: URL | string, evalId: string): Promise<EvalTest>;
+/** @deprecated Use `loadTestById` instead */
+declare const loadEvalCaseById: typeof loadTestById;
 declare function fileExists(filePath: string): Promise<boolean>;
 /**
@@ -744,6 +1189,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     commandTemplate: z.ZodString;
     filesFormat: z.ZodOptional<z.ZodString>;
     cwd: z.ZodOptional<z.ZodString>;
+    workspaceTemplate: z.ZodOptional<z.ZodString>;
     timeoutMs: z.ZodOptional<z.ZodNumber>;
     healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
         type: z.ZodLiteral<"http">;
@@ -780,6 +1226,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     cwd?: string | undefined;
     verbose?: boolean | undefined;
     filesFormat?: string | undefined;
+    workspaceTemplate?: string | undefined;
     healthcheck?: {
         type: "http";
         url: string;
@@ -797,6 +1244,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     cwd?: string | undefined;
     verbose?: boolean | undefined;
     filesFormat?: string | undefined;
+    workspaceTemplate?: string | undefined;
     healthcheck?: {
         type: "http";
         url: string;
@@ -858,19 +1306,34 @@ interface GeminiResolvedConfig {
     readonly retry?: RetryConfig;
 }
 interface CodexResolvedConfig {
+    readonly model?: string;
     readonly executable: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
+    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
     readonly systemPrompt?: string;
 }
-interface CopilotResolvedConfig {
+interface CopilotCliResolvedConfig {
     readonly executable: string;
     readonly model?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
+    readonly workspaceTemplate?: string;
+    readonly timeoutMs?: number;
+    readonly logDir?: string;
+    readonly logFormat?: 'summary' | 'json';
+    readonly systemPrompt?: string;
+}
+interface CopilotSdkResolvedConfig {
+    readonly cliUrl?: string;
+    readonly cliPath?: string;
+    readonly githubToken?: string;
+    readonly model?: string;
+    readonly cwd?: string;
+    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -885,6 +1348,7 @@ interface PiCodingAgentResolvedConfig {
     readonly thinking?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
+    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -897,13 +1361,14 @@ interface PiAgentSdkResolvedConfig {
     readonly timeoutMs?: number;
     readonly systemPrompt?: string;
 }
-interface ClaudeCodeResolvedConfig {
-    readonly executable: string;
+interface ClaudeResolvedConfig {
     readonly model?: string;
     readonly systemPrompt?: string;
-    readonly args?: readonly string[];
     readonly cwd?: string;
+    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
+    readonly maxTurns?: number;
+    readonly maxBudgetUsd?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
 }
@@ -914,11 +1379,12 @@ interface MockResolvedConfig {
     readonly delayMaxMs?: number;
 }
 interface VSCodeResolvedConfig {
-    readonly command: string;
+    readonly executable: string;
     readonly waitForResponse: boolean;
     readonly dryRun: boolean;
     readonly subagentRoot?: string;
     readonly workspaceTemplate?: string;
+    readonly timeoutMs?: number;
 }
 type ResolvedTarget = {
     readonly kind: 'azure';
@@ -948,13 +1414,20 @@ type ResolvedTarget = {
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: CodexResolvedConfig;
+} | {
+    readonly kind: 'copilot';
+    readonly name: string;
+    readonly judgeTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: CopilotSdkResolvedConfig;
 } | {
     readonly kind: 'copilot-cli';
     readonly name: string;
     readonly judgeTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
-    readonly config: CopilotResolvedConfig;
+    readonly config: CopilotCliResolvedConfig;
 } | {
     readonly kind: 'pi-coding-agent';
     readonly name: string;
@@ -970,12 +1443,12 @@ type ResolvedTarget = {
     readonly providerBatching?: boolean;
     readonly config: PiAgentSdkResolvedConfig;
 } | {
-    readonly kind: 'claude-code';
+    readonly kind: 'claude';
     readonly name: string;
     readonly judgeTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
-    readonly config: ClaudeCodeResolvedConfig;
+    readonly config: ClaudeResolvedConfig;
 } | {
     readonly kind: 'mock';
     readonly name: string;
@@ -1000,6 +1473,42 @@ type ResolvedTarget = {
 };
 declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
+/**
+ * Extensible provider registry.
+ *
+ * Replaces the hardcoded switch/case dispatch in createProvider() with
+ * a registry of named factory functions. Built-in providers are registered
+ * at startup; users can add custom providers via the registry API or by
+ * dropping files in `.agentv/providers/`.
+ */
+/**
+ * Factory function that creates a Provider instance from a resolved target.
+ */
+type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
+/**
+ * Registry of provider factory functions keyed by provider kind.
+ *
+ * Built-in providers are registered at startup. Custom providers can be
+ * registered via the `register()` method.
+ */
+declare class ProviderRegistry {
+    private readonly factories;
+    /** Register a factory function for a provider kind. */
+    register(kind: string, factory: ProviderFactoryFn): this;
+    /** Get the factory function for a provider kind. */
+    get(kind: string): ProviderFactoryFn | undefined;
+    /** Check if a factory is registered for the given kind. */
+    has(kind: string): boolean;
+    /** List all registered provider kind names. */
+    list(): string[];
+    /**
+     * Create a provider instance from a resolved target.
+     * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
+     */
+    create(target: ResolvedTarget): Provider;
+}
 declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
 declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
@@ -1007,6 +1516,7 @@ interface EnsureSubagentsOptions {
     readonly kind: 'vscode' | 'vscode-insiders';
     readonly count: number;
     readonly verbose?: boolean;
+    readonly vscodeCmd?: string;
 }
 interface EnsureSubagentsResult {
     readonly provisioned: boolean;
@@ -1041,15 +1551,25 @@ type PiLogListener = (entry: PiLogEntry) => void;
 declare function consumePiLogEntries(): PiLogEntry[];
 declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
-type ClaudeCodeLogEntry = {
+type ClaudeLogEntry = {
+    readonly filePath: string;
+    readonly evalCaseId?: string;
+    readonly targetName: string;
+    readonly attempt?: number;
+};
+type ClaudeLogListener = (entry: ClaudeLogEntry) => void;
+declare function consumeClaudeLogEntries(): ClaudeLogEntry[];
+declare function subscribeToClaudeLogEntries(listener: ClaudeLogListener): () => void;
+type CopilotSdkLogEntry = {
     readonly filePath: string;
     readonly evalCaseId?: string;
     readonly targetName: string;
     readonly attempt?: number;
 };
-type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
-declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
-declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
+type CopilotSdkLogListener = (entry: CopilotSdkLogEntry) => void;
+declare function consumeCopilotSdkLogEntries(): CopilotSdkLogEntry[];
+declare function subscribeToCopilotSdkLogEntries(listener: CopilotSdkLogListener): () => void;
 type CopilotCliLogEntry = {
     readonly filePath: string;
@@ -1061,6 +1581,38 @@ type CopilotCliLogListener = (entry: CopilotCliLogEntry) => void;
 declare function consumeCopilotCliLogEntries(): CopilotCliLogEntry[];
 declare function subscribeToCopilotCliLogEntries(listener: CopilotCliLogListener): () => void;
+/**
+ * Convention-based discovery of custom provider scripts.
+ *
+ * Scans `.agentv/providers/` for TypeScript/JavaScript files and registers
+ * them as CLI-like providers in the registry. The file name (without
+ * extension) becomes the provider kind name.
+ *
+ * Example: `.agentv/providers/my-llm.ts` -> provider kind "my-llm" in targets.yaml
+ */
+/**
+ * Discover custom provider scripts from `.agentv/providers/` and register
+ * them as provider kinds in the registry.
+ *
+ * Each discovered script is registered as a CLI-like provider that runs
+ * via `bun run <filePath> {PROMPT}`. The script receives the prompt as
+ * a CLI argument and should print its response to stdout.
+ *
+ * @param registry - The provider registry to register discovered providers into
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
+ * @returns Names of discovered provider kinds
+ */
+declare function discoverProviders(registry: ProviderRegistry, baseDir: string): Promise<string[]>;
+/**
+ * Create and return the default provider registry with all built-in providers.
+ */
+declare function createBuiltinProviderRegistry(): ProviderRegistry;
+/**
+ * Create a provider from a resolved target using the default registry.
+ * Custom providers can be registered via `createBuiltinProviderRegistry().register()`.
+ */
 declare function createProvider(target: ResolvedTarget): Provider;
 declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
@@ -1070,7 +1622,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
  */
 type TargetResolver = (targetName: string) => Provider | undefined;
 interface EvaluationContext {
-    readonly evalCase: EvalCase;
+    readonly evalCase: EvalTest;
     readonly candidate: string;
     readonly target: ResolvedTarget;
     readonly provider: Provider;
@@ -1086,13 +1638,17 @@ interface EvaluationContext {
     readonly evaluatorTemplateOverride?: string;
     readonly evaluator?: EvaluatorConfig;
     /** Output messages from agent execution (primary source for tool trajectory) */
-    readonly outputMessages?: readonly OutputMessage[];
+    readonly output?: readonly Message[];
     /** Lightweight summary of trace events (if available) */
-    readonly traceSummary?: TraceSummary;
+    readonly trace?: TraceSummary;
     /** Resolver for target override in code judges */
     readonly targetResolver?: TargetResolver;
     /** List of available target names for code judges */
     readonly availableTargets?: readonly string[];
+    /** Unified diff of file changes from workspace (when workspace_template is configured) */
+    readonly fileChanges?: string;
+    /** Absolute path to the workspace directory (when workspace_template is configured) */
+    readonly workspacePath?: string;
 }
 interface EvaluationScore {
     readonly score: number;
@@ -1102,7 +1658,7 @@ interface EvaluationScore {
     readonly expectedAspectCount: number;
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
-    readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
     readonly details?: JsonObject;
 }
@@ -1116,7 +1672,7 @@ interface ChildEvaluatorResult {
     readonly misses: readonly string[];
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
-    readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
     readonly details?: JsonObject;
 }
@@ -1134,11 +1690,12 @@ declare function extractJsonBlob(text: string): string | undefined;
 declare function parseJsonFromText(text: string): unknown;
 declare function isNonEmptyString(value: unknown): value is string;
 declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
+declare function deepEqual(a: unknown, b: unknown): boolean;
 /**
- * Deep equality check for two values.
- * Handles primitives, arrays, and plain objects.
+ * Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
+ * swaps hits/misses, and annotates reasoning.
  */
-declare function deepEqual(a: unknown, b: unknown): boolean;
+declare function negateScore(score: EvaluationScore): EvaluationScore;
 interface CodeEvaluatorOptions {
     readonly script: readonly string[];
@@ -1175,6 +1732,7 @@ declare class CompositeEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private aggregate;
     private runWeightedAverage;
+    private runThreshold;
     private runCodeAggregator;
     private runLlmAggregator;
 }
@@ -1184,7 +1742,7 @@ interface CostEvaluatorOptions {
 }
 /**
  * Evaluator that checks execution cost against a budget.
- * Uses traceSummary.costUsd from the evaluation context.
+ * Uses trace.costUsd from the evaluation context.
  */
 declare class CostEvaluator implements Evaluator {
     readonly kind = "cost";
@@ -1193,6 +1751,25 @@ declare class CostEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): EvaluationScore;
 }
+interface ExecutionMetricsEvaluatorOptions {
+    readonly config: ExecutionMetricsEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution metrics against configured thresholds.
+ * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
+ * and exploration ratio. Only specified thresholds are checked.
+ *
+ * Score is proportional: hits.length / (hits.length + misses.length)
+ */
+declare class ExecutionMetricsEvaluator implements Evaluator {
+    readonly kind = "execution_metrics";
+    private readonly config;
+    constructor(options: ExecutionMetricsEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+    private extractConfiguredThresholds;
+    private filterDefinedMetrics;
+}
 interface FieldAccuracyEvaluatorOptions {
     readonly config: FieldAccuracyEvaluatorConfig;
 }
@@ -1206,7 +1783,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
     constructor(options: FieldAccuracyEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
     /**
-     * Extract expected data from expected_messages array.
+     * Extract expected data from expected_output array.
      * Looks for the last assistant message with content.
      */
     private extractExpectedData;
@@ -1237,7 +1814,7 @@ interface LatencyEvaluatorOptions {
 }
 /**
  * Evaluator that checks execution duration against a threshold.
- * Uses traceSummary.durationMs from the evaluation context.
+ * Uses trace.durationMs from the evaluation context.
  */
 declare class LatencyEvaluator implements Evaluator {
     readonly kind = "latency";
@@ -1246,6 +1823,11 @@ declare class LatencyEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): EvaluationScore;
 }
+/**
+ * Default evaluator template for the user prompt (variables will be substituted).
+ * Custom evaluators can override this via evaluatorTemplate option.
+ */
+declare const DEFAULT_EVALUATOR_TEMPLATE: string;
 type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
 interface LlmJudgeEvaluatorOptions {
     readonly resolveJudgeProvider: JudgeProviderResolver;
@@ -1269,6 +1851,36 @@ declare const freeformEvaluationSchema: z.ZodObject<{
     misses?: string[] | undefined;
     reasoning?: string | undefined;
 }>;
+declare const rubricEvaluationSchema: z.ZodObject<{
+    checks: z.ZodArray<z.ZodObject<{
+        id: z.ZodString;
+        satisfied: z.ZodBoolean;
+        reasoning: z.ZodString;
+    }, "strip", z.ZodTypeAny, {
+        reasoning: string;
+        id: string;
+        satisfied: boolean;
+    }, {
+        reasoning: string;
+        id: string;
+        satisfied: boolean;
+    }>, "many">;
+    overall_reasoning: z.ZodString;
+}, "strip", z.ZodTypeAny, {
+    checks: {
+        reasoning: string;
+        id: string;
+        satisfied: boolean;
+    }[];
+    overall_reasoning: string;
+}, {
+    checks: {
+        reasoning: string;
+        id: string;
+        satisfied: boolean;
+    }[];
+    overall_reasoning: string;
+}>;
 declare class LlmJudgeEvaluator implements Evaluator {
     readonly kind = "llm_judge";
@@ -1297,13 +1909,87 @@ declare class LlmJudgeEvaluator implements Evaluator {
  * This schema is always appended to the evaluator template.
  */
 declare function buildOutputSchema(): string;
+declare function buildRubricOutputSchema(): string;
+declare function substituteVariables(template: string, variables: Record<string, string>): string;
+declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
+    score: number;
+    verdict: 'pass' | 'fail' | 'borderline';
+    hits: string[];
+    misses: string[];
+};
+/**
+ * Build the output schema for score-range rubric evaluation.
+ */
+declare function buildScoreRangeOutputSchema(): string;
+interface AgentJudgeEvaluatorOptions {
+    readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
+    readonly maxSteps?: number;
+    readonly temperature?: number;
+    readonly evaluatorTemplate?: string;
+    readonly judgeTargetProvider?: Provider;
+}
+declare class AgentJudgeEvaluator implements Evaluator {
+    readonly kind = "agent_judge";
+    private readonly resolveJudgeProvider;
+    private readonly maxSteps;
+    private readonly temperature;
+    private readonly evaluatorTemplate?;
+    private readonly judgeTargetProvider?;
+    constructor(options: AgentJudgeEvaluatorOptions);
+    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
+    /**
+     * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
+     */
+    private evaluateBuiltIn;
+    /**
+     * Judge target mode: Delegates to an external agent provider via Provider.invoke().
+     */
+    private evaluateWithJudgeTarget;
+    /**
+     * Parse the agent's response text into an EvaluationScore.
+     * Supports both freeform and rubric modes.
+     */
+    private parseResult;
+    /**
+     * Build system prompt for built-in mode.
+     * Includes output format instructions.
+     */
+    private buildSystemPrompt;
+    /**
+     * Build user prompt for built-in mode.
+     * Uses custom template if provided, otherwise builds default prompt.
+     */
+    private buildUserPrompt;
+    /**
+     * Build the full evaluation prompt for judge target mode (delegation).
+     * Combines task context, criteria, candidate info, and output format instructions.
+     */
+    private buildDelegatedPrompt;
+}
+interface LlmJudgePromptAssembly {
+    systemPrompt: string;
+    userPrompt: string;
+    responseSchema: string;
+    mode: 'freeform' | 'checklist' | 'score_range';
+}
+declare function assembleLlmJudgePrompt(input: {
+    evalCase: EvalTest;
+    candidate: string;
+    promptInputs: PromptInputs;
+    evaluatorConfig?: LlmJudgeEvaluatorConfig;
+    output?: readonly Message[];
+    fileChanges?: string;
+    evaluatorTemplateOverride?: string;
+}): LlmJudgePromptAssembly;
 interface TokenUsageEvaluatorOptions {
     readonly config: TokenUsageEvaluatorConfig;
 }
 /**
  * Evaluator that checks provider-reported token usage against configured limits.
- * Uses traceSummary.tokenUsage from the evaluation context.
+ * Uses trace.tokenUsage from the evaluation context.
  */
 declare class TokenUsageEvaluator implements Evaluator {
     readonly kind = "token_usage";
@@ -1331,6 +2017,109 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
     private evaluateAnyOrder;
     private evaluateInOrder;
     private evaluateExact;
+    /**
+     * Superset mode: actual trajectory must contain all expected tool calls.
+     * Every expected item must be found in actual (greedy matching with consumption).
+     * Extra tool calls in actual are OK.
+     */
+    private evaluateSuperset;
+    /**
+     * Subset mode: every actual tool call must be in the allowed list.
+     * Expected items are reusable (not consumed) - they define the allowed set.
+     * If every actual call matches at least one expected item, score is 1.
+     */
+    private evaluateSubset;
+}
+/**
+ * Deterministic assertion evaluators.
+ *
+ * Pure functions that check agent output against simple conditions
+ * and return a binary score (0 or 1) with descriptive hits/misses.
+ */
+type AssertionResult = {
+    score: number;
+    hits: string[];
+    misses: string[];
+};
+/** Checks if `output` contains the given `value` substring. */
+declare function runContainsAssertion(output: string, value: string): AssertionResult;
+/** Checks if `output` matches the given regex `pattern`. */
+declare function runRegexAssertion(output: string, pattern: string): AssertionResult;
+/** Checks if `output` is valid JSON. */
+declare function runIsJsonAssertion(output: string): AssertionResult;
+/** Checks if `output` exactly equals `value` (both trimmed). */
+declare function runEqualsAssertion(output: string, value: string): AssertionResult;
+/**
+ * Extensible evaluator registry.
+ *
+ * Replaces the hardcoded switch/case dispatch in the orchestrator with
+ * a registry of named factory functions. Built-in evaluators are registered
+ * at startup; users can add custom evaluators via `defineAssertion()` in
+ * `@agentv/eval` or by dropping files in `.agentv/assertions/`.
+ */
+/**
+ * Context passed to evaluator factory functions during creation.
+ * Contains shared resources needed by evaluator instances.
+ */
+interface EvaluatorDispatchContext {
+    /** Shared LLM judge provider (resolved at suite level) */
+    readonly judgeProvider?: Provider;
+    /** Function to resolve target names to providers */
+    readonly targetResolver?: TargetResolver;
+    /** Available target names for code judges */
+    readonly availableTargets?: readonly string[];
+    /** Agent timeout in ms */
+    readonly agentTimeoutMs?: number;
+    /** Directory containing the eval file (for composite member resolution) */
+    readonly evalFileDir?: string;
+    /** Shared LLM judge evaluator instance */
+    readonly llmJudge: Evaluator;
+    /** Reference to the registry itself (for composite evaluators that need to create children) */
+    readonly registry: EvaluatorRegistry;
+}
+/**
+ * Factory function that creates an Evaluator instance from a config.
+ *
+ * Factory functions handle all type-specific initialization logic:
+ * - Reading prompt files for LLM judges
+ * - Resolving script paths for code judges
+ * - Creating adapter evaluators for deterministic assertions
+ */
+type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
+/**
+ * Registry of evaluator factory functions keyed by evaluator type name.
+ *
+ * Built-in evaluators are registered at startup. Custom evaluators can be
+ * registered via the `register()` method or discovered from `.agentv/assertions/`.
+ */
+declare class EvaluatorRegistry {
+    private readonly factories;
+    /** Register a factory function for an evaluator type. */
+    register(type: string, factory: EvaluatorFactoryFn): this;
+    /** Get the factory function for an evaluator type. */
+    get(type: string): EvaluatorFactoryFn | undefined;
+    /** Check if a factory is registered for the given type. */
+    has(type: string): boolean;
+    /** List all registered evaluator type names. */
+    list(): string[];
+    /**
+     * Create an evaluator instance from a config, using the registered factory.
+     * Throws if no factory is registered for the evaluator type.
+     */
+    create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
+}
+/**
+ * Adapter that wraps a synchronous assertion function as an Evaluator.
+ * Used for deterministic assertions (contains, regex, is_json, equals).
+ */
+declare class DeterministicAssertionEvaluator implements Evaluator {
+    private readonly assertFn;
+    readonly kind: string;
+    constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
+    evaluate(context: EvaluationContext): EvaluationScore;
 }
 type MaybePromise<T> = T | Promise<T>;
@@ -1339,7 +2128,7 @@ interface EvaluationCache {
     set(key: string, value: ProviderResponse): MaybePromise<void>;
 }
 interface RunEvalCaseOptions {
-    readonly evalCase: EvalCase;
+    readonly evalCase: EvalTest;
     readonly provider: Provider;
     readonly target: ResolvedTarget;
     readonly evaluators: Partial<Record<string, Evaluator>> & {
@@ -1356,10 +2145,26 @@ interface RunEvalCaseOptions {
     readonly targetResolver?: (name: string) => Provider | undefined;
     /** List of available target names for code judges */
     readonly availableTargets?: readonly string[];
+    /** Unique identifier for the evaluation run (used for workspace management) */
+    readonly evalRunId?: string;
+    /** Keep workspace on success (default: cleanup on success, keep on failure) */
+    readonly keepWorkspaces?: boolean;
+    /** Force cleanup of workspaces even on failure */
+    readonly cleanupWorkspaces?: boolean;
+    /** Pre-created shared workspace path (shared across tests in a suite) */
+    readonly sharedWorkspacePath?: string;
+    /** Pre-initialized baseline commit for shared workspace */
+    readonly sharedBaselineCommit?: string;
+    /** Suite-level .code-workspace file (resolved from workspace.template) */
+    readonly suiteWorkspaceFile?: string;
+    /** Real-time observability callbacks passed to the provider */
+    readonly streamCallbacks?: ProviderStreamCallbacks;
+    /** Evaluator type registry (with custom assertions discovered) */
+    readonly typeRegistry?: EvaluatorRegistry;
 }
 interface ProgressEvent {
     readonly workerId: number;
-    readonly evalId: string;
+    readonly testId: string;
     readonly status: 'pending' | 'running' | 'completed' | 'failed';
     readonly startedAt?: number;
     readonly completedAt?: number;
@@ -1378,19 +2183,367 @@ interface RunEvaluationOptions {
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
-    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
     readonly filter?: string;
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
-    readonly evalCases?: readonly EvalCase[];
+    readonly evalCases?: readonly EvalTest[];
     readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
     readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
+    /** Keep workspace on success (default: cleanup on success, keep on failure) */
+    readonly keepWorkspaces?: boolean;
+    /** Force cleanup of workspaces even on failure */
+    readonly cleanupWorkspaces?: boolean;
+    /** Trial configuration for running eval cases multiple times */
+    readonly trials?: TrialsConfig;
+    /** Real-time observability callbacks passed to the provider */
+    readonly streamCallbacks?: ProviderStreamCallbacks;
 }
 declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
 declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
+/**
+ * Programmatic API for running evaluations.
+ *
+ * Provides `evaluate()` — a high-level function for using AgentV as a library
+ * instead of a CLI. The config shape mirrors the YAML structure for easy
+ * translation between file-based and programmatic usage.
+ *
+ * @example Inline tests
+ * ```typescript
+ * import { evaluate } from '@agentv/core';
+ *
+ * const results = await evaluate({
+ *   tests: [
+ *     {
+ *       id: 'capital',
+ *       input: 'What is the capital of France?',
+ *       expected_output: 'Paris',
+ *       assert: [{ type: 'contains', value: 'Paris' }],
+ *     },
+ *   ],
+ *   target: { provider: 'mock_agent' },
+ * });
+ *
+ * console.log(results.summary.passed, 'passed');
+ * ```
+ *
+ * @example File-based
+ * ```typescript
+ * const results = await evaluate({
+ *   specFile: './evals/EVAL.yaml',
+ *   target: { provider: 'claude_agent' },
+ * });
+ * ```
+ *
+ * @module
+ */
+/**
+ * Inline test definition for the programmatic API.
+ * Mirrors the YAML test structure.
+ */
+interface EvalTestInput {
+    /** Unique test identifier */
+    readonly id: string;
+    /** What the response should accomplish */
+    readonly criteria?: string;
+    /** Input to the agent (string or message array) */
+    readonly input: string | readonly {
+        role: string;
+        content: string;
+    }[];
+    /** Expected reference output */
+    readonly expected_output?: string;
+    /** Assertion evaluators */
+    readonly assert?: readonly EvalAssertionInput[];
+    /** Arbitrary metadata */
+    readonly metadata?: Record<string, unknown>;
+}
+/**
+ * Inline assertion definition for the programmatic API.
+ * Matches the YAML `assert` block structure.
+ */
+interface EvalAssertionInput {
+    /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
+    readonly type: string;
+    /** Display name */
+    readonly name?: string;
+    /** Value for deterministic assertions (contains, equals, regex) */
+    readonly value?: string;
+    /** Weight for scoring */
+    readonly weight?: number;
+    /** Whether this assertion is required to pass */
+    readonly required?: boolean | number;
+    /** Prompt file for llm_judge */
+    readonly prompt?: string;
+    /** Script for code_judge */
+    readonly script?: string | readonly string[];
+    /** Additional config passed to the assertion */
+    readonly config?: Record<string, unknown>;
+    /** Nested assertions for composite type */
+    readonly assert?: readonly EvalAssertionInput[];
+    /** Rubric criteria for rubrics type */
+    readonly criteria?: readonly (string | {
+        id?: string;
+        outcome: string;
+        weight?: number;
+    })[];
+    /** Additional properties */
+    readonly [key: string]: unknown;
+}
+/**
+ * Configuration for `evaluate()`.
+ * Accepts either inline tests or a spec file path.
+ */
+interface EvalConfig {
+    /** Inline test definitions (mutually exclusive with specFile) */
+    readonly tests?: readonly EvalTestInput[];
+    /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
+    readonly specFile?: string;
+    /** Target provider configuration */
+    readonly target?: TargetDefinition;
+    /** Suite-level assertions applied to all tests */
+    readonly assert?: readonly EvalAssertionInput[];
+    /** Filter tests by ID pattern (glob supported) */
+    readonly filter?: string;
+    /** Maximum concurrent workers (default: 3) */
+    readonly workers?: number;
+    /** Maximum retries on failure (default: 2) */
+    readonly maxRetries?: number;
+    /** Agent timeout in milliseconds (default: 120000) */
+    readonly agentTimeoutMs?: number;
+    /** Enable response caching */
+    readonly cache?: boolean;
+    /** Verbose logging */
+    readonly verbose?: boolean;
+    /** Callback for each completed result */
+    readonly onResult?: (result: EvaluationResult) => void;
+}
+/**
+ * Summary statistics for an evaluation run.
+ */
+interface EvalSummary {
+    /** Total number of test cases */
+    readonly total: number;
+    /** Number of passing test cases (score >= 0.8) */
+    readonly passed: number;
+    /** Number of failing test cases (score < 0.5) */
+    readonly failed: number;
+    /** Number of borderline test cases (0.5 <= score < 0.8) */
+    readonly borderline: number;
+    /** Total duration in milliseconds */
+    readonly durationMs: number;
+    /** Mean score across all cases */
+    readonly meanScore: number;
+}
+/**
+ * Result of an `evaluate()` call.
+ */
+interface EvalRunResult {
+    /** Individual test case results */
+    readonly results: readonly EvaluationResult[];
+    /** Aggregate summary statistics */
+    readonly summary: EvalSummary;
+}
+/**
+ * Run an evaluation suite against a target provider.
+ *
+ * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
+ * The config shape mirrors the YAML structure — users can translate between
+ * file-based and programmatic usage 1:1.
+ *
+ * @param config - Evaluation configuration
+ * @returns Typed evaluation results with summary statistics
+ *
+ * @example Inline tests with assertions
+ * ```typescript
+ * const { results, summary } = await evaluate({
+ *   tests: [
+ *     {
+ *       id: 'greeting',
+ *       input: 'Say hello',
+ *       assert: [{ type: 'contains', value: 'hello' }],
+ *     },
+ *   ],
+ *   target: { provider: 'mock_agent' },
+ * });
+ * console.log(`${summary.passed}/${summary.total} passed`);
+ * ```
+ *
+ * @example Load from YAML
+ * ```typescript
+ * const { summary } = await evaluate({
+ *   specFile: './evals/my-eval.yaml',
+ *   filter: 'greeting-*',
+ * });
+ * ```
+ */
+declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
+/**
+ * Typed configuration file support for AgentV.
+ *
+ * Provides `defineConfig()` for use in `agentv.config.ts` files. Supports
+ * auto-discovery, Zod validation, and IDE autocomplete.
+ *
+ * @example
+ * ```typescript
+ * // agentv.config.ts
+ * import { defineConfig } from '@agentv/core';
+ *
+ * export default defineConfig({
+ *   execution: {
+ *     workers: 5,
+ *     maxRetries: 2,
+ *     agentTimeoutMs: 120_000,
+ *   },
+ *   output: {
+ *     format: 'jsonl',
+ *     dir: './results',
+ *   },
+ * });
+ * ```
+ *
+ * @module
+ */
+/**
+ * Schema for AgentV project-level configuration.
+ */
+declare const AgentVConfigSchema: z.ZodObject<{
+    /** Default execution settings */
+    execution: z.ZodOptional<z.ZodObject<{
+        /** Number of parallel workers (default: 3) */
+        workers: z.ZodOptional<z.ZodNumber>;
+        /** Maximum retries on failure (default: 2) */
+        maxRetries: z.ZodOptional<z.ZodNumber>;
+        /** Agent timeout in milliseconds (default: 120000) */
+        agentTimeoutMs: z.ZodOptional<z.ZodNumber>;
+    }, "strip", z.ZodTypeAny, {
+        workers?: number | undefined;
+        maxRetries?: number | undefined;
+        agentTimeoutMs?: number | undefined;
+    }, {
+        workers?: number | undefined;
+        maxRetries?: number | undefined;
+        agentTimeoutMs?: number | undefined;
+    }>>;
+    /** Output settings */
+    output: z.ZodOptional<z.ZodObject<{
+        /** Output format */
+        format: z.ZodOptional<z.ZodEnum<["jsonl", "yaml", "json", "xml"]>>;
+        /** Output directory */
+        dir: z.ZodOptional<z.ZodString>;
+    }, "strip", z.ZodTypeAny, {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    }, {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    }>>;
+    /** Response caching */
+    cache: z.ZodOptional<z.ZodObject<{
+        /** Enable response caching */
+        enabled: z.ZodOptional<z.ZodBoolean>;
+        /** Cache file path */
+        path: z.ZodOptional<z.ZodString>;
+    }, "strip", z.ZodTypeAny, {
+        enabled?: boolean | undefined;
+        path?: string | undefined;
+    }, {
+        enabled?: boolean | undefined;
+        path?: string | undefined;
+    }>>;
+    /** Cost and duration limits */
+    limits: z.ZodOptional<z.ZodObject<{
+        /** Maximum cost per run in USD */
+        maxCostUsd: z.ZodOptional<z.ZodNumber>;
+        /** Maximum duration per run in milliseconds */
+        maxDurationMs: z.ZodOptional<z.ZodNumber>;
+    }, "strip", z.ZodTypeAny, {
+        maxDurationMs?: number | undefined;
+        maxCostUsd?: number | undefined;
+    }, {
+        maxDurationMs?: number | undefined;
+        maxCostUsd?: number | undefined;
+    }>>;
+}, "strip", z.ZodTypeAny, {
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
+    execution?: {
+        workers?: number | undefined;
+        maxRetries?: number | undefined;
+        agentTimeoutMs?: number | undefined;
+    } | undefined;
+    cache?: {
+        enabled?: boolean | undefined;
+        path?: string | undefined;
+    } | undefined;
+    limits?: {
+        maxDurationMs?: number | undefined;
+        maxCostUsd?: number | undefined;
+    } | undefined;
+}, {
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
+    execution?: {
+        workers?: number | undefined;
+        maxRetries?: number | undefined;
+        agentTimeoutMs?: number | undefined;
+    } | undefined;
+    cache?: {
+        enabled?: boolean | undefined;
+        path?: string | undefined;
+    } | undefined;
+    limits?: {
+        maxDurationMs?: number | undefined;
+        maxCostUsd?: number | undefined;
+    } | undefined;
+}>;
+/**
+ * AgentV project-level configuration type.
+ * Inferred from the Zod schema for full type safety.
+ */
+type AgentVConfig = z.infer<typeof AgentVConfigSchema>;
+/**
+ * Define a typed AgentV configuration.
+ *
+ * Use this in `agentv.config.ts` at your project root. The configuration
+ * is validated at load time and provides full IDE autocomplete.
+ *
+ * @param config - Configuration object
+ * @returns Validated configuration
+ *
+ * @example
+ * ```typescript
+ * import { defineConfig } from '@agentv/core';
+ *
+ * export default defineConfig({
+ *   execution: { workers: 5 },
+ *   output: { format: 'jsonl', dir: './results' },
+ *   limits: { maxCostUsd: 10.0 },
+ * });
+ * ```
+ */
+declare function defineConfig(config: AgentVConfig): AgentVConfig;
+/**
+ * Discover and load an AgentV config file from the project root.
+ *
+ * Searches for config files in discovery order. Returns null if
+ * no config file is found.
+ *
+ * @param projectRoot - Project root directory to search from
+ * @returns Loaded and validated config, or null if not found
+ */
+declare function loadTsConfig(projectRoot: string): Promise<AgentVConfig | null>;
 interface GenerateRubricsOptions {
-    readonly expectedOutcome: string;
+    readonly criteria: string;
     readonly question?: string;
     readonly referenceAnswer?: string;
     readonly provider: Provider;
@@ -1400,9 +2553,339 @@ interface GenerateRubricsOptions {
  */
 declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
+/**
+ * Error thrown when the template path does not exist.
+ */
+declare class TemplateNotFoundError extends Error {
+    constructor(templatePath: string);
+}
+/**
+ * Error thrown when the template path is a file instead of a directory.
+ */
+declare class TemplateNotDirectoryError extends Error {
+    constructor(templatePath: string);
+}
+/**
+ * Error thrown when there is insufficient disk space or other I/O errors.
+ */
+declare class WorkspaceCreationError extends Error {
+    readonly cause?: Error | undefined;
+    constructor(message: string, cause?: Error | undefined);
+}
+/**
+ * Get the workspace path for a specific eval case.
+ *
+ * Workspace structure:
+ * {workspaceRoot}/{evalRunId}/{caseId}
+ *
+ * Example:
+ * ~/.agentv/workspaces/abc123/case-01
+ *
+ * @param evalRunId - The unique identifier for the evaluation run
+ * @param caseId - The unique identifier for the evaluation case
+ * @param workspaceRoot - Optional custom workspace root directory (defaults to ~/.agentv/workspaces)
+ * @returns Absolute path to the workspace directory
+ */
+declare function getWorkspacePath(evalRunId: string, caseId: string, workspaceRoot?: string): string;
+/**
+ * Create a temporary workspace by copying a template directory.
+ *
+ * The workspace is created at ~/.agentv/workspaces/{evalRunId}/{caseId}/
+ * The .git directory from the template is skipped during copy.
+ *
+ * @param templatePath - Absolute path to the template directory
+ * @param evalRunId - The unique identifier for the evaluation run
+ * @param caseId - The unique identifier for the evaluation case
+ * @param workspaceRoot - Optional custom workspace root directory
+ * @returns Absolute path to the created workspace directory
+ * @throws TemplateNotFoundError if the template path does not exist
+ * @throws TemplateNotDirectoryError if the template path is not a directory
+ * @throws WorkspaceCreationError if there's an error creating the workspace
+ */
+declare function createTempWorkspace(templatePath: string, evalRunId: string, caseId: string, workspaceRoot?: string): Promise<string>;
+/**
+ * Remove a single workspace directory.
+ *
+ * @param workspacePath - Absolute path to the workspace directory to remove
+ * @throws Error if the cleanup fails
+ */
+declare function cleanupWorkspace(workspacePath: string): Promise<void>;
+/**
+ * Remove all workspaces for an evaluation run.
+ *
+ * This removes the entire {workspaceRoot}/{evalRunId} directory,
+ * cleaning up all case workspaces for that run.
+ *
+ * @param evalRunId - The unique identifier for the evaluation run
+ * @param workspaceRoot - Optional custom workspace root directory
+ * @throws Error if the cleanup fails
+ */
+declare function cleanupEvalWorkspaces(evalRunId: string, workspaceRoot?: string): Promise<void>;
+/**
+ * Context passed to workspace lifecycle scripts via stdin.
+ */
+interface ScriptExecutionContext {
+    readonly workspacePath: string;
+    readonly testId: string;
+    readonly evalRunId: string;
+    readonly caseInput?: string;
+    readonly caseMetadata?: Record<string, unknown>;
+}
+type ScriptFailureMode = 'fatal' | 'warn';
+/**
+ * Executes a workspace lifecycle script (before_all, after_all, before_each, after_each).
+ *
+ * @param config - Workspace script configuration (script, timeout_ms, cwd)
+ * @param context - Context passed to script via stdin (JSON)
+ * @param failureMode - 'fatal' throws on non-zero exit; 'warn' logs warning
+ * @returns Captured stdout from the script
+ * @throws Error if script exits with non-zero code (fatal mode) or times out
+ */
+declare function executeWorkspaceScript(config: WorkspaceScriptConfig, context: ScriptExecutionContext, failureMode?: ScriptFailureMode): Promise<string>;
+/**
+ * Initialize a git baseline for workspace file change tracking.
+ *
+ * Runs `git init` directly in the workspace, stages all files, and creates
+ * a baseline commit. Returns the commit hash for later diffing.
+ */
+declare function initializeBaseline(workspacePath: string): Promise<string>;
+/**
+ * Capture file changes from workspace relative to the baseline commit.
+ * Returns a unified diff string, or empty string if no changes.
+ *
+ * Supports nested git repos (e.g. cloned dependencies): stages files inside
+ * each child repo first, then uses `--submodule=diff` to expand submodule
+ * changes into individual file diffs rather than opaque gitlink hashes.
+ */
+declare function captureFileChanges(workspacePath: string, baselineCommit: string): Promise<string>;
+interface ResolvedWorkspaceTemplate {
+    /** Directory to copy as the working directory (for createTempWorkspace / request.cwd) */
+    readonly dir: string;
+    /** Optional .code-workspace file for VS Code providers */
+    readonly workspaceFile?: string;
+}
+/**
+ * Resolves a workspace.template value into a directory + optional .code-workspace file.
+ *
+ * Resolution rules:
+ * - .code-workspace file → dir = parent directory, workspaceFile = the file
+ * - Directory with exactly 1 .code-workspace → dir = directory, workspaceFile = that file
+ * - Directory with N .code-workspace → dir = directory, workspaceFile = template.code-workspace (if present)
+ * - Directory with 0 .code-workspace → dir = directory, workspaceFile = undefined
+ */
+declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
+/**
+ * File-based LLM response cache.
+ * Stores provider responses as JSON files keyed by SHA-256 hash.
+ * Directory structure: <cache_path>/<first-2-chars>/<full-hash>.json
+ */
+declare class ResponseCache implements EvaluationCache {
+    private readonly cachePath;
+    constructor(cachePath?: string);
+    get(key: string): Promise<ProviderResponse | undefined>;
+    set(key: string, value: ProviderResponse): Promise<void>;
+    private keyToPath;
+}
+/**
+ * Determine whether caching should be active for a given run.
+ *
+ * Precedence:
+ *   1. --no-cache CLI flag → always disabled
+ *   2. --cache CLI flag OR execution.cache YAML → enabled
+ *   3. Default → disabled (safe for variability testing)
+ */
+declare function shouldEnableCache(params: {
+    cliCache: boolean;
+    cliNoCache: boolean;
+    yamlCache?: boolean;
+}): boolean;
+/**
+ * Check whether caching should be skipped for a target with temperature > 0.
+ * Non-deterministic responses should not be cached unless explicitly forced.
+ */
+declare function shouldSkipCacheForTemperature(targetConfig: Record<string, unknown>): boolean;
+/**
+ * Recursively converts all keys in an object from camelCase to snake_case.
+ * This is used to convert TypeScript internal representations to snake_case
+ * for Python ecosystem compatibility in JSON payloads.
+ *
+ * Conversion rules:
+ * - Object keys: camelCase -> snake_case
+ * - Array elements: recursively converted
+ * - Primitives: returned unchanged
+ * - null/undefined: returned unchanged
+ *
+ * @param obj - The object to convert (can be any JSON-serializable value)
+ * @returns A new object with all keys converted to snake_case
+ */
+declare function toSnakeCaseDeep(obj: unknown): unknown;
+/**
+ * Recursively converts all keys in an object from snake_case to camelCase.
+ * This is used by optional SDK helpers to map wire payloads into TypeScript-friendly
+ * shapes.
+ *
+ * @param obj - The object to convert (can be any JSON-serializable value)
+ * @returns A new object with all keys converted to camelCase
+ */
+declare function toCamelCaseDeep(obj: unknown): unknown;
+/**
+ * Trims an EvaluationResult for baseline storage.
+ * Strips large debug/audit fields (denylist approach) while preserving
+ * all fields needed for regression comparison (scores, hits, misses, etc.).
+ *
+ * Returns a new object — the input is not mutated.
+ */
+declare function trimBaselineResult(result: EvaluationResult): EvaluationResult;
+/** Options for configuring the OTel trace exporter. */
+interface OtelExportOptions {
+    /** OTLP endpoint URL */
+    readonly endpoint?: string;
+    /** Custom headers (e.g., auth) */
+    readonly headers?: Record<string, string>;
+    /** Whether to include message content in spans */
+    readonly captureContent?: boolean;
+    /** Service name for OTel resource */
+    readonly serviceName?: string;
+    /** When true, group messages into turn spans for multi-turn evals */
+    readonly groupTurns?: boolean;
+    /** Path to write OTLP JSON file (importable by OTel backends) */
+    readonly otlpFilePath?: string;
+    /** Path to write human-readable simple JSONL trace file */
+    readonly traceFilePath?: string;
+}
+/** Preset configuration for a known observability backend. */
+interface OtelBackendPreset {
+    readonly name: string;
+    readonly endpoint: string;
+    readonly headers: (env: Record<string, string | undefined>) => Record<string, string>;
+}
+declare const OTEL_BACKEND_PRESETS: Record<string, OtelBackendPreset>;
+type OtelApi = any;
+type Tracer = any;
+declare class OtelTraceExporter {
+    private readonly options;
+    private provider;
+    private tracer;
+    private api;
+    private W3CPropagator;
+    constructor(options: OtelExportOptions);
+    /** Initialize the OTel SDK. Returns false if OTel packages are not available. */
+    init(): Promise<boolean>;
+    /** Export a single evaluation result as an OTel trace. */
+    exportResult(result: EvaluationResult): Promise<void>;
+    /** Flush pending spans and shut down. */
+    shutdown(): Promise<void>;
+    /** Create a streaming observer for real-time span export */
+    createStreamingObserver(): OtelStreamingObserver | null;
+    private exportMessage;
+}
+/**
+ * Streaming observer that creates OTel spans in real-time during eval execution.
+ * Spans are exported immediately via SimpleSpanProcessor as each tool call / LLM response completes.
+ */
+declare class OtelStreamingObserver {
+    private readonly tracer;
+    private readonly api;
+    private readonly captureContent;
+    private readonly parentCtx?;
+    private rootSpan;
+    private rootCtx;
+    constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
+    /** Create root eval span immediately (visible in backend right away) */
+    startEvalCase(testId: string, target: string, dataset?: string): void;
+    /** Create and immediately export a tool span */
+    onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
+    /** Create and immediately export an LLM span */
+    onLlmCall(model: string, tokenUsage?: ProviderTokenUsage): void;
+    /** Finalize root span with score/verdict after evaluation completes */
+    finalizeEvalCase(score: number, error?: string): void;
+    /** Get ProviderStreamCallbacks for passing to providers */
+    getStreamCallbacks(): ProviderStreamCallbacks;
+}
+type ReadableSpan$1 = any;
+/**
+ * SpanExporter that writes OTLP JSON (the standard OTel wire format) to a file.
+ * The file can be imported by any OTel-compatible backend.
+ */
+declare class OtlpJsonFileExporter {
+    private spans;
+    private filePath;
+    constructor(filePath: string);
+    export(spans: ReadableSpan$1[], resultCallback: (result: {
+        code: number;
+    }) => void): void;
+    shutdown(): Promise<void>;
+    forceFlush(): Promise<void>;
+    private flush;
+}
+type ReadableSpan = any;
+/**
+ * SpanExporter that writes human-readable JSONL (one line per root span).
+ * Designed for quick debugging and analysis without OTel tooling.
+ */
+declare class SimpleTraceFileExporter {
+    private stream;
+    private filePath;
+    private streamReady;
+    private pendingWrites;
+    private _shuttingDown;
+    constructor(filePath: string);
+    private ensureStream;
+    export(spans: ReadableSpan[], resultCallback: (result: {
+        code: number;
+    }) => void): void;
+    shutdown(): Promise<void>;
+    forceFlush(): Promise<void>;
+    private collectChildren;
+    private buildSimpleRecord;
+}
+/**
+ * Factory functions for all built-in evaluator types.
+ *
+ * Each factory creates an Evaluator instance from an EvaluatorConfig,
+ * handling type-specific initialization logic. These are registered into
+ * the EvaluatorRegistry at startup.
+ */
+/**
+ * Create a new EvaluatorRegistry with all built-in evaluator types registered.
+ */
+declare function createBuiltinRegistry(): EvaluatorRegistry;
+/**
+ * Convention-based discovery of custom assertion scripts.
+ *
+ * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
+ * them as code_judge evaluators in the registry. The file name (without
+ * extension) becomes the evaluator type name.
+ *
+ * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
+ */
+/**
+ * Discover custom assertion scripts from `.agentv/assertions/` and register
+ * them as evaluator types in the registry.
+ *
+ * @param registry - The evaluator registry to register discovered assertions into
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
+ * @returns Names of discovered assertion types
+ */
+declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
 type AgentKernel = {
     status: string;
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type CopilotResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };