@agentv/core 2.5.8 → 2.7.1-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,5 +1,217 @@
1
- import { z } from 'zod';
2
1
  import * as ai from 'ai';
2
+ import { z } from 'zod';
3
+
4
+ type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
5
+ interface ChatMessage {
6
+ readonly role: ChatMessageRole;
7
+ readonly content: string;
8
+ readonly name?: string;
9
+ }
10
+ type ChatPrompt = readonly ChatMessage[];
11
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
12
+ /** Callbacks for real-time observability during provider execution */
13
+ interface ProviderStreamCallbacks {
14
+ onToolCallStart?: (toolName: string, toolCallId?: string) => void;
15
+ onToolCallEnd?: (toolName: string, input: unknown, output: unknown, durationMs: number, toolCallId?: string) => void;
16
+ onLlmCallEnd?: (model: string, tokenUsage?: ProviderTokenUsage) => void;
17
+ }
18
+ interface ProviderRequest {
19
+ readonly question: string;
20
+ readonly systemPrompt?: string;
21
+ readonly guidelines?: string;
22
+ readonly guideline_patterns?: readonly string[];
23
+ readonly chatPrompt?: ChatPrompt;
24
+ readonly inputFiles?: readonly string[];
25
+ readonly evalCaseId?: string;
26
+ readonly attempt?: number;
27
+ readonly maxOutputTokens?: number;
28
+ readonly temperature?: number;
29
+ readonly metadata?: JsonObject;
30
+ readonly signal?: AbortSignal;
31
+ /** Working directory override (e.g., from workspace_template) */
32
+ readonly cwd?: string;
33
+ /** VS Code .code-workspace file (resolved from workspace.template) */
34
+ readonly workspaceFile?: string;
35
+ /** When true, AgentV captures file changes from workspace — provider should skip forced diff prompt */
36
+ readonly captureFileChanges?: boolean;
37
+ /** Real-time observability callbacks (optional) */
38
+ readonly streamCallbacks?: ProviderStreamCallbacks;
39
+ }
40
+ /**
41
+ * A tool call within an output message.
42
+ * Represents a single tool invocation with its input and optional output.
43
+ */
44
+ interface ToolCall {
45
+ /** Tool name */
46
+ readonly tool: string;
47
+ /** Tool input arguments */
48
+ readonly input?: unknown;
49
+ /** Tool output result */
50
+ readonly output?: unknown;
51
+ /** Stable identifier for pairing tool calls */
52
+ readonly id?: string;
53
+ /** ISO 8601 timestamp when the tool call started */
54
+ readonly startTime?: string;
55
+ /** ISO 8601 timestamp when the tool call ended */
56
+ readonly endTime?: string;
57
+ /** Duration of the tool call in milliseconds */
58
+ readonly durationMs?: number;
59
+ }
60
+ /**
61
+ * An output message from agent execution.
62
+ * Represents a single message in the conversation with optional tool calls.
63
+ */
64
+ interface Message {
65
+ /** Message role (e.g., 'assistant', 'user', 'tool') */
66
+ readonly role: string;
67
+ /** Optional name for the message sender */
68
+ readonly name?: string;
69
+ /** Message content */
70
+ readonly content?: unknown;
71
+ /** Tool calls made in this message */
72
+ readonly toolCalls?: readonly ToolCall[];
73
+ /** ISO 8601 timestamp when the message started */
74
+ readonly startTime?: string;
75
+ /** ISO 8601 timestamp when the message ended */
76
+ readonly endTime?: string;
77
+ /** Duration of the message in milliseconds */
78
+ readonly durationMs?: number;
79
+ /** Provider-specific metadata */
80
+ readonly metadata?: Record<string, unknown>;
81
+ /** Per-message token usage metrics (optional) */
82
+ readonly tokenUsage?: ProviderTokenUsage;
83
+ }
84
+ /** @deprecated Use Message instead */
85
+ type OutputMessage = Message;
86
+ /**
87
+ * Token usage metrics reported by provider.
88
+ */
89
+ interface ProviderTokenUsage {
90
+ /** Input/prompt tokens consumed */
91
+ readonly input: number;
92
+ /** Output/completion tokens generated */
93
+ readonly output: number;
94
+ /** Cached tokens (optional, provider-specific) */
95
+ readonly cached?: number;
96
+ }
97
+ interface ProviderResponse {
98
+ readonly raw?: unknown;
99
+ readonly usage?: JsonObject;
100
+ /** Output messages from agent execution (primary source for tool trajectory) */
101
+ readonly output?: readonly Message[];
102
+ /** Token usage metrics (optional) */
103
+ readonly tokenUsage?: ProviderTokenUsage;
104
+ /** Total cost in USD (optional) */
105
+ readonly costUsd?: number;
106
+ /** Execution duration in milliseconds (optional) */
107
+ readonly durationMs?: number;
108
+ /** ISO 8601 timestamp when execution started (optional) */
109
+ readonly startTime?: string;
110
+ /** ISO 8601 timestamp when execution ended (optional) */
111
+ readonly endTime?: string;
112
+ }
113
+ interface Provider {
114
+ readonly id: string;
115
+ readonly kind: ProviderKind;
116
+ readonly targetName: string;
117
+ invoke(request: ProviderRequest): Promise<ProviderResponse>;
118
+ /**
119
+ * Optional capability marker for provider-managed batching (single session handling multiple requests).
120
+ */
121
+ readonly supportsBatch?: boolean;
122
+ /**
123
+ * Optional batch invocation hook. When defined alongside supportsBatch=true,
124
+ * the orchestrator may send multiple requests in a single provider session.
125
+ */
126
+ invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
127
+ /**
128
+ * Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
129
+ * Used by evaluators that need generateObject/generateText from the AI SDK.
130
+ */
131
+ asLanguageModel?(): ai.LanguageModel;
132
+ }
133
+ type EnvLookup = Readonly<Record<string, string | undefined>>;
134
+ interface TargetDefinition {
135
+ readonly name: string;
136
+ readonly provider: ProviderKind | string;
137
+ readonly judge_target?: string | undefined;
138
+ readonly workers?: number | undefined;
139
+ readonly provider_batching?: boolean | undefined;
140
+ readonly providerBatching?: boolean | undefined;
141
+ readonly endpoint?: string | unknown | undefined;
142
+ readonly resource?: string | unknown | undefined;
143
+ readonly resourceName?: string | unknown | undefined;
144
+ readonly api_key?: string | unknown | undefined;
145
+ readonly apiKey?: string | unknown | undefined;
146
+ readonly deployment?: string | unknown | undefined;
147
+ readonly deploymentName?: string | unknown | undefined;
148
+ readonly model?: string | unknown | undefined;
149
+ readonly version?: string | unknown | undefined;
150
+ readonly api_version?: string | unknown | undefined;
151
+ readonly variant?: string | unknown | undefined;
152
+ readonly thinking_budget?: number | unknown | undefined;
153
+ readonly thinkingBudget?: number | unknown | undefined;
154
+ readonly temperature?: number | unknown | undefined;
155
+ readonly max_output_tokens?: number | unknown | undefined;
156
+ readonly maxTokens?: number | unknown | undefined;
157
+ readonly executable?: string | unknown | undefined;
158
+ readonly command?: string | unknown | undefined;
159
+ readonly binary?: string | unknown | undefined;
160
+ readonly args?: unknown | undefined;
161
+ readonly arguments?: unknown | undefined;
162
+ readonly cwd?: string | unknown | undefined;
163
+ readonly timeout_seconds?: number | unknown | undefined;
164
+ readonly timeoutSeconds?: number | unknown | undefined;
165
+ readonly log_dir?: string | unknown | undefined;
166
+ readonly logDir?: string | unknown | undefined;
167
+ readonly log_directory?: string | unknown | undefined;
168
+ readonly logDirectory?: string | unknown | undefined;
169
+ readonly log_format?: string | unknown | undefined;
170
+ readonly logFormat?: string | unknown | undefined;
171
+ readonly log_output_format?: string | unknown | undefined;
172
+ readonly logOutputFormat?: string | unknown | undefined;
173
+ readonly system_prompt?: string | unknown | undefined;
174
+ readonly systemPrompt?: string | unknown | undefined;
175
+ readonly max_turns?: number | unknown | undefined;
176
+ readonly maxTurns?: number | unknown | undefined;
177
+ readonly max_budget_usd?: number | unknown | undefined;
178
+ readonly maxBudgetUsd?: number | unknown | undefined;
179
+ readonly response?: string | unknown | undefined;
180
+ readonly delayMs?: number | unknown | undefined;
181
+ readonly delayMinMs?: number | unknown | undefined;
182
+ readonly delayMaxMs?: number | unknown | undefined;
183
+ readonly wait?: boolean | unknown | undefined;
184
+ readonly dry_run?: boolean | unknown | undefined;
185
+ readonly dryRun?: boolean | unknown | undefined;
186
+ readonly subagent_root?: string | unknown | undefined;
187
+ readonly subagentRoot?: string | unknown | undefined;
188
+ readonly workspace_template?: string | unknown | undefined;
189
+ readonly workspaceTemplate?: string | unknown | undefined;
190
+ readonly command_template?: string | unknown | undefined;
191
+ readonly commandTemplate?: string | unknown | undefined;
192
+ readonly files_format?: string | unknown | undefined;
193
+ readonly filesFormat?: string | unknown | undefined;
194
+ readonly attachments_format?: string | unknown | undefined;
195
+ readonly attachmentsFormat?: string | unknown | undefined;
196
+ readonly env?: unknown | undefined;
197
+ readonly healthcheck?: unknown | undefined;
198
+ readonly cli_url?: string | unknown | undefined;
199
+ readonly cliUrl?: string | unknown | undefined;
200
+ readonly cli_path?: string | unknown | undefined;
201
+ readonly cliPath?: string | unknown | undefined;
202
+ readonly github_token?: string | unknown | undefined;
203
+ readonly githubToken?: string | unknown | undefined;
204
+ readonly max_retries?: number | unknown | undefined;
205
+ readonly maxRetries?: number | unknown | undefined;
206
+ readonly retry_initial_delay_ms?: number | unknown | undefined;
207
+ readonly retryInitialDelayMs?: number | unknown | undefined;
208
+ readonly retry_max_delay_ms?: number | unknown | undefined;
209
+ readonly retryMaxDelayMs?: number | unknown | undefined;
210
+ readonly retry_backoff_factor?: number | unknown | undefined;
211
+ readonly retryBackoffFactor?: number | unknown | undefined;
212
+ readonly retry_status_codes?: unknown | undefined;
213
+ readonly retryStatusCodes?: unknown | undefined;
214
+ }
3
215
 
4
216
  /**
5
217
  * Trace event types for capturing agent execution traces.
@@ -37,7 +249,21 @@ interface TraceSummary {
37
249
  readonly durationMs?: number;
38
250
  /** Per-tool duration arrays in milliseconds (optional) */
39
251
  readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
252
+ /** ISO 8601 timestamp when execution started (derived from earliest span) */
253
+ readonly startTime?: string;
254
+ /** ISO 8601 timestamp when execution ended (derived from latest span) */
255
+ readonly endTime?: string;
256
+ /** Number of LLM calls (assistant messages) */
257
+ readonly llmCallCount?: number;
40
258
  }
259
+ /**
260
+ * Argument matching mode for tool_trajectory expected items.
261
+ * - 'exact': bidirectional deep equality, no extra keys allowed (default)
262
+ * - 'superset': actual args must contain all expected keys (extras OK)
263
+ * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
264
+ * - 'ignore': skip argument checking entirely
265
+ */
266
+ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
41
267
  /**
42
268
  * Configuration for tool_trajectory evaluator.
43
269
  */
@@ -45,13 +271,18 @@ interface ToolTrajectoryEvaluatorConfig {
45
271
  readonly name: string;
46
272
  readonly type: 'tool_trajectory';
47
273
  /** Matching mode */
48
- readonly mode: 'any_order' | 'in_order' | 'exact';
274
+ readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
49
275
  /** Minimum call counts per tool (for any_order mode) */
50
276
  readonly minimums?: Readonly<Record<string, number>>;
51
- /** Expected tool sequence (for in_order/exact modes) */
277
+ /** Expected tool sequence (for in_order/exact/subset/superset modes) */
52
278
  readonly expected?: readonly ToolTrajectoryExpectedItem[];
53
279
  /** Optional weight for top-level aggregation (defaults to 1.0) */
54
280
  readonly weight?: number;
281
+ readonly required?: boolean | number;
282
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
283
+ readonly negate?: boolean;
284
+ /** Default argument matching mode for all expected items (defaults to 'exact') */
285
+ readonly argsMatch?: ArgsMatchMode | readonly string[];
55
286
  }
56
287
  /**
57
288
  * Expected tool call item in a trajectory sequence.
@@ -62,21 +293,35 @@ interface ToolTrajectoryExpectedItem {
62
293
  readonly args?: 'any' | Record<string, unknown>;
63
294
  /** Optional maximum duration in milliseconds for latency assertions */
64
295
  readonly maxDurationMs?: number;
296
+ /** Per-item argument matching mode override (takes precedence over evaluator-level argsMatch) */
297
+ readonly argsMatch?: ArgsMatchMode | readonly string[];
65
298
  }
66
299
  /**
67
300
  * Simplified input type for computeTraceSummary.
68
- * Matches OutputMessage structure without requiring full provider/types import.
301
+ * Matches Message structure without requiring full provider/types import.
69
302
  */
70
- interface OutputMessageLike {
303
+ interface MessageLike {
304
+ readonly role?: string;
305
+ readonly startTime?: string;
306
+ readonly endTime?: string;
71
307
  readonly toolCalls?: readonly {
72
308
  readonly tool: string;
309
+ readonly startTime?: string;
310
+ readonly endTime?: string;
311
+ readonly durationMs?: number;
73
312
  }[];
74
313
  }
75
314
  /**
76
315
  * Compute a lightweight summary from output messages.
77
316
  * Used for default result persistence without payload bloat.
317
+ *
318
+ * Derives timing information from span boundaries:
319
+ * - startTime: earliest startTime across all messages and tool calls
320
+ * - endTime: latest endTime across all messages and tool calls
321
+ * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
322
+ * - llmCallCount: count of assistant messages
78
323
  */
79
- declare function computeTraceSummary(messages: readonly OutputMessageLike[]): TraceSummary;
324
+ declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
80
325
  /**
81
326
  * Default tool names considered as exploration/read-only operations.
82
327
  * Can be overridden per-evaluation via config.
@@ -114,10 +359,15 @@ interface ExecutionMetrics {
114
359
  readonly tokenUsage?: TokenUsage;
115
360
  readonly costUsd?: number;
116
361
  readonly durationMs?: number;
362
+ /** ISO 8601 timestamp when execution started */
363
+ readonly startTime?: string;
364
+ /** ISO 8601 timestamp when execution ended */
365
+ readonly endTime?: string;
117
366
  }
118
367
  /**
119
368
  * Merge execution metrics from provider response into a trace summary.
120
369
  * Returns a new TraceSummary with metrics fields populated.
370
+ * Provider-level timing takes precedence over span-derived timing.
121
371
  *
122
372
  * @param summary - Base trace summary from computeTraceSummary
123
373
  * @param metrics - Optional execution metrics from provider
@@ -203,7 +453,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
203
453
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
204
454
  */
205
455
  declare function isTestMessage(value: unknown): value is TestMessage;
206
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
456
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "regex", "is_json", "equals", "rubrics"];
207
457
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
208
458
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
209
459
  /**
@@ -215,6 +465,43 @@ type TargetAccessConfig = {
215
465
  /** Maximum number of target invocations allowed per execution (default: 50) */
216
466
  readonly max_calls?: number;
217
467
  };
468
+ /**
469
+ * Configuration for workspace lifecycle scripts (before_all, after_all, before_each, after_each).
470
+ * Scripts are executed with workspace context passed via stdin.
471
+ */
472
+ type WorkspaceScriptConfig = {
473
+ /** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */
474
+ readonly script: readonly string[];
475
+ /** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */
476
+ readonly timeout_ms?: number;
477
+ readonly timeoutMs?: number;
478
+ /** Optional working directory for script execution */
479
+ readonly cwd?: string;
480
+ };
481
+ /**
482
+ * Workspace configuration for eval tests.
483
+ * Can be specified at suite level and overridden per-case.
484
+ * Merge strategy: template/scripts replaced, env deep-merged.
485
+ *
486
+ * Lifecycle hooks follow bun:test/Vitest naming:
487
+ * - before_all: runs ONCE before first test, creates shared workspace
488
+ * - after_all: runs ONCE after last test, final cleanup
489
+ * - before_each: runs before each test (optional)
490
+ * - after_each: runs after each test (e.g., reset git state)
491
+ */
492
+ type WorkspaceConfig = {
493
+ /** Template directory or .code-workspace file. Directories are copied to temp workspace.
494
+ * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
495
+ readonly template?: string;
496
+ /** Script to run once before first test (after workspace creation, before git baseline) */
497
+ readonly before_all?: WorkspaceScriptConfig;
498
+ /** Script to run once after last test (before workspace cleanup) */
499
+ readonly after_all?: WorkspaceScriptConfig;
500
+ /** Script to run before each test */
501
+ readonly before_each?: WorkspaceScriptConfig;
502
+ /** Script to run after each test (e.g., git reset for workspace reuse) */
503
+ readonly after_each?: WorkspaceScriptConfig;
504
+ };
218
505
  type CodeEvaluatorConfig = {
219
506
  readonly name: string;
220
507
  readonly type: 'code';
@@ -223,6 +510,9 @@ type CodeEvaluatorConfig = {
223
510
  readonly cwd?: string;
224
511
  readonly resolvedCwd?: string;
225
512
  readonly weight?: number;
513
+ readonly required?: boolean | number;
514
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
515
+ readonly negate?: boolean;
226
516
  /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
227
517
  readonly config?: JsonObject;
228
518
  /** When present, enables target access for the script via local proxy */
@@ -250,32 +540,35 @@ type LlmJudgeEvaluatorConfig = {
250
540
  readonly resolvedPromptScript?: readonly string[];
251
541
  readonly rubrics?: readonly RubricItem[];
252
542
  readonly weight?: number;
543
+ readonly required?: boolean | number;
544
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
545
+ readonly negate?: boolean;
253
546
  /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
254
547
  readonly config?: Record<string, unknown>;
255
548
  };
256
549
  /**
257
550
  * Score range definition for analytic rubric scoring.
258
- * Each range maps an integer score band (0-10) to an expected outcome description.
551
+ * Each range maps an integer score band (0-10) to an outcome description.
259
552
  */
260
553
  type ScoreRange = {
261
554
  /** Inclusive integer range [min, max] within 0-10 */
262
555
  readonly score_range: readonly [number, number];
263
556
  /** Description of what this score range represents */
264
- readonly expected_outcome: string;
557
+ readonly outcome: string;
265
558
  };
266
559
  /**
267
560
  * Rubric item for LLM judge evaluation.
268
561
  * Supports two modes:
269
- * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
562
+ * - Checklist mode: boolean satisfied/not-satisfied with `outcome`
270
563
  * - Score-range mode: 0-10 integer scoring with `score_ranges`
271
564
  */
272
565
  type RubricItem = {
273
566
  readonly id: string;
274
567
  /**
275
- * For checklist rubrics: the expected outcome text (required).
568
+ * For checklist rubrics: the outcome text (required).
276
569
  * For score-range rubrics: optional overall criterion description.
277
570
  */
278
- readonly expected_outcome?: string;
571
+ readonly outcome?: string;
279
572
  readonly weight: number;
280
573
  /**
281
574
  * Legacy boolean gating (deprecated, treated as required_min_score: 10).
@@ -306,6 +599,9 @@ type CompositeAggregatorConfig = {
306
599
  readonly prompt?: string;
307
600
  readonly promptPath?: string;
308
601
  readonly model?: string;
602
+ } | {
603
+ readonly type: 'threshold';
604
+ readonly threshold: number;
309
605
  };
310
606
  type CompositeEvaluatorConfig = {
311
607
  readonly name: string;
@@ -313,6 +609,9 @@ type CompositeEvaluatorConfig = {
313
609
  readonly evaluators: readonly EvaluatorConfig[];
314
610
  readonly aggregator: CompositeAggregatorConfig;
315
611
  readonly weight?: number;
612
+ readonly required?: boolean | number;
613
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
614
+ readonly negate?: boolean;
316
615
  };
317
616
  /**
318
617
  * Match type for field accuracy evaluation.
@@ -354,6 +653,9 @@ type FieldAccuracyEvaluatorConfig = {
354
653
  /** Strategy for combining field scores (default: weighted_average) */
355
654
  readonly aggregation?: FieldAggregationType;
356
655
  readonly weight?: number;
656
+ readonly required?: boolean | number;
657
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
658
+ readonly negate?: boolean;
357
659
  };
358
660
  /**
359
661
  * Configuration for the latency evaluator.
@@ -365,6 +667,9 @@ type LatencyEvaluatorConfig = {
365
667
  /** Maximum allowed duration in milliseconds */
366
668
  readonly threshold: number;
367
669
  readonly weight?: number;
670
+ readonly required?: boolean | number;
671
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
672
+ readonly negate?: boolean;
368
673
  };
369
674
  /**
370
675
  * Configuration for the cost evaluator.
@@ -376,6 +681,9 @@ type CostEvaluatorConfig = {
376
681
  /** Maximum allowed cost in USD */
377
682
  readonly budget: number;
378
683
  readonly weight?: number;
684
+ readonly required?: boolean | number;
685
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
686
+ readonly negate?: boolean;
379
687
  };
380
688
  /**
381
689
  * Configuration for the token_usage evaluator.
@@ -391,48 +699,256 @@ type TokenUsageEvaluatorConfig = {
391
699
  /** Maximum allowed output tokens (completion) */
392
700
  readonly max_output?: number;
393
701
  readonly weight?: number;
702
+ readonly required?: boolean | number;
703
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
704
+ readonly negate?: boolean;
705
+ };
706
+ /**
707
+ * Configuration for the execution_metrics evaluator.
708
+ * Provides declarative threshold-based checks on execution metrics.
709
+ * Only specified thresholds are checked; omitted ones are ignored.
710
+ */
711
+ type ExecutionMetricsEvaluatorConfig = {
712
+ readonly name: string;
713
+ readonly type: 'execution_metrics';
714
+ /** Maximum allowed number of tool calls */
715
+ readonly max_tool_calls?: number;
716
+ /** Maximum allowed number of LLM calls (assistant messages) */
717
+ readonly max_llm_calls?: number;
718
+ /** Maximum allowed total tokens (input + output) */
719
+ readonly max_tokens?: number;
720
+ /** Maximum allowed cost in USD */
721
+ readonly max_cost_usd?: number;
722
+ /** Maximum allowed duration in milliseconds */
723
+ readonly max_duration_ms?: number;
724
+ /** Target exploration ratio (0-1, proportion of read-only tool calls) */
725
+ readonly target_exploration_ratio?: number;
726
+ /** Tolerance for exploration ratio check (default: 0.2) */
727
+ readonly exploration_tolerance?: number;
728
+ readonly weight?: number;
729
+ readonly required?: boolean | number;
730
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
731
+ readonly negate?: boolean;
732
+ };
733
+ /**
734
+ * Configuration for the agent_judge evaluator.
735
+ * Runs an agentic investigation loop to audit workspaces and verify criteria.
736
+ * Two modes:
737
+ * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
738
+ * - Judge target: Delegates to an external agent provider via Provider.invoke()
739
+ */
740
+ type AgentJudgeEvaluatorConfig = {
741
+ readonly name: string;
742
+ readonly type: 'agent_judge';
743
+ /** Custom evaluation prompt (inline text or file path) */
744
+ readonly prompt?: string;
745
+ readonly promptPath?: string;
746
+ /** Resolved absolute path for prompt file */
747
+ readonly resolvedPromptPath?: string;
748
+ /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
749
+ readonly rubrics?: readonly RubricItem[];
750
+ /** Maximum agent steps for built-in mode (default 10, max 50) */
751
+ readonly max_steps?: number;
752
+ /** Temperature for built-in mode (default 0) */
753
+ readonly temperature?: number;
754
+ /** Target name — delegates agent loop to this provider instead of built-in mode */
755
+ readonly target?: string;
756
+ readonly weight?: number;
757
+ readonly required?: boolean | number;
758
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
759
+ readonly negate?: boolean;
760
+ };
761
+ /**
762
+ * Configuration for the contains assertion evaluator.
763
+ * Checks whether the candidate output contains a specified substring.
764
+ */
765
+ type ContainsEvaluatorConfig = {
766
+ readonly name: string;
767
+ readonly type: 'contains';
768
+ readonly value: string;
769
+ readonly weight?: number;
770
+ readonly required?: boolean | number;
771
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
772
+ readonly negate?: boolean;
773
+ };
774
+ /**
775
+ * Configuration for the regex assertion evaluator.
776
+ * Checks whether the candidate output matches a regular expression pattern.
777
+ */
778
+ type RegexEvaluatorConfig = {
779
+ readonly name: string;
780
+ readonly type: 'regex';
781
+ readonly value: string;
782
+ readonly weight?: number;
783
+ readonly required?: boolean | number;
784
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
785
+ readonly negate?: boolean;
786
+ };
787
+ /**
788
+ * Configuration for the is_json assertion evaluator.
789
+ * Checks whether the candidate output is valid JSON.
790
+ */
791
+ type IsJsonEvaluatorConfig = {
792
+ readonly name: string;
793
+ readonly type: 'is_json';
794
+ readonly weight?: number;
795
+ readonly required?: boolean | number;
796
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
797
+ readonly negate?: boolean;
798
+ };
799
+ /**
800
+ * Configuration for the equals assertion evaluator.
801
+ * Checks whether the candidate output exactly equals a specified string.
802
+ */
803
+ type EqualsEvaluatorConfig = {
804
+ readonly name: string;
805
+ readonly type: 'equals';
806
+ readonly value: string;
807
+ readonly weight?: number;
808
+ readonly required?: boolean | number;
809
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
810
+ readonly negate?: boolean;
811
+ };
812
+ /**
813
+ * Configuration for the rubrics evaluator.
814
+ * Evaluates candidate output against a list of rubric criteria.
815
+ */
816
+ type RubricsEvaluatorConfig = {
817
+ readonly name: string;
818
+ readonly type: 'rubrics';
819
+ readonly criteria: readonly RubricItem[];
820
+ readonly weight?: number;
821
+ readonly required?: boolean | number;
822
+ /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
823
+ readonly negate?: boolean;
394
824
  };
395
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
825
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
396
826
  /**
397
- * Eval case definition sourced from AgentV specs.
827
+ * Eval test definition sourced from AgentV specs.
398
828
  */
399
- interface EvalCase {
829
+ interface EvalTest {
400
830
  readonly id: string;
401
831
  readonly dataset?: string;
402
832
  readonly conversation_id?: string;
403
833
  readonly question: string;
404
- readonly input_messages: readonly TestMessage[];
834
+ readonly input: readonly TestMessage[];
405
835
  readonly input_segments: readonly JsonObject[];
406
- readonly expected_messages: readonly JsonObject[];
836
+ readonly expected_output: readonly JsonObject[];
407
837
  readonly reference_answer?: string;
408
838
  readonly guideline_paths: readonly string[];
409
839
  readonly guideline_patterns?: readonly string[];
410
840
  readonly file_paths: readonly string[];
411
- readonly expected_outcome: string;
841
+ readonly criteria: string;
412
842
  readonly evaluator?: EvaluatorKind;
413
843
  readonly evaluators?: readonly EvaluatorConfig[];
844
+ /** Workspace configuration (merged from suite-level and case-level) */
845
+ readonly workspace?: WorkspaceConfig;
846
+ /** Arbitrary metadata passed to workspace scripts via stdin */
847
+ readonly metadata?: Record<string, unknown>;
848
+ /** Per-test target override (matrix evaluation) */
849
+ readonly targets?: readonly string[];
850
+ }
851
+ /** @deprecated Use `EvalTest` instead */
852
+ type EvalCase = EvalTest;
853
+ /**
854
+ * Supported trial aggregation strategies.
855
+ */
856
+ type TrialStrategy = 'pass_at_k' | 'mean' | 'confidence_interval';
857
+ /**
858
+ * Configuration for running multiple trials per eval case.
859
+ */
860
+ interface TrialsConfig {
861
+ readonly count: number;
862
+ readonly strategy: TrialStrategy;
863
+ readonly costLimitUsd?: number;
864
+ }
865
+ /**
866
+ * Result of a single trial attempt.
867
+ */
868
+ interface TrialResult {
869
+ readonly attempt: number;
870
+ readonly score: number;
871
+ readonly verdict: EvaluationVerdict;
872
+ readonly scores?: readonly EvaluatorResult[];
873
+ readonly error?: string;
874
+ readonly costUsd?: number;
875
+ }
876
+ /**
877
+ * Aggregation metadata for pass_at_k strategy.
878
+ */
879
+ interface PassAtKAggregation {
880
+ readonly strategy: 'pass_at_k';
881
+ readonly passedAttempts: number;
882
+ readonly totalAttempts: number;
414
883
  }
884
+ /**
885
+ * Aggregation metadata for mean strategy.
886
+ */
887
+ interface MeanAggregation {
888
+ readonly strategy: 'mean';
889
+ readonly mean: number;
890
+ readonly min: number;
891
+ readonly max: number;
892
+ }
893
+ /**
894
+ * Aggregation metadata for confidence_interval strategy.
895
+ */
896
+ interface ConfidenceIntervalAggregation {
897
+ readonly strategy: 'confidence_interval';
898
+ readonly mean: number;
899
+ readonly ci95Lower: number;
900
+ readonly ci95Upper: number;
901
+ readonly stddev: number;
902
+ }
903
+ /**
904
+ * Discriminated union of trial aggregation results.
905
+ */
906
+ type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
415
907
  /**
416
908
  * Evaluator scorecard for a single eval case run.
417
909
  */
418
910
  interface EvaluationResult {
419
911
  readonly timestamp: string;
420
- readonly evalId: string;
912
+ readonly testId: string;
421
913
  readonly dataset?: string;
422
914
  readonly conversationId?: string;
423
915
  readonly score: number;
424
916
  readonly hits: readonly string[];
425
917
  readonly misses: readonly string[];
426
- readonly candidateAnswer: string;
918
+ readonly answer: string;
427
919
  readonly target: string;
428
920
  readonly reasoning?: string;
429
- readonly agentProviderRequest?: JsonObject;
430
- readonly lmProviderRequest?: JsonObject;
431
- readonly evaluatorProviderRequest?: JsonObject;
432
- readonly evaluatorResults?: readonly EvaluatorResult[];
921
+ readonly requests?: {
922
+ readonly agent?: JsonObject;
923
+ readonly lm?: JsonObject;
924
+ readonly evaluator?: JsonObject;
925
+ };
926
+ readonly scores?: readonly EvaluatorResult[];
433
927
  readonly error?: string;
434
928
  /** Lightweight summary of the execution trace (always included when available) */
435
- readonly traceSummary?: TraceSummary;
929
+ readonly trace?: TraceSummary;
930
+ /** Path to the temporary workspace directory (included on failure for debugging) */
931
+ readonly workspacePath?: string;
932
+ /** Input messages or prompt string sent to the agent */
933
+ readonly input?: readonly Message[] | string;
934
+ /** Full output messages from agent execution (only included when --trace flag is set) */
935
+ readonly output?: readonly Message[];
936
+ /** Captured output from workspace before_all script */
937
+ readonly beforeAllOutput?: string;
938
+ /** Captured output from workspace before_each script */
939
+ readonly beforeEachOutput?: string;
940
+ /** Captured output from workspace after_all script */
941
+ readonly afterAllOutput?: string;
942
+ /** Captured output from workspace after_each script */
943
+ readonly afterEachOutput?: string;
944
+ /** Unified diff of workspace file changes (when workspace_template is configured) */
945
+ readonly fileChanges?: string;
946
+ /** Individual trial results (only present when trials.count > 1) */
947
+ readonly trials?: readonly TrialResult[];
948
+ /** Aggregation metadata describing how the final score was computed from trials */
949
+ readonly aggregation?: TrialAggregation;
950
+ /** Whether the trial loop was terminated early due to cost limit */
951
+ readonly costLimited?: boolean;
436
952
  }
437
953
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
438
954
  interface EvaluatorResult {
@@ -446,7 +962,7 @@ interface EvaluatorResult {
446
962
  readonly reasoning?: string;
447
963
  readonly rawRequest?: JsonObject;
448
964
  readonly evaluatorProviderRequest?: JsonObject;
449
- readonly evaluatorResults?: readonly EvaluatorResult[];
965
+ readonly scores?: readonly EvaluatorResult[];
450
966
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
451
967
  readonly details?: JsonObject;
452
968
  }
@@ -455,182 +971,88 @@ interface EvaluatorResult {
455
971
  */
456
972
  declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
457
973
 
458
- type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
459
- interface ChatMessage {
460
- readonly role: ChatMessageRole;
461
- readonly content: string;
462
- readonly name?: string;
463
- }
464
- type ChatPrompt = readonly ChatMessage[];
465
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
466
- interface ProviderRequest {
467
- readonly question: string;
468
- readonly systemPrompt?: string;
469
- readonly guidelines?: string;
974
+ declare const MetadataSchema: z.ZodObject<{
975
+ name: z.ZodString;
976
+ description: z.ZodOptional<z.ZodString>;
977
+ version: z.ZodOptional<z.ZodString>;
978
+ author: z.ZodOptional<z.ZodString>;
979
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
980
+ license: z.ZodOptional<z.ZodString>;
981
+ requires: z.ZodOptional<z.ZodObject<{
982
+ agentv: z.ZodOptional<z.ZodString>;
983
+ }, "strip", z.ZodTypeAny, {
984
+ agentv?: string | undefined;
985
+ }, {
986
+ agentv?: string | undefined;
987
+ }>>;
988
+ }, "strip", z.ZodTypeAny, {
989
+ name: string;
990
+ description?: string | undefined;
991
+ version?: string | undefined;
992
+ author?: string | undefined;
993
+ tags?: string[] | undefined;
994
+ license?: string | undefined;
995
+ requires?: {
996
+ agentv?: string | undefined;
997
+ } | undefined;
998
+ }, {
999
+ name: string;
1000
+ description?: string | undefined;
1001
+ version?: string | undefined;
1002
+ author?: string | undefined;
1003
+ tags?: string[] | undefined;
1004
+ license?: string | undefined;
1005
+ requires?: {
1006
+ agentv?: string | undefined;
1007
+ } | undefined;
1008
+ }>;
1009
+ type EvalMetadata = z.infer<typeof MetadataSchema>;
1010
+
1011
+ declare const DEFAULT_EVAL_PATTERNS: readonly string[];
1012
+ type AgentVConfig$1 = {
470
1013
  readonly guideline_patterns?: readonly string[];
471
- readonly chatPrompt?: ChatPrompt;
472
- readonly inputFiles?: readonly string[];
473
- readonly evalCaseId?: string;
474
- readonly attempt?: number;
475
- readonly maxOutputTokens?: number;
476
- readonly temperature?: number;
477
- readonly metadata?: JsonObject;
478
- readonly signal?: AbortSignal;
479
- }
1014
+ readonly eval_patterns?: readonly string[];
1015
+ };
480
1016
  /**
481
- * A tool call within an output message.
482
- * Represents a single tool invocation with its input and optional output.
1017
+ * Load optional .agentv/config.yaml configuration file.
1018
+ * Searches from eval file directory up to repo root.
483
1019
  */
484
- interface ToolCall {
485
- /** Tool name */
486
- readonly tool: string;
487
- /** Tool input arguments */
488
- readonly input?: unknown;
489
- /** Tool output result */
490
- readonly output?: unknown;
491
- /** Stable identifier for pairing tool calls */
492
- readonly id?: string;
493
- /** ISO 8601 timestamp */
494
- readonly timestamp?: string;
495
- /** Duration of the tool call in milliseconds */
496
- readonly durationMs?: number;
497
- }
1020
+ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
498
1021
  /**
499
- * An output message from agent execution.
500
- * Represents a single message in the conversation with optional tool calls.
1022
+ * Determine whether a path references guideline content (instructions or prompts).
501
1023
  */
502
- interface OutputMessage {
503
- /** Message role (e.g., 'assistant', 'user', 'tool') */
504
- readonly role: string;
505
- /** Optional name for the message sender */
506
- readonly name?: string;
507
- /** Message content */
508
- readonly content?: unknown;
509
- /** Tool calls made in this message */
510
- readonly toolCalls?: readonly ToolCall[];
511
- /** ISO 8601 timestamp */
512
- readonly timestamp?: string;
513
- /** Duration of the message in milliseconds */
514
- readonly durationMs?: number;
515
- /** Provider-specific metadata */
516
- readonly metadata?: Record<string, unknown>;
517
- }
1024
+ declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
518
1025
  /**
519
- * Token usage metrics reported by provider.
1026
+ * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
520
1027
  */
521
- interface ProviderTokenUsage {
522
- /** Input/prompt tokens consumed */
523
- readonly input: number;
524
- /** Output/completion tokens generated */
525
- readonly output: number;
526
- /** Cached tokens (optional, provider-specific) */
527
- readonly cached?: number;
528
- }
529
- interface ProviderResponse {
530
- readonly raw?: unknown;
531
- readonly usage?: JsonObject;
532
- /** Output messages from agent execution (primary source for tool trajectory) */
533
- readonly outputMessages?: readonly OutputMessage[];
534
- /** Token usage metrics (optional) */
535
- readonly tokenUsage?: ProviderTokenUsage;
536
- /** Total cost in USD (optional) */
537
- readonly costUsd?: number;
538
- /** Execution duration in milliseconds (optional) */
539
- readonly durationMs?: number;
540
- }
541
- interface Provider {
542
- readonly id: string;
543
- readonly kind: ProviderKind;
544
- readonly targetName: string;
545
- invoke(request: ProviderRequest): Promise<ProviderResponse>;
546
- /**
547
- * Optional capability marker for provider-managed batching (single session handling multiple requests).
548
- */
549
- readonly supportsBatch?: boolean;
550
- /**
551
- * Optional batch invocation hook. When defined alongside supportsBatch=true,
552
- * the orchestrator may send multiple requests in a single provider session.
553
- */
554
- invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
555
- /**
556
- * Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
557
- * Used by evaluators that need generateObject/generateText from the AI SDK.
558
- */
559
- asLanguageModel?(): ai.LanguageModel;
560
- }
561
- type EnvLookup = Readonly<Record<string, string | undefined>>;
562
- interface TargetDefinition {
563
- readonly name: string;
564
- readonly provider: ProviderKind | string;
565
- readonly judge_target?: string | undefined;
566
- readonly workers?: number | undefined;
567
- readonly provider_batching?: boolean | undefined;
568
- readonly providerBatching?: boolean | undefined;
569
- readonly endpoint?: string | unknown | undefined;
570
- readonly resource?: string | unknown | undefined;
571
- readonly resourceName?: string | unknown | undefined;
572
- readonly api_key?: string | unknown | undefined;
573
- readonly apiKey?: string | unknown | undefined;
574
- readonly deployment?: string | unknown | undefined;
575
- readonly deploymentName?: string | unknown | undefined;
576
- readonly model?: string | unknown | undefined;
577
- readonly version?: string | unknown | undefined;
578
- readonly api_version?: string | unknown | undefined;
579
- readonly variant?: string | unknown | undefined;
580
- readonly thinking_budget?: number | unknown | undefined;
581
- readonly thinkingBudget?: number | unknown | undefined;
582
- readonly temperature?: number | unknown | undefined;
583
- readonly max_output_tokens?: number | unknown | undefined;
584
- readonly maxTokens?: number | unknown | undefined;
585
- readonly executable?: string | unknown | undefined;
586
- readonly command?: string | unknown | undefined;
587
- readonly binary?: string | unknown | undefined;
588
- readonly args?: unknown | undefined;
589
- readonly arguments?: unknown | undefined;
590
- readonly cwd?: string | unknown | undefined;
591
- readonly timeout_seconds?: number | unknown | undefined;
592
- readonly timeoutSeconds?: number | unknown | undefined;
593
- readonly log_dir?: string | unknown | undefined;
594
- readonly logDir?: string | unknown | undefined;
595
- readonly log_directory?: string | unknown | undefined;
596
- readonly logDirectory?: string | unknown | undefined;
597
- readonly log_format?: string | unknown | undefined;
598
- readonly logFormat?: string | unknown | undefined;
599
- readonly log_output_format?: string | unknown | undefined;
600
- readonly logOutputFormat?: string | unknown | undefined;
601
- readonly system_prompt?: string | unknown | undefined;
602
- readonly systemPrompt?: string | unknown | undefined;
603
- readonly response?: string | unknown | undefined;
604
- readonly delayMs?: number | unknown | undefined;
605
- readonly delayMinMs?: number | unknown | undefined;
606
- readonly delayMaxMs?: number | unknown | undefined;
607
- readonly vscode_cmd?: string | unknown | undefined;
608
- readonly wait?: boolean | unknown | undefined;
609
- readonly dry_run?: boolean | unknown | undefined;
610
- readonly dryRun?: boolean | unknown | undefined;
611
- readonly subagent_root?: string | unknown | undefined;
612
- readonly subagentRoot?: string | unknown | undefined;
613
- readonly workspace_template?: string | unknown | undefined;
614
- readonly workspaceTemplate?: string | unknown | undefined;
615
- readonly command_template?: string | unknown | undefined;
616
- readonly commandTemplate?: string | unknown | undefined;
617
- readonly files_format?: string | unknown | undefined;
618
- readonly filesFormat?: string | unknown | undefined;
619
- readonly attachments_format?: string | unknown | undefined;
620
- readonly attachmentsFormat?: string | unknown | undefined;
621
- readonly env?: unknown | undefined;
622
- readonly healthcheck?: unknown | undefined;
623
- readonly max_retries?: number | unknown | undefined;
624
- readonly maxRetries?: number | unknown | undefined;
625
- readonly retry_initial_delay_ms?: number | unknown | undefined;
626
- readonly retryInitialDelayMs?: number | unknown | undefined;
627
- readonly retry_max_delay_ms?: number | unknown | undefined;
628
- readonly retryMaxDelayMs?: number | unknown | undefined;
629
- readonly retry_backoff_factor?: number | unknown | undefined;
630
- readonly retryBackoffFactor?: number | unknown | undefined;
631
- readonly retry_status_codes?: unknown | undefined;
632
- readonly retryStatusCodes?: unknown | undefined;
1028
+ declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1029
+ /**
1030
+ * Extract targets array from parsed eval suite.
1031
+ * Precedence: execution.targets (array) > execution.target (singular).
1032
+ * Returns undefined when no targets array is specified.
1033
+ */
1034
+ declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1035
+ /**
1036
+ * Extract per-test targets array from a raw test case object.
1037
+ */
1038
+ declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
1039
+ /**
1040
+ * Extract trials configuration from parsed eval suite's execution block.
1041
+ * Returns undefined when count is 1 or not specified (no-op).
1042
+ */
1043
+ declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
1044
+ /**
1045
+ * Cache configuration parsed from execution block.
1046
+ */
1047
+ interface CacheConfig {
1048
+ readonly enabled: boolean;
1049
+ readonly cachePath?: string;
633
1050
  }
1051
+ /**
1052
+ * Extract cache configuration from parsed eval suite's execution block.
1053
+ * Returns undefined when no cache config is specified.
1054
+ */
1055
+ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
634
1056
 
635
1057
  /**
636
1058
  * Formatting mode for segment content.
@@ -654,12 +1076,7 @@ interface PromptInputs {
654
1076
  * @param testCase - The evaluation test case
655
1077
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
656
1078
  */
657
- declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
658
-
659
- /**
660
- * Determine whether a path references guideline content (instructions or prompts).
661
- */
662
- declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
1079
+ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
663
1080
 
664
1081
  /**
665
1082
  * Detect file format by extension.
@@ -668,21 +1085,49 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
668
1085
 
669
1086
  type LoadOptions = {
670
1087
  readonly verbose?: boolean;
671
- /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
1088
+ /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
672
1089
  readonly filter?: string;
673
1090
  };
674
1091
  /**
675
1092
  * Read metadata from a test suite file (like target name).
676
- * This is a convenience function for CLI tools that need metadata without loading all eval cases.
1093
+ * This is a convenience function for CLI tools that need metadata without loading all tests.
677
1094
  */
678
1095
  declare function readTestSuiteMetadata(testFilePath: string): Promise<{
679
1096
  target?: string;
1097
+ targets?: readonly string[];
1098
+ trials?: TrialsConfig;
680
1099
  }>;
681
1100
  /**
682
- * Load eval cases from a AgentV specification file (YAML or JSONL).
1101
+ * Load tests from an AgentV specification file (YAML or JSONL).
683
1102
  * Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
684
1103
  */
685
- declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
1104
+ type EvalSuiteResult = {
1105
+ readonly tests: readonly EvalTest[];
1106
+ readonly trials?: TrialsConfig;
1107
+ /** Suite-level targets from execution.targets (matrix evaluation) */
1108
+ readonly targets?: readonly string[];
1109
+ /** Suite-level cache config from execution.cache */
1110
+ readonly cacheConfig?: CacheConfig;
1111
+ /** Suite-level metadata (name, description, version, etc.) */
1112
+ readonly metadata?: EvalMetadata;
1113
+ };
1114
+ /**
1115
+ * Load tests and suite metadata from a single parse.
1116
+ * Prefer this over calling loadTests + readTestSuiteMetadata separately.
1117
+ */
1118
+ declare function loadTestSuite(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<EvalSuiteResult>;
1119
+ /** @deprecated Use `loadTestSuite` instead */
1120
+ declare const loadEvalSuite: typeof loadTestSuite;
1121
+ declare function loadTests(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalTest[]>;
1122
+ /** @deprecated Use `loadTests` instead */
1123
+ declare const loadEvalCases: typeof loadTests;
1124
+ /**
1125
+ * Load a single test by exact ID match.
1126
+ * Throws if the ID is not found.
1127
+ */
1128
+ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, evalId: string): Promise<EvalTest>;
1129
+ /** @deprecated Use `loadTestById` instead */
1130
+ declare const loadEvalCaseById: typeof loadTestById;
686
1131
 
687
1132
  declare function fileExists(filePath: string): Promise<boolean>;
688
1133
  /**
@@ -744,6 +1189,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
744
1189
  commandTemplate: z.ZodString;
745
1190
  filesFormat: z.ZodOptional<z.ZodString>;
746
1191
  cwd: z.ZodOptional<z.ZodString>;
1192
+ workspaceTemplate: z.ZodOptional<z.ZodString>;
747
1193
  timeoutMs: z.ZodOptional<z.ZodNumber>;
748
1194
  healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
749
1195
  type: z.ZodLiteral<"http">;
@@ -780,6 +1226,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
780
1226
  cwd?: string | undefined;
781
1227
  verbose?: boolean | undefined;
782
1228
  filesFormat?: string | undefined;
1229
+ workspaceTemplate?: string | undefined;
783
1230
  healthcheck?: {
784
1231
  type: "http";
785
1232
  url: string;
@@ -797,6 +1244,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
797
1244
  cwd?: string | undefined;
798
1245
  verbose?: boolean | undefined;
799
1246
  filesFormat?: string | undefined;
1247
+ workspaceTemplate?: string | undefined;
800
1248
  healthcheck?: {
801
1249
  type: "http";
802
1250
  url: string;
@@ -858,19 +1306,34 @@ interface GeminiResolvedConfig {
858
1306
  readonly retry?: RetryConfig;
859
1307
  }
860
1308
  interface CodexResolvedConfig {
1309
+ readonly model?: string;
861
1310
  readonly executable: string;
862
1311
  readonly args?: readonly string[];
863
1312
  readonly cwd?: string;
1313
+ readonly workspaceTemplate?: string;
864
1314
  readonly timeoutMs?: number;
865
1315
  readonly logDir?: string;
866
1316
  readonly logFormat?: 'summary' | 'json';
867
1317
  readonly systemPrompt?: string;
868
1318
  }
869
- interface CopilotResolvedConfig {
1319
+ interface CopilotCliResolvedConfig {
870
1320
  readonly executable: string;
871
1321
  readonly model?: string;
872
1322
  readonly args?: readonly string[];
873
1323
  readonly cwd?: string;
1324
+ readonly workspaceTemplate?: string;
1325
+ readonly timeoutMs?: number;
1326
+ readonly logDir?: string;
1327
+ readonly logFormat?: 'summary' | 'json';
1328
+ readonly systemPrompt?: string;
1329
+ }
1330
+ interface CopilotSdkResolvedConfig {
1331
+ readonly cliUrl?: string;
1332
+ readonly cliPath?: string;
1333
+ readonly githubToken?: string;
1334
+ readonly model?: string;
1335
+ readonly cwd?: string;
1336
+ readonly workspaceTemplate?: string;
874
1337
  readonly timeoutMs?: number;
875
1338
  readonly logDir?: string;
876
1339
  readonly logFormat?: 'summary' | 'json';
@@ -885,6 +1348,7 @@ interface PiCodingAgentResolvedConfig {
885
1348
  readonly thinking?: string;
886
1349
  readonly args?: readonly string[];
887
1350
  readonly cwd?: string;
1351
+ readonly workspaceTemplate?: string;
888
1352
  readonly timeoutMs?: number;
889
1353
  readonly logDir?: string;
890
1354
  readonly logFormat?: 'summary' | 'json';
@@ -897,13 +1361,14 @@ interface PiAgentSdkResolvedConfig {
897
1361
  readonly timeoutMs?: number;
898
1362
  readonly systemPrompt?: string;
899
1363
  }
900
- interface ClaudeCodeResolvedConfig {
901
- readonly executable: string;
1364
+ interface ClaudeResolvedConfig {
902
1365
  readonly model?: string;
903
1366
  readonly systemPrompt?: string;
904
- readonly args?: readonly string[];
905
1367
  readonly cwd?: string;
1368
+ readonly workspaceTemplate?: string;
906
1369
  readonly timeoutMs?: number;
1370
+ readonly maxTurns?: number;
1371
+ readonly maxBudgetUsd?: number;
907
1372
  readonly logDir?: string;
908
1373
  readonly logFormat?: 'summary' | 'json';
909
1374
  }
@@ -914,11 +1379,12 @@ interface MockResolvedConfig {
914
1379
  readonly delayMaxMs?: number;
915
1380
  }
916
1381
  interface VSCodeResolvedConfig {
917
- readonly command: string;
1382
+ readonly executable: string;
918
1383
  readonly waitForResponse: boolean;
919
1384
  readonly dryRun: boolean;
920
1385
  readonly subagentRoot?: string;
921
1386
  readonly workspaceTemplate?: string;
1387
+ readonly timeoutMs?: number;
922
1388
  }
923
1389
  type ResolvedTarget = {
924
1390
  readonly kind: 'azure';
@@ -948,13 +1414,20 @@ type ResolvedTarget = {
948
1414
  readonly workers?: number;
949
1415
  readonly providerBatching?: boolean;
950
1416
  readonly config: CodexResolvedConfig;
1417
+ } | {
1418
+ readonly kind: 'copilot';
1419
+ readonly name: string;
1420
+ readonly judgeTarget?: string;
1421
+ readonly workers?: number;
1422
+ readonly providerBatching?: boolean;
1423
+ readonly config: CopilotSdkResolvedConfig;
951
1424
  } | {
952
1425
  readonly kind: 'copilot-cli';
953
1426
  readonly name: string;
954
1427
  readonly judgeTarget?: string;
955
1428
  readonly workers?: number;
956
1429
  readonly providerBatching?: boolean;
957
- readonly config: CopilotResolvedConfig;
1430
+ readonly config: CopilotCliResolvedConfig;
958
1431
  } | {
959
1432
  readonly kind: 'pi-coding-agent';
960
1433
  readonly name: string;
@@ -970,12 +1443,12 @@ type ResolvedTarget = {
970
1443
  readonly providerBatching?: boolean;
971
1444
  readonly config: PiAgentSdkResolvedConfig;
972
1445
  } | {
973
- readonly kind: 'claude-code';
1446
+ readonly kind: 'claude';
974
1447
  readonly name: string;
975
1448
  readonly judgeTarget?: string;
976
1449
  readonly workers?: number;
977
1450
  readonly providerBatching?: boolean;
978
- readonly config: ClaudeCodeResolvedConfig;
1451
+ readonly config: ClaudeResolvedConfig;
979
1452
  } | {
980
1453
  readonly kind: 'mock';
981
1454
  readonly name: string;
@@ -1000,6 +1473,42 @@ type ResolvedTarget = {
1000
1473
  };
1001
1474
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
1002
1475
 
1476
+ /**
1477
+ * Extensible provider registry.
1478
+ *
1479
+ * Replaces the hardcoded switch/case dispatch in createProvider() with
1480
+ * a registry of named factory functions. Built-in providers are registered
1481
+ * at startup; users can add custom providers via the registry API or by
1482
+ * dropping files in `.agentv/providers/`.
1483
+ */
1484
+
1485
+ /**
1486
+ * Factory function that creates a Provider instance from a resolved target.
1487
+ */
1488
+ type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
1489
+ /**
1490
+ * Registry of provider factory functions keyed by provider kind.
1491
+ *
1492
+ * Built-in providers are registered at startup. Custom providers can be
1493
+ * registered via the `register()` method.
1494
+ */
1495
+ declare class ProviderRegistry {
1496
+ private readonly factories;
1497
+ /** Register a factory function for a provider kind. */
1498
+ register(kind: string, factory: ProviderFactoryFn): this;
1499
+ /** Get the factory function for a provider kind. */
1500
+ get(kind: string): ProviderFactoryFn | undefined;
1501
+ /** Check if a factory is registered for the given kind. */
1502
+ has(kind: string): boolean;
1503
+ /** List all registered provider kind names. */
1504
+ list(): string[];
1505
+ /**
1506
+ * Create a provider instance from a resolved target.
1507
+ * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
1508
+ */
1509
+ create(target: ResolvedTarget): Provider;
1510
+ }
1511
+
1003
1512
  declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
1004
1513
  declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
1005
1514
 
@@ -1007,6 +1516,7 @@ interface EnsureSubagentsOptions {
1007
1516
  readonly kind: 'vscode' | 'vscode-insiders';
1008
1517
  readonly count: number;
1009
1518
  readonly verbose?: boolean;
1519
+ readonly vscodeCmd?: string;
1010
1520
  }
1011
1521
  interface EnsureSubagentsResult {
1012
1522
  readonly provisioned: boolean;
@@ -1041,15 +1551,25 @@ type PiLogListener = (entry: PiLogEntry) => void;
1041
1551
  declare function consumePiLogEntries(): PiLogEntry[];
1042
1552
  declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
1043
1553
 
1044
- type ClaudeCodeLogEntry = {
1554
+ type ClaudeLogEntry = {
1555
+ readonly filePath: string;
1556
+ readonly evalCaseId?: string;
1557
+ readonly targetName: string;
1558
+ readonly attempt?: number;
1559
+ };
1560
+ type ClaudeLogListener = (entry: ClaudeLogEntry) => void;
1561
+ declare function consumeClaudeLogEntries(): ClaudeLogEntry[];
1562
+ declare function subscribeToClaudeLogEntries(listener: ClaudeLogListener): () => void;
1563
+
1564
+ type CopilotSdkLogEntry = {
1045
1565
  readonly filePath: string;
1046
1566
  readonly evalCaseId?: string;
1047
1567
  readonly targetName: string;
1048
1568
  readonly attempt?: number;
1049
1569
  };
1050
- type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
1051
- declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
1052
- declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
1570
+ type CopilotSdkLogListener = (entry: CopilotSdkLogEntry) => void;
1571
+ declare function consumeCopilotSdkLogEntries(): CopilotSdkLogEntry[];
1572
+ declare function subscribeToCopilotSdkLogEntries(listener: CopilotSdkLogListener): () => void;
1053
1573
 
1054
1574
  type CopilotCliLogEntry = {
1055
1575
  readonly filePath: string;
@@ -1061,6 +1581,38 @@ type CopilotCliLogListener = (entry: CopilotCliLogEntry) => void;
1061
1581
  declare function consumeCopilotCliLogEntries(): CopilotCliLogEntry[];
1062
1582
  declare function subscribeToCopilotCliLogEntries(listener: CopilotCliLogListener): () => void;
1063
1583
 
1584
+ /**
1585
+ * Convention-based discovery of custom provider scripts.
1586
+ *
1587
+ * Scans `.agentv/providers/` for TypeScript/JavaScript files and registers
1588
+ * them as CLI-like providers in the registry. The file name (without
1589
+ * extension) becomes the provider kind name.
1590
+ *
1591
+ * Example: `.agentv/providers/my-llm.ts` -> provider kind "my-llm" in targets.yaml
1592
+ */
1593
+
1594
+ /**
1595
+ * Discover custom provider scripts from `.agentv/providers/` and register
1596
+ * them as provider kinds in the registry.
1597
+ *
1598
+ * Each discovered script is registered as a CLI-like provider that runs
1599
+ * via `bun run <filePath> {PROMPT}`. The script receives the prompt as
1600
+ * a CLI argument and should print its response to stdout.
1601
+ *
1602
+ * @param registry - The provider registry to register discovered providers into
1603
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
1604
+ * @returns Names of discovered provider kinds
1605
+ */
1606
+ declare function discoverProviders(registry: ProviderRegistry, baseDir: string): Promise<string[]>;
1607
+
1608
+ /**
1609
+ * Create and return the default provider registry with all built-in providers.
1610
+ */
1611
+ declare function createBuiltinProviderRegistry(): ProviderRegistry;
1612
+ /**
1613
+ * Create a provider from a resolved target using the default registry.
1614
+ * Custom providers can be registered via `createBuiltinProviderRegistry().register()`.
1615
+ */
1064
1616
  declare function createProvider(target: ResolvedTarget): Provider;
1065
1617
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
1066
1618
 
@@ -1070,7 +1622,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
1070
1622
  */
1071
1623
  type TargetResolver = (targetName: string) => Provider | undefined;
1072
1624
  interface EvaluationContext {
1073
- readonly evalCase: EvalCase;
1625
+ readonly evalCase: EvalTest;
1074
1626
  readonly candidate: string;
1075
1627
  readonly target: ResolvedTarget;
1076
1628
  readonly provider: Provider;
@@ -1086,13 +1638,17 @@ interface EvaluationContext {
1086
1638
  readonly evaluatorTemplateOverride?: string;
1087
1639
  readonly evaluator?: EvaluatorConfig;
1088
1640
  /** Output messages from agent execution (primary source for tool trajectory) */
1089
- readonly outputMessages?: readonly OutputMessage[];
1641
+ readonly output?: readonly Message[];
1090
1642
  /** Lightweight summary of trace events (if available) */
1091
- readonly traceSummary?: TraceSummary;
1643
+ readonly trace?: TraceSummary;
1092
1644
  /** Resolver for target override in code judges */
1093
1645
  readonly targetResolver?: TargetResolver;
1094
1646
  /** List of available target names for code judges */
1095
1647
  readonly availableTargets?: readonly string[];
1648
+ /** Unified diff of file changes from workspace (when workspace_template is configured) */
1649
+ readonly fileChanges?: string;
1650
+ /** Absolute path to the workspace directory (when workspace_template is configured) */
1651
+ readonly workspacePath?: string;
1096
1652
  }
1097
1653
  interface EvaluationScore {
1098
1654
  readonly score: number;
@@ -1102,7 +1658,7 @@ interface EvaluationScore {
1102
1658
  readonly expectedAspectCount: number;
1103
1659
  readonly reasoning?: string;
1104
1660
  readonly evaluatorRawRequest?: JsonObject;
1105
- readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1661
+ readonly scores?: readonly ChildEvaluatorResult[];
1106
1662
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1107
1663
  readonly details?: JsonObject;
1108
1664
  }
@@ -1116,7 +1672,7 @@ interface ChildEvaluatorResult {
1116
1672
  readonly misses: readonly string[];
1117
1673
  readonly reasoning?: string;
1118
1674
  readonly evaluatorRawRequest?: JsonObject;
1119
- readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1675
+ readonly scores?: readonly ChildEvaluatorResult[];
1120
1676
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1121
1677
  readonly details?: JsonObject;
1122
1678
  }
@@ -1134,11 +1690,12 @@ declare function extractJsonBlob(text: string): string | undefined;
1134
1690
  declare function parseJsonFromText(text: string): unknown;
1135
1691
  declare function isNonEmptyString(value: unknown): value is string;
1136
1692
  declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
1693
+ declare function deepEqual(a: unknown, b: unknown): boolean;
1137
1694
  /**
1138
- * Deep equality check for two values.
1139
- * Handles primitives, arrays, and plain objects.
1695
+ * Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
1696
+ * swaps hits/misses, and annotates reasoning.
1140
1697
  */
1141
- declare function deepEqual(a: unknown, b: unknown): boolean;
1698
+ declare function negateScore(score: EvaluationScore): EvaluationScore;
1142
1699
 
1143
1700
  interface CodeEvaluatorOptions {
1144
1701
  readonly script: readonly string[];
@@ -1175,6 +1732,7 @@ declare class CompositeEvaluator implements Evaluator {
1175
1732
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1176
1733
  private aggregate;
1177
1734
  private runWeightedAverage;
1735
+ private runThreshold;
1178
1736
  private runCodeAggregator;
1179
1737
  private runLlmAggregator;
1180
1738
  }
@@ -1184,7 +1742,7 @@ interface CostEvaluatorOptions {
1184
1742
  }
1185
1743
  /**
1186
1744
  * Evaluator that checks execution cost against a budget.
1187
- * Uses traceSummary.costUsd from the evaluation context.
1745
+ * Uses trace.costUsd from the evaluation context.
1188
1746
  */
1189
1747
  declare class CostEvaluator implements Evaluator {
1190
1748
  readonly kind = "cost";
@@ -1193,6 +1751,25 @@ declare class CostEvaluator implements Evaluator {
1193
1751
  evaluate(context: EvaluationContext): EvaluationScore;
1194
1752
  }
1195
1753
 
1754
+ interface ExecutionMetricsEvaluatorOptions {
1755
+ readonly config: ExecutionMetricsEvaluatorConfig;
1756
+ }
1757
+ /**
1758
+ * Evaluator that checks execution metrics against configured thresholds.
1759
+ * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
1760
+ * and exploration ratio. Only specified thresholds are checked.
1761
+ *
1762
+ * Score is proportional: hits.length / (hits.length + misses.length)
1763
+ */
1764
+ declare class ExecutionMetricsEvaluator implements Evaluator {
1765
+ readonly kind = "execution_metrics";
1766
+ private readonly config;
1767
+ constructor(options: ExecutionMetricsEvaluatorOptions);
1768
+ evaluate(context: EvaluationContext): EvaluationScore;
1769
+ private extractConfiguredThresholds;
1770
+ private filterDefinedMetrics;
1771
+ }
1772
+
1196
1773
  interface FieldAccuracyEvaluatorOptions {
1197
1774
  readonly config: FieldAccuracyEvaluatorConfig;
1198
1775
  }
@@ -1206,7 +1783,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
1206
1783
  constructor(options: FieldAccuracyEvaluatorOptions);
1207
1784
  evaluate(context: EvaluationContext): EvaluationScore;
1208
1785
  /**
1209
- * Extract expected data from expected_messages array.
1786
+ * Extract expected data from expected_output array.
1210
1787
  * Looks for the last assistant message with content.
1211
1788
  */
1212
1789
  private extractExpectedData;
@@ -1237,7 +1814,7 @@ interface LatencyEvaluatorOptions {
1237
1814
  }
1238
1815
  /**
1239
1816
  * Evaluator that checks execution duration against a threshold.
1240
- * Uses traceSummary.durationMs from the evaluation context.
1817
+ * Uses trace.durationMs from the evaluation context.
1241
1818
  */
1242
1819
  declare class LatencyEvaluator implements Evaluator {
1243
1820
  readonly kind = "latency";
@@ -1246,6 +1823,11 @@ declare class LatencyEvaluator implements Evaluator {
1246
1823
  evaluate(context: EvaluationContext): EvaluationScore;
1247
1824
  }
1248
1825
 
1826
+ /**
1827
+ * Default evaluator template for the user prompt (variables will be substituted).
1828
+ * Custom evaluators can override this via evaluatorTemplate option.
1829
+ */
1830
+ declare const DEFAULT_EVALUATOR_TEMPLATE: string;
1249
1831
  type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
1250
1832
  interface LlmJudgeEvaluatorOptions {
1251
1833
  readonly resolveJudgeProvider: JudgeProviderResolver;
@@ -1269,6 +1851,36 @@ declare const freeformEvaluationSchema: z.ZodObject<{
1269
1851
  misses?: string[] | undefined;
1270
1852
  reasoning?: string | undefined;
1271
1853
  }>;
1854
+ declare const rubricEvaluationSchema: z.ZodObject<{
1855
+ checks: z.ZodArray<z.ZodObject<{
1856
+ id: z.ZodString;
1857
+ satisfied: z.ZodBoolean;
1858
+ reasoning: z.ZodString;
1859
+ }, "strip", z.ZodTypeAny, {
1860
+ reasoning: string;
1861
+ id: string;
1862
+ satisfied: boolean;
1863
+ }, {
1864
+ reasoning: string;
1865
+ id: string;
1866
+ satisfied: boolean;
1867
+ }>, "many">;
1868
+ overall_reasoning: z.ZodString;
1869
+ }, "strip", z.ZodTypeAny, {
1870
+ checks: {
1871
+ reasoning: string;
1872
+ id: string;
1873
+ satisfied: boolean;
1874
+ }[];
1875
+ overall_reasoning: string;
1876
+ }, {
1877
+ checks: {
1878
+ reasoning: string;
1879
+ id: string;
1880
+ satisfied: boolean;
1881
+ }[];
1882
+ overall_reasoning: string;
1883
+ }>;
1272
1884
 
1273
1885
  declare class LlmJudgeEvaluator implements Evaluator {
1274
1886
  readonly kind = "llm_judge";
@@ -1297,13 +1909,87 @@ declare class LlmJudgeEvaluator implements Evaluator {
1297
1909
  * This schema is always appended to the evaluator template.
1298
1910
  */
1299
1911
  declare function buildOutputSchema(): string;
1912
+ declare function buildRubricOutputSchema(): string;
1913
+ declare function substituteVariables(template: string, variables: Record<string, string>): string;
1914
+ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
1915
+ score: number;
1916
+ verdict: 'pass' | 'fail' | 'borderline';
1917
+ hits: string[];
1918
+ misses: string[];
1919
+ };
1920
+ /**
1921
+ * Build the output schema for score-range rubric evaluation.
1922
+ */
1923
+ declare function buildScoreRangeOutputSchema(): string;
1924
+
1925
+ interface AgentJudgeEvaluatorOptions {
1926
+ readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
1927
+ readonly maxSteps?: number;
1928
+ readonly temperature?: number;
1929
+ readonly evaluatorTemplate?: string;
1930
+ readonly judgeTargetProvider?: Provider;
1931
+ }
1932
+ declare class AgentJudgeEvaluator implements Evaluator {
1933
+ readonly kind = "agent_judge";
1934
+ private readonly resolveJudgeProvider;
1935
+ private readonly maxSteps;
1936
+ private readonly temperature;
1937
+ private readonly evaluatorTemplate?;
1938
+ private readonly judgeTargetProvider?;
1939
+ constructor(options: AgentJudgeEvaluatorOptions);
1940
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1941
+ /**
1942
+ * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
1943
+ */
1944
+ private evaluateBuiltIn;
1945
+ /**
1946
+ * Judge target mode: Delegates to an external agent provider via Provider.invoke().
1947
+ */
1948
+ private evaluateWithJudgeTarget;
1949
+ /**
1950
+ * Parse the agent's response text into an EvaluationScore.
1951
+ * Supports both freeform and rubric modes.
1952
+ */
1953
+ private parseResult;
1954
+ /**
1955
+ * Build system prompt for built-in mode.
1956
+ * Includes output format instructions.
1957
+ */
1958
+ private buildSystemPrompt;
1959
+ /**
1960
+ * Build user prompt for built-in mode.
1961
+ * Uses custom template if provided, otherwise builds default prompt.
1962
+ */
1963
+ private buildUserPrompt;
1964
+ /**
1965
+ * Build the full evaluation prompt for judge target mode (delegation).
1966
+ * Combines task context, criteria, candidate info, and output format instructions.
1967
+ */
1968
+ private buildDelegatedPrompt;
1969
+ }
1970
+
1971
+ interface LlmJudgePromptAssembly {
1972
+ systemPrompt: string;
1973
+ userPrompt: string;
1974
+ responseSchema: string;
1975
+ mode: 'freeform' | 'checklist' | 'score_range';
1976
+ }
1977
+ declare function assembleLlmJudgePrompt(input: {
1978
+ evalCase: EvalTest;
1979
+ candidate: string;
1980
+ promptInputs: PromptInputs;
1981
+ evaluatorConfig?: LlmJudgeEvaluatorConfig;
1982
+ output?: readonly Message[];
1983
+ fileChanges?: string;
1984
+ evaluatorTemplateOverride?: string;
1985
+ }): LlmJudgePromptAssembly;
1300
1986
 
1301
1987
  interface TokenUsageEvaluatorOptions {
1302
1988
  readonly config: TokenUsageEvaluatorConfig;
1303
1989
  }
1304
1990
  /**
1305
1991
  * Evaluator that checks provider-reported token usage against configured limits.
1306
- * Uses traceSummary.tokenUsage from the evaluation context.
1992
+ * Uses trace.tokenUsage from the evaluation context.
1307
1993
  */
1308
1994
  declare class TokenUsageEvaluator implements Evaluator {
1309
1995
  readonly kind = "token_usage";
@@ -1331,6 +2017,109 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
1331
2017
  private evaluateAnyOrder;
1332
2018
  private evaluateInOrder;
1333
2019
  private evaluateExact;
2020
+ /**
2021
+ * Superset mode: actual trajectory must contain all expected tool calls.
2022
+ * Every expected item must be found in actual (greedy matching with consumption).
2023
+ * Extra tool calls in actual are OK.
2024
+ */
2025
+ private evaluateSuperset;
2026
+ /**
2027
+ * Subset mode: every actual tool call must be in the allowed list.
2028
+ * Expected items are reusable (not consumed) - they define the allowed set.
2029
+ * If every actual call matches at least one expected item, score is 1.
2030
+ */
2031
+ private evaluateSubset;
2032
+ }
2033
+
2034
+ /**
2035
+ * Deterministic assertion evaluators.
2036
+ *
2037
+ * Pure functions that check agent output against simple conditions
2038
+ * and return a binary score (0 or 1) with descriptive hits/misses.
2039
+ */
2040
+ type AssertionResult = {
2041
+ score: number;
2042
+ hits: string[];
2043
+ misses: string[];
2044
+ };
2045
+ /** Checks if `output` contains the given `value` substring. */
2046
+ declare function runContainsAssertion(output: string, value: string): AssertionResult;
2047
+ /** Checks if `output` matches the given regex `pattern`. */
2048
+ declare function runRegexAssertion(output: string, pattern: string): AssertionResult;
2049
+ /** Checks if `output` is valid JSON. */
2050
+ declare function runIsJsonAssertion(output: string): AssertionResult;
2051
+ /** Checks if `output` exactly equals `value` (both trimmed). */
2052
+ declare function runEqualsAssertion(output: string, value: string): AssertionResult;
2053
+
2054
+ /**
2055
+ * Extensible evaluator registry.
2056
+ *
2057
+ * Replaces the hardcoded switch/case dispatch in the orchestrator with
2058
+ * a registry of named factory functions. Built-in evaluators are registered
2059
+ * at startup; users can add custom evaluators via `defineAssertion()` in
2060
+ * `@agentv/eval` or by dropping files in `.agentv/assertions/`.
2061
+ */
2062
+
2063
+ /**
2064
+ * Context passed to evaluator factory functions during creation.
2065
+ * Contains shared resources needed by evaluator instances.
2066
+ */
2067
+ interface EvaluatorDispatchContext {
2068
+ /** Shared LLM judge provider (resolved at suite level) */
2069
+ readonly judgeProvider?: Provider;
2070
+ /** Function to resolve target names to providers */
2071
+ readonly targetResolver?: TargetResolver;
2072
+ /** Available target names for code judges */
2073
+ readonly availableTargets?: readonly string[];
2074
+ /** Agent timeout in ms */
2075
+ readonly agentTimeoutMs?: number;
2076
+ /** Directory containing the eval file (for composite member resolution) */
2077
+ readonly evalFileDir?: string;
2078
+ /** Shared LLM judge evaluator instance */
2079
+ readonly llmJudge: Evaluator;
2080
+ /** Reference to the registry itself (for composite evaluators that need to create children) */
2081
+ readonly registry: EvaluatorRegistry;
2082
+ }
2083
+ /**
2084
+ * Factory function that creates an Evaluator instance from a config.
2085
+ *
2086
+ * Factory functions handle all type-specific initialization logic:
2087
+ * - Reading prompt files for LLM judges
2088
+ * - Resolving script paths for code judges
2089
+ * - Creating adapter evaluators for deterministic assertions
2090
+ */
2091
+ type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
2092
+ /**
2093
+ * Registry of evaluator factory functions keyed by evaluator type name.
2094
+ *
2095
+ * Built-in evaluators are registered at startup. Custom evaluators can be
2096
+ * registered via the `register()` method or discovered from `.agentv/assertions/`.
2097
+ */
2098
+ declare class EvaluatorRegistry {
2099
+ private readonly factories;
2100
+ /** Register a factory function for an evaluator type. */
2101
+ register(type: string, factory: EvaluatorFactoryFn): this;
2102
+ /** Get the factory function for an evaluator type. */
2103
+ get(type: string): EvaluatorFactoryFn | undefined;
2104
+ /** Check if a factory is registered for the given type. */
2105
+ has(type: string): boolean;
2106
+ /** List all registered evaluator type names. */
2107
+ list(): string[];
2108
+ /**
2109
+ * Create an evaluator instance from a config, using the registered factory.
2110
+ * Throws if no factory is registered for the evaluator type.
2111
+ */
2112
+ create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
2113
+ }
2114
+ /**
2115
+ * Adapter that wraps a synchronous assertion function as an Evaluator.
2116
+ * Used for deterministic assertions (contains, regex, is_json, equals).
2117
+ */
2118
+ declare class DeterministicAssertionEvaluator implements Evaluator {
2119
+ private readonly assertFn;
2120
+ readonly kind: string;
2121
+ constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
2122
+ evaluate(context: EvaluationContext): EvaluationScore;
1334
2123
  }
1335
2124
 
1336
2125
  type MaybePromise<T> = T | Promise<T>;
@@ -1339,7 +2128,7 @@ interface EvaluationCache {
1339
2128
  set(key: string, value: ProviderResponse): MaybePromise<void>;
1340
2129
  }
1341
2130
  interface RunEvalCaseOptions {
1342
- readonly evalCase: EvalCase;
2131
+ readonly evalCase: EvalTest;
1343
2132
  readonly provider: Provider;
1344
2133
  readonly target: ResolvedTarget;
1345
2134
  readonly evaluators: Partial<Record<string, Evaluator>> & {
@@ -1356,10 +2145,26 @@ interface RunEvalCaseOptions {
1356
2145
  readonly targetResolver?: (name: string) => Provider | undefined;
1357
2146
  /** List of available target names for code judges */
1358
2147
  readonly availableTargets?: readonly string[];
2148
+ /** Unique identifier for the evaluation run (used for workspace management) */
2149
+ readonly evalRunId?: string;
2150
+ /** Keep workspace on success (default: cleanup on success, keep on failure) */
2151
+ readonly keepWorkspaces?: boolean;
2152
+ /** Force cleanup of workspaces even on failure */
2153
+ readonly cleanupWorkspaces?: boolean;
2154
+ /** Pre-created shared workspace path (shared across tests in a suite) */
2155
+ readonly sharedWorkspacePath?: string;
2156
+ /** Pre-initialized baseline commit for shared workspace */
2157
+ readonly sharedBaselineCommit?: string;
2158
+ /** Suite-level .code-workspace file (resolved from workspace.template) */
2159
+ readonly suiteWorkspaceFile?: string;
2160
+ /** Real-time observability callbacks passed to the provider */
2161
+ readonly streamCallbacks?: ProviderStreamCallbacks;
2162
+ /** Evaluator type registry (with custom assertions discovered) */
2163
+ readonly typeRegistry?: EvaluatorRegistry;
1359
2164
  }
1360
2165
  interface ProgressEvent {
1361
2166
  readonly workerId: number;
1362
- readonly evalId: string;
2167
+ readonly testId: string;
1363
2168
  readonly status: 'pending' | 'running' | 'completed' | 'failed';
1364
2169
  readonly startedAt?: number;
1365
2170
  readonly completedAt?: number;
@@ -1378,19 +2183,367 @@ interface RunEvaluationOptions {
1378
2183
  readonly cache?: EvaluationCache;
1379
2184
  readonly useCache?: boolean;
1380
2185
  readonly now?: () => Date;
1381
- /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
2186
+ /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
1382
2187
  readonly filter?: string;
1383
2188
  readonly verbose?: boolean;
1384
2189
  readonly maxConcurrency?: number;
1385
- readonly evalCases?: readonly EvalCase[];
2190
+ readonly evalCases?: readonly EvalTest[];
1386
2191
  readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
1387
2192
  readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
2193
+ /** Keep workspace on success (default: cleanup on success, keep on failure) */
2194
+ readonly keepWorkspaces?: boolean;
2195
+ /** Force cleanup of workspaces even on failure */
2196
+ readonly cleanupWorkspaces?: boolean;
2197
+ /** Trial configuration for running eval cases multiple times */
2198
+ readonly trials?: TrialsConfig;
2199
+ /** Real-time observability callbacks passed to the provider */
2200
+ readonly streamCallbacks?: ProviderStreamCallbacks;
1388
2201
  }
1389
2202
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
1390
2203
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
1391
2204
 
2205
+ /**
2206
+ * Programmatic API for running evaluations.
2207
+ *
2208
+ * Provides `evaluate()` — a high-level function for using AgentV as a library
2209
+ * instead of a CLI. The config shape mirrors the YAML structure for easy
2210
+ * translation between file-based and programmatic usage.
2211
+ *
2212
+ * @example Inline tests
2213
+ * ```typescript
2214
+ * import { evaluate } from '@agentv/core';
2215
+ *
2216
+ * const results = await evaluate({
2217
+ * tests: [
2218
+ * {
2219
+ * id: 'capital',
2220
+ * input: 'What is the capital of France?',
2221
+ * expected_output: 'Paris',
2222
+ * assert: [{ type: 'contains', value: 'Paris' }],
2223
+ * },
2224
+ * ],
2225
+ * target: { provider: 'mock_agent' },
2226
+ * });
2227
+ *
2228
+ * console.log(results.summary.passed, 'passed');
2229
+ * ```
2230
+ *
2231
+ * @example File-based
2232
+ * ```typescript
2233
+ * const results = await evaluate({
2234
+ * specFile: './evals/EVAL.yaml',
2235
+ * target: { provider: 'claude_agent' },
2236
+ * });
2237
+ * ```
2238
+ *
2239
+ * @module
2240
+ */
2241
+
2242
+ /**
2243
+ * Inline test definition for the programmatic API.
2244
+ * Mirrors the YAML test structure.
2245
+ */
2246
+ interface EvalTestInput {
2247
+ /** Unique test identifier */
2248
+ readonly id: string;
2249
+ /** What the response should accomplish */
2250
+ readonly criteria?: string;
2251
+ /** Input to the agent (string or message array) */
2252
+ readonly input: string | readonly {
2253
+ role: string;
2254
+ content: string;
2255
+ }[];
2256
+ /** Expected reference output */
2257
+ readonly expected_output?: string;
2258
+ /** Assertion evaluators */
2259
+ readonly assert?: readonly EvalAssertionInput[];
2260
+ /** Arbitrary metadata */
2261
+ readonly metadata?: Record<string, unknown>;
2262
+ }
2263
+ /**
2264
+ * Inline assertion definition for the programmatic API.
2265
+ * Matches the YAML `assert` block structure.
2266
+ */
2267
+ interface EvalAssertionInput {
2268
+ /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
2269
+ readonly type: string;
2270
+ /** Display name */
2271
+ readonly name?: string;
2272
+ /** Value for deterministic assertions (contains, equals, regex) */
2273
+ readonly value?: string;
2274
+ /** Weight for scoring */
2275
+ readonly weight?: number;
2276
+ /** Whether this assertion is required to pass */
2277
+ readonly required?: boolean | number;
2278
+ /** Prompt file for llm_judge */
2279
+ readonly prompt?: string;
2280
+ /** Script for code_judge */
2281
+ readonly script?: string | readonly string[];
2282
+ /** Additional config passed to the assertion */
2283
+ readonly config?: Record<string, unknown>;
2284
+ /** Nested assertions for composite type */
2285
+ readonly assert?: readonly EvalAssertionInput[];
2286
+ /** Rubric criteria for rubrics type */
2287
+ readonly criteria?: readonly (string | {
2288
+ id?: string;
2289
+ outcome: string;
2290
+ weight?: number;
2291
+ })[];
2292
+ /** Additional properties */
2293
+ readonly [key: string]: unknown;
2294
+ }
2295
+ /**
2296
+ * Configuration for `evaluate()`.
2297
+ * Accepts either inline tests or a spec file path.
2298
+ */
2299
+ interface EvalConfig {
2300
+ /** Inline test definitions (mutually exclusive with specFile) */
2301
+ readonly tests?: readonly EvalTestInput[];
2302
+ /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
2303
+ readonly specFile?: string;
2304
+ /** Target provider configuration */
2305
+ readonly target?: TargetDefinition;
2306
+ /** Suite-level assertions applied to all tests */
2307
+ readonly assert?: readonly EvalAssertionInput[];
2308
+ /** Filter tests by ID pattern (glob supported) */
2309
+ readonly filter?: string;
2310
+ /** Maximum concurrent workers (default: 3) */
2311
+ readonly workers?: number;
2312
+ /** Maximum retries on failure (default: 2) */
2313
+ readonly maxRetries?: number;
2314
+ /** Agent timeout in milliseconds (default: 120000) */
2315
+ readonly agentTimeoutMs?: number;
2316
+ /** Enable response caching */
2317
+ readonly cache?: boolean;
2318
+ /** Verbose logging */
2319
+ readonly verbose?: boolean;
2320
+ /** Callback for each completed result */
2321
+ readonly onResult?: (result: EvaluationResult) => void;
2322
+ }
2323
+ /**
2324
+ * Summary statistics for an evaluation run.
2325
+ */
2326
+ interface EvalSummary {
2327
+ /** Total number of test cases */
2328
+ readonly total: number;
2329
+ /** Number of passing test cases (score >= 0.8) */
2330
+ readonly passed: number;
2331
+ /** Number of failing test cases (score < 0.5) */
2332
+ readonly failed: number;
2333
+ /** Number of borderline test cases (0.5 <= score < 0.8) */
2334
+ readonly borderline: number;
2335
+ /** Total duration in milliseconds */
2336
+ readonly durationMs: number;
2337
+ /** Mean score across all cases */
2338
+ readonly meanScore: number;
2339
+ }
2340
+ /**
2341
+ * Result of an `evaluate()` call.
2342
+ */
2343
+ interface EvalRunResult {
2344
+ /** Individual test case results */
2345
+ readonly results: readonly EvaluationResult[];
2346
+ /** Aggregate summary statistics */
2347
+ readonly summary: EvalSummary;
2348
+ }
2349
+ /**
2350
+ * Run an evaluation suite against a target provider.
2351
+ *
2352
+ * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
2353
+ * The config shape mirrors the YAML structure — users can translate between
2354
+ * file-based and programmatic usage 1:1.
2355
+ *
2356
+ * @param config - Evaluation configuration
2357
+ * @returns Typed evaluation results with summary statistics
2358
+ *
2359
+ * @example Inline tests with assertions
2360
+ * ```typescript
2361
+ * const { results, summary } = await evaluate({
2362
+ * tests: [
2363
+ * {
2364
+ * id: 'greeting',
2365
+ * input: 'Say hello',
2366
+ * assert: [{ type: 'contains', value: 'hello' }],
2367
+ * },
2368
+ * ],
2369
+ * target: { provider: 'mock_agent' },
2370
+ * });
2371
+ * console.log(`${summary.passed}/${summary.total} passed`);
2372
+ * ```
2373
+ *
2374
+ * @example Load from YAML
2375
+ * ```typescript
2376
+ * const { summary } = await evaluate({
2377
+ * specFile: './evals/my-eval.yaml',
2378
+ * filter: 'greeting-*',
2379
+ * });
2380
+ * ```
2381
+ */
2382
+ declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
2383
+
2384
+ /**
2385
+ * Typed configuration file support for AgentV.
2386
+ *
2387
+ * Provides `defineConfig()` for use in `agentv.config.ts` files. Supports
2388
+ * auto-discovery, Zod validation, and IDE autocomplete.
2389
+ *
2390
+ * @example
2391
+ * ```typescript
2392
+ * // agentv.config.ts
2393
+ * import { defineConfig } from '@agentv/core';
2394
+ *
2395
+ * export default defineConfig({
2396
+ * execution: {
2397
+ * workers: 5,
2398
+ * maxRetries: 2,
2399
+ * agentTimeoutMs: 120_000,
2400
+ * },
2401
+ * output: {
2402
+ * format: 'jsonl',
2403
+ * dir: './results',
2404
+ * },
2405
+ * });
2406
+ * ```
2407
+ *
2408
+ * @module
2409
+ */
2410
+
2411
+ /**
2412
+ * Schema for AgentV project-level configuration.
2413
+ */
2414
+ declare const AgentVConfigSchema: z.ZodObject<{
2415
+ /** Default execution settings */
2416
+ execution: z.ZodOptional<z.ZodObject<{
2417
+ /** Number of parallel workers (default: 3) */
2418
+ workers: z.ZodOptional<z.ZodNumber>;
2419
+ /** Maximum retries on failure (default: 2) */
2420
+ maxRetries: z.ZodOptional<z.ZodNumber>;
2421
+ /** Agent timeout in milliseconds (default: 120000) */
2422
+ agentTimeoutMs: z.ZodOptional<z.ZodNumber>;
2423
+ }, "strip", z.ZodTypeAny, {
2424
+ workers?: number | undefined;
2425
+ maxRetries?: number | undefined;
2426
+ agentTimeoutMs?: number | undefined;
2427
+ }, {
2428
+ workers?: number | undefined;
2429
+ maxRetries?: number | undefined;
2430
+ agentTimeoutMs?: number | undefined;
2431
+ }>>;
2432
+ /** Output settings */
2433
+ output: z.ZodOptional<z.ZodObject<{
2434
+ /** Output format */
2435
+ format: z.ZodOptional<z.ZodEnum<["jsonl", "yaml", "json", "xml"]>>;
2436
+ /** Output directory */
2437
+ dir: z.ZodOptional<z.ZodString>;
2438
+ }, "strip", z.ZodTypeAny, {
2439
+ dir?: string | undefined;
2440
+ format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
2441
+ }, {
2442
+ dir?: string | undefined;
2443
+ format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
2444
+ }>>;
2445
+ /** Response caching */
2446
+ cache: z.ZodOptional<z.ZodObject<{
2447
+ /** Enable response caching */
2448
+ enabled: z.ZodOptional<z.ZodBoolean>;
2449
+ /** Cache file path */
2450
+ path: z.ZodOptional<z.ZodString>;
2451
+ }, "strip", z.ZodTypeAny, {
2452
+ enabled?: boolean | undefined;
2453
+ path?: string | undefined;
2454
+ }, {
2455
+ enabled?: boolean | undefined;
2456
+ path?: string | undefined;
2457
+ }>>;
2458
+ /** Cost and duration limits */
2459
+ limits: z.ZodOptional<z.ZodObject<{
2460
+ /** Maximum cost per run in USD */
2461
+ maxCostUsd: z.ZodOptional<z.ZodNumber>;
2462
+ /** Maximum duration per run in milliseconds */
2463
+ maxDurationMs: z.ZodOptional<z.ZodNumber>;
2464
+ }, "strip", z.ZodTypeAny, {
2465
+ maxDurationMs?: number | undefined;
2466
+ maxCostUsd?: number | undefined;
2467
+ }, {
2468
+ maxDurationMs?: number | undefined;
2469
+ maxCostUsd?: number | undefined;
2470
+ }>>;
2471
+ }, "strip", z.ZodTypeAny, {
2472
+ output?: {
2473
+ dir?: string | undefined;
2474
+ format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
2475
+ } | undefined;
2476
+ execution?: {
2477
+ workers?: number | undefined;
2478
+ maxRetries?: number | undefined;
2479
+ agentTimeoutMs?: number | undefined;
2480
+ } | undefined;
2481
+ cache?: {
2482
+ enabled?: boolean | undefined;
2483
+ path?: string | undefined;
2484
+ } | undefined;
2485
+ limits?: {
2486
+ maxDurationMs?: number | undefined;
2487
+ maxCostUsd?: number | undefined;
2488
+ } | undefined;
2489
+ }, {
2490
+ output?: {
2491
+ dir?: string | undefined;
2492
+ format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
2493
+ } | undefined;
2494
+ execution?: {
2495
+ workers?: number | undefined;
2496
+ maxRetries?: number | undefined;
2497
+ agentTimeoutMs?: number | undefined;
2498
+ } | undefined;
2499
+ cache?: {
2500
+ enabled?: boolean | undefined;
2501
+ path?: string | undefined;
2502
+ } | undefined;
2503
+ limits?: {
2504
+ maxDurationMs?: number | undefined;
2505
+ maxCostUsd?: number | undefined;
2506
+ } | undefined;
2507
+ }>;
2508
+ /**
2509
+ * AgentV project-level configuration type.
2510
+ * Inferred from the Zod schema for full type safety.
2511
+ */
2512
+ type AgentVConfig = z.infer<typeof AgentVConfigSchema>;
2513
+ /**
2514
+ * Define a typed AgentV configuration.
2515
+ *
2516
+ * Use this in `agentv.config.ts` at your project root. The configuration
2517
+ * is validated at load time and provides full IDE autocomplete.
2518
+ *
2519
+ * @param config - Configuration object
2520
+ * @returns Validated configuration
2521
+ *
2522
+ * @example
2523
+ * ```typescript
2524
+ * import { defineConfig } from '@agentv/core';
2525
+ *
2526
+ * export default defineConfig({
2527
+ * execution: { workers: 5 },
2528
+ * output: { format: 'jsonl', dir: './results' },
2529
+ * limits: { maxCostUsd: 10.0 },
2530
+ * });
2531
+ * ```
2532
+ */
2533
+ declare function defineConfig(config: AgentVConfig): AgentVConfig;
2534
+ /**
2535
+ * Discover and load an AgentV config file from the project root.
2536
+ *
2537
+ * Searches for config files in discovery order. Returns null if
2538
+ * no config file is found.
2539
+ *
2540
+ * @param projectRoot - Project root directory to search from
2541
+ * @returns Loaded and validated config, or null if not found
2542
+ */
2543
+ declare function loadTsConfig(projectRoot: string): Promise<AgentVConfig | null>;
2544
+
1392
2545
  interface GenerateRubricsOptions {
1393
- readonly expectedOutcome: string;
2546
+ readonly criteria: string;
1394
2547
  readonly question?: string;
1395
2548
  readonly referenceAnswer?: string;
1396
2549
  readonly provider: Provider;
@@ -1400,9 +2553,339 @@ interface GenerateRubricsOptions {
1400
2553
  */
1401
2554
  declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
1402
2555
 
2556
+ /**
2557
+ * Error thrown when the template path does not exist.
2558
+ */
2559
+ declare class TemplateNotFoundError extends Error {
2560
+ constructor(templatePath: string);
2561
+ }
2562
+ /**
2563
+ * Error thrown when the template path is a file instead of a directory.
2564
+ */
2565
+ declare class TemplateNotDirectoryError extends Error {
2566
+ constructor(templatePath: string);
2567
+ }
2568
+ /**
2569
+ * Error thrown when there is insufficient disk space or other I/O errors.
2570
+ */
2571
+ declare class WorkspaceCreationError extends Error {
2572
+ readonly cause?: Error | undefined;
2573
+ constructor(message: string, cause?: Error | undefined);
2574
+ }
2575
+ /**
2576
+ * Get the workspace path for a specific eval case.
2577
+ *
2578
+ * Workspace structure:
2579
+ * {workspaceRoot}/{evalRunId}/{caseId}
2580
+ *
2581
+ * Example:
2582
+ * ~/.agentv/workspaces/abc123/case-01
2583
+ *
2584
+ * @param evalRunId - The unique identifier for the evaluation run
2585
+ * @param caseId - The unique identifier for the evaluation case
2586
+ * @param workspaceRoot - Optional custom workspace root directory (defaults to ~/.agentv/workspaces)
2587
+ * @returns Absolute path to the workspace directory
2588
+ */
2589
+ declare function getWorkspacePath(evalRunId: string, caseId: string, workspaceRoot?: string): string;
2590
+ /**
2591
+ * Create a temporary workspace by copying a template directory.
2592
+ *
2593
+ * The workspace is created at ~/.agentv/workspaces/{evalRunId}/{caseId}/
2594
+ * The .git directory from the template is skipped during copy.
2595
+ *
2596
+ * @param templatePath - Absolute path to the template directory
2597
+ * @param evalRunId - The unique identifier for the evaluation run
2598
+ * @param caseId - The unique identifier for the evaluation case
2599
+ * @param workspaceRoot - Optional custom workspace root directory
2600
+ * @returns Absolute path to the created workspace directory
2601
+ * @throws TemplateNotFoundError if the template path does not exist
2602
+ * @throws TemplateNotDirectoryError if the template path is not a directory
2603
+ * @throws WorkspaceCreationError if there's an error creating the workspace
2604
+ */
2605
+ declare function createTempWorkspace(templatePath: string, evalRunId: string, caseId: string, workspaceRoot?: string): Promise<string>;
2606
+ /**
2607
+ * Remove a single workspace directory.
2608
+ *
2609
+ * @param workspacePath - Absolute path to the workspace directory to remove
2610
+ * @throws Error if the cleanup fails
2611
+ */
2612
+ declare function cleanupWorkspace(workspacePath: string): Promise<void>;
2613
+ /**
2614
+ * Remove all workspaces for an evaluation run.
2615
+ *
2616
+ * This removes the entire {workspaceRoot}/{evalRunId} directory,
2617
+ * cleaning up all case workspaces for that run.
2618
+ *
2619
+ * @param evalRunId - The unique identifier for the evaluation run
2620
+ * @param workspaceRoot - Optional custom workspace root directory
2621
+ * @throws Error if the cleanup fails
2622
+ */
2623
+ declare function cleanupEvalWorkspaces(evalRunId: string, workspaceRoot?: string): Promise<void>;
2624
+
2625
+ /**
2626
+ * Context passed to workspace lifecycle scripts via stdin.
2627
+ */
2628
+ interface ScriptExecutionContext {
2629
+ readonly workspacePath: string;
2630
+ readonly testId: string;
2631
+ readonly evalRunId: string;
2632
+ readonly caseInput?: string;
2633
+ readonly caseMetadata?: Record<string, unknown>;
2634
+ }
2635
+ type ScriptFailureMode = 'fatal' | 'warn';
2636
+ /**
2637
+ * Executes a workspace lifecycle script (before_all, after_all, before_each, after_each).
2638
+ *
2639
+ * @param config - Workspace script configuration (script, timeout_ms, cwd)
2640
+ * @param context - Context passed to script via stdin (JSON)
2641
+ * @param failureMode - 'fatal' throws on non-zero exit; 'warn' logs warning
2642
+ * @returns Captured stdout from the script
2643
+ * @throws Error if script exits with non-zero code (fatal mode) or times out
2644
+ */
2645
+ declare function executeWorkspaceScript(config: WorkspaceScriptConfig, context: ScriptExecutionContext, failureMode?: ScriptFailureMode): Promise<string>;
2646
+
2647
+ /**
2648
+ * Initialize a git baseline for workspace file change tracking.
2649
+ *
2650
+ * Runs `git init` directly in the workspace, stages all files, and creates
2651
+ * a baseline commit. Returns the commit hash for later diffing.
2652
+ */
2653
+ declare function initializeBaseline(workspacePath: string): Promise<string>;
2654
+ /**
2655
+ * Capture file changes from workspace relative to the baseline commit.
2656
+ * Returns a unified diff string, or empty string if no changes.
2657
+ *
2658
+ * Supports nested git repos (e.g. cloned dependencies): stages files inside
2659
+ * each child repo first, then uses `--submodule=diff` to expand submodule
2660
+ * changes into individual file diffs rather than opaque gitlink hashes.
2661
+ */
2662
+ declare function captureFileChanges(workspacePath: string, baselineCommit: string): Promise<string>;
2663
+
2664
+ interface ResolvedWorkspaceTemplate {
2665
+ /** Directory to copy as the working directory (for createTempWorkspace / request.cwd) */
2666
+ readonly dir: string;
2667
+ /** Optional .code-workspace file for VS Code providers */
2668
+ readonly workspaceFile?: string;
2669
+ }
2670
+ /**
2671
+ * Resolves a workspace.template value into a directory + optional .code-workspace file.
2672
+ *
2673
+ * Resolution rules:
2674
+ * - .code-workspace file → dir = parent directory, workspaceFile = the file
2675
+ * - Directory with exactly 1 .code-workspace → dir = directory, workspaceFile = that file
2676
+ * - Directory with N .code-workspace → dir = directory, workspaceFile = template.code-workspace (if present)
2677
+ * - Directory with 0 .code-workspace → dir = directory, workspaceFile = undefined
2678
+ */
2679
+ declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
2680
+
2681
+ /**
2682
+ * File-based LLM response cache.
2683
+ * Stores provider responses as JSON files keyed by SHA-256 hash.
2684
+ * Directory structure: <cache_path>/<first-2-chars>/<full-hash>.json
2685
+ */
2686
+ declare class ResponseCache implements EvaluationCache {
2687
+ private readonly cachePath;
2688
+ constructor(cachePath?: string);
2689
+ get(key: string): Promise<ProviderResponse | undefined>;
2690
+ set(key: string, value: ProviderResponse): Promise<void>;
2691
+ private keyToPath;
2692
+ }
2693
+ /**
2694
+ * Determine whether caching should be active for a given run.
2695
+ *
2696
+ * Precedence:
2697
+ * 1. --no-cache CLI flag → always disabled
2698
+ * 2. --cache CLI flag OR execution.cache YAML → enabled
2699
+ * 3. Default → disabled (safe for variability testing)
2700
+ */
2701
+ declare function shouldEnableCache(params: {
2702
+ cliCache: boolean;
2703
+ cliNoCache: boolean;
2704
+ yamlCache?: boolean;
2705
+ }): boolean;
2706
+ /**
2707
+ * Check whether caching should be skipped for a target with temperature > 0.
2708
+ * Non-deterministic responses should not be cached unless explicitly forced.
2709
+ */
2710
+ declare function shouldSkipCacheForTemperature(targetConfig: Record<string, unknown>): boolean;
2711
+
2712
+ /**
2713
+ * Recursively converts all keys in an object from camelCase to snake_case.
2714
+ * This is used to convert TypeScript internal representations to snake_case
2715
+ * for Python ecosystem compatibility in JSON payloads.
2716
+ *
2717
+ * Conversion rules:
2718
+ * - Object keys: camelCase -> snake_case
2719
+ * - Array elements: recursively converted
2720
+ * - Primitives: returned unchanged
2721
+ * - null/undefined: returned unchanged
2722
+ *
2723
+ * @param obj - The object to convert (can be any JSON-serializable value)
2724
+ * @returns A new object with all keys converted to snake_case
2725
+ */
2726
+ declare function toSnakeCaseDeep(obj: unknown): unknown;
2727
+ /**
2728
+ * Recursively converts all keys in an object from snake_case to camelCase.
2729
+ * This is used by optional SDK helpers to map wire payloads into TypeScript-friendly
2730
+ * shapes.
2731
+ *
2732
+ * @param obj - The object to convert (can be any JSON-serializable value)
2733
+ * @returns A new object with all keys converted to camelCase
2734
+ */
2735
+ declare function toCamelCaseDeep(obj: unknown): unknown;
2736
+
2737
+ /**
2738
+ * Trims an EvaluationResult for baseline storage.
2739
+ * Strips large debug/audit fields (denylist approach) while preserving
2740
+ * all fields needed for regression comparison (scores, hits, misses, etc.).
2741
+ *
2742
+ * Returns a new object — the input is not mutated.
2743
+ */
2744
+ declare function trimBaselineResult(result: EvaluationResult): EvaluationResult;
2745
+
2746
+ /** Options for configuring the OTel trace exporter. */
2747
+ interface OtelExportOptions {
2748
+ /** OTLP endpoint URL */
2749
+ readonly endpoint?: string;
2750
+ /** Custom headers (e.g., auth) */
2751
+ readonly headers?: Record<string, string>;
2752
+ /** Whether to include message content in spans */
2753
+ readonly captureContent?: boolean;
2754
+ /** Service name for OTel resource */
2755
+ readonly serviceName?: string;
2756
+ /** When true, group messages into turn spans for multi-turn evals */
2757
+ readonly groupTurns?: boolean;
2758
+ /** Path to write OTLP JSON file (importable by OTel backends) */
2759
+ readonly otlpFilePath?: string;
2760
+ /** Path to write human-readable simple JSONL trace file */
2761
+ readonly traceFilePath?: string;
2762
+ }
2763
+ /** Preset configuration for a known observability backend. */
2764
+ interface OtelBackendPreset {
2765
+ readonly name: string;
2766
+ readonly endpoint: string;
2767
+ readonly headers: (env: Record<string, string | undefined>) => Record<string, string>;
2768
+ }
2769
+
2770
+ declare const OTEL_BACKEND_PRESETS: Record<string, OtelBackendPreset>;
2771
+ type OtelApi = any;
2772
+ type Tracer = any;
2773
+ declare class OtelTraceExporter {
2774
+ private readonly options;
2775
+ private provider;
2776
+ private tracer;
2777
+ private api;
2778
+ private W3CPropagator;
2779
+ constructor(options: OtelExportOptions);
2780
+ /** Initialize the OTel SDK. Returns false if OTel packages are not available. */
2781
+ init(): Promise<boolean>;
2782
+ /** Export a single evaluation result as an OTel trace. */
2783
+ exportResult(result: EvaluationResult): Promise<void>;
2784
+ /** Flush pending spans and shut down. */
2785
+ shutdown(): Promise<void>;
2786
+ /** Create a streaming observer for real-time span export */
2787
+ createStreamingObserver(): OtelStreamingObserver | null;
2788
+ private exportMessage;
2789
+ }
2790
+ /**
2791
+ * Streaming observer that creates OTel spans in real-time during eval execution.
2792
+ * Spans are exported immediately via SimpleSpanProcessor as each tool call / LLM response completes.
2793
+ */
2794
+ declare class OtelStreamingObserver {
2795
+ private readonly tracer;
2796
+ private readonly api;
2797
+ private readonly captureContent;
2798
+ private readonly parentCtx?;
2799
+ private rootSpan;
2800
+ private rootCtx;
2801
+ constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
2802
+ /** Create root eval span immediately (visible in backend right away) */
2803
+ startEvalCase(testId: string, target: string, dataset?: string): void;
2804
+ /** Create and immediately export a tool span */
2805
+ onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
2806
+ /** Create and immediately export an LLM span */
2807
+ onLlmCall(model: string, tokenUsage?: ProviderTokenUsage): void;
2808
+ /** Finalize root span with score/verdict after evaluation completes */
2809
+ finalizeEvalCase(score: number, error?: string): void;
2810
+ /** Get ProviderStreamCallbacks for passing to providers */
2811
+ getStreamCallbacks(): ProviderStreamCallbacks;
2812
+ }
2813
+
2814
+ type ReadableSpan$1 = any;
2815
+ /**
2816
+ * SpanExporter that writes OTLP JSON (the standard OTel wire format) to a file.
2817
+ * The file can be imported by any OTel-compatible backend.
2818
+ */
2819
+ declare class OtlpJsonFileExporter {
2820
+ private spans;
2821
+ private filePath;
2822
+ constructor(filePath: string);
2823
+ export(spans: ReadableSpan$1[], resultCallback: (result: {
2824
+ code: number;
2825
+ }) => void): void;
2826
+ shutdown(): Promise<void>;
2827
+ forceFlush(): Promise<void>;
2828
+ private flush;
2829
+ }
2830
+
2831
+ type ReadableSpan = any;
2832
+ /**
2833
+ * SpanExporter that writes human-readable JSONL (one line per root span).
2834
+ * Designed for quick debugging and analysis without OTel tooling.
2835
+ */
2836
+ declare class SimpleTraceFileExporter {
2837
+ private stream;
2838
+ private filePath;
2839
+ private streamReady;
2840
+ private pendingWrites;
2841
+ private _shuttingDown;
2842
+ constructor(filePath: string);
2843
+ private ensureStream;
2844
+ export(spans: ReadableSpan[], resultCallback: (result: {
2845
+ code: number;
2846
+ }) => void): void;
2847
+ shutdown(): Promise<void>;
2848
+ forceFlush(): Promise<void>;
2849
+ private collectChildren;
2850
+ private buildSimpleRecord;
2851
+ }
2852
+
2853
+ /**
2854
+ * Factory functions for all built-in evaluator types.
2855
+ *
2856
+ * Each factory creates an Evaluator instance from an EvaluatorConfig,
2857
+ * handling type-specific initialization logic. These are registered into
2858
+ * the EvaluatorRegistry at startup.
2859
+ */
2860
+
2861
+ /**
2862
+ * Create a new EvaluatorRegistry with all built-in evaluator types registered.
2863
+ */
2864
+ declare function createBuiltinRegistry(): EvaluatorRegistry;
2865
+
2866
+ /**
2867
+ * Convention-based discovery of custom assertion scripts.
2868
+ *
2869
+ * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
2870
+ * them as code_judge evaluators in the registry. The file name (without
2871
+ * extension) becomes the evaluator type name.
2872
+ *
2873
+ * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
2874
+ */
2875
+
2876
+ /**
2877
+ * Discover custom assertion scripts from `.agentv/assertions/` and register
2878
+ * them as evaluator types in the registry.
2879
+ *
2880
+ * @param registry - The evaluator registry to register discovered assertions into
2881
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
2882
+ * @returns Names of discovered assertion types
2883
+ */
2884
+ declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
2885
+
1403
2886
  type AgentKernel = {
1404
2887
  status: string;
1405
2888
  };
1406
2889
  declare function createAgentKernel(): AgentKernel;
1407
2890
 
1408
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type CopilotResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToPiLogEntries, tokensPerTool };
2891
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };