@agentv/core 2.6.0 → 2.7.1-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SSPAANFZ.js → chunk-6W5E3VR6.js} +383 -54
- package/dist/chunk-6W5E3VR6.js.map +1 -0
- package/dist/chunk-HFSYZHGF.js +82 -0
- package/dist/chunk-HFSYZHGF.js.map +1 -0
- package/dist/chunk-HMXZ2AX4.js +112 -0
- package/dist/chunk-HMXZ2AX4.js.map +1 -0
- package/dist/esm-5Q4BZALM.js +968 -0
- package/dist/esm-5Q4BZALM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +337 -70
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +294 -69
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +9221 -4037
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1717 -234
- package/dist/index.d.ts +1717 -234
- package/dist/index.js +6559 -3140
- package/dist/index.js.map +1 -1
- package/dist/otlp-json-file-exporter-77FDBRSY.js +7 -0
- package/dist/otlp-json-file-exporter-77FDBRSY.js.map +1 -0
- package/dist/simple-trace-file-exporter-S76DMABU.js +7 -0
- package/dist/simple-trace-file-exporter-S76DMABU.js.map +1 -0
- package/package.json +16 -3
- package/dist/chunk-SSPAANFZ.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,5 +1,217 @@
|
|
|
1
|
-
import { z } from 'zod';
|
|
2
1
|
import * as ai from 'ai';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
|
|
5
|
+
interface ChatMessage {
|
|
6
|
+
readonly role: ChatMessageRole;
|
|
7
|
+
readonly content: string;
|
|
8
|
+
readonly name?: string;
|
|
9
|
+
}
|
|
10
|
+
type ChatPrompt = readonly ChatMessage[];
|
|
11
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
12
|
+
/** Callbacks for real-time observability during provider execution */
|
|
13
|
+
interface ProviderStreamCallbacks {
|
|
14
|
+
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
15
|
+
onToolCallEnd?: (toolName: string, input: unknown, output: unknown, durationMs: number, toolCallId?: string) => void;
|
|
16
|
+
onLlmCallEnd?: (model: string, tokenUsage?: ProviderTokenUsage) => void;
|
|
17
|
+
}
|
|
18
|
+
interface ProviderRequest {
|
|
19
|
+
readonly question: string;
|
|
20
|
+
readonly systemPrompt?: string;
|
|
21
|
+
readonly guidelines?: string;
|
|
22
|
+
readonly guideline_patterns?: readonly string[];
|
|
23
|
+
readonly chatPrompt?: ChatPrompt;
|
|
24
|
+
readonly inputFiles?: readonly string[];
|
|
25
|
+
readonly evalCaseId?: string;
|
|
26
|
+
readonly attempt?: number;
|
|
27
|
+
readonly maxOutputTokens?: number;
|
|
28
|
+
readonly temperature?: number;
|
|
29
|
+
readonly metadata?: JsonObject;
|
|
30
|
+
readonly signal?: AbortSignal;
|
|
31
|
+
/** Working directory override (e.g., from workspace_template) */
|
|
32
|
+
readonly cwd?: string;
|
|
33
|
+
/** VS Code .code-workspace file (resolved from workspace.template) */
|
|
34
|
+
readonly workspaceFile?: string;
|
|
35
|
+
/** When true, AgentV captures file changes from workspace — provider should skip forced diff prompt */
|
|
36
|
+
readonly captureFileChanges?: boolean;
|
|
37
|
+
/** Real-time observability callbacks (optional) */
|
|
38
|
+
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* A tool call within an output message.
|
|
42
|
+
* Represents a single tool invocation with its input and optional output.
|
|
43
|
+
*/
|
|
44
|
+
interface ToolCall {
|
|
45
|
+
/** Tool name */
|
|
46
|
+
readonly tool: string;
|
|
47
|
+
/** Tool input arguments */
|
|
48
|
+
readonly input?: unknown;
|
|
49
|
+
/** Tool output result */
|
|
50
|
+
readonly output?: unknown;
|
|
51
|
+
/** Stable identifier for pairing tool calls */
|
|
52
|
+
readonly id?: string;
|
|
53
|
+
/** ISO 8601 timestamp when the tool call started */
|
|
54
|
+
readonly startTime?: string;
|
|
55
|
+
/** ISO 8601 timestamp when the tool call ended */
|
|
56
|
+
readonly endTime?: string;
|
|
57
|
+
/** Duration of the tool call in milliseconds */
|
|
58
|
+
readonly durationMs?: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* An output message from agent execution.
|
|
62
|
+
* Represents a single message in the conversation with optional tool calls.
|
|
63
|
+
*/
|
|
64
|
+
interface Message {
|
|
65
|
+
/** Message role (e.g., 'assistant', 'user', 'tool') */
|
|
66
|
+
readonly role: string;
|
|
67
|
+
/** Optional name for the message sender */
|
|
68
|
+
readonly name?: string;
|
|
69
|
+
/** Message content */
|
|
70
|
+
readonly content?: unknown;
|
|
71
|
+
/** Tool calls made in this message */
|
|
72
|
+
readonly toolCalls?: readonly ToolCall[];
|
|
73
|
+
/** ISO 8601 timestamp when the message started */
|
|
74
|
+
readonly startTime?: string;
|
|
75
|
+
/** ISO 8601 timestamp when the message ended */
|
|
76
|
+
readonly endTime?: string;
|
|
77
|
+
/** Duration of the message in milliseconds */
|
|
78
|
+
readonly durationMs?: number;
|
|
79
|
+
/** Provider-specific metadata */
|
|
80
|
+
readonly metadata?: Record<string, unknown>;
|
|
81
|
+
/** Per-message token usage metrics (optional) */
|
|
82
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
83
|
+
}
|
|
84
|
+
/** @deprecated Use Message instead */
|
|
85
|
+
type OutputMessage = Message;
|
|
86
|
+
/**
|
|
87
|
+
* Token usage metrics reported by provider.
|
|
88
|
+
*/
|
|
89
|
+
interface ProviderTokenUsage {
|
|
90
|
+
/** Input/prompt tokens consumed */
|
|
91
|
+
readonly input: number;
|
|
92
|
+
/** Output/completion tokens generated */
|
|
93
|
+
readonly output: number;
|
|
94
|
+
/** Cached tokens (optional, provider-specific) */
|
|
95
|
+
readonly cached?: number;
|
|
96
|
+
}
|
|
97
|
+
interface ProviderResponse {
|
|
98
|
+
readonly raw?: unknown;
|
|
99
|
+
readonly usage?: JsonObject;
|
|
100
|
+
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
101
|
+
readonly output?: readonly Message[];
|
|
102
|
+
/** Token usage metrics (optional) */
|
|
103
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
104
|
+
/** Total cost in USD (optional) */
|
|
105
|
+
readonly costUsd?: number;
|
|
106
|
+
/** Execution duration in milliseconds (optional) */
|
|
107
|
+
readonly durationMs?: number;
|
|
108
|
+
/** ISO 8601 timestamp when execution started (optional) */
|
|
109
|
+
readonly startTime?: string;
|
|
110
|
+
/** ISO 8601 timestamp when execution ended (optional) */
|
|
111
|
+
readonly endTime?: string;
|
|
112
|
+
}
|
|
113
|
+
interface Provider {
|
|
114
|
+
readonly id: string;
|
|
115
|
+
readonly kind: ProviderKind;
|
|
116
|
+
readonly targetName: string;
|
|
117
|
+
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
118
|
+
/**
|
|
119
|
+
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
120
|
+
*/
|
|
121
|
+
readonly supportsBatch?: boolean;
|
|
122
|
+
/**
|
|
123
|
+
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
124
|
+
* the orchestrator may send multiple requests in a single provider session.
|
|
125
|
+
*/
|
|
126
|
+
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
127
|
+
/**
|
|
128
|
+
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
129
|
+
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
130
|
+
*/
|
|
131
|
+
asLanguageModel?(): ai.LanguageModel;
|
|
132
|
+
}
|
|
133
|
+
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
134
|
+
interface TargetDefinition {
|
|
135
|
+
readonly name: string;
|
|
136
|
+
readonly provider: ProviderKind | string;
|
|
137
|
+
readonly judge_target?: string | undefined;
|
|
138
|
+
readonly workers?: number | undefined;
|
|
139
|
+
readonly provider_batching?: boolean | undefined;
|
|
140
|
+
readonly providerBatching?: boolean | undefined;
|
|
141
|
+
readonly endpoint?: string | unknown | undefined;
|
|
142
|
+
readonly resource?: string | unknown | undefined;
|
|
143
|
+
readonly resourceName?: string | unknown | undefined;
|
|
144
|
+
readonly api_key?: string | unknown | undefined;
|
|
145
|
+
readonly apiKey?: string | unknown | undefined;
|
|
146
|
+
readonly deployment?: string | unknown | undefined;
|
|
147
|
+
readonly deploymentName?: string | unknown | undefined;
|
|
148
|
+
readonly model?: string | unknown | undefined;
|
|
149
|
+
readonly version?: string | unknown | undefined;
|
|
150
|
+
readonly api_version?: string | unknown | undefined;
|
|
151
|
+
readonly variant?: string | unknown | undefined;
|
|
152
|
+
readonly thinking_budget?: number | unknown | undefined;
|
|
153
|
+
readonly thinkingBudget?: number | unknown | undefined;
|
|
154
|
+
readonly temperature?: number | unknown | undefined;
|
|
155
|
+
readonly max_output_tokens?: number | unknown | undefined;
|
|
156
|
+
readonly maxTokens?: number | unknown | undefined;
|
|
157
|
+
readonly executable?: string | unknown | undefined;
|
|
158
|
+
readonly command?: string | unknown | undefined;
|
|
159
|
+
readonly binary?: string | unknown | undefined;
|
|
160
|
+
readonly args?: unknown | undefined;
|
|
161
|
+
readonly arguments?: unknown | undefined;
|
|
162
|
+
readonly cwd?: string | unknown | undefined;
|
|
163
|
+
readonly timeout_seconds?: number | unknown | undefined;
|
|
164
|
+
readonly timeoutSeconds?: number | unknown | undefined;
|
|
165
|
+
readonly log_dir?: string | unknown | undefined;
|
|
166
|
+
readonly logDir?: string | unknown | undefined;
|
|
167
|
+
readonly log_directory?: string | unknown | undefined;
|
|
168
|
+
readonly logDirectory?: string | unknown | undefined;
|
|
169
|
+
readonly log_format?: string | unknown | undefined;
|
|
170
|
+
readonly logFormat?: string | unknown | undefined;
|
|
171
|
+
readonly log_output_format?: string | unknown | undefined;
|
|
172
|
+
readonly logOutputFormat?: string | unknown | undefined;
|
|
173
|
+
readonly system_prompt?: string | unknown | undefined;
|
|
174
|
+
readonly systemPrompt?: string | unknown | undefined;
|
|
175
|
+
readonly max_turns?: number | unknown | undefined;
|
|
176
|
+
readonly maxTurns?: number | unknown | undefined;
|
|
177
|
+
readonly max_budget_usd?: number | unknown | undefined;
|
|
178
|
+
readonly maxBudgetUsd?: number | unknown | undefined;
|
|
179
|
+
readonly response?: string | unknown | undefined;
|
|
180
|
+
readonly delayMs?: number | unknown | undefined;
|
|
181
|
+
readonly delayMinMs?: number | unknown | undefined;
|
|
182
|
+
readonly delayMaxMs?: number | unknown | undefined;
|
|
183
|
+
readonly wait?: boolean | unknown | undefined;
|
|
184
|
+
readonly dry_run?: boolean | unknown | undefined;
|
|
185
|
+
readonly dryRun?: boolean | unknown | undefined;
|
|
186
|
+
readonly subagent_root?: string | unknown | undefined;
|
|
187
|
+
readonly subagentRoot?: string | unknown | undefined;
|
|
188
|
+
readonly workspace_template?: string | unknown | undefined;
|
|
189
|
+
readonly workspaceTemplate?: string | unknown | undefined;
|
|
190
|
+
readonly command_template?: string | unknown | undefined;
|
|
191
|
+
readonly commandTemplate?: string | unknown | undefined;
|
|
192
|
+
readonly files_format?: string | unknown | undefined;
|
|
193
|
+
readonly filesFormat?: string | unknown | undefined;
|
|
194
|
+
readonly attachments_format?: string | unknown | undefined;
|
|
195
|
+
readonly attachmentsFormat?: string | unknown | undefined;
|
|
196
|
+
readonly env?: unknown | undefined;
|
|
197
|
+
readonly healthcheck?: unknown | undefined;
|
|
198
|
+
readonly cli_url?: string | unknown | undefined;
|
|
199
|
+
readonly cliUrl?: string | unknown | undefined;
|
|
200
|
+
readonly cli_path?: string | unknown | undefined;
|
|
201
|
+
readonly cliPath?: string | unknown | undefined;
|
|
202
|
+
readonly github_token?: string | unknown | undefined;
|
|
203
|
+
readonly githubToken?: string | unknown | undefined;
|
|
204
|
+
readonly max_retries?: number | unknown | undefined;
|
|
205
|
+
readonly maxRetries?: number | unknown | undefined;
|
|
206
|
+
readonly retry_initial_delay_ms?: number | unknown | undefined;
|
|
207
|
+
readonly retryInitialDelayMs?: number | unknown | undefined;
|
|
208
|
+
readonly retry_max_delay_ms?: number | unknown | undefined;
|
|
209
|
+
readonly retryMaxDelayMs?: number | unknown | undefined;
|
|
210
|
+
readonly retry_backoff_factor?: number | unknown | undefined;
|
|
211
|
+
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
212
|
+
readonly retry_status_codes?: unknown | undefined;
|
|
213
|
+
readonly retryStatusCodes?: unknown | undefined;
|
|
214
|
+
}
|
|
3
215
|
|
|
4
216
|
/**
|
|
5
217
|
* Trace event types for capturing agent execution traces.
|
|
@@ -37,7 +249,21 @@ interface TraceSummary {
|
|
|
37
249
|
readonly durationMs?: number;
|
|
38
250
|
/** Per-tool duration arrays in milliseconds (optional) */
|
|
39
251
|
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
252
|
+
/** ISO 8601 timestamp when execution started (derived from earliest span) */
|
|
253
|
+
readonly startTime?: string;
|
|
254
|
+
/** ISO 8601 timestamp when execution ended (derived from latest span) */
|
|
255
|
+
readonly endTime?: string;
|
|
256
|
+
/** Number of LLM calls (assistant messages) */
|
|
257
|
+
readonly llmCallCount?: number;
|
|
40
258
|
}
|
|
259
|
+
/**
|
|
260
|
+
* Argument matching mode for tool_trajectory expected items.
|
|
261
|
+
* - 'exact': bidirectional deep equality, no extra keys allowed (default)
|
|
262
|
+
* - 'superset': actual args must contain all expected keys (extras OK)
|
|
263
|
+
* - 'subset': actual args must be a subset of expected keys (no unexpected keys)
|
|
264
|
+
* - 'ignore': skip argument checking entirely
|
|
265
|
+
*/
|
|
266
|
+
type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
41
267
|
/**
|
|
42
268
|
* Configuration for tool_trajectory evaluator.
|
|
43
269
|
*/
|
|
@@ -45,13 +271,18 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
45
271
|
readonly name: string;
|
|
46
272
|
readonly type: 'tool_trajectory';
|
|
47
273
|
/** Matching mode */
|
|
48
|
-
readonly mode: 'any_order' | 'in_order' | 'exact';
|
|
274
|
+
readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
|
|
49
275
|
/** Minimum call counts per tool (for any_order mode) */
|
|
50
276
|
readonly minimums?: Readonly<Record<string, number>>;
|
|
51
|
-
/** Expected tool sequence (for in_order/exact modes) */
|
|
277
|
+
/** Expected tool sequence (for in_order/exact/subset/superset modes) */
|
|
52
278
|
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
53
279
|
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
54
280
|
readonly weight?: number;
|
|
281
|
+
readonly required?: boolean | number;
|
|
282
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
283
|
+
readonly negate?: boolean;
|
|
284
|
+
/** Default argument matching mode for all expected items (defaults to 'exact') */
|
|
285
|
+
readonly argsMatch?: ArgsMatchMode | readonly string[];
|
|
55
286
|
}
|
|
56
287
|
/**
|
|
57
288
|
* Expected tool call item in a trajectory sequence.
|
|
@@ -62,21 +293,35 @@ interface ToolTrajectoryExpectedItem {
|
|
|
62
293
|
readonly args?: 'any' | Record<string, unknown>;
|
|
63
294
|
/** Optional maximum duration in milliseconds for latency assertions */
|
|
64
295
|
readonly maxDurationMs?: number;
|
|
296
|
+
/** Per-item argument matching mode override (takes precedence over evaluator-level argsMatch) */
|
|
297
|
+
readonly argsMatch?: ArgsMatchMode | readonly string[];
|
|
65
298
|
}
|
|
66
299
|
/**
|
|
67
300
|
* Simplified input type for computeTraceSummary.
|
|
68
|
-
* Matches
|
|
301
|
+
* Matches Message structure without requiring full provider/types import.
|
|
69
302
|
*/
|
|
70
|
-
interface
|
|
303
|
+
interface MessageLike {
|
|
304
|
+
readonly role?: string;
|
|
305
|
+
readonly startTime?: string;
|
|
306
|
+
readonly endTime?: string;
|
|
71
307
|
readonly toolCalls?: readonly {
|
|
72
308
|
readonly tool: string;
|
|
309
|
+
readonly startTime?: string;
|
|
310
|
+
readonly endTime?: string;
|
|
311
|
+
readonly durationMs?: number;
|
|
73
312
|
}[];
|
|
74
313
|
}
|
|
75
314
|
/**
|
|
76
315
|
* Compute a lightweight summary from output messages.
|
|
77
316
|
* Used for default result persistence without payload bloat.
|
|
317
|
+
*
|
|
318
|
+
* Derives timing information from span boundaries:
|
|
319
|
+
* - startTime: earliest startTime across all messages and tool calls
|
|
320
|
+
* - endTime: latest endTime across all messages and tool calls
|
|
321
|
+
* - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
|
|
322
|
+
* - llmCallCount: count of assistant messages
|
|
78
323
|
*/
|
|
79
|
-
declare function computeTraceSummary(messages: readonly
|
|
324
|
+
declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
|
|
80
325
|
/**
|
|
81
326
|
* Default tool names considered as exploration/read-only operations.
|
|
82
327
|
* Can be overridden per-evaluation via config.
|
|
@@ -114,10 +359,15 @@ interface ExecutionMetrics {
|
|
|
114
359
|
readonly tokenUsage?: TokenUsage;
|
|
115
360
|
readonly costUsd?: number;
|
|
116
361
|
readonly durationMs?: number;
|
|
362
|
+
/** ISO 8601 timestamp when execution started */
|
|
363
|
+
readonly startTime?: string;
|
|
364
|
+
/** ISO 8601 timestamp when execution ended */
|
|
365
|
+
readonly endTime?: string;
|
|
117
366
|
}
|
|
118
367
|
/**
|
|
119
368
|
* Merge execution metrics from provider response into a trace summary.
|
|
120
369
|
* Returns a new TraceSummary with metrics fields populated.
|
|
370
|
+
* Provider-level timing takes precedence over span-derived timing.
|
|
121
371
|
*
|
|
122
372
|
* @param summary - Base trace summary from computeTraceSummary
|
|
123
373
|
* @param metrics - Optional execution metrics from provider
|
|
@@ -203,7 +453,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
203
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
204
454
|
*/
|
|
205
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
206
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "regex", "is_json", "equals", "rubrics"];
|
|
207
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
208
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
209
459
|
/**
|
|
@@ -215,6 +465,43 @@ type TargetAccessConfig = {
|
|
|
215
465
|
/** Maximum number of target invocations allowed per execution (default: 50) */
|
|
216
466
|
readonly max_calls?: number;
|
|
217
467
|
};
|
|
468
|
+
/**
|
|
469
|
+
* Configuration for workspace lifecycle scripts (before_all, after_all, before_each, after_each).
|
|
470
|
+
* Scripts are executed with workspace context passed via stdin.
|
|
471
|
+
*/
|
|
472
|
+
type WorkspaceScriptConfig = {
|
|
473
|
+
/** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */
|
|
474
|
+
readonly script: readonly string[];
|
|
475
|
+
/** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */
|
|
476
|
+
readonly timeout_ms?: number;
|
|
477
|
+
readonly timeoutMs?: number;
|
|
478
|
+
/** Optional working directory for script execution */
|
|
479
|
+
readonly cwd?: string;
|
|
480
|
+
};
|
|
481
|
+
/**
|
|
482
|
+
* Workspace configuration for eval tests.
|
|
483
|
+
* Can be specified at suite level and overridden per-case.
|
|
484
|
+
* Merge strategy: template/scripts replaced, env deep-merged.
|
|
485
|
+
*
|
|
486
|
+
* Lifecycle hooks follow bun:test/Vitest naming:
|
|
487
|
+
* - before_all: runs ONCE before first test, creates shared workspace
|
|
488
|
+
* - after_all: runs ONCE after last test, final cleanup
|
|
489
|
+
* - before_each: runs before each test (optional)
|
|
490
|
+
* - after_each: runs after each test (e.g., reset git state)
|
|
491
|
+
*/
|
|
492
|
+
type WorkspaceConfig = {
|
|
493
|
+
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
494
|
+
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
495
|
+
readonly template?: string;
|
|
496
|
+
/** Script to run once before first test (after workspace creation, before git baseline) */
|
|
497
|
+
readonly before_all?: WorkspaceScriptConfig;
|
|
498
|
+
/** Script to run once after last test (before workspace cleanup) */
|
|
499
|
+
readonly after_all?: WorkspaceScriptConfig;
|
|
500
|
+
/** Script to run before each test */
|
|
501
|
+
readonly before_each?: WorkspaceScriptConfig;
|
|
502
|
+
/** Script to run after each test (e.g., git reset for workspace reuse) */
|
|
503
|
+
readonly after_each?: WorkspaceScriptConfig;
|
|
504
|
+
};
|
|
218
505
|
type CodeEvaluatorConfig = {
|
|
219
506
|
readonly name: string;
|
|
220
507
|
readonly type: 'code';
|
|
@@ -223,6 +510,9 @@ type CodeEvaluatorConfig = {
|
|
|
223
510
|
readonly cwd?: string;
|
|
224
511
|
readonly resolvedCwd?: string;
|
|
225
512
|
readonly weight?: number;
|
|
513
|
+
readonly required?: boolean | number;
|
|
514
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
515
|
+
readonly negate?: boolean;
|
|
226
516
|
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
227
517
|
readonly config?: JsonObject;
|
|
228
518
|
/** When present, enables target access for the script via local proxy */
|
|
@@ -250,32 +540,35 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
250
540
|
readonly resolvedPromptScript?: readonly string[];
|
|
251
541
|
readonly rubrics?: readonly RubricItem[];
|
|
252
542
|
readonly weight?: number;
|
|
543
|
+
readonly required?: boolean | number;
|
|
544
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
545
|
+
readonly negate?: boolean;
|
|
253
546
|
/** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
|
|
254
547
|
readonly config?: Record<string, unknown>;
|
|
255
548
|
};
|
|
256
549
|
/**
|
|
257
550
|
* Score range definition for analytic rubric scoring.
|
|
258
|
-
* Each range maps an integer score band (0-10) to an
|
|
551
|
+
* Each range maps an integer score band (0-10) to an outcome description.
|
|
259
552
|
*/
|
|
260
553
|
type ScoreRange = {
|
|
261
554
|
/** Inclusive integer range [min, max] within 0-10 */
|
|
262
555
|
readonly score_range: readonly [number, number];
|
|
263
556
|
/** Description of what this score range represents */
|
|
264
|
-
readonly
|
|
557
|
+
readonly outcome: string;
|
|
265
558
|
};
|
|
266
559
|
/**
|
|
267
560
|
* Rubric item for LLM judge evaluation.
|
|
268
561
|
* Supports two modes:
|
|
269
|
-
* - Checklist mode: boolean satisfied/not-satisfied with `
|
|
562
|
+
* - Checklist mode: boolean satisfied/not-satisfied with `outcome`
|
|
270
563
|
* - Score-range mode: 0-10 integer scoring with `score_ranges`
|
|
271
564
|
*/
|
|
272
565
|
type RubricItem = {
|
|
273
566
|
readonly id: string;
|
|
274
567
|
/**
|
|
275
|
-
* For checklist rubrics: the
|
|
568
|
+
* For checklist rubrics: the outcome text (required).
|
|
276
569
|
* For score-range rubrics: optional overall criterion description.
|
|
277
570
|
*/
|
|
278
|
-
readonly
|
|
571
|
+
readonly outcome?: string;
|
|
279
572
|
readonly weight: number;
|
|
280
573
|
/**
|
|
281
574
|
* Legacy boolean gating (deprecated, treated as required_min_score: 10).
|
|
@@ -306,6 +599,9 @@ type CompositeAggregatorConfig = {
|
|
|
306
599
|
readonly prompt?: string;
|
|
307
600
|
readonly promptPath?: string;
|
|
308
601
|
readonly model?: string;
|
|
602
|
+
} | {
|
|
603
|
+
readonly type: 'threshold';
|
|
604
|
+
readonly threshold: number;
|
|
309
605
|
};
|
|
310
606
|
type CompositeEvaluatorConfig = {
|
|
311
607
|
readonly name: string;
|
|
@@ -313,6 +609,9 @@ type CompositeEvaluatorConfig = {
|
|
|
313
609
|
readonly evaluators: readonly EvaluatorConfig[];
|
|
314
610
|
readonly aggregator: CompositeAggregatorConfig;
|
|
315
611
|
readonly weight?: number;
|
|
612
|
+
readonly required?: boolean | number;
|
|
613
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
614
|
+
readonly negate?: boolean;
|
|
316
615
|
};
|
|
317
616
|
/**
|
|
318
617
|
* Match type for field accuracy evaluation.
|
|
@@ -354,6 +653,9 @@ type FieldAccuracyEvaluatorConfig = {
|
|
|
354
653
|
/** Strategy for combining field scores (default: weighted_average) */
|
|
355
654
|
readonly aggregation?: FieldAggregationType;
|
|
356
655
|
readonly weight?: number;
|
|
656
|
+
readonly required?: boolean | number;
|
|
657
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
658
|
+
readonly negate?: boolean;
|
|
357
659
|
};
|
|
358
660
|
/**
|
|
359
661
|
* Configuration for the latency evaluator.
|
|
@@ -365,6 +667,9 @@ type LatencyEvaluatorConfig = {
|
|
|
365
667
|
/** Maximum allowed duration in milliseconds */
|
|
366
668
|
readonly threshold: number;
|
|
367
669
|
readonly weight?: number;
|
|
670
|
+
readonly required?: boolean | number;
|
|
671
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
672
|
+
readonly negate?: boolean;
|
|
368
673
|
};
|
|
369
674
|
/**
|
|
370
675
|
* Configuration for the cost evaluator.
|
|
@@ -376,6 +681,9 @@ type CostEvaluatorConfig = {
|
|
|
376
681
|
/** Maximum allowed cost in USD */
|
|
377
682
|
readonly budget: number;
|
|
378
683
|
readonly weight?: number;
|
|
684
|
+
readonly required?: boolean | number;
|
|
685
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
686
|
+
readonly negate?: boolean;
|
|
379
687
|
};
|
|
380
688
|
/**
|
|
381
689
|
* Configuration for the token_usage evaluator.
|
|
@@ -391,48 +699,256 @@ type TokenUsageEvaluatorConfig = {
|
|
|
391
699
|
/** Maximum allowed output tokens (completion) */
|
|
392
700
|
readonly max_output?: number;
|
|
393
701
|
readonly weight?: number;
|
|
702
|
+
readonly required?: boolean | number;
|
|
703
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
704
|
+
readonly negate?: boolean;
|
|
705
|
+
};
|
|
706
|
+
/**
|
|
707
|
+
* Configuration for the execution_metrics evaluator.
|
|
708
|
+
* Provides declarative threshold-based checks on execution metrics.
|
|
709
|
+
* Only specified thresholds are checked; omitted ones are ignored.
|
|
710
|
+
*/
|
|
711
|
+
type ExecutionMetricsEvaluatorConfig = {
|
|
712
|
+
readonly name: string;
|
|
713
|
+
readonly type: 'execution_metrics';
|
|
714
|
+
/** Maximum allowed number of tool calls */
|
|
715
|
+
readonly max_tool_calls?: number;
|
|
716
|
+
/** Maximum allowed number of LLM calls (assistant messages) */
|
|
717
|
+
readonly max_llm_calls?: number;
|
|
718
|
+
/** Maximum allowed total tokens (input + output) */
|
|
719
|
+
readonly max_tokens?: number;
|
|
720
|
+
/** Maximum allowed cost in USD */
|
|
721
|
+
readonly max_cost_usd?: number;
|
|
722
|
+
/** Maximum allowed duration in milliseconds */
|
|
723
|
+
readonly max_duration_ms?: number;
|
|
724
|
+
/** Target exploration ratio (0-1, proportion of read-only tool calls) */
|
|
725
|
+
readonly target_exploration_ratio?: number;
|
|
726
|
+
/** Tolerance for exploration ratio check (default: 0.2) */
|
|
727
|
+
readonly exploration_tolerance?: number;
|
|
728
|
+
readonly weight?: number;
|
|
729
|
+
readonly required?: boolean | number;
|
|
730
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
731
|
+
readonly negate?: boolean;
|
|
732
|
+
};
|
|
733
|
+
/**
|
|
734
|
+
* Configuration for the agent_judge evaluator.
|
|
735
|
+
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
736
|
+
* Two modes:
|
|
737
|
+
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
738
|
+
* - Judge target: Delegates to an external agent provider via Provider.invoke()
|
|
739
|
+
*/
|
|
740
|
+
type AgentJudgeEvaluatorConfig = {
|
|
741
|
+
readonly name: string;
|
|
742
|
+
readonly type: 'agent_judge';
|
|
743
|
+
/** Custom evaluation prompt (inline text or file path) */
|
|
744
|
+
readonly prompt?: string;
|
|
745
|
+
readonly promptPath?: string;
|
|
746
|
+
/** Resolved absolute path for prompt file */
|
|
747
|
+
readonly resolvedPromptPath?: string;
|
|
748
|
+
/** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
|
|
749
|
+
readonly rubrics?: readonly RubricItem[];
|
|
750
|
+
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
751
|
+
readonly max_steps?: number;
|
|
752
|
+
/** Temperature for built-in mode (default 0) */
|
|
753
|
+
readonly temperature?: number;
|
|
754
|
+
/** Target name — delegates agent loop to this provider instead of built-in mode */
|
|
755
|
+
readonly target?: string;
|
|
756
|
+
readonly weight?: number;
|
|
757
|
+
readonly required?: boolean | number;
|
|
758
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
759
|
+
readonly negate?: boolean;
|
|
760
|
+
};
|
|
761
|
+
/**
|
|
762
|
+
* Configuration for the contains assertion evaluator.
|
|
763
|
+
* Checks whether the candidate output contains a specified substring.
|
|
764
|
+
*/
|
|
765
|
+
type ContainsEvaluatorConfig = {
|
|
766
|
+
readonly name: string;
|
|
767
|
+
readonly type: 'contains';
|
|
768
|
+
readonly value: string;
|
|
769
|
+
readonly weight?: number;
|
|
770
|
+
readonly required?: boolean | number;
|
|
771
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
772
|
+
readonly negate?: boolean;
|
|
773
|
+
};
|
|
774
|
+
/**
|
|
775
|
+
* Configuration for the regex assertion evaluator.
|
|
776
|
+
* Checks whether the candidate output matches a regular expression pattern.
|
|
777
|
+
*/
|
|
778
|
+
type RegexEvaluatorConfig = {
|
|
779
|
+
readonly name: string;
|
|
780
|
+
readonly type: 'regex';
|
|
781
|
+
readonly value: string;
|
|
782
|
+
readonly weight?: number;
|
|
783
|
+
readonly required?: boolean | number;
|
|
784
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
785
|
+
readonly negate?: boolean;
|
|
786
|
+
};
|
|
787
|
+
/**
|
|
788
|
+
* Configuration for the is_json assertion evaluator.
|
|
789
|
+
* Checks whether the candidate output is valid JSON.
|
|
790
|
+
*/
|
|
791
|
+
type IsJsonEvaluatorConfig = {
|
|
792
|
+
readonly name: string;
|
|
793
|
+
readonly type: 'is_json';
|
|
794
|
+
readonly weight?: number;
|
|
795
|
+
readonly required?: boolean | number;
|
|
796
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
797
|
+
readonly negate?: boolean;
|
|
798
|
+
};
|
|
799
|
+
/**
|
|
800
|
+
* Configuration for the equals assertion evaluator.
|
|
801
|
+
* Checks whether the candidate output exactly equals a specified string.
|
|
802
|
+
*/
|
|
803
|
+
type EqualsEvaluatorConfig = {
|
|
804
|
+
readonly name: string;
|
|
805
|
+
readonly type: 'equals';
|
|
806
|
+
readonly value: string;
|
|
807
|
+
readonly weight?: number;
|
|
808
|
+
readonly required?: boolean | number;
|
|
809
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
810
|
+
readonly negate?: boolean;
|
|
811
|
+
};
|
|
812
|
+
/**
|
|
813
|
+
* Configuration for the rubrics evaluator.
|
|
814
|
+
* Evaluates candidate output against a list of rubric criteria.
|
|
815
|
+
*/
|
|
816
|
+
type RubricsEvaluatorConfig = {
|
|
817
|
+
readonly name: string;
|
|
818
|
+
readonly type: 'rubrics';
|
|
819
|
+
readonly criteria: readonly RubricItem[];
|
|
820
|
+
readonly weight?: number;
|
|
821
|
+
readonly required?: boolean | number;
|
|
822
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
823
|
+
readonly negate?: boolean;
|
|
394
824
|
};
|
|
395
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
|
|
825
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
|
|
396
826
|
/**
|
|
397
|
-
* Eval
|
|
827
|
+
* Eval test definition sourced from AgentV specs.
|
|
398
828
|
*/
|
|
399
|
-
interface
|
|
829
|
+
interface EvalTest {
|
|
400
830
|
readonly id: string;
|
|
401
831
|
readonly dataset?: string;
|
|
402
832
|
readonly conversation_id?: string;
|
|
403
833
|
readonly question: string;
|
|
404
|
-
readonly
|
|
834
|
+
readonly input: readonly TestMessage[];
|
|
405
835
|
readonly input_segments: readonly JsonObject[];
|
|
406
|
-
readonly
|
|
836
|
+
readonly expected_output: readonly JsonObject[];
|
|
407
837
|
readonly reference_answer?: string;
|
|
408
838
|
readonly guideline_paths: readonly string[];
|
|
409
839
|
readonly guideline_patterns?: readonly string[];
|
|
410
840
|
readonly file_paths: readonly string[];
|
|
411
|
-
readonly
|
|
841
|
+
readonly criteria: string;
|
|
412
842
|
readonly evaluator?: EvaluatorKind;
|
|
413
843
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
844
|
+
/** Workspace configuration (merged from suite-level and case-level) */
|
|
845
|
+
readonly workspace?: WorkspaceConfig;
|
|
846
|
+
/** Arbitrary metadata passed to workspace scripts via stdin */
|
|
847
|
+
readonly metadata?: Record<string, unknown>;
|
|
848
|
+
/** Per-test target override (matrix evaluation) */
|
|
849
|
+
readonly targets?: readonly string[];
|
|
850
|
+
}
|
|
851
|
+
/** @deprecated Use `EvalTest` instead */
|
|
852
|
+
type EvalCase = EvalTest;
|
|
853
|
+
/**
|
|
854
|
+
* Supported trial aggregation strategies.
|
|
855
|
+
*/
|
|
856
|
+
type TrialStrategy = 'pass_at_k' | 'mean' | 'confidence_interval';
|
|
857
|
+
/**
|
|
858
|
+
* Configuration for running multiple trials per eval case.
|
|
859
|
+
*/
|
|
860
|
+
interface TrialsConfig {
|
|
861
|
+
readonly count: number;
|
|
862
|
+
readonly strategy: TrialStrategy;
|
|
863
|
+
readonly costLimitUsd?: number;
|
|
864
|
+
}
|
|
865
|
+
/**
|
|
866
|
+
* Result of a single trial attempt.
|
|
867
|
+
*/
|
|
868
|
+
interface TrialResult {
|
|
869
|
+
readonly attempt: number;
|
|
870
|
+
readonly score: number;
|
|
871
|
+
readonly verdict: EvaluationVerdict;
|
|
872
|
+
readonly scores?: readonly EvaluatorResult[];
|
|
873
|
+
readonly error?: string;
|
|
874
|
+
readonly costUsd?: number;
|
|
875
|
+
}
|
|
876
|
+
/**
|
|
877
|
+
* Aggregation metadata for pass_at_k strategy.
|
|
878
|
+
*/
|
|
879
|
+
interface PassAtKAggregation {
|
|
880
|
+
readonly strategy: 'pass_at_k';
|
|
881
|
+
readonly passedAttempts: number;
|
|
882
|
+
readonly totalAttempts: number;
|
|
414
883
|
}
|
|
884
|
+
/**
|
|
885
|
+
* Aggregation metadata for mean strategy.
|
|
886
|
+
*/
|
|
887
|
+
interface MeanAggregation {
|
|
888
|
+
readonly strategy: 'mean';
|
|
889
|
+
readonly mean: number;
|
|
890
|
+
readonly min: number;
|
|
891
|
+
readonly max: number;
|
|
892
|
+
}
|
|
893
|
+
/**
|
|
894
|
+
* Aggregation metadata for confidence_interval strategy.
|
|
895
|
+
*/
|
|
896
|
+
interface ConfidenceIntervalAggregation {
|
|
897
|
+
readonly strategy: 'confidence_interval';
|
|
898
|
+
readonly mean: number;
|
|
899
|
+
readonly ci95Lower: number;
|
|
900
|
+
readonly ci95Upper: number;
|
|
901
|
+
readonly stddev: number;
|
|
902
|
+
}
|
|
903
|
+
/**
|
|
904
|
+
* Discriminated union of trial aggregation results.
|
|
905
|
+
*/
|
|
906
|
+
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
415
907
|
/**
|
|
416
908
|
* Evaluator scorecard for a single eval case run.
|
|
417
909
|
*/
|
|
418
910
|
interface EvaluationResult {
|
|
419
911
|
readonly timestamp: string;
|
|
420
|
-
readonly
|
|
912
|
+
readonly testId: string;
|
|
421
913
|
readonly dataset?: string;
|
|
422
914
|
readonly conversationId?: string;
|
|
423
915
|
readonly score: number;
|
|
424
916
|
readonly hits: readonly string[];
|
|
425
917
|
readonly misses: readonly string[];
|
|
426
|
-
readonly
|
|
918
|
+
readonly answer: string;
|
|
427
919
|
readonly target: string;
|
|
428
920
|
readonly reasoning?: string;
|
|
429
|
-
readonly
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
921
|
+
readonly requests?: {
|
|
922
|
+
readonly agent?: JsonObject;
|
|
923
|
+
readonly lm?: JsonObject;
|
|
924
|
+
readonly evaluator?: JsonObject;
|
|
925
|
+
};
|
|
926
|
+
readonly scores?: readonly EvaluatorResult[];
|
|
433
927
|
readonly error?: string;
|
|
434
928
|
/** Lightweight summary of the execution trace (always included when available) */
|
|
435
|
-
readonly
|
|
929
|
+
readonly trace?: TraceSummary;
|
|
930
|
+
/** Path to the temporary workspace directory (included on failure for debugging) */
|
|
931
|
+
readonly workspacePath?: string;
|
|
932
|
+
/** Input messages or prompt string sent to the agent */
|
|
933
|
+
readonly input?: readonly Message[] | string;
|
|
934
|
+
/** Full output messages from agent execution (only included when --trace flag is set) */
|
|
935
|
+
readonly output?: readonly Message[];
|
|
936
|
+
/** Captured output from workspace before_all script */
|
|
937
|
+
readonly beforeAllOutput?: string;
|
|
938
|
+
/** Captured output from workspace before_each script */
|
|
939
|
+
readonly beforeEachOutput?: string;
|
|
940
|
+
/** Captured output from workspace after_all script */
|
|
941
|
+
readonly afterAllOutput?: string;
|
|
942
|
+
/** Captured output from workspace after_each script */
|
|
943
|
+
readonly afterEachOutput?: string;
|
|
944
|
+
/** Unified diff of workspace file changes (when workspace_template is configured) */
|
|
945
|
+
readonly fileChanges?: string;
|
|
946
|
+
/** Individual trial results (only present when trials.count > 1) */
|
|
947
|
+
readonly trials?: readonly TrialResult[];
|
|
948
|
+
/** Aggregation metadata describing how the final score was computed from trials */
|
|
949
|
+
readonly aggregation?: TrialAggregation;
|
|
950
|
+
/** Whether the trial loop was terminated early due to cost limit */
|
|
951
|
+
readonly costLimited?: boolean;
|
|
436
952
|
}
|
|
437
953
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
438
954
|
interface EvaluatorResult {
|
|
@@ -446,7 +962,7 @@ interface EvaluatorResult {
|
|
|
446
962
|
readonly reasoning?: string;
|
|
447
963
|
readonly rawRequest?: JsonObject;
|
|
448
964
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
449
|
-
readonly
|
|
965
|
+
readonly scores?: readonly EvaluatorResult[];
|
|
450
966
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
451
967
|
readonly details?: JsonObject;
|
|
452
968
|
}
|
|
@@ -455,182 +971,88 @@ interface EvaluatorResult {
|
|
|
455
971
|
*/
|
|
456
972
|
declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
|
|
457
973
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
974
|
+
declare const MetadataSchema: z.ZodObject<{
|
|
975
|
+
name: z.ZodString;
|
|
976
|
+
description: z.ZodOptional<z.ZodString>;
|
|
977
|
+
version: z.ZodOptional<z.ZodString>;
|
|
978
|
+
author: z.ZodOptional<z.ZodString>;
|
|
979
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
980
|
+
license: z.ZodOptional<z.ZodString>;
|
|
981
|
+
requires: z.ZodOptional<z.ZodObject<{
|
|
982
|
+
agentv: z.ZodOptional<z.ZodString>;
|
|
983
|
+
}, "strip", z.ZodTypeAny, {
|
|
984
|
+
agentv?: string | undefined;
|
|
985
|
+
}, {
|
|
986
|
+
agentv?: string | undefined;
|
|
987
|
+
}>>;
|
|
988
|
+
}, "strip", z.ZodTypeAny, {
|
|
989
|
+
name: string;
|
|
990
|
+
description?: string | undefined;
|
|
991
|
+
version?: string | undefined;
|
|
992
|
+
author?: string | undefined;
|
|
993
|
+
tags?: string[] | undefined;
|
|
994
|
+
license?: string | undefined;
|
|
995
|
+
requires?: {
|
|
996
|
+
agentv?: string | undefined;
|
|
997
|
+
} | undefined;
|
|
998
|
+
}, {
|
|
999
|
+
name: string;
|
|
1000
|
+
description?: string | undefined;
|
|
1001
|
+
version?: string | undefined;
|
|
1002
|
+
author?: string | undefined;
|
|
1003
|
+
tags?: string[] | undefined;
|
|
1004
|
+
license?: string | undefined;
|
|
1005
|
+
requires?: {
|
|
1006
|
+
agentv?: string | undefined;
|
|
1007
|
+
} | undefined;
|
|
1008
|
+
}>;
|
|
1009
|
+
type EvalMetadata = z.infer<typeof MetadataSchema>;
|
|
1010
|
+
|
|
1011
|
+
declare const DEFAULT_EVAL_PATTERNS: readonly string[];
|
|
1012
|
+
type AgentVConfig$1 = {
|
|
470
1013
|
readonly guideline_patterns?: readonly string[];
|
|
471
|
-
readonly
|
|
472
|
-
|
|
473
|
-
readonly evalCaseId?: string;
|
|
474
|
-
readonly attempt?: number;
|
|
475
|
-
readonly maxOutputTokens?: number;
|
|
476
|
-
readonly temperature?: number;
|
|
477
|
-
readonly metadata?: JsonObject;
|
|
478
|
-
readonly signal?: AbortSignal;
|
|
479
|
-
}
|
|
1014
|
+
readonly eval_patterns?: readonly string[];
|
|
1015
|
+
};
|
|
480
1016
|
/**
|
|
481
|
-
*
|
|
482
|
-
*
|
|
1017
|
+
* Load optional .agentv/config.yaml configuration file.
|
|
1018
|
+
* Searches from eval file directory up to repo root.
|
|
483
1019
|
*/
|
|
484
|
-
|
|
485
|
-
/** Tool name */
|
|
486
|
-
readonly tool: string;
|
|
487
|
-
/** Tool input arguments */
|
|
488
|
-
readonly input?: unknown;
|
|
489
|
-
/** Tool output result */
|
|
490
|
-
readonly output?: unknown;
|
|
491
|
-
/** Stable identifier for pairing tool calls */
|
|
492
|
-
readonly id?: string;
|
|
493
|
-
/** ISO 8601 timestamp */
|
|
494
|
-
readonly timestamp?: string;
|
|
495
|
-
/** Duration of the tool call in milliseconds */
|
|
496
|
-
readonly durationMs?: number;
|
|
497
|
-
}
|
|
1020
|
+
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
498
1021
|
/**
|
|
499
|
-
*
|
|
500
|
-
* Represents a single message in the conversation with optional tool calls.
|
|
1022
|
+
* Determine whether a path references guideline content (instructions or prompts).
|
|
501
1023
|
*/
|
|
502
|
-
|
|
503
|
-
/** Message role (e.g., 'assistant', 'user', 'tool') */
|
|
504
|
-
readonly role: string;
|
|
505
|
-
/** Optional name for the message sender */
|
|
506
|
-
readonly name?: string;
|
|
507
|
-
/** Message content */
|
|
508
|
-
readonly content?: unknown;
|
|
509
|
-
/** Tool calls made in this message */
|
|
510
|
-
readonly toolCalls?: readonly ToolCall[];
|
|
511
|
-
/** ISO 8601 timestamp */
|
|
512
|
-
readonly timestamp?: string;
|
|
513
|
-
/** Duration of the message in milliseconds */
|
|
514
|
-
readonly durationMs?: number;
|
|
515
|
-
/** Provider-specific metadata */
|
|
516
|
-
readonly metadata?: Record<string, unknown>;
|
|
517
|
-
}
|
|
1024
|
+
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
518
1025
|
/**
|
|
519
|
-
*
|
|
1026
|
+
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
520
1027
|
*/
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
readonly
|
|
543
|
-
readonly kind: ProviderKind;
|
|
544
|
-
readonly targetName: string;
|
|
545
|
-
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
546
|
-
/**
|
|
547
|
-
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
548
|
-
*/
|
|
549
|
-
readonly supportsBatch?: boolean;
|
|
550
|
-
/**
|
|
551
|
-
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
552
|
-
* the orchestrator may send multiple requests in a single provider session.
|
|
553
|
-
*/
|
|
554
|
-
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
555
|
-
/**
|
|
556
|
-
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
557
|
-
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
558
|
-
*/
|
|
559
|
-
asLanguageModel?(): ai.LanguageModel;
|
|
560
|
-
}
|
|
561
|
-
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
562
|
-
interface TargetDefinition {
|
|
563
|
-
readonly name: string;
|
|
564
|
-
readonly provider: ProviderKind | string;
|
|
565
|
-
readonly judge_target?: string | undefined;
|
|
566
|
-
readonly workers?: number | undefined;
|
|
567
|
-
readonly provider_batching?: boolean | undefined;
|
|
568
|
-
readonly providerBatching?: boolean | undefined;
|
|
569
|
-
readonly endpoint?: string | unknown | undefined;
|
|
570
|
-
readonly resource?: string | unknown | undefined;
|
|
571
|
-
readonly resourceName?: string | unknown | undefined;
|
|
572
|
-
readonly api_key?: string | unknown | undefined;
|
|
573
|
-
readonly apiKey?: string | unknown | undefined;
|
|
574
|
-
readonly deployment?: string | unknown | undefined;
|
|
575
|
-
readonly deploymentName?: string | unknown | undefined;
|
|
576
|
-
readonly model?: string | unknown | undefined;
|
|
577
|
-
readonly version?: string | unknown | undefined;
|
|
578
|
-
readonly api_version?: string | unknown | undefined;
|
|
579
|
-
readonly variant?: string | unknown | undefined;
|
|
580
|
-
readonly thinking_budget?: number | unknown | undefined;
|
|
581
|
-
readonly thinkingBudget?: number | unknown | undefined;
|
|
582
|
-
readonly temperature?: number | unknown | undefined;
|
|
583
|
-
readonly max_output_tokens?: number | unknown | undefined;
|
|
584
|
-
readonly maxTokens?: number | unknown | undefined;
|
|
585
|
-
readonly executable?: string | unknown | undefined;
|
|
586
|
-
readonly command?: string | unknown | undefined;
|
|
587
|
-
readonly binary?: string | unknown | undefined;
|
|
588
|
-
readonly args?: unknown | undefined;
|
|
589
|
-
readonly arguments?: unknown | undefined;
|
|
590
|
-
readonly cwd?: string | unknown | undefined;
|
|
591
|
-
readonly timeout_seconds?: number | unknown | undefined;
|
|
592
|
-
readonly timeoutSeconds?: number | unknown | undefined;
|
|
593
|
-
readonly log_dir?: string | unknown | undefined;
|
|
594
|
-
readonly logDir?: string | unknown | undefined;
|
|
595
|
-
readonly log_directory?: string | unknown | undefined;
|
|
596
|
-
readonly logDirectory?: string | unknown | undefined;
|
|
597
|
-
readonly log_format?: string | unknown | undefined;
|
|
598
|
-
readonly logFormat?: string | unknown | undefined;
|
|
599
|
-
readonly log_output_format?: string | unknown | undefined;
|
|
600
|
-
readonly logOutputFormat?: string | unknown | undefined;
|
|
601
|
-
readonly system_prompt?: string | unknown | undefined;
|
|
602
|
-
readonly systemPrompt?: string | unknown | undefined;
|
|
603
|
-
readonly response?: string | unknown | undefined;
|
|
604
|
-
readonly delayMs?: number | unknown | undefined;
|
|
605
|
-
readonly delayMinMs?: number | unknown | undefined;
|
|
606
|
-
readonly delayMaxMs?: number | unknown | undefined;
|
|
607
|
-
readonly vscode_cmd?: string | unknown | undefined;
|
|
608
|
-
readonly wait?: boolean | unknown | undefined;
|
|
609
|
-
readonly dry_run?: boolean | unknown | undefined;
|
|
610
|
-
readonly dryRun?: boolean | unknown | undefined;
|
|
611
|
-
readonly subagent_root?: string | unknown | undefined;
|
|
612
|
-
readonly subagentRoot?: string | unknown | undefined;
|
|
613
|
-
readonly workspace_template?: string | unknown | undefined;
|
|
614
|
-
readonly workspaceTemplate?: string | unknown | undefined;
|
|
615
|
-
readonly command_template?: string | unknown | undefined;
|
|
616
|
-
readonly commandTemplate?: string | unknown | undefined;
|
|
617
|
-
readonly files_format?: string | unknown | undefined;
|
|
618
|
-
readonly filesFormat?: string | unknown | undefined;
|
|
619
|
-
readonly attachments_format?: string | unknown | undefined;
|
|
620
|
-
readonly attachmentsFormat?: string | unknown | undefined;
|
|
621
|
-
readonly env?: unknown | undefined;
|
|
622
|
-
readonly healthcheck?: unknown | undefined;
|
|
623
|
-
readonly max_retries?: number | unknown | undefined;
|
|
624
|
-
readonly maxRetries?: number | unknown | undefined;
|
|
625
|
-
readonly retry_initial_delay_ms?: number | unknown | undefined;
|
|
626
|
-
readonly retryInitialDelayMs?: number | unknown | undefined;
|
|
627
|
-
readonly retry_max_delay_ms?: number | unknown | undefined;
|
|
628
|
-
readonly retryMaxDelayMs?: number | unknown | undefined;
|
|
629
|
-
readonly retry_backoff_factor?: number | unknown | undefined;
|
|
630
|
-
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
631
|
-
readonly retry_status_codes?: unknown | undefined;
|
|
632
|
-
readonly retryStatusCodes?: unknown | undefined;
|
|
1028
|
+
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1029
|
+
/**
|
|
1030
|
+
* Extract targets array from parsed eval suite.
|
|
1031
|
+
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1032
|
+
* Returns undefined when no targets array is specified.
|
|
1033
|
+
*/
|
|
1034
|
+
declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
|
|
1035
|
+
/**
|
|
1036
|
+
* Extract per-test targets array from a raw test case object.
|
|
1037
|
+
*/
|
|
1038
|
+
declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
|
|
1039
|
+
/**
|
|
1040
|
+
* Extract trials configuration from parsed eval suite's execution block.
|
|
1041
|
+
* Returns undefined when count is 1 or not specified (no-op).
|
|
1042
|
+
*/
|
|
1043
|
+
declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
|
|
1044
|
+
/**
|
|
1045
|
+
* Cache configuration parsed from execution block.
|
|
1046
|
+
*/
|
|
1047
|
+
interface CacheConfig {
|
|
1048
|
+
readonly enabled: boolean;
|
|
1049
|
+
readonly cachePath?: string;
|
|
633
1050
|
}
|
|
1051
|
+
/**
|
|
1052
|
+
* Extract cache configuration from parsed eval suite's execution block.
|
|
1053
|
+
* Returns undefined when no cache config is specified.
|
|
1054
|
+
*/
|
|
1055
|
+
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
634
1056
|
|
|
635
1057
|
/**
|
|
636
1058
|
* Formatting mode for segment content.
|
|
@@ -654,12 +1076,7 @@ interface PromptInputs {
|
|
|
654
1076
|
* @param testCase - The evaluation test case
|
|
655
1077
|
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
656
1078
|
*/
|
|
657
|
-
declare function buildPromptInputs(testCase:
|
|
658
|
-
|
|
659
|
-
/**
|
|
660
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
661
|
-
*/
|
|
662
|
-
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
1079
|
+
declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
|
|
663
1080
|
|
|
664
1081
|
/**
|
|
665
1082
|
* Detect file format by extension.
|
|
@@ -668,21 +1085,49 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
|
|
|
668
1085
|
|
|
669
1086
|
type LoadOptions = {
|
|
670
1087
|
readonly verbose?: boolean;
|
|
671
|
-
/** Filter
|
|
1088
|
+
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
672
1089
|
readonly filter?: string;
|
|
673
1090
|
};
|
|
674
1091
|
/**
|
|
675
1092
|
* Read metadata from a test suite file (like target name).
|
|
676
|
-
* This is a convenience function for CLI tools that need metadata without loading all
|
|
1093
|
+
* This is a convenience function for CLI tools that need metadata without loading all tests.
|
|
677
1094
|
*/
|
|
678
1095
|
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
679
1096
|
target?: string;
|
|
1097
|
+
targets?: readonly string[];
|
|
1098
|
+
trials?: TrialsConfig;
|
|
680
1099
|
}>;
|
|
681
1100
|
/**
|
|
682
|
-
* Load
|
|
1101
|
+
* Load tests from an AgentV specification file (YAML or JSONL).
|
|
683
1102
|
* Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
|
|
684
1103
|
*/
|
|
685
|
-
|
|
1104
|
+
type EvalSuiteResult = {
|
|
1105
|
+
readonly tests: readonly EvalTest[];
|
|
1106
|
+
readonly trials?: TrialsConfig;
|
|
1107
|
+
/** Suite-level targets from execution.targets (matrix evaluation) */
|
|
1108
|
+
readonly targets?: readonly string[];
|
|
1109
|
+
/** Suite-level cache config from execution.cache */
|
|
1110
|
+
readonly cacheConfig?: CacheConfig;
|
|
1111
|
+
/** Suite-level metadata (name, description, version, etc.) */
|
|
1112
|
+
readonly metadata?: EvalMetadata;
|
|
1113
|
+
};
|
|
1114
|
+
/**
|
|
1115
|
+
* Load tests and suite metadata from a single parse.
|
|
1116
|
+
* Prefer this over calling loadTests + readTestSuiteMetadata separately.
|
|
1117
|
+
*/
|
|
1118
|
+
declare function loadTestSuite(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<EvalSuiteResult>;
|
|
1119
|
+
/** @deprecated Use `loadTestSuite` instead */
|
|
1120
|
+
declare const loadEvalSuite: typeof loadTestSuite;
|
|
1121
|
+
declare function loadTests(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalTest[]>;
|
|
1122
|
+
/** @deprecated Use `loadTests` instead */
|
|
1123
|
+
declare const loadEvalCases: typeof loadTests;
|
|
1124
|
+
/**
|
|
1125
|
+
* Load a single test by exact ID match.
|
|
1126
|
+
* Throws if the ID is not found.
|
|
1127
|
+
*/
|
|
1128
|
+
declare function loadTestById(evalFilePath: string, repoRoot: URL | string, evalId: string): Promise<EvalTest>;
|
|
1129
|
+
/** @deprecated Use `loadTestById` instead */
|
|
1130
|
+
declare const loadEvalCaseById: typeof loadTestById;
|
|
686
1131
|
|
|
687
1132
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
688
1133
|
/**
|
|
@@ -744,6 +1189,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
744
1189
|
commandTemplate: z.ZodString;
|
|
745
1190
|
filesFormat: z.ZodOptional<z.ZodString>;
|
|
746
1191
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1192
|
+
workspaceTemplate: z.ZodOptional<z.ZodString>;
|
|
747
1193
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
748
1194
|
healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
|
|
749
1195
|
type: z.ZodLiteral<"http">;
|
|
@@ -780,6 +1226,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
780
1226
|
cwd?: string | undefined;
|
|
781
1227
|
verbose?: boolean | undefined;
|
|
782
1228
|
filesFormat?: string | undefined;
|
|
1229
|
+
workspaceTemplate?: string | undefined;
|
|
783
1230
|
healthcheck?: {
|
|
784
1231
|
type: "http";
|
|
785
1232
|
url: string;
|
|
@@ -797,6 +1244,7 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
797
1244
|
cwd?: string | undefined;
|
|
798
1245
|
verbose?: boolean | undefined;
|
|
799
1246
|
filesFormat?: string | undefined;
|
|
1247
|
+
workspaceTemplate?: string | undefined;
|
|
800
1248
|
healthcheck?: {
|
|
801
1249
|
type: "http";
|
|
802
1250
|
url: string;
|
|
@@ -858,19 +1306,34 @@ interface GeminiResolvedConfig {
|
|
|
858
1306
|
readonly retry?: RetryConfig;
|
|
859
1307
|
}
|
|
860
1308
|
interface CodexResolvedConfig {
|
|
1309
|
+
readonly model?: string;
|
|
861
1310
|
readonly executable: string;
|
|
862
1311
|
readonly args?: readonly string[];
|
|
863
1312
|
readonly cwd?: string;
|
|
1313
|
+
readonly workspaceTemplate?: string;
|
|
864
1314
|
readonly timeoutMs?: number;
|
|
865
1315
|
readonly logDir?: string;
|
|
866
1316
|
readonly logFormat?: 'summary' | 'json';
|
|
867
1317
|
readonly systemPrompt?: string;
|
|
868
1318
|
}
|
|
869
|
-
interface
|
|
1319
|
+
interface CopilotCliResolvedConfig {
|
|
870
1320
|
readonly executable: string;
|
|
871
1321
|
readonly model?: string;
|
|
872
1322
|
readonly args?: readonly string[];
|
|
873
1323
|
readonly cwd?: string;
|
|
1324
|
+
readonly workspaceTemplate?: string;
|
|
1325
|
+
readonly timeoutMs?: number;
|
|
1326
|
+
readonly logDir?: string;
|
|
1327
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1328
|
+
readonly systemPrompt?: string;
|
|
1329
|
+
}
|
|
1330
|
+
interface CopilotSdkResolvedConfig {
|
|
1331
|
+
readonly cliUrl?: string;
|
|
1332
|
+
readonly cliPath?: string;
|
|
1333
|
+
readonly githubToken?: string;
|
|
1334
|
+
readonly model?: string;
|
|
1335
|
+
readonly cwd?: string;
|
|
1336
|
+
readonly workspaceTemplate?: string;
|
|
874
1337
|
readonly timeoutMs?: number;
|
|
875
1338
|
readonly logDir?: string;
|
|
876
1339
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -885,6 +1348,7 @@ interface PiCodingAgentResolvedConfig {
|
|
|
885
1348
|
readonly thinking?: string;
|
|
886
1349
|
readonly args?: readonly string[];
|
|
887
1350
|
readonly cwd?: string;
|
|
1351
|
+
readonly workspaceTemplate?: string;
|
|
888
1352
|
readonly timeoutMs?: number;
|
|
889
1353
|
readonly logDir?: string;
|
|
890
1354
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -897,13 +1361,14 @@ interface PiAgentSdkResolvedConfig {
|
|
|
897
1361
|
readonly timeoutMs?: number;
|
|
898
1362
|
readonly systemPrompt?: string;
|
|
899
1363
|
}
|
|
900
|
-
interface
|
|
901
|
-
readonly executable: string;
|
|
1364
|
+
interface ClaudeResolvedConfig {
|
|
902
1365
|
readonly model?: string;
|
|
903
1366
|
readonly systemPrompt?: string;
|
|
904
|
-
readonly args?: readonly string[];
|
|
905
1367
|
readonly cwd?: string;
|
|
1368
|
+
readonly workspaceTemplate?: string;
|
|
906
1369
|
readonly timeoutMs?: number;
|
|
1370
|
+
readonly maxTurns?: number;
|
|
1371
|
+
readonly maxBudgetUsd?: number;
|
|
907
1372
|
readonly logDir?: string;
|
|
908
1373
|
readonly logFormat?: 'summary' | 'json';
|
|
909
1374
|
}
|
|
@@ -914,11 +1379,12 @@ interface MockResolvedConfig {
|
|
|
914
1379
|
readonly delayMaxMs?: number;
|
|
915
1380
|
}
|
|
916
1381
|
interface VSCodeResolvedConfig {
|
|
917
|
-
readonly
|
|
1382
|
+
readonly executable: string;
|
|
918
1383
|
readonly waitForResponse: boolean;
|
|
919
1384
|
readonly dryRun: boolean;
|
|
920
1385
|
readonly subagentRoot?: string;
|
|
921
1386
|
readonly workspaceTemplate?: string;
|
|
1387
|
+
readonly timeoutMs?: number;
|
|
922
1388
|
}
|
|
923
1389
|
type ResolvedTarget = {
|
|
924
1390
|
readonly kind: 'azure';
|
|
@@ -948,13 +1414,20 @@ type ResolvedTarget = {
|
|
|
948
1414
|
readonly workers?: number;
|
|
949
1415
|
readonly providerBatching?: boolean;
|
|
950
1416
|
readonly config: CodexResolvedConfig;
|
|
1417
|
+
} | {
|
|
1418
|
+
readonly kind: 'copilot';
|
|
1419
|
+
readonly name: string;
|
|
1420
|
+
readonly judgeTarget?: string;
|
|
1421
|
+
readonly workers?: number;
|
|
1422
|
+
readonly providerBatching?: boolean;
|
|
1423
|
+
readonly config: CopilotSdkResolvedConfig;
|
|
951
1424
|
} | {
|
|
952
1425
|
readonly kind: 'copilot-cli';
|
|
953
1426
|
readonly name: string;
|
|
954
1427
|
readonly judgeTarget?: string;
|
|
955
1428
|
readonly workers?: number;
|
|
956
1429
|
readonly providerBatching?: boolean;
|
|
957
|
-
readonly config:
|
|
1430
|
+
readonly config: CopilotCliResolvedConfig;
|
|
958
1431
|
} | {
|
|
959
1432
|
readonly kind: 'pi-coding-agent';
|
|
960
1433
|
readonly name: string;
|
|
@@ -970,12 +1443,12 @@ type ResolvedTarget = {
|
|
|
970
1443
|
readonly providerBatching?: boolean;
|
|
971
1444
|
readonly config: PiAgentSdkResolvedConfig;
|
|
972
1445
|
} | {
|
|
973
|
-
readonly kind: 'claude
|
|
1446
|
+
readonly kind: 'claude';
|
|
974
1447
|
readonly name: string;
|
|
975
1448
|
readonly judgeTarget?: string;
|
|
976
1449
|
readonly workers?: number;
|
|
977
1450
|
readonly providerBatching?: boolean;
|
|
978
|
-
readonly config:
|
|
1451
|
+
readonly config: ClaudeResolvedConfig;
|
|
979
1452
|
} | {
|
|
980
1453
|
readonly kind: 'mock';
|
|
981
1454
|
readonly name: string;
|
|
@@ -1000,6 +1473,42 @@ type ResolvedTarget = {
|
|
|
1000
1473
|
};
|
|
1001
1474
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
1002
1475
|
|
|
1476
|
+
/**
|
|
1477
|
+
* Extensible provider registry.
|
|
1478
|
+
*
|
|
1479
|
+
* Replaces the hardcoded switch/case dispatch in createProvider() with
|
|
1480
|
+
* a registry of named factory functions. Built-in providers are registered
|
|
1481
|
+
* at startup; users can add custom providers via the registry API or by
|
|
1482
|
+
* dropping files in `.agentv/providers/`.
|
|
1483
|
+
*/
|
|
1484
|
+
|
|
1485
|
+
/**
|
|
1486
|
+
* Factory function that creates a Provider instance from a resolved target.
|
|
1487
|
+
*/
|
|
1488
|
+
type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
|
|
1489
|
+
/**
|
|
1490
|
+
* Registry of provider factory functions keyed by provider kind.
|
|
1491
|
+
*
|
|
1492
|
+
* Built-in providers are registered at startup. Custom providers can be
|
|
1493
|
+
* registered via the `register()` method.
|
|
1494
|
+
*/
|
|
1495
|
+
declare class ProviderRegistry {
|
|
1496
|
+
private readonly factories;
|
|
1497
|
+
/** Register a factory function for a provider kind. */
|
|
1498
|
+
register(kind: string, factory: ProviderFactoryFn): this;
|
|
1499
|
+
/** Get the factory function for a provider kind. */
|
|
1500
|
+
get(kind: string): ProviderFactoryFn | undefined;
|
|
1501
|
+
/** Check if a factory is registered for the given kind. */
|
|
1502
|
+
has(kind: string): boolean;
|
|
1503
|
+
/** List all registered provider kind names. */
|
|
1504
|
+
list(): string[];
|
|
1505
|
+
/**
|
|
1506
|
+
* Create a provider instance from a resolved target.
|
|
1507
|
+
* Falls back to CLI provider for unknown kinds (custom provider escape hatch).
|
|
1508
|
+
*/
|
|
1509
|
+
create(target: ResolvedTarget): Provider;
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1003
1512
|
declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
|
|
1004
1513
|
declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
|
|
1005
1514
|
|
|
@@ -1007,6 +1516,7 @@ interface EnsureSubagentsOptions {
|
|
|
1007
1516
|
readonly kind: 'vscode' | 'vscode-insiders';
|
|
1008
1517
|
readonly count: number;
|
|
1009
1518
|
readonly verbose?: boolean;
|
|
1519
|
+
readonly vscodeCmd?: string;
|
|
1010
1520
|
}
|
|
1011
1521
|
interface EnsureSubagentsResult {
|
|
1012
1522
|
readonly provisioned: boolean;
|
|
@@ -1041,15 +1551,25 @@ type PiLogListener = (entry: PiLogEntry) => void;
|
|
|
1041
1551
|
declare function consumePiLogEntries(): PiLogEntry[];
|
|
1042
1552
|
declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
|
|
1043
1553
|
|
|
1044
|
-
type
|
|
1554
|
+
type ClaudeLogEntry = {
|
|
1555
|
+
readonly filePath: string;
|
|
1556
|
+
readonly evalCaseId?: string;
|
|
1557
|
+
readonly targetName: string;
|
|
1558
|
+
readonly attempt?: number;
|
|
1559
|
+
};
|
|
1560
|
+
type ClaudeLogListener = (entry: ClaudeLogEntry) => void;
|
|
1561
|
+
declare function consumeClaudeLogEntries(): ClaudeLogEntry[];
|
|
1562
|
+
declare function subscribeToClaudeLogEntries(listener: ClaudeLogListener): () => void;
|
|
1563
|
+
|
|
1564
|
+
type CopilotSdkLogEntry = {
|
|
1045
1565
|
readonly filePath: string;
|
|
1046
1566
|
readonly evalCaseId?: string;
|
|
1047
1567
|
readonly targetName: string;
|
|
1048
1568
|
readonly attempt?: number;
|
|
1049
1569
|
};
|
|
1050
|
-
type
|
|
1051
|
-
declare function
|
|
1052
|
-
declare function
|
|
1570
|
+
type CopilotSdkLogListener = (entry: CopilotSdkLogEntry) => void;
|
|
1571
|
+
declare function consumeCopilotSdkLogEntries(): CopilotSdkLogEntry[];
|
|
1572
|
+
declare function subscribeToCopilotSdkLogEntries(listener: CopilotSdkLogListener): () => void;
|
|
1053
1573
|
|
|
1054
1574
|
type CopilotCliLogEntry = {
|
|
1055
1575
|
readonly filePath: string;
|
|
@@ -1061,6 +1581,38 @@ type CopilotCliLogListener = (entry: CopilotCliLogEntry) => void;
|
|
|
1061
1581
|
declare function consumeCopilotCliLogEntries(): CopilotCliLogEntry[];
|
|
1062
1582
|
declare function subscribeToCopilotCliLogEntries(listener: CopilotCliLogListener): () => void;
|
|
1063
1583
|
|
|
1584
|
+
/**
|
|
1585
|
+
* Convention-based discovery of custom provider scripts.
|
|
1586
|
+
*
|
|
1587
|
+
* Scans `.agentv/providers/` for TypeScript/JavaScript files and registers
|
|
1588
|
+
* them as CLI-like providers in the registry. The file name (without
|
|
1589
|
+
* extension) becomes the provider kind name.
|
|
1590
|
+
*
|
|
1591
|
+
* Example: `.agentv/providers/my-llm.ts` -> provider kind "my-llm" in targets.yaml
|
|
1592
|
+
*/
|
|
1593
|
+
|
|
1594
|
+
/**
|
|
1595
|
+
* Discover custom provider scripts from `.agentv/providers/` and register
|
|
1596
|
+
* them as provider kinds in the registry.
|
|
1597
|
+
*
|
|
1598
|
+
* Each discovered script is registered as a CLI-like provider that runs
|
|
1599
|
+
* via `bun run <filePath> {PROMPT}`. The script receives the prompt as
|
|
1600
|
+
* a CLI argument and should print its response to stdout.
|
|
1601
|
+
*
|
|
1602
|
+
* @param registry - The provider registry to register discovered providers into
|
|
1603
|
+
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
1604
|
+
* @returns Names of discovered provider kinds
|
|
1605
|
+
*/
|
|
1606
|
+
declare function discoverProviders(registry: ProviderRegistry, baseDir: string): Promise<string[]>;
|
|
1607
|
+
|
|
1608
|
+
/**
|
|
1609
|
+
* Create and return the default provider registry with all built-in providers.
|
|
1610
|
+
*/
|
|
1611
|
+
declare function createBuiltinProviderRegistry(): ProviderRegistry;
|
|
1612
|
+
/**
|
|
1613
|
+
* Create a provider from a resolved target using the default registry.
|
|
1614
|
+
* Custom providers can be registered via `createBuiltinProviderRegistry().register()`.
|
|
1615
|
+
*/
|
|
1064
1616
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
1065
1617
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
1066
1618
|
|
|
@@ -1070,7 +1622,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
|
|
|
1070
1622
|
*/
|
|
1071
1623
|
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
1072
1624
|
interface EvaluationContext {
|
|
1073
|
-
readonly evalCase:
|
|
1625
|
+
readonly evalCase: EvalTest;
|
|
1074
1626
|
readonly candidate: string;
|
|
1075
1627
|
readonly target: ResolvedTarget;
|
|
1076
1628
|
readonly provider: Provider;
|
|
@@ -1086,13 +1638,17 @@ interface EvaluationContext {
|
|
|
1086
1638
|
readonly evaluatorTemplateOverride?: string;
|
|
1087
1639
|
readonly evaluator?: EvaluatorConfig;
|
|
1088
1640
|
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
1089
|
-
readonly
|
|
1641
|
+
readonly output?: readonly Message[];
|
|
1090
1642
|
/** Lightweight summary of trace events (if available) */
|
|
1091
|
-
readonly
|
|
1643
|
+
readonly trace?: TraceSummary;
|
|
1092
1644
|
/** Resolver for target override in code judges */
|
|
1093
1645
|
readonly targetResolver?: TargetResolver;
|
|
1094
1646
|
/** List of available target names for code judges */
|
|
1095
1647
|
readonly availableTargets?: readonly string[];
|
|
1648
|
+
/** Unified diff of file changes from workspace (when workspace_template is configured) */
|
|
1649
|
+
readonly fileChanges?: string;
|
|
1650
|
+
/** Absolute path to the workspace directory (when workspace_template is configured) */
|
|
1651
|
+
readonly workspacePath?: string;
|
|
1096
1652
|
}
|
|
1097
1653
|
interface EvaluationScore {
|
|
1098
1654
|
readonly score: number;
|
|
@@ -1102,7 +1658,7 @@ interface EvaluationScore {
|
|
|
1102
1658
|
readonly expectedAspectCount: number;
|
|
1103
1659
|
readonly reasoning?: string;
|
|
1104
1660
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1105
|
-
readonly
|
|
1661
|
+
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1106
1662
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1107
1663
|
readonly details?: JsonObject;
|
|
1108
1664
|
}
|
|
@@ -1116,7 +1672,7 @@ interface ChildEvaluatorResult {
|
|
|
1116
1672
|
readonly misses: readonly string[];
|
|
1117
1673
|
readonly reasoning?: string;
|
|
1118
1674
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1119
|
-
readonly
|
|
1675
|
+
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1120
1676
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1121
1677
|
readonly details?: JsonObject;
|
|
1122
1678
|
}
|
|
@@ -1134,11 +1690,12 @@ declare function extractJsonBlob(text: string): string | undefined;
|
|
|
1134
1690
|
declare function parseJsonFromText(text: string): unknown;
|
|
1135
1691
|
declare function isNonEmptyString(value: unknown): value is string;
|
|
1136
1692
|
declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
|
|
1693
|
+
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
1137
1694
|
/**
|
|
1138
|
-
*
|
|
1139
|
-
*
|
|
1695
|
+
* Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
|
|
1696
|
+
* swaps hits/misses, and annotates reasoning.
|
|
1140
1697
|
*/
|
|
1141
|
-
declare function
|
|
1698
|
+
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
1142
1699
|
|
|
1143
1700
|
interface CodeEvaluatorOptions {
|
|
1144
1701
|
readonly script: readonly string[];
|
|
@@ -1175,6 +1732,7 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
1175
1732
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1176
1733
|
private aggregate;
|
|
1177
1734
|
private runWeightedAverage;
|
|
1735
|
+
private runThreshold;
|
|
1178
1736
|
private runCodeAggregator;
|
|
1179
1737
|
private runLlmAggregator;
|
|
1180
1738
|
}
|
|
@@ -1184,7 +1742,7 @@ interface CostEvaluatorOptions {
|
|
|
1184
1742
|
}
|
|
1185
1743
|
/**
|
|
1186
1744
|
* Evaluator that checks execution cost against a budget.
|
|
1187
|
-
* Uses
|
|
1745
|
+
* Uses trace.costUsd from the evaluation context.
|
|
1188
1746
|
*/
|
|
1189
1747
|
declare class CostEvaluator implements Evaluator {
|
|
1190
1748
|
readonly kind = "cost";
|
|
@@ -1193,6 +1751,25 @@ declare class CostEvaluator implements Evaluator {
|
|
|
1193
1751
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1194
1752
|
}
|
|
1195
1753
|
|
|
1754
|
+
interface ExecutionMetricsEvaluatorOptions {
|
|
1755
|
+
readonly config: ExecutionMetricsEvaluatorConfig;
|
|
1756
|
+
}
|
|
1757
|
+
/**
|
|
1758
|
+
* Evaluator that checks execution metrics against configured thresholds.
|
|
1759
|
+
* Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
|
|
1760
|
+
* and exploration ratio. Only specified thresholds are checked.
|
|
1761
|
+
*
|
|
1762
|
+
* Score is proportional: hits.length / (hits.length + misses.length)
|
|
1763
|
+
*/
|
|
1764
|
+
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
1765
|
+
readonly kind = "execution_metrics";
|
|
1766
|
+
private readonly config;
|
|
1767
|
+
constructor(options: ExecutionMetricsEvaluatorOptions);
|
|
1768
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1769
|
+
private extractConfiguredThresholds;
|
|
1770
|
+
private filterDefinedMetrics;
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1196
1773
|
interface FieldAccuracyEvaluatorOptions {
|
|
1197
1774
|
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1198
1775
|
}
|
|
@@ -1206,7 +1783,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
1206
1783
|
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1207
1784
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1208
1785
|
/**
|
|
1209
|
-
* Extract expected data from
|
|
1786
|
+
* Extract expected data from expected_output array.
|
|
1210
1787
|
* Looks for the last assistant message with content.
|
|
1211
1788
|
*/
|
|
1212
1789
|
private extractExpectedData;
|
|
@@ -1237,7 +1814,7 @@ interface LatencyEvaluatorOptions {
|
|
|
1237
1814
|
}
|
|
1238
1815
|
/**
|
|
1239
1816
|
* Evaluator that checks execution duration against a threshold.
|
|
1240
|
-
* Uses
|
|
1817
|
+
* Uses trace.durationMs from the evaluation context.
|
|
1241
1818
|
*/
|
|
1242
1819
|
declare class LatencyEvaluator implements Evaluator {
|
|
1243
1820
|
readonly kind = "latency";
|
|
@@ -1246,6 +1823,11 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
1246
1823
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1247
1824
|
}
|
|
1248
1825
|
|
|
1826
|
+
/**
|
|
1827
|
+
* Default evaluator template for the user prompt (variables will be substituted).
|
|
1828
|
+
* Custom evaluators can override this via evaluatorTemplate option.
|
|
1829
|
+
*/
|
|
1830
|
+
declare const DEFAULT_EVALUATOR_TEMPLATE: string;
|
|
1249
1831
|
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
1250
1832
|
interface LlmJudgeEvaluatorOptions {
|
|
1251
1833
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
@@ -1269,6 +1851,36 @@ declare const freeformEvaluationSchema: z.ZodObject<{
|
|
|
1269
1851
|
misses?: string[] | undefined;
|
|
1270
1852
|
reasoning?: string | undefined;
|
|
1271
1853
|
}>;
|
|
1854
|
+
declare const rubricEvaluationSchema: z.ZodObject<{
|
|
1855
|
+
checks: z.ZodArray<z.ZodObject<{
|
|
1856
|
+
id: z.ZodString;
|
|
1857
|
+
satisfied: z.ZodBoolean;
|
|
1858
|
+
reasoning: z.ZodString;
|
|
1859
|
+
}, "strip", z.ZodTypeAny, {
|
|
1860
|
+
reasoning: string;
|
|
1861
|
+
id: string;
|
|
1862
|
+
satisfied: boolean;
|
|
1863
|
+
}, {
|
|
1864
|
+
reasoning: string;
|
|
1865
|
+
id: string;
|
|
1866
|
+
satisfied: boolean;
|
|
1867
|
+
}>, "many">;
|
|
1868
|
+
overall_reasoning: z.ZodString;
|
|
1869
|
+
}, "strip", z.ZodTypeAny, {
|
|
1870
|
+
checks: {
|
|
1871
|
+
reasoning: string;
|
|
1872
|
+
id: string;
|
|
1873
|
+
satisfied: boolean;
|
|
1874
|
+
}[];
|
|
1875
|
+
overall_reasoning: string;
|
|
1876
|
+
}, {
|
|
1877
|
+
checks: {
|
|
1878
|
+
reasoning: string;
|
|
1879
|
+
id: string;
|
|
1880
|
+
satisfied: boolean;
|
|
1881
|
+
}[];
|
|
1882
|
+
overall_reasoning: string;
|
|
1883
|
+
}>;
|
|
1272
1884
|
|
|
1273
1885
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1274
1886
|
readonly kind = "llm_judge";
|
|
@@ -1297,13 +1909,87 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
1297
1909
|
* This schema is always appended to the evaluator template.
|
|
1298
1910
|
*/
|
|
1299
1911
|
declare function buildOutputSchema(): string;
|
|
1912
|
+
declare function buildRubricOutputSchema(): string;
|
|
1913
|
+
declare function substituteVariables(template: string, variables: Record<string, string>): string;
|
|
1914
|
+
declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
|
|
1915
|
+
score: number;
|
|
1916
|
+
verdict: 'pass' | 'fail' | 'borderline';
|
|
1917
|
+
hits: string[];
|
|
1918
|
+
misses: string[];
|
|
1919
|
+
};
|
|
1920
|
+
/**
|
|
1921
|
+
* Build the output schema for score-range rubric evaluation.
|
|
1922
|
+
*/
|
|
1923
|
+
declare function buildScoreRangeOutputSchema(): string;
|
|
1924
|
+
|
|
1925
|
+
interface AgentJudgeEvaluatorOptions {
|
|
1926
|
+
readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
|
|
1927
|
+
readonly maxSteps?: number;
|
|
1928
|
+
readonly temperature?: number;
|
|
1929
|
+
readonly evaluatorTemplate?: string;
|
|
1930
|
+
readonly judgeTargetProvider?: Provider;
|
|
1931
|
+
}
|
|
1932
|
+
declare class AgentJudgeEvaluator implements Evaluator {
|
|
1933
|
+
readonly kind = "agent_judge";
|
|
1934
|
+
private readonly resolveJudgeProvider;
|
|
1935
|
+
private readonly maxSteps;
|
|
1936
|
+
private readonly temperature;
|
|
1937
|
+
private readonly evaluatorTemplate?;
|
|
1938
|
+
private readonly judgeTargetProvider?;
|
|
1939
|
+
constructor(options: AgentJudgeEvaluatorOptions);
|
|
1940
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1941
|
+
/**
|
|
1942
|
+
* Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
|
|
1943
|
+
*/
|
|
1944
|
+
private evaluateBuiltIn;
|
|
1945
|
+
/**
|
|
1946
|
+
* Judge target mode: Delegates to an external agent provider via Provider.invoke().
|
|
1947
|
+
*/
|
|
1948
|
+
private evaluateWithJudgeTarget;
|
|
1949
|
+
/**
|
|
1950
|
+
* Parse the agent's response text into an EvaluationScore.
|
|
1951
|
+
* Supports both freeform and rubric modes.
|
|
1952
|
+
*/
|
|
1953
|
+
private parseResult;
|
|
1954
|
+
/**
|
|
1955
|
+
* Build system prompt for built-in mode.
|
|
1956
|
+
* Includes output format instructions.
|
|
1957
|
+
*/
|
|
1958
|
+
private buildSystemPrompt;
|
|
1959
|
+
/**
|
|
1960
|
+
* Build user prompt for built-in mode.
|
|
1961
|
+
* Uses custom template if provided, otherwise builds default prompt.
|
|
1962
|
+
*/
|
|
1963
|
+
private buildUserPrompt;
|
|
1964
|
+
/**
|
|
1965
|
+
* Build the full evaluation prompt for judge target mode (delegation).
|
|
1966
|
+
* Combines task context, criteria, candidate info, and output format instructions.
|
|
1967
|
+
*/
|
|
1968
|
+
private buildDelegatedPrompt;
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
interface LlmJudgePromptAssembly {
|
|
1972
|
+
systemPrompt: string;
|
|
1973
|
+
userPrompt: string;
|
|
1974
|
+
responseSchema: string;
|
|
1975
|
+
mode: 'freeform' | 'checklist' | 'score_range';
|
|
1976
|
+
}
|
|
1977
|
+
declare function assembleLlmJudgePrompt(input: {
|
|
1978
|
+
evalCase: EvalTest;
|
|
1979
|
+
candidate: string;
|
|
1980
|
+
promptInputs: PromptInputs;
|
|
1981
|
+
evaluatorConfig?: LlmJudgeEvaluatorConfig;
|
|
1982
|
+
output?: readonly Message[];
|
|
1983
|
+
fileChanges?: string;
|
|
1984
|
+
evaluatorTemplateOverride?: string;
|
|
1985
|
+
}): LlmJudgePromptAssembly;
|
|
1300
1986
|
|
|
1301
1987
|
interface TokenUsageEvaluatorOptions {
|
|
1302
1988
|
readonly config: TokenUsageEvaluatorConfig;
|
|
1303
1989
|
}
|
|
1304
1990
|
/**
|
|
1305
1991
|
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1306
|
-
* Uses
|
|
1992
|
+
* Uses trace.tokenUsage from the evaluation context.
|
|
1307
1993
|
*/
|
|
1308
1994
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
1309
1995
|
readonly kind = "token_usage";
|
|
@@ -1331,6 +2017,109 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
1331
2017
|
private evaluateAnyOrder;
|
|
1332
2018
|
private evaluateInOrder;
|
|
1333
2019
|
private evaluateExact;
|
|
2020
|
+
/**
|
|
2021
|
+
* Superset mode: actual trajectory must contain all expected tool calls.
|
|
2022
|
+
* Every expected item must be found in actual (greedy matching with consumption).
|
|
2023
|
+
* Extra tool calls in actual are OK.
|
|
2024
|
+
*/
|
|
2025
|
+
private evaluateSuperset;
|
|
2026
|
+
/**
|
|
2027
|
+
* Subset mode: every actual tool call must be in the allowed list.
|
|
2028
|
+
* Expected items are reusable (not consumed) - they define the allowed set.
|
|
2029
|
+
* If every actual call matches at least one expected item, score is 1.
|
|
2030
|
+
*/
|
|
2031
|
+
private evaluateSubset;
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
/**
|
|
2035
|
+
* Deterministic assertion evaluators.
|
|
2036
|
+
*
|
|
2037
|
+
* Pure functions that check agent output against simple conditions
|
|
2038
|
+
* and return a binary score (0 or 1) with descriptive hits/misses.
|
|
2039
|
+
*/
|
|
2040
|
+
type AssertionResult = {
|
|
2041
|
+
score: number;
|
|
2042
|
+
hits: string[];
|
|
2043
|
+
misses: string[];
|
|
2044
|
+
};
|
|
2045
|
+
/** Checks if `output` contains the given `value` substring. */
|
|
2046
|
+
declare function runContainsAssertion(output: string, value: string): AssertionResult;
|
|
2047
|
+
/** Checks if `output` matches the given regex `pattern`. */
|
|
2048
|
+
declare function runRegexAssertion(output: string, pattern: string): AssertionResult;
|
|
2049
|
+
/** Checks if `output` is valid JSON. */
|
|
2050
|
+
declare function runIsJsonAssertion(output: string): AssertionResult;
|
|
2051
|
+
/** Checks if `output` exactly equals `value` (both trimmed). */
|
|
2052
|
+
declare function runEqualsAssertion(output: string, value: string): AssertionResult;
|
|
2053
|
+
|
|
2054
|
+
/**
|
|
2055
|
+
* Extensible evaluator registry.
|
|
2056
|
+
*
|
|
2057
|
+
* Replaces the hardcoded switch/case dispatch in the orchestrator with
|
|
2058
|
+
* a registry of named factory functions. Built-in evaluators are registered
|
|
2059
|
+
* at startup; users can add custom evaluators via `defineAssertion()` in
|
|
2060
|
+
* `@agentv/eval` or by dropping files in `.agentv/assertions/`.
|
|
2061
|
+
*/
|
|
2062
|
+
|
|
2063
|
+
/**
|
|
2064
|
+
* Context passed to evaluator factory functions during creation.
|
|
2065
|
+
* Contains shared resources needed by evaluator instances.
|
|
2066
|
+
*/
|
|
2067
|
+
interface EvaluatorDispatchContext {
|
|
2068
|
+
/** Shared LLM judge provider (resolved at suite level) */
|
|
2069
|
+
readonly judgeProvider?: Provider;
|
|
2070
|
+
/** Function to resolve target names to providers */
|
|
2071
|
+
readonly targetResolver?: TargetResolver;
|
|
2072
|
+
/** Available target names for code judges */
|
|
2073
|
+
readonly availableTargets?: readonly string[];
|
|
2074
|
+
/** Agent timeout in ms */
|
|
2075
|
+
readonly agentTimeoutMs?: number;
|
|
2076
|
+
/** Directory containing the eval file (for composite member resolution) */
|
|
2077
|
+
readonly evalFileDir?: string;
|
|
2078
|
+
/** Shared LLM judge evaluator instance */
|
|
2079
|
+
readonly llmJudge: Evaluator;
|
|
2080
|
+
/** Reference to the registry itself (for composite evaluators that need to create children) */
|
|
2081
|
+
readonly registry: EvaluatorRegistry;
|
|
2082
|
+
}
|
|
2083
|
+
/**
|
|
2084
|
+
* Factory function that creates an Evaluator instance from a config.
|
|
2085
|
+
*
|
|
2086
|
+
* Factory functions handle all type-specific initialization logic:
|
|
2087
|
+
* - Reading prompt files for LLM judges
|
|
2088
|
+
* - Resolving script paths for code judges
|
|
2089
|
+
* - Creating adapter evaluators for deterministic assertions
|
|
2090
|
+
*/
|
|
2091
|
+
type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
|
|
2092
|
+
/**
|
|
2093
|
+
* Registry of evaluator factory functions keyed by evaluator type name.
|
|
2094
|
+
*
|
|
2095
|
+
* Built-in evaluators are registered at startup. Custom evaluators can be
|
|
2096
|
+
* registered via the `register()` method or discovered from `.agentv/assertions/`.
|
|
2097
|
+
*/
|
|
2098
|
+
declare class EvaluatorRegistry {
|
|
2099
|
+
private readonly factories;
|
|
2100
|
+
/** Register a factory function for an evaluator type. */
|
|
2101
|
+
register(type: string, factory: EvaluatorFactoryFn): this;
|
|
2102
|
+
/** Get the factory function for an evaluator type. */
|
|
2103
|
+
get(type: string): EvaluatorFactoryFn | undefined;
|
|
2104
|
+
/** Check if a factory is registered for the given type. */
|
|
2105
|
+
has(type: string): boolean;
|
|
2106
|
+
/** List all registered evaluator type names. */
|
|
2107
|
+
list(): string[];
|
|
2108
|
+
/**
|
|
2109
|
+
* Create an evaluator instance from a config, using the registered factory.
|
|
2110
|
+
* Throws if no factory is registered for the evaluator type.
|
|
2111
|
+
*/
|
|
2112
|
+
create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
|
|
2113
|
+
}
|
|
2114
|
+
/**
|
|
2115
|
+
* Adapter that wraps a synchronous assertion function as an Evaluator.
|
|
2116
|
+
* Used for deterministic assertions (contains, regex, is_json, equals).
|
|
2117
|
+
*/
|
|
2118
|
+
declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
2119
|
+
private readonly assertFn;
|
|
2120
|
+
readonly kind: string;
|
|
2121
|
+
constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
|
|
2122
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1334
2123
|
}
|
|
1335
2124
|
|
|
1336
2125
|
type MaybePromise<T> = T | Promise<T>;
|
|
@@ -1339,7 +2128,7 @@ interface EvaluationCache {
|
|
|
1339
2128
|
set(key: string, value: ProviderResponse): MaybePromise<void>;
|
|
1340
2129
|
}
|
|
1341
2130
|
interface RunEvalCaseOptions {
|
|
1342
|
-
readonly evalCase:
|
|
2131
|
+
readonly evalCase: EvalTest;
|
|
1343
2132
|
readonly provider: Provider;
|
|
1344
2133
|
readonly target: ResolvedTarget;
|
|
1345
2134
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
@@ -1356,10 +2145,26 @@ interface RunEvalCaseOptions {
|
|
|
1356
2145
|
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
1357
2146
|
/** List of available target names for code judges */
|
|
1358
2147
|
readonly availableTargets?: readonly string[];
|
|
2148
|
+
/** Unique identifier for the evaluation run (used for workspace management) */
|
|
2149
|
+
readonly evalRunId?: string;
|
|
2150
|
+
/** Keep workspace on success (default: cleanup on success, keep on failure) */
|
|
2151
|
+
readonly keepWorkspaces?: boolean;
|
|
2152
|
+
/** Force cleanup of workspaces even on failure */
|
|
2153
|
+
readonly cleanupWorkspaces?: boolean;
|
|
2154
|
+
/** Pre-created shared workspace path (shared across tests in a suite) */
|
|
2155
|
+
readonly sharedWorkspacePath?: string;
|
|
2156
|
+
/** Pre-initialized baseline commit for shared workspace */
|
|
2157
|
+
readonly sharedBaselineCommit?: string;
|
|
2158
|
+
/** Suite-level .code-workspace file (resolved from workspace.template) */
|
|
2159
|
+
readonly suiteWorkspaceFile?: string;
|
|
2160
|
+
/** Real-time observability callbacks passed to the provider */
|
|
2161
|
+
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2162
|
+
/** Evaluator type registry (with custom assertions discovered) */
|
|
2163
|
+
readonly typeRegistry?: EvaluatorRegistry;
|
|
1359
2164
|
}
|
|
1360
2165
|
interface ProgressEvent {
|
|
1361
2166
|
readonly workerId: number;
|
|
1362
|
-
readonly
|
|
2167
|
+
readonly testId: string;
|
|
1363
2168
|
readonly status: 'pending' | 'running' | 'completed' | 'failed';
|
|
1364
2169
|
readonly startedAt?: number;
|
|
1365
2170
|
readonly completedAt?: number;
|
|
@@ -1378,19 +2183,367 @@ interface RunEvaluationOptions {
|
|
|
1378
2183
|
readonly cache?: EvaluationCache;
|
|
1379
2184
|
readonly useCache?: boolean;
|
|
1380
2185
|
readonly now?: () => Date;
|
|
1381
|
-
/** Filter
|
|
2186
|
+
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
1382
2187
|
readonly filter?: string;
|
|
1383
2188
|
readonly verbose?: boolean;
|
|
1384
2189
|
readonly maxConcurrency?: number;
|
|
1385
|
-
readonly evalCases?: readonly
|
|
2190
|
+
readonly evalCases?: readonly EvalTest[];
|
|
1386
2191
|
readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
|
|
1387
2192
|
readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
|
|
2193
|
+
/** Keep workspace on success (default: cleanup on success, keep on failure) */
|
|
2194
|
+
readonly keepWorkspaces?: boolean;
|
|
2195
|
+
/** Force cleanup of workspaces even on failure */
|
|
2196
|
+
readonly cleanupWorkspaces?: boolean;
|
|
2197
|
+
/** Trial configuration for running eval cases multiple times */
|
|
2198
|
+
readonly trials?: TrialsConfig;
|
|
2199
|
+
/** Real-time observability callbacks passed to the provider */
|
|
2200
|
+
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
1388
2201
|
}
|
|
1389
2202
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
1390
2203
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
1391
2204
|
|
|
2205
|
+
/**
|
|
2206
|
+
* Programmatic API for running evaluations.
|
|
2207
|
+
*
|
|
2208
|
+
* Provides `evaluate()` — a high-level function for using AgentV as a library
|
|
2209
|
+
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
2210
|
+
* translation between file-based and programmatic usage.
|
|
2211
|
+
*
|
|
2212
|
+
* @example Inline tests
|
|
2213
|
+
* ```typescript
|
|
2214
|
+
* import { evaluate } from '@agentv/core';
|
|
2215
|
+
*
|
|
2216
|
+
* const results = await evaluate({
|
|
2217
|
+
* tests: [
|
|
2218
|
+
* {
|
|
2219
|
+
* id: 'capital',
|
|
2220
|
+
* input: 'What is the capital of France?',
|
|
2221
|
+
* expected_output: 'Paris',
|
|
2222
|
+
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
2223
|
+
* },
|
|
2224
|
+
* ],
|
|
2225
|
+
* target: { provider: 'mock_agent' },
|
|
2226
|
+
* });
|
|
2227
|
+
*
|
|
2228
|
+
* console.log(results.summary.passed, 'passed');
|
|
2229
|
+
* ```
|
|
2230
|
+
*
|
|
2231
|
+
* @example File-based
|
|
2232
|
+
* ```typescript
|
|
2233
|
+
* const results = await evaluate({
|
|
2234
|
+
* specFile: './evals/EVAL.yaml',
|
|
2235
|
+
* target: { provider: 'claude_agent' },
|
|
2236
|
+
* });
|
|
2237
|
+
* ```
|
|
2238
|
+
*
|
|
2239
|
+
* @module
|
|
2240
|
+
*/
|
|
2241
|
+
|
|
2242
|
+
/**
|
|
2243
|
+
* Inline test definition for the programmatic API.
|
|
2244
|
+
* Mirrors the YAML test structure.
|
|
2245
|
+
*/
|
|
2246
|
+
interface EvalTestInput {
|
|
2247
|
+
/** Unique test identifier */
|
|
2248
|
+
readonly id: string;
|
|
2249
|
+
/** What the response should accomplish */
|
|
2250
|
+
readonly criteria?: string;
|
|
2251
|
+
/** Input to the agent (string or message array) */
|
|
2252
|
+
readonly input: string | readonly {
|
|
2253
|
+
role: string;
|
|
2254
|
+
content: string;
|
|
2255
|
+
}[];
|
|
2256
|
+
/** Expected reference output */
|
|
2257
|
+
readonly expected_output?: string;
|
|
2258
|
+
/** Assertion evaluators */
|
|
2259
|
+
readonly assert?: readonly EvalAssertionInput[];
|
|
2260
|
+
/** Arbitrary metadata */
|
|
2261
|
+
readonly metadata?: Record<string, unknown>;
|
|
2262
|
+
}
|
|
2263
|
+
/**
|
|
2264
|
+
* Inline assertion definition for the programmatic API.
|
|
2265
|
+
* Matches the YAML `assert` block structure.
|
|
2266
|
+
*/
|
|
2267
|
+
interface EvalAssertionInput {
|
|
2268
|
+
/** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
|
|
2269
|
+
readonly type: string;
|
|
2270
|
+
/** Display name */
|
|
2271
|
+
readonly name?: string;
|
|
2272
|
+
/** Value for deterministic assertions (contains, equals, regex) */
|
|
2273
|
+
readonly value?: string;
|
|
2274
|
+
/** Weight for scoring */
|
|
2275
|
+
readonly weight?: number;
|
|
2276
|
+
/** Whether this assertion is required to pass */
|
|
2277
|
+
readonly required?: boolean | number;
|
|
2278
|
+
/** Prompt file for llm_judge */
|
|
2279
|
+
readonly prompt?: string;
|
|
2280
|
+
/** Script for code_judge */
|
|
2281
|
+
readonly script?: string | readonly string[];
|
|
2282
|
+
/** Additional config passed to the assertion */
|
|
2283
|
+
readonly config?: Record<string, unknown>;
|
|
2284
|
+
/** Nested assertions for composite type */
|
|
2285
|
+
readonly assert?: readonly EvalAssertionInput[];
|
|
2286
|
+
/** Rubric criteria for rubrics type */
|
|
2287
|
+
readonly criteria?: readonly (string | {
|
|
2288
|
+
id?: string;
|
|
2289
|
+
outcome: string;
|
|
2290
|
+
weight?: number;
|
|
2291
|
+
})[];
|
|
2292
|
+
/** Additional properties */
|
|
2293
|
+
readonly [key: string]: unknown;
|
|
2294
|
+
}
|
|
2295
|
+
/**
|
|
2296
|
+
* Configuration for `evaluate()`.
|
|
2297
|
+
* Accepts either inline tests or a spec file path.
|
|
2298
|
+
*/
|
|
2299
|
+
interface EvalConfig {
|
|
2300
|
+
/** Inline test definitions (mutually exclusive with specFile) */
|
|
2301
|
+
readonly tests?: readonly EvalTestInput[];
|
|
2302
|
+
/** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
|
|
2303
|
+
readonly specFile?: string;
|
|
2304
|
+
/** Target provider configuration */
|
|
2305
|
+
readonly target?: TargetDefinition;
|
|
2306
|
+
/** Suite-level assertions applied to all tests */
|
|
2307
|
+
readonly assert?: readonly EvalAssertionInput[];
|
|
2308
|
+
/** Filter tests by ID pattern (glob supported) */
|
|
2309
|
+
readonly filter?: string;
|
|
2310
|
+
/** Maximum concurrent workers (default: 3) */
|
|
2311
|
+
readonly workers?: number;
|
|
2312
|
+
/** Maximum retries on failure (default: 2) */
|
|
2313
|
+
readonly maxRetries?: number;
|
|
2314
|
+
/** Agent timeout in milliseconds (default: 120000) */
|
|
2315
|
+
readonly agentTimeoutMs?: number;
|
|
2316
|
+
/** Enable response caching */
|
|
2317
|
+
readonly cache?: boolean;
|
|
2318
|
+
/** Verbose logging */
|
|
2319
|
+
readonly verbose?: boolean;
|
|
2320
|
+
/** Callback for each completed result */
|
|
2321
|
+
readonly onResult?: (result: EvaluationResult) => void;
|
|
2322
|
+
}
|
|
2323
|
+
/**
|
|
2324
|
+
* Summary statistics for an evaluation run.
|
|
2325
|
+
*/
|
|
2326
|
+
interface EvalSummary {
|
|
2327
|
+
/** Total number of test cases */
|
|
2328
|
+
readonly total: number;
|
|
2329
|
+
/** Number of passing test cases (score >= 0.8) */
|
|
2330
|
+
readonly passed: number;
|
|
2331
|
+
/** Number of failing test cases (score < 0.5) */
|
|
2332
|
+
readonly failed: number;
|
|
2333
|
+
/** Number of borderline test cases (0.5 <= score < 0.8) */
|
|
2334
|
+
readonly borderline: number;
|
|
2335
|
+
/** Total duration in milliseconds */
|
|
2336
|
+
readonly durationMs: number;
|
|
2337
|
+
/** Mean score across all cases */
|
|
2338
|
+
readonly meanScore: number;
|
|
2339
|
+
}
|
|
2340
|
+
/**
|
|
2341
|
+
* Result of an `evaluate()` call.
|
|
2342
|
+
*/
|
|
2343
|
+
interface EvalRunResult {
|
|
2344
|
+
/** Individual test case results */
|
|
2345
|
+
readonly results: readonly EvaluationResult[];
|
|
2346
|
+
/** Aggregate summary statistics */
|
|
2347
|
+
readonly summary: EvalSummary;
|
|
2348
|
+
}
|
|
2349
|
+
/**
|
|
2350
|
+
* Run an evaluation suite against a target provider.
|
|
2351
|
+
*
|
|
2352
|
+
* Accepts either inline test definitions or a path to an EVAL.yaml spec file.
|
|
2353
|
+
* The config shape mirrors the YAML structure — users can translate between
|
|
2354
|
+
* file-based and programmatic usage 1:1.
|
|
2355
|
+
*
|
|
2356
|
+
* @param config - Evaluation configuration
|
|
2357
|
+
* @returns Typed evaluation results with summary statistics
|
|
2358
|
+
*
|
|
2359
|
+
* @example Inline tests with assertions
|
|
2360
|
+
* ```typescript
|
|
2361
|
+
* const { results, summary } = await evaluate({
|
|
2362
|
+
* tests: [
|
|
2363
|
+
* {
|
|
2364
|
+
* id: 'greeting',
|
|
2365
|
+
* input: 'Say hello',
|
|
2366
|
+
* assert: [{ type: 'contains', value: 'hello' }],
|
|
2367
|
+
* },
|
|
2368
|
+
* ],
|
|
2369
|
+
* target: { provider: 'mock_agent' },
|
|
2370
|
+
* });
|
|
2371
|
+
* console.log(`${summary.passed}/${summary.total} passed`);
|
|
2372
|
+
* ```
|
|
2373
|
+
*
|
|
2374
|
+
* @example Load from YAML
|
|
2375
|
+
* ```typescript
|
|
2376
|
+
* const { summary } = await evaluate({
|
|
2377
|
+
* specFile: './evals/my-eval.yaml',
|
|
2378
|
+
* filter: 'greeting-*',
|
|
2379
|
+
* });
|
|
2380
|
+
* ```
|
|
2381
|
+
*/
|
|
2382
|
+
declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
|
|
2383
|
+
|
|
2384
|
+
/**
|
|
2385
|
+
* Typed configuration file support for AgentV.
|
|
2386
|
+
*
|
|
2387
|
+
* Provides `defineConfig()` for use in `agentv.config.ts` files. Supports
|
|
2388
|
+
* auto-discovery, Zod validation, and IDE autocomplete.
|
|
2389
|
+
*
|
|
2390
|
+
* @example
|
|
2391
|
+
* ```typescript
|
|
2392
|
+
* // agentv.config.ts
|
|
2393
|
+
* import { defineConfig } from '@agentv/core';
|
|
2394
|
+
*
|
|
2395
|
+
* export default defineConfig({
|
|
2396
|
+
* execution: {
|
|
2397
|
+
* workers: 5,
|
|
2398
|
+
* maxRetries: 2,
|
|
2399
|
+
* agentTimeoutMs: 120_000,
|
|
2400
|
+
* },
|
|
2401
|
+
* output: {
|
|
2402
|
+
* format: 'jsonl',
|
|
2403
|
+
* dir: './results',
|
|
2404
|
+
* },
|
|
2405
|
+
* });
|
|
2406
|
+
* ```
|
|
2407
|
+
*
|
|
2408
|
+
* @module
|
|
2409
|
+
*/
|
|
2410
|
+
|
|
2411
|
+
/**
|
|
2412
|
+
* Schema for AgentV project-level configuration.
|
|
2413
|
+
*/
|
|
2414
|
+
declare const AgentVConfigSchema: z.ZodObject<{
|
|
2415
|
+
/** Default execution settings */
|
|
2416
|
+
execution: z.ZodOptional<z.ZodObject<{
|
|
2417
|
+
/** Number of parallel workers (default: 3) */
|
|
2418
|
+
workers: z.ZodOptional<z.ZodNumber>;
|
|
2419
|
+
/** Maximum retries on failure (default: 2) */
|
|
2420
|
+
maxRetries: z.ZodOptional<z.ZodNumber>;
|
|
2421
|
+
/** Agent timeout in milliseconds (default: 120000) */
|
|
2422
|
+
agentTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
2423
|
+
}, "strip", z.ZodTypeAny, {
|
|
2424
|
+
workers?: number | undefined;
|
|
2425
|
+
maxRetries?: number | undefined;
|
|
2426
|
+
agentTimeoutMs?: number | undefined;
|
|
2427
|
+
}, {
|
|
2428
|
+
workers?: number | undefined;
|
|
2429
|
+
maxRetries?: number | undefined;
|
|
2430
|
+
agentTimeoutMs?: number | undefined;
|
|
2431
|
+
}>>;
|
|
2432
|
+
/** Output settings */
|
|
2433
|
+
output: z.ZodOptional<z.ZodObject<{
|
|
2434
|
+
/** Output format */
|
|
2435
|
+
format: z.ZodOptional<z.ZodEnum<["jsonl", "yaml", "json", "xml"]>>;
|
|
2436
|
+
/** Output directory */
|
|
2437
|
+
dir: z.ZodOptional<z.ZodString>;
|
|
2438
|
+
}, "strip", z.ZodTypeAny, {
|
|
2439
|
+
dir?: string | undefined;
|
|
2440
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2441
|
+
}, {
|
|
2442
|
+
dir?: string | undefined;
|
|
2443
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2444
|
+
}>>;
|
|
2445
|
+
/** Response caching */
|
|
2446
|
+
cache: z.ZodOptional<z.ZodObject<{
|
|
2447
|
+
/** Enable response caching */
|
|
2448
|
+
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
2449
|
+
/** Cache file path */
|
|
2450
|
+
path: z.ZodOptional<z.ZodString>;
|
|
2451
|
+
}, "strip", z.ZodTypeAny, {
|
|
2452
|
+
enabled?: boolean | undefined;
|
|
2453
|
+
path?: string | undefined;
|
|
2454
|
+
}, {
|
|
2455
|
+
enabled?: boolean | undefined;
|
|
2456
|
+
path?: string | undefined;
|
|
2457
|
+
}>>;
|
|
2458
|
+
/** Cost and duration limits */
|
|
2459
|
+
limits: z.ZodOptional<z.ZodObject<{
|
|
2460
|
+
/** Maximum cost per run in USD */
|
|
2461
|
+
maxCostUsd: z.ZodOptional<z.ZodNumber>;
|
|
2462
|
+
/** Maximum duration per run in milliseconds */
|
|
2463
|
+
maxDurationMs: z.ZodOptional<z.ZodNumber>;
|
|
2464
|
+
}, "strip", z.ZodTypeAny, {
|
|
2465
|
+
maxDurationMs?: number | undefined;
|
|
2466
|
+
maxCostUsd?: number | undefined;
|
|
2467
|
+
}, {
|
|
2468
|
+
maxDurationMs?: number | undefined;
|
|
2469
|
+
maxCostUsd?: number | undefined;
|
|
2470
|
+
}>>;
|
|
2471
|
+
}, "strip", z.ZodTypeAny, {
|
|
2472
|
+
output?: {
|
|
2473
|
+
dir?: string | undefined;
|
|
2474
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2475
|
+
} | undefined;
|
|
2476
|
+
execution?: {
|
|
2477
|
+
workers?: number | undefined;
|
|
2478
|
+
maxRetries?: number | undefined;
|
|
2479
|
+
agentTimeoutMs?: number | undefined;
|
|
2480
|
+
} | undefined;
|
|
2481
|
+
cache?: {
|
|
2482
|
+
enabled?: boolean | undefined;
|
|
2483
|
+
path?: string | undefined;
|
|
2484
|
+
} | undefined;
|
|
2485
|
+
limits?: {
|
|
2486
|
+
maxDurationMs?: number | undefined;
|
|
2487
|
+
maxCostUsd?: number | undefined;
|
|
2488
|
+
} | undefined;
|
|
2489
|
+
}, {
|
|
2490
|
+
output?: {
|
|
2491
|
+
dir?: string | undefined;
|
|
2492
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2493
|
+
} | undefined;
|
|
2494
|
+
execution?: {
|
|
2495
|
+
workers?: number | undefined;
|
|
2496
|
+
maxRetries?: number | undefined;
|
|
2497
|
+
agentTimeoutMs?: number | undefined;
|
|
2498
|
+
} | undefined;
|
|
2499
|
+
cache?: {
|
|
2500
|
+
enabled?: boolean | undefined;
|
|
2501
|
+
path?: string | undefined;
|
|
2502
|
+
} | undefined;
|
|
2503
|
+
limits?: {
|
|
2504
|
+
maxDurationMs?: number | undefined;
|
|
2505
|
+
maxCostUsd?: number | undefined;
|
|
2506
|
+
} | undefined;
|
|
2507
|
+
}>;
|
|
2508
|
+
/**
|
|
2509
|
+
* AgentV project-level configuration type.
|
|
2510
|
+
* Inferred from the Zod schema for full type safety.
|
|
2511
|
+
*/
|
|
2512
|
+
type AgentVConfig = z.infer<typeof AgentVConfigSchema>;
|
|
2513
|
+
/**
|
|
2514
|
+
* Define a typed AgentV configuration.
|
|
2515
|
+
*
|
|
2516
|
+
* Use this in `agentv.config.ts` at your project root. The configuration
|
|
2517
|
+
* is validated at load time and provides full IDE autocomplete.
|
|
2518
|
+
*
|
|
2519
|
+
* @param config - Configuration object
|
|
2520
|
+
* @returns Validated configuration
|
|
2521
|
+
*
|
|
2522
|
+
* @example
|
|
2523
|
+
* ```typescript
|
|
2524
|
+
* import { defineConfig } from '@agentv/core';
|
|
2525
|
+
*
|
|
2526
|
+
* export default defineConfig({
|
|
2527
|
+
* execution: { workers: 5 },
|
|
2528
|
+
* output: { format: 'jsonl', dir: './results' },
|
|
2529
|
+
* limits: { maxCostUsd: 10.0 },
|
|
2530
|
+
* });
|
|
2531
|
+
* ```
|
|
2532
|
+
*/
|
|
2533
|
+
declare function defineConfig(config: AgentVConfig): AgentVConfig;
|
|
2534
|
+
/**
|
|
2535
|
+
* Discover and load an AgentV config file from the project root.
|
|
2536
|
+
*
|
|
2537
|
+
* Searches for config files in discovery order. Returns null if
|
|
2538
|
+
* no config file is found.
|
|
2539
|
+
*
|
|
2540
|
+
* @param projectRoot - Project root directory to search from
|
|
2541
|
+
* @returns Loaded and validated config, or null if not found
|
|
2542
|
+
*/
|
|
2543
|
+
declare function loadTsConfig(projectRoot: string): Promise<AgentVConfig | null>;
|
|
2544
|
+
|
|
1392
2545
|
interface GenerateRubricsOptions {
|
|
1393
|
-
readonly
|
|
2546
|
+
readonly criteria: string;
|
|
1394
2547
|
readonly question?: string;
|
|
1395
2548
|
readonly referenceAnswer?: string;
|
|
1396
2549
|
readonly provider: Provider;
|
|
@@ -1400,9 +2553,339 @@ interface GenerateRubricsOptions {
|
|
|
1400
2553
|
*/
|
|
1401
2554
|
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
1402
2555
|
|
|
2556
|
+
/**
|
|
2557
|
+
* Error thrown when the template path does not exist.
|
|
2558
|
+
*/
|
|
2559
|
+
declare class TemplateNotFoundError extends Error {
|
|
2560
|
+
constructor(templatePath: string);
|
|
2561
|
+
}
|
|
2562
|
+
/**
|
|
2563
|
+
* Error thrown when the template path is a file instead of a directory.
|
|
2564
|
+
*/
|
|
2565
|
+
declare class TemplateNotDirectoryError extends Error {
|
|
2566
|
+
constructor(templatePath: string);
|
|
2567
|
+
}
|
|
2568
|
+
/**
|
|
2569
|
+
* Error thrown when there is insufficient disk space or other I/O errors.
|
|
2570
|
+
*/
|
|
2571
|
+
declare class WorkspaceCreationError extends Error {
|
|
2572
|
+
readonly cause?: Error | undefined;
|
|
2573
|
+
constructor(message: string, cause?: Error | undefined);
|
|
2574
|
+
}
|
|
2575
|
+
/**
|
|
2576
|
+
* Get the workspace path for a specific eval case.
|
|
2577
|
+
*
|
|
2578
|
+
* Workspace structure:
|
|
2579
|
+
* {workspaceRoot}/{evalRunId}/{caseId}
|
|
2580
|
+
*
|
|
2581
|
+
* Example:
|
|
2582
|
+
* ~/.agentv/workspaces/abc123/case-01
|
|
2583
|
+
*
|
|
2584
|
+
* @param evalRunId - The unique identifier for the evaluation run
|
|
2585
|
+
* @param caseId - The unique identifier for the evaluation case
|
|
2586
|
+
* @param workspaceRoot - Optional custom workspace root directory (defaults to ~/.agentv/workspaces)
|
|
2587
|
+
* @returns Absolute path to the workspace directory
|
|
2588
|
+
*/
|
|
2589
|
+
declare function getWorkspacePath(evalRunId: string, caseId: string, workspaceRoot?: string): string;
|
|
2590
|
+
/**
|
|
2591
|
+
* Create a temporary workspace by copying a template directory.
|
|
2592
|
+
*
|
|
2593
|
+
* The workspace is created at ~/.agentv/workspaces/{evalRunId}/{caseId}/
|
|
2594
|
+
* The .git directory from the template is skipped during copy.
|
|
2595
|
+
*
|
|
2596
|
+
* @param templatePath - Absolute path to the template directory
|
|
2597
|
+
* @param evalRunId - The unique identifier for the evaluation run
|
|
2598
|
+
* @param caseId - The unique identifier for the evaluation case
|
|
2599
|
+
* @param workspaceRoot - Optional custom workspace root directory
|
|
2600
|
+
* @returns Absolute path to the created workspace directory
|
|
2601
|
+
* @throws TemplateNotFoundError if the template path does not exist
|
|
2602
|
+
* @throws TemplateNotDirectoryError if the template path is not a directory
|
|
2603
|
+
* @throws WorkspaceCreationError if there's an error creating the workspace
|
|
2604
|
+
*/
|
|
2605
|
+
declare function createTempWorkspace(templatePath: string, evalRunId: string, caseId: string, workspaceRoot?: string): Promise<string>;
|
|
2606
|
+
/**
|
|
2607
|
+
* Remove a single workspace directory.
|
|
2608
|
+
*
|
|
2609
|
+
* @param workspacePath - Absolute path to the workspace directory to remove
|
|
2610
|
+
* @throws Error if the cleanup fails
|
|
2611
|
+
*/
|
|
2612
|
+
declare function cleanupWorkspace(workspacePath: string): Promise<void>;
|
|
2613
|
+
/**
|
|
2614
|
+
* Remove all workspaces for an evaluation run.
|
|
2615
|
+
*
|
|
2616
|
+
* This removes the entire {workspaceRoot}/{evalRunId} directory,
|
|
2617
|
+
* cleaning up all case workspaces for that run.
|
|
2618
|
+
*
|
|
2619
|
+
* @param evalRunId - The unique identifier for the evaluation run
|
|
2620
|
+
* @param workspaceRoot - Optional custom workspace root directory
|
|
2621
|
+
* @throws Error if the cleanup fails
|
|
2622
|
+
*/
|
|
2623
|
+
declare function cleanupEvalWorkspaces(evalRunId: string, workspaceRoot?: string): Promise<void>;
|
|
2624
|
+
|
|
2625
|
+
/**
|
|
2626
|
+
* Context passed to workspace lifecycle scripts via stdin.
|
|
2627
|
+
*/
|
|
2628
|
+
interface ScriptExecutionContext {
|
|
2629
|
+
readonly workspacePath: string;
|
|
2630
|
+
readonly testId: string;
|
|
2631
|
+
readonly evalRunId: string;
|
|
2632
|
+
readonly caseInput?: string;
|
|
2633
|
+
readonly caseMetadata?: Record<string, unknown>;
|
|
2634
|
+
}
|
|
2635
|
+
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2636
|
+
/**
|
|
2637
|
+
* Executes a workspace lifecycle script (before_all, after_all, before_each, after_each).
|
|
2638
|
+
*
|
|
2639
|
+
* @param config - Workspace script configuration (script, timeout_ms, cwd)
|
|
2640
|
+
* @param context - Context passed to script via stdin (JSON)
|
|
2641
|
+
* @param failureMode - 'fatal' throws on non-zero exit; 'warn' logs warning
|
|
2642
|
+
* @returns Captured stdout from the script
|
|
2643
|
+
* @throws Error if script exits with non-zero code (fatal mode) or times out
|
|
2644
|
+
*/
|
|
2645
|
+
declare function executeWorkspaceScript(config: WorkspaceScriptConfig, context: ScriptExecutionContext, failureMode?: ScriptFailureMode): Promise<string>;
|
|
2646
|
+
|
|
2647
|
+
/**
|
|
2648
|
+
* Initialize a git baseline for workspace file change tracking.
|
|
2649
|
+
*
|
|
2650
|
+
* Runs `git init` directly in the workspace, stages all files, and creates
|
|
2651
|
+
* a baseline commit. Returns the commit hash for later diffing.
|
|
2652
|
+
*/
|
|
2653
|
+
declare function initializeBaseline(workspacePath: string): Promise<string>;
|
|
2654
|
+
/**
|
|
2655
|
+
* Capture file changes from workspace relative to the baseline commit.
|
|
2656
|
+
* Returns a unified diff string, or empty string if no changes.
|
|
2657
|
+
*
|
|
2658
|
+
* Supports nested git repos (e.g. cloned dependencies): stages files inside
|
|
2659
|
+
* each child repo first, then uses `--submodule=diff` to expand submodule
|
|
2660
|
+
* changes into individual file diffs rather than opaque gitlink hashes.
|
|
2661
|
+
*/
|
|
2662
|
+
declare function captureFileChanges(workspacePath: string, baselineCommit: string): Promise<string>;
|
|
2663
|
+
|
|
2664
|
+
interface ResolvedWorkspaceTemplate {
|
|
2665
|
+
/** Directory to copy as the working directory (for createTempWorkspace / request.cwd) */
|
|
2666
|
+
readonly dir: string;
|
|
2667
|
+
/** Optional .code-workspace file for VS Code providers */
|
|
2668
|
+
readonly workspaceFile?: string;
|
|
2669
|
+
}
|
|
2670
|
+
/**
|
|
2671
|
+
* Resolves a workspace.template value into a directory + optional .code-workspace file.
|
|
2672
|
+
*
|
|
2673
|
+
* Resolution rules:
|
|
2674
|
+
* - .code-workspace file → dir = parent directory, workspaceFile = the file
|
|
2675
|
+
* - Directory with exactly 1 .code-workspace → dir = directory, workspaceFile = that file
|
|
2676
|
+
* - Directory with N .code-workspace → dir = directory, workspaceFile = template.code-workspace (if present)
|
|
2677
|
+
* - Directory with 0 .code-workspace → dir = directory, workspaceFile = undefined
|
|
2678
|
+
*/
|
|
2679
|
+
declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
|
|
2680
|
+
|
|
2681
|
+
/**
|
|
2682
|
+
* File-based LLM response cache.
|
|
2683
|
+
* Stores provider responses as JSON files keyed by SHA-256 hash.
|
|
2684
|
+
* Directory structure: <cache_path>/<first-2-chars>/<full-hash>.json
|
|
2685
|
+
*/
|
|
2686
|
+
declare class ResponseCache implements EvaluationCache {
|
|
2687
|
+
private readonly cachePath;
|
|
2688
|
+
constructor(cachePath?: string);
|
|
2689
|
+
get(key: string): Promise<ProviderResponse | undefined>;
|
|
2690
|
+
set(key: string, value: ProviderResponse): Promise<void>;
|
|
2691
|
+
private keyToPath;
|
|
2692
|
+
}
|
|
2693
|
+
/**
|
|
2694
|
+
* Determine whether caching should be active for a given run.
|
|
2695
|
+
*
|
|
2696
|
+
* Precedence:
|
|
2697
|
+
* 1. --no-cache CLI flag → always disabled
|
|
2698
|
+
* 2. --cache CLI flag OR execution.cache YAML → enabled
|
|
2699
|
+
* 3. Default → disabled (safe for variability testing)
|
|
2700
|
+
*/
|
|
2701
|
+
declare function shouldEnableCache(params: {
|
|
2702
|
+
cliCache: boolean;
|
|
2703
|
+
cliNoCache: boolean;
|
|
2704
|
+
yamlCache?: boolean;
|
|
2705
|
+
}): boolean;
|
|
2706
|
+
/**
|
|
2707
|
+
* Check whether caching should be skipped for a target with temperature > 0.
|
|
2708
|
+
* Non-deterministic responses should not be cached unless explicitly forced.
|
|
2709
|
+
*/
|
|
2710
|
+
declare function shouldSkipCacheForTemperature(targetConfig: Record<string, unknown>): boolean;
|
|
2711
|
+
|
|
2712
|
+
/**
|
|
2713
|
+
* Recursively converts all keys in an object from camelCase to snake_case.
|
|
2714
|
+
* This is used to convert TypeScript internal representations to snake_case
|
|
2715
|
+
* for Python ecosystem compatibility in JSON payloads.
|
|
2716
|
+
*
|
|
2717
|
+
* Conversion rules:
|
|
2718
|
+
* - Object keys: camelCase -> snake_case
|
|
2719
|
+
* - Array elements: recursively converted
|
|
2720
|
+
* - Primitives: returned unchanged
|
|
2721
|
+
* - null/undefined: returned unchanged
|
|
2722
|
+
*
|
|
2723
|
+
* @param obj - The object to convert (can be any JSON-serializable value)
|
|
2724
|
+
* @returns A new object with all keys converted to snake_case
|
|
2725
|
+
*/
|
|
2726
|
+
declare function toSnakeCaseDeep(obj: unknown): unknown;
|
|
2727
|
+
/**
|
|
2728
|
+
* Recursively converts all keys in an object from snake_case to camelCase.
|
|
2729
|
+
* This is used by optional SDK helpers to map wire payloads into TypeScript-friendly
|
|
2730
|
+
* shapes.
|
|
2731
|
+
*
|
|
2732
|
+
* @param obj - The object to convert (can be any JSON-serializable value)
|
|
2733
|
+
* @returns A new object with all keys converted to camelCase
|
|
2734
|
+
*/
|
|
2735
|
+
declare function toCamelCaseDeep(obj: unknown): unknown;
|
|
2736
|
+
|
|
2737
|
+
/**
|
|
2738
|
+
* Trims an EvaluationResult for baseline storage.
|
|
2739
|
+
* Strips large debug/audit fields (denylist approach) while preserving
|
|
2740
|
+
* all fields needed for regression comparison (scores, hits, misses, etc.).
|
|
2741
|
+
*
|
|
2742
|
+
* Returns a new object — the input is not mutated.
|
|
2743
|
+
*/
|
|
2744
|
+
declare function trimBaselineResult(result: EvaluationResult): EvaluationResult;
|
|
2745
|
+
|
|
2746
|
+
/** Options for configuring the OTel trace exporter. */
|
|
2747
|
+
interface OtelExportOptions {
|
|
2748
|
+
/** OTLP endpoint URL */
|
|
2749
|
+
readonly endpoint?: string;
|
|
2750
|
+
/** Custom headers (e.g., auth) */
|
|
2751
|
+
readonly headers?: Record<string, string>;
|
|
2752
|
+
/** Whether to include message content in spans */
|
|
2753
|
+
readonly captureContent?: boolean;
|
|
2754
|
+
/** Service name for OTel resource */
|
|
2755
|
+
readonly serviceName?: string;
|
|
2756
|
+
/** When true, group messages into turn spans for multi-turn evals */
|
|
2757
|
+
readonly groupTurns?: boolean;
|
|
2758
|
+
/** Path to write OTLP JSON file (importable by OTel backends) */
|
|
2759
|
+
readonly otlpFilePath?: string;
|
|
2760
|
+
/** Path to write human-readable simple JSONL trace file */
|
|
2761
|
+
readonly traceFilePath?: string;
|
|
2762
|
+
}
|
|
2763
|
+
/** Preset configuration for a known observability backend. */
|
|
2764
|
+
interface OtelBackendPreset {
|
|
2765
|
+
readonly name: string;
|
|
2766
|
+
readonly endpoint: string;
|
|
2767
|
+
readonly headers: (env: Record<string, string | undefined>) => Record<string, string>;
|
|
2768
|
+
}
|
|
2769
|
+
|
|
2770
|
+
declare const OTEL_BACKEND_PRESETS: Record<string, OtelBackendPreset>;
|
|
2771
|
+
type OtelApi = any;
|
|
2772
|
+
type Tracer = any;
|
|
2773
|
+
declare class OtelTraceExporter {
|
|
2774
|
+
private readonly options;
|
|
2775
|
+
private provider;
|
|
2776
|
+
private tracer;
|
|
2777
|
+
private api;
|
|
2778
|
+
private W3CPropagator;
|
|
2779
|
+
constructor(options: OtelExportOptions);
|
|
2780
|
+
/** Initialize the OTel SDK. Returns false if OTel packages are not available. */
|
|
2781
|
+
init(): Promise<boolean>;
|
|
2782
|
+
/** Export a single evaluation result as an OTel trace. */
|
|
2783
|
+
exportResult(result: EvaluationResult): Promise<void>;
|
|
2784
|
+
/** Flush pending spans and shut down. */
|
|
2785
|
+
shutdown(): Promise<void>;
|
|
2786
|
+
/** Create a streaming observer for real-time span export */
|
|
2787
|
+
createStreamingObserver(): OtelStreamingObserver | null;
|
|
2788
|
+
private exportMessage;
|
|
2789
|
+
}
|
|
2790
|
+
/**
|
|
2791
|
+
* Streaming observer that creates OTel spans in real-time during eval execution.
|
|
2792
|
+
* Spans are exported immediately via SimpleSpanProcessor as each tool call / LLM response completes.
|
|
2793
|
+
*/
|
|
2794
|
+
declare class OtelStreamingObserver {
|
|
2795
|
+
private readonly tracer;
|
|
2796
|
+
private readonly api;
|
|
2797
|
+
private readonly captureContent;
|
|
2798
|
+
private readonly parentCtx?;
|
|
2799
|
+
private rootSpan;
|
|
2800
|
+
private rootCtx;
|
|
2801
|
+
constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
|
|
2802
|
+
/** Create root eval span immediately (visible in backend right away) */
|
|
2803
|
+
startEvalCase(testId: string, target: string, dataset?: string): void;
|
|
2804
|
+
/** Create and immediately export a tool span */
|
|
2805
|
+
onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
|
|
2806
|
+
/** Create and immediately export an LLM span */
|
|
2807
|
+
onLlmCall(model: string, tokenUsage?: ProviderTokenUsage): void;
|
|
2808
|
+
/** Finalize root span with score/verdict after evaluation completes */
|
|
2809
|
+
finalizeEvalCase(score: number, error?: string): void;
|
|
2810
|
+
/** Get ProviderStreamCallbacks for passing to providers */
|
|
2811
|
+
getStreamCallbacks(): ProviderStreamCallbacks;
|
|
2812
|
+
}
|
|
2813
|
+
|
|
2814
|
+
type ReadableSpan$1 = any;
|
|
2815
|
+
/**
|
|
2816
|
+
* SpanExporter that writes OTLP JSON (the standard OTel wire format) to a file.
|
|
2817
|
+
* The file can be imported by any OTel-compatible backend.
|
|
2818
|
+
*/
|
|
2819
|
+
declare class OtlpJsonFileExporter {
|
|
2820
|
+
private spans;
|
|
2821
|
+
private filePath;
|
|
2822
|
+
constructor(filePath: string);
|
|
2823
|
+
export(spans: ReadableSpan$1[], resultCallback: (result: {
|
|
2824
|
+
code: number;
|
|
2825
|
+
}) => void): void;
|
|
2826
|
+
shutdown(): Promise<void>;
|
|
2827
|
+
forceFlush(): Promise<void>;
|
|
2828
|
+
private flush;
|
|
2829
|
+
}
|
|
2830
|
+
|
|
2831
|
+
type ReadableSpan = any;
|
|
2832
|
+
/**
|
|
2833
|
+
* SpanExporter that writes human-readable JSONL (one line per root span).
|
|
2834
|
+
* Designed for quick debugging and analysis without OTel tooling.
|
|
2835
|
+
*/
|
|
2836
|
+
declare class SimpleTraceFileExporter {
|
|
2837
|
+
private stream;
|
|
2838
|
+
private filePath;
|
|
2839
|
+
private streamReady;
|
|
2840
|
+
private pendingWrites;
|
|
2841
|
+
private _shuttingDown;
|
|
2842
|
+
constructor(filePath: string);
|
|
2843
|
+
private ensureStream;
|
|
2844
|
+
export(spans: ReadableSpan[], resultCallback: (result: {
|
|
2845
|
+
code: number;
|
|
2846
|
+
}) => void): void;
|
|
2847
|
+
shutdown(): Promise<void>;
|
|
2848
|
+
forceFlush(): Promise<void>;
|
|
2849
|
+
private collectChildren;
|
|
2850
|
+
private buildSimpleRecord;
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
/**
|
|
2854
|
+
* Factory functions for all built-in evaluator types.
|
|
2855
|
+
*
|
|
2856
|
+
* Each factory creates an Evaluator instance from an EvaluatorConfig,
|
|
2857
|
+
* handling type-specific initialization logic. These are registered into
|
|
2858
|
+
* the EvaluatorRegistry at startup.
|
|
2859
|
+
*/
|
|
2860
|
+
|
|
2861
|
+
/**
|
|
2862
|
+
* Create a new EvaluatorRegistry with all built-in evaluator types registered.
|
|
2863
|
+
*/
|
|
2864
|
+
declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
2865
|
+
|
|
2866
|
+
/**
|
|
2867
|
+
* Convention-based discovery of custom assertion scripts.
|
|
2868
|
+
*
|
|
2869
|
+
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
2870
|
+
* them as code_judge evaluators in the registry. The file name (without
|
|
2871
|
+
* extension) becomes the evaluator type name.
|
|
2872
|
+
*
|
|
2873
|
+
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
2874
|
+
*/
|
|
2875
|
+
|
|
2876
|
+
/**
|
|
2877
|
+
* Discover custom assertion scripts from `.agentv/assertions/` and register
|
|
2878
|
+
* them as evaluator types in the registry.
|
|
2879
|
+
*
|
|
2880
|
+
* @param registry - The evaluator registry to register discovered assertions into
|
|
2881
|
+
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
2882
|
+
* @returns Names of discovered assertion types
|
|
2883
|
+
*/
|
|
2884
|
+
declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
|
|
2885
|
+
|
|
1403
2886
|
type AgentKernel = {
|
|
1404
2887
|
status: string;
|
|
1405
2888
|
};
|
|
1406
2889
|
declare function createAgentKernel(): AgentKernel;
|
|
1407
2890
|
|
|
1408
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type
|
|
2891
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|