@agentv/core 1.4.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KPHTMTZ3.js → chunk-IBTKEEOT.js} +337 -83
- package/dist/chunk-IBTKEEOT.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +83 -71
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +4137 -1182
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +445 -40
- package/dist/index.d.ts +445 -40
- package/dist/index.js +3822 -1130
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-KPHTMTZ3.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,9 +1,21 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
1
2
|
import * as ai from 'ai';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Trace event types for capturing agent execution traces.
|
|
5
6
|
* Provides a normalized, provider-agnostic model for tool-call trajectories.
|
|
6
7
|
*/
|
|
8
|
+
/**
|
|
9
|
+
* Token usage metrics from provider execution.
|
|
10
|
+
*/
|
|
11
|
+
interface TokenUsage {
|
|
12
|
+
/** Input/prompt tokens consumed */
|
|
13
|
+
readonly input: number;
|
|
14
|
+
/** Output/completion tokens generated */
|
|
15
|
+
readonly output: number;
|
|
16
|
+
/** Cached tokens (optional, provider-specific) */
|
|
17
|
+
readonly cached?: number;
|
|
18
|
+
}
|
|
7
19
|
/**
|
|
8
20
|
* Compact summary of a trace for lightweight persistence.
|
|
9
21
|
* Included in results by default to avoid payload bloat.
|
|
@@ -17,6 +29,14 @@ interface TraceSummary {
|
|
|
17
29
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
18
30
|
/** Number of error events */
|
|
19
31
|
readonly errorCount: number;
|
|
32
|
+
/** Token usage metrics (optional, from provider) */
|
|
33
|
+
readonly tokenUsage?: TokenUsage;
|
|
34
|
+
/** Total cost in USD (optional, from provider) */
|
|
35
|
+
readonly costUsd?: number;
|
|
36
|
+
/** Total execution duration in milliseconds (optional) */
|
|
37
|
+
readonly durationMs?: number;
|
|
38
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
39
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
20
40
|
}
|
|
21
41
|
/**
|
|
22
42
|
* Configuration for tool_trajectory evaluator.
|
|
@@ -38,6 +58,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
38
58
|
*/
|
|
39
59
|
interface ToolTrajectoryExpectedItem {
|
|
40
60
|
readonly tool: string;
|
|
61
|
+
/** Optional argument matching: 'any' skips validation, object performs partial deep equality */
|
|
62
|
+
readonly args?: 'any' | Record<string, unknown>;
|
|
41
63
|
}
|
|
42
64
|
/**
|
|
43
65
|
* Simplified input type for computeTraceSummary.
|
|
@@ -53,6 +75,53 @@ interface OutputMessageLike {
|
|
|
53
75
|
* Used for default result persistence without payload bloat.
|
|
54
76
|
*/
|
|
55
77
|
declare function computeTraceSummary(messages: readonly OutputMessageLike[]): TraceSummary;
|
|
78
|
+
/**
|
|
79
|
+
* Default tool names considered as exploration/read-only operations.
|
|
80
|
+
* Can be overridden per-evaluation via config.
|
|
81
|
+
*/
|
|
82
|
+
declare const DEFAULT_EXPLORATION_TOOLS: readonly ["read", "grep", "glob", "search", "list", "Read", "Grep", "Glob", "WebSearch", "WebFetch"];
|
|
83
|
+
/**
|
|
84
|
+
* Ratio of exploration tool calls to total tool calls.
|
|
85
|
+
* Returns undefined if there are no tool calls.
|
|
86
|
+
*
|
|
87
|
+
* @param summary - Trace summary with tool call counts
|
|
88
|
+
* @param explorationTools - Tool names considered exploration (defaults to DEFAULT_EXPLORATION_TOOLS)
|
|
89
|
+
* @returns Ratio between 0 and 1, or undefined if no tool calls
|
|
90
|
+
*/
|
|
91
|
+
declare function explorationRatio(summary: TraceSummary, explorationTools?: readonly string[]): number | undefined;
|
|
92
|
+
/**
|
|
93
|
+
* Average tokens consumed per tool call.
|
|
94
|
+
* Returns undefined if tokenUsage is not available or no tool calls.
|
|
95
|
+
*
|
|
96
|
+
* @param summary - Trace summary with optional token usage
|
|
97
|
+
* @returns Average tokens per tool call, or undefined
|
|
98
|
+
*/
|
|
99
|
+
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
100
|
+
/**
|
|
101
|
+
* Average tool duration across all tool calls.
|
|
102
|
+
* Returns undefined if toolDurations is not available or empty.
|
|
103
|
+
*
|
|
104
|
+
* @param summary - Trace summary with optional tool durations
|
|
105
|
+
* @returns Average duration in milliseconds, or undefined
|
|
106
|
+
*/
|
|
107
|
+
declare function avgToolDurationMs(summary: TraceSummary): number | undefined;
|
|
108
|
+
/**
|
|
109
|
+
* Execution metrics from provider response.
|
|
110
|
+
*/
|
|
111
|
+
interface ExecutionMetrics {
|
|
112
|
+
readonly tokenUsage?: TokenUsage;
|
|
113
|
+
readonly costUsd?: number;
|
|
114
|
+
readonly durationMs?: number;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Merge execution metrics from provider response into a trace summary.
|
|
118
|
+
* Returns a new TraceSummary with metrics fields populated.
|
|
119
|
+
*
|
|
120
|
+
* @param summary - Base trace summary from computeTraceSummary
|
|
121
|
+
* @param metrics - Optional execution metrics from provider
|
|
122
|
+
* @returns TraceSummary with merged metrics
|
|
123
|
+
*/
|
|
124
|
+
declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
|
|
56
125
|
|
|
57
126
|
/**
|
|
58
127
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -132,17 +201,19 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
132
201
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
133
202
|
*/
|
|
134
203
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
135
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
|
|
204
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
136
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
137
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
138
207
|
type CodeEvaluatorConfig = {
|
|
139
208
|
readonly name: string;
|
|
140
209
|
readonly type: 'code';
|
|
141
|
-
readonly script: string;
|
|
210
|
+
readonly script: readonly string[];
|
|
142
211
|
readonly resolvedScriptPath?: string;
|
|
143
212
|
readonly cwd?: string;
|
|
144
213
|
readonly resolvedCwd?: string;
|
|
145
214
|
readonly weight?: number;
|
|
215
|
+
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
|
+
readonly config?: JsonObject;
|
|
146
217
|
};
|
|
147
218
|
type LlmJudgeEvaluatorConfig = {
|
|
148
219
|
readonly name: string;
|
|
@@ -178,7 +249,85 @@ type CompositeEvaluatorConfig = {
|
|
|
178
249
|
readonly aggregator: CompositeAggregatorConfig;
|
|
179
250
|
readonly weight?: number;
|
|
180
251
|
};
|
|
181
|
-
|
|
252
|
+
/**
|
|
253
|
+
* Match type for field accuracy evaluation.
|
|
254
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
|
|
255
|
+
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
256
|
+
*/
|
|
257
|
+
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
258
|
+
/**
|
|
259
|
+
* Aggregation strategy for combining field scores.
|
|
260
|
+
*/
|
|
261
|
+
type FieldAggregationType = 'weighted_average' | 'all_or_nothing';
|
|
262
|
+
/**
|
|
263
|
+
* Configuration for a single field to evaluate.
|
|
264
|
+
*/
|
|
265
|
+
type FieldConfig = {
|
|
266
|
+
/** Dot-notation path to the field (e.g., "invoice.vendor.name" or "items[0].amount") */
|
|
267
|
+
readonly path: string;
|
|
268
|
+
/** Match strategy for this field */
|
|
269
|
+
readonly match: FieldMatchType;
|
|
270
|
+
/** Whether this field is required (missing required fields count as failures) */
|
|
271
|
+
readonly required?: boolean;
|
|
272
|
+
/** Weight for aggregation (default: 1.0) */
|
|
273
|
+
readonly weight?: number;
|
|
274
|
+
/** Tolerance for numeric matching (absolute value unless relative is true) */
|
|
275
|
+
readonly tolerance?: number;
|
|
276
|
+
/** Whether tolerance is relative (percentage) vs absolute */
|
|
277
|
+
readonly relative?: boolean;
|
|
278
|
+
/** Date formats to try when parsing (default: common formats) */
|
|
279
|
+
readonly formats?: readonly string[];
|
|
280
|
+
};
|
|
281
|
+
/**
|
|
282
|
+
* Configuration for the field_accuracy evaluator.
|
|
283
|
+
*/
|
|
284
|
+
type FieldAccuracyEvaluatorConfig = {
|
|
285
|
+
readonly name: string;
|
|
286
|
+
readonly type: 'field_accuracy';
|
|
287
|
+
/** Fields to compare between candidate and expected */
|
|
288
|
+
readonly fields: readonly FieldConfig[];
|
|
289
|
+
/** Strategy for combining field scores (default: weighted_average) */
|
|
290
|
+
readonly aggregation?: FieldAggregationType;
|
|
291
|
+
readonly weight?: number;
|
|
292
|
+
};
|
|
293
|
+
/**
|
|
294
|
+
* Configuration for the latency evaluator.
|
|
295
|
+
* Checks execution duration against a threshold.
|
|
296
|
+
*/
|
|
297
|
+
type LatencyEvaluatorConfig = {
|
|
298
|
+
readonly name: string;
|
|
299
|
+
readonly type: 'latency';
|
|
300
|
+
/** Maximum allowed duration in milliseconds */
|
|
301
|
+
readonly threshold: number;
|
|
302
|
+
readonly weight?: number;
|
|
303
|
+
};
|
|
304
|
+
/**
|
|
305
|
+
* Configuration for the cost evaluator.
|
|
306
|
+
* Checks execution cost against a budget.
|
|
307
|
+
*/
|
|
308
|
+
type CostEvaluatorConfig = {
|
|
309
|
+
readonly name: string;
|
|
310
|
+
readonly type: 'cost';
|
|
311
|
+
/** Maximum allowed cost in USD */
|
|
312
|
+
readonly budget: number;
|
|
313
|
+
readonly weight?: number;
|
|
314
|
+
};
|
|
315
|
+
/**
|
|
316
|
+
* Configuration for the token_usage evaluator.
|
|
317
|
+
* Checks provider-reported token usage against configured limits.
|
|
318
|
+
*/
|
|
319
|
+
type TokenUsageEvaluatorConfig = {
|
|
320
|
+
readonly name: string;
|
|
321
|
+
readonly type: 'token_usage';
|
|
322
|
+
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
323
|
+
readonly max_total?: number;
|
|
324
|
+
/** Maximum allowed input tokens (prompt) */
|
|
325
|
+
readonly max_input?: number;
|
|
326
|
+
/** Maximum allowed output tokens (completion) */
|
|
327
|
+
readonly max_output?: number;
|
|
328
|
+
readonly weight?: number;
|
|
329
|
+
};
|
|
330
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
|
|
182
331
|
/**
|
|
183
332
|
* Eval case definition sourced from AgentV specs.
|
|
184
333
|
*/
|
|
@@ -204,23 +353,22 @@ interface EvalCase {
|
|
|
204
353
|
*/
|
|
205
354
|
interface EvaluationResult {
|
|
206
355
|
readonly timestamp: string;
|
|
207
|
-
readonly
|
|
356
|
+
readonly evalId: string;
|
|
208
357
|
readonly dataset?: string;
|
|
209
|
-
readonly
|
|
358
|
+
readonly conversationId?: string;
|
|
210
359
|
readonly score: number;
|
|
211
360
|
readonly hits: readonly string[];
|
|
212
361
|
readonly misses: readonly string[];
|
|
213
|
-
readonly
|
|
362
|
+
readonly candidateAnswer: string;
|
|
214
363
|
readonly target: string;
|
|
215
364
|
readonly reasoning?: string;
|
|
216
|
-
readonly
|
|
217
|
-
readonly
|
|
218
|
-
readonly
|
|
219
|
-
readonly
|
|
220
|
-
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
365
|
+
readonly agentProviderRequest?: JsonObject;
|
|
366
|
+
readonly lmProviderRequest?: JsonObject;
|
|
367
|
+
readonly evaluatorProviderRequest?: JsonObject;
|
|
368
|
+
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
221
369
|
readonly error?: string;
|
|
222
370
|
/** Lightweight summary of the execution trace (always included when available) */
|
|
223
|
-
readonly
|
|
371
|
+
readonly traceSummary?: TraceSummary;
|
|
224
372
|
}
|
|
225
373
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
226
374
|
interface EvaluatorResult {
|
|
@@ -232,9 +380,9 @@ interface EvaluatorResult {
|
|
|
232
380
|
readonly hits: readonly string[];
|
|
233
381
|
readonly misses: readonly string[];
|
|
234
382
|
readonly reasoning?: string;
|
|
235
|
-
readonly
|
|
236
|
-
readonly
|
|
237
|
-
readonly
|
|
383
|
+
readonly rawRequest?: JsonObject;
|
|
384
|
+
readonly evaluatorProviderRequest?: JsonObject;
|
|
385
|
+
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
238
386
|
}
|
|
239
387
|
/**
|
|
240
388
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -248,7 +396,7 @@ interface ChatMessage {
|
|
|
248
396
|
readonly name?: string;
|
|
249
397
|
}
|
|
250
398
|
type ChatPrompt = readonly ChatMessage[];
|
|
251
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
399
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
252
400
|
interface ProviderRequest {
|
|
253
401
|
readonly question: string;
|
|
254
402
|
readonly systemPrompt?: string;
|
|
@@ -297,11 +445,28 @@ interface OutputMessage {
|
|
|
297
445
|
/** Provider-specific metadata */
|
|
298
446
|
readonly metadata?: Record<string, unknown>;
|
|
299
447
|
}
|
|
448
|
+
/**
|
|
449
|
+
* Token usage metrics reported by provider.
|
|
450
|
+
*/
|
|
451
|
+
interface ProviderTokenUsage {
|
|
452
|
+
/** Input/prompt tokens consumed */
|
|
453
|
+
readonly input: number;
|
|
454
|
+
/** Output/completion tokens generated */
|
|
455
|
+
readonly output: number;
|
|
456
|
+
/** Cached tokens (optional, provider-specific) */
|
|
457
|
+
readonly cached?: number;
|
|
458
|
+
}
|
|
300
459
|
interface ProviderResponse {
|
|
301
460
|
readonly raw?: unknown;
|
|
302
461
|
readonly usage?: JsonObject;
|
|
303
462
|
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
304
463
|
readonly outputMessages?: readonly OutputMessage[];
|
|
464
|
+
/** Token usage metrics (optional) */
|
|
465
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
466
|
+
/** Total cost in USD (optional) */
|
|
467
|
+
readonly costUsd?: number;
|
|
468
|
+
/** Execution duration in milliseconds (optional) */
|
|
469
|
+
readonly durationMs?: number;
|
|
305
470
|
}
|
|
306
471
|
interface Provider {
|
|
307
472
|
readonly id: string;
|
|
@@ -482,6 +647,101 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
482
647
|
readonly attempted: readonly string[];
|
|
483
648
|
}>;
|
|
484
649
|
|
|
650
|
+
/**
|
|
651
|
+
* Strict normalized schema for CLI target configuration.
|
|
652
|
+
* This is the final validated shape after environment variable resolution
|
|
653
|
+
* and snake_case to camelCase normalization.
|
|
654
|
+
*
|
|
655
|
+
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
656
|
+
* errors are caught early rather than silently ignored.
|
|
657
|
+
*
|
|
658
|
+
* @example
|
|
659
|
+
* ```typescript
|
|
660
|
+
* const config: CliNormalizedConfig = {
|
|
661
|
+
* commandTemplate: 'agent run {PROMPT}',
|
|
662
|
+
* timeoutMs: 120000,
|
|
663
|
+
* verbose: true,
|
|
664
|
+
* };
|
|
665
|
+
* CliTargetConfigSchema.parse(config); // Validates the normalized config
|
|
666
|
+
* ```
|
|
667
|
+
*/
|
|
668
|
+
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
669
|
+
commandTemplate: z.ZodString;
|
|
670
|
+
filesFormat: z.ZodOptional<z.ZodString>;
|
|
671
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
672
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
673
|
+
healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
|
|
674
|
+
type: z.ZodLiteral<"http">;
|
|
675
|
+
url: z.ZodString;
|
|
676
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
677
|
+
}, "strict", z.ZodTypeAny, {
|
|
678
|
+
type: "http";
|
|
679
|
+
url: string;
|
|
680
|
+
timeoutMs?: number | undefined;
|
|
681
|
+
}, {
|
|
682
|
+
type: "http";
|
|
683
|
+
url: string;
|
|
684
|
+
timeoutMs?: number | undefined;
|
|
685
|
+
}>, z.ZodObject<{
|
|
686
|
+
type: z.ZodLiteral<"command">;
|
|
687
|
+
commandTemplate: z.ZodString;
|
|
688
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
689
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
690
|
+
}, "strict", z.ZodTypeAny, {
|
|
691
|
+
type: "command";
|
|
692
|
+
commandTemplate: string;
|
|
693
|
+
cwd?: string | undefined;
|
|
694
|
+
timeoutMs?: number | undefined;
|
|
695
|
+
}, {
|
|
696
|
+
type: "command";
|
|
697
|
+
commandTemplate: string;
|
|
698
|
+
cwd?: string | undefined;
|
|
699
|
+
timeoutMs?: number | undefined;
|
|
700
|
+
}>]>>;
|
|
701
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
702
|
+
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
703
|
+
}, "strict", z.ZodTypeAny, {
|
|
704
|
+
commandTemplate: string;
|
|
705
|
+
cwd?: string | undefined;
|
|
706
|
+
verbose?: boolean | undefined;
|
|
707
|
+
filesFormat?: string | undefined;
|
|
708
|
+
healthcheck?: {
|
|
709
|
+
type: "http";
|
|
710
|
+
url: string;
|
|
711
|
+
timeoutMs?: number | undefined;
|
|
712
|
+
} | {
|
|
713
|
+
type: "command";
|
|
714
|
+
commandTemplate: string;
|
|
715
|
+
cwd?: string | undefined;
|
|
716
|
+
timeoutMs?: number | undefined;
|
|
717
|
+
} | undefined;
|
|
718
|
+
keepTempFiles?: boolean | undefined;
|
|
719
|
+
timeoutMs?: number | undefined;
|
|
720
|
+
}, {
|
|
721
|
+
commandTemplate: string;
|
|
722
|
+
cwd?: string | undefined;
|
|
723
|
+
verbose?: boolean | undefined;
|
|
724
|
+
filesFormat?: string | undefined;
|
|
725
|
+
healthcheck?: {
|
|
726
|
+
type: "http";
|
|
727
|
+
url: string;
|
|
728
|
+
timeoutMs?: number | undefined;
|
|
729
|
+
} | {
|
|
730
|
+
type: "command";
|
|
731
|
+
commandTemplate: string;
|
|
732
|
+
cwd?: string | undefined;
|
|
733
|
+
timeoutMs?: number | undefined;
|
|
734
|
+
} | undefined;
|
|
735
|
+
keepTempFiles?: boolean | undefined;
|
|
736
|
+
timeoutMs?: number | undefined;
|
|
737
|
+
}>;
|
|
738
|
+
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
739
|
+
/**
|
|
740
|
+
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
741
|
+
* This is the final validated shape used by the CLI provider at runtime.
|
|
742
|
+
* Using Readonly to ensure immutability for runtime safety.
|
|
743
|
+
*/
|
|
744
|
+
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
485
745
|
interface RetryConfig {
|
|
486
746
|
readonly maxRetries?: number;
|
|
487
747
|
readonly initialDelayMs?: number;
|
|
@@ -529,6 +789,31 @@ interface CodexResolvedConfig {
|
|
|
529
789
|
readonly timeoutMs?: number;
|
|
530
790
|
readonly logDir?: string;
|
|
531
791
|
readonly logFormat?: 'summary' | 'json';
|
|
792
|
+
readonly systemPrompt?: string;
|
|
793
|
+
}
|
|
794
|
+
interface PiCodingAgentResolvedConfig {
|
|
795
|
+
readonly executable: string;
|
|
796
|
+
readonly provider?: string;
|
|
797
|
+
readonly model?: string;
|
|
798
|
+
readonly apiKey?: string;
|
|
799
|
+
readonly tools?: string;
|
|
800
|
+
readonly thinking?: string;
|
|
801
|
+
readonly args?: readonly string[];
|
|
802
|
+
readonly cwd?: string;
|
|
803
|
+
readonly timeoutMs?: number;
|
|
804
|
+
readonly logDir?: string;
|
|
805
|
+
readonly logFormat?: 'summary' | 'json';
|
|
806
|
+
readonly systemPrompt?: string;
|
|
807
|
+
}
|
|
808
|
+
interface ClaudeCodeResolvedConfig {
|
|
809
|
+
readonly executable: string;
|
|
810
|
+
readonly model?: string;
|
|
811
|
+
readonly systemPrompt?: string;
|
|
812
|
+
readonly args?: readonly string[];
|
|
813
|
+
readonly cwd?: string;
|
|
814
|
+
readonly timeoutMs?: number;
|
|
815
|
+
readonly logDir?: string;
|
|
816
|
+
readonly logFormat?: 'summary' | 'json';
|
|
532
817
|
}
|
|
533
818
|
interface MockResolvedConfig {
|
|
534
819
|
readonly response?: string;
|
|
@@ -543,25 +828,6 @@ interface VSCodeResolvedConfig {
|
|
|
543
828
|
readonly subagentRoot?: string;
|
|
544
829
|
readonly workspaceTemplate?: string;
|
|
545
830
|
}
|
|
546
|
-
type CliHealthcheck = {
|
|
547
|
-
readonly type: 'http';
|
|
548
|
-
readonly url: string;
|
|
549
|
-
readonly timeoutMs?: number;
|
|
550
|
-
} | {
|
|
551
|
-
readonly type: 'command';
|
|
552
|
-
readonly commandTemplate: string;
|
|
553
|
-
readonly timeoutMs?: number;
|
|
554
|
-
readonly cwd?: string;
|
|
555
|
-
};
|
|
556
|
-
interface CliResolvedConfig {
|
|
557
|
-
readonly commandTemplate: string;
|
|
558
|
-
readonly filesFormat?: string;
|
|
559
|
-
readonly cwd?: string;
|
|
560
|
-
readonly timeoutMs?: number;
|
|
561
|
-
readonly healthcheck?: CliHealthcheck;
|
|
562
|
-
readonly verbose?: boolean;
|
|
563
|
-
readonly keepTempFiles?: boolean;
|
|
564
|
-
}
|
|
565
831
|
type ResolvedTarget = {
|
|
566
832
|
readonly kind: 'azure';
|
|
567
833
|
readonly name: string;
|
|
@@ -590,6 +856,20 @@ type ResolvedTarget = {
|
|
|
590
856
|
readonly workers?: number;
|
|
591
857
|
readonly providerBatching?: boolean;
|
|
592
858
|
readonly config: CodexResolvedConfig;
|
|
859
|
+
} | {
|
|
860
|
+
readonly kind: 'pi-coding-agent';
|
|
861
|
+
readonly name: string;
|
|
862
|
+
readonly judgeTarget?: string;
|
|
863
|
+
readonly workers?: number;
|
|
864
|
+
readonly providerBatching?: boolean;
|
|
865
|
+
readonly config: PiCodingAgentResolvedConfig;
|
|
866
|
+
} | {
|
|
867
|
+
readonly kind: 'claude-code';
|
|
868
|
+
readonly name: string;
|
|
869
|
+
readonly judgeTarget?: string;
|
|
870
|
+
readonly workers?: number;
|
|
871
|
+
readonly providerBatching?: boolean;
|
|
872
|
+
readonly config: ClaudeCodeResolvedConfig;
|
|
593
873
|
} | {
|
|
594
874
|
readonly kind: 'mock';
|
|
595
875
|
readonly name: string;
|
|
@@ -645,6 +925,26 @@ type CodexLogListener = (entry: CodexLogEntry) => void;
|
|
|
645
925
|
declare function consumeCodexLogEntries(): CodexLogEntry[];
|
|
646
926
|
declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => void;
|
|
647
927
|
|
|
928
|
+
type PiLogEntry = {
|
|
929
|
+
readonly filePath: string;
|
|
930
|
+
readonly evalCaseId?: string;
|
|
931
|
+
readonly targetName: string;
|
|
932
|
+
readonly attempt?: number;
|
|
933
|
+
};
|
|
934
|
+
type PiLogListener = (entry: PiLogEntry) => void;
|
|
935
|
+
declare function consumePiLogEntries(): PiLogEntry[];
|
|
936
|
+
declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
|
|
937
|
+
|
|
938
|
+
type ClaudeCodeLogEntry = {
|
|
939
|
+
readonly filePath: string;
|
|
940
|
+
readonly evalCaseId?: string;
|
|
941
|
+
readonly targetName: string;
|
|
942
|
+
readonly attempt?: number;
|
|
943
|
+
};
|
|
944
|
+
type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
|
|
945
|
+
declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
|
|
946
|
+
declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
|
|
947
|
+
|
|
648
948
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
649
949
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
650
950
|
|
|
@@ -676,7 +976,6 @@ interface EvaluationScore {
|
|
|
676
976
|
readonly misses: readonly string[];
|
|
677
977
|
readonly expectedAspectCount: number;
|
|
678
978
|
readonly reasoning?: string;
|
|
679
|
-
readonly rawAspects?: readonly string[];
|
|
680
979
|
readonly evaluatorRawRequest?: JsonObject;
|
|
681
980
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
682
981
|
}
|
|
@@ -717,15 +1016,18 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
717
1016
|
private runWithRetry;
|
|
718
1017
|
}
|
|
719
1018
|
interface CodeEvaluatorOptions {
|
|
720
|
-
readonly script: string;
|
|
1019
|
+
readonly script: readonly string[];
|
|
721
1020
|
readonly cwd?: string;
|
|
722
1021
|
readonly agentTimeoutMs?: number;
|
|
1022
|
+
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1023
|
+
readonly config?: Record<string, unknown>;
|
|
723
1024
|
}
|
|
724
1025
|
declare class CodeEvaluator implements Evaluator {
|
|
725
1026
|
readonly kind = "code";
|
|
726
1027
|
private readonly script;
|
|
727
1028
|
private readonly cwd?;
|
|
728
1029
|
private readonly agentTimeoutMs?;
|
|
1030
|
+
private readonly config?;
|
|
729
1031
|
constructor(options: CodeEvaluatorOptions);
|
|
730
1032
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
731
1033
|
}
|
|
@@ -749,6 +1051,44 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
749
1051
|
private evaluateInOrder;
|
|
750
1052
|
private evaluateExact;
|
|
751
1053
|
}
|
|
1054
|
+
interface FieldAccuracyEvaluatorOptions {
|
|
1055
|
+
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1056
|
+
}
|
|
1057
|
+
/**
|
|
1058
|
+
* FieldAccuracyEvaluator compares extracted structured data against expected values
|
|
1059
|
+
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1060
|
+
*/
|
|
1061
|
+
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1062
|
+
readonly kind = "field_accuracy";
|
|
1063
|
+
private readonly config;
|
|
1064
|
+
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1065
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1066
|
+
/**
|
|
1067
|
+
* Extract expected data from expected_messages array.
|
|
1068
|
+
* Looks for the last assistant message with content.
|
|
1069
|
+
*/
|
|
1070
|
+
private extractExpectedData;
|
|
1071
|
+
/**
|
|
1072
|
+
* Evaluate a single field against the expected value.
|
|
1073
|
+
*/
|
|
1074
|
+
private evaluateField;
|
|
1075
|
+
/**
|
|
1076
|
+
* Exact equality comparison.
|
|
1077
|
+
*/
|
|
1078
|
+
private compareExact;
|
|
1079
|
+
/**
|
|
1080
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
1081
|
+
*/
|
|
1082
|
+
private compareNumericTolerance;
|
|
1083
|
+
/**
|
|
1084
|
+
* Date comparison with format normalization.
|
|
1085
|
+
*/
|
|
1086
|
+
private compareDate;
|
|
1087
|
+
/**
|
|
1088
|
+
* Aggregate field results using configured strategy.
|
|
1089
|
+
*/
|
|
1090
|
+
private aggregateResults;
|
|
1091
|
+
}
|
|
752
1092
|
interface EvaluatorFactory {
|
|
753
1093
|
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
754
1094
|
}
|
|
@@ -769,6 +1109,45 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
769
1109
|
private runCodeAggregator;
|
|
770
1110
|
private runLlmAggregator;
|
|
771
1111
|
}
|
|
1112
|
+
interface LatencyEvaluatorOptions {
|
|
1113
|
+
readonly config: LatencyEvaluatorConfig;
|
|
1114
|
+
}
|
|
1115
|
+
/**
|
|
1116
|
+
* Evaluator that checks execution duration against a threshold.
|
|
1117
|
+
* Uses traceSummary.durationMs from the evaluation context.
|
|
1118
|
+
*/
|
|
1119
|
+
declare class LatencyEvaluator implements Evaluator {
|
|
1120
|
+
readonly kind = "latency";
|
|
1121
|
+
private readonly config;
|
|
1122
|
+
constructor(options: LatencyEvaluatorOptions);
|
|
1123
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1124
|
+
}
|
|
1125
|
+
interface CostEvaluatorOptions {
|
|
1126
|
+
readonly config: CostEvaluatorConfig;
|
|
1127
|
+
}
|
|
1128
|
+
/**
|
|
1129
|
+
* Evaluator that checks execution cost against a budget.
|
|
1130
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1131
|
+
*/
|
|
1132
|
+
declare class CostEvaluator implements Evaluator {
|
|
1133
|
+
readonly kind = "cost";
|
|
1134
|
+
private readonly config;
|
|
1135
|
+
constructor(options: CostEvaluatorOptions);
|
|
1136
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1137
|
+
}
|
|
1138
|
+
interface TokenUsageEvaluatorOptions {
|
|
1139
|
+
readonly config: TokenUsageEvaluatorConfig;
|
|
1140
|
+
}
|
|
1141
|
+
/**
|
|
1142
|
+
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1143
|
+
* Uses traceSummary.tokenUsage from the evaluation context.
|
|
1144
|
+
*/
|
|
1145
|
+
declare class TokenUsageEvaluator implements Evaluator {
|
|
1146
|
+
readonly kind = "token_usage";
|
|
1147
|
+
private readonly config;
|
|
1148
|
+
constructor(options: TokenUsageEvaluatorOptions);
|
|
1149
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1150
|
+
}
|
|
772
1151
|
|
|
773
1152
|
type MaybePromise<T> = T | Promise<T>;
|
|
774
1153
|
interface EvaluationCache {
|
|
@@ -785,7 +1164,6 @@ interface RunEvalCaseOptions {
|
|
|
785
1164
|
readonly now?: () => Date;
|
|
786
1165
|
readonly maxRetries?: number;
|
|
787
1166
|
readonly agentTimeoutMs?: number;
|
|
788
|
-
readonly promptDumpDir?: string;
|
|
789
1167
|
readonly cache?: EvaluationCache;
|
|
790
1168
|
readonly useCache?: boolean;
|
|
791
1169
|
readonly signal?: AbortSignal;
|
|
@@ -809,7 +1187,6 @@ interface RunEvaluationOptions {
|
|
|
809
1187
|
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
810
1188
|
readonly maxRetries?: number;
|
|
811
1189
|
readonly agentTimeoutMs?: number;
|
|
812
|
-
readonly promptDumpDir?: string;
|
|
813
1190
|
readonly cache?: EvaluationCache;
|
|
814
1191
|
readonly useCache?: boolean;
|
|
815
1192
|
readonly now?: () => Date;
|
|
@@ -834,9 +1211,37 @@ interface GenerateRubricsOptions {
|
|
|
834
1211
|
*/
|
|
835
1212
|
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
836
1213
|
|
|
1214
|
+
/**
|
|
1215
|
+
* Payload received by code judges via stdin.
|
|
1216
|
+
* All properties use camelCase for TypeScript ergonomics.
|
|
1217
|
+
*/
|
|
1218
|
+
interface CodeJudgePayload {
|
|
1219
|
+
readonly question: string;
|
|
1220
|
+
readonly expectedOutcome: string;
|
|
1221
|
+
readonly expectedMessages: readonly JsonObject[];
|
|
1222
|
+
readonly referenceAnswer?: string;
|
|
1223
|
+
readonly candidateAnswer: string;
|
|
1224
|
+
readonly outputMessages?: readonly OutputMessage[] | null;
|
|
1225
|
+
readonly guidelineFiles: readonly string[];
|
|
1226
|
+
readonly inputFiles: readonly string[];
|
|
1227
|
+
readonly inputMessages: readonly TestMessage[];
|
|
1228
|
+
readonly traceSummary?: TraceSummary | null;
|
|
1229
|
+
readonly config?: JsonObject | null;
|
|
1230
|
+
}
|
|
1231
|
+
/**
|
|
1232
|
+
* Parse stdin JSON (snake_case) into typed camelCase object.
|
|
1233
|
+
* Use this in TypeScript code judges to get type-safe, idiomatic input.
|
|
1234
|
+
*/
|
|
1235
|
+
declare function parseCodeJudgePayload(payload: string): CodeJudgePayload;
|
|
1236
|
+
/**
|
|
1237
|
+
* Convenience helper that reads stdin and parses it.
|
|
1238
|
+
* Equivalent to: parseCodeJudgePayload(readFileSync(0, 'utf8'))
|
|
1239
|
+
*/
|
|
1240
|
+
declare function readCodeJudgePayload(): CodeJudgePayload;
|
|
1241
|
+
|
|
837
1242
|
type AgentKernel = {
|
|
838
1243
|
status: string;
|
|
839
1244
|
};
|
|
840
1245
|
declare function createAgentKernel(): AgentKernel;
|
|
841
1246
|
|
|
842
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
1247
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodeJudgePayload, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseCodeJudgePayload, readCodeJudgePayload, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|