@agentv/core 0.22.2 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
- package/dist/chunk-OYTL3LNN.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +994 -50
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +205 -4
- package/dist/index.d.ts +205 -4
- package/dist/index.js +953 -23
- package/dist/index.js.map +1 -1
- package/package.json +3 -4
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,5 +1,97 @@
|
|
|
1
1
|
import * as ai from 'ai';
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* Trace event types for capturing agent execution traces.
|
|
5
|
+
* Provides a normalized, provider-agnostic model for tool-call trajectories.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Supported trace event types.
|
|
9
|
+
*/
|
|
10
|
+
type TraceEventType = 'model_step' | 'tool_call' | 'tool_result' | 'message' | 'error';
|
|
11
|
+
/**
|
|
12
|
+
* Normalized trace event representing a single step in agent execution.
|
|
13
|
+
* Provider-agnostic format for tool-call trajectory evaluation.
|
|
14
|
+
*/
|
|
15
|
+
interface TraceEvent {
|
|
16
|
+
/** Event type */
|
|
17
|
+
readonly type: TraceEventType;
|
|
18
|
+
/** ISO 8601 timestamp */
|
|
19
|
+
readonly timestamp: string;
|
|
20
|
+
/** Stable identifier for pairing tool_call/tool_result */
|
|
21
|
+
readonly id?: string;
|
|
22
|
+
/** Tool name (for tool_call/tool_result) */
|
|
23
|
+
readonly name?: string;
|
|
24
|
+
/** Tool input - any JSON value */
|
|
25
|
+
readonly input?: unknown;
|
|
26
|
+
/** Tool output - any JSON value */
|
|
27
|
+
readonly output?: unknown;
|
|
28
|
+
/** Message content (for message/model_step) */
|
|
29
|
+
readonly text?: string;
|
|
30
|
+
/** Provider-specific metadata */
|
|
31
|
+
readonly metadata?: Record<string, unknown>;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Compact summary of a trace for lightweight persistence.
|
|
35
|
+
* Included in results by default to avoid payload bloat.
|
|
36
|
+
*/
|
|
37
|
+
interface TraceSummary {
|
|
38
|
+
/** Total number of events in trace */
|
|
39
|
+
readonly eventCount: number;
|
|
40
|
+
/** Unique tool names, sorted alphabetically */
|
|
41
|
+
readonly toolNames: readonly string[];
|
|
42
|
+
/** Map of tool name to call count */
|
|
43
|
+
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
44
|
+
/** Number of error events */
|
|
45
|
+
readonly errorCount: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Configuration for tool_trajectory evaluator.
|
|
49
|
+
*/
|
|
50
|
+
interface ToolTrajectoryEvaluatorConfig {
|
|
51
|
+
readonly name: string;
|
|
52
|
+
readonly type: 'tool_trajectory';
|
|
53
|
+
/** Matching mode */
|
|
54
|
+
readonly mode: 'any_order' | 'in_order' | 'exact';
|
|
55
|
+
/** Minimum call counts per tool (for any_order mode) */
|
|
56
|
+
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
|
+
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
|
+
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Expected tool call item in a trajectory sequence.
|
|
62
|
+
*/
|
|
63
|
+
interface ToolTrajectoryExpectedItem {
|
|
64
|
+
readonly tool: string;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Expected tool call specification for expected_messages validation.
|
|
68
|
+
*/
|
|
69
|
+
interface ExpectedToolCall {
|
|
70
|
+
/** Tool name (required) */
|
|
71
|
+
readonly tool: string;
|
|
72
|
+
/** Tool input - if specified, must match exactly */
|
|
73
|
+
readonly input?: unknown;
|
|
74
|
+
/** Tool output - if specified, must match exactly */
|
|
75
|
+
readonly output?: unknown;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Type guard for TraceEventType values.
|
|
79
|
+
*/
|
|
80
|
+
declare function isTraceEventType(value: unknown): value is TraceEventType;
|
|
81
|
+
/**
|
|
82
|
+
* Type guard for TraceEvent objects.
|
|
83
|
+
*/
|
|
84
|
+
declare function isTraceEvent(value: unknown): value is TraceEvent;
|
|
85
|
+
/**
|
|
86
|
+
* Type guard for ExpectedToolCall objects.
|
|
87
|
+
*/
|
|
88
|
+
declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
|
|
89
|
+
/**
|
|
90
|
+
* Compute a lightweight summary from a full trace.
|
|
91
|
+
* Used for default result persistence without payload bloat.
|
|
92
|
+
*/
|
|
93
|
+
declare function computeTraceSummary(trace: readonly TraceEvent[]): TraceSummary;
|
|
94
|
+
|
|
3
95
|
/**
|
|
4
96
|
* JSON primitive values appearing in AgentV payloads.
|
|
5
97
|
*/
|
|
@@ -41,12 +133,21 @@ type UserTestMessage = {
|
|
|
41
133
|
readonly role: 'user';
|
|
42
134
|
readonly content: TestMessageContent;
|
|
43
135
|
};
|
|
136
|
+
/**
|
|
137
|
+
* Tool call specification for expected_messages validation.
|
|
138
|
+
*/
|
|
139
|
+
type TestMessageToolCall = {
|
|
140
|
+
readonly tool: string;
|
|
141
|
+
readonly input?: unknown;
|
|
142
|
+
};
|
|
44
143
|
/**
|
|
45
144
|
* Assistant response message.
|
|
46
145
|
*/
|
|
47
146
|
type AssistantTestMessage = {
|
|
48
147
|
readonly role: 'assistant';
|
|
49
148
|
readonly content: TestMessageContent;
|
|
149
|
+
/** Optional tool_calls for expected_messages validation against traces */
|
|
150
|
+
readonly tool_calls?: readonly TestMessageToolCall[];
|
|
50
151
|
};
|
|
51
152
|
/**
|
|
52
153
|
* Tool invocation message.
|
|
@@ -75,7 +176,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
176
|
* Guard validating raw test messages.
|
|
76
177
|
*/
|
|
77
178
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["
|
|
179
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
|
|
79
180
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
181
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
182
|
type CodeEvaluatorConfig = {
|
|
@@ -99,7 +200,30 @@ type RubricItem = {
|
|
|
99
200
|
readonly weight: number;
|
|
100
201
|
readonly required: boolean;
|
|
101
202
|
};
|
|
102
|
-
type
|
|
203
|
+
type CompositeAggregatorConfig = {
|
|
204
|
+
readonly type: 'weighted_average';
|
|
205
|
+
readonly weights?: Record<string, number>;
|
|
206
|
+
} | {
|
|
207
|
+
readonly type: 'code_judge';
|
|
208
|
+
readonly path: string;
|
|
209
|
+
readonly cwd?: string;
|
|
210
|
+
} | {
|
|
211
|
+
readonly type: 'llm_judge';
|
|
212
|
+
readonly prompt?: string;
|
|
213
|
+
readonly promptPath?: string;
|
|
214
|
+
readonly model?: string;
|
|
215
|
+
};
|
|
216
|
+
type CompositeEvaluatorConfig = {
|
|
217
|
+
readonly name: string;
|
|
218
|
+
readonly type: 'composite';
|
|
219
|
+
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
|
+
readonly aggregator: CompositeAggregatorConfig;
|
|
221
|
+
};
|
|
222
|
+
type ExpectedMessagesEvaluatorConfig = {
|
|
223
|
+
readonly name: string;
|
|
224
|
+
readonly type: 'expected_messages';
|
|
225
|
+
};
|
|
226
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
|
|
103
227
|
/**
|
|
104
228
|
* Eval case definition sourced from AgentV specs.
|
|
105
229
|
*/
|
|
@@ -140,18 +264,24 @@ interface EvaluationResult {
|
|
|
140
264
|
readonly evaluator_provider_request?: JsonObject;
|
|
141
265
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
142
266
|
readonly error?: string;
|
|
267
|
+
/** Lightweight summary of the execution trace (always included when available) */
|
|
268
|
+
readonly trace_summary?: TraceSummary;
|
|
269
|
+
/** Full trace events (only included when --include-trace flag is set) */
|
|
270
|
+
readonly trace?: readonly TraceEvent[];
|
|
143
271
|
}
|
|
144
272
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
145
273
|
interface EvaluatorResult {
|
|
146
274
|
readonly name: string;
|
|
147
275
|
readonly type: EvaluatorKind;
|
|
148
276
|
readonly score: number;
|
|
277
|
+
readonly weight?: number;
|
|
149
278
|
readonly verdict?: EvaluationVerdict;
|
|
150
279
|
readonly hits: readonly string[];
|
|
151
280
|
readonly misses: readonly string[];
|
|
152
281
|
readonly reasoning?: string;
|
|
153
282
|
readonly raw_request?: JsonObject;
|
|
154
283
|
readonly evaluator_provider_request?: JsonObject;
|
|
284
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
155
285
|
}
|
|
156
286
|
/**
|
|
157
287
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -185,6 +315,10 @@ interface ProviderResponse {
|
|
|
185
315
|
readonly reasoning?: string;
|
|
186
316
|
readonly raw?: unknown;
|
|
187
317
|
readonly usage?: JsonObject;
|
|
318
|
+
/** Normalized trace events from agent execution */
|
|
319
|
+
readonly trace?: readonly TraceEvent[];
|
|
320
|
+
/** Reference to external trace file (alternative to inline trace) */
|
|
321
|
+
readonly traceRef?: string;
|
|
188
322
|
}
|
|
189
323
|
interface Provider {
|
|
190
324
|
readonly id: string;
|
|
@@ -338,6 +472,10 @@ declare function normalizeLineEndings(content: string): string;
|
|
|
338
472
|
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
339
473
|
*/
|
|
340
474
|
declare function readTextFile(filePath: string): Promise<string>;
|
|
475
|
+
/**
|
|
476
|
+
* Read a JSON file and parse it.
|
|
477
|
+
*/
|
|
478
|
+
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
341
479
|
/**
|
|
342
480
|
* Find git repository root by walking up the directory tree.
|
|
343
481
|
*/
|
|
@@ -414,6 +552,8 @@ interface MockResolvedConfig {
|
|
|
414
552
|
readonly delayMs?: number;
|
|
415
553
|
readonly delayMinMs?: number;
|
|
416
554
|
readonly delayMaxMs?: number;
|
|
555
|
+
/** Mock trace events for testing tool_trajectory evaluator */
|
|
556
|
+
readonly trace?: readonly TraceEvent[];
|
|
417
557
|
}
|
|
418
558
|
interface VSCodeResolvedConfig {
|
|
419
559
|
readonly command: string;
|
|
@@ -490,7 +630,7 @@ type ResolvedTarget = {
|
|
|
490
630
|
readonly providerBatching?: boolean;
|
|
491
631
|
readonly config: CliResolvedConfig;
|
|
492
632
|
};
|
|
493
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
633
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
494
634
|
|
|
495
635
|
declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
|
|
496
636
|
declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
|
|
@@ -542,6 +682,10 @@ interface EvaluationContext {
|
|
|
542
682
|
readonly judgeProvider?: Provider;
|
|
543
683
|
readonly evaluatorTemplateOverride?: string;
|
|
544
684
|
readonly evaluator?: EvaluatorConfig;
|
|
685
|
+
/** Normalized trace events from provider execution (if available) */
|
|
686
|
+
readonly candidateTrace?: readonly TraceEvent[];
|
|
687
|
+
/** Lightweight summary of trace events (if available) */
|
|
688
|
+
readonly candidateTraceSummary?: TraceSummary;
|
|
545
689
|
}
|
|
546
690
|
interface EvaluationScore {
|
|
547
691
|
readonly score: number;
|
|
@@ -552,6 +696,19 @@ interface EvaluationScore {
|
|
|
552
696
|
readonly reasoning?: string;
|
|
553
697
|
readonly rawAspects?: readonly string[];
|
|
554
698
|
readonly evaluatorRawRequest?: JsonObject;
|
|
699
|
+
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
700
|
+
}
|
|
701
|
+
interface ChildEvaluatorResult {
|
|
702
|
+
readonly name: string;
|
|
703
|
+
readonly type: string;
|
|
704
|
+
readonly score: number;
|
|
705
|
+
readonly weight?: number;
|
|
706
|
+
readonly verdict: EvaluationVerdict;
|
|
707
|
+
readonly hits: readonly string[];
|
|
708
|
+
readonly misses: readonly string[];
|
|
709
|
+
readonly reasoning?: string;
|
|
710
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
711
|
+
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
555
712
|
}
|
|
556
713
|
interface Evaluator {
|
|
557
714
|
readonly kind: string;
|
|
@@ -590,6 +747,50 @@ declare class CodeEvaluator implements Evaluator {
|
|
|
590
747
|
constructor(options: CodeEvaluatorOptions);
|
|
591
748
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
592
749
|
}
|
|
750
|
+
interface ToolTrajectoryEvaluatorOptions {
|
|
751
|
+
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
752
|
+
}
|
|
753
|
+
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
754
|
+
readonly kind = "tool_trajectory";
|
|
755
|
+
private readonly config;
|
|
756
|
+
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
757
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
758
|
+
private evaluateAnyOrder;
|
|
759
|
+
private evaluateInOrder;
|
|
760
|
+
private evaluateExact;
|
|
761
|
+
}
|
|
762
|
+
/**
|
|
763
|
+
* Evaluator that validates tool_calls in expected_messages against the actual trace.
|
|
764
|
+
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
765
|
+
* sequentially against tool_call events in the trace.
|
|
766
|
+
*/
|
|
767
|
+
declare class ExpectedMessagesEvaluator implements Evaluator {
|
|
768
|
+
readonly kind = "expected_messages";
|
|
769
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
770
|
+
private extractExpectedToolCalls;
|
|
771
|
+
private validateToolCalls;
|
|
772
|
+
private deepEquals;
|
|
773
|
+
}
|
|
774
|
+
interface EvaluatorFactory {
|
|
775
|
+
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
776
|
+
}
|
|
777
|
+
interface CompositeEvaluatorOptions {
|
|
778
|
+
readonly config: CompositeEvaluatorConfig;
|
|
779
|
+
readonly evaluatorFactory: EvaluatorFactory;
|
|
780
|
+
readonly cwd?: string;
|
|
781
|
+
}
|
|
782
|
+
declare class CompositeEvaluator implements Evaluator {
|
|
783
|
+
readonly kind = "composite";
|
|
784
|
+
private readonly config;
|
|
785
|
+
private readonly evaluatorFactory;
|
|
786
|
+
private readonly cwd?;
|
|
787
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
788
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
789
|
+
private aggregate;
|
|
790
|
+
private runWeightedAverage;
|
|
791
|
+
private runCodeAggregator;
|
|
792
|
+
private runLlmAggregator;
|
|
793
|
+
}
|
|
593
794
|
|
|
594
795
|
type MaybePromise<T> = T | Promise<T>;
|
|
595
796
|
interface EvaluationCache {
|
|
@@ -660,4 +861,4 @@ type AgentKernel = {
|
|
|
660
861
|
};
|
|
661
862
|
declare function createAgentKernel(): AgentKernel;
|
|
662
863
|
|
|
663
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
864
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,97 @@
|
|
|
1
1
|
import * as ai from 'ai';
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* Trace event types for capturing agent execution traces.
|
|
5
|
+
* Provides a normalized, provider-agnostic model for tool-call trajectories.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Supported trace event types.
|
|
9
|
+
*/
|
|
10
|
+
type TraceEventType = 'model_step' | 'tool_call' | 'tool_result' | 'message' | 'error';
|
|
11
|
+
/**
|
|
12
|
+
* Normalized trace event representing a single step in agent execution.
|
|
13
|
+
* Provider-agnostic format for tool-call trajectory evaluation.
|
|
14
|
+
*/
|
|
15
|
+
interface TraceEvent {
|
|
16
|
+
/** Event type */
|
|
17
|
+
readonly type: TraceEventType;
|
|
18
|
+
/** ISO 8601 timestamp */
|
|
19
|
+
readonly timestamp: string;
|
|
20
|
+
/** Stable identifier for pairing tool_call/tool_result */
|
|
21
|
+
readonly id?: string;
|
|
22
|
+
/** Tool name (for tool_call/tool_result) */
|
|
23
|
+
readonly name?: string;
|
|
24
|
+
/** Tool input - any JSON value */
|
|
25
|
+
readonly input?: unknown;
|
|
26
|
+
/** Tool output - any JSON value */
|
|
27
|
+
readonly output?: unknown;
|
|
28
|
+
/** Message content (for message/model_step) */
|
|
29
|
+
readonly text?: string;
|
|
30
|
+
/** Provider-specific metadata */
|
|
31
|
+
readonly metadata?: Record<string, unknown>;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Compact summary of a trace for lightweight persistence.
|
|
35
|
+
* Included in results by default to avoid payload bloat.
|
|
36
|
+
*/
|
|
37
|
+
interface TraceSummary {
|
|
38
|
+
/** Total number of events in trace */
|
|
39
|
+
readonly eventCount: number;
|
|
40
|
+
/** Unique tool names, sorted alphabetically */
|
|
41
|
+
readonly toolNames: readonly string[];
|
|
42
|
+
/** Map of tool name to call count */
|
|
43
|
+
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
44
|
+
/** Number of error events */
|
|
45
|
+
readonly errorCount: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Configuration for tool_trajectory evaluator.
|
|
49
|
+
*/
|
|
50
|
+
interface ToolTrajectoryEvaluatorConfig {
|
|
51
|
+
readonly name: string;
|
|
52
|
+
readonly type: 'tool_trajectory';
|
|
53
|
+
/** Matching mode */
|
|
54
|
+
readonly mode: 'any_order' | 'in_order' | 'exact';
|
|
55
|
+
/** Minimum call counts per tool (for any_order mode) */
|
|
56
|
+
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
|
+
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
|
+
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Expected tool call item in a trajectory sequence.
|
|
62
|
+
*/
|
|
63
|
+
interface ToolTrajectoryExpectedItem {
|
|
64
|
+
readonly tool: string;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Expected tool call specification for expected_messages validation.
|
|
68
|
+
*/
|
|
69
|
+
interface ExpectedToolCall {
|
|
70
|
+
/** Tool name (required) */
|
|
71
|
+
readonly tool: string;
|
|
72
|
+
/** Tool input - if specified, must match exactly */
|
|
73
|
+
readonly input?: unknown;
|
|
74
|
+
/** Tool output - if specified, must match exactly */
|
|
75
|
+
readonly output?: unknown;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Type guard for TraceEventType values.
|
|
79
|
+
*/
|
|
80
|
+
declare function isTraceEventType(value: unknown): value is TraceEventType;
|
|
81
|
+
/**
|
|
82
|
+
* Type guard for TraceEvent objects.
|
|
83
|
+
*/
|
|
84
|
+
declare function isTraceEvent(value: unknown): value is TraceEvent;
|
|
85
|
+
/**
|
|
86
|
+
* Type guard for ExpectedToolCall objects.
|
|
87
|
+
*/
|
|
88
|
+
declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
|
|
89
|
+
/**
|
|
90
|
+
* Compute a lightweight summary from a full trace.
|
|
91
|
+
* Used for default result persistence without payload bloat.
|
|
92
|
+
*/
|
|
93
|
+
declare function computeTraceSummary(trace: readonly TraceEvent[]): TraceSummary;
|
|
94
|
+
|
|
3
95
|
/**
|
|
4
96
|
* JSON primitive values appearing in AgentV payloads.
|
|
5
97
|
*/
|
|
@@ -41,12 +133,21 @@ type UserTestMessage = {
|
|
|
41
133
|
readonly role: 'user';
|
|
42
134
|
readonly content: TestMessageContent;
|
|
43
135
|
};
|
|
136
|
+
/**
|
|
137
|
+
* Tool call specification for expected_messages validation.
|
|
138
|
+
*/
|
|
139
|
+
type TestMessageToolCall = {
|
|
140
|
+
readonly tool: string;
|
|
141
|
+
readonly input?: unknown;
|
|
142
|
+
};
|
|
44
143
|
/**
|
|
45
144
|
* Assistant response message.
|
|
46
145
|
*/
|
|
47
146
|
type AssistantTestMessage = {
|
|
48
147
|
readonly role: 'assistant';
|
|
49
148
|
readonly content: TestMessageContent;
|
|
149
|
+
/** Optional tool_calls for expected_messages validation against traces */
|
|
150
|
+
readonly tool_calls?: readonly TestMessageToolCall[];
|
|
50
151
|
};
|
|
51
152
|
/**
|
|
52
153
|
* Tool invocation message.
|
|
@@ -75,7 +176,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
176
|
* Guard validating raw test messages.
|
|
76
177
|
*/
|
|
77
178
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["
|
|
179
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
|
|
79
180
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
181
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
182
|
type CodeEvaluatorConfig = {
|
|
@@ -99,7 +200,30 @@ type RubricItem = {
|
|
|
99
200
|
readonly weight: number;
|
|
100
201
|
readonly required: boolean;
|
|
101
202
|
};
|
|
102
|
-
type
|
|
203
|
+
type CompositeAggregatorConfig = {
|
|
204
|
+
readonly type: 'weighted_average';
|
|
205
|
+
readonly weights?: Record<string, number>;
|
|
206
|
+
} | {
|
|
207
|
+
readonly type: 'code_judge';
|
|
208
|
+
readonly path: string;
|
|
209
|
+
readonly cwd?: string;
|
|
210
|
+
} | {
|
|
211
|
+
readonly type: 'llm_judge';
|
|
212
|
+
readonly prompt?: string;
|
|
213
|
+
readonly promptPath?: string;
|
|
214
|
+
readonly model?: string;
|
|
215
|
+
};
|
|
216
|
+
type CompositeEvaluatorConfig = {
|
|
217
|
+
readonly name: string;
|
|
218
|
+
readonly type: 'composite';
|
|
219
|
+
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
|
+
readonly aggregator: CompositeAggregatorConfig;
|
|
221
|
+
};
|
|
222
|
+
type ExpectedMessagesEvaluatorConfig = {
|
|
223
|
+
readonly name: string;
|
|
224
|
+
readonly type: 'expected_messages';
|
|
225
|
+
};
|
|
226
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
|
|
103
227
|
/**
|
|
104
228
|
* Eval case definition sourced from AgentV specs.
|
|
105
229
|
*/
|
|
@@ -140,18 +264,24 @@ interface EvaluationResult {
|
|
|
140
264
|
readonly evaluator_provider_request?: JsonObject;
|
|
141
265
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
142
266
|
readonly error?: string;
|
|
267
|
+
/** Lightweight summary of the execution trace (always included when available) */
|
|
268
|
+
readonly trace_summary?: TraceSummary;
|
|
269
|
+
/** Full trace events (only included when --include-trace flag is set) */
|
|
270
|
+
readonly trace?: readonly TraceEvent[];
|
|
143
271
|
}
|
|
144
272
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
145
273
|
interface EvaluatorResult {
|
|
146
274
|
readonly name: string;
|
|
147
275
|
readonly type: EvaluatorKind;
|
|
148
276
|
readonly score: number;
|
|
277
|
+
readonly weight?: number;
|
|
149
278
|
readonly verdict?: EvaluationVerdict;
|
|
150
279
|
readonly hits: readonly string[];
|
|
151
280
|
readonly misses: readonly string[];
|
|
152
281
|
readonly reasoning?: string;
|
|
153
282
|
readonly raw_request?: JsonObject;
|
|
154
283
|
readonly evaluator_provider_request?: JsonObject;
|
|
284
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
155
285
|
}
|
|
156
286
|
/**
|
|
157
287
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -185,6 +315,10 @@ interface ProviderResponse {
|
|
|
185
315
|
readonly reasoning?: string;
|
|
186
316
|
readonly raw?: unknown;
|
|
187
317
|
readonly usage?: JsonObject;
|
|
318
|
+
/** Normalized trace events from agent execution */
|
|
319
|
+
readonly trace?: readonly TraceEvent[];
|
|
320
|
+
/** Reference to external trace file (alternative to inline trace) */
|
|
321
|
+
readonly traceRef?: string;
|
|
188
322
|
}
|
|
189
323
|
interface Provider {
|
|
190
324
|
readonly id: string;
|
|
@@ -338,6 +472,10 @@ declare function normalizeLineEndings(content: string): string;
|
|
|
338
472
|
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
339
473
|
*/
|
|
340
474
|
declare function readTextFile(filePath: string): Promise<string>;
|
|
475
|
+
/**
|
|
476
|
+
* Read a JSON file and parse it.
|
|
477
|
+
*/
|
|
478
|
+
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
341
479
|
/**
|
|
342
480
|
* Find git repository root by walking up the directory tree.
|
|
343
481
|
*/
|
|
@@ -414,6 +552,8 @@ interface MockResolvedConfig {
|
|
|
414
552
|
readonly delayMs?: number;
|
|
415
553
|
readonly delayMinMs?: number;
|
|
416
554
|
readonly delayMaxMs?: number;
|
|
555
|
+
/** Mock trace events for testing tool_trajectory evaluator */
|
|
556
|
+
readonly trace?: readonly TraceEvent[];
|
|
417
557
|
}
|
|
418
558
|
interface VSCodeResolvedConfig {
|
|
419
559
|
readonly command: string;
|
|
@@ -490,7 +630,7 @@ type ResolvedTarget = {
|
|
|
490
630
|
readonly providerBatching?: boolean;
|
|
491
631
|
readonly config: CliResolvedConfig;
|
|
492
632
|
};
|
|
493
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
633
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
494
634
|
|
|
495
635
|
declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
|
|
496
636
|
declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
|
|
@@ -542,6 +682,10 @@ interface EvaluationContext {
|
|
|
542
682
|
readonly judgeProvider?: Provider;
|
|
543
683
|
readonly evaluatorTemplateOverride?: string;
|
|
544
684
|
readonly evaluator?: EvaluatorConfig;
|
|
685
|
+
/** Normalized trace events from provider execution (if available) */
|
|
686
|
+
readonly candidateTrace?: readonly TraceEvent[];
|
|
687
|
+
/** Lightweight summary of trace events (if available) */
|
|
688
|
+
readonly candidateTraceSummary?: TraceSummary;
|
|
545
689
|
}
|
|
546
690
|
interface EvaluationScore {
|
|
547
691
|
readonly score: number;
|
|
@@ -552,6 +696,19 @@ interface EvaluationScore {
|
|
|
552
696
|
readonly reasoning?: string;
|
|
553
697
|
readonly rawAspects?: readonly string[];
|
|
554
698
|
readonly evaluatorRawRequest?: JsonObject;
|
|
699
|
+
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
700
|
+
}
|
|
701
|
+
interface ChildEvaluatorResult {
|
|
702
|
+
readonly name: string;
|
|
703
|
+
readonly type: string;
|
|
704
|
+
readonly score: number;
|
|
705
|
+
readonly weight?: number;
|
|
706
|
+
readonly verdict: EvaluationVerdict;
|
|
707
|
+
readonly hits: readonly string[];
|
|
708
|
+
readonly misses: readonly string[];
|
|
709
|
+
readonly reasoning?: string;
|
|
710
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
711
|
+
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
555
712
|
}
|
|
556
713
|
interface Evaluator {
|
|
557
714
|
readonly kind: string;
|
|
@@ -590,6 +747,50 @@ declare class CodeEvaluator implements Evaluator {
|
|
|
590
747
|
constructor(options: CodeEvaluatorOptions);
|
|
591
748
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
592
749
|
}
|
|
750
|
+
interface ToolTrajectoryEvaluatorOptions {
|
|
751
|
+
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
752
|
+
}
|
|
753
|
+
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
754
|
+
readonly kind = "tool_trajectory";
|
|
755
|
+
private readonly config;
|
|
756
|
+
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
757
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
758
|
+
private evaluateAnyOrder;
|
|
759
|
+
private evaluateInOrder;
|
|
760
|
+
private evaluateExact;
|
|
761
|
+
}
|
|
762
|
+
/**
|
|
763
|
+
* Evaluator that validates tool_calls in expected_messages against the actual trace.
|
|
764
|
+
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
765
|
+
* sequentially against tool_call events in the trace.
|
|
766
|
+
*/
|
|
767
|
+
declare class ExpectedMessagesEvaluator implements Evaluator {
|
|
768
|
+
readonly kind = "expected_messages";
|
|
769
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
770
|
+
private extractExpectedToolCalls;
|
|
771
|
+
private validateToolCalls;
|
|
772
|
+
private deepEquals;
|
|
773
|
+
}
|
|
774
|
+
interface EvaluatorFactory {
|
|
775
|
+
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
776
|
+
}
|
|
777
|
+
interface CompositeEvaluatorOptions {
|
|
778
|
+
readonly config: CompositeEvaluatorConfig;
|
|
779
|
+
readonly evaluatorFactory: EvaluatorFactory;
|
|
780
|
+
readonly cwd?: string;
|
|
781
|
+
}
|
|
782
|
+
declare class CompositeEvaluator implements Evaluator {
|
|
783
|
+
readonly kind = "composite";
|
|
784
|
+
private readonly config;
|
|
785
|
+
private readonly evaluatorFactory;
|
|
786
|
+
private readonly cwd?;
|
|
787
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
788
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
789
|
+
private aggregate;
|
|
790
|
+
private runWeightedAverage;
|
|
791
|
+
private runCodeAggregator;
|
|
792
|
+
private runLlmAggregator;
|
|
793
|
+
}
|
|
593
794
|
|
|
594
795
|
type MaybePromise<T> = T | Promise<T>;
|
|
595
796
|
interface EvaluationCache {
|
|
@@ -660,4 +861,4 @@ type AgentKernel = {
|
|
|
660
861
|
};
|
|
661
862
|
declare function createAgentKernel(): AgentKernel;
|
|
662
863
|
|
|
663
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
864
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|