@agentv/core 0.23.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,5 +1,99 @@
1
1
  import * as ai from 'ai';
2
2
 
3
+ /**
4
+ * Trace event types for capturing agent execution traces.
5
+ * Provides a normalized, provider-agnostic model for tool-call trajectories.
6
+ */
7
+ /**
8
+ * Supported trace event types.
9
+ */
10
+ type TraceEventType = 'model_step' | 'tool_call' | 'tool_result' | 'message' | 'error';
11
+ /**
12
+ * Normalized trace event representing a single step in agent execution.
13
+ * Provider-agnostic format for tool-call trajectory evaluation.
14
+ */
15
+ interface TraceEvent {
16
+ /** Event type */
17
+ readonly type: TraceEventType;
18
+ /** ISO 8601 timestamp */
19
+ readonly timestamp: string;
20
+ /** Stable identifier for pairing tool_call/tool_result */
21
+ readonly id?: string;
22
+ /** Tool name (for tool_call/tool_result) */
23
+ readonly name?: string;
24
+ /** Tool input - any JSON value */
25
+ readonly input?: unknown;
26
+ /** Tool output - any JSON value */
27
+ readonly output?: unknown;
28
+ /** Message content (for message/model_step) */
29
+ readonly text?: string;
30
+ /** Provider-specific metadata */
31
+ readonly metadata?: Record<string, unknown>;
32
+ }
33
+ /**
34
+ * Compact summary of a trace for lightweight persistence.
35
+ * Included in results by default to avoid payload bloat.
36
+ */
37
+ interface TraceSummary {
38
+ /** Total number of events in trace */
39
+ readonly eventCount: number;
40
+ /** Unique tool names, sorted alphabetically */
41
+ readonly toolNames: readonly string[];
42
+ /** Map of tool name to call count */
43
+ readonly toolCallsByName: Readonly<Record<string, number>>;
44
+ /** Number of error events */
45
+ readonly errorCount: number;
46
+ }
47
+ /**
48
+ * Configuration for tool_trajectory evaluator.
49
+ */
50
+ interface ToolTrajectoryEvaluatorConfig {
51
+ readonly name: string;
52
+ readonly type: 'tool_trajectory';
53
+ /** Matching mode */
54
+ readonly mode: 'any_order' | 'in_order' | 'exact';
55
+ /** Minimum call counts per tool (for any_order mode) */
56
+ readonly minimums?: Readonly<Record<string, number>>;
57
+ /** Expected tool sequence (for in_order/exact modes) */
58
+ readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
61
+ }
62
+ /**
63
+ * Expected tool call item in a trajectory sequence.
64
+ */
65
+ interface ToolTrajectoryExpectedItem {
66
+ readonly tool: string;
67
+ }
68
+ /**
69
+ * Expected tool call specification for expected_messages validation.
70
+ */
71
+ interface ExpectedToolCall {
72
+ /** Tool name (required) */
73
+ readonly tool: string;
74
+ /** Tool input - if specified, must match exactly */
75
+ readonly input?: unknown;
76
+ /** Tool output - if specified, must match exactly */
77
+ readonly output?: unknown;
78
+ }
79
+ /**
80
+ * Type guard for TraceEventType values.
81
+ */
82
+ declare function isTraceEventType(value: unknown): value is TraceEventType;
83
+ /**
84
+ * Type guard for TraceEvent objects.
85
+ */
86
+ declare function isTraceEvent(value: unknown): value is TraceEvent;
87
+ /**
88
+ * Type guard for ExpectedToolCall objects.
89
+ */
90
+ declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
91
+ /**
92
+ * Compute a lightweight summary from a full trace.
93
+ * Used for default result persistence without payload bloat.
94
+ */
95
+ declare function computeTraceSummary(trace: readonly TraceEvent[]): TraceSummary;
96
+
3
97
  /**
4
98
  * JSON primitive values appearing in AgentV payloads.
5
99
  */
@@ -41,12 +135,21 @@ type UserTestMessage = {
41
135
  readonly role: 'user';
42
136
  readonly content: TestMessageContent;
43
137
  };
138
+ /**
139
+ * Tool call specification for expected_messages validation.
140
+ */
141
+ type TestMessageToolCall = {
142
+ readonly tool: string;
143
+ readonly input?: unknown;
144
+ };
44
145
  /**
45
146
  * Assistant response message.
46
147
  */
47
148
  type AssistantTestMessage = {
48
149
  readonly role: 'assistant';
49
150
  readonly content: TestMessageContent;
151
+ /** Optional tool_calls for expected_messages validation against traces */
152
+ readonly tool_calls?: readonly TestMessageToolCall[];
50
153
  };
51
154
  /**
52
155
  * Tool invocation message.
@@ -75,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
178
  * Guard validating raw test messages.
76
179
  */
77
180
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite"];
181
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
79
182
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
183
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
184
  type CodeEvaluatorConfig = {
@@ -85,6 +188,7 @@ type CodeEvaluatorConfig = {
85
188
  readonly resolvedScriptPath?: string;
86
189
  readonly cwd?: string;
87
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
88
192
  };
89
193
  type LlmJudgeEvaluatorConfig = {
90
194
  readonly name: string;
@@ -92,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
92
196
  readonly prompt?: string;
93
197
  readonly promptPath?: string;
94
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
95
200
  };
96
201
  type RubricItem = {
97
202
  readonly id: string;
@@ -117,8 +222,14 @@ type CompositeEvaluatorConfig = {
117
222
  readonly type: 'composite';
118
223
  readonly evaluators: readonly EvaluatorConfig[];
119
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
226
+ };
227
+ type ExpectedMessagesEvaluatorConfig = {
228
+ readonly name: string;
229
+ readonly type: 'expected_messages';
230
+ readonly weight?: number;
120
231
  };
121
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig;
232
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
122
233
  /**
123
234
  * Eval case definition sourced from AgentV specs.
124
235
  */
@@ -159,6 +270,10 @@ interface EvaluationResult {
159
270
  readonly evaluator_provider_request?: JsonObject;
160
271
  readonly evaluator_results?: readonly EvaluatorResult[];
161
272
  readonly error?: string;
273
+ /** Lightweight summary of the execution trace (always included when available) */
274
+ readonly trace_summary?: TraceSummary;
275
+ /** Full trace events (only included when --include-trace flag is set) */
276
+ readonly trace?: readonly TraceEvent[];
162
277
  }
163
278
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
164
279
  interface EvaluatorResult {
@@ -206,6 +321,10 @@ interface ProviderResponse {
206
321
  readonly reasoning?: string;
207
322
  readonly raw?: unknown;
208
323
  readonly usage?: JsonObject;
324
+ /** Normalized trace events from agent execution */
325
+ readonly trace?: readonly TraceEvent[];
326
+ /** Reference to external trace file (alternative to inline trace) */
327
+ readonly traceRef?: string;
209
328
  }
210
329
  interface Provider {
211
330
  readonly id: string;
@@ -359,6 +478,10 @@ declare function normalizeLineEndings(content: string): string;
359
478
  * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
360
479
  */
361
480
  declare function readTextFile(filePath: string): Promise<string>;
481
+ /**
482
+ * Read a JSON file and parse it.
483
+ */
484
+ declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
362
485
  /**
363
486
  * Find git repository root by walking up the directory tree.
364
487
  */
@@ -435,6 +558,8 @@ interface MockResolvedConfig {
435
558
  readonly delayMs?: number;
436
559
  readonly delayMinMs?: number;
437
560
  readonly delayMaxMs?: number;
561
+ /** Mock trace events for testing tool_trajectory evaluator */
562
+ readonly trace?: readonly TraceEvent[];
438
563
  }
439
564
  interface VSCodeResolvedConfig {
440
565
  readonly command: string;
@@ -511,7 +636,7 @@ type ResolvedTarget = {
511
636
  readonly providerBatching?: boolean;
512
637
  readonly config: CliResolvedConfig;
513
638
  };
514
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
639
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
515
640
 
516
641
  declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
517
642
  declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
@@ -563,6 +688,10 @@ interface EvaluationContext {
563
688
  readonly judgeProvider?: Provider;
564
689
  readonly evaluatorTemplateOverride?: string;
565
690
  readonly evaluator?: EvaluatorConfig;
691
+ /** Normalized trace events from provider execution (if available) */
692
+ readonly candidateTrace?: readonly TraceEvent[];
693
+ /** Lightweight summary of trace events (if available) */
694
+ readonly candidateTraceSummary?: TraceSummary;
566
695
  }
567
696
  interface EvaluationScore {
568
697
  readonly score: number;
@@ -624,6 +753,30 @@ declare class CodeEvaluator implements Evaluator {
624
753
  constructor(options: CodeEvaluatorOptions);
625
754
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
626
755
  }
756
+ interface ToolTrajectoryEvaluatorOptions {
757
+ readonly config: ToolTrajectoryEvaluatorConfig;
758
+ }
759
+ declare class ToolTrajectoryEvaluator implements Evaluator {
760
+ readonly kind = "tool_trajectory";
761
+ private readonly config;
762
+ constructor(options: ToolTrajectoryEvaluatorOptions);
763
+ evaluate(context: EvaluationContext): EvaluationScore;
764
+ private evaluateAnyOrder;
765
+ private evaluateInOrder;
766
+ private evaluateExact;
767
+ }
768
+ /**
769
+ * Evaluator that validates tool_calls in expected_messages against the actual trace.
770
+ * Extracts tool_calls from assistant messages in expected_messages and compares them
771
+ * sequentially against tool_call events in the trace.
772
+ */
773
+ declare class ExpectedMessagesEvaluator implements Evaluator {
774
+ readonly kind = "expected_messages";
775
+ evaluate(context: EvaluationContext): EvaluationScore;
776
+ private extractExpectedToolCalls;
777
+ private validateToolCalls;
778
+ private deepEquals;
779
+ }
627
780
  interface EvaluatorFactory {
628
781
  create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
629
782
  }
@@ -714,4 +867,4 @@ type AgentKernel = {
714
867
  };
715
868
  declare function createAgentKernel(): AgentKernel;
716
869
 
717
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
870
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.d.ts CHANGED
@@ -1,5 +1,99 @@
1
1
  import * as ai from 'ai';
2
2
 
3
+ /**
4
+ * Trace event types for capturing agent execution traces.
5
+ * Provides a normalized, provider-agnostic model for tool-call trajectories.
6
+ */
7
+ /**
8
+ * Supported trace event types.
9
+ */
10
+ type TraceEventType = 'model_step' | 'tool_call' | 'tool_result' | 'message' | 'error';
11
+ /**
12
+ * Normalized trace event representing a single step in agent execution.
13
+ * Provider-agnostic format for tool-call trajectory evaluation.
14
+ */
15
+ interface TraceEvent {
16
+ /** Event type */
17
+ readonly type: TraceEventType;
18
+ /** ISO 8601 timestamp */
19
+ readonly timestamp: string;
20
+ /** Stable identifier for pairing tool_call/tool_result */
21
+ readonly id?: string;
22
+ /** Tool name (for tool_call/tool_result) */
23
+ readonly name?: string;
24
+ /** Tool input - any JSON value */
25
+ readonly input?: unknown;
26
+ /** Tool output - any JSON value */
27
+ readonly output?: unknown;
28
+ /** Message content (for message/model_step) */
29
+ readonly text?: string;
30
+ /** Provider-specific metadata */
31
+ readonly metadata?: Record<string, unknown>;
32
+ }
33
+ /**
34
+ * Compact summary of a trace for lightweight persistence.
35
+ * Included in results by default to avoid payload bloat.
36
+ */
37
+ interface TraceSummary {
38
+ /** Total number of events in trace */
39
+ readonly eventCount: number;
40
+ /** Unique tool names, sorted alphabetically */
41
+ readonly toolNames: readonly string[];
42
+ /** Map of tool name to call count */
43
+ readonly toolCallsByName: Readonly<Record<string, number>>;
44
+ /** Number of error events */
45
+ readonly errorCount: number;
46
+ }
47
+ /**
48
+ * Configuration for tool_trajectory evaluator.
49
+ */
50
+ interface ToolTrajectoryEvaluatorConfig {
51
+ readonly name: string;
52
+ readonly type: 'tool_trajectory';
53
+ /** Matching mode */
54
+ readonly mode: 'any_order' | 'in_order' | 'exact';
55
+ /** Minimum call counts per tool (for any_order mode) */
56
+ readonly minimums?: Readonly<Record<string, number>>;
57
+ /** Expected tool sequence (for in_order/exact modes) */
58
+ readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
61
+ }
62
+ /**
63
+ * Expected tool call item in a trajectory sequence.
64
+ */
65
+ interface ToolTrajectoryExpectedItem {
66
+ readonly tool: string;
67
+ }
68
+ /**
69
+ * Expected tool call specification for expected_messages validation.
70
+ */
71
+ interface ExpectedToolCall {
72
+ /** Tool name (required) */
73
+ readonly tool: string;
74
+ /** Tool input - if specified, must match exactly */
75
+ readonly input?: unknown;
76
+ /** Tool output - if specified, must match exactly */
77
+ readonly output?: unknown;
78
+ }
79
+ /**
80
+ * Type guard for TraceEventType values.
81
+ */
82
+ declare function isTraceEventType(value: unknown): value is TraceEventType;
83
+ /**
84
+ * Type guard for TraceEvent objects.
85
+ */
86
+ declare function isTraceEvent(value: unknown): value is TraceEvent;
87
+ /**
88
+ * Type guard for ExpectedToolCall objects.
89
+ */
90
+ declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
91
+ /**
92
+ * Compute a lightweight summary from a full trace.
93
+ * Used for default result persistence without payload bloat.
94
+ */
95
+ declare function computeTraceSummary(trace: readonly TraceEvent[]): TraceSummary;
96
+
3
97
  /**
4
98
  * JSON primitive values appearing in AgentV payloads.
5
99
  */
@@ -41,12 +135,21 @@ type UserTestMessage = {
41
135
  readonly role: 'user';
42
136
  readonly content: TestMessageContent;
43
137
  };
138
+ /**
139
+ * Tool call specification for expected_messages validation.
140
+ */
141
+ type TestMessageToolCall = {
142
+ readonly tool: string;
143
+ readonly input?: unknown;
144
+ };
44
145
  /**
45
146
  * Assistant response message.
46
147
  */
47
148
  type AssistantTestMessage = {
48
149
  readonly role: 'assistant';
49
150
  readonly content: TestMessageContent;
151
+ /** Optional tool_calls for expected_messages validation against traces */
152
+ readonly tool_calls?: readonly TestMessageToolCall[];
50
153
  };
51
154
  /**
52
155
  * Tool invocation message.
@@ -75,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
178
  * Guard validating raw test messages.
76
179
  */
77
180
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite"];
181
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
79
182
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
183
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
184
  type CodeEvaluatorConfig = {
@@ -85,6 +188,7 @@ type CodeEvaluatorConfig = {
85
188
  readonly resolvedScriptPath?: string;
86
189
  readonly cwd?: string;
87
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
88
192
  };
89
193
  type LlmJudgeEvaluatorConfig = {
90
194
  readonly name: string;
@@ -92,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
92
196
  readonly prompt?: string;
93
197
  readonly promptPath?: string;
94
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
95
200
  };
96
201
  type RubricItem = {
97
202
  readonly id: string;
@@ -117,8 +222,14 @@ type CompositeEvaluatorConfig = {
117
222
  readonly type: 'composite';
118
223
  readonly evaluators: readonly EvaluatorConfig[];
119
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
226
+ };
227
+ type ExpectedMessagesEvaluatorConfig = {
228
+ readonly name: string;
229
+ readonly type: 'expected_messages';
230
+ readonly weight?: number;
120
231
  };
121
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig;
232
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
122
233
  /**
123
234
  * Eval case definition sourced from AgentV specs.
124
235
  */
@@ -159,6 +270,10 @@ interface EvaluationResult {
159
270
  readonly evaluator_provider_request?: JsonObject;
160
271
  readonly evaluator_results?: readonly EvaluatorResult[];
161
272
  readonly error?: string;
273
+ /** Lightweight summary of the execution trace (always included when available) */
274
+ readonly trace_summary?: TraceSummary;
275
+ /** Full trace events (only included when --include-trace flag is set) */
276
+ readonly trace?: readonly TraceEvent[];
162
277
  }
163
278
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
164
279
  interface EvaluatorResult {
@@ -206,6 +321,10 @@ interface ProviderResponse {
206
321
  readonly reasoning?: string;
207
322
  readonly raw?: unknown;
208
323
  readonly usage?: JsonObject;
324
+ /** Normalized trace events from agent execution */
325
+ readonly trace?: readonly TraceEvent[];
326
+ /** Reference to external trace file (alternative to inline trace) */
327
+ readonly traceRef?: string;
209
328
  }
210
329
  interface Provider {
211
330
  readonly id: string;
@@ -359,6 +478,10 @@ declare function normalizeLineEndings(content: string): string;
359
478
  * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
360
479
  */
361
480
  declare function readTextFile(filePath: string): Promise<string>;
481
+ /**
482
+ * Read a JSON file and parse it.
483
+ */
484
+ declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
362
485
  /**
363
486
  * Find git repository root by walking up the directory tree.
364
487
  */
@@ -435,6 +558,8 @@ interface MockResolvedConfig {
435
558
  readonly delayMs?: number;
436
559
  readonly delayMinMs?: number;
437
560
  readonly delayMaxMs?: number;
561
+ /** Mock trace events for testing tool_trajectory evaluator */
562
+ readonly trace?: readonly TraceEvent[];
438
563
  }
439
564
  interface VSCodeResolvedConfig {
440
565
  readonly command: string;
@@ -511,7 +636,7 @@ type ResolvedTarget = {
511
636
  readonly providerBatching?: boolean;
512
637
  readonly config: CliResolvedConfig;
513
638
  };
514
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
639
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
515
640
 
516
641
  declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
517
642
  declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
@@ -563,6 +688,10 @@ interface EvaluationContext {
563
688
  readonly judgeProvider?: Provider;
564
689
  readonly evaluatorTemplateOverride?: string;
565
690
  readonly evaluator?: EvaluatorConfig;
691
+ /** Normalized trace events from provider execution (if available) */
692
+ readonly candidateTrace?: readonly TraceEvent[];
693
+ /** Lightweight summary of trace events (if available) */
694
+ readonly candidateTraceSummary?: TraceSummary;
566
695
  }
567
696
  interface EvaluationScore {
568
697
  readonly score: number;
@@ -624,6 +753,30 @@ declare class CodeEvaluator implements Evaluator {
624
753
  constructor(options: CodeEvaluatorOptions);
625
754
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
626
755
  }
756
+ interface ToolTrajectoryEvaluatorOptions {
757
+ readonly config: ToolTrajectoryEvaluatorConfig;
758
+ }
759
+ declare class ToolTrajectoryEvaluator implements Evaluator {
760
+ readonly kind = "tool_trajectory";
761
+ private readonly config;
762
+ constructor(options: ToolTrajectoryEvaluatorOptions);
763
+ evaluate(context: EvaluationContext): EvaluationScore;
764
+ private evaluateAnyOrder;
765
+ private evaluateInOrder;
766
+ private evaluateExact;
767
+ }
768
+ /**
769
+ * Evaluator that validates tool_calls in expected_messages against the actual trace.
770
+ * Extracts tool_calls from assistant messages in expected_messages and compares them
771
+ * sequentially against tool_call events in the trace.
772
+ */
773
+ declare class ExpectedMessagesEvaluator implements Evaluator {
774
+ readonly kind = "expected_messages";
775
+ evaluate(context: EvaluationContext): EvaluationScore;
776
+ private extractExpectedToolCalls;
777
+ private validateToolCalls;
778
+ private deepEquals;
779
+ }
627
780
  interface EvaluatorFactory {
628
781
  create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
629
782
  }
@@ -714,4 +867,4 @@ type AgentKernel = {
714
867
  };
715
868
  declare function createAgentKernel(): AgentKernel;
716
869
 
717
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
870
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };