@agentv/core 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -65,17 +65,6 @@ interface ToolTrajectoryEvaluatorConfig {
65
65
  interface ToolTrajectoryExpectedItem {
66
66
  readonly tool: string;
67
67
  }
68
- /**
69
- * Expected tool call specification for expected_messages validation.
70
- */
71
- interface ExpectedToolCall {
72
- /** Tool name (required) */
73
- readonly tool: string;
74
- /** Tool input - if specified, must match exactly */
75
- readonly input?: unknown;
76
- /** Tool output - if specified, must match exactly */
77
- readonly output?: unknown;
78
- }
79
68
  /**
80
69
  * Type guard for TraceEventType values.
81
70
  */
@@ -84,10 +73,6 @@ declare function isTraceEventType(value: unknown): value is TraceEventType;
84
73
  * Type guard for TraceEvent objects.
85
74
  */
86
75
  declare function isTraceEvent(value: unknown): value is TraceEvent;
87
- /**
88
- * Type guard for ExpectedToolCall objects.
89
- */
90
- declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
91
76
  /**
92
77
  * Compute a lightweight summary from a full trace.
93
78
  * Used for default result persistence without payload bloat.
@@ -135,21 +120,12 @@ type UserTestMessage = {
135
120
  readonly role: 'user';
136
121
  readonly content: TestMessageContent;
137
122
  };
138
- /**
139
- * Tool call specification for expected_messages validation.
140
- */
141
- type TestMessageToolCall = {
142
- readonly tool: string;
143
- readonly input?: unknown;
144
- };
145
123
  /**
146
124
  * Assistant response message.
147
125
  */
148
126
  type AssistantTestMessage = {
149
127
  readonly role: 'assistant';
150
128
  readonly content: TestMessageContent;
151
- /** Optional tool_calls for expected_messages validation against traces */
152
- readonly tool_calls?: readonly TestMessageToolCall[];
153
129
  };
154
130
  /**
155
131
  * Tool invocation message.
@@ -176,9 +152,12 @@ declare function isJsonObject(value: unknown): value is JsonObject;
176
152
  declare function isJsonValue(value: unknown): value is JsonValue;
177
153
  /**
178
154
  * Guard validating raw test messages.
155
+ * A valid test message has:
156
+ * - A valid role (system, user, assistant, tool)
157
+ * - Either content (string or array of objects) OR tool_calls (for assistant messages)
179
158
  */
180
159
  declare function isTestMessage(value: unknown): value is TestMessage;
181
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
160
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
182
161
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
183
162
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
184
163
  type CodeEvaluatorConfig = {
@@ -224,12 +203,7 @@ type CompositeEvaluatorConfig = {
224
203
  readonly aggregator: CompositeAggregatorConfig;
225
204
  readonly weight?: number;
226
205
  };
227
- type ExpectedMessagesEvaluatorConfig = {
228
- readonly name: string;
229
- readonly type: 'expected_messages';
230
- readonly weight?: number;
231
- };
232
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
206
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig;
233
207
  /**
234
208
  * Eval case definition sourced from AgentV specs.
235
209
  */
@@ -240,7 +214,7 @@ interface EvalCase {
240
214
  readonly question: string;
241
215
  readonly input_messages: readonly TestMessage[];
242
216
  readonly input_segments: readonly JsonObject[];
243
- readonly expected_segments: readonly JsonObject[];
217
+ readonly expected_messages: readonly JsonObject[];
244
218
  readonly reference_answer?: string;
245
219
  readonly guideline_paths: readonly string[];
246
220
  readonly guideline_patterns?: readonly string[];
@@ -690,6 +664,8 @@ interface EvaluationContext {
690
664
  readonly evaluator?: EvaluatorConfig;
691
665
  /** Normalized trace events from provider execution (if available) */
692
666
  readonly candidateTrace?: readonly TraceEvent[];
667
+ /** File path to trace data (alternative to inline candidateTrace) */
668
+ readonly candidateTraceRef?: string;
693
669
  /** Lightweight summary of trace events (if available) */
694
670
  readonly candidateTraceSummary?: TraceSummary;
695
671
  }
@@ -765,18 +741,6 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
765
741
  private evaluateInOrder;
766
742
  private evaluateExact;
767
743
  }
768
- /**
769
- * Evaluator that validates tool_calls in expected_messages against the actual trace.
770
- * Extracts tool_calls from assistant messages in expected_messages and compares them
771
- * sequentially against tool_call events in the trace.
772
- */
773
- declare class ExpectedMessagesEvaluator implements Evaluator {
774
- readonly kind = "expected_messages";
775
- evaluate(context: EvaluationContext): EvaluationScore;
776
- private extractExpectedToolCalls;
777
- private validateToolCalls;
778
- private deepEquals;
779
- }
780
744
  interface EvaluatorFactory {
781
745
  create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
782
746
  }
@@ -867,4 +831,4 @@ type AgentKernel = {
867
831
  };
868
832
  declare function createAgentKernel(): AgentKernel;
869
833
 
870
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
834
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.d.ts CHANGED
@@ -65,17 +65,6 @@ interface ToolTrajectoryEvaluatorConfig {
65
65
  interface ToolTrajectoryExpectedItem {
66
66
  readonly tool: string;
67
67
  }
68
- /**
69
- * Expected tool call specification for expected_messages validation.
70
- */
71
- interface ExpectedToolCall {
72
- /** Tool name (required) */
73
- readonly tool: string;
74
- /** Tool input - if specified, must match exactly */
75
- readonly input?: unknown;
76
- /** Tool output - if specified, must match exactly */
77
- readonly output?: unknown;
78
- }
79
68
  /**
80
69
  * Type guard for TraceEventType values.
81
70
  */
@@ -84,10 +73,6 @@ declare function isTraceEventType(value: unknown): value is TraceEventType;
84
73
  * Type guard for TraceEvent objects.
85
74
  */
86
75
  declare function isTraceEvent(value: unknown): value is TraceEvent;
87
- /**
88
- * Type guard for ExpectedToolCall objects.
89
- */
90
- declare function isExpectedToolCall(value: unknown): value is ExpectedToolCall;
91
76
  /**
92
77
  * Compute a lightweight summary from a full trace.
93
78
  * Used for default result persistence without payload bloat.
@@ -135,21 +120,12 @@ type UserTestMessage = {
135
120
  readonly role: 'user';
136
121
  readonly content: TestMessageContent;
137
122
  };
138
- /**
139
- * Tool call specification for expected_messages validation.
140
- */
141
- type TestMessageToolCall = {
142
- readonly tool: string;
143
- readonly input?: unknown;
144
- };
145
123
  /**
146
124
  * Assistant response message.
147
125
  */
148
126
  type AssistantTestMessage = {
149
127
  readonly role: 'assistant';
150
128
  readonly content: TestMessageContent;
151
- /** Optional tool_calls for expected_messages validation against traces */
152
- readonly tool_calls?: readonly TestMessageToolCall[];
153
129
  };
154
130
  /**
155
131
  * Tool invocation message.
@@ -176,9 +152,12 @@ declare function isJsonObject(value: unknown): value is JsonObject;
176
152
  declare function isJsonValue(value: unknown): value is JsonValue;
177
153
  /**
178
154
  * Guard validating raw test messages.
155
+ * A valid test message has:
156
+ * - A valid role (system, user, assistant, tool)
157
+ * - Either content (string or array of objects) OR tool_calls (for assistant messages)
179
158
  */
180
159
  declare function isTestMessage(value: unknown): value is TestMessage;
181
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
160
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
182
161
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
183
162
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
184
163
  type CodeEvaluatorConfig = {
@@ -224,12 +203,7 @@ type CompositeEvaluatorConfig = {
224
203
  readonly aggregator: CompositeAggregatorConfig;
225
204
  readonly weight?: number;
226
205
  };
227
- type ExpectedMessagesEvaluatorConfig = {
228
- readonly name: string;
229
- readonly type: 'expected_messages';
230
- readonly weight?: number;
231
- };
232
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
206
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig;
233
207
  /**
234
208
  * Eval case definition sourced from AgentV specs.
235
209
  */
@@ -240,7 +214,7 @@ interface EvalCase {
240
214
  readonly question: string;
241
215
  readonly input_messages: readonly TestMessage[];
242
216
  readonly input_segments: readonly JsonObject[];
243
- readonly expected_segments: readonly JsonObject[];
217
+ readonly expected_messages: readonly JsonObject[];
244
218
  readonly reference_answer?: string;
245
219
  readonly guideline_paths: readonly string[];
246
220
  readonly guideline_patterns?: readonly string[];
@@ -690,6 +664,8 @@ interface EvaluationContext {
690
664
  readonly evaluator?: EvaluatorConfig;
691
665
  /** Normalized trace events from provider execution (if available) */
692
666
  readonly candidateTrace?: readonly TraceEvent[];
667
+ /** File path to trace data (alternative to inline candidateTrace) */
668
+ readonly candidateTraceRef?: string;
693
669
  /** Lightweight summary of trace events (if available) */
694
670
  readonly candidateTraceSummary?: TraceSummary;
695
671
  }
@@ -765,18 +741,6 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
765
741
  private evaluateInOrder;
766
742
  private evaluateExact;
767
743
  }
768
- /**
769
- * Evaluator that validates tool_calls in expected_messages against the actual trace.
770
- * Extracts tool_calls from assistant messages in expected_messages and compares them
771
- * sequentially against tool_call events in the trace.
772
- */
773
- declare class ExpectedMessagesEvaluator implements Evaluator {
774
- readonly kind = "expected_messages";
775
- evaluate(context: EvaluationContext): EvaluationScore;
776
- private extractExpectedToolCalls;
777
- private validateToolCalls;
778
- private deepEquals;
779
- }
780
744
  interface EvaluatorFactory {
781
745
  create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
782
746
  }
@@ -867,4 +831,4 @@ type AgentKernel = {
867
831
  };
868
832
  declare function createAgentKernel(): AgentKernel;
869
833
 
870
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
834
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };