@agentv/core 0.26.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NDEN3H2B.js → chunk-V3JCB3HI.js} +1 -1
- package/dist/chunk-V3JCB3HI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +12 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -7
- package/dist/index.d.ts +7 -7
- package/dist/index.js +12 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-NDEN3H2B.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -178,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
178
178
|
* Guard validating raw test messages.
|
|
179
179
|
*/
|
|
180
180
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
181
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "
|
|
181
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
|
|
182
182
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
183
183
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
184
184
|
type CodeEvaluatorConfig = {
|
|
@@ -224,12 +224,12 @@ type CompositeEvaluatorConfig = {
|
|
|
224
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
225
|
readonly weight?: number;
|
|
226
226
|
};
|
|
227
|
-
type
|
|
227
|
+
type ExpectedToolCallsEvaluatorConfig = {
|
|
228
228
|
readonly name: string;
|
|
229
|
-
readonly type: '
|
|
229
|
+
readonly type: 'expected_tool_calls';
|
|
230
230
|
readonly weight?: number;
|
|
231
231
|
};
|
|
232
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig |
|
|
232
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
|
|
233
233
|
/**
|
|
234
234
|
* Eval case definition sourced from AgentV specs.
|
|
235
235
|
*/
|
|
@@ -770,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
770
770
|
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
771
771
|
* sequentially against tool_call events in the trace.
|
|
772
772
|
*/
|
|
773
|
-
declare class
|
|
774
|
-
readonly kind = "
|
|
773
|
+
declare class ExpectedToolCallsEvaluator implements Evaluator {
|
|
774
|
+
readonly kind = "expected_tool_calls";
|
|
775
775
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
776
776
|
private extractExpectedToolCalls;
|
|
777
777
|
private validateToolCalls;
|
|
@@ -867,4 +867,4 @@ type AgentKernel = {
|
|
|
867
867
|
};
|
|
868
868
|
declare function createAgentKernel(): AgentKernel;
|
|
869
869
|
|
|
870
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult,
|
|
870
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -178,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
178
178
|
* Guard validating raw test messages.
|
|
179
179
|
*/
|
|
180
180
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
181
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "
|
|
181
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
|
|
182
182
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
183
183
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
184
184
|
type CodeEvaluatorConfig = {
|
|
@@ -224,12 +224,12 @@ type CompositeEvaluatorConfig = {
|
|
|
224
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
225
|
readonly weight?: number;
|
|
226
226
|
};
|
|
227
|
-
type
|
|
227
|
+
type ExpectedToolCallsEvaluatorConfig = {
|
|
228
228
|
readonly name: string;
|
|
229
|
-
readonly type: '
|
|
229
|
+
readonly type: 'expected_tool_calls';
|
|
230
230
|
readonly weight?: number;
|
|
231
231
|
};
|
|
232
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig |
|
|
232
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
|
|
233
233
|
/**
|
|
234
234
|
* Eval case definition sourced from AgentV specs.
|
|
235
235
|
*/
|
|
@@ -770,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
770
770
|
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
771
771
|
* sequentially against tool_call events in the trace.
|
|
772
772
|
*/
|
|
773
|
-
declare class
|
|
774
|
-
readonly kind = "
|
|
773
|
+
declare class ExpectedToolCallsEvaluator implements Evaluator {
|
|
774
|
+
readonly kind = "expected_tool_calls";
|
|
775
775
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
776
776
|
private extractExpectedToolCalls;
|
|
777
777
|
private validateToolCalls;
|
|
@@ -867,4 +867,4 @@ type AgentKernel = {
|
|
|
867
867
|
};
|
|
868
868
|
declare function createAgentKernel(): AgentKernel;
|
|
869
869
|
|
|
870
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult,
|
|
870
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-V3JCB3HI.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -58,7 +58,7 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
58
58
|
"rubric",
|
|
59
59
|
"composite",
|
|
60
60
|
"tool_trajectory",
|
|
61
|
-
"
|
|
61
|
+
"expected_tool_calls"
|
|
62
62
|
];
|
|
63
63
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
64
64
|
function isEvaluatorKind(value) {
|
|
@@ -582,11 +582,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
582
582
|
});
|
|
583
583
|
continue;
|
|
584
584
|
}
|
|
585
|
-
if (typeValue === "
|
|
585
|
+
if (typeValue === "expected_tool_calls") {
|
|
586
586
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
587
587
|
evaluators.push({
|
|
588
588
|
name,
|
|
589
|
-
type: "
|
|
589
|
+
type: "expected_tool_calls",
|
|
590
590
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
591
591
|
});
|
|
592
592
|
continue;
|
|
@@ -3813,8 +3813,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3813
3813
|
};
|
|
3814
3814
|
}
|
|
3815
3815
|
};
|
|
3816
|
-
var
|
|
3817
|
-
kind = "
|
|
3816
|
+
var ExpectedToolCallsEvaluator = class {
|
|
3817
|
+
kind = "expected_tool_calls";
|
|
3818
3818
|
evaluate(context) {
|
|
3819
3819
|
const { candidateTrace, evalCase } = context;
|
|
3820
3820
|
const expectedSegments = evalCase.expected_segments;
|
|
@@ -4897,8 +4897,8 @@ async function runEvaluatorList(options) {
|
|
|
4897
4897
|
return new ToolTrajectoryEvaluator({
|
|
4898
4898
|
config: memberConfig
|
|
4899
4899
|
});
|
|
4900
|
-
case "
|
|
4901
|
-
return new
|
|
4900
|
+
case "expected_tool_calls":
|
|
4901
|
+
return new ExpectedToolCallsEvaluator();
|
|
4902
4902
|
default: {
|
|
4903
4903
|
const unknownConfig = memberConfig;
|
|
4904
4904
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4963,9 +4963,9 @@ async function runEvaluatorList(options) {
|
|
|
4963
4963
|
reasoning: score2.reasoning
|
|
4964
4964
|
});
|
|
4965
4965
|
}
|
|
4966
|
-
if (evaluator.type === "
|
|
4967
|
-
const
|
|
4968
|
-
const score2 =
|
|
4966
|
+
if (evaluator.type === "expected_tool_calls") {
|
|
4967
|
+
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
4968
|
+
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
4969
4969
|
evalCase,
|
|
4970
4970
|
candidate,
|
|
4971
4971
|
target,
|
|
@@ -5345,7 +5345,7 @@ function createAgentKernel() {
|
|
|
5345
5345
|
export {
|
|
5346
5346
|
CodeEvaluator,
|
|
5347
5347
|
CompositeEvaluator,
|
|
5348
|
-
|
|
5348
|
+
ExpectedToolCallsEvaluator,
|
|
5349
5349
|
LlmJudgeEvaluator,
|
|
5350
5350
|
TEST_MESSAGE_ROLES,
|
|
5351
5351
|
ToolTrajectoryEvaluator,
|