@agentv/core 1.0.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-V3JCB3HI.js → chunk-4A6L2F6L.js} +11 -5
- package/dist/chunk-4A6L2F6L.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +12 -44
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +13 -45
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +227 -230
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +10 -46
- package/dist/index.d.ts +10 -46
- package/dist/index.js +218 -225
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-V3JCB3HI.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,7 +32,6 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
-
ExpectedToolCallsEvaluator: () => ExpectedToolCallsEvaluator,
|
|
36
35
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
37
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
38
37
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
@@ -50,7 +49,6 @@ __export(index_exports, {
|
|
|
50
49
|
generateRubrics: () => generateRubrics,
|
|
51
50
|
getHitCount: () => getHitCount,
|
|
52
51
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
53
|
-
isExpectedToolCall: () => isExpectedToolCall,
|
|
54
52
|
isGuidelineFile: () => isGuidelineFile,
|
|
55
53
|
isJsonObject: () => isJsonObject,
|
|
56
54
|
isJsonValue: () => isJsonValue,
|
|
@@ -110,18 +108,23 @@ function isTestMessage(value) {
|
|
|
110
108
|
if (typeof candidate.content === "string") {
|
|
111
109
|
return true;
|
|
112
110
|
}
|
|
113
|
-
if (
|
|
114
|
-
return
|
|
111
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
115
|
+
return true;
|
|
115
116
|
}
|
|
116
|
-
|
|
117
|
+
if (isJsonObject(candidate.content)) {
|
|
118
|
+
return true;
|
|
119
|
+
}
|
|
120
|
+
return false;
|
|
117
121
|
}
|
|
118
122
|
var EVALUATOR_KIND_VALUES = [
|
|
119
123
|
"code_judge",
|
|
120
124
|
"llm_judge",
|
|
121
125
|
"rubric",
|
|
122
126
|
"composite",
|
|
123
|
-
"tool_trajectory"
|
|
124
|
-
"expected_tool_calls"
|
|
127
|
+
"tool_trajectory"
|
|
125
128
|
];
|
|
126
129
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
127
130
|
function isEvaluatorKind(value) {
|
|
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
|
|
|
142
145
|
const candidate = value;
|
|
143
146
|
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
144
147
|
}
|
|
145
|
-
function isExpectedToolCall(value) {
|
|
146
|
-
if (typeof value !== "object" || value === null) {
|
|
147
|
-
return false;
|
|
148
|
-
}
|
|
149
|
-
const candidate = value;
|
|
150
|
-
return typeof candidate.tool === "string";
|
|
151
|
-
}
|
|
152
148
|
function computeTraceSummary(trace) {
|
|
153
149
|
const toolCallCounts = {};
|
|
154
150
|
let errorCount = 0;
|
|
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
645
641
|
});
|
|
646
642
|
continue;
|
|
647
643
|
}
|
|
648
|
-
if (typeValue === "expected_tool_calls") {
|
|
649
|
-
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
650
|
-
evaluators.push({
|
|
651
|
-
name,
|
|
652
|
-
type: "expected_tool_calls",
|
|
653
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
654
|
-
});
|
|
655
|
-
continue;
|
|
656
|
-
}
|
|
657
644
|
if (typeValue === "tool_trajectory") {
|
|
658
645
|
const mode = asString2(rawEvaluator.mode);
|
|
659
646
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -848,6 +835,17 @@ async function processMessages(options) {
|
|
|
848
835
|
}
|
|
849
836
|
continue;
|
|
850
837
|
}
|
|
838
|
+
if (isJsonObject(content)) {
|
|
839
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
840
|
+
segments.push({ type: "text", value: rendered });
|
|
841
|
+
if (textParts) {
|
|
842
|
+
textParts.push(rendered);
|
|
843
|
+
}
|
|
844
|
+
continue;
|
|
845
|
+
}
|
|
846
|
+
if (!Array.isArray(content)) {
|
|
847
|
+
continue;
|
|
848
|
+
}
|
|
851
849
|
for (const rawSegment of content) {
|
|
852
850
|
if (!isJsonObject(rawSegment)) {
|
|
853
851
|
continue;
|
|
@@ -908,63 +906,6 @@ async function processMessages(options) {
|
|
|
908
906
|
}
|
|
909
907
|
return segments;
|
|
910
908
|
}
|
|
911
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
912
|
-
if (typeof content === "string") {
|
|
913
|
-
return content;
|
|
914
|
-
}
|
|
915
|
-
if (!content) {
|
|
916
|
-
return "";
|
|
917
|
-
}
|
|
918
|
-
const parts = [];
|
|
919
|
-
for (const entry of content) {
|
|
920
|
-
if (typeof entry === "string") {
|
|
921
|
-
parts.push({ content: entry, isFile: false });
|
|
922
|
-
continue;
|
|
923
|
-
}
|
|
924
|
-
if (!isJsonObject(entry)) {
|
|
925
|
-
continue;
|
|
926
|
-
}
|
|
927
|
-
const segmentType = asString3(entry.type);
|
|
928
|
-
if (segmentType === "file") {
|
|
929
|
-
const rawValue = asString3(entry.value);
|
|
930
|
-
if (!rawValue) {
|
|
931
|
-
continue;
|
|
932
|
-
}
|
|
933
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
934
|
-
rawValue,
|
|
935
|
-
searchRoots
|
|
936
|
-
);
|
|
937
|
-
if (!resolvedPath) {
|
|
938
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
939
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
940
|
-
continue;
|
|
941
|
-
}
|
|
942
|
-
try {
|
|
943
|
-
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
944
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
945
|
-
if (verbose) {
|
|
946
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
947
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
948
|
-
}
|
|
949
|
-
} catch (error) {
|
|
950
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
951
|
-
}
|
|
952
|
-
continue;
|
|
953
|
-
}
|
|
954
|
-
const textValue = asString3(entry.text);
|
|
955
|
-
if (typeof textValue === "string") {
|
|
956
|
-
parts.push({ content: textValue, isFile: false });
|
|
957
|
-
continue;
|
|
958
|
-
}
|
|
959
|
-
const valueValue = asString3(entry.value);
|
|
960
|
-
if (typeof valueValue === "string") {
|
|
961
|
-
parts.push({ content: valueValue, isFile: false });
|
|
962
|
-
continue;
|
|
963
|
-
}
|
|
964
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
965
|
-
}
|
|
966
|
-
return formatFileContents(parts);
|
|
967
|
-
}
|
|
968
909
|
function asString3(value) {
|
|
969
910
|
return typeof value === "string" ? value : void 0;
|
|
970
911
|
}
|
|
@@ -997,14 +938,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
997
938
|
}
|
|
998
939
|
}
|
|
999
940
|
async function processExpectedMessages(options) {
|
|
1000
|
-
const { messages, searchRoots,
|
|
941
|
+
const { messages, searchRoots, verbose } = options;
|
|
1001
942
|
const segments = [];
|
|
1002
943
|
for (const message of messages) {
|
|
944
|
+
const extendedMessage = message;
|
|
1003
945
|
const segment = {
|
|
1004
946
|
role: message.role
|
|
1005
947
|
};
|
|
1006
|
-
if (
|
|
1007
|
-
segment.
|
|
948
|
+
if (extendedMessage.name) {
|
|
949
|
+
segment.name = extendedMessage.name;
|
|
1008
950
|
}
|
|
1009
951
|
const content = message.content;
|
|
1010
952
|
if (typeof content === "string") {
|
|
@@ -1052,6 +994,13 @@ async function processExpectedMessages(options) {
|
|
|
1052
994
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
1053
995
|
}
|
|
1054
996
|
segment.content = processedContent;
|
|
997
|
+
} else if (isJsonObject(content)) {
|
|
998
|
+
segment.content = cloneJsonObject(content);
|
|
999
|
+
}
|
|
1000
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
1001
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
1002
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
1003
|
+
);
|
|
1055
1004
|
}
|
|
1056
1005
|
segments.push(segment);
|
|
1057
1006
|
}
|
|
@@ -1123,6 +1072,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1123
1072
|
}
|
|
1124
1073
|
}
|
|
1125
1074
|
}
|
|
1075
|
+
} else if (isJsonObject(message.content)) {
|
|
1076
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
1077
|
+
if (rendered.trim().length > 0) {
|
|
1078
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
1079
|
+
}
|
|
1126
1080
|
}
|
|
1127
1081
|
segmentsByMessage.push(messageSegments);
|
|
1128
1082
|
}
|
|
@@ -1346,9 +1300,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1346
1300
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
1347
1301
|
continue;
|
|
1348
1302
|
}
|
|
1349
|
-
if (expectedMessages.length > 1) {
|
|
1350
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
1351
|
-
}
|
|
1352
1303
|
const guidelinePaths = [];
|
|
1353
1304
|
const inputTextParts = [];
|
|
1354
1305
|
const inputSegments = await processMessages({
|
|
@@ -1368,8 +1319,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1368
1319
|
verbose
|
|
1369
1320
|
}) : [];
|
|
1370
1321
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1371
|
-
|
|
1372
|
-
|
|
1322
|
+
let referenceAnswer = "";
|
|
1323
|
+
if (outputSegments.length > 1) {
|
|
1324
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
1325
|
+
} else if (outputSegments.length === 1) {
|
|
1326
|
+
const singleMessage = outputSegments[0];
|
|
1327
|
+
if (typeof singleMessage.content === "string") {
|
|
1328
|
+
referenceAnswer = singleMessage.content;
|
|
1329
|
+
} else if (singleMessage.content) {
|
|
1330
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1331
|
+
} else if (singleMessage.tool_calls) {
|
|
1332
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1373
1335
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1374
1336
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1375
1337
|
let evaluators;
|
|
@@ -1424,7 +1386,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1424
1386
|
question,
|
|
1425
1387
|
input_messages: inputMessages,
|
|
1426
1388
|
input_segments: inputSegments,
|
|
1427
|
-
|
|
1389
|
+
expected_messages: outputSegments,
|
|
1428
1390
|
reference_answer: referenceAnswer,
|
|
1429
1391
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
1430
1392
|
guideline_patterns: guidelinePatterns,
|
|
@@ -1963,7 +1925,7 @@ var CliProvider = class {
|
|
|
1963
1925
|
id;
|
|
1964
1926
|
kind = "cli";
|
|
1965
1927
|
targetName;
|
|
1966
|
-
supportsBatch =
|
|
1928
|
+
supportsBatch = true;
|
|
1967
1929
|
config;
|
|
1968
1930
|
runCommand;
|
|
1969
1931
|
verbose;
|
|
@@ -1983,6 +1945,11 @@ var CliProvider = class {
|
|
|
1983
1945
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
1984
1946
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
1985
1947
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1948
|
+
if (this.verbose) {
|
|
1949
|
+
console.log(
|
|
1950
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1951
|
+
);
|
|
1952
|
+
}
|
|
1986
1953
|
const result = await this.runCommand(renderedCommand, {
|
|
1987
1954
|
cwd: this.config.cwd,
|
|
1988
1955
|
env: process.env,
|
|
@@ -2017,6 +1984,114 @@ var CliProvider = class {
|
|
|
2017
1984
|
}
|
|
2018
1985
|
};
|
|
2019
1986
|
}
|
|
1987
|
+
async invokeBatch(requests) {
|
|
1988
|
+
if (requests.length === 0) {
|
|
1989
|
+
return [];
|
|
1990
|
+
}
|
|
1991
|
+
for (const request of requests) {
|
|
1992
|
+
if (request.signal?.aborted) {
|
|
1993
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
1994
|
+
}
|
|
1995
|
+
}
|
|
1996
|
+
const controller = new AbortController();
|
|
1997
|
+
for (const request of requests) {
|
|
1998
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1999
|
+
}
|
|
2000
|
+
await this.ensureHealthy(controller.signal);
|
|
2001
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
2002
|
+
const batchInputFiles = [];
|
|
2003
|
+
for (const request of requests) {
|
|
2004
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
2005
|
+
batchInputFiles.push(...request.inputFiles);
|
|
2006
|
+
}
|
|
2007
|
+
}
|
|
2008
|
+
const templateValues = buildTemplateValues(
|
|
2009
|
+
{
|
|
2010
|
+
question: "",
|
|
2011
|
+
guidelines: "",
|
|
2012
|
+
inputFiles: batchInputFiles,
|
|
2013
|
+
evalCaseId: "batch",
|
|
2014
|
+
attempt: 0
|
|
2015
|
+
},
|
|
2016
|
+
this.config,
|
|
2017
|
+
outputFilePath
|
|
2018
|
+
);
|
|
2019
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
2020
|
+
if (this.verbose) {
|
|
2021
|
+
console.log(
|
|
2022
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2023
|
+
);
|
|
2024
|
+
}
|
|
2025
|
+
const result = await this.runCommand(renderedCommand, {
|
|
2026
|
+
cwd: this.config.cwd,
|
|
2027
|
+
env: process.env,
|
|
2028
|
+
timeoutMs: this.config.timeoutMs,
|
|
2029
|
+
signal: controller.signal
|
|
2030
|
+
});
|
|
2031
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2032
|
+
if (controller.signal.aborted) {
|
|
2033
|
+
throw new Error("CLI provider request was aborted");
|
|
2034
|
+
}
|
|
2035
|
+
if (result.timedOut) {
|
|
2036
|
+
throw new Error(
|
|
2037
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2038
|
+
);
|
|
2039
|
+
}
|
|
2040
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
2041
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
2042
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
2043
|
+
throw new Error(message);
|
|
2044
|
+
}
|
|
2045
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
2046
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
2047
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
2048
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
2049
|
+
if (missingIds.length > 0) {
|
|
2050
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2051
|
+
}
|
|
2052
|
+
const responses = requests.map((request) => {
|
|
2053
|
+
const evalCaseId = request.evalCaseId;
|
|
2054
|
+
if (!evalCaseId) {
|
|
2055
|
+
return {
|
|
2056
|
+
text: "",
|
|
2057
|
+
raw: {
|
|
2058
|
+
command: renderedCommand,
|
|
2059
|
+
stderr: result.stderr,
|
|
2060
|
+
exitCode: result.exitCode ?? 0,
|
|
2061
|
+
cwd: this.config.cwd,
|
|
2062
|
+
outputFile: outputFilePath
|
|
2063
|
+
}
|
|
2064
|
+
};
|
|
2065
|
+
}
|
|
2066
|
+
const parsed = recordsById.get(evalCaseId);
|
|
2067
|
+
if (!parsed) {
|
|
2068
|
+
return {
|
|
2069
|
+
text: "",
|
|
2070
|
+
raw: {
|
|
2071
|
+
command: renderedCommand,
|
|
2072
|
+
stderr: result.stderr,
|
|
2073
|
+
exitCode: result.exitCode ?? 0,
|
|
2074
|
+
cwd: this.config.cwd,
|
|
2075
|
+
outputFile: outputFilePath
|
|
2076
|
+
}
|
|
2077
|
+
};
|
|
2078
|
+
}
|
|
2079
|
+
return {
|
|
2080
|
+
text: parsed.text,
|
|
2081
|
+
trace: parsed.trace,
|
|
2082
|
+
traceRef: parsed.traceRef,
|
|
2083
|
+
raw: {
|
|
2084
|
+
command: renderedCommand,
|
|
2085
|
+
stderr: result.stderr,
|
|
2086
|
+
exitCode: result.exitCode ?? 0,
|
|
2087
|
+
cwd: this.config.cwd,
|
|
2088
|
+
outputFile: outputFilePath,
|
|
2089
|
+
recordId: evalCaseId
|
|
2090
|
+
}
|
|
2091
|
+
};
|
|
2092
|
+
});
|
|
2093
|
+
return responses;
|
|
2094
|
+
}
|
|
2020
2095
|
/**
|
|
2021
2096
|
* Parse output content from CLI.
|
|
2022
2097
|
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
@@ -2042,6 +2117,38 @@ var CliProvider = class {
|
|
|
2042
2117
|
const validEvents = trace.filter(isTraceEvent);
|
|
2043
2118
|
return validEvents.length > 0 ? validEvents : void 0;
|
|
2044
2119
|
}
|
|
2120
|
+
parseJsonlBatchOutput(content) {
|
|
2121
|
+
const records = /* @__PURE__ */ new Map();
|
|
2122
|
+
const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2123
|
+
for (const line of lines) {
|
|
2124
|
+
let parsed;
|
|
2125
|
+
try {
|
|
2126
|
+
parsed = JSON.parse(line);
|
|
2127
|
+
} catch (error) {
|
|
2128
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
2129
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2130
|
+
}
|
|
2131
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
2132
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
2133
|
+
}
|
|
2134
|
+
const obj = parsed;
|
|
2135
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
2136
|
+
if (!id || id.trim().length === 0) {
|
|
2137
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
2138
|
+
}
|
|
2139
|
+
if (records.has(id)) {
|
|
2140
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2141
|
+
}
|
|
2142
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2143
|
+
const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
|
|
2144
|
+
records.set(id, {
|
|
2145
|
+
text,
|
|
2146
|
+
trace: this.parseTrace(obj.trace),
|
|
2147
|
+
traceRef
|
|
2148
|
+
});
|
|
2149
|
+
}
|
|
2150
|
+
return records;
|
|
2151
|
+
}
|
|
2045
2152
|
async readAndCleanupOutputFile(filePath) {
|
|
2046
2153
|
try {
|
|
2047
2154
|
const content = await readTextFile(filePath);
|
|
@@ -2103,7 +2210,7 @@ var CliProvider = class {
|
|
|
2103
2210
|
);
|
|
2104
2211
|
if (this.verbose) {
|
|
2105
2212
|
console.log(
|
|
2106
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
2213
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2107
2214
|
);
|
|
2108
2215
|
}
|
|
2109
2216
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -2171,11 +2278,11 @@ function shellEscape(value) {
|
|
|
2171
2278
|
}
|
|
2172
2279
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
2173
2280
|
}
|
|
2174
|
-
function generateOutputFilePath(evalCaseId) {
|
|
2281
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
2175
2282
|
const safeEvalId = evalCaseId || "unknown";
|
|
2176
2283
|
const timestamp = Date.now();
|
|
2177
2284
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2178
|
-
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
2285
|
+
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2179
2286
|
}
|
|
2180
2287
|
function formatTimeoutSuffix(timeoutMs) {
|
|
2181
2288
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3355,10 +3462,14 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3355
3462
|
const filesFormat = resolveOptionalLiteralString(
|
|
3356
3463
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3357
3464
|
);
|
|
3465
|
+
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
3358
3466
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3359
3467
|
allowLiteral: true,
|
|
3360
3468
|
optionalEnv: true
|
|
3361
3469
|
});
|
|
3470
|
+
if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
|
|
3471
|
+
cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
|
|
3472
|
+
}
|
|
3362
3473
|
if (!cwd && evalFilePath) {
|
|
3363
3474
|
cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
|
|
3364
3475
|
}
|
|
@@ -3366,7 +3477,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3366
3477
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
3367
3478
|
`${target.name} timeout`
|
|
3368
3479
|
);
|
|
3369
|
-
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
3480
|
+
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
|
|
3370
3481
|
const commandTemplate = resolveString(
|
|
3371
3482
|
commandTemplateSource,
|
|
3372
3483
|
env,
|
|
@@ -3379,7 +3490,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3379
3490
|
filesFormat,
|
|
3380
3491
|
cwd,
|
|
3381
3492
|
timeoutMs,
|
|
3382
|
-
healthcheck
|
|
3493
|
+
healthcheck,
|
|
3494
|
+
verbose
|
|
3383
3495
|
};
|
|
3384
3496
|
}
|
|
3385
3497
|
function resolveTimeoutMs(source, description) {
|
|
@@ -3392,7 +3504,7 @@ function resolveTimeoutMs(source, description) {
|
|
|
3392
3504
|
}
|
|
3393
3505
|
return Math.floor(seconds * 1e3);
|
|
3394
3506
|
}
|
|
3395
|
-
function resolveCliHealthcheck(source, env, targetName) {
|
|
3507
|
+
function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
|
|
3396
3508
|
if (source === void 0 || source === null) {
|
|
3397
3509
|
return void 0;
|
|
3398
3510
|
}
|
|
@@ -3425,11 +3537,12 @@ function resolveCliHealthcheck(source, env, targetName) {
|
|
|
3425
3537
|
allowLiteral: true,
|
|
3426
3538
|
optionalEnv: true
|
|
3427
3539
|
});
|
|
3540
|
+
const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
|
|
3428
3541
|
return {
|
|
3429
3542
|
type: "command",
|
|
3430
3543
|
commandTemplate,
|
|
3431
3544
|
timeoutMs,
|
|
3432
|
-
cwd
|
|
3545
|
+
cwd: resolvedCwd
|
|
3433
3546
|
};
|
|
3434
3547
|
}
|
|
3435
3548
|
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
@@ -3979,7 +4092,7 @@ var import_ai2 = require("ai");
|
|
|
3979
4092
|
var import_zod2 = require("zod");
|
|
3980
4093
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3981
4094
|
|
|
3982
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
4095
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3983
4096
|
|
|
3984
4097
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3985
4098
|
|
|
@@ -4037,7 +4150,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4037
4150
|
const variables = {
|
|
4038
4151
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
4039
4152
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
4040
|
-
context.evalCase.
|
|
4153
|
+
context.evalCase.expected_messages,
|
|
4041
4154
|
null,
|
|
4042
4155
|
2
|
|
4043
4156
|
),
|
|
@@ -4250,13 +4363,16 @@ var CodeEvaluator = class {
|
|
|
4250
4363
|
{
|
|
4251
4364
|
question: context.evalCase.question,
|
|
4252
4365
|
expected_outcome: context.evalCase.expected_outcome,
|
|
4366
|
+
expected_messages: context.evalCase.expected_messages,
|
|
4253
4367
|
reference_answer: context.evalCase.reference_answer,
|
|
4254
4368
|
candidate_answer: context.candidate,
|
|
4255
4369
|
guideline_files: context.evalCase.guideline_paths,
|
|
4256
4370
|
input_files: context.evalCase.file_paths.filter(
|
|
4257
4371
|
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
4258
4372
|
),
|
|
4259
|
-
input_messages: context.evalCase.input_messages
|
|
4373
|
+
input_messages: context.evalCase.input_messages,
|
|
4374
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
4375
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
4260
4376
|
},
|
|
4261
4377
|
null,
|
|
4262
4378
|
2
|
|
@@ -4522,105 +4638,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4522
4638
|
};
|
|
4523
4639
|
}
|
|
4524
4640
|
};
|
|
4525
|
-
var ExpectedToolCallsEvaluator = class {
|
|
4526
|
-
kind = "expected_tool_calls";
|
|
4527
|
-
evaluate(context) {
|
|
4528
|
-
const { candidateTrace, evalCase } = context;
|
|
4529
|
-
const expectedSegments = evalCase.expected_segments;
|
|
4530
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
4531
|
-
if (expectedToolCalls.length === 0) {
|
|
4532
|
-
return {
|
|
4533
|
-
score: 1,
|
|
4534
|
-
verdict: "pass",
|
|
4535
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
4536
|
-
misses: [],
|
|
4537
|
-
expectedAspectCount: 1
|
|
4538
|
-
};
|
|
4539
|
-
}
|
|
4540
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
4541
|
-
return {
|
|
4542
|
-
score: 0,
|
|
4543
|
-
verdict: "fail",
|
|
4544
|
-
hits: [],
|
|
4545
|
-
misses: ["No trace available to validate tool_calls"],
|
|
4546
|
-
expectedAspectCount: expectedToolCalls.length
|
|
4547
|
-
};
|
|
4548
|
-
}
|
|
4549
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
4550
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
4551
|
-
}
|
|
4552
|
-
extractExpectedToolCalls(segments) {
|
|
4553
|
-
if (!segments) {
|
|
4554
|
-
return [];
|
|
4555
|
-
}
|
|
4556
|
-
const toolCalls = [];
|
|
4557
|
-
for (const segment of segments) {
|
|
4558
|
-
const role = segment.role;
|
|
4559
|
-
const segmentToolCalls = segment.tool_calls;
|
|
4560
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
4561
|
-
for (const tc of segmentToolCalls) {
|
|
4562
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
4563
|
-
const toolCall = tc;
|
|
4564
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
4565
|
-
}
|
|
4566
|
-
}
|
|
4567
|
-
}
|
|
4568
|
-
}
|
|
4569
|
-
return toolCalls;
|
|
4570
|
-
}
|
|
4571
|
-
validateToolCalls(expected, actual) {
|
|
4572
|
-
const hits = [];
|
|
4573
|
-
const misses = [];
|
|
4574
|
-
for (let i = 0; i < expected.length; i++) {
|
|
4575
|
-
const expectedCall = expected[i];
|
|
4576
|
-
const actualCall = actual[i];
|
|
4577
|
-
if (!actualCall) {
|
|
4578
|
-
misses.push(
|
|
4579
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
4580
|
-
);
|
|
4581
|
-
continue;
|
|
4582
|
-
}
|
|
4583
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
4584
|
-
misses.push(
|
|
4585
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
4586
|
-
);
|
|
4587
|
-
continue;
|
|
4588
|
-
}
|
|
4589
|
-
if (expectedCall.input !== void 0) {
|
|
4590
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
4591
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
4592
|
-
continue;
|
|
4593
|
-
}
|
|
4594
|
-
}
|
|
4595
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
4596
|
-
}
|
|
4597
|
-
const totalChecks = expected.length || 1;
|
|
4598
|
-
const score = hits.length / totalChecks;
|
|
4599
|
-
return {
|
|
4600
|
-
score,
|
|
4601
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
4602
|
-
hits,
|
|
4603
|
-
misses,
|
|
4604
|
-
expectedAspectCount: totalChecks
|
|
4605
|
-
};
|
|
4606
|
-
}
|
|
4607
|
-
deepEquals(a, b) {
|
|
4608
|
-
if (a === b) return true;
|
|
4609
|
-
if (typeof a !== typeof b) return false;
|
|
4610
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
4611
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4612
|
-
if (a.length !== b.length) return false;
|
|
4613
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
4614
|
-
}
|
|
4615
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
4616
|
-
const aObj = a;
|
|
4617
|
-
const bObj = b;
|
|
4618
|
-
const aKeys = Object.keys(aObj);
|
|
4619
|
-
const bKeys = Object.keys(bObj);
|
|
4620
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
4621
|
-
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
4622
|
-
}
|
|
4623
|
-
};
|
|
4624
4641
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4625
4642
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4626
4643
|
|
|
@@ -5392,6 +5409,7 @@ async function runEvalCase(options) {
|
|
|
5392
5409
|
judgeProvider,
|
|
5393
5410
|
agentTimeoutMs,
|
|
5394
5411
|
candidateTrace,
|
|
5412
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
5395
5413
|
candidateTraceSummary
|
|
5396
5414
|
});
|
|
5397
5415
|
} catch (error) {
|
|
@@ -5411,6 +5429,7 @@ async function evaluateCandidate(options) {
|
|
|
5411
5429
|
judgeProvider,
|
|
5412
5430
|
agentTimeoutMs,
|
|
5413
5431
|
candidateTrace,
|
|
5432
|
+
candidateTraceRef,
|
|
5414
5433
|
candidateTraceSummary
|
|
5415
5434
|
} = options;
|
|
5416
5435
|
const gradeTimestamp = nowFn();
|
|
@@ -5426,6 +5445,7 @@ async function evaluateCandidate(options) {
|
|
|
5426
5445
|
judgeProvider,
|
|
5427
5446
|
agentTimeoutMs,
|
|
5428
5447
|
candidateTrace,
|
|
5448
|
+
candidateTraceRef,
|
|
5429
5449
|
candidateTraceSummary
|
|
5430
5450
|
});
|
|
5431
5451
|
const completedAt = nowFn();
|
|
@@ -5480,6 +5500,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5480
5500
|
judgeProvider,
|
|
5481
5501
|
agentTimeoutMs,
|
|
5482
5502
|
candidateTrace,
|
|
5503
|
+
candidateTraceRef,
|
|
5483
5504
|
candidateTraceSummary
|
|
5484
5505
|
} = options;
|
|
5485
5506
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -5496,6 +5517,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5496
5517
|
judgeProvider,
|
|
5497
5518
|
agentTimeoutMs,
|
|
5498
5519
|
candidateTrace,
|
|
5520
|
+
candidateTraceRef,
|
|
5499
5521
|
candidateTraceSummary
|
|
5500
5522
|
});
|
|
5501
5523
|
}
|
|
@@ -5514,6 +5536,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5514
5536
|
now,
|
|
5515
5537
|
judgeProvider,
|
|
5516
5538
|
candidateTrace,
|
|
5539
|
+
candidateTraceRef,
|
|
5517
5540
|
candidateTraceSummary
|
|
5518
5541
|
});
|
|
5519
5542
|
return { score };
|
|
@@ -5532,6 +5555,7 @@ async function runEvaluatorList(options) {
|
|
|
5532
5555
|
judgeProvider,
|
|
5533
5556
|
agentTimeoutMs,
|
|
5534
5557
|
candidateTrace,
|
|
5558
|
+
candidateTraceRef,
|
|
5535
5559
|
candidateTraceSummary
|
|
5536
5560
|
} = options;
|
|
5537
5561
|
const scored = [];
|
|
@@ -5578,7 +5602,9 @@ async function runEvaluatorList(options) {
|
|
|
5578
5602
|
provider,
|
|
5579
5603
|
attempt,
|
|
5580
5604
|
promptInputs,
|
|
5581
|
-
now
|
|
5605
|
+
now,
|
|
5606
|
+
candidateTraceRef,
|
|
5607
|
+
candidateTraceSummary
|
|
5582
5608
|
});
|
|
5583
5609
|
const weight = evaluator.weight ?? 1;
|
|
5584
5610
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -5616,8 +5642,6 @@ async function runEvaluatorList(options) {
|
|
|
5616
5642
|
return new ToolTrajectoryEvaluator({
|
|
5617
5643
|
config: memberConfig
|
|
5618
5644
|
});
|
|
5619
|
-
case "expected_tool_calls":
|
|
5620
|
-
return new ExpectedToolCallsEvaluator();
|
|
5621
5645
|
default: {
|
|
5622
5646
|
const unknownConfig = memberConfig;
|
|
5623
5647
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5667,32 +5691,7 @@ async function runEvaluatorList(options) {
|
|
|
5667
5691
|
promptInputs,
|
|
5668
5692
|
now,
|
|
5669
5693
|
candidateTrace,
|
|
5670
|
-
|
|
5671
|
-
});
|
|
5672
|
-
const weight = evaluator.weight ?? 1;
|
|
5673
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5674
|
-
evaluatorResults.push({
|
|
5675
|
-
name: evaluator.name,
|
|
5676
|
-
type: evaluator.type,
|
|
5677
|
-
score: score2.score,
|
|
5678
|
-
weight,
|
|
5679
|
-
verdict: score2.verdict,
|
|
5680
|
-
hits: score2.hits,
|
|
5681
|
-
misses: score2.misses,
|
|
5682
|
-
reasoning: score2.reasoning
|
|
5683
|
-
});
|
|
5684
|
-
}
|
|
5685
|
-
if (evaluator.type === "expected_tool_calls") {
|
|
5686
|
-
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
5687
|
-
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
5688
|
-
evalCase,
|
|
5689
|
-
candidate,
|
|
5690
|
-
target,
|
|
5691
|
-
provider,
|
|
5692
|
-
attempt,
|
|
5693
|
-
promptInputs,
|
|
5694
|
-
now,
|
|
5695
|
-
candidateTrace,
|
|
5694
|
+
candidateTraceRef,
|
|
5696
5695
|
candidateTraceSummary
|
|
5697
5696
|
});
|
|
5698
5697
|
const weight = evaluator.weight ?? 1;
|
|
@@ -6065,7 +6064,6 @@ function createAgentKernel() {
|
|
|
6065
6064
|
0 && (module.exports = {
|
|
6066
6065
|
CodeEvaluator,
|
|
6067
6066
|
CompositeEvaluator,
|
|
6068
|
-
ExpectedToolCallsEvaluator,
|
|
6069
6067
|
LlmJudgeEvaluator,
|
|
6070
6068
|
TEST_MESSAGE_ROLES,
|
|
6071
6069
|
ToolTrajectoryEvaluator,
|
|
@@ -6083,7 +6081,6 @@ function createAgentKernel() {
|
|
|
6083
6081
|
generateRubrics,
|
|
6084
6082
|
getHitCount,
|
|
6085
6083
|
isEvaluatorKind,
|
|
6086
|
-
isExpectedToolCall,
|
|
6087
6084
|
isGuidelineFile,
|
|
6088
6085
|
isJsonObject,
|
|
6089
6086
|
isJsonValue,
|