@agentv/core 1.0.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-V3JCB3HI.js → chunk-4A6L2F6L.js} +11 -5
- package/dist/chunk-4A6L2F6L.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +12 -44
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +13 -45
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +227 -230
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +10 -46
- package/dist/index.d.ts +10 -46
- package/dist/index.js +218 -225
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-V3JCB3HI.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-4A6L2F6L.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -47,18 +47,23 @@ function isTestMessage(value) {
|
|
|
47
47
|
if (typeof candidate.content === "string") {
|
|
48
48
|
return true;
|
|
49
49
|
}
|
|
50
|
-
if (
|
|
51
|
-
return
|
|
50
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
54
|
+
return true;
|
|
52
55
|
}
|
|
53
|
-
|
|
56
|
+
if (isJsonObject(candidate.content)) {
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
return false;
|
|
54
60
|
}
|
|
55
61
|
var EVALUATOR_KIND_VALUES = [
|
|
56
62
|
"code_judge",
|
|
57
63
|
"llm_judge",
|
|
58
64
|
"rubric",
|
|
59
65
|
"composite",
|
|
60
|
-
"tool_trajectory"
|
|
61
|
-
"expected_tool_calls"
|
|
66
|
+
"tool_trajectory"
|
|
62
67
|
];
|
|
63
68
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
64
69
|
function isEvaluatorKind(value) {
|
|
@@ -79,13 +84,6 @@ function isTraceEvent(value) {
|
|
|
79
84
|
const candidate = value;
|
|
80
85
|
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
81
86
|
}
|
|
82
|
-
function isExpectedToolCall(value) {
|
|
83
|
-
if (typeof value !== "object" || value === null) {
|
|
84
|
-
return false;
|
|
85
|
-
}
|
|
86
|
-
const candidate = value;
|
|
87
|
-
return typeof candidate.tool === "string";
|
|
88
|
-
}
|
|
89
87
|
function computeTraceSummary(trace) {
|
|
90
88
|
const toolCallCounts = {};
|
|
91
89
|
let errorCount = 0;
|
|
@@ -582,15 +580,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
582
580
|
});
|
|
583
581
|
continue;
|
|
584
582
|
}
|
|
585
|
-
if (typeValue === "expected_tool_calls") {
|
|
586
|
-
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
587
|
-
evaluators.push({
|
|
588
|
-
name,
|
|
589
|
-
type: "expected_tool_calls",
|
|
590
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
591
|
-
});
|
|
592
|
-
continue;
|
|
593
|
-
}
|
|
594
583
|
if (typeValue === "tool_trajectory") {
|
|
595
584
|
const mode = asString2(rawEvaluator.mode);
|
|
596
585
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -785,6 +774,17 @@ async function processMessages(options) {
|
|
|
785
774
|
}
|
|
786
775
|
continue;
|
|
787
776
|
}
|
|
777
|
+
if (isJsonObject(content)) {
|
|
778
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
779
|
+
segments.push({ type: "text", value: rendered });
|
|
780
|
+
if (textParts) {
|
|
781
|
+
textParts.push(rendered);
|
|
782
|
+
}
|
|
783
|
+
continue;
|
|
784
|
+
}
|
|
785
|
+
if (!Array.isArray(content)) {
|
|
786
|
+
continue;
|
|
787
|
+
}
|
|
788
788
|
for (const rawSegment of content) {
|
|
789
789
|
if (!isJsonObject(rawSegment)) {
|
|
790
790
|
continue;
|
|
@@ -845,63 +845,6 @@ async function processMessages(options) {
|
|
|
845
845
|
}
|
|
846
846
|
return segments;
|
|
847
847
|
}
|
|
848
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
849
|
-
if (typeof content === "string") {
|
|
850
|
-
return content;
|
|
851
|
-
}
|
|
852
|
-
if (!content) {
|
|
853
|
-
return "";
|
|
854
|
-
}
|
|
855
|
-
const parts = [];
|
|
856
|
-
for (const entry of content) {
|
|
857
|
-
if (typeof entry === "string") {
|
|
858
|
-
parts.push({ content: entry, isFile: false });
|
|
859
|
-
continue;
|
|
860
|
-
}
|
|
861
|
-
if (!isJsonObject(entry)) {
|
|
862
|
-
continue;
|
|
863
|
-
}
|
|
864
|
-
const segmentType = asString3(entry.type);
|
|
865
|
-
if (segmentType === "file") {
|
|
866
|
-
const rawValue = asString3(entry.value);
|
|
867
|
-
if (!rawValue) {
|
|
868
|
-
continue;
|
|
869
|
-
}
|
|
870
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
871
|
-
rawValue,
|
|
872
|
-
searchRoots
|
|
873
|
-
);
|
|
874
|
-
if (!resolvedPath) {
|
|
875
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
876
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
877
|
-
continue;
|
|
878
|
-
}
|
|
879
|
-
try {
|
|
880
|
-
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
881
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
882
|
-
if (verbose) {
|
|
883
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
884
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
885
|
-
}
|
|
886
|
-
} catch (error) {
|
|
887
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
888
|
-
}
|
|
889
|
-
continue;
|
|
890
|
-
}
|
|
891
|
-
const textValue = asString3(entry.text);
|
|
892
|
-
if (typeof textValue === "string") {
|
|
893
|
-
parts.push({ content: textValue, isFile: false });
|
|
894
|
-
continue;
|
|
895
|
-
}
|
|
896
|
-
const valueValue = asString3(entry.value);
|
|
897
|
-
if (typeof valueValue === "string") {
|
|
898
|
-
parts.push({ content: valueValue, isFile: false });
|
|
899
|
-
continue;
|
|
900
|
-
}
|
|
901
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
902
|
-
}
|
|
903
|
-
return formatFileContents(parts);
|
|
904
|
-
}
|
|
905
848
|
function asString3(value) {
|
|
906
849
|
return typeof value === "string" ? value : void 0;
|
|
907
850
|
}
|
|
@@ -934,14 +877,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
934
877
|
}
|
|
935
878
|
}
|
|
936
879
|
async function processExpectedMessages(options) {
|
|
937
|
-
const { messages, searchRoots,
|
|
880
|
+
const { messages, searchRoots, verbose } = options;
|
|
938
881
|
const segments = [];
|
|
939
882
|
for (const message of messages) {
|
|
883
|
+
const extendedMessage = message;
|
|
940
884
|
const segment = {
|
|
941
885
|
role: message.role
|
|
942
886
|
};
|
|
943
|
-
if (
|
|
944
|
-
segment.
|
|
887
|
+
if (extendedMessage.name) {
|
|
888
|
+
segment.name = extendedMessage.name;
|
|
945
889
|
}
|
|
946
890
|
const content = message.content;
|
|
947
891
|
if (typeof content === "string") {
|
|
@@ -989,6 +933,13 @@ async function processExpectedMessages(options) {
|
|
|
989
933
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
990
934
|
}
|
|
991
935
|
segment.content = processedContent;
|
|
936
|
+
} else if (isJsonObject(content)) {
|
|
937
|
+
segment.content = cloneJsonObject(content);
|
|
938
|
+
}
|
|
939
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
940
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
941
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
942
|
+
);
|
|
992
943
|
}
|
|
993
944
|
segments.push(segment);
|
|
994
945
|
}
|
|
@@ -1060,6 +1011,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1060
1011
|
}
|
|
1061
1012
|
}
|
|
1062
1013
|
}
|
|
1014
|
+
} else if (isJsonObject(message.content)) {
|
|
1015
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
1016
|
+
if (rendered.trim().length > 0) {
|
|
1017
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
1018
|
+
}
|
|
1063
1019
|
}
|
|
1064
1020
|
segmentsByMessage.push(messageSegments);
|
|
1065
1021
|
}
|
|
@@ -1283,9 +1239,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1283
1239
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
1284
1240
|
continue;
|
|
1285
1241
|
}
|
|
1286
|
-
if (expectedMessages.length > 1) {
|
|
1287
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
1288
|
-
}
|
|
1289
1242
|
const guidelinePaths = [];
|
|
1290
1243
|
const inputTextParts = [];
|
|
1291
1244
|
const inputSegments = await processMessages({
|
|
@@ -1305,8 +1258,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1305
1258
|
verbose
|
|
1306
1259
|
}) : [];
|
|
1307
1260
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1308
|
-
|
|
1309
|
-
|
|
1261
|
+
let referenceAnswer = "";
|
|
1262
|
+
if (outputSegments.length > 1) {
|
|
1263
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
1264
|
+
} else if (outputSegments.length === 1) {
|
|
1265
|
+
const singleMessage = outputSegments[0];
|
|
1266
|
+
if (typeof singleMessage.content === "string") {
|
|
1267
|
+
referenceAnswer = singleMessage.content;
|
|
1268
|
+
} else if (singleMessage.content) {
|
|
1269
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1270
|
+
} else if (singleMessage.tool_calls) {
|
|
1271
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1310
1274
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1311
1275
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1312
1276
|
let evaluators;
|
|
@@ -1361,7 +1325,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1361
1325
|
question,
|
|
1362
1326
|
input_messages: inputMessages,
|
|
1363
1327
|
input_segments: inputSegments,
|
|
1364
|
-
|
|
1328
|
+
expected_messages: outputSegments,
|
|
1365
1329
|
reference_answer: referenceAnswer,
|
|
1366
1330
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
1367
1331
|
guideline_patterns: guidelinePatterns,
|
|
@@ -1785,7 +1749,7 @@ var CliProvider = class {
|
|
|
1785
1749
|
id;
|
|
1786
1750
|
kind = "cli";
|
|
1787
1751
|
targetName;
|
|
1788
|
-
supportsBatch =
|
|
1752
|
+
supportsBatch = true;
|
|
1789
1753
|
config;
|
|
1790
1754
|
runCommand;
|
|
1791
1755
|
verbose;
|
|
@@ -1805,6 +1769,11 @@ var CliProvider = class {
|
|
|
1805
1769
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
1806
1770
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
1807
1771
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1772
|
+
if (this.verbose) {
|
|
1773
|
+
console.log(
|
|
1774
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1775
|
+
);
|
|
1776
|
+
}
|
|
1808
1777
|
const result = await this.runCommand(renderedCommand, {
|
|
1809
1778
|
cwd: this.config.cwd,
|
|
1810
1779
|
env: process.env,
|
|
@@ -1839,6 +1808,114 @@ var CliProvider = class {
|
|
|
1839
1808
|
}
|
|
1840
1809
|
};
|
|
1841
1810
|
}
|
|
1811
|
+
async invokeBatch(requests) {
|
|
1812
|
+
if (requests.length === 0) {
|
|
1813
|
+
return [];
|
|
1814
|
+
}
|
|
1815
|
+
for (const request of requests) {
|
|
1816
|
+
if (request.signal?.aborted) {
|
|
1817
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
1818
|
+
}
|
|
1819
|
+
}
|
|
1820
|
+
const controller = new AbortController();
|
|
1821
|
+
for (const request of requests) {
|
|
1822
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1823
|
+
}
|
|
1824
|
+
await this.ensureHealthy(controller.signal);
|
|
1825
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
1826
|
+
const batchInputFiles = [];
|
|
1827
|
+
for (const request of requests) {
|
|
1828
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
1829
|
+
batchInputFiles.push(...request.inputFiles);
|
|
1830
|
+
}
|
|
1831
|
+
}
|
|
1832
|
+
const templateValues = buildTemplateValues(
|
|
1833
|
+
{
|
|
1834
|
+
question: "",
|
|
1835
|
+
guidelines: "",
|
|
1836
|
+
inputFiles: batchInputFiles,
|
|
1837
|
+
evalCaseId: "batch",
|
|
1838
|
+
attempt: 0
|
|
1839
|
+
},
|
|
1840
|
+
this.config,
|
|
1841
|
+
outputFilePath
|
|
1842
|
+
);
|
|
1843
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1844
|
+
if (this.verbose) {
|
|
1845
|
+
console.log(
|
|
1846
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1847
|
+
);
|
|
1848
|
+
}
|
|
1849
|
+
const result = await this.runCommand(renderedCommand, {
|
|
1850
|
+
cwd: this.config.cwd,
|
|
1851
|
+
env: process.env,
|
|
1852
|
+
timeoutMs: this.config.timeoutMs,
|
|
1853
|
+
signal: controller.signal
|
|
1854
|
+
});
|
|
1855
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1856
|
+
if (controller.signal.aborted) {
|
|
1857
|
+
throw new Error("CLI provider request was aborted");
|
|
1858
|
+
}
|
|
1859
|
+
if (result.timedOut) {
|
|
1860
|
+
throw new Error(
|
|
1861
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
1862
|
+
);
|
|
1863
|
+
}
|
|
1864
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
1865
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
1866
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1867
|
+
throw new Error(message);
|
|
1868
|
+
}
|
|
1869
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1870
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
1871
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
1872
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
1873
|
+
if (missingIds.length > 0) {
|
|
1874
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1875
|
+
}
|
|
1876
|
+
const responses = requests.map((request) => {
|
|
1877
|
+
const evalCaseId = request.evalCaseId;
|
|
1878
|
+
if (!evalCaseId) {
|
|
1879
|
+
return {
|
|
1880
|
+
text: "",
|
|
1881
|
+
raw: {
|
|
1882
|
+
command: renderedCommand,
|
|
1883
|
+
stderr: result.stderr,
|
|
1884
|
+
exitCode: result.exitCode ?? 0,
|
|
1885
|
+
cwd: this.config.cwd,
|
|
1886
|
+
outputFile: outputFilePath
|
|
1887
|
+
}
|
|
1888
|
+
};
|
|
1889
|
+
}
|
|
1890
|
+
const parsed = recordsById.get(evalCaseId);
|
|
1891
|
+
if (!parsed) {
|
|
1892
|
+
return {
|
|
1893
|
+
text: "",
|
|
1894
|
+
raw: {
|
|
1895
|
+
command: renderedCommand,
|
|
1896
|
+
stderr: result.stderr,
|
|
1897
|
+
exitCode: result.exitCode ?? 0,
|
|
1898
|
+
cwd: this.config.cwd,
|
|
1899
|
+
outputFile: outputFilePath
|
|
1900
|
+
}
|
|
1901
|
+
};
|
|
1902
|
+
}
|
|
1903
|
+
return {
|
|
1904
|
+
text: parsed.text,
|
|
1905
|
+
trace: parsed.trace,
|
|
1906
|
+
traceRef: parsed.traceRef,
|
|
1907
|
+
raw: {
|
|
1908
|
+
command: renderedCommand,
|
|
1909
|
+
stderr: result.stderr,
|
|
1910
|
+
exitCode: result.exitCode ?? 0,
|
|
1911
|
+
cwd: this.config.cwd,
|
|
1912
|
+
outputFile: outputFilePath,
|
|
1913
|
+
recordId: evalCaseId
|
|
1914
|
+
}
|
|
1915
|
+
};
|
|
1916
|
+
});
|
|
1917
|
+
return responses;
|
|
1918
|
+
}
|
|
1842
1919
|
/**
|
|
1843
1920
|
* Parse output content from CLI.
|
|
1844
1921
|
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
@@ -1864,6 +1941,38 @@ var CliProvider = class {
|
|
|
1864
1941
|
const validEvents = trace.filter(isTraceEvent);
|
|
1865
1942
|
return validEvents.length > 0 ? validEvents : void 0;
|
|
1866
1943
|
}
|
|
1944
|
+
parseJsonlBatchOutput(content) {
|
|
1945
|
+
const records = /* @__PURE__ */ new Map();
|
|
1946
|
+
const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1947
|
+
for (const line of lines) {
|
|
1948
|
+
let parsed;
|
|
1949
|
+
try {
|
|
1950
|
+
parsed = JSON.parse(line);
|
|
1951
|
+
} catch (error) {
|
|
1952
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
1953
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
1954
|
+
}
|
|
1955
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
1956
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
1957
|
+
}
|
|
1958
|
+
const obj = parsed;
|
|
1959
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
1960
|
+
if (!id || id.trim().length === 0) {
|
|
1961
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
1962
|
+
}
|
|
1963
|
+
if (records.has(id)) {
|
|
1964
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
1965
|
+
}
|
|
1966
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
1967
|
+
const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
|
|
1968
|
+
records.set(id, {
|
|
1969
|
+
text,
|
|
1970
|
+
trace: this.parseTrace(obj.trace),
|
|
1971
|
+
traceRef
|
|
1972
|
+
});
|
|
1973
|
+
}
|
|
1974
|
+
return records;
|
|
1975
|
+
}
|
|
1867
1976
|
async readAndCleanupOutputFile(filePath) {
|
|
1868
1977
|
try {
|
|
1869
1978
|
const content = await readTextFile(filePath);
|
|
@@ -1925,7 +2034,7 @@ var CliProvider = class {
|
|
|
1925
2034
|
);
|
|
1926
2035
|
if (this.verbose) {
|
|
1927
2036
|
console.log(
|
|
1928
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
2037
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1929
2038
|
);
|
|
1930
2039
|
}
|
|
1931
2040
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -1993,11 +2102,11 @@ function shellEscape(value) {
|
|
|
1993
2102
|
}
|
|
1994
2103
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
1995
2104
|
}
|
|
1996
|
-
function generateOutputFilePath(evalCaseId) {
|
|
2105
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
1997
2106
|
const safeEvalId = evalCaseId || "unknown";
|
|
1998
2107
|
const timestamp = Date.now();
|
|
1999
2108
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2000
|
-
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
2109
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2001
2110
|
}
|
|
2002
2111
|
function formatTimeoutSuffix(timeoutMs) {
|
|
2003
2112
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3270,7 +3379,7 @@ import { generateText as generateText2 } from "ai";
|
|
|
3270
3379
|
import { z } from "zod";
|
|
3271
3380
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3272
3381
|
|
|
3273
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3382
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3274
3383
|
|
|
3275
3384
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3276
3385
|
|
|
@@ -3328,7 +3437,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3328
3437
|
const variables = {
|
|
3329
3438
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3330
3439
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
3331
|
-
context.evalCase.
|
|
3440
|
+
context.evalCase.expected_messages,
|
|
3332
3441
|
null,
|
|
3333
3442
|
2
|
|
3334
3443
|
),
|
|
@@ -3541,13 +3650,16 @@ var CodeEvaluator = class {
|
|
|
3541
3650
|
{
|
|
3542
3651
|
question: context.evalCase.question,
|
|
3543
3652
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3653
|
+
expected_messages: context.evalCase.expected_messages,
|
|
3544
3654
|
reference_answer: context.evalCase.reference_answer,
|
|
3545
3655
|
candidate_answer: context.candidate,
|
|
3546
3656
|
guideline_files: context.evalCase.guideline_paths,
|
|
3547
3657
|
input_files: context.evalCase.file_paths.filter(
|
|
3548
3658
|
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3549
3659
|
),
|
|
3550
|
-
input_messages: context.evalCase.input_messages
|
|
3660
|
+
input_messages: context.evalCase.input_messages,
|
|
3661
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
3662
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
3551
3663
|
},
|
|
3552
3664
|
null,
|
|
3553
3665
|
2
|
|
@@ -3813,105 +3925,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3813
3925
|
};
|
|
3814
3926
|
}
|
|
3815
3927
|
};
|
|
3816
|
-
var ExpectedToolCallsEvaluator = class {
|
|
3817
|
-
kind = "expected_tool_calls";
|
|
3818
|
-
evaluate(context) {
|
|
3819
|
-
const { candidateTrace, evalCase } = context;
|
|
3820
|
-
const expectedSegments = evalCase.expected_segments;
|
|
3821
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
3822
|
-
if (expectedToolCalls.length === 0) {
|
|
3823
|
-
return {
|
|
3824
|
-
score: 1,
|
|
3825
|
-
verdict: "pass",
|
|
3826
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
3827
|
-
misses: [],
|
|
3828
|
-
expectedAspectCount: 1
|
|
3829
|
-
};
|
|
3830
|
-
}
|
|
3831
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
3832
|
-
return {
|
|
3833
|
-
score: 0,
|
|
3834
|
-
verdict: "fail",
|
|
3835
|
-
hits: [],
|
|
3836
|
-
misses: ["No trace available to validate tool_calls"],
|
|
3837
|
-
expectedAspectCount: expectedToolCalls.length
|
|
3838
|
-
};
|
|
3839
|
-
}
|
|
3840
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
3841
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
3842
|
-
}
|
|
3843
|
-
extractExpectedToolCalls(segments) {
|
|
3844
|
-
if (!segments) {
|
|
3845
|
-
return [];
|
|
3846
|
-
}
|
|
3847
|
-
const toolCalls = [];
|
|
3848
|
-
for (const segment of segments) {
|
|
3849
|
-
const role = segment.role;
|
|
3850
|
-
const segmentToolCalls = segment.tool_calls;
|
|
3851
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
3852
|
-
for (const tc of segmentToolCalls) {
|
|
3853
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
3854
|
-
const toolCall = tc;
|
|
3855
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
3856
|
-
}
|
|
3857
|
-
}
|
|
3858
|
-
}
|
|
3859
|
-
}
|
|
3860
|
-
return toolCalls;
|
|
3861
|
-
}
|
|
3862
|
-
validateToolCalls(expected, actual) {
|
|
3863
|
-
const hits = [];
|
|
3864
|
-
const misses = [];
|
|
3865
|
-
for (let i = 0; i < expected.length; i++) {
|
|
3866
|
-
const expectedCall = expected[i];
|
|
3867
|
-
const actualCall = actual[i];
|
|
3868
|
-
if (!actualCall) {
|
|
3869
|
-
misses.push(
|
|
3870
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
3871
|
-
);
|
|
3872
|
-
continue;
|
|
3873
|
-
}
|
|
3874
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
3875
|
-
misses.push(
|
|
3876
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
3877
|
-
);
|
|
3878
|
-
continue;
|
|
3879
|
-
}
|
|
3880
|
-
if (expectedCall.input !== void 0) {
|
|
3881
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
3882
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
3883
|
-
continue;
|
|
3884
|
-
}
|
|
3885
|
-
}
|
|
3886
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
3887
|
-
}
|
|
3888
|
-
const totalChecks = expected.length || 1;
|
|
3889
|
-
const score = hits.length / totalChecks;
|
|
3890
|
-
return {
|
|
3891
|
-
score,
|
|
3892
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
3893
|
-
hits,
|
|
3894
|
-
misses,
|
|
3895
|
-
expectedAspectCount: totalChecks
|
|
3896
|
-
};
|
|
3897
|
-
}
|
|
3898
|
-
deepEquals(a, b) {
|
|
3899
|
-
if (a === b) return true;
|
|
3900
|
-
if (typeof a !== typeof b) return false;
|
|
3901
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
3902
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
3903
|
-
if (a.length !== b.length) return false;
|
|
3904
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
3905
|
-
}
|
|
3906
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
3907
|
-
const aObj = a;
|
|
3908
|
-
const bObj = b;
|
|
3909
|
-
const aKeys = Object.keys(aObj);
|
|
3910
|
-
const bKeys = Object.keys(bObj);
|
|
3911
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
3912
|
-
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
3913
|
-
}
|
|
3914
|
-
};
|
|
3915
3928
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3916
3929
|
{{EVALUATOR_RESULTS_JSON}}
|
|
3917
3930
|
|
|
@@ -4673,6 +4686,7 @@ async function runEvalCase(options) {
|
|
|
4673
4686
|
judgeProvider,
|
|
4674
4687
|
agentTimeoutMs,
|
|
4675
4688
|
candidateTrace,
|
|
4689
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
4676
4690
|
candidateTraceSummary
|
|
4677
4691
|
});
|
|
4678
4692
|
} catch (error) {
|
|
@@ -4692,6 +4706,7 @@ async function evaluateCandidate(options) {
|
|
|
4692
4706
|
judgeProvider,
|
|
4693
4707
|
agentTimeoutMs,
|
|
4694
4708
|
candidateTrace,
|
|
4709
|
+
candidateTraceRef,
|
|
4695
4710
|
candidateTraceSummary
|
|
4696
4711
|
} = options;
|
|
4697
4712
|
const gradeTimestamp = nowFn();
|
|
@@ -4707,6 +4722,7 @@ async function evaluateCandidate(options) {
|
|
|
4707
4722
|
judgeProvider,
|
|
4708
4723
|
agentTimeoutMs,
|
|
4709
4724
|
candidateTrace,
|
|
4725
|
+
candidateTraceRef,
|
|
4710
4726
|
candidateTraceSummary
|
|
4711
4727
|
});
|
|
4712
4728
|
const completedAt = nowFn();
|
|
@@ -4761,6 +4777,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4761
4777
|
judgeProvider,
|
|
4762
4778
|
agentTimeoutMs,
|
|
4763
4779
|
candidateTrace,
|
|
4780
|
+
candidateTraceRef,
|
|
4764
4781
|
candidateTraceSummary
|
|
4765
4782
|
} = options;
|
|
4766
4783
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -4777,6 +4794,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4777
4794
|
judgeProvider,
|
|
4778
4795
|
agentTimeoutMs,
|
|
4779
4796
|
candidateTrace,
|
|
4797
|
+
candidateTraceRef,
|
|
4780
4798
|
candidateTraceSummary
|
|
4781
4799
|
});
|
|
4782
4800
|
}
|
|
@@ -4795,6 +4813,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4795
4813
|
now,
|
|
4796
4814
|
judgeProvider,
|
|
4797
4815
|
candidateTrace,
|
|
4816
|
+
candidateTraceRef,
|
|
4798
4817
|
candidateTraceSummary
|
|
4799
4818
|
});
|
|
4800
4819
|
return { score };
|
|
@@ -4813,6 +4832,7 @@ async function runEvaluatorList(options) {
|
|
|
4813
4832
|
judgeProvider,
|
|
4814
4833
|
agentTimeoutMs,
|
|
4815
4834
|
candidateTrace,
|
|
4835
|
+
candidateTraceRef,
|
|
4816
4836
|
candidateTraceSummary
|
|
4817
4837
|
} = options;
|
|
4818
4838
|
const scored = [];
|
|
@@ -4859,7 +4879,9 @@ async function runEvaluatorList(options) {
|
|
|
4859
4879
|
provider,
|
|
4860
4880
|
attempt,
|
|
4861
4881
|
promptInputs,
|
|
4862
|
-
now
|
|
4882
|
+
now,
|
|
4883
|
+
candidateTraceRef,
|
|
4884
|
+
candidateTraceSummary
|
|
4863
4885
|
});
|
|
4864
4886
|
const weight = evaluator.weight ?? 1;
|
|
4865
4887
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -4897,8 +4919,6 @@ async function runEvaluatorList(options) {
|
|
|
4897
4919
|
return new ToolTrajectoryEvaluator({
|
|
4898
4920
|
config: memberConfig
|
|
4899
4921
|
});
|
|
4900
|
-
case "expected_tool_calls":
|
|
4901
|
-
return new ExpectedToolCallsEvaluator();
|
|
4902
4922
|
default: {
|
|
4903
4923
|
const unknownConfig = memberConfig;
|
|
4904
4924
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4948,32 +4968,7 @@ async function runEvaluatorList(options) {
|
|
|
4948
4968
|
promptInputs,
|
|
4949
4969
|
now,
|
|
4950
4970
|
candidateTrace,
|
|
4951
|
-
|
|
4952
|
-
});
|
|
4953
|
-
const weight = evaluator.weight ?? 1;
|
|
4954
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4955
|
-
evaluatorResults.push({
|
|
4956
|
-
name: evaluator.name,
|
|
4957
|
-
type: evaluator.type,
|
|
4958
|
-
score: score2.score,
|
|
4959
|
-
weight,
|
|
4960
|
-
verdict: score2.verdict,
|
|
4961
|
-
hits: score2.hits,
|
|
4962
|
-
misses: score2.misses,
|
|
4963
|
-
reasoning: score2.reasoning
|
|
4964
|
-
});
|
|
4965
|
-
}
|
|
4966
|
-
if (evaluator.type === "expected_tool_calls") {
|
|
4967
|
-
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
4968
|
-
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
4969
|
-
evalCase,
|
|
4970
|
-
candidate,
|
|
4971
|
-
target,
|
|
4972
|
-
provider,
|
|
4973
|
-
attempt,
|
|
4974
|
-
promptInputs,
|
|
4975
|
-
now,
|
|
4976
|
-
candidateTrace,
|
|
4971
|
+
candidateTraceRef,
|
|
4977
4972
|
candidateTraceSummary
|
|
4978
4973
|
});
|
|
4979
4974
|
const weight = evaluator.weight ?? 1;
|
|
@@ -5345,7 +5340,6 @@ function createAgentKernel() {
|
|
|
5345
5340
|
export {
|
|
5346
5341
|
CodeEvaluator,
|
|
5347
5342
|
CompositeEvaluator,
|
|
5348
|
-
ExpectedToolCallsEvaluator,
|
|
5349
5343
|
LlmJudgeEvaluator,
|
|
5350
5344
|
TEST_MESSAGE_ROLES,
|
|
5351
5345
|
ToolTrajectoryEvaluator,
|
|
@@ -5363,7 +5357,6 @@ export {
|
|
|
5363
5357
|
generateRubrics,
|
|
5364
5358
|
getHitCount,
|
|
5365
5359
|
isEvaluatorKind,
|
|
5366
|
-
isExpectedToolCall,
|
|
5367
5360
|
isGuidelineFile,
|
|
5368
5361
|
isJsonObject,
|
|
5369
5362
|
isJsonValue,
|