@agentv/core 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-V3JCB3HI.js → chunk-KPHTMTZ3.js} +32 -7
- package/dist/chunk-KPHTMTZ3.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +17 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +18 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +411 -146
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +59 -51
- package/dist/index.d.ts +59 -51
- package/dist/index.js +371 -129
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-V3JCB3HI.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
buildDirectoryChain,
|
|
3
3
|
buildSearchRoots,
|
|
4
|
+
extractLastAssistantContent,
|
|
4
5
|
fileExists,
|
|
5
6
|
findGitRoot,
|
|
6
7
|
isAgentProvider,
|
|
@@ -9,7 +10,7 @@ import {
|
|
|
9
10
|
readTextFile,
|
|
10
11
|
resolveFileReference,
|
|
11
12
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-KPHTMTZ3.js";
|
|
13
14
|
|
|
14
15
|
// src/evaluation/types.ts
|
|
15
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -74,33 +75,22 @@ function getHitCount(result) {
|
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
// src/evaluation/trace.ts
|
|
77
|
-
function
|
|
78
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
79
|
-
}
|
|
80
|
-
function isTraceEvent(value) {
|
|
81
|
-
if (typeof value !== "object" || value === null) {
|
|
82
|
-
return false;
|
|
83
|
-
}
|
|
84
|
-
const candidate = value;
|
|
85
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
86
|
-
}
|
|
87
|
-
function computeTraceSummary(trace) {
|
|
78
|
+
function computeTraceSummary(messages) {
|
|
88
79
|
const toolCallCounts = {};
|
|
89
|
-
let
|
|
90
|
-
for (const
|
|
91
|
-
if (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
errorCount++;
|
|
80
|
+
let totalToolCalls = 0;
|
|
81
|
+
for (const message of messages) {
|
|
82
|
+
if (!message.toolCalls) continue;
|
|
83
|
+
for (const toolCall of message.toolCalls) {
|
|
84
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
85
|
+
totalToolCalls++;
|
|
96
86
|
}
|
|
97
87
|
}
|
|
98
88
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
99
89
|
return {
|
|
100
|
-
eventCount:
|
|
90
|
+
eventCount: totalToolCalls,
|
|
101
91
|
toolNames,
|
|
102
92
|
toolCallsByName: toolCallCounts,
|
|
103
|
-
errorCount
|
|
93
|
+
errorCount: 0
|
|
104
94
|
};
|
|
105
95
|
}
|
|
106
96
|
|
|
@@ -376,7 +366,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
376
366
|
QUESTION: "question",
|
|
377
367
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
378
368
|
REFERENCE_ANSWER: "reference_answer",
|
|
379
|
-
INPUT_MESSAGES: "input_messages"
|
|
369
|
+
INPUT_MESSAGES: "input_messages",
|
|
370
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
380
371
|
};
|
|
381
372
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
382
373
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -774,6 +765,17 @@ async function processMessages(options) {
|
|
|
774
765
|
}
|
|
775
766
|
continue;
|
|
776
767
|
}
|
|
768
|
+
if (isJsonObject(content)) {
|
|
769
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
770
|
+
segments.push({ type: "text", value: rendered });
|
|
771
|
+
if (textParts) {
|
|
772
|
+
textParts.push(rendered);
|
|
773
|
+
}
|
|
774
|
+
continue;
|
|
775
|
+
}
|
|
776
|
+
if (!Array.isArray(content)) {
|
|
777
|
+
continue;
|
|
778
|
+
}
|
|
777
779
|
for (const rawSegment of content) {
|
|
778
780
|
if (!isJsonObject(rawSegment)) {
|
|
779
781
|
continue;
|
|
@@ -1000,6 +1002,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1000
1002
|
}
|
|
1001
1003
|
}
|
|
1002
1004
|
}
|
|
1005
|
+
} else if (isJsonObject(message.content)) {
|
|
1006
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
1007
|
+
if (rendered.trim().length > 0) {
|
|
1008
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
1009
|
+
}
|
|
1003
1010
|
}
|
|
1004
1011
|
segmentsByMessage.push(messageSegments);
|
|
1005
1012
|
}
|
|
@@ -1243,16 +1250,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1243
1250
|
}) : [];
|
|
1244
1251
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1245
1252
|
let referenceAnswer = "";
|
|
1246
|
-
if (outputSegments.length >
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
const
|
|
1250
|
-
if (typeof
|
|
1251
|
-
referenceAnswer =
|
|
1252
|
-
} else if (
|
|
1253
|
-
referenceAnswer = JSON.stringify(
|
|
1254
|
-
} else if (
|
|
1255
|
-
referenceAnswer = JSON.stringify(
|
|
1253
|
+
if (outputSegments.length > 0) {
|
|
1254
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1255
|
+
const content = lastMessage.content;
|
|
1256
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1257
|
+
if (typeof content === "string") {
|
|
1258
|
+
referenceAnswer = content;
|
|
1259
|
+
} else if (content !== void 0 && content !== null) {
|
|
1260
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1261
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1262
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1256
1263
|
}
|
|
1257
1264
|
}
|
|
1258
1265
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1580,11 +1587,11 @@ async function invokeModel(options) {
|
|
|
1580
1587
|
return mapResponse(result);
|
|
1581
1588
|
}
|
|
1582
1589
|
function mapResponse(result) {
|
|
1590
|
+
const content = result.text ?? "";
|
|
1583
1591
|
return {
|
|
1584
|
-
text: result.text ?? "",
|
|
1585
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1586
1592
|
raw: result,
|
|
1587
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1593
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1594
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1588
1595
|
};
|
|
1589
1596
|
}
|
|
1590
1597
|
function toJsonObject(value) {
|
|
@@ -1733,10 +1740,11 @@ var CliProvider = class {
|
|
|
1733
1740
|
id;
|
|
1734
1741
|
kind = "cli";
|
|
1735
1742
|
targetName;
|
|
1736
|
-
supportsBatch =
|
|
1743
|
+
supportsBatch = true;
|
|
1737
1744
|
config;
|
|
1738
1745
|
runCommand;
|
|
1739
1746
|
verbose;
|
|
1747
|
+
keepTempFiles;
|
|
1740
1748
|
healthcheckPromise;
|
|
1741
1749
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1742
1750
|
this.targetName = targetName;
|
|
@@ -1744,6 +1752,7 @@ var CliProvider = class {
|
|
|
1744
1752
|
this.config = config;
|
|
1745
1753
|
this.runCommand = runner;
|
|
1746
1754
|
this.verbose = config.verbose ?? false;
|
|
1755
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1747
1756
|
}
|
|
1748
1757
|
async invoke(request) {
|
|
1749
1758
|
if (request.signal?.aborted) {
|
|
@@ -1753,6 +1762,11 @@ var CliProvider = class {
|
|
|
1753
1762
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
1754
1763
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
1755
1764
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1765
|
+
if (this.verbose) {
|
|
1766
|
+
console.log(
|
|
1767
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1768
|
+
);
|
|
1769
|
+
}
|
|
1756
1770
|
const result = await this.runCommand(renderedCommand, {
|
|
1757
1771
|
cwd: this.config.cwd,
|
|
1758
1772
|
env: process.env,
|
|
@@ -1776,8 +1790,7 @@ var CliProvider = class {
|
|
|
1776
1790
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1777
1791
|
const parsed = this.parseOutputContent(responseContent);
|
|
1778
1792
|
return {
|
|
1779
|
-
|
|
1780
|
-
trace: parsed.trace,
|
|
1793
|
+
outputMessages: parsed.outputMessages,
|
|
1781
1794
|
raw: {
|
|
1782
1795
|
command: renderedCommand,
|
|
1783
1796
|
stderr: result.stderr,
|
|
@@ -1787,30 +1800,225 @@ var CliProvider = class {
|
|
|
1787
1800
|
}
|
|
1788
1801
|
};
|
|
1789
1802
|
}
|
|
1803
|
+
async invokeBatch(requests) {
|
|
1804
|
+
if (requests.length === 0) {
|
|
1805
|
+
return [];
|
|
1806
|
+
}
|
|
1807
|
+
for (const request of requests) {
|
|
1808
|
+
if (request.signal?.aborted) {
|
|
1809
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
const controller = new AbortController();
|
|
1813
|
+
for (const request of requests) {
|
|
1814
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1815
|
+
}
|
|
1816
|
+
await this.ensureHealthy(controller.signal);
|
|
1817
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
1818
|
+
const batchInputFiles = [];
|
|
1819
|
+
for (const request of requests) {
|
|
1820
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
1821
|
+
batchInputFiles.push(...request.inputFiles);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
const templateValues = buildTemplateValues(
|
|
1825
|
+
{
|
|
1826
|
+
question: "",
|
|
1827
|
+
guidelines: "",
|
|
1828
|
+
inputFiles: batchInputFiles,
|
|
1829
|
+
evalCaseId: "batch",
|
|
1830
|
+
attempt: 0
|
|
1831
|
+
},
|
|
1832
|
+
this.config,
|
|
1833
|
+
outputFilePath
|
|
1834
|
+
);
|
|
1835
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1836
|
+
if (this.verbose) {
|
|
1837
|
+
console.log(
|
|
1838
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1839
|
+
);
|
|
1840
|
+
}
|
|
1841
|
+
const result = await this.runCommand(renderedCommand, {
|
|
1842
|
+
cwd: this.config.cwd,
|
|
1843
|
+
env: process.env,
|
|
1844
|
+
timeoutMs: this.config.timeoutMs,
|
|
1845
|
+
signal: controller.signal
|
|
1846
|
+
});
|
|
1847
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1848
|
+
if (controller.signal.aborted) {
|
|
1849
|
+
throw new Error("CLI provider request was aborted");
|
|
1850
|
+
}
|
|
1851
|
+
if (result.timedOut) {
|
|
1852
|
+
throw new Error(
|
|
1853
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
1854
|
+
);
|
|
1855
|
+
}
|
|
1856
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
1857
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
1858
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1859
|
+
throw new Error(message);
|
|
1860
|
+
}
|
|
1861
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1862
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
1863
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
1864
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
1865
|
+
if (missingIds.length > 0) {
|
|
1866
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1867
|
+
}
|
|
1868
|
+
const responses = requests.map((request) => {
|
|
1869
|
+
const evalCaseId = request.evalCaseId;
|
|
1870
|
+
if (!evalCaseId) {
|
|
1871
|
+
return {
|
|
1872
|
+
outputMessages: [],
|
|
1873
|
+
raw: {
|
|
1874
|
+
command: renderedCommand,
|
|
1875
|
+
stderr: result.stderr,
|
|
1876
|
+
exitCode: result.exitCode ?? 0,
|
|
1877
|
+
cwd: this.config.cwd,
|
|
1878
|
+
outputFile: outputFilePath
|
|
1879
|
+
}
|
|
1880
|
+
};
|
|
1881
|
+
}
|
|
1882
|
+
const parsed = recordsById.get(evalCaseId);
|
|
1883
|
+
if (!parsed) {
|
|
1884
|
+
return {
|
|
1885
|
+
outputMessages: [],
|
|
1886
|
+
raw: {
|
|
1887
|
+
command: renderedCommand,
|
|
1888
|
+
stderr: result.stderr,
|
|
1889
|
+
exitCode: result.exitCode ?? 0,
|
|
1890
|
+
cwd: this.config.cwd,
|
|
1891
|
+
outputFile: outputFilePath
|
|
1892
|
+
}
|
|
1893
|
+
};
|
|
1894
|
+
}
|
|
1895
|
+
return {
|
|
1896
|
+
outputMessages: parsed.outputMessages,
|
|
1897
|
+
raw: {
|
|
1898
|
+
command: renderedCommand,
|
|
1899
|
+
stderr: result.stderr,
|
|
1900
|
+
exitCode: result.exitCode ?? 0,
|
|
1901
|
+
cwd: this.config.cwd,
|
|
1902
|
+
outputFile: outputFilePath,
|
|
1903
|
+
recordId: evalCaseId
|
|
1904
|
+
}
|
|
1905
|
+
};
|
|
1906
|
+
});
|
|
1907
|
+
return responses;
|
|
1908
|
+
}
|
|
1790
1909
|
/**
|
|
1791
1910
|
* Parse output content from CLI.
|
|
1792
|
-
* If the content is valid JSON with
|
|
1793
|
-
*
|
|
1911
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
1912
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
1913
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
1794
1914
|
*/
|
|
1795
1915
|
parseOutputContent(content) {
|
|
1796
1916
|
try {
|
|
1797
1917
|
const parsed = JSON.parse(content);
|
|
1798
|
-
if (typeof parsed === "object" && parsed !== null
|
|
1918
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
1799
1919
|
const obj = parsed;
|
|
1800
|
-
const
|
|
1801
|
-
|
|
1802
|
-
|
|
1920
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1921
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
1922
|
+
return { outputMessages };
|
|
1923
|
+
}
|
|
1924
|
+
if ("text" in obj) {
|
|
1925
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1926
|
+
return { outputMessages: [{ role: "assistant", content: text }] };
|
|
1927
|
+
}
|
|
1803
1928
|
}
|
|
1804
1929
|
} catch {
|
|
1805
1930
|
}
|
|
1806
|
-
return {
|
|
1931
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
1807
1932
|
}
|
|
1808
|
-
|
|
1809
|
-
|
|
1933
|
+
/**
|
|
1934
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
1935
|
+
*/
|
|
1936
|
+
parseOutputMessages(outputMessages) {
|
|
1937
|
+
if (!Array.isArray(outputMessages)) {
|
|
1810
1938
|
return void 0;
|
|
1811
1939
|
}
|
|
1812
|
-
const
|
|
1813
|
-
|
|
1940
|
+
const messages = [];
|
|
1941
|
+
for (const msg of outputMessages) {
|
|
1942
|
+
if (typeof msg !== "object" || msg === null) {
|
|
1943
|
+
continue;
|
|
1944
|
+
}
|
|
1945
|
+
const rawMsg = msg;
|
|
1946
|
+
if (typeof rawMsg.role !== "string") {
|
|
1947
|
+
continue;
|
|
1948
|
+
}
|
|
1949
|
+
const message = {
|
|
1950
|
+
role: rawMsg.role,
|
|
1951
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
1952
|
+
content: rawMsg.content,
|
|
1953
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
1954
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
1955
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
1956
|
+
};
|
|
1957
|
+
messages.push(message);
|
|
1958
|
+
}
|
|
1959
|
+
return messages.length > 0 ? messages : void 0;
|
|
1960
|
+
}
|
|
1961
|
+
/**
|
|
1962
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
1963
|
+
*/
|
|
1964
|
+
parseToolCalls(toolCalls) {
|
|
1965
|
+
if (!Array.isArray(toolCalls)) {
|
|
1966
|
+
return void 0;
|
|
1967
|
+
}
|
|
1968
|
+
const calls = [];
|
|
1969
|
+
for (const call of toolCalls) {
|
|
1970
|
+
if (typeof call !== "object" || call === null) {
|
|
1971
|
+
continue;
|
|
1972
|
+
}
|
|
1973
|
+
const rawCall = call;
|
|
1974
|
+
if (typeof rawCall.tool !== "string") {
|
|
1975
|
+
continue;
|
|
1976
|
+
}
|
|
1977
|
+
calls.push({
|
|
1978
|
+
tool: rawCall.tool,
|
|
1979
|
+
input: rawCall.input,
|
|
1980
|
+
output: rawCall.output,
|
|
1981
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
1982
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
1983
|
+
});
|
|
1984
|
+
}
|
|
1985
|
+
return calls.length > 0 ? calls : void 0;
|
|
1986
|
+
}
|
|
1987
|
+
parseJsonlBatchOutput(content) {
|
|
1988
|
+
const records = /* @__PURE__ */ new Map();
|
|
1989
|
+
const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1990
|
+
for (const line of lines) {
|
|
1991
|
+
let parsed;
|
|
1992
|
+
try {
|
|
1993
|
+
parsed = JSON.parse(line);
|
|
1994
|
+
} catch (error) {
|
|
1995
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
1996
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
1997
|
+
}
|
|
1998
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
1999
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
2000
|
+
}
|
|
2001
|
+
const obj = parsed;
|
|
2002
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
2003
|
+
if (!id || id.trim().length === 0) {
|
|
2004
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
2005
|
+
}
|
|
2006
|
+
if (records.has(id)) {
|
|
2007
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2008
|
+
}
|
|
2009
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2010
|
+
let outputMessages;
|
|
2011
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2012
|
+
outputMessages = parsedOutputMessages;
|
|
2013
|
+
} else {
|
|
2014
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2015
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2016
|
+
}
|
|
2017
|
+
records.set(id, {
|
|
2018
|
+
outputMessages
|
|
2019
|
+
});
|
|
2020
|
+
}
|
|
2021
|
+
return records;
|
|
1814
2022
|
}
|
|
1815
2023
|
async readAndCleanupOutputFile(filePath) {
|
|
1816
2024
|
try {
|
|
@@ -1820,8 +2028,10 @@ var CliProvider = class {
|
|
|
1820
2028
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1821
2029
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1822
2030
|
} finally {
|
|
1823
|
-
|
|
1824
|
-
|
|
2031
|
+
if (!this.keepTempFiles) {
|
|
2032
|
+
await fs.unlink(filePath).catch(() => {
|
|
2033
|
+
});
|
|
2034
|
+
}
|
|
1825
2035
|
}
|
|
1826
2036
|
}
|
|
1827
2037
|
async ensureHealthy(signal) {
|
|
@@ -1873,7 +2083,7 @@ var CliProvider = class {
|
|
|
1873
2083
|
);
|
|
1874
2084
|
if (this.verbose) {
|
|
1875
2085
|
console.log(
|
|
1876
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
2086
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1877
2087
|
);
|
|
1878
2088
|
}
|
|
1879
2089
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -1941,11 +2151,11 @@ function shellEscape(value) {
|
|
|
1941
2151
|
}
|
|
1942
2152
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
1943
2153
|
}
|
|
1944
|
-
function generateOutputFilePath(evalCaseId) {
|
|
2154
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
1945
2155
|
const safeEvalId = evalCaseId || "unknown";
|
|
1946
2156
|
const timestamp = Date.now();
|
|
1947
2157
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1948
|
-
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
2158
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
1949
2159
|
}
|
|
1950
2160
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1951
2161
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -2164,7 +2374,6 @@ var CodexProvider = class {
|
|
|
2164
2374
|
const parsed = parseCodexJson(result.stdout);
|
|
2165
2375
|
const assistantText = extractAssistantText(parsed);
|
|
2166
2376
|
return {
|
|
2167
|
-
text: assistantText,
|
|
2168
2377
|
raw: {
|
|
2169
2378
|
response: parsed,
|
|
2170
2379
|
stdout: result.stdout,
|
|
@@ -2176,7 +2385,8 @@ var CodexProvider = class {
|
|
|
2176
2385
|
workspace: workspaceRoot,
|
|
2177
2386
|
inputFiles,
|
|
2178
2387
|
logFile: logger?.filePath
|
|
2179
|
-
}
|
|
2388
|
+
},
|
|
2389
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2180
2390
|
};
|
|
2181
2391
|
} finally {
|
|
2182
2392
|
await logger?.close();
|
|
@@ -2798,7 +3008,6 @@ var MockProvider = class {
|
|
|
2798
3008
|
delayMs;
|
|
2799
3009
|
delayMinMs;
|
|
2800
3010
|
delayMaxMs;
|
|
2801
|
-
trace;
|
|
2802
3011
|
constructor(targetName, config) {
|
|
2803
3012
|
this.id = `mock:${targetName}`;
|
|
2804
3013
|
this.targetName = targetName;
|
|
@@ -2806,7 +3015,6 @@ var MockProvider = class {
|
|
|
2806
3015
|
this.delayMs = config.delayMs ?? 0;
|
|
2807
3016
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2808
3017
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2809
|
-
this.trace = config.trace;
|
|
2810
3018
|
}
|
|
2811
3019
|
async invoke(request) {
|
|
2812
3020
|
const delay = this.calculateDelay();
|
|
@@ -2814,12 +3022,11 @@ var MockProvider = class {
|
|
|
2814
3022
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
2815
3023
|
}
|
|
2816
3024
|
return {
|
|
2817
|
-
|
|
3025
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
2818
3026
|
raw: {
|
|
2819
3027
|
question: request.question,
|
|
2820
3028
|
guidelines: request.guidelines
|
|
2821
|
-
}
|
|
2822
|
-
trace: this.trace
|
|
3029
|
+
}
|
|
2823
3030
|
};
|
|
2824
3031
|
}
|
|
2825
3032
|
calculateDelay() {
|
|
@@ -2912,7 +3119,7 @@ var VSCodeProvider = class {
|
|
|
2912
3119
|
}
|
|
2913
3120
|
if (this.config.dryRun) {
|
|
2914
3121
|
return {
|
|
2915
|
-
|
|
3122
|
+
outputMessages: [],
|
|
2916
3123
|
raw: {
|
|
2917
3124
|
session,
|
|
2918
3125
|
inputFiles
|
|
@@ -2921,7 +3128,7 @@ var VSCodeProvider = class {
|
|
|
2921
3128
|
}
|
|
2922
3129
|
const responseText = await readTextFile(session.responseFile);
|
|
2923
3130
|
return {
|
|
2924
|
-
|
|
3131
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
2925
3132
|
raw: {
|
|
2926
3133
|
session,
|
|
2927
3134
|
inputFiles
|
|
@@ -2959,7 +3166,7 @@ var VSCodeProvider = class {
|
|
|
2959
3166
|
}
|
|
2960
3167
|
if (this.config.dryRun) {
|
|
2961
3168
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
2962
|
-
|
|
3169
|
+
outputMessages: [],
|
|
2963
3170
|
raw: {
|
|
2964
3171
|
session,
|
|
2965
3172
|
inputFiles,
|
|
@@ -2976,7 +3183,7 @@ var VSCodeProvider = class {
|
|
|
2976
3183
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
2977
3184
|
const responseText = await readTextFile(responseFile);
|
|
2978
3185
|
responses.push({
|
|
2979
|
-
|
|
3186
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
2980
3187
|
raw: {
|
|
2981
3188
|
session,
|
|
2982
3189
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -3280,6 +3487,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3280
3487
|
null,
|
|
3281
3488
|
2
|
|
3282
3489
|
),
|
|
3490
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
3283
3491
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3284
3492
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3285
3493
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -3304,7 +3512,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3304
3512
|
const score = clampScore(data.score);
|
|
3305
3513
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3306
3514
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3307
|
-
const reasoning = data.reasoning
|
|
3515
|
+
const reasoning = data.reasoning;
|
|
3308
3516
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3309
3517
|
return {
|
|
3310
3518
|
score,
|
|
@@ -3406,7 +3614,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3406
3614
|
maxOutputTokens: this.maxOutputTokens,
|
|
3407
3615
|
temperature: this.temperature
|
|
3408
3616
|
});
|
|
3409
|
-
const data = schema.parse(
|
|
3617
|
+
const data = schema.parse(
|
|
3618
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
3619
|
+
);
|
|
3410
3620
|
return { data, providerResponse: response };
|
|
3411
3621
|
} catch (e) {
|
|
3412
3622
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -3489,15 +3699,16 @@ var CodeEvaluator = class {
|
|
|
3489
3699
|
{
|
|
3490
3700
|
question: context.evalCase.question,
|
|
3491
3701
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3702
|
+
expected_messages: context.evalCase.expected_messages,
|
|
3492
3703
|
reference_answer: context.evalCase.reference_answer,
|
|
3493
3704
|
candidate_answer: context.candidate,
|
|
3705
|
+
output_messages: context.outputMessages ?? null,
|
|
3494
3706
|
guideline_files: context.evalCase.guideline_paths,
|
|
3495
3707
|
input_files: context.evalCase.file_paths.filter(
|
|
3496
3708
|
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3497
3709
|
),
|
|
3498
3710
|
input_messages: context.evalCase.input_messages,
|
|
3499
|
-
|
|
3500
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
3711
|
+
candidate_trace_summary: context.traceSummary ?? null
|
|
3501
3712
|
},
|
|
3502
3713
|
null,
|
|
3503
3714
|
2
|
|
@@ -3624,8 +3835,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3624
3835
|
this.config = options.config;
|
|
3625
3836
|
}
|
|
3626
3837
|
evaluate(context) {
|
|
3627
|
-
const {
|
|
3628
|
-
|
|
3838
|
+
const { outputMessages, traceSummary } = context;
|
|
3839
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
3840
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
3841
|
+
return {
|
|
3842
|
+
score: 0,
|
|
3843
|
+
verdict: "fail",
|
|
3844
|
+
hits: [],
|
|
3845
|
+
misses: ["No trace available for evaluation"],
|
|
3846
|
+
expectedAspectCount: 1
|
|
3847
|
+
};
|
|
3848
|
+
}
|
|
3849
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
3850
|
+
if (!summary) {
|
|
3629
3851
|
return {
|
|
3630
3852
|
score: 0,
|
|
3631
3853
|
verdict: "fail",
|
|
@@ -3636,11 +3858,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3636
3858
|
}
|
|
3637
3859
|
switch (this.config.mode) {
|
|
3638
3860
|
case "any_order":
|
|
3639
|
-
return this.evaluateAnyOrder(
|
|
3861
|
+
return this.evaluateAnyOrder(summary);
|
|
3640
3862
|
case "in_order":
|
|
3641
|
-
return this.evaluateInOrder(
|
|
3863
|
+
return this.evaluateInOrder(toolCalls);
|
|
3642
3864
|
case "exact":
|
|
3643
|
-
return this.evaluateExact(
|
|
3865
|
+
return this.evaluateExact(toolCalls);
|
|
3644
3866
|
default:
|
|
3645
3867
|
return {
|
|
3646
3868
|
score: 0,
|
|
@@ -3651,6 +3873,39 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3651
3873
|
};
|
|
3652
3874
|
}
|
|
3653
3875
|
}
|
|
3876
|
+
/**
|
|
3877
|
+
* Extract tool calls from output messages.
|
|
3878
|
+
*/
|
|
3879
|
+
extractToolCallsFromMessages(messages) {
|
|
3880
|
+
if (!messages) {
|
|
3881
|
+
return [];
|
|
3882
|
+
}
|
|
3883
|
+
const toolCalls = [];
|
|
3884
|
+
for (const message of messages) {
|
|
3885
|
+
if (message.toolCalls) {
|
|
3886
|
+
for (const call of message.toolCalls) {
|
|
3887
|
+
toolCalls.push({ name: call.tool });
|
|
3888
|
+
}
|
|
3889
|
+
}
|
|
3890
|
+
}
|
|
3891
|
+
return toolCalls;
|
|
3892
|
+
}
|
|
3893
|
+
/**
|
|
3894
|
+
* Build a summary from extracted tool calls.
|
|
3895
|
+
*/
|
|
3896
|
+
buildSummary(toolCalls) {
|
|
3897
|
+
const toolCallsByName = {};
|
|
3898
|
+
for (const call of toolCalls) {
|
|
3899
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
3900
|
+
}
|
|
3901
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
3902
|
+
return {
|
|
3903
|
+
eventCount: toolCalls.length,
|
|
3904
|
+
toolNames,
|
|
3905
|
+
toolCallsByName,
|
|
3906
|
+
errorCount: 0
|
|
3907
|
+
};
|
|
3908
|
+
}
|
|
3654
3909
|
evaluateAnyOrder(summary) {
|
|
3655
3910
|
const minimums = this.config.minimums ?? {};
|
|
3656
3911
|
const toolNames = Object.keys(minimums);
|
|
@@ -3683,7 +3938,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3683
3938
|
expectedAspectCount: toolNames.length
|
|
3684
3939
|
};
|
|
3685
3940
|
}
|
|
3686
|
-
evaluateInOrder(
|
|
3941
|
+
evaluateInOrder(toolCalls) {
|
|
3687
3942
|
const expected = this.config.expected ?? [];
|
|
3688
3943
|
if (expected.length === 0) {
|
|
3689
3944
|
return {
|
|
@@ -3694,15 +3949,14 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3694
3949
|
expectedAspectCount: 0
|
|
3695
3950
|
};
|
|
3696
3951
|
}
|
|
3697
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3698
3952
|
const hits = [];
|
|
3699
3953
|
const misses = [];
|
|
3700
3954
|
let actualIndex = 0;
|
|
3701
3955
|
for (let i = 0; i < expected.length; i++) {
|
|
3702
3956
|
const expectedTool = expected[i].tool;
|
|
3703
3957
|
let found = false;
|
|
3704
|
-
while (actualIndex <
|
|
3705
|
-
if (
|
|
3958
|
+
while (actualIndex < toolCalls.length) {
|
|
3959
|
+
if (toolCalls[actualIndex].name === expectedTool) {
|
|
3706
3960
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
3707
3961
|
actualIndex++;
|
|
3708
3962
|
found = true;
|
|
@@ -3723,7 +3977,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3723
3977
|
expectedAspectCount: expected.length
|
|
3724
3978
|
};
|
|
3725
3979
|
}
|
|
3726
|
-
evaluateExact(
|
|
3980
|
+
evaluateExact(toolCalls) {
|
|
3727
3981
|
const expected = this.config.expected ?? [];
|
|
3728
3982
|
if (expected.length === 0) {
|
|
3729
3983
|
return {
|
|
@@ -3734,16 +3988,15 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3734
3988
|
expectedAspectCount: 0
|
|
3735
3989
|
};
|
|
3736
3990
|
}
|
|
3737
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3738
3991
|
const hits = [];
|
|
3739
3992
|
const misses = [];
|
|
3740
|
-
if (
|
|
3741
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
3993
|
+
if (toolCalls.length !== expected.length) {
|
|
3994
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
3742
3995
|
}
|
|
3743
|
-
const checkLength = Math.min(expected.length,
|
|
3996
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
3744
3997
|
for (let i = 0; i < checkLength; i++) {
|
|
3745
3998
|
const expectedTool = expected[i].tool;
|
|
3746
|
-
const actualTool =
|
|
3999
|
+
const actualTool = toolCalls[i].name;
|
|
3747
4000
|
if (actualTool === expectedTool) {
|
|
3748
4001
|
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
3749
4002
|
} else {
|
|
@@ -3957,11 +4210,13 @@ var CompositeEvaluator = class {
|
|
|
3957
4210
|
evalCaseId: context.evalCase.id,
|
|
3958
4211
|
attempt: context.attempt
|
|
3959
4212
|
});
|
|
3960
|
-
const data = freeformEvaluationSchema.parse(
|
|
4213
|
+
const data = freeformEvaluationSchema.parse(
|
|
4214
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4215
|
+
);
|
|
3961
4216
|
const score = clampScore(data.score);
|
|
3962
4217
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3963
4218
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3964
|
-
const reasoning = data.reasoning
|
|
4219
|
+
const reasoning = data.reasoning;
|
|
3965
4220
|
return {
|
|
3966
4221
|
score,
|
|
3967
4222
|
verdict: scoreToVerdict(score),
|
|
@@ -4384,11 +4639,14 @@ async function runBatchEvaluation(options) {
|
|
|
4384
4639
|
const evalCase = evalCases[i];
|
|
4385
4640
|
const promptInputs = promptInputsList[i];
|
|
4386
4641
|
const providerResponse = batchResponse[i];
|
|
4642
|
+
const outputMessages = providerResponse.outputMessages;
|
|
4643
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
4644
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4387
4645
|
let result;
|
|
4388
4646
|
try {
|
|
4389
4647
|
result = await evaluateCandidate({
|
|
4390
4648
|
evalCase,
|
|
4391
|
-
candidate
|
|
4649
|
+
candidate,
|
|
4392
4650
|
target,
|
|
4393
4651
|
provider,
|
|
4394
4652
|
evaluators: evaluatorRegistry,
|
|
@@ -4396,7 +4654,9 @@ async function runBatchEvaluation(options) {
|
|
|
4396
4654
|
nowFn,
|
|
4397
4655
|
attempt: 0,
|
|
4398
4656
|
judgeProvider: await resolveJudgeProvider(target),
|
|
4399
|
-
agentTimeoutMs
|
|
4657
|
+
agentTimeoutMs,
|
|
4658
|
+
outputMessages,
|
|
4659
|
+
traceSummary
|
|
4400
4660
|
});
|
|
4401
4661
|
} catch (error) {
|
|
4402
4662
|
const errorResult = buildErrorResult(
|
|
@@ -4500,21 +4760,13 @@ async function runEvalCase(options) {
|
|
|
4500
4760
|
if (cacheKey && cache && !cachedResponse) {
|
|
4501
4761
|
await cache.set(cacheKey, providerResponse);
|
|
4502
4762
|
}
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
4507
|
-
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
4508
|
-
candidateTrace = rawTrace;
|
|
4509
|
-
}
|
|
4510
|
-
} catch {
|
|
4511
|
-
}
|
|
4512
|
-
}
|
|
4513
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4763
|
+
const outputMessages = providerResponse.outputMessages;
|
|
4764
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
4765
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4514
4766
|
try {
|
|
4515
4767
|
return await evaluateCandidate({
|
|
4516
4768
|
evalCase,
|
|
4517
|
-
candidate
|
|
4769
|
+
candidate,
|
|
4518
4770
|
target,
|
|
4519
4771
|
provider,
|
|
4520
4772
|
evaluators,
|
|
@@ -4523,9 +4775,8 @@ async function runEvalCase(options) {
|
|
|
4523
4775
|
attempt,
|
|
4524
4776
|
judgeProvider,
|
|
4525
4777
|
agentTimeoutMs,
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
candidateTraceSummary
|
|
4778
|
+
outputMessages,
|
|
4779
|
+
traceSummary
|
|
4529
4780
|
});
|
|
4530
4781
|
} catch (error) {
|
|
4531
4782
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4543,9 +4794,8 @@ async function evaluateCandidate(options) {
|
|
|
4543
4794
|
attempt,
|
|
4544
4795
|
judgeProvider,
|
|
4545
4796
|
agentTimeoutMs,
|
|
4546
|
-
|
|
4547
|
-
|
|
4548
|
-
candidateTraceSummary
|
|
4797
|
+
outputMessages,
|
|
4798
|
+
traceSummary
|
|
4549
4799
|
} = options;
|
|
4550
4800
|
const gradeTimestamp = nowFn();
|
|
4551
4801
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4559,9 +4809,8 @@ async function evaluateCandidate(options) {
|
|
|
4559
4809
|
now: gradeTimestamp,
|
|
4560
4810
|
judgeProvider,
|
|
4561
4811
|
agentTimeoutMs,
|
|
4562
|
-
|
|
4563
|
-
|
|
4564
|
-
candidateTraceSummary
|
|
4812
|
+
outputMessages,
|
|
4813
|
+
traceSummary
|
|
4565
4814
|
});
|
|
4566
4815
|
const completedAt = nowFn();
|
|
4567
4816
|
let agentProviderRequest;
|
|
@@ -4599,7 +4848,7 @@ async function evaluateCandidate(options) {
|
|
|
4599
4848
|
lm_provider_request: lmProviderRequest,
|
|
4600
4849
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4601
4850
|
evaluator_results: evaluatorResults,
|
|
4602
|
-
trace_summary:
|
|
4851
|
+
trace_summary: traceSummary
|
|
4603
4852
|
};
|
|
4604
4853
|
}
|
|
4605
4854
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4614,9 +4863,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4614
4863
|
now,
|
|
4615
4864
|
judgeProvider,
|
|
4616
4865
|
agentTimeoutMs,
|
|
4617
|
-
|
|
4618
|
-
|
|
4619
|
-
candidateTraceSummary
|
|
4866
|
+
outputMessages,
|
|
4867
|
+
traceSummary
|
|
4620
4868
|
} = options;
|
|
4621
4869
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4622
4870
|
return runEvaluatorList({
|
|
@@ -4631,9 +4879,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4631
4879
|
now,
|
|
4632
4880
|
judgeProvider,
|
|
4633
4881
|
agentTimeoutMs,
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
candidateTraceSummary
|
|
4882
|
+
outputMessages,
|
|
4883
|
+
traceSummary
|
|
4637
4884
|
});
|
|
4638
4885
|
}
|
|
4639
4886
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4650,9 +4897,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4650
4897
|
promptInputs,
|
|
4651
4898
|
now,
|
|
4652
4899
|
judgeProvider,
|
|
4653
|
-
|
|
4654
|
-
|
|
4655
|
-
candidateTraceSummary
|
|
4900
|
+
outputMessages,
|
|
4901
|
+
traceSummary
|
|
4656
4902
|
});
|
|
4657
4903
|
return { score };
|
|
4658
4904
|
}
|
|
@@ -4669,9 +4915,8 @@ async function runEvaluatorList(options) {
|
|
|
4669
4915
|
now,
|
|
4670
4916
|
judgeProvider,
|
|
4671
4917
|
agentTimeoutMs,
|
|
4672
|
-
|
|
4673
|
-
|
|
4674
|
-
candidateTraceSummary
|
|
4918
|
+
outputMessages,
|
|
4919
|
+
traceSummary
|
|
4675
4920
|
} = options;
|
|
4676
4921
|
const scored = [];
|
|
4677
4922
|
const evaluatorResults = [];
|
|
@@ -4718,8 +4963,8 @@ async function runEvaluatorList(options) {
|
|
|
4718
4963
|
attempt,
|
|
4719
4964
|
promptInputs,
|
|
4720
4965
|
now,
|
|
4721
|
-
|
|
4722
|
-
|
|
4966
|
+
outputMessages,
|
|
4967
|
+
traceSummary
|
|
4723
4968
|
});
|
|
4724
4969
|
const weight = evaluator.weight ?? 1;
|
|
4725
4970
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -4805,9 +5050,8 @@ async function runEvaluatorList(options) {
|
|
|
4805
5050
|
attempt,
|
|
4806
5051
|
promptInputs,
|
|
4807
5052
|
now,
|
|
4808
|
-
|
|
4809
|
-
|
|
4810
|
-
candidateTraceSummary
|
|
5053
|
+
outputMessages,
|
|
5054
|
+
traceSummary
|
|
4811
5055
|
});
|
|
4812
5056
|
const weight = evaluator.weight ?? 1;
|
|
4813
5057
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5200,8 +5444,6 @@ export {
|
|
|
5200
5444
|
isJsonValue,
|
|
5201
5445
|
isTestMessage,
|
|
5202
5446
|
isTestMessageRole,
|
|
5203
|
-
isTraceEvent,
|
|
5204
|
-
isTraceEventType,
|
|
5205
5447
|
listTargetNames,
|
|
5206
5448
|
loadEvalCases,
|
|
5207
5449
|
normalizeLineEndings,
|