@agentv/core 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-V3JCB3HI.js → chunk-KPHTMTZ3.js} +32 -7
- package/dist/chunk-KPHTMTZ3.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +17 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +18 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +411 -146
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +59 -51
- package/dist/index.d.ts +59 -51
- package/dist/index.js +371 -129
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-V3JCB3HI.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -54,8 +54,6 @@ __export(index_exports, {
|
|
|
54
54
|
isJsonValue: () => isJsonValue,
|
|
55
55
|
isTestMessage: () => isTestMessage,
|
|
56
56
|
isTestMessageRole: () => isTestMessageRole,
|
|
57
|
-
isTraceEvent: () => isTraceEvent,
|
|
58
|
-
isTraceEventType: () => isTraceEventType,
|
|
59
57
|
listTargetNames: () => listTargetNames,
|
|
60
58
|
loadEvalCases: () => loadEvalCases,
|
|
61
59
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
@@ -135,33 +133,22 @@ function getHitCount(result) {
|
|
|
135
133
|
}
|
|
136
134
|
|
|
137
135
|
// src/evaluation/trace.ts
|
|
138
|
-
function
|
|
139
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
140
|
-
}
|
|
141
|
-
function isTraceEvent(value) {
|
|
142
|
-
if (typeof value !== "object" || value === null) {
|
|
143
|
-
return false;
|
|
144
|
-
}
|
|
145
|
-
const candidate = value;
|
|
146
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
147
|
-
}
|
|
148
|
-
function computeTraceSummary(trace) {
|
|
136
|
+
function computeTraceSummary(messages) {
|
|
149
137
|
const toolCallCounts = {};
|
|
150
|
-
let
|
|
151
|
-
for (const
|
|
152
|
-
if (
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
errorCount++;
|
|
138
|
+
let totalToolCalls = 0;
|
|
139
|
+
for (const message of messages) {
|
|
140
|
+
if (!message.toolCalls) continue;
|
|
141
|
+
for (const toolCall of message.toolCalls) {
|
|
142
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
143
|
+
totalToolCalls++;
|
|
157
144
|
}
|
|
158
145
|
}
|
|
159
146
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
160
147
|
return {
|
|
161
|
-
eventCount:
|
|
148
|
+
eventCount: totalToolCalls,
|
|
162
149
|
toolNames,
|
|
163
150
|
toolCallsByName: toolCallCounts,
|
|
164
|
-
errorCount
|
|
151
|
+
errorCount: 0
|
|
165
152
|
};
|
|
166
153
|
}
|
|
167
154
|
|
|
@@ -437,7 +424,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
437
424
|
QUESTION: "question",
|
|
438
425
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
439
426
|
REFERENCE_ANSWER: "reference_answer",
|
|
440
|
-
INPUT_MESSAGES: "input_messages"
|
|
427
|
+
INPUT_MESSAGES: "input_messages",
|
|
428
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
441
429
|
};
|
|
442
430
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
443
431
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -835,6 +823,17 @@ async function processMessages(options) {
|
|
|
835
823
|
}
|
|
836
824
|
continue;
|
|
837
825
|
}
|
|
826
|
+
if (isJsonObject(content)) {
|
|
827
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
828
|
+
segments.push({ type: "text", value: rendered });
|
|
829
|
+
if (textParts) {
|
|
830
|
+
textParts.push(rendered);
|
|
831
|
+
}
|
|
832
|
+
continue;
|
|
833
|
+
}
|
|
834
|
+
if (!Array.isArray(content)) {
|
|
835
|
+
continue;
|
|
836
|
+
}
|
|
838
837
|
for (const rawSegment of content) {
|
|
839
838
|
if (!isJsonObject(rawSegment)) {
|
|
840
839
|
continue;
|
|
@@ -1061,6 +1060,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1061
1060
|
}
|
|
1062
1061
|
}
|
|
1063
1062
|
}
|
|
1063
|
+
} else if (isJsonObject(message.content)) {
|
|
1064
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
1065
|
+
if (rendered.trim().length > 0) {
|
|
1066
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
1067
|
+
}
|
|
1064
1068
|
}
|
|
1065
1069
|
segmentsByMessage.push(messageSegments);
|
|
1066
1070
|
}
|
|
@@ -1304,16 +1308,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1304
1308
|
}) : [];
|
|
1305
1309
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1306
1310
|
let referenceAnswer = "";
|
|
1307
|
-
if (outputSegments.length >
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
const
|
|
1311
|
-
if (typeof
|
|
1312
|
-
referenceAnswer =
|
|
1313
|
-
} else if (
|
|
1314
|
-
referenceAnswer = JSON.stringify(
|
|
1315
|
-
} else if (
|
|
1316
|
-
referenceAnswer = JSON.stringify(
|
|
1311
|
+
if (outputSegments.length > 0) {
|
|
1312
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1313
|
+
const content = lastMessage.content;
|
|
1314
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1315
|
+
if (typeof content === "string") {
|
|
1316
|
+
referenceAnswer = content;
|
|
1317
|
+
} else if (content !== void 0 && content !== null) {
|
|
1318
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1319
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1320
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1317
1321
|
}
|
|
1318
1322
|
}
|
|
1319
1323
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1756,11 +1760,11 @@ async function invokeModel(options) {
|
|
|
1756
1760
|
return mapResponse(result);
|
|
1757
1761
|
}
|
|
1758
1762
|
function mapResponse(result) {
|
|
1763
|
+
const content = result.text ?? "";
|
|
1759
1764
|
return {
|
|
1760
|
-
text: result.text ?? "",
|
|
1761
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1762
1765
|
raw: result,
|
|
1763
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1766
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1767
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1764
1768
|
};
|
|
1765
1769
|
}
|
|
1766
1770
|
function toJsonObject(value) {
|
|
@@ -1909,10 +1913,11 @@ var CliProvider = class {
|
|
|
1909
1913
|
id;
|
|
1910
1914
|
kind = "cli";
|
|
1911
1915
|
targetName;
|
|
1912
|
-
supportsBatch =
|
|
1916
|
+
supportsBatch = true;
|
|
1913
1917
|
config;
|
|
1914
1918
|
runCommand;
|
|
1915
1919
|
verbose;
|
|
1920
|
+
keepTempFiles;
|
|
1916
1921
|
healthcheckPromise;
|
|
1917
1922
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1918
1923
|
this.targetName = targetName;
|
|
@@ -1920,6 +1925,7 @@ var CliProvider = class {
|
|
|
1920
1925
|
this.config = config;
|
|
1921
1926
|
this.runCommand = runner;
|
|
1922
1927
|
this.verbose = config.verbose ?? false;
|
|
1928
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1923
1929
|
}
|
|
1924
1930
|
async invoke(request) {
|
|
1925
1931
|
if (request.signal?.aborted) {
|
|
@@ -1929,6 +1935,11 @@ var CliProvider = class {
|
|
|
1929
1935
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
1930
1936
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
1931
1937
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1938
|
+
if (this.verbose) {
|
|
1939
|
+
console.log(
|
|
1940
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1941
|
+
);
|
|
1942
|
+
}
|
|
1932
1943
|
const result = await this.runCommand(renderedCommand, {
|
|
1933
1944
|
cwd: this.config.cwd,
|
|
1934
1945
|
env: process.env,
|
|
@@ -1952,8 +1963,7 @@ var CliProvider = class {
|
|
|
1952
1963
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1953
1964
|
const parsed = this.parseOutputContent(responseContent);
|
|
1954
1965
|
return {
|
|
1955
|
-
|
|
1956
|
-
trace: parsed.trace,
|
|
1966
|
+
outputMessages: parsed.outputMessages,
|
|
1957
1967
|
raw: {
|
|
1958
1968
|
command: renderedCommand,
|
|
1959
1969
|
stderr: result.stderr,
|
|
@@ -1963,30 +1973,225 @@ var CliProvider = class {
|
|
|
1963
1973
|
}
|
|
1964
1974
|
};
|
|
1965
1975
|
}
|
|
1976
|
+
async invokeBatch(requests) {
|
|
1977
|
+
if (requests.length === 0) {
|
|
1978
|
+
return [];
|
|
1979
|
+
}
|
|
1980
|
+
for (const request of requests) {
|
|
1981
|
+
if (request.signal?.aborted) {
|
|
1982
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
1983
|
+
}
|
|
1984
|
+
}
|
|
1985
|
+
const controller = new AbortController();
|
|
1986
|
+
for (const request of requests) {
|
|
1987
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1988
|
+
}
|
|
1989
|
+
await this.ensureHealthy(controller.signal);
|
|
1990
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
1991
|
+
const batchInputFiles = [];
|
|
1992
|
+
for (const request of requests) {
|
|
1993
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
1994
|
+
batchInputFiles.push(...request.inputFiles);
|
|
1995
|
+
}
|
|
1996
|
+
}
|
|
1997
|
+
const templateValues = buildTemplateValues(
|
|
1998
|
+
{
|
|
1999
|
+
question: "",
|
|
2000
|
+
guidelines: "",
|
|
2001
|
+
inputFiles: batchInputFiles,
|
|
2002
|
+
evalCaseId: "batch",
|
|
2003
|
+
attempt: 0
|
|
2004
|
+
},
|
|
2005
|
+
this.config,
|
|
2006
|
+
outputFilePath
|
|
2007
|
+
);
|
|
2008
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
2009
|
+
if (this.verbose) {
|
|
2010
|
+
console.log(
|
|
2011
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2012
|
+
);
|
|
2013
|
+
}
|
|
2014
|
+
const result = await this.runCommand(renderedCommand, {
|
|
2015
|
+
cwd: this.config.cwd,
|
|
2016
|
+
env: process.env,
|
|
2017
|
+
timeoutMs: this.config.timeoutMs,
|
|
2018
|
+
signal: controller.signal
|
|
2019
|
+
});
|
|
2020
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2021
|
+
if (controller.signal.aborted) {
|
|
2022
|
+
throw new Error("CLI provider request was aborted");
|
|
2023
|
+
}
|
|
2024
|
+
if (result.timedOut) {
|
|
2025
|
+
throw new Error(
|
|
2026
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2027
|
+
);
|
|
2028
|
+
}
|
|
2029
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
2030
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
2031
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
2032
|
+
throw new Error(message);
|
|
2033
|
+
}
|
|
2034
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
2035
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
2036
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
2037
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
2038
|
+
if (missingIds.length > 0) {
|
|
2039
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2040
|
+
}
|
|
2041
|
+
const responses = requests.map((request) => {
|
|
2042
|
+
const evalCaseId = request.evalCaseId;
|
|
2043
|
+
if (!evalCaseId) {
|
|
2044
|
+
return {
|
|
2045
|
+
outputMessages: [],
|
|
2046
|
+
raw: {
|
|
2047
|
+
command: renderedCommand,
|
|
2048
|
+
stderr: result.stderr,
|
|
2049
|
+
exitCode: result.exitCode ?? 0,
|
|
2050
|
+
cwd: this.config.cwd,
|
|
2051
|
+
outputFile: outputFilePath
|
|
2052
|
+
}
|
|
2053
|
+
};
|
|
2054
|
+
}
|
|
2055
|
+
const parsed = recordsById.get(evalCaseId);
|
|
2056
|
+
if (!parsed) {
|
|
2057
|
+
return {
|
|
2058
|
+
outputMessages: [],
|
|
2059
|
+
raw: {
|
|
2060
|
+
command: renderedCommand,
|
|
2061
|
+
stderr: result.stderr,
|
|
2062
|
+
exitCode: result.exitCode ?? 0,
|
|
2063
|
+
cwd: this.config.cwd,
|
|
2064
|
+
outputFile: outputFilePath
|
|
2065
|
+
}
|
|
2066
|
+
};
|
|
2067
|
+
}
|
|
2068
|
+
return {
|
|
2069
|
+
outputMessages: parsed.outputMessages,
|
|
2070
|
+
raw: {
|
|
2071
|
+
command: renderedCommand,
|
|
2072
|
+
stderr: result.stderr,
|
|
2073
|
+
exitCode: result.exitCode ?? 0,
|
|
2074
|
+
cwd: this.config.cwd,
|
|
2075
|
+
outputFile: outputFilePath,
|
|
2076
|
+
recordId: evalCaseId
|
|
2077
|
+
}
|
|
2078
|
+
};
|
|
2079
|
+
});
|
|
2080
|
+
return responses;
|
|
2081
|
+
}
|
|
1966
2082
|
/**
|
|
1967
2083
|
* Parse output content from CLI.
|
|
1968
|
-
* If the content is valid JSON with
|
|
1969
|
-
*
|
|
2084
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
2085
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
2086
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
1970
2087
|
*/
|
|
1971
2088
|
parseOutputContent(content) {
|
|
1972
2089
|
try {
|
|
1973
2090
|
const parsed = JSON.parse(content);
|
|
1974
|
-
if (typeof parsed === "object" && parsed !== null
|
|
2091
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
1975
2092
|
const obj = parsed;
|
|
1976
|
-
const
|
|
1977
|
-
|
|
1978
|
-
|
|
2093
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2094
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
2095
|
+
return { outputMessages };
|
|
2096
|
+
}
|
|
2097
|
+
if ("text" in obj) {
|
|
2098
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2099
|
+
return { outputMessages: [{ role: "assistant", content: text }] };
|
|
2100
|
+
}
|
|
1979
2101
|
}
|
|
1980
2102
|
} catch {
|
|
1981
2103
|
}
|
|
1982
|
-
return {
|
|
2104
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2105
|
+
}
|
|
2106
|
+
/**
|
|
2107
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2108
|
+
*/
|
|
2109
|
+
parseOutputMessages(outputMessages) {
|
|
2110
|
+
if (!Array.isArray(outputMessages)) {
|
|
2111
|
+
return void 0;
|
|
2112
|
+
}
|
|
2113
|
+
const messages = [];
|
|
2114
|
+
for (const msg of outputMessages) {
|
|
2115
|
+
if (typeof msg !== "object" || msg === null) {
|
|
2116
|
+
continue;
|
|
2117
|
+
}
|
|
2118
|
+
const rawMsg = msg;
|
|
2119
|
+
if (typeof rawMsg.role !== "string") {
|
|
2120
|
+
continue;
|
|
2121
|
+
}
|
|
2122
|
+
const message = {
|
|
2123
|
+
role: rawMsg.role,
|
|
2124
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2125
|
+
content: rawMsg.content,
|
|
2126
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2127
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2128
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
2129
|
+
};
|
|
2130
|
+
messages.push(message);
|
|
2131
|
+
}
|
|
2132
|
+
return messages.length > 0 ? messages : void 0;
|
|
1983
2133
|
}
|
|
1984
|
-
|
|
1985
|
-
|
|
2134
|
+
/**
|
|
2135
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2136
|
+
*/
|
|
2137
|
+
parseToolCalls(toolCalls) {
|
|
2138
|
+
if (!Array.isArray(toolCalls)) {
|
|
1986
2139
|
return void 0;
|
|
1987
2140
|
}
|
|
1988
|
-
const
|
|
1989
|
-
|
|
2141
|
+
const calls = [];
|
|
2142
|
+
for (const call of toolCalls) {
|
|
2143
|
+
if (typeof call !== "object" || call === null) {
|
|
2144
|
+
continue;
|
|
2145
|
+
}
|
|
2146
|
+
const rawCall = call;
|
|
2147
|
+
if (typeof rawCall.tool !== "string") {
|
|
2148
|
+
continue;
|
|
2149
|
+
}
|
|
2150
|
+
calls.push({
|
|
2151
|
+
tool: rawCall.tool,
|
|
2152
|
+
input: rawCall.input,
|
|
2153
|
+
output: rawCall.output,
|
|
2154
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2155
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2156
|
+
});
|
|
2157
|
+
}
|
|
2158
|
+
return calls.length > 0 ? calls : void 0;
|
|
2159
|
+
}
|
|
2160
|
+
parseJsonlBatchOutput(content) {
|
|
2161
|
+
const records = /* @__PURE__ */ new Map();
|
|
2162
|
+
const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2163
|
+
for (const line of lines) {
|
|
2164
|
+
let parsed;
|
|
2165
|
+
try {
|
|
2166
|
+
parsed = JSON.parse(line);
|
|
2167
|
+
} catch (error) {
|
|
2168
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
2169
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2170
|
+
}
|
|
2171
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
2172
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
2173
|
+
}
|
|
2174
|
+
const obj = parsed;
|
|
2175
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
2176
|
+
if (!id || id.trim().length === 0) {
|
|
2177
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
2178
|
+
}
|
|
2179
|
+
if (records.has(id)) {
|
|
2180
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2181
|
+
}
|
|
2182
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2183
|
+
let outputMessages;
|
|
2184
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2185
|
+
outputMessages = parsedOutputMessages;
|
|
2186
|
+
} else {
|
|
2187
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2188
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2189
|
+
}
|
|
2190
|
+
records.set(id, {
|
|
2191
|
+
outputMessages
|
|
2192
|
+
});
|
|
2193
|
+
}
|
|
2194
|
+
return records;
|
|
1990
2195
|
}
|
|
1991
2196
|
async readAndCleanupOutputFile(filePath) {
|
|
1992
2197
|
try {
|
|
@@ -1996,8 +2201,10 @@ var CliProvider = class {
|
|
|
1996
2201
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1997
2202
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1998
2203
|
} finally {
|
|
1999
|
-
|
|
2000
|
-
|
|
2204
|
+
if (!this.keepTempFiles) {
|
|
2205
|
+
await import_promises8.default.unlink(filePath).catch(() => {
|
|
2206
|
+
});
|
|
2207
|
+
}
|
|
2001
2208
|
}
|
|
2002
2209
|
}
|
|
2003
2210
|
async ensureHealthy(signal) {
|
|
@@ -2049,7 +2256,7 @@ var CliProvider = class {
|
|
|
2049
2256
|
);
|
|
2050
2257
|
if (this.verbose) {
|
|
2051
2258
|
console.log(
|
|
2052
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
2259
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2053
2260
|
);
|
|
2054
2261
|
}
|
|
2055
2262
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -2117,11 +2324,11 @@ function shellEscape(value) {
|
|
|
2117
2324
|
}
|
|
2118
2325
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
2119
2326
|
}
|
|
2120
|
-
function generateOutputFilePath(evalCaseId) {
|
|
2327
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
2121
2328
|
const safeEvalId = evalCaseId || "unknown";
|
|
2122
2329
|
const timestamp = Date.now();
|
|
2123
2330
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2124
|
-
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
2331
|
+
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2125
2332
|
}
|
|
2126
2333
|
function formatTimeoutSuffix(timeoutMs) {
|
|
2127
2334
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -2340,7 +2547,6 @@ var CodexProvider = class {
|
|
|
2340
2547
|
const parsed = parseCodexJson(result.stdout);
|
|
2341
2548
|
const assistantText = extractAssistantText(parsed);
|
|
2342
2549
|
return {
|
|
2343
|
-
text: assistantText,
|
|
2344
2550
|
raw: {
|
|
2345
2551
|
response: parsed,
|
|
2346
2552
|
stdout: result.stdout,
|
|
@@ -2352,7 +2558,8 @@ var CodexProvider = class {
|
|
|
2352
2558
|
workspace: workspaceRoot,
|
|
2353
2559
|
inputFiles,
|
|
2354
2560
|
logFile: logger?.filePath
|
|
2355
|
-
}
|
|
2561
|
+
},
|
|
2562
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2356
2563
|
};
|
|
2357
2564
|
} finally {
|
|
2358
2565
|
await logger?.close();
|
|
@@ -2974,7 +3181,6 @@ var MockProvider = class {
|
|
|
2974
3181
|
delayMs;
|
|
2975
3182
|
delayMinMs;
|
|
2976
3183
|
delayMaxMs;
|
|
2977
|
-
trace;
|
|
2978
3184
|
constructor(targetName, config) {
|
|
2979
3185
|
this.id = `mock:${targetName}`;
|
|
2980
3186
|
this.targetName = targetName;
|
|
@@ -2982,7 +3188,6 @@ var MockProvider = class {
|
|
|
2982
3188
|
this.delayMs = config.delayMs ?? 0;
|
|
2983
3189
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2984
3190
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2985
|
-
this.trace = config.trace;
|
|
2986
3191
|
}
|
|
2987
3192
|
async invoke(request) {
|
|
2988
3193
|
const delay = this.calculateDelay();
|
|
@@ -2990,12 +3195,11 @@ var MockProvider = class {
|
|
|
2990
3195
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
2991
3196
|
}
|
|
2992
3197
|
return {
|
|
2993
|
-
|
|
3198
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
2994
3199
|
raw: {
|
|
2995
3200
|
question: request.question,
|
|
2996
3201
|
guidelines: request.guidelines
|
|
2997
|
-
}
|
|
2998
|
-
trace: this.trace
|
|
3202
|
+
}
|
|
2999
3203
|
};
|
|
3000
3204
|
}
|
|
3001
3205
|
calculateDelay() {
|
|
@@ -3263,8 +3467,7 @@ function normalizeCodexLogFormat(value) {
|
|
|
3263
3467
|
}
|
|
3264
3468
|
function resolveMockConfig(target) {
|
|
3265
3469
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3266
|
-
|
|
3267
|
-
return { response, trace };
|
|
3470
|
+
return { response };
|
|
3268
3471
|
}
|
|
3269
3472
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
3270
3473
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -3301,10 +3504,17 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3301
3504
|
const filesFormat = resolveOptionalLiteralString(
|
|
3302
3505
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3303
3506
|
);
|
|
3507
|
+
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
3508
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
3509
|
+
target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
|
|
3510
|
+
);
|
|
3304
3511
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3305
3512
|
allowLiteral: true,
|
|
3306
3513
|
optionalEnv: true
|
|
3307
3514
|
});
|
|
3515
|
+
if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
|
|
3516
|
+
cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
|
|
3517
|
+
}
|
|
3308
3518
|
if (!cwd && evalFilePath) {
|
|
3309
3519
|
cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
|
|
3310
3520
|
}
|
|
@@ -3312,7 +3522,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3312
3522
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
3313
3523
|
`${target.name} timeout`
|
|
3314
3524
|
);
|
|
3315
|
-
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
3525
|
+
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
|
|
3316
3526
|
const commandTemplate = resolveString(
|
|
3317
3527
|
commandTemplateSource,
|
|
3318
3528
|
env,
|
|
@@ -3325,7 +3535,9 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3325
3535
|
filesFormat,
|
|
3326
3536
|
cwd,
|
|
3327
3537
|
timeoutMs,
|
|
3328
|
-
healthcheck
|
|
3538
|
+
healthcheck,
|
|
3539
|
+
verbose,
|
|
3540
|
+
keepTempFiles
|
|
3329
3541
|
};
|
|
3330
3542
|
}
|
|
3331
3543
|
function resolveTimeoutMs(source, description) {
|
|
@@ -3338,7 +3550,7 @@ function resolveTimeoutMs(source, description) {
|
|
|
3338
3550
|
}
|
|
3339
3551
|
return Math.floor(seconds * 1e3);
|
|
3340
3552
|
}
|
|
3341
|
-
function resolveCliHealthcheck(source, env, targetName) {
|
|
3553
|
+
function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
|
|
3342
3554
|
if (source === void 0 || source === null) {
|
|
3343
3555
|
return void 0;
|
|
3344
3556
|
}
|
|
@@ -3371,11 +3583,12 @@ function resolveCliHealthcheck(source, env, targetName) {
|
|
|
3371
3583
|
allowLiteral: true,
|
|
3372
3584
|
optionalEnv: true
|
|
3373
3585
|
});
|
|
3586
|
+
const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
|
|
3374
3587
|
return {
|
|
3375
3588
|
type: "command",
|
|
3376
3589
|
commandTemplate,
|
|
3377
3590
|
timeoutMs,
|
|
3378
|
-
cwd
|
|
3591
|
+
cwd: resolvedCwd
|
|
3379
3592
|
};
|
|
3380
3593
|
}
|
|
3381
3594
|
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
@@ -3619,7 +3832,7 @@ var VSCodeProvider = class {
|
|
|
3619
3832
|
}
|
|
3620
3833
|
if (this.config.dryRun) {
|
|
3621
3834
|
return {
|
|
3622
|
-
|
|
3835
|
+
outputMessages: [],
|
|
3623
3836
|
raw: {
|
|
3624
3837
|
session,
|
|
3625
3838
|
inputFiles
|
|
@@ -3628,7 +3841,7 @@ var VSCodeProvider = class {
|
|
|
3628
3841
|
}
|
|
3629
3842
|
const responseText = await readTextFile(session.responseFile);
|
|
3630
3843
|
return {
|
|
3631
|
-
|
|
3844
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3632
3845
|
raw: {
|
|
3633
3846
|
session,
|
|
3634
3847
|
inputFiles
|
|
@@ -3666,7 +3879,7 @@ var VSCodeProvider = class {
|
|
|
3666
3879
|
}
|
|
3667
3880
|
if (this.config.dryRun) {
|
|
3668
3881
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3669
|
-
|
|
3882
|
+
outputMessages: [],
|
|
3670
3883
|
raw: {
|
|
3671
3884
|
session,
|
|
3672
3885
|
inputFiles,
|
|
@@ -3683,7 +3896,7 @@ var VSCodeProvider = class {
|
|
|
3683
3896
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3684
3897
|
const responseText = await readTextFile(responseFile);
|
|
3685
3898
|
responses.push({
|
|
3686
|
-
|
|
3899
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3687
3900
|
raw: {
|
|
3688
3901
|
session,
|
|
3689
3902
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -3923,6 +4136,33 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3923
4136
|
// src/evaluation/evaluators.ts
|
|
3924
4137
|
var import_ai2 = require("ai");
|
|
3925
4138
|
var import_zod2 = require("zod");
|
|
4139
|
+
|
|
4140
|
+
// src/evaluation/providers/types.ts
|
|
4141
|
+
var AGENT_PROVIDER_KINDS = [
|
|
4142
|
+
"codex",
|
|
4143
|
+
"vscode",
|
|
4144
|
+
"vscode-insiders"
|
|
4145
|
+
];
|
|
4146
|
+
function extractLastAssistantContent(messages) {
|
|
4147
|
+
if (!messages || messages.length === 0) {
|
|
4148
|
+
return "";
|
|
4149
|
+
}
|
|
4150
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
4151
|
+
const msg = messages[i];
|
|
4152
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
4153
|
+
if (typeof msg.content === "string") {
|
|
4154
|
+
return msg.content;
|
|
4155
|
+
}
|
|
4156
|
+
return JSON.stringify(msg.content);
|
|
4157
|
+
}
|
|
4158
|
+
}
|
|
4159
|
+
return "";
|
|
4160
|
+
}
|
|
4161
|
+
function isAgentProvider(provider) {
|
|
4162
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
4163
|
+
}
|
|
4164
|
+
|
|
4165
|
+
// src/evaluation/evaluators.ts
|
|
3926
4166
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3927
4167
|
|
|
3928
4168
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3987,6 +4227,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3987
4227
|
null,
|
|
3988
4228
|
2
|
|
3989
4229
|
),
|
|
4230
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
3990
4231
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3991
4232
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3992
4233
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -4011,7 +4252,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4011
4252
|
const score = clampScore(data.score);
|
|
4012
4253
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4013
4254
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4014
|
-
const reasoning = data.reasoning
|
|
4255
|
+
const reasoning = data.reasoning;
|
|
4015
4256
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
4016
4257
|
return {
|
|
4017
4258
|
score,
|
|
@@ -4113,7 +4354,9 @@ var LlmJudgeEvaluator = class {
|
|
|
4113
4354
|
maxOutputTokens: this.maxOutputTokens,
|
|
4114
4355
|
temperature: this.temperature
|
|
4115
4356
|
});
|
|
4116
|
-
const data = schema.parse(
|
|
4357
|
+
const data = schema.parse(
|
|
4358
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4359
|
+
);
|
|
4117
4360
|
return { data, providerResponse: response };
|
|
4118
4361
|
} catch (e) {
|
|
4119
4362
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -4196,15 +4439,16 @@ var CodeEvaluator = class {
|
|
|
4196
4439
|
{
|
|
4197
4440
|
question: context.evalCase.question,
|
|
4198
4441
|
expected_outcome: context.evalCase.expected_outcome,
|
|
4442
|
+
expected_messages: context.evalCase.expected_messages,
|
|
4199
4443
|
reference_answer: context.evalCase.reference_answer,
|
|
4200
4444
|
candidate_answer: context.candidate,
|
|
4445
|
+
output_messages: context.outputMessages ?? null,
|
|
4201
4446
|
guideline_files: context.evalCase.guideline_paths,
|
|
4202
4447
|
input_files: context.evalCase.file_paths.filter(
|
|
4203
4448
|
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
4204
4449
|
),
|
|
4205
4450
|
input_messages: context.evalCase.input_messages,
|
|
4206
|
-
|
|
4207
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
4451
|
+
candidate_trace_summary: context.traceSummary ?? null
|
|
4208
4452
|
},
|
|
4209
4453
|
null,
|
|
4210
4454
|
2
|
|
@@ -4331,8 +4575,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4331
4575
|
this.config = options.config;
|
|
4332
4576
|
}
|
|
4333
4577
|
evaluate(context) {
|
|
4334
|
-
const {
|
|
4335
|
-
|
|
4578
|
+
const { outputMessages, traceSummary } = context;
|
|
4579
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
4580
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
4581
|
+
return {
|
|
4582
|
+
score: 0,
|
|
4583
|
+
verdict: "fail",
|
|
4584
|
+
hits: [],
|
|
4585
|
+
misses: ["No trace available for evaluation"],
|
|
4586
|
+
expectedAspectCount: 1
|
|
4587
|
+
};
|
|
4588
|
+
}
|
|
4589
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
4590
|
+
if (!summary) {
|
|
4336
4591
|
return {
|
|
4337
4592
|
score: 0,
|
|
4338
4593
|
verdict: "fail",
|
|
@@ -4343,11 +4598,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4343
4598
|
}
|
|
4344
4599
|
switch (this.config.mode) {
|
|
4345
4600
|
case "any_order":
|
|
4346
|
-
return this.evaluateAnyOrder(
|
|
4601
|
+
return this.evaluateAnyOrder(summary);
|
|
4347
4602
|
case "in_order":
|
|
4348
|
-
return this.evaluateInOrder(
|
|
4603
|
+
return this.evaluateInOrder(toolCalls);
|
|
4349
4604
|
case "exact":
|
|
4350
|
-
return this.evaluateExact(
|
|
4605
|
+
return this.evaluateExact(toolCalls);
|
|
4351
4606
|
default:
|
|
4352
4607
|
return {
|
|
4353
4608
|
score: 0,
|
|
@@ -4358,6 +4613,39 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4358
4613
|
};
|
|
4359
4614
|
}
|
|
4360
4615
|
}
|
|
4616
|
+
/**
|
|
4617
|
+
* Extract tool calls from output messages.
|
|
4618
|
+
*/
|
|
4619
|
+
extractToolCallsFromMessages(messages) {
|
|
4620
|
+
if (!messages) {
|
|
4621
|
+
return [];
|
|
4622
|
+
}
|
|
4623
|
+
const toolCalls = [];
|
|
4624
|
+
for (const message of messages) {
|
|
4625
|
+
if (message.toolCalls) {
|
|
4626
|
+
for (const call of message.toolCalls) {
|
|
4627
|
+
toolCalls.push({ name: call.tool });
|
|
4628
|
+
}
|
|
4629
|
+
}
|
|
4630
|
+
}
|
|
4631
|
+
return toolCalls;
|
|
4632
|
+
}
|
|
4633
|
+
/**
|
|
4634
|
+
* Build a summary from extracted tool calls.
|
|
4635
|
+
*/
|
|
4636
|
+
buildSummary(toolCalls) {
|
|
4637
|
+
const toolCallsByName = {};
|
|
4638
|
+
for (const call of toolCalls) {
|
|
4639
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
4640
|
+
}
|
|
4641
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
4642
|
+
return {
|
|
4643
|
+
eventCount: toolCalls.length,
|
|
4644
|
+
toolNames,
|
|
4645
|
+
toolCallsByName,
|
|
4646
|
+
errorCount: 0
|
|
4647
|
+
};
|
|
4648
|
+
}
|
|
4361
4649
|
evaluateAnyOrder(summary) {
|
|
4362
4650
|
const minimums = this.config.minimums ?? {};
|
|
4363
4651
|
const toolNames = Object.keys(minimums);
|
|
@@ -4390,7 +4678,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4390
4678
|
expectedAspectCount: toolNames.length
|
|
4391
4679
|
};
|
|
4392
4680
|
}
|
|
4393
|
-
evaluateInOrder(
|
|
4681
|
+
evaluateInOrder(toolCalls) {
|
|
4394
4682
|
const expected = this.config.expected ?? [];
|
|
4395
4683
|
if (expected.length === 0) {
|
|
4396
4684
|
return {
|
|
@@ -4401,15 +4689,14 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4401
4689
|
expectedAspectCount: 0
|
|
4402
4690
|
};
|
|
4403
4691
|
}
|
|
4404
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4405
4692
|
const hits = [];
|
|
4406
4693
|
const misses = [];
|
|
4407
4694
|
let actualIndex = 0;
|
|
4408
4695
|
for (let i = 0; i < expected.length; i++) {
|
|
4409
4696
|
const expectedTool = expected[i].tool;
|
|
4410
4697
|
let found = false;
|
|
4411
|
-
while (actualIndex <
|
|
4412
|
-
if (
|
|
4698
|
+
while (actualIndex < toolCalls.length) {
|
|
4699
|
+
if (toolCalls[actualIndex].name === expectedTool) {
|
|
4413
4700
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4414
4701
|
actualIndex++;
|
|
4415
4702
|
found = true;
|
|
@@ -4430,7 +4717,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4430
4717
|
expectedAspectCount: expected.length
|
|
4431
4718
|
};
|
|
4432
4719
|
}
|
|
4433
|
-
evaluateExact(
|
|
4720
|
+
evaluateExact(toolCalls) {
|
|
4434
4721
|
const expected = this.config.expected ?? [];
|
|
4435
4722
|
if (expected.length === 0) {
|
|
4436
4723
|
return {
|
|
@@ -4441,16 +4728,15 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4441
4728
|
expectedAspectCount: 0
|
|
4442
4729
|
};
|
|
4443
4730
|
}
|
|
4444
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4445
4731
|
const hits = [];
|
|
4446
4732
|
const misses = [];
|
|
4447
|
-
if (
|
|
4448
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
4733
|
+
if (toolCalls.length !== expected.length) {
|
|
4734
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
4449
4735
|
}
|
|
4450
|
-
const checkLength = Math.min(expected.length,
|
|
4736
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
4451
4737
|
for (let i = 0; i < checkLength; i++) {
|
|
4452
4738
|
const expectedTool = expected[i].tool;
|
|
4453
|
-
const actualTool =
|
|
4739
|
+
const actualTool = toolCalls[i].name;
|
|
4454
4740
|
if (actualTool === expectedTool) {
|
|
4455
4741
|
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
4456
4742
|
} else {
|
|
@@ -4664,11 +4950,13 @@ var CompositeEvaluator = class {
|
|
|
4664
4950
|
evalCaseId: context.evalCase.id,
|
|
4665
4951
|
attempt: context.attempt
|
|
4666
4952
|
});
|
|
4667
|
-
const data = freeformEvaluationSchema.parse(
|
|
4953
|
+
const data = freeformEvaluationSchema.parse(
|
|
4954
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4955
|
+
);
|
|
4668
4956
|
const score = clampScore(data.score);
|
|
4669
4957
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4670
4958
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4671
|
-
const reasoning = data.reasoning
|
|
4959
|
+
const reasoning = data.reasoning;
|
|
4672
4960
|
return {
|
|
4673
4961
|
score,
|
|
4674
4962
|
verdict: scoreToVerdict(score),
|
|
@@ -4837,16 +5125,6 @@ function validateConcurrency(concurrency) {
|
|
|
4837
5125
|
}
|
|
4838
5126
|
}
|
|
4839
5127
|
|
|
4840
|
-
// src/evaluation/providers/types.ts
|
|
4841
|
-
var AGENT_PROVIDER_KINDS = [
|
|
4842
|
-
"codex",
|
|
4843
|
-
"vscode",
|
|
4844
|
-
"vscode-insiders"
|
|
4845
|
-
];
|
|
4846
|
-
function isAgentProvider(provider) {
|
|
4847
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
4848
|
-
}
|
|
4849
|
-
|
|
4850
5128
|
// src/evaluation/orchestrator.ts
|
|
4851
5129
|
async function runEvaluation(options) {
|
|
4852
5130
|
const {
|
|
@@ -5101,11 +5379,14 @@ async function runBatchEvaluation(options) {
|
|
|
5101
5379
|
const evalCase = evalCases[i];
|
|
5102
5380
|
const promptInputs = promptInputsList[i];
|
|
5103
5381
|
const providerResponse = batchResponse[i];
|
|
5382
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5383
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5384
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5104
5385
|
let result;
|
|
5105
5386
|
try {
|
|
5106
5387
|
result = await evaluateCandidate({
|
|
5107
5388
|
evalCase,
|
|
5108
|
-
candidate
|
|
5389
|
+
candidate,
|
|
5109
5390
|
target,
|
|
5110
5391
|
provider,
|
|
5111
5392
|
evaluators: evaluatorRegistry,
|
|
@@ -5113,7 +5394,9 @@ async function runBatchEvaluation(options) {
|
|
|
5113
5394
|
nowFn,
|
|
5114
5395
|
attempt: 0,
|
|
5115
5396
|
judgeProvider: await resolveJudgeProvider(target),
|
|
5116
|
-
agentTimeoutMs
|
|
5397
|
+
agentTimeoutMs,
|
|
5398
|
+
outputMessages,
|
|
5399
|
+
traceSummary
|
|
5117
5400
|
});
|
|
5118
5401
|
} catch (error) {
|
|
5119
5402
|
const errorResult = buildErrorResult(
|
|
@@ -5217,21 +5500,13 @@ async function runEvalCase(options) {
|
|
|
5217
5500
|
if (cacheKey && cache && !cachedResponse) {
|
|
5218
5501
|
await cache.set(cacheKey, providerResponse);
|
|
5219
5502
|
}
|
|
5220
|
-
|
|
5221
|
-
|
|
5222
|
-
|
|
5223
|
-
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
5224
|
-
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
5225
|
-
candidateTrace = rawTrace;
|
|
5226
|
-
}
|
|
5227
|
-
} catch {
|
|
5228
|
-
}
|
|
5229
|
-
}
|
|
5230
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
5503
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5504
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5505
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5231
5506
|
try {
|
|
5232
5507
|
return await evaluateCandidate({
|
|
5233
5508
|
evalCase,
|
|
5234
|
-
candidate
|
|
5509
|
+
candidate,
|
|
5235
5510
|
target,
|
|
5236
5511
|
provider,
|
|
5237
5512
|
evaluators,
|
|
@@ -5240,9 +5515,8 @@ async function runEvalCase(options) {
|
|
|
5240
5515
|
attempt,
|
|
5241
5516
|
judgeProvider,
|
|
5242
5517
|
agentTimeoutMs,
|
|
5243
|
-
|
|
5244
|
-
|
|
5245
|
-
candidateTraceSummary
|
|
5518
|
+
outputMessages,
|
|
5519
|
+
traceSummary
|
|
5246
5520
|
});
|
|
5247
5521
|
} catch (error) {
|
|
5248
5522
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -5260,9 +5534,8 @@ async function evaluateCandidate(options) {
|
|
|
5260
5534
|
attempt,
|
|
5261
5535
|
judgeProvider,
|
|
5262
5536
|
agentTimeoutMs,
|
|
5263
|
-
|
|
5264
|
-
|
|
5265
|
-
candidateTraceSummary
|
|
5537
|
+
outputMessages,
|
|
5538
|
+
traceSummary
|
|
5266
5539
|
} = options;
|
|
5267
5540
|
const gradeTimestamp = nowFn();
|
|
5268
5541
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -5276,9 +5549,8 @@ async function evaluateCandidate(options) {
|
|
|
5276
5549
|
now: gradeTimestamp,
|
|
5277
5550
|
judgeProvider,
|
|
5278
5551
|
agentTimeoutMs,
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
candidateTraceSummary
|
|
5552
|
+
outputMessages,
|
|
5553
|
+
traceSummary
|
|
5282
5554
|
});
|
|
5283
5555
|
const completedAt = nowFn();
|
|
5284
5556
|
let agentProviderRequest;
|
|
@@ -5316,7 +5588,7 @@ async function evaluateCandidate(options) {
|
|
|
5316
5588
|
lm_provider_request: lmProviderRequest,
|
|
5317
5589
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
5318
5590
|
evaluator_results: evaluatorResults,
|
|
5319
|
-
trace_summary:
|
|
5591
|
+
trace_summary: traceSummary
|
|
5320
5592
|
};
|
|
5321
5593
|
}
|
|
5322
5594
|
async function runEvaluatorsForCase(options) {
|
|
@@ -5331,9 +5603,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5331
5603
|
now,
|
|
5332
5604
|
judgeProvider,
|
|
5333
5605
|
agentTimeoutMs,
|
|
5334
|
-
|
|
5335
|
-
|
|
5336
|
-
candidateTraceSummary
|
|
5606
|
+
outputMessages,
|
|
5607
|
+
traceSummary
|
|
5337
5608
|
} = options;
|
|
5338
5609
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
5339
5610
|
return runEvaluatorList({
|
|
@@ -5348,9 +5619,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5348
5619
|
now,
|
|
5349
5620
|
judgeProvider,
|
|
5350
5621
|
agentTimeoutMs,
|
|
5351
|
-
|
|
5352
|
-
|
|
5353
|
-
candidateTraceSummary
|
|
5622
|
+
outputMessages,
|
|
5623
|
+
traceSummary
|
|
5354
5624
|
});
|
|
5355
5625
|
}
|
|
5356
5626
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -5367,9 +5637,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5367
5637
|
promptInputs,
|
|
5368
5638
|
now,
|
|
5369
5639
|
judgeProvider,
|
|
5370
|
-
|
|
5371
|
-
|
|
5372
|
-
candidateTraceSummary
|
|
5640
|
+
outputMessages,
|
|
5641
|
+
traceSummary
|
|
5373
5642
|
});
|
|
5374
5643
|
return { score };
|
|
5375
5644
|
}
|
|
@@ -5386,9 +5655,8 @@ async function runEvaluatorList(options) {
|
|
|
5386
5655
|
now,
|
|
5387
5656
|
judgeProvider,
|
|
5388
5657
|
agentTimeoutMs,
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
candidateTraceSummary
|
|
5658
|
+
outputMessages,
|
|
5659
|
+
traceSummary
|
|
5392
5660
|
} = options;
|
|
5393
5661
|
const scored = [];
|
|
5394
5662
|
const evaluatorResults = [];
|
|
@@ -5435,8 +5703,8 @@ async function runEvaluatorList(options) {
|
|
|
5435
5703
|
attempt,
|
|
5436
5704
|
promptInputs,
|
|
5437
5705
|
now,
|
|
5438
|
-
|
|
5439
|
-
|
|
5706
|
+
outputMessages,
|
|
5707
|
+
traceSummary
|
|
5440
5708
|
});
|
|
5441
5709
|
const weight = evaluator.weight ?? 1;
|
|
5442
5710
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -5522,9 +5790,8 @@ async function runEvaluatorList(options) {
|
|
|
5522
5790
|
attempt,
|
|
5523
5791
|
promptInputs,
|
|
5524
5792
|
now,
|
|
5525
|
-
|
|
5526
|
-
|
|
5527
|
-
candidateTraceSummary
|
|
5793
|
+
outputMessages,
|
|
5794
|
+
traceSummary
|
|
5528
5795
|
});
|
|
5529
5796
|
const weight = evaluator.weight ?? 1;
|
|
5530
5797
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5918,8 +6185,6 @@ function createAgentKernel() {
|
|
|
5918
6185
|
isJsonValue,
|
|
5919
6186
|
isTestMessage,
|
|
5920
6187
|
isTestMessageRole,
|
|
5921
|
-
isTraceEvent,
|
|
5922
|
-
isTraceEventType,
|
|
5923
6188
|
listTargetNames,
|
|
5924
6189
|
loadEvalCases,
|
|
5925
6190
|
normalizeLineEndings,
|