@agentv/core 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-4A6L2F6L.js → chunk-KPHTMTZ3.js} +23 -4
- package/dist/chunk-KPHTMTZ3.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +6 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +246 -149
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +58 -50
- package/dist/index.d.ts +58 -50
- package/dist/index.js +215 -135
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-4A6L2F6L.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -54,8 +54,6 @@ __export(index_exports, {
|
|
|
54
54
|
isJsonValue: () => isJsonValue,
|
|
55
55
|
isTestMessage: () => isTestMessage,
|
|
56
56
|
isTestMessageRole: () => isTestMessageRole,
|
|
57
|
-
isTraceEvent: () => isTraceEvent,
|
|
58
|
-
isTraceEventType: () => isTraceEventType,
|
|
59
57
|
listTargetNames: () => listTargetNames,
|
|
60
58
|
loadEvalCases: () => loadEvalCases,
|
|
61
59
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
@@ -135,33 +133,22 @@ function getHitCount(result) {
|
|
|
135
133
|
}
|
|
136
134
|
|
|
137
135
|
// src/evaluation/trace.ts
|
|
138
|
-
function
|
|
139
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
140
|
-
}
|
|
141
|
-
function isTraceEvent(value) {
|
|
142
|
-
if (typeof value !== "object" || value === null) {
|
|
143
|
-
return false;
|
|
144
|
-
}
|
|
145
|
-
const candidate = value;
|
|
146
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
147
|
-
}
|
|
148
|
-
function computeTraceSummary(trace) {
|
|
136
|
+
function computeTraceSummary(messages) {
|
|
149
137
|
const toolCallCounts = {};
|
|
150
|
-
let
|
|
151
|
-
for (const
|
|
152
|
-
if (
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
errorCount++;
|
|
138
|
+
let totalToolCalls = 0;
|
|
139
|
+
for (const message of messages) {
|
|
140
|
+
if (!message.toolCalls) continue;
|
|
141
|
+
for (const toolCall of message.toolCalls) {
|
|
142
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
143
|
+
totalToolCalls++;
|
|
157
144
|
}
|
|
158
145
|
}
|
|
159
146
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
160
147
|
return {
|
|
161
|
-
eventCount:
|
|
148
|
+
eventCount: totalToolCalls,
|
|
162
149
|
toolNames,
|
|
163
150
|
toolCallsByName: toolCallCounts,
|
|
164
|
-
errorCount
|
|
151
|
+
errorCount: 0
|
|
165
152
|
};
|
|
166
153
|
}
|
|
167
154
|
|
|
@@ -437,7 +424,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
437
424
|
QUESTION: "question",
|
|
438
425
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
439
426
|
REFERENCE_ANSWER: "reference_answer",
|
|
440
|
-
INPUT_MESSAGES: "input_messages"
|
|
427
|
+
INPUT_MESSAGES: "input_messages",
|
|
428
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
441
429
|
};
|
|
442
430
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
443
431
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -1320,16 +1308,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1320
1308
|
}) : [];
|
|
1321
1309
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1322
1310
|
let referenceAnswer = "";
|
|
1323
|
-
if (outputSegments.length >
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
const
|
|
1327
|
-
if (typeof
|
|
1328
|
-
referenceAnswer =
|
|
1329
|
-
} else if (
|
|
1330
|
-
referenceAnswer = JSON.stringify(
|
|
1331
|
-
} else if (
|
|
1332
|
-
referenceAnswer = JSON.stringify(
|
|
1311
|
+
if (outputSegments.length > 0) {
|
|
1312
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1313
|
+
const content = lastMessage.content;
|
|
1314
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1315
|
+
if (typeof content === "string") {
|
|
1316
|
+
referenceAnswer = content;
|
|
1317
|
+
} else if (content !== void 0 && content !== null) {
|
|
1318
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1319
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1320
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1333
1321
|
}
|
|
1334
1322
|
}
|
|
1335
1323
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1772,11 +1760,11 @@ async function invokeModel(options) {
|
|
|
1772
1760
|
return mapResponse(result);
|
|
1773
1761
|
}
|
|
1774
1762
|
function mapResponse(result) {
|
|
1763
|
+
const content = result.text ?? "";
|
|
1775
1764
|
return {
|
|
1776
|
-
text: result.text ?? "",
|
|
1777
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1778
1765
|
raw: result,
|
|
1779
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1766
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1767
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1780
1768
|
};
|
|
1781
1769
|
}
|
|
1782
1770
|
function toJsonObject(value) {
|
|
@@ -1929,6 +1917,7 @@ var CliProvider = class {
|
|
|
1929
1917
|
config;
|
|
1930
1918
|
runCommand;
|
|
1931
1919
|
verbose;
|
|
1920
|
+
keepTempFiles;
|
|
1932
1921
|
healthcheckPromise;
|
|
1933
1922
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1934
1923
|
this.targetName = targetName;
|
|
@@ -1936,6 +1925,7 @@ var CliProvider = class {
|
|
|
1936
1925
|
this.config = config;
|
|
1937
1926
|
this.runCommand = runner;
|
|
1938
1927
|
this.verbose = config.verbose ?? false;
|
|
1928
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1939
1929
|
}
|
|
1940
1930
|
async invoke(request) {
|
|
1941
1931
|
if (request.signal?.aborted) {
|
|
@@ -1973,8 +1963,7 @@ var CliProvider = class {
|
|
|
1973
1963
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1974
1964
|
const parsed = this.parseOutputContent(responseContent);
|
|
1975
1965
|
return {
|
|
1976
|
-
|
|
1977
|
-
trace: parsed.trace,
|
|
1966
|
+
outputMessages: parsed.outputMessages,
|
|
1978
1967
|
raw: {
|
|
1979
1968
|
command: renderedCommand,
|
|
1980
1969
|
stderr: result.stderr,
|
|
@@ -2053,7 +2042,7 @@ var CliProvider = class {
|
|
|
2053
2042
|
const evalCaseId = request.evalCaseId;
|
|
2054
2043
|
if (!evalCaseId) {
|
|
2055
2044
|
return {
|
|
2056
|
-
|
|
2045
|
+
outputMessages: [],
|
|
2057
2046
|
raw: {
|
|
2058
2047
|
command: renderedCommand,
|
|
2059
2048
|
stderr: result.stderr,
|
|
@@ -2066,7 +2055,7 @@ var CliProvider = class {
|
|
|
2066
2055
|
const parsed = recordsById.get(evalCaseId);
|
|
2067
2056
|
if (!parsed) {
|
|
2068
2057
|
return {
|
|
2069
|
-
|
|
2058
|
+
outputMessages: [],
|
|
2070
2059
|
raw: {
|
|
2071
2060
|
command: renderedCommand,
|
|
2072
2061
|
stderr: result.stderr,
|
|
@@ -2077,9 +2066,7 @@ var CliProvider = class {
|
|
|
2077
2066
|
};
|
|
2078
2067
|
}
|
|
2079
2068
|
return {
|
|
2080
|
-
|
|
2081
|
-
trace: parsed.trace,
|
|
2082
|
-
traceRef: parsed.traceRef,
|
|
2069
|
+
outputMessages: parsed.outputMessages,
|
|
2083
2070
|
raw: {
|
|
2084
2071
|
command: renderedCommand,
|
|
2085
2072
|
stderr: result.stderr,
|
|
@@ -2094,28 +2081,81 @@ var CliProvider = class {
|
|
|
2094
2081
|
}
|
|
2095
2082
|
/**
|
|
2096
2083
|
* Parse output content from CLI.
|
|
2097
|
-
* If the content is valid JSON with
|
|
2098
|
-
*
|
|
2084
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
2085
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
2086
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
2099
2087
|
*/
|
|
2100
2088
|
parseOutputContent(content) {
|
|
2101
2089
|
try {
|
|
2102
2090
|
const parsed = JSON.parse(content);
|
|
2103
|
-
if (typeof parsed === "object" && parsed !== null
|
|
2091
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
2104
2092
|
const obj = parsed;
|
|
2105
|
-
const
|
|
2106
|
-
|
|
2107
|
-
|
|
2093
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2094
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
2095
|
+
return { outputMessages };
|
|
2096
|
+
}
|
|
2097
|
+
if ("text" in obj) {
|
|
2098
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2099
|
+
return { outputMessages: [{ role: "assistant", content: text }] };
|
|
2100
|
+
}
|
|
2108
2101
|
}
|
|
2109
2102
|
} catch {
|
|
2110
2103
|
}
|
|
2111
|
-
return {
|
|
2104
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2112
2105
|
}
|
|
2113
|
-
|
|
2114
|
-
|
|
2106
|
+
/**
|
|
2107
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2108
|
+
*/
|
|
2109
|
+
parseOutputMessages(outputMessages) {
|
|
2110
|
+
if (!Array.isArray(outputMessages)) {
|
|
2115
2111
|
return void 0;
|
|
2116
2112
|
}
|
|
2117
|
-
const
|
|
2118
|
-
|
|
2113
|
+
const messages = [];
|
|
2114
|
+
for (const msg of outputMessages) {
|
|
2115
|
+
if (typeof msg !== "object" || msg === null) {
|
|
2116
|
+
continue;
|
|
2117
|
+
}
|
|
2118
|
+
const rawMsg = msg;
|
|
2119
|
+
if (typeof rawMsg.role !== "string") {
|
|
2120
|
+
continue;
|
|
2121
|
+
}
|
|
2122
|
+
const message = {
|
|
2123
|
+
role: rawMsg.role,
|
|
2124
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2125
|
+
content: rawMsg.content,
|
|
2126
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2127
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2128
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
2129
|
+
};
|
|
2130
|
+
messages.push(message);
|
|
2131
|
+
}
|
|
2132
|
+
return messages.length > 0 ? messages : void 0;
|
|
2133
|
+
}
|
|
2134
|
+
/**
|
|
2135
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2136
|
+
*/
|
|
2137
|
+
parseToolCalls(toolCalls) {
|
|
2138
|
+
if (!Array.isArray(toolCalls)) {
|
|
2139
|
+
return void 0;
|
|
2140
|
+
}
|
|
2141
|
+
const calls = [];
|
|
2142
|
+
for (const call of toolCalls) {
|
|
2143
|
+
if (typeof call !== "object" || call === null) {
|
|
2144
|
+
continue;
|
|
2145
|
+
}
|
|
2146
|
+
const rawCall = call;
|
|
2147
|
+
if (typeof rawCall.tool !== "string") {
|
|
2148
|
+
continue;
|
|
2149
|
+
}
|
|
2150
|
+
calls.push({
|
|
2151
|
+
tool: rawCall.tool,
|
|
2152
|
+
input: rawCall.input,
|
|
2153
|
+
output: rawCall.output,
|
|
2154
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2155
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2156
|
+
});
|
|
2157
|
+
}
|
|
2158
|
+
return calls.length > 0 ? calls : void 0;
|
|
2119
2159
|
}
|
|
2120
2160
|
parseJsonlBatchOutput(content) {
|
|
2121
2161
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2139,12 +2179,16 @@ var CliProvider = class {
|
|
|
2139
2179
|
if (records.has(id)) {
|
|
2140
2180
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2141
2181
|
}
|
|
2142
|
-
const
|
|
2143
|
-
|
|
2182
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2183
|
+
let outputMessages;
|
|
2184
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2185
|
+
outputMessages = parsedOutputMessages;
|
|
2186
|
+
} else {
|
|
2187
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2188
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2189
|
+
}
|
|
2144
2190
|
records.set(id, {
|
|
2145
|
-
|
|
2146
|
-
trace: this.parseTrace(obj.trace),
|
|
2147
|
-
traceRef
|
|
2191
|
+
outputMessages
|
|
2148
2192
|
});
|
|
2149
2193
|
}
|
|
2150
2194
|
return records;
|
|
@@ -2157,8 +2201,10 @@ var CliProvider = class {
|
|
|
2157
2201
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
2158
2202
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
2159
2203
|
} finally {
|
|
2160
|
-
|
|
2161
|
-
|
|
2204
|
+
if (!this.keepTempFiles) {
|
|
2205
|
+
await import_promises8.default.unlink(filePath).catch(() => {
|
|
2206
|
+
});
|
|
2207
|
+
}
|
|
2162
2208
|
}
|
|
2163
2209
|
}
|
|
2164
2210
|
async ensureHealthy(signal) {
|
|
@@ -2501,7 +2547,6 @@ var CodexProvider = class {
|
|
|
2501
2547
|
const parsed = parseCodexJson(result.stdout);
|
|
2502
2548
|
const assistantText = extractAssistantText(parsed);
|
|
2503
2549
|
return {
|
|
2504
|
-
text: assistantText,
|
|
2505
2550
|
raw: {
|
|
2506
2551
|
response: parsed,
|
|
2507
2552
|
stdout: result.stdout,
|
|
@@ -2513,7 +2558,8 @@ var CodexProvider = class {
|
|
|
2513
2558
|
workspace: workspaceRoot,
|
|
2514
2559
|
inputFiles,
|
|
2515
2560
|
logFile: logger?.filePath
|
|
2516
|
-
}
|
|
2561
|
+
},
|
|
2562
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2517
2563
|
};
|
|
2518
2564
|
} finally {
|
|
2519
2565
|
await logger?.close();
|
|
@@ -3135,7 +3181,6 @@ var MockProvider = class {
|
|
|
3135
3181
|
delayMs;
|
|
3136
3182
|
delayMinMs;
|
|
3137
3183
|
delayMaxMs;
|
|
3138
|
-
trace;
|
|
3139
3184
|
constructor(targetName, config) {
|
|
3140
3185
|
this.id = `mock:${targetName}`;
|
|
3141
3186
|
this.targetName = targetName;
|
|
@@ -3143,7 +3188,6 @@ var MockProvider = class {
|
|
|
3143
3188
|
this.delayMs = config.delayMs ?? 0;
|
|
3144
3189
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
3145
3190
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
3146
|
-
this.trace = config.trace;
|
|
3147
3191
|
}
|
|
3148
3192
|
async invoke(request) {
|
|
3149
3193
|
const delay = this.calculateDelay();
|
|
@@ -3151,12 +3195,11 @@ var MockProvider = class {
|
|
|
3151
3195
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
3152
3196
|
}
|
|
3153
3197
|
return {
|
|
3154
|
-
|
|
3198
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
3155
3199
|
raw: {
|
|
3156
3200
|
question: request.question,
|
|
3157
3201
|
guidelines: request.guidelines
|
|
3158
|
-
}
|
|
3159
|
-
trace: this.trace
|
|
3202
|
+
}
|
|
3160
3203
|
};
|
|
3161
3204
|
}
|
|
3162
3205
|
calculateDelay() {
|
|
@@ -3424,8 +3467,7 @@ function normalizeCodexLogFormat(value) {
|
|
|
3424
3467
|
}
|
|
3425
3468
|
function resolveMockConfig(target) {
|
|
3426
3469
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3427
|
-
|
|
3428
|
-
return { response, trace };
|
|
3470
|
+
return { response };
|
|
3429
3471
|
}
|
|
3430
3472
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
3431
3473
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -3463,6 +3505,9 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3463
3505
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3464
3506
|
);
|
|
3465
3507
|
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
3508
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
3509
|
+
target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
|
|
3510
|
+
);
|
|
3466
3511
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3467
3512
|
allowLiteral: true,
|
|
3468
3513
|
optionalEnv: true
|
|
@@ -3491,7 +3536,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
3491
3536
|
cwd,
|
|
3492
3537
|
timeoutMs,
|
|
3493
3538
|
healthcheck,
|
|
3494
|
-
verbose
|
|
3539
|
+
verbose,
|
|
3540
|
+
keepTempFiles
|
|
3495
3541
|
};
|
|
3496
3542
|
}
|
|
3497
3543
|
function resolveTimeoutMs(source, description) {
|
|
@@ -3786,7 +3832,7 @@ var VSCodeProvider = class {
|
|
|
3786
3832
|
}
|
|
3787
3833
|
if (this.config.dryRun) {
|
|
3788
3834
|
return {
|
|
3789
|
-
|
|
3835
|
+
outputMessages: [],
|
|
3790
3836
|
raw: {
|
|
3791
3837
|
session,
|
|
3792
3838
|
inputFiles
|
|
@@ -3795,7 +3841,7 @@ var VSCodeProvider = class {
|
|
|
3795
3841
|
}
|
|
3796
3842
|
const responseText = await readTextFile(session.responseFile);
|
|
3797
3843
|
return {
|
|
3798
|
-
|
|
3844
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3799
3845
|
raw: {
|
|
3800
3846
|
session,
|
|
3801
3847
|
inputFiles
|
|
@@ -3833,7 +3879,7 @@ var VSCodeProvider = class {
|
|
|
3833
3879
|
}
|
|
3834
3880
|
if (this.config.dryRun) {
|
|
3835
3881
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3836
|
-
|
|
3882
|
+
outputMessages: [],
|
|
3837
3883
|
raw: {
|
|
3838
3884
|
session,
|
|
3839
3885
|
inputFiles,
|
|
@@ -3850,7 +3896,7 @@ var VSCodeProvider = class {
|
|
|
3850
3896
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3851
3897
|
const responseText = await readTextFile(responseFile);
|
|
3852
3898
|
responses.push({
|
|
3853
|
-
|
|
3899
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3854
3900
|
raw: {
|
|
3855
3901
|
session,
|
|
3856
3902
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -4090,6 +4136,33 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
4090
4136
|
// src/evaluation/evaluators.ts
|
|
4091
4137
|
var import_ai2 = require("ai");
|
|
4092
4138
|
var import_zod2 = require("zod");
|
|
4139
|
+
|
|
4140
|
+
// src/evaluation/providers/types.ts
|
|
4141
|
+
var AGENT_PROVIDER_KINDS = [
|
|
4142
|
+
"codex",
|
|
4143
|
+
"vscode",
|
|
4144
|
+
"vscode-insiders"
|
|
4145
|
+
];
|
|
4146
|
+
function extractLastAssistantContent(messages) {
|
|
4147
|
+
if (!messages || messages.length === 0) {
|
|
4148
|
+
return "";
|
|
4149
|
+
}
|
|
4150
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
4151
|
+
const msg = messages[i];
|
|
4152
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
4153
|
+
if (typeof msg.content === "string") {
|
|
4154
|
+
return msg.content;
|
|
4155
|
+
}
|
|
4156
|
+
return JSON.stringify(msg.content);
|
|
4157
|
+
}
|
|
4158
|
+
}
|
|
4159
|
+
return "";
|
|
4160
|
+
}
|
|
4161
|
+
function isAgentProvider(provider) {
|
|
4162
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
4163
|
+
}
|
|
4164
|
+
|
|
4165
|
+
// src/evaluation/evaluators.ts
|
|
4093
4166
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
4094
4167
|
|
|
4095
4168
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -4154,6 +4227,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4154
4227
|
null,
|
|
4155
4228
|
2
|
|
4156
4229
|
),
|
|
4230
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
4157
4231
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
4158
4232
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
4159
4233
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -4178,7 +4252,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4178
4252
|
const score = clampScore(data.score);
|
|
4179
4253
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4180
4254
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4181
|
-
const reasoning = data.reasoning
|
|
4255
|
+
const reasoning = data.reasoning;
|
|
4182
4256
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
4183
4257
|
return {
|
|
4184
4258
|
score,
|
|
@@ -4280,7 +4354,9 @@ var LlmJudgeEvaluator = class {
|
|
|
4280
4354
|
maxOutputTokens: this.maxOutputTokens,
|
|
4281
4355
|
temperature: this.temperature
|
|
4282
4356
|
});
|
|
4283
|
-
const data = schema.parse(
|
|
4357
|
+
const data = schema.parse(
|
|
4358
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4359
|
+
);
|
|
4284
4360
|
return { data, providerResponse: response };
|
|
4285
4361
|
} catch (e) {
|
|
4286
4362
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -4366,13 +4442,13 @@ var CodeEvaluator = class {
|
|
|
4366
4442
|
expected_messages: context.evalCase.expected_messages,
|
|
4367
4443
|
reference_answer: context.evalCase.reference_answer,
|
|
4368
4444
|
candidate_answer: context.candidate,
|
|
4445
|
+
output_messages: context.outputMessages ?? null,
|
|
4369
4446
|
guideline_files: context.evalCase.guideline_paths,
|
|
4370
4447
|
input_files: context.evalCase.file_paths.filter(
|
|
4371
4448
|
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
4372
4449
|
),
|
|
4373
4450
|
input_messages: context.evalCase.input_messages,
|
|
4374
|
-
|
|
4375
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
4451
|
+
candidate_trace_summary: context.traceSummary ?? null
|
|
4376
4452
|
},
|
|
4377
4453
|
null,
|
|
4378
4454
|
2
|
|
@@ -4499,8 +4575,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4499
4575
|
this.config = options.config;
|
|
4500
4576
|
}
|
|
4501
4577
|
evaluate(context) {
|
|
4502
|
-
const {
|
|
4503
|
-
|
|
4578
|
+
const { outputMessages, traceSummary } = context;
|
|
4579
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
4580
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
4581
|
+
return {
|
|
4582
|
+
score: 0,
|
|
4583
|
+
verdict: "fail",
|
|
4584
|
+
hits: [],
|
|
4585
|
+
misses: ["No trace available for evaluation"],
|
|
4586
|
+
expectedAspectCount: 1
|
|
4587
|
+
};
|
|
4588
|
+
}
|
|
4589
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
4590
|
+
if (!summary) {
|
|
4504
4591
|
return {
|
|
4505
4592
|
score: 0,
|
|
4506
4593
|
verdict: "fail",
|
|
@@ -4511,11 +4598,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4511
4598
|
}
|
|
4512
4599
|
switch (this.config.mode) {
|
|
4513
4600
|
case "any_order":
|
|
4514
|
-
return this.evaluateAnyOrder(
|
|
4601
|
+
return this.evaluateAnyOrder(summary);
|
|
4515
4602
|
case "in_order":
|
|
4516
|
-
return this.evaluateInOrder(
|
|
4603
|
+
return this.evaluateInOrder(toolCalls);
|
|
4517
4604
|
case "exact":
|
|
4518
|
-
return this.evaluateExact(
|
|
4605
|
+
return this.evaluateExact(toolCalls);
|
|
4519
4606
|
default:
|
|
4520
4607
|
return {
|
|
4521
4608
|
score: 0,
|
|
@@ -4526,6 +4613,39 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4526
4613
|
};
|
|
4527
4614
|
}
|
|
4528
4615
|
}
|
|
4616
|
+
/**
|
|
4617
|
+
* Extract tool calls from output messages.
|
|
4618
|
+
*/
|
|
4619
|
+
extractToolCallsFromMessages(messages) {
|
|
4620
|
+
if (!messages) {
|
|
4621
|
+
return [];
|
|
4622
|
+
}
|
|
4623
|
+
const toolCalls = [];
|
|
4624
|
+
for (const message of messages) {
|
|
4625
|
+
if (message.toolCalls) {
|
|
4626
|
+
for (const call of message.toolCalls) {
|
|
4627
|
+
toolCalls.push({ name: call.tool });
|
|
4628
|
+
}
|
|
4629
|
+
}
|
|
4630
|
+
}
|
|
4631
|
+
return toolCalls;
|
|
4632
|
+
}
|
|
4633
|
+
/**
|
|
4634
|
+
* Build a summary from extracted tool calls.
|
|
4635
|
+
*/
|
|
4636
|
+
buildSummary(toolCalls) {
|
|
4637
|
+
const toolCallsByName = {};
|
|
4638
|
+
for (const call of toolCalls) {
|
|
4639
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
4640
|
+
}
|
|
4641
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
4642
|
+
return {
|
|
4643
|
+
eventCount: toolCalls.length,
|
|
4644
|
+
toolNames,
|
|
4645
|
+
toolCallsByName,
|
|
4646
|
+
errorCount: 0
|
|
4647
|
+
};
|
|
4648
|
+
}
|
|
4529
4649
|
evaluateAnyOrder(summary) {
|
|
4530
4650
|
const minimums = this.config.minimums ?? {};
|
|
4531
4651
|
const toolNames = Object.keys(minimums);
|
|
@@ -4558,7 +4678,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4558
4678
|
expectedAspectCount: toolNames.length
|
|
4559
4679
|
};
|
|
4560
4680
|
}
|
|
4561
|
-
evaluateInOrder(
|
|
4681
|
+
evaluateInOrder(toolCalls) {
|
|
4562
4682
|
const expected = this.config.expected ?? [];
|
|
4563
4683
|
if (expected.length === 0) {
|
|
4564
4684
|
return {
|
|
@@ -4569,15 +4689,14 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4569
4689
|
expectedAspectCount: 0
|
|
4570
4690
|
};
|
|
4571
4691
|
}
|
|
4572
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4573
4692
|
const hits = [];
|
|
4574
4693
|
const misses = [];
|
|
4575
4694
|
let actualIndex = 0;
|
|
4576
4695
|
for (let i = 0; i < expected.length; i++) {
|
|
4577
4696
|
const expectedTool = expected[i].tool;
|
|
4578
4697
|
let found = false;
|
|
4579
|
-
while (actualIndex <
|
|
4580
|
-
if (
|
|
4698
|
+
while (actualIndex < toolCalls.length) {
|
|
4699
|
+
if (toolCalls[actualIndex].name === expectedTool) {
|
|
4581
4700
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4582
4701
|
actualIndex++;
|
|
4583
4702
|
found = true;
|
|
@@ -4598,7 +4717,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4598
4717
|
expectedAspectCount: expected.length
|
|
4599
4718
|
};
|
|
4600
4719
|
}
|
|
4601
|
-
evaluateExact(
|
|
4720
|
+
evaluateExact(toolCalls) {
|
|
4602
4721
|
const expected = this.config.expected ?? [];
|
|
4603
4722
|
if (expected.length === 0) {
|
|
4604
4723
|
return {
|
|
@@ -4609,16 +4728,15 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4609
4728
|
expectedAspectCount: 0
|
|
4610
4729
|
};
|
|
4611
4730
|
}
|
|
4612
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4613
4731
|
const hits = [];
|
|
4614
4732
|
const misses = [];
|
|
4615
|
-
if (
|
|
4616
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
4733
|
+
if (toolCalls.length !== expected.length) {
|
|
4734
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
4617
4735
|
}
|
|
4618
|
-
const checkLength = Math.min(expected.length,
|
|
4736
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
4619
4737
|
for (let i = 0; i < checkLength; i++) {
|
|
4620
4738
|
const expectedTool = expected[i].tool;
|
|
4621
|
-
const actualTool =
|
|
4739
|
+
const actualTool = toolCalls[i].name;
|
|
4622
4740
|
if (actualTool === expectedTool) {
|
|
4623
4741
|
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
4624
4742
|
} else {
|
|
@@ -4832,11 +4950,13 @@ var CompositeEvaluator = class {
|
|
|
4832
4950
|
evalCaseId: context.evalCase.id,
|
|
4833
4951
|
attempt: context.attempt
|
|
4834
4952
|
});
|
|
4835
|
-
const data = freeformEvaluationSchema.parse(
|
|
4953
|
+
const data = freeformEvaluationSchema.parse(
|
|
4954
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4955
|
+
);
|
|
4836
4956
|
const score = clampScore(data.score);
|
|
4837
4957
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4838
4958
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4839
|
-
const reasoning = data.reasoning
|
|
4959
|
+
const reasoning = data.reasoning;
|
|
4840
4960
|
return {
|
|
4841
4961
|
score,
|
|
4842
4962
|
verdict: scoreToVerdict(score),
|
|
@@ -5005,16 +5125,6 @@ function validateConcurrency(concurrency) {
|
|
|
5005
5125
|
}
|
|
5006
5126
|
}
|
|
5007
5127
|
|
|
5008
|
-
// src/evaluation/providers/types.ts
|
|
5009
|
-
var AGENT_PROVIDER_KINDS = [
|
|
5010
|
-
"codex",
|
|
5011
|
-
"vscode",
|
|
5012
|
-
"vscode-insiders"
|
|
5013
|
-
];
|
|
5014
|
-
function isAgentProvider(provider) {
|
|
5015
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
5016
|
-
}
|
|
5017
|
-
|
|
5018
5128
|
// src/evaluation/orchestrator.ts
|
|
5019
5129
|
async function runEvaluation(options) {
|
|
5020
5130
|
const {
|
|
@@ -5269,11 +5379,14 @@ async function runBatchEvaluation(options) {
|
|
|
5269
5379
|
const evalCase = evalCases[i];
|
|
5270
5380
|
const promptInputs = promptInputsList[i];
|
|
5271
5381
|
const providerResponse = batchResponse[i];
|
|
5382
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5383
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5384
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5272
5385
|
let result;
|
|
5273
5386
|
try {
|
|
5274
5387
|
result = await evaluateCandidate({
|
|
5275
5388
|
evalCase,
|
|
5276
|
-
candidate
|
|
5389
|
+
candidate,
|
|
5277
5390
|
target,
|
|
5278
5391
|
provider,
|
|
5279
5392
|
evaluators: evaluatorRegistry,
|
|
@@ -5281,7 +5394,9 @@ async function runBatchEvaluation(options) {
|
|
|
5281
5394
|
nowFn,
|
|
5282
5395
|
attempt: 0,
|
|
5283
5396
|
judgeProvider: await resolveJudgeProvider(target),
|
|
5284
|
-
agentTimeoutMs
|
|
5397
|
+
agentTimeoutMs,
|
|
5398
|
+
outputMessages,
|
|
5399
|
+
traceSummary
|
|
5285
5400
|
});
|
|
5286
5401
|
} catch (error) {
|
|
5287
5402
|
const errorResult = buildErrorResult(
|
|
@@ -5385,21 +5500,13 @@ async function runEvalCase(options) {
|
|
|
5385
5500
|
if (cacheKey && cache && !cachedResponse) {
|
|
5386
5501
|
await cache.set(cacheKey, providerResponse);
|
|
5387
5502
|
}
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
5392
|
-
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
5393
|
-
candidateTrace = rawTrace;
|
|
5394
|
-
}
|
|
5395
|
-
} catch {
|
|
5396
|
-
}
|
|
5397
|
-
}
|
|
5398
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
5503
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5504
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5505
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5399
5506
|
try {
|
|
5400
5507
|
return await evaluateCandidate({
|
|
5401
5508
|
evalCase,
|
|
5402
|
-
candidate
|
|
5509
|
+
candidate,
|
|
5403
5510
|
target,
|
|
5404
5511
|
provider,
|
|
5405
5512
|
evaluators,
|
|
@@ -5408,9 +5515,8 @@ async function runEvalCase(options) {
|
|
|
5408
5515
|
attempt,
|
|
5409
5516
|
judgeProvider,
|
|
5410
5517
|
agentTimeoutMs,
|
|
5411
|
-
|
|
5412
|
-
|
|
5413
|
-
candidateTraceSummary
|
|
5518
|
+
outputMessages,
|
|
5519
|
+
traceSummary
|
|
5414
5520
|
});
|
|
5415
5521
|
} catch (error) {
|
|
5416
5522
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -5428,9 +5534,8 @@ async function evaluateCandidate(options) {
|
|
|
5428
5534
|
attempt,
|
|
5429
5535
|
judgeProvider,
|
|
5430
5536
|
agentTimeoutMs,
|
|
5431
|
-
|
|
5432
|
-
|
|
5433
|
-
candidateTraceSummary
|
|
5537
|
+
outputMessages,
|
|
5538
|
+
traceSummary
|
|
5434
5539
|
} = options;
|
|
5435
5540
|
const gradeTimestamp = nowFn();
|
|
5436
5541
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -5444,9 +5549,8 @@ async function evaluateCandidate(options) {
|
|
|
5444
5549
|
now: gradeTimestamp,
|
|
5445
5550
|
judgeProvider,
|
|
5446
5551
|
agentTimeoutMs,
|
|
5447
|
-
|
|
5448
|
-
|
|
5449
|
-
candidateTraceSummary
|
|
5552
|
+
outputMessages,
|
|
5553
|
+
traceSummary
|
|
5450
5554
|
});
|
|
5451
5555
|
const completedAt = nowFn();
|
|
5452
5556
|
let agentProviderRequest;
|
|
@@ -5484,7 +5588,7 @@ async function evaluateCandidate(options) {
|
|
|
5484
5588
|
lm_provider_request: lmProviderRequest,
|
|
5485
5589
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
5486
5590
|
evaluator_results: evaluatorResults,
|
|
5487
|
-
trace_summary:
|
|
5591
|
+
trace_summary: traceSummary
|
|
5488
5592
|
};
|
|
5489
5593
|
}
|
|
5490
5594
|
async function runEvaluatorsForCase(options) {
|
|
@@ -5499,9 +5603,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5499
5603
|
now,
|
|
5500
5604
|
judgeProvider,
|
|
5501
5605
|
agentTimeoutMs,
|
|
5502
|
-
|
|
5503
|
-
|
|
5504
|
-
candidateTraceSummary
|
|
5606
|
+
outputMessages,
|
|
5607
|
+
traceSummary
|
|
5505
5608
|
} = options;
|
|
5506
5609
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
5507
5610
|
return runEvaluatorList({
|
|
@@ -5516,9 +5619,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5516
5619
|
now,
|
|
5517
5620
|
judgeProvider,
|
|
5518
5621
|
agentTimeoutMs,
|
|
5519
|
-
|
|
5520
|
-
|
|
5521
|
-
candidateTraceSummary
|
|
5622
|
+
outputMessages,
|
|
5623
|
+
traceSummary
|
|
5522
5624
|
});
|
|
5523
5625
|
}
|
|
5524
5626
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -5535,9 +5637,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5535
5637
|
promptInputs,
|
|
5536
5638
|
now,
|
|
5537
5639
|
judgeProvider,
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
candidateTraceSummary
|
|
5640
|
+
outputMessages,
|
|
5641
|
+
traceSummary
|
|
5541
5642
|
});
|
|
5542
5643
|
return { score };
|
|
5543
5644
|
}
|
|
@@ -5554,9 +5655,8 @@ async function runEvaluatorList(options) {
|
|
|
5554
5655
|
now,
|
|
5555
5656
|
judgeProvider,
|
|
5556
5657
|
agentTimeoutMs,
|
|
5557
|
-
|
|
5558
|
-
|
|
5559
|
-
candidateTraceSummary
|
|
5658
|
+
outputMessages,
|
|
5659
|
+
traceSummary
|
|
5560
5660
|
} = options;
|
|
5561
5661
|
const scored = [];
|
|
5562
5662
|
const evaluatorResults = [];
|
|
@@ -5603,8 +5703,8 @@ async function runEvaluatorList(options) {
|
|
|
5603
5703
|
attempt,
|
|
5604
5704
|
promptInputs,
|
|
5605
5705
|
now,
|
|
5606
|
-
|
|
5607
|
-
|
|
5706
|
+
outputMessages,
|
|
5707
|
+
traceSummary
|
|
5608
5708
|
});
|
|
5609
5709
|
const weight = evaluator.weight ?? 1;
|
|
5610
5710
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -5690,9 +5790,8 @@ async function runEvaluatorList(options) {
|
|
|
5690
5790
|
attempt,
|
|
5691
5791
|
promptInputs,
|
|
5692
5792
|
now,
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
candidateTraceSummary
|
|
5793
|
+
outputMessages,
|
|
5794
|
+
traceSummary
|
|
5696
5795
|
});
|
|
5697
5796
|
const weight = evaluator.weight ?? 1;
|
|
5698
5797
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -6086,8 +6185,6 @@ function createAgentKernel() {
|
|
|
6086
6185
|
isJsonValue,
|
|
6087
6186
|
isTestMessage,
|
|
6088
6187
|
isTestMessageRole,
|
|
6089
|
-
isTraceEvent,
|
|
6090
|
-
isTraceEventType,
|
|
6091
6188
|
listTargetNames,
|
|
6092
6189
|
loadEvalCases,
|
|
6093
6190
|
normalizeLineEndings,
|