@agentv/core 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-4A6L2F6L.js → chunk-KPHTMTZ3.js} +23 -4
- package/dist/chunk-KPHTMTZ3.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +6 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +246 -149
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +58 -50
- package/dist/index.d.ts +58 -50
- package/dist/index.js +215 -135
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-4A6L2F6L.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
buildDirectoryChain,
|
|
3
3
|
buildSearchRoots,
|
|
4
|
+
extractLastAssistantContent,
|
|
4
5
|
fileExists,
|
|
5
6
|
findGitRoot,
|
|
6
7
|
isAgentProvider,
|
|
@@ -9,7 +10,7 @@ import {
|
|
|
9
10
|
readTextFile,
|
|
10
11
|
resolveFileReference,
|
|
11
12
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-KPHTMTZ3.js";
|
|
13
14
|
|
|
14
15
|
// src/evaluation/types.ts
|
|
15
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -74,33 +75,22 @@ function getHitCount(result) {
|
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
// src/evaluation/trace.ts
|
|
77
|
-
function
|
|
78
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
79
|
-
}
|
|
80
|
-
function isTraceEvent(value) {
|
|
81
|
-
if (typeof value !== "object" || value === null) {
|
|
82
|
-
return false;
|
|
83
|
-
}
|
|
84
|
-
const candidate = value;
|
|
85
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
86
|
-
}
|
|
87
|
-
function computeTraceSummary(trace) {
|
|
78
|
+
function computeTraceSummary(messages) {
|
|
88
79
|
const toolCallCounts = {};
|
|
89
|
-
let
|
|
90
|
-
for (const
|
|
91
|
-
if (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
errorCount++;
|
|
80
|
+
let totalToolCalls = 0;
|
|
81
|
+
for (const message of messages) {
|
|
82
|
+
if (!message.toolCalls) continue;
|
|
83
|
+
for (const toolCall of message.toolCalls) {
|
|
84
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
85
|
+
totalToolCalls++;
|
|
96
86
|
}
|
|
97
87
|
}
|
|
98
88
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
99
89
|
return {
|
|
100
|
-
eventCount:
|
|
90
|
+
eventCount: totalToolCalls,
|
|
101
91
|
toolNames,
|
|
102
92
|
toolCallsByName: toolCallCounts,
|
|
103
|
-
errorCount
|
|
93
|
+
errorCount: 0
|
|
104
94
|
};
|
|
105
95
|
}
|
|
106
96
|
|
|
@@ -376,7 +366,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
376
366
|
QUESTION: "question",
|
|
377
367
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
378
368
|
REFERENCE_ANSWER: "reference_answer",
|
|
379
|
-
INPUT_MESSAGES: "input_messages"
|
|
369
|
+
INPUT_MESSAGES: "input_messages",
|
|
370
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
380
371
|
};
|
|
381
372
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
382
373
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -1259,16 +1250,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1259
1250
|
}) : [];
|
|
1260
1251
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1261
1252
|
let referenceAnswer = "";
|
|
1262
|
-
if (outputSegments.length >
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
const
|
|
1266
|
-
if (typeof
|
|
1267
|
-
referenceAnswer =
|
|
1268
|
-
} else if (
|
|
1269
|
-
referenceAnswer = JSON.stringify(
|
|
1270
|
-
} else if (
|
|
1271
|
-
referenceAnswer = JSON.stringify(
|
|
1253
|
+
if (outputSegments.length > 0) {
|
|
1254
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1255
|
+
const content = lastMessage.content;
|
|
1256
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1257
|
+
if (typeof content === "string") {
|
|
1258
|
+
referenceAnswer = content;
|
|
1259
|
+
} else if (content !== void 0 && content !== null) {
|
|
1260
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1261
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1262
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1272
1263
|
}
|
|
1273
1264
|
}
|
|
1274
1265
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1596,11 +1587,11 @@ async function invokeModel(options) {
|
|
|
1596
1587
|
return mapResponse(result);
|
|
1597
1588
|
}
|
|
1598
1589
|
function mapResponse(result) {
|
|
1590
|
+
const content = result.text ?? "";
|
|
1599
1591
|
return {
|
|
1600
|
-
text: result.text ?? "",
|
|
1601
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1602
1592
|
raw: result,
|
|
1603
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1593
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1594
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1604
1595
|
};
|
|
1605
1596
|
}
|
|
1606
1597
|
function toJsonObject(value) {
|
|
@@ -1753,6 +1744,7 @@ var CliProvider = class {
|
|
|
1753
1744
|
config;
|
|
1754
1745
|
runCommand;
|
|
1755
1746
|
verbose;
|
|
1747
|
+
keepTempFiles;
|
|
1756
1748
|
healthcheckPromise;
|
|
1757
1749
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1758
1750
|
this.targetName = targetName;
|
|
@@ -1760,6 +1752,7 @@ var CliProvider = class {
|
|
|
1760
1752
|
this.config = config;
|
|
1761
1753
|
this.runCommand = runner;
|
|
1762
1754
|
this.verbose = config.verbose ?? false;
|
|
1755
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1763
1756
|
}
|
|
1764
1757
|
async invoke(request) {
|
|
1765
1758
|
if (request.signal?.aborted) {
|
|
@@ -1797,8 +1790,7 @@ var CliProvider = class {
|
|
|
1797
1790
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1798
1791
|
const parsed = this.parseOutputContent(responseContent);
|
|
1799
1792
|
return {
|
|
1800
|
-
|
|
1801
|
-
trace: parsed.trace,
|
|
1793
|
+
outputMessages: parsed.outputMessages,
|
|
1802
1794
|
raw: {
|
|
1803
1795
|
command: renderedCommand,
|
|
1804
1796
|
stderr: result.stderr,
|
|
@@ -1877,7 +1869,7 @@ var CliProvider = class {
|
|
|
1877
1869
|
const evalCaseId = request.evalCaseId;
|
|
1878
1870
|
if (!evalCaseId) {
|
|
1879
1871
|
return {
|
|
1880
|
-
|
|
1872
|
+
outputMessages: [],
|
|
1881
1873
|
raw: {
|
|
1882
1874
|
command: renderedCommand,
|
|
1883
1875
|
stderr: result.stderr,
|
|
@@ -1890,7 +1882,7 @@ var CliProvider = class {
|
|
|
1890
1882
|
const parsed = recordsById.get(evalCaseId);
|
|
1891
1883
|
if (!parsed) {
|
|
1892
1884
|
return {
|
|
1893
|
-
|
|
1885
|
+
outputMessages: [],
|
|
1894
1886
|
raw: {
|
|
1895
1887
|
command: renderedCommand,
|
|
1896
1888
|
stderr: result.stderr,
|
|
@@ -1901,9 +1893,7 @@ var CliProvider = class {
|
|
|
1901
1893
|
};
|
|
1902
1894
|
}
|
|
1903
1895
|
return {
|
|
1904
|
-
|
|
1905
|
-
trace: parsed.trace,
|
|
1906
|
-
traceRef: parsed.traceRef,
|
|
1896
|
+
outputMessages: parsed.outputMessages,
|
|
1907
1897
|
raw: {
|
|
1908
1898
|
command: renderedCommand,
|
|
1909
1899
|
stderr: result.stderr,
|
|
@@ -1918,28 +1908,81 @@ var CliProvider = class {
|
|
|
1918
1908
|
}
|
|
1919
1909
|
/**
|
|
1920
1910
|
* Parse output content from CLI.
|
|
1921
|
-
* If the content is valid JSON with
|
|
1922
|
-
*
|
|
1911
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
1912
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
1913
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
1923
1914
|
*/
|
|
1924
1915
|
parseOutputContent(content) {
|
|
1925
1916
|
try {
|
|
1926
1917
|
const parsed = JSON.parse(content);
|
|
1927
|
-
if (typeof parsed === "object" && parsed !== null
|
|
1918
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
1928
1919
|
const obj = parsed;
|
|
1929
|
-
const
|
|
1930
|
-
|
|
1931
|
-
|
|
1920
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1921
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
1922
|
+
return { outputMessages };
|
|
1923
|
+
}
|
|
1924
|
+
if ("text" in obj) {
|
|
1925
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1926
|
+
return { outputMessages: [{ role: "assistant", content: text }] };
|
|
1927
|
+
}
|
|
1932
1928
|
}
|
|
1933
1929
|
} catch {
|
|
1934
1930
|
}
|
|
1935
|
-
return {
|
|
1931
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
1936
1932
|
}
|
|
1937
|
-
|
|
1938
|
-
|
|
1933
|
+
/**
|
|
1934
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
1935
|
+
*/
|
|
1936
|
+
parseOutputMessages(outputMessages) {
|
|
1937
|
+
if (!Array.isArray(outputMessages)) {
|
|
1939
1938
|
return void 0;
|
|
1940
1939
|
}
|
|
1941
|
-
const
|
|
1942
|
-
|
|
1940
|
+
const messages = [];
|
|
1941
|
+
for (const msg of outputMessages) {
|
|
1942
|
+
if (typeof msg !== "object" || msg === null) {
|
|
1943
|
+
continue;
|
|
1944
|
+
}
|
|
1945
|
+
const rawMsg = msg;
|
|
1946
|
+
if (typeof rawMsg.role !== "string") {
|
|
1947
|
+
continue;
|
|
1948
|
+
}
|
|
1949
|
+
const message = {
|
|
1950
|
+
role: rawMsg.role,
|
|
1951
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
1952
|
+
content: rawMsg.content,
|
|
1953
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
1954
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
1955
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
1956
|
+
};
|
|
1957
|
+
messages.push(message);
|
|
1958
|
+
}
|
|
1959
|
+
return messages.length > 0 ? messages : void 0;
|
|
1960
|
+
}
|
|
1961
|
+
/**
|
|
1962
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
1963
|
+
*/
|
|
1964
|
+
parseToolCalls(toolCalls) {
|
|
1965
|
+
if (!Array.isArray(toolCalls)) {
|
|
1966
|
+
return void 0;
|
|
1967
|
+
}
|
|
1968
|
+
const calls = [];
|
|
1969
|
+
for (const call of toolCalls) {
|
|
1970
|
+
if (typeof call !== "object" || call === null) {
|
|
1971
|
+
continue;
|
|
1972
|
+
}
|
|
1973
|
+
const rawCall = call;
|
|
1974
|
+
if (typeof rawCall.tool !== "string") {
|
|
1975
|
+
continue;
|
|
1976
|
+
}
|
|
1977
|
+
calls.push({
|
|
1978
|
+
tool: rawCall.tool,
|
|
1979
|
+
input: rawCall.input,
|
|
1980
|
+
output: rawCall.output,
|
|
1981
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
1982
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
1983
|
+
});
|
|
1984
|
+
}
|
|
1985
|
+
return calls.length > 0 ? calls : void 0;
|
|
1943
1986
|
}
|
|
1944
1987
|
parseJsonlBatchOutput(content) {
|
|
1945
1988
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -1963,12 +2006,16 @@ var CliProvider = class {
|
|
|
1963
2006
|
if (records.has(id)) {
|
|
1964
2007
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
1965
2008
|
}
|
|
1966
|
-
const
|
|
1967
|
-
|
|
2009
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2010
|
+
let outputMessages;
|
|
2011
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2012
|
+
outputMessages = parsedOutputMessages;
|
|
2013
|
+
} else {
|
|
2014
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2015
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2016
|
+
}
|
|
1968
2017
|
records.set(id, {
|
|
1969
|
-
|
|
1970
|
-
trace: this.parseTrace(obj.trace),
|
|
1971
|
-
traceRef
|
|
2018
|
+
outputMessages
|
|
1972
2019
|
});
|
|
1973
2020
|
}
|
|
1974
2021
|
return records;
|
|
@@ -1981,8 +2028,10 @@ var CliProvider = class {
|
|
|
1981
2028
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1982
2029
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1983
2030
|
} finally {
|
|
1984
|
-
|
|
1985
|
-
|
|
2031
|
+
if (!this.keepTempFiles) {
|
|
2032
|
+
await fs.unlink(filePath).catch(() => {
|
|
2033
|
+
});
|
|
2034
|
+
}
|
|
1986
2035
|
}
|
|
1987
2036
|
}
|
|
1988
2037
|
async ensureHealthy(signal) {
|
|
@@ -2325,7 +2374,6 @@ var CodexProvider = class {
|
|
|
2325
2374
|
const parsed = parseCodexJson(result.stdout);
|
|
2326
2375
|
const assistantText = extractAssistantText(parsed);
|
|
2327
2376
|
return {
|
|
2328
|
-
text: assistantText,
|
|
2329
2377
|
raw: {
|
|
2330
2378
|
response: parsed,
|
|
2331
2379
|
stdout: result.stdout,
|
|
@@ -2337,7 +2385,8 @@ var CodexProvider = class {
|
|
|
2337
2385
|
workspace: workspaceRoot,
|
|
2338
2386
|
inputFiles,
|
|
2339
2387
|
logFile: logger?.filePath
|
|
2340
|
-
}
|
|
2388
|
+
},
|
|
2389
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2341
2390
|
};
|
|
2342
2391
|
} finally {
|
|
2343
2392
|
await logger?.close();
|
|
@@ -2959,7 +3008,6 @@ var MockProvider = class {
|
|
|
2959
3008
|
delayMs;
|
|
2960
3009
|
delayMinMs;
|
|
2961
3010
|
delayMaxMs;
|
|
2962
|
-
trace;
|
|
2963
3011
|
constructor(targetName, config) {
|
|
2964
3012
|
this.id = `mock:${targetName}`;
|
|
2965
3013
|
this.targetName = targetName;
|
|
@@ -2967,7 +3015,6 @@ var MockProvider = class {
|
|
|
2967
3015
|
this.delayMs = config.delayMs ?? 0;
|
|
2968
3016
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2969
3017
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2970
|
-
this.trace = config.trace;
|
|
2971
3018
|
}
|
|
2972
3019
|
async invoke(request) {
|
|
2973
3020
|
const delay = this.calculateDelay();
|
|
@@ -2975,12 +3022,11 @@ var MockProvider = class {
|
|
|
2975
3022
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
2976
3023
|
}
|
|
2977
3024
|
return {
|
|
2978
|
-
|
|
3025
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
2979
3026
|
raw: {
|
|
2980
3027
|
question: request.question,
|
|
2981
3028
|
guidelines: request.guidelines
|
|
2982
|
-
}
|
|
2983
|
-
trace: this.trace
|
|
3029
|
+
}
|
|
2984
3030
|
};
|
|
2985
3031
|
}
|
|
2986
3032
|
calculateDelay() {
|
|
@@ -3073,7 +3119,7 @@ var VSCodeProvider = class {
|
|
|
3073
3119
|
}
|
|
3074
3120
|
if (this.config.dryRun) {
|
|
3075
3121
|
return {
|
|
3076
|
-
|
|
3122
|
+
outputMessages: [],
|
|
3077
3123
|
raw: {
|
|
3078
3124
|
session,
|
|
3079
3125
|
inputFiles
|
|
@@ -3082,7 +3128,7 @@ var VSCodeProvider = class {
|
|
|
3082
3128
|
}
|
|
3083
3129
|
const responseText = await readTextFile(session.responseFile);
|
|
3084
3130
|
return {
|
|
3085
|
-
|
|
3131
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3086
3132
|
raw: {
|
|
3087
3133
|
session,
|
|
3088
3134
|
inputFiles
|
|
@@ -3120,7 +3166,7 @@ var VSCodeProvider = class {
|
|
|
3120
3166
|
}
|
|
3121
3167
|
if (this.config.dryRun) {
|
|
3122
3168
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3123
|
-
|
|
3169
|
+
outputMessages: [],
|
|
3124
3170
|
raw: {
|
|
3125
3171
|
session,
|
|
3126
3172
|
inputFiles,
|
|
@@ -3137,7 +3183,7 @@ var VSCodeProvider = class {
|
|
|
3137
3183
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3138
3184
|
const responseText = await readTextFile(responseFile);
|
|
3139
3185
|
responses.push({
|
|
3140
|
-
|
|
3186
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3141
3187
|
raw: {
|
|
3142
3188
|
session,
|
|
3143
3189
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -3441,6 +3487,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3441
3487
|
null,
|
|
3442
3488
|
2
|
|
3443
3489
|
),
|
|
3490
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
3444
3491
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3445
3492
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3446
3493
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -3465,7 +3512,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3465
3512
|
const score = clampScore(data.score);
|
|
3466
3513
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3467
3514
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3468
|
-
const reasoning = data.reasoning
|
|
3515
|
+
const reasoning = data.reasoning;
|
|
3469
3516
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3470
3517
|
return {
|
|
3471
3518
|
score,
|
|
@@ -3567,7 +3614,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3567
3614
|
maxOutputTokens: this.maxOutputTokens,
|
|
3568
3615
|
temperature: this.temperature
|
|
3569
3616
|
});
|
|
3570
|
-
const data = schema.parse(
|
|
3617
|
+
const data = schema.parse(
|
|
3618
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
3619
|
+
);
|
|
3571
3620
|
return { data, providerResponse: response };
|
|
3572
3621
|
} catch (e) {
|
|
3573
3622
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -3653,13 +3702,13 @@ var CodeEvaluator = class {
|
|
|
3653
3702
|
expected_messages: context.evalCase.expected_messages,
|
|
3654
3703
|
reference_answer: context.evalCase.reference_answer,
|
|
3655
3704
|
candidate_answer: context.candidate,
|
|
3705
|
+
output_messages: context.outputMessages ?? null,
|
|
3656
3706
|
guideline_files: context.evalCase.guideline_paths,
|
|
3657
3707
|
input_files: context.evalCase.file_paths.filter(
|
|
3658
3708
|
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3659
3709
|
),
|
|
3660
3710
|
input_messages: context.evalCase.input_messages,
|
|
3661
|
-
|
|
3662
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
3711
|
+
candidate_trace_summary: context.traceSummary ?? null
|
|
3663
3712
|
},
|
|
3664
3713
|
null,
|
|
3665
3714
|
2
|
|
@@ -3786,8 +3835,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3786
3835
|
this.config = options.config;
|
|
3787
3836
|
}
|
|
3788
3837
|
evaluate(context) {
|
|
3789
|
-
const {
|
|
3790
|
-
|
|
3838
|
+
const { outputMessages, traceSummary } = context;
|
|
3839
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
3840
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
3841
|
+
return {
|
|
3842
|
+
score: 0,
|
|
3843
|
+
verdict: "fail",
|
|
3844
|
+
hits: [],
|
|
3845
|
+
misses: ["No trace available for evaluation"],
|
|
3846
|
+
expectedAspectCount: 1
|
|
3847
|
+
};
|
|
3848
|
+
}
|
|
3849
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
3850
|
+
if (!summary) {
|
|
3791
3851
|
return {
|
|
3792
3852
|
score: 0,
|
|
3793
3853
|
verdict: "fail",
|
|
@@ -3798,11 +3858,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3798
3858
|
}
|
|
3799
3859
|
switch (this.config.mode) {
|
|
3800
3860
|
case "any_order":
|
|
3801
|
-
return this.evaluateAnyOrder(
|
|
3861
|
+
return this.evaluateAnyOrder(summary);
|
|
3802
3862
|
case "in_order":
|
|
3803
|
-
return this.evaluateInOrder(
|
|
3863
|
+
return this.evaluateInOrder(toolCalls);
|
|
3804
3864
|
case "exact":
|
|
3805
|
-
return this.evaluateExact(
|
|
3865
|
+
return this.evaluateExact(toolCalls);
|
|
3806
3866
|
default:
|
|
3807
3867
|
return {
|
|
3808
3868
|
score: 0,
|
|
@@ -3813,6 +3873,39 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3813
3873
|
};
|
|
3814
3874
|
}
|
|
3815
3875
|
}
|
|
3876
|
+
/**
|
|
3877
|
+
* Extract tool calls from output messages.
|
|
3878
|
+
*/
|
|
3879
|
+
extractToolCallsFromMessages(messages) {
|
|
3880
|
+
if (!messages) {
|
|
3881
|
+
return [];
|
|
3882
|
+
}
|
|
3883
|
+
const toolCalls = [];
|
|
3884
|
+
for (const message of messages) {
|
|
3885
|
+
if (message.toolCalls) {
|
|
3886
|
+
for (const call of message.toolCalls) {
|
|
3887
|
+
toolCalls.push({ name: call.tool });
|
|
3888
|
+
}
|
|
3889
|
+
}
|
|
3890
|
+
}
|
|
3891
|
+
return toolCalls;
|
|
3892
|
+
}
|
|
3893
|
+
/**
|
|
3894
|
+
* Build a summary from extracted tool calls.
|
|
3895
|
+
*/
|
|
3896
|
+
buildSummary(toolCalls) {
|
|
3897
|
+
const toolCallsByName = {};
|
|
3898
|
+
for (const call of toolCalls) {
|
|
3899
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
3900
|
+
}
|
|
3901
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
3902
|
+
return {
|
|
3903
|
+
eventCount: toolCalls.length,
|
|
3904
|
+
toolNames,
|
|
3905
|
+
toolCallsByName,
|
|
3906
|
+
errorCount: 0
|
|
3907
|
+
};
|
|
3908
|
+
}
|
|
3816
3909
|
evaluateAnyOrder(summary) {
|
|
3817
3910
|
const minimums = this.config.minimums ?? {};
|
|
3818
3911
|
const toolNames = Object.keys(minimums);
|
|
@@ -3845,7 +3938,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3845
3938
|
expectedAspectCount: toolNames.length
|
|
3846
3939
|
};
|
|
3847
3940
|
}
|
|
3848
|
-
evaluateInOrder(
|
|
3941
|
+
evaluateInOrder(toolCalls) {
|
|
3849
3942
|
const expected = this.config.expected ?? [];
|
|
3850
3943
|
if (expected.length === 0) {
|
|
3851
3944
|
return {
|
|
@@ -3856,15 +3949,14 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3856
3949
|
expectedAspectCount: 0
|
|
3857
3950
|
};
|
|
3858
3951
|
}
|
|
3859
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3860
3952
|
const hits = [];
|
|
3861
3953
|
const misses = [];
|
|
3862
3954
|
let actualIndex = 0;
|
|
3863
3955
|
for (let i = 0; i < expected.length; i++) {
|
|
3864
3956
|
const expectedTool = expected[i].tool;
|
|
3865
3957
|
let found = false;
|
|
3866
|
-
while (actualIndex <
|
|
3867
|
-
if (
|
|
3958
|
+
while (actualIndex < toolCalls.length) {
|
|
3959
|
+
if (toolCalls[actualIndex].name === expectedTool) {
|
|
3868
3960
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
3869
3961
|
actualIndex++;
|
|
3870
3962
|
found = true;
|
|
@@ -3885,7 +3977,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3885
3977
|
expectedAspectCount: expected.length
|
|
3886
3978
|
};
|
|
3887
3979
|
}
|
|
3888
|
-
evaluateExact(
|
|
3980
|
+
evaluateExact(toolCalls) {
|
|
3889
3981
|
const expected = this.config.expected ?? [];
|
|
3890
3982
|
if (expected.length === 0) {
|
|
3891
3983
|
return {
|
|
@@ -3896,16 +3988,15 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3896
3988
|
expectedAspectCount: 0
|
|
3897
3989
|
};
|
|
3898
3990
|
}
|
|
3899
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3900
3991
|
const hits = [];
|
|
3901
3992
|
const misses = [];
|
|
3902
|
-
if (
|
|
3903
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
3993
|
+
if (toolCalls.length !== expected.length) {
|
|
3994
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
3904
3995
|
}
|
|
3905
|
-
const checkLength = Math.min(expected.length,
|
|
3996
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
3906
3997
|
for (let i = 0; i < checkLength; i++) {
|
|
3907
3998
|
const expectedTool = expected[i].tool;
|
|
3908
|
-
const actualTool =
|
|
3999
|
+
const actualTool = toolCalls[i].name;
|
|
3909
4000
|
if (actualTool === expectedTool) {
|
|
3910
4001
|
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
3911
4002
|
} else {
|
|
@@ -4119,11 +4210,13 @@ var CompositeEvaluator = class {
|
|
|
4119
4210
|
evalCaseId: context.evalCase.id,
|
|
4120
4211
|
attempt: context.attempt
|
|
4121
4212
|
});
|
|
4122
|
-
const data = freeformEvaluationSchema.parse(
|
|
4213
|
+
const data = freeformEvaluationSchema.parse(
|
|
4214
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4215
|
+
);
|
|
4123
4216
|
const score = clampScore(data.score);
|
|
4124
4217
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4125
4218
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4126
|
-
const reasoning = data.reasoning
|
|
4219
|
+
const reasoning = data.reasoning;
|
|
4127
4220
|
return {
|
|
4128
4221
|
score,
|
|
4129
4222
|
verdict: scoreToVerdict(score),
|
|
@@ -4546,11 +4639,14 @@ async function runBatchEvaluation(options) {
|
|
|
4546
4639
|
const evalCase = evalCases[i];
|
|
4547
4640
|
const promptInputs = promptInputsList[i];
|
|
4548
4641
|
const providerResponse = batchResponse[i];
|
|
4642
|
+
const outputMessages = providerResponse.outputMessages;
|
|
4643
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
4644
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4549
4645
|
let result;
|
|
4550
4646
|
try {
|
|
4551
4647
|
result = await evaluateCandidate({
|
|
4552
4648
|
evalCase,
|
|
4553
|
-
candidate
|
|
4649
|
+
candidate,
|
|
4554
4650
|
target,
|
|
4555
4651
|
provider,
|
|
4556
4652
|
evaluators: evaluatorRegistry,
|
|
@@ -4558,7 +4654,9 @@ async function runBatchEvaluation(options) {
|
|
|
4558
4654
|
nowFn,
|
|
4559
4655
|
attempt: 0,
|
|
4560
4656
|
judgeProvider: await resolveJudgeProvider(target),
|
|
4561
|
-
agentTimeoutMs
|
|
4657
|
+
agentTimeoutMs,
|
|
4658
|
+
outputMessages,
|
|
4659
|
+
traceSummary
|
|
4562
4660
|
});
|
|
4563
4661
|
} catch (error) {
|
|
4564
4662
|
const errorResult = buildErrorResult(
|
|
@@ -4662,21 +4760,13 @@ async function runEvalCase(options) {
|
|
|
4662
4760
|
if (cacheKey && cache && !cachedResponse) {
|
|
4663
4761
|
await cache.set(cacheKey, providerResponse);
|
|
4664
4762
|
}
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
|
|
4668
|
-
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
4669
|
-
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
4670
|
-
candidateTrace = rawTrace;
|
|
4671
|
-
}
|
|
4672
|
-
} catch {
|
|
4673
|
-
}
|
|
4674
|
-
}
|
|
4675
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4763
|
+
const outputMessages = providerResponse.outputMessages;
|
|
4764
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
4765
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4676
4766
|
try {
|
|
4677
4767
|
return await evaluateCandidate({
|
|
4678
4768
|
evalCase,
|
|
4679
|
-
candidate
|
|
4769
|
+
candidate,
|
|
4680
4770
|
target,
|
|
4681
4771
|
provider,
|
|
4682
4772
|
evaluators,
|
|
@@ -4685,9 +4775,8 @@ async function runEvalCase(options) {
|
|
|
4685
4775
|
attempt,
|
|
4686
4776
|
judgeProvider,
|
|
4687
4777
|
agentTimeoutMs,
|
|
4688
|
-
|
|
4689
|
-
|
|
4690
|
-
candidateTraceSummary
|
|
4778
|
+
outputMessages,
|
|
4779
|
+
traceSummary
|
|
4691
4780
|
});
|
|
4692
4781
|
} catch (error) {
|
|
4693
4782
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4705,9 +4794,8 @@ async function evaluateCandidate(options) {
|
|
|
4705
4794
|
attempt,
|
|
4706
4795
|
judgeProvider,
|
|
4707
4796
|
agentTimeoutMs,
|
|
4708
|
-
|
|
4709
|
-
|
|
4710
|
-
candidateTraceSummary
|
|
4797
|
+
outputMessages,
|
|
4798
|
+
traceSummary
|
|
4711
4799
|
} = options;
|
|
4712
4800
|
const gradeTimestamp = nowFn();
|
|
4713
4801
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4721,9 +4809,8 @@ async function evaluateCandidate(options) {
|
|
|
4721
4809
|
now: gradeTimestamp,
|
|
4722
4810
|
judgeProvider,
|
|
4723
4811
|
agentTimeoutMs,
|
|
4724
|
-
|
|
4725
|
-
|
|
4726
|
-
candidateTraceSummary
|
|
4812
|
+
outputMessages,
|
|
4813
|
+
traceSummary
|
|
4727
4814
|
});
|
|
4728
4815
|
const completedAt = nowFn();
|
|
4729
4816
|
let agentProviderRequest;
|
|
@@ -4761,7 +4848,7 @@ async function evaluateCandidate(options) {
|
|
|
4761
4848
|
lm_provider_request: lmProviderRequest,
|
|
4762
4849
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4763
4850
|
evaluator_results: evaluatorResults,
|
|
4764
|
-
trace_summary:
|
|
4851
|
+
trace_summary: traceSummary
|
|
4765
4852
|
};
|
|
4766
4853
|
}
|
|
4767
4854
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4776,9 +4863,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4776
4863
|
now,
|
|
4777
4864
|
judgeProvider,
|
|
4778
4865
|
agentTimeoutMs,
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
candidateTraceSummary
|
|
4866
|
+
outputMessages,
|
|
4867
|
+
traceSummary
|
|
4782
4868
|
} = options;
|
|
4783
4869
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4784
4870
|
return runEvaluatorList({
|
|
@@ -4793,9 +4879,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4793
4879
|
now,
|
|
4794
4880
|
judgeProvider,
|
|
4795
4881
|
agentTimeoutMs,
|
|
4796
|
-
|
|
4797
|
-
|
|
4798
|
-
candidateTraceSummary
|
|
4882
|
+
outputMessages,
|
|
4883
|
+
traceSummary
|
|
4799
4884
|
});
|
|
4800
4885
|
}
|
|
4801
4886
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4812,9 +4897,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4812
4897
|
promptInputs,
|
|
4813
4898
|
now,
|
|
4814
4899
|
judgeProvider,
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
candidateTraceSummary
|
|
4900
|
+
outputMessages,
|
|
4901
|
+
traceSummary
|
|
4818
4902
|
});
|
|
4819
4903
|
return { score };
|
|
4820
4904
|
}
|
|
@@ -4831,9 +4915,8 @@ async function runEvaluatorList(options) {
|
|
|
4831
4915
|
now,
|
|
4832
4916
|
judgeProvider,
|
|
4833
4917
|
agentTimeoutMs,
|
|
4834
|
-
|
|
4835
|
-
|
|
4836
|
-
candidateTraceSummary
|
|
4918
|
+
outputMessages,
|
|
4919
|
+
traceSummary
|
|
4837
4920
|
} = options;
|
|
4838
4921
|
const scored = [];
|
|
4839
4922
|
const evaluatorResults = [];
|
|
@@ -4880,8 +4963,8 @@ async function runEvaluatorList(options) {
|
|
|
4880
4963
|
attempt,
|
|
4881
4964
|
promptInputs,
|
|
4882
4965
|
now,
|
|
4883
|
-
|
|
4884
|
-
|
|
4966
|
+
outputMessages,
|
|
4967
|
+
traceSummary
|
|
4885
4968
|
});
|
|
4886
4969
|
const weight = evaluator.weight ?? 1;
|
|
4887
4970
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -4967,9 +5050,8 @@ async function runEvaluatorList(options) {
|
|
|
4967
5050
|
attempt,
|
|
4968
5051
|
promptInputs,
|
|
4969
5052
|
now,
|
|
4970
|
-
|
|
4971
|
-
|
|
4972
|
-
candidateTraceSummary
|
|
5053
|
+
outputMessages,
|
|
5054
|
+
traceSummary
|
|
4973
5055
|
});
|
|
4974
5056
|
const weight = evaluator.weight ?? 1;
|
|
4975
5057
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5362,8 +5444,6 @@ export {
|
|
|
5362
5444
|
isJsonValue,
|
|
5363
5445
|
isTestMessage,
|
|
5364
5446
|
isTestMessageRole,
|
|
5365
|
-
isTraceEvent,
|
|
5366
|
-
isTraceEventType,
|
|
5367
5447
|
listTargetNames,
|
|
5368
5448
|
loadEvalCases,
|
|
5369
5449
|
normalizeLineEndings,
|