@agentv/core 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
- package/dist/chunk-OYTL3LNN.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +596 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +151 -4
- package/dist/index.d.ts +151 -4
- package/dist/index.js +555 -17
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,11 +32,14 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
|
|
35
36
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
37
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
38
|
+
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
37
39
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
38
40
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
41
|
buildSearchRoots: () => buildSearchRoots2,
|
|
42
|
+
computeTraceSummary: () => computeTraceSummary,
|
|
40
43
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
41
44
|
createAgentKernel: () => createAgentKernel,
|
|
42
45
|
createProvider: () => createProvider,
|
|
@@ -47,14 +50,18 @@ __export(index_exports, {
|
|
|
47
50
|
generateRubrics: () => generateRubrics,
|
|
48
51
|
getHitCount: () => getHitCount,
|
|
49
52
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
53
|
+
isExpectedToolCall: () => isExpectedToolCall,
|
|
50
54
|
isGuidelineFile: () => isGuidelineFile,
|
|
51
55
|
isJsonObject: () => isJsonObject,
|
|
52
56
|
isJsonValue: () => isJsonValue,
|
|
53
57
|
isTestMessage: () => isTestMessage,
|
|
54
58
|
isTestMessageRole: () => isTestMessageRole,
|
|
59
|
+
isTraceEvent: () => isTraceEvent,
|
|
60
|
+
isTraceEventType: () => isTraceEventType,
|
|
55
61
|
listTargetNames: () => listTargetNames,
|
|
56
62
|
loadEvalCases: () => loadEvalCases,
|
|
57
63
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
64
|
+
readJsonFile: () => readJsonFile,
|
|
58
65
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
59
66
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
60
67
|
readTextFile: () => readTextFile,
|
|
@@ -108,7 +115,14 @@ function isTestMessage(value) {
|
|
|
108
115
|
}
|
|
109
116
|
return candidate.content.every(isJsonObject);
|
|
110
117
|
}
|
|
111
|
-
var EVALUATOR_KIND_VALUES = [
|
|
118
|
+
var EVALUATOR_KIND_VALUES = [
|
|
119
|
+
"code_judge",
|
|
120
|
+
"llm_judge",
|
|
121
|
+
"rubric",
|
|
122
|
+
"composite",
|
|
123
|
+
"tool_trajectory",
|
|
124
|
+
"expected_messages"
|
|
125
|
+
];
|
|
112
126
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
113
127
|
function isEvaluatorKind(value) {
|
|
114
128
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -117,6 +131,44 @@ function getHitCount(result) {
|
|
|
117
131
|
return result.hits.length;
|
|
118
132
|
}
|
|
119
133
|
|
|
134
|
+
// src/evaluation/trace.ts
|
|
135
|
+
function isTraceEventType(value) {
|
|
136
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
137
|
+
}
|
|
138
|
+
function isTraceEvent(value) {
|
|
139
|
+
if (typeof value !== "object" || value === null) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
const candidate = value;
|
|
143
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
144
|
+
}
|
|
145
|
+
function isExpectedToolCall(value) {
|
|
146
|
+
if (typeof value !== "object" || value === null) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
const candidate = value;
|
|
150
|
+
return typeof candidate.tool === "string";
|
|
151
|
+
}
|
|
152
|
+
function computeTraceSummary(trace) {
|
|
153
|
+
const toolCallCounts = {};
|
|
154
|
+
let errorCount = 0;
|
|
155
|
+
for (const event of trace) {
|
|
156
|
+
if (event.type === "tool_call" && event.name) {
|
|
157
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
158
|
+
}
|
|
159
|
+
if (event.type === "error") {
|
|
160
|
+
errorCount++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
164
|
+
return {
|
|
165
|
+
eventCount: trace.length,
|
|
166
|
+
toolNames,
|
|
167
|
+
toolCallsByName: toolCallCounts,
|
|
168
|
+
errorCount
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
120
172
|
// src/evaluation/yaml-parser.ts
|
|
121
173
|
var import_promises6 = require("fs/promises");
|
|
122
174
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
@@ -589,6 +641,75 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
589
641
|
});
|
|
590
642
|
continue;
|
|
591
643
|
}
|
|
644
|
+
if (typeValue === "expected_messages") {
|
|
645
|
+
evaluators.push({
|
|
646
|
+
name,
|
|
647
|
+
type: "expected_messages"
|
|
648
|
+
});
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
if (typeValue === "tool_trajectory") {
|
|
652
|
+
const mode = asString2(rawEvaluator.mode);
|
|
653
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
654
|
+
logWarning2(
|
|
655
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
656
|
+
);
|
|
657
|
+
continue;
|
|
658
|
+
}
|
|
659
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
660
|
+
let minimums;
|
|
661
|
+
if (rawMinimums !== void 0) {
|
|
662
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
663
|
+
logWarning2(
|
|
664
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
665
|
+
);
|
|
666
|
+
continue;
|
|
667
|
+
}
|
|
668
|
+
minimums = {};
|
|
669
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
670
|
+
if (typeof count === "number" && count >= 0) {
|
|
671
|
+
minimums[toolName] = count;
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
const rawExpected = rawEvaluator.expected;
|
|
676
|
+
let expected;
|
|
677
|
+
if (rawExpected !== void 0) {
|
|
678
|
+
if (!Array.isArray(rawExpected)) {
|
|
679
|
+
logWarning2(
|
|
680
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
681
|
+
);
|
|
682
|
+
continue;
|
|
683
|
+
}
|
|
684
|
+
expected = [];
|
|
685
|
+
for (const item of rawExpected) {
|
|
686
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
687
|
+
expected.push({ tool: item.tool });
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
if (mode === "any_order" && !minimums) {
|
|
692
|
+
logWarning2(
|
|
693
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
694
|
+
);
|
|
695
|
+
continue;
|
|
696
|
+
}
|
|
697
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
698
|
+
logWarning2(
|
|
699
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
700
|
+
);
|
|
701
|
+
continue;
|
|
702
|
+
}
|
|
703
|
+
const config = {
|
|
704
|
+
name,
|
|
705
|
+
type: "tool_trajectory",
|
|
706
|
+
mode,
|
|
707
|
+
...minimums ? { minimums } : {},
|
|
708
|
+
...expected ? { expected } : {}
|
|
709
|
+
};
|
|
710
|
+
evaluators.push(config);
|
|
711
|
+
continue;
|
|
712
|
+
}
|
|
592
713
|
const prompt = asString2(rawEvaluator.prompt);
|
|
593
714
|
let promptPath;
|
|
594
715
|
if (prompt) {
|
|
@@ -842,6 +963,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
842
963
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
843
964
|
}
|
|
844
965
|
}
|
|
966
|
+
async function processExpectedMessages(options) {
|
|
967
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
968
|
+
const segments = [];
|
|
969
|
+
for (const message of messages) {
|
|
970
|
+
const segment = {
|
|
971
|
+
role: message.role
|
|
972
|
+
};
|
|
973
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
974
|
+
segment.tool_calls = message.tool_calls;
|
|
975
|
+
}
|
|
976
|
+
const content = message.content;
|
|
977
|
+
if (typeof content === "string") {
|
|
978
|
+
segment.content = content;
|
|
979
|
+
} else if (Array.isArray(content)) {
|
|
980
|
+
const processedContent = [];
|
|
981
|
+
for (const rawSegment of content) {
|
|
982
|
+
if (!isJsonObject(rawSegment)) {
|
|
983
|
+
continue;
|
|
984
|
+
}
|
|
985
|
+
const segmentType = asString3(rawSegment.type);
|
|
986
|
+
if (segmentType === "file") {
|
|
987
|
+
const rawValue = asString3(rawSegment.value);
|
|
988
|
+
if (!rawValue) {
|
|
989
|
+
continue;
|
|
990
|
+
}
|
|
991
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
992
|
+
rawValue,
|
|
993
|
+
searchRoots
|
|
994
|
+
);
|
|
995
|
+
if (!resolvedPath) {
|
|
996
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
997
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
998
|
+
continue;
|
|
999
|
+
}
|
|
1000
|
+
try {
|
|
1001
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
1002
|
+
processedContent.push({
|
|
1003
|
+
type: "file",
|
|
1004
|
+
path: displayPath,
|
|
1005
|
+
text: fileContent,
|
|
1006
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
1007
|
+
});
|
|
1008
|
+
if (verbose) {
|
|
1009
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
1010
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
1011
|
+
}
|
|
1012
|
+
} catch (error) {
|
|
1013
|
+
logWarning3(
|
|
1014
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
1015
|
+
);
|
|
1016
|
+
}
|
|
1017
|
+
continue;
|
|
1018
|
+
}
|
|
1019
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
1020
|
+
}
|
|
1021
|
+
segment.content = processedContent;
|
|
1022
|
+
}
|
|
1023
|
+
segments.push(segment);
|
|
1024
|
+
}
|
|
1025
|
+
return segments;
|
|
1026
|
+
}
|
|
845
1027
|
|
|
846
1028
|
// src/evaluation/formatting/prompt-builder.ts
|
|
847
1029
|
var import_promises5 = require("fs/promises");
|
|
@@ -1146,12 +1328,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1146
1328
|
messageType: "input",
|
|
1147
1329
|
verbose
|
|
1148
1330
|
});
|
|
1149
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1331
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1150
1332
|
messages: expectedMessages,
|
|
1151
1333
|
searchRoots,
|
|
1152
1334
|
repoRootPath,
|
|
1153
|
-
guidelinePatterns,
|
|
1154
|
-
messageType: "output",
|
|
1155
1335
|
verbose
|
|
1156
1336
|
}) : [];
|
|
1157
1337
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1278,6 +1458,10 @@ async function readTextFile(filePath) {
|
|
|
1278
1458
|
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1279
1459
|
return normalizeLineEndings(content);
|
|
1280
1460
|
}
|
|
1461
|
+
async function readJsonFile(filePath) {
|
|
1462
|
+
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1463
|
+
return JSON.parse(content);
|
|
1464
|
+
}
|
|
1281
1465
|
async function findGitRoot(startPath) {
|
|
1282
1466
|
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1283
1467
|
const root = import_node_path7.default.parse(currentDir).root;
|
|
@@ -1786,9 +1970,11 @@ var CliProvider = class {
|
|
|
1786
1970
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1787
1971
|
throw new Error(message);
|
|
1788
1972
|
}
|
|
1789
|
-
const
|
|
1973
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1974
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1790
1975
|
return {
|
|
1791
|
-
text:
|
|
1976
|
+
text: parsed.text,
|
|
1977
|
+
trace: parsed.trace,
|
|
1792
1978
|
raw: {
|
|
1793
1979
|
command: renderedCommand,
|
|
1794
1980
|
stderr: result.stderr,
|
|
@@ -1798,6 +1984,31 @@ var CliProvider = class {
|
|
|
1798
1984
|
}
|
|
1799
1985
|
};
|
|
1800
1986
|
}
|
|
1987
|
+
/**
|
|
1988
|
+
* Parse output content from CLI.
|
|
1989
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
1990
|
+
* Otherwise, treat the entire content as plain text.
|
|
1991
|
+
*/
|
|
1992
|
+
parseOutputContent(content) {
|
|
1993
|
+
try {
|
|
1994
|
+
const parsed = JSON.parse(content);
|
|
1995
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
1996
|
+
const obj = parsed;
|
|
1997
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1998
|
+
const trace = this.parseTrace(obj.trace);
|
|
1999
|
+
return { text, trace };
|
|
2000
|
+
}
|
|
2001
|
+
} catch {
|
|
2002
|
+
}
|
|
2003
|
+
return { text: content };
|
|
2004
|
+
}
|
|
2005
|
+
parseTrace(trace) {
|
|
2006
|
+
if (!Array.isArray(trace)) {
|
|
2007
|
+
return void 0;
|
|
2008
|
+
}
|
|
2009
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
2010
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
2011
|
+
}
|
|
1801
2012
|
async readAndCleanupOutputFile(filePath) {
|
|
1802
2013
|
try {
|
|
1803
2014
|
const content = await readTextFile(filePath);
|
|
@@ -2784,6 +2995,7 @@ var MockProvider = class {
|
|
|
2784
2995
|
delayMs;
|
|
2785
2996
|
delayMinMs;
|
|
2786
2997
|
delayMaxMs;
|
|
2998
|
+
trace;
|
|
2787
2999
|
constructor(targetName, config) {
|
|
2788
3000
|
this.id = `mock:${targetName}`;
|
|
2789
3001
|
this.targetName = targetName;
|
|
@@ -2791,6 +3003,7 @@ var MockProvider = class {
|
|
|
2791
3003
|
this.delayMs = config.delayMs ?? 0;
|
|
2792
3004
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2793
3005
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
3006
|
+
this.trace = config.trace;
|
|
2794
3007
|
}
|
|
2795
3008
|
async invoke(request) {
|
|
2796
3009
|
const delay = this.calculateDelay();
|
|
@@ -2802,7 +3015,8 @@ var MockProvider = class {
|
|
|
2802
3015
|
raw: {
|
|
2803
3016
|
question: request.question,
|
|
2804
3017
|
guidelines: request.guidelines
|
|
2805
|
-
}
|
|
3018
|
+
},
|
|
3019
|
+
trace: this.trace
|
|
2806
3020
|
};
|
|
2807
3021
|
}
|
|
2808
3022
|
calculateDelay() {
|
|
@@ -2816,6 +3030,7 @@ var MockProvider = class {
|
|
|
2816
3030
|
};
|
|
2817
3031
|
|
|
2818
3032
|
// src/evaluation/providers/targets.ts
|
|
3033
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2819
3034
|
var import_zod = require("zod");
|
|
2820
3035
|
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2821
3036
|
"PROMPT",
|
|
@@ -2831,7 +3046,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
|
2831
3046
|
judge_target: import_zod.z.string().optional(),
|
|
2832
3047
|
workers: import_zod.z.number().int().min(1).optional()
|
|
2833
3048
|
}).passthrough();
|
|
2834
|
-
var DEFAULT_AZURE_API_VERSION = "2024-
|
|
3049
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
2835
3050
|
function normalizeAzureApiVersion(value) {
|
|
2836
3051
|
if (!value) {
|
|
2837
3052
|
return DEFAULT_AZURE_API_VERSION;
|
|
@@ -2875,7 +3090,7 @@ function resolveRetryConfig(target) {
|
|
|
2875
3090
|
retryableStatusCodes
|
|
2876
3091
|
};
|
|
2877
3092
|
}
|
|
2878
|
-
function resolveTargetDefinition(definition, env = process.env) {
|
|
3093
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
2879
3094
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
2880
3095
|
const provider = parsed.provider.toLowerCase();
|
|
2881
3096
|
const providerBatching = resolveOptionalBoolean(
|
|
@@ -2948,7 +3163,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
2948
3163
|
judgeTarget: parsed.judge_target,
|
|
2949
3164
|
workers: parsed.workers,
|
|
2950
3165
|
providerBatching,
|
|
2951
|
-
config: resolveCliConfig(parsed, env)
|
|
3166
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
2952
3167
|
};
|
|
2953
3168
|
default:
|
|
2954
3169
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
@@ -3066,7 +3281,8 @@ function normalizeCodexLogFormat(value) {
|
|
|
3066
3281
|
}
|
|
3067
3282
|
function resolveMockConfig(target) {
|
|
3068
3283
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3069
|
-
|
|
3284
|
+
const trace = Array.isArray(target.trace) ? target.trace : void 0;
|
|
3285
|
+
return { response, trace };
|
|
3070
3286
|
}
|
|
3071
3287
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
3072
3288
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -3098,15 +3314,18 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
3098
3314
|
workspaceTemplate
|
|
3099
3315
|
};
|
|
3100
3316
|
}
|
|
3101
|
-
function resolveCliConfig(target, env) {
|
|
3317
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
3102
3318
|
const commandTemplateSource = target.command_template ?? target.commandTemplate;
|
|
3103
3319
|
const filesFormat = resolveOptionalLiteralString(
|
|
3104
3320
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3105
3321
|
);
|
|
3106
|
-
|
|
3322
|
+
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3107
3323
|
allowLiteral: true,
|
|
3108
3324
|
optionalEnv: true
|
|
3109
3325
|
});
|
|
3326
|
+
if (!cwd && evalFilePath) {
|
|
3327
|
+
cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
|
|
3328
|
+
}
|
|
3110
3329
|
const timeoutMs = resolveTimeoutMs(
|
|
3111
3330
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
3112
3331
|
`${target.name} timeout`
|
|
@@ -3224,17 +3443,15 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
3224
3443
|
if (envVarMatch) {
|
|
3225
3444
|
const varName = envVarMatch[1];
|
|
3226
3445
|
const envValue = env[varName];
|
|
3227
|
-
if (envValue !== void 0) {
|
|
3228
|
-
if (envValue.trim().length === 0) {
|
|
3229
|
-
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
3230
|
-
}
|
|
3231
|
-
return envValue;
|
|
3232
|
-
}
|
|
3233
3446
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
3234
|
-
if (
|
|
3235
|
-
|
|
3447
|
+
if (envValue === void 0 || envValue.trim().length === 0) {
|
|
3448
|
+
if (optionalEnv) {
|
|
3449
|
+
return void 0;
|
|
3450
|
+
}
|
|
3451
|
+
const status = envValue === void 0 ? "is not set" : "is empty";
|
|
3452
|
+
throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
|
|
3236
3453
|
}
|
|
3237
|
-
|
|
3454
|
+
return envValue;
|
|
3238
3455
|
}
|
|
3239
3456
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
3240
3457
|
if (!allowLiteral) {
|
|
@@ -3346,7 +3563,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
3346
3563
|
}
|
|
3347
3564
|
|
|
3348
3565
|
// src/evaluation/providers/vscode.ts
|
|
3349
|
-
var
|
|
3566
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3350
3567
|
var import_subagent = require("subagent");
|
|
3351
3568
|
|
|
3352
3569
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -3516,7 +3733,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3516
3733
|
return "";
|
|
3517
3734
|
}
|
|
3518
3735
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3519
|
-
const fileName =
|
|
3736
|
+
const fileName = import_node_path12.default.basename(absolutePath);
|
|
3520
3737
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3521
3738
|
return `* [${fileName}](${fileUri})`;
|
|
3522
3739
|
});
|
|
@@ -3541,8 +3758,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3541
3758
|
}
|
|
3542
3759
|
const unique = /* @__PURE__ */ new Map();
|
|
3543
3760
|
for (const attachment of attachments) {
|
|
3544
|
-
const absolutePath =
|
|
3545
|
-
const normalized = absolutePath.split(
|
|
3761
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3762
|
+
const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
|
|
3546
3763
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3547
3764
|
if (!unique.has(absolutePath)) {
|
|
3548
3765
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3557,7 +3774,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3557
3774
|
}
|
|
3558
3775
|
const unique = /* @__PURE__ */ new Map();
|
|
3559
3776
|
for (const attachment of attachments) {
|
|
3560
|
-
const absolutePath =
|
|
3777
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3561
3778
|
if (!unique.has(absolutePath)) {
|
|
3562
3779
|
unique.set(absolutePath, absolutePath);
|
|
3563
3780
|
}
|
|
@@ -3565,7 +3782,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3565
3782
|
return Array.from(unique.values());
|
|
3566
3783
|
}
|
|
3567
3784
|
function pathToFileUri2(filePath) {
|
|
3568
|
-
const absolutePath =
|
|
3785
|
+
const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
|
|
3569
3786
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3570
3787
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3571
3788
|
return `file:///${normalizedPath}`;
|
|
@@ -3578,7 +3795,7 @@ function normalizeAttachments(attachments) {
|
|
|
3578
3795
|
}
|
|
3579
3796
|
const deduped = /* @__PURE__ */ new Set();
|
|
3580
3797
|
for (const attachment of attachments) {
|
|
3581
|
-
deduped.add(
|
|
3798
|
+
deduped.add(import_node_path12.default.resolve(attachment));
|
|
3582
3799
|
}
|
|
3583
3800
|
return Array.from(deduped);
|
|
3584
3801
|
}
|
|
@@ -3587,7 +3804,7 @@ function mergeAttachments(all) {
|
|
|
3587
3804
|
for (const list of all) {
|
|
3588
3805
|
if (!list) continue;
|
|
3589
3806
|
for (const inputFile of list) {
|
|
3590
|
-
deduped.add(
|
|
3807
|
+
deduped.add(import_node_path12.default.resolve(inputFile));
|
|
3591
3808
|
}
|
|
3592
3809
|
}
|
|
3593
3810
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3636,7 +3853,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3636
3853
|
// src/evaluation/providers/targets-file.ts
|
|
3637
3854
|
var import_node_fs4 = require("fs");
|
|
3638
3855
|
var import_promises10 = require("fs/promises");
|
|
3639
|
-
var
|
|
3856
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3640
3857
|
var import_yaml3 = require("yaml");
|
|
3641
3858
|
function isRecord(value) {
|
|
3642
3859
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -3673,7 +3890,7 @@ async function fileExists3(filePath) {
|
|
|
3673
3890
|
}
|
|
3674
3891
|
}
|
|
3675
3892
|
async function readTargetDefinitions(filePath) {
|
|
3676
|
-
const absolutePath =
|
|
3893
|
+
const absolutePath = import_node_path13.default.resolve(filePath);
|
|
3677
3894
|
if (!await fileExists3(absolutePath)) {
|
|
3678
3895
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3679
3896
|
}
|
|
@@ -4121,6 +4338,251 @@ function substituteVariables(template, variables) {
|
|
|
4121
4338
|
return variables[varName] ?? match;
|
|
4122
4339
|
});
|
|
4123
4340
|
}
|
|
4341
|
+
var ToolTrajectoryEvaluator = class {
|
|
4342
|
+
kind = "tool_trajectory";
|
|
4343
|
+
config;
|
|
4344
|
+
constructor(options) {
|
|
4345
|
+
this.config = options.config;
|
|
4346
|
+
}
|
|
4347
|
+
evaluate(context) {
|
|
4348
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
4349
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
4350
|
+
return {
|
|
4351
|
+
score: 0,
|
|
4352
|
+
verdict: "fail",
|
|
4353
|
+
hits: [],
|
|
4354
|
+
misses: ["No trace available for evaluation"],
|
|
4355
|
+
expectedAspectCount: 1
|
|
4356
|
+
};
|
|
4357
|
+
}
|
|
4358
|
+
switch (this.config.mode) {
|
|
4359
|
+
case "any_order":
|
|
4360
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
4361
|
+
case "in_order":
|
|
4362
|
+
return this.evaluateInOrder(candidateTrace);
|
|
4363
|
+
case "exact":
|
|
4364
|
+
return this.evaluateExact(candidateTrace);
|
|
4365
|
+
default:
|
|
4366
|
+
return {
|
|
4367
|
+
score: 0,
|
|
4368
|
+
verdict: "fail",
|
|
4369
|
+
hits: [],
|
|
4370
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
4371
|
+
expectedAspectCount: 1
|
|
4372
|
+
};
|
|
4373
|
+
}
|
|
4374
|
+
}
|
|
4375
|
+
evaluateAnyOrder(summary) {
|
|
4376
|
+
const minimums = this.config.minimums ?? {};
|
|
4377
|
+
const toolNames = Object.keys(minimums);
|
|
4378
|
+
if (toolNames.length === 0) {
|
|
4379
|
+
return {
|
|
4380
|
+
score: 1,
|
|
4381
|
+
verdict: "pass",
|
|
4382
|
+
hits: ["No tool requirements specified"],
|
|
4383
|
+
misses: [],
|
|
4384
|
+
expectedAspectCount: 0
|
|
4385
|
+
};
|
|
4386
|
+
}
|
|
4387
|
+
const hits = [];
|
|
4388
|
+
const misses = [];
|
|
4389
|
+
for (const toolName of toolNames) {
|
|
4390
|
+
const required = minimums[toolName];
|
|
4391
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
4392
|
+
if (actual >= required) {
|
|
4393
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4394
|
+
} else {
|
|
4395
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4396
|
+
}
|
|
4397
|
+
}
|
|
4398
|
+
const score = hits.length / toolNames.length;
|
|
4399
|
+
return {
|
|
4400
|
+
score,
|
|
4401
|
+
verdict: scoreToVerdict(score),
|
|
4402
|
+
hits,
|
|
4403
|
+
misses,
|
|
4404
|
+
expectedAspectCount: toolNames.length
|
|
4405
|
+
};
|
|
4406
|
+
}
|
|
4407
|
+
evaluateInOrder(trace) {
|
|
4408
|
+
const expected = this.config.expected ?? [];
|
|
4409
|
+
if (expected.length === 0) {
|
|
4410
|
+
return {
|
|
4411
|
+
score: 1,
|
|
4412
|
+
verdict: "pass",
|
|
4413
|
+
hits: ["No tool sequence specified"],
|
|
4414
|
+
misses: [],
|
|
4415
|
+
expectedAspectCount: 0
|
|
4416
|
+
};
|
|
4417
|
+
}
|
|
4418
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4419
|
+
const hits = [];
|
|
4420
|
+
const misses = [];
|
|
4421
|
+
let actualIndex = 0;
|
|
4422
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4423
|
+
const expectedTool = expected[i].tool;
|
|
4424
|
+
let found = false;
|
|
4425
|
+
while (actualIndex < actualToolCalls.length) {
|
|
4426
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
4427
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4428
|
+
actualIndex++;
|
|
4429
|
+
found = true;
|
|
4430
|
+
break;
|
|
4431
|
+
}
|
|
4432
|
+
actualIndex++;
|
|
4433
|
+
}
|
|
4434
|
+
if (!found) {
|
|
4435
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
4436
|
+
}
|
|
4437
|
+
}
|
|
4438
|
+
const score = hits.length / expected.length;
|
|
4439
|
+
return {
|
|
4440
|
+
score,
|
|
4441
|
+
verdict: scoreToVerdict(score),
|
|
4442
|
+
hits,
|
|
4443
|
+
misses,
|
|
4444
|
+
expectedAspectCount: expected.length
|
|
4445
|
+
};
|
|
4446
|
+
}
|
|
4447
|
+
evaluateExact(trace) {
|
|
4448
|
+
const expected = this.config.expected ?? [];
|
|
4449
|
+
if (expected.length === 0) {
|
|
4450
|
+
return {
|
|
4451
|
+
score: 1,
|
|
4452
|
+
verdict: "pass",
|
|
4453
|
+
hits: ["No tool sequence specified"],
|
|
4454
|
+
misses: [],
|
|
4455
|
+
expectedAspectCount: 0
|
|
4456
|
+
};
|
|
4457
|
+
}
|
|
4458
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4459
|
+
const hits = [];
|
|
4460
|
+
const misses = [];
|
|
4461
|
+
if (actualToolCalls.length !== expected.length) {
|
|
4462
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
4463
|
+
}
|
|
4464
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
4465
|
+
for (let i = 0; i < checkLength; i++) {
|
|
4466
|
+
const expectedTool = expected[i].tool;
|
|
4467
|
+
const actualTool = actualToolCalls[i].name;
|
|
4468
|
+
if (actualTool === expectedTool) {
|
|
4469
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
4470
|
+
} else {
|
|
4471
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4472
|
+
}
|
|
4473
|
+
}
|
|
4474
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
4475
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
4476
|
+
}
|
|
4477
|
+
const score = hits.length / expected.length;
|
|
4478
|
+
return {
|
|
4479
|
+
score,
|
|
4480
|
+
verdict: scoreToVerdict(score),
|
|
4481
|
+
hits,
|
|
4482
|
+
misses,
|
|
4483
|
+
expectedAspectCount: expected.length
|
|
4484
|
+
};
|
|
4485
|
+
}
|
|
4486
|
+
};
|
|
4487
|
+
var ExpectedMessagesEvaluator = class {
|
|
4488
|
+
kind = "expected_messages";
|
|
4489
|
+
evaluate(context) {
|
|
4490
|
+
const { candidateTrace, evalCase } = context;
|
|
4491
|
+
const expectedSegments = evalCase.expected_segments;
|
|
4492
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
4493
|
+
if (expectedToolCalls.length === 0) {
|
|
4494
|
+
return {
|
|
4495
|
+
score: 1,
|
|
4496
|
+
verdict: "pass",
|
|
4497
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
4498
|
+
misses: [],
|
|
4499
|
+
expectedAspectCount: 1
|
|
4500
|
+
};
|
|
4501
|
+
}
|
|
4502
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
4503
|
+
return {
|
|
4504
|
+
score: 0,
|
|
4505
|
+
verdict: "fail",
|
|
4506
|
+
hits: [],
|
|
4507
|
+
misses: ["No trace available to validate tool_calls"],
|
|
4508
|
+
expectedAspectCount: expectedToolCalls.length
|
|
4509
|
+
};
|
|
4510
|
+
}
|
|
4511
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
4512
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
4513
|
+
}
|
|
4514
|
+
extractExpectedToolCalls(segments) {
|
|
4515
|
+
if (!segments) {
|
|
4516
|
+
return [];
|
|
4517
|
+
}
|
|
4518
|
+
const toolCalls = [];
|
|
4519
|
+
for (const segment of segments) {
|
|
4520
|
+
const role = segment.role;
|
|
4521
|
+
const segmentToolCalls = segment.tool_calls;
|
|
4522
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
4523
|
+
for (const tc of segmentToolCalls) {
|
|
4524
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
4525
|
+
const toolCall = tc;
|
|
4526
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
4527
|
+
}
|
|
4528
|
+
}
|
|
4529
|
+
}
|
|
4530
|
+
}
|
|
4531
|
+
return toolCalls;
|
|
4532
|
+
}
|
|
4533
|
+
validateToolCalls(expected, actual) {
|
|
4534
|
+
const hits = [];
|
|
4535
|
+
const misses = [];
|
|
4536
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4537
|
+
const expectedCall = expected[i];
|
|
4538
|
+
const actualCall = actual[i];
|
|
4539
|
+
if (!actualCall) {
|
|
4540
|
+
misses.push(
|
|
4541
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
4542
|
+
);
|
|
4543
|
+
continue;
|
|
4544
|
+
}
|
|
4545
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
4546
|
+
misses.push(
|
|
4547
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
4548
|
+
);
|
|
4549
|
+
continue;
|
|
4550
|
+
}
|
|
4551
|
+
if (expectedCall.input !== void 0) {
|
|
4552
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
4553
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
4554
|
+
continue;
|
|
4555
|
+
}
|
|
4556
|
+
}
|
|
4557
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
4558
|
+
}
|
|
4559
|
+
const totalChecks = expected.length || 1;
|
|
4560
|
+
const score = hits.length / totalChecks;
|
|
4561
|
+
return {
|
|
4562
|
+
score,
|
|
4563
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
4564
|
+
hits,
|
|
4565
|
+
misses,
|
|
4566
|
+
expectedAspectCount: totalChecks
|
|
4567
|
+
};
|
|
4568
|
+
}
|
|
4569
|
+
deepEquals(a, b) {
|
|
4570
|
+
if (a === b) return true;
|
|
4571
|
+
if (typeof a !== typeof b) return false;
|
|
4572
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
4573
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4574
|
+
if (a.length !== b.length) return false;
|
|
4575
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
4576
|
+
}
|
|
4577
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
4578
|
+
const aObj = a;
|
|
4579
|
+
const bObj = b;
|
|
4580
|
+
const aKeys = Object.keys(aObj);
|
|
4581
|
+
const bKeys = Object.keys(bObj);
|
|
4582
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
4583
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
4584
|
+
}
|
|
4585
|
+
};
|
|
4124
4586
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4125
4587
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4126
4588
|
|
|
@@ -4347,7 +4809,7 @@ var CompositeEvaluator = class {
|
|
|
4347
4809
|
// src/evaluation/orchestrator.ts
|
|
4348
4810
|
var import_node_crypto2 = require("crypto");
|
|
4349
4811
|
var import_promises11 = require("fs/promises");
|
|
4350
|
-
var
|
|
4812
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4351
4813
|
|
|
4352
4814
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4353
4815
|
var Node = class {
|
|
@@ -4554,7 +5016,7 @@ async function runEvaluation(options) {
|
|
|
4554
5016
|
if (!definition) {
|
|
4555
5017
|
return void 0;
|
|
4556
5018
|
}
|
|
4557
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
5019
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
4558
5020
|
resolvedTargetsByName.set(name, resolved);
|
|
4559
5021
|
return resolved;
|
|
4560
5022
|
};
|
|
@@ -4868,6 +5330,17 @@ async function runEvalCase(options) {
|
|
|
4868
5330
|
if (cacheKey && cache && !cachedResponse) {
|
|
4869
5331
|
await cache.set(cacheKey, providerResponse);
|
|
4870
5332
|
}
|
|
5333
|
+
let candidateTrace = providerResponse.trace;
|
|
5334
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
5335
|
+
try {
|
|
5336
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
5337
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
5338
|
+
candidateTrace = rawTrace;
|
|
5339
|
+
}
|
|
5340
|
+
} catch {
|
|
5341
|
+
}
|
|
5342
|
+
}
|
|
5343
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4871
5344
|
try {
|
|
4872
5345
|
return await evaluateCandidate({
|
|
4873
5346
|
evalCase,
|
|
@@ -4879,7 +5352,9 @@ async function runEvalCase(options) {
|
|
|
4879
5352
|
nowFn,
|
|
4880
5353
|
attempt,
|
|
4881
5354
|
judgeProvider,
|
|
4882
|
-
agentTimeoutMs
|
|
5355
|
+
agentTimeoutMs,
|
|
5356
|
+
candidateTrace,
|
|
5357
|
+
candidateTraceSummary
|
|
4883
5358
|
});
|
|
4884
5359
|
} catch (error) {
|
|
4885
5360
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4896,7 +5371,9 @@ async function evaluateCandidate(options) {
|
|
|
4896
5371
|
nowFn,
|
|
4897
5372
|
attempt,
|
|
4898
5373
|
judgeProvider,
|
|
4899
|
-
agentTimeoutMs
|
|
5374
|
+
agentTimeoutMs,
|
|
5375
|
+
candidateTrace,
|
|
5376
|
+
candidateTraceSummary
|
|
4900
5377
|
} = options;
|
|
4901
5378
|
const gradeTimestamp = nowFn();
|
|
4902
5379
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4909,7 +5386,9 @@ async function evaluateCandidate(options) {
|
|
|
4909
5386
|
promptInputs,
|
|
4910
5387
|
now: gradeTimestamp,
|
|
4911
5388
|
judgeProvider,
|
|
4912
|
-
agentTimeoutMs
|
|
5389
|
+
agentTimeoutMs,
|
|
5390
|
+
candidateTrace,
|
|
5391
|
+
candidateTraceSummary
|
|
4913
5392
|
});
|
|
4914
5393
|
const completedAt = nowFn();
|
|
4915
5394
|
let agentProviderRequest;
|
|
@@ -4948,7 +5427,8 @@ async function evaluateCandidate(options) {
|
|
|
4948
5427
|
agent_provider_request: agentProviderRequest,
|
|
4949
5428
|
lm_provider_request: lmProviderRequest,
|
|
4950
5429
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4951
|
-
evaluator_results: evaluatorResults
|
|
5430
|
+
evaluator_results: evaluatorResults,
|
|
5431
|
+
trace_summary: candidateTraceSummary
|
|
4952
5432
|
};
|
|
4953
5433
|
}
|
|
4954
5434
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4962,7 +5442,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4962
5442
|
promptInputs,
|
|
4963
5443
|
now,
|
|
4964
5444
|
judgeProvider,
|
|
4965
|
-
agentTimeoutMs
|
|
5445
|
+
agentTimeoutMs,
|
|
5446
|
+
candidateTrace,
|
|
5447
|
+
candidateTraceSummary
|
|
4966
5448
|
} = options;
|
|
4967
5449
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4968
5450
|
return runEvaluatorList({
|
|
@@ -4976,7 +5458,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4976
5458
|
promptInputs,
|
|
4977
5459
|
now,
|
|
4978
5460
|
judgeProvider,
|
|
4979
|
-
agentTimeoutMs
|
|
5461
|
+
agentTimeoutMs,
|
|
5462
|
+
candidateTrace,
|
|
5463
|
+
candidateTraceSummary
|
|
4980
5464
|
});
|
|
4981
5465
|
}
|
|
4982
5466
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4992,7 +5476,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4992
5476
|
attempt,
|
|
4993
5477
|
promptInputs,
|
|
4994
5478
|
now,
|
|
4995
|
-
judgeProvider
|
|
5479
|
+
judgeProvider,
|
|
5480
|
+
candidateTrace,
|
|
5481
|
+
candidateTraceSummary
|
|
4996
5482
|
});
|
|
4997
5483
|
return { score };
|
|
4998
5484
|
}
|
|
@@ -5008,7 +5494,9 @@ async function runEvaluatorList(options) {
|
|
|
5008
5494
|
promptInputs,
|
|
5009
5495
|
now,
|
|
5010
5496
|
judgeProvider,
|
|
5011
|
-
agentTimeoutMs
|
|
5497
|
+
agentTimeoutMs,
|
|
5498
|
+
candidateTrace,
|
|
5499
|
+
candidateTraceSummary
|
|
5012
5500
|
} = options;
|
|
5013
5501
|
const scored = [];
|
|
5014
5502
|
const evaluatorResults = [];
|
|
@@ -5067,7 +5555,7 @@ async function runEvaluatorList(options) {
|
|
|
5067
5555
|
});
|
|
5068
5556
|
}
|
|
5069
5557
|
if (evaluator.type === "composite") {
|
|
5070
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
5558
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5071
5559
|
const createEvaluator = (memberConfig) => {
|
|
5072
5560
|
switch (memberConfig.type) {
|
|
5073
5561
|
case "llm_judge":
|
|
@@ -5084,6 +5572,12 @@ async function runEvaluatorList(options) {
|
|
|
5084
5572
|
cwd: evalFileDir,
|
|
5085
5573
|
evaluatorFactory: { create: createEvaluator }
|
|
5086
5574
|
});
|
|
5575
|
+
case "tool_trajectory":
|
|
5576
|
+
return new ToolTrajectoryEvaluator({
|
|
5577
|
+
config: memberConfig
|
|
5578
|
+
});
|
|
5579
|
+
case "expected_messages":
|
|
5580
|
+
return new ExpectedMessagesEvaluator();
|
|
5087
5581
|
default: {
|
|
5088
5582
|
const unknownConfig = memberConfig;
|
|
5089
5583
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5118,6 +5612,56 @@ async function runEvaluatorList(options) {
|
|
|
5118
5612
|
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
5119
5613
|
});
|
|
5120
5614
|
}
|
|
5615
|
+
if (evaluator.type === "tool_trajectory") {
|
|
5616
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
5617
|
+
config: evaluator
|
|
5618
|
+
});
|
|
5619
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
5620
|
+
evalCase,
|
|
5621
|
+
candidate,
|
|
5622
|
+
target,
|
|
5623
|
+
provider,
|
|
5624
|
+
attempt,
|
|
5625
|
+
promptInputs,
|
|
5626
|
+
now,
|
|
5627
|
+
candidateTrace,
|
|
5628
|
+
candidateTraceSummary
|
|
5629
|
+
});
|
|
5630
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
5631
|
+
evaluatorResults.push({
|
|
5632
|
+
name: evaluator.name,
|
|
5633
|
+
type: evaluator.type,
|
|
5634
|
+
score: score2.score,
|
|
5635
|
+
verdict: score2.verdict,
|
|
5636
|
+
hits: score2.hits,
|
|
5637
|
+
misses: score2.misses,
|
|
5638
|
+
reasoning: score2.reasoning
|
|
5639
|
+
});
|
|
5640
|
+
}
|
|
5641
|
+
if (evaluator.type === "expected_messages") {
|
|
5642
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
5643
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
5644
|
+
evalCase,
|
|
5645
|
+
candidate,
|
|
5646
|
+
target,
|
|
5647
|
+
provider,
|
|
5648
|
+
attempt,
|
|
5649
|
+
promptInputs,
|
|
5650
|
+
now,
|
|
5651
|
+
candidateTrace,
|
|
5652
|
+
candidateTraceSummary
|
|
5653
|
+
});
|
|
5654
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
5655
|
+
evaluatorResults.push({
|
|
5656
|
+
name: evaluator.name,
|
|
5657
|
+
type: evaluator.type,
|
|
5658
|
+
score: score2.score,
|
|
5659
|
+
verdict: score2.verdict,
|
|
5660
|
+
hits: score2.hits,
|
|
5661
|
+
misses: score2.misses,
|
|
5662
|
+
reasoning: score2.reasoning
|
|
5663
|
+
});
|
|
5664
|
+
}
|
|
5121
5665
|
} catch (error) {
|
|
5122
5666
|
const message = error instanceof Error ? error.message : String(error);
|
|
5123
5667
|
const fallbackScore = {
|
|
@@ -5240,8 +5784,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5240
5784
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5241
5785
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5242
5786
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5243
|
-
const filePath =
|
|
5244
|
-
await (0, import_promises11.mkdir)(
|
|
5787
|
+
const filePath = import_node_path14.default.resolve(directory, filename);
|
|
5788
|
+
await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
|
|
5245
5789
|
const payload = {
|
|
5246
5790
|
eval_id: evalCase.id,
|
|
5247
5791
|
question: promptInputs.question,
|
|
@@ -5460,11 +6004,14 @@ function createAgentKernel() {
|
|
|
5460
6004
|
0 && (module.exports = {
|
|
5461
6005
|
CodeEvaluator,
|
|
5462
6006
|
CompositeEvaluator,
|
|
6007
|
+
ExpectedMessagesEvaluator,
|
|
5463
6008
|
LlmJudgeEvaluator,
|
|
5464
6009
|
TEST_MESSAGE_ROLES,
|
|
6010
|
+
ToolTrajectoryEvaluator,
|
|
5465
6011
|
buildDirectoryChain,
|
|
5466
6012
|
buildPromptInputs,
|
|
5467
6013
|
buildSearchRoots,
|
|
6014
|
+
computeTraceSummary,
|
|
5468
6015
|
consumeCodexLogEntries,
|
|
5469
6016
|
createAgentKernel,
|
|
5470
6017
|
createProvider,
|
|
@@ -5475,14 +6022,18 @@ function createAgentKernel() {
|
|
|
5475
6022
|
generateRubrics,
|
|
5476
6023
|
getHitCount,
|
|
5477
6024
|
isEvaluatorKind,
|
|
6025
|
+
isExpectedToolCall,
|
|
5478
6026
|
isGuidelineFile,
|
|
5479
6027
|
isJsonObject,
|
|
5480
6028
|
isJsonValue,
|
|
5481
6029
|
isTestMessage,
|
|
5482
6030
|
isTestMessageRole,
|
|
6031
|
+
isTraceEvent,
|
|
6032
|
+
isTraceEventType,
|
|
5483
6033
|
listTargetNames,
|
|
5484
6034
|
loadEvalCases,
|
|
5485
6035
|
normalizeLineEndings,
|
|
6036
|
+
readJsonFile,
|
|
5486
6037
|
readTargetDefinitions,
|
|
5487
6038
|
readTestSuiteMetadata,
|
|
5488
6039
|
readTextFile,
|