@agentv/core 0.23.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-NDEN3H2B.js} +28 -17
- package/dist/chunk-NDEN3H2B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +674 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +157 -4
- package/dist/index.d.ts +157 -4
- package/dist/index.js +629 -33
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,11 +32,14 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
|
|
35
36
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
37
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
38
|
+
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
37
39
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
38
40
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
41
|
buildSearchRoots: () => buildSearchRoots2,
|
|
42
|
+
computeTraceSummary: () => computeTraceSummary,
|
|
40
43
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
41
44
|
createAgentKernel: () => createAgentKernel,
|
|
42
45
|
createProvider: () => createProvider,
|
|
@@ -47,14 +50,18 @@ __export(index_exports, {
|
|
|
47
50
|
generateRubrics: () => generateRubrics,
|
|
48
51
|
getHitCount: () => getHitCount,
|
|
49
52
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
53
|
+
isExpectedToolCall: () => isExpectedToolCall,
|
|
50
54
|
isGuidelineFile: () => isGuidelineFile,
|
|
51
55
|
isJsonObject: () => isJsonObject,
|
|
52
56
|
isJsonValue: () => isJsonValue,
|
|
53
57
|
isTestMessage: () => isTestMessage,
|
|
54
58
|
isTestMessageRole: () => isTestMessageRole,
|
|
59
|
+
isTraceEvent: () => isTraceEvent,
|
|
60
|
+
isTraceEventType: () => isTraceEventType,
|
|
55
61
|
listTargetNames: () => listTargetNames,
|
|
56
62
|
loadEvalCases: () => loadEvalCases,
|
|
57
63
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
64
|
+
readJsonFile: () => readJsonFile,
|
|
58
65
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
59
66
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
60
67
|
readTextFile: () => readTextFile,
|
|
@@ -108,7 +115,14 @@ function isTestMessage(value) {
|
|
|
108
115
|
}
|
|
109
116
|
return candidate.content.every(isJsonObject);
|
|
110
117
|
}
|
|
111
|
-
var EVALUATOR_KIND_VALUES = [
|
|
118
|
+
var EVALUATOR_KIND_VALUES = [
|
|
119
|
+
"code_judge",
|
|
120
|
+
"llm_judge",
|
|
121
|
+
"rubric",
|
|
122
|
+
"composite",
|
|
123
|
+
"tool_trajectory",
|
|
124
|
+
"expected_messages"
|
|
125
|
+
];
|
|
112
126
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
113
127
|
function isEvaluatorKind(value) {
|
|
114
128
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -117,6 +131,44 @@ function getHitCount(result) {
|
|
|
117
131
|
return result.hits.length;
|
|
118
132
|
}
|
|
119
133
|
|
|
134
|
+
// src/evaluation/trace.ts
|
|
135
|
+
function isTraceEventType(value) {
|
|
136
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
137
|
+
}
|
|
138
|
+
function isTraceEvent(value) {
|
|
139
|
+
if (typeof value !== "object" || value === null) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
const candidate = value;
|
|
143
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
144
|
+
}
|
|
145
|
+
function isExpectedToolCall(value) {
|
|
146
|
+
if (typeof value !== "object" || value === null) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
const candidate = value;
|
|
150
|
+
return typeof candidate.tool === "string";
|
|
151
|
+
}
|
|
152
|
+
function computeTraceSummary(trace) {
|
|
153
|
+
const toolCallCounts = {};
|
|
154
|
+
let errorCount = 0;
|
|
155
|
+
for (const event of trace) {
|
|
156
|
+
if (event.type === "tool_call" && event.name) {
|
|
157
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
158
|
+
}
|
|
159
|
+
if (event.type === "error") {
|
|
160
|
+
errorCount++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
164
|
+
return {
|
|
165
|
+
eventCount: trace.length,
|
|
166
|
+
toolNames,
|
|
167
|
+
toolCallsByName: toolCallCounts,
|
|
168
|
+
errorCount
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
120
172
|
// src/evaluation/yaml-parser.ts
|
|
121
173
|
var import_promises6 = require("fs/promises");
|
|
122
174
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
@@ -466,6 +518,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
466
518
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
467
519
|
continue;
|
|
468
520
|
}
|
|
521
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
469
522
|
const cwd = asString2(rawEvaluator.cwd);
|
|
470
523
|
let resolvedCwd;
|
|
471
524
|
if (cwd) {
|
|
@@ -486,7 +539,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
486
539
|
type: "code",
|
|
487
540
|
script,
|
|
488
541
|
cwd,
|
|
489
|
-
resolvedCwd
|
|
542
|
+
resolvedCwd,
|
|
543
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
490
544
|
});
|
|
491
545
|
continue;
|
|
492
546
|
}
|
|
@@ -581,14 +635,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
581
635
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
582
636
|
};
|
|
583
637
|
}
|
|
638
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
584
639
|
evaluators.push({
|
|
585
640
|
name,
|
|
586
641
|
type: "composite",
|
|
587
642
|
evaluators: memberEvaluators,
|
|
588
|
-
aggregator
|
|
643
|
+
aggregator,
|
|
644
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
589
645
|
});
|
|
590
646
|
continue;
|
|
591
647
|
}
|
|
648
|
+
if (typeValue === "expected_messages") {
|
|
649
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
650
|
+
evaluators.push({
|
|
651
|
+
name,
|
|
652
|
+
type: "expected_messages",
|
|
653
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
654
|
+
});
|
|
655
|
+
continue;
|
|
656
|
+
}
|
|
657
|
+
if (typeValue === "tool_trajectory") {
|
|
658
|
+
const mode = asString2(rawEvaluator.mode);
|
|
659
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
660
|
+
logWarning2(
|
|
661
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
662
|
+
);
|
|
663
|
+
continue;
|
|
664
|
+
}
|
|
665
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
666
|
+
let minimums;
|
|
667
|
+
if (rawMinimums !== void 0) {
|
|
668
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
669
|
+
logWarning2(
|
|
670
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
671
|
+
);
|
|
672
|
+
continue;
|
|
673
|
+
}
|
|
674
|
+
minimums = {};
|
|
675
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
676
|
+
if (typeof count === "number" && count >= 0) {
|
|
677
|
+
minimums[toolName] = count;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
const rawExpected = rawEvaluator.expected;
|
|
682
|
+
let expected;
|
|
683
|
+
if (rawExpected !== void 0) {
|
|
684
|
+
if (!Array.isArray(rawExpected)) {
|
|
685
|
+
logWarning2(
|
|
686
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
687
|
+
);
|
|
688
|
+
continue;
|
|
689
|
+
}
|
|
690
|
+
expected = [];
|
|
691
|
+
for (const item of rawExpected) {
|
|
692
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
693
|
+
expected.push({ tool: item.tool });
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
if (mode === "any_order" && !minimums) {
|
|
698
|
+
logWarning2(
|
|
699
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
700
|
+
);
|
|
701
|
+
continue;
|
|
702
|
+
}
|
|
703
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
704
|
+
logWarning2(
|
|
705
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
706
|
+
);
|
|
707
|
+
continue;
|
|
708
|
+
}
|
|
709
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
710
|
+
const config = {
|
|
711
|
+
name,
|
|
712
|
+
type: "tool_trajectory",
|
|
713
|
+
mode,
|
|
714
|
+
...minimums ? { minimums } : {},
|
|
715
|
+
...expected ? { expected } : {},
|
|
716
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
717
|
+
};
|
|
718
|
+
evaluators.push(config);
|
|
719
|
+
continue;
|
|
720
|
+
}
|
|
592
721
|
const prompt = asString2(rawEvaluator.prompt);
|
|
593
722
|
let promptPath;
|
|
594
723
|
if (prompt) {
|
|
@@ -625,19 +754,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
625
754
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
626
755
|
continue;
|
|
627
756
|
}
|
|
757
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
628
758
|
evaluators.push({
|
|
629
759
|
name,
|
|
630
760
|
type: "llm_judge",
|
|
631
|
-
rubrics: parsedRubrics
|
|
761
|
+
rubrics: parsedRubrics,
|
|
762
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
632
763
|
});
|
|
633
764
|
continue;
|
|
634
765
|
}
|
|
766
|
+
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
635
767
|
evaluators.push({
|
|
636
768
|
name,
|
|
637
769
|
type: "llm_judge",
|
|
638
770
|
prompt,
|
|
639
771
|
promptPath,
|
|
640
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
772
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
773
|
+
...weight !== void 0 ? { weight } : {}
|
|
641
774
|
});
|
|
642
775
|
}
|
|
643
776
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -667,6 +800,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
667
800
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
668
801
|
}
|
|
669
802
|
}
|
|
803
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
804
|
+
if (rawWeight === void 0) {
|
|
805
|
+
return void 0;
|
|
806
|
+
}
|
|
807
|
+
if (typeof rawWeight !== "number") {
|
|
808
|
+
throw new Error(
|
|
809
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
810
|
+
);
|
|
811
|
+
}
|
|
812
|
+
if (!Number.isFinite(rawWeight)) {
|
|
813
|
+
throw new Error(
|
|
814
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
815
|
+
);
|
|
816
|
+
}
|
|
817
|
+
if (rawWeight < 0) {
|
|
818
|
+
throw new Error(
|
|
819
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
820
|
+
);
|
|
821
|
+
}
|
|
822
|
+
return rawWeight;
|
|
823
|
+
}
|
|
670
824
|
|
|
671
825
|
// src/evaluation/loaders/message-processor.ts
|
|
672
826
|
var import_promises4 = require("fs/promises");
|
|
@@ -842,6 +996,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
842
996
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
843
997
|
}
|
|
844
998
|
}
|
|
999
|
+
async function processExpectedMessages(options) {
|
|
1000
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
1001
|
+
const segments = [];
|
|
1002
|
+
for (const message of messages) {
|
|
1003
|
+
const segment = {
|
|
1004
|
+
role: message.role
|
|
1005
|
+
};
|
|
1006
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
1007
|
+
segment.tool_calls = message.tool_calls;
|
|
1008
|
+
}
|
|
1009
|
+
const content = message.content;
|
|
1010
|
+
if (typeof content === "string") {
|
|
1011
|
+
segment.content = content;
|
|
1012
|
+
} else if (Array.isArray(content)) {
|
|
1013
|
+
const processedContent = [];
|
|
1014
|
+
for (const rawSegment of content) {
|
|
1015
|
+
if (!isJsonObject(rawSegment)) {
|
|
1016
|
+
continue;
|
|
1017
|
+
}
|
|
1018
|
+
const segmentType = asString3(rawSegment.type);
|
|
1019
|
+
if (segmentType === "file") {
|
|
1020
|
+
const rawValue = asString3(rawSegment.value);
|
|
1021
|
+
if (!rawValue) {
|
|
1022
|
+
continue;
|
|
1023
|
+
}
|
|
1024
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
1025
|
+
rawValue,
|
|
1026
|
+
searchRoots
|
|
1027
|
+
);
|
|
1028
|
+
if (!resolvedPath) {
|
|
1029
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
1030
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
1031
|
+
continue;
|
|
1032
|
+
}
|
|
1033
|
+
try {
|
|
1034
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
1035
|
+
processedContent.push({
|
|
1036
|
+
type: "file",
|
|
1037
|
+
path: displayPath,
|
|
1038
|
+
text: fileContent,
|
|
1039
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
1040
|
+
});
|
|
1041
|
+
if (verbose) {
|
|
1042
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
1043
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
1044
|
+
}
|
|
1045
|
+
} catch (error) {
|
|
1046
|
+
logWarning3(
|
|
1047
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
1048
|
+
);
|
|
1049
|
+
}
|
|
1050
|
+
continue;
|
|
1051
|
+
}
|
|
1052
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
1053
|
+
}
|
|
1054
|
+
segment.content = processedContent;
|
|
1055
|
+
}
|
|
1056
|
+
segments.push(segment);
|
|
1057
|
+
}
|
|
1058
|
+
return segments;
|
|
1059
|
+
}
|
|
845
1060
|
|
|
846
1061
|
// src/evaluation/formatting/prompt-builder.ts
|
|
847
1062
|
var import_promises5 = require("fs/promises");
|
|
@@ -1146,12 +1361,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1146
1361
|
messageType: "input",
|
|
1147
1362
|
verbose
|
|
1148
1363
|
});
|
|
1149
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1364
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1150
1365
|
messages: expectedMessages,
|
|
1151
1366
|
searchRoots,
|
|
1152
1367
|
repoRootPath,
|
|
1153
|
-
guidelinePatterns,
|
|
1154
|
-
messageType: "output",
|
|
1155
1368
|
verbose
|
|
1156
1369
|
}) : [];
|
|
1157
1370
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1278,6 +1491,10 @@ async function readTextFile(filePath) {
|
|
|
1278
1491
|
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1279
1492
|
return normalizeLineEndings(content);
|
|
1280
1493
|
}
|
|
1494
|
+
async function readJsonFile(filePath) {
|
|
1495
|
+
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1496
|
+
return JSON.parse(content);
|
|
1497
|
+
}
|
|
1281
1498
|
async function findGitRoot(startPath) {
|
|
1282
1499
|
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1283
1500
|
const root = import_node_path7.default.parse(currentDir).root;
|
|
@@ -1786,9 +2003,11 @@ var CliProvider = class {
|
|
|
1786
2003
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1787
2004
|
throw new Error(message);
|
|
1788
2005
|
}
|
|
1789
|
-
const
|
|
2006
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
2007
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1790
2008
|
return {
|
|
1791
|
-
text:
|
|
2009
|
+
text: parsed.text,
|
|
2010
|
+
trace: parsed.trace,
|
|
1792
2011
|
raw: {
|
|
1793
2012
|
command: renderedCommand,
|
|
1794
2013
|
stderr: result.stderr,
|
|
@@ -1798,6 +2017,31 @@ var CliProvider = class {
|
|
|
1798
2017
|
}
|
|
1799
2018
|
};
|
|
1800
2019
|
}
|
|
2020
|
+
/**
|
|
2021
|
+
* Parse output content from CLI.
|
|
2022
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
2023
|
+
* Otherwise, treat the entire content as plain text.
|
|
2024
|
+
*/
|
|
2025
|
+
parseOutputContent(content) {
|
|
2026
|
+
try {
|
|
2027
|
+
const parsed = JSON.parse(content);
|
|
2028
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
2029
|
+
const obj = parsed;
|
|
2030
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2031
|
+
const trace = this.parseTrace(obj.trace);
|
|
2032
|
+
return { text, trace };
|
|
2033
|
+
}
|
|
2034
|
+
} catch {
|
|
2035
|
+
}
|
|
2036
|
+
return { text: content };
|
|
2037
|
+
}
|
|
2038
|
+
parseTrace(trace) {
|
|
2039
|
+
if (!Array.isArray(trace)) {
|
|
2040
|
+
return void 0;
|
|
2041
|
+
}
|
|
2042
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
2043
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
2044
|
+
}
|
|
1801
2045
|
async readAndCleanupOutputFile(filePath) {
|
|
1802
2046
|
try {
|
|
1803
2047
|
const content = await readTextFile(filePath);
|
|
@@ -2784,6 +3028,7 @@ var MockProvider = class {
|
|
|
2784
3028
|
delayMs;
|
|
2785
3029
|
delayMinMs;
|
|
2786
3030
|
delayMaxMs;
|
|
3031
|
+
trace;
|
|
2787
3032
|
constructor(targetName, config) {
|
|
2788
3033
|
this.id = `mock:${targetName}`;
|
|
2789
3034
|
this.targetName = targetName;
|
|
@@ -2791,6 +3036,7 @@ var MockProvider = class {
|
|
|
2791
3036
|
this.delayMs = config.delayMs ?? 0;
|
|
2792
3037
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2793
3038
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
3039
|
+
this.trace = config.trace;
|
|
2794
3040
|
}
|
|
2795
3041
|
async invoke(request) {
|
|
2796
3042
|
const delay = this.calculateDelay();
|
|
@@ -2802,7 +3048,8 @@ var MockProvider = class {
|
|
|
2802
3048
|
raw: {
|
|
2803
3049
|
question: request.question,
|
|
2804
3050
|
guidelines: request.guidelines
|
|
2805
|
-
}
|
|
3051
|
+
},
|
|
3052
|
+
trace: this.trace
|
|
2806
3053
|
};
|
|
2807
3054
|
}
|
|
2808
3055
|
calculateDelay() {
|
|
@@ -2816,6 +3063,7 @@ var MockProvider = class {
|
|
|
2816
3063
|
};
|
|
2817
3064
|
|
|
2818
3065
|
// src/evaluation/providers/targets.ts
|
|
3066
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2819
3067
|
var import_zod = require("zod");
|
|
2820
3068
|
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2821
3069
|
"PROMPT",
|
|
@@ -2831,7 +3079,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
|
2831
3079
|
judge_target: import_zod.z.string().optional(),
|
|
2832
3080
|
workers: import_zod.z.number().int().min(1).optional()
|
|
2833
3081
|
}).passthrough();
|
|
2834
|
-
var DEFAULT_AZURE_API_VERSION = "2024-
|
|
3082
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
2835
3083
|
function normalizeAzureApiVersion(value) {
|
|
2836
3084
|
if (!value) {
|
|
2837
3085
|
return DEFAULT_AZURE_API_VERSION;
|
|
@@ -2875,7 +3123,7 @@ function resolveRetryConfig(target) {
|
|
|
2875
3123
|
retryableStatusCodes
|
|
2876
3124
|
};
|
|
2877
3125
|
}
|
|
2878
|
-
function resolveTargetDefinition(definition, env = process.env) {
|
|
3126
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
2879
3127
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
2880
3128
|
const provider = parsed.provider.toLowerCase();
|
|
2881
3129
|
const providerBatching = resolveOptionalBoolean(
|
|
@@ -2948,7 +3196,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
2948
3196
|
judgeTarget: parsed.judge_target,
|
|
2949
3197
|
workers: parsed.workers,
|
|
2950
3198
|
providerBatching,
|
|
2951
|
-
config: resolveCliConfig(parsed, env)
|
|
3199
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
2952
3200
|
};
|
|
2953
3201
|
default:
|
|
2954
3202
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
@@ -2965,7 +3213,10 @@ function resolveAzureConfig(target, env) {
|
|
|
2965
3213
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
2966
3214
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
2967
3215
|
const version = normalizeAzureApiVersion(
|
|
2968
|
-
resolveOptionalString(versionSource, env, `${target.name} api version
|
|
3216
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
3217
|
+
allowLiteral: true,
|
|
3218
|
+
optionalEnv: true
|
|
3219
|
+
})
|
|
2969
3220
|
);
|
|
2970
3221
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
2971
3222
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -3066,7 +3317,8 @@ function normalizeCodexLogFormat(value) {
|
|
|
3066
3317
|
}
|
|
3067
3318
|
function resolveMockConfig(target) {
|
|
3068
3319
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3069
|
-
|
|
3320
|
+
const trace = Array.isArray(target.trace) ? target.trace : void 0;
|
|
3321
|
+
return { response, trace };
|
|
3070
3322
|
}
|
|
3071
3323
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
3072
3324
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -3098,15 +3350,18 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
3098
3350
|
workspaceTemplate
|
|
3099
3351
|
};
|
|
3100
3352
|
}
|
|
3101
|
-
function resolveCliConfig(target, env) {
|
|
3353
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
3102
3354
|
const commandTemplateSource = target.command_template ?? target.commandTemplate;
|
|
3103
3355
|
const filesFormat = resolveOptionalLiteralString(
|
|
3104
3356
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3105
3357
|
);
|
|
3106
|
-
|
|
3358
|
+
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3107
3359
|
allowLiteral: true,
|
|
3108
3360
|
optionalEnv: true
|
|
3109
3361
|
});
|
|
3362
|
+
if (!cwd && evalFilePath) {
|
|
3363
|
+
cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
|
|
3364
|
+
}
|
|
3110
3365
|
const timeoutMs = resolveTimeoutMs(
|
|
3111
3366
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
3112
3367
|
`${target.name} timeout`
|
|
@@ -3224,17 +3479,15 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
3224
3479
|
if (envVarMatch) {
|
|
3225
3480
|
const varName = envVarMatch[1];
|
|
3226
3481
|
const envValue = env[varName];
|
|
3227
|
-
if (envValue !== void 0) {
|
|
3228
|
-
if (envValue.trim().length === 0) {
|
|
3229
|
-
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
3230
|
-
}
|
|
3231
|
-
return envValue;
|
|
3232
|
-
}
|
|
3233
3482
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
3234
|
-
if (
|
|
3235
|
-
|
|
3483
|
+
if (envValue === void 0 || envValue.trim().length === 0) {
|
|
3484
|
+
if (optionalEnv) {
|
|
3485
|
+
return void 0;
|
|
3486
|
+
}
|
|
3487
|
+
const status = envValue === void 0 ? "is not set" : "is empty";
|
|
3488
|
+
throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
|
|
3236
3489
|
}
|
|
3237
|
-
|
|
3490
|
+
return envValue;
|
|
3238
3491
|
}
|
|
3239
3492
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
3240
3493
|
if (!allowLiteral) {
|
|
@@ -3346,7 +3599,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
3346
3599
|
}
|
|
3347
3600
|
|
|
3348
3601
|
// src/evaluation/providers/vscode.ts
|
|
3349
|
-
var
|
|
3602
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3350
3603
|
var import_subagent = require("subagent");
|
|
3351
3604
|
|
|
3352
3605
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -3516,7 +3769,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3516
3769
|
return "";
|
|
3517
3770
|
}
|
|
3518
3771
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3519
|
-
const fileName =
|
|
3772
|
+
const fileName = import_node_path12.default.basename(absolutePath);
|
|
3520
3773
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3521
3774
|
return `* [${fileName}](${fileUri})`;
|
|
3522
3775
|
});
|
|
@@ -3541,8 +3794,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3541
3794
|
}
|
|
3542
3795
|
const unique = /* @__PURE__ */ new Map();
|
|
3543
3796
|
for (const attachment of attachments) {
|
|
3544
|
-
const absolutePath =
|
|
3545
|
-
const normalized = absolutePath.split(
|
|
3797
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3798
|
+
const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
|
|
3546
3799
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3547
3800
|
if (!unique.has(absolutePath)) {
|
|
3548
3801
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3557,7 +3810,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3557
3810
|
}
|
|
3558
3811
|
const unique = /* @__PURE__ */ new Map();
|
|
3559
3812
|
for (const attachment of attachments) {
|
|
3560
|
-
const absolutePath =
|
|
3813
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3561
3814
|
if (!unique.has(absolutePath)) {
|
|
3562
3815
|
unique.set(absolutePath, absolutePath);
|
|
3563
3816
|
}
|
|
@@ -3565,7 +3818,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3565
3818
|
return Array.from(unique.values());
|
|
3566
3819
|
}
|
|
3567
3820
|
function pathToFileUri2(filePath) {
|
|
3568
|
-
const absolutePath =
|
|
3821
|
+
const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
|
|
3569
3822
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3570
3823
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3571
3824
|
return `file:///${normalizedPath}`;
|
|
@@ -3578,7 +3831,7 @@ function normalizeAttachments(attachments) {
|
|
|
3578
3831
|
}
|
|
3579
3832
|
const deduped = /* @__PURE__ */ new Set();
|
|
3580
3833
|
for (const attachment of attachments) {
|
|
3581
|
-
deduped.add(
|
|
3834
|
+
deduped.add(import_node_path12.default.resolve(attachment));
|
|
3582
3835
|
}
|
|
3583
3836
|
return Array.from(deduped);
|
|
3584
3837
|
}
|
|
@@ -3587,7 +3840,7 @@ function mergeAttachments(all) {
|
|
|
3587
3840
|
for (const list of all) {
|
|
3588
3841
|
if (!list) continue;
|
|
3589
3842
|
for (const inputFile of list) {
|
|
3590
|
-
deduped.add(
|
|
3843
|
+
deduped.add(import_node_path12.default.resolve(inputFile));
|
|
3591
3844
|
}
|
|
3592
3845
|
}
|
|
3593
3846
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3636,7 +3889,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3636
3889
|
// src/evaluation/providers/targets-file.ts
|
|
3637
3890
|
var import_node_fs4 = require("fs");
|
|
3638
3891
|
var import_promises10 = require("fs/promises");
|
|
3639
|
-
var
|
|
3892
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3640
3893
|
var import_yaml3 = require("yaml");
|
|
3641
3894
|
function isRecord(value) {
|
|
3642
3895
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -3673,7 +3926,7 @@ async function fileExists3(filePath) {
|
|
|
3673
3926
|
}
|
|
3674
3927
|
}
|
|
3675
3928
|
async function readTargetDefinitions(filePath) {
|
|
3676
|
-
const absolutePath =
|
|
3929
|
+
const absolutePath = import_node_path13.default.resolve(filePath);
|
|
3677
3930
|
if (!await fileExists3(absolutePath)) {
|
|
3678
3931
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3679
3932
|
}
|
|
@@ -3999,9 +4252,11 @@ var CodeEvaluator = class {
|
|
|
3999
4252
|
expected_outcome: context.evalCase.expected_outcome,
|
|
4000
4253
|
reference_answer: context.evalCase.reference_answer,
|
|
4001
4254
|
candidate_answer: context.candidate,
|
|
4002
|
-
|
|
4003
|
-
input_files: context.evalCase.file_paths
|
|
4004
|
-
|
|
4255
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
4256
|
+
input_files: context.evalCase.file_paths.filter(
|
|
4257
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
4258
|
+
),
|
|
4259
|
+
input_messages: context.evalCase.input_messages
|
|
4005
4260
|
},
|
|
4006
4261
|
null,
|
|
4007
4262
|
2
|
|
@@ -4121,6 +4376,251 @@ function substituteVariables(template, variables) {
|
|
|
4121
4376
|
return variables[varName] ?? match;
|
|
4122
4377
|
});
|
|
4123
4378
|
}
|
|
4379
|
+
var ToolTrajectoryEvaluator = class {
|
|
4380
|
+
kind = "tool_trajectory";
|
|
4381
|
+
config;
|
|
4382
|
+
constructor(options) {
|
|
4383
|
+
this.config = options.config;
|
|
4384
|
+
}
|
|
4385
|
+
evaluate(context) {
|
|
4386
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
4387
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
4388
|
+
return {
|
|
4389
|
+
score: 0,
|
|
4390
|
+
verdict: "fail",
|
|
4391
|
+
hits: [],
|
|
4392
|
+
misses: ["No trace available for evaluation"],
|
|
4393
|
+
expectedAspectCount: 1
|
|
4394
|
+
};
|
|
4395
|
+
}
|
|
4396
|
+
switch (this.config.mode) {
|
|
4397
|
+
case "any_order":
|
|
4398
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
4399
|
+
case "in_order":
|
|
4400
|
+
return this.evaluateInOrder(candidateTrace);
|
|
4401
|
+
case "exact":
|
|
4402
|
+
return this.evaluateExact(candidateTrace);
|
|
4403
|
+
default:
|
|
4404
|
+
return {
|
|
4405
|
+
score: 0,
|
|
4406
|
+
verdict: "fail",
|
|
4407
|
+
hits: [],
|
|
4408
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
4409
|
+
expectedAspectCount: 1
|
|
4410
|
+
};
|
|
4411
|
+
}
|
|
4412
|
+
}
|
|
4413
|
+
evaluateAnyOrder(summary) {
|
|
4414
|
+
const minimums = this.config.minimums ?? {};
|
|
4415
|
+
const toolNames = Object.keys(minimums);
|
|
4416
|
+
if (toolNames.length === 0) {
|
|
4417
|
+
return {
|
|
4418
|
+
score: 1,
|
|
4419
|
+
verdict: "pass",
|
|
4420
|
+
hits: ["No tool requirements specified"],
|
|
4421
|
+
misses: [],
|
|
4422
|
+
expectedAspectCount: 0
|
|
4423
|
+
};
|
|
4424
|
+
}
|
|
4425
|
+
const hits = [];
|
|
4426
|
+
const misses = [];
|
|
4427
|
+
for (const toolName of toolNames) {
|
|
4428
|
+
const required = minimums[toolName];
|
|
4429
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
4430
|
+
if (actual >= required) {
|
|
4431
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4432
|
+
} else {
|
|
4433
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4434
|
+
}
|
|
4435
|
+
}
|
|
4436
|
+
const score = hits.length / toolNames.length;
|
|
4437
|
+
return {
|
|
4438
|
+
score,
|
|
4439
|
+
verdict: scoreToVerdict(score),
|
|
4440
|
+
hits,
|
|
4441
|
+
misses,
|
|
4442
|
+
expectedAspectCount: toolNames.length
|
|
4443
|
+
};
|
|
4444
|
+
}
|
|
4445
|
+
evaluateInOrder(trace) {
|
|
4446
|
+
const expected = this.config.expected ?? [];
|
|
4447
|
+
if (expected.length === 0) {
|
|
4448
|
+
return {
|
|
4449
|
+
score: 1,
|
|
4450
|
+
verdict: "pass",
|
|
4451
|
+
hits: ["No tool sequence specified"],
|
|
4452
|
+
misses: [],
|
|
4453
|
+
expectedAspectCount: 0
|
|
4454
|
+
};
|
|
4455
|
+
}
|
|
4456
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4457
|
+
const hits = [];
|
|
4458
|
+
const misses = [];
|
|
4459
|
+
let actualIndex = 0;
|
|
4460
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4461
|
+
const expectedTool = expected[i].tool;
|
|
4462
|
+
let found = false;
|
|
4463
|
+
while (actualIndex < actualToolCalls.length) {
|
|
4464
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
4465
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4466
|
+
actualIndex++;
|
|
4467
|
+
found = true;
|
|
4468
|
+
break;
|
|
4469
|
+
}
|
|
4470
|
+
actualIndex++;
|
|
4471
|
+
}
|
|
4472
|
+
if (!found) {
|
|
4473
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
4474
|
+
}
|
|
4475
|
+
}
|
|
4476
|
+
const score = hits.length / expected.length;
|
|
4477
|
+
return {
|
|
4478
|
+
score,
|
|
4479
|
+
verdict: scoreToVerdict(score),
|
|
4480
|
+
hits,
|
|
4481
|
+
misses,
|
|
4482
|
+
expectedAspectCount: expected.length
|
|
4483
|
+
};
|
|
4484
|
+
}
|
|
4485
|
+
evaluateExact(trace) {
|
|
4486
|
+
const expected = this.config.expected ?? [];
|
|
4487
|
+
if (expected.length === 0) {
|
|
4488
|
+
return {
|
|
4489
|
+
score: 1,
|
|
4490
|
+
verdict: "pass",
|
|
4491
|
+
hits: ["No tool sequence specified"],
|
|
4492
|
+
misses: [],
|
|
4493
|
+
expectedAspectCount: 0
|
|
4494
|
+
};
|
|
4495
|
+
}
|
|
4496
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4497
|
+
const hits = [];
|
|
4498
|
+
const misses = [];
|
|
4499
|
+
if (actualToolCalls.length !== expected.length) {
|
|
4500
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
4501
|
+
}
|
|
4502
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
4503
|
+
for (let i = 0; i < checkLength; i++) {
|
|
4504
|
+
const expectedTool = expected[i].tool;
|
|
4505
|
+
const actualTool = actualToolCalls[i].name;
|
|
4506
|
+
if (actualTool === expectedTool) {
|
|
4507
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
4508
|
+
} else {
|
|
4509
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4510
|
+
}
|
|
4511
|
+
}
|
|
4512
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
4513
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
4514
|
+
}
|
|
4515
|
+
const score = hits.length / expected.length;
|
|
4516
|
+
return {
|
|
4517
|
+
score,
|
|
4518
|
+
verdict: scoreToVerdict(score),
|
|
4519
|
+
hits,
|
|
4520
|
+
misses,
|
|
4521
|
+
expectedAspectCount: expected.length
|
|
4522
|
+
};
|
|
4523
|
+
}
|
|
4524
|
+
};
|
|
4525
|
+
var ExpectedMessagesEvaluator = class {
|
|
4526
|
+
kind = "expected_messages";
|
|
4527
|
+
evaluate(context) {
|
|
4528
|
+
const { candidateTrace, evalCase } = context;
|
|
4529
|
+
const expectedSegments = evalCase.expected_segments;
|
|
4530
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
4531
|
+
if (expectedToolCalls.length === 0) {
|
|
4532
|
+
return {
|
|
4533
|
+
score: 1,
|
|
4534
|
+
verdict: "pass",
|
|
4535
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
4536
|
+
misses: [],
|
|
4537
|
+
expectedAspectCount: 1
|
|
4538
|
+
};
|
|
4539
|
+
}
|
|
4540
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
4541
|
+
return {
|
|
4542
|
+
score: 0,
|
|
4543
|
+
verdict: "fail",
|
|
4544
|
+
hits: [],
|
|
4545
|
+
misses: ["No trace available to validate tool_calls"],
|
|
4546
|
+
expectedAspectCount: expectedToolCalls.length
|
|
4547
|
+
};
|
|
4548
|
+
}
|
|
4549
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
4550
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
4551
|
+
}
|
|
4552
|
+
extractExpectedToolCalls(segments) {
|
|
4553
|
+
if (!segments) {
|
|
4554
|
+
return [];
|
|
4555
|
+
}
|
|
4556
|
+
const toolCalls = [];
|
|
4557
|
+
for (const segment of segments) {
|
|
4558
|
+
const role = segment.role;
|
|
4559
|
+
const segmentToolCalls = segment.tool_calls;
|
|
4560
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
4561
|
+
for (const tc of segmentToolCalls) {
|
|
4562
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
4563
|
+
const toolCall = tc;
|
|
4564
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
4565
|
+
}
|
|
4566
|
+
}
|
|
4567
|
+
}
|
|
4568
|
+
}
|
|
4569
|
+
return toolCalls;
|
|
4570
|
+
}
|
|
4571
|
+
validateToolCalls(expected, actual) {
|
|
4572
|
+
const hits = [];
|
|
4573
|
+
const misses = [];
|
|
4574
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4575
|
+
const expectedCall = expected[i];
|
|
4576
|
+
const actualCall = actual[i];
|
|
4577
|
+
if (!actualCall) {
|
|
4578
|
+
misses.push(
|
|
4579
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
4580
|
+
);
|
|
4581
|
+
continue;
|
|
4582
|
+
}
|
|
4583
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
4584
|
+
misses.push(
|
|
4585
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
4586
|
+
);
|
|
4587
|
+
continue;
|
|
4588
|
+
}
|
|
4589
|
+
if (expectedCall.input !== void 0) {
|
|
4590
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
4591
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
4592
|
+
continue;
|
|
4593
|
+
}
|
|
4594
|
+
}
|
|
4595
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
4596
|
+
}
|
|
4597
|
+
const totalChecks = expected.length || 1;
|
|
4598
|
+
const score = hits.length / totalChecks;
|
|
4599
|
+
return {
|
|
4600
|
+
score,
|
|
4601
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
4602
|
+
hits,
|
|
4603
|
+
misses,
|
|
4604
|
+
expectedAspectCount: totalChecks
|
|
4605
|
+
};
|
|
4606
|
+
}
|
|
4607
|
+
deepEquals(a, b) {
|
|
4608
|
+
if (a === b) return true;
|
|
4609
|
+
if (typeof a !== typeof b) return false;
|
|
4610
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
4611
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4612
|
+
if (a.length !== b.length) return false;
|
|
4613
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
4614
|
+
}
|
|
4615
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
4616
|
+
const aObj = a;
|
|
4617
|
+
const bObj = b;
|
|
4618
|
+
const aKeys = Object.keys(aObj);
|
|
4619
|
+
const bKeys = Object.keys(bObj);
|
|
4620
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
4621
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
4622
|
+
}
|
|
4623
|
+
};
|
|
4124
4624
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4125
4625
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4126
4626
|
|
|
@@ -4347,7 +4847,7 @@ var CompositeEvaluator = class {
|
|
|
4347
4847
|
// src/evaluation/orchestrator.ts
|
|
4348
4848
|
var import_node_crypto2 = require("crypto");
|
|
4349
4849
|
var import_promises11 = require("fs/promises");
|
|
4350
|
-
var
|
|
4850
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4351
4851
|
|
|
4352
4852
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4353
4853
|
var Node = class {
|
|
@@ -4554,7 +5054,7 @@ async function runEvaluation(options) {
|
|
|
4554
5054
|
if (!definition) {
|
|
4555
5055
|
return void 0;
|
|
4556
5056
|
}
|
|
4557
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
5057
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
4558
5058
|
resolvedTargetsByName.set(name, resolved);
|
|
4559
5059
|
return resolved;
|
|
4560
5060
|
};
|
|
@@ -4868,6 +5368,17 @@ async function runEvalCase(options) {
|
|
|
4868
5368
|
if (cacheKey && cache && !cachedResponse) {
|
|
4869
5369
|
await cache.set(cacheKey, providerResponse);
|
|
4870
5370
|
}
|
|
5371
|
+
let candidateTrace = providerResponse.trace;
|
|
5372
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
5373
|
+
try {
|
|
5374
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
5375
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
5376
|
+
candidateTrace = rawTrace;
|
|
5377
|
+
}
|
|
5378
|
+
} catch {
|
|
5379
|
+
}
|
|
5380
|
+
}
|
|
5381
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4871
5382
|
try {
|
|
4872
5383
|
return await evaluateCandidate({
|
|
4873
5384
|
evalCase,
|
|
@@ -4879,7 +5390,9 @@ async function runEvalCase(options) {
|
|
|
4879
5390
|
nowFn,
|
|
4880
5391
|
attempt,
|
|
4881
5392
|
judgeProvider,
|
|
4882
|
-
agentTimeoutMs
|
|
5393
|
+
agentTimeoutMs,
|
|
5394
|
+
candidateTrace,
|
|
5395
|
+
candidateTraceSummary
|
|
4883
5396
|
});
|
|
4884
5397
|
} catch (error) {
|
|
4885
5398
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4896,7 +5409,9 @@ async function evaluateCandidate(options) {
|
|
|
4896
5409
|
nowFn,
|
|
4897
5410
|
attempt,
|
|
4898
5411
|
judgeProvider,
|
|
4899
|
-
agentTimeoutMs
|
|
5412
|
+
agentTimeoutMs,
|
|
5413
|
+
candidateTrace,
|
|
5414
|
+
candidateTraceSummary
|
|
4900
5415
|
} = options;
|
|
4901
5416
|
const gradeTimestamp = nowFn();
|
|
4902
5417
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4909,7 +5424,9 @@ async function evaluateCandidate(options) {
|
|
|
4909
5424
|
promptInputs,
|
|
4910
5425
|
now: gradeTimestamp,
|
|
4911
5426
|
judgeProvider,
|
|
4912
|
-
agentTimeoutMs
|
|
5427
|
+
agentTimeoutMs,
|
|
5428
|
+
candidateTrace,
|
|
5429
|
+
candidateTraceSummary
|
|
4913
5430
|
});
|
|
4914
5431
|
const completedAt = nowFn();
|
|
4915
5432
|
let agentProviderRequest;
|
|
@@ -4922,14 +5439,12 @@ async function evaluateCandidate(options) {
|
|
|
4922
5439
|
} else {
|
|
4923
5440
|
if (promptInputs.chatPrompt) {
|
|
4924
5441
|
lmProviderRequest = {
|
|
4925
|
-
chat_prompt: promptInputs.chatPrompt
|
|
4926
|
-
guideline_paths: evalCase.guideline_paths
|
|
5442
|
+
chat_prompt: promptInputs.chatPrompt
|
|
4927
5443
|
};
|
|
4928
5444
|
} else {
|
|
4929
5445
|
lmProviderRequest = {
|
|
4930
5446
|
question: promptInputs.question,
|
|
4931
|
-
guidelines: promptInputs.guidelines
|
|
4932
|
-
guideline_paths: evalCase.guideline_paths
|
|
5447
|
+
guidelines: promptInputs.guidelines
|
|
4933
5448
|
};
|
|
4934
5449
|
}
|
|
4935
5450
|
}
|
|
@@ -4948,7 +5463,8 @@ async function evaluateCandidate(options) {
|
|
|
4948
5463
|
agent_provider_request: agentProviderRequest,
|
|
4949
5464
|
lm_provider_request: lmProviderRequest,
|
|
4950
5465
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4951
|
-
evaluator_results: evaluatorResults
|
|
5466
|
+
evaluator_results: evaluatorResults,
|
|
5467
|
+
trace_summary: candidateTraceSummary
|
|
4952
5468
|
};
|
|
4953
5469
|
}
|
|
4954
5470
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4962,7 +5478,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4962
5478
|
promptInputs,
|
|
4963
5479
|
now,
|
|
4964
5480
|
judgeProvider,
|
|
4965
|
-
agentTimeoutMs
|
|
5481
|
+
agentTimeoutMs,
|
|
5482
|
+
candidateTrace,
|
|
5483
|
+
candidateTraceSummary
|
|
4966
5484
|
} = options;
|
|
4967
5485
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4968
5486
|
return runEvaluatorList({
|
|
@@ -4976,7 +5494,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4976
5494
|
promptInputs,
|
|
4977
5495
|
now,
|
|
4978
5496
|
judgeProvider,
|
|
4979
|
-
agentTimeoutMs
|
|
5497
|
+
agentTimeoutMs,
|
|
5498
|
+
candidateTrace,
|
|
5499
|
+
candidateTraceSummary
|
|
4980
5500
|
});
|
|
4981
5501
|
}
|
|
4982
5502
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4992,7 +5512,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4992
5512
|
attempt,
|
|
4993
5513
|
promptInputs,
|
|
4994
5514
|
now,
|
|
4995
|
-
judgeProvider
|
|
5515
|
+
judgeProvider,
|
|
5516
|
+
candidateTrace,
|
|
5517
|
+
candidateTraceSummary
|
|
4996
5518
|
});
|
|
4997
5519
|
return { score };
|
|
4998
5520
|
}
|
|
@@ -5008,7 +5530,9 @@ async function runEvaluatorList(options) {
|
|
|
5008
5530
|
promptInputs,
|
|
5009
5531
|
now,
|
|
5010
5532
|
judgeProvider,
|
|
5011
|
-
agentTimeoutMs
|
|
5533
|
+
agentTimeoutMs,
|
|
5534
|
+
candidateTrace,
|
|
5535
|
+
candidateTraceSummary
|
|
5012
5536
|
} = options;
|
|
5013
5537
|
const scored = [];
|
|
5014
5538
|
const evaluatorResults = [];
|
|
@@ -5027,11 +5551,13 @@ async function runEvaluatorList(options) {
|
|
|
5027
5551
|
now,
|
|
5028
5552
|
judgeProvider
|
|
5029
5553
|
});
|
|
5030
|
-
|
|
5554
|
+
const weight = evaluator.weight ?? 1;
|
|
5555
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5031
5556
|
evaluatorResults.push({
|
|
5032
5557
|
name: evaluator.name,
|
|
5033
5558
|
type: evaluator.type,
|
|
5034
5559
|
score: score2.score,
|
|
5560
|
+
weight,
|
|
5035
5561
|
verdict: score2.verdict,
|
|
5036
5562
|
hits: score2.hits,
|
|
5037
5563
|
misses: score2.misses,
|
|
@@ -5054,11 +5580,13 @@ async function runEvaluatorList(options) {
|
|
|
5054
5580
|
promptInputs,
|
|
5055
5581
|
now
|
|
5056
5582
|
});
|
|
5057
|
-
|
|
5583
|
+
const weight = evaluator.weight ?? 1;
|
|
5584
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
5058
5585
|
evaluatorResults.push({
|
|
5059
5586
|
name: evaluator.name,
|
|
5060
5587
|
type: "code_judge",
|
|
5061
5588
|
score: score2.score,
|
|
5589
|
+
weight,
|
|
5062
5590
|
verdict: score2.verdict,
|
|
5063
5591
|
hits: score2.hits,
|
|
5064
5592
|
misses: score2.misses,
|
|
@@ -5067,7 +5595,7 @@ async function runEvaluatorList(options) {
|
|
|
5067
5595
|
});
|
|
5068
5596
|
}
|
|
5069
5597
|
if (evaluator.type === "composite") {
|
|
5070
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
5598
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5071
5599
|
const createEvaluator = (memberConfig) => {
|
|
5072
5600
|
switch (memberConfig.type) {
|
|
5073
5601
|
case "llm_judge":
|
|
@@ -5084,6 +5612,12 @@ async function runEvaluatorList(options) {
|
|
|
5084
5612
|
cwd: evalFileDir,
|
|
5085
5613
|
evaluatorFactory: { create: createEvaluator }
|
|
5086
5614
|
});
|
|
5615
|
+
case "tool_trajectory":
|
|
5616
|
+
return new ToolTrajectoryEvaluator({
|
|
5617
|
+
config: memberConfig
|
|
5618
|
+
});
|
|
5619
|
+
case "expected_messages":
|
|
5620
|
+
return new ExpectedMessagesEvaluator();
|
|
5087
5621
|
default: {
|
|
5088
5622
|
const unknownConfig = memberConfig;
|
|
5089
5623
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5105,11 +5639,13 @@ async function runEvaluatorList(options) {
|
|
|
5105
5639
|
now,
|
|
5106
5640
|
judgeProvider
|
|
5107
5641
|
});
|
|
5108
|
-
|
|
5642
|
+
const weight = evaluator.weight ?? 1;
|
|
5643
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5109
5644
|
evaluatorResults.push({
|
|
5110
5645
|
name: evaluator.name,
|
|
5111
5646
|
type: evaluator.type,
|
|
5112
5647
|
score: score2.score,
|
|
5648
|
+
weight,
|
|
5113
5649
|
verdict: score2.verdict,
|
|
5114
5650
|
hits: score2.hits,
|
|
5115
5651
|
misses: score2.misses,
|
|
@@ -5118,6 +5654,60 @@ async function runEvaluatorList(options) {
|
|
|
5118
5654
|
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
5119
5655
|
});
|
|
5120
5656
|
}
|
|
5657
|
+
if (evaluator.type === "tool_trajectory") {
|
|
5658
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
5659
|
+
config: evaluator
|
|
5660
|
+
});
|
|
5661
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
5662
|
+
evalCase,
|
|
5663
|
+
candidate,
|
|
5664
|
+
target,
|
|
5665
|
+
provider,
|
|
5666
|
+
attempt,
|
|
5667
|
+
promptInputs,
|
|
5668
|
+
now,
|
|
5669
|
+
candidateTrace,
|
|
5670
|
+
candidateTraceSummary
|
|
5671
|
+
});
|
|
5672
|
+
const weight = evaluator.weight ?? 1;
|
|
5673
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5674
|
+
evaluatorResults.push({
|
|
5675
|
+
name: evaluator.name,
|
|
5676
|
+
type: evaluator.type,
|
|
5677
|
+
score: score2.score,
|
|
5678
|
+
weight,
|
|
5679
|
+
verdict: score2.verdict,
|
|
5680
|
+
hits: score2.hits,
|
|
5681
|
+
misses: score2.misses,
|
|
5682
|
+
reasoning: score2.reasoning
|
|
5683
|
+
});
|
|
5684
|
+
}
|
|
5685
|
+
if (evaluator.type === "expected_messages") {
|
|
5686
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
5687
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
5688
|
+
evalCase,
|
|
5689
|
+
candidate,
|
|
5690
|
+
target,
|
|
5691
|
+
provider,
|
|
5692
|
+
attempt,
|
|
5693
|
+
promptInputs,
|
|
5694
|
+
now,
|
|
5695
|
+
candidateTrace,
|
|
5696
|
+
candidateTraceSummary
|
|
5697
|
+
});
|
|
5698
|
+
const weight = evaluator.weight ?? 1;
|
|
5699
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5700
|
+
evaluatorResults.push({
|
|
5701
|
+
name: evaluator.name,
|
|
5702
|
+
type: evaluator.type,
|
|
5703
|
+
score: score2.score,
|
|
5704
|
+
weight,
|
|
5705
|
+
verdict: score2.verdict,
|
|
5706
|
+
hits: score2.hits,
|
|
5707
|
+
misses: score2.misses,
|
|
5708
|
+
reasoning: score2.reasoning
|
|
5709
|
+
});
|
|
5710
|
+
}
|
|
5121
5711
|
} catch (error) {
|
|
5122
5712
|
const message = error instanceof Error ? error.message : String(error);
|
|
5123
5713
|
const fallbackScore = {
|
|
@@ -5129,15 +5719,18 @@ async function runEvaluatorList(options) {
|
|
|
5129
5719
|
reasoning: message
|
|
5130
5720
|
};
|
|
5131
5721
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
5722
|
+
const weight = evaluator.weight ?? 1;
|
|
5132
5723
|
scored.push({
|
|
5133
5724
|
score: fallbackScore,
|
|
5134
5725
|
name: evaluator.name ?? "unknown",
|
|
5135
|
-
type: resultType ?? "llm_judge"
|
|
5726
|
+
type: resultType ?? "llm_judge",
|
|
5727
|
+
weight
|
|
5136
5728
|
});
|
|
5137
5729
|
evaluatorResults.push({
|
|
5138
5730
|
name: evaluator.name ?? "unknown",
|
|
5139
5731
|
type: resultType ?? "llm_judge",
|
|
5140
5732
|
score: 0,
|
|
5733
|
+
weight,
|
|
5141
5734
|
verdict: "fail",
|
|
5142
5735
|
hits: [],
|
|
5143
5736
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -5145,7 +5738,9 @@ async function runEvaluatorList(options) {
|
|
|
5145
5738
|
});
|
|
5146
5739
|
}
|
|
5147
5740
|
}
|
|
5148
|
-
const aggregateScore = scored.length > 0 ?
|
|
5741
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
5742
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
5743
|
+
) : 0;
|
|
5149
5744
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
5150
5745
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
5151
5746
|
const expectedAspectCount = scored.reduce(
|
|
@@ -5240,8 +5835,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5240
5835
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5241
5836
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5242
5837
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5243
|
-
const filePath =
|
|
5244
|
-
await (0, import_promises11.mkdir)(
|
|
5838
|
+
const filePath = import_node_path14.default.resolve(directory, filename);
|
|
5839
|
+
await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
|
|
5245
5840
|
const payload = {
|
|
5246
5841
|
eval_id: evalCase.id,
|
|
5247
5842
|
question: promptInputs.question,
|
|
@@ -5371,6 +5966,16 @@ function mapChildResults(children) {
|
|
|
5371
5966
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5372
5967
|
}));
|
|
5373
5968
|
}
|
|
5969
|
+
function computeWeightedMean(entries) {
|
|
5970
|
+
let totalWeight = 0;
|
|
5971
|
+
let weightedSum = 0;
|
|
5972
|
+
for (const entry of entries) {
|
|
5973
|
+
const weight = entry.weight ?? 1;
|
|
5974
|
+
totalWeight += weight;
|
|
5975
|
+
weightedSum += entry.score * weight;
|
|
5976
|
+
}
|
|
5977
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5978
|
+
}
|
|
5374
5979
|
|
|
5375
5980
|
// src/evaluation/generators/rubric-generator.ts
|
|
5376
5981
|
var import_ai3 = require("ai");
|
|
@@ -5460,11 +6065,14 @@ function createAgentKernel() {
|
|
|
5460
6065
|
0 && (module.exports = {
|
|
5461
6066
|
CodeEvaluator,
|
|
5462
6067
|
CompositeEvaluator,
|
|
6068
|
+
ExpectedMessagesEvaluator,
|
|
5463
6069
|
LlmJudgeEvaluator,
|
|
5464
6070
|
TEST_MESSAGE_ROLES,
|
|
6071
|
+
ToolTrajectoryEvaluator,
|
|
5465
6072
|
buildDirectoryChain,
|
|
5466
6073
|
buildPromptInputs,
|
|
5467
6074
|
buildSearchRoots,
|
|
6075
|
+
computeTraceSummary,
|
|
5468
6076
|
consumeCodexLogEntries,
|
|
5469
6077
|
createAgentKernel,
|
|
5470
6078
|
createProvider,
|
|
@@ -5475,14 +6083,18 @@ function createAgentKernel() {
|
|
|
5475
6083
|
generateRubrics,
|
|
5476
6084
|
getHitCount,
|
|
5477
6085
|
isEvaluatorKind,
|
|
6086
|
+
isExpectedToolCall,
|
|
5478
6087
|
isGuidelineFile,
|
|
5479
6088
|
isJsonObject,
|
|
5480
6089
|
isJsonValue,
|
|
5481
6090
|
isTestMessage,
|
|
5482
6091
|
isTestMessageRole,
|
|
6092
|
+
isTraceEvent,
|
|
6093
|
+
isTraceEventType,
|
|
5483
6094
|
listTargetNames,
|
|
5484
6095
|
loadEvalCases,
|
|
5485
6096
|
normalizeLineEndings,
|
|
6097
|
+
readJsonFile,
|
|
5486
6098
|
readTargetDefinitions,
|
|
5487
6099
|
readTestSuiteMetadata,
|
|
5488
6100
|
readTextFile,
|