@agentv/core 0.22.2 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
- package/dist/chunk-OYTL3LNN.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +994 -50
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +205 -4
- package/dist/index.d.ts +205 -4
- package/dist/index.js +953 -23
- package/dist/index.js.map +1 -1
- package/package.json +3 -4
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -31,11 +31,15 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
|
+
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
|
|
34
36
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
37
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
38
|
+
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
36
39
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
37
40
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
41
|
buildSearchRoots: () => buildSearchRoots2,
|
|
42
|
+
computeTraceSummary: () => computeTraceSummary,
|
|
39
43
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
40
44
|
createAgentKernel: () => createAgentKernel,
|
|
41
45
|
createProvider: () => createProvider,
|
|
@@ -46,14 +50,18 @@ __export(index_exports, {
|
|
|
46
50
|
generateRubrics: () => generateRubrics,
|
|
47
51
|
getHitCount: () => getHitCount,
|
|
48
52
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
53
|
+
isExpectedToolCall: () => isExpectedToolCall,
|
|
49
54
|
isGuidelineFile: () => isGuidelineFile,
|
|
50
55
|
isJsonObject: () => isJsonObject,
|
|
51
56
|
isJsonValue: () => isJsonValue,
|
|
52
57
|
isTestMessage: () => isTestMessage,
|
|
53
58
|
isTestMessageRole: () => isTestMessageRole,
|
|
59
|
+
isTraceEvent: () => isTraceEvent,
|
|
60
|
+
isTraceEventType: () => isTraceEventType,
|
|
54
61
|
listTargetNames: () => listTargetNames,
|
|
55
62
|
loadEvalCases: () => loadEvalCases,
|
|
56
63
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
64
|
+
readJsonFile: () => readJsonFile,
|
|
57
65
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
58
66
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
59
67
|
readTextFile: () => readTextFile,
|
|
@@ -107,7 +115,14 @@ function isTestMessage(value) {
|
|
|
107
115
|
}
|
|
108
116
|
return candidate.content.every(isJsonObject);
|
|
109
117
|
}
|
|
110
|
-
var EVALUATOR_KIND_VALUES = [
|
|
118
|
+
var EVALUATOR_KIND_VALUES = [
|
|
119
|
+
"code_judge",
|
|
120
|
+
"llm_judge",
|
|
121
|
+
"rubric",
|
|
122
|
+
"composite",
|
|
123
|
+
"tool_trajectory",
|
|
124
|
+
"expected_messages"
|
|
125
|
+
];
|
|
111
126
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
112
127
|
function isEvaluatorKind(value) {
|
|
113
128
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -116,6 +131,44 @@ function getHitCount(result) {
|
|
|
116
131
|
return result.hits.length;
|
|
117
132
|
}
|
|
118
133
|
|
|
134
|
+
// src/evaluation/trace.ts
|
|
135
|
+
function isTraceEventType(value) {
|
|
136
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
137
|
+
}
|
|
138
|
+
function isTraceEvent(value) {
|
|
139
|
+
if (typeof value !== "object" || value === null) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
const candidate = value;
|
|
143
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
144
|
+
}
|
|
145
|
+
function isExpectedToolCall(value) {
|
|
146
|
+
if (typeof value !== "object" || value === null) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
const candidate = value;
|
|
150
|
+
return typeof candidate.tool === "string";
|
|
151
|
+
}
|
|
152
|
+
function computeTraceSummary(trace) {
|
|
153
|
+
const toolCallCounts = {};
|
|
154
|
+
let errorCount = 0;
|
|
155
|
+
for (const event of trace) {
|
|
156
|
+
if (event.type === "tool_call" && event.name) {
|
|
157
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
158
|
+
}
|
|
159
|
+
if (event.type === "error") {
|
|
160
|
+
errorCount++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
164
|
+
return {
|
|
165
|
+
eventCount: trace.length,
|
|
166
|
+
toolNames,
|
|
167
|
+
toolCallsByName: toolCallCounts,
|
|
168
|
+
errorCount
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
119
172
|
// src/evaluation/yaml-parser.ts
|
|
120
173
|
var import_promises6 = require("fs/promises");
|
|
121
174
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
@@ -459,10 +512,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
459
512
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
460
513
|
continue;
|
|
461
514
|
}
|
|
462
|
-
if (typeValue === "
|
|
515
|
+
if (typeValue === "code_judge") {
|
|
463
516
|
const script = asString2(rawEvaluator.script);
|
|
464
517
|
if (!script) {
|
|
465
|
-
logWarning2(`Skipping
|
|
518
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
466
519
|
continue;
|
|
467
520
|
}
|
|
468
521
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -473,7 +526,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
473
526
|
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
474
527
|
} else {
|
|
475
528
|
logWarning2(
|
|
476
|
-
`
|
|
529
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
477
530
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
478
531
|
);
|
|
479
532
|
}
|
|
@@ -489,6 +542,174 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
489
542
|
});
|
|
490
543
|
continue;
|
|
491
544
|
}
|
|
545
|
+
if (typeValue === "composite") {
|
|
546
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
547
|
+
if (!Array.isArray(rawMembers)) {
|
|
548
|
+
logWarning2(
|
|
549
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
550
|
+
);
|
|
551
|
+
continue;
|
|
552
|
+
}
|
|
553
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
554
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
555
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
556
|
+
continue;
|
|
557
|
+
}
|
|
558
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
559
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
560
|
+
logWarning2(
|
|
561
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
562
|
+
);
|
|
563
|
+
continue;
|
|
564
|
+
}
|
|
565
|
+
const memberEvaluators = [];
|
|
566
|
+
for (const rawMember of rawMembers) {
|
|
567
|
+
if (!isJsonObject2(rawMember)) {
|
|
568
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
569
|
+
continue;
|
|
570
|
+
}
|
|
571
|
+
const memberName = asString2(rawMember.name);
|
|
572
|
+
const memberType = rawMember.type;
|
|
573
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
574
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
575
|
+
continue;
|
|
576
|
+
}
|
|
577
|
+
const memberConfigs = await parseEvaluators(
|
|
578
|
+
{ evaluators: [rawMember] },
|
|
579
|
+
void 0,
|
|
580
|
+
searchRoots,
|
|
581
|
+
`${evalId}:${name}:${memberName}`
|
|
582
|
+
);
|
|
583
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
584
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
if (memberEvaluators.length === 0) {
|
|
588
|
+
logWarning2(
|
|
589
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
590
|
+
);
|
|
591
|
+
continue;
|
|
592
|
+
}
|
|
593
|
+
let aggregator;
|
|
594
|
+
if (aggregatorType === "weighted_average") {
|
|
595
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
596
|
+
const parsedWeights = {};
|
|
597
|
+
if (weights) {
|
|
598
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
599
|
+
if (typeof value === "number") {
|
|
600
|
+
parsedWeights[key] = value;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
aggregator = {
|
|
605
|
+
type: "weighted_average",
|
|
606
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
607
|
+
};
|
|
608
|
+
} else if (aggregatorType === "code_judge") {
|
|
609
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
610
|
+
if (!aggregatorPath) {
|
|
611
|
+
logWarning2(
|
|
612
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
613
|
+
);
|
|
614
|
+
continue;
|
|
615
|
+
}
|
|
616
|
+
aggregator = {
|
|
617
|
+
type: "code_judge",
|
|
618
|
+
path: aggregatorPath,
|
|
619
|
+
cwd: searchRoots[0]
|
|
620
|
+
};
|
|
621
|
+
} else {
|
|
622
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
623
|
+
let promptPath2;
|
|
624
|
+
if (aggregatorPrompt) {
|
|
625
|
+
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
626
|
+
if (resolved.resolvedPath) {
|
|
627
|
+
promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
aggregator = {
|
|
631
|
+
type: "llm_judge",
|
|
632
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
633
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
evaluators.push({
|
|
637
|
+
name,
|
|
638
|
+
type: "composite",
|
|
639
|
+
evaluators: memberEvaluators,
|
|
640
|
+
aggregator
|
|
641
|
+
});
|
|
642
|
+
continue;
|
|
643
|
+
}
|
|
644
|
+
if (typeValue === "expected_messages") {
|
|
645
|
+
evaluators.push({
|
|
646
|
+
name,
|
|
647
|
+
type: "expected_messages"
|
|
648
|
+
});
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
if (typeValue === "tool_trajectory") {
|
|
652
|
+
const mode = asString2(rawEvaluator.mode);
|
|
653
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
654
|
+
logWarning2(
|
|
655
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
656
|
+
);
|
|
657
|
+
continue;
|
|
658
|
+
}
|
|
659
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
660
|
+
let minimums;
|
|
661
|
+
if (rawMinimums !== void 0) {
|
|
662
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
663
|
+
logWarning2(
|
|
664
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
665
|
+
);
|
|
666
|
+
continue;
|
|
667
|
+
}
|
|
668
|
+
minimums = {};
|
|
669
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
670
|
+
if (typeof count === "number" && count >= 0) {
|
|
671
|
+
minimums[toolName] = count;
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
const rawExpected = rawEvaluator.expected;
|
|
676
|
+
let expected;
|
|
677
|
+
if (rawExpected !== void 0) {
|
|
678
|
+
if (!Array.isArray(rawExpected)) {
|
|
679
|
+
logWarning2(
|
|
680
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
681
|
+
);
|
|
682
|
+
continue;
|
|
683
|
+
}
|
|
684
|
+
expected = [];
|
|
685
|
+
for (const item of rawExpected) {
|
|
686
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
687
|
+
expected.push({ tool: item.tool });
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
if (mode === "any_order" && !minimums) {
|
|
692
|
+
logWarning2(
|
|
693
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
694
|
+
);
|
|
695
|
+
continue;
|
|
696
|
+
}
|
|
697
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
698
|
+
logWarning2(
|
|
699
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
700
|
+
);
|
|
701
|
+
continue;
|
|
702
|
+
}
|
|
703
|
+
const config = {
|
|
704
|
+
name,
|
|
705
|
+
type: "tool_trajectory",
|
|
706
|
+
mode,
|
|
707
|
+
...minimums ? { minimums } : {},
|
|
708
|
+
...expected ? { expected } : {}
|
|
709
|
+
};
|
|
710
|
+
evaluators.push(config);
|
|
711
|
+
continue;
|
|
712
|
+
}
|
|
492
713
|
const prompt = asString2(rawEvaluator.prompt);
|
|
493
714
|
let promptPath;
|
|
494
715
|
if (prompt) {
|
|
@@ -742,6 +963,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
742
963
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
743
964
|
}
|
|
744
965
|
}
|
|
966
|
+
async function processExpectedMessages(options) {
|
|
967
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
968
|
+
const segments = [];
|
|
969
|
+
for (const message of messages) {
|
|
970
|
+
const segment = {
|
|
971
|
+
role: message.role
|
|
972
|
+
};
|
|
973
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
974
|
+
segment.tool_calls = message.tool_calls;
|
|
975
|
+
}
|
|
976
|
+
const content = message.content;
|
|
977
|
+
if (typeof content === "string") {
|
|
978
|
+
segment.content = content;
|
|
979
|
+
} else if (Array.isArray(content)) {
|
|
980
|
+
const processedContent = [];
|
|
981
|
+
for (const rawSegment of content) {
|
|
982
|
+
if (!isJsonObject(rawSegment)) {
|
|
983
|
+
continue;
|
|
984
|
+
}
|
|
985
|
+
const segmentType = asString3(rawSegment.type);
|
|
986
|
+
if (segmentType === "file") {
|
|
987
|
+
const rawValue = asString3(rawSegment.value);
|
|
988
|
+
if (!rawValue) {
|
|
989
|
+
continue;
|
|
990
|
+
}
|
|
991
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
992
|
+
rawValue,
|
|
993
|
+
searchRoots
|
|
994
|
+
);
|
|
995
|
+
if (!resolvedPath) {
|
|
996
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
997
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
998
|
+
continue;
|
|
999
|
+
}
|
|
1000
|
+
try {
|
|
1001
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
1002
|
+
processedContent.push({
|
|
1003
|
+
type: "file",
|
|
1004
|
+
path: displayPath,
|
|
1005
|
+
text: fileContent,
|
|
1006
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
1007
|
+
});
|
|
1008
|
+
if (verbose) {
|
|
1009
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
1010
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
1011
|
+
}
|
|
1012
|
+
} catch (error) {
|
|
1013
|
+
logWarning3(
|
|
1014
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
1015
|
+
);
|
|
1016
|
+
}
|
|
1017
|
+
continue;
|
|
1018
|
+
}
|
|
1019
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
1020
|
+
}
|
|
1021
|
+
segment.content = processedContent;
|
|
1022
|
+
}
|
|
1023
|
+
segments.push(segment);
|
|
1024
|
+
}
|
|
1025
|
+
return segments;
|
|
1026
|
+
}
|
|
745
1027
|
|
|
746
1028
|
// src/evaluation/formatting/prompt-builder.ts
|
|
747
1029
|
var import_promises5 = require("fs/promises");
|
|
@@ -1046,12 +1328,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1046
1328
|
messageType: "input",
|
|
1047
1329
|
verbose
|
|
1048
1330
|
});
|
|
1049
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1331
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1050
1332
|
messages: expectedMessages,
|
|
1051
1333
|
searchRoots,
|
|
1052
1334
|
repoRootPath,
|
|
1053
|
-
guidelinePatterns,
|
|
1054
|
-
messageType: "output",
|
|
1055
1335
|
verbose
|
|
1056
1336
|
}) : [];
|
|
1057
1337
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1178,6 +1458,10 @@ async function readTextFile(filePath) {
|
|
|
1178
1458
|
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1179
1459
|
return normalizeLineEndings(content);
|
|
1180
1460
|
}
|
|
1461
|
+
async function readJsonFile(filePath) {
|
|
1462
|
+
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1463
|
+
return JSON.parse(content);
|
|
1464
|
+
}
|
|
1181
1465
|
async function findGitRoot(startPath) {
|
|
1182
1466
|
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1183
1467
|
const root = import_node_path7.default.parse(currentDir).root;
|
|
@@ -1686,9 +1970,11 @@ var CliProvider = class {
|
|
|
1686
1970
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1687
1971
|
throw new Error(message);
|
|
1688
1972
|
}
|
|
1689
|
-
const
|
|
1973
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1974
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1690
1975
|
return {
|
|
1691
|
-
text:
|
|
1976
|
+
text: parsed.text,
|
|
1977
|
+
trace: parsed.trace,
|
|
1692
1978
|
raw: {
|
|
1693
1979
|
command: renderedCommand,
|
|
1694
1980
|
stderr: result.stderr,
|
|
@@ -1698,6 +1984,31 @@ var CliProvider = class {
|
|
|
1698
1984
|
}
|
|
1699
1985
|
};
|
|
1700
1986
|
}
|
|
1987
|
+
/**
|
|
1988
|
+
* Parse output content from CLI.
|
|
1989
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
1990
|
+
* Otherwise, treat the entire content as plain text.
|
|
1991
|
+
*/
|
|
1992
|
+
parseOutputContent(content) {
|
|
1993
|
+
try {
|
|
1994
|
+
const parsed = JSON.parse(content);
|
|
1995
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
1996
|
+
const obj = parsed;
|
|
1997
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1998
|
+
const trace = this.parseTrace(obj.trace);
|
|
1999
|
+
return { text, trace };
|
|
2000
|
+
}
|
|
2001
|
+
} catch {
|
|
2002
|
+
}
|
|
2003
|
+
return { text: content };
|
|
2004
|
+
}
|
|
2005
|
+
parseTrace(trace) {
|
|
2006
|
+
if (!Array.isArray(trace)) {
|
|
2007
|
+
return void 0;
|
|
2008
|
+
}
|
|
2009
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
2010
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
2011
|
+
}
|
|
1701
2012
|
async readAndCleanupOutputFile(filePath) {
|
|
1702
2013
|
try {
|
|
1703
2014
|
const content = await readTextFile(filePath);
|
|
@@ -2684,6 +2995,7 @@ var MockProvider = class {
|
|
|
2684
2995
|
delayMs;
|
|
2685
2996
|
delayMinMs;
|
|
2686
2997
|
delayMaxMs;
|
|
2998
|
+
trace;
|
|
2687
2999
|
constructor(targetName, config) {
|
|
2688
3000
|
this.id = `mock:${targetName}`;
|
|
2689
3001
|
this.targetName = targetName;
|
|
@@ -2691,6 +3003,7 @@ var MockProvider = class {
|
|
|
2691
3003
|
this.delayMs = config.delayMs ?? 0;
|
|
2692
3004
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2693
3005
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
3006
|
+
this.trace = config.trace;
|
|
2694
3007
|
}
|
|
2695
3008
|
async invoke(request) {
|
|
2696
3009
|
const delay = this.calculateDelay();
|
|
@@ -2702,7 +3015,8 @@ var MockProvider = class {
|
|
|
2702
3015
|
raw: {
|
|
2703
3016
|
question: request.question,
|
|
2704
3017
|
guidelines: request.guidelines
|
|
2705
|
-
}
|
|
3018
|
+
},
|
|
3019
|
+
trace: this.trace
|
|
2706
3020
|
};
|
|
2707
3021
|
}
|
|
2708
3022
|
calculateDelay() {
|
|
@@ -2716,6 +3030,7 @@ var MockProvider = class {
|
|
|
2716
3030
|
};
|
|
2717
3031
|
|
|
2718
3032
|
// src/evaluation/providers/targets.ts
|
|
3033
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2719
3034
|
var import_zod = require("zod");
|
|
2720
3035
|
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2721
3036
|
"PROMPT",
|
|
@@ -2731,7 +3046,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
|
2731
3046
|
judge_target: import_zod.z.string().optional(),
|
|
2732
3047
|
workers: import_zod.z.number().int().min(1).optional()
|
|
2733
3048
|
}).passthrough();
|
|
2734
|
-
var DEFAULT_AZURE_API_VERSION = "2024-
|
|
3049
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
2735
3050
|
function normalizeAzureApiVersion(value) {
|
|
2736
3051
|
if (!value) {
|
|
2737
3052
|
return DEFAULT_AZURE_API_VERSION;
|
|
@@ -2775,7 +3090,7 @@ function resolveRetryConfig(target) {
|
|
|
2775
3090
|
retryableStatusCodes
|
|
2776
3091
|
};
|
|
2777
3092
|
}
|
|
2778
|
-
function resolveTargetDefinition(definition, env = process.env) {
|
|
3093
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
2779
3094
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
2780
3095
|
const provider = parsed.provider.toLowerCase();
|
|
2781
3096
|
const providerBatching = resolveOptionalBoolean(
|
|
@@ -2848,7 +3163,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
2848
3163
|
judgeTarget: parsed.judge_target,
|
|
2849
3164
|
workers: parsed.workers,
|
|
2850
3165
|
providerBatching,
|
|
2851
|
-
config: resolveCliConfig(parsed, env)
|
|
3166
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
2852
3167
|
};
|
|
2853
3168
|
default:
|
|
2854
3169
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
@@ -2966,7 +3281,8 @@ function normalizeCodexLogFormat(value) {
|
|
|
2966
3281
|
}
|
|
2967
3282
|
function resolveMockConfig(target) {
|
|
2968
3283
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
2969
|
-
|
|
3284
|
+
const trace = Array.isArray(target.trace) ? target.trace : void 0;
|
|
3285
|
+
return { response, trace };
|
|
2970
3286
|
}
|
|
2971
3287
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
2972
3288
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -2998,15 +3314,18 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
2998
3314
|
workspaceTemplate
|
|
2999
3315
|
};
|
|
3000
3316
|
}
|
|
3001
|
-
function resolveCliConfig(target, env) {
|
|
3317
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
3002
3318
|
const commandTemplateSource = target.command_template ?? target.commandTemplate;
|
|
3003
3319
|
const filesFormat = resolveOptionalLiteralString(
|
|
3004
3320
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3005
3321
|
);
|
|
3006
|
-
|
|
3322
|
+
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3007
3323
|
allowLiteral: true,
|
|
3008
3324
|
optionalEnv: true
|
|
3009
3325
|
});
|
|
3326
|
+
if (!cwd && evalFilePath) {
|
|
3327
|
+
cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
|
|
3328
|
+
}
|
|
3010
3329
|
const timeoutMs = resolveTimeoutMs(
|
|
3011
3330
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
3012
3331
|
`${target.name} timeout`
|
|
@@ -3124,17 +3443,15 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
3124
3443
|
if (envVarMatch) {
|
|
3125
3444
|
const varName = envVarMatch[1];
|
|
3126
3445
|
const envValue = env[varName];
|
|
3127
|
-
if (envValue !== void 0) {
|
|
3128
|
-
if (envValue.trim().length === 0) {
|
|
3129
|
-
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
3130
|
-
}
|
|
3131
|
-
return envValue;
|
|
3132
|
-
}
|
|
3133
3446
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
3134
|
-
if (
|
|
3135
|
-
|
|
3447
|
+
if (envValue === void 0 || envValue.trim().length === 0) {
|
|
3448
|
+
if (optionalEnv) {
|
|
3449
|
+
return void 0;
|
|
3450
|
+
}
|
|
3451
|
+
const status = envValue === void 0 ? "is not set" : "is empty";
|
|
3452
|
+
throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
|
|
3136
3453
|
}
|
|
3137
|
-
|
|
3454
|
+
return envValue;
|
|
3138
3455
|
}
|
|
3139
3456
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
3140
3457
|
if (!allowLiteral) {
|
|
@@ -3246,7 +3563,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
3246
3563
|
}
|
|
3247
3564
|
|
|
3248
3565
|
// src/evaluation/providers/vscode.ts
|
|
3249
|
-
var
|
|
3566
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3250
3567
|
var import_subagent = require("subagent");
|
|
3251
3568
|
|
|
3252
3569
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -3416,7 +3733,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3416
3733
|
return "";
|
|
3417
3734
|
}
|
|
3418
3735
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3419
|
-
const fileName =
|
|
3736
|
+
const fileName = import_node_path12.default.basename(absolutePath);
|
|
3420
3737
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3421
3738
|
return `* [${fileName}](${fileUri})`;
|
|
3422
3739
|
});
|
|
@@ -3441,8 +3758,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3441
3758
|
}
|
|
3442
3759
|
const unique = /* @__PURE__ */ new Map();
|
|
3443
3760
|
for (const attachment of attachments) {
|
|
3444
|
-
const absolutePath =
|
|
3445
|
-
const normalized = absolutePath.split(
|
|
3761
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3762
|
+
const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
|
|
3446
3763
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3447
3764
|
if (!unique.has(absolutePath)) {
|
|
3448
3765
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3457,7 +3774,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3457
3774
|
}
|
|
3458
3775
|
const unique = /* @__PURE__ */ new Map();
|
|
3459
3776
|
for (const attachment of attachments) {
|
|
3460
|
-
const absolutePath =
|
|
3777
|
+
const absolutePath = import_node_path12.default.resolve(attachment);
|
|
3461
3778
|
if (!unique.has(absolutePath)) {
|
|
3462
3779
|
unique.set(absolutePath, absolutePath);
|
|
3463
3780
|
}
|
|
@@ -3465,7 +3782,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3465
3782
|
return Array.from(unique.values());
|
|
3466
3783
|
}
|
|
3467
3784
|
function pathToFileUri2(filePath) {
|
|
3468
|
-
const absolutePath =
|
|
3785
|
+
const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
|
|
3469
3786
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3470
3787
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3471
3788
|
return `file:///${normalizedPath}`;
|
|
@@ -3478,7 +3795,7 @@ function normalizeAttachments(attachments) {
|
|
|
3478
3795
|
}
|
|
3479
3796
|
const deduped = /* @__PURE__ */ new Set();
|
|
3480
3797
|
for (const attachment of attachments) {
|
|
3481
|
-
deduped.add(
|
|
3798
|
+
deduped.add(import_node_path12.default.resolve(attachment));
|
|
3482
3799
|
}
|
|
3483
3800
|
return Array.from(deduped);
|
|
3484
3801
|
}
|
|
@@ -3487,7 +3804,7 @@ function mergeAttachments(all) {
|
|
|
3487
3804
|
for (const list of all) {
|
|
3488
3805
|
if (!list) continue;
|
|
3489
3806
|
for (const inputFile of list) {
|
|
3490
|
-
deduped.add(
|
|
3807
|
+
deduped.add(import_node_path12.default.resolve(inputFile));
|
|
3491
3808
|
}
|
|
3492
3809
|
}
|
|
3493
3810
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3536,7 +3853,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3536
3853
|
// src/evaluation/providers/targets-file.ts
|
|
3537
3854
|
var import_node_fs4 = require("fs");
|
|
3538
3855
|
var import_promises10 = require("fs/promises");
|
|
3539
|
-
var
|
|
3856
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3540
3857
|
var import_yaml3 = require("yaml");
|
|
3541
3858
|
function isRecord(value) {
|
|
3542
3859
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -3573,7 +3890,7 @@ async function fileExists3(filePath) {
|
|
|
3573
3890
|
}
|
|
3574
3891
|
}
|
|
3575
3892
|
async function readTargetDefinitions(filePath) {
|
|
3576
|
-
const absolutePath =
|
|
3893
|
+
const absolutePath = import_node_path13.default.resolve(filePath);
|
|
3577
3894
|
if (!await fileExists3(absolutePath)) {
|
|
3578
3895
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3579
3896
|
}
|
|
@@ -4021,11 +4338,478 @@ function substituteVariables(template, variables) {
|
|
|
4021
4338
|
return variables[varName] ?? match;
|
|
4022
4339
|
});
|
|
4023
4340
|
}
|
|
4341
|
+
var ToolTrajectoryEvaluator = class {
|
|
4342
|
+
kind = "tool_trajectory";
|
|
4343
|
+
config;
|
|
4344
|
+
constructor(options) {
|
|
4345
|
+
this.config = options.config;
|
|
4346
|
+
}
|
|
4347
|
+
evaluate(context) {
|
|
4348
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
4349
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
4350
|
+
return {
|
|
4351
|
+
score: 0,
|
|
4352
|
+
verdict: "fail",
|
|
4353
|
+
hits: [],
|
|
4354
|
+
misses: ["No trace available for evaluation"],
|
|
4355
|
+
expectedAspectCount: 1
|
|
4356
|
+
};
|
|
4357
|
+
}
|
|
4358
|
+
switch (this.config.mode) {
|
|
4359
|
+
case "any_order":
|
|
4360
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
4361
|
+
case "in_order":
|
|
4362
|
+
return this.evaluateInOrder(candidateTrace);
|
|
4363
|
+
case "exact":
|
|
4364
|
+
return this.evaluateExact(candidateTrace);
|
|
4365
|
+
default:
|
|
4366
|
+
return {
|
|
4367
|
+
score: 0,
|
|
4368
|
+
verdict: "fail",
|
|
4369
|
+
hits: [],
|
|
4370
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
4371
|
+
expectedAspectCount: 1
|
|
4372
|
+
};
|
|
4373
|
+
}
|
|
4374
|
+
}
|
|
4375
|
+
evaluateAnyOrder(summary) {
|
|
4376
|
+
const minimums = this.config.minimums ?? {};
|
|
4377
|
+
const toolNames = Object.keys(minimums);
|
|
4378
|
+
if (toolNames.length === 0) {
|
|
4379
|
+
return {
|
|
4380
|
+
score: 1,
|
|
4381
|
+
verdict: "pass",
|
|
4382
|
+
hits: ["No tool requirements specified"],
|
|
4383
|
+
misses: [],
|
|
4384
|
+
expectedAspectCount: 0
|
|
4385
|
+
};
|
|
4386
|
+
}
|
|
4387
|
+
const hits = [];
|
|
4388
|
+
const misses = [];
|
|
4389
|
+
for (const toolName of toolNames) {
|
|
4390
|
+
const required = minimums[toolName];
|
|
4391
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
4392
|
+
if (actual >= required) {
|
|
4393
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4394
|
+
} else {
|
|
4395
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
4396
|
+
}
|
|
4397
|
+
}
|
|
4398
|
+
const score = hits.length / toolNames.length;
|
|
4399
|
+
return {
|
|
4400
|
+
score,
|
|
4401
|
+
verdict: scoreToVerdict(score),
|
|
4402
|
+
hits,
|
|
4403
|
+
misses,
|
|
4404
|
+
expectedAspectCount: toolNames.length
|
|
4405
|
+
};
|
|
4406
|
+
}
|
|
4407
|
+
evaluateInOrder(trace) {
|
|
4408
|
+
const expected = this.config.expected ?? [];
|
|
4409
|
+
if (expected.length === 0) {
|
|
4410
|
+
return {
|
|
4411
|
+
score: 1,
|
|
4412
|
+
verdict: "pass",
|
|
4413
|
+
hits: ["No tool sequence specified"],
|
|
4414
|
+
misses: [],
|
|
4415
|
+
expectedAspectCount: 0
|
|
4416
|
+
};
|
|
4417
|
+
}
|
|
4418
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4419
|
+
const hits = [];
|
|
4420
|
+
const misses = [];
|
|
4421
|
+
let actualIndex = 0;
|
|
4422
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4423
|
+
const expectedTool = expected[i].tool;
|
|
4424
|
+
let found = false;
|
|
4425
|
+
while (actualIndex < actualToolCalls.length) {
|
|
4426
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
4427
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4428
|
+
actualIndex++;
|
|
4429
|
+
found = true;
|
|
4430
|
+
break;
|
|
4431
|
+
}
|
|
4432
|
+
actualIndex++;
|
|
4433
|
+
}
|
|
4434
|
+
if (!found) {
|
|
4435
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
4436
|
+
}
|
|
4437
|
+
}
|
|
4438
|
+
const score = hits.length / expected.length;
|
|
4439
|
+
return {
|
|
4440
|
+
score,
|
|
4441
|
+
verdict: scoreToVerdict(score),
|
|
4442
|
+
hits,
|
|
4443
|
+
misses,
|
|
4444
|
+
expectedAspectCount: expected.length
|
|
4445
|
+
};
|
|
4446
|
+
}
|
|
4447
|
+
evaluateExact(trace) {
|
|
4448
|
+
const expected = this.config.expected ?? [];
|
|
4449
|
+
if (expected.length === 0) {
|
|
4450
|
+
return {
|
|
4451
|
+
score: 1,
|
|
4452
|
+
verdict: "pass",
|
|
4453
|
+
hits: ["No tool sequence specified"],
|
|
4454
|
+
misses: [],
|
|
4455
|
+
expectedAspectCount: 0
|
|
4456
|
+
};
|
|
4457
|
+
}
|
|
4458
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4459
|
+
const hits = [];
|
|
4460
|
+
const misses = [];
|
|
4461
|
+
if (actualToolCalls.length !== expected.length) {
|
|
4462
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
4463
|
+
}
|
|
4464
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
4465
|
+
for (let i = 0; i < checkLength; i++) {
|
|
4466
|
+
const expectedTool = expected[i].tool;
|
|
4467
|
+
const actualTool = actualToolCalls[i].name;
|
|
4468
|
+
if (actualTool === expectedTool) {
|
|
4469
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
4470
|
+
} else {
|
|
4471
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4472
|
+
}
|
|
4473
|
+
}
|
|
4474
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
4475
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
4476
|
+
}
|
|
4477
|
+
const score = hits.length / expected.length;
|
|
4478
|
+
return {
|
|
4479
|
+
score,
|
|
4480
|
+
verdict: scoreToVerdict(score),
|
|
4481
|
+
hits,
|
|
4482
|
+
misses,
|
|
4483
|
+
expectedAspectCount: expected.length
|
|
4484
|
+
};
|
|
4485
|
+
}
|
|
4486
|
+
};
|
|
4487
|
+
var ExpectedMessagesEvaluator = class {
|
|
4488
|
+
kind = "expected_messages";
|
|
4489
|
+
evaluate(context) {
|
|
4490
|
+
const { candidateTrace, evalCase } = context;
|
|
4491
|
+
const expectedSegments = evalCase.expected_segments;
|
|
4492
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
4493
|
+
if (expectedToolCalls.length === 0) {
|
|
4494
|
+
return {
|
|
4495
|
+
score: 1,
|
|
4496
|
+
verdict: "pass",
|
|
4497
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
4498
|
+
misses: [],
|
|
4499
|
+
expectedAspectCount: 1
|
|
4500
|
+
};
|
|
4501
|
+
}
|
|
4502
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
4503
|
+
return {
|
|
4504
|
+
score: 0,
|
|
4505
|
+
verdict: "fail",
|
|
4506
|
+
hits: [],
|
|
4507
|
+
misses: ["No trace available to validate tool_calls"],
|
|
4508
|
+
expectedAspectCount: expectedToolCalls.length
|
|
4509
|
+
};
|
|
4510
|
+
}
|
|
4511
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
4512
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
4513
|
+
}
|
|
4514
|
+
extractExpectedToolCalls(segments) {
|
|
4515
|
+
if (!segments) {
|
|
4516
|
+
return [];
|
|
4517
|
+
}
|
|
4518
|
+
const toolCalls = [];
|
|
4519
|
+
for (const segment of segments) {
|
|
4520
|
+
const role = segment.role;
|
|
4521
|
+
const segmentToolCalls = segment.tool_calls;
|
|
4522
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
4523
|
+
for (const tc of segmentToolCalls) {
|
|
4524
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
4525
|
+
const toolCall = tc;
|
|
4526
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
4527
|
+
}
|
|
4528
|
+
}
|
|
4529
|
+
}
|
|
4530
|
+
}
|
|
4531
|
+
return toolCalls;
|
|
4532
|
+
}
|
|
4533
|
+
validateToolCalls(expected, actual) {
|
|
4534
|
+
const hits = [];
|
|
4535
|
+
const misses = [];
|
|
4536
|
+
for (let i = 0; i < expected.length; i++) {
|
|
4537
|
+
const expectedCall = expected[i];
|
|
4538
|
+
const actualCall = actual[i];
|
|
4539
|
+
if (!actualCall) {
|
|
4540
|
+
misses.push(
|
|
4541
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
4542
|
+
);
|
|
4543
|
+
continue;
|
|
4544
|
+
}
|
|
4545
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
4546
|
+
misses.push(
|
|
4547
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
4548
|
+
);
|
|
4549
|
+
continue;
|
|
4550
|
+
}
|
|
4551
|
+
if (expectedCall.input !== void 0) {
|
|
4552
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
4553
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
4554
|
+
continue;
|
|
4555
|
+
}
|
|
4556
|
+
}
|
|
4557
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
4558
|
+
}
|
|
4559
|
+
const totalChecks = expected.length || 1;
|
|
4560
|
+
const score = hits.length / totalChecks;
|
|
4561
|
+
return {
|
|
4562
|
+
score,
|
|
4563
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
4564
|
+
hits,
|
|
4565
|
+
misses,
|
|
4566
|
+
expectedAspectCount: totalChecks
|
|
4567
|
+
};
|
|
4568
|
+
}
|
|
4569
|
+
deepEquals(a, b) {
|
|
4570
|
+
if (a === b) return true;
|
|
4571
|
+
if (typeof a !== typeof b) return false;
|
|
4572
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
4573
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4574
|
+
if (a.length !== b.length) return false;
|
|
4575
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
4576
|
+
}
|
|
4577
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
4578
|
+
const aObj = a;
|
|
4579
|
+
const bObj = b;
|
|
4580
|
+
const aKeys = Object.keys(aObj);
|
|
4581
|
+
const bKeys = Object.keys(bObj);
|
|
4582
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
4583
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
4584
|
+
}
|
|
4585
|
+
};
|
|
4586
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4587
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
4588
|
+
|
|
4589
|
+
Decide the final score and verdict based on all evaluator results.
|
|
4590
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
4591
|
+
var CompositeEvaluator = class {
|
|
4592
|
+
kind = "composite";
|
|
4593
|
+
config;
|
|
4594
|
+
evaluatorFactory;
|
|
4595
|
+
cwd;
|
|
4596
|
+
constructor(options) {
|
|
4597
|
+
this.config = options.config;
|
|
4598
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
4599
|
+
this.cwd = options.cwd;
|
|
4600
|
+
}
|
|
4601
|
+
async evaluate(context) {
|
|
4602
|
+
const memberResults = await Promise.all(
|
|
4603
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
4604
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
4605
|
+
return {
|
|
4606
|
+
id: memberConfig.name,
|
|
4607
|
+
type: memberConfig.type,
|
|
4608
|
+
result: await evaluator.evaluate(context)
|
|
4609
|
+
};
|
|
4610
|
+
})
|
|
4611
|
+
);
|
|
4612
|
+
return this.aggregate(memberResults, context);
|
|
4613
|
+
}
|
|
4614
|
+
async aggregate(results, context) {
|
|
4615
|
+
const aggregator = this.config.aggregator;
|
|
4616
|
+
switch (aggregator.type) {
|
|
4617
|
+
case "code_judge":
|
|
4618
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
4619
|
+
case "llm_judge":
|
|
4620
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
4621
|
+
default:
|
|
4622
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
4623
|
+
}
|
|
4624
|
+
}
|
|
4625
|
+
runWeightedAverage(results, weights) {
|
|
4626
|
+
let totalWeight = 0;
|
|
4627
|
+
let weightedSum = 0;
|
|
4628
|
+
const allHits = [];
|
|
4629
|
+
const allMisses = [];
|
|
4630
|
+
const reasoningParts = [];
|
|
4631
|
+
const evaluatorResults = [];
|
|
4632
|
+
for (const member of results) {
|
|
4633
|
+
const weight = weights?.[member.id] ?? 1;
|
|
4634
|
+
totalWeight += weight;
|
|
4635
|
+
weightedSum += member.result.score * weight;
|
|
4636
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
4637
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
4638
|
+
if (member.result.reasoning) {
|
|
4639
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
4640
|
+
}
|
|
4641
|
+
evaluatorResults.push({
|
|
4642
|
+
name: member.id,
|
|
4643
|
+
type: member.type,
|
|
4644
|
+
score: member.result.score,
|
|
4645
|
+
weight,
|
|
4646
|
+
verdict: member.result.verdict,
|
|
4647
|
+
hits: [...member.result.hits],
|
|
4648
|
+
misses: [...member.result.misses],
|
|
4649
|
+
reasoning: member.result.reasoning,
|
|
4650
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4651
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4652
|
+
});
|
|
4653
|
+
}
|
|
4654
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
4655
|
+
return {
|
|
4656
|
+
score: clampScore(finalScore),
|
|
4657
|
+
verdict: scoreToVerdict(finalScore),
|
|
4658
|
+
hits: allHits,
|
|
4659
|
+
misses: allMisses,
|
|
4660
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
4661
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
4662
|
+
evaluatorRawRequest: {
|
|
4663
|
+
aggregator: "weighted_average",
|
|
4664
|
+
...weights ? { weights } : {}
|
|
4665
|
+
},
|
|
4666
|
+
evaluatorResults
|
|
4667
|
+
};
|
|
4668
|
+
}
|
|
4669
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
4670
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4671
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
4672
|
+
const evaluatorResults = results.map((member) => ({
|
|
4673
|
+
name: member.id,
|
|
4674
|
+
type: member.type,
|
|
4675
|
+
score: member.result.score,
|
|
4676
|
+
weight: weights?.[member.id] ?? 1,
|
|
4677
|
+
verdict: member.result.verdict,
|
|
4678
|
+
hits: [...member.result.hits],
|
|
4679
|
+
misses: [...member.result.misses],
|
|
4680
|
+
reasoning: member.result.reasoning,
|
|
4681
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4682
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4683
|
+
}));
|
|
4684
|
+
try {
|
|
4685
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
4686
|
+
const parsed = parseJsonSafe(stdout);
|
|
4687
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
4688
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
4689
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
4690
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
4691
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
4692
|
+
return {
|
|
4693
|
+
score,
|
|
4694
|
+
verdict,
|
|
4695
|
+
hits,
|
|
4696
|
+
misses,
|
|
4697
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
4698
|
+
reasoning,
|
|
4699
|
+
evaluatorRawRequest: {
|
|
4700
|
+
aggregator: "code_judge",
|
|
4701
|
+
script: scriptPath
|
|
4702
|
+
},
|
|
4703
|
+
evaluatorResults
|
|
4704
|
+
};
|
|
4705
|
+
} catch (error) {
|
|
4706
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4707
|
+
return {
|
|
4708
|
+
score: 0,
|
|
4709
|
+
verdict: "fail",
|
|
4710
|
+
hits: [],
|
|
4711
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
4712
|
+
expectedAspectCount: 1,
|
|
4713
|
+
reasoning: message,
|
|
4714
|
+
evaluatorRawRequest: {
|
|
4715
|
+
aggregator: "code_judge",
|
|
4716
|
+
script: scriptPath,
|
|
4717
|
+
error: message
|
|
4718
|
+
},
|
|
4719
|
+
evaluatorResults
|
|
4720
|
+
};
|
|
4721
|
+
}
|
|
4722
|
+
}
|
|
4723
|
+
async runLlmAggregator(results, context, config) {
|
|
4724
|
+
const judgeProvider = context.judgeProvider;
|
|
4725
|
+
if (!judgeProvider) {
|
|
4726
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
4727
|
+
}
|
|
4728
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4729
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
4730
|
+
const evaluatorResults = results.map((member) => ({
|
|
4731
|
+
name: member.id,
|
|
4732
|
+
type: member.type,
|
|
4733
|
+
score: member.result.score,
|
|
4734
|
+
verdict: member.result.verdict,
|
|
4735
|
+
hits: [...member.result.hits],
|
|
4736
|
+
misses: [...member.result.misses],
|
|
4737
|
+
reasoning: member.result.reasoning,
|
|
4738
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4739
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4740
|
+
}));
|
|
4741
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
4742
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
4743
|
+
const systemPrompt = buildOutputSchema();
|
|
4744
|
+
const evaluatorRawRequest = {
|
|
4745
|
+
aggregator: "llm_judge",
|
|
4746
|
+
userPrompt,
|
|
4747
|
+
systemPrompt,
|
|
4748
|
+
target: judgeProvider.targetName
|
|
4749
|
+
};
|
|
4750
|
+
try {
|
|
4751
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
4752
|
+
if (model) {
|
|
4753
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
4754
|
+
model,
|
|
4755
|
+
system: systemPrompt,
|
|
4756
|
+
prompt: userPrompt
|
|
4757
|
+
});
|
|
4758
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
4759
|
+
const score2 = clampScore(data2.score);
|
|
4760
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4761
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4762
|
+
const reasoning2 = data2.reasoning;
|
|
4763
|
+
return {
|
|
4764
|
+
score: score2,
|
|
4765
|
+
verdict: scoreToVerdict(score2),
|
|
4766
|
+
hits: hits2,
|
|
4767
|
+
misses: misses2,
|
|
4768
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
4769
|
+
reasoning: reasoning2,
|
|
4770
|
+
evaluatorRawRequest,
|
|
4771
|
+
evaluatorResults
|
|
4772
|
+
};
|
|
4773
|
+
}
|
|
4774
|
+
const response = await judgeProvider.invoke({
|
|
4775
|
+
question: userPrompt,
|
|
4776
|
+
systemPrompt,
|
|
4777
|
+
evalCaseId: context.evalCase.id,
|
|
4778
|
+
attempt: context.attempt
|
|
4779
|
+
});
|
|
4780
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
4781
|
+
const score = clampScore(data.score);
|
|
4782
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4783
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4784
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
4785
|
+
return {
|
|
4786
|
+
score,
|
|
4787
|
+
verdict: scoreToVerdict(score),
|
|
4788
|
+
hits,
|
|
4789
|
+
misses,
|
|
4790
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
4791
|
+
reasoning,
|
|
4792
|
+
evaluatorRawRequest,
|
|
4793
|
+
evaluatorResults
|
|
4794
|
+
};
|
|
4795
|
+
} catch {
|
|
4796
|
+
return {
|
|
4797
|
+
score: 0,
|
|
4798
|
+
verdict: "fail",
|
|
4799
|
+
hits: [],
|
|
4800
|
+
misses: [],
|
|
4801
|
+
expectedAspectCount: 1,
|
|
4802
|
+
evaluatorRawRequest,
|
|
4803
|
+
evaluatorResults
|
|
4804
|
+
};
|
|
4805
|
+
}
|
|
4806
|
+
}
|
|
4807
|
+
};
|
|
4024
4808
|
|
|
4025
4809
|
// src/evaluation/orchestrator.ts
|
|
4026
4810
|
var import_node_crypto2 = require("crypto");
|
|
4027
4811
|
var import_promises11 = require("fs/promises");
|
|
4028
|
-
var
|
|
4812
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4029
4813
|
|
|
4030
4814
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4031
4815
|
var Node = class {
|
|
@@ -4232,7 +5016,7 @@ async function runEvaluation(options) {
|
|
|
4232
5016
|
if (!definition) {
|
|
4233
5017
|
return void 0;
|
|
4234
5018
|
}
|
|
4235
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
5019
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
4236
5020
|
resolvedTargetsByName.set(name, resolved);
|
|
4237
5021
|
return resolved;
|
|
4238
5022
|
};
|
|
@@ -4546,6 +5330,17 @@ async function runEvalCase(options) {
|
|
|
4546
5330
|
if (cacheKey && cache && !cachedResponse) {
|
|
4547
5331
|
await cache.set(cacheKey, providerResponse);
|
|
4548
5332
|
}
|
|
5333
|
+
let candidateTrace = providerResponse.trace;
|
|
5334
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
5335
|
+
try {
|
|
5336
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
5337
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
5338
|
+
candidateTrace = rawTrace;
|
|
5339
|
+
}
|
|
5340
|
+
} catch {
|
|
5341
|
+
}
|
|
5342
|
+
}
|
|
5343
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4549
5344
|
try {
|
|
4550
5345
|
return await evaluateCandidate({
|
|
4551
5346
|
evalCase,
|
|
@@ -4557,7 +5352,9 @@ async function runEvalCase(options) {
|
|
|
4557
5352
|
nowFn,
|
|
4558
5353
|
attempt,
|
|
4559
5354
|
judgeProvider,
|
|
4560
|
-
agentTimeoutMs
|
|
5355
|
+
agentTimeoutMs,
|
|
5356
|
+
candidateTrace,
|
|
5357
|
+
candidateTraceSummary
|
|
4561
5358
|
});
|
|
4562
5359
|
} catch (error) {
|
|
4563
5360
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4574,7 +5371,9 @@ async function evaluateCandidate(options) {
|
|
|
4574
5371
|
nowFn,
|
|
4575
5372
|
attempt,
|
|
4576
5373
|
judgeProvider,
|
|
4577
|
-
agentTimeoutMs
|
|
5374
|
+
agentTimeoutMs,
|
|
5375
|
+
candidateTrace,
|
|
5376
|
+
candidateTraceSummary
|
|
4578
5377
|
} = options;
|
|
4579
5378
|
const gradeTimestamp = nowFn();
|
|
4580
5379
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4587,7 +5386,9 @@ async function evaluateCandidate(options) {
|
|
|
4587
5386
|
promptInputs,
|
|
4588
5387
|
now: gradeTimestamp,
|
|
4589
5388
|
judgeProvider,
|
|
4590
|
-
agentTimeoutMs
|
|
5389
|
+
agentTimeoutMs,
|
|
5390
|
+
candidateTrace,
|
|
5391
|
+
candidateTraceSummary
|
|
4591
5392
|
});
|
|
4592
5393
|
const completedAt = nowFn();
|
|
4593
5394
|
let agentProviderRequest;
|
|
@@ -4626,7 +5427,8 @@ async function evaluateCandidate(options) {
|
|
|
4626
5427
|
agent_provider_request: agentProviderRequest,
|
|
4627
5428
|
lm_provider_request: lmProviderRequest,
|
|
4628
5429
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4629
|
-
evaluator_results: evaluatorResults
|
|
5430
|
+
evaluator_results: evaluatorResults,
|
|
5431
|
+
trace_summary: candidateTraceSummary
|
|
4630
5432
|
};
|
|
4631
5433
|
}
|
|
4632
5434
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4640,7 +5442,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4640
5442
|
promptInputs,
|
|
4641
5443
|
now,
|
|
4642
5444
|
judgeProvider,
|
|
4643
|
-
agentTimeoutMs
|
|
5445
|
+
agentTimeoutMs,
|
|
5446
|
+
candidateTrace,
|
|
5447
|
+
candidateTraceSummary
|
|
4644
5448
|
} = options;
|
|
4645
5449
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4646
5450
|
return runEvaluatorList({
|
|
@@ -4654,7 +5458,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4654
5458
|
promptInputs,
|
|
4655
5459
|
now,
|
|
4656
5460
|
judgeProvider,
|
|
4657
|
-
agentTimeoutMs
|
|
5461
|
+
agentTimeoutMs,
|
|
5462
|
+
candidateTrace,
|
|
5463
|
+
candidateTraceSummary
|
|
4658
5464
|
});
|
|
4659
5465
|
}
|
|
4660
5466
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4670,7 +5476,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4670
5476
|
attempt,
|
|
4671
5477
|
promptInputs,
|
|
4672
5478
|
now,
|
|
4673
|
-
judgeProvider
|
|
5479
|
+
judgeProvider,
|
|
5480
|
+
candidateTrace,
|
|
5481
|
+
candidateTraceSummary
|
|
4674
5482
|
});
|
|
4675
5483
|
return { score };
|
|
4676
5484
|
}
|
|
@@ -4686,7 +5494,9 @@ async function runEvaluatorList(options) {
|
|
|
4686
5494
|
promptInputs,
|
|
4687
5495
|
now,
|
|
4688
5496
|
judgeProvider,
|
|
4689
|
-
agentTimeoutMs
|
|
5497
|
+
agentTimeoutMs,
|
|
5498
|
+
candidateTrace,
|
|
5499
|
+
candidateTraceSummary
|
|
4690
5500
|
} = options;
|
|
4691
5501
|
const scored = [];
|
|
4692
5502
|
const evaluatorResults = [];
|
|
@@ -4732,6 +5542,63 @@ async function runEvaluatorList(options) {
|
|
|
4732
5542
|
promptInputs,
|
|
4733
5543
|
now
|
|
4734
5544
|
});
|
|
5545
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
5546
|
+
evaluatorResults.push({
|
|
5547
|
+
name: evaluator.name,
|
|
5548
|
+
type: "code_judge",
|
|
5549
|
+
score: score2.score,
|
|
5550
|
+
verdict: score2.verdict,
|
|
5551
|
+
hits: score2.hits,
|
|
5552
|
+
misses: score2.misses,
|
|
5553
|
+
reasoning: score2.reasoning,
|
|
5554
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
5555
|
+
});
|
|
5556
|
+
}
|
|
5557
|
+
if (evaluator.type === "composite") {
|
|
5558
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5559
|
+
const createEvaluator = (memberConfig) => {
|
|
5560
|
+
switch (memberConfig.type) {
|
|
5561
|
+
case "llm_judge":
|
|
5562
|
+
return evaluatorRegistry.llm_judge;
|
|
5563
|
+
case "code":
|
|
5564
|
+
return new CodeEvaluator({
|
|
5565
|
+
script: memberConfig.script,
|
|
5566
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
5567
|
+
agentTimeoutMs
|
|
5568
|
+
});
|
|
5569
|
+
case "composite":
|
|
5570
|
+
return new CompositeEvaluator({
|
|
5571
|
+
config: memberConfig,
|
|
5572
|
+
cwd: evalFileDir,
|
|
5573
|
+
evaluatorFactory: { create: createEvaluator }
|
|
5574
|
+
});
|
|
5575
|
+
case "tool_trajectory":
|
|
5576
|
+
return new ToolTrajectoryEvaluator({
|
|
5577
|
+
config: memberConfig
|
|
5578
|
+
});
|
|
5579
|
+
case "expected_messages":
|
|
5580
|
+
return new ExpectedMessagesEvaluator();
|
|
5581
|
+
default: {
|
|
5582
|
+
const unknownConfig = memberConfig;
|
|
5583
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
5584
|
+
}
|
|
5585
|
+
}
|
|
5586
|
+
};
|
|
5587
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
5588
|
+
config: evaluator,
|
|
5589
|
+
cwd: evalFileDir,
|
|
5590
|
+
evaluatorFactory: { create: createEvaluator }
|
|
5591
|
+
});
|
|
5592
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
5593
|
+
evalCase,
|
|
5594
|
+
candidate,
|
|
5595
|
+
target,
|
|
5596
|
+
provider,
|
|
5597
|
+
attempt,
|
|
5598
|
+
promptInputs,
|
|
5599
|
+
now,
|
|
5600
|
+
judgeProvider
|
|
5601
|
+
});
|
|
4735
5602
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4736
5603
|
evaluatorResults.push({
|
|
4737
5604
|
name: evaluator.name,
|
|
@@ -4741,7 +5608,58 @@ async function runEvaluatorList(options) {
|
|
|
4741
5608
|
hits: score2.hits,
|
|
4742
5609
|
misses: score2.misses,
|
|
4743
5610
|
reasoning: score2.reasoning,
|
|
4744
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
5611
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
5612
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
5613
|
+
});
|
|
5614
|
+
}
|
|
5615
|
+
if (evaluator.type === "tool_trajectory") {
|
|
5616
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
5617
|
+
config: evaluator
|
|
5618
|
+
});
|
|
5619
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
5620
|
+
evalCase,
|
|
5621
|
+
candidate,
|
|
5622
|
+
target,
|
|
5623
|
+
provider,
|
|
5624
|
+
attempt,
|
|
5625
|
+
promptInputs,
|
|
5626
|
+
now,
|
|
5627
|
+
candidateTrace,
|
|
5628
|
+
candidateTraceSummary
|
|
5629
|
+
});
|
|
5630
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
5631
|
+
evaluatorResults.push({
|
|
5632
|
+
name: evaluator.name,
|
|
5633
|
+
type: evaluator.type,
|
|
5634
|
+
score: score2.score,
|
|
5635
|
+
verdict: score2.verdict,
|
|
5636
|
+
hits: score2.hits,
|
|
5637
|
+
misses: score2.misses,
|
|
5638
|
+
reasoning: score2.reasoning
|
|
5639
|
+
});
|
|
5640
|
+
}
|
|
5641
|
+
if (evaluator.type === "expected_messages") {
|
|
5642
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
5643
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
5644
|
+
evalCase,
|
|
5645
|
+
candidate,
|
|
5646
|
+
target,
|
|
5647
|
+
provider,
|
|
5648
|
+
attempt,
|
|
5649
|
+
promptInputs,
|
|
5650
|
+
now,
|
|
5651
|
+
candidateTrace,
|
|
5652
|
+
candidateTraceSummary
|
|
5653
|
+
});
|
|
5654
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
5655
|
+
evaluatorResults.push({
|
|
5656
|
+
name: evaluator.name,
|
|
5657
|
+
type: evaluator.type,
|
|
5658
|
+
score: score2.score,
|
|
5659
|
+
verdict: score2.verdict,
|
|
5660
|
+
hits: score2.hits,
|
|
5661
|
+
misses: score2.misses,
|
|
5662
|
+
reasoning: score2.reasoning
|
|
4745
5663
|
});
|
|
4746
5664
|
}
|
|
4747
5665
|
} catch (error) {
|
|
@@ -4754,14 +5672,15 @@ async function runEvaluatorList(options) {
|
|
|
4754
5672
|
expectedAspectCount: 1,
|
|
4755
5673
|
reasoning: message
|
|
4756
5674
|
};
|
|
5675
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4757
5676
|
scored.push({
|
|
4758
5677
|
score: fallbackScore,
|
|
4759
5678
|
name: evaluator.name ?? "unknown",
|
|
4760
|
-
type:
|
|
5679
|
+
type: resultType ?? "llm_judge"
|
|
4761
5680
|
});
|
|
4762
5681
|
evaluatorResults.push({
|
|
4763
5682
|
name: evaluator.name ?? "unknown",
|
|
4764
|
-
type:
|
|
5683
|
+
type: resultType ?? "llm_judge",
|
|
4765
5684
|
score: 0,
|
|
4766
5685
|
verdict: "fail",
|
|
4767
5686
|
hits: [],
|
|
@@ -4865,8 +5784,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
4865
5784
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
4866
5785
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4867
5786
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4868
|
-
const filePath =
|
|
4869
|
-
await (0, import_promises11.mkdir)(
|
|
5787
|
+
const filePath = import_node_path14.default.resolve(directory, filename);
|
|
5788
|
+
await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
|
|
4870
5789
|
const payload = {
|
|
4871
5790
|
eval_id: evalCase.id,
|
|
4872
5791
|
question: promptInputs.question,
|
|
@@ -4979,6 +5898,23 @@ function isTimeoutLike(error) {
|
|
|
4979
5898
|
const value = String(error).toLowerCase();
|
|
4980
5899
|
return value.includes("timeout");
|
|
4981
5900
|
}
|
|
5901
|
+
function mapChildResults(children) {
|
|
5902
|
+
if (!children || children.length === 0) {
|
|
5903
|
+
return void 0;
|
|
5904
|
+
}
|
|
5905
|
+
return children.map((child) => ({
|
|
5906
|
+
name: child.name,
|
|
5907
|
+
type: child.type,
|
|
5908
|
+
score: child.score,
|
|
5909
|
+
weight: child.weight,
|
|
5910
|
+
verdict: child.verdict,
|
|
5911
|
+
hits: child.hits,
|
|
5912
|
+
misses: child.misses,
|
|
5913
|
+
reasoning: child.reasoning,
|
|
5914
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
5915
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5916
|
+
}));
|
|
5917
|
+
}
|
|
4982
5918
|
|
|
4983
5919
|
// src/evaluation/generators/rubric-generator.ts
|
|
4984
5920
|
var import_ai3 = require("ai");
|
|
@@ -5067,11 +6003,15 @@ function createAgentKernel() {
|
|
|
5067
6003
|
// Annotate the CommonJS export names for ESM import in node:
|
|
5068
6004
|
0 && (module.exports = {
|
|
5069
6005
|
CodeEvaluator,
|
|
6006
|
+
CompositeEvaluator,
|
|
6007
|
+
ExpectedMessagesEvaluator,
|
|
5070
6008
|
LlmJudgeEvaluator,
|
|
5071
6009
|
TEST_MESSAGE_ROLES,
|
|
6010
|
+
ToolTrajectoryEvaluator,
|
|
5072
6011
|
buildDirectoryChain,
|
|
5073
6012
|
buildPromptInputs,
|
|
5074
6013
|
buildSearchRoots,
|
|
6014
|
+
computeTraceSummary,
|
|
5075
6015
|
consumeCodexLogEntries,
|
|
5076
6016
|
createAgentKernel,
|
|
5077
6017
|
createProvider,
|
|
@@ -5082,14 +6022,18 @@ function createAgentKernel() {
|
|
|
5082
6022
|
generateRubrics,
|
|
5083
6023
|
getHitCount,
|
|
5084
6024
|
isEvaluatorKind,
|
|
6025
|
+
isExpectedToolCall,
|
|
5085
6026
|
isGuidelineFile,
|
|
5086
6027
|
isJsonObject,
|
|
5087
6028
|
isJsonValue,
|
|
5088
6029
|
isTestMessage,
|
|
5089
6030
|
isTestMessageRole,
|
|
6031
|
+
isTraceEvent,
|
|
6032
|
+
isTraceEventType,
|
|
5090
6033
|
listTargetNames,
|
|
5091
6034
|
loadEvalCases,
|
|
5092
6035
|
normalizeLineEndings,
|
|
6036
|
+
readJsonFile,
|
|
5093
6037
|
readTargetDefinitions,
|
|
5094
6038
|
readTestSuiteMetadata,
|
|
5095
6039
|
readTextFile,
|