@agentv/core 0.23.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-NDEN3H2B.js} +28 -17
- package/dist/chunk-NDEN3H2B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +674 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +157 -4
- package/dist/index.d.ts +157 -4
- package/dist/index.js +629 -33
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,10 +5,11 @@ import {
|
|
|
5
5
|
findGitRoot,
|
|
6
6
|
isAgentProvider,
|
|
7
7
|
normalizeLineEndings,
|
|
8
|
+
readJsonFile,
|
|
8
9
|
readTextFile,
|
|
9
10
|
resolveFileReference,
|
|
10
11
|
resolveTargetDefinition
|
|
11
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-NDEN3H2B.js";
|
|
12
13
|
|
|
13
14
|
// src/evaluation/types.ts
|
|
14
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -51,7 +52,14 @@ function isTestMessage(value) {
|
|
|
51
52
|
}
|
|
52
53
|
return candidate.content.every(isJsonObject);
|
|
53
54
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = [
|
|
55
|
+
var EVALUATOR_KIND_VALUES = [
|
|
56
|
+
"code_judge",
|
|
57
|
+
"llm_judge",
|
|
58
|
+
"rubric",
|
|
59
|
+
"composite",
|
|
60
|
+
"tool_trajectory",
|
|
61
|
+
"expected_messages"
|
|
62
|
+
];
|
|
55
63
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
64
|
function isEvaluatorKind(value) {
|
|
57
65
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -60,6 +68,44 @@ function getHitCount(result) {
|
|
|
60
68
|
return result.hits.length;
|
|
61
69
|
}
|
|
62
70
|
|
|
71
|
+
// src/evaluation/trace.ts
|
|
72
|
+
function isTraceEventType(value) {
|
|
73
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
74
|
+
}
|
|
75
|
+
function isTraceEvent(value) {
|
|
76
|
+
if (typeof value !== "object" || value === null) {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
const candidate = value;
|
|
80
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
81
|
+
}
|
|
82
|
+
function isExpectedToolCall(value) {
|
|
83
|
+
if (typeof value !== "object" || value === null) {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
const candidate = value;
|
|
87
|
+
return typeof candidate.tool === "string";
|
|
88
|
+
}
|
|
89
|
+
function computeTraceSummary(trace) {
|
|
90
|
+
const toolCallCounts = {};
|
|
91
|
+
let errorCount = 0;
|
|
92
|
+
for (const event of trace) {
|
|
93
|
+
if (event.type === "tool_call" && event.name) {
|
|
94
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
95
|
+
}
|
|
96
|
+
if (event.type === "error") {
|
|
97
|
+
errorCount++;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
101
|
+
return {
|
|
102
|
+
eventCount: trace.length,
|
|
103
|
+
toolNames,
|
|
104
|
+
toolCallsByName: toolCallCounts,
|
|
105
|
+
errorCount
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
63
109
|
// src/evaluation/yaml-parser.ts
|
|
64
110
|
import { readFile as readFile5 } from "node:fs/promises";
|
|
65
111
|
import path6 from "node:path";
|
|
@@ -409,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
409
455
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
410
456
|
continue;
|
|
411
457
|
}
|
|
458
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
412
459
|
const cwd = asString2(rawEvaluator.cwd);
|
|
413
460
|
let resolvedCwd;
|
|
414
461
|
if (cwd) {
|
|
@@ -429,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
429
476
|
type: "code",
|
|
430
477
|
script,
|
|
431
478
|
cwd,
|
|
432
|
-
resolvedCwd
|
|
479
|
+
resolvedCwd,
|
|
480
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
433
481
|
});
|
|
434
482
|
continue;
|
|
435
483
|
}
|
|
@@ -524,14 +572,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
524
572
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
525
573
|
};
|
|
526
574
|
}
|
|
575
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
527
576
|
evaluators.push({
|
|
528
577
|
name,
|
|
529
578
|
type: "composite",
|
|
530
579
|
evaluators: memberEvaluators,
|
|
531
|
-
aggregator
|
|
580
|
+
aggregator,
|
|
581
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
582
|
+
});
|
|
583
|
+
continue;
|
|
584
|
+
}
|
|
585
|
+
if (typeValue === "expected_messages") {
|
|
586
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
587
|
+
evaluators.push({
|
|
588
|
+
name,
|
|
589
|
+
type: "expected_messages",
|
|
590
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
532
591
|
});
|
|
533
592
|
continue;
|
|
534
593
|
}
|
|
594
|
+
if (typeValue === "tool_trajectory") {
|
|
595
|
+
const mode = asString2(rawEvaluator.mode);
|
|
596
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
597
|
+
logWarning2(
|
|
598
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
599
|
+
);
|
|
600
|
+
continue;
|
|
601
|
+
}
|
|
602
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
603
|
+
let minimums;
|
|
604
|
+
if (rawMinimums !== void 0) {
|
|
605
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
606
|
+
logWarning2(
|
|
607
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
608
|
+
);
|
|
609
|
+
continue;
|
|
610
|
+
}
|
|
611
|
+
minimums = {};
|
|
612
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
613
|
+
if (typeof count === "number" && count >= 0) {
|
|
614
|
+
minimums[toolName] = count;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
const rawExpected = rawEvaluator.expected;
|
|
619
|
+
let expected;
|
|
620
|
+
if (rawExpected !== void 0) {
|
|
621
|
+
if (!Array.isArray(rawExpected)) {
|
|
622
|
+
logWarning2(
|
|
623
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
624
|
+
);
|
|
625
|
+
continue;
|
|
626
|
+
}
|
|
627
|
+
expected = [];
|
|
628
|
+
for (const item of rawExpected) {
|
|
629
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
630
|
+
expected.push({ tool: item.tool });
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
if (mode === "any_order" && !minimums) {
|
|
635
|
+
logWarning2(
|
|
636
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
637
|
+
);
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
641
|
+
logWarning2(
|
|
642
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
643
|
+
);
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
647
|
+
const config = {
|
|
648
|
+
name,
|
|
649
|
+
type: "tool_trajectory",
|
|
650
|
+
mode,
|
|
651
|
+
...minimums ? { minimums } : {},
|
|
652
|
+
...expected ? { expected } : {},
|
|
653
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
654
|
+
};
|
|
655
|
+
evaluators.push(config);
|
|
656
|
+
continue;
|
|
657
|
+
}
|
|
535
658
|
const prompt = asString2(rawEvaluator.prompt);
|
|
536
659
|
let promptPath;
|
|
537
660
|
if (prompt) {
|
|
@@ -568,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
568
691
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
569
692
|
continue;
|
|
570
693
|
}
|
|
694
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
571
695
|
evaluators.push({
|
|
572
696
|
name,
|
|
573
697
|
type: "llm_judge",
|
|
574
|
-
rubrics: parsedRubrics
|
|
698
|
+
rubrics: parsedRubrics,
|
|
699
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
575
700
|
});
|
|
576
701
|
continue;
|
|
577
702
|
}
|
|
703
|
+
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
578
704
|
evaluators.push({
|
|
579
705
|
name,
|
|
580
706
|
type: "llm_judge",
|
|
581
707
|
prompt,
|
|
582
708
|
promptPath,
|
|
583
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
709
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
710
|
+
...weight !== void 0 ? { weight } : {}
|
|
584
711
|
});
|
|
585
712
|
}
|
|
586
713
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -610,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
610
737
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
611
738
|
}
|
|
612
739
|
}
|
|
740
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
741
|
+
if (rawWeight === void 0) {
|
|
742
|
+
return void 0;
|
|
743
|
+
}
|
|
744
|
+
if (typeof rawWeight !== "number") {
|
|
745
|
+
throw new Error(
|
|
746
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
747
|
+
);
|
|
748
|
+
}
|
|
749
|
+
if (!Number.isFinite(rawWeight)) {
|
|
750
|
+
throw new Error(
|
|
751
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
752
|
+
);
|
|
753
|
+
}
|
|
754
|
+
if (rawWeight < 0) {
|
|
755
|
+
throw new Error(
|
|
756
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
757
|
+
);
|
|
758
|
+
}
|
|
759
|
+
return rawWeight;
|
|
760
|
+
}
|
|
613
761
|
|
|
614
762
|
// src/evaluation/loaders/message-processor.ts
|
|
615
763
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -785,6 +933,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
785
933
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
786
934
|
}
|
|
787
935
|
}
|
|
936
|
+
async function processExpectedMessages(options) {
|
|
937
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
938
|
+
const segments = [];
|
|
939
|
+
for (const message of messages) {
|
|
940
|
+
const segment = {
|
|
941
|
+
role: message.role
|
|
942
|
+
};
|
|
943
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
944
|
+
segment.tool_calls = message.tool_calls;
|
|
945
|
+
}
|
|
946
|
+
const content = message.content;
|
|
947
|
+
if (typeof content === "string") {
|
|
948
|
+
segment.content = content;
|
|
949
|
+
} else if (Array.isArray(content)) {
|
|
950
|
+
const processedContent = [];
|
|
951
|
+
for (const rawSegment of content) {
|
|
952
|
+
if (!isJsonObject(rawSegment)) {
|
|
953
|
+
continue;
|
|
954
|
+
}
|
|
955
|
+
const segmentType = asString3(rawSegment.type);
|
|
956
|
+
if (segmentType === "file") {
|
|
957
|
+
const rawValue = asString3(rawSegment.value);
|
|
958
|
+
if (!rawValue) {
|
|
959
|
+
continue;
|
|
960
|
+
}
|
|
961
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
962
|
+
rawValue,
|
|
963
|
+
searchRoots
|
|
964
|
+
);
|
|
965
|
+
if (!resolvedPath) {
|
|
966
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
967
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
968
|
+
continue;
|
|
969
|
+
}
|
|
970
|
+
try {
|
|
971
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
972
|
+
processedContent.push({
|
|
973
|
+
type: "file",
|
|
974
|
+
path: displayPath,
|
|
975
|
+
text: fileContent,
|
|
976
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
977
|
+
});
|
|
978
|
+
if (verbose) {
|
|
979
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
980
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
981
|
+
}
|
|
982
|
+
} catch (error) {
|
|
983
|
+
logWarning3(
|
|
984
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
985
|
+
);
|
|
986
|
+
}
|
|
987
|
+
continue;
|
|
988
|
+
}
|
|
989
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
990
|
+
}
|
|
991
|
+
segment.content = processedContent;
|
|
992
|
+
}
|
|
993
|
+
segments.push(segment);
|
|
994
|
+
}
|
|
995
|
+
return segments;
|
|
996
|
+
}
|
|
788
997
|
|
|
789
998
|
// src/evaluation/formatting/prompt-builder.ts
|
|
790
999
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -1089,12 +1298,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1089
1298
|
messageType: "input",
|
|
1090
1299
|
verbose
|
|
1091
1300
|
});
|
|
1092
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1301
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1093
1302
|
messages: expectedMessages,
|
|
1094
1303
|
searchRoots,
|
|
1095
1304
|
repoRootPath,
|
|
1096
|
-
guidelinePatterns,
|
|
1097
|
-
messageType: "output",
|
|
1098
1305
|
verbose
|
|
1099
1306
|
}) : [];
|
|
1100
1307
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1618,9 +1825,11 @@ var CliProvider = class {
|
|
|
1618
1825
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1619
1826
|
throw new Error(message);
|
|
1620
1827
|
}
|
|
1621
|
-
const
|
|
1828
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1829
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1622
1830
|
return {
|
|
1623
|
-
text:
|
|
1831
|
+
text: parsed.text,
|
|
1832
|
+
trace: parsed.trace,
|
|
1624
1833
|
raw: {
|
|
1625
1834
|
command: renderedCommand,
|
|
1626
1835
|
stderr: result.stderr,
|
|
@@ -1630,6 +1839,31 @@ var CliProvider = class {
|
|
|
1630
1839
|
}
|
|
1631
1840
|
};
|
|
1632
1841
|
}
|
|
1842
|
+
/**
|
|
1843
|
+
* Parse output content from CLI.
|
|
1844
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
1845
|
+
* Otherwise, treat the entire content as plain text.
|
|
1846
|
+
*/
|
|
1847
|
+
parseOutputContent(content) {
|
|
1848
|
+
try {
|
|
1849
|
+
const parsed = JSON.parse(content);
|
|
1850
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
1851
|
+
const obj = parsed;
|
|
1852
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1853
|
+
const trace = this.parseTrace(obj.trace);
|
|
1854
|
+
return { text, trace };
|
|
1855
|
+
}
|
|
1856
|
+
} catch {
|
|
1857
|
+
}
|
|
1858
|
+
return { text: content };
|
|
1859
|
+
}
|
|
1860
|
+
parseTrace(trace) {
|
|
1861
|
+
if (!Array.isArray(trace)) {
|
|
1862
|
+
return void 0;
|
|
1863
|
+
}
|
|
1864
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
1865
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
1866
|
+
}
|
|
1633
1867
|
async readAndCleanupOutputFile(filePath) {
|
|
1634
1868
|
try {
|
|
1635
1869
|
const content = await readTextFile(filePath);
|
|
@@ -2616,6 +2850,7 @@ var MockProvider = class {
|
|
|
2616
2850
|
delayMs;
|
|
2617
2851
|
delayMinMs;
|
|
2618
2852
|
delayMaxMs;
|
|
2853
|
+
trace;
|
|
2619
2854
|
constructor(targetName, config) {
|
|
2620
2855
|
this.id = `mock:${targetName}`;
|
|
2621
2856
|
this.targetName = targetName;
|
|
@@ -2623,6 +2858,7 @@ var MockProvider = class {
|
|
|
2623
2858
|
this.delayMs = config.delayMs ?? 0;
|
|
2624
2859
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2625
2860
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2861
|
+
this.trace = config.trace;
|
|
2626
2862
|
}
|
|
2627
2863
|
async invoke(request) {
|
|
2628
2864
|
const delay = this.calculateDelay();
|
|
@@ -2634,7 +2870,8 @@ var MockProvider = class {
|
|
|
2634
2870
|
raw: {
|
|
2635
2871
|
question: request.question,
|
|
2636
2872
|
guidelines: request.guidelines
|
|
2637
|
-
}
|
|
2873
|
+
},
|
|
2874
|
+
trace: this.trace
|
|
2638
2875
|
};
|
|
2639
2876
|
}
|
|
2640
2877
|
calculateDelay() {
|
|
@@ -3306,9 +3543,11 @@ var CodeEvaluator = class {
|
|
|
3306
3543
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3307
3544
|
reference_answer: context.evalCase.reference_answer,
|
|
3308
3545
|
candidate_answer: context.candidate,
|
|
3309
|
-
|
|
3310
|
-
input_files: context.evalCase.file_paths
|
|
3311
|
-
|
|
3546
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
3547
|
+
input_files: context.evalCase.file_paths.filter(
|
|
3548
|
+
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3549
|
+
),
|
|
3550
|
+
input_messages: context.evalCase.input_messages
|
|
3312
3551
|
},
|
|
3313
3552
|
null,
|
|
3314
3553
|
2
|
|
@@ -3428,6 +3667,251 @@ function substituteVariables(template, variables) {
|
|
|
3428
3667
|
return variables[varName] ?? match;
|
|
3429
3668
|
});
|
|
3430
3669
|
}
|
|
3670
|
+
var ToolTrajectoryEvaluator = class {
|
|
3671
|
+
kind = "tool_trajectory";
|
|
3672
|
+
config;
|
|
3673
|
+
constructor(options) {
|
|
3674
|
+
this.config = options.config;
|
|
3675
|
+
}
|
|
3676
|
+
evaluate(context) {
|
|
3677
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
3678
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
3679
|
+
return {
|
|
3680
|
+
score: 0,
|
|
3681
|
+
verdict: "fail",
|
|
3682
|
+
hits: [],
|
|
3683
|
+
misses: ["No trace available for evaluation"],
|
|
3684
|
+
expectedAspectCount: 1
|
|
3685
|
+
};
|
|
3686
|
+
}
|
|
3687
|
+
switch (this.config.mode) {
|
|
3688
|
+
case "any_order":
|
|
3689
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
3690
|
+
case "in_order":
|
|
3691
|
+
return this.evaluateInOrder(candidateTrace);
|
|
3692
|
+
case "exact":
|
|
3693
|
+
return this.evaluateExact(candidateTrace);
|
|
3694
|
+
default:
|
|
3695
|
+
return {
|
|
3696
|
+
score: 0,
|
|
3697
|
+
verdict: "fail",
|
|
3698
|
+
hits: [],
|
|
3699
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
3700
|
+
expectedAspectCount: 1
|
|
3701
|
+
};
|
|
3702
|
+
}
|
|
3703
|
+
}
|
|
3704
|
+
evaluateAnyOrder(summary) {
|
|
3705
|
+
const minimums = this.config.minimums ?? {};
|
|
3706
|
+
const toolNames = Object.keys(minimums);
|
|
3707
|
+
if (toolNames.length === 0) {
|
|
3708
|
+
return {
|
|
3709
|
+
score: 1,
|
|
3710
|
+
verdict: "pass",
|
|
3711
|
+
hits: ["No tool requirements specified"],
|
|
3712
|
+
misses: [],
|
|
3713
|
+
expectedAspectCount: 0
|
|
3714
|
+
};
|
|
3715
|
+
}
|
|
3716
|
+
const hits = [];
|
|
3717
|
+
const misses = [];
|
|
3718
|
+
for (const toolName of toolNames) {
|
|
3719
|
+
const required = minimums[toolName];
|
|
3720
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
3721
|
+
if (actual >= required) {
|
|
3722
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3723
|
+
} else {
|
|
3724
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3725
|
+
}
|
|
3726
|
+
}
|
|
3727
|
+
const score = hits.length / toolNames.length;
|
|
3728
|
+
return {
|
|
3729
|
+
score,
|
|
3730
|
+
verdict: scoreToVerdict(score),
|
|
3731
|
+
hits,
|
|
3732
|
+
misses,
|
|
3733
|
+
expectedAspectCount: toolNames.length
|
|
3734
|
+
};
|
|
3735
|
+
}
|
|
3736
|
+
evaluateInOrder(trace) {
|
|
3737
|
+
const expected = this.config.expected ?? [];
|
|
3738
|
+
if (expected.length === 0) {
|
|
3739
|
+
return {
|
|
3740
|
+
score: 1,
|
|
3741
|
+
verdict: "pass",
|
|
3742
|
+
hits: ["No tool sequence specified"],
|
|
3743
|
+
misses: [],
|
|
3744
|
+
expectedAspectCount: 0
|
|
3745
|
+
};
|
|
3746
|
+
}
|
|
3747
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3748
|
+
const hits = [];
|
|
3749
|
+
const misses = [];
|
|
3750
|
+
let actualIndex = 0;
|
|
3751
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3752
|
+
const expectedTool = expected[i].tool;
|
|
3753
|
+
let found = false;
|
|
3754
|
+
while (actualIndex < actualToolCalls.length) {
|
|
3755
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
3756
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
3757
|
+
actualIndex++;
|
|
3758
|
+
found = true;
|
|
3759
|
+
break;
|
|
3760
|
+
}
|
|
3761
|
+
actualIndex++;
|
|
3762
|
+
}
|
|
3763
|
+
if (!found) {
|
|
3764
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
3765
|
+
}
|
|
3766
|
+
}
|
|
3767
|
+
const score = hits.length / expected.length;
|
|
3768
|
+
return {
|
|
3769
|
+
score,
|
|
3770
|
+
verdict: scoreToVerdict(score),
|
|
3771
|
+
hits,
|
|
3772
|
+
misses,
|
|
3773
|
+
expectedAspectCount: expected.length
|
|
3774
|
+
};
|
|
3775
|
+
}
|
|
3776
|
+
evaluateExact(trace) {
|
|
3777
|
+
const expected = this.config.expected ?? [];
|
|
3778
|
+
if (expected.length === 0) {
|
|
3779
|
+
return {
|
|
3780
|
+
score: 1,
|
|
3781
|
+
verdict: "pass",
|
|
3782
|
+
hits: ["No tool sequence specified"],
|
|
3783
|
+
misses: [],
|
|
3784
|
+
expectedAspectCount: 0
|
|
3785
|
+
};
|
|
3786
|
+
}
|
|
3787
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3788
|
+
const hits = [];
|
|
3789
|
+
const misses = [];
|
|
3790
|
+
if (actualToolCalls.length !== expected.length) {
|
|
3791
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
3792
|
+
}
|
|
3793
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
3794
|
+
for (let i = 0; i < checkLength; i++) {
|
|
3795
|
+
const expectedTool = expected[i].tool;
|
|
3796
|
+
const actualTool = actualToolCalls[i].name;
|
|
3797
|
+
if (actualTool === expectedTool) {
|
|
3798
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
3799
|
+
} else {
|
|
3800
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
3801
|
+
}
|
|
3802
|
+
}
|
|
3803
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
3804
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
3805
|
+
}
|
|
3806
|
+
const score = hits.length / expected.length;
|
|
3807
|
+
return {
|
|
3808
|
+
score,
|
|
3809
|
+
verdict: scoreToVerdict(score),
|
|
3810
|
+
hits,
|
|
3811
|
+
misses,
|
|
3812
|
+
expectedAspectCount: expected.length
|
|
3813
|
+
};
|
|
3814
|
+
}
|
|
3815
|
+
};
|
|
3816
|
+
var ExpectedMessagesEvaluator = class {
|
|
3817
|
+
kind = "expected_messages";
|
|
3818
|
+
evaluate(context) {
|
|
3819
|
+
const { candidateTrace, evalCase } = context;
|
|
3820
|
+
const expectedSegments = evalCase.expected_segments;
|
|
3821
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
3822
|
+
if (expectedToolCalls.length === 0) {
|
|
3823
|
+
return {
|
|
3824
|
+
score: 1,
|
|
3825
|
+
verdict: "pass",
|
|
3826
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
3827
|
+
misses: [],
|
|
3828
|
+
expectedAspectCount: 1
|
|
3829
|
+
};
|
|
3830
|
+
}
|
|
3831
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
3832
|
+
return {
|
|
3833
|
+
score: 0,
|
|
3834
|
+
verdict: "fail",
|
|
3835
|
+
hits: [],
|
|
3836
|
+
misses: ["No trace available to validate tool_calls"],
|
|
3837
|
+
expectedAspectCount: expectedToolCalls.length
|
|
3838
|
+
};
|
|
3839
|
+
}
|
|
3840
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
3841
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
3842
|
+
}
|
|
3843
|
+
extractExpectedToolCalls(segments) {
|
|
3844
|
+
if (!segments) {
|
|
3845
|
+
return [];
|
|
3846
|
+
}
|
|
3847
|
+
const toolCalls = [];
|
|
3848
|
+
for (const segment of segments) {
|
|
3849
|
+
const role = segment.role;
|
|
3850
|
+
const segmentToolCalls = segment.tool_calls;
|
|
3851
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
3852
|
+
for (const tc of segmentToolCalls) {
|
|
3853
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
3854
|
+
const toolCall = tc;
|
|
3855
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
3856
|
+
}
|
|
3857
|
+
}
|
|
3858
|
+
}
|
|
3859
|
+
}
|
|
3860
|
+
return toolCalls;
|
|
3861
|
+
}
|
|
3862
|
+
validateToolCalls(expected, actual) {
|
|
3863
|
+
const hits = [];
|
|
3864
|
+
const misses = [];
|
|
3865
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3866
|
+
const expectedCall = expected[i];
|
|
3867
|
+
const actualCall = actual[i];
|
|
3868
|
+
if (!actualCall) {
|
|
3869
|
+
misses.push(
|
|
3870
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
3871
|
+
);
|
|
3872
|
+
continue;
|
|
3873
|
+
}
|
|
3874
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
3875
|
+
misses.push(
|
|
3876
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
3877
|
+
);
|
|
3878
|
+
continue;
|
|
3879
|
+
}
|
|
3880
|
+
if (expectedCall.input !== void 0) {
|
|
3881
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
3882
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
3883
|
+
continue;
|
|
3884
|
+
}
|
|
3885
|
+
}
|
|
3886
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
3887
|
+
}
|
|
3888
|
+
const totalChecks = expected.length || 1;
|
|
3889
|
+
const score = hits.length / totalChecks;
|
|
3890
|
+
return {
|
|
3891
|
+
score,
|
|
3892
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
3893
|
+
hits,
|
|
3894
|
+
misses,
|
|
3895
|
+
expectedAspectCount: totalChecks
|
|
3896
|
+
};
|
|
3897
|
+
}
|
|
3898
|
+
deepEquals(a, b) {
|
|
3899
|
+
if (a === b) return true;
|
|
3900
|
+
if (typeof a !== typeof b) return false;
|
|
3901
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
3902
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
3903
|
+
if (a.length !== b.length) return false;
|
|
3904
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
3905
|
+
}
|
|
3906
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
3907
|
+
const aObj = a;
|
|
3908
|
+
const bObj = b;
|
|
3909
|
+
const aKeys = Object.keys(aObj);
|
|
3910
|
+
const bKeys = Object.keys(bObj);
|
|
3911
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
3912
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
3913
|
+
}
|
|
3914
|
+
};
|
|
3431
3915
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3432
3916
|
{{EVALUATOR_RESULTS_JSON}}
|
|
3433
3917
|
|
|
@@ -3851,7 +4335,7 @@ async function runEvaluation(options) {
|
|
|
3851
4335
|
if (!definition) {
|
|
3852
4336
|
return void 0;
|
|
3853
4337
|
}
|
|
3854
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
4338
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
3855
4339
|
resolvedTargetsByName.set(name, resolved);
|
|
3856
4340
|
return resolved;
|
|
3857
4341
|
};
|
|
@@ -4165,6 +4649,17 @@ async function runEvalCase(options) {
|
|
|
4165
4649
|
if (cacheKey && cache && !cachedResponse) {
|
|
4166
4650
|
await cache.set(cacheKey, providerResponse);
|
|
4167
4651
|
}
|
|
4652
|
+
let candidateTrace = providerResponse.trace;
|
|
4653
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
4654
|
+
try {
|
|
4655
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
4656
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
4657
|
+
candidateTrace = rawTrace;
|
|
4658
|
+
}
|
|
4659
|
+
} catch {
|
|
4660
|
+
}
|
|
4661
|
+
}
|
|
4662
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4168
4663
|
try {
|
|
4169
4664
|
return await evaluateCandidate({
|
|
4170
4665
|
evalCase,
|
|
@@ -4176,7 +4671,9 @@ async function runEvalCase(options) {
|
|
|
4176
4671
|
nowFn,
|
|
4177
4672
|
attempt,
|
|
4178
4673
|
judgeProvider,
|
|
4179
|
-
agentTimeoutMs
|
|
4674
|
+
agentTimeoutMs,
|
|
4675
|
+
candidateTrace,
|
|
4676
|
+
candidateTraceSummary
|
|
4180
4677
|
});
|
|
4181
4678
|
} catch (error) {
|
|
4182
4679
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4193,7 +4690,9 @@ async function evaluateCandidate(options) {
|
|
|
4193
4690
|
nowFn,
|
|
4194
4691
|
attempt,
|
|
4195
4692
|
judgeProvider,
|
|
4196
|
-
agentTimeoutMs
|
|
4693
|
+
agentTimeoutMs,
|
|
4694
|
+
candidateTrace,
|
|
4695
|
+
candidateTraceSummary
|
|
4197
4696
|
} = options;
|
|
4198
4697
|
const gradeTimestamp = nowFn();
|
|
4199
4698
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4206,7 +4705,9 @@ async function evaluateCandidate(options) {
|
|
|
4206
4705
|
promptInputs,
|
|
4207
4706
|
now: gradeTimestamp,
|
|
4208
4707
|
judgeProvider,
|
|
4209
|
-
agentTimeoutMs
|
|
4708
|
+
agentTimeoutMs,
|
|
4709
|
+
candidateTrace,
|
|
4710
|
+
candidateTraceSummary
|
|
4210
4711
|
});
|
|
4211
4712
|
const completedAt = nowFn();
|
|
4212
4713
|
let agentProviderRequest;
|
|
@@ -4219,14 +4720,12 @@ async function evaluateCandidate(options) {
|
|
|
4219
4720
|
} else {
|
|
4220
4721
|
if (promptInputs.chatPrompt) {
|
|
4221
4722
|
lmProviderRequest = {
|
|
4222
|
-
chat_prompt: promptInputs.chatPrompt
|
|
4223
|
-
guideline_paths: evalCase.guideline_paths
|
|
4723
|
+
chat_prompt: promptInputs.chatPrompt
|
|
4224
4724
|
};
|
|
4225
4725
|
} else {
|
|
4226
4726
|
lmProviderRequest = {
|
|
4227
4727
|
question: promptInputs.question,
|
|
4228
|
-
guidelines: promptInputs.guidelines
|
|
4229
|
-
guideline_paths: evalCase.guideline_paths
|
|
4728
|
+
guidelines: promptInputs.guidelines
|
|
4230
4729
|
};
|
|
4231
4730
|
}
|
|
4232
4731
|
}
|
|
@@ -4245,7 +4744,8 @@ async function evaluateCandidate(options) {
|
|
|
4245
4744
|
agent_provider_request: agentProviderRequest,
|
|
4246
4745
|
lm_provider_request: lmProviderRequest,
|
|
4247
4746
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4248
|
-
evaluator_results: evaluatorResults
|
|
4747
|
+
evaluator_results: evaluatorResults,
|
|
4748
|
+
trace_summary: candidateTraceSummary
|
|
4249
4749
|
};
|
|
4250
4750
|
}
|
|
4251
4751
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4259,7 +4759,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4259
4759
|
promptInputs,
|
|
4260
4760
|
now,
|
|
4261
4761
|
judgeProvider,
|
|
4262
|
-
agentTimeoutMs
|
|
4762
|
+
agentTimeoutMs,
|
|
4763
|
+
candidateTrace,
|
|
4764
|
+
candidateTraceSummary
|
|
4263
4765
|
} = options;
|
|
4264
4766
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4265
4767
|
return runEvaluatorList({
|
|
@@ -4273,7 +4775,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4273
4775
|
promptInputs,
|
|
4274
4776
|
now,
|
|
4275
4777
|
judgeProvider,
|
|
4276
|
-
agentTimeoutMs
|
|
4778
|
+
agentTimeoutMs,
|
|
4779
|
+
candidateTrace,
|
|
4780
|
+
candidateTraceSummary
|
|
4277
4781
|
});
|
|
4278
4782
|
}
|
|
4279
4783
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4289,7 +4793,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4289
4793
|
attempt,
|
|
4290
4794
|
promptInputs,
|
|
4291
4795
|
now,
|
|
4292
|
-
judgeProvider
|
|
4796
|
+
judgeProvider,
|
|
4797
|
+
candidateTrace,
|
|
4798
|
+
candidateTraceSummary
|
|
4293
4799
|
});
|
|
4294
4800
|
return { score };
|
|
4295
4801
|
}
|
|
@@ -4305,7 +4811,9 @@ async function runEvaluatorList(options) {
|
|
|
4305
4811
|
promptInputs,
|
|
4306
4812
|
now,
|
|
4307
4813
|
judgeProvider,
|
|
4308
|
-
agentTimeoutMs
|
|
4814
|
+
agentTimeoutMs,
|
|
4815
|
+
candidateTrace,
|
|
4816
|
+
candidateTraceSummary
|
|
4309
4817
|
} = options;
|
|
4310
4818
|
const scored = [];
|
|
4311
4819
|
const evaluatorResults = [];
|
|
@@ -4324,11 +4832,13 @@ async function runEvaluatorList(options) {
|
|
|
4324
4832
|
now,
|
|
4325
4833
|
judgeProvider
|
|
4326
4834
|
});
|
|
4327
|
-
|
|
4835
|
+
const weight = evaluator.weight ?? 1;
|
|
4836
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4328
4837
|
evaluatorResults.push({
|
|
4329
4838
|
name: evaluator.name,
|
|
4330
4839
|
type: evaluator.type,
|
|
4331
4840
|
score: score2.score,
|
|
4841
|
+
weight,
|
|
4332
4842
|
verdict: score2.verdict,
|
|
4333
4843
|
hits: score2.hits,
|
|
4334
4844
|
misses: score2.misses,
|
|
@@ -4351,11 +4861,13 @@ async function runEvaluatorList(options) {
|
|
|
4351
4861
|
promptInputs,
|
|
4352
4862
|
now
|
|
4353
4863
|
});
|
|
4354
|
-
|
|
4864
|
+
const weight = evaluator.weight ?? 1;
|
|
4865
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
4355
4866
|
evaluatorResults.push({
|
|
4356
4867
|
name: evaluator.name,
|
|
4357
4868
|
type: "code_judge",
|
|
4358
4869
|
score: score2.score,
|
|
4870
|
+
weight,
|
|
4359
4871
|
verdict: score2.verdict,
|
|
4360
4872
|
hits: score2.hits,
|
|
4361
4873
|
misses: score2.misses,
|
|
@@ -4381,6 +4893,12 @@ async function runEvaluatorList(options) {
|
|
|
4381
4893
|
cwd: evalFileDir,
|
|
4382
4894
|
evaluatorFactory: { create: createEvaluator }
|
|
4383
4895
|
});
|
|
4896
|
+
case "tool_trajectory":
|
|
4897
|
+
return new ToolTrajectoryEvaluator({
|
|
4898
|
+
config: memberConfig
|
|
4899
|
+
});
|
|
4900
|
+
case "expected_messages":
|
|
4901
|
+
return new ExpectedMessagesEvaluator();
|
|
4384
4902
|
default: {
|
|
4385
4903
|
const unknownConfig = memberConfig;
|
|
4386
4904
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4402,11 +4920,13 @@ async function runEvaluatorList(options) {
|
|
|
4402
4920
|
now,
|
|
4403
4921
|
judgeProvider
|
|
4404
4922
|
});
|
|
4405
|
-
|
|
4923
|
+
const weight = evaluator.weight ?? 1;
|
|
4924
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4406
4925
|
evaluatorResults.push({
|
|
4407
4926
|
name: evaluator.name,
|
|
4408
4927
|
type: evaluator.type,
|
|
4409
4928
|
score: score2.score,
|
|
4929
|
+
weight,
|
|
4410
4930
|
verdict: score2.verdict,
|
|
4411
4931
|
hits: score2.hits,
|
|
4412
4932
|
misses: score2.misses,
|
|
@@ -4415,6 +4935,60 @@ async function runEvaluatorList(options) {
|
|
|
4415
4935
|
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4416
4936
|
});
|
|
4417
4937
|
}
|
|
4938
|
+
if (evaluator.type === "tool_trajectory") {
|
|
4939
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
4940
|
+
config: evaluator
|
|
4941
|
+
});
|
|
4942
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
4943
|
+
evalCase,
|
|
4944
|
+
candidate,
|
|
4945
|
+
target,
|
|
4946
|
+
provider,
|
|
4947
|
+
attempt,
|
|
4948
|
+
promptInputs,
|
|
4949
|
+
now,
|
|
4950
|
+
candidateTrace,
|
|
4951
|
+
candidateTraceSummary
|
|
4952
|
+
});
|
|
4953
|
+
const weight = evaluator.weight ?? 1;
|
|
4954
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4955
|
+
evaluatorResults.push({
|
|
4956
|
+
name: evaluator.name,
|
|
4957
|
+
type: evaluator.type,
|
|
4958
|
+
score: score2.score,
|
|
4959
|
+
weight,
|
|
4960
|
+
verdict: score2.verdict,
|
|
4961
|
+
hits: score2.hits,
|
|
4962
|
+
misses: score2.misses,
|
|
4963
|
+
reasoning: score2.reasoning
|
|
4964
|
+
});
|
|
4965
|
+
}
|
|
4966
|
+
if (evaluator.type === "expected_messages") {
|
|
4967
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
4968
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
4969
|
+
evalCase,
|
|
4970
|
+
candidate,
|
|
4971
|
+
target,
|
|
4972
|
+
provider,
|
|
4973
|
+
attempt,
|
|
4974
|
+
promptInputs,
|
|
4975
|
+
now,
|
|
4976
|
+
candidateTrace,
|
|
4977
|
+
candidateTraceSummary
|
|
4978
|
+
});
|
|
4979
|
+
const weight = evaluator.weight ?? 1;
|
|
4980
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4981
|
+
evaluatorResults.push({
|
|
4982
|
+
name: evaluator.name,
|
|
4983
|
+
type: evaluator.type,
|
|
4984
|
+
score: score2.score,
|
|
4985
|
+
weight,
|
|
4986
|
+
verdict: score2.verdict,
|
|
4987
|
+
hits: score2.hits,
|
|
4988
|
+
misses: score2.misses,
|
|
4989
|
+
reasoning: score2.reasoning
|
|
4990
|
+
});
|
|
4991
|
+
}
|
|
4418
4992
|
} catch (error) {
|
|
4419
4993
|
const message = error instanceof Error ? error.message : String(error);
|
|
4420
4994
|
const fallbackScore = {
|
|
@@ -4426,15 +5000,18 @@ async function runEvaluatorList(options) {
|
|
|
4426
5000
|
reasoning: message
|
|
4427
5001
|
};
|
|
4428
5002
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
5003
|
+
const weight = evaluator.weight ?? 1;
|
|
4429
5004
|
scored.push({
|
|
4430
5005
|
score: fallbackScore,
|
|
4431
5006
|
name: evaluator.name ?? "unknown",
|
|
4432
|
-
type: resultType ?? "llm_judge"
|
|
5007
|
+
type: resultType ?? "llm_judge",
|
|
5008
|
+
weight
|
|
4433
5009
|
});
|
|
4434
5010
|
evaluatorResults.push({
|
|
4435
5011
|
name: evaluator.name ?? "unknown",
|
|
4436
5012
|
type: resultType ?? "llm_judge",
|
|
4437
5013
|
score: 0,
|
|
5014
|
+
weight,
|
|
4438
5015
|
verdict: "fail",
|
|
4439
5016
|
hits: [],
|
|
4440
5017
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -4442,7 +5019,9 @@ async function runEvaluatorList(options) {
|
|
|
4442
5019
|
});
|
|
4443
5020
|
}
|
|
4444
5021
|
}
|
|
4445
|
-
const aggregateScore = scored.length > 0 ?
|
|
5022
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
5023
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
5024
|
+
) : 0;
|
|
4446
5025
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
4447
5026
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
4448
5027
|
const expectedAspectCount = scored.reduce(
|
|
@@ -4668,6 +5247,16 @@ function mapChildResults(children) {
|
|
|
4668
5247
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
4669
5248
|
}));
|
|
4670
5249
|
}
|
|
5250
|
+
function computeWeightedMean(entries) {
|
|
5251
|
+
let totalWeight = 0;
|
|
5252
|
+
let weightedSum = 0;
|
|
5253
|
+
for (const entry of entries) {
|
|
5254
|
+
const weight = entry.weight ?? 1;
|
|
5255
|
+
totalWeight += weight;
|
|
5256
|
+
weightedSum += entry.score * weight;
|
|
5257
|
+
}
|
|
5258
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5259
|
+
}
|
|
4671
5260
|
|
|
4672
5261
|
// src/evaluation/generators/rubric-generator.ts
|
|
4673
5262
|
import { generateText as generateText3 } from "ai";
|
|
@@ -4756,11 +5345,14 @@ function createAgentKernel() {
|
|
|
4756
5345
|
export {
|
|
4757
5346
|
CodeEvaluator,
|
|
4758
5347
|
CompositeEvaluator,
|
|
5348
|
+
ExpectedMessagesEvaluator,
|
|
4759
5349
|
LlmJudgeEvaluator,
|
|
4760
5350
|
TEST_MESSAGE_ROLES,
|
|
5351
|
+
ToolTrajectoryEvaluator,
|
|
4761
5352
|
buildDirectoryChain,
|
|
4762
5353
|
buildPromptInputs,
|
|
4763
5354
|
buildSearchRoots,
|
|
5355
|
+
computeTraceSummary,
|
|
4764
5356
|
consumeCodexLogEntries,
|
|
4765
5357
|
createAgentKernel,
|
|
4766
5358
|
createProvider,
|
|
@@ -4771,14 +5363,18 @@ export {
|
|
|
4771
5363
|
generateRubrics,
|
|
4772
5364
|
getHitCount,
|
|
4773
5365
|
isEvaluatorKind,
|
|
5366
|
+
isExpectedToolCall,
|
|
4774
5367
|
isGuidelineFile,
|
|
4775
5368
|
isJsonObject,
|
|
4776
5369
|
isJsonValue,
|
|
4777
5370
|
isTestMessage,
|
|
4778
5371
|
isTestMessageRole,
|
|
5372
|
+
isTraceEvent,
|
|
5373
|
+
isTraceEventType,
|
|
4779
5374
|
listTargetNames,
|
|
4780
5375
|
loadEvalCases,
|
|
4781
5376
|
normalizeLineEndings,
|
|
5377
|
+
readJsonFile,
|
|
4782
5378
|
readTargetDefinitions,
|
|
4783
5379
|
readTestSuiteMetadata,
|
|
4784
5380
|
readTextFile,
|