@agentv/core 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
- package/dist/chunk-OYTL3LNN.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +596 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +151 -4
- package/dist/index.d.ts +151 -4
- package/dist/index.js +555 -17
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,10 +5,11 @@ import {
|
|
|
5
5
|
findGitRoot,
|
|
6
6
|
isAgentProvider,
|
|
7
7
|
normalizeLineEndings,
|
|
8
|
+
readJsonFile,
|
|
8
9
|
readTextFile,
|
|
9
10
|
resolveFileReference,
|
|
10
11
|
resolveTargetDefinition
|
|
11
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-OYTL3LNN.js";
|
|
12
13
|
|
|
13
14
|
// src/evaluation/types.ts
|
|
14
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -51,7 +52,14 @@ function isTestMessage(value) {
|
|
|
51
52
|
}
|
|
52
53
|
return candidate.content.every(isJsonObject);
|
|
53
54
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = [
|
|
55
|
+
var EVALUATOR_KIND_VALUES = [
|
|
56
|
+
"code_judge",
|
|
57
|
+
"llm_judge",
|
|
58
|
+
"rubric",
|
|
59
|
+
"composite",
|
|
60
|
+
"tool_trajectory",
|
|
61
|
+
"expected_messages"
|
|
62
|
+
];
|
|
55
63
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
64
|
function isEvaluatorKind(value) {
|
|
57
65
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -60,6 +68,44 @@ function getHitCount(result) {
|
|
|
60
68
|
return result.hits.length;
|
|
61
69
|
}
|
|
62
70
|
|
|
71
|
+
// src/evaluation/trace.ts
|
|
72
|
+
function isTraceEventType(value) {
|
|
73
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
74
|
+
}
|
|
75
|
+
function isTraceEvent(value) {
|
|
76
|
+
if (typeof value !== "object" || value === null) {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
const candidate = value;
|
|
80
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
81
|
+
}
|
|
82
|
+
function isExpectedToolCall(value) {
|
|
83
|
+
if (typeof value !== "object" || value === null) {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
const candidate = value;
|
|
87
|
+
return typeof candidate.tool === "string";
|
|
88
|
+
}
|
|
89
|
+
function computeTraceSummary(trace) {
|
|
90
|
+
const toolCallCounts = {};
|
|
91
|
+
let errorCount = 0;
|
|
92
|
+
for (const event of trace) {
|
|
93
|
+
if (event.type === "tool_call" && event.name) {
|
|
94
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
95
|
+
}
|
|
96
|
+
if (event.type === "error") {
|
|
97
|
+
errorCount++;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
101
|
+
return {
|
|
102
|
+
eventCount: trace.length,
|
|
103
|
+
toolNames,
|
|
104
|
+
toolCallsByName: toolCallCounts,
|
|
105
|
+
errorCount
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
63
109
|
// src/evaluation/yaml-parser.ts
|
|
64
110
|
import { readFile as readFile5 } from "node:fs/promises";
|
|
65
111
|
import path6 from "node:path";
|
|
@@ -532,6 +578,75 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
532
578
|
});
|
|
533
579
|
continue;
|
|
534
580
|
}
|
|
581
|
+
if (typeValue === "expected_messages") {
|
|
582
|
+
evaluators.push({
|
|
583
|
+
name,
|
|
584
|
+
type: "expected_messages"
|
|
585
|
+
});
|
|
586
|
+
continue;
|
|
587
|
+
}
|
|
588
|
+
if (typeValue === "tool_trajectory") {
|
|
589
|
+
const mode = asString2(rawEvaluator.mode);
|
|
590
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
591
|
+
logWarning2(
|
|
592
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
593
|
+
);
|
|
594
|
+
continue;
|
|
595
|
+
}
|
|
596
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
597
|
+
let minimums;
|
|
598
|
+
if (rawMinimums !== void 0) {
|
|
599
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
600
|
+
logWarning2(
|
|
601
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
602
|
+
);
|
|
603
|
+
continue;
|
|
604
|
+
}
|
|
605
|
+
minimums = {};
|
|
606
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
607
|
+
if (typeof count === "number" && count >= 0) {
|
|
608
|
+
minimums[toolName] = count;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
const rawExpected = rawEvaluator.expected;
|
|
613
|
+
let expected;
|
|
614
|
+
if (rawExpected !== void 0) {
|
|
615
|
+
if (!Array.isArray(rawExpected)) {
|
|
616
|
+
logWarning2(
|
|
617
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
618
|
+
);
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
expected = [];
|
|
622
|
+
for (const item of rawExpected) {
|
|
623
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
624
|
+
expected.push({ tool: item.tool });
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
if (mode === "any_order" && !minimums) {
|
|
629
|
+
logWarning2(
|
|
630
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
631
|
+
);
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
635
|
+
logWarning2(
|
|
636
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
637
|
+
);
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
const config = {
|
|
641
|
+
name,
|
|
642
|
+
type: "tool_trajectory",
|
|
643
|
+
mode,
|
|
644
|
+
...minimums ? { minimums } : {},
|
|
645
|
+
...expected ? { expected } : {}
|
|
646
|
+
};
|
|
647
|
+
evaluators.push(config);
|
|
648
|
+
continue;
|
|
649
|
+
}
|
|
535
650
|
const prompt = asString2(rawEvaluator.prompt);
|
|
536
651
|
let promptPath;
|
|
537
652
|
if (prompt) {
|
|
@@ -785,6 +900,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
785
900
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
786
901
|
}
|
|
787
902
|
}
|
|
903
|
+
async function processExpectedMessages(options) {
|
|
904
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
905
|
+
const segments = [];
|
|
906
|
+
for (const message of messages) {
|
|
907
|
+
const segment = {
|
|
908
|
+
role: message.role
|
|
909
|
+
};
|
|
910
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
911
|
+
segment.tool_calls = message.tool_calls;
|
|
912
|
+
}
|
|
913
|
+
const content = message.content;
|
|
914
|
+
if (typeof content === "string") {
|
|
915
|
+
segment.content = content;
|
|
916
|
+
} else if (Array.isArray(content)) {
|
|
917
|
+
const processedContent = [];
|
|
918
|
+
for (const rawSegment of content) {
|
|
919
|
+
if (!isJsonObject(rawSegment)) {
|
|
920
|
+
continue;
|
|
921
|
+
}
|
|
922
|
+
const segmentType = asString3(rawSegment.type);
|
|
923
|
+
if (segmentType === "file") {
|
|
924
|
+
const rawValue = asString3(rawSegment.value);
|
|
925
|
+
if (!rawValue) {
|
|
926
|
+
continue;
|
|
927
|
+
}
|
|
928
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
929
|
+
rawValue,
|
|
930
|
+
searchRoots
|
|
931
|
+
);
|
|
932
|
+
if (!resolvedPath) {
|
|
933
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
934
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
935
|
+
continue;
|
|
936
|
+
}
|
|
937
|
+
try {
|
|
938
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
939
|
+
processedContent.push({
|
|
940
|
+
type: "file",
|
|
941
|
+
path: displayPath,
|
|
942
|
+
text: fileContent,
|
|
943
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
944
|
+
});
|
|
945
|
+
if (verbose) {
|
|
946
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
947
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
948
|
+
}
|
|
949
|
+
} catch (error) {
|
|
950
|
+
logWarning3(
|
|
951
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
952
|
+
);
|
|
953
|
+
}
|
|
954
|
+
continue;
|
|
955
|
+
}
|
|
956
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
957
|
+
}
|
|
958
|
+
segment.content = processedContent;
|
|
959
|
+
}
|
|
960
|
+
segments.push(segment);
|
|
961
|
+
}
|
|
962
|
+
return segments;
|
|
963
|
+
}
|
|
788
964
|
|
|
789
965
|
// src/evaluation/formatting/prompt-builder.ts
|
|
790
966
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -1089,12 +1265,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1089
1265
|
messageType: "input",
|
|
1090
1266
|
verbose
|
|
1091
1267
|
});
|
|
1092
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1268
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1093
1269
|
messages: expectedMessages,
|
|
1094
1270
|
searchRoots,
|
|
1095
1271
|
repoRootPath,
|
|
1096
|
-
guidelinePatterns,
|
|
1097
|
-
messageType: "output",
|
|
1098
1272
|
verbose
|
|
1099
1273
|
}) : [];
|
|
1100
1274
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1618,9 +1792,11 @@ var CliProvider = class {
|
|
|
1618
1792
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1619
1793
|
throw new Error(message);
|
|
1620
1794
|
}
|
|
1621
|
-
const
|
|
1795
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1796
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1622
1797
|
return {
|
|
1623
|
-
text:
|
|
1798
|
+
text: parsed.text,
|
|
1799
|
+
trace: parsed.trace,
|
|
1624
1800
|
raw: {
|
|
1625
1801
|
command: renderedCommand,
|
|
1626
1802
|
stderr: result.stderr,
|
|
@@ -1630,6 +1806,31 @@ var CliProvider = class {
|
|
|
1630
1806
|
}
|
|
1631
1807
|
};
|
|
1632
1808
|
}
|
|
1809
|
+
/**
|
|
1810
|
+
* Parse output content from CLI.
|
|
1811
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
1812
|
+
* Otherwise, treat the entire content as plain text.
|
|
1813
|
+
*/
|
|
1814
|
+
parseOutputContent(content) {
|
|
1815
|
+
try {
|
|
1816
|
+
const parsed = JSON.parse(content);
|
|
1817
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
1818
|
+
const obj = parsed;
|
|
1819
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1820
|
+
const trace = this.parseTrace(obj.trace);
|
|
1821
|
+
return { text, trace };
|
|
1822
|
+
}
|
|
1823
|
+
} catch {
|
|
1824
|
+
}
|
|
1825
|
+
return { text: content };
|
|
1826
|
+
}
|
|
1827
|
+
parseTrace(trace) {
|
|
1828
|
+
if (!Array.isArray(trace)) {
|
|
1829
|
+
return void 0;
|
|
1830
|
+
}
|
|
1831
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
1832
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
1833
|
+
}
|
|
1633
1834
|
async readAndCleanupOutputFile(filePath) {
|
|
1634
1835
|
try {
|
|
1635
1836
|
const content = await readTextFile(filePath);
|
|
@@ -2616,6 +2817,7 @@ var MockProvider = class {
|
|
|
2616
2817
|
delayMs;
|
|
2617
2818
|
delayMinMs;
|
|
2618
2819
|
delayMaxMs;
|
|
2820
|
+
trace;
|
|
2619
2821
|
constructor(targetName, config) {
|
|
2620
2822
|
this.id = `mock:${targetName}`;
|
|
2621
2823
|
this.targetName = targetName;
|
|
@@ -2623,6 +2825,7 @@ var MockProvider = class {
|
|
|
2623
2825
|
this.delayMs = config.delayMs ?? 0;
|
|
2624
2826
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2625
2827
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2828
|
+
this.trace = config.trace;
|
|
2626
2829
|
}
|
|
2627
2830
|
async invoke(request) {
|
|
2628
2831
|
const delay = this.calculateDelay();
|
|
@@ -2634,7 +2837,8 @@ var MockProvider = class {
|
|
|
2634
2837
|
raw: {
|
|
2635
2838
|
question: request.question,
|
|
2636
2839
|
guidelines: request.guidelines
|
|
2637
|
-
}
|
|
2840
|
+
},
|
|
2841
|
+
trace: this.trace
|
|
2638
2842
|
};
|
|
2639
2843
|
}
|
|
2640
2844
|
calculateDelay() {
|
|
@@ -3428,6 +3632,251 @@ function substituteVariables(template, variables) {
|
|
|
3428
3632
|
return variables[varName] ?? match;
|
|
3429
3633
|
});
|
|
3430
3634
|
}
|
|
3635
|
+
var ToolTrajectoryEvaluator = class {
|
|
3636
|
+
kind = "tool_trajectory";
|
|
3637
|
+
config;
|
|
3638
|
+
constructor(options) {
|
|
3639
|
+
this.config = options.config;
|
|
3640
|
+
}
|
|
3641
|
+
evaluate(context) {
|
|
3642
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
3643
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
3644
|
+
return {
|
|
3645
|
+
score: 0,
|
|
3646
|
+
verdict: "fail",
|
|
3647
|
+
hits: [],
|
|
3648
|
+
misses: ["No trace available for evaluation"],
|
|
3649
|
+
expectedAspectCount: 1
|
|
3650
|
+
};
|
|
3651
|
+
}
|
|
3652
|
+
switch (this.config.mode) {
|
|
3653
|
+
case "any_order":
|
|
3654
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
3655
|
+
case "in_order":
|
|
3656
|
+
return this.evaluateInOrder(candidateTrace);
|
|
3657
|
+
case "exact":
|
|
3658
|
+
return this.evaluateExact(candidateTrace);
|
|
3659
|
+
default:
|
|
3660
|
+
return {
|
|
3661
|
+
score: 0,
|
|
3662
|
+
verdict: "fail",
|
|
3663
|
+
hits: [],
|
|
3664
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
3665
|
+
expectedAspectCount: 1
|
|
3666
|
+
};
|
|
3667
|
+
}
|
|
3668
|
+
}
|
|
3669
|
+
evaluateAnyOrder(summary) {
|
|
3670
|
+
const minimums = this.config.minimums ?? {};
|
|
3671
|
+
const toolNames = Object.keys(minimums);
|
|
3672
|
+
if (toolNames.length === 0) {
|
|
3673
|
+
return {
|
|
3674
|
+
score: 1,
|
|
3675
|
+
verdict: "pass",
|
|
3676
|
+
hits: ["No tool requirements specified"],
|
|
3677
|
+
misses: [],
|
|
3678
|
+
expectedAspectCount: 0
|
|
3679
|
+
};
|
|
3680
|
+
}
|
|
3681
|
+
const hits = [];
|
|
3682
|
+
const misses = [];
|
|
3683
|
+
for (const toolName of toolNames) {
|
|
3684
|
+
const required = minimums[toolName];
|
|
3685
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
3686
|
+
if (actual >= required) {
|
|
3687
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3688
|
+
} else {
|
|
3689
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3692
|
+
const score = hits.length / toolNames.length;
|
|
3693
|
+
return {
|
|
3694
|
+
score,
|
|
3695
|
+
verdict: scoreToVerdict(score),
|
|
3696
|
+
hits,
|
|
3697
|
+
misses,
|
|
3698
|
+
expectedAspectCount: toolNames.length
|
|
3699
|
+
};
|
|
3700
|
+
}
|
|
3701
|
+
evaluateInOrder(trace) {
|
|
3702
|
+
const expected = this.config.expected ?? [];
|
|
3703
|
+
if (expected.length === 0) {
|
|
3704
|
+
return {
|
|
3705
|
+
score: 1,
|
|
3706
|
+
verdict: "pass",
|
|
3707
|
+
hits: ["No tool sequence specified"],
|
|
3708
|
+
misses: [],
|
|
3709
|
+
expectedAspectCount: 0
|
|
3710
|
+
};
|
|
3711
|
+
}
|
|
3712
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3713
|
+
const hits = [];
|
|
3714
|
+
const misses = [];
|
|
3715
|
+
let actualIndex = 0;
|
|
3716
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3717
|
+
const expectedTool = expected[i].tool;
|
|
3718
|
+
let found = false;
|
|
3719
|
+
while (actualIndex < actualToolCalls.length) {
|
|
3720
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
3721
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
3722
|
+
actualIndex++;
|
|
3723
|
+
found = true;
|
|
3724
|
+
break;
|
|
3725
|
+
}
|
|
3726
|
+
actualIndex++;
|
|
3727
|
+
}
|
|
3728
|
+
if (!found) {
|
|
3729
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
const score = hits.length / expected.length;
|
|
3733
|
+
return {
|
|
3734
|
+
score,
|
|
3735
|
+
verdict: scoreToVerdict(score),
|
|
3736
|
+
hits,
|
|
3737
|
+
misses,
|
|
3738
|
+
expectedAspectCount: expected.length
|
|
3739
|
+
};
|
|
3740
|
+
}
|
|
3741
|
+
evaluateExact(trace) {
|
|
3742
|
+
const expected = this.config.expected ?? [];
|
|
3743
|
+
if (expected.length === 0) {
|
|
3744
|
+
return {
|
|
3745
|
+
score: 1,
|
|
3746
|
+
verdict: "pass",
|
|
3747
|
+
hits: ["No tool sequence specified"],
|
|
3748
|
+
misses: [],
|
|
3749
|
+
expectedAspectCount: 0
|
|
3750
|
+
};
|
|
3751
|
+
}
|
|
3752
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3753
|
+
const hits = [];
|
|
3754
|
+
const misses = [];
|
|
3755
|
+
if (actualToolCalls.length !== expected.length) {
|
|
3756
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
3757
|
+
}
|
|
3758
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
3759
|
+
for (let i = 0; i < checkLength; i++) {
|
|
3760
|
+
const expectedTool = expected[i].tool;
|
|
3761
|
+
const actualTool = actualToolCalls[i].name;
|
|
3762
|
+
if (actualTool === expectedTool) {
|
|
3763
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
3764
|
+
} else {
|
|
3765
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
3766
|
+
}
|
|
3767
|
+
}
|
|
3768
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
3769
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
3770
|
+
}
|
|
3771
|
+
const score = hits.length / expected.length;
|
|
3772
|
+
return {
|
|
3773
|
+
score,
|
|
3774
|
+
verdict: scoreToVerdict(score),
|
|
3775
|
+
hits,
|
|
3776
|
+
misses,
|
|
3777
|
+
expectedAspectCount: expected.length
|
|
3778
|
+
};
|
|
3779
|
+
}
|
|
3780
|
+
};
|
|
3781
|
+
var ExpectedMessagesEvaluator = class {
|
|
3782
|
+
kind = "expected_messages";
|
|
3783
|
+
evaluate(context) {
|
|
3784
|
+
const { candidateTrace, evalCase } = context;
|
|
3785
|
+
const expectedSegments = evalCase.expected_segments;
|
|
3786
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
3787
|
+
if (expectedToolCalls.length === 0) {
|
|
3788
|
+
return {
|
|
3789
|
+
score: 1,
|
|
3790
|
+
verdict: "pass",
|
|
3791
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
3792
|
+
misses: [],
|
|
3793
|
+
expectedAspectCount: 1
|
|
3794
|
+
};
|
|
3795
|
+
}
|
|
3796
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
3797
|
+
return {
|
|
3798
|
+
score: 0,
|
|
3799
|
+
verdict: "fail",
|
|
3800
|
+
hits: [],
|
|
3801
|
+
misses: ["No trace available to validate tool_calls"],
|
|
3802
|
+
expectedAspectCount: expectedToolCalls.length
|
|
3803
|
+
};
|
|
3804
|
+
}
|
|
3805
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
3806
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
3807
|
+
}
|
|
3808
|
+
extractExpectedToolCalls(segments) {
|
|
3809
|
+
if (!segments) {
|
|
3810
|
+
return [];
|
|
3811
|
+
}
|
|
3812
|
+
const toolCalls = [];
|
|
3813
|
+
for (const segment of segments) {
|
|
3814
|
+
const role = segment.role;
|
|
3815
|
+
const segmentToolCalls = segment.tool_calls;
|
|
3816
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
3817
|
+
for (const tc of segmentToolCalls) {
|
|
3818
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
3819
|
+
const toolCall = tc;
|
|
3820
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
3821
|
+
}
|
|
3822
|
+
}
|
|
3823
|
+
}
|
|
3824
|
+
}
|
|
3825
|
+
return toolCalls;
|
|
3826
|
+
}
|
|
3827
|
+
validateToolCalls(expected, actual) {
|
|
3828
|
+
const hits = [];
|
|
3829
|
+
const misses = [];
|
|
3830
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3831
|
+
const expectedCall = expected[i];
|
|
3832
|
+
const actualCall = actual[i];
|
|
3833
|
+
if (!actualCall) {
|
|
3834
|
+
misses.push(
|
|
3835
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
3836
|
+
);
|
|
3837
|
+
continue;
|
|
3838
|
+
}
|
|
3839
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
3840
|
+
misses.push(
|
|
3841
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
3842
|
+
);
|
|
3843
|
+
continue;
|
|
3844
|
+
}
|
|
3845
|
+
if (expectedCall.input !== void 0) {
|
|
3846
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
3847
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
3848
|
+
continue;
|
|
3849
|
+
}
|
|
3850
|
+
}
|
|
3851
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
3852
|
+
}
|
|
3853
|
+
const totalChecks = expected.length || 1;
|
|
3854
|
+
const score = hits.length / totalChecks;
|
|
3855
|
+
return {
|
|
3856
|
+
score,
|
|
3857
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
3858
|
+
hits,
|
|
3859
|
+
misses,
|
|
3860
|
+
expectedAspectCount: totalChecks
|
|
3861
|
+
};
|
|
3862
|
+
}
|
|
3863
|
+
deepEquals(a, b) {
|
|
3864
|
+
if (a === b) return true;
|
|
3865
|
+
if (typeof a !== typeof b) return false;
|
|
3866
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
3867
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
3868
|
+
if (a.length !== b.length) return false;
|
|
3869
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
3870
|
+
}
|
|
3871
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
3872
|
+
const aObj = a;
|
|
3873
|
+
const bObj = b;
|
|
3874
|
+
const aKeys = Object.keys(aObj);
|
|
3875
|
+
const bKeys = Object.keys(bObj);
|
|
3876
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
3877
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
3878
|
+
}
|
|
3879
|
+
};
|
|
3431
3880
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3432
3881
|
{{EVALUATOR_RESULTS_JSON}}
|
|
3433
3882
|
|
|
@@ -3851,7 +4300,7 @@ async function runEvaluation(options) {
|
|
|
3851
4300
|
if (!definition) {
|
|
3852
4301
|
return void 0;
|
|
3853
4302
|
}
|
|
3854
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
4303
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
3855
4304
|
resolvedTargetsByName.set(name, resolved);
|
|
3856
4305
|
return resolved;
|
|
3857
4306
|
};
|
|
@@ -4165,6 +4614,17 @@ async function runEvalCase(options) {
|
|
|
4165
4614
|
if (cacheKey && cache && !cachedResponse) {
|
|
4166
4615
|
await cache.set(cacheKey, providerResponse);
|
|
4167
4616
|
}
|
|
4617
|
+
let candidateTrace = providerResponse.trace;
|
|
4618
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
4619
|
+
try {
|
|
4620
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
4621
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
4622
|
+
candidateTrace = rawTrace;
|
|
4623
|
+
}
|
|
4624
|
+
} catch {
|
|
4625
|
+
}
|
|
4626
|
+
}
|
|
4627
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
4168
4628
|
try {
|
|
4169
4629
|
return await evaluateCandidate({
|
|
4170
4630
|
evalCase,
|
|
@@ -4176,7 +4636,9 @@ async function runEvalCase(options) {
|
|
|
4176
4636
|
nowFn,
|
|
4177
4637
|
attempt,
|
|
4178
4638
|
judgeProvider,
|
|
4179
|
-
agentTimeoutMs
|
|
4639
|
+
agentTimeoutMs,
|
|
4640
|
+
candidateTrace,
|
|
4641
|
+
candidateTraceSummary
|
|
4180
4642
|
});
|
|
4181
4643
|
} catch (error) {
|
|
4182
4644
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4193,7 +4655,9 @@ async function evaluateCandidate(options) {
|
|
|
4193
4655
|
nowFn,
|
|
4194
4656
|
attempt,
|
|
4195
4657
|
judgeProvider,
|
|
4196
|
-
agentTimeoutMs
|
|
4658
|
+
agentTimeoutMs,
|
|
4659
|
+
candidateTrace,
|
|
4660
|
+
candidateTraceSummary
|
|
4197
4661
|
} = options;
|
|
4198
4662
|
const gradeTimestamp = nowFn();
|
|
4199
4663
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4206,7 +4670,9 @@ async function evaluateCandidate(options) {
|
|
|
4206
4670
|
promptInputs,
|
|
4207
4671
|
now: gradeTimestamp,
|
|
4208
4672
|
judgeProvider,
|
|
4209
|
-
agentTimeoutMs
|
|
4673
|
+
agentTimeoutMs,
|
|
4674
|
+
candidateTrace,
|
|
4675
|
+
candidateTraceSummary
|
|
4210
4676
|
});
|
|
4211
4677
|
const completedAt = nowFn();
|
|
4212
4678
|
let agentProviderRequest;
|
|
@@ -4245,7 +4711,8 @@ async function evaluateCandidate(options) {
|
|
|
4245
4711
|
agent_provider_request: agentProviderRequest,
|
|
4246
4712
|
lm_provider_request: lmProviderRequest,
|
|
4247
4713
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4248
|
-
evaluator_results: evaluatorResults
|
|
4714
|
+
evaluator_results: evaluatorResults,
|
|
4715
|
+
trace_summary: candidateTraceSummary
|
|
4249
4716
|
};
|
|
4250
4717
|
}
|
|
4251
4718
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4259,7 +4726,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4259
4726
|
promptInputs,
|
|
4260
4727
|
now,
|
|
4261
4728
|
judgeProvider,
|
|
4262
|
-
agentTimeoutMs
|
|
4729
|
+
agentTimeoutMs,
|
|
4730
|
+
candidateTrace,
|
|
4731
|
+
candidateTraceSummary
|
|
4263
4732
|
} = options;
|
|
4264
4733
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4265
4734
|
return runEvaluatorList({
|
|
@@ -4273,7 +4742,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4273
4742
|
promptInputs,
|
|
4274
4743
|
now,
|
|
4275
4744
|
judgeProvider,
|
|
4276
|
-
agentTimeoutMs
|
|
4745
|
+
agentTimeoutMs,
|
|
4746
|
+
candidateTrace,
|
|
4747
|
+
candidateTraceSummary
|
|
4277
4748
|
});
|
|
4278
4749
|
}
|
|
4279
4750
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4289,7 +4760,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
4289
4760
|
attempt,
|
|
4290
4761
|
promptInputs,
|
|
4291
4762
|
now,
|
|
4292
|
-
judgeProvider
|
|
4763
|
+
judgeProvider,
|
|
4764
|
+
candidateTrace,
|
|
4765
|
+
candidateTraceSummary
|
|
4293
4766
|
});
|
|
4294
4767
|
return { score };
|
|
4295
4768
|
}
|
|
@@ -4305,7 +4778,9 @@ async function runEvaluatorList(options) {
|
|
|
4305
4778
|
promptInputs,
|
|
4306
4779
|
now,
|
|
4307
4780
|
judgeProvider,
|
|
4308
|
-
agentTimeoutMs
|
|
4781
|
+
agentTimeoutMs,
|
|
4782
|
+
candidateTrace,
|
|
4783
|
+
candidateTraceSummary
|
|
4309
4784
|
} = options;
|
|
4310
4785
|
const scored = [];
|
|
4311
4786
|
const evaluatorResults = [];
|
|
@@ -4381,6 +4856,12 @@ async function runEvaluatorList(options) {
|
|
|
4381
4856
|
cwd: evalFileDir,
|
|
4382
4857
|
evaluatorFactory: { create: createEvaluator }
|
|
4383
4858
|
});
|
|
4859
|
+
case "tool_trajectory":
|
|
4860
|
+
return new ToolTrajectoryEvaluator({
|
|
4861
|
+
config: memberConfig
|
|
4862
|
+
});
|
|
4863
|
+
case "expected_messages":
|
|
4864
|
+
return new ExpectedMessagesEvaluator();
|
|
4384
4865
|
default: {
|
|
4385
4866
|
const unknownConfig = memberConfig;
|
|
4386
4867
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4415,6 +4896,56 @@ async function runEvaluatorList(options) {
|
|
|
4415
4896
|
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4416
4897
|
});
|
|
4417
4898
|
}
|
|
4899
|
+
if (evaluator.type === "tool_trajectory") {
|
|
4900
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
4901
|
+
config: evaluator
|
|
4902
|
+
});
|
|
4903
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
4904
|
+
evalCase,
|
|
4905
|
+
candidate,
|
|
4906
|
+
target,
|
|
4907
|
+
provider,
|
|
4908
|
+
attempt,
|
|
4909
|
+
promptInputs,
|
|
4910
|
+
now,
|
|
4911
|
+
candidateTrace,
|
|
4912
|
+
candidateTraceSummary
|
|
4913
|
+
});
|
|
4914
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4915
|
+
evaluatorResults.push({
|
|
4916
|
+
name: evaluator.name,
|
|
4917
|
+
type: evaluator.type,
|
|
4918
|
+
score: score2.score,
|
|
4919
|
+
verdict: score2.verdict,
|
|
4920
|
+
hits: score2.hits,
|
|
4921
|
+
misses: score2.misses,
|
|
4922
|
+
reasoning: score2.reasoning
|
|
4923
|
+
});
|
|
4924
|
+
}
|
|
4925
|
+
if (evaluator.type === "expected_messages") {
|
|
4926
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
4927
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
4928
|
+
evalCase,
|
|
4929
|
+
candidate,
|
|
4930
|
+
target,
|
|
4931
|
+
provider,
|
|
4932
|
+
attempt,
|
|
4933
|
+
promptInputs,
|
|
4934
|
+
now,
|
|
4935
|
+
candidateTrace,
|
|
4936
|
+
candidateTraceSummary
|
|
4937
|
+
});
|
|
4938
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4939
|
+
evaluatorResults.push({
|
|
4940
|
+
name: evaluator.name,
|
|
4941
|
+
type: evaluator.type,
|
|
4942
|
+
score: score2.score,
|
|
4943
|
+
verdict: score2.verdict,
|
|
4944
|
+
hits: score2.hits,
|
|
4945
|
+
misses: score2.misses,
|
|
4946
|
+
reasoning: score2.reasoning
|
|
4947
|
+
});
|
|
4948
|
+
}
|
|
4418
4949
|
} catch (error) {
|
|
4419
4950
|
const message = error instanceof Error ? error.message : String(error);
|
|
4420
4951
|
const fallbackScore = {
|
|
@@ -4756,11 +5287,14 @@ function createAgentKernel() {
|
|
|
4756
5287
|
export {
|
|
4757
5288
|
CodeEvaluator,
|
|
4758
5289
|
CompositeEvaluator,
|
|
5290
|
+
ExpectedMessagesEvaluator,
|
|
4759
5291
|
LlmJudgeEvaluator,
|
|
4760
5292
|
TEST_MESSAGE_ROLES,
|
|
5293
|
+
ToolTrajectoryEvaluator,
|
|
4761
5294
|
buildDirectoryChain,
|
|
4762
5295
|
buildPromptInputs,
|
|
4763
5296
|
buildSearchRoots,
|
|
5297
|
+
computeTraceSummary,
|
|
4764
5298
|
consumeCodexLogEntries,
|
|
4765
5299
|
createAgentKernel,
|
|
4766
5300
|
createProvider,
|
|
@@ -4771,14 +5305,18 @@ export {
|
|
|
4771
5305
|
generateRubrics,
|
|
4772
5306
|
getHitCount,
|
|
4773
5307
|
isEvaluatorKind,
|
|
5308
|
+
isExpectedToolCall,
|
|
4774
5309
|
isGuidelineFile,
|
|
4775
5310
|
isJsonObject,
|
|
4776
5311
|
isJsonValue,
|
|
4777
5312
|
isTestMessage,
|
|
4778
5313
|
isTestMessageRole,
|
|
5314
|
+
isTraceEvent,
|
|
5315
|
+
isTraceEventType,
|
|
4779
5316
|
listTargetNames,
|
|
4780
5317
|
loadEvalCases,
|
|
4781
5318
|
normalizeLineEndings,
|
|
5319
|
+
readJsonFile,
|
|
4782
5320
|
readTargetDefinitions,
|
|
4783
5321
|
readTestSuiteMetadata,
|
|
4784
5322
|
readTextFile,
|