@agentv/core 0.22.2 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
- package/dist/chunk-OYTL3LNN.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +64 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +48 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +994 -50
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +205 -4
- package/dist/index.d.ts +205 -4
- package/dist/index.js +953 -23
- package/dist/index.js.map +1 -1
- package/package.json +3 -4
- package/dist/chunk-B2J23S7D.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,10 +5,11 @@ import {
|
|
|
5
5
|
findGitRoot,
|
|
6
6
|
isAgentProvider,
|
|
7
7
|
normalizeLineEndings,
|
|
8
|
+
readJsonFile,
|
|
8
9
|
readTextFile,
|
|
9
10
|
resolveFileReference,
|
|
10
11
|
resolveTargetDefinition
|
|
11
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-OYTL3LNN.js";
|
|
12
13
|
|
|
13
14
|
// src/evaluation/types.ts
|
|
14
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -51,7 +52,14 @@ function isTestMessage(value) {
|
|
|
51
52
|
}
|
|
52
53
|
return candidate.content.every(isJsonObject);
|
|
53
54
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = [
|
|
55
|
+
var EVALUATOR_KIND_VALUES = [
|
|
56
|
+
"code_judge",
|
|
57
|
+
"llm_judge",
|
|
58
|
+
"rubric",
|
|
59
|
+
"composite",
|
|
60
|
+
"tool_trajectory",
|
|
61
|
+
"expected_messages"
|
|
62
|
+
];
|
|
55
63
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
64
|
function isEvaluatorKind(value) {
|
|
57
65
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -60,6 +68,44 @@ function getHitCount(result) {
|
|
|
60
68
|
return result.hits.length;
|
|
61
69
|
}
|
|
62
70
|
|
|
71
|
+
// src/evaluation/trace.ts
|
|
72
|
+
function isTraceEventType(value) {
|
|
73
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
74
|
+
}
|
|
75
|
+
function isTraceEvent(value) {
|
|
76
|
+
if (typeof value !== "object" || value === null) {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
const candidate = value;
|
|
80
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
81
|
+
}
|
|
82
|
+
function isExpectedToolCall(value) {
|
|
83
|
+
if (typeof value !== "object" || value === null) {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
const candidate = value;
|
|
87
|
+
return typeof candidate.tool === "string";
|
|
88
|
+
}
|
|
89
|
+
function computeTraceSummary(trace) {
|
|
90
|
+
const toolCallCounts = {};
|
|
91
|
+
let errorCount = 0;
|
|
92
|
+
for (const event of trace) {
|
|
93
|
+
if (event.type === "tool_call" && event.name) {
|
|
94
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
95
|
+
}
|
|
96
|
+
if (event.type === "error") {
|
|
97
|
+
errorCount++;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
101
|
+
return {
|
|
102
|
+
eventCount: trace.length,
|
|
103
|
+
toolNames,
|
|
104
|
+
toolCallsByName: toolCallCounts,
|
|
105
|
+
errorCount
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
63
109
|
// src/evaluation/yaml-parser.ts
|
|
64
110
|
import { readFile as readFile5 } from "node:fs/promises";
|
|
65
111
|
import path6 from "node:path";
|
|
@@ -403,10 +449,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
403
449
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
404
450
|
continue;
|
|
405
451
|
}
|
|
406
|
-
if (typeValue === "
|
|
452
|
+
if (typeValue === "code_judge") {
|
|
407
453
|
const script = asString2(rawEvaluator.script);
|
|
408
454
|
if (!script) {
|
|
409
|
-
logWarning2(`Skipping
|
|
455
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
410
456
|
continue;
|
|
411
457
|
}
|
|
412
458
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -417,7 +463,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
417
463
|
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
418
464
|
} else {
|
|
419
465
|
logWarning2(
|
|
420
|
-
`
|
|
466
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
421
467
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
422
468
|
);
|
|
423
469
|
}
|
|
@@ -433,6 +479,174 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
433
479
|
});
|
|
434
480
|
continue;
|
|
435
481
|
}
|
|
482
|
+
if (typeValue === "composite") {
|
|
483
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
484
|
+
if (!Array.isArray(rawMembers)) {
|
|
485
|
+
logWarning2(
|
|
486
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
487
|
+
);
|
|
488
|
+
continue;
|
|
489
|
+
}
|
|
490
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
491
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
492
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
493
|
+
continue;
|
|
494
|
+
}
|
|
495
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
496
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
497
|
+
logWarning2(
|
|
498
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
499
|
+
);
|
|
500
|
+
continue;
|
|
501
|
+
}
|
|
502
|
+
const memberEvaluators = [];
|
|
503
|
+
for (const rawMember of rawMembers) {
|
|
504
|
+
if (!isJsonObject2(rawMember)) {
|
|
505
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
506
|
+
continue;
|
|
507
|
+
}
|
|
508
|
+
const memberName = asString2(rawMember.name);
|
|
509
|
+
const memberType = rawMember.type;
|
|
510
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
511
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
512
|
+
continue;
|
|
513
|
+
}
|
|
514
|
+
const memberConfigs = await parseEvaluators(
|
|
515
|
+
{ evaluators: [rawMember] },
|
|
516
|
+
void 0,
|
|
517
|
+
searchRoots,
|
|
518
|
+
`${evalId}:${name}:${memberName}`
|
|
519
|
+
);
|
|
520
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
521
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
if (memberEvaluators.length === 0) {
|
|
525
|
+
logWarning2(
|
|
526
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
527
|
+
);
|
|
528
|
+
continue;
|
|
529
|
+
}
|
|
530
|
+
let aggregator;
|
|
531
|
+
if (aggregatorType === "weighted_average") {
|
|
532
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
533
|
+
const parsedWeights = {};
|
|
534
|
+
if (weights) {
|
|
535
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
536
|
+
if (typeof value === "number") {
|
|
537
|
+
parsedWeights[key] = value;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
aggregator = {
|
|
542
|
+
type: "weighted_average",
|
|
543
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
544
|
+
};
|
|
545
|
+
} else if (aggregatorType === "code_judge") {
|
|
546
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
547
|
+
if (!aggregatorPath) {
|
|
548
|
+
logWarning2(
|
|
549
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
550
|
+
);
|
|
551
|
+
continue;
|
|
552
|
+
}
|
|
553
|
+
aggregator = {
|
|
554
|
+
type: "code_judge",
|
|
555
|
+
path: aggregatorPath,
|
|
556
|
+
cwd: searchRoots[0]
|
|
557
|
+
};
|
|
558
|
+
} else {
|
|
559
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
560
|
+
let promptPath2;
|
|
561
|
+
if (aggregatorPrompt) {
|
|
562
|
+
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
563
|
+
if (resolved.resolvedPath) {
|
|
564
|
+
promptPath2 = path3.resolve(resolved.resolvedPath);
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
aggregator = {
|
|
568
|
+
type: "llm_judge",
|
|
569
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
570
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
571
|
+
};
|
|
572
|
+
}
|
|
573
|
+
evaluators.push({
|
|
574
|
+
name,
|
|
575
|
+
type: "composite",
|
|
576
|
+
evaluators: memberEvaluators,
|
|
577
|
+
aggregator
|
|
578
|
+
});
|
|
579
|
+
continue;
|
|
580
|
+
}
|
|
581
|
+
if (typeValue === "expected_messages") {
|
|
582
|
+
evaluators.push({
|
|
583
|
+
name,
|
|
584
|
+
type: "expected_messages"
|
|
585
|
+
});
|
|
586
|
+
continue;
|
|
587
|
+
}
|
|
588
|
+
if (typeValue === "tool_trajectory") {
|
|
589
|
+
const mode = asString2(rawEvaluator.mode);
|
|
590
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
591
|
+
logWarning2(
|
|
592
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
593
|
+
);
|
|
594
|
+
continue;
|
|
595
|
+
}
|
|
596
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
597
|
+
let minimums;
|
|
598
|
+
if (rawMinimums !== void 0) {
|
|
599
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
600
|
+
logWarning2(
|
|
601
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
602
|
+
);
|
|
603
|
+
continue;
|
|
604
|
+
}
|
|
605
|
+
minimums = {};
|
|
606
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
607
|
+
if (typeof count === "number" && count >= 0) {
|
|
608
|
+
minimums[toolName] = count;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
const rawExpected = rawEvaluator.expected;
|
|
613
|
+
let expected;
|
|
614
|
+
if (rawExpected !== void 0) {
|
|
615
|
+
if (!Array.isArray(rawExpected)) {
|
|
616
|
+
logWarning2(
|
|
617
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
618
|
+
);
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
expected = [];
|
|
622
|
+
for (const item of rawExpected) {
|
|
623
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
624
|
+
expected.push({ tool: item.tool });
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
if (mode === "any_order" && !minimums) {
|
|
629
|
+
logWarning2(
|
|
630
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
631
|
+
);
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
635
|
+
logWarning2(
|
|
636
|
+
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
637
|
+
);
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
const config = {
|
|
641
|
+
name,
|
|
642
|
+
type: "tool_trajectory",
|
|
643
|
+
mode,
|
|
644
|
+
...minimums ? { minimums } : {},
|
|
645
|
+
...expected ? { expected } : {}
|
|
646
|
+
};
|
|
647
|
+
evaluators.push(config);
|
|
648
|
+
continue;
|
|
649
|
+
}
|
|
436
650
|
const prompt = asString2(rawEvaluator.prompt);
|
|
437
651
|
let promptPath;
|
|
438
652
|
if (prompt) {
|
|
@@ -686,6 +900,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
686
900
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
687
901
|
}
|
|
688
902
|
}
|
|
903
|
+
async function processExpectedMessages(options) {
|
|
904
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
905
|
+
const segments = [];
|
|
906
|
+
for (const message of messages) {
|
|
907
|
+
const segment = {
|
|
908
|
+
role: message.role
|
|
909
|
+
};
|
|
910
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
911
|
+
segment.tool_calls = message.tool_calls;
|
|
912
|
+
}
|
|
913
|
+
const content = message.content;
|
|
914
|
+
if (typeof content === "string") {
|
|
915
|
+
segment.content = content;
|
|
916
|
+
} else if (Array.isArray(content)) {
|
|
917
|
+
const processedContent = [];
|
|
918
|
+
for (const rawSegment of content) {
|
|
919
|
+
if (!isJsonObject(rawSegment)) {
|
|
920
|
+
continue;
|
|
921
|
+
}
|
|
922
|
+
const segmentType = asString3(rawSegment.type);
|
|
923
|
+
if (segmentType === "file") {
|
|
924
|
+
const rawValue = asString3(rawSegment.value);
|
|
925
|
+
if (!rawValue) {
|
|
926
|
+
continue;
|
|
927
|
+
}
|
|
928
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
929
|
+
rawValue,
|
|
930
|
+
searchRoots
|
|
931
|
+
);
|
|
932
|
+
if (!resolvedPath) {
|
|
933
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
934
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
935
|
+
continue;
|
|
936
|
+
}
|
|
937
|
+
try {
|
|
938
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
939
|
+
processedContent.push({
|
|
940
|
+
type: "file",
|
|
941
|
+
path: displayPath,
|
|
942
|
+
text: fileContent,
|
|
943
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
944
|
+
});
|
|
945
|
+
if (verbose) {
|
|
946
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
947
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
948
|
+
}
|
|
949
|
+
} catch (error) {
|
|
950
|
+
logWarning3(
|
|
951
|
+
`Could not read expected output file ${resolvedPath}: ${error.message}`
|
|
952
|
+
);
|
|
953
|
+
}
|
|
954
|
+
continue;
|
|
955
|
+
}
|
|
956
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
957
|
+
}
|
|
958
|
+
segment.content = processedContent;
|
|
959
|
+
}
|
|
960
|
+
segments.push(segment);
|
|
961
|
+
}
|
|
962
|
+
return segments;
|
|
963
|
+
}
|
|
689
964
|
|
|
690
965
|
// src/evaluation/formatting/prompt-builder.ts
|
|
691
966
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -990,12 +1265,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
990
1265
|
messageType: "input",
|
|
991
1266
|
verbose
|
|
992
1267
|
});
|
|
993
|
-
const outputSegments = hasExpectedMessages ? await
|
|
1268
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
994
1269
|
messages: expectedMessages,
|
|
995
1270
|
searchRoots,
|
|
996
1271
|
repoRootPath,
|
|
997
|
-
guidelinePatterns,
|
|
998
|
-
messageType: "output",
|
|
999
1272
|
verbose
|
|
1000
1273
|
}) : [];
|
|
1001
1274
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -1519,9 +1792,11 @@ var CliProvider = class {
|
|
|
1519
1792
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
1520
1793
|
throw new Error(message);
|
|
1521
1794
|
}
|
|
1522
|
-
const
|
|
1795
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1796
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
1523
1797
|
return {
|
|
1524
|
-
text:
|
|
1798
|
+
text: parsed.text,
|
|
1799
|
+
trace: parsed.trace,
|
|
1525
1800
|
raw: {
|
|
1526
1801
|
command: renderedCommand,
|
|
1527
1802
|
stderr: result.stderr,
|
|
@@ -1531,6 +1806,31 @@ var CliProvider = class {
|
|
|
1531
1806
|
}
|
|
1532
1807
|
};
|
|
1533
1808
|
}
|
|
1809
|
+
/**
|
|
1810
|
+
* Parse output content from CLI.
|
|
1811
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
1812
|
+
* Otherwise, treat the entire content as plain text.
|
|
1813
|
+
*/
|
|
1814
|
+
parseOutputContent(content) {
|
|
1815
|
+
try {
|
|
1816
|
+
const parsed = JSON.parse(content);
|
|
1817
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
1818
|
+
const obj = parsed;
|
|
1819
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1820
|
+
const trace = this.parseTrace(obj.trace);
|
|
1821
|
+
return { text, trace };
|
|
1822
|
+
}
|
|
1823
|
+
} catch {
|
|
1824
|
+
}
|
|
1825
|
+
return { text: content };
|
|
1826
|
+
}
|
|
1827
|
+
parseTrace(trace) {
|
|
1828
|
+
if (!Array.isArray(trace)) {
|
|
1829
|
+
return void 0;
|
|
1830
|
+
}
|
|
1831
|
+
const validEvents = trace.filter(isTraceEvent);
|
|
1832
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
1833
|
+
}
|
|
1534
1834
|
async readAndCleanupOutputFile(filePath) {
|
|
1535
1835
|
try {
|
|
1536
1836
|
const content = await readTextFile(filePath);
|
|
@@ -2517,6 +2817,7 @@ var MockProvider = class {
|
|
|
2517
2817
|
delayMs;
|
|
2518
2818
|
delayMinMs;
|
|
2519
2819
|
delayMaxMs;
|
|
2820
|
+
trace;
|
|
2520
2821
|
constructor(targetName, config) {
|
|
2521
2822
|
this.id = `mock:${targetName}`;
|
|
2522
2823
|
this.targetName = targetName;
|
|
@@ -2524,6 +2825,7 @@ var MockProvider = class {
|
|
|
2524
2825
|
this.delayMs = config.delayMs ?? 0;
|
|
2525
2826
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2526
2827
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2828
|
+
this.trace = config.trace;
|
|
2527
2829
|
}
|
|
2528
2830
|
async invoke(request) {
|
|
2529
2831
|
const delay = this.calculateDelay();
|
|
@@ -2535,7 +2837,8 @@ var MockProvider = class {
|
|
|
2535
2837
|
raw: {
|
|
2536
2838
|
question: request.question,
|
|
2537
2839
|
guidelines: request.guidelines
|
|
2538
|
-
}
|
|
2840
|
+
},
|
|
2841
|
+
trace: this.trace
|
|
2539
2842
|
};
|
|
2540
2843
|
}
|
|
2541
2844
|
calculateDelay() {
|
|
@@ -3329,6 +3632,473 @@ function substituteVariables(template, variables) {
|
|
|
3329
3632
|
return variables[varName] ?? match;
|
|
3330
3633
|
});
|
|
3331
3634
|
}
|
|
3635
|
+
var ToolTrajectoryEvaluator = class {
|
|
3636
|
+
kind = "tool_trajectory";
|
|
3637
|
+
config;
|
|
3638
|
+
constructor(options) {
|
|
3639
|
+
this.config = options.config;
|
|
3640
|
+
}
|
|
3641
|
+
evaluate(context) {
|
|
3642
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
3643
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
3644
|
+
return {
|
|
3645
|
+
score: 0,
|
|
3646
|
+
verdict: "fail",
|
|
3647
|
+
hits: [],
|
|
3648
|
+
misses: ["No trace available for evaluation"],
|
|
3649
|
+
expectedAspectCount: 1
|
|
3650
|
+
};
|
|
3651
|
+
}
|
|
3652
|
+
switch (this.config.mode) {
|
|
3653
|
+
case "any_order":
|
|
3654
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
3655
|
+
case "in_order":
|
|
3656
|
+
return this.evaluateInOrder(candidateTrace);
|
|
3657
|
+
case "exact":
|
|
3658
|
+
return this.evaluateExact(candidateTrace);
|
|
3659
|
+
default:
|
|
3660
|
+
return {
|
|
3661
|
+
score: 0,
|
|
3662
|
+
verdict: "fail",
|
|
3663
|
+
hits: [],
|
|
3664
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
3665
|
+
expectedAspectCount: 1
|
|
3666
|
+
};
|
|
3667
|
+
}
|
|
3668
|
+
}
|
|
3669
|
+
evaluateAnyOrder(summary) {
|
|
3670
|
+
const minimums = this.config.minimums ?? {};
|
|
3671
|
+
const toolNames = Object.keys(minimums);
|
|
3672
|
+
if (toolNames.length === 0) {
|
|
3673
|
+
return {
|
|
3674
|
+
score: 1,
|
|
3675
|
+
verdict: "pass",
|
|
3676
|
+
hits: ["No tool requirements specified"],
|
|
3677
|
+
misses: [],
|
|
3678
|
+
expectedAspectCount: 0
|
|
3679
|
+
};
|
|
3680
|
+
}
|
|
3681
|
+
const hits = [];
|
|
3682
|
+
const misses = [];
|
|
3683
|
+
for (const toolName of toolNames) {
|
|
3684
|
+
const required = minimums[toolName];
|
|
3685
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
3686
|
+
if (actual >= required) {
|
|
3687
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3688
|
+
} else {
|
|
3689
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3692
|
+
const score = hits.length / toolNames.length;
|
|
3693
|
+
return {
|
|
3694
|
+
score,
|
|
3695
|
+
verdict: scoreToVerdict(score),
|
|
3696
|
+
hits,
|
|
3697
|
+
misses,
|
|
3698
|
+
expectedAspectCount: toolNames.length
|
|
3699
|
+
};
|
|
3700
|
+
}
|
|
3701
|
+
evaluateInOrder(trace) {
|
|
3702
|
+
const expected = this.config.expected ?? [];
|
|
3703
|
+
if (expected.length === 0) {
|
|
3704
|
+
return {
|
|
3705
|
+
score: 1,
|
|
3706
|
+
verdict: "pass",
|
|
3707
|
+
hits: ["No tool sequence specified"],
|
|
3708
|
+
misses: [],
|
|
3709
|
+
expectedAspectCount: 0
|
|
3710
|
+
};
|
|
3711
|
+
}
|
|
3712
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3713
|
+
const hits = [];
|
|
3714
|
+
const misses = [];
|
|
3715
|
+
let actualIndex = 0;
|
|
3716
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3717
|
+
const expectedTool = expected[i].tool;
|
|
3718
|
+
let found = false;
|
|
3719
|
+
while (actualIndex < actualToolCalls.length) {
|
|
3720
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
3721
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
3722
|
+
actualIndex++;
|
|
3723
|
+
found = true;
|
|
3724
|
+
break;
|
|
3725
|
+
}
|
|
3726
|
+
actualIndex++;
|
|
3727
|
+
}
|
|
3728
|
+
if (!found) {
|
|
3729
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
const score = hits.length / expected.length;
|
|
3733
|
+
return {
|
|
3734
|
+
score,
|
|
3735
|
+
verdict: scoreToVerdict(score),
|
|
3736
|
+
hits,
|
|
3737
|
+
misses,
|
|
3738
|
+
expectedAspectCount: expected.length
|
|
3739
|
+
};
|
|
3740
|
+
}
|
|
3741
|
+
evaluateExact(trace) {
|
|
3742
|
+
const expected = this.config.expected ?? [];
|
|
3743
|
+
if (expected.length === 0) {
|
|
3744
|
+
return {
|
|
3745
|
+
score: 1,
|
|
3746
|
+
verdict: "pass",
|
|
3747
|
+
hits: ["No tool sequence specified"],
|
|
3748
|
+
misses: [],
|
|
3749
|
+
expectedAspectCount: 0
|
|
3750
|
+
};
|
|
3751
|
+
}
|
|
3752
|
+
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3753
|
+
const hits = [];
|
|
3754
|
+
const misses = [];
|
|
3755
|
+
if (actualToolCalls.length !== expected.length) {
|
|
3756
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
3757
|
+
}
|
|
3758
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
3759
|
+
for (let i = 0; i < checkLength; i++) {
|
|
3760
|
+
const expectedTool = expected[i].tool;
|
|
3761
|
+
const actualTool = actualToolCalls[i].name;
|
|
3762
|
+
if (actualTool === expectedTool) {
|
|
3763
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
3764
|
+
} else {
|
|
3765
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
3766
|
+
}
|
|
3767
|
+
}
|
|
3768
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
3769
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
3770
|
+
}
|
|
3771
|
+
const score = hits.length / expected.length;
|
|
3772
|
+
return {
|
|
3773
|
+
score,
|
|
3774
|
+
verdict: scoreToVerdict(score),
|
|
3775
|
+
hits,
|
|
3776
|
+
misses,
|
|
3777
|
+
expectedAspectCount: expected.length
|
|
3778
|
+
};
|
|
3779
|
+
}
|
|
3780
|
+
};
|
|
3781
|
+
var ExpectedMessagesEvaluator = class {
|
|
3782
|
+
kind = "expected_messages";
|
|
3783
|
+
evaluate(context) {
|
|
3784
|
+
const { candidateTrace, evalCase } = context;
|
|
3785
|
+
const expectedSegments = evalCase.expected_segments;
|
|
3786
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
3787
|
+
if (expectedToolCalls.length === 0) {
|
|
3788
|
+
return {
|
|
3789
|
+
score: 1,
|
|
3790
|
+
verdict: "pass",
|
|
3791
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
3792
|
+
misses: [],
|
|
3793
|
+
expectedAspectCount: 1
|
|
3794
|
+
};
|
|
3795
|
+
}
|
|
3796
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
3797
|
+
return {
|
|
3798
|
+
score: 0,
|
|
3799
|
+
verdict: "fail",
|
|
3800
|
+
hits: [],
|
|
3801
|
+
misses: ["No trace available to validate tool_calls"],
|
|
3802
|
+
expectedAspectCount: expectedToolCalls.length
|
|
3803
|
+
};
|
|
3804
|
+
}
|
|
3805
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
3806
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
3807
|
+
}
|
|
3808
|
+
extractExpectedToolCalls(segments) {
|
|
3809
|
+
if (!segments) {
|
|
3810
|
+
return [];
|
|
3811
|
+
}
|
|
3812
|
+
const toolCalls = [];
|
|
3813
|
+
for (const segment of segments) {
|
|
3814
|
+
const role = segment.role;
|
|
3815
|
+
const segmentToolCalls = segment.tool_calls;
|
|
3816
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
3817
|
+
for (const tc of segmentToolCalls) {
|
|
3818
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
3819
|
+
const toolCall = tc;
|
|
3820
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
3821
|
+
}
|
|
3822
|
+
}
|
|
3823
|
+
}
|
|
3824
|
+
}
|
|
3825
|
+
return toolCalls;
|
|
3826
|
+
}
|
|
3827
|
+
validateToolCalls(expected, actual) {
|
|
3828
|
+
const hits = [];
|
|
3829
|
+
const misses = [];
|
|
3830
|
+
for (let i = 0; i < expected.length; i++) {
|
|
3831
|
+
const expectedCall = expected[i];
|
|
3832
|
+
const actualCall = actual[i];
|
|
3833
|
+
if (!actualCall) {
|
|
3834
|
+
misses.push(
|
|
3835
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
3836
|
+
);
|
|
3837
|
+
continue;
|
|
3838
|
+
}
|
|
3839
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
3840
|
+
misses.push(
|
|
3841
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
3842
|
+
);
|
|
3843
|
+
continue;
|
|
3844
|
+
}
|
|
3845
|
+
if (expectedCall.input !== void 0) {
|
|
3846
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
3847
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
3848
|
+
continue;
|
|
3849
|
+
}
|
|
3850
|
+
}
|
|
3851
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
3852
|
+
}
|
|
3853
|
+
const totalChecks = expected.length || 1;
|
|
3854
|
+
const score = hits.length / totalChecks;
|
|
3855
|
+
return {
|
|
3856
|
+
score,
|
|
3857
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
3858
|
+
hits,
|
|
3859
|
+
misses,
|
|
3860
|
+
expectedAspectCount: totalChecks
|
|
3861
|
+
};
|
|
3862
|
+
}
|
|
3863
|
+
deepEquals(a, b) {
|
|
3864
|
+
if (a === b) return true;
|
|
3865
|
+
if (typeof a !== typeof b) return false;
|
|
3866
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
3867
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
3868
|
+
if (a.length !== b.length) return false;
|
|
3869
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
3870
|
+
}
|
|
3871
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
3872
|
+
const aObj = a;
|
|
3873
|
+
const bObj = b;
|
|
3874
|
+
const aKeys = Object.keys(aObj);
|
|
3875
|
+
const bKeys = Object.keys(bObj);
|
|
3876
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
3877
|
+
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
3878
|
+
}
|
|
3879
|
+
};
|
|
3880
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3881
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
3882
|
+
|
|
3883
|
+
Decide the final score and verdict based on all evaluator results.
|
|
3884
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
3885
|
+
var CompositeEvaluator = class {
|
|
3886
|
+
kind = "composite";
|
|
3887
|
+
config;
|
|
3888
|
+
evaluatorFactory;
|
|
3889
|
+
cwd;
|
|
3890
|
+
constructor(options) {
|
|
3891
|
+
this.config = options.config;
|
|
3892
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
3893
|
+
this.cwd = options.cwd;
|
|
3894
|
+
}
|
|
3895
|
+
async evaluate(context) {
|
|
3896
|
+
const memberResults = await Promise.all(
|
|
3897
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
3898
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
3899
|
+
return {
|
|
3900
|
+
id: memberConfig.name,
|
|
3901
|
+
type: memberConfig.type,
|
|
3902
|
+
result: await evaluator.evaluate(context)
|
|
3903
|
+
};
|
|
3904
|
+
})
|
|
3905
|
+
);
|
|
3906
|
+
return this.aggregate(memberResults, context);
|
|
3907
|
+
}
|
|
3908
|
+
async aggregate(results, context) {
|
|
3909
|
+
const aggregator = this.config.aggregator;
|
|
3910
|
+
switch (aggregator.type) {
|
|
3911
|
+
case "code_judge":
|
|
3912
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
3913
|
+
case "llm_judge":
|
|
3914
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
3915
|
+
default:
|
|
3916
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
3917
|
+
}
|
|
3918
|
+
}
|
|
3919
|
+
runWeightedAverage(results, weights) {
|
|
3920
|
+
let totalWeight = 0;
|
|
3921
|
+
let weightedSum = 0;
|
|
3922
|
+
const allHits = [];
|
|
3923
|
+
const allMisses = [];
|
|
3924
|
+
const reasoningParts = [];
|
|
3925
|
+
const evaluatorResults = [];
|
|
3926
|
+
for (const member of results) {
|
|
3927
|
+
const weight = weights?.[member.id] ?? 1;
|
|
3928
|
+
totalWeight += weight;
|
|
3929
|
+
weightedSum += member.result.score * weight;
|
|
3930
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
3931
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
3932
|
+
if (member.result.reasoning) {
|
|
3933
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
3934
|
+
}
|
|
3935
|
+
evaluatorResults.push({
|
|
3936
|
+
name: member.id,
|
|
3937
|
+
type: member.type,
|
|
3938
|
+
score: member.result.score,
|
|
3939
|
+
weight,
|
|
3940
|
+
verdict: member.result.verdict,
|
|
3941
|
+
hits: [...member.result.hits],
|
|
3942
|
+
misses: [...member.result.misses],
|
|
3943
|
+
reasoning: member.result.reasoning,
|
|
3944
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3945
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3946
|
+
});
|
|
3947
|
+
}
|
|
3948
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
3949
|
+
return {
|
|
3950
|
+
score: clampScore(finalScore),
|
|
3951
|
+
verdict: scoreToVerdict(finalScore),
|
|
3952
|
+
hits: allHits,
|
|
3953
|
+
misses: allMisses,
|
|
3954
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
3955
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
3956
|
+
evaluatorRawRequest: {
|
|
3957
|
+
aggregator: "weighted_average",
|
|
3958
|
+
...weights ? { weights } : {}
|
|
3959
|
+
},
|
|
3960
|
+
evaluatorResults
|
|
3961
|
+
};
|
|
3962
|
+
}
|
|
3963
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
3964
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
3965
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
3966
|
+
const evaluatorResults = results.map((member) => ({
|
|
3967
|
+
name: member.id,
|
|
3968
|
+
type: member.type,
|
|
3969
|
+
score: member.result.score,
|
|
3970
|
+
weight: weights?.[member.id] ?? 1,
|
|
3971
|
+
verdict: member.result.verdict,
|
|
3972
|
+
hits: [...member.result.hits],
|
|
3973
|
+
misses: [...member.result.misses],
|
|
3974
|
+
reasoning: member.result.reasoning,
|
|
3975
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3976
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3977
|
+
}));
|
|
3978
|
+
try {
|
|
3979
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
3980
|
+
const parsed = parseJsonSafe(stdout);
|
|
3981
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
3982
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
3983
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
3984
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3985
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
3986
|
+
return {
|
|
3987
|
+
score,
|
|
3988
|
+
verdict,
|
|
3989
|
+
hits,
|
|
3990
|
+
misses,
|
|
3991
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
3992
|
+
reasoning,
|
|
3993
|
+
evaluatorRawRequest: {
|
|
3994
|
+
aggregator: "code_judge",
|
|
3995
|
+
script: scriptPath
|
|
3996
|
+
},
|
|
3997
|
+
evaluatorResults
|
|
3998
|
+
};
|
|
3999
|
+
} catch (error) {
|
|
4000
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4001
|
+
return {
|
|
4002
|
+
score: 0,
|
|
4003
|
+
verdict: "fail",
|
|
4004
|
+
hits: [],
|
|
4005
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
4006
|
+
expectedAspectCount: 1,
|
|
4007
|
+
reasoning: message,
|
|
4008
|
+
evaluatorRawRequest: {
|
|
4009
|
+
aggregator: "code_judge",
|
|
4010
|
+
script: scriptPath,
|
|
4011
|
+
error: message
|
|
4012
|
+
},
|
|
4013
|
+
evaluatorResults
|
|
4014
|
+
};
|
|
4015
|
+
}
|
|
4016
|
+
}
|
|
4017
|
+
async runLlmAggregator(results, context, config) {
|
|
4018
|
+
const judgeProvider = context.judgeProvider;
|
|
4019
|
+
if (!judgeProvider) {
|
|
4020
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
4021
|
+
}
|
|
4022
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4023
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
4024
|
+
const evaluatorResults = results.map((member) => ({
|
|
4025
|
+
name: member.id,
|
|
4026
|
+
type: member.type,
|
|
4027
|
+
score: member.result.score,
|
|
4028
|
+
verdict: member.result.verdict,
|
|
4029
|
+
hits: [...member.result.hits],
|
|
4030
|
+
misses: [...member.result.misses],
|
|
4031
|
+
reasoning: member.result.reasoning,
|
|
4032
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4033
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4034
|
+
}));
|
|
4035
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
4036
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
4037
|
+
const systemPrompt = buildOutputSchema();
|
|
4038
|
+
const evaluatorRawRequest = {
|
|
4039
|
+
aggregator: "llm_judge",
|
|
4040
|
+
userPrompt,
|
|
4041
|
+
systemPrompt,
|
|
4042
|
+
target: judgeProvider.targetName
|
|
4043
|
+
};
|
|
4044
|
+
try {
|
|
4045
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
4046
|
+
if (model) {
|
|
4047
|
+
const { text } = await generateText2({
|
|
4048
|
+
model,
|
|
4049
|
+
system: systemPrompt,
|
|
4050
|
+
prompt: userPrompt
|
|
4051
|
+
});
|
|
4052
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
4053
|
+
const score2 = clampScore(data2.score);
|
|
4054
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4055
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4056
|
+
const reasoning2 = data2.reasoning;
|
|
4057
|
+
return {
|
|
4058
|
+
score: score2,
|
|
4059
|
+
verdict: scoreToVerdict(score2),
|
|
4060
|
+
hits: hits2,
|
|
4061
|
+
misses: misses2,
|
|
4062
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
4063
|
+
reasoning: reasoning2,
|
|
4064
|
+
evaluatorRawRequest,
|
|
4065
|
+
evaluatorResults
|
|
4066
|
+
};
|
|
4067
|
+
}
|
|
4068
|
+
const response = await judgeProvider.invoke({
|
|
4069
|
+
question: userPrompt,
|
|
4070
|
+
systemPrompt,
|
|
4071
|
+
evalCaseId: context.evalCase.id,
|
|
4072
|
+
attempt: context.attempt
|
|
4073
|
+
});
|
|
4074
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
4075
|
+
const score = clampScore(data.score);
|
|
4076
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4077
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4078
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
4079
|
+
return {
|
|
4080
|
+
score,
|
|
4081
|
+
verdict: scoreToVerdict(score),
|
|
4082
|
+
hits,
|
|
4083
|
+
misses,
|
|
4084
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
4085
|
+
reasoning,
|
|
4086
|
+
evaluatorRawRequest,
|
|
4087
|
+
evaluatorResults
|
|
4088
|
+
};
|
|
4089
|
+
} catch {
|
|
4090
|
+
return {
|
|
4091
|
+
score: 0,
|
|
4092
|
+
verdict: "fail",
|
|
4093
|
+
hits: [],
|
|
4094
|
+
misses: [],
|
|
4095
|
+
expectedAspectCount: 1,
|
|
4096
|
+
evaluatorRawRequest,
|
|
4097
|
+
evaluatorResults
|
|
4098
|
+
};
|
|
4099
|
+
}
|
|
4100
|
+
}
|
|
4101
|
+
};
|
|
3332
4102
|
|
|
3333
4103
|
// src/evaluation/orchestrator.ts
|
|
3334
4104
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
@@ -3530,7 +4300,7 @@ async function runEvaluation(options) {
|
|
|
3530
4300
|
if (!definition) {
|
|
3531
4301
|
return void 0;
|
|
3532
4302
|
}
|
|
3533
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
4303
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
3534
4304
|
resolvedTargetsByName.set(name, resolved);
|
|
3535
4305
|
return resolved;
|
|
3536
4306
|
};
|
|
@@ -3844,6 +4614,17 @@ async function runEvalCase(options) {
|
|
|
3844
4614
|
if (cacheKey && cache && !cachedResponse) {
|
|
3845
4615
|
await cache.set(cacheKey, providerResponse);
|
|
3846
4616
|
}
|
|
4617
|
+
let candidateTrace = providerResponse.trace;
|
|
4618
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
4619
|
+
try {
|
|
4620
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
4621
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
4622
|
+
candidateTrace = rawTrace;
|
|
4623
|
+
}
|
|
4624
|
+
} catch {
|
|
4625
|
+
}
|
|
4626
|
+
}
|
|
4627
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
3847
4628
|
try {
|
|
3848
4629
|
return await evaluateCandidate({
|
|
3849
4630
|
evalCase,
|
|
@@ -3855,7 +4636,9 @@ async function runEvalCase(options) {
|
|
|
3855
4636
|
nowFn,
|
|
3856
4637
|
attempt,
|
|
3857
4638
|
judgeProvider,
|
|
3858
|
-
agentTimeoutMs
|
|
4639
|
+
agentTimeoutMs,
|
|
4640
|
+
candidateTrace,
|
|
4641
|
+
candidateTraceSummary
|
|
3859
4642
|
});
|
|
3860
4643
|
} catch (error) {
|
|
3861
4644
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -3872,7 +4655,9 @@ async function evaluateCandidate(options) {
|
|
|
3872
4655
|
nowFn,
|
|
3873
4656
|
attempt,
|
|
3874
4657
|
judgeProvider,
|
|
3875
|
-
agentTimeoutMs
|
|
4658
|
+
agentTimeoutMs,
|
|
4659
|
+
candidateTrace,
|
|
4660
|
+
candidateTraceSummary
|
|
3876
4661
|
} = options;
|
|
3877
4662
|
const gradeTimestamp = nowFn();
|
|
3878
4663
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -3885,7 +4670,9 @@ async function evaluateCandidate(options) {
|
|
|
3885
4670
|
promptInputs,
|
|
3886
4671
|
now: gradeTimestamp,
|
|
3887
4672
|
judgeProvider,
|
|
3888
|
-
agentTimeoutMs
|
|
4673
|
+
agentTimeoutMs,
|
|
4674
|
+
candidateTrace,
|
|
4675
|
+
candidateTraceSummary
|
|
3889
4676
|
});
|
|
3890
4677
|
const completedAt = nowFn();
|
|
3891
4678
|
let agentProviderRequest;
|
|
@@ -3924,7 +4711,8 @@ async function evaluateCandidate(options) {
|
|
|
3924
4711
|
agent_provider_request: agentProviderRequest,
|
|
3925
4712
|
lm_provider_request: lmProviderRequest,
|
|
3926
4713
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3927
|
-
evaluator_results: evaluatorResults
|
|
4714
|
+
evaluator_results: evaluatorResults,
|
|
4715
|
+
trace_summary: candidateTraceSummary
|
|
3928
4716
|
};
|
|
3929
4717
|
}
|
|
3930
4718
|
async function runEvaluatorsForCase(options) {
|
|
@@ -3938,7 +4726,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
3938
4726
|
promptInputs,
|
|
3939
4727
|
now,
|
|
3940
4728
|
judgeProvider,
|
|
3941
|
-
agentTimeoutMs
|
|
4729
|
+
agentTimeoutMs,
|
|
4730
|
+
candidateTrace,
|
|
4731
|
+
candidateTraceSummary
|
|
3942
4732
|
} = options;
|
|
3943
4733
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3944
4734
|
return runEvaluatorList({
|
|
@@ -3952,7 +4742,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
3952
4742
|
promptInputs,
|
|
3953
4743
|
now,
|
|
3954
4744
|
judgeProvider,
|
|
3955
|
-
agentTimeoutMs
|
|
4745
|
+
agentTimeoutMs,
|
|
4746
|
+
candidateTrace,
|
|
4747
|
+
candidateTraceSummary
|
|
3956
4748
|
});
|
|
3957
4749
|
}
|
|
3958
4750
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -3968,7 +4760,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
3968
4760
|
attempt,
|
|
3969
4761
|
promptInputs,
|
|
3970
4762
|
now,
|
|
3971
|
-
judgeProvider
|
|
4763
|
+
judgeProvider,
|
|
4764
|
+
candidateTrace,
|
|
4765
|
+
candidateTraceSummary
|
|
3972
4766
|
});
|
|
3973
4767
|
return { score };
|
|
3974
4768
|
}
|
|
@@ -3984,7 +4778,9 @@ async function runEvaluatorList(options) {
|
|
|
3984
4778
|
promptInputs,
|
|
3985
4779
|
now,
|
|
3986
4780
|
judgeProvider,
|
|
3987
|
-
agentTimeoutMs
|
|
4781
|
+
agentTimeoutMs,
|
|
4782
|
+
candidateTrace,
|
|
4783
|
+
candidateTraceSummary
|
|
3988
4784
|
} = options;
|
|
3989
4785
|
const scored = [];
|
|
3990
4786
|
const evaluatorResults = [];
|
|
@@ -4030,6 +4826,63 @@ async function runEvaluatorList(options) {
|
|
|
4030
4826
|
promptInputs,
|
|
4031
4827
|
now
|
|
4032
4828
|
});
|
|
4829
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
4830
|
+
evaluatorResults.push({
|
|
4831
|
+
name: evaluator.name,
|
|
4832
|
+
type: "code_judge",
|
|
4833
|
+
score: score2.score,
|
|
4834
|
+
verdict: score2.verdict,
|
|
4835
|
+
hits: score2.hits,
|
|
4836
|
+
misses: score2.misses,
|
|
4837
|
+
reasoning: score2.reasoning,
|
|
4838
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4839
|
+
});
|
|
4840
|
+
}
|
|
4841
|
+
if (evaluator.type === "composite") {
|
|
4842
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
4843
|
+
const createEvaluator = (memberConfig) => {
|
|
4844
|
+
switch (memberConfig.type) {
|
|
4845
|
+
case "llm_judge":
|
|
4846
|
+
return evaluatorRegistry.llm_judge;
|
|
4847
|
+
case "code":
|
|
4848
|
+
return new CodeEvaluator({
|
|
4849
|
+
script: memberConfig.script,
|
|
4850
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
4851
|
+
agentTimeoutMs
|
|
4852
|
+
});
|
|
4853
|
+
case "composite":
|
|
4854
|
+
return new CompositeEvaluator({
|
|
4855
|
+
config: memberConfig,
|
|
4856
|
+
cwd: evalFileDir,
|
|
4857
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4858
|
+
});
|
|
4859
|
+
case "tool_trajectory":
|
|
4860
|
+
return new ToolTrajectoryEvaluator({
|
|
4861
|
+
config: memberConfig
|
|
4862
|
+
});
|
|
4863
|
+
case "expected_messages":
|
|
4864
|
+
return new ExpectedMessagesEvaluator();
|
|
4865
|
+
default: {
|
|
4866
|
+
const unknownConfig = memberConfig;
|
|
4867
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
4868
|
+
}
|
|
4869
|
+
}
|
|
4870
|
+
};
|
|
4871
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
4872
|
+
config: evaluator,
|
|
4873
|
+
cwd: evalFileDir,
|
|
4874
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4875
|
+
});
|
|
4876
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
4877
|
+
evalCase,
|
|
4878
|
+
candidate,
|
|
4879
|
+
target,
|
|
4880
|
+
provider,
|
|
4881
|
+
attempt,
|
|
4882
|
+
promptInputs,
|
|
4883
|
+
now,
|
|
4884
|
+
judgeProvider
|
|
4885
|
+
});
|
|
4033
4886
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4034
4887
|
evaluatorResults.push({
|
|
4035
4888
|
name: evaluator.name,
|
|
@@ -4039,7 +4892,58 @@ async function runEvaluatorList(options) {
|
|
|
4039
4892
|
hits: score2.hits,
|
|
4040
4893
|
misses: score2.misses,
|
|
4041
4894
|
reasoning: score2.reasoning,
|
|
4042
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4895
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
4896
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4897
|
+
});
|
|
4898
|
+
}
|
|
4899
|
+
if (evaluator.type === "tool_trajectory") {
|
|
4900
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
4901
|
+
config: evaluator
|
|
4902
|
+
});
|
|
4903
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
4904
|
+
evalCase,
|
|
4905
|
+
candidate,
|
|
4906
|
+
target,
|
|
4907
|
+
provider,
|
|
4908
|
+
attempt,
|
|
4909
|
+
promptInputs,
|
|
4910
|
+
now,
|
|
4911
|
+
candidateTrace,
|
|
4912
|
+
candidateTraceSummary
|
|
4913
|
+
});
|
|
4914
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4915
|
+
evaluatorResults.push({
|
|
4916
|
+
name: evaluator.name,
|
|
4917
|
+
type: evaluator.type,
|
|
4918
|
+
score: score2.score,
|
|
4919
|
+
verdict: score2.verdict,
|
|
4920
|
+
hits: score2.hits,
|
|
4921
|
+
misses: score2.misses,
|
|
4922
|
+
reasoning: score2.reasoning
|
|
4923
|
+
});
|
|
4924
|
+
}
|
|
4925
|
+
if (evaluator.type === "expected_messages") {
|
|
4926
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
4927
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
4928
|
+
evalCase,
|
|
4929
|
+
candidate,
|
|
4930
|
+
target,
|
|
4931
|
+
provider,
|
|
4932
|
+
attempt,
|
|
4933
|
+
promptInputs,
|
|
4934
|
+
now,
|
|
4935
|
+
candidateTrace,
|
|
4936
|
+
candidateTraceSummary
|
|
4937
|
+
});
|
|
4938
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4939
|
+
evaluatorResults.push({
|
|
4940
|
+
name: evaluator.name,
|
|
4941
|
+
type: evaluator.type,
|
|
4942
|
+
score: score2.score,
|
|
4943
|
+
verdict: score2.verdict,
|
|
4944
|
+
hits: score2.hits,
|
|
4945
|
+
misses: score2.misses,
|
|
4946
|
+
reasoning: score2.reasoning
|
|
4043
4947
|
});
|
|
4044
4948
|
}
|
|
4045
4949
|
} catch (error) {
|
|
@@ -4052,14 +4956,15 @@ async function runEvaluatorList(options) {
|
|
|
4052
4956
|
expectedAspectCount: 1,
|
|
4053
4957
|
reasoning: message
|
|
4054
4958
|
};
|
|
4959
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4055
4960
|
scored.push({
|
|
4056
4961
|
score: fallbackScore,
|
|
4057
4962
|
name: evaluator.name ?? "unknown",
|
|
4058
|
-
type:
|
|
4963
|
+
type: resultType ?? "llm_judge"
|
|
4059
4964
|
});
|
|
4060
4965
|
evaluatorResults.push({
|
|
4061
4966
|
name: evaluator.name ?? "unknown",
|
|
4062
|
-
type:
|
|
4967
|
+
type: resultType ?? "llm_judge",
|
|
4063
4968
|
score: 0,
|
|
4064
4969
|
verdict: "fail",
|
|
4065
4970
|
hits: [],
|
|
@@ -4277,6 +5182,23 @@ function isTimeoutLike(error) {
|
|
|
4277
5182
|
const value = String(error).toLowerCase();
|
|
4278
5183
|
return value.includes("timeout");
|
|
4279
5184
|
}
|
|
5185
|
+
function mapChildResults(children) {
|
|
5186
|
+
if (!children || children.length === 0) {
|
|
5187
|
+
return void 0;
|
|
5188
|
+
}
|
|
5189
|
+
return children.map((child) => ({
|
|
5190
|
+
name: child.name,
|
|
5191
|
+
type: child.type,
|
|
5192
|
+
score: child.score,
|
|
5193
|
+
weight: child.weight,
|
|
5194
|
+
verdict: child.verdict,
|
|
5195
|
+
hits: child.hits,
|
|
5196
|
+
misses: child.misses,
|
|
5197
|
+
reasoning: child.reasoning,
|
|
5198
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
5199
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5200
|
+
}));
|
|
5201
|
+
}
|
|
4280
5202
|
|
|
4281
5203
|
// src/evaluation/generators/rubric-generator.ts
|
|
4282
5204
|
import { generateText as generateText3 } from "ai";
|
|
@@ -4364,11 +5286,15 @@ function createAgentKernel() {
|
|
|
4364
5286
|
}
|
|
4365
5287
|
export {
|
|
4366
5288
|
CodeEvaluator,
|
|
5289
|
+
CompositeEvaluator,
|
|
5290
|
+
ExpectedMessagesEvaluator,
|
|
4367
5291
|
LlmJudgeEvaluator,
|
|
4368
5292
|
TEST_MESSAGE_ROLES,
|
|
5293
|
+
ToolTrajectoryEvaluator,
|
|
4369
5294
|
buildDirectoryChain,
|
|
4370
5295
|
buildPromptInputs,
|
|
4371
5296
|
buildSearchRoots,
|
|
5297
|
+
computeTraceSummary,
|
|
4372
5298
|
consumeCodexLogEntries,
|
|
4373
5299
|
createAgentKernel,
|
|
4374
5300
|
createProvider,
|
|
@@ -4379,14 +5305,18 @@ export {
|
|
|
4379
5305
|
generateRubrics,
|
|
4380
5306
|
getHitCount,
|
|
4381
5307
|
isEvaluatorKind,
|
|
5308
|
+
isExpectedToolCall,
|
|
4382
5309
|
isGuidelineFile,
|
|
4383
5310
|
isJsonObject,
|
|
4384
5311
|
isJsonValue,
|
|
4385
5312
|
isTestMessage,
|
|
4386
5313
|
isTestMessageRole,
|
|
5314
|
+
isTraceEvent,
|
|
5315
|
+
isTraceEventType,
|
|
4387
5316
|
listTargetNames,
|
|
4388
5317
|
loadEvalCases,
|
|
4389
5318
|
normalizeLineEndings,
|
|
5319
|
+
readJsonFile,
|
|
4390
5320
|
readTargetDefinitions,
|
|
4391
5321
|
readTestSuiteMetadata,
|
|
4392
5322
|
readTextFile,
|