@agentv/core 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1457 -1121
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +142 -71
- package/dist/index.d.ts +142 -71
- package/dist/index.js +1295 -968
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -42,31 +42,39 @@ __export(index_exports, {
|
|
|
42
42
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
43
43
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
44
44
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
45
|
+
buildOutputSchema: () => buildOutputSchema,
|
|
45
46
|
buildPromptInputs: () => buildPromptInputs,
|
|
46
47
|
buildSearchRoots: () => buildSearchRoots2,
|
|
48
|
+
clampScore: () => clampScore,
|
|
47
49
|
computeTraceSummary: () => computeTraceSummary,
|
|
48
50
|
consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
|
|
49
51
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
50
52
|
consumePiLogEntries: () => consumePiLogEntries,
|
|
51
53
|
createAgentKernel: () => createAgentKernel,
|
|
52
54
|
createProvider: () => createProvider,
|
|
55
|
+
deepEqual: () => deepEqual,
|
|
53
56
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
|
+
executeScript: () => executeScript,
|
|
54
58
|
explorationRatio: () => explorationRatio,
|
|
55
|
-
|
|
59
|
+
extractJsonBlob: () => extractJsonBlob,
|
|
56
60
|
fileExists: () => fileExists2,
|
|
57
61
|
findGitRoot: () => findGitRoot,
|
|
62
|
+
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
58
63
|
generateRubrics: () => generateRubrics,
|
|
59
64
|
getHitCount: () => getHitCount,
|
|
60
65
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
61
66
|
isGuidelineFile: () => isGuidelineFile,
|
|
62
67
|
isJsonObject: () => isJsonObject,
|
|
63
68
|
isJsonValue: () => isJsonValue,
|
|
69
|
+
isNonEmptyString: () => isNonEmptyString,
|
|
64
70
|
isTestMessage: () => isTestMessage,
|
|
65
71
|
isTestMessageRole: () => isTestMessageRole,
|
|
66
72
|
listTargetNames: () => listTargetNames,
|
|
67
73
|
loadEvalCases: () => loadEvalCases,
|
|
68
74
|
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
69
75
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
76
|
+
parseJsonFromText: () => parseJsonFromText,
|
|
77
|
+
parseJsonSafe: () => parseJsonSafe,
|
|
70
78
|
readJsonFile: () => readJsonFile,
|
|
71
79
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
72
80
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
@@ -76,6 +84,7 @@ __export(index_exports, {
|
|
|
76
84
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
77
85
|
runEvalCase: () => runEvalCase,
|
|
78
86
|
runEvaluation: () => runEvaluation,
|
|
87
|
+
scoreToVerdict: () => scoreToVerdict,
|
|
79
88
|
subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
|
|
80
89
|
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
81
90
|
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
@@ -221,85 +230,6 @@ var import_promises6 = require("fs/promises");
|
|
|
221
230
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
222
231
|
var import_yaml2 = require("yaml");
|
|
223
232
|
|
|
224
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
225
|
-
function extractCodeBlocks(segments) {
|
|
226
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
227
|
-
const codeBlocks = [];
|
|
228
|
-
for (const segment of segments) {
|
|
229
|
-
const typeValue = segment.type;
|
|
230
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
231
|
-
continue;
|
|
232
|
-
}
|
|
233
|
-
const textValue = segment.value;
|
|
234
|
-
if (typeof textValue !== "string") {
|
|
235
|
-
continue;
|
|
236
|
-
}
|
|
237
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
238
|
-
if (matches) {
|
|
239
|
-
codeBlocks.push(...matches);
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
return codeBlocks;
|
|
243
|
-
}
|
|
244
|
-
function formatFileContents(parts) {
|
|
245
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
246
|
-
if (fileCount > 0) {
|
|
247
|
-
return parts.map((part) => {
|
|
248
|
-
if (part.isFile && part.displayPath) {
|
|
249
|
-
return `<file path="${part.displayPath}">
|
|
250
|
-
${part.content}
|
|
251
|
-
</file>`;
|
|
252
|
-
}
|
|
253
|
-
return part.content;
|
|
254
|
-
}).join("\n\n");
|
|
255
|
-
}
|
|
256
|
-
return parts.map((p) => p.content).join(" ");
|
|
257
|
-
}
|
|
258
|
-
function formatSegment(segment, mode = "lm") {
|
|
259
|
-
const type = asString(segment.type);
|
|
260
|
-
if (type === "text") {
|
|
261
|
-
return asString(segment.value);
|
|
262
|
-
}
|
|
263
|
-
if (type === "guideline_ref") {
|
|
264
|
-
const refPath = asString(segment.path);
|
|
265
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
266
|
-
}
|
|
267
|
-
if (type === "file") {
|
|
268
|
-
const filePath = asString(segment.path);
|
|
269
|
-
if (!filePath) {
|
|
270
|
-
return void 0;
|
|
271
|
-
}
|
|
272
|
-
if (mode === "agent") {
|
|
273
|
-
return `<file: path="${filePath}">`;
|
|
274
|
-
}
|
|
275
|
-
const text = asString(segment.text);
|
|
276
|
-
if (text && filePath) {
|
|
277
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
return void 0;
|
|
281
|
-
}
|
|
282
|
-
function hasVisibleContent(segments) {
|
|
283
|
-
return segments.some((segment) => {
|
|
284
|
-
const type = asString(segment.type);
|
|
285
|
-
if (type === "text") {
|
|
286
|
-
const value = asString(segment.value);
|
|
287
|
-
return value !== void 0 && value.trim().length > 0;
|
|
288
|
-
}
|
|
289
|
-
if (type === "guideline_ref") {
|
|
290
|
-
return false;
|
|
291
|
-
}
|
|
292
|
-
if (type === "file") {
|
|
293
|
-
const text = asString(segment.text);
|
|
294
|
-
return text !== void 0 && text.trim().length > 0;
|
|
295
|
-
}
|
|
296
|
-
return false;
|
|
297
|
-
});
|
|
298
|
-
}
|
|
299
|
-
function asString(value) {
|
|
300
|
-
return typeof value === "string" ? value : void 0;
|
|
301
|
-
}
|
|
302
|
-
|
|
303
233
|
// src/evaluation/loaders/config-loader.ts
|
|
304
234
|
var import_promises2 = require("fs/promises");
|
|
305
235
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -554,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
554
484
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
555
485
|
continue;
|
|
556
486
|
}
|
|
557
|
-
const name =
|
|
487
|
+
const name = asString(rawEvaluator.name);
|
|
558
488
|
const typeValue = rawEvaluator.type;
|
|
559
489
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
560
490
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -582,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
582
512
|
continue;
|
|
583
513
|
}
|
|
584
514
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
585
|
-
const cwd =
|
|
515
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
586
516
|
let resolvedCwd;
|
|
587
517
|
if (cwd) {
|
|
588
518
|
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
@@ -597,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
597
527
|
} else {
|
|
598
528
|
resolvedCwd = searchRoots[0];
|
|
599
529
|
}
|
|
600
|
-
const
|
|
530
|
+
const rawTarget = rawEvaluator.target;
|
|
531
|
+
let targetConfig;
|
|
532
|
+
if (rawTarget !== void 0) {
|
|
533
|
+
if (isJsonObject2(rawTarget)) {
|
|
534
|
+
const maxCalls = rawTarget.max_calls;
|
|
535
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
536
|
+
logWarning2(
|
|
537
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
538
|
+
);
|
|
539
|
+
} else {
|
|
540
|
+
targetConfig = {
|
|
541
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
} else if (rawTarget === true) {
|
|
545
|
+
targetConfig = {};
|
|
546
|
+
} else {
|
|
547
|
+
logWarning2(
|
|
548
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
549
|
+
);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
601
553
|
const config = {};
|
|
602
554
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
603
555
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -611,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
611
563
|
cwd,
|
|
612
564
|
resolvedCwd,
|
|
613
565
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
614
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
566
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
567
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
615
568
|
});
|
|
616
569
|
continue;
|
|
617
570
|
}
|
|
@@ -628,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
628
581
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
629
582
|
continue;
|
|
630
583
|
}
|
|
631
|
-
const aggregatorType =
|
|
584
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
632
585
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
633
586
|
logWarning2(
|
|
634
587
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -641,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
641
594
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
642
595
|
continue;
|
|
643
596
|
}
|
|
644
|
-
const memberName =
|
|
597
|
+
const memberName = asString(rawMember.name);
|
|
645
598
|
const memberType = rawMember.type;
|
|
646
599
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
647
600
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -679,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
679
632
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
680
633
|
};
|
|
681
634
|
} else if (aggregatorType === "code_judge") {
|
|
682
|
-
const aggregatorPath =
|
|
635
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
683
636
|
if (!aggregatorPath) {
|
|
684
637
|
logWarning2(
|
|
685
638
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -692,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
692
645
|
cwd: searchRoots[0]
|
|
693
646
|
};
|
|
694
647
|
} else {
|
|
695
|
-
const aggregatorPrompt =
|
|
648
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
696
649
|
let promptPath2;
|
|
697
650
|
if (aggregatorPrompt) {
|
|
698
651
|
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
@@ -717,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
717
670
|
continue;
|
|
718
671
|
}
|
|
719
672
|
if (typeValue === "tool_trajectory") {
|
|
720
|
-
const mode =
|
|
673
|
+
const mode = asString(rawEvaluator.mode);
|
|
721
674
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
722
675
|
logWarning2(
|
|
723
676
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -808,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
808
761
|
);
|
|
809
762
|
continue;
|
|
810
763
|
}
|
|
811
|
-
const fieldPath =
|
|
812
|
-
const match =
|
|
764
|
+
const fieldPath = asString(rawField.path);
|
|
765
|
+
const match = asString(rawField.match);
|
|
813
766
|
if (!fieldPath) {
|
|
814
767
|
logWarning2(
|
|
815
768
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -839,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
839
792
|
);
|
|
840
793
|
continue;
|
|
841
794
|
}
|
|
842
|
-
const aggregation =
|
|
795
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
843
796
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
844
797
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
845
798
|
evaluators.push({
|
|
@@ -920,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
920
873
|
});
|
|
921
874
|
continue;
|
|
922
875
|
}
|
|
923
|
-
const prompt =
|
|
876
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
924
877
|
let promptPath;
|
|
925
878
|
if (prompt) {
|
|
926
879
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
@@ -939,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
939
892
|
);
|
|
940
893
|
}
|
|
941
894
|
}
|
|
942
|
-
const _model =
|
|
895
|
+
const _model = asString(rawEvaluator.model);
|
|
943
896
|
const rawRubrics = rawEvaluator.rubrics;
|
|
944
897
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
945
|
-
id:
|
|
946
|
-
description:
|
|
898
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
899
|
+
description: asString(rubric.description) ?? "",
|
|
947
900
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
948
901
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
949
902
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -987,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
987
940
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
988
941
|
return void 0;
|
|
989
942
|
}
|
|
990
|
-
function
|
|
943
|
+
function asString(value) {
|
|
991
944
|
return typeof value === "string" ? value : void 0;
|
|
992
945
|
}
|
|
993
946
|
function asStringArray(value, description) {
|
|
@@ -1063,6 +1016,68 @@ function isValidFieldAggregationType(value) {
|
|
|
1063
1016
|
// src/evaluation/loaders/message-processor.ts
|
|
1064
1017
|
var import_promises4 = require("fs/promises");
|
|
1065
1018
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1019
|
+
|
|
1020
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
1021
|
+
function formatFileContents(parts) {
|
|
1022
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
1023
|
+
if (fileCount > 0) {
|
|
1024
|
+
return parts.map((part) => {
|
|
1025
|
+
if (part.isFile && part.displayPath) {
|
|
1026
|
+
return `<file path="${part.displayPath}">
|
|
1027
|
+
${part.content}
|
|
1028
|
+
</file>`;
|
|
1029
|
+
}
|
|
1030
|
+
return part.content;
|
|
1031
|
+
}).join("\n\n");
|
|
1032
|
+
}
|
|
1033
|
+
return parts.map((p) => p.content).join(" ");
|
|
1034
|
+
}
|
|
1035
|
+
function formatSegment(segment, mode = "lm") {
|
|
1036
|
+
const type = asString2(segment.type);
|
|
1037
|
+
if (type === "text") {
|
|
1038
|
+
return asString2(segment.value);
|
|
1039
|
+
}
|
|
1040
|
+
if (type === "guideline_ref") {
|
|
1041
|
+
const refPath = asString2(segment.path);
|
|
1042
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
1043
|
+
}
|
|
1044
|
+
if (type === "file") {
|
|
1045
|
+
const filePath = asString2(segment.path);
|
|
1046
|
+
if (!filePath) {
|
|
1047
|
+
return void 0;
|
|
1048
|
+
}
|
|
1049
|
+
if (mode === "agent") {
|
|
1050
|
+
return `<file: path="${filePath}">`;
|
|
1051
|
+
}
|
|
1052
|
+
const text = asString2(segment.text);
|
|
1053
|
+
if (text && filePath) {
|
|
1054
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
return void 0;
|
|
1058
|
+
}
|
|
1059
|
+
function hasVisibleContent(segments) {
|
|
1060
|
+
return segments.some((segment) => {
|
|
1061
|
+
const type = asString2(segment.type);
|
|
1062
|
+
if (type === "text") {
|
|
1063
|
+
const value = asString2(segment.value);
|
|
1064
|
+
return value !== void 0 && value.trim().length > 0;
|
|
1065
|
+
}
|
|
1066
|
+
if (type === "guideline_ref") {
|
|
1067
|
+
return false;
|
|
1068
|
+
}
|
|
1069
|
+
if (type === "file") {
|
|
1070
|
+
const text = asString2(segment.text);
|
|
1071
|
+
return text !== void 0 && text.trim().length > 0;
|
|
1072
|
+
}
|
|
1073
|
+
return false;
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
function asString2(value) {
|
|
1077
|
+
return typeof value === "string" ? value : void 0;
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
// src/evaluation/loaders/message-processor.ts
|
|
1066
1081
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
1067
1082
|
var ANSI_RESET4 = "\x1B[0m";
|
|
1068
1083
|
async function processMessages(options) {
|
|
@@ -1368,9 +1383,6 @@ ${messageContent}`);
|
|
|
1368
1383
|
questionParts.push(formattedContent);
|
|
1369
1384
|
}
|
|
1370
1385
|
}
|
|
1371
|
-
if (testCase.code_snippets.length > 0) {
|
|
1372
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1373
|
-
}
|
|
1374
1386
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1375
1387
|
}
|
|
1376
1388
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1569,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1569
1581
|
repoRootPath,
|
|
1570
1582
|
verbose
|
|
1571
1583
|
}) : [];
|
|
1572
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1573
1584
|
let referenceAnswer = "";
|
|
1574
1585
|
if (outputSegments.length > 0) {
|
|
1575
1586
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1642,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1642
1653
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
1643
1654
|
guideline_patterns: guidelinePatterns,
|
|
1644
1655
|
file_paths: allFilePaths,
|
|
1645
|
-
code_snippets: codeSnippets,
|
|
1646
1656
|
expected_outcome: outcome,
|
|
1647
1657
|
evaluator: evalCaseEvaluatorKind,
|
|
1648
1658
|
evaluators
|
|
@@ -6327,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
6327
6337
|
return createProvider(resolved);
|
|
6328
6338
|
}
|
|
6329
6339
|
|
|
6330
|
-
// src/evaluation/evaluators.ts
|
|
6331
|
-
|
|
6332
|
-
|
|
6340
|
+
// src/evaluation/evaluators/scoring.ts
|
|
6341
|
+
function scoreToVerdict(score) {
|
|
6342
|
+
if (score >= 0.8) {
|
|
6343
|
+
return "pass";
|
|
6344
|
+
}
|
|
6345
|
+
if (score >= 0.6) {
|
|
6346
|
+
return "borderline";
|
|
6347
|
+
}
|
|
6348
|
+
return "fail";
|
|
6349
|
+
}
|
|
6350
|
+
function clampScore(value) {
|
|
6351
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
6352
|
+
return 0;
|
|
6353
|
+
}
|
|
6354
|
+
if (value < 0) {
|
|
6355
|
+
return 0;
|
|
6356
|
+
}
|
|
6357
|
+
if (value > 1) {
|
|
6358
|
+
return 1;
|
|
6359
|
+
}
|
|
6360
|
+
return value;
|
|
6361
|
+
}
|
|
6362
|
+
function extractJsonBlob(text) {
|
|
6363
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
6364
|
+
return match?.[0];
|
|
6365
|
+
}
|
|
6366
|
+
function parseJsonFromText(text) {
|
|
6367
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6368
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6369
|
+
return JSON.parse(blob);
|
|
6370
|
+
}
|
|
6371
|
+
function isNonEmptyString(value) {
|
|
6372
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
6373
|
+
}
|
|
6374
|
+
function parseJsonSafe(payload) {
|
|
6375
|
+
try {
|
|
6376
|
+
return JSON.parse(payload);
|
|
6377
|
+
} catch {
|
|
6378
|
+
return void 0;
|
|
6379
|
+
}
|
|
6380
|
+
}
|
|
6381
|
+
function deepEqual(a, b) {
|
|
6382
|
+
if (a === b) return true;
|
|
6383
|
+
if (a === null || b === null) return a === b;
|
|
6384
|
+
if (typeof a !== typeof b) return false;
|
|
6385
|
+
if (typeof a !== "object") return a === b;
|
|
6386
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
6387
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
6388
|
+
if (a.length !== b.length) return false;
|
|
6389
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
6390
|
+
}
|
|
6391
|
+
const aObj = a;
|
|
6392
|
+
const bObj = b;
|
|
6393
|
+
const aKeys = Object.keys(aObj);
|
|
6394
|
+
const bKeys = Object.keys(bObj);
|
|
6395
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
6396
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
6397
|
+
}
|
|
6333
6398
|
|
|
6334
6399
|
// src/runtime/exec.ts
|
|
6335
6400
|
function shellEscapePath(value) {
|
|
@@ -6354,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
6354
6419
|
cwd: options.cwd,
|
|
6355
6420
|
stdin: encoder.encode(stdinPayload),
|
|
6356
6421
|
stdout: "pipe",
|
|
6357
|
-
stderr: "pipe"
|
|
6422
|
+
stderr: "pipe",
|
|
6423
|
+
// Merge additional env vars with process.env
|
|
6424
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6358
6425
|
});
|
|
6359
6426
|
let timedOut = false;
|
|
6360
6427
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -6389,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6389
6456
|
const [cmd, ...args] = argv;
|
|
6390
6457
|
const child = spawn4(cmd, args, {
|
|
6391
6458
|
cwd: options.cwd,
|
|
6392
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
6459
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
6460
|
+
// Merge additional env vars with process.env
|
|
6461
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6393
6462
|
});
|
|
6394
6463
|
const stdoutChunks = [];
|
|
6395
6464
|
const stderrChunks = [];
|
|
@@ -6442,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6442
6511
|
const child = spawn4(wrappedCommand, {
|
|
6443
6512
|
shell: true,
|
|
6444
6513
|
cwd: options.cwd,
|
|
6445
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
6514
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
6515
|
+
// Merge additional env vars with process.env
|
|
6516
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6446
6517
|
});
|
|
6447
6518
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
6448
6519
|
child.kill();
|
|
@@ -6469,59 +6540,414 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6469
6540
|
}
|
|
6470
6541
|
}
|
|
6471
6542
|
|
|
6472
|
-
// src/
|
|
6473
|
-
|
|
6474
|
-
|
|
6475
|
-
|
|
6476
|
-
|
|
6477
|
-
|
|
6478
|
-
|
|
6479
|
-
|
|
6480
|
-
|
|
6481
|
-
|
|
6482
|
-
|
|
6483
|
-
|
|
6484
|
-
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
|
|
6488
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
6489
|
-
const snakeKey = toSnakeCase(key);
|
|
6490
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6543
|
+
// src/runtime/target-proxy.ts
|
|
6544
|
+
var import_node_crypto4 = require("crypto");
|
|
6545
|
+
var import_node_http = require("http");
|
|
6546
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
6547
|
+
async function createTargetProxy(options) {
|
|
6548
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
6549
|
+
const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
|
|
6550
|
+
let callCount = 0;
|
|
6551
|
+
let isShutdown = false;
|
|
6552
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
6553
|
+
function resolveProvider(targetName) {
|
|
6554
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
6555
|
+
return defaultProvider;
|
|
6556
|
+
}
|
|
6557
|
+
if (targetResolver) {
|
|
6558
|
+
return targetResolver(targetName);
|
|
6491
6559
|
}
|
|
6492
|
-
return
|
|
6560
|
+
return void 0;
|
|
6493
6561
|
}
|
|
6494
|
-
|
|
6495
|
-
|
|
6496
|
-
|
|
6497
|
-
|
|
6498
|
-
|
|
6499
|
-
|
|
6500
|
-
|
|
6501
|
-
|
|
6502
|
-
|
|
6503
|
-
|
|
6504
|
-
|
|
6505
|
-
|
|
6506
|
-
|
|
6507
|
-
|
|
6562
|
+
const server = (0, import_node_http.createServer)(async (req, res) => {
|
|
6563
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
6564
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
6565
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
6566
|
+
if (req.method === "OPTIONS") {
|
|
6567
|
+
res.writeHead(204);
|
|
6568
|
+
res.end();
|
|
6569
|
+
return;
|
|
6570
|
+
}
|
|
6571
|
+
const authHeader = req.headers.authorization;
|
|
6572
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
6573
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
6574
|
+
return;
|
|
6575
|
+
}
|
|
6576
|
+
if (isShutdown) {
|
|
6577
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
6578
|
+
return;
|
|
6579
|
+
}
|
|
6580
|
+
const url2 = req.url ?? "";
|
|
6581
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
6582
|
+
handleInfo(res);
|
|
6583
|
+
return;
|
|
6584
|
+
}
|
|
6585
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
6586
|
+
await handleInvoke(req, res);
|
|
6587
|
+
return;
|
|
6588
|
+
}
|
|
6589
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
6590
|
+
await handleInvokeBatch(req, res);
|
|
6591
|
+
return;
|
|
6592
|
+
}
|
|
6593
|
+
sendJson(res, 404, { error: "Not found" });
|
|
6594
|
+
});
|
|
6595
|
+
function handleInfo(res) {
|
|
6596
|
+
const response = {
|
|
6597
|
+
targetName: defaultProvider.targetName,
|
|
6598
|
+
maxCalls,
|
|
6599
|
+
callCount,
|
|
6600
|
+
availableTargets: targetsList
|
|
6601
|
+
};
|
|
6602
|
+
sendJson(res, 200, response);
|
|
6508
6603
|
}
|
|
6509
|
-
|
|
6510
|
-
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6604
|
+
async function handleInvoke(req, res) {
|
|
6605
|
+
if (callCount >= maxCalls) {
|
|
6606
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
6607
|
+
return;
|
|
6608
|
+
}
|
|
6609
|
+
try {
|
|
6610
|
+
const body = await readBody(req);
|
|
6611
|
+
const request = JSON.parse(body);
|
|
6612
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6613
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
6614
|
+
return;
|
|
6514
6615
|
}
|
|
6515
|
-
|
|
6616
|
+
const provider = resolveProvider(request.target);
|
|
6617
|
+
if (!provider) {
|
|
6618
|
+
sendJson(res, 400, {
|
|
6619
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6620
|
+
});
|
|
6621
|
+
return;
|
|
6622
|
+
}
|
|
6623
|
+
callCount++;
|
|
6624
|
+
const response = await provider.invoke({
|
|
6625
|
+
question: request.question,
|
|
6626
|
+
systemPrompt: request.systemPrompt,
|
|
6627
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6628
|
+
attempt: request.attempt ?? 1
|
|
6629
|
+
});
|
|
6630
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6631
|
+
const rawText = extractLastAssistantContent(outputMessages);
|
|
6632
|
+
const result = {
|
|
6633
|
+
outputMessages,
|
|
6634
|
+
rawText
|
|
6635
|
+
};
|
|
6636
|
+
sendJson(res, 200, result);
|
|
6637
|
+
} catch (error) {
|
|
6638
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6639
|
+
sendJson(res, 500, { error: message });
|
|
6516
6640
|
}
|
|
6517
6641
|
}
|
|
6518
|
-
|
|
6519
|
-
|
|
6520
|
-
|
|
6521
|
-
|
|
6642
|
+
async function handleInvokeBatch(req, res) {
|
|
6643
|
+
try {
|
|
6644
|
+
const body = await readBody(req);
|
|
6645
|
+
const { requests } = JSON.parse(body);
|
|
6646
|
+
if (!Array.isArray(requests)) {
|
|
6647
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
6648
|
+
return;
|
|
6649
|
+
}
|
|
6650
|
+
if (callCount + requests.length > maxCalls) {
|
|
6651
|
+
sendJson(res, 429, {
|
|
6652
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
6653
|
+
});
|
|
6654
|
+
return;
|
|
6655
|
+
}
|
|
6656
|
+
const responses = [];
|
|
6657
|
+
for (const request of requests) {
|
|
6658
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6659
|
+
responses.push({
|
|
6660
|
+
outputMessages: [],
|
|
6661
|
+
rawText: "Error: Missing required field: question"
|
|
6662
|
+
});
|
|
6663
|
+
continue;
|
|
6664
|
+
}
|
|
6665
|
+
const provider = resolveProvider(request.target);
|
|
6666
|
+
if (!provider) {
|
|
6667
|
+
responses.push({
|
|
6668
|
+
outputMessages: [],
|
|
6669
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6670
|
+
});
|
|
6671
|
+
continue;
|
|
6672
|
+
}
|
|
6673
|
+
callCount++;
|
|
6674
|
+
try {
|
|
6675
|
+
const response = await provider.invoke({
|
|
6676
|
+
question: request.question,
|
|
6677
|
+
systemPrompt: request.systemPrompt,
|
|
6678
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6679
|
+
attempt: request.attempt ?? 1
|
|
6680
|
+
});
|
|
6681
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6682
|
+
responses.push({
|
|
6683
|
+
outputMessages,
|
|
6684
|
+
rawText: extractLastAssistantContent(outputMessages)
|
|
6685
|
+
});
|
|
6686
|
+
} catch (error) {
|
|
6687
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6688
|
+
responses.push({
|
|
6689
|
+
outputMessages: [],
|
|
6690
|
+
rawText: `Error: ${message}`
|
|
6691
|
+
});
|
|
6692
|
+
}
|
|
6693
|
+
}
|
|
6694
|
+
sendJson(res, 200, { responses });
|
|
6695
|
+
} catch (error) {
|
|
6696
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6697
|
+
sendJson(res, 500, { error: message });
|
|
6698
|
+
}
|
|
6699
|
+
}
|
|
6700
|
+
await new Promise((resolve, reject) => {
|
|
6701
|
+
server.once("error", reject);
|
|
6702
|
+
server.listen(0, "127.0.0.1", () => {
|
|
6703
|
+
server.removeListener("error", reject);
|
|
6704
|
+
resolve();
|
|
6705
|
+
});
|
|
6706
|
+
});
|
|
6707
|
+
const address = server.address();
|
|
6708
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
6709
|
+
return {
|
|
6710
|
+
url,
|
|
6711
|
+
token,
|
|
6712
|
+
shutdown: async () => {
|
|
6713
|
+
isShutdown = true;
|
|
6714
|
+
return new Promise((resolve, reject) => {
|
|
6715
|
+
server.close((err) => {
|
|
6716
|
+
if (err) reject(err);
|
|
6717
|
+
else resolve();
|
|
6718
|
+
});
|
|
6719
|
+
});
|
|
6720
|
+
},
|
|
6721
|
+
getUsageMetadata: () => ({
|
|
6722
|
+
callCount,
|
|
6723
|
+
maxCalls
|
|
6724
|
+
})
|
|
6725
|
+
};
|
|
6726
|
+
}
|
|
6727
|
+
function sendJson(res, statusCode, body) {
|
|
6728
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
6729
|
+
res.end(JSON.stringify(body));
|
|
6730
|
+
}
|
|
6731
|
+
function readBody(req) {
|
|
6732
|
+
return new Promise((resolve, reject) => {
|
|
6733
|
+
const chunks = [];
|
|
6734
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
6735
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
6736
|
+
req.on("error", reject);
|
|
6737
|
+
});
|
|
6738
|
+
}
|
|
6739
|
+
function extractLastAssistantContent(messages) {
|
|
6740
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6741
|
+
const msg = messages[i];
|
|
6742
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6743
|
+
if (typeof msg.content === "string") {
|
|
6744
|
+
return msg.content;
|
|
6745
|
+
}
|
|
6746
|
+
if (Array.isArray(msg.content)) {
|
|
6747
|
+
for (const part of msg.content) {
|
|
6748
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
6749
|
+
return String(part.text);
|
|
6750
|
+
}
|
|
6751
|
+
}
|
|
6752
|
+
}
|
|
6753
|
+
}
|
|
6754
|
+
}
|
|
6755
|
+
return void 0;
|
|
6756
|
+
}
|
|
6757
|
+
|
|
6758
|
+
// src/evaluation/case-conversion.ts
|
|
6759
|
+
function toSnakeCase(str) {
|
|
6760
|
+
if (/^[A-Z]/.test(str)) {
|
|
6761
|
+
return str;
|
|
6762
|
+
}
|
|
6763
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
6764
|
+
}
|
|
6765
|
+
function toSnakeCaseDeep(obj) {
|
|
6766
|
+
if (obj === null || obj === void 0) {
|
|
6767
|
+
return obj;
|
|
6768
|
+
}
|
|
6769
|
+
if (Array.isArray(obj)) {
|
|
6770
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
6771
|
+
}
|
|
6772
|
+
if (typeof obj === "object") {
|
|
6773
|
+
const result = {};
|
|
6774
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
6775
|
+
const snakeKey = toSnakeCase(key);
|
|
6776
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6777
|
+
}
|
|
6778
|
+
return result;
|
|
6779
|
+
}
|
|
6780
|
+
return obj;
|
|
6781
|
+
}
|
|
6782
|
+
|
|
6783
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
6784
|
+
var CodeEvaluator = class {
|
|
6785
|
+
kind = "code";
|
|
6786
|
+
script;
|
|
6787
|
+
cwd;
|
|
6788
|
+
agentTimeoutMs;
|
|
6789
|
+
config;
|
|
6790
|
+
target;
|
|
6791
|
+
constructor(options) {
|
|
6792
|
+
this.script = options.script;
|
|
6793
|
+
this.cwd = options.cwd;
|
|
6794
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6795
|
+
this.config = options.config;
|
|
6796
|
+
this.target = options.target;
|
|
6797
|
+
}
|
|
6798
|
+
async evaluate(context) {
|
|
6799
|
+
const payload = {
|
|
6800
|
+
question: context.evalCase.question,
|
|
6801
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
6802
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
6803
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
6804
|
+
candidateAnswer: context.candidate,
|
|
6805
|
+
outputMessages: context.outputMessages ?? null,
|
|
6806
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
6807
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
6808
|
+
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6809
|
+
),
|
|
6810
|
+
inputMessages: context.evalCase.input_messages,
|
|
6811
|
+
traceSummary: context.traceSummary ?? null,
|
|
6812
|
+
config: this.config ?? null
|
|
6813
|
+
};
|
|
6814
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
6815
|
+
let proxyEnv;
|
|
6816
|
+
let proxyShutdown;
|
|
6817
|
+
let getProxyUsage;
|
|
6818
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
6819
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
6820
|
+
const proxy = await createTargetProxy({
|
|
6821
|
+
defaultProvider: context.judgeProvider,
|
|
6822
|
+
targetResolver: context.targetResolver,
|
|
6823
|
+
availableTargets: context.availableTargets,
|
|
6824
|
+
maxCalls
|
|
6825
|
+
});
|
|
6826
|
+
proxyEnv = {
|
|
6827
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
6828
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
6829
|
+
};
|
|
6830
|
+
proxyShutdown = proxy.shutdown;
|
|
6831
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
6832
|
+
}
|
|
6833
|
+
try {
|
|
6834
|
+
const stdout = await executeScript(
|
|
6835
|
+
this.script,
|
|
6836
|
+
inputPayload,
|
|
6837
|
+
this.agentTimeoutMs,
|
|
6838
|
+
this.cwd,
|
|
6839
|
+
proxyEnv
|
|
6840
|
+
);
|
|
6841
|
+
const parsed = parseJsonSafe(stdout);
|
|
6842
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6843
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6844
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6845
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6846
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
6847
|
+
const proxyUsage = getProxyUsage?.();
|
|
6848
|
+
const evaluatorRawRequest = {
|
|
6849
|
+
script: this.script,
|
|
6850
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6851
|
+
...proxyUsage ? {
|
|
6852
|
+
target_proxy: {
|
|
6853
|
+
call_count: proxyUsage.callCount,
|
|
6854
|
+
max_calls: proxyUsage.maxCalls
|
|
6855
|
+
}
|
|
6856
|
+
} : {}
|
|
6857
|
+
};
|
|
6858
|
+
return {
|
|
6859
|
+
score,
|
|
6860
|
+
verdict: scoreToVerdict(score),
|
|
6861
|
+
hits,
|
|
6862
|
+
misses,
|
|
6863
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6864
|
+
reasoning,
|
|
6865
|
+
evaluatorRawRequest,
|
|
6866
|
+
...details ? { details } : {}
|
|
6867
|
+
};
|
|
6868
|
+
} catch (error) {
|
|
6869
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6870
|
+
const proxyUsage = getProxyUsage?.();
|
|
6871
|
+
return {
|
|
6872
|
+
score: 0,
|
|
6873
|
+
verdict: "fail",
|
|
6874
|
+
hits: [],
|
|
6875
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
6876
|
+
expectedAspectCount: 1,
|
|
6877
|
+
reasoning: message,
|
|
6878
|
+
evaluatorRawRequest: {
|
|
6879
|
+
script: this.script,
|
|
6880
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6881
|
+
...proxyUsage ? {
|
|
6882
|
+
target_proxy: {
|
|
6883
|
+
call_count: proxyUsage.callCount,
|
|
6884
|
+
max_calls: proxyUsage.maxCalls
|
|
6885
|
+
}
|
|
6886
|
+
} : {},
|
|
6887
|
+
error: message
|
|
6888
|
+
}
|
|
6889
|
+
};
|
|
6890
|
+
} finally {
|
|
6891
|
+
if (proxyShutdown) {
|
|
6892
|
+
await proxyShutdown();
|
|
6893
|
+
}
|
|
6894
|
+
}
|
|
6895
|
+
}
|
|
6896
|
+
};
|
|
6897
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
6898
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
6899
|
+
if (exitCode !== 0) {
|
|
6900
|
+
const trimmedErr = formatStderr(stderr);
|
|
6901
|
+
throw new Error(
|
|
6902
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
6903
|
+
);
|
|
6904
|
+
}
|
|
6905
|
+
return stdout.trim();
|
|
6906
|
+
}
|
|
6907
|
+
function formatStderr(stderr) {
|
|
6908
|
+
const trimmed = stderr.trim();
|
|
6909
|
+
const maxLength = 2e3;
|
|
6910
|
+
if (trimmed.length <= maxLength) {
|
|
6911
|
+
return trimmed;
|
|
6912
|
+
}
|
|
6913
|
+
const tail = trimmed.slice(-maxLength);
|
|
6914
|
+
return `...(truncated, last ${maxLength} chars)
|
|
6915
|
+
${tail}`;
|
|
6916
|
+
}
|
|
6917
|
+
|
|
6918
|
+
// src/evaluation/evaluators/composite.ts
|
|
6919
|
+
var import_ai3 = require("ai");
|
|
6920
|
+
|
|
6921
|
+
// src/evaluation/providers/types.ts
|
|
6922
|
+
var AGENT_PROVIDER_KINDS = [
|
|
6923
|
+
"codex",
|
|
6924
|
+
"pi-coding-agent",
|
|
6925
|
+
"claude-code",
|
|
6926
|
+
"vscode",
|
|
6927
|
+
"vscode-insiders"
|
|
6928
|
+
];
|
|
6929
|
+
function extractLastAssistantContent2(messages) {
|
|
6930
|
+
if (!messages || messages.length === 0) {
|
|
6931
|
+
return "";
|
|
6932
|
+
}
|
|
6933
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6934
|
+
const msg = messages[i];
|
|
6935
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6936
|
+
if (typeof msg.content === "string") {
|
|
6937
|
+
return msg.content;
|
|
6938
|
+
}
|
|
6939
|
+
return JSON.stringify(msg.content);
|
|
6940
|
+
}
|
|
6941
|
+
}
|
|
6942
|
+
return "";
|
|
6943
|
+
}
|
|
6944
|
+
function isAgentProvider(provider) {
|
|
6945
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
6522
6946
|
}
|
|
6523
6947
|
|
|
6524
|
-
// src/evaluation/evaluators.ts
|
|
6948
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
6949
|
+
var import_ai2 = require("ai");
|
|
6950
|
+
var import_zod3 = require("zod");
|
|
6525
6951
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
6526
6952
|
|
|
6527
6953
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -6601,7 +7027,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6601
7027
|
target: judgeProvider.targetName
|
|
6602
7028
|
};
|
|
6603
7029
|
try {
|
|
6604
|
-
const { data
|
|
7030
|
+
const { data } = await this.runWithRetry({
|
|
6605
7031
|
context,
|
|
6606
7032
|
judgeProvider,
|
|
6607
7033
|
systemPrompt,
|
|
@@ -6714,7 +7140,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6714
7140
|
temperature: this.temperature
|
|
6715
7141
|
});
|
|
6716
7142
|
const data = schema.parse(
|
|
6717
|
-
parseJsonFromText(
|
|
7143
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
6718
7144
|
);
|
|
6719
7145
|
return { data, providerResponse: response };
|
|
6720
7146
|
} catch (e) {
|
|
@@ -6750,86 +7176,160 @@ You must return a valid JSON object matching this schema:
|
|
|
6750
7176
|
"overall_reasoning": "string (summary)"
|
|
6751
7177
|
}`;
|
|
6752
7178
|
}
|
|
6753
|
-
function
|
|
6754
|
-
|
|
6755
|
-
return
|
|
6756
|
-
}
|
|
6757
|
-
if (score >= 0.6) {
|
|
6758
|
-
return "borderline";
|
|
6759
|
-
}
|
|
6760
|
-
return "fail";
|
|
7179
|
+
function substituteVariables(template, variables) {
|
|
7180
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
7181
|
+
return variables[varName] ?? match;
|
|
7182
|
+
});
|
|
6761
7183
|
}
|
|
6762
|
-
function
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
7184
|
+
function calculateRubricScore(result, rubrics) {
|
|
7185
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
7186
|
+
const hits = [];
|
|
7187
|
+
const misses = [];
|
|
7188
|
+
let totalWeight = 0;
|
|
7189
|
+
let earnedWeight = 0;
|
|
7190
|
+
let failedRequired = false;
|
|
7191
|
+
for (const check of result.checks) {
|
|
7192
|
+
const rubric = rubricMap.get(check.id);
|
|
7193
|
+
if (!rubric) {
|
|
7194
|
+
continue;
|
|
7195
|
+
}
|
|
7196
|
+
totalWeight += rubric.weight;
|
|
7197
|
+
if (check.satisfied) {
|
|
7198
|
+
earnedWeight += rubric.weight;
|
|
7199
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
7200
|
+
} else {
|
|
7201
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
7202
|
+
if (rubric.required) {
|
|
7203
|
+
failedRequired = true;
|
|
7204
|
+
}
|
|
7205
|
+
}
|
|
6771
7206
|
}
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
6776
|
-
return match?.[0];
|
|
6777
|
-
}
|
|
6778
|
-
function parseJsonFromText(text) {
|
|
6779
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6780
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6781
|
-
return JSON.parse(blob);
|
|
6782
|
-
}
|
|
6783
|
-
function isNonEmptyString(value) {
|
|
6784
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7207
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
7208
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7209
|
+
return { score, verdict, hits, misses };
|
|
6785
7210
|
}
|
|
6786
|
-
|
|
6787
|
-
|
|
6788
|
-
|
|
6789
|
-
|
|
6790
|
-
|
|
7211
|
+
|
|
7212
|
+
// src/evaluation/evaluators/composite.ts
|
|
7213
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7214
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
7215
|
+
|
|
7216
|
+
Decide the final score and verdict based on all evaluator results.
|
|
7217
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7218
|
+
var CompositeEvaluator = class {
|
|
7219
|
+
kind = "composite";
|
|
6791
7220
|
config;
|
|
7221
|
+
evaluatorFactory;
|
|
7222
|
+
cwd;
|
|
6792
7223
|
constructor(options) {
|
|
6793
|
-
this.script = options.script;
|
|
6794
|
-
this.cwd = options.cwd;
|
|
6795
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6796
7224
|
this.config = options.config;
|
|
7225
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
7226
|
+
this.cwd = options.cwd;
|
|
6797
7227
|
}
|
|
6798
7228
|
async evaluate(context) {
|
|
6799
|
-
const
|
|
6800
|
-
|
|
6801
|
-
|
|
6802
|
-
|
|
6803
|
-
|
|
6804
|
-
|
|
6805
|
-
|
|
6806
|
-
|
|
6807
|
-
|
|
6808
|
-
|
|
6809
|
-
|
|
6810
|
-
|
|
6811
|
-
|
|
6812
|
-
|
|
7229
|
+
const memberResults = await Promise.all(
|
|
7230
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
7231
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
7232
|
+
return {
|
|
7233
|
+
id: memberConfig.name,
|
|
7234
|
+
type: memberConfig.type,
|
|
7235
|
+
result: await evaluator.evaluate(context)
|
|
7236
|
+
};
|
|
7237
|
+
})
|
|
7238
|
+
);
|
|
7239
|
+
return this.aggregate(memberResults, context);
|
|
7240
|
+
}
|
|
7241
|
+
async aggregate(results, context) {
|
|
7242
|
+
const aggregator = this.config.aggregator;
|
|
7243
|
+
switch (aggregator.type) {
|
|
7244
|
+
case "code_judge":
|
|
7245
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
7246
|
+
case "llm_judge":
|
|
7247
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
7248
|
+
default:
|
|
7249
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
7250
|
+
}
|
|
7251
|
+
}
|
|
7252
|
+
runWeightedAverage(results, weights) {
|
|
7253
|
+
let totalWeight = 0;
|
|
7254
|
+
let weightedSum = 0;
|
|
7255
|
+
const allHits = [];
|
|
7256
|
+
const allMisses = [];
|
|
7257
|
+
const reasoningParts = [];
|
|
7258
|
+
const evaluatorResults = [];
|
|
7259
|
+
for (const member of results) {
|
|
7260
|
+
const weight = weights?.[member.id] ?? 1;
|
|
7261
|
+
totalWeight += weight;
|
|
7262
|
+
weightedSum += member.result.score * weight;
|
|
7263
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7264
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7265
|
+
if (member.result.reasoning) {
|
|
7266
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7267
|
+
}
|
|
7268
|
+
evaluatorResults.push({
|
|
7269
|
+
name: member.id,
|
|
7270
|
+
type: member.type,
|
|
7271
|
+
score: member.result.score,
|
|
7272
|
+
weight,
|
|
7273
|
+
verdict: member.result.verdict,
|
|
7274
|
+
hits: [...member.result.hits],
|
|
7275
|
+
misses: [...member.result.misses],
|
|
7276
|
+
reasoning: member.result.reasoning,
|
|
7277
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7278
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7279
|
+
details: member.result.details
|
|
7280
|
+
});
|
|
7281
|
+
}
|
|
7282
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7283
|
+
return {
|
|
7284
|
+
score: clampScore(finalScore),
|
|
7285
|
+
verdict: scoreToVerdict(finalScore),
|
|
7286
|
+
hits: allHits,
|
|
7287
|
+
misses: allMisses,
|
|
7288
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
7289
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7290
|
+
evaluatorRawRequest: {
|
|
7291
|
+
aggregator: "weighted_average",
|
|
7292
|
+
...weights ? { weights } : {}
|
|
7293
|
+
},
|
|
7294
|
+
evaluatorResults
|
|
6813
7295
|
};
|
|
6814
|
-
|
|
7296
|
+
}
|
|
7297
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
7298
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7299
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
7300
|
+
const evaluatorResults = results.map((member) => ({
|
|
7301
|
+
name: member.id,
|
|
7302
|
+
type: member.type,
|
|
7303
|
+
score: member.result.score,
|
|
7304
|
+
weight: weights?.[member.id] ?? 1,
|
|
7305
|
+
verdict: member.result.verdict,
|
|
7306
|
+
hits: [...member.result.hits],
|
|
7307
|
+
misses: [...member.result.misses],
|
|
7308
|
+
reasoning: member.result.reasoning,
|
|
7309
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7310
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7311
|
+
details: member.result.details
|
|
7312
|
+
}));
|
|
6815
7313
|
try {
|
|
6816
|
-
const stdout = await executeScript(
|
|
7314
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6817
7315
|
const parsed = parseJsonSafe(stdout);
|
|
6818
7316
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6819
7317
|
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6820
7318
|
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6821
7319
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7320
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6822
7321
|
return {
|
|
6823
7322
|
score,
|
|
6824
|
-
verdict
|
|
7323
|
+
verdict,
|
|
6825
7324
|
hits,
|
|
6826
7325
|
misses,
|
|
6827
7326
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
6828
7327
|
reasoning,
|
|
6829
7328
|
evaluatorRawRequest: {
|
|
6830
|
-
|
|
6831
|
-
|
|
6832
|
-
}
|
|
7329
|
+
aggregator: "code_judge",
|
|
7330
|
+
script: scriptPath
|
|
7331
|
+
},
|
|
7332
|
+
evaluatorResults
|
|
6833
7333
|
};
|
|
6834
7334
|
} catch (error) {
|
|
6835
7335
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -6837,452 +7337,292 @@ var CodeEvaluator = class {
|
|
|
6837
7337
|
score: 0,
|
|
6838
7338
|
verdict: "fail",
|
|
6839
7339
|
hits: [],
|
|
6840
|
-
misses: [`Code
|
|
7340
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
6841
7341
|
expectedAspectCount: 1,
|
|
6842
7342
|
reasoning: message,
|
|
6843
7343
|
evaluatorRawRequest: {
|
|
6844
|
-
|
|
6845
|
-
|
|
7344
|
+
aggregator: "code_judge",
|
|
7345
|
+
script: scriptPath,
|
|
6846
7346
|
error: message
|
|
6847
|
-
}
|
|
7347
|
+
},
|
|
7348
|
+
evaluatorResults
|
|
6848
7349
|
};
|
|
6849
7350
|
}
|
|
6850
7351
|
}
|
|
6851
|
-
|
|
6852
|
-
|
|
6853
|
-
|
|
6854
|
-
|
|
6855
|
-
const misses = [];
|
|
6856
|
-
let totalWeight = 0;
|
|
6857
|
-
let earnedWeight = 0;
|
|
6858
|
-
let failedRequired = false;
|
|
6859
|
-
for (const check of result.checks) {
|
|
6860
|
-
const rubric = rubricMap.get(check.id);
|
|
6861
|
-
if (!rubric) {
|
|
6862
|
-
continue;
|
|
7352
|
+
async runLlmAggregator(results, context, config) {
|
|
7353
|
+
const judgeProvider = context.judgeProvider;
|
|
7354
|
+
if (!judgeProvider) {
|
|
7355
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
6863
7356
|
}
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
|
|
7357
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7358
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
7359
|
+
const evaluatorResults = results.map((member) => ({
|
|
7360
|
+
name: member.id,
|
|
7361
|
+
type: member.type,
|
|
7362
|
+
score: member.result.score,
|
|
7363
|
+
verdict: member.result.verdict,
|
|
7364
|
+
hits: [...member.result.hits],
|
|
7365
|
+
misses: [...member.result.misses],
|
|
7366
|
+
reasoning: member.result.reasoning,
|
|
7367
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7368
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7369
|
+
details: member.result.details
|
|
7370
|
+
}));
|
|
7371
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
7372
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
7373
|
+
const systemPrompt = buildOutputSchema();
|
|
7374
|
+
const evaluatorRawRequest = {
|
|
7375
|
+
aggregator: "llm_judge",
|
|
7376
|
+
userPrompt,
|
|
7377
|
+
systemPrompt,
|
|
7378
|
+
target: judgeProvider.targetName
|
|
7379
|
+
};
|
|
7380
|
+
try {
|
|
7381
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
7382
|
+
if (model) {
|
|
7383
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
7384
|
+
model,
|
|
7385
|
+
system: systemPrompt,
|
|
7386
|
+
prompt: userPrompt
|
|
7387
|
+
});
|
|
7388
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
7389
|
+
const score2 = clampScore(data2.score);
|
|
7390
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7391
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7392
|
+
const reasoning2 = data2.reasoning;
|
|
7393
|
+
return {
|
|
7394
|
+
score: score2,
|
|
7395
|
+
verdict: scoreToVerdict(score2),
|
|
7396
|
+
hits: hits2,
|
|
7397
|
+
misses: misses2,
|
|
7398
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7399
|
+
reasoning: reasoning2,
|
|
7400
|
+
evaluatorRawRequest,
|
|
7401
|
+
evaluatorResults
|
|
7402
|
+
};
|
|
6872
7403
|
}
|
|
7404
|
+
const response = await judgeProvider.invoke({
|
|
7405
|
+
question: userPrompt,
|
|
7406
|
+
systemPrompt,
|
|
7407
|
+
evalCaseId: context.evalCase.id,
|
|
7408
|
+
attempt: context.attempt
|
|
7409
|
+
});
|
|
7410
|
+
const data = freeformEvaluationSchema.parse(
|
|
7411
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
7412
|
+
);
|
|
7413
|
+
const score = clampScore(data.score);
|
|
7414
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7415
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7416
|
+
const reasoning = data.reasoning;
|
|
7417
|
+
return {
|
|
7418
|
+
score,
|
|
7419
|
+
verdict: scoreToVerdict(score),
|
|
7420
|
+
hits,
|
|
7421
|
+
misses,
|
|
7422
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
7423
|
+
reasoning,
|
|
7424
|
+
evaluatorRawRequest,
|
|
7425
|
+
evaluatorResults
|
|
7426
|
+
};
|
|
7427
|
+
} catch {
|
|
7428
|
+
return {
|
|
7429
|
+
score: 0,
|
|
7430
|
+
verdict: "fail",
|
|
7431
|
+
hits: [],
|
|
7432
|
+
misses: [],
|
|
7433
|
+
expectedAspectCount: 1,
|
|
7434
|
+
evaluatorRawRequest,
|
|
7435
|
+
evaluatorResults
|
|
7436
|
+
};
|
|
6873
7437
|
}
|
|
6874
7438
|
}
|
|
6875
|
-
|
|
6876
|
-
|
|
6877
|
-
|
|
6878
|
-
|
|
6879
|
-
|
|
6880
|
-
|
|
6881
|
-
|
|
6882
|
-
|
|
6883
|
-
throw new Error(
|
|
6884
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
6885
|
-
);
|
|
6886
|
-
}
|
|
6887
|
-
return stdout.trim();
|
|
6888
|
-
}
|
|
6889
|
-
function formatStderr(stderr) {
|
|
6890
|
-
const trimmed = stderr.trim();
|
|
6891
|
-
const maxLength = 2e3;
|
|
6892
|
-
if (trimmed.length <= maxLength) {
|
|
6893
|
-
return trimmed;
|
|
7439
|
+
};
|
|
7440
|
+
|
|
7441
|
+
// src/evaluation/evaluators/cost.ts
|
|
7442
|
+
var CostEvaluator = class {
|
|
7443
|
+
kind = "cost";
|
|
7444
|
+
config;
|
|
7445
|
+
constructor(options) {
|
|
7446
|
+
this.config = options.config;
|
|
6894
7447
|
}
|
|
6895
|
-
|
|
6896
|
-
|
|
6897
|
-
|
|
6898
|
-
|
|
6899
|
-
|
|
6900
|
-
|
|
6901
|
-
|
|
6902
|
-
|
|
6903
|
-
|
|
6904
|
-
|
|
6905
|
-
|
|
6906
|
-
|
|
6907
|
-
|
|
6908
|
-
|
|
6909
|
-
|
|
6910
|
-
}
|
|
6911
|
-
|
|
6912
|
-
|
|
6913
|
-
|
|
6914
|
-
|
|
6915
|
-
|
|
6916
|
-
|
|
6917
|
-
|
|
6918
|
-
|
|
6919
|
-
|
|
6920
|
-
|
|
6921
|
-
|
|
6922
|
-
|
|
6923
|
-
|
|
6924
|
-
|
|
6925
|
-
|
|
6926
|
-
|
|
6927
|
-
}
|
|
6928
|
-
|
|
6929
|
-
if (expected === void 0) return true;
|
|
6930
|
-
if (expected === "any") return true;
|
|
6931
|
-
if (actual === void 0) return false;
|
|
6932
|
-
for (const key of Object.keys(expected)) {
|
|
6933
|
-
if (!Object.hasOwn(actual, key)) return false;
|
|
6934
|
-
if (!deepEqual(expected[key], actual[key])) return false;
|
|
7448
|
+
evaluate(context) {
|
|
7449
|
+
const { budget } = this.config;
|
|
7450
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
7451
|
+
if (costUsd === void 0) {
|
|
7452
|
+
return {
|
|
7453
|
+
score: 0,
|
|
7454
|
+
verdict: "fail",
|
|
7455
|
+
hits: [],
|
|
7456
|
+
misses: ["No cost data available in trace"],
|
|
7457
|
+
expectedAspectCount: 1,
|
|
7458
|
+
reasoning: "Execution cost not reported by provider",
|
|
7459
|
+
evaluatorRawRequest: {
|
|
7460
|
+
type: "cost",
|
|
7461
|
+
budget,
|
|
7462
|
+
costUsd: null
|
|
7463
|
+
}
|
|
7464
|
+
};
|
|
7465
|
+
}
|
|
7466
|
+
const passed = costUsd <= budget;
|
|
7467
|
+
const score = passed ? 1 : 0;
|
|
7468
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7469
|
+
return {
|
|
7470
|
+
score,
|
|
7471
|
+
verdict: passed ? "pass" : "fail",
|
|
7472
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7473
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7474
|
+
expectedAspectCount: 1,
|
|
7475
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7476
|
+
evaluatorRawRequest: {
|
|
7477
|
+
type: "cost",
|
|
7478
|
+
budget,
|
|
7479
|
+
costUsd
|
|
7480
|
+
}
|
|
7481
|
+
};
|
|
6935
7482
|
}
|
|
6936
|
-
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
7483
|
+
};
|
|
7484
|
+
|
|
7485
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
7486
|
+
var DEFAULT_DATE_FORMATS = [
|
|
7487
|
+
"YYYY-MM-DDTHH:mm:ssZ",
|
|
7488
|
+
// ISO with timezone
|
|
7489
|
+
"YYYY-MM-DDTHH:mm:ss",
|
|
7490
|
+
// ISO with time
|
|
7491
|
+
"YYYY-MM-DD",
|
|
7492
|
+
// ISO date
|
|
7493
|
+
"DD-MMM-YYYY",
|
|
7494
|
+
// Localized (e.g., "15-JAN-2025")
|
|
7495
|
+
"MM/DD/YYYY",
|
|
7496
|
+
// US format
|
|
7497
|
+
"DD/MM/YYYY",
|
|
7498
|
+
// EU format
|
|
7499
|
+
"MM-DD-YYYY",
|
|
7500
|
+
// US with dashes
|
|
7501
|
+
"DD-MM-YYYY"
|
|
7502
|
+
// EU with dashes
|
|
7503
|
+
];
|
|
7504
|
+
var MONTH_NAMES = {
|
|
7505
|
+
jan: 0,
|
|
7506
|
+
january: 0,
|
|
7507
|
+
feb: 1,
|
|
7508
|
+
february: 1,
|
|
7509
|
+
mar: 2,
|
|
7510
|
+
march: 2,
|
|
7511
|
+
apr: 3,
|
|
7512
|
+
april: 3,
|
|
7513
|
+
may: 4,
|
|
7514
|
+
jun: 5,
|
|
7515
|
+
june: 5,
|
|
7516
|
+
jul: 6,
|
|
7517
|
+
july: 6,
|
|
7518
|
+
aug: 7,
|
|
7519
|
+
august: 7,
|
|
7520
|
+
sep: 8,
|
|
7521
|
+
sept: 8,
|
|
7522
|
+
september: 8,
|
|
7523
|
+
oct: 9,
|
|
7524
|
+
october: 9,
|
|
7525
|
+
nov: 10,
|
|
7526
|
+
november: 10,
|
|
7527
|
+
dec: 11,
|
|
7528
|
+
december: 11
|
|
7529
|
+
};
|
|
7530
|
+
var FieldAccuracyEvaluator = class {
|
|
7531
|
+
kind = "field_accuracy";
|
|
6940
7532
|
config;
|
|
6941
7533
|
constructor(options) {
|
|
6942
7534
|
this.config = options.config;
|
|
6943
7535
|
}
|
|
6944
7536
|
evaluate(context) {
|
|
6945
|
-
const {
|
|
6946
|
-
|
|
6947
|
-
|
|
7537
|
+
const { evalCase, candidate } = context;
|
|
7538
|
+
let candidateData;
|
|
7539
|
+
try {
|
|
7540
|
+
candidateData = parseJsonFromTextSafe(candidate);
|
|
7541
|
+
} catch {
|
|
6948
7542
|
return {
|
|
6949
7543
|
score: 0,
|
|
6950
7544
|
verdict: "fail",
|
|
6951
7545
|
hits: [],
|
|
6952
|
-
misses: ["
|
|
6953
|
-
expectedAspectCount:
|
|
7546
|
+
misses: ["Failed to parse candidate answer as JSON"],
|
|
7547
|
+
expectedAspectCount: this.config.fields.length,
|
|
7548
|
+
reasoning: "Candidate answer is not valid JSON"
|
|
6954
7549
|
};
|
|
6955
7550
|
}
|
|
6956
|
-
const
|
|
6957
|
-
if (!
|
|
7551
|
+
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
7552
|
+
if (!expectedData) {
|
|
6958
7553
|
return {
|
|
6959
7554
|
score: 0,
|
|
6960
7555
|
verdict: "fail",
|
|
6961
7556
|
hits: [],
|
|
6962
|
-
misses: ["No
|
|
6963
|
-
expectedAspectCount:
|
|
7557
|
+
misses: ["No expected data found in expected_messages"],
|
|
7558
|
+
expectedAspectCount: this.config.fields.length,
|
|
7559
|
+
reasoning: "Could not extract expected data from expected_messages"
|
|
6964
7560
|
};
|
|
6965
7561
|
}
|
|
6966
|
-
|
|
6967
|
-
|
|
6968
|
-
|
|
6969
|
-
|
|
6970
|
-
return this.evaluateInOrder(toolCalls);
|
|
6971
|
-
case "exact":
|
|
6972
|
-
return this.evaluateExact(toolCalls);
|
|
6973
|
-
default:
|
|
6974
|
-
return {
|
|
6975
|
-
score: 0,
|
|
6976
|
-
verdict: "fail",
|
|
6977
|
-
hits: [],
|
|
6978
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
6979
|
-
expectedAspectCount: 1
|
|
6980
|
-
};
|
|
7562
|
+
const fieldResults = [];
|
|
7563
|
+
for (const fieldConfig of this.config.fields) {
|
|
7564
|
+
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
7565
|
+
fieldResults.push(result);
|
|
6981
7566
|
}
|
|
7567
|
+
return this.aggregateResults(fieldResults);
|
|
6982
7568
|
}
|
|
6983
7569
|
/**
|
|
6984
|
-
* Extract
|
|
7570
|
+
* Extract expected data from expected_messages array.
|
|
7571
|
+
* Looks for the last assistant message with content.
|
|
6985
7572
|
*/
|
|
6986
|
-
|
|
6987
|
-
|
|
6988
|
-
|
|
6989
|
-
|
|
6990
|
-
|
|
6991
|
-
|
|
6992
|
-
|
|
6993
|
-
|
|
6994
|
-
|
|
6995
|
-
|
|
6996
|
-
|
|
6997
|
-
}
|
|
7573
|
+
extractExpectedData(expectedMessages) {
|
|
7574
|
+
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
7575
|
+
const message = expectedMessages[i];
|
|
7576
|
+
if (message.role === "assistant" && message.content) {
|
|
7577
|
+
if (typeof message.content === "object" && message.content !== null) {
|
|
7578
|
+
return message.content;
|
|
7579
|
+
}
|
|
7580
|
+
if (typeof message.content === "string") {
|
|
7581
|
+
try {
|
|
7582
|
+
return parseJsonFromTextSafe(message.content);
|
|
7583
|
+
} catch {
|
|
7584
|
+
}
|
|
6998
7585
|
}
|
|
6999
7586
|
}
|
|
7000
7587
|
}
|
|
7001
|
-
return
|
|
7588
|
+
return void 0;
|
|
7002
7589
|
}
|
|
7003
7590
|
/**
|
|
7004
|
-
*
|
|
7591
|
+
* Evaluate a single field against the expected value.
|
|
7005
7592
|
*/
|
|
7006
|
-
|
|
7007
|
-
const
|
|
7008
|
-
|
|
7009
|
-
|
|
7010
|
-
|
|
7011
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
7012
|
-
return {
|
|
7013
|
-
eventCount: toolCalls.length,
|
|
7014
|
-
toolNames,
|
|
7015
|
-
toolCallsByName,
|
|
7016
|
-
errorCount: 0
|
|
7017
|
-
};
|
|
7018
|
-
}
|
|
7019
|
-
evaluateAnyOrder(summary) {
|
|
7020
|
-
const minimums = this.config.minimums ?? {};
|
|
7021
|
-
const toolNames = Object.keys(minimums);
|
|
7022
|
-
if (toolNames.length === 0) {
|
|
7593
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7594
|
+
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7595
|
+
const candidateValue = resolvePath(candidateData, path17);
|
|
7596
|
+
const expectedValue = resolvePath(expectedData, path17);
|
|
7597
|
+
if (expectedValue === void 0) {
|
|
7023
7598
|
return {
|
|
7599
|
+
path: path17,
|
|
7024
7600
|
score: 1,
|
|
7025
|
-
|
|
7026
|
-
|
|
7027
|
-
|
|
7028
|
-
|
|
7601
|
+
// No expected value means no comparison needed
|
|
7602
|
+
weight,
|
|
7603
|
+
hit: true,
|
|
7604
|
+
message: `${path17}: no expected value`
|
|
7029
7605
|
};
|
|
7030
7606
|
}
|
|
7031
|
-
|
|
7032
|
-
|
|
7033
|
-
|
|
7034
|
-
|
|
7035
|
-
|
|
7036
|
-
|
|
7037
|
-
|
|
7038
|
-
|
|
7039
|
-
|
|
7607
|
+
if (candidateValue === void 0) {
|
|
7608
|
+
if (required) {
|
|
7609
|
+
return {
|
|
7610
|
+
path: path17,
|
|
7611
|
+
score: 0,
|
|
7612
|
+
weight,
|
|
7613
|
+
hit: false,
|
|
7614
|
+
message: `${path17} (required, missing)`
|
|
7615
|
+
};
|
|
7040
7616
|
}
|
|
7041
|
-
|
|
7042
|
-
|
|
7043
|
-
|
|
7044
|
-
|
|
7045
|
-
|
|
7046
|
-
|
|
7047
|
-
|
|
7048
|
-
|
|
7049
|
-
|
|
7050
|
-
}
|
|
7051
|
-
evaluateInOrder(toolCalls) {
|
|
7052
|
-
const expected = this.config.expected ?? [];
|
|
7053
|
-
if (expected.length === 0) {
|
|
7054
|
-
return {
|
|
7055
|
-
score: 1,
|
|
7056
|
-
verdict: "pass",
|
|
7057
|
-
hits: ["No tool sequence specified"],
|
|
7058
|
-
misses: [],
|
|
7059
|
-
expectedAspectCount: 0
|
|
7060
|
-
};
|
|
7061
|
-
}
|
|
7062
|
-
const hits = [];
|
|
7063
|
-
const misses = [];
|
|
7064
|
-
let actualIndex = 0;
|
|
7065
|
-
for (let i = 0; i < expected.length; i++) {
|
|
7066
|
-
const expectedItem = expected[i];
|
|
7067
|
-
const expectedTool = expectedItem.tool;
|
|
7068
|
-
let found = false;
|
|
7069
|
-
let argsMismatch = false;
|
|
7070
|
-
while (actualIndex < toolCalls.length) {
|
|
7071
|
-
const actualCall = toolCalls[actualIndex];
|
|
7072
|
-
if (actualCall.name === expectedTool) {
|
|
7073
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7074
|
-
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7075
|
-
actualIndex++;
|
|
7076
|
-
found = true;
|
|
7077
|
-
break;
|
|
7078
|
-
}
|
|
7079
|
-
misses.push(
|
|
7080
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
7081
|
-
);
|
|
7082
|
-
actualIndex++;
|
|
7083
|
-
argsMismatch = true;
|
|
7084
|
-
break;
|
|
7085
|
-
}
|
|
7086
|
-
actualIndex++;
|
|
7087
|
-
}
|
|
7088
|
-
if (!found && !argsMismatch) {
|
|
7089
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7090
|
-
}
|
|
7091
|
-
}
|
|
7092
|
-
const score = hits.length / expected.length;
|
|
7093
|
-
return {
|
|
7094
|
-
score,
|
|
7095
|
-
verdict: scoreToVerdict(score),
|
|
7096
|
-
hits,
|
|
7097
|
-
misses,
|
|
7098
|
-
expectedAspectCount: expected.length
|
|
7099
|
-
};
|
|
7100
|
-
}
|
|
7101
|
-
evaluateExact(toolCalls) {
|
|
7102
|
-
const expected = this.config.expected ?? [];
|
|
7103
|
-
if (expected.length === 0) {
|
|
7104
|
-
return {
|
|
7105
|
-
score: 1,
|
|
7106
|
-
verdict: "pass",
|
|
7107
|
-
hits: ["No tool sequence specified"],
|
|
7108
|
-
misses: [],
|
|
7109
|
-
expectedAspectCount: 0
|
|
7110
|
-
};
|
|
7111
|
-
}
|
|
7112
|
-
const hits = [];
|
|
7113
|
-
const misses = [];
|
|
7114
|
-
if (toolCalls.length !== expected.length) {
|
|
7115
|
-
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7116
|
-
}
|
|
7117
|
-
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
7118
|
-
for (let i = 0; i < checkLength; i++) {
|
|
7119
|
-
const expectedItem = expected[i];
|
|
7120
|
-
const expectedTool = expectedItem.tool;
|
|
7121
|
-
const actualCall = toolCalls[i];
|
|
7122
|
-
const actualTool = actualCall.name;
|
|
7123
|
-
if (actualTool === expectedTool) {
|
|
7124
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7125
|
-
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7126
|
-
} else {
|
|
7127
|
-
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7128
|
-
}
|
|
7129
|
-
} else {
|
|
7130
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7131
|
-
}
|
|
7132
|
-
}
|
|
7133
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
7134
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7135
|
-
}
|
|
7136
|
-
const score = hits.length / expected.length;
|
|
7137
|
-
return {
|
|
7138
|
-
score,
|
|
7139
|
-
verdict: scoreToVerdict(score),
|
|
7140
|
-
hits,
|
|
7141
|
-
misses,
|
|
7142
|
-
expectedAspectCount: expected.length
|
|
7143
|
-
};
|
|
7144
|
-
}
|
|
7145
|
-
};
|
|
7146
|
-
var DEFAULT_DATE_FORMATS = [
|
|
7147
|
-
"YYYY-MM-DDTHH:mm:ssZ",
|
|
7148
|
-
// ISO with timezone
|
|
7149
|
-
"YYYY-MM-DDTHH:mm:ss",
|
|
7150
|
-
// ISO with time
|
|
7151
|
-
"YYYY-MM-DD",
|
|
7152
|
-
// ISO date
|
|
7153
|
-
"DD-MMM-YYYY",
|
|
7154
|
-
// Localized (e.g., "15-JAN-2025")
|
|
7155
|
-
"MM/DD/YYYY",
|
|
7156
|
-
// US format
|
|
7157
|
-
"DD/MM/YYYY",
|
|
7158
|
-
// EU format
|
|
7159
|
-
"MM-DD-YYYY",
|
|
7160
|
-
// US with dashes
|
|
7161
|
-
"DD-MM-YYYY"
|
|
7162
|
-
// EU with dashes
|
|
7163
|
-
];
|
|
7164
|
-
var MONTH_NAMES = {
|
|
7165
|
-
jan: 0,
|
|
7166
|
-
january: 0,
|
|
7167
|
-
feb: 1,
|
|
7168
|
-
february: 1,
|
|
7169
|
-
mar: 2,
|
|
7170
|
-
march: 2,
|
|
7171
|
-
apr: 3,
|
|
7172
|
-
april: 3,
|
|
7173
|
-
may: 4,
|
|
7174
|
-
jun: 5,
|
|
7175
|
-
june: 5,
|
|
7176
|
-
jul: 6,
|
|
7177
|
-
july: 6,
|
|
7178
|
-
aug: 7,
|
|
7179
|
-
august: 7,
|
|
7180
|
-
sep: 8,
|
|
7181
|
-
sept: 8,
|
|
7182
|
-
september: 8,
|
|
7183
|
-
oct: 9,
|
|
7184
|
-
october: 9,
|
|
7185
|
-
nov: 10,
|
|
7186
|
-
november: 10,
|
|
7187
|
-
dec: 11,
|
|
7188
|
-
december: 11
|
|
7189
|
-
};
|
|
7190
|
-
var FieldAccuracyEvaluator = class {
|
|
7191
|
-
kind = "field_accuracy";
|
|
7192
|
-
config;
|
|
7193
|
-
constructor(options) {
|
|
7194
|
-
this.config = options.config;
|
|
7195
|
-
}
|
|
7196
|
-
evaluate(context) {
|
|
7197
|
-
const { evalCase, candidate } = context;
|
|
7198
|
-
let candidateData;
|
|
7199
|
-
try {
|
|
7200
|
-
candidateData = parseJsonFromTextSafe(candidate);
|
|
7201
|
-
} catch {
|
|
7202
|
-
return {
|
|
7203
|
-
score: 0,
|
|
7204
|
-
verdict: "fail",
|
|
7205
|
-
hits: [],
|
|
7206
|
-
misses: ["Failed to parse candidate answer as JSON"],
|
|
7207
|
-
expectedAspectCount: this.config.fields.length,
|
|
7208
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
7209
|
-
};
|
|
7210
|
-
}
|
|
7211
|
-
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
7212
|
-
if (!expectedData) {
|
|
7213
|
-
return {
|
|
7214
|
-
score: 0,
|
|
7215
|
-
verdict: "fail",
|
|
7216
|
-
hits: [],
|
|
7217
|
-
misses: ["No expected data found in expected_messages"],
|
|
7218
|
-
expectedAspectCount: this.config.fields.length,
|
|
7219
|
-
reasoning: "Could not extract expected data from expected_messages"
|
|
7220
|
-
};
|
|
7221
|
-
}
|
|
7222
|
-
const fieldResults = [];
|
|
7223
|
-
for (const fieldConfig of this.config.fields) {
|
|
7224
|
-
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
7225
|
-
fieldResults.push(result);
|
|
7226
|
-
}
|
|
7227
|
-
return this.aggregateResults(fieldResults);
|
|
7228
|
-
}
|
|
7229
|
-
/**
|
|
7230
|
-
* Extract expected data from expected_messages array.
|
|
7231
|
-
* Looks for the last assistant message with content.
|
|
7232
|
-
*/
|
|
7233
|
-
extractExpectedData(expectedMessages) {
|
|
7234
|
-
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
7235
|
-
const message = expectedMessages[i];
|
|
7236
|
-
if (message.role === "assistant" && message.content) {
|
|
7237
|
-
if (typeof message.content === "object" && message.content !== null) {
|
|
7238
|
-
return message.content;
|
|
7239
|
-
}
|
|
7240
|
-
if (typeof message.content === "string") {
|
|
7241
|
-
try {
|
|
7242
|
-
return parseJsonFromTextSafe(message.content);
|
|
7243
|
-
} catch {
|
|
7244
|
-
}
|
|
7245
|
-
}
|
|
7246
|
-
}
|
|
7247
|
-
}
|
|
7248
|
-
return void 0;
|
|
7249
|
-
}
|
|
7250
|
-
/**
|
|
7251
|
-
* Evaluate a single field against the expected value.
|
|
7252
|
-
*/
|
|
7253
|
-
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7254
|
-
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7255
|
-
const candidateValue = resolvePath(candidateData, path17);
|
|
7256
|
-
const expectedValue = resolvePath(expectedData, path17);
|
|
7257
|
-
if (expectedValue === void 0) {
|
|
7258
|
-
return {
|
|
7259
|
-
path: path17,
|
|
7260
|
-
score: 1,
|
|
7261
|
-
// No expected value means no comparison needed
|
|
7262
|
-
weight,
|
|
7263
|
-
hit: true,
|
|
7264
|
-
message: `${path17}: no expected value`
|
|
7265
|
-
};
|
|
7266
|
-
}
|
|
7267
|
-
if (candidateValue === void 0) {
|
|
7268
|
-
if (required) {
|
|
7269
|
-
return {
|
|
7270
|
-
path: path17,
|
|
7271
|
-
score: 0,
|
|
7272
|
-
weight,
|
|
7273
|
-
hit: false,
|
|
7274
|
-
message: `${path17} (required, missing)`
|
|
7275
|
-
};
|
|
7276
|
-
}
|
|
7277
|
-
return {
|
|
7278
|
-
path: path17,
|
|
7279
|
-
score: 1,
|
|
7280
|
-
// Don't penalize missing optional fields
|
|
7281
|
-
weight: 0,
|
|
7282
|
-
// Zero weight means it won't affect the score
|
|
7283
|
-
hit: true,
|
|
7284
|
-
message: `${path17}: optional field missing`
|
|
7285
|
-
};
|
|
7617
|
+
return {
|
|
7618
|
+
path: path17,
|
|
7619
|
+
score: 1,
|
|
7620
|
+
// Don't penalize missing optional fields
|
|
7621
|
+
weight: 0,
|
|
7622
|
+
// Zero weight means it won't affect the score
|
|
7623
|
+
hit: true,
|
|
7624
|
+
message: `${path17}: optional field missing`
|
|
7625
|
+
};
|
|
7286
7626
|
}
|
|
7287
7627
|
switch (match) {
|
|
7288
7628
|
case "exact":
|
|
@@ -7353,436 +7693,211 @@ var FieldAccuracyEvaluator = class {
|
|
|
7353
7693
|
message: `${path17} (non-numeric value)`
|
|
7354
7694
|
};
|
|
7355
7695
|
}
|
|
7356
|
-
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7357
|
-
return {
|
|
7358
|
-
path: path17,
|
|
7359
|
-
score: 0,
|
|
7360
|
-
weight,
|
|
7361
|
-
hit: false,
|
|
7362
|
-
message: `${path17} (invalid numeric value)`
|
|
7363
|
-
};
|
|
7364
|
-
}
|
|
7365
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
7366
|
-
let withinTolerance;
|
|
7367
|
-
if (relative) {
|
|
7368
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7369
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
7370
|
-
} else {
|
|
7371
|
-
withinTolerance = diff <= tolerance;
|
|
7372
|
-
}
|
|
7373
|
-
if (withinTolerance) {
|
|
7374
|
-
return {
|
|
7375
|
-
path: path17,
|
|
7376
|
-
score: 1,
|
|
7377
|
-
weight,
|
|
7378
|
-
hit: true,
|
|
7379
|
-
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7380
|
-
};
|
|
7381
|
-
}
|
|
7382
|
-
return {
|
|
7383
|
-
path: path17,
|
|
7384
|
-
score: 0,
|
|
7385
|
-
weight,
|
|
7386
|
-
hit: false,
|
|
7387
|
-
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7388
|
-
};
|
|
7389
|
-
}
|
|
7390
|
-
/**
|
|
7391
|
-
* Date comparison with format normalization.
|
|
7392
|
-
*/
|
|
7393
|
-
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7394
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7395
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7396
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7397
|
-
if (candidateDate === null) {
|
|
7398
|
-
return {
|
|
7399
|
-
path: path17,
|
|
7400
|
-
score: 0,
|
|
7401
|
-
weight,
|
|
7402
|
-
hit: false,
|
|
7403
|
-
message: `${path17} (unparseable candidate date)`
|
|
7404
|
-
};
|
|
7405
|
-
}
|
|
7406
|
-
if (expectedDate === null) {
|
|
7407
|
-
return {
|
|
7408
|
-
path: path17,
|
|
7409
|
-
score: 0,
|
|
7410
|
-
weight,
|
|
7411
|
-
hit: false,
|
|
7412
|
-
message: `${path17} (unparseable expected date)`
|
|
7413
|
-
};
|
|
7414
|
-
}
|
|
7415
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7416
|
-
return {
|
|
7417
|
-
path: path17,
|
|
7418
|
-
score: 1,
|
|
7419
|
-
weight,
|
|
7420
|
-
hit: true,
|
|
7421
|
-
message: path17
|
|
7422
|
-
};
|
|
7423
|
-
}
|
|
7424
|
-
return {
|
|
7425
|
-
path: path17,
|
|
7426
|
-
score: 0,
|
|
7427
|
-
weight,
|
|
7428
|
-
hit: false,
|
|
7429
|
-
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7430
|
-
};
|
|
7431
|
-
}
|
|
7432
|
-
/**
|
|
7433
|
-
* Aggregate field results using configured strategy.
|
|
7434
|
-
*/
|
|
7435
|
-
aggregateResults(results) {
|
|
7436
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7437
|
-
const hits = [];
|
|
7438
|
-
const misses = [];
|
|
7439
|
-
for (const result of results) {
|
|
7440
|
-
if (result.hit) {
|
|
7441
|
-
hits.push(result.message);
|
|
7442
|
-
} else {
|
|
7443
|
-
misses.push(result.message);
|
|
7444
|
-
}
|
|
7445
|
-
}
|
|
7446
|
-
let score;
|
|
7447
|
-
if (aggregation === "all_or_nothing") {
|
|
7448
|
-
score = misses.length === 0 ? 1 : 0;
|
|
7449
|
-
} else {
|
|
7450
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7451
|
-
if (totalWeight === 0) {
|
|
7452
|
-
score = results.length === 0 ? 1 : 0;
|
|
7453
|
-
} else {
|
|
7454
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7455
|
-
score = weightedSum / totalWeight;
|
|
7456
|
-
}
|
|
7457
|
-
}
|
|
7458
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7459
|
-
return {
|
|
7460
|
-
score: clampScore(score),
|
|
7461
|
-
verdict: scoreToVerdict(score),
|
|
7462
|
-
hits: hits.slice(0, 4),
|
|
7463
|
-
misses: misses.slice(0, 4),
|
|
7464
|
-
expectedAspectCount: results.length,
|
|
7465
|
-
reasoning
|
|
7466
|
-
};
|
|
7467
|
-
}
|
|
7468
|
-
};
|
|
7469
|
-
function resolvePath(obj, path17) {
|
|
7470
|
-
if (!path17 || !obj) {
|
|
7471
|
-
return void 0;
|
|
7472
|
-
}
|
|
7473
|
-
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7474
|
-
let current = obj;
|
|
7475
|
-
for (const part of parts) {
|
|
7476
|
-
if (current === null || current === void 0) {
|
|
7477
|
-
return void 0;
|
|
7478
|
-
}
|
|
7479
|
-
if (typeof current !== "object") {
|
|
7480
|
-
return void 0;
|
|
7481
|
-
}
|
|
7482
|
-
const isIndex = /^\d+$/.test(part);
|
|
7483
|
-
if (isIndex && Array.isArray(current)) {
|
|
7484
|
-
current = current[Number.parseInt(part, 10)];
|
|
7485
|
-
} else {
|
|
7486
|
-
current = current[part];
|
|
7487
|
-
}
|
|
7488
|
-
}
|
|
7489
|
-
return current;
|
|
7490
|
-
}
|
|
7491
|
-
function toNumber(value) {
|
|
7492
|
-
if (typeof value === "number") {
|
|
7493
|
-
return value;
|
|
7494
|
-
}
|
|
7495
|
-
if (typeof value === "string") {
|
|
7496
|
-
const num = Number.parseFloat(value);
|
|
7497
|
-
return Number.isNaN(num) ? null : num;
|
|
7498
|
-
}
|
|
7499
|
-
return null;
|
|
7500
|
-
}
|
|
7501
|
-
function parseDate(dateStr, formats) {
|
|
7502
|
-
if (!dateStr) return null;
|
|
7503
|
-
const trimmed = dateStr.trim();
|
|
7504
|
-
const isoDate = new Date(trimmed);
|
|
7505
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
7506
|
-
return isoDate;
|
|
7507
|
-
}
|
|
7508
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7509
|
-
if (localizedMatch) {
|
|
7510
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7511
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
7512
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7513
|
-
const month = MONTH_NAMES[monthName];
|
|
7514
|
-
if (month !== void 0) {
|
|
7515
|
-
return new Date(year, month, day);
|
|
7516
|
-
}
|
|
7517
|
-
}
|
|
7518
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7519
|
-
if (usMatch) {
|
|
7520
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7521
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7522
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
7523
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7524
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
7525
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7526
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7527
|
-
return new Date(year, month, day);
|
|
7528
|
-
}
|
|
7529
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
7530
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
7531
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7532
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7533
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7534
|
-
return new Date(year, month, day);
|
|
7535
|
-
}
|
|
7536
|
-
} else {
|
|
7537
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7538
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7539
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7540
|
-
if (num1 > 12 && num2 <= 12) {
|
|
7541
|
-
return new Date(year, num2 - 1, num1);
|
|
7542
|
-
}
|
|
7543
|
-
if (num2 > 12 && num1 <= 12) {
|
|
7544
|
-
return new Date(year, num1 - 1, num2);
|
|
7545
|
-
}
|
|
7546
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
7547
|
-
return new Date(year, num1 - 1, num2);
|
|
7548
|
-
}
|
|
7549
|
-
}
|
|
7550
|
-
}
|
|
7551
|
-
return null;
|
|
7552
|
-
}
|
|
7553
|
-
function formatDateISO(date) {
|
|
7554
|
-
return date.toISOString().split("T")[0];
|
|
7555
|
-
}
|
|
7556
|
-
function parseJsonFromTextSafe(text) {
|
|
7557
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
7558
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
7559
|
-
const blob = match?.[0] ?? cleaned;
|
|
7560
|
-
return JSON.parse(blob);
|
|
7561
|
-
}
|
|
7562
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7563
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
7564
|
-
|
|
7565
|
-
Decide the final score and verdict based on all evaluator results.
|
|
7566
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7567
|
-
var CompositeEvaluator = class {
|
|
7568
|
-
kind = "composite";
|
|
7569
|
-
config;
|
|
7570
|
-
evaluatorFactory;
|
|
7571
|
-
cwd;
|
|
7572
|
-
constructor(options) {
|
|
7573
|
-
this.config = options.config;
|
|
7574
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
7575
|
-
this.cwd = options.cwd;
|
|
7576
|
-
}
|
|
7577
|
-
async evaluate(context) {
|
|
7578
|
-
const memberResults = await Promise.all(
|
|
7579
|
-
this.config.evaluators.map(async (memberConfig) => {
|
|
7580
|
-
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
7581
|
-
return {
|
|
7582
|
-
id: memberConfig.name,
|
|
7583
|
-
type: memberConfig.type,
|
|
7584
|
-
result: await evaluator.evaluate(context)
|
|
7585
|
-
};
|
|
7586
|
-
})
|
|
7587
|
-
);
|
|
7588
|
-
return this.aggregate(memberResults, context);
|
|
7589
|
-
}
|
|
7590
|
-
async aggregate(results, context) {
|
|
7591
|
-
const aggregator = this.config.aggregator;
|
|
7592
|
-
switch (aggregator.type) {
|
|
7593
|
-
case "code_judge":
|
|
7594
|
-
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
7595
|
-
case "llm_judge":
|
|
7596
|
-
return this.runLlmAggregator(results, context, aggregator);
|
|
7597
|
-
default:
|
|
7598
|
-
return this.runWeightedAverage(results, aggregator.weights);
|
|
7599
|
-
}
|
|
7600
|
-
}
|
|
7601
|
-
runWeightedAverage(results, weights) {
|
|
7602
|
-
let totalWeight = 0;
|
|
7603
|
-
let weightedSum = 0;
|
|
7604
|
-
const allHits = [];
|
|
7605
|
-
const allMisses = [];
|
|
7606
|
-
const reasoningParts = [];
|
|
7607
|
-
const evaluatorResults = [];
|
|
7608
|
-
for (const member of results) {
|
|
7609
|
-
const weight = weights?.[member.id] ?? 1;
|
|
7610
|
-
totalWeight += weight;
|
|
7611
|
-
weightedSum += member.result.score * weight;
|
|
7612
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7613
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7614
|
-
if (member.result.reasoning) {
|
|
7615
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7616
|
-
}
|
|
7617
|
-
evaluatorResults.push({
|
|
7618
|
-
name: member.id,
|
|
7619
|
-
type: member.type,
|
|
7620
|
-
score: member.result.score,
|
|
7621
|
-
weight,
|
|
7622
|
-
verdict: member.result.verdict,
|
|
7623
|
-
hits: [...member.result.hits],
|
|
7624
|
-
misses: [...member.result.misses],
|
|
7625
|
-
reasoning: member.result.reasoning,
|
|
7626
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7627
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7628
|
-
});
|
|
7629
|
-
}
|
|
7630
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7631
|
-
return {
|
|
7632
|
-
score: clampScore(finalScore),
|
|
7633
|
-
verdict: scoreToVerdict(finalScore),
|
|
7634
|
-
hits: allHits,
|
|
7635
|
-
misses: allMisses,
|
|
7636
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
7637
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7638
|
-
evaluatorRawRequest: {
|
|
7639
|
-
aggregator: "weighted_average",
|
|
7640
|
-
...weights ? { weights } : {}
|
|
7641
|
-
},
|
|
7642
|
-
evaluatorResults
|
|
7643
|
-
};
|
|
7644
|
-
}
|
|
7645
|
-
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
7646
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7647
|
-
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
7648
|
-
const evaluatorResults = results.map((member) => ({
|
|
7649
|
-
name: member.id,
|
|
7650
|
-
type: member.type,
|
|
7651
|
-
score: member.result.score,
|
|
7652
|
-
weight: weights?.[member.id] ?? 1,
|
|
7653
|
-
verdict: member.result.verdict,
|
|
7654
|
-
hits: [...member.result.hits],
|
|
7655
|
-
misses: [...member.result.misses],
|
|
7656
|
-
reasoning: member.result.reasoning,
|
|
7657
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7658
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7659
|
-
}));
|
|
7660
|
-
try {
|
|
7661
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
7662
|
-
const parsed = parseJsonSafe(stdout);
|
|
7663
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
7664
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
7665
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
7666
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7667
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
7668
|
-
return {
|
|
7669
|
-
score,
|
|
7670
|
-
verdict,
|
|
7671
|
-
hits,
|
|
7672
|
-
misses,
|
|
7673
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
7674
|
-
reasoning,
|
|
7675
|
-
evaluatorRawRequest: {
|
|
7676
|
-
aggregator: "code_judge",
|
|
7677
|
-
script: scriptPath
|
|
7678
|
-
},
|
|
7679
|
-
evaluatorResults
|
|
7680
|
-
};
|
|
7681
|
-
} catch (error) {
|
|
7682
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
7683
|
-
return {
|
|
7684
|
-
score: 0,
|
|
7685
|
-
verdict: "fail",
|
|
7686
|
-
hits: [],
|
|
7687
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
7688
|
-
expectedAspectCount: 1,
|
|
7689
|
-
reasoning: message,
|
|
7690
|
-
evaluatorRawRequest: {
|
|
7691
|
-
aggregator: "code_judge",
|
|
7692
|
-
script: scriptPath,
|
|
7693
|
-
error: message
|
|
7694
|
-
},
|
|
7695
|
-
evaluatorResults
|
|
7696
|
-
};
|
|
7697
|
-
}
|
|
7698
|
-
}
|
|
7699
|
-
async runLlmAggregator(results, context, config) {
|
|
7700
|
-
const judgeProvider = context.judgeProvider;
|
|
7701
|
-
if (!judgeProvider) {
|
|
7702
|
-
throw new Error("No judge provider available for LLM aggregation");
|
|
7703
|
-
}
|
|
7704
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7705
|
-
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
7706
|
-
const evaluatorResults = results.map((member) => ({
|
|
7707
|
-
name: member.id,
|
|
7708
|
-
type: member.type,
|
|
7709
|
-
score: member.result.score,
|
|
7710
|
-
verdict: member.result.verdict,
|
|
7711
|
-
hits: [...member.result.hits],
|
|
7712
|
-
misses: [...member.result.misses],
|
|
7713
|
-
reasoning: member.result.reasoning,
|
|
7714
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7715
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7716
|
-
}));
|
|
7717
|
-
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
7718
|
-
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
7719
|
-
const systemPrompt = buildOutputSchema();
|
|
7720
|
-
const evaluatorRawRequest = {
|
|
7721
|
-
aggregator: "llm_judge",
|
|
7722
|
-
userPrompt,
|
|
7723
|
-
systemPrompt,
|
|
7724
|
-
target: judgeProvider.targetName
|
|
7725
|
-
};
|
|
7726
|
-
try {
|
|
7727
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
7728
|
-
if (model) {
|
|
7729
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
7730
|
-
model,
|
|
7731
|
-
system: systemPrompt,
|
|
7732
|
-
prompt: userPrompt
|
|
7733
|
-
});
|
|
7734
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
7735
|
-
const score2 = clampScore(data2.score);
|
|
7736
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7737
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7738
|
-
const reasoning2 = data2.reasoning;
|
|
7739
|
-
return {
|
|
7740
|
-
score: score2,
|
|
7741
|
-
verdict: scoreToVerdict(score2),
|
|
7742
|
-
hits: hits2,
|
|
7743
|
-
misses: misses2,
|
|
7744
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7745
|
-
reasoning: reasoning2,
|
|
7746
|
-
evaluatorRawRequest,
|
|
7747
|
-
evaluatorResults
|
|
7748
|
-
};
|
|
7749
|
-
}
|
|
7750
|
-
const response = await judgeProvider.invoke({
|
|
7751
|
-
question: userPrompt,
|
|
7752
|
-
systemPrompt,
|
|
7753
|
-
evalCaseId: context.evalCase.id,
|
|
7754
|
-
attempt: context.attempt
|
|
7755
|
-
});
|
|
7756
|
-
const data = freeformEvaluationSchema.parse(
|
|
7757
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
7758
|
-
);
|
|
7759
|
-
const score = clampScore(data.score);
|
|
7760
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7761
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7762
|
-
const reasoning = data.reasoning;
|
|
7696
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7763
7697
|
return {
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
|
|
7769
|
-
reasoning,
|
|
7770
|
-
evaluatorRawRequest,
|
|
7771
|
-
evaluatorResults
|
|
7698
|
+
path: path17,
|
|
7699
|
+
score: 0,
|
|
7700
|
+
weight,
|
|
7701
|
+
hit: false,
|
|
7702
|
+
message: `${path17} (invalid numeric value)`
|
|
7772
7703
|
};
|
|
7773
|
-
}
|
|
7704
|
+
}
|
|
7705
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
7706
|
+
let withinTolerance;
|
|
7707
|
+
if (relative) {
|
|
7708
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7709
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
7710
|
+
} else {
|
|
7711
|
+
withinTolerance = diff <= tolerance;
|
|
7712
|
+
}
|
|
7713
|
+
if (withinTolerance) {
|
|
7714
|
+
return {
|
|
7715
|
+
path: path17,
|
|
7716
|
+
score: 1,
|
|
7717
|
+
weight,
|
|
7718
|
+
hit: true,
|
|
7719
|
+
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7720
|
+
};
|
|
7721
|
+
}
|
|
7722
|
+
return {
|
|
7723
|
+
path: path17,
|
|
7724
|
+
score: 0,
|
|
7725
|
+
weight,
|
|
7726
|
+
hit: false,
|
|
7727
|
+
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7728
|
+
};
|
|
7729
|
+
}
|
|
7730
|
+
/**
|
|
7731
|
+
* Date comparison with format normalization.
|
|
7732
|
+
*/
|
|
7733
|
+
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7734
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7735
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7736
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7737
|
+
if (candidateDate === null) {
|
|
7774
7738
|
return {
|
|
7739
|
+
path: path17,
|
|
7775
7740
|
score: 0,
|
|
7776
|
-
|
|
7777
|
-
|
|
7778
|
-
|
|
7779
|
-
|
|
7780
|
-
|
|
7781
|
-
|
|
7741
|
+
weight,
|
|
7742
|
+
hit: false,
|
|
7743
|
+
message: `${path17} (unparseable candidate date)`
|
|
7744
|
+
};
|
|
7745
|
+
}
|
|
7746
|
+
if (expectedDate === null) {
|
|
7747
|
+
return {
|
|
7748
|
+
path: path17,
|
|
7749
|
+
score: 0,
|
|
7750
|
+
weight,
|
|
7751
|
+
hit: false,
|
|
7752
|
+
message: `${path17} (unparseable expected date)`
|
|
7753
|
+
};
|
|
7754
|
+
}
|
|
7755
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7756
|
+
return {
|
|
7757
|
+
path: path17,
|
|
7758
|
+
score: 1,
|
|
7759
|
+
weight,
|
|
7760
|
+
hit: true,
|
|
7761
|
+
message: path17
|
|
7782
7762
|
};
|
|
7783
7763
|
}
|
|
7764
|
+
return {
|
|
7765
|
+
path: path17,
|
|
7766
|
+
score: 0,
|
|
7767
|
+
weight,
|
|
7768
|
+
hit: false,
|
|
7769
|
+
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7770
|
+
};
|
|
7771
|
+
}
|
|
7772
|
+
/**
|
|
7773
|
+
* Aggregate field results using configured strategy.
|
|
7774
|
+
*/
|
|
7775
|
+
aggregateResults(results) {
|
|
7776
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7777
|
+
const hits = [];
|
|
7778
|
+
const misses = [];
|
|
7779
|
+
for (const result of results) {
|
|
7780
|
+
if (result.hit) {
|
|
7781
|
+
hits.push(result.message);
|
|
7782
|
+
} else {
|
|
7783
|
+
misses.push(result.message);
|
|
7784
|
+
}
|
|
7785
|
+
}
|
|
7786
|
+
let score;
|
|
7787
|
+
if (aggregation === "all_or_nothing") {
|
|
7788
|
+
score = misses.length === 0 ? 1 : 0;
|
|
7789
|
+
} else {
|
|
7790
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7791
|
+
if (totalWeight === 0) {
|
|
7792
|
+
score = results.length === 0 ? 1 : 0;
|
|
7793
|
+
} else {
|
|
7794
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7795
|
+
score = weightedSum / totalWeight;
|
|
7796
|
+
}
|
|
7797
|
+
}
|
|
7798
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7799
|
+
return {
|
|
7800
|
+
score: clampScore(score),
|
|
7801
|
+
verdict: scoreToVerdict(score),
|
|
7802
|
+
hits: hits.slice(0, 4),
|
|
7803
|
+
misses: misses.slice(0, 4),
|
|
7804
|
+
expectedAspectCount: results.length,
|
|
7805
|
+
reasoning
|
|
7806
|
+
};
|
|
7784
7807
|
}
|
|
7785
7808
|
};
|
|
7809
|
+
function resolvePath(obj, path17) {
|
|
7810
|
+
if (!path17 || !obj) {
|
|
7811
|
+
return void 0;
|
|
7812
|
+
}
|
|
7813
|
+
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7814
|
+
let current = obj;
|
|
7815
|
+
for (const part of parts) {
|
|
7816
|
+
if (current === null || current === void 0) {
|
|
7817
|
+
return void 0;
|
|
7818
|
+
}
|
|
7819
|
+
if (typeof current !== "object") {
|
|
7820
|
+
return void 0;
|
|
7821
|
+
}
|
|
7822
|
+
const isIndex = /^\d+$/.test(part);
|
|
7823
|
+
if (isIndex && Array.isArray(current)) {
|
|
7824
|
+
current = current[Number.parseInt(part, 10)];
|
|
7825
|
+
} else {
|
|
7826
|
+
current = current[part];
|
|
7827
|
+
}
|
|
7828
|
+
}
|
|
7829
|
+
return current;
|
|
7830
|
+
}
|
|
7831
|
+
function toNumber(value) {
|
|
7832
|
+
if (typeof value === "number") {
|
|
7833
|
+
return value;
|
|
7834
|
+
}
|
|
7835
|
+
if (typeof value === "string") {
|
|
7836
|
+
const num = Number.parseFloat(value);
|
|
7837
|
+
return Number.isNaN(num) ? null : num;
|
|
7838
|
+
}
|
|
7839
|
+
return null;
|
|
7840
|
+
}
|
|
7841
|
+
function parseDate(dateStr, formats) {
|
|
7842
|
+
if (!dateStr) return null;
|
|
7843
|
+
const trimmed = dateStr.trim();
|
|
7844
|
+
const isoDate = new Date(trimmed);
|
|
7845
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
7846
|
+
return isoDate;
|
|
7847
|
+
}
|
|
7848
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7849
|
+
if (localizedMatch) {
|
|
7850
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7851
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
7852
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7853
|
+
const month = MONTH_NAMES[monthName];
|
|
7854
|
+
if (month !== void 0) {
|
|
7855
|
+
return new Date(year, month, day);
|
|
7856
|
+
}
|
|
7857
|
+
}
|
|
7858
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7859
|
+
if (usMatch) {
|
|
7860
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7861
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7862
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
7863
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7864
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
7865
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7866
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7867
|
+
return new Date(year, month, day);
|
|
7868
|
+
}
|
|
7869
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
7870
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
7871
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7872
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7873
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7874
|
+
return new Date(year, month, day);
|
|
7875
|
+
}
|
|
7876
|
+
} else {
|
|
7877
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7878
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7879
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7880
|
+
if (num1 > 12 && num2 <= 12) {
|
|
7881
|
+
return new Date(year, num2 - 1, num1);
|
|
7882
|
+
}
|
|
7883
|
+
if (num2 > 12 && num1 <= 12) {
|
|
7884
|
+
return new Date(year, num1 - 1, num2);
|
|
7885
|
+
}
|
|
7886
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
7887
|
+
return new Date(year, num1 - 1, num2);
|
|
7888
|
+
}
|
|
7889
|
+
}
|
|
7890
|
+
}
|
|
7891
|
+
return null;
|
|
7892
|
+
}
|
|
7893
|
+
function formatDateISO(date) {
|
|
7894
|
+
return date.toISOString().split("T")[0];
|
|
7895
|
+
}
|
|
7896
|
+
function parseJsonFromTextSafe(text) {
|
|
7897
|
+
return parseJsonFromText(text);
|
|
7898
|
+
}
|
|
7899
|
+
|
|
7900
|
+
// src/evaluation/evaluators/latency.ts
|
|
7786
7901
|
var LatencyEvaluator = class {
|
|
7787
7902
|
kind = "latency";
|
|
7788
7903
|
config;
|
|
@@ -7816,56 +7931,16 @@ var LatencyEvaluator = class {
|
|
|
7816
7931
|
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
7817
7932
|
expectedAspectCount: 1,
|
|
7818
7933
|
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
7819
|
-
evaluatorRawRequest: {
|
|
7820
|
-
type: "latency",
|
|
7821
|
-
threshold,
|
|
7822
|
-
durationMs
|
|
7823
|
-
}
|
|
7824
|
-
};
|
|
7825
|
-
}
|
|
7826
|
-
};
|
|
7827
|
-
var CostEvaluator = class {
|
|
7828
|
-
kind = "cost";
|
|
7829
|
-
config;
|
|
7830
|
-
constructor(options) {
|
|
7831
|
-
this.config = options.config;
|
|
7832
|
-
}
|
|
7833
|
-
evaluate(context) {
|
|
7834
|
-
const { budget } = this.config;
|
|
7835
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
7836
|
-
if (costUsd === void 0) {
|
|
7837
|
-
return {
|
|
7838
|
-
score: 0,
|
|
7839
|
-
verdict: "fail",
|
|
7840
|
-
hits: [],
|
|
7841
|
-
misses: ["No cost data available in trace"],
|
|
7842
|
-
expectedAspectCount: 1,
|
|
7843
|
-
reasoning: "Execution cost not reported by provider",
|
|
7844
|
-
evaluatorRawRequest: {
|
|
7845
|
-
type: "cost",
|
|
7846
|
-
budget,
|
|
7847
|
-
costUsd: null
|
|
7848
|
-
}
|
|
7849
|
-
};
|
|
7850
|
-
}
|
|
7851
|
-
const passed = costUsd <= budget;
|
|
7852
|
-
const score = passed ? 1 : 0;
|
|
7853
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7854
|
-
return {
|
|
7855
|
-
score,
|
|
7856
|
-
verdict: passed ? "pass" : "fail",
|
|
7857
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7858
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7859
|
-
expectedAspectCount: 1,
|
|
7860
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7861
|
-
evaluatorRawRequest: {
|
|
7862
|
-
type: "cost",
|
|
7863
|
-
budget,
|
|
7864
|
-
costUsd
|
|
7934
|
+
evaluatorRawRequest: {
|
|
7935
|
+
type: "latency",
|
|
7936
|
+
threshold,
|
|
7937
|
+
durationMs
|
|
7865
7938
|
}
|
|
7866
7939
|
};
|
|
7867
7940
|
}
|
|
7868
7941
|
};
|
|
7942
|
+
|
|
7943
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
7869
7944
|
var TokenUsageEvaluator = class {
|
|
7870
7945
|
kind = "token_usage";
|
|
7871
7946
|
config;
|
|
@@ -7949,8 +8024,228 @@ var TokenUsageEvaluator = class {
|
|
|
7949
8024
|
}
|
|
7950
8025
|
};
|
|
7951
8026
|
|
|
8027
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
8028
|
+
function argsMatch(expected, actual) {
|
|
8029
|
+
if (expected === void 0) return true;
|
|
8030
|
+
if (expected === "any") return true;
|
|
8031
|
+
if (actual === void 0) return false;
|
|
8032
|
+
for (const key of Object.keys(expected)) {
|
|
8033
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
8034
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
8035
|
+
}
|
|
8036
|
+
return true;
|
|
8037
|
+
}
|
|
8038
|
+
var ToolTrajectoryEvaluator = class {
|
|
8039
|
+
kind = "tool_trajectory";
|
|
8040
|
+
config;
|
|
8041
|
+
constructor(options) {
|
|
8042
|
+
this.config = options.config;
|
|
8043
|
+
}
|
|
8044
|
+
evaluate(context) {
|
|
8045
|
+
const { outputMessages, traceSummary } = context;
|
|
8046
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
8047
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
8048
|
+
return {
|
|
8049
|
+
score: 0,
|
|
8050
|
+
verdict: "fail",
|
|
8051
|
+
hits: [],
|
|
8052
|
+
misses: ["No trace available for evaluation"],
|
|
8053
|
+
expectedAspectCount: 1
|
|
8054
|
+
};
|
|
8055
|
+
}
|
|
8056
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
8057
|
+
if (!summary) {
|
|
8058
|
+
return {
|
|
8059
|
+
score: 0,
|
|
8060
|
+
verdict: "fail",
|
|
8061
|
+
hits: [],
|
|
8062
|
+
misses: ["No trace available for evaluation"],
|
|
8063
|
+
expectedAspectCount: 1
|
|
8064
|
+
};
|
|
8065
|
+
}
|
|
8066
|
+
switch (this.config.mode) {
|
|
8067
|
+
case "any_order":
|
|
8068
|
+
return this.evaluateAnyOrder(summary);
|
|
8069
|
+
case "in_order":
|
|
8070
|
+
return this.evaluateInOrder(toolCalls);
|
|
8071
|
+
case "exact":
|
|
8072
|
+
return this.evaluateExact(toolCalls);
|
|
8073
|
+
default:
|
|
8074
|
+
return {
|
|
8075
|
+
score: 0,
|
|
8076
|
+
verdict: "fail",
|
|
8077
|
+
hits: [],
|
|
8078
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
8079
|
+
expectedAspectCount: 1
|
|
8080
|
+
};
|
|
8081
|
+
}
|
|
8082
|
+
}
|
|
8083
|
+
/**
|
|
8084
|
+
* Extract tool calls from output messages.
|
|
8085
|
+
*/
|
|
8086
|
+
extractToolCallsFromMessages(messages) {
|
|
8087
|
+
if (!messages) {
|
|
8088
|
+
return [];
|
|
8089
|
+
}
|
|
8090
|
+
const toolCalls = [];
|
|
8091
|
+
for (const message of messages) {
|
|
8092
|
+
if (message.toolCalls) {
|
|
8093
|
+
for (const call of message.toolCalls) {
|
|
8094
|
+
toolCalls.push({
|
|
8095
|
+
name: call.tool,
|
|
8096
|
+
args: call.input
|
|
8097
|
+
});
|
|
8098
|
+
}
|
|
8099
|
+
}
|
|
8100
|
+
}
|
|
8101
|
+
return toolCalls;
|
|
8102
|
+
}
|
|
8103
|
+
/**
|
|
8104
|
+
* Build a summary from extracted tool calls.
|
|
8105
|
+
*/
|
|
8106
|
+
buildSummary(toolCalls) {
|
|
8107
|
+
const toolCallsByName = {};
|
|
8108
|
+
for (const call of toolCalls) {
|
|
8109
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
8110
|
+
}
|
|
8111
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
8112
|
+
return {
|
|
8113
|
+
eventCount: toolCalls.length,
|
|
8114
|
+
toolNames,
|
|
8115
|
+
toolCallsByName,
|
|
8116
|
+
errorCount: 0
|
|
8117
|
+
};
|
|
8118
|
+
}
|
|
8119
|
+
evaluateAnyOrder(summary) {
|
|
8120
|
+
const minimums = this.config.minimums ?? {};
|
|
8121
|
+
const toolNames = Object.keys(minimums);
|
|
8122
|
+
if (toolNames.length === 0) {
|
|
8123
|
+
return {
|
|
8124
|
+
score: 1,
|
|
8125
|
+
verdict: "pass",
|
|
8126
|
+
hits: ["No tool requirements specified"],
|
|
8127
|
+
misses: [],
|
|
8128
|
+
expectedAspectCount: 0
|
|
8129
|
+
};
|
|
8130
|
+
}
|
|
8131
|
+
const hits = [];
|
|
8132
|
+
const misses = [];
|
|
8133
|
+
for (const toolName of toolNames) {
|
|
8134
|
+
const required = minimums[toolName];
|
|
8135
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
8136
|
+
if (actual >= required) {
|
|
8137
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8138
|
+
} else {
|
|
8139
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8140
|
+
}
|
|
8141
|
+
}
|
|
8142
|
+
const score = hits.length / toolNames.length;
|
|
8143
|
+
return {
|
|
8144
|
+
score,
|
|
8145
|
+
verdict: scoreToVerdict(score),
|
|
8146
|
+
hits,
|
|
8147
|
+
misses,
|
|
8148
|
+
expectedAspectCount: toolNames.length
|
|
8149
|
+
};
|
|
8150
|
+
}
|
|
8151
|
+
evaluateInOrder(toolCalls) {
|
|
8152
|
+
const expected = this.config.expected ?? [];
|
|
8153
|
+
if (expected.length === 0) {
|
|
8154
|
+
return {
|
|
8155
|
+
score: 1,
|
|
8156
|
+
verdict: "pass",
|
|
8157
|
+
hits: ["No tool sequence specified"],
|
|
8158
|
+
misses: [],
|
|
8159
|
+
expectedAspectCount: 0
|
|
8160
|
+
};
|
|
8161
|
+
}
|
|
8162
|
+
const hits = [];
|
|
8163
|
+
const misses = [];
|
|
8164
|
+
let actualIndex = 0;
|
|
8165
|
+
for (let i = 0; i < expected.length; i++) {
|
|
8166
|
+
const expectedItem = expected[i];
|
|
8167
|
+
const expectedTool = expectedItem.tool;
|
|
8168
|
+
let found = false;
|
|
8169
|
+
let argsMismatch = false;
|
|
8170
|
+
while (actualIndex < toolCalls.length) {
|
|
8171
|
+
const actualCall = toolCalls[actualIndex];
|
|
8172
|
+
if (actualCall.name === expectedTool) {
|
|
8173
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8174
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
8175
|
+
actualIndex++;
|
|
8176
|
+
found = true;
|
|
8177
|
+
break;
|
|
8178
|
+
}
|
|
8179
|
+
misses.push(
|
|
8180
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
8181
|
+
);
|
|
8182
|
+
actualIndex++;
|
|
8183
|
+
argsMismatch = true;
|
|
8184
|
+
break;
|
|
8185
|
+
}
|
|
8186
|
+
actualIndex++;
|
|
8187
|
+
}
|
|
8188
|
+
if (!found && !argsMismatch) {
|
|
8189
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
8190
|
+
}
|
|
8191
|
+
}
|
|
8192
|
+
const score = hits.length / expected.length;
|
|
8193
|
+
return {
|
|
8194
|
+
score,
|
|
8195
|
+
verdict: scoreToVerdict(score),
|
|
8196
|
+
hits,
|
|
8197
|
+
misses,
|
|
8198
|
+
expectedAspectCount: expected.length
|
|
8199
|
+
};
|
|
8200
|
+
}
|
|
8201
|
+
evaluateExact(toolCalls) {
|
|
8202
|
+
const expected = this.config.expected ?? [];
|
|
8203
|
+
if (expected.length === 0) {
|
|
8204
|
+
return {
|
|
8205
|
+
score: 1,
|
|
8206
|
+
verdict: "pass",
|
|
8207
|
+
hits: ["No tool sequence specified"],
|
|
8208
|
+
misses: [],
|
|
8209
|
+
expectedAspectCount: 0
|
|
8210
|
+
};
|
|
8211
|
+
}
|
|
8212
|
+
const hits = [];
|
|
8213
|
+
const misses = [];
|
|
8214
|
+
if (toolCalls.length !== expected.length) {
|
|
8215
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
8216
|
+
}
|
|
8217
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
8218
|
+
for (let i = 0; i < checkLength; i++) {
|
|
8219
|
+
const expectedItem = expected[i];
|
|
8220
|
+
const expectedTool = expectedItem.tool;
|
|
8221
|
+
const actualCall = toolCalls[i];
|
|
8222
|
+
const actualTool = actualCall.name;
|
|
8223
|
+
if (actualTool === expectedTool) {
|
|
8224
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8225
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
8226
|
+
} else {
|
|
8227
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
8228
|
+
}
|
|
8229
|
+
} else {
|
|
8230
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
8231
|
+
}
|
|
8232
|
+
}
|
|
8233
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
8234
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
8235
|
+
}
|
|
8236
|
+
const score = hits.length / expected.length;
|
|
8237
|
+
return {
|
|
8238
|
+
score,
|
|
8239
|
+
verdict: scoreToVerdict(score),
|
|
8240
|
+
hits,
|
|
8241
|
+
misses,
|
|
8242
|
+
expectedAspectCount: expected.length
|
|
8243
|
+
};
|
|
8244
|
+
}
|
|
8245
|
+
};
|
|
8246
|
+
|
|
7952
8247
|
// src/evaluation/orchestrator.ts
|
|
7953
|
-
var
|
|
8248
|
+
var import_node_crypto5 = require("crypto");
|
|
7954
8249
|
var import_node_path16 = __toESM(require("path"), 1);
|
|
7955
8250
|
|
|
7956
8251
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -8162,6 +8457,17 @@ async function runEvaluation(options) {
|
|
|
8162
8457
|
}
|
|
8163
8458
|
return getOrCreateProvider(resolvedJudge);
|
|
8164
8459
|
};
|
|
8460
|
+
const targetResolver = (name) => {
|
|
8461
|
+
const resolved = resolveTargetByName(name);
|
|
8462
|
+
if (!resolved) {
|
|
8463
|
+
return void 0;
|
|
8464
|
+
}
|
|
8465
|
+
return getOrCreateProvider(resolved);
|
|
8466
|
+
};
|
|
8467
|
+
const availableTargets = [
|
|
8468
|
+
target.name,
|
|
8469
|
+
...Array.from(targetDefinitions.keys())
|
|
8470
|
+
];
|
|
8165
8471
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
8166
8472
|
const primaryProvider = getOrCreateProvider(target);
|
|
8167
8473
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -8191,7 +8497,9 @@ async function runEvaluation(options) {
|
|
|
8191
8497
|
onResult,
|
|
8192
8498
|
verbose,
|
|
8193
8499
|
resolveJudgeProvider,
|
|
8194
|
-
agentTimeoutMs
|
|
8500
|
+
agentTimeoutMs,
|
|
8501
|
+
targetResolver,
|
|
8502
|
+
availableTargets
|
|
8195
8503
|
});
|
|
8196
8504
|
} catch (error) {
|
|
8197
8505
|
if (verbose) {
|
|
@@ -8230,7 +8538,9 @@ async function runEvaluation(options) {
|
|
|
8230
8538
|
cache,
|
|
8231
8539
|
useCache,
|
|
8232
8540
|
now,
|
|
8233
|
-
judgeProvider
|
|
8541
|
+
judgeProvider,
|
|
8542
|
+
targetResolver,
|
|
8543
|
+
availableTargets
|
|
8234
8544
|
});
|
|
8235
8545
|
if (onProgress) {
|
|
8236
8546
|
await onProgress({
|
|
@@ -8297,7 +8607,9 @@ async function runBatchEvaluation(options) {
|
|
|
8297
8607
|
onProgress,
|
|
8298
8608
|
onResult,
|
|
8299
8609
|
resolveJudgeProvider,
|
|
8300
|
-
agentTimeoutMs
|
|
8610
|
+
agentTimeoutMs,
|
|
8611
|
+
targetResolver,
|
|
8612
|
+
availableTargets
|
|
8301
8613
|
} = options;
|
|
8302
8614
|
const promptInputsList = [];
|
|
8303
8615
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -8356,7 +8668,7 @@ async function runBatchEvaluation(options) {
|
|
|
8356
8668
|
costUsd: providerResponse.costUsd,
|
|
8357
8669
|
durationMs: providerResponse.durationMs
|
|
8358
8670
|
}) : void 0;
|
|
8359
|
-
const candidate =
|
|
8671
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8360
8672
|
const providerError = extractProviderError(providerResponse);
|
|
8361
8673
|
let result;
|
|
8362
8674
|
try {
|
|
@@ -8372,7 +8684,9 @@ async function runBatchEvaluation(options) {
|
|
|
8372
8684
|
judgeProvider: await resolveJudgeProvider(target),
|
|
8373
8685
|
agentTimeoutMs,
|
|
8374
8686
|
outputMessages,
|
|
8375
|
-
traceSummary
|
|
8687
|
+
traceSummary,
|
|
8688
|
+
targetResolver,
|
|
8689
|
+
availableTargets
|
|
8376
8690
|
});
|
|
8377
8691
|
if (providerError) {
|
|
8378
8692
|
result = { ...result, error: providerError };
|
|
@@ -8430,7 +8744,9 @@ async function runEvalCase(options) {
|
|
|
8430
8744
|
cache,
|
|
8431
8745
|
useCache,
|
|
8432
8746
|
signal,
|
|
8433
|
-
judgeProvider
|
|
8747
|
+
judgeProvider,
|
|
8748
|
+
targetResolver,
|
|
8749
|
+
availableTargets
|
|
8434
8750
|
} = options;
|
|
8435
8751
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
8436
8752
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -8489,7 +8805,7 @@ async function runEvalCase(options) {
|
|
|
8489
8805
|
costUsd: providerResponse.costUsd,
|
|
8490
8806
|
durationMs: providerResponse.durationMs
|
|
8491
8807
|
}) : void 0;
|
|
8492
|
-
const candidate =
|
|
8808
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8493
8809
|
const providerError = extractProviderError(providerResponse);
|
|
8494
8810
|
try {
|
|
8495
8811
|
const result = await evaluateCandidate({
|
|
@@ -8504,7 +8820,9 @@ async function runEvalCase(options) {
|
|
|
8504
8820
|
judgeProvider,
|
|
8505
8821
|
agentTimeoutMs,
|
|
8506
8822
|
outputMessages,
|
|
8507
|
-
traceSummary
|
|
8823
|
+
traceSummary,
|
|
8824
|
+
targetResolver,
|
|
8825
|
+
availableTargets
|
|
8508
8826
|
});
|
|
8509
8827
|
return providerError ? { ...result, error: providerError } : result;
|
|
8510
8828
|
} catch (error) {
|
|
@@ -8524,7 +8842,9 @@ async function evaluateCandidate(options) {
|
|
|
8524
8842
|
judgeProvider,
|
|
8525
8843
|
agentTimeoutMs,
|
|
8526
8844
|
outputMessages,
|
|
8527
|
-
traceSummary
|
|
8845
|
+
traceSummary,
|
|
8846
|
+
targetResolver,
|
|
8847
|
+
availableTargets
|
|
8528
8848
|
} = options;
|
|
8529
8849
|
const gradeTimestamp = nowFn();
|
|
8530
8850
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -8539,7 +8859,9 @@ async function evaluateCandidate(options) {
|
|
|
8539
8859
|
judgeProvider,
|
|
8540
8860
|
agentTimeoutMs,
|
|
8541
8861
|
outputMessages,
|
|
8542
|
-
traceSummary
|
|
8862
|
+
traceSummary,
|
|
8863
|
+
targetResolver,
|
|
8864
|
+
availableTargets
|
|
8543
8865
|
});
|
|
8544
8866
|
const completedAt = nowFn();
|
|
8545
8867
|
let agentProviderRequest;
|
|
@@ -8592,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8592
8914
|
judgeProvider,
|
|
8593
8915
|
agentTimeoutMs,
|
|
8594
8916
|
outputMessages,
|
|
8595
|
-
traceSummary
|
|
8917
|
+
traceSummary,
|
|
8918
|
+
targetResolver,
|
|
8919
|
+
availableTargets
|
|
8596
8920
|
} = options;
|
|
8597
8921
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
8598
8922
|
return runEvaluatorList({
|
|
@@ -8608,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8608
8932
|
judgeProvider,
|
|
8609
8933
|
agentTimeoutMs,
|
|
8610
8934
|
outputMessages,
|
|
8611
|
-
traceSummary
|
|
8935
|
+
traceSummary,
|
|
8936
|
+
targetResolver,
|
|
8937
|
+
availableTargets
|
|
8612
8938
|
});
|
|
8613
8939
|
}
|
|
8614
8940
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -8626,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8626
8952
|
now,
|
|
8627
8953
|
judgeProvider,
|
|
8628
8954
|
outputMessages,
|
|
8629
|
-
traceSummary
|
|
8955
|
+
traceSummary,
|
|
8956
|
+
targetResolver,
|
|
8957
|
+
availableTargets
|
|
8630
8958
|
});
|
|
8631
8959
|
return { score };
|
|
8632
8960
|
}
|
|
@@ -8644,7 +8972,9 @@ async function runEvaluatorList(options) {
|
|
|
8644
8972
|
judgeProvider,
|
|
8645
8973
|
agentTimeoutMs,
|
|
8646
8974
|
outputMessages,
|
|
8647
|
-
traceSummary
|
|
8975
|
+
traceSummary,
|
|
8976
|
+
targetResolver,
|
|
8977
|
+
availableTargets
|
|
8648
8978
|
} = options;
|
|
8649
8979
|
const scored = [];
|
|
8650
8980
|
const evaluatorResults = [];
|
|
@@ -8682,7 +9012,8 @@ async function runEvaluatorList(options) {
|
|
|
8682
9012
|
script: evaluator.script,
|
|
8683
9013
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
8684
9014
|
agentTimeoutMs,
|
|
8685
|
-
config: evaluator.config
|
|
9015
|
+
config: evaluator.config,
|
|
9016
|
+
target: evaluator.target
|
|
8686
9017
|
});
|
|
8687
9018
|
const score2 = await codeEvaluator.evaluate({
|
|
8688
9019
|
evalCase,
|
|
@@ -8692,8 +9023,11 @@ async function runEvaluatorList(options) {
|
|
|
8692
9023
|
attempt,
|
|
8693
9024
|
promptInputs,
|
|
8694
9025
|
now,
|
|
9026
|
+
judgeProvider,
|
|
8695
9027
|
outputMessages,
|
|
8696
|
-
traceSummary
|
|
9028
|
+
traceSummary,
|
|
9029
|
+
targetResolver,
|
|
9030
|
+
availableTargets
|
|
8697
9031
|
});
|
|
8698
9032
|
const weight = evaluator.weight ?? 1;
|
|
8699
9033
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -8706,7 +9040,8 @@ async function runEvaluatorList(options) {
|
|
|
8706
9040
|
hits: score2.hits,
|
|
8707
9041
|
misses: score2.misses,
|
|
8708
9042
|
reasoning: score2.reasoning,
|
|
8709
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
9043
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
9044
|
+
details: score2.details
|
|
8710
9045
|
});
|
|
8711
9046
|
}
|
|
8712
9047
|
if (evaluator.type === "composite") {
|
|
@@ -8720,7 +9055,8 @@ async function runEvaluatorList(options) {
|
|
|
8720
9055
|
script: memberConfig.script,
|
|
8721
9056
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
8722
9057
|
agentTimeoutMs,
|
|
8723
|
-
config: memberConfig.config
|
|
9058
|
+
config: memberConfig.config,
|
|
9059
|
+
target: memberConfig.target
|
|
8724
9060
|
});
|
|
8725
9061
|
case "composite":
|
|
8726
9062
|
return new CompositeEvaluator({
|
|
@@ -8769,7 +9105,9 @@ async function runEvaluatorList(options) {
|
|
|
8769
9105
|
now,
|
|
8770
9106
|
judgeProvider,
|
|
8771
9107
|
outputMessages,
|
|
8772
|
-
traceSummary
|
|
9108
|
+
traceSummary,
|
|
9109
|
+
targetResolver,
|
|
9110
|
+
availableTargets
|
|
8773
9111
|
});
|
|
8774
9112
|
const weight = evaluator.weight ?? 1;
|
|
8775
9113
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -8965,11 +9303,11 @@ async function runEvaluatorList(options) {
|
|
|
8965
9303
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
8966
9304
|
0
|
|
8967
9305
|
);
|
|
8968
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
9306
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
8969
9307
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
8970
9308
|
const score = {
|
|
8971
9309
|
score: aggregateScore,
|
|
8972
|
-
verdict:
|
|
9310
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
8973
9311
|
hits,
|
|
8974
9312
|
misses,
|
|
8975
9313
|
expectedAspectCount,
|
|
@@ -9016,18 +9354,6 @@ async function resolveCustomPrompt(config) {
|
|
|
9016
9354
|
}
|
|
9017
9355
|
return config.prompt;
|
|
9018
9356
|
}
|
|
9019
|
-
function isNonEmptyString2(value) {
|
|
9020
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
9021
|
-
}
|
|
9022
|
-
function scoreToVerdict2(score) {
|
|
9023
|
-
if (score >= 0.8) {
|
|
9024
|
-
return "pass";
|
|
9025
|
-
}
|
|
9026
|
-
if (score >= 0.6) {
|
|
9027
|
-
return "borderline";
|
|
9028
|
-
}
|
|
9029
|
-
return "fail";
|
|
9030
|
-
}
|
|
9031
9357
|
function filterEvalCases(evalCases, evalId) {
|
|
9032
9358
|
if (!evalId) {
|
|
9033
9359
|
return evalCases;
|
|
@@ -9129,7 +9455,7 @@ function extractProviderError(response) {
|
|
|
9129
9455
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
9130
9456
|
}
|
|
9131
9457
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
9132
|
-
const hash = (0,
|
|
9458
|
+
const hash = (0, import_node_crypto5.createHash)("sha256");
|
|
9133
9459
|
hash.update(provider.id);
|
|
9134
9460
|
hash.update(target.name);
|
|
9135
9461
|
hash.update(evalCase.id);
|
|
@@ -9170,7 +9496,8 @@ function mapChildResults(children) {
|
|
|
9170
9496
|
misses: child.misses,
|
|
9171
9497
|
reasoning: child.reasoning,
|
|
9172
9498
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
9173
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
9499
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
9500
|
+
details: child.details
|
|
9174
9501
|
}));
|
|
9175
9502
|
}
|
|
9176
9503
|
function computeWeightedMean(entries) {
|
|
@@ -9185,7 +9512,7 @@ function computeWeightedMean(entries) {
|
|
|
9185
9512
|
}
|
|
9186
9513
|
|
|
9187
9514
|
// src/evaluation/generators/rubric-generator.ts
|
|
9188
|
-
var
|
|
9515
|
+
var import_ai4 = require("ai");
|
|
9189
9516
|
var import_zod4 = require("zod");
|
|
9190
9517
|
var rubricItemSchema = import_zod4.z.object({
|
|
9191
9518
|
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -9219,7 +9546,7 @@ You must return a valid JSON object matching this schema:
|
|
|
9219
9546
|
let lastError;
|
|
9220
9547
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
9221
9548
|
try {
|
|
9222
|
-
const { text } = await (0,
|
|
9549
|
+
const { text } = await (0, import_ai4.generateText)({
|
|
9223
9550
|
model,
|
|
9224
9551
|
system,
|
|
9225
9552
|
prompt
|
|
@@ -9282,31 +9609,39 @@ function createAgentKernel() {
|
|
|
9282
9609
|
ToolTrajectoryEvaluator,
|
|
9283
9610
|
avgToolDurationMs,
|
|
9284
9611
|
buildDirectoryChain,
|
|
9612
|
+
buildOutputSchema,
|
|
9285
9613
|
buildPromptInputs,
|
|
9286
9614
|
buildSearchRoots,
|
|
9615
|
+
clampScore,
|
|
9287
9616
|
computeTraceSummary,
|
|
9288
9617
|
consumeClaudeCodeLogEntries,
|
|
9289
9618
|
consumeCodexLogEntries,
|
|
9290
9619
|
consumePiLogEntries,
|
|
9291
9620
|
createAgentKernel,
|
|
9292
9621
|
createProvider,
|
|
9622
|
+
deepEqual,
|
|
9293
9623
|
ensureVSCodeSubagents,
|
|
9624
|
+
executeScript,
|
|
9294
9625
|
explorationRatio,
|
|
9295
|
-
|
|
9626
|
+
extractJsonBlob,
|
|
9296
9627
|
fileExists,
|
|
9297
9628
|
findGitRoot,
|
|
9629
|
+
freeformEvaluationSchema,
|
|
9298
9630
|
generateRubrics,
|
|
9299
9631
|
getHitCount,
|
|
9300
9632
|
isEvaluatorKind,
|
|
9301
9633
|
isGuidelineFile,
|
|
9302
9634
|
isJsonObject,
|
|
9303
9635
|
isJsonValue,
|
|
9636
|
+
isNonEmptyString,
|
|
9304
9637
|
isTestMessage,
|
|
9305
9638
|
isTestMessageRole,
|
|
9306
9639
|
listTargetNames,
|
|
9307
9640
|
loadEvalCases,
|
|
9308
9641
|
mergeExecutionMetrics,
|
|
9309
9642
|
normalizeLineEndings,
|
|
9643
|
+
parseJsonFromText,
|
|
9644
|
+
parseJsonSafe,
|
|
9310
9645
|
readJsonFile,
|
|
9311
9646
|
readTargetDefinitions,
|
|
9312
9647
|
readTestSuiteMetadata,
|
|
@@ -9316,6 +9651,7 @@ function createAgentKernel() {
|
|
|
9316
9651
|
resolveTargetDefinition,
|
|
9317
9652
|
runEvalCase,
|
|
9318
9653
|
runEvaluation,
|
|
9654
|
+
scoreToVerdict,
|
|
9319
9655
|
subscribeToClaudeCodeLogEntries,
|
|
9320
9656
|
subscribeToCodexLogEntries,
|
|
9321
9657
|
subscribeToPiLogEntries,
|