@agentv/core 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IBTKEEOT.js → chunk-KDEP4I7G.js} +44 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +1641 -1138
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +157 -100
- package/dist/index.d.ts +157 -100
- package/dist/index.js +1451 -997
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
- package/dist/chunk-IBTKEEOT.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -42,33 +42,39 @@ __export(index_exports, {
|
|
|
42
42
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
43
43
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
44
44
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
45
|
+
buildOutputSchema: () => buildOutputSchema,
|
|
45
46
|
buildPromptInputs: () => buildPromptInputs,
|
|
46
47
|
buildSearchRoots: () => buildSearchRoots2,
|
|
48
|
+
clampScore: () => clampScore,
|
|
47
49
|
computeTraceSummary: () => computeTraceSummary,
|
|
48
50
|
consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
|
|
49
51
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
50
52
|
consumePiLogEntries: () => consumePiLogEntries,
|
|
51
53
|
createAgentKernel: () => createAgentKernel,
|
|
52
54
|
createProvider: () => createProvider,
|
|
55
|
+
deepEqual: () => deepEqual,
|
|
53
56
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
|
+
executeScript: () => executeScript,
|
|
54
58
|
explorationRatio: () => explorationRatio,
|
|
55
|
-
|
|
59
|
+
extractJsonBlob: () => extractJsonBlob,
|
|
56
60
|
fileExists: () => fileExists2,
|
|
57
61
|
findGitRoot: () => findGitRoot,
|
|
62
|
+
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
58
63
|
generateRubrics: () => generateRubrics,
|
|
59
64
|
getHitCount: () => getHitCount,
|
|
60
65
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
61
66
|
isGuidelineFile: () => isGuidelineFile,
|
|
62
67
|
isJsonObject: () => isJsonObject,
|
|
63
68
|
isJsonValue: () => isJsonValue,
|
|
69
|
+
isNonEmptyString: () => isNonEmptyString,
|
|
64
70
|
isTestMessage: () => isTestMessage,
|
|
65
71
|
isTestMessageRole: () => isTestMessageRole,
|
|
66
72
|
listTargetNames: () => listTargetNames,
|
|
67
73
|
loadEvalCases: () => loadEvalCases,
|
|
68
74
|
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
69
75
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
70
|
-
|
|
71
|
-
|
|
76
|
+
parseJsonFromText: () => parseJsonFromText,
|
|
77
|
+
parseJsonSafe: () => parseJsonSafe,
|
|
72
78
|
readJsonFile: () => readJsonFile,
|
|
73
79
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
74
80
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
@@ -78,6 +84,7 @@ __export(index_exports, {
|
|
|
78
84
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
79
85
|
runEvalCase: () => runEvalCase,
|
|
80
86
|
runEvaluation: () => runEvaluation,
|
|
87
|
+
scoreToVerdict: () => scoreToVerdict,
|
|
81
88
|
subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
|
|
82
89
|
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
83
90
|
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
@@ -223,85 +230,6 @@ var import_promises6 = require("fs/promises");
|
|
|
223
230
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
224
231
|
var import_yaml2 = require("yaml");
|
|
225
232
|
|
|
226
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
227
|
-
function extractCodeBlocks(segments) {
|
|
228
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
229
|
-
const codeBlocks = [];
|
|
230
|
-
for (const segment of segments) {
|
|
231
|
-
const typeValue = segment.type;
|
|
232
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
233
|
-
continue;
|
|
234
|
-
}
|
|
235
|
-
const textValue = segment.value;
|
|
236
|
-
if (typeof textValue !== "string") {
|
|
237
|
-
continue;
|
|
238
|
-
}
|
|
239
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
240
|
-
if (matches) {
|
|
241
|
-
codeBlocks.push(...matches);
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
return codeBlocks;
|
|
245
|
-
}
|
|
246
|
-
function formatFileContents(parts) {
|
|
247
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
248
|
-
if (fileCount > 0) {
|
|
249
|
-
return parts.map((part) => {
|
|
250
|
-
if (part.isFile && part.displayPath) {
|
|
251
|
-
return `<file path="${part.displayPath}">
|
|
252
|
-
${part.content}
|
|
253
|
-
</file>`;
|
|
254
|
-
}
|
|
255
|
-
return part.content;
|
|
256
|
-
}).join("\n\n");
|
|
257
|
-
}
|
|
258
|
-
return parts.map((p) => p.content).join(" ");
|
|
259
|
-
}
|
|
260
|
-
function formatSegment(segment, mode = "lm") {
|
|
261
|
-
const type = asString(segment.type);
|
|
262
|
-
if (type === "text") {
|
|
263
|
-
return asString(segment.value);
|
|
264
|
-
}
|
|
265
|
-
if (type === "guideline_ref") {
|
|
266
|
-
const refPath = asString(segment.path);
|
|
267
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
268
|
-
}
|
|
269
|
-
if (type === "file") {
|
|
270
|
-
const filePath = asString(segment.path);
|
|
271
|
-
if (!filePath) {
|
|
272
|
-
return void 0;
|
|
273
|
-
}
|
|
274
|
-
if (mode === "agent") {
|
|
275
|
-
return `<file: path="${filePath}">`;
|
|
276
|
-
}
|
|
277
|
-
const text = asString(segment.text);
|
|
278
|
-
if (text && filePath) {
|
|
279
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
return void 0;
|
|
283
|
-
}
|
|
284
|
-
function hasVisibleContent(segments) {
|
|
285
|
-
return segments.some((segment) => {
|
|
286
|
-
const type = asString(segment.type);
|
|
287
|
-
if (type === "text") {
|
|
288
|
-
const value = asString(segment.value);
|
|
289
|
-
return value !== void 0 && value.trim().length > 0;
|
|
290
|
-
}
|
|
291
|
-
if (type === "guideline_ref") {
|
|
292
|
-
return false;
|
|
293
|
-
}
|
|
294
|
-
if (type === "file") {
|
|
295
|
-
const text = asString(segment.text);
|
|
296
|
-
return text !== void 0 && text.trim().length > 0;
|
|
297
|
-
}
|
|
298
|
-
return false;
|
|
299
|
-
});
|
|
300
|
-
}
|
|
301
|
-
function asString(value) {
|
|
302
|
-
return typeof value === "string" ? value : void 0;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
233
|
// src/evaluation/loaders/config-loader.ts
|
|
306
234
|
var import_promises2 = require("fs/promises");
|
|
307
235
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -556,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
556
484
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
557
485
|
continue;
|
|
558
486
|
}
|
|
559
|
-
const name =
|
|
487
|
+
const name = asString(rawEvaluator.name);
|
|
560
488
|
const typeValue = rawEvaluator.type;
|
|
561
489
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
562
490
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -584,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
584
512
|
continue;
|
|
585
513
|
}
|
|
586
514
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
587
|
-
const cwd =
|
|
515
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
588
516
|
let resolvedCwd;
|
|
589
517
|
if (cwd) {
|
|
590
518
|
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
@@ -599,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
599
527
|
} else {
|
|
600
528
|
resolvedCwd = searchRoots[0];
|
|
601
529
|
}
|
|
602
|
-
const
|
|
530
|
+
const rawTarget = rawEvaluator.target;
|
|
531
|
+
let targetConfig;
|
|
532
|
+
if (rawTarget !== void 0) {
|
|
533
|
+
if (isJsonObject2(rawTarget)) {
|
|
534
|
+
const maxCalls = rawTarget.max_calls;
|
|
535
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
536
|
+
logWarning2(
|
|
537
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
538
|
+
);
|
|
539
|
+
} else {
|
|
540
|
+
targetConfig = {
|
|
541
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
} else if (rawTarget === true) {
|
|
545
|
+
targetConfig = {};
|
|
546
|
+
} else {
|
|
547
|
+
logWarning2(
|
|
548
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
549
|
+
);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
603
553
|
const config = {};
|
|
604
554
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
605
555
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -613,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
613
563
|
cwd,
|
|
614
564
|
resolvedCwd,
|
|
615
565
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
616
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
566
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
567
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
617
568
|
});
|
|
618
569
|
continue;
|
|
619
570
|
}
|
|
@@ -630,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
630
581
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
631
582
|
continue;
|
|
632
583
|
}
|
|
633
|
-
const aggregatorType =
|
|
584
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
634
585
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
635
586
|
logWarning2(
|
|
636
587
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -643,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
643
594
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
644
595
|
continue;
|
|
645
596
|
}
|
|
646
|
-
const memberName =
|
|
597
|
+
const memberName = asString(rawMember.name);
|
|
647
598
|
const memberType = rawMember.type;
|
|
648
599
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
649
600
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -681,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
681
632
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
682
633
|
};
|
|
683
634
|
} else if (aggregatorType === "code_judge") {
|
|
684
|
-
const aggregatorPath =
|
|
635
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
685
636
|
if (!aggregatorPath) {
|
|
686
637
|
logWarning2(
|
|
687
638
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -694,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
694
645
|
cwd: searchRoots[0]
|
|
695
646
|
};
|
|
696
647
|
} else {
|
|
697
|
-
const aggregatorPrompt =
|
|
648
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
698
649
|
let promptPath2;
|
|
699
650
|
if (aggregatorPrompt) {
|
|
700
651
|
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
@@ -719,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
719
670
|
continue;
|
|
720
671
|
}
|
|
721
672
|
if (typeValue === "tool_trajectory") {
|
|
722
|
-
const mode =
|
|
673
|
+
const mode = asString(rawEvaluator.mode);
|
|
723
674
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
724
675
|
logWarning2(
|
|
725
676
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -810,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
810
761
|
);
|
|
811
762
|
continue;
|
|
812
763
|
}
|
|
813
|
-
const fieldPath =
|
|
814
|
-
const match =
|
|
764
|
+
const fieldPath = asString(rawField.path);
|
|
765
|
+
const match = asString(rawField.match);
|
|
815
766
|
if (!fieldPath) {
|
|
816
767
|
logWarning2(
|
|
817
768
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -841,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
841
792
|
);
|
|
842
793
|
continue;
|
|
843
794
|
}
|
|
844
|
-
const aggregation =
|
|
795
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
845
796
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
846
797
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
847
798
|
evaluators.push({
|
|
@@ -922,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
922
873
|
});
|
|
923
874
|
continue;
|
|
924
875
|
}
|
|
925
|
-
const prompt =
|
|
876
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
926
877
|
let promptPath;
|
|
927
878
|
if (prompt) {
|
|
928
879
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
@@ -941,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
941
892
|
);
|
|
942
893
|
}
|
|
943
894
|
}
|
|
944
|
-
const _model =
|
|
895
|
+
const _model = asString(rawEvaluator.model);
|
|
945
896
|
const rawRubrics = rawEvaluator.rubrics;
|
|
946
897
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
947
|
-
id:
|
|
948
|
-
description:
|
|
898
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
899
|
+
description: asString(rubric.description) ?? "",
|
|
949
900
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
950
901
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
951
902
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -989,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
989
940
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
990
941
|
return void 0;
|
|
991
942
|
}
|
|
992
|
-
function
|
|
943
|
+
function asString(value) {
|
|
993
944
|
return typeof value === "string" ? value : void 0;
|
|
994
945
|
}
|
|
995
946
|
function asStringArray(value, description) {
|
|
@@ -1065,6 +1016,68 @@ function isValidFieldAggregationType(value) {
|
|
|
1065
1016
|
// src/evaluation/loaders/message-processor.ts
|
|
1066
1017
|
var import_promises4 = require("fs/promises");
|
|
1067
1018
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1019
|
+
|
|
1020
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
1021
|
+
function formatFileContents(parts) {
|
|
1022
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
1023
|
+
if (fileCount > 0) {
|
|
1024
|
+
return parts.map((part) => {
|
|
1025
|
+
if (part.isFile && part.displayPath) {
|
|
1026
|
+
return `<file path="${part.displayPath}">
|
|
1027
|
+
${part.content}
|
|
1028
|
+
</file>`;
|
|
1029
|
+
}
|
|
1030
|
+
return part.content;
|
|
1031
|
+
}).join("\n\n");
|
|
1032
|
+
}
|
|
1033
|
+
return parts.map((p) => p.content).join(" ");
|
|
1034
|
+
}
|
|
1035
|
+
function formatSegment(segment, mode = "lm") {
|
|
1036
|
+
const type = asString2(segment.type);
|
|
1037
|
+
if (type === "text") {
|
|
1038
|
+
return asString2(segment.value);
|
|
1039
|
+
}
|
|
1040
|
+
if (type === "guideline_ref") {
|
|
1041
|
+
const refPath = asString2(segment.path);
|
|
1042
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
1043
|
+
}
|
|
1044
|
+
if (type === "file") {
|
|
1045
|
+
const filePath = asString2(segment.path);
|
|
1046
|
+
if (!filePath) {
|
|
1047
|
+
return void 0;
|
|
1048
|
+
}
|
|
1049
|
+
if (mode === "agent") {
|
|
1050
|
+
return `<file: path="${filePath}">`;
|
|
1051
|
+
}
|
|
1052
|
+
const text = asString2(segment.text);
|
|
1053
|
+
if (text && filePath) {
|
|
1054
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
return void 0;
|
|
1058
|
+
}
|
|
1059
|
+
function hasVisibleContent(segments) {
|
|
1060
|
+
return segments.some((segment) => {
|
|
1061
|
+
const type = asString2(segment.type);
|
|
1062
|
+
if (type === "text") {
|
|
1063
|
+
const value = asString2(segment.value);
|
|
1064
|
+
return value !== void 0 && value.trim().length > 0;
|
|
1065
|
+
}
|
|
1066
|
+
if (type === "guideline_ref") {
|
|
1067
|
+
return false;
|
|
1068
|
+
}
|
|
1069
|
+
if (type === "file") {
|
|
1070
|
+
const text = asString2(segment.text);
|
|
1071
|
+
return text !== void 0 && text.trim().length > 0;
|
|
1072
|
+
}
|
|
1073
|
+
return false;
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
function asString2(value) {
|
|
1077
|
+
return typeof value === "string" ? value : void 0;
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
// src/evaluation/loaders/message-processor.ts
|
|
1068
1081
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
1069
1082
|
var ANSI_RESET4 = "\x1B[0m";
|
|
1070
1083
|
async function processMessages(options) {
|
|
@@ -1370,9 +1383,6 @@ ${messageContent}`);
|
|
|
1370
1383
|
questionParts.push(formattedContent);
|
|
1371
1384
|
}
|
|
1372
1385
|
}
|
|
1373
|
-
if (testCase.code_snippets.length > 0) {
|
|
1374
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1375
|
-
}
|
|
1376
1386
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1377
1387
|
}
|
|
1378
1388
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1571,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1571
1581
|
repoRootPath,
|
|
1572
1582
|
verbose
|
|
1573
1583
|
}) : [];
|
|
1574
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1575
1584
|
let referenceAnswer = "";
|
|
1576
1585
|
if (outputSegments.length > 0) {
|
|
1577
1586
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1644,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1644
1653
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
1645
1654
|
guideline_patterns: guidelinePatterns,
|
|
1646
1655
|
file_paths: allFilePaths,
|
|
1647
|
-
code_snippets: codeSnippets,
|
|
1648
1656
|
expected_outcome: outcome,
|
|
1649
1657
|
evaluator: evalCaseEvaluatorKind,
|
|
1650
1658
|
evaluators
|
|
@@ -4272,6 +4280,167 @@ var MockProvider = class {
|
|
|
4272
4280
|
}
|
|
4273
4281
|
};
|
|
4274
4282
|
|
|
4283
|
+
// src/evaluation/providers/pi-agent-sdk.ts
|
|
4284
|
+
var piAgentModule = null;
|
|
4285
|
+
var piAiModule = null;
|
|
4286
|
+
async function loadPiModules() {
|
|
4287
|
+
if (!piAgentModule || !piAiModule) {
|
|
4288
|
+
try {
|
|
4289
|
+
[piAgentModule, piAiModule] = await Promise.all([
|
|
4290
|
+
import("@mariozechner/pi-agent"),
|
|
4291
|
+
import("@mariozechner/pi-ai")
|
|
4292
|
+
]);
|
|
4293
|
+
} catch (error) {
|
|
4294
|
+
throw new Error(
|
|
4295
|
+
`Failed to load pi-agent-sdk dependencies. Please install them:
|
|
4296
|
+
npm install @mariozechner/pi-agent @mariozechner/pi-ai
|
|
4297
|
+
|
|
4298
|
+
Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
4299
|
+
);
|
|
4300
|
+
}
|
|
4301
|
+
}
|
|
4302
|
+
return {
|
|
4303
|
+
Agent: piAgentModule.Agent,
|
|
4304
|
+
ProviderTransport: piAgentModule.ProviderTransport,
|
|
4305
|
+
getModel: piAiModule.getModel,
|
|
4306
|
+
getEnvApiKey: piAiModule.getEnvApiKey
|
|
4307
|
+
};
|
|
4308
|
+
}
|
|
4309
|
+
var PiAgentSdkProvider = class {
|
|
4310
|
+
id;
|
|
4311
|
+
kind = "pi-agent-sdk";
|
|
4312
|
+
targetName;
|
|
4313
|
+
supportsBatch = false;
|
|
4314
|
+
config;
|
|
4315
|
+
constructor(targetName, config) {
|
|
4316
|
+
this.id = `pi-agent-sdk:${targetName}`;
|
|
4317
|
+
this.targetName = targetName;
|
|
4318
|
+
this.config = config;
|
|
4319
|
+
}
|
|
4320
|
+
async invoke(request) {
|
|
4321
|
+
if (request.signal?.aborted) {
|
|
4322
|
+
throw new Error("Pi agent SDK request was aborted before execution");
|
|
4323
|
+
}
|
|
4324
|
+
const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
|
|
4325
|
+
const startTime = Date.now();
|
|
4326
|
+
const providerName = this.config.provider ?? "anthropic";
|
|
4327
|
+
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
4328
|
+
const model = getModel(providerName, modelId);
|
|
4329
|
+
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
4330
|
+
const transport = new ProviderTransport({
|
|
4331
|
+
getApiKey: async (provider) => {
|
|
4332
|
+
return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
|
|
4333
|
+
}
|
|
4334
|
+
});
|
|
4335
|
+
const agent = new Agent({
|
|
4336
|
+
initialState: {
|
|
4337
|
+
systemPrompt,
|
|
4338
|
+
model,
|
|
4339
|
+
tools: [],
|
|
4340
|
+
// No tools for simple Q&A
|
|
4341
|
+
messages: []
|
|
4342
|
+
},
|
|
4343
|
+
transport
|
|
4344
|
+
});
|
|
4345
|
+
const outputMessages = [];
|
|
4346
|
+
let finalAssistantContent = "";
|
|
4347
|
+
const unsubscribe = agent.subscribe((event) => {
|
|
4348
|
+
if (event.type === "message_end") {
|
|
4349
|
+
const msg = event.message;
|
|
4350
|
+
if (msg.role === "assistant") {
|
|
4351
|
+
const content = extractTextContent2(msg.content);
|
|
4352
|
+
if (content) {
|
|
4353
|
+
finalAssistantContent = content;
|
|
4354
|
+
}
|
|
4355
|
+
}
|
|
4356
|
+
}
|
|
4357
|
+
});
|
|
4358
|
+
try {
|
|
4359
|
+
const timeoutMs = this.config.timeoutMs ?? 12e4;
|
|
4360
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
4361
|
+
setTimeout(
|
|
4362
|
+
() => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
|
|
4363
|
+
timeoutMs
|
|
4364
|
+
);
|
|
4365
|
+
});
|
|
4366
|
+
await Promise.race([agent.prompt(request.question), timeoutPromise]);
|
|
4367
|
+
await agent.waitForIdle();
|
|
4368
|
+
const agentMessages = agent.state.messages;
|
|
4369
|
+
for (const msg of agentMessages) {
|
|
4370
|
+
outputMessages.push(convertAgentMessage(msg));
|
|
4371
|
+
}
|
|
4372
|
+
const durationMs = Date.now() - startTime;
|
|
4373
|
+
return {
|
|
4374
|
+
raw: {
|
|
4375
|
+
messages: agentMessages,
|
|
4376
|
+
systemPrompt,
|
|
4377
|
+
model: this.config.model,
|
|
4378
|
+
provider: this.config.provider
|
|
4379
|
+
},
|
|
4380
|
+
outputMessages,
|
|
4381
|
+
durationMs
|
|
4382
|
+
};
|
|
4383
|
+
} finally {
|
|
4384
|
+
unsubscribe();
|
|
4385
|
+
}
|
|
4386
|
+
}
|
|
4387
|
+
};
|
|
4388
|
+
function extractTextContent2(content) {
|
|
4389
|
+
if (typeof content === "string") {
|
|
4390
|
+
return content;
|
|
4391
|
+
}
|
|
4392
|
+
if (!Array.isArray(content)) {
|
|
4393
|
+
return void 0;
|
|
4394
|
+
}
|
|
4395
|
+
const textParts = [];
|
|
4396
|
+
for (const part of content) {
|
|
4397
|
+
if (!part || typeof part !== "object") {
|
|
4398
|
+
continue;
|
|
4399
|
+
}
|
|
4400
|
+
const p = part;
|
|
4401
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
4402
|
+
textParts.push(p.text);
|
|
4403
|
+
}
|
|
4404
|
+
}
|
|
4405
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4406
|
+
}
|
|
4407
|
+
function convertAgentMessage(message) {
|
|
4408
|
+
if (!message || typeof message !== "object") {
|
|
4409
|
+
return { role: "unknown", content: String(message) };
|
|
4410
|
+
}
|
|
4411
|
+
const msg = message;
|
|
4412
|
+
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
4413
|
+
const content = extractTextContent2(msg.content);
|
|
4414
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
4415
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4416
|
+
return {
|
|
4417
|
+
role,
|
|
4418
|
+
content,
|
|
4419
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
4420
|
+
timestamp
|
|
4421
|
+
};
|
|
4422
|
+
}
|
|
4423
|
+
function extractToolCalls2(content) {
|
|
4424
|
+
if (!Array.isArray(content)) {
|
|
4425
|
+
return [];
|
|
4426
|
+
}
|
|
4427
|
+
const toolCalls = [];
|
|
4428
|
+
for (const part of content) {
|
|
4429
|
+
if (!part || typeof part !== "object") {
|
|
4430
|
+
continue;
|
|
4431
|
+
}
|
|
4432
|
+
const p = part;
|
|
4433
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
4434
|
+
toolCalls.push({
|
|
4435
|
+
tool: p.name,
|
|
4436
|
+
input: p.input,
|
|
4437
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
4438
|
+
});
|
|
4439
|
+
}
|
|
4440
|
+
}
|
|
4441
|
+
return toolCalls;
|
|
4442
|
+
}
|
|
4443
|
+
|
|
4275
4444
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
4276
4445
|
var import_node_child_process4 = require("child_process");
|
|
4277
4446
|
var import_node_crypto3 = require("crypto");
|
|
@@ -4787,8 +4956,8 @@ function convertPiMessage(message) {
|
|
|
4787
4956
|
if (typeof role !== "string") {
|
|
4788
4957
|
return void 0;
|
|
4789
4958
|
}
|
|
4790
|
-
const content =
|
|
4791
|
-
const toolCalls =
|
|
4959
|
+
const content = extractTextContent3(msg.content);
|
|
4960
|
+
const toolCalls = extractToolCalls3(msg.content);
|
|
4792
4961
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4793
4962
|
const metadata = {};
|
|
4794
4963
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -4804,7 +4973,7 @@ function convertPiMessage(message) {
|
|
|
4804
4973
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
4805
4974
|
};
|
|
4806
4975
|
}
|
|
4807
|
-
function
|
|
4976
|
+
function extractTextContent3(content) {
|
|
4808
4977
|
if (typeof content === "string") {
|
|
4809
4978
|
return content;
|
|
4810
4979
|
}
|
|
@@ -4823,7 +4992,7 @@ function extractTextContent2(content) {
|
|
|
4823
4992
|
}
|
|
4824
4993
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4825
4994
|
}
|
|
4826
|
-
function
|
|
4995
|
+
function extractToolCalls3(content) {
|
|
4827
4996
|
if (!Array.isArray(content)) {
|
|
4828
4997
|
return [];
|
|
4829
4998
|
}
|
|
@@ -5227,6 +5396,15 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
5227
5396
|
providerBatching,
|
|
5228
5397
|
config: resolvePiCodingAgentConfig(parsed, env)
|
|
5229
5398
|
};
|
|
5399
|
+
case "pi-agent-sdk":
|
|
5400
|
+
return {
|
|
5401
|
+
kind: "pi-agent-sdk",
|
|
5402
|
+
name: parsed.name,
|
|
5403
|
+
judgeTarget: parsed.judge_target,
|
|
5404
|
+
workers: parsed.workers,
|
|
5405
|
+
providerBatching,
|
|
5406
|
+
config: resolvePiAgentSdkConfig(parsed, env)
|
|
5407
|
+
};
|
|
5230
5408
|
case "claude-code":
|
|
5231
5409
|
return {
|
|
5232
5410
|
kind: "claude-code",
|
|
@@ -5448,25 +5626,58 @@ function resolvePiCodingAgentConfig(target, env) {
|
|
|
5448
5626
|
systemPrompt
|
|
5449
5627
|
};
|
|
5450
5628
|
}
|
|
5451
|
-
function
|
|
5452
|
-
const
|
|
5453
|
-
const modelSource = target.model;
|
|
5454
|
-
const
|
|
5455
|
-
const cwdSource = target.cwd;
|
|
5629
|
+
function resolvePiAgentSdkConfig(target, env) {
|
|
5630
|
+
const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
|
|
5631
|
+
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
5632
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
5456
5633
|
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
5457
|
-
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
5458
|
-
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
|
|
5459
5634
|
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
5460
|
-
const
|
|
5635
|
+
const provider = resolveOptionalString(
|
|
5636
|
+
providerSource,
|
|
5637
|
+
env,
|
|
5638
|
+
`${target.name} pi-agent-sdk provider`,
|
|
5639
|
+
{
|
|
5640
|
+
allowLiteral: true,
|
|
5641
|
+
optionalEnv: true
|
|
5642
|
+
}
|
|
5643
|
+
);
|
|
5644
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
|
|
5461
5645
|
allowLiteral: true,
|
|
5462
5646
|
optionalEnv: true
|
|
5463
|
-
})
|
|
5464
|
-
const
|
|
5465
|
-
allowLiteral:
|
|
5647
|
+
});
|
|
5648
|
+
const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
|
|
5649
|
+
allowLiteral: false,
|
|
5466
5650
|
optionalEnv: true
|
|
5467
5651
|
});
|
|
5468
|
-
const
|
|
5469
|
-
const
|
|
5652
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
|
|
5653
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5654
|
+
return {
|
|
5655
|
+
provider,
|
|
5656
|
+
model,
|
|
5657
|
+
apiKey,
|
|
5658
|
+
timeoutMs,
|
|
5659
|
+
systemPrompt
|
|
5660
|
+
};
|
|
5661
|
+
}
|
|
5662
|
+
function resolveClaudeCodeConfig(target, env) {
|
|
5663
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
5664
|
+
const modelSource = target.model;
|
|
5665
|
+
const argsSource = target.args ?? target.arguments;
|
|
5666
|
+
const cwdSource = target.cwd;
|
|
5667
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
5668
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
5669
|
+
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
|
|
5670
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
5671
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
|
|
5672
|
+
allowLiteral: true,
|
|
5673
|
+
optionalEnv: true
|
|
5674
|
+
}) ?? "claude";
|
|
5675
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
|
|
5676
|
+
allowLiteral: true,
|
|
5677
|
+
optionalEnv: true
|
|
5678
|
+
});
|
|
5679
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
|
|
5680
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
|
|
5470
5681
|
allowLiteral: true,
|
|
5471
5682
|
optionalEnv: true
|
|
5472
5683
|
});
|
|
@@ -6106,6 +6317,8 @@ function createProvider(target) {
|
|
|
6106
6317
|
return new CodexProvider(target.name, target.config);
|
|
6107
6318
|
case "pi-coding-agent":
|
|
6108
6319
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
6320
|
+
case "pi-agent-sdk":
|
|
6321
|
+
return new PiAgentSdkProvider(target.name, target.config);
|
|
6109
6322
|
case "claude-code":
|
|
6110
6323
|
return new ClaudeCodeProvider(target.name, target.config);
|
|
6111
6324
|
case "mock":
|
|
@@ -6124,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
6124
6337
|
return createProvider(resolved);
|
|
6125
6338
|
}
|
|
6126
6339
|
|
|
6127
|
-
// src/evaluation/evaluators.ts
|
|
6128
|
-
|
|
6129
|
-
|
|
6340
|
+
// src/evaluation/evaluators/scoring.ts
|
|
6341
|
+
function scoreToVerdict(score) {
|
|
6342
|
+
if (score >= 0.8) {
|
|
6343
|
+
return "pass";
|
|
6344
|
+
}
|
|
6345
|
+
if (score >= 0.6) {
|
|
6346
|
+
return "borderline";
|
|
6347
|
+
}
|
|
6348
|
+
return "fail";
|
|
6349
|
+
}
|
|
6350
|
+
function clampScore(value) {
|
|
6351
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
6352
|
+
return 0;
|
|
6353
|
+
}
|
|
6354
|
+
if (value < 0) {
|
|
6355
|
+
return 0;
|
|
6356
|
+
}
|
|
6357
|
+
if (value > 1) {
|
|
6358
|
+
return 1;
|
|
6359
|
+
}
|
|
6360
|
+
return value;
|
|
6361
|
+
}
|
|
6362
|
+
function extractJsonBlob(text) {
|
|
6363
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
6364
|
+
return match?.[0];
|
|
6365
|
+
}
|
|
6366
|
+
function parseJsonFromText(text) {
|
|
6367
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6368
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6369
|
+
return JSON.parse(blob);
|
|
6370
|
+
}
|
|
6371
|
+
function isNonEmptyString(value) {
|
|
6372
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
6373
|
+
}
|
|
6374
|
+
function parseJsonSafe(payload) {
|
|
6375
|
+
try {
|
|
6376
|
+
return JSON.parse(payload);
|
|
6377
|
+
} catch {
|
|
6378
|
+
return void 0;
|
|
6379
|
+
}
|
|
6380
|
+
}
|
|
6381
|
+
function deepEqual(a, b) {
|
|
6382
|
+
if (a === b) return true;
|
|
6383
|
+
if (a === null || b === null) return a === b;
|
|
6384
|
+
if (typeof a !== typeof b) return false;
|
|
6385
|
+
if (typeof a !== "object") return a === b;
|
|
6386
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
6387
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
6388
|
+
if (a.length !== b.length) return false;
|
|
6389
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
6390
|
+
}
|
|
6391
|
+
const aObj = a;
|
|
6392
|
+
const bObj = b;
|
|
6393
|
+
const aKeys = Object.keys(aObj);
|
|
6394
|
+
const bKeys = Object.keys(bObj);
|
|
6395
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
6396
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
6397
|
+
}
|
|
6130
6398
|
|
|
6131
6399
|
// src/runtime/exec.ts
|
|
6132
6400
|
function shellEscapePath(value) {
|
|
@@ -6151,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
6151
6419
|
cwd: options.cwd,
|
|
6152
6420
|
stdin: encoder.encode(stdinPayload),
|
|
6153
6421
|
stdout: "pipe",
|
|
6154
|
-
stderr: "pipe"
|
|
6422
|
+
stderr: "pipe",
|
|
6423
|
+
// Merge additional env vars with process.env
|
|
6424
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6155
6425
|
});
|
|
6156
6426
|
let timedOut = false;
|
|
6157
6427
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -6186,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6186
6456
|
const [cmd, ...args] = argv;
|
|
6187
6457
|
const child = spawn4(cmd, args, {
|
|
6188
6458
|
cwd: options.cwd,
|
|
6189
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
6459
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
6460
|
+
// Merge additional env vars with process.env
|
|
6461
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6190
6462
|
});
|
|
6191
6463
|
const stdoutChunks = [];
|
|
6192
6464
|
const stderrChunks = [];
|
|
@@ -6239,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6239
6511
|
const child = spawn4(wrappedCommand, {
|
|
6240
6512
|
shell: true,
|
|
6241
6513
|
cwd: options.cwd,
|
|
6242
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
6514
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
6515
|
+
// Merge additional env vars with process.env
|
|
6516
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6243
6517
|
});
|
|
6244
6518
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
6245
6519
|
child.kill();
|
|
@@ -6266,6 +6540,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6266
6540
|
}
|
|
6267
6541
|
}
|
|
6268
6542
|
|
|
6543
|
+
// src/runtime/target-proxy.ts
|
|
6544
|
+
var import_node_crypto4 = require("crypto");
|
|
6545
|
+
var import_node_http = require("http");
|
|
6546
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
6547
|
+
async function createTargetProxy(options) {
|
|
6548
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
6549
|
+
const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
|
|
6550
|
+
let callCount = 0;
|
|
6551
|
+
let isShutdown = false;
|
|
6552
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
6553
|
+
function resolveProvider(targetName) {
|
|
6554
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
6555
|
+
return defaultProvider;
|
|
6556
|
+
}
|
|
6557
|
+
if (targetResolver) {
|
|
6558
|
+
return targetResolver(targetName);
|
|
6559
|
+
}
|
|
6560
|
+
return void 0;
|
|
6561
|
+
}
|
|
6562
|
+
const server = (0, import_node_http.createServer)(async (req, res) => {
|
|
6563
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
6564
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
6565
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
6566
|
+
if (req.method === "OPTIONS") {
|
|
6567
|
+
res.writeHead(204);
|
|
6568
|
+
res.end();
|
|
6569
|
+
return;
|
|
6570
|
+
}
|
|
6571
|
+
const authHeader = req.headers.authorization;
|
|
6572
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
6573
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
6574
|
+
return;
|
|
6575
|
+
}
|
|
6576
|
+
if (isShutdown) {
|
|
6577
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
6578
|
+
return;
|
|
6579
|
+
}
|
|
6580
|
+
const url2 = req.url ?? "";
|
|
6581
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
6582
|
+
handleInfo(res);
|
|
6583
|
+
return;
|
|
6584
|
+
}
|
|
6585
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
6586
|
+
await handleInvoke(req, res);
|
|
6587
|
+
return;
|
|
6588
|
+
}
|
|
6589
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
6590
|
+
await handleInvokeBatch(req, res);
|
|
6591
|
+
return;
|
|
6592
|
+
}
|
|
6593
|
+
sendJson(res, 404, { error: "Not found" });
|
|
6594
|
+
});
|
|
6595
|
+
function handleInfo(res) {
|
|
6596
|
+
const response = {
|
|
6597
|
+
targetName: defaultProvider.targetName,
|
|
6598
|
+
maxCalls,
|
|
6599
|
+
callCount,
|
|
6600
|
+
availableTargets: targetsList
|
|
6601
|
+
};
|
|
6602
|
+
sendJson(res, 200, response);
|
|
6603
|
+
}
|
|
6604
|
+
async function handleInvoke(req, res) {
|
|
6605
|
+
if (callCount >= maxCalls) {
|
|
6606
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
6607
|
+
return;
|
|
6608
|
+
}
|
|
6609
|
+
try {
|
|
6610
|
+
const body = await readBody(req);
|
|
6611
|
+
const request = JSON.parse(body);
|
|
6612
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6613
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
6614
|
+
return;
|
|
6615
|
+
}
|
|
6616
|
+
const provider = resolveProvider(request.target);
|
|
6617
|
+
if (!provider) {
|
|
6618
|
+
sendJson(res, 400, {
|
|
6619
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6620
|
+
});
|
|
6621
|
+
return;
|
|
6622
|
+
}
|
|
6623
|
+
callCount++;
|
|
6624
|
+
const response = await provider.invoke({
|
|
6625
|
+
question: request.question,
|
|
6626
|
+
systemPrompt: request.systemPrompt,
|
|
6627
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6628
|
+
attempt: request.attempt ?? 1
|
|
6629
|
+
});
|
|
6630
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6631
|
+
const rawText = extractLastAssistantContent(outputMessages);
|
|
6632
|
+
const result = {
|
|
6633
|
+
outputMessages,
|
|
6634
|
+
rawText
|
|
6635
|
+
};
|
|
6636
|
+
sendJson(res, 200, result);
|
|
6637
|
+
} catch (error) {
|
|
6638
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6639
|
+
sendJson(res, 500, { error: message });
|
|
6640
|
+
}
|
|
6641
|
+
}
|
|
6642
|
+
async function handleInvokeBatch(req, res) {
|
|
6643
|
+
try {
|
|
6644
|
+
const body = await readBody(req);
|
|
6645
|
+
const { requests } = JSON.parse(body);
|
|
6646
|
+
if (!Array.isArray(requests)) {
|
|
6647
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
6648
|
+
return;
|
|
6649
|
+
}
|
|
6650
|
+
if (callCount + requests.length > maxCalls) {
|
|
6651
|
+
sendJson(res, 429, {
|
|
6652
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
6653
|
+
});
|
|
6654
|
+
return;
|
|
6655
|
+
}
|
|
6656
|
+
const responses = [];
|
|
6657
|
+
for (const request of requests) {
|
|
6658
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6659
|
+
responses.push({
|
|
6660
|
+
outputMessages: [],
|
|
6661
|
+
rawText: "Error: Missing required field: question"
|
|
6662
|
+
});
|
|
6663
|
+
continue;
|
|
6664
|
+
}
|
|
6665
|
+
const provider = resolveProvider(request.target);
|
|
6666
|
+
if (!provider) {
|
|
6667
|
+
responses.push({
|
|
6668
|
+
outputMessages: [],
|
|
6669
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6670
|
+
});
|
|
6671
|
+
continue;
|
|
6672
|
+
}
|
|
6673
|
+
callCount++;
|
|
6674
|
+
try {
|
|
6675
|
+
const response = await provider.invoke({
|
|
6676
|
+
question: request.question,
|
|
6677
|
+
systemPrompt: request.systemPrompt,
|
|
6678
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6679
|
+
attempt: request.attempt ?? 1
|
|
6680
|
+
});
|
|
6681
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6682
|
+
responses.push({
|
|
6683
|
+
outputMessages,
|
|
6684
|
+
rawText: extractLastAssistantContent(outputMessages)
|
|
6685
|
+
});
|
|
6686
|
+
} catch (error) {
|
|
6687
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6688
|
+
responses.push({
|
|
6689
|
+
outputMessages: [],
|
|
6690
|
+
rawText: `Error: ${message}`
|
|
6691
|
+
});
|
|
6692
|
+
}
|
|
6693
|
+
}
|
|
6694
|
+
sendJson(res, 200, { responses });
|
|
6695
|
+
} catch (error) {
|
|
6696
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6697
|
+
sendJson(res, 500, { error: message });
|
|
6698
|
+
}
|
|
6699
|
+
}
|
|
6700
|
+
await new Promise((resolve, reject) => {
|
|
6701
|
+
server.once("error", reject);
|
|
6702
|
+
server.listen(0, "127.0.0.1", () => {
|
|
6703
|
+
server.removeListener("error", reject);
|
|
6704
|
+
resolve();
|
|
6705
|
+
});
|
|
6706
|
+
});
|
|
6707
|
+
const address = server.address();
|
|
6708
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
6709
|
+
return {
|
|
6710
|
+
url,
|
|
6711
|
+
token,
|
|
6712
|
+
shutdown: async () => {
|
|
6713
|
+
isShutdown = true;
|
|
6714
|
+
return new Promise((resolve, reject) => {
|
|
6715
|
+
server.close((err) => {
|
|
6716
|
+
if (err) reject(err);
|
|
6717
|
+
else resolve();
|
|
6718
|
+
});
|
|
6719
|
+
});
|
|
6720
|
+
},
|
|
6721
|
+
getUsageMetadata: () => ({
|
|
6722
|
+
callCount,
|
|
6723
|
+
maxCalls
|
|
6724
|
+
})
|
|
6725
|
+
};
|
|
6726
|
+
}
|
|
6727
|
+
function sendJson(res, statusCode, body) {
|
|
6728
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
6729
|
+
res.end(JSON.stringify(body));
|
|
6730
|
+
}
|
|
6731
|
+
function readBody(req) {
|
|
6732
|
+
return new Promise((resolve, reject) => {
|
|
6733
|
+
const chunks = [];
|
|
6734
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
6735
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
6736
|
+
req.on("error", reject);
|
|
6737
|
+
});
|
|
6738
|
+
}
|
|
6739
|
+
function extractLastAssistantContent(messages) {
|
|
6740
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6741
|
+
const msg = messages[i];
|
|
6742
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6743
|
+
if (typeof msg.content === "string") {
|
|
6744
|
+
return msg.content;
|
|
6745
|
+
}
|
|
6746
|
+
if (Array.isArray(msg.content)) {
|
|
6747
|
+
for (const part of msg.content) {
|
|
6748
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
6749
|
+
return String(part.text);
|
|
6750
|
+
}
|
|
6751
|
+
}
|
|
6752
|
+
}
|
|
6753
|
+
}
|
|
6754
|
+
}
|
|
6755
|
+
return void 0;
|
|
6756
|
+
}
|
|
6757
|
+
|
|
6269
6758
|
// src/evaluation/case-conversion.ts
|
|
6270
6759
|
function toSnakeCase(str) {
|
|
6271
6760
|
if (/^[A-Z]/.test(str)) {
|
|
@@ -6273,12 +6762,6 @@ function toSnakeCase(str) {
|
|
|
6273
6762
|
}
|
|
6274
6763
|
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
6275
6764
|
}
|
|
6276
|
-
function toCamelCase(str) {
|
|
6277
|
-
if (/^[A-Z]/.test(str)) {
|
|
6278
|
-
return str;
|
|
6279
|
-
}
|
|
6280
|
-
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
6281
|
-
}
|
|
6282
6765
|
function toSnakeCaseDeep(obj) {
|
|
6283
6766
|
if (obj === null || obj === void 0) {
|
|
6284
6767
|
return obj;
|
|
@@ -6296,61 +6779,184 @@ function toSnakeCaseDeep(obj) {
|
|
|
6296
6779
|
}
|
|
6297
6780
|
return obj;
|
|
6298
6781
|
}
|
|
6299
|
-
function toCamelCaseDeep(obj) {
|
|
6300
|
-
if (obj === null || obj === void 0) {
|
|
6301
|
-
return obj;
|
|
6302
|
-
}
|
|
6303
|
-
if (Array.isArray(obj)) {
|
|
6304
|
-
return obj.map((item) => toCamelCaseDeep(item));
|
|
6305
|
-
}
|
|
6306
|
-
if (typeof obj === "object") {
|
|
6307
|
-
const result = {};
|
|
6308
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
6309
|
-
const camelKey = toCamelCase(key);
|
|
6310
|
-
result[camelKey] = toCamelCaseDeep(value);
|
|
6311
|
-
}
|
|
6312
|
-
return result;
|
|
6313
|
-
}
|
|
6314
|
-
return obj;
|
|
6315
|
-
}
|
|
6316
6782
|
|
|
6317
|
-
// src/evaluation/
|
|
6318
|
-
var
|
|
6319
|
-
"
|
|
6320
|
-
|
|
6321
|
-
|
|
6322
|
-
|
|
6323
|
-
|
|
6324
|
-
|
|
6325
|
-
|
|
6326
|
-
|
|
6327
|
-
|
|
6783
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
6784
|
+
var CodeEvaluator = class {
|
|
6785
|
+
kind = "code";
|
|
6786
|
+
script;
|
|
6787
|
+
cwd;
|
|
6788
|
+
agentTimeoutMs;
|
|
6789
|
+
config;
|
|
6790
|
+
target;
|
|
6791
|
+
constructor(options) {
|
|
6792
|
+
this.script = options.script;
|
|
6793
|
+
this.cwd = options.cwd;
|
|
6794
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6795
|
+
this.config = options.config;
|
|
6796
|
+
this.target = options.target;
|
|
6328
6797
|
}
|
|
6329
|
-
|
|
6330
|
-
const
|
|
6331
|
-
|
|
6332
|
-
|
|
6333
|
-
|
|
6334
|
-
|
|
6335
|
-
|
|
6798
|
+
async evaluate(context) {
|
|
6799
|
+
const payload = {
|
|
6800
|
+
question: context.evalCase.question,
|
|
6801
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
6802
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
6803
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
6804
|
+
candidateAnswer: context.candidate,
|
|
6805
|
+
outputMessages: context.outputMessages ?? null,
|
|
6806
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
6807
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
6808
|
+
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6809
|
+
),
|
|
6810
|
+
inputMessages: context.evalCase.input_messages,
|
|
6811
|
+
traceSummary: context.traceSummary ?? null,
|
|
6812
|
+
config: this.config ?? null
|
|
6813
|
+
};
|
|
6814
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
6815
|
+
let proxyEnv;
|
|
6816
|
+
let proxyShutdown;
|
|
6817
|
+
let getProxyUsage;
|
|
6818
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
6819
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
6820
|
+
const proxy = await createTargetProxy({
|
|
6821
|
+
defaultProvider: context.judgeProvider,
|
|
6822
|
+
targetResolver: context.targetResolver,
|
|
6823
|
+
availableTargets: context.availableTargets,
|
|
6824
|
+
maxCalls
|
|
6825
|
+
});
|
|
6826
|
+
proxyEnv = {
|
|
6827
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
6828
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
6829
|
+
};
|
|
6830
|
+
proxyShutdown = proxy.shutdown;
|
|
6831
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
6336
6832
|
}
|
|
6337
|
-
|
|
6338
|
-
|
|
6339
|
-
|
|
6340
|
-
|
|
6341
|
-
|
|
6342
|
-
|
|
6343
|
-
|
|
6344
|
-
|
|
6345
|
-
|
|
6346
|
-
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
|
|
6352
|
-
{
|
|
6353
|
-
|
|
6833
|
+
try {
|
|
6834
|
+
const stdout = await executeScript(
|
|
6835
|
+
this.script,
|
|
6836
|
+
inputPayload,
|
|
6837
|
+
this.agentTimeoutMs,
|
|
6838
|
+
this.cwd,
|
|
6839
|
+
proxyEnv
|
|
6840
|
+
);
|
|
6841
|
+
const parsed = parseJsonSafe(stdout);
|
|
6842
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6843
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6844
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6845
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6846
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
6847
|
+
const proxyUsage = getProxyUsage?.();
|
|
6848
|
+
const evaluatorRawRequest = {
|
|
6849
|
+
script: this.script,
|
|
6850
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6851
|
+
...proxyUsage ? {
|
|
6852
|
+
target_proxy: {
|
|
6853
|
+
call_count: proxyUsage.callCount,
|
|
6854
|
+
max_calls: proxyUsage.maxCalls
|
|
6855
|
+
}
|
|
6856
|
+
} : {}
|
|
6857
|
+
};
|
|
6858
|
+
return {
|
|
6859
|
+
score,
|
|
6860
|
+
verdict: scoreToVerdict(score),
|
|
6861
|
+
hits,
|
|
6862
|
+
misses,
|
|
6863
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6864
|
+
reasoning,
|
|
6865
|
+
evaluatorRawRequest,
|
|
6866
|
+
...details ? { details } : {}
|
|
6867
|
+
};
|
|
6868
|
+
} catch (error) {
|
|
6869
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6870
|
+
const proxyUsage = getProxyUsage?.();
|
|
6871
|
+
return {
|
|
6872
|
+
score: 0,
|
|
6873
|
+
verdict: "fail",
|
|
6874
|
+
hits: [],
|
|
6875
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
6876
|
+
expectedAspectCount: 1,
|
|
6877
|
+
reasoning: message,
|
|
6878
|
+
evaluatorRawRequest: {
|
|
6879
|
+
script: this.script,
|
|
6880
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6881
|
+
...proxyUsage ? {
|
|
6882
|
+
target_proxy: {
|
|
6883
|
+
call_count: proxyUsage.callCount,
|
|
6884
|
+
max_calls: proxyUsage.maxCalls
|
|
6885
|
+
}
|
|
6886
|
+
} : {},
|
|
6887
|
+
error: message
|
|
6888
|
+
}
|
|
6889
|
+
};
|
|
6890
|
+
} finally {
|
|
6891
|
+
if (proxyShutdown) {
|
|
6892
|
+
await proxyShutdown();
|
|
6893
|
+
}
|
|
6894
|
+
}
|
|
6895
|
+
}
|
|
6896
|
+
};
|
|
6897
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
6898
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
6899
|
+
if (exitCode !== 0) {
|
|
6900
|
+
const trimmedErr = formatStderr(stderr);
|
|
6901
|
+
throw new Error(
|
|
6902
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
6903
|
+
);
|
|
6904
|
+
}
|
|
6905
|
+
return stdout.trim();
|
|
6906
|
+
}
|
|
6907
|
+
function formatStderr(stderr) {
|
|
6908
|
+
const trimmed = stderr.trim();
|
|
6909
|
+
const maxLength = 2e3;
|
|
6910
|
+
if (trimmed.length <= maxLength) {
|
|
6911
|
+
return trimmed;
|
|
6912
|
+
}
|
|
6913
|
+
const tail = trimmed.slice(-maxLength);
|
|
6914
|
+
return `...(truncated, last ${maxLength} chars)
|
|
6915
|
+
${tail}`;
|
|
6916
|
+
}
|
|
6917
|
+
|
|
6918
|
+
// src/evaluation/evaluators/composite.ts
|
|
6919
|
+
var import_ai3 = require("ai");
|
|
6920
|
+
|
|
6921
|
+
// src/evaluation/providers/types.ts
|
|
6922
|
+
var AGENT_PROVIDER_KINDS = [
|
|
6923
|
+
"codex",
|
|
6924
|
+
"pi-coding-agent",
|
|
6925
|
+
"claude-code",
|
|
6926
|
+
"vscode",
|
|
6927
|
+
"vscode-insiders"
|
|
6928
|
+
];
|
|
6929
|
+
function extractLastAssistantContent2(messages) {
|
|
6930
|
+
if (!messages || messages.length === 0) {
|
|
6931
|
+
return "";
|
|
6932
|
+
}
|
|
6933
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6934
|
+
const msg = messages[i];
|
|
6935
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6936
|
+
if (typeof msg.content === "string") {
|
|
6937
|
+
return msg.content;
|
|
6938
|
+
}
|
|
6939
|
+
return JSON.stringify(msg.content);
|
|
6940
|
+
}
|
|
6941
|
+
}
|
|
6942
|
+
return "";
|
|
6943
|
+
}
|
|
6944
|
+
function isAgentProvider(provider) {
|
|
6945
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
6946
|
+
}
|
|
6947
|
+
|
|
6948
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
6949
|
+
var import_ai2 = require("ai");
|
|
6950
|
+
var import_zod3 = require("zod");
|
|
6951
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
6952
|
+
|
|
6953
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
6954
|
+
|
|
6955
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
6956
|
+
|
|
6957
|
+
[[ ## expected_outcome ## ]]
|
|
6958
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
|
|
6959
|
+
|
|
6354
6960
|
[[ ## question ## ]]
|
|
6355
6961
|
{{${TEMPLATE_VARIABLES.QUESTION}}}
|
|
6356
6962
|
|
|
@@ -6421,7 +7027,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6421
7027
|
target: judgeProvider.targetName
|
|
6422
7028
|
};
|
|
6423
7029
|
try {
|
|
6424
|
-
const { data
|
|
7030
|
+
const { data } = await this.runWithRetry({
|
|
6425
7031
|
context,
|
|
6426
7032
|
judgeProvider,
|
|
6427
7033
|
systemPrompt,
|
|
@@ -6534,7 +7140,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6534
7140
|
temperature: this.temperature
|
|
6535
7141
|
});
|
|
6536
7142
|
const data = schema.parse(
|
|
6537
|
-
parseJsonFromText(
|
|
7143
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
6538
7144
|
);
|
|
6539
7145
|
return { data, providerResponse: response };
|
|
6540
7146
|
} catch (e) {
|
|
@@ -6570,105 +7176,11 @@ You must return a valid JSON object matching this schema:
|
|
|
6570
7176
|
"overall_reasoning": "string (summary)"
|
|
6571
7177
|
}`;
|
|
6572
7178
|
}
|
|
6573
|
-
function
|
|
6574
|
-
|
|
6575
|
-
return
|
|
6576
|
-
}
|
|
6577
|
-
if (score >= 0.6) {
|
|
6578
|
-
return "borderline";
|
|
6579
|
-
}
|
|
6580
|
-
return "fail";
|
|
6581
|
-
}
|
|
6582
|
-
function clampScore(value) {
|
|
6583
|
-
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
6584
|
-
return 0;
|
|
6585
|
-
}
|
|
6586
|
-
if (value < 0) {
|
|
6587
|
-
return 0;
|
|
6588
|
-
}
|
|
6589
|
-
if (value > 1) {
|
|
6590
|
-
return 1;
|
|
6591
|
-
}
|
|
6592
|
-
return value;
|
|
6593
|
-
}
|
|
6594
|
-
function extractJsonBlob(text) {
|
|
6595
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
6596
|
-
return match?.[0];
|
|
6597
|
-
}
|
|
6598
|
-
function parseJsonFromText(text) {
|
|
6599
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6600
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6601
|
-
return JSON.parse(blob);
|
|
6602
|
-
}
|
|
6603
|
-
function isNonEmptyString(value) {
|
|
6604
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7179
|
+
function substituteVariables(template, variables) {
|
|
7180
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
7181
|
+
return variables[varName] ?? match;
|
|
7182
|
+
});
|
|
6605
7183
|
}
|
|
6606
|
-
var CodeEvaluator = class {
|
|
6607
|
-
kind = "code";
|
|
6608
|
-
script;
|
|
6609
|
-
cwd;
|
|
6610
|
-
agentTimeoutMs;
|
|
6611
|
-
config;
|
|
6612
|
-
constructor(options) {
|
|
6613
|
-
this.script = options.script;
|
|
6614
|
-
this.cwd = options.cwd;
|
|
6615
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6616
|
-
this.config = options.config;
|
|
6617
|
-
}
|
|
6618
|
-
async evaluate(context) {
|
|
6619
|
-
const payload = {
|
|
6620
|
-
question: context.evalCase.question,
|
|
6621
|
-
expectedOutcome: context.evalCase.expected_outcome,
|
|
6622
|
-
expectedMessages: context.evalCase.expected_messages,
|
|
6623
|
-
referenceAnswer: context.evalCase.reference_answer,
|
|
6624
|
-
candidateAnswer: context.candidate,
|
|
6625
|
-
outputMessages: context.outputMessages ?? null,
|
|
6626
|
-
guidelineFiles: context.evalCase.guideline_paths,
|
|
6627
|
-
inputFiles: context.evalCase.file_paths.filter(
|
|
6628
|
-
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6629
|
-
),
|
|
6630
|
-
inputMessages: context.evalCase.input_messages,
|
|
6631
|
-
traceSummary: context.traceSummary ?? null,
|
|
6632
|
-
config: this.config ?? null
|
|
6633
|
-
};
|
|
6634
|
-
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
6635
|
-
try {
|
|
6636
|
-
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
6637
|
-
const parsed = parseJsonSafe(stdout);
|
|
6638
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6639
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6640
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6641
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6642
|
-
return {
|
|
6643
|
-
score,
|
|
6644
|
-
verdict: scoreToVerdict(score),
|
|
6645
|
-
hits,
|
|
6646
|
-
misses,
|
|
6647
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
6648
|
-
reasoning,
|
|
6649
|
-
evaluatorRawRequest: {
|
|
6650
|
-
script: this.script,
|
|
6651
|
-
...this.cwd ? { cwd: this.cwd } : {}
|
|
6652
|
-
}
|
|
6653
|
-
};
|
|
6654
|
-
} catch (error) {
|
|
6655
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
6656
|
-
return {
|
|
6657
|
-
score: 0,
|
|
6658
|
-
verdict: "fail",
|
|
6659
|
-
hits: [],
|
|
6660
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
6661
|
-
expectedAspectCount: 1,
|
|
6662
|
-
reasoning: message,
|
|
6663
|
-
evaluatorRawRequest: {
|
|
6664
|
-
script: this.script,
|
|
6665
|
-
...this.cwd ? { cwd: this.cwd } : {},
|
|
6666
|
-
error: message
|
|
6667
|
-
}
|
|
6668
|
-
};
|
|
6669
|
-
}
|
|
6670
|
-
}
|
|
6671
|
-
};
|
|
6672
7184
|
function calculateRubricScore(result, rubrics) {
|
|
6673
7185
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
6674
7186
|
const hits = [];
|
|
@@ -6696,273 +7208,281 @@ function calculateRubricScore(result, rubrics) {
|
|
|
6696
7208
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
6697
7209
|
return { score, verdict, hits, misses };
|
|
6698
7210
|
}
|
|
6699
|
-
|
|
6700
|
-
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6704
|
-
|
|
6705
|
-
|
|
7211
|
+
|
|
7212
|
+
// src/evaluation/evaluators/composite.ts
|
|
7213
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7214
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
7215
|
+
|
|
7216
|
+
Decide the final score and verdict based on all evaluator results.
|
|
7217
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7218
|
+
var CompositeEvaluator = class {
|
|
7219
|
+
kind = "composite";
|
|
7220
|
+
config;
|
|
7221
|
+
evaluatorFactory;
|
|
7222
|
+
cwd;
|
|
7223
|
+
constructor(options) {
|
|
7224
|
+
this.config = options.config;
|
|
7225
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
7226
|
+
this.cwd = options.cwd;
|
|
6706
7227
|
}
|
|
6707
|
-
|
|
6708
|
-
|
|
6709
|
-
|
|
6710
|
-
|
|
6711
|
-
|
|
6712
|
-
|
|
6713
|
-
|
|
7228
|
+
async evaluate(context) {
|
|
7229
|
+
const memberResults = await Promise.all(
|
|
7230
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
7231
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
7232
|
+
return {
|
|
7233
|
+
id: memberConfig.name,
|
|
7234
|
+
type: memberConfig.type,
|
|
7235
|
+
result: await evaluator.evaluate(context)
|
|
7236
|
+
};
|
|
7237
|
+
})
|
|
7238
|
+
);
|
|
7239
|
+
return this.aggregate(memberResults, context);
|
|
6714
7240
|
}
|
|
6715
|
-
|
|
6716
|
-
|
|
6717
|
-
|
|
6718
|
-
|
|
6719
|
-
|
|
6720
|
-
|
|
6721
|
-
|
|
6722
|
-
|
|
6723
|
-
|
|
7241
|
+
async aggregate(results, context) {
|
|
7242
|
+
const aggregator = this.config.aggregator;
|
|
7243
|
+
switch (aggregator.type) {
|
|
7244
|
+
case "code_judge":
|
|
7245
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
7246
|
+
case "llm_judge":
|
|
7247
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
7248
|
+
default:
|
|
7249
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
7250
|
+
}
|
|
6724
7251
|
}
|
|
6725
|
-
|
|
6726
|
-
|
|
6727
|
-
|
|
6728
|
-
|
|
6729
|
-
|
|
6730
|
-
|
|
6731
|
-
|
|
6732
|
-
|
|
6733
|
-
|
|
6734
|
-
|
|
6735
|
-
|
|
6736
|
-
|
|
6737
|
-
|
|
6738
|
-
|
|
6739
|
-
|
|
6740
|
-
|
|
6741
|
-
|
|
6742
|
-
|
|
6743
|
-
|
|
6744
|
-
|
|
6745
|
-
|
|
6746
|
-
|
|
6747
|
-
|
|
6748
|
-
|
|
6749
|
-
|
|
6750
|
-
|
|
6751
|
-
|
|
6752
|
-
|
|
6753
|
-
|
|
6754
|
-
|
|
6755
|
-
|
|
6756
|
-
|
|
6757
|
-
|
|
6758
|
-
|
|
6759
|
-
|
|
6760
|
-
|
|
6761
|
-
|
|
6762
|
-
|
|
7252
|
+
runWeightedAverage(results, weights) {
|
|
7253
|
+
let totalWeight = 0;
|
|
7254
|
+
let weightedSum = 0;
|
|
7255
|
+
const allHits = [];
|
|
7256
|
+
const allMisses = [];
|
|
7257
|
+
const reasoningParts = [];
|
|
7258
|
+
const evaluatorResults = [];
|
|
7259
|
+
for (const member of results) {
|
|
7260
|
+
const weight = weights?.[member.id] ?? 1;
|
|
7261
|
+
totalWeight += weight;
|
|
7262
|
+
weightedSum += member.result.score * weight;
|
|
7263
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7264
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7265
|
+
if (member.result.reasoning) {
|
|
7266
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7267
|
+
}
|
|
7268
|
+
evaluatorResults.push({
|
|
7269
|
+
name: member.id,
|
|
7270
|
+
type: member.type,
|
|
7271
|
+
score: member.result.score,
|
|
7272
|
+
weight,
|
|
7273
|
+
verdict: member.result.verdict,
|
|
7274
|
+
hits: [...member.result.hits],
|
|
7275
|
+
misses: [...member.result.misses],
|
|
7276
|
+
reasoning: member.result.reasoning,
|
|
7277
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7278
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7279
|
+
details: member.result.details
|
|
7280
|
+
});
|
|
7281
|
+
}
|
|
7282
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7283
|
+
return {
|
|
7284
|
+
score: clampScore(finalScore),
|
|
7285
|
+
verdict: scoreToVerdict(finalScore),
|
|
7286
|
+
hits: allHits,
|
|
7287
|
+
misses: allMisses,
|
|
7288
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
7289
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7290
|
+
evaluatorRawRequest: {
|
|
7291
|
+
aggregator: "weighted_average",
|
|
7292
|
+
...weights ? { weights } : {}
|
|
7293
|
+
},
|
|
7294
|
+
evaluatorResults
|
|
7295
|
+
};
|
|
6763
7296
|
}
|
|
6764
|
-
|
|
6765
|
-
const
|
|
6766
|
-
const
|
|
6767
|
-
|
|
7297
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
7298
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7299
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
7300
|
+
const evaluatorResults = results.map((member) => ({
|
|
7301
|
+
name: member.id,
|
|
7302
|
+
type: member.type,
|
|
7303
|
+
score: member.result.score,
|
|
7304
|
+
weight: weights?.[member.id] ?? 1,
|
|
7305
|
+
verdict: member.result.verdict,
|
|
7306
|
+
hits: [...member.result.hits],
|
|
7307
|
+
misses: [...member.result.misses],
|
|
7308
|
+
reasoning: member.result.reasoning,
|
|
7309
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7310
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7311
|
+
details: member.result.details
|
|
7312
|
+
}));
|
|
7313
|
+
try {
|
|
7314
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
7315
|
+
const parsed = parseJsonSafe(stdout);
|
|
7316
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
7317
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
7318
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
7319
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7320
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
7321
|
+
return {
|
|
7322
|
+
score,
|
|
7323
|
+
verdict,
|
|
7324
|
+
hits,
|
|
7325
|
+
misses,
|
|
7326
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
7327
|
+
reasoning,
|
|
7328
|
+
evaluatorRawRequest: {
|
|
7329
|
+
aggregator: "code_judge",
|
|
7330
|
+
script: scriptPath
|
|
7331
|
+
},
|
|
7332
|
+
evaluatorResults
|
|
7333
|
+
};
|
|
7334
|
+
} catch (error) {
|
|
7335
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6768
7336
|
return {
|
|
6769
7337
|
score: 0,
|
|
6770
7338
|
verdict: "fail",
|
|
6771
7339
|
hits: [],
|
|
6772
|
-
misses: [
|
|
6773
|
-
expectedAspectCount: 1
|
|
7340
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
7341
|
+
expectedAspectCount: 1,
|
|
7342
|
+
reasoning: message,
|
|
7343
|
+
evaluatorRawRequest: {
|
|
7344
|
+
aggregator: "code_judge",
|
|
7345
|
+
script: scriptPath,
|
|
7346
|
+
error: message
|
|
7347
|
+
},
|
|
7348
|
+
evaluatorResults
|
|
6774
7349
|
};
|
|
6775
7350
|
}
|
|
6776
|
-
|
|
6777
|
-
|
|
7351
|
+
}
|
|
7352
|
+
async runLlmAggregator(results, context, config) {
|
|
7353
|
+
const judgeProvider = context.judgeProvider;
|
|
7354
|
+
if (!judgeProvider) {
|
|
7355
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
7356
|
+
}
|
|
7357
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7358
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
7359
|
+
const evaluatorResults = results.map((member) => ({
|
|
7360
|
+
name: member.id,
|
|
7361
|
+
type: member.type,
|
|
7362
|
+
score: member.result.score,
|
|
7363
|
+
verdict: member.result.verdict,
|
|
7364
|
+
hits: [...member.result.hits],
|
|
7365
|
+
misses: [...member.result.misses],
|
|
7366
|
+
reasoning: member.result.reasoning,
|
|
7367
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7368
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7369
|
+
details: member.result.details
|
|
7370
|
+
}));
|
|
7371
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
7372
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
7373
|
+
const systemPrompt = buildOutputSchema();
|
|
7374
|
+
const evaluatorRawRequest = {
|
|
7375
|
+
aggregator: "llm_judge",
|
|
7376
|
+
userPrompt,
|
|
7377
|
+
systemPrompt,
|
|
7378
|
+
target: judgeProvider.targetName
|
|
7379
|
+
};
|
|
7380
|
+
try {
|
|
7381
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
7382
|
+
if (model) {
|
|
7383
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
7384
|
+
model,
|
|
7385
|
+
system: systemPrompt,
|
|
7386
|
+
prompt: userPrompt
|
|
7387
|
+
});
|
|
7388
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
7389
|
+
const score2 = clampScore(data2.score);
|
|
7390
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7391
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7392
|
+
const reasoning2 = data2.reasoning;
|
|
7393
|
+
return {
|
|
7394
|
+
score: score2,
|
|
7395
|
+
verdict: scoreToVerdict(score2),
|
|
7396
|
+
hits: hits2,
|
|
7397
|
+
misses: misses2,
|
|
7398
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7399
|
+
reasoning: reasoning2,
|
|
7400
|
+
evaluatorRawRequest,
|
|
7401
|
+
evaluatorResults
|
|
7402
|
+
};
|
|
7403
|
+
}
|
|
7404
|
+
const response = await judgeProvider.invoke({
|
|
7405
|
+
question: userPrompt,
|
|
7406
|
+
systemPrompt,
|
|
7407
|
+
evalCaseId: context.evalCase.id,
|
|
7408
|
+
attempt: context.attempt
|
|
7409
|
+
});
|
|
7410
|
+
const data = freeformEvaluationSchema.parse(
|
|
7411
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
7412
|
+
);
|
|
7413
|
+
const score = clampScore(data.score);
|
|
7414
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7415
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7416
|
+
const reasoning = data.reasoning;
|
|
7417
|
+
return {
|
|
7418
|
+
score,
|
|
7419
|
+
verdict: scoreToVerdict(score),
|
|
7420
|
+
hits,
|
|
7421
|
+
misses,
|
|
7422
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
7423
|
+
reasoning,
|
|
7424
|
+
evaluatorRawRequest,
|
|
7425
|
+
evaluatorResults
|
|
7426
|
+
};
|
|
7427
|
+
} catch {
|
|
6778
7428
|
return {
|
|
6779
7429
|
score: 0,
|
|
6780
7430
|
verdict: "fail",
|
|
6781
7431
|
hits: [],
|
|
6782
|
-
misses: [
|
|
6783
|
-
expectedAspectCount: 1
|
|
7432
|
+
misses: [],
|
|
7433
|
+
expectedAspectCount: 1,
|
|
7434
|
+
evaluatorRawRequest,
|
|
7435
|
+
evaluatorResults
|
|
6784
7436
|
};
|
|
6785
7437
|
}
|
|
6786
|
-
switch (this.config.mode) {
|
|
6787
|
-
case "any_order":
|
|
6788
|
-
return this.evaluateAnyOrder(summary);
|
|
6789
|
-
case "in_order":
|
|
6790
|
-
return this.evaluateInOrder(toolCalls);
|
|
6791
|
-
case "exact":
|
|
6792
|
-
return this.evaluateExact(toolCalls);
|
|
6793
|
-
default:
|
|
6794
|
-
return {
|
|
6795
|
-
score: 0,
|
|
6796
|
-
verdict: "fail",
|
|
6797
|
-
hits: [],
|
|
6798
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
6799
|
-
expectedAspectCount: 1
|
|
6800
|
-
};
|
|
6801
|
-
}
|
|
6802
7438
|
}
|
|
6803
|
-
|
|
6804
|
-
|
|
6805
|
-
|
|
6806
|
-
|
|
6807
|
-
|
|
6808
|
-
|
|
6809
|
-
|
|
6810
|
-
|
|
6811
|
-
for (const message of messages) {
|
|
6812
|
-
if (message.toolCalls) {
|
|
6813
|
-
for (const call of message.toolCalls) {
|
|
6814
|
-
toolCalls.push({
|
|
6815
|
-
name: call.tool,
|
|
6816
|
-
args: call.input
|
|
6817
|
-
});
|
|
6818
|
-
}
|
|
6819
|
-
}
|
|
6820
|
-
}
|
|
6821
|
-
return toolCalls;
|
|
7439
|
+
};
|
|
7440
|
+
|
|
7441
|
+
// src/evaluation/evaluators/cost.ts
|
|
7442
|
+
var CostEvaluator = class {
|
|
7443
|
+
kind = "cost";
|
|
7444
|
+
config;
|
|
7445
|
+
constructor(options) {
|
|
7446
|
+
this.config = options.config;
|
|
6822
7447
|
}
|
|
6823
|
-
|
|
6824
|
-
|
|
6825
|
-
|
|
6826
|
-
|
|
6827
|
-
const toolCallsByName = {};
|
|
6828
|
-
for (const call of toolCalls) {
|
|
6829
|
-
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
6830
|
-
}
|
|
6831
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
6832
|
-
return {
|
|
6833
|
-
eventCount: toolCalls.length,
|
|
6834
|
-
toolNames,
|
|
6835
|
-
toolCallsByName,
|
|
6836
|
-
errorCount: 0
|
|
6837
|
-
};
|
|
6838
|
-
}
|
|
6839
|
-
evaluateAnyOrder(summary) {
|
|
6840
|
-
const minimums = this.config.minimums ?? {};
|
|
6841
|
-
const toolNames = Object.keys(minimums);
|
|
6842
|
-
if (toolNames.length === 0) {
|
|
6843
|
-
return {
|
|
6844
|
-
score: 1,
|
|
6845
|
-
verdict: "pass",
|
|
6846
|
-
hits: ["No tool requirements specified"],
|
|
6847
|
-
misses: [],
|
|
6848
|
-
expectedAspectCount: 0
|
|
6849
|
-
};
|
|
6850
|
-
}
|
|
6851
|
-
const hits = [];
|
|
6852
|
-
const misses = [];
|
|
6853
|
-
for (const toolName of toolNames) {
|
|
6854
|
-
const required = minimums[toolName];
|
|
6855
|
-
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
6856
|
-
if (actual >= required) {
|
|
6857
|
-
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
6858
|
-
} else {
|
|
6859
|
-
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
6860
|
-
}
|
|
6861
|
-
}
|
|
6862
|
-
const score = hits.length / toolNames.length;
|
|
6863
|
-
return {
|
|
6864
|
-
score,
|
|
6865
|
-
verdict: scoreToVerdict(score),
|
|
6866
|
-
hits,
|
|
6867
|
-
misses,
|
|
6868
|
-
expectedAspectCount: toolNames.length
|
|
6869
|
-
};
|
|
6870
|
-
}
|
|
6871
|
-
evaluateInOrder(toolCalls) {
|
|
6872
|
-
const expected = this.config.expected ?? [];
|
|
6873
|
-
if (expected.length === 0) {
|
|
7448
|
+
evaluate(context) {
|
|
7449
|
+
const { budget } = this.config;
|
|
7450
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
7451
|
+
if (costUsd === void 0) {
|
|
6874
7452
|
return {
|
|
6875
|
-
score:
|
|
6876
|
-
verdict: "
|
|
6877
|
-
hits: [
|
|
6878
|
-
misses: [],
|
|
6879
|
-
expectedAspectCount:
|
|
6880
|
-
|
|
6881
|
-
|
|
6882
|
-
|
|
6883
|
-
|
|
6884
|
-
|
|
6885
|
-
for (let i = 0; i < expected.length; i++) {
|
|
6886
|
-
const expectedItem = expected[i];
|
|
6887
|
-
const expectedTool = expectedItem.tool;
|
|
6888
|
-
let found = false;
|
|
6889
|
-
let argsMismatch = false;
|
|
6890
|
-
while (actualIndex < toolCalls.length) {
|
|
6891
|
-
const actualCall = toolCalls[actualIndex];
|
|
6892
|
-
if (actualCall.name === expectedTool) {
|
|
6893
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
6894
|
-
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
6895
|
-
actualIndex++;
|
|
6896
|
-
found = true;
|
|
6897
|
-
break;
|
|
6898
|
-
}
|
|
6899
|
-
misses.push(
|
|
6900
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
6901
|
-
);
|
|
6902
|
-
actualIndex++;
|
|
6903
|
-
argsMismatch = true;
|
|
6904
|
-
break;
|
|
7453
|
+
score: 0,
|
|
7454
|
+
verdict: "fail",
|
|
7455
|
+
hits: [],
|
|
7456
|
+
misses: ["No cost data available in trace"],
|
|
7457
|
+
expectedAspectCount: 1,
|
|
7458
|
+
reasoning: "Execution cost not reported by provider",
|
|
7459
|
+
evaluatorRawRequest: {
|
|
7460
|
+
type: "cost",
|
|
7461
|
+
budget,
|
|
7462
|
+
costUsd: null
|
|
6905
7463
|
}
|
|
6906
|
-
actualIndex++;
|
|
6907
|
-
}
|
|
6908
|
-
if (!found && !argsMismatch) {
|
|
6909
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
6910
|
-
}
|
|
6911
|
-
}
|
|
6912
|
-
const score = hits.length / expected.length;
|
|
6913
|
-
return {
|
|
6914
|
-
score,
|
|
6915
|
-
verdict: scoreToVerdict(score),
|
|
6916
|
-
hits,
|
|
6917
|
-
misses,
|
|
6918
|
-
expectedAspectCount: expected.length
|
|
6919
|
-
};
|
|
6920
|
-
}
|
|
6921
|
-
evaluateExact(toolCalls) {
|
|
6922
|
-
const expected = this.config.expected ?? [];
|
|
6923
|
-
if (expected.length === 0) {
|
|
6924
|
-
return {
|
|
6925
|
-
score: 1,
|
|
6926
|
-
verdict: "pass",
|
|
6927
|
-
hits: ["No tool sequence specified"],
|
|
6928
|
-
misses: [],
|
|
6929
|
-
expectedAspectCount: 0
|
|
6930
7464
|
};
|
|
6931
7465
|
}
|
|
6932
|
-
const
|
|
6933
|
-
const
|
|
6934
|
-
|
|
6935
|
-
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
6936
|
-
}
|
|
6937
|
-
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
6938
|
-
for (let i = 0; i < checkLength; i++) {
|
|
6939
|
-
const expectedItem = expected[i];
|
|
6940
|
-
const expectedTool = expectedItem.tool;
|
|
6941
|
-
const actualCall = toolCalls[i];
|
|
6942
|
-
const actualTool = actualCall.name;
|
|
6943
|
-
if (actualTool === expectedTool) {
|
|
6944
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
6945
|
-
hits.push(`Position ${i}: ${expectedTool}`);
|
|
6946
|
-
} else {
|
|
6947
|
-
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
6948
|
-
}
|
|
6949
|
-
} else {
|
|
6950
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6951
|
-
}
|
|
6952
|
-
}
|
|
6953
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
6954
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
6955
|
-
}
|
|
6956
|
-
const score = hits.length / expected.length;
|
|
7466
|
+
const passed = costUsd <= budget;
|
|
7467
|
+
const score = passed ? 1 : 0;
|
|
7468
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6957
7469
|
return {
|
|
6958
7470
|
score,
|
|
6959
|
-
verdict:
|
|
6960
|
-
hits,
|
|
6961
|
-
misses,
|
|
6962
|
-
expectedAspectCount:
|
|
7471
|
+
verdict: passed ? "pass" : "fail",
|
|
7472
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7473
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7474
|
+
expectedAspectCount: 1,
|
|
7475
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7476
|
+
evaluatorRawRequest: {
|
|
7477
|
+
type: "cost",
|
|
7478
|
+
budget,
|
|
7479
|
+
costUsd
|
|
7480
|
+
}
|
|
6963
7481
|
};
|
|
6964
7482
|
}
|
|
6965
7483
|
};
|
|
7484
|
+
|
|
7485
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
6966
7486
|
var DEFAULT_DATE_FORMATS = [
|
|
6967
7487
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
6968
7488
|
// ISO with timezone
|
|
@@ -7058,551 +7578,326 @@ var FieldAccuracyEvaluator = class {
|
|
|
7058
7578
|
return message.content;
|
|
7059
7579
|
}
|
|
7060
7580
|
if (typeof message.content === "string") {
|
|
7061
|
-
try {
|
|
7062
|
-
return parseJsonFromTextSafe(message.content);
|
|
7063
|
-
} catch {
|
|
7064
|
-
}
|
|
7065
|
-
}
|
|
7066
|
-
}
|
|
7067
|
-
}
|
|
7068
|
-
return void 0;
|
|
7069
|
-
}
|
|
7070
|
-
/**
|
|
7071
|
-
* Evaluate a single field against the expected value.
|
|
7072
|
-
*/
|
|
7073
|
-
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7074
|
-
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7075
|
-
const candidateValue = resolvePath(candidateData, path17);
|
|
7076
|
-
const expectedValue = resolvePath(expectedData, path17);
|
|
7077
|
-
if (expectedValue === void 0) {
|
|
7078
|
-
return {
|
|
7079
|
-
path: path17,
|
|
7080
|
-
score: 1,
|
|
7081
|
-
// No expected value means no comparison needed
|
|
7082
|
-
weight,
|
|
7083
|
-
hit: true,
|
|
7084
|
-
message: `${path17}: no expected value`
|
|
7085
|
-
};
|
|
7086
|
-
}
|
|
7087
|
-
if (candidateValue === void 0) {
|
|
7088
|
-
if (required) {
|
|
7089
|
-
return {
|
|
7090
|
-
path: path17,
|
|
7091
|
-
score: 0,
|
|
7092
|
-
weight,
|
|
7093
|
-
hit: false,
|
|
7094
|
-
message: `${path17} (required, missing)`
|
|
7095
|
-
};
|
|
7096
|
-
}
|
|
7097
|
-
return {
|
|
7098
|
-
path: path17,
|
|
7099
|
-
score: 1,
|
|
7100
|
-
// Don't penalize missing optional fields
|
|
7101
|
-
weight: 0,
|
|
7102
|
-
// Zero weight means it won't affect the score
|
|
7103
|
-
hit: true,
|
|
7104
|
-
message: `${path17}: optional field missing`
|
|
7105
|
-
};
|
|
7106
|
-
}
|
|
7107
|
-
switch (match) {
|
|
7108
|
-
case "exact":
|
|
7109
|
-
return this.compareExact(path17, candidateValue, expectedValue, weight);
|
|
7110
|
-
case "numeric_tolerance":
|
|
7111
|
-
return this.compareNumericTolerance(
|
|
7112
|
-
path17,
|
|
7113
|
-
candidateValue,
|
|
7114
|
-
expectedValue,
|
|
7115
|
-
fieldConfig,
|
|
7116
|
-
weight
|
|
7117
|
-
);
|
|
7118
|
-
case "date":
|
|
7119
|
-
return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
|
|
7120
|
-
default:
|
|
7121
|
-
return {
|
|
7122
|
-
path: path17,
|
|
7123
|
-
score: 0,
|
|
7124
|
-
weight,
|
|
7125
|
-
hit: false,
|
|
7126
|
-
message: `${path17}: unknown match type "${match}"`
|
|
7127
|
-
};
|
|
7128
|
-
}
|
|
7129
|
-
}
|
|
7130
|
-
/**
|
|
7131
|
-
* Exact equality comparison.
|
|
7132
|
-
*/
|
|
7133
|
-
compareExact(path17, candidateValue, expectedValue, weight) {
|
|
7134
|
-
if (deepEqual(candidateValue, expectedValue)) {
|
|
7135
|
-
return {
|
|
7136
|
-
path: path17,
|
|
7137
|
-
score: 1,
|
|
7138
|
-
weight,
|
|
7139
|
-
hit: true,
|
|
7140
|
-
message: path17
|
|
7141
|
-
};
|
|
7142
|
-
}
|
|
7143
|
-
if (typeof candidateValue !== typeof expectedValue) {
|
|
7144
|
-
return {
|
|
7145
|
-
path: path17,
|
|
7146
|
-
score: 0,
|
|
7147
|
-
weight,
|
|
7148
|
-
hit: false,
|
|
7149
|
-
message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7150
|
-
};
|
|
7151
|
-
}
|
|
7152
|
-
return {
|
|
7153
|
-
path: path17,
|
|
7154
|
-
score: 0,
|
|
7155
|
-
weight,
|
|
7156
|
-
hit: false,
|
|
7157
|
-
message: `${path17} (value mismatch)`
|
|
7158
|
-
};
|
|
7159
|
-
}
|
|
7160
|
-
/**
|
|
7161
|
-
* Numeric comparison with absolute or relative tolerance.
|
|
7162
|
-
*/
|
|
7163
|
-
compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7164
|
-
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7165
|
-
const candidateNum = toNumber(candidateValue);
|
|
7166
|
-
const expectedNum = toNumber(expectedValue);
|
|
7167
|
-
if (candidateNum === null || expectedNum === null) {
|
|
7168
|
-
return {
|
|
7169
|
-
path: path17,
|
|
7170
|
-
score: 0,
|
|
7171
|
-
weight,
|
|
7172
|
-
hit: false,
|
|
7173
|
-
message: `${path17} (non-numeric value)`
|
|
7174
|
-
};
|
|
7175
|
-
}
|
|
7176
|
-
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7177
|
-
return {
|
|
7178
|
-
path: path17,
|
|
7179
|
-
score: 0,
|
|
7180
|
-
weight,
|
|
7181
|
-
hit: false,
|
|
7182
|
-
message: `${path17} (invalid numeric value)`
|
|
7183
|
-
};
|
|
7184
|
-
}
|
|
7185
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
7186
|
-
let withinTolerance;
|
|
7187
|
-
if (relative) {
|
|
7188
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7189
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
7190
|
-
} else {
|
|
7191
|
-
withinTolerance = diff <= tolerance;
|
|
7192
|
-
}
|
|
7193
|
-
if (withinTolerance) {
|
|
7194
|
-
return {
|
|
7195
|
-
path: path17,
|
|
7196
|
-
score: 1,
|
|
7197
|
-
weight,
|
|
7198
|
-
hit: true,
|
|
7199
|
-
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7200
|
-
};
|
|
7201
|
-
}
|
|
7202
|
-
return {
|
|
7203
|
-
path: path17,
|
|
7204
|
-
score: 0,
|
|
7205
|
-
weight,
|
|
7206
|
-
hit: false,
|
|
7207
|
-
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7208
|
-
};
|
|
7209
|
-
}
|
|
7210
|
-
/**
|
|
7211
|
-
* Date comparison with format normalization.
|
|
7212
|
-
*/
|
|
7213
|
-
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7214
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7215
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7216
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7217
|
-
if (candidateDate === null) {
|
|
7218
|
-
return {
|
|
7219
|
-
path: path17,
|
|
7220
|
-
score: 0,
|
|
7221
|
-
weight,
|
|
7222
|
-
hit: false,
|
|
7223
|
-
message: `${path17} (unparseable candidate date)`
|
|
7224
|
-
};
|
|
7225
|
-
}
|
|
7226
|
-
if (expectedDate === null) {
|
|
7227
|
-
return {
|
|
7228
|
-
path: path17,
|
|
7229
|
-
score: 0,
|
|
7230
|
-
weight,
|
|
7231
|
-
hit: false,
|
|
7232
|
-
message: `${path17} (unparseable expected date)`
|
|
7233
|
-
};
|
|
7234
|
-
}
|
|
7235
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7236
|
-
return {
|
|
7237
|
-
path: path17,
|
|
7238
|
-
score: 1,
|
|
7239
|
-
weight,
|
|
7240
|
-
hit: true,
|
|
7241
|
-
message: path17
|
|
7242
|
-
};
|
|
7243
|
-
}
|
|
7244
|
-
return {
|
|
7245
|
-
path: path17,
|
|
7246
|
-
score: 0,
|
|
7247
|
-
weight,
|
|
7248
|
-
hit: false,
|
|
7249
|
-
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7250
|
-
};
|
|
7251
|
-
}
|
|
7252
|
-
/**
|
|
7253
|
-
* Aggregate field results using configured strategy.
|
|
7254
|
-
*/
|
|
7255
|
-
aggregateResults(results) {
|
|
7256
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7257
|
-
const hits = [];
|
|
7258
|
-
const misses = [];
|
|
7259
|
-
for (const result of results) {
|
|
7260
|
-
if (result.hit) {
|
|
7261
|
-
hits.push(result.message);
|
|
7262
|
-
} else {
|
|
7263
|
-
misses.push(result.message);
|
|
7264
|
-
}
|
|
7265
|
-
}
|
|
7266
|
-
let score;
|
|
7267
|
-
if (aggregation === "all_or_nothing") {
|
|
7268
|
-
score = misses.length === 0 ? 1 : 0;
|
|
7269
|
-
} else {
|
|
7270
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7271
|
-
if (totalWeight === 0) {
|
|
7272
|
-
score = results.length === 0 ? 1 : 0;
|
|
7273
|
-
} else {
|
|
7274
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7275
|
-
score = weightedSum / totalWeight;
|
|
7276
|
-
}
|
|
7277
|
-
}
|
|
7278
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7279
|
-
return {
|
|
7280
|
-
score: clampScore(score),
|
|
7281
|
-
verdict: scoreToVerdict(score),
|
|
7282
|
-
hits: hits.slice(0, 4),
|
|
7283
|
-
misses: misses.slice(0, 4),
|
|
7284
|
-
expectedAspectCount: results.length,
|
|
7285
|
-
reasoning
|
|
7286
|
-
};
|
|
7287
|
-
}
|
|
7288
|
-
};
|
|
7289
|
-
function resolvePath(obj, path17) {
|
|
7290
|
-
if (!path17 || !obj) {
|
|
7291
|
-
return void 0;
|
|
7292
|
-
}
|
|
7293
|
-
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7294
|
-
let current = obj;
|
|
7295
|
-
for (const part of parts) {
|
|
7296
|
-
if (current === null || current === void 0) {
|
|
7297
|
-
return void 0;
|
|
7298
|
-
}
|
|
7299
|
-
if (typeof current !== "object") {
|
|
7300
|
-
return void 0;
|
|
7301
|
-
}
|
|
7302
|
-
const isIndex = /^\d+$/.test(part);
|
|
7303
|
-
if (isIndex && Array.isArray(current)) {
|
|
7304
|
-
current = current[Number.parseInt(part, 10)];
|
|
7305
|
-
} else {
|
|
7306
|
-
current = current[part];
|
|
7307
|
-
}
|
|
7308
|
-
}
|
|
7309
|
-
return current;
|
|
7310
|
-
}
|
|
7311
|
-
function toNumber(value) {
|
|
7312
|
-
if (typeof value === "number") {
|
|
7313
|
-
return value;
|
|
7314
|
-
}
|
|
7315
|
-
if (typeof value === "string") {
|
|
7316
|
-
const num = Number.parseFloat(value);
|
|
7317
|
-
return Number.isNaN(num) ? null : num;
|
|
7318
|
-
}
|
|
7319
|
-
return null;
|
|
7320
|
-
}
|
|
7321
|
-
function parseDate(dateStr, formats) {
|
|
7322
|
-
if (!dateStr) return null;
|
|
7323
|
-
const trimmed = dateStr.trim();
|
|
7324
|
-
const isoDate = new Date(trimmed);
|
|
7325
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
7326
|
-
return isoDate;
|
|
7327
|
-
}
|
|
7328
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7329
|
-
if (localizedMatch) {
|
|
7330
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7331
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
7332
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7333
|
-
const month = MONTH_NAMES[monthName];
|
|
7334
|
-
if (month !== void 0) {
|
|
7335
|
-
return new Date(year, month, day);
|
|
7336
|
-
}
|
|
7337
|
-
}
|
|
7338
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7339
|
-
if (usMatch) {
|
|
7340
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7341
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7342
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
7343
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7344
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
7345
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7346
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7347
|
-
return new Date(year, month, day);
|
|
7348
|
-
}
|
|
7349
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
7350
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
7351
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7352
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7353
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7354
|
-
return new Date(year, month, day);
|
|
7355
|
-
}
|
|
7356
|
-
} else {
|
|
7357
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7358
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7359
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7360
|
-
if (num1 > 12 && num2 <= 12) {
|
|
7361
|
-
return new Date(year, num2 - 1, num1);
|
|
7362
|
-
}
|
|
7363
|
-
if (num2 > 12 && num1 <= 12) {
|
|
7364
|
-
return new Date(year, num1 - 1, num2);
|
|
7365
|
-
}
|
|
7366
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
7367
|
-
return new Date(year, num1 - 1, num2);
|
|
7368
|
-
}
|
|
7369
|
-
}
|
|
7370
|
-
}
|
|
7371
|
-
return null;
|
|
7372
|
-
}
|
|
7373
|
-
function formatDateISO(date) {
|
|
7374
|
-
return date.toISOString().split("T")[0];
|
|
7375
|
-
}
|
|
7376
|
-
function parseJsonFromTextSafe(text) {
|
|
7377
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
7378
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
7379
|
-
const blob = match?.[0] ?? cleaned;
|
|
7380
|
-
return JSON.parse(blob);
|
|
7381
|
-
}
|
|
7382
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7383
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
7384
|
-
|
|
7385
|
-
Decide the final score and verdict based on all evaluator results.
|
|
7386
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7387
|
-
var CompositeEvaluator = class {
|
|
7388
|
-
kind = "composite";
|
|
7389
|
-
config;
|
|
7390
|
-
evaluatorFactory;
|
|
7391
|
-
cwd;
|
|
7392
|
-
constructor(options) {
|
|
7393
|
-
this.config = options.config;
|
|
7394
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
7395
|
-
this.cwd = options.cwd;
|
|
7581
|
+
try {
|
|
7582
|
+
return parseJsonFromTextSafe(message.content);
|
|
7583
|
+
} catch {
|
|
7584
|
+
}
|
|
7585
|
+
}
|
|
7586
|
+
}
|
|
7587
|
+
}
|
|
7588
|
+
return void 0;
|
|
7396
7589
|
}
|
|
7397
|
-
|
|
7398
|
-
|
|
7399
|
-
|
|
7400
|
-
|
|
7590
|
+
/**
|
|
7591
|
+
* Evaluate a single field against the expected value.
|
|
7592
|
+
*/
|
|
7593
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7594
|
+
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7595
|
+
const candidateValue = resolvePath(candidateData, path17);
|
|
7596
|
+
const expectedValue = resolvePath(expectedData, path17);
|
|
7597
|
+
if (expectedValue === void 0) {
|
|
7598
|
+
return {
|
|
7599
|
+
path: path17,
|
|
7600
|
+
score: 1,
|
|
7601
|
+
// No expected value means no comparison needed
|
|
7602
|
+
weight,
|
|
7603
|
+
hit: true,
|
|
7604
|
+
message: `${path17}: no expected value`
|
|
7605
|
+
};
|
|
7606
|
+
}
|
|
7607
|
+
if (candidateValue === void 0) {
|
|
7608
|
+
if (required) {
|
|
7401
7609
|
return {
|
|
7402
|
-
|
|
7403
|
-
|
|
7404
|
-
|
|
7610
|
+
path: path17,
|
|
7611
|
+
score: 0,
|
|
7612
|
+
weight,
|
|
7613
|
+
hit: false,
|
|
7614
|
+
message: `${path17} (required, missing)`
|
|
7405
7615
|
};
|
|
7406
|
-
}
|
|
7407
|
-
|
|
7408
|
-
|
|
7409
|
-
|
|
7410
|
-
|
|
7411
|
-
|
|
7412
|
-
|
|
7413
|
-
|
|
7414
|
-
|
|
7415
|
-
|
|
7416
|
-
|
|
7616
|
+
}
|
|
7617
|
+
return {
|
|
7618
|
+
path: path17,
|
|
7619
|
+
score: 1,
|
|
7620
|
+
// Don't penalize missing optional fields
|
|
7621
|
+
weight: 0,
|
|
7622
|
+
// Zero weight means it won't affect the score
|
|
7623
|
+
hit: true,
|
|
7624
|
+
message: `${path17}: optional field missing`
|
|
7625
|
+
};
|
|
7626
|
+
}
|
|
7627
|
+
switch (match) {
|
|
7628
|
+
case "exact":
|
|
7629
|
+
return this.compareExact(path17, candidateValue, expectedValue, weight);
|
|
7630
|
+
case "numeric_tolerance":
|
|
7631
|
+
return this.compareNumericTolerance(
|
|
7632
|
+
path17,
|
|
7633
|
+
candidateValue,
|
|
7634
|
+
expectedValue,
|
|
7635
|
+
fieldConfig,
|
|
7636
|
+
weight
|
|
7637
|
+
);
|
|
7638
|
+
case "date":
|
|
7639
|
+
return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
|
|
7417
7640
|
default:
|
|
7418
|
-
return
|
|
7641
|
+
return {
|
|
7642
|
+
path: path17,
|
|
7643
|
+
score: 0,
|
|
7644
|
+
weight,
|
|
7645
|
+
hit: false,
|
|
7646
|
+
message: `${path17}: unknown match type "${match}"`
|
|
7647
|
+
};
|
|
7419
7648
|
}
|
|
7420
7649
|
}
|
|
7421
|
-
|
|
7422
|
-
|
|
7423
|
-
|
|
7424
|
-
|
|
7425
|
-
|
|
7426
|
-
|
|
7427
|
-
|
|
7428
|
-
|
|
7429
|
-
const weight = weights?.[member.id] ?? 1;
|
|
7430
|
-
totalWeight += weight;
|
|
7431
|
-
weightedSum += member.result.score * weight;
|
|
7432
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7433
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7434
|
-
if (member.result.reasoning) {
|
|
7435
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7436
|
-
}
|
|
7437
|
-
evaluatorResults.push({
|
|
7438
|
-
name: member.id,
|
|
7439
|
-
type: member.type,
|
|
7440
|
-
score: member.result.score,
|
|
7650
|
+
/**
|
|
7651
|
+
* Exact equality comparison.
|
|
7652
|
+
*/
|
|
7653
|
+
compareExact(path17, candidateValue, expectedValue, weight) {
|
|
7654
|
+
if (deepEqual(candidateValue, expectedValue)) {
|
|
7655
|
+
return {
|
|
7656
|
+
path: path17,
|
|
7657
|
+
score: 1,
|
|
7441
7658
|
weight,
|
|
7442
|
-
|
|
7443
|
-
|
|
7444
|
-
|
|
7445
|
-
|
|
7446
|
-
|
|
7447
|
-
|
|
7448
|
-
|
|
7659
|
+
hit: true,
|
|
7660
|
+
message: path17
|
|
7661
|
+
};
|
|
7662
|
+
}
|
|
7663
|
+
if (typeof candidateValue !== typeof expectedValue) {
|
|
7664
|
+
return {
|
|
7665
|
+
path: path17,
|
|
7666
|
+
score: 0,
|
|
7667
|
+
weight,
|
|
7668
|
+
hit: false,
|
|
7669
|
+
message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7670
|
+
};
|
|
7449
7671
|
}
|
|
7450
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7451
7672
|
return {
|
|
7452
|
-
|
|
7453
|
-
|
|
7454
|
-
|
|
7455
|
-
|
|
7456
|
-
|
|
7457
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7458
|
-
evaluatorRawRequest: {
|
|
7459
|
-
aggregator: "weighted_average",
|
|
7460
|
-
...weights ? { weights } : {}
|
|
7461
|
-
},
|
|
7462
|
-
evaluatorResults
|
|
7673
|
+
path: path17,
|
|
7674
|
+
score: 0,
|
|
7675
|
+
weight,
|
|
7676
|
+
hit: false,
|
|
7677
|
+
message: `${path17} (value mismatch)`
|
|
7463
7678
|
};
|
|
7464
7679
|
}
|
|
7465
|
-
|
|
7466
|
-
|
|
7467
|
-
|
|
7468
|
-
|
|
7469
|
-
|
|
7470
|
-
|
|
7471
|
-
|
|
7472
|
-
|
|
7473
|
-
verdict: member.result.verdict,
|
|
7474
|
-
hits: [...member.result.hits],
|
|
7475
|
-
misses: [...member.result.misses],
|
|
7476
|
-
reasoning: member.result.reasoning,
|
|
7477
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7478
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7479
|
-
}));
|
|
7480
|
-
try {
|
|
7481
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
7482
|
-
const parsed = parseJsonSafe(stdout);
|
|
7483
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
7484
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
7485
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
7486
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7487
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
7680
|
+
/**
|
|
7681
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
7682
|
+
*/
|
|
7683
|
+
compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7684
|
+
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7685
|
+
const candidateNum = toNumber(candidateValue);
|
|
7686
|
+
const expectedNum = toNumber(expectedValue);
|
|
7687
|
+
if (candidateNum === null || expectedNum === null) {
|
|
7488
7688
|
return {
|
|
7489
|
-
|
|
7490
|
-
|
|
7491
|
-
|
|
7492
|
-
|
|
7493
|
-
|
|
7494
|
-
reasoning,
|
|
7495
|
-
evaluatorRawRequest: {
|
|
7496
|
-
aggregator: "code_judge",
|
|
7497
|
-
script: scriptPath
|
|
7498
|
-
},
|
|
7499
|
-
evaluatorResults
|
|
7689
|
+
path: path17,
|
|
7690
|
+
score: 0,
|
|
7691
|
+
weight,
|
|
7692
|
+
hit: false,
|
|
7693
|
+
message: `${path17} (non-numeric value)`
|
|
7500
7694
|
};
|
|
7501
|
-
}
|
|
7502
|
-
|
|
7695
|
+
}
|
|
7696
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7503
7697
|
return {
|
|
7698
|
+
path: path17,
|
|
7504
7699
|
score: 0,
|
|
7505
|
-
|
|
7506
|
-
|
|
7507
|
-
|
|
7508
|
-
expectedAspectCount: 1,
|
|
7509
|
-
reasoning: message,
|
|
7510
|
-
evaluatorRawRequest: {
|
|
7511
|
-
aggregator: "code_judge",
|
|
7512
|
-
script: scriptPath,
|
|
7513
|
-
error: message
|
|
7514
|
-
},
|
|
7515
|
-
evaluatorResults
|
|
7700
|
+
weight,
|
|
7701
|
+
hit: false,
|
|
7702
|
+
message: `${path17} (invalid numeric value)`
|
|
7516
7703
|
};
|
|
7517
7704
|
}
|
|
7518
|
-
|
|
7519
|
-
|
|
7520
|
-
|
|
7521
|
-
|
|
7522
|
-
|
|
7705
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
7706
|
+
let withinTolerance;
|
|
7707
|
+
if (relative) {
|
|
7708
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7709
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
7710
|
+
} else {
|
|
7711
|
+
withinTolerance = diff <= tolerance;
|
|
7523
7712
|
}
|
|
7524
|
-
|
|
7525
|
-
|
|
7526
|
-
|
|
7527
|
-
|
|
7528
|
-
|
|
7529
|
-
|
|
7530
|
-
|
|
7531
|
-
|
|
7532
|
-
|
|
7533
|
-
|
|
7534
|
-
|
|
7535
|
-
|
|
7536
|
-
|
|
7537
|
-
|
|
7538
|
-
|
|
7539
|
-
const systemPrompt = buildOutputSchema();
|
|
7540
|
-
const evaluatorRawRequest = {
|
|
7541
|
-
aggregator: "llm_judge",
|
|
7542
|
-
userPrompt,
|
|
7543
|
-
systemPrompt,
|
|
7544
|
-
target: judgeProvider.targetName
|
|
7713
|
+
if (withinTolerance) {
|
|
7714
|
+
return {
|
|
7715
|
+
path: path17,
|
|
7716
|
+
score: 1,
|
|
7717
|
+
weight,
|
|
7718
|
+
hit: true,
|
|
7719
|
+
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7720
|
+
};
|
|
7721
|
+
}
|
|
7722
|
+
return {
|
|
7723
|
+
path: path17,
|
|
7724
|
+
score: 0,
|
|
7725
|
+
weight,
|
|
7726
|
+
hit: false,
|
|
7727
|
+
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7545
7728
|
};
|
|
7546
|
-
|
|
7547
|
-
|
|
7548
|
-
|
|
7549
|
-
|
|
7550
|
-
|
|
7551
|
-
|
|
7552
|
-
|
|
7553
|
-
|
|
7554
|
-
|
|
7555
|
-
const score2 = clampScore(data2.score);
|
|
7556
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7557
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7558
|
-
const reasoning2 = data2.reasoning;
|
|
7559
|
-
return {
|
|
7560
|
-
score: score2,
|
|
7561
|
-
verdict: scoreToVerdict(score2),
|
|
7562
|
-
hits: hits2,
|
|
7563
|
-
misses: misses2,
|
|
7564
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7565
|
-
reasoning: reasoning2,
|
|
7566
|
-
evaluatorRawRequest,
|
|
7567
|
-
evaluatorResults
|
|
7568
|
-
};
|
|
7569
|
-
}
|
|
7570
|
-
const response = await judgeProvider.invoke({
|
|
7571
|
-
question: userPrompt,
|
|
7572
|
-
systemPrompt,
|
|
7573
|
-
evalCaseId: context.evalCase.id,
|
|
7574
|
-
attempt: context.attempt
|
|
7575
|
-
});
|
|
7576
|
-
const data = freeformEvaluationSchema.parse(
|
|
7577
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
7578
|
-
);
|
|
7579
|
-
const score = clampScore(data.score);
|
|
7580
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7581
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7582
|
-
const reasoning = data.reasoning;
|
|
7729
|
+
}
|
|
7730
|
+
/**
|
|
7731
|
+
* Date comparison with format normalization.
|
|
7732
|
+
*/
|
|
7733
|
+
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7734
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7735
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7736
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7737
|
+
if (candidateDate === null) {
|
|
7583
7738
|
return {
|
|
7584
|
-
|
|
7585
|
-
|
|
7586
|
-
|
|
7587
|
-
|
|
7588
|
-
|
|
7589
|
-
reasoning,
|
|
7590
|
-
evaluatorRawRequest,
|
|
7591
|
-
evaluatorResults
|
|
7739
|
+
path: path17,
|
|
7740
|
+
score: 0,
|
|
7741
|
+
weight,
|
|
7742
|
+
hit: false,
|
|
7743
|
+
message: `${path17} (unparseable candidate date)`
|
|
7592
7744
|
};
|
|
7593
|
-
}
|
|
7745
|
+
}
|
|
7746
|
+
if (expectedDate === null) {
|
|
7594
7747
|
return {
|
|
7748
|
+
path: path17,
|
|
7595
7749
|
score: 0,
|
|
7596
|
-
|
|
7597
|
-
|
|
7598
|
-
|
|
7599
|
-
|
|
7600
|
-
|
|
7601
|
-
|
|
7750
|
+
weight,
|
|
7751
|
+
hit: false,
|
|
7752
|
+
message: `${path17} (unparseable expected date)`
|
|
7753
|
+
};
|
|
7754
|
+
}
|
|
7755
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7756
|
+
return {
|
|
7757
|
+
path: path17,
|
|
7758
|
+
score: 1,
|
|
7759
|
+
weight,
|
|
7760
|
+
hit: true,
|
|
7761
|
+
message: path17
|
|
7602
7762
|
};
|
|
7603
7763
|
}
|
|
7764
|
+
return {
|
|
7765
|
+
path: path17,
|
|
7766
|
+
score: 0,
|
|
7767
|
+
weight,
|
|
7768
|
+
hit: false,
|
|
7769
|
+
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7770
|
+
};
|
|
7771
|
+
}
|
|
7772
|
+
/**
|
|
7773
|
+
* Aggregate field results using configured strategy.
|
|
7774
|
+
*/
|
|
7775
|
+
aggregateResults(results) {
|
|
7776
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7777
|
+
const hits = [];
|
|
7778
|
+
const misses = [];
|
|
7779
|
+
for (const result of results) {
|
|
7780
|
+
if (result.hit) {
|
|
7781
|
+
hits.push(result.message);
|
|
7782
|
+
} else {
|
|
7783
|
+
misses.push(result.message);
|
|
7784
|
+
}
|
|
7785
|
+
}
|
|
7786
|
+
let score;
|
|
7787
|
+
if (aggregation === "all_or_nothing") {
|
|
7788
|
+
score = misses.length === 0 ? 1 : 0;
|
|
7789
|
+
} else {
|
|
7790
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7791
|
+
if (totalWeight === 0) {
|
|
7792
|
+
score = results.length === 0 ? 1 : 0;
|
|
7793
|
+
} else {
|
|
7794
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7795
|
+
score = weightedSum / totalWeight;
|
|
7796
|
+
}
|
|
7797
|
+
}
|
|
7798
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7799
|
+
return {
|
|
7800
|
+
score: clampScore(score),
|
|
7801
|
+
verdict: scoreToVerdict(score),
|
|
7802
|
+
hits: hits.slice(0, 4),
|
|
7803
|
+
misses: misses.slice(0, 4),
|
|
7804
|
+
expectedAspectCount: results.length,
|
|
7805
|
+
reasoning
|
|
7806
|
+
};
|
|
7604
7807
|
}
|
|
7605
7808
|
};
|
|
7809
|
+
function resolvePath(obj, path17) {
|
|
7810
|
+
if (!path17 || !obj) {
|
|
7811
|
+
return void 0;
|
|
7812
|
+
}
|
|
7813
|
+
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7814
|
+
let current = obj;
|
|
7815
|
+
for (const part of parts) {
|
|
7816
|
+
if (current === null || current === void 0) {
|
|
7817
|
+
return void 0;
|
|
7818
|
+
}
|
|
7819
|
+
if (typeof current !== "object") {
|
|
7820
|
+
return void 0;
|
|
7821
|
+
}
|
|
7822
|
+
const isIndex = /^\d+$/.test(part);
|
|
7823
|
+
if (isIndex && Array.isArray(current)) {
|
|
7824
|
+
current = current[Number.parseInt(part, 10)];
|
|
7825
|
+
} else {
|
|
7826
|
+
current = current[part];
|
|
7827
|
+
}
|
|
7828
|
+
}
|
|
7829
|
+
return current;
|
|
7830
|
+
}
|
|
7831
|
+
function toNumber(value) {
|
|
7832
|
+
if (typeof value === "number") {
|
|
7833
|
+
return value;
|
|
7834
|
+
}
|
|
7835
|
+
if (typeof value === "string") {
|
|
7836
|
+
const num = Number.parseFloat(value);
|
|
7837
|
+
return Number.isNaN(num) ? null : num;
|
|
7838
|
+
}
|
|
7839
|
+
return null;
|
|
7840
|
+
}
|
|
7841
|
+
function parseDate(dateStr, formats) {
|
|
7842
|
+
if (!dateStr) return null;
|
|
7843
|
+
const trimmed = dateStr.trim();
|
|
7844
|
+
const isoDate = new Date(trimmed);
|
|
7845
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
7846
|
+
return isoDate;
|
|
7847
|
+
}
|
|
7848
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7849
|
+
if (localizedMatch) {
|
|
7850
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7851
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
7852
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7853
|
+
const month = MONTH_NAMES[monthName];
|
|
7854
|
+
if (month !== void 0) {
|
|
7855
|
+
return new Date(year, month, day);
|
|
7856
|
+
}
|
|
7857
|
+
}
|
|
7858
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7859
|
+
if (usMatch) {
|
|
7860
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7861
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7862
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
7863
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7864
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
7865
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7866
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7867
|
+
return new Date(year, month, day);
|
|
7868
|
+
}
|
|
7869
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
7870
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
7871
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7872
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7873
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7874
|
+
return new Date(year, month, day);
|
|
7875
|
+
}
|
|
7876
|
+
} else {
|
|
7877
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7878
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7879
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7880
|
+
if (num1 > 12 && num2 <= 12) {
|
|
7881
|
+
return new Date(year, num2 - 1, num1);
|
|
7882
|
+
}
|
|
7883
|
+
if (num2 > 12 && num1 <= 12) {
|
|
7884
|
+
return new Date(year, num1 - 1, num2);
|
|
7885
|
+
}
|
|
7886
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
7887
|
+
return new Date(year, num1 - 1, num2);
|
|
7888
|
+
}
|
|
7889
|
+
}
|
|
7890
|
+
}
|
|
7891
|
+
return null;
|
|
7892
|
+
}
|
|
7893
|
+
function formatDateISO(date) {
|
|
7894
|
+
return date.toISOString().split("T")[0];
|
|
7895
|
+
}
|
|
7896
|
+
function parseJsonFromTextSafe(text) {
|
|
7897
|
+
return parseJsonFromText(text);
|
|
7898
|
+
}
|
|
7899
|
+
|
|
7900
|
+
// src/evaluation/evaluators/latency.ts
|
|
7606
7901
|
var LatencyEvaluator = class {
|
|
7607
7902
|
kind = "latency";
|
|
7608
7903
|
config;
|
|
@@ -7639,53 +7934,13 @@ var LatencyEvaluator = class {
|
|
|
7639
7934
|
evaluatorRawRequest: {
|
|
7640
7935
|
type: "latency",
|
|
7641
7936
|
threshold,
|
|
7642
|
-
durationMs
|
|
7643
|
-
}
|
|
7644
|
-
};
|
|
7645
|
-
}
|
|
7646
|
-
};
|
|
7647
|
-
var CostEvaluator = class {
|
|
7648
|
-
kind = "cost";
|
|
7649
|
-
config;
|
|
7650
|
-
constructor(options) {
|
|
7651
|
-
this.config = options.config;
|
|
7652
|
-
}
|
|
7653
|
-
evaluate(context) {
|
|
7654
|
-
const { budget } = this.config;
|
|
7655
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
7656
|
-
if (costUsd === void 0) {
|
|
7657
|
-
return {
|
|
7658
|
-
score: 0,
|
|
7659
|
-
verdict: "fail",
|
|
7660
|
-
hits: [],
|
|
7661
|
-
misses: ["No cost data available in trace"],
|
|
7662
|
-
expectedAspectCount: 1,
|
|
7663
|
-
reasoning: "Execution cost not reported by provider",
|
|
7664
|
-
evaluatorRawRequest: {
|
|
7665
|
-
type: "cost",
|
|
7666
|
-
budget,
|
|
7667
|
-
costUsd: null
|
|
7668
|
-
}
|
|
7669
|
-
};
|
|
7670
|
-
}
|
|
7671
|
-
const passed = costUsd <= budget;
|
|
7672
|
-
const score = passed ? 1 : 0;
|
|
7673
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7674
|
-
return {
|
|
7675
|
-
score,
|
|
7676
|
-
verdict: passed ? "pass" : "fail",
|
|
7677
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7678
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7679
|
-
expectedAspectCount: 1,
|
|
7680
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7681
|
-
evaluatorRawRequest: {
|
|
7682
|
-
type: "cost",
|
|
7683
|
-
budget,
|
|
7684
|
-
costUsd
|
|
7937
|
+
durationMs
|
|
7685
7938
|
}
|
|
7686
7939
|
};
|
|
7687
7940
|
}
|
|
7688
7941
|
};
|
|
7942
|
+
|
|
7943
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
7689
7944
|
var TokenUsageEvaluator = class {
|
|
7690
7945
|
kind = "token_usage";
|
|
7691
7946
|
config;
|
|
@@ -7769,8 +8024,228 @@ var TokenUsageEvaluator = class {
|
|
|
7769
8024
|
}
|
|
7770
8025
|
};
|
|
7771
8026
|
|
|
8027
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
8028
|
+
function argsMatch(expected, actual) {
|
|
8029
|
+
if (expected === void 0) return true;
|
|
8030
|
+
if (expected === "any") return true;
|
|
8031
|
+
if (actual === void 0) return false;
|
|
8032
|
+
for (const key of Object.keys(expected)) {
|
|
8033
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
8034
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
8035
|
+
}
|
|
8036
|
+
return true;
|
|
8037
|
+
}
|
|
8038
|
+
var ToolTrajectoryEvaluator = class {
|
|
8039
|
+
kind = "tool_trajectory";
|
|
8040
|
+
config;
|
|
8041
|
+
constructor(options) {
|
|
8042
|
+
this.config = options.config;
|
|
8043
|
+
}
|
|
8044
|
+
evaluate(context) {
|
|
8045
|
+
const { outputMessages, traceSummary } = context;
|
|
8046
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
8047
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
8048
|
+
return {
|
|
8049
|
+
score: 0,
|
|
8050
|
+
verdict: "fail",
|
|
8051
|
+
hits: [],
|
|
8052
|
+
misses: ["No trace available for evaluation"],
|
|
8053
|
+
expectedAspectCount: 1
|
|
8054
|
+
};
|
|
8055
|
+
}
|
|
8056
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
8057
|
+
if (!summary) {
|
|
8058
|
+
return {
|
|
8059
|
+
score: 0,
|
|
8060
|
+
verdict: "fail",
|
|
8061
|
+
hits: [],
|
|
8062
|
+
misses: ["No trace available for evaluation"],
|
|
8063
|
+
expectedAspectCount: 1
|
|
8064
|
+
};
|
|
8065
|
+
}
|
|
8066
|
+
switch (this.config.mode) {
|
|
8067
|
+
case "any_order":
|
|
8068
|
+
return this.evaluateAnyOrder(summary);
|
|
8069
|
+
case "in_order":
|
|
8070
|
+
return this.evaluateInOrder(toolCalls);
|
|
8071
|
+
case "exact":
|
|
8072
|
+
return this.evaluateExact(toolCalls);
|
|
8073
|
+
default:
|
|
8074
|
+
return {
|
|
8075
|
+
score: 0,
|
|
8076
|
+
verdict: "fail",
|
|
8077
|
+
hits: [],
|
|
8078
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
8079
|
+
expectedAspectCount: 1
|
|
8080
|
+
};
|
|
8081
|
+
}
|
|
8082
|
+
}
|
|
8083
|
+
/**
|
|
8084
|
+
* Extract tool calls from output messages.
|
|
8085
|
+
*/
|
|
8086
|
+
extractToolCallsFromMessages(messages) {
|
|
8087
|
+
if (!messages) {
|
|
8088
|
+
return [];
|
|
8089
|
+
}
|
|
8090
|
+
const toolCalls = [];
|
|
8091
|
+
for (const message of messages) {
|
|
8092
|
+
if (message.toolCalls) {
|
|
8093
|
+
for (const call of message.toolCalls) {
|
|
8094
|
+
toolCalls.push({
|
|
8095
|
+
name: call.tool,
|
|
8096
|
+
args: call.input
|
|
8097
|
+
});
|
|
8098
|
+
}
|
|
8099
|
+
}
|
|
8100
|
+
}
|
|
8101
|
+
return toolCalls;
|
|
8102
|
+
}
|
|
8103
|
+
/**
|
|
8104
|
+
* Build a summary from extracted tool calls.
|
|
8105
|
+
*/
|
|
8106
|
+
buildSummary(toolCalls) {
|
|
8107
|
+
const toolCallsByName = {};
|
|
8108
|
+
for (const call of toolCalls) {
|
|
8109
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
8110
|
+
}
|
|
8111
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
8112
|
+
return {
|
|
8113
|
+
eventCount: toolCalls.length,
|
|
8114
|
+
toolNames,
|
|
8115
|
+
toolCallsByName,
|
|
8116
|
+
errorCount: 0
|
|
8117
|
+
};
|
|
8118
|
+
}
|
|
8119
|
+
evaluateAnyOrder(summary) {
|
|
8120
|
+
const minimums = this.config.minimums ?? {};
|
|
8121
|
+
const toolNames = Object.keys(minimums);
|
|
8122
|
+
if (toolNames.length === 0) {
|
|
8123
|
+
return {
|
|
8124
|
+
score: 1,
|
|
8125
|
+
verdict: "pass",
|
|
8126
|
+
hits: ["No tool requirements specified"],
|
|
8127
|
+
misses: [],
|
|
8128
|
+
expectedAspectCount: 0
|
|
8129
|
+
};
|
|
8130
|
+
}
|
|
8131
|
+
const hits = [];
|
|
8132
|
+
const misses = [];
|
|
8133
|
+
for (const toolName of toolNames) {
|
|
8134
|
+
const required = minimums[toolName];
|
|
8135
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
8136
|
+
if (actual >= required) {
|
|
8137
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8138
|
+
} else {
|
|
8139
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8140
|
+
}
|
|
8141
|
+
}
|
|
8142
|
+
const score = hits.length / toolNames.length;
|
|
8143
|
+
return {
|
|
8144
|
+
score,
|
|
8145
|
+
verdict: scoreToVerdict(score),
|
|
8146
|
+
hits,
|
|
8147
|
+
misses,
|
|
8148
|
+
expectedAspectCount: toolNames.length
|
|
8149
|
+
};
|
|
8150
|
+
}
|
|
8151
|
+
evaluateInOrder(toolCalls) {
|
|
8152
|
+
const expected = this.config.expected ?? [];
|
|
8153
|
+
if (expected.length === 0) {
|
|
8154
|
+
return {
|
|
8155
|
+
score: 1,
|
|
8156
|
+
verdict: "pass",
|
|
8157
|
+
hits: ["No tool sequence specified"],
|
|
8158
|
+
misses: [],
|
|
8159
|
+
expectedAspectCount: 0
|
|
8160
|
+
};
|
|
8161
|
+
}
|
|
8162
|
+
const hits = [];
|
|
8163
|
+
const misses = [];
|
|
8164
|
+
let actualIndex = 0;
|
|
8165
|
+
for (let i = 0; i < expected.length; i++) {
|
|
8166
|
+
const expectedItem = expected[i];
|
|
8167
|
+
const expectedTool = expectedItem.tool;
|
|
8168
|
+
let found = false;
|
|
8169
|
+
let argsMismatch = false;
|
|
8170
|
+
while (actualIndex < toolCalls.length) {
|
|
8171
|
+
const actualCall = toolCalls[actualIndex];
|
|
8172
|
+
if (actualCall.name === expectedTool) {
|
|
8173
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8174
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
8175
|
+
actualIndex++;
|
|
8176
|
+
found = true;
|
|
8177
|
+
break;
|
|
8178
|
+
}
|
|
8179
|
+
misses.push(
|
|
8180
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
8181
|
+
);
|
|
8182
|
+
actualIndex++;
|
|
8183
|
+
argsMismatch = true;
|
|
8184
|
+
break;
|
|
8185
|
+
}
|
|
8186
|
+
actualIndex++;
|
|
8187
|
+
}
|
|
8188
|
+
if (!found && !argsMismatch) {
|
|
8189
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
8190
|
+
}
|
|
8191
|
+
}
|
|
8192
|
+
const score = hits.length / expected.length;
|
|
8193
|
+
return {
|
|
8194
|
+
score,
|
|
8195
|
+
verdict: scoreToVerdict(score),
|
|
8196
|
+
hits,
|
|
8197
|
+
misses,
|
|
8198
|
+
expectedAspectCount: expected.length
|
|
8199
|
+
};
|
|
8200
|
+
}
|
|
8201
|
+
evaluateExact(toolCalls) {
|
|
8202
|
+
const expected = this.config.expected ?? [];
|
|
8203
|
+
if (expected.length === 0) {
|
|
8204
|
+
return {
|
|
8205
|
+
score: 1,
|
|
8206
|
+
verdict: "pass",
|
|
8207
|
+
hits: ["No tool sequence specified"],
|
|
8208
|
+
misses: [],
|
|
8209
|
+
expectedAspectCount: 0
|
|
8210
|
+
};
|
|
8211
|
+
}
|
|
8212
|
+
const hits = [];
|
|
8213
|
+
const misses = [];
|
|
8214
|
+
if (toolCalls.length !== expected.length) {
|
|
8215
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
8216
|
+
}
|
|
8217
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
8218
|
+
for (let i = 0; i < checkLength; i++) {
|
|
8219
|
+
const expectedItem = expected[i];
|
|
8220
|
+
const expectedTool = expectedItem.tool;
|
|
8221
|
+
const actualCall = toolCalls[i];
|
|
8222
|
+
const actualTool = actualCall.name;
|
|
8223
|
+
if (actualTool === expectedTool) {
|
|
8224
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8225
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
8226
|
+
} else {
|
|
8227
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
8228
|
+
}
|
|
8229
|
+
} else {
|
|
8230
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
8231
|
+
}
|
|
8232
|
+
}
|
|
8233
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
8234
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
8235
|
+
}
|
|
8236
|
+
const score = hits.length / expected.length;
|
|
8237
|
+
return {
|
|
8238
|
+
score,
|
|
8239
|
+
verdict: scoreToVerdict(score),
|
|
8240
|
+
hits,
|
|
8241
|
+
misses,
|
|
8242
|
+
expectedAspectCount: expected.length
|
|
8243
|
+
};
|
|
8244
|
+
}
|
|
8245
|
+
};
|
|
8246
|
+
|
|
7772
8247
|
// src/evaluation/orchestrator.ts
|
|
7773
|
-
var
|
|
8248
|
+
var import_node_crypto5 = require("crypto");
|
|
7774
8249
|
var import_node_path16 = __toESM(require("path"), 1);
|
|
7775
8250
|
|
|
7776
8251
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -7982,6 +8457,17 @@ async function runEvaluation(options) {
|
|
|
7982
8457
|
}
|
|
7983
8458
|
return getOrCreateProvider(resolvedJudge);
|
|
7984
8459
|
};
|
|
8460
|
+
const targetResolver = (name) => {
|
|
8461
|
+
const resolved = resolveTargetByName(name);
|
|
8462
|
+
if (!resolved) {
|
|
8463
|
+
return void 0;
|
|
8464
|
+
}
|
|
8465
|
+
return getOrCreateProvider(resolved);
|
|
8466
|
+
};
|
|
8467
|
+
const availableTargets = [
|
|
8468
|
+
target.name,
|
|
8469
|
+
...Array.from(targetDefinitions.keys())
|
|
8470
|
+
];
|
|
7985
8471
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
7986
8472
|
const primaryProvider = getOrCreateProvider(target);
|
|
7987
8473
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -8011,7 +8497,9 @@ async function runEvaluation(options) {
|
|
|
8011
8497
|
onResult,
|
|
8012
8498
|
verbose,
|
|
8013
8499
|
resolveJudgeProvider,
|
|
8014
|
-
agentTimeoutMs
|
|
8500
|
+
agentTimeoutMs,
|
|
8501
|
+
targetResolver,
|
|
8502
|
+
availableTargets
|
|
8015
8503
|
});
|
|
8016
8504
|
} catch (error) {
|
|
8017
8505
|
if (verbose) {
|
|
@@ -8050,7 +8538,9 @@ async function runEvaluation(options) {
|
|
|
8050
8538
|
cache,
|
|
8051
8539
|
useCache,
|
|
8052
8540
|
now,
|
|
8053
|
-
judgeProvider
|
|
8541
|
+
judgeProvider,
|
|
8542
|
+
targetResolver,
|
|
8543
|
+
availableTargets
|
|
8054
8544
|
});
|
|
8055
8545
|
if (onProgress) {
|
|
8056
8546
|
await onProgress({
|
|
@@ -8117,7 +8607,9 @@ async function runBatchEvaluation(options) {
|
|
|
8117
8607
|
onProgress,
|
|
8118
8608
|
onResult,
|
|
8119
8609
|
resolveJudgeProvider,
|
|
8120
|
-
agentTimeoutMs
|
|
8610
|
+
agentTimeoutMs,
|
|
8611
|
+
targetResolver,
|
|
8612
|
+
availableTargets
|
|
8121
8613
|
} = options;
|
|
8122
8614
|
const promptInputsList = [];
|
|
8123
8615
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -8176,7 +8668,7 @@ async function runBatchEvaluation(options) {
|
|
|
8176
8668
|
costUsd: providerResponse.costUsd,
|
|
8177
8669
|
durationMs: providerResponse.durationMs
|
|
8178
8670
|
}) : void 0;
|
|
8179
|
-
const candidate =
|
|
8671
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8180
8672
|
const providerError = extractProviderError(providerResponse);
|
|
8181
8673
|
let result;
|
|
8182
8674
|
try {
|
|
@@ -8192,7 +8684,9 @@ async function runBatchEvaluation(options) {
|
|
|
8192
8684
|
judgeProvider: await resolveJudgeProvider(target),
|
|
8193
8685
|
agentTimeoutMs,
|
|
8194
8686
|
outputMessages,
|
|
8195
|
-
traceSummary
|
|
8687
|
+
traceSummary,
|
|
8688
|
+
targetResolver,
|
|
8689
|
+
availableTargets
|
|
8196
8690
|
});
|
|
8197
8691
|
if (providerError) {
|
|
8198
8692
|
result = { ...result, error: providerError };
|
|
@@ -8250,7 +8744,9 @@ async function runEvalCase(options) {
|
|
|
8250
8744
|
cache,
|
|
8251
8745
|
useCache,
|
|
8252
8746
|
signal,
|
|
8253
|
-
judgeProvider
|
|
8747
|
+
judgeProvider,
|
|
8748
|
+
targetResolver,
|
|
8749
|
+
availableTargets
|
|
8254
8750
|
} = options;
|
|
8255
8751
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
8256
8752
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -8309,7 +8805,7 @@ async function runEvalCase(options) {
|
|
|
8309
8805
|
costUsd: providerResponse.costUsd,
|
|
8310
8806
|
durationMs: providerResponse.durationMs
|
|
8311
8807
|
}) : void 0;
|
|
8312
|
-
const candidate =
|
|
8808
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8313
8809
|
const providerError = extractProviderError(providerResponse);
|
|
8314
8810
|
try {
|
|
8315
8811
|
const result = await evaluateCandidate({
|
|
@@ -8324,7 +8820,9 @@ async function runEvalCase(options) {
|
|
|
8324
8820
|
judgeProvider,
|
|
8325
8821
|
agentTimeoutMs,
|
|
8326
8822
|
outputMessages,
|
|
8327
|
-
traceSummary
|
|
8823
|
+
traceSummary,
|
|
8824
|
+
targetResolver,
|
|
8825
|
+
availableTargets
|
|
8328
8826
|
});
|
|
8329
8827
|
return providerError ? { ...result, error: providerError } : result;
|
|
8330
8828
|
} catch (error) {
|
|
@@ -8344,7 +8842,9 @@ async function evaluateCandidate(options) {
|
|
|
8344
8842
|
judgeProvider,
|
|
8345
8843
|
agentTimeoutMs,
|
|
8346
8844
|
outputMessages,
|
|
8347
|
-
traceSummary
|
|
8845
|
+
traceSummary,
|
|
8846
|
+
targetResolver,
|
|
8847
|
+
availableTargets
|
|
8348
8848
|
} = options;
|
|
8349
8849
|
const gradeTimestamp = nowFn();
|
|
8350
8850
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -8359,7 +8859,9 @@ async function evaluateCandidate(options) {
|
|
|
8359
8859
|
judgeProvider,
|
|
8360
8860
|
agentTimeoutMs,
|
|
8361
8861
|
outputMessages,
|
|
8362
|
-
traceSummary
|
|
8862
|
+
traceSummary,
|
|
8863
|
+
targetResolver,
|
|
8864
|
+
availableTargets
|
|
8363
8865
|
});
|
|
8364
8866
|
const completedAt = nowFn();
|
|
8365
8867
|
let agentProviderRequest;
|
|
@@ -8412,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8412
8914
|
judgeProvider,
|
|
8413
8915
|
agentTimeoutMs,
|
|
8414
8916
|
outputMessages,
|
|
8415
|
-
traceSummary
|
|
8917
|
+
traceSummary,
|
|
8918
|
+
targetResolver,
|
|
8919
|
+
availableTargets
|
|
8416
8920
|
} = options;
|
|
8417
8921
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
8418
8922
|
return runEvaluatorList({
|
|
@@ -8428,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8428
8932
|
judgeProvider,
|
|
8429
8933
|
agentTimeoutMs,
|
|
8430
8934
|
outputMessages,
|
|
8431
|
-
traceSummary
|
|
8935
|
+
traceSummary,
|
|
8936
|
+
targetResolver,
|
|
8937
|
+
availableTargets
|
|
8432
8938
|
});
|
|
8433
8939
|
}
|
|
8434
8940
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -8446,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8446
8952
|
now,
|
|
8447
8953
|
judgeProvider,
|
|
8448
8954
|
outputMessages,
|
|
8449
|
-
traceSummary
|
|
8955
|
+
traceSummary,
|
|
8956
|
+
targetResolver,
|
|
8957
|
+
availableTargets
|
|
8450
8958
|
});
|
|
8451
8959
|
return { score };
|
|
8452
8960
|
}
|
|
@@ -8464,7 +8972,9 @@ async function runEvaluatorList(options) {
|
|
|
8464
8972
|
judgeProvider,
|
|
8465
8973
|
agentTimeoutMs,
|
|
8466
8974
|
outputMessages,
|
|
8467
|
-
traceSummary
|
|
8975
|
+
traceSummary,
|
|
8976
|
+
targetResolver,
|
|
8977
|
+
availableTargets
|
|
8468
8978
|
} = options;
|
|
8469
8979
|
const scored = [];
|
|
8470
8980
|
const evaluatorResults = [];
|
|
@@ -8502,7 +9012,8 @@ async function runEvaluatorList(options) {
|
|
|
8502
9012
|
script: evaluator.script,
|
|
8503
9013
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
8504
9014
|
agentTimeoutMs,
|
|
8505
|
-
config: evaluator.config
|
|
9015
|
+
config: evaluator.config,
|
|
9016
|
+
target: evaluator.target
|
|
8506
9017
|
});
|
|
8507
9018
|
const score2 = await codeEvaluator.evaluate({
|
|
8508
9019
|
evalCase,
|
|
@@ -8512,8 +9023,11 @@ async function runEvaluatorList(options) {
|
|
|
8512
9023
|
attempt,
|
|
8513
9024
|
promptInputs,
|
|
8514
9025
|
now,
|
|
9026
|
+
judgeProvider,
|
|
8515
9027
|
outputMessages,
|
|
8516
|
-
traceSummary
|
|
9028
|
+
traceSummary,
|
|
9029
|
+
targetResolver,
|
|
9030
|
+
availableTargets
|
|
8517
9031
|
});
|
|
8518
9032
|
const weight = evaluator.weight ?? 1;
|
|
8519
9033
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -8526,7 +9040,8 @@ async function runEvaluatorList(options) {
|
|
|
8526
9040
|
hits: score2.hits,
|
|
8527
9041
|
misses: score2.misses,
|
|
8528
9042
|
reasoning: score2.reasoning,
|
|
8529
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
9043
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
9044
|
+
details: score2.details
|
|
8530
9045
|
});
|
|
8531
9046
|
}
|
|
8532
9047
|
if (evaluator.type === "composite") {
|
|
@@ -8540,7 +9055,8 @@ async function runEvaluatorList(options) {
|
|
|
8540
9055
|
script: memberConfig.script,
|
|
8541
9056
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
8542
9057
|
agentTimeoutMs,
|
|
8543
|
-
config: memberConfig.config
|
|
9058
|
+
config: memberConfig.config,
|
|
9059
|
+
target: memberConfig.target
|
|
8544
9060
|
});
|
|
8545
9061
|
case "composite":
|
|
8546
9062
|
return new CompositeEvaluator({
|
|
@@ -8589,7 +9105,9 @@ async function runEvaluatorList(options) {
|
|
|
8589
9105
|
now,
|
|
8590
9106
|
judgeProvider,
|
|
8591
9107
|
outputMessages,
|
|
8592
|
-
traceSummary
|
|
9108
|
+
traceSummary,
|
|
9109
|
+
targetResolver,
|
|
9110
|
+
availableTargets
|
|
8593
9111
|
});
|
|
8594
9112
|
const weight = evaluator.weight ?? 1;
|
|
8595
9113
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -8785,11 +9303,11 @@ async function runEvaluatorList(options) {
|
|
|
8785
9303
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
8786
9304
|
0
|
|
8787
9305
|
);
|
|
8788
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
9306
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
8789
9307
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
8790
9308
|
const score = {
|
|
8791
9309
|
score: aggregateScore,
|
|
8792
|
-
verdict:
|
|
9310
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
8793
9311
|
hits,
|
|
8794
9312
|
misses,
|
|
8795
9313
|
expectedAspectCount,
|
|
@@ -8836,18 +9354,6 @@ async function resolveCustomPrompt(config) {
|
|
|
8836
9354
|
}
|
|
8837
9355
|
return config.prompt;
|
|
8838
9356
|
}
|
|
8839
|
-
function isNonEmptyString2(value) {
|
|
8840
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
8841
|
-
}
|
|
8842
|
-
function scoreToVerdict2(score) {
|
|
8843
|
-
if (score >= 0.8) {
|
|
8844
|
-
return "pass";
|
|
8845
|
-
}
|
|
8846
|
-
if (score >= 0.6) {
|
|
8847
|
-
return "borderline";
|
|
8848
|
-
}
|
|
8849
|
-
return "fail";
|
|
8850
|
-
}
|
|
8851
9357
|
function filterEvalCases(evalCases, evalId) {
|
|
8852
9358
|
if (!evalId) {
|
|
8853
9359
|
return evalCases;
|
|
@@ -8949,7 +9455,7 @@ function extractProviderError(response) {
|
|
|
8949
9455
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
8950
9456
|
}
|
|
8951
9457
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
8952
|
-
const hash = (0,
|
|
9458
|
+
const hash = (0, import_node_crypto5.createHash)("sha256");
|
|
8953
9459
|
hash.update(provider.id);
|
|
8954
9460
|
hash.update(target.name);
|
|
8955
9461
|
hash.update(evalCase.id);
|
|
@@ -8990,7 +9496,8 @@ function mapChildResults(children) {
|
|
|
8990
9496
|
misses: child.misses,
|
|
8991
9497
|
reasoning: child.reasoning,
|
|
8992
9498
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
8993
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
9499
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
9500
|
+
details: child.details
|
|
8994
9501
|
}));
|
|
8995
9502
|
}
|
|
8996
9503
|
function computeWeightedMean(entries) {
|
|
@@ -9005,7 +9512,7 @@ function computeWeightedMean(entries) {
|
|
|
9005
9512
|
}
|
|
9006
9513
|
|
|
9007
9514
|
// src/evaluation/generators/rubric-generator.ts
|
|
9008
|
-
var
|
|
9515
|
+
var import_ai4 = require("ai");
|
|
9009
9516
|
var import_zod4 = require("zod");
|
|
9010
9517
|
var rubricItemSchema = import_zod4.z.object({
|
|
9011
9518
|
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -9039,7 +9546,7 @@ You must return a valid JSON object matching this schema:
|
|
|
9039
9546
|
let lastError;
|
|
9040
9547
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
9041
9548
|
try {
|
|
9042
|
-
const { text } = await (0,
|
|
9549
|
+
const { text } = await (0, import_ai4.generateText)({
|
|
9043
9550
|
model,
|
|
9044
9551
|
system,
|
|
9045
9552
|
prompt
|
|
@@ -9084,17 +9591,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
9084
9591
|
return parts.join("\n");
|
|
9085
9592
|
}
|
|
9086
9593
|
|
|
9087
|
-
// src/evaluation/code-judge-sdk.ts
|
|
9088
|
-
var import_node_fs7 = require("fs");
|
|
9089
|
-
function parseCodeJudgePayload(payload) {
|
|
9090
|
-
const parsed = JSON.parse(payload);
|
|
9091
|
-
return toCamelCaseDeep(parsed);
|
|
9092
|
-
}
|
|
9093
|
-
function readCodeJudgePayload() {
|
|
9094
|
-
const stdin = (0, import_node_fs7.readFileSync)(0, "utf8");
|
|
9095
|
-
return parseCodeJudgePayload(stdin);
|
|
9096
|
-
}
|
|
9097
|
-
|
|
9098
9594
|
// src/index.ts
|
|
9099
9595
|
function createAgentKernel() {
|
|
9100
9596
|
return { status: "stub" };
|
|
@@ -9113,33 +9609,39 @@ function createAgentKernel() {
|
|
|
9113
9609
|
ToolTrajectoryEvaluator,
|
|
9114
9610
|
avgToolDurationMs,
|
|
9115
9611
|
buildDirectoryChain,
|
|
9612
|
+
buildOutputSchema,
|
|
9116
9613
|
buildPromptInputs,
|
|
9117
9614
|
buildSearchRoots,
|
|
9615
|
+
clampScore,
|
|
9118
9616
|
computeTraceSummary,
|
|
9119
9617
|
consumeClaudeCodeLogEntries,
|
|
9120
9618
|
consumeCodexLogEntries,
|
|
9121
9619
|
consumePiLogEntries,
|
|
9122
9620
|
createAgentKernel,
|
|
9123
9621
|
createProvider,
|
|
9622
|
+
deepEqual,
|
|
9124
9623
|
ensureVSCodeSubagents,
|
|
9624
|
+
executeScript,
|
|
9125
9625
|
explorationRatio,
|
|
9126
|
-
|
|
9626
|
+
extractJsonBlob,
|
|
9127
9627
|
fileExists,
|
|
9128
9628
|
findGitRoot,
|
|
9629
|
+
freeformEvaluationSchema,
|
|
9129
9630
|
generateRubrics,
|
|
9130
9631
|
getHitCount,
|
|
9131
9632
|
isEvaluatorKind,
|
|
9132
9633
|
isGuidelineFile,
|
|
9133
9634
|
isJsonObject,
|
|
9134
9635
|
isJsonValue,
|
|
9636
|
+
isNonEmptyString,
|
|
9135
9637
|
isTestMessage,
|
|
9136
9638
|
isTestMessageRole,
|
|
9137
9639
|
listTargetNames,
|
|
9138
9640
|
loadEvalCases,
|
|
9139
9641
|
mergeExecutionMetrics,
|
|
9140
9642
|
normalizeLineEndings,
|
|
9141
|
-
|
|
9142
|
-
|
|
9643
|
+
parseJsonFromText,
|
|
9644
|
+
parseJsonSafe,
|
|
9143
9645
|
readJsonFile,
|
|
9144
9646
|
readTargetDefinitions,
|
|
9145
9647
|
readTestSuiteMetadata,
|
|
@@ -9149,6 +9651,7 @@ function createAgentKernel() {
|
|
|
9149
9651
|
resolveTargetDefinition,
|
|
9150
9652
|
runEvalCase,
|
|
9151
9653
|
runEvaluation,
|
|
9654
|
+
scoreToVerdict,
|
|
9152
9655
|
subscribeToClaudeCodeLogEntries,
|
|
9153
9656
|
subscribeToCodexLogEntries,
|
|
9154
9657
|
subscribeToPiLogEntries,
|