@agentv/core 2.0.2 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs +0 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +0 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1336 -1007
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +142 -71
- package/dist/index.d.ts +142 -71
- package/dist/index.js +1293 -973
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -42,31 +42,39 @@ __export(index_exports, {
|
|
|
42
42
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
43
43
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
44
44
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
45
|
+
buildOutputSchema: () => buildOutputSchema,
|
|
45
46
|
buildPromptInputs: () => buildPromptInputs,
|
|
46
47
|
buildSearchRoots: () => buildSearchRoots2,
|
|
48
|
+
clampScore: () => clampScore,
|
|
47
49
|
computeTraceSummary: () => computeTraceSummary,
|
|
48
50
|
consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
|
|
49
51
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
50
52
|
consumePiLogEntries: () => consumePiLogEntries,
|
|
51
53
|
createAgentKernel: () => createAgentKernel,
|
|
52
54
|
createProvider: () => createProvider,
|
|
55
|
+
deepEqual: () => deepEqual,
|
|
53
56
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
|
+
executeScript: () => executeScript,
|
|
54
58
|
explorationRatio: () => explorationRatio,
|
|
55
|
-
|
|
59
|
+
extractJsonBlob: () => extractJsonBlob,
|
|
56
60
|
fileExists: () => fileExists2,
|
|
57
61
|
findGitRoot: () => findGitRoot,
|
|
62
|
+
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
58
63
|
generateRubrics: () => generateRubrics,
|
|
59
64
|
getHitCount: () => getHitCount,
|
|
60
65
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
61
66
|
isGuidelineFile: () => isGuidelineFile,
|
|
62
67
|
isJsonObject: () => isJsonObject,
|
|
63
68
|
isJsonValue: () => isJsonValue,
|
|
69
|
+
isNonEmptyString: () => isNonEmptyString,
|
|
64
70
|
isTestMessage: () => isTestMessage,
|
|
65
71
|
isTestMessageRole: () => isTestMessageRole,
|
|
66
72
|
listTargetNames: () => listTargetNames,
|
|
67
73
|
loadEvalCases: () => loadEvalCases,
|
|
68
74
|
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
69
75
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
76
|
+
parseJsonFromText: () => parseJsonFromText,
|
|
77
|
+
parseJsonSafe: () => parseJsonSafe,
|
|
70
78
|
readJsonFile: () => readJsonFile,
|
|
71
79
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
72
80
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
@@ -76,6 +84,7 @@ __export(index_exports, {
|
|
|
76
84
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
77
85
|
runEvalCase: () => runEvalCase,
|
|
78
86
|
runEvaluation: () => runEvaluation,
|
|
87
|
+
scoreToVerdict: () => scoreToVerdict,
|
|
79
88
|
subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
|
|
80
89
|
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
81
90
|
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
@@ -221,85 +230,6 @@ var import_promises6 = require("fs/promises");
|
|
|
221
230
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
222
231
|
var import_yaml2 = require("yaml");
|
|
223
232
|
|
|
224
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
225
|
-
function extractCodeBlocks(segments) {
|
|
226
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
227
|
-
const codeBlocks = [];
|
|
228
|
-
for (const segment of segments) {
|
|
229
|
-
const typeValue = segment.type;
|
|
230
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
231
|
-
continue;
|
|
232
|
-
}
|
|
233
|
-
const textValue = segment.value;
|
|
234
|
-
if (typeof textValue !== "string") {
|
|
235
|
-
continue;
|
|
236
|
-
}
|
|
237
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
238
|
-
if (matches) {
|
|
239
|
-
codeBlocks.push(...matches);
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
return codeBlocks;
|
|
243
|
-
}
|
|
244
|
-
function formatFileContents(parts) {
|
|
245
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
246
|
-
if (fileCount > 0) {
|
|
247
|
-
return parts.map((part) => {
|
|
248
|
-
if (part.isFile && part.displayPath) {
|
|
249
|
-
return `<file path="${part.displayPath}">
|
|
250
|
-
${part.content}
|
|
251
|
-
</file>`;
|
|
252
|
-
}
|
|
253
|
-
return part.content;
|
|
254
|
-
}).join("\n\n");
|
|
255
|
-
}
|
|
256
|
-
return parts.map((p) => p.content).join(" ");
|
|
257
|
-
}
|
|
258
|
-
function formatSegment(segment, mode = "lm") {
|
|
259
|
-
const type = asString(segment.type);
|
|
260
|
-
if (type === "text") {
|
|
261
|
-
return asString(segment.value);
|
|
262
|
-
}
|
|
263
|
-
if (type === "guideline_ref") {
|
|
264
|
-
const refPath = asString(segment.path);
|
|
265
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
266
|
-
}
|
|
267
|
-
if (type === "file") {
|
|
268
|
-
const filePath = asString(segment.path);
|
|
269
|
-
if (!filePath) {
|
|
270
|
-
return void 0;
|
|
271
|
-
}
|
|
272
|
-
if (mode === "agent") {
|
|
273
|
-
return `<file: path="${filePath}">`;
|
|
274
|
-
}
|
|
275
|
-
const text = asString(segment.text);
|
|
276
|
-
if (text && filePath) {
|
|
277
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
return void 0;
|
|
281
|
-
}
|
|
282
|
-
function hasVisibleContent(segments) {
|
|
283
|
-
return segments.some((segment) => {
|
|
284
|
-
const type = asString(segment.type);
|
|
285
|
-
if (type === "text") {
|
|
286
|
-
const value = asString(segment.value);
|
|
287
|
-
return value !== void 0 && value.trim().length > 0;
|
|
288
|
-
}
|
|
289
|
-
if (type === "guideline_ref") {
|
|
290
|
-
return false;
|
|
291
|
-
}
|
|
292
|
-
if (type === "file") {
|
|
293
|
-
const text = asString(segment.text);
|
|
294
|
-
return text !== void 0 && text.trim().length > 0;
|
|
295
|
-
}
|
|
296
|
-
return false;
|
|
297
|
-
});
|
|
298
|
-
}
|
|
299
|
-
function asString(value) {
|
|
300
|
-
return typeof value === "string" ? value : void 0;
|
|
301
|
-
}
|
|
302
|
-
|
|
303
233
|
// src/evaluation/loaders/config-loader.ts
|
|
304
234
|
var import_promises2 = require("fs/promises");
|
|
305
235
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -407,7 +337,6 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
407
337
|
}
|
|
408
338
|
|
|
409
339
|
// src/evaluation/loaders/config-loader.ts
|
|
410
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
411
340
|
var ANSI_YELLOW = "\x1B[33m";
|
|
412
341
|
var ANSI_RESET = "\x1B[0m";
|
|
413
342
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
@@ -425,13 +354,6 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
425
354
|
continue;
|
|
426
355
|
}
|
|
427
356
|
const config = parsed;
|
|
428
|
-
const schema = config.$schema;
|
|
429
|
-
if (schema !== SCHEMA_CONFIG_V2) {
|
|
430
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
431
|
-
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
432
|
-
logWarning(message);
|
|
433
|
-
continue;
|
|
434
|
-
}
|
|
435
357
|
const guidelinePatterns = config.guideline_patterns;
|
|
436
358
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
437
359
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -540,7 +462,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
|
|
|
540
462
|
var ANSI_RESET3 = "\x1B[0m";
|
|
541
463
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
542
464
|
const execution = rawEvalCase.execution;
|
|
543
|
-
const
|
|
465
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
466
|
+
const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
544
467
|
if (candidateEvaluators === void 0) {
|
|
545
468
|
return void 0;
|
|
546
469
|
}
|
|
@@ -554,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
554
477
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
555
478
|
continue;
|
|
556
479
|
}
|
|
557
|
-
const name =
|
|
480
|
+
const name = asString(rawEvaluator.name);
|
|
558
481
|
const typeValue = rawEvaluator.type;
|
|
559
482
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
560
483
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -582,7 +505,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
582
505
|
continue;
|
|
583
506
|
}
|
|
584
507
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
585
|
-
const cwd =
|
|
508
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
586
509
|
let resolvedCwd;
|
|
587
510
|
if (cwd) {
|
|
588
511
|
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
@@ -597,7 +520,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
597
520
|
} else {
|
|
598
521
|
resolvedCwd = searchRoots[0];
|
|
599
522
|
}
|
|
600
|
-
const
|
|
523
|
+
const rawTarget = rawEvaluator.target;
|
|
524
|
+
let targetConfig;
|
|
525
|
+
if (rawTarget !== void 0) {
|
|
526
|
+
if (isJsonObject2(rawTarget)) {
|
|
527
|
+
const maxCalls = rawTarget.max_calls;
|
|
528
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
529
|
+
logWarning2(
|
|
530
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
531
|
+
);
|
|
532
|
+
} else {
|
|
533
|
+
targetConfig = {
|
|
534
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
} else if (rawTarget === true) {
|
|
538
|
+
targetConfig = {};
|
|
539
|
+
} else {
|
|
540
|
+
logWarning2(
|
|
541
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
542
|
+
);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
601
546
|
const config = {};
|
|
602
547
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
603
548
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -611,7 +556,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
611
556
|
cwd,
|
|
612
557
|
resolvedCwd,
|
|
613
558
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
614
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
559
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
560
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
615
561
|
});
|
|
616
562
|
continue;
|
|
617
563
|
}
|
|
@@ -628,7 +574,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
628
574
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
629
575
|
continue;
|
|
630
576
|
}
|
|
631
|
-
const aggregatorType =
|
|
577
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
632
578
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
633
579
|
logWarning2(
|
|
634
580
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -641,7 +587,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
641
587
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
642
588
|
continue;
|
|
643
589
|
}
|
|
644
|
-
const memberName =
|
|
590
|
+
const memberName = asString(rawMember.name);
|
|
645
591
|
const memberType = rawMember.type;
|
|
646
592
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
647
593
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -679,7 +625,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
679
625
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
680
626
|
};
|
|
681
627
|
} else if (aggregatorType === "code_judge") {
|
|
682
|
-
const aggregatorPath =
|
|
628
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
683
629
|
if (!aggregatorPath) {
|
|
684
630
|
logWarning2(
|
|
685
631
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -692,7 +638,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
692
638
|
cwd: searchRoots[0]
|
|
693
639
|
};
|
|
694
640
|
} else {
|
|
695
|
-
const aggregatorPrompt =
|
|
641
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
696
642
|
let promptPath2;
|
|
697
643
|
if (aggregatorPrompt) {
|
|
698
644
|
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
@@ -717,7 +663,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
717
663
|
continue;
|
|
718
664
|
}
|
|
719
665
|
if (typeValue === "tool_trajectory") {
|
|
720
|
-
const mode =
|
|
666
|
+
const mode = asString(rawEvaluator.mode);
|
|
721
667
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
722
668
|
logWarning2(
|
|
723
669
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -808,8 +754,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
808
754
|
);
|
|
809
755
|
continue;
|
|
810
756
|
}
|
|
811
|
-
const fieldPath =
|
|
812
|
-
const match =
|
|
757
|
+
const fieldPath = asString(rawField.path);
|
|
758
|
+
const match = asString(rawField.match);
|
|
813
759
|
if (!fieldPath) {
|
|
814
760
|
logWarning2(
|
|
815
761
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -839,7 +785,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
839
785
|
);
|
|
840
786
|
continue;
|
|
841
787
|
}
|
|
842
|
-
const aggregation =
|
|
788
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
843
789
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
844
790
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
845
791
|
evaluators.push({
|
|
@@ -920,7 +866,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
920
866
|
});
|
|
921
867
|
continue;
|
|
922
868
|
}
|
|
923
|
-
const prompt =
|
|
869
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
924
870
|
let promptPath;
|
|
925
871
|
if (prompt) {
|
|
926
872
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
@@ -939,11 +885,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
939
885
|
);
|
|
940
886
|
}
|
|
941
887
|
}
|
|
942
|
-
const _model =
|
|
888
|
+
const _model = asString(rawEvaluator.model);
|
|
943
889
|
const rawRubrics = rawEvaluator.rubrics;
|
|
944
890
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
945
|
-
id:
|
|
946
|
-
description:
|
|
891
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
892
|
+
description: asString(rubric.description) ?? "",
|
|
947
893
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
948
894
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
949
895
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -987,7 +933,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
987
933
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
988
934
|
return void 0;
|
|
989
935
|
}
|
|
990
|
-
function
|
|
936
|
+
function asString(value) {
|
|
991
937
|
return typeof value === "string" ? value : void 0;
|
|
992
938
|
}
|
|
993
939
|
function asStringArray(value, description) {
|
|
@@ -1063,6 +1009,68 @@ function isValidFieldAggregationType(value) {
|
|
|
1063
1009
|
// src/evaluation/loaders/message-processor.ts
|
|
1064
1010
|
var import_promises4 = require("fs/promises");
|
|
1065
1011
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1012
|
+
|
|
1013
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
1014
|
+
function formatFileContents(parts) {
|
|
1015
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
1016
|
+
if (fileCount > 0) {
|
|
1017
|
+
return parts.map((part) => {
|
|
1018
|
+
if (part.isFile && part.displayPath) {
|
|
1019
|
+
return `<file path="${part.displayPath}">
|
|
1020
|
+
${part.content}
|
|
1021
|
+
</file>`;
|
|
1022
|
+
}
|
|
1023
|
+
return part.content;
|
|
1024
|
+
}).join("\n\n");
|
|
1025
|
+
}
|
|
1026
|
+
return parts.map((p) => p.content).join(" ");
|
|
1027
|
+
}
|
|
1028
|
+
function formatSegment(segment, mode = "lm") {
|
|
1029
|
+
const type = asString2(segment.type);
|
|
1030
|
+
if (type === "text") {
|
|
1031
|
+
return asString2(segment.value);
|
|
1032
|
+
}
|
|
1033
|
+
if (type === "guideline_ref") {
|
|
1034
|
+
const refPath = asString2(segment.path);
|
|
1035
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
1036
|
+
}
|
|
1037
|
+
if (type === "file") {
|
|
1038
|
+
const filePath = asString2(segment.path);
|
|
1039
|
+
if (!filePath) {
|
|
1040
|
+
return void 0;
|
|
1041
|
+
}
|
|
1042
|
+
if (mode === "agent") {
|
|
1043
|
+
return `<file: path="${filePath}">`;
|
|
1044
|
+
}
|
|
1045
|
+
const text = asString2(segment.text);
|
|
1046
|
+
if (text && filePath) {
|
|
1047
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
return void 0;
|
|
1051
|
+
}
|
|
1052
|
+
function hasVisibleContent(segments) {
|
|
1053
|
+
return segments.some((segment) => {
|
|
1054
|
+
const type = asString2(segment.type);
|
|
1055
|
+
if (type === "text") {
|
|
1056
|
+
const value = asString2(segment.value);
|
|
1057
|
+
return value !== void 0 && value.trim().length > 0;
|
|
1058
|
+
}
|
|
1059
|
+
if (type === "guideline_ref") {
|
|
1060
|
+
return false;
|
|
1061
|
+
}
|
|
1062
|
+
if (type === "file") {
|
|
1063
|
+
const text = asString2(segment.text);
|
|
1064
|
+
return text !== void 0 && text.trim().length > 0;
|
|
1065
|
+
}
|
|
1066
|
+
return false;
|
|
1067
|
+
});
|
|
1068
|
+
}
|
|
1069
|
+
function asString2(value) {
|
|
1070
|
+
return typeof value === "string" ? value : void 0;
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
// src/evaluation/loaders/message-processor.ts
|
|
1066
1074
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
1067
1075
|
var ANSI_RESET4 = "\x1B[0m";
|
|
1068
1076
|
async function processMessages(options) {
|
|
@@ -1368,9 +1376,6 @@ ${messageContent}`);
|
|
|
1368
1376
|
questionParts.push(formattedContent);
|
|
1369
1377
|
}
|
|
1370
1378
|
}
|
|
1371
|
-
if (testCase.code_snippets.length > 0) {
|
|
1372
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1373
|
-
}
|
|
1374
1379
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1375
1380
|
}
|
|
1376
1381
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1569,7 +1574,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1569
1574
|
repoRootPath,
|
|
1570
1575
|
verbose
|
|
1571
1576
|
}) : [];
|
|
1572
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1573
1577
|
let referenceAnswer = "";
|
|
1574
1578
|
if (outputSegments.length > 0) {
|
|
1575
1579
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1642,7 +1646,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1642
1646
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
1643
1647
|
guideline_patterns: guidelinePatterns,
|
|
1644
1648
|
file_paths: allFilePaths,
|
|
1645
|
-
code_snippets: codeSnippets,
|
|
1646
1649
|
expected_outcome: outcome,
|
|
1647
1650
|
evaluator: evalCaseEvaluatorKind,
|
|
1648
1651
|
evaluators
|
|
@@ -6327,9 +6330,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
6327
6330
|
return createProvider(resolved);
|
|
6328
6331
|
}
|
|
6329
6332
|
|
|
6330
|
-
// src/evaluation/evaluators.ts
|
|
6331
|
-
|
|
6332
|
-
|
|
6333
|
+
// src/evaluation/evaluators/scoring.ts
|
|
6334
|
+
function scoreToVerdict(score) {
|
|
6335
|
+
if (score >= 0.8) {
|
|
6336
|
+
return "pass";
|
|
6337
|
+
}
|
|
6338
|
+
if (score >= 0.6) {
|
|
6339
|
+
return "borderline";
|
|
6340
|
+
}
|
|
6341
|
+
return "fail";
|
|
6342
|
+
}
|
|
6343
|
+
function clampScore(value) {
|
|
6344
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
6345
|
+
return 0;
|
|
6346
|
+
}
|
|
6347
|
+
if (value < 0) {
|
|
6348
|
+
return 0;
|
|
6349
|
+
}
|
|
6350
|
+
if (value > 1) {
|
|
6351
|
+
return 1;
|
|
6352
|
+
}
|
|
6353
|
+
return value;
|
|
6354
|
+
}
|
|
6355
|
+
function extractJsonBlob(text) {
|
|
6356
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
6357
|
+
return match?.[0];
|
|
6358
|
+
}
|
|
6359
|
+
function parseJsonFromText(text) {
|
|
6360
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6361
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6362
|
+
return JSON.parse(blob);
|
|
6363
|
+
}
|
|
6364
|
+
function isNonEmptyString(value) {
|
|
6365
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
6366
|
+
}
|
|
6367
|
+
function parseJsonSafe(payload) {
|
|
6368
|
+
try {
|
|
6369
|
+
return JSON.parse(payload);
|
|
6370
|
+
} catch {
|
|
6371
|
+
return void 0;
|
|
6372
|
+
}
|
|
6373
|
+
}
|
|
6374
|
+
function deepEqual(a, b) {
|
|
6375
|
+
if (a === b) return true;
|
|
6376
|
+
if (a === null || b === null) return a === b;
|
|
6377
|
+
if (typeof a !== typeof b) return false;
|
|
6378
|
+
if (typeof a !== "object") return a === b;
|
|
6379
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
6380
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
6381
|
+
if (a.length !== b.length) return false;
|
|
6382
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
6383
|
+
}
|
|
6384
|
+
const aObj = a;
|
|
6385
|
+
const bObj = b;
|
|
6386
|
+
const aKeys = Object.keys(aObj);
|
|
6387
|
+
const bKeys = Object.keys(bObj);
|
|
6388
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
6389
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
6390
|
+
}
|
|
6333
6391
|
|
|
6334
6392
|
// src/runtime/exec.ts
|
|
6335
6393
|
function shellEscapePath(value) {
|
|
@@ -6354,7 +6412,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
6354
6412
|
cwd: options.cwd,
|
|
6355
6413
|
stdin: encoder.encode(stdinPayload),
|
|
6356
6414
|
stdout: "pipe",
|
|
6357
|
-
stderr: "pipe"
|
|
6415
|
+
stderr: "pipe",
|
|
6416
|
+
// Merge additional env vars with process.env
|
|
6417
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6358
6418
|
});
|
|
6359
6419
|
let timedOut = false;
|
|
6360
6420
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -6389,7 +6449,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6389
6449
|
const [cmd, ...args] = argv;
|
|
6390
6450
|
const child = spawn4(cmd, args, {
|
|
6391
6451
|
cwd: options.cwd,
|
|
6392
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
6452
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
6453
|
+
// Merge additional env vars with process.env
|
|
6454
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6393
6455
|
});
|
|
6394
6456
|
const stdoutChunks = [];
|
|
6395
6457
|
const stderrChunks = [];
|
|
@@ -6442,7 +6504,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6442
6504
|
const child = spawn4(wrappedCommand, {
|
|
6443
6505
|
shell: true,
|
|
6444
6506
|
cwd: options.cwd,
|
|
6445
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
6507
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
6508
|
+
// Merge additional env vars with process.env
|
|
6509
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
6446
6510
|
});
|
|
6447
6511
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
6448
6512
|
child.kill();
|
|
@@ -6469,59 +6533,414 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6469
6533
|
}
|
|
6470
6534
|
}
|
|
6471
6535
|
|
|
6472
|
-
// src/
|
|
6473
|
-
|
|
6474
|
-
|
|
6475
|
-
|
|
6476
|
-
|
|
6477
|
-
|
|
6478
|
-
|
|
6479
|
-
|
|
6480
|
-
|
|
6481
|
-
|
|
6482
|
-
|
|
6483
|
-
|
|
6484
|
-
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
|
|
6488
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
6489
|
-
const snakeKey = toSnakeCase(key);
|
|
6490
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6536
|
+
// src/runtime/target-proxy.ts
|
|
6537
|
+
var import_node_crypto4 = require("crypto");
|
|
6538
|
+
var import_node_http = require("http");
|
|
6539
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
6540
|
+
async function createTargetProxy(options) {
|
|
6541
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
6542
|
+
const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
|
|
6543
|
+
let callCount = 0;
|
|
6544
|
+
let isShutdown = false;
|
|
6545
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
6546
|
+
function resolveProvider(targetName) {
|
|
6547
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
6548
|
+
return defaultProvider;
|
|
6549
|
+
}
|
|
6550
|
+
if (targetResolver) {
|
|
6551
|
+
return targetResolver(targetName);
|
|
6491
6552
|
}
|
|
6492
|
-
return
|
|
6553
|
+
return void 0;
|
|
6493
6554
|
}
|
|
6494
|
-
|
|
6495
|
-
|
|
6496
|
-
|
|
6497
|
-
|
|
6498
|
-
|
|
6499
|
-
|
|
6500
|
-
|
|
6501
|
-
|
|
6502
|
-
|
|
6503
|
-
|
|
6504
|
-
|
|
6505
|
-
|
|
6506
|
-
|
|
6507
|
-
|
|
6555
|
+
const server = (0, import_node_http.createServer)(async (req, res) => {
|
|
6556
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
6557
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
6558
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
6559
|
+
if (req.method === "OPTIONS") {
|
|
6560
|
+
res.writeHead(204);
|
|
6561
|
+
res.end();
|
|
6562
|
+
return;
|
|
6563
|
+
}
|
|
6564
|
+
const authHeader = req.headers.authorization;
|
|
6565
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
6566
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
6567
|
+
return;
|
|
6568
|
+
}
|
|
6569
|
+
if (isShutdown) {
|
|
6570
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
6571
|
+
return;
|
|
6572
|
+
}
|
|
6573
|
+
const url2 = req.url ?? "";
|
|
6574
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
6575
|
+
handleInfo(res);
|
|
6576
|
+
return;
|
|
6577
|
+
}
|
|
6578
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
6579
|
+
await handleInvoke(req, res);
|
|
6580
|
+
return;
|
|
6581
|
+
}
|
|
6582
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
6583
|
+
await handleInvokeBatch(req, res);
|
|
6584
|
+
return;
|
|
6585
|
+
}
|
|
6586
|
+
sendJson(res, 404, { error: "Not found" });
|
|
6587
|
+
});
|
|
6588
|
+
function handleInfo(res) {
|
|
6589
|
+
const response = {
|
|
6590
|
+
targetName: defaultProvider.targetName,
|
|
6591
|
+
maxCalls,
|
|
6592
|
+
callCount,
|
|
6593
|
+
availableTargets: targetsList
|
|
6594
|
+
};
|
|
6595
|
+
sendJson(res, 200, response);
|
|
6508
6596
|
}
|
|
6509
|
-
|
|
6510
|
-
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6597
|
+
async function handleInvoke(req, res) {
|
|
6598
|
+
if (callCount >= maxCalls) {
|
|
6599
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
6600
|
+
return;
|
|
6601
|
+
}
|
|
6602
|
+
try {
|
|
6603
|
+
const body = await readBody(req);
|
|
6604
|
+
const request = JSON.parse(body);
|
|
6605
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6606
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
6607
|
+
return;
|
|
6514
6608
|
}
|
|
6515
|
-
|
|
6609
|
+
const provider = resolveProvider(request.target);
|
|
6610
|
+
if (!provider) {
|
|
6611
|
+
sendJson(res, 400, {
|
|
6612
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6613
|
+
});
|
|
6614
|
+
return;
|
|
6615
|
+
}
|
|
6616
|
+
callCount++;
|
|
6617
|
+
const response = await provider.invoke({
|
|
6618
|
+
question: request.question,
|
|
6619
|
+
systemPrompt: request.systemPrompt,
|
|
6620
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6621
|
+
attempt: request.attempt ?? 1
|
|
6622
|
+
});
|
|
6623
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6624
|
+
const rawText = extractLastAssistantContent(outputMessages);
|
|
6625
|
+
const result = {
|
|
6626
|
+
outputMessages,
|
|
6627
|
+
rawText
|
|
6628
|
+
};
|
|
6629
|
+
sendJson(res, 200, result);
|
|
6630
|
+
} catch (error) {
|
|
6631
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6632
|
+
sendJson(res, 500, { error: message });
|
|
6516
6633
|
}
|
|
6517
6634
|
}
|
|
6518
|
-
|
|
6519
|
-
|
|
6635
|
+
async function handleInvokeBatch(req, res) {
|
|
6636
|
+
try {
|
|
6637
|
+
const body = await readBody(req);
|
|
6638
|
+
const { requests } = JSON.parse(body);
|
|
6639
|
+
if (!Array.isArray(requests)) {
|
|
6640
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
6641
|
+
return;
|
|
6642
|
+
}
|
|
6643
|
+
if (callCount + requests.length > maxCalls) {
|
|
6644
|
+
sendJson(res, 429, {
|
|
6645
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
6646
|
+
});
|
|
6647
|
+
return;
|
|
6648
|
+
}
|
|
6649
|
+
const responses = [];
|
|
6650
|
+
for (const request of requests) {
|
|
6651
|
+
if (!request.question || typeof request.question !== "string") {
|
|
6652
|
+
responses.push({
|
|
6653
|
+
outputMessages: [],
|
|
6654
|
+
rawText: "Error: Missing required field: question"
|
|
6655
|
+
});
|
|
6656
|
+
continue;
|
|
6657
|
+
}
|
|
6658
|
+
const provider = resolveProvider(request.target);
|
|
6659
|
+
if (!provider) {
|
|
6660
|
+
responses.push({
|
|
6661
|
+
outputMessages: [],
|
|
6662
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
6663
|
+
});
|
|
6664
|
+
continue;
|
|
6665
|
+
}
|
|
6666
|
+
callCount++;
|
|
6667
|
+
try {
|
|
6668
|
+
const response = await provider.invoke({
|
|
6669
|
+
question: request.question,
|
|
6670
|
+
systemPrompt: request.systemPrompt,
|
|
6671
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
6672
|
+
attempt: request.attempt ?? 1
|
|
6673
|
+
});
|
|
6674
|
+
const outputMessages = response.outputMessages ?? [];
|
|
6675
|
+
responses.push({
|
|
6676
|
+
outputMessages,
|
|
6677
|
+
rawText: extractLastAssistantContent(outputMessages)
|
|
6678
|
+
});
|
|
6679
|
+
} catch (error) {
|
|
6680
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6681
|
+
responses.push({
|
|
6682
|
+
outputMessages: [],
|
|
6683
|
+
rawText: `Error: ${message}`
|
|
6684
|
+
});
|
|
6685
|
+
}
|
|
6686
|
+
}
|
|
6687
|
+
sendJson(res, 200, { responses });
|
|
6688
|
+
} catch (error) {
|
|
6689
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6690
|
+
sendJson(res, 500, { error: message });
|
|
6691
|
+
}
|
|
6692
|
+
}
|
|
6693
|
+
await new Promise((resolve, reject) => {
|
|
6694
|
+
server.once("error", reject);
|
|
6695
|
+
server.listen(0, "127.0.0.1", () => {
|
|
6696
|
+
server.removeListener("error", reject);
|
|
6697
|
+
resolve();
|
|
6698
|
+
});
|
|
6699
|
+
});
|
|
6700
|
+
const address = server.address();
|
|
6701
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
6702
|
+
return {
|
|
6703
|
+
url,
|
|
6704
|
+
token,
|
|
6705
|
+
shutdown: async () => {
|
|
6706
|
+
isShutdown = true;
|
|
6707
|
+
return new Promise((resolve, reject) => {
|
|
6708
|
+
server.close((err) => {
|
|
6709
|
+
if (err) reject(err);
|
|
6710
|
+
else resolve();
|
|
6711
|
+
});
|
|
6712
|
+
});
|
|
6713
|
+
},
|
|
6714
|
+
getUsageMetadata: () => ({
|
|
6715
|
+
callCount,
|
|
6716
|
+
maxCalls
|
|
6717
|
+
})
|
|
6718
|
+
};
|
|
6719
|
+
}
|
|
6720
|
+
function sendJson(res, statusCode, body) {
|
|
6721
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
6722
|
+
res.end(JSON.stringify(body));
|
|
6723
|
+
}
|
|
6724
|
+
function readBody(req) {
|
|
6725
|
+
return new Promise((resolve, reject) => {
|
|
6726
|
+
const chunks = [];
|
|
6727
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
6728
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
6729
|
+
req.on("error", reject);
|
|
6730
|
+
});
|
|
6731
|
+
}
|
|
6732
|
+
function extractLastAssistantContent(messages) {
|
|
6733
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6734
|
+
const msg = messages[i];
|
|
6735
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6736
|
+
if (typeof msg.content === "string") {
|
|
6737
|
+
return msg.content;
|
|
6738
|
+
}
|
|
6739
|
+
if (Array.isArray(msg.content)) {
|
|
6740
|
+
for (const part of msg.content) {
|
|
6741
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
6742
|
+
return String(part.text);
|
|
6743
|
+
}
|
|
6744
|
+
}
|
|
6745
|
+
}
|
|
6746
|
+
}
|
|
6747
|
+
}
|
|
6748
|
+
return void 0;
|
|
6749
|
+
}
|
|
6750
|
+
|
|
6751
|
+
// src/evaluation/case-conversion.ts
|
|
6752
|
+
function toSnakeCase(str) {
|
|
6753
|
+
if (/^[A-Z]/.test(str)) {
|
|
6754
|
+
return str;
|
|
6755
|
+
}
|
|
6756
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
6757
|
+
}
|
|
6758
|
+
function toSnakeCaseDeep(obj) {
|
|
6759
|
+
if (obj === null || obj === void 0) {
|
|
6760
|
+
return obj;
|
|
6761
|
+
}
|
|
6762
|
+
if (Array.isArray(obj)) {
|
|
6763
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
6764
|
+
}
|
|
6765
|
+
if (typeof obj === "object") {
|
|
6766
|
+
const result = {};
|
|
6767
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
6768
|
+
const snakeKey = toSnakeCase(key);
|
|
6769
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6770
|
+
}
|
|
6771
|
+
return result;
|
|
6772
|
+
}
|
|
6773
|
+
return obj;
|
|
6774
|
+
}
|
|
6775
|
+
|
|
6776
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
6777
|
+
var CodeEvaluator = class {
|
|
6778
|
+
kind = "code";
|
|
6779
|
+
script;
|
|
6780
|
+
cwd;
|
|
6781
|
+
agentTimeoutMs;
|
|
6782
|
+
config;
|
|
6783
|
+
target;
|
|
6784
|
+
constructor(options) {
|
|
6785
|
+
this.script = options.script;
|
|
6786
|
+
this.cwd = options.cwd;
|
|
6787
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6788
|
+
this.config = options.config;
|
|
6789
|
+
this.target = options.target;
|
|
6790
|
+
}
|
|
6791
|
+
async evaluate(context) {
|
|
6792
|
+
const payload = {
|
|
6793
|
+
question: context.evalCase.question,
|
|
6794
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
6795
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
6796
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
6797
|
+
candidateAnswer: context.candidate,
|
|
6798
|
+
outputMessages: context.outputMessages ?? null,
|
|
6799
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
6800
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
6801
|
+
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6802
|
+
),
|
|
6803
|
+
inputMessages: context.evalCase.input_messages,
|
|
6804
|
+
traceSummary: context.traceSummary ?? null,
|
|
6805
|
+
config: this.config ?? null
|
|
6806
|
+
};
|
|
6807
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
6808
|
+
let proxyEnv;
|
|
6809
|
+
let proxyShutdown;
|
|
6810
|
+
let getProxyUsage;
|
|
6811
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
6812
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
6813
|
+
const proxy = await createTargetProxy({
|
|
6814
|
+
defaultProvider: context.judgeProvider,
|
|
6815
|
+
targetResolver: context.targetResolver,
|
|
6816
|
+
availableTargets: context.availableTargets,
|
|
6817
|
+
maxCalls
|
|
6818
|
+
});
|
|
6819
|
+
proxyEnv = {
|
|
6820
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
6821
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
6822
|
+
};
|
|
6823
|
+
proxyShutdown = proxy.shutdown;
|
|
6824
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
6825
|
+
}
|
|
6826
|
+
try {
|
|
6827
|
+
const stdout = await executeScript(
|
|
6828
|
+
this.script,
|
|
6829
|
+
inputPayload,
|
|
6830
|
+
this.agentTimeoutMs,
|
|
6831
|
+
this.cwd,
|
|
6832
|
+
proxyEnv
|
|
6833
|
+
);
|
|
6834
|
+
const parsed = parseJsonSafe(stdout);
|
|
6835
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6836
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6837
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6838
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6839
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
6840
|
+
const proxyUsage = getProxyUsage?.();
|
|
6841
|
+
const evaluatorRawRequest = {
|
|
6842
|
+
script: this.script,
|
|
6843
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6844
|
+
...proxyUsage ? {
|
|
6845
|
+
target_proxy: {
|
|
6846
|
+
call_count: proxyUsage.callCount,
|
|
6847
|
+
max_calls: proxyUsage.maxCalls
|
|
6848
|
+
}
|
|
6849
|
+
} : {}
|
|
6850
|
+
};
|
|
6851
|
+
return {
|
|
6852
|
+
score,
|
|
6853
|
+
verdict: scoreToVerdict(score),
|
|
6854
|
+
hits,
|
|
6855
|
+
misses,
|
|
6856
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6857
|
+
reasoning,
|
|
6858
|
+
evaluatorRawRequest,
|
|
6859
|
+
...details ? { details } : {}
|
|
6860
|
+
};
|
|
6861
|
+
} catch (error) {
|
|
6862
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6863
|
+
const proxyUsage = getProxyUsage?.();
|
|
6864
|
+
return {
|
|
6865
|
+
score: 0,
|
|
6866
|
+
verdict: "fail",
|
|
6867
|
+
hits: [],
|
|
6868
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
6869
|
+
expectedAspectCount: 1,
|
|
6870
|
+
reasoning: message,
|
|
6871
|
+
evaluatorRawRequest: {
|
|
6872
|
+
script: this.script,
|
|
6873
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
6874
|
+
...proxyUsage ? {
|
|
6875
|
+
target_proxy: {
|
|
6876
|
+
call_count: proxyUsage.callCount,
|
|
6877
|
+
max_calls: proxyUsage.maxCalls
|
|
6878
|
+
}
|
|
6879
|
+
} : {},
|
|
6880
|
+
error: message
|
|
6881
|
+
}
|
|
6882
|
+
};
|
|
6883
|
+
} finally {
|
|
6884
|
+
if (proxyShutdown) {
|
|
6885
|
+
await proxyShutdown();
|
|
6886
|
+
}
|
|
6887
|
+
}
|
|
6888
|
+
}
|
|
6889
|
+
};
|
|
6890
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
6891
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
6892
|
+
if (exitCode !== 0) {
|
|
6893
|
+
const trimmedErr = formatStderr(stderr);
|
|
6894
|
+
throw new Error(
|
|
6895
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
6896
|
+
);
|
|
6897
|
+
}
|
|
6898
|
+
return stdout.trim();
|
|
6899
|
+
}
|
|
6900
|
+
function formatStderr(stderr) {
|
|
6901
|
+
const trimmed = stderr.trim();
|
|
6902
|
+
const maxLength = 2e3;
|
|
6903
|
+
if (trimmed.length <= maxLength) {
|
|
6904
|
+
return trimmed;
|
|
6905
|
+
}
|
|
6906
|
+
const tail = trimmed.slice(-maxLength);
|
|
6907
|
+
return `...(truncated, last ${maxLength} chars)
|
|
6908
|
+
${tail}`;
|
|
6909
|
+
}
|
|
6910
|
+
|
|
6911
|
+
// src/evaluation/evaluators/composite.ts
|
|
6912
|
+
var import_ai3 = require("ai");
|
|
6913
|
+
|
|
6914
|
+
// src/evaluation/providers/types.ts
|
|
6915
|
+
var AGENT_PROVIDER_KINDS = [
|
|
6916
|
+
"codex",
|
|
6917
|
+
"pi-coding-agent",
|
|
6918
|
+
"claude-code",
|
|
6919
|
+
"vscode",
|
|
6920
|
+
"vscode-insiders"
|
|
6921
|
+
];
|
|
6922
|
+
function extractLastAssistantContent2(messages) {
|
|
6923
|
+
if (!messages || messages.length === 0) {
|
|
6924
|
+
return "";
|
|
6925
|
+
}
|
|
6926
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
6927
|
+
const msg = messages[i];
|
|
6928
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
6929
|
+
if (typeof msg.content === "string") {
|
|
6930
|
+
return msg.content;
|
|
6931
|
+
}
|
|
6932
|
+
return JSON.stringify(msg.content);
|
|
6933
|
+
}
|
|
6934
|
+
}
|
|
6935
|
+
return "";
|
|
6936
|
+
}
|
|
6520
6937
|
function isAgentProvider(provider) {
|
|
6521
6938
|
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
6522
6939
|
}
|
|
6523
6940
|
|
|
6524
|
-
// src/evaluation/evaluators.ts
|
|
6941
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
6942
|
+
var import_ai2 = require("ai");
|
|
6943
|
+
var import_zod3 = require("zod");
|
|
6525
6944
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
6526
6945
|
|
|
6527
6946
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -6601,7 +7020,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6601
7020
|
target: judgeProvider.targetName
|
|
6602
7021
|
};
|
|
6603
7022
|
try {
|
|
6604
|
-
const { data
|
|
7023
|
+
const { data } = await this.runWithRetry({
|
|
6605
7024
|
context,
|
|
6606
7025
|
judgeProvider,
|
|
6607
7026
|
systemPrompt,
|
|
@@ -6714,7 +7133,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6714
7133
|
temperature: this.temperature
|
|
6715
7134
|
});
|
|
6716
7135
|
const data = schema.parse(
|
|
6717
|
-
parseJsonFromText(
|
|
7136
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
6718
7137
|
);
|
|
6719
7138
|
return { data, providerResponse: response };
|
|
6720
7139
|
} catch (e) {
|
|
@@ -6750,86 +7169,160 @@ You must return a valid JSON object matching this schema:
|
|
|
6750
7169
|
"overall_reasoning": "string (summary)"
|
|
6751
7170
|
}`;
|
|
6752
7171
|
}
|
|
6753
|
-
function
|
|
6754
|
-
|
|
6755
|
-
return
|
|
6756
|
-
}
|
|
6757
|
-
if (score >= 0.6) {
|
|
6758
|
-
return "borderline";
|
|
6759
|
-
}
|
|
6760
|
-
return "fail";
|
|
7172
|
+
function substituteVariables(template, variables) {
|
|
7173
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
7174
|
+
return variables[varName] ?? match;
|
|
7175
|
+
});
|
|
6761
7176
|
}
|
|
6762
|
-
function
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
7177
|
+
function calculateRubricScore(result, rubrics) {
|
|
7178
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
7179
|
+
const hits = [];
|
|
7180
|
+
const misses = [];
|
|
7181
|
+
let totalWeight = 0;
|
|
7182
|
+
let earnedWeight = 0;
|
|
7183
|
+
let failedRequired = false;
|
|
7184
|
+
for (const check of result.checks) {
|
|
7185
|
+
const rubric = rubricMap.get(check.id);
|
|
7186
|
+
if (!rubric) {
|
|
7187
|
+
continue;
|
|
7188
|
+
}
|
|
7189
|
+
totalWeight += rubric.weight;
|
|
7190
|
+
if (check.satisfied) {
|
|
7191
|
+
earnedWeight += rubric.weight;
|
|
7192
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
7193
|
+
} else {
|
|
7194
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
7195
|
+
if (rubric.required) {
|
|
7196
|
+
failedRequired = true;
|
|
7197
|
+
}
|
|
7198
|
+
}
|
|
6771
7199
|
}
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
6776
|
-
return match?.[0];
|
|
6777
|
-
}
|
|
6778
|
-
function parseJsonFromText(text) {
|
|
6779
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6780
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
6781
|
-
return JSON.parse(blob);
|
|
6782
|
-
}
|
|
6783
|
-
function isNonEmptyString(value) {
|
|
6784
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7200
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
7201
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7202
|
+
return { score, verdict, hits, misses };
|
|
6785
7203
|
}
|
|
6786
|
-
|
|
6787
|
-
|
|
6788
|
-
|
|
6789
|
-
|
|
6790
|
-
|
|
7204
|
+
|
|
7205
|
+
// src/evaluation/evaluators/composite.ts
|
|
7206
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7207
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
7208
|
+
|
|
7209
|
+
Decide the final score and verdict based on all evaluator results.
|
|
7210
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7211
|
+
var CompositeEvaluator = class {
|
|
7212
|
+
kind = "composite";
|
|
6791
7213
|
config;
|
|
7214
|
+
evaluatorFactory;
|
|
7215
|
+
cwd;
|
|
6792
7216
|
constructor(options) {
|
|
6793
|
-
this.script = options.script;
|
|
6794
|
-
this.cwd = options.cwd;
|
|
6795
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6796
7217
|
this.config = options.config;
|
|
7218
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
7219
|
+
this.cwd = options.cwd;
|
|
6797
7220
|
}
|
|
6798
7221
|
async evaluate(context) {
|
|
6799
|
-
const
|
|
6800
|
-
|
|
6801
|
-
|
|
6802
|
-
|
|
6803
|
-
|
|
6804
|
-
|
|
6805
|
-
|
|
6806
|
-
|
|
6807
|
-
|
|
6808
|
-
|
|
6809
|
-
|
|
6810
|
-
|
|
6811
|
-
|
|
6812
|
-
|
|
7222
|
+
const memberResults = await Promise.all(
|
|
7223
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
7224
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
7225
|
+
return {
|
|
7226
|
+
id: memberConfig.name,
|
|
7227
|
+
type: memberConfig.type,
|
|
7228
|
+
result: await evaluator.evaluate(context)
|
|
7229
|
+
};
|
|
7230
|
+
})
|
|
7231
|
+
);
|
|
7232
|
+
return this.aggregate(memberResults, context);
|
|
7233
|
+
}
|
|
7234
|
+
async aggregate(results, context) {
|
|
7235
|
+
const aggregator = this.config.aggregator;
|
|
7236
|
+
switch (aggregator.type) {
|
|
7237
|
+
case "code_judge":
|
|
7238
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
7239
|
+
case "llm_judge":
|
|
7240
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
7241
|
+
default:
|
|
7242
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
7243
|
+
}
|
|
7244
|
+
}
|
|
7245
|
+
runWeightedAverage(results, weights) {
|
|
7246
|
+
let totalWeight = 0;
|
|
7247
|
+
let weightedSum = 0;
|
|
7248
|
+
const allHits = [];
|
|
7249
|
+
const allMisses = [];
|
|
7250
|
+
const reasoningParts = [];
|
|
7251
|
+
const evaluatorResults = [];
|
|
7252
|
+
for (const member of results) {
|
|
7253
|
+
const weight = weights?.[member.id] ?? 1;
|
|
7254
|
+
totalWeight += weight;
|
|
7255
|
+
weightedSum += member.result.score * weight;
|
|
7256
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7257
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7258
|
+
if (member.result.reasoning) {
|
|
7259
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7260
|
+
}
|
|
7261
|
+
evaluatorResults.push({
|
|
7262
|
+
name: member.id,
|
|
7263
|
+
type: member.type,
|
|
7264
|
+
score: member.result.score,
|
|
7265
|
+
weight,
|
|
7266
|
+
verdict: member.result.verdict,
|
|
7267
|
+
hits: [...member.result.hits],
|
|
7268
|
+
misses: [...member.result.misses],
|
|
7269
|
+
reasoning: member.result.reasoning,
|
|
7270
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7271
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7272
|
+
details: member.result.details
|
|
7273
|
+
});
|
|
7274
|
+
}
|
|
7275
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7276
|
+
return {
|
|
7277
|
+
score: clampScore(finalScore),
|
|
7278
|
+
verdict: scoreToVerdict(finalScore),
|
|
7279
|
+
hits: allHits,
|
|
7280
|
+
misses: allMisses,
|
|
7281
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
7282
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7283
|
+
evaluatorRawRequest: {
|
|
7284
|
+
aggregator: "weighted_average",
|
|
7285
|
+
...weights ? { weights } : {}
|
|
7286
|
+
},
|
|
7287
|
+
evaluatorResults
|
|
6813
7288
|
};
|
|
6814
|
-
|
|
7289
|
+
}
|
|
7290
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
7291
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7292
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
7293
|
+
const evaluatorResults = results.map((member) => ({
|
|
7294
|
+
name: member.id,
|
|
7295
|
+
type: member.type,
|
|
7296
|
+
score: member.result.score,
|
|
7297
|
+
weight: weights?.[member.id] ?? 1,
|
|
7298
|
+
verdict: member.result.verdict,
|
|
7299
|
+
hits: [...member.result.hits],
|
|
7300
|
+
misses: [...member.result.misses],
|
|
7301
|
+
reasoning: member.result.reasoning,
|
|
7302
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7303
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7304
|
+
details: member.result.details
|
|
7305
|
+
}));
|
|
6815
7306
|
try {
|
|
6816
|
-
const stdout = await executeScript(
|
|
7307
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6817
7308
|
const parsed = parseJsonSafe(stdout);
|
|
6818
7309
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6819
7310
|
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6820
7311
|
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6821
7312
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7313
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6822
7314
|
return {
|
|
6823
7315
|
score,
|
|
6824
|
-
verdict
|
|
7316
|
+
verdict,
|
|
6825
7317
|
hits,
|
|
6826
7318
|
misses,
|
|
6827
7319
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
6828
7320
|
reasoning,
|
|
6829
7321
|
evaluatorRawRequest: {
|
|
6830
|
-
|
|
6831
|
-
|
|
6832
|
-
}
|
|
7322
|
+
aggregator: "code_judge",
|
|
7323
|
+
script: scriptPath
|
|
7324
|
+
},
|
|
7325
|
+
evaluatorResults
|
|
6833
7326
|
};
|
|
6834
7327
|
} catch (error) {
|
|
6835
7328
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -6837,312 +7330,152 @@ var CodeEvaluator = class {
|
|
|
6837
7330
|
score: 0,
|
|
6838
7331
|
verdict: "fail",
|
|
6839
7332
|
hits: [],
|
|
6840
|
-
misses: [`Code
|
|
7333
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
6841
7334
|
expectedAspectCount: 1,
|
|
6842
7335
|
reasoning: message,
|
|
6843
7336
|
evaluatorRawRequest: {
|
|
6844
|
-
|
|
6845
|
-
|
|
7337
|
+
aggregator: "code_judge",
|
|
7338
|
+
script: scriptPath,
|
|
6846
7339
|
error: message
|
|
6847
|
-
}
|
|
7340
|
+
},
|
|
7341
|
+
evaluatorResults
|
|
6848
7342
|
};
|
|
6849
7343
|
}
|
|
6850
7344
|
}
|
|
6851
|
-
|
|
6852
|
-
|
|
6853
|
-
|
|
6854
|
-
|
|
6855
|
-
const misses = [];
|
|
6856
|
-
let totalWeight = 0;
|
|
6857
|
-
let earnedWeight = 0;
|
|
6858
|
-
let failedRequired = false;
|
|
6859
|
-
for (const check of result.checks) {
|
|
6860
|
-
const rubric = rubricMap.get(check.id);
|
|
6861
|
-
if (!rubric) {
|
|
6862
|
-
continue;
|
|
7345
|
+
async runLlmAggregator(results, context, config) {
|
|
7346
|
+
const judgeProvider = context.judgeProvider;
|
|
7347
|
+
if (!judgeProvider) {
|
|
7348
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
6863
7349
|
}
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
|
|
7350
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7351
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
7352
|
+
const evaluatorResults = results.map((member) => ({
|
|
7353
|
+
name: member.id,
|
|
7354
|
+
type: member.type,
|
|
7355
|
+
score: member.result.score,
|
|
7356
|
+
verdict: member.result.verdict,
|
|
7357
|
+
hits: [...member.result.hits],
|
|
7358
|
+
misses: [...member.result.misses],
|
|
7359
|
+
reasoning: member.result.reasoning,
|
|
7360
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7361
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
7362
|
+
details: member.result.details
|
|
7363
|
+
}));
|
|
7364
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
7365
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
7366
|
+
const systemPrompt = buildOutputSchema();
|
|
7367
|
+
const evaluatorRawRequest = {
|
|
7368
|
+
aggregator: "llm_judge",
|
|
7369
|
+
userPrompt,
|
|
7370
|
+
systemPrompt,
|
|
7371
|
+
target: judgeProvider.targetName
|
|
7372
|
+
};
|
|
7373
|
+
try {
|
|
7374
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
7375
|
+
if (model) {
|
|
7376
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
7377
|
+
model,
|
|
7378
|
+
system: systemPrompt,
|
|
7379
|
+
prompt: userPrompt
|
|
7380
|
+
});
|
|
7381
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
7382
|
+
const score2 = clampScore(data2.score);
|
|
7383
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7384
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7385
|
+
const reasoning2 = data2.reasoning;
|
|
7386
|
+
return {
|
|
7387
|
+
score: score2,
|
|
7388
|
+
verdict: scoreToVerdict(score2),
|
|
7389
|
+
hits: hits2,
|
|
7390
|
+
misses: misses2,
|
|
7391
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7392
|
+
reasoning: reasoning2,
|
|
7393
|
+
evaluatorRawRequest,
|
|
7394
|
+
evaluatorResults
|
|
7395
|
+
};
|
|
6872
7396
|
}
|
|
7397
|
+
const response = await judgeProvider.invoke({
|
|
7398
|
+
question: userPrompt,
|
|
7399
|
+
systemPrompt,
|
|
7400
|
+
evalCaseId: context.evalCase.id,
|
|
7401
|
+
attempt: context.attempt
|
|
7402
|
+
});
|
|
7403
|
+
const data = freeformEvaluationSchema.parse(
|
|
7404
|
+
parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
|
|
7405
|
+
);
|
|
7406
|
+
const score = clampScore(data.score);
|
|
7407
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7408
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7409
|
+
const reasoning = data.reasoning;
|
|
7410
|
+
return {
|
|
7411
|
+
score,
|
|
7412
|
+
verdict: scoreToVerdict(score),
|
|
7413
|
+
hits,
|
|
7414
|
+
misses,
|
|
7415
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
7416
|
+
reasoning,
|
|
7417
|
+
evaluatorRawRequest,
|
|
7418
|
+
evaluatorResults
|
|
7419
|
+
};
|
|
7420
|
+
} catch {
|
|
7421
|
+
return {
|
|
7422
|
+
score: 0,
|
|
7423
|
+
verdict: "fail",
|
|
7424
|
+
hits: [],
|
|
7425
|
+
misses: [],
|
|
7426
|
+
expectedAspectCount: 1,
|
|
7427
|
+
evaluatorRawRequest,
|
|
7428
|
+
evaluatorResults
|
|
7429
|
+
};
|
|
6873
7430
|
}
|
|
6874
7431
|
}
|
|
6875
|
-
|
|
6876
|
-
|
|
6877
|
-
|
|
6878
|
-
|
|
6879
|
-
|
|
6880
|
-
|
|
6881
|
-
|
|
6882
|
-
|
|
6883
|
-
throw new Error(
|
|
6884
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
6885
|
-
);
|
|
6886
|
-
}
|
|
6887
|
-
return stdout.trim();
|
|
6888
|
-
}
|
|
6889
|
-
function formatStderr(stderr) {
|
|
6890
|
-
const trimmed = stderr.trim();
|
|
6891
|
-
const maxLength = 2e3;
|
|
6892
|
-
if (trimmed.length <= maxLength) {
|
|
6893
|
-
return trimmed;
|
|
6894
|
-
}
|
|
6895
|
-
const tail = trimmed.slice(-maxLength);
|
|
6896
|
-
return `...(truncated, last ${maxLength} chars)
|
|
6897
|
-
${tail}`;
|
|
6898
|
-
}
|
|
6899
|
-
function parseJsonSafe(payload) {
|
|
6900
|
-
try {
|
|
6901
|
-
return JSON.parse(payload);
|
|
6902
|
-
} catch {
|
|
6903
|
-
return void 0;
|
|
6904
|
-
}
|
|
6905
|
-
}
|
|
6906
|
-
function substituteVariables(template, variables) {
|
|
6907
|
-
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
6908
|
-
return variables[varName] ?? match;
|
|
6909
|
-
});
|
|
6910
|
-
}
|
|
6911
|
-
function deepEqual(a, b) {
|
|
6912
|
-
if (a === b) return true;
|
|
6913
|
-
if (a === null || b === null) return a === b;
|
|
6914
|
-
if (typeof a !== typeof b) return false;
|
|
6915
|
-
if (typeof a !== "object") return a === b;
|
|
6916
|
-
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
6917
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
6918
|
-
if (a.length !== b.length) return false;
|
|
6919
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
6920
|
-
}
|
|
6921
|
-
const aObj = a;
|
|
6922
|
-
const bObj = b;
|
|
6923
|
-
const aKeys = Object.keys(aObj);
|
|
6924
|
-
const bKeys = Object.keys(bObj);
|
|
6925
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
6926
|
-
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
6927
|
-
}
|
|
6928
|
-
function argsMatch(expected, actual) {
|
|
6929
|
-
if (expected === void 0) return true;
|
|
6930
|
-
if (expected === "any") return true;
|
|
6931
|
-
if (actual === void 0) return false;
|
|
6932
|
-
for (const key of Object.keys(expected)) {
|
|
6933
|
-
if (!Object.hasOwn(actual, key)) return false;
|
|
6934
|
-
if (!deepEqual(expected[key], actual[key])) return false;
|
|
6935
|
-
}
|
|
6936
|
-
return true;
|
|
6937
|
-
}
|
|
6938
|
-
var ToolTrajectoryEvaluator = class {
|
|
6939
|
-
kind = "tool_trajectory";
|
|
6940
|
-
config;
|
|
6941
|
-
constructor(options) {
|
|
6942
|
-
this.config = options.config;
|
|
7432
|
+
};
|
|
7433
|
+
|
|
7434
|
+
// src/evaluation/evaluators/cost.ts
|
|
7435
|
+
var CostEvaluator = class {
|
|
7436
|
+
kind = "cost";
|
|
7437
|
+
config;
|
|
7438
|
+
constructor(options) {
|
|
7439
|
+
this.config = options.config;
|
|
6943
7440
|
}
|
|
6944
7441
|
evaluate(context) {
|
|
6945
|
-
const {
|
|
6946
|
-
const
|
|
6947
|
-
if (
|
|
6948
|
-
return {
|
|
6949
|
-
score: 0,
|
|
6950
|
-
verdict: "fail",
|
|
6951
|
-
hits: [],
|
|
6952
|
-
misses: ["No trace available for evaluation"],
|
|
6953
|
-
expectedAspectCount: 1
|
|
6954
|
-
};
|
|
6955
|
-
}
|
|
6956
|
-
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
6957
|
-
if (!summary) {
|
|
7442
|
+
const { budget } = this.config;
|
|
7443
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
7444
|
+
if (costUsd === void 0) {
|
|
6958
7445
|
return {
|
|
6959
7446
|
score: 0,
|
|
6960
7447
|
verdict: "fail",
|
|
6961
7448
|
hits: [],
|
|
6962
|
-
misses: ["No
|
|
6963
|
-
expectedAspectCount: 1
|
|
6964
|
-
|
|
6965
|
-
|
|
6966
|
-
|
|
6967
|
-
|
|
6968
|
-
|
|
6969
|
-
case "in_order":
|
|
6970
|
-
return this.evaluateInOrder(toolCalls);
|
|
6971
|
-
case "exact":
|
|
6972
|
-
return this.evaluateExact(toolCalls);
|
|
6973
|
-
default:
|
|
6974
|
-
return {
|
|
6975
|
-
score: 0,
|
|
6976
|
-
verdict: "fail",
|
|
6977
|
-
hits: [],
|
|
6978
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
6979
|
-
expectedAspectCount: 1
|
|
6980
|
-
};
|
|
6981
|
-
}
|
|
6982
|
-
}
|
|
6983
|
-
/**
|
|
6984
|
-
* Extract tool calls from output messages.
|
|
6985
|
-
*/
|
|
6986
|
-
extractToolCallsFromMessages(messages) {
|
|
6987
|
-
if (!messages) {
|
|
6988
|
-
return [];
|
|
6989
|
-
}
|
|
6990
|
-
const toolCalls = [];
|
|
6991
|
-
for (const message of messages) {
|
|
6992
|
-
if (message.toolCalls) {
|
|
6993
|
-
for (const call of message.toolCalls) {
|
|
6994
|
-
toolCalls.push({
|
|
6995
|
-
name: call.tool,
|
|
6996
|
-
args: call.input
|
|
6997
|
-
});
|
|
7449
|
+
misses: ["No cost data available in trace"],
|
|
7450
|
+
expectedAspectCount: 1,
|
|
7451
|
+
reasoning: "Execution cost not reported by provider",
|
|
7452
|
+
evaluatorRawRequest: {
|
|
7453
|
+
type: "cost",
|
|
7454
|
+
budget,
|
|
7455
|
+
costUsd: null
|
|
6998
7456
|
}
|
|
6999
|
-
}
|
|
7000
|
-
}
|
|
7001
|
-
return toolCalls;
|
|
7002
|
-
}
|
|
7003
|
-
/**
|
|
7004
|
-
* Build a summary from extracted tool calls.
|
|
7005
|
-
*/
|
|
7006
|
-
buildSummary(toolCalls) {
|
|
7007
|
-
const toolCallsByName = {};
|
|
7008
|
-
for (const call of toolCalls) {
|
|
7009
|
-
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
7010
|
-
}
|
|
7011
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
7012
|
-
return {
|
|
7013
|
-
eventCount: toolCalls.length,
|
|
7014
|
-
toolNames,
|
|
7015
|
-
toolCallsByName,
|
|
7016
|
-
errorCount: 0
|
|
7017
|
-
};
|
|
7018
|
-
}
|
|
7019
|
-
evaluateAnyOrder(summary) {
|
|
7020
|
-
const minimums = this.config.minimums ?? {};
|
|
7021
|
-
const toolNames = Object.keys(minimums);
|
|
7022
|
-
if (toolNames.length === 0) {
|
|
7023
|
-
return {
|
|
7024
|
-
score: 1,
|
|
7025
|
-
verdict: "pass",
|
|
7026
|
-
hits: ["No tool requirements specified"],
|
|
7027
|
-
misses: [],
|
|
7028
|
-
expectedAspectCount: 0
|
|
7029
|
-
};
|
|
7030
|
-
}
|
|
7031
|
-
const hits = [];
|
|
7032
|
-
const misses = [];
|
|
7033
|
-
for (const toolName of toolNames) {
|
|
7034
|
-
const required = minimums[toolName];
|
|
7035
|
-
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
7036
|
-
if (actual >= required) {
|
|
7037
|
-
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7038
|
-
} else {
|
|
7039
|
-
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7040
|
-
}
|
|
7041
|
-
}
|
|
7042
|
-
const score = hits.length / toolNames.length;
|
|
7043
|
-
return {
|
|
7044
|
-
score,
|
|
7045
|
-
verdict: scoreToVerdict(score),
|
|
7046
|
-
hits,
|
|
7047
|
-
misses,
|
|
7048
|
-
expectedAspectCount: toolNames.length
|
|
7049
|
-
};
|
|
7050
|
-
}
|
|
7051
|
-
evaluateInOrder(toolCalls) {
|
|
7052
|
-
const expected = this.config.expected ?? [];
|
|
7053
|
-
if (expected.length === 0) {
|
|
7054
|
-
return {
|
|
7055
|
-
score: 1,
|
|
7056
|
-
verdict: "pass",
|
|
7057
|
-
hits: ["No tool sequence specified"],
|
|
7058
|
-
misses: [],
|
|
7059
|
-
expectedAspectCount: 0
|
|
7060
7457
|
};
|
|
7061
7458
|
}
|
|
7062
|
-
const
|
|
7063
|
-
const
|
|
7064
|
-
|
|
7065
|
-
for (let i = 0; i < expected.length; i++) {
|
|
7066
|
-
const expectedItem = expected[i];
|
|
7067
|
-
const expectedTool = expectedItem.tool;
|
|
7068
|
-
let found = false;
|
|
7069
|
-
let argsMismatch = false;
|
|
7070
|
-
while (actualIndex < toolCalls.length) {
|
|
7071
|
-
const actualCall = toolCalls[actualIndex];
|
|
7072
|
-
if (actualCall.name === expectedTool) {
|
|
7073
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7074
|
-
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7075
|
-
actualIndex++;
|
|
7076
|
-
found = true;
|
|
7077
|
-
break;
|
|
7078
|
-
}
|
|
7079
|
-
misses.push(
|
|
7080
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
7081
|
-
);
|
|
7082
|
-
actualIndex++;
|
|
7083
|
-
argsMismatch = true;
|
|
7084
|
-
break;
|
|
7085
|
-
}
|
|
7086
|
-
actualIndex++;
|
|
7087
|
-
}
|
|
7088
|
-
if (!found && !argsMismatch) {
|
|
7089
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7090
|
-
}
|
|
7091
|
-
}
|
|
7092
|
-
const score = hits.length / expected.length;
|
|
7459
|
+
const passed = costUsd <= budget;
|
|
7460
|
+
const score = passed ? 1 : 0;
|
|
7461
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7093
7462
|
return {
|
|
7094
7463
|
score,
|
|
7095
|
-
verdict:
|
|
7096
|
-
hits,
|
|
7097
|
-
misses,
|
|
7098
|
-
expectedAspectCount:
|
|
7099
|
-
|
|
7100
|
-
|
|
7101
|
-
|
|
7102
|
-
|
|
7103
|
-
|
|
7104
|
-
return {
|
|
7105
|
-
score: 1,
|
|
7106
|
-
verdict: "pass",
|
|
7107
|
-
hits: ["No tool sequence specified"],
|
|
7108
|
-
misses: [],
|
|
7109
|
-
expectedAspectCount: 0
|
|
7110
|
-
};
|
|
7111
|
-
}
|
|
7112
|
-
const hits = [];
|
|
7113
|
-
const misses = [];
|
|
7114
|
-
if (toolCalls.length !== expected.length) {
|
|
7115
|
-
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7116
|
-
}
|
|
7117
|
-
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
7118
|
-
for (let i = 0; i < checkLength; i++) {
|
|
7119
|
-
const expectedItem = expected[i];
|
|
7120
|
-
const expectedTool = expectedItem.tool;
|
|
7121
|
-
const actualCall = toolCalls[i];
|
|
7122
|
-
const actualTool = actualCall.name;
|
|
7123
|
-
if (actualTool === expectedTool) {
|
|
7124
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7125
|
-
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7126
|
-
} else {
|
|
7127
|
-
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7128
|
-
}
|
|
7129
|
-
} else {
|
|
7130
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7464
|
+
verdict: passed ? "pass" : "fail",
|
|
7465
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7466
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7467
|
+
expectedAspectCount: 1,
|
|
7468
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7469
|
+
evaluatorRawRequest: {
|
|
7470
|
+
type: "cost",
|
|
7471
|
+
budget,
|
|
7472
|
+
costUsd
|
|
7131
7473
|
}
|
|
7132
|
-
}
|
|
7133
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
7134
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7135
|
-
}
|
|
7136
|
-
const score = hits.length / expected.length;
|
|
7137
|
-
return {
|
|
7138
|
-
score,
|
|
7139
|
-
verdict: scoreToVerdict(score),
|
|
7140
|
-
hits,
|
|
7141
|
-
misses,
|
|
7142
|
-
expectedAspectCount: expected.length
|
|
7143
7474
|
};
|
|
7144
7475
|
}
|
|
7145
7476
|
};
|
|
7477
|
+
|
|
7478
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
7146
7479
|
var DEFAULT_DATE_FORMATS = [
|
|
7147
7480
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
7148
7481
|
// ISO with timezone
|
|
@@ -7353,436 +7686,211 @@ var FieldAccuracyEvaluator = class {
|
|
|
7353
7686
|
message: `${path17} (non-numeric value)`
|
|
7354
7687
|
};
|
|
7355
7688
|
}
|
|
7356
|
-
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7357
|
-
return {
|
|
7358
|
-
path: path17,
|
|
7359
|
-
score: 0,
|
|
7360
|
-
weight,
|
|
7361
|
-
hit: false,
|
|
7362
|
-
message: `${path17} (invalid numeric value)`
|
|
7363
|
-
};
|
|
7364
|
-
}
|
|
7365
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
7366
|
-
let withinTolerance;
|
|
7367
|
-
if (relative) {
|
|
7368
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7369
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
7370
|
-
} else {
|
|
7371
|
-
withinTolerance = diff <= tolerance;
|
|
7372
|
-
}
|
|
7373
|
-
if (withinTolerance) {
|
|
7374
|
-
return {
|
|
7375
|
-
path: path17,
|
|
7376
|
-
score: 1,
|
|
7377
|
-
weight,
|
|
7378
|
-
hit: true,
|
|
7379
|
-
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7380
|
-
};
|
|
7381
|
-
}
|
|
7382
|
-
return {
|
|
7383
|
-
path: path17,
|
|
7384
|
-
score: 0,
|
|
7385
|
-
weight,
|
|
7386
|
-
hit: false,
|
|
7387
|
-
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7388
|
-
};
|
|
7389
|
-
}
|
|
7390
|
-
/**
|
|
7391
|
-
* Date comparison with format normalization.
|
|
7392
|
-
*/
|
|
7393
|
-
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7394
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7395
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7396
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7397
|
-
if (candidateDate === null) {
|
|
7398
|
-
return {
|
|
7399
|
-
path: path17,
|
|
7400
|
-
score: 0,
|
|
7401
|
-
weight,
|
|
7402
|
-
hit: false,
|
|
7403
|
-
message: `${path17} (unparseable candidate date)`
|
|
7404
|
-
};
|
|
7405
|
-
}
|
|
7406
|
-
if (expectedDate === null) {
|
|
7407
|
-
return {
|
|
7408
|
-
path: path17,
|
|
7409
|
-
score: 0,
|
|
7410
|
-
weight,
|
|
7411
|
-
hit: false,
|
|
7412
|
-
message: `${path17} (unparseable expected date)`
|
|
7413
|
-
};
|
|
7414
|
-
}
|
|
7415
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7416
|
-
return {
|
|
7417
|
-
path: path17,
|
|
7418
|
-
score: 1,
|
|
7419
|
-
weight,
|
|
7420
|
-
hit: true,
|
|
7421
|
-
message: path17
|
|
7422
|
-
};
|
|
7423
|
-
}
|
|
7424
|
-
return {
|
|
7425
|
-
path: path17,
|
|
7426
|
-
score: 0,
|
|
7427
|
-
weight,
|
|
7428
|
-
hit: false,
|
|
7429
|
-
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7430
|
-
};
|
|
7431
|
-
}
|
|
7432
|
-
/**
|
|
7433
|
-
* Aggregate field results using configured strategy.
|
|
7434
|
-
*/
|
|
7435
|
-
aggregateResults(results) {
|
|
7436
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7437
|
-
const hits = [];
|
|
7438
|
-
const misses = [];
|
|
7439
|
-
for (const result of results) {
|
|
7440
|
-
if (result.hit) {
|
|
7441
|
-
hits.push(result.message);
|
|
7442
|
-
} else {
|
|
7443
|
-
misses.push(result.message);
|
|
7444
|
-
}
|
|
7445
|
-
}
|
|
7446
|
-
let score;
|
|
7447
|
-
if (aggregation === "all_or_nothing") {
|
|
7448
|
-
score = misses.length === 0 ? 1 : 0;
|
|
7449
|
-
} else {
|
|
7450
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7451
|
-
if (totalWeight === 0) {
|
|
7452
|
-
score = results.length === 0 ? 1 : 0;
|
|
7453
|
-
} else {
|
|
7454
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7455
|
-
score = weightedSum / totalWeight;
|
|
7456
|
-
}
|
|
7457
|
-
}
|
|
7458
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7459
|
-
return {
|
|
7460
|
-
score: clampScore(score),
|
|
7461
|
-
verdict: scoreToVerdict(score),
|
|
7462
|
-
hits: hits.slice(0, 4),
|
|
7463
|
-
misses: misses.slice(0, 4),
|
|
7464
|
-
expectedAspectCount: results.length,
|
|
7465
|
-
reasoning
|
|
7466
|
-
};
|
|
7467
|
-
}
|
|
7468
|
-
};
|
|
7469
|
-
function resolvePath(obj, path17) {
|
|
7470
|
-
if (!path17 || !obj) {
|
|
7471
|
-
return void 0;
|
|
7472
|
-
}
|
|
7473
|
-
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7474
|
-
let current = obj;
|
|
7475
|
-
for (const part of parts) {
|
|
7476
|
-
if (current === null || current === void 0) {
|
|
7477
|
-
return void 0;
|
|
7478
|
-
}
|
|
7479
|
-
if (typeof current !== "object") {
|
|
7480
|
-
return void 0;
|
|
7481
|
-
}
|
|
7482
|
-
const isIndex = /^\d+$/.test(part);
|
|
7483
|
-
if (isIndex && Array.isArray(current)) {
|
|
7484
|
-
current = current[Number.parseInt(part, 10)];
|
|
7485
|
-
} else {
|
|
7486
|
-
current = current[part];
|
|
7487
|
-
}
|
|
7488
|
-
}
|
|
7489
|
-
return current;
|
|
7490
|
-
}
|
|
7491
|
-
function toNumber(value) {
|
|
7492
|
-
if (typeof value === "number") {
|
|
7493
|
-
return value;
|
|
7494
|
-
}
|
|
7495
|
-
if (typeof value === "string") {
|
|
7496
|
-
const num = Number.parseFloat(value);
|
|
7497
|
-
return Number.isNaN(num) ? null : num;
|
|
7498
|
-
}
|
|
7499
|
-
return null;
|
|
7500
|
-
}
|
|
7501
|
-
function parseDate(dateStr, formats) {
|
|
7502
|
-
if (!dateStr) return null;
|
|
7503
|
-
const trimmed = dateStr.trim();
|
|
7504
|
-
const isoDate = new Date(trimmed);
|
|
7505
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
7506
|
-
return isoDate;
|
|
7507
|
-
}
|
|
7508
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7509
|
-
if (localizedMatch) {
|
|
7510
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7511
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
7512
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7513
|
-
const month = MONTH_NAMES[monthName];
|
|
7514
|
-
if (month !== void 0) {
|
|
7515
|
-
return new Date(year, month, day);
|
|
7516
|
-
}
|
|
7517
|
-
}
|
|
7518
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7519
|
-
if (usMatch) {
|
|
7520
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7521
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7522
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
7523
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7524
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
7525
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7526
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7527
|
-
return new Date(year, month, day);
|
|
7528
|
-
}
|
|
7529
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
7530
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
7531
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7532
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7533
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7534
|
-
return new Date(year, month, day);
|
|
7535
|
-
}
|
|
7536
|
-
} else {
|
|
7537
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7538
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7539
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
7540
|
-
if (num1 > 12 && num2 <= 12) {
|
|
7541
|
-
return new Date(year, num2 - 1, num1);
|
|
7542
|
-
}
|
|
7543
|
-
if (num2 > 12 && num1 <= 12) {
|
|
7544
|
-
return new Date(year, num1 - 1, num2);
|
|
7545
|
-
}
|
|
7546
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
7547
|
-
return new Date(year, num1 - 1, num2);
|
|
7548
|
-
}
|
|
7549
|
-
}
|
|
7550
|
-
}
|
|
7551
|
-
return null;
|
|
7552
|
-
}
|
|
7553
|
-
function formatDateISO(date) {
|
|
7554
|
-
return date.toISOString().split("T")[0];
|
|
7555
|
-
}
|
|
7556
|
-
function parseJsonFromTextSafe(text) {
|
|
7557
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
7558
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
7559
|
-
const blob = match?.[0] ?? cleaned;
|
|
7560
|
-
return JSON.parse(blob);
|
|
7561
|
-
}
|
|
7562
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
7563
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
7564
|
-
|
|
7565
|
-
Decide the final score and verdict based on all evaluator results.
|
|
7566
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
7567
|
-
var CompositeEvaluator = class {
|
|
7568
|
-
kind = "composite";
|
|
7569
|
-
config;
|
|
7570
|
-
evaluatorFactory;
|
|
7571
|
-
cwd;
|
|
7572
|
-
constructor(options) {
|
|
7573
|
-
this.config = options.config;
|
|
7574
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
7575
|
-
this.cwd = options.cwd;
|
|
7576
|
-
}
|
|
7577
|
-
async evaluate(context) {
|
|
7578
|
-
const memberResults = await Promise.all(
|
|
7579
|
-
this.config.evaluators.map(async (memberConfig) => {
|
|
7580
|
-
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
7581
|
-
return {
|
|
7582
|
-
id: memberConfig.name,
|
|
7583
|
-
type: memberConfig.type,
|
|
7584
|
-
result: await evaluator.evaluate(context)
|
|
7585
|
-
};
|
|
7586
|
-
})
|
|
7587
|
-
);
|
|
7588
|
-
return this.aggregate(memberResults, context);
|
|
7589
|
-
}
|
|
7590
|
-
async aggregate(results, context) {
|
|
7591
|
-
const aggregator = this.config.aggregator;
|
|
7592
|
-
switch (aggregator.type) {
|
|
7593
|
-
case "code_judge":
|
|
7594
|
-
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
7595
|
-
case "llm_judge":
|
|
7596
|
-
return this.runLlmAggregator(results, context, aggregator);
|
|
7597
|
-
default:
|
|
7598
|
-
return this.runWeightedAverage(results, aggregator.weights);
|
|
7599
|
-
}
|
|
7600
|
-
}
|
|
7601
|
-
runWeightedAverage(results, weights) {
|
|
7602
|
-
let totalWeight = 0;
|
|
7603
|
-
let weightedSum = 0;
|
|
7604
|
-
const allHits = [];
|
|
7605
|
-
const allMisses = [];
|
|
7606
|
-
const reasoningParts = [];
|
|
7607
|
-
const evaluatorResults = [];
|
|
7608
|
-
for (const member of results) {
|
|
7609
|
-
const weight = weights?.[member.id] ?? 1;
|
|
7610
|
-
totalWeight += weight;
|
|
7611
|
-
weightedSum += member.result.score * weight;
|
|
7612
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
7613
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
7614
|
-
if (member.result.reasoning) {
|
|
7615
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
7616
|
-
}
|
|
7617
|
-
evaluatorResults.push({
|
|
7618
|
-
name: member.id,
|
|
7619
|
-
type: member.type,
|
|
7620
|
-
score: member.result.score,
|
|
7621
|
-
weight,
|
|
7622
|
-
verdict: member.result.verdict,
|
|
7623
|
-
hits: [...member.result.hits],
|
|
7624
|
-
misses: [...member.result.misses],
|
|
7625
|
-
reasoning: member.result.reasoning,
|
|
7626
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7627
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7628
|
-
});
|
|
7629
|
-
}
|
|
7630
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
7631
|
-
return {
|
|
7632
|
-
score: clampScore(finalScore),
|
|
7633
|
-
verdict: scoreToVerdict(finalScore),
|
|
7634
|
-
hits: allHits,
|
|
7635
|
-
misses: allMisses,
|
|
7636
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
7637
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
7638
|
-
evaluatorRawRequest: {
|
|
7639
|
-
aggregator: "weighted_average",
|
|
7640
|
-
...weights ? { weights } : {}
|
|
7641
|
-
},
|
|
7642
|
-
evaluatorResults
|
|
7643
|
-
};
|
|
7644
|
-
}
|
|
7645
|
-
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
7646
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7647
|
-
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
7648
|
-
const evaluatorResults = results.map((member) => ({
|
|
7649
|
-
name: member.id,
|
|
7650
|
-
type: member.type,
|
|
7651
|
-
score: member.result.score,
|
|
7652
|
-
weight: weights?.[member.id] ?? 1,
|
|
7653
|
-
verdict: member.result.verdict,
|
|
7654
|
-
hits: [...member.result.hits],
|
|
7655
|
-
misses: [...member.result.misses],
|
|
7656
|
-
reasoning: member.result.reasoning,
|
|
7657
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7658
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7659
|
-
}));
|
|
7660
|
-
try {
|
|
7661
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
7662
|
-
const parsed = parseJsonSafe(stdout);
|
|
7663
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
7664
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
7665
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
7666
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
7667
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
7668
|
-
return {
|
|
7669
|
-
score,
|
|
7670
|
-
verdict,
|
|
7671
|
-
hits,
|
|
7672
|
-
misses,
|
|
7673
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
7674
|
-
reasoning,
|
|
7675
|
-
evaluatorRawRequest: {
|
|
7676
|
-
aggregator: "code_judge",
|
|
7677
|
-
script: scriptPath
|
|
7678
|
-
},
|
|
7679
|
-
evaluatorResults
|
|
7680
|
-
};
|
|
7681
|
-
} catch (error) {
|
|
7682
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
7683
|
-
return {
|
|
7684
|
-
score: 0,
|
|
7685
|
-
verdict: "fail",
|
|
7686
|
-
hits: [],
|
|
7687
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
7688
|
-
expectedAspectCount: 1,
|
|
7689
|
-
reasoning: message,
|
|
7690
|
-
evaluatorRawRequest: {
|
|
7691
|
-
aggregator: "code_judge",
|
|
7692
|
-
script: scriptPath,
|
|
7693
|
-
error: message
|
|
7694
|
-
},
|
|
7695
|
-
evaluatorResults
|
|
7696
|
-
};
|
|
7697
|
-
}
|
|
7698
|
-
}
|
|
7699
|
-
async runLlmAggregator(results, context, config) {
|
|
7700
|
-
const judgeProvider = context.judgeProvider;
|
|
7701
|
-
if (!judgeProvider) {
|
|
7702
|
-
throw new Error("No judge provider available for LLM aggregation");
|
|
7703
|
-
}
|
|
7704
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
7705
|
-
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
7706
|
-
const evaluatorResults = results.map((member) => ({
|
|
7707
|
-
name: member.id,
|
|
7708
|
-
type: member.type,
|
|
7709
|
-
score: member.result.score,
|
|
7710
|
-
verdict: member.result.verdict,
|
|
7711
|
-
hits: [...member.result.hits],
|
|
7712
|
-
misses: [...member.result.misses],
|
|
7713
|
-
reasoning: member.result.reasoning,
|
|
7714
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
7715
|
-
evaluatorResults: member.result.evaluatorResults
|
|
7716
|
-
}));
|
|
7717
|
-
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
7718
|
-
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
7719
|
-
const systemPrompt = buildOutputSchema();
|
|
7720
|
-
const evaluatorRawRequest = {
|
|
7721
|
-
aggregator: "llm_judge",
|
|
7722
|
-
userPrompt,
|
|
7723
|
-
systemPrompt,
|
|
7724
|
-
target: judgeProvider.targetName
|
|
7725
|
-
};
|
|
7726
|
-
try {
|
|
7727
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
7728
|
-
if (model) {
|
|
7729
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
7730
|
-
model,
|
|
7731
|
-
system: systemPrompt,
|
|
7732
|
-
prompt: userPrompt
|
|
7733
|
-
});
|
|
7734
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
7735
|
-
const score2 = clampScore(data2.score);
|
|
7736
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7737
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7738
|
-
const reasoning2 = data2.reasoning;
|
|
7739
|
-
return {
|
|
7740
|
-
score: score2,
|
|
7741
|
-
verdict: scoreToVerdict(score2),
|
|
7742
|
-
hits: hits2,
|
|
7743
|
-
misses: misses2,
|
|
7744
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
7745
|
-
reasoning: reasoning2,
|
|
7746
|
-
evaluatorRawRequest,
|
|
7747
|
-
evaluatorResults
|
|
7748
|
-
};
|
|
7749
|
-
}
|
|
7750
|
-
const response = await judgeProvider.invoke({
|
|
7751
|
-
question: userPrompt,
|
|
7752
|
-
systemPrompt,
|
|
7753
|
-
evalCaseId: context.evalCase.id,
|
|
7754
|
-
attempt: context.attempt
|
|
7755
|
-
});
|
|
7756
|
-
const data = freeformEvaluationSchema.parse(
|
|
7757
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
7758
|
-
);
|
|
7759
|
-
const score = clampScore(data.score);
|
|
7760
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7761
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
7762
|
-
const reasoning = data.reasoning;
|
|
7689
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7763
7690
|
return {
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
|
|
7769
|
-
reasoning,
|
|
7770
|
-
evaluatorRawRequest,
|
|
7771
|
-
evaluatorResults
|
|
7691
|
+
path: path17,
|
|
7692
|
+
score: 0,
|
|
7693
|
+
weight,
|
|
7694
|
+
hit: false,
|
|
7695
|
+
message: `${path17} (invalid numeric value)`
|
|
7772
7696
|
};
|
|
7773
|
-
}
|
|
7697
|
+
}
|
|
7698
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
7699
|
+
let withinTolerance;
|
|
7700
|
+
if (relative) {
|
|
7701
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7702
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
7703
|
+
} else {
|
|
7704
|
+
withinTolerance = diff <= tolerance;
|
|
7705
|
+
}
|
|
7706
|
+
if (withinTolerance) {
|
|
7707
|
+
return {
|
|
7708
|
+
path: path17,
|
|
7709
|
+
score: 1,
|
|
7710
|
+
weight,
|
|
7711
|
+
hit: true,
|
|
7712
|
+
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7713
|
+
};
|
|
7714
|
+
}
|
|
7715
|
+
return {
|
|
7716
|
+
path: path17,
|
|
7717
|
+
score: 0,
|
|
7718
|
+
weight,
|
|
7719
|
+
hit: false,
|
|
7720
|
+
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7721
|
+
};
|
|
7722
|
+
}
|
|
7723
|
+
/**
|
|
7724
|
+
* Date comparison with format normalization.
|
|
7725
|
+
*/
|
|
7726
|
+
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7727
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7728
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7729
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7730
|
+
if (candidateDate === null) {
|
|
7774
7731
|
return {
|
|
7732
|
+
path: path17,
|
|
7775
7733
|
score: 0,
|
|
7776
|
-
|
|
7777
|
-
|
|
7778
|
-
|
|
7779
|
-
|
|
7780
|
-
|
|
7781
|
-
|
|
7734
|
+
weight,
|
|
7735
|
+
hit: false,
|
|
7736
|
+
message: `${path17} (unparseable candidate date)`
|
|
7737
|
+
};
|
|
7738
|
+
}
|
|
7739
|
+
if (expectedDate === null) {
|
|
7740
|
+
return {
|
|
7741
|
+
path: path17,
|
|
7742
|
+
score: 0,
|
|
7743
|
+
weight,
|
|
7744
|
+
hit: false,
|
|
7745
|
+
message: `${path17} (unparseable expected date)`
|
|
7746
|
+
};
|
|
7747
|
+
}
|
|
7748
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7749
|
+
return {
|
|
7750
|
+
path: path17,
|
|
7751
|
+
score: 1,
|
|
7752
|
+
weight,
|
|
7753
|
+
hit: true,
|
|
7754
|
+
message: path17
|
|
7782
7755
|
};
|
|
7783
7756
|
}
|
|
7757
|
+
return {
|
|
7758
|
+
path: path17,
|
|
7759
|
+
score: 0,
|
|
7760
|
+
weight,
|
|
7761
|
+
hit: false,
|
|
7762
|
+
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7763
|
+
};
|
|
7764
|
+
}
|
|
7765
|
+
/**
|
|
7766
|
+
* Aggregate field results using configured strategy.
|
|
7767
|
+
*/
|
|
7768
|
+
aggregateResults(results) {
|
|
7769
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7770
|
+
const hits = [];
|
|
7771
|
+
const misses = [];
|
|
7772
|
+
for (const result of results) {
|
|
7773
|
+
if (result.hit) {
|
|
7774
|
+
hits.push(result.message);
|
|
7775
|
+
} else {
|
|
7776
|
+
misses.push(result.message);
|
|
7777
|
+
}
|
|
7778
|
+
}
|
|
7779
|
+
let score;
|
|
7780
|
+
if (aggregation === "all_or_nothing") {
|
|
7781
|
+
score = misses.length === 0 ? 1 : 0;
|
|
7782
|
+
} else {
|
|
7783
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7784
|
+
if (totalWeight === 0) {
|
|
7785
|
+
score = results.length === 0 ? 1 : 0;
|
|
7786
|
+
} else {
|
|
7787
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7788
|
+
score = weightedSum / totalWeight;
|
|
7789
|
+
}
|
|
7790
|
+
}
|
|
7791
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
7792
|
+
return {
|
|
7793
|
+
score: clampScore(score),
|
|
7794
|
+
verdict: scoreToVerdict(score),
|
|
7795
|
+
hits: hits.slice(0, 4),
|
|
7796
|
+
misses: misses.slice(0, 4),
|
|
7797
|
+
expectedAspectCount: results.length,
|
|
7798
|
+
reasoning
|
|
7799
|
+
};
|
|
7784
7800
|
}
|
|
7785
7801
|
};
|
|
7802
|
+
function resolvePath(obj, path17) {
|
|
7803
|
+
if (!path17 || !obj) {
|
|
7804
|
+
return void 0;
|
|
7805
|
+
}
|
|
7806
|
+
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7807
|
+
let current = obj;
|
|
7808
|
+
for (const part of parts) {
|
|
7809
|
+
if (current === null || current === void 0) {
|
|
7810
|
+
return void 0;
|
|
7811
|
+
}
|
|
7812
|
+
if (typeof current !== "object") {
|
|
7813
|
+
return void 0;
|
|
7814
|
+
}
|
|
7815
|
+
const isIndex = /^\d+$/.test(part);
|
|
7816
|
+
if (isIndex && Array.isArray(current)) {
|
|
7817
|
+
current = current[Number.parseInt(part, 10)];
|
|
7818
|
+
} else {
|
|
7819
|
+
current = current[part];
|
|
7820
|
+
}
|
|
7821
|
+
}
|
|
7822
|
+
return current;
|
|
7823
|
+
}
|
|
7824
|
+
function toNumber(value) {
|
|
7825
|
+
if (typeof value === "number") {
|
|
7826
|
+
return value;
|
|
7827
|
+
}
|
|
7828
|
+
if (typeof value === "string") {
|
|
7829
|
+
const num = Number.parseFloat(value);
|
|
7830
|
+
return Number.isNaN(num) ? null : num;
|
|
7831
|
+
}
|
|
7832
|
+
return null;
|
|
7833
|
+
}
|
|
7834
|
+
function parseDate(dateStr, formats) {
|
|
7835
|
+
if (!dateStr) return null;
|
|
7836
|
+
const trimmed = dateStr.trim();
|
|
7837
|
+
const isoDate = new Date(trimmed);
|
|
7838
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
7839
|
+
return isoDate;
|
|
7840
|
+
}
|
|
7841
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7842
|
+
if (localizedMatch) {
|
|
7843
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7844
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
7845
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7846
|
+
const month = MONTH_NAMES[monthName];
|
|
7847
|
+
if (month !== void 0) {
|
|
7848
|
+
return new Date(year, month, day);
|
|
7849
|
+
}
|
|
7850
|
+
}
|
|
7851
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7852
|
+
if (usMatch) {
|
|
7853
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7854
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7855
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
7856
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7857
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
7858
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7859
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7860
|
+
return new Date(year, month, day);
|
|
7861
|
+
}
|
|
7862
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
7863
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
7864
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7865
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7866
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7867
|
+
return new Date(year, month, day);
|
|
7868
|
+
}
|
|
7869
|
+
} else {
|
|
7870
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7871
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7872
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7873
|
+
if (num1 > 12 && num2 <= 12) {
|
|
7874
|
+
return new Date(year, num2 - 1, num1);
|
|
7875
|
+
}
|
|
7876
|
+
if (num2 > 12 && num1 <= 12) {
|
|
7877
|
+
return new Date(year, num1 - 1, num2);
|
|
7878
|
+
}
|
|
7879
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
7880
|
+
return new Date(year, num1 - 1, num2);
|
|
7881
|
+
}
|
|
7882
|
+
}
|
|
7883
|
+
}
|
|
7884
|
+
return null;
|
|
7885
|
+
}
|
|
7886
|
+
function formatDateISO(date) {
|
|
7887
|
+
return date.toISOString().split("T")[0];
|
|
7888
|
+
}
|
|
7889
|
+
function parseJsonFromTextSafe(text) {
|
|
7890
|
+
return parseJsonFromText(text);
|
|
7891
|
+
}
|
|
7892
|
+
|
|
7893
|
+
// src/evaluation/evaluators/latency.ts
|
|
7786
7894
|
var LatencyEvaluator = class {
|
|
7787
7895
|
kind = "latency";
|
|
7788
7896
|
config;
|
|
@@ -7816,56 +7924,16 @@ var LatencyEvaluator = class {
|
|
|
7816
7924
|
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
7817
7925
|
expectedAspectCount: 1,
|
|
7818
7926
|
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
7819
|
-
evaluatorRawRequest: {
|
|
7820
|
-
type: "latency",
|
|
7821
|
-
threshold,
|
|
7822
|
-
durationMs
|
|
7823
|
-
}
|
|
7824
|
-
};
|
|
7825
|
-
}
|
|
7826
|
-
};
|
|
7827
|
-
var CostEvaluator = class {
|
|
7828
|
-
kind = "cost";
|
|
7829
|
-
config;
|
|
7830
|
-
constructor(options) {
|
|
7831
|
-
this.config = options.config;
|
|
7832
|
-
}
|
|
7833
|
-
evaluate(context) {
|
|
7834
|
-
const { budget } = this.config;
|
|
7835
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
7836
|
-
if (costUsd === void 0) {
|
|
7837
|
-
return {
|
|
7838
|
-
score: 0,
|
|
7839
|
-
verdict: "fail",
|
|
7840
|
-
hits: [],
|
|
7841
|
-
misses: ["No cost data available in trace"],
|
|
7842
|
-
expectedAspectCount: 1,
|
|
7843
|
-
reasoning: "Execution cost not reported by provider",
|
|
7844
|
-
evaluatorRawRequest: {
|
|
7845
|
-
type: "cost",
|
|
7846
|
-
budget,
|
|
7847
|
-
costUsd: null
|
|
7848
|
-
}
|
|
7849
|
-
};
|
|
7850
|
-
}
|
|
7851
|
-
const passed = costUsd <= budget;
|
|
7852
|
-
const score = passed ? 1 : 0;
|
|
7853
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7854
|
-
return {
|
|
7855
|
-
score,
|
|
7856
|
-
verdict: passed ? "pass" : "fail",
|
|
7857
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7858
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7859
|
-
expectedAspectCount: 1,
|
|
7860
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7861
|
-
evaluatorRawRequest: {
|
|
7862
|
-
type: "cost",
|
|
7863
|
-
budget,
|
|
7864
|
-
costUsd
|
|
7927
|
+
evaluatorRawRequest: {
|
|
7928
|
+
type: "latency",
|
|
7929
|
+
threshold,
|
|
7930
|
+
durationMs
|
|
7865
7931
|
}
|
|
7866
7932
|
};
|
|
7867
7933
|
}
|
|
7868
7934
|
};
|
|
7935
|
+
|
|
7936
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
7869
7937
|
var TokenUsageEvaluator = class {
|
|
7870
7938
|
kind = "token_usage";
|
|
7871
7939
|
config;
|
|
@@ -7949,8 +8017,228 @@ var TokenUsageEvaluator = class {
|
|
|
7949
8017
|
}
|
|
7950
8018
|
};
|
|
7951
8019
|
|
|
8020
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
8021
|
+
function argsMatch(expected, actual) {
|
|
8022
|
+
if (expected === void 0) return true;
|
|
8023
|
+
if (expected === "any") return true;
|
|
8024
|
+
if (actual === void 0) return false;
|
|
8025
|
+
for (const key of Object.keys(expected)) {
|
|
8026
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
8027
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
8028
|
+
}
|
|
8029
|
+
return true;
|
|
8030
|
+
}
|
|
8031
|
+
var ToolTrajectoryEvaluator = class {
|
|
8032
|
+
kind = "tool_trajectory";
|
|
8033
|
+
config;
|
|
8034
|
+
constructor(options) {
|
|
8035
|
+
this.config = options.config;
|
|
8036
|
+
}
|
|
8037
|
+
evaluate(context) {
|
|
8038
|
+
const { outputMessages, traceSummary } = context;
|
|
8039
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
8040
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
8041
|
+
return {
|
|
8042
|
+
score: 0,
|
|
8043
|
+
verdict: "fail",
|
|
8044
|
+
hits: [],
|
|
8045
|
+
misses: ["No trace available for evaluation"],
|
|
8046
|
+
expectedAspectCount: 1
|
|
8047
|
+
};
|
|
8048
|
+
}
|
|
8049
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
8050
|
+
if (!summary) {
|
|
8051
|
+
return {
|
|
8052
|
+
score: 0,
|
|
8053
|
+
verdict: "fail",
|
|
8054
|
+
hits: [],
|
|
8055
|
+
misses: ["No trace available for evaluation"],
|
|
8056
|
+
expectedAspectCount: 1
|
|
8057
|
+
};
|
|
8058
|
+
}
|
|
8059
|
+
switch (this.config.mode) {
|
|
8060
|
+
case "any_order":
|
|
8061
|
+
return this.evaluateAnyOrder(summary);
|
|
8062
|
+
case "in_order":
|
|
8063
|
+
return this.evaluateInOrder(toolCalls);
|
|
8064
|
+
case "exact":
|
|
8065
|
+
return this.evaluateExact(toolCalls);
|
|
8066
|
+
default:
|
|
8067
|
+
return {
|
|
8068
|
+
score: 0,
|
|
8069
|
+
verdict: "fail",
|
|
8070
|
+
hits: [],
|
|
8071
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
8072
|
+
expectedAspectCount: 1
|
|
8073
|
+
};
|
|
8074
|
+
}
|
|
8075
|
+
}
|
|
8076
|
+
/**
|
|
8077
|
+
* Extract tool calls from output messages.
|
|
8078
|
+
*/
|
|
8079
|
+
extractToolCallsFromMessages(messages) {
|
|
8080
|
+
if (!messages) {
|
|
8081
|
+
return [];
|
|
8082
|
+
}
|
|
8083
|
+
const toolCalls = [];
|
|
8084
|
+
for (const message of messages) {
|
|
8085
|
+
if (message.toolCalls) {
|
|
8086
|
+
for (const call of message.toolCalls) {
|
|
8087
|
+
toolCalls.push({
|
|
8088
|
+
name: call.tool,
|
|
8089
|
+
args: call.input
|
|
8090
|
+
});
|
|
8091
|
+
}
|
|
8092
|
+
}
|
|
8093
|
+
}
|
|
8094
|
+
return toolCalls;
|
|
8095
|
+
}
|
|
8096
|
+
/**
|
|
8097
|
+
* Build a summary from extracted tool calls.
|
|
8098
|
+
*/
|
|
8099
|
+
buildSummary(toolCalls) {
|
|
8100
|
+
const toolCallsByName = {};
|
|
8101
|
+
for (const call of toolCalls) {
|
|
8102
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
8103
|
+
}
|
|
8104
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
8105
|
+
return {
|
|
8106
|
+
eventCount: toolCalls.length,
|
|
8107
|
+
toolNames,
|
|
8108
|
+
toolCallsByName,
|
|
8109
|
+
errorCount: 0
|
|
8110
|
+
};
|
|
8111
|
+
}
|
|
8112
|
+
evaluateAnyOrder(summary) {
|
|
8113
|
+
const minimums = this.config.minimums ?? {};
|
|
8114
|
+
const toolNames = Object.keys(minimums);
|
|
8115
|
+
if (toolNames.length === 0) {
|
|
8116
|
+
return {
|
|
8117
|
+
score: 1,
|
|
8118
|
+
verdict: "pass",
|
|
8119
|
+
hits: ["No tool requirements specified"],
|
|
8120
|
+
misses: [],
|
|
8121
|
+
expectedAspectCount: 0
|
|
8122
|
+
};
|
|
8123
|
+
}
|
|
8124
|
+
const hits = [];
|
|
8125
|
+
const misses = [];
|
|
8126
|
+
for (const toolName of toolNames) {
|
|
8127
|
+
const required = minimums[toolName];
|
|
8128
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
8129
|
+
if (actual >= required) {
|
|
8130
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8131
|
+
} else {
|
|
8132
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
8133
|
+
}
|
|
8134
|
+
}
|
|
8135
|
+
const score = hits.length / toolNames.length;
|
|
8136
|
+
return {
|
|
8137
|
+
score,
|
|
8138
|
+
verdict: scoreToVerdict(score),
|
|
8139
|
+
hits,
|
|
8140
|
+
misses,
|
|
8141
|
+
expectedAspectCount: toolNames.length
|
|
8142
|
+
};
|
|
8143
|
+
}
|
|
8144
|
+
evaluateInOrder(toolCalls) {
|
|
8145
|
+
const expected = this.config.expected ?? [];
|
|
8146
|
+
if (expected.length === 0) {
|
|
8147
|
+
return {
|
|
8148
|
+
score: 1,
|
|
8149
|
+
verdict: "pass",
|
|
8150
|
+
hits: ["No tool sequence specified"],
|
|
8151
|
+
misses: [],
|
|
8152
|
+
expectedAspectCount: 0
|
|
8153
|
+
};
|
|
8154
|
+
}
|
|
8155
|
+
const hits = [];
|
|
8156
|
+
const misses = [];
|
|
8157
|
+
let actualIndex = 0;
|
|
8158
|
+
for (let i = 0; i < expected.length; i++) {
|
|
8159
|
+
const expectedItem = expected[i];
|
|
8160
|
+
const expectedTool = expectedItem.tool;
|
|
8161
|
+
let found = false;
|
|
8162
|
+
let argsMismatch = false;
|
|
8163
|
+
while (actualIndex < toolCalls.length) {
|
|
8164
|
+
const actualCall = toolCalls[actualIndex];
|
|
8165
|
+
if (actualCall.name === expectedTool) {
|
|
8166
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8167
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
8168
|
+
actualIndex++;
|
|
8169
|
+
found = true;
|
|
8170
|
+
break;
|
|
8171
|
+
}
|
|
8172
|
+
misses.push(
|
|
8173
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
8174
|
+
);
|
|
8175
|
+
actualIndex++;
|
|
8176
|
+
argsMismatch = true;
|
|
8177
|
+
break;
|
|
8178
|
+
}
|
|
8179
|
+
actualIndex++;
|
|
8180
|
+
}
|
|
8181
|
+
if (!found && !argsMismatch) {
|
|
8182
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
8183
|
+
}
|
|
8184
|
+
}
|
|
8185
|
+
const score = hits.length / expected.length;
|
|
8186
|
+
return {
|
|
8187
|
+
score,
|
|
8188
|
+
verdict: scoreToVerdict(score),
|
|
8189
|
+
hits,
|
|
8190
|
+
misses,
|
|
8191
|
+
expectedAspectCount: expected.length
|
|
8192
|
+
};
|
|
8193
|
+
}
|
|
8194
|
+
evaluateExact(toolCalls) {
|
|
8195
|
+
const expected = this.config.expected ?? [];
|
|
8196
|
+
if (expected.length === 0) {
|
|
8197
|
+
return {
|
|
8198
|
+
score: 1,
|
|
8199
|
+
verdict: "pass",
|
|
8200
|
+
hits: ["No tool sequence specified"],
|
|
8201
|
+
misses: [],
|
|
8202
|
+
expectedAspectCount: 0
|
|
8203
|
+
};
|
|
8204
|
+
}
|
|
8205
|
+
const hits = [];
|
|
8206
|
+
const misses = [];
|
|
8207
|
+
if (toolCalls.length !== expected.length) {
|
|
8208
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
8209
|
+
}
|
|
8210
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
8211
|
+
for (let i = 0; i < checkLength; i++) {
|
|
8212
|
+
const expectedItem = expected[i];
|
|
8213
|
+
const expectedTool = expectedItem.tool;
|
|
8214
|
+
const actualCall = toolCalls[i];
|
|
8215
|
+
const actualTool = actualCall.name;
|
|
8216
|
+
if (actualTool === expectedTool) {
|
|
8217
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8218
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
8219
|
+
} else {
|
|
8220
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
8221
|
+
}
|
|
8222
|
+
} else {
|
|
8223
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
8224
|
+
}
|
|
8225
|
+
}
|
|
8226
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
8227
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
8228
|
+
}
|
|
8229
|
+
const score = hits.length / expected.length;
|
|
8230
|
+
return {
|
|
8231
|
+
score,
|
|
8232
|
+
verdict: scoreToVerdict(score),
|
|
8233
|
+
hits,
|
|
8234
|
+
misses,
|
|
8235
|
+
expectedAspectCount: expected.length
|
|
8236
|
+
};
|
|
8237
|
+
}
|
|
8238
|
+
};
|
|
8239
|
+
|
|
7952
8240
|
// src/evaluation/orchestrator.ts
|
|
7953
|
-
var
|
|
8241
|
+
var import_node_crypto5 = require("crypto");
|
|
7954
8242
|
var import_node_path16 = __toESM(require("path"), 1);
|
|
7955
8243
|
|
|
7956
8244
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -8162,6 +8450,17 @@ async function runEvaluation(options) {
|
|
|
8162
8450
|
}
|
|
8163
8451
|
return getOrCreateProvider(resolvedJudge);
|
|
8164
8452
|
};
|
|
8453
|
+
const targetResolver = (name) => {
|
|
8454
|
+
const resolved = resolveTargetByName(name);
|
|
8455
|
+
if (!resolved) {
|
|
8456
|
+
return void 0;
|
|
8457
|
+
}
|
|
8458
|
+
return getOrCreateProvider(resolved);
|
|
8459
|
+
};
|
|
8460
|
+
const availableTargets = [
|
|
8461
|
+
target.name,
|
|
8462
|
+
...Array.from(targetDefinitions.keys())
|
|
8463
|
+
];
|
|
8165
8464
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
8166
8465
|
const primaryProvider = getOrCreateProvider(target);
|
|
8167
8466
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -8191,7 +8490,9 @@ async function runEvaluation(options) {
|
|
|
8191
8490
|
onResult,
|
|
8192
8491
|
verbose,
|
|
8193
8492
|
resolveJudgeProvider,
|
|
8194
|
-
agentTimeoutMs
|
|
8493
|
+
agentTimeoutMs,
|
|
8494
|
+
targetResolver,
|
|
8495
|
+
availableTargets
|
|
8195
8496
|
});
|
|
8196
8497
|
} catch (error) {
|
|
8197
8498
|
if (verbose) {
|
|
@@ -8230,7 +8531,9 @@ async function runEvaluation(options) {
|
|
|
8230
8531
|
cache,
|
|
8231
8532
|
useCache,
|
|
8232
8533
|
now,
|
|
8233
|
-
judgeProvider
|
|
8534
|
+
judgeProvider,
|
|
8535
|
+
targetResolver,
|
|
8536
|
+
availableTargets
|
|
8234
8537
|
});
|
|
8235
8538
|
if (onProgress) {
|
|
8236
8539
|
await onProgress({
|
|
@@ -8297,7 +8600,9 @@ async function runBatchEvaluation(options) {
|
|
|
8297
8600
|
onProgress,
|
|
8298
8601
|
onResult,
|
|
8299
8602
|
resolveJudgeProvider,
|
|
8300
|
-
agentTimeoutMs
|
|
8603
|
+
agentTimeoutMs,
|
|
8604
|
+
targetResolver,
|
|
8605
|
+
availableTargets
|
|
8301
8606
|
} = options;
|
|
8302
8607
|
const promptInputsList = [];
|
|
8303
8608
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -8356,7 +8661,7 @@ async function runBatchEvaluation(options) {
|
|
|
8356
8661
|
costUsd: providerResponse.costUsd,
|
|
8357
8662
|
durationMs: providerResponse.durationMs
|
|
8358
8663
|
}) : void 0;
|
|
8359
|
-
const candidate =
|
|
8664
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8360
8665
|
const providerError = extractProviderError(providerResponse);
|
|
8361
8666
|
let result;
|
|
8362
8667
|
try {
|
|
@@ -8372,7 +8677,9 @@ async function runBatchEvaluation(options) {
|
|
|
8372
8677
|
judgeProvider: await resolveJudgeProvider(target),
|
|
8373
8678
|
agentTimeoutMs,
|
|
8374
8679
|
outputMessages,
|
|
8375
|
-
traceSummary
|
|
8680
|
+
traceSummary,
|
|
8681
|
+
targetResolver,
|
|
8682
|
+
availableTargets
|
|
8376
8683
|
});
|
|
8377
8684
|
if (providerError) {
|
|
8378
8685
|
result = { ...result, error: providerError };
|
|
@@ -8430,7 +8737,9 @@ async function runEvalCase(options) {
|
|
|
8430
8737
|
cache,
|
|
8431
8738
|
useCache,
|
|
8432
8739
|
signal,
|
|
8433
|
-
judgeProvider
|
|
8740
|
+
judgeProvider,
|
|
8741
|
+
targetResolver,
|
|
8742
|
+
availableTargets
|
|
8434
8743
|
} = options;
|
|
8435
8744
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
8436
8745
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -8489,7 +8798,7 @@ async function runEvalCase(options) {
|
|
|
8489
8798
|
costUsd: providerResponse.costUsd,
|
|
8490
8799
|
durationMs: providerResponse.durationMs
|
|
8491
8800
|
}) : void 0;
|
|
8492
|
-
const candidate =
|
|
8801
|
+
const candidate = extractLastAssistantContent2(outputMessages);
|
|
8493
8802
|
const providerError = extractProviderError(providerResponse);
|
|
8494
8803
|
try {
|
|
8495
8804
|
const result = await evaluateCandidate({
|
|
@@ -8504,7 +8813,9 @@ async function runEvalCase(options) {
|
|
|
8504
8813
|
judgeProvider,
|
|
8505
8814
|
agentTimeoutMs,
|
|
8506
8815
|
outputMessages,
|
|
8507
|
-
traceSummary
|
|
8816
|
+
traceSummary,
|
|
8817
|
+
targetResolver,
|
|
8818
|
+
availableTargets
|
|
8508
8819
|
});
|
|
8509
8820
|
return providerError ? { ...result, error: providerError } : result;
|
|
8510
8821
|
} catch (error) {
|
|
@@ -8524,7 +8835,9 @@ async function evaluateCandidate(options) {
|
|
|
8524
8835
|
judgeProvider,
|
|
8525
8836
|
agentTimeoutMs,
|
|
8526
8837
|
outputMessages,
|
|
8527
|
-
traceSummary
|
|
8838
|
+
traceSummary,
|
|
8839
|
+
targetResolver,
|
|
8840
|
+
availableTargets
|
|
8528
8841
|
} = options;
|
|
8529
8842
|
const gradeTimestamp = nowFn();
|
|
8530
8843
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -8539,7 +8852,9 @@ async function evaluateCandidate(options) {
|
|
|
8539
8852
|
judgeProvider,
|
|
8540
8853
|
agentTimeoutMs,
|
|
8541
8854
|
outputMessages,
|
|
8542
|
-
traceSummary
|
|
8855
|
+
traceSummary,
|
|
8856
|
+
targetResolver,
|
|
8857
|
+
availableTargets
|
|
8543
8858
|
});
|
|
8544
8859
|
const completedAt = nowFn();
|
|
8545
8860
|
let agentProviderRequest;
|
|
@@ -8592,7 +8907,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8592
8907
|
judgeProvider,
|
|
8593
8908
|
agentTimeoutMs,
|
|
8594
8909
|
outputMessages,
|
|
8595
|
-
traceSummary
|
|
8910
|
+
traceSummary,
|
|
8911
|
+
targetResolver,
|
|
8912
|
+
availableTargets
|
|
8596
8913
|
} = options;
|
|
8597
8914
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
8598
8915
|
return runEvaluatorList({
|
|
@@ -8608,7 +8925,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8608
8925
|
judgeProvider,
|
|
8609
8926
|
agentTimeoutMs,
|
|
8610
8927
|
outputMessages,
|
|
8611
|
-
traceSummary
|
|
8928
|
+
traceSummary,
|
|
8929
|
+
targetResolver,
|
|
8930
|
+
availableTargets
|
|
8612
8931
|
});
|
|
8613
8932
|
}
|
|
8614
8933
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -8626,7 +8945,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
8626
8945
|
now,
|
|
8627
8946
|
judgeProvider,
|
|
8628
8947
|
outputMessages,
|
|
8629
|
-
traceSummary
|
|
8948
|
+
traceSummary,
|
|
8949
|
+
targetResolver,
|
|
8950
|
+
availableTargets
|
|
8630
8951
|
});
|
|
8631
8952
|
return { score };
|
|
8632
8953
|
}
|
|
@@ -8644,7 +8965,9 @@ async function runEvaluatorList(options) {
|
|
|
8644
8965
|
judgeProvider,
|
|
8645
8966
|
agentTimeoutMs,
|
|
8646
8967
|
outputMessages,
|
|
8647
|
-
traceSummary
|
|
8968
|
+
traceSummary,
|
|
8969
|
+
targetResolver,
|
|
8970
|
+
availableTargets
|
|
8648
8971
|
} = options;
|
|
8649
8972
|
const scored = [];
|
|
8650
8973
|
const evaluatorResults = [];
|
|
@@ -8682,7 +9005,8 @@ async function runEvaluatorList(options) {
|
|
|
8682
9005
|
script: evaluator.script,
|
|
8683
9006
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
8684
9007
|
agentTimeoutMs,
|
|
8685
|
-
config: evaluator.config
|
|
9008
|
+
config: evaluator.config,
|
|
9009
|
+
target: evaluator.target
|
|
8686
9010
|
});
|
|
8687
9011
|
const score2 = await codeEvaluator.evaluate({
|
|
8688
9012
|
evalCase,
|
|
@@ -8692,8 +9016,11 @@ async function runEvaluatorList(options) {
|
|
|
8692
9016
|
attempt,
|
|
8693
9017
|
promptInputs,
|
|
8694
9018
|
now,
|
|
9019
|
+
judgeProvider,
|
|
8695
9020
|
outputMessages,
|
|
8696
|
-
traceSummary
|
|
9021
|
+
traceSummary,
|
|
9022
|
+
targetResolver,
|
|
9023
|
+
availableTargets
|
|
8697
9024
|
});
|
|
8698
9025
|
const weight = evaluator.weight ?? 1;
|
|
8699
9026
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -8706,7 +9033,8 @@ async function runEvaluatorList(options) {
|
|
|
8706
9033
|
hits: score2.hits,
|
|
8707
9034
|
misses: score2.misses,
|
|
8708
9035
|
reasoning: score2.reasoning,
|
|
8709
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
9036
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
9037
|
+
details: score2.details
|
|
8710
9038
|
});
|
|
8711
9039
|
}
|
|
8712
9040
|
if (evaluator.type === "composite") {
|
|
@@ -8720,7 +9048,8 @@ async function runEvaluatorList(options) {
|
|
|
8720
9048
|
script: memberConfig.script,
|
|
8721
9049
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
8722
9050
|
agentTimeoutMs,
|
|
8723
|
-
config: memberConfig.config
|
|
9051
|
+
config: memberConfig.config,
|
|
9052
|
+
target: memberConfig.target
|
|
8724
9053
|
});
|
|
8725
9054
|
case "composite":
|
|
8726
9055
|
return new CompositeEvaluator({
|
|
@@ -8769,7 +9098,9 @@ async function runEvaluatorList(options) {
|
|
|
8769
9098
|
now,
|
|
8770
9099
|
judgeProvider,
|
|
8771
9100
|
outputMessages,
|
|
8772
|
-
traceSummary
|
|
9101
|
+
traceSummary,
|
|
9102
|
+
targetResolver,
|
|
9103
|
+
availableTargets
|
|
8773
9104
|
});
|
|
8774
9105
|
const weight = evaluator.weight ?? 1;
|
|
8775
9106
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -8965,11 +9296,11 @@ async function runEvaluatorList(options) {
|
|
|
8965
9296
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
8966
9297
|
0
|
|
8967
9298
|
);
|
|
8968
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
9299
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
8969
9300
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
8970
9301
|
const score = {
|
|
8971
9302
|
score: aggregateScore,
|
|
8972
|
-
verdict:
|
|
9303
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
8973
9304
|
hits,
|
|
8974
9305
|
misses,
|
|
8975
9306
|
expectedAspectCount,
|
|
@@ -9016,18 +9347,6 @@ async function resolveCustomPrompt(config) {
|
|
|
9016
9347
|
}
|
|
9017
9348
|
return config.prompt;
|
|
9018
9349
|
}
|
|
9019
|
-
function isNonEmptyString2(value) {
|
|
9020
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
9021
|
-
}
|
|
9022
|
-
function scoreToVerdict2(score) {
|
|
9023
|
-
if (score >= 0.8) {
|
|
9024
|
-
return "pass";
|
|
9025
|
-
}
|
|
9026
|
-
if (score >= 0.6) {
|
|
9027
|
-
return "borderline";
|
|
9028
|
-
}
|
|
9029
|
-
return "fail";
|
|
9030
|
-
}
|
|
9031
9350
|
function filterEvalCases(evalCases, evalId) {
|
|
9032
9351
|
if (!evalId) {
|
|
9033
9352
|
return evalCases;
|
|
@@ -9129,7 +9448,7 @@ function extractProviderError(response) {
|
|
|
9129
9448
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
9130
9449
|
}
|
|
9131
9450
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
9132
|
-
const hash = (0,
|
|
9451
|
+
const hash = (0, import_node_crypto5.createHash)("sha256");
|
|
9133
9452
|
hash.update(provider.id);
|
|
9134
9453
|
hash.update(target.name);
|
|
9135
9454
|
hash.update(evalCase.id);
|
|
@@ -9170,7 +9489,8 @@ function mapChildResults(children) {
|
|
|
9170
9489
|
misses: child.misses,
|
|
9171
9490
|
reasoning: child.reasoning,
|
|
9172
9491
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
9173
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
9492
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
9493
|
+
details: child.details
|
|
9174
9494
|
}));
|
|
9175
9495
|
}
|
|
9176
9496
|
function computeWeightedMean(entries) {
|
|
@@ -9185,7 +9505,7 @@ function computeWeightedMean(entries) {
|
|
|
9185
9505
|
}
|
|
9186
9506
|
|
|
9187
9507
|
// src/evaluation/generators/rubric-generator.ts
|
|
9188
|
-
var
|
|
9508
|
+
var import_ai4 = require("ai");
|
|
9189
9509
|
var import_zod4 = require("zod");
|
|
9190
9510
|
var rubricItemSchema = import_zod4.z.object({
|
|
9191
9511
|
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -9219,7 +9539,7 @@ You must return a valid JSON object matching this schema:
|
|
|
9219
9539
|
let lastError;
|
|
9220
9540
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
9221
9541
|
try {
|
|
9222
|
-
const { text } = await (0,
|
|
9542
|
+
const { text } = await (0, import_ai4.generateText)({
|
|
9223
9543
|
model,
|
|
9224
9544
|
system,
|
|
9225
9545
|
prompt
|
|
@@ -9282,31 +9602,39 @@ function createAgentKernel() {
|
|
|
9282
9602
|
ToolTrajectoryEvaluator,
|
|
9283
9603
|
avgToolDurationMs,
|
|
9284
9604
|
buildDirectoryChain,
|
|
9605
|
+
buildOutputSchema,
|
|
9285
9606
|
buildPromptInputs,
|
|
9286
9607
|
buildSearchRoots,
|
|
9608
|
+
clampScore,
|
|
9287
9609
|
computeTraceSummary,
|
|
9288
9610
|
consumeClaudeCodeLogEntries,
|
|
9289
9611
|
consumeCodexLogEntries,
|
|
9290
9612
|
consumePiLogEntries,
|
|
9291
9613
|
createAgentKernel,
|
|
9292
9614
|
createProvider,
|
|
9615
|
+
deepEqual,
|
|
9293
9616
|
ensureVSCodeSubagents,
|
|
9617
|
+
executeScript,
|
|
9294
9618
|
explorationRatio,
|
|
9295
|
-
|
|
9619
|
+
extractJsonBlob,
|
|
9296
9620
|
fileExists,
|
|
9297
9621
|
findGitRoot,
|
|
9622
|
+
freeformEvaluationSchema,
|
|
9298
9623
|
generateRubrics,
|
|
9299
9624
|
getHitCount,
|
|
9300
9625
|
isEvaluatorKind,
|
|
9301
9626
|
isGuidelineFile,
|
|
9302
9627
|
isJsonObject,
|
|
9303
9628
|
isJsonValue,
|
|
9629
|
+
isNonEmptyString,
|
|
9304
9630
|
isTestMessage,
|
|
9305
9631
|
isTestMessageRole,
|
|
9306
9632
|
listTargetNames,
|
|
9307
9633
|
loadEvalCases,
|
|
9308
9634
|
mergeExecutionMetrics,
|
|
9309
9635
|
normalizeLineEndings,
|
|
9636
|
+
parseJsonFromText,
|
|
9637
|
+
parseJsonSafe,
|
|
9310
9638
|
readJsonFile,
|
|
9311
9639
|
readTargetDefinitions,
|
|
9312
9640
|
readTestSuiteMetadata,
|
|
@@ -9316,6 +9644,7 @@ function createAgentKernel() {
|
|
|
9316
9644
|
resolveTargetDefinition,
|
|
9317
9645
|
runEvalCase,
|
|
9318
9646
|
runEvaluation,
|
|
9647
|
+
scoreToVerdict,
|
|
9319
9648
|
subscribeToClaudeCodeLogEntries,
|
|
9320
9649
|
subscribeToCodexLogEntries,
|
|
9321
9650
|
subscribeToPiLogEntries,
|