@agentv/core 2.0.2 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs +0 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +0 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1336 -1007
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +142 -71
- package/dist/index.d.ts +142 -71
- package/dist/index.js +1293 -973
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
|
|
|
150
150
|
import path6 from "node:path";
|
|
151
151
|
import { parse as parse2 } from "yaml";
|
|
152
152
|
|
|
153
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
154
|
-
function extractCodeBlocks(segments) {
|
|
155
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
156
|
-
const codeBlocks = [];
|
|
157
|
-
for (const segment of segments) {
|
|
158
|
-
const typeValue = segment.type;
|
|
159
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
160
|
-
continue;
|
|
161
|
-
}
|
|
162
|
-
const textValue = segment.value;
|
|
163
|
-
if (typeof textValue !== "string") {
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
166
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
167
|
-
if (matches) {
|
|
168
|
-
codeBlocks.push(...matches);
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
return codeBlocks;
|
|
172
|
-
}
|
|
173
|
-
function formatFileContents(parts) {
|
|
174
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
175
|
-
if (fileCount > 0) {
|
|
176
|
-
return parts.map((part) => {
|
|
177
|
-
if (part.isFile && part.displayPath) {
|
|
178
|
-
return `<file path="${part.displayPath}">
|
|
179
|
-
${part.content}
|
|
180
|
-
</file>`;
|
|
181
|
-
}
|
|
182
|
-
return part.content;
|
|
183
|
-
}).join("\n\n");
|
|
184
|
-
}
|
|
185
|
-
return parts.map((p) => p.content).join(" ");
|
|
186
|
-
}
|
|
187
|
-
function formatSegment(segment, mode = "lm") {
|
|
188
|
-
const type = asString(segment.type);
|
|
189
|
-
if (type === "text") {
|
|
190
|
-
return asString(segment.value);
|
|
191
|
-
}
|
|
192
|
-
if (type === "guideline_ref") {
|
|
193
|
-
const refPath = asString(segment.path);
|
|
194
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
195
|
-
}
|
|
196
|
-
if (type === "file") {
|
|
197
|
-
const filePath = asString(segment.path);
|
|
198
|
-
if (!filePath) {
|
|
199
|
-
return void 0;
|
|
200
|
-
}
|
|
201
|
-
if (mode === "agent") {
|
|
202
|
-
return `<file: path="${filePath}">`;
|
|
203
|
-
}
|
|
204
|
-
const text = asString(segment.text);
|
|
205
|
-
if (text && filePath) {
|
|
206
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
return void 0;
|
|
210
|
-
}
|
|
211
|
-
function hasVisibleContent(segments) {
|
|
212
|
-
return segments.some((segment) => {
|
|
213
|
-
const type = asString(segment.type);
|
|
214
|
-
if (type === "text") {
|
|
215
|
-
const value = asString(segment.value);
|
|
216
|
-
return value !== void 0 && value.trim().length > 0;
|
|
217
|
-
}
|
|
218
|
-
if (type === "guideline_ref") {
|
|
219
|
-
return false;
|
|
220
|
-
}
|
|
221
|
-
if (type === "file") {
|
|
222
|
-
const text = asString(segment.text);
|
|
223
|
-
return text !== void 0 && text.trim().length > 0;
|
|
224
|
-
}
|
|
225
|
-
return false;
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
function asString(value) {
|
|
229
|
-
return typeof value === "string" ? value : void 0;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
153
|
// src/evaluation/loaders/config-loader.ts
|
|
233
154
|
import { readFile } from "node:fs/promises";
|
|
234
155
|
import path2 from "node:path";
|
|
@@ -336,7 +257,6 @@ async function resolveFileReference2(rawValue, searchRoots) {
|
|
|
336
257
|
}
|
|
337
258
|
|
|
338
259
|
// src/evaluation/loaders/config-loader.ts
|
|
339
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
340
260
|
var ANSI_YELLOW = "\x1B[33m";
|
|
341
261
|
var ANSI_RESET = "\x1B[0m";
|
|
342
262
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
@@ -354,13 +274,6 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
354
274
|
continue;
|
|
355
275
|
}
|
|
356
276
|
const config = parsed;
|
|
357
|
-
const schema = config.$schema;
|
|
358
|
-
if (schema !== SCHEMA_CONFIG_V2) {
|
|
359
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
360
|
-
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
361
|
-
logWarning(message);
|
|
362
|
-
continue;
|
|
363
|
-
}
|
|
364
277
|
const guidelinePatterns = config.guideline_patterns;
|
|
365
278
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
366
279
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -469,7 +382,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
|
|
|
469
382
|
var ANSI_RESET3 = "\x1B[0m";
|
|
470
383
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
471
384
|
const execution = rawEvalCase.execution;
|
|
472
|
-
const
|
|
385
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
386
|
+
const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
473
387
|
if (candidateEvaluators === void 0) {
|
|
474
388
|
return void 0;
|
|
475
389
|
}
|
|
@@ -483,7 +397,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
483
397
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
484
398
|
continue;
|
|
485
399
|
}
|
|
486
|
-
const name =
|
|
400
|
+
const name = asString(rawEvaluator.name);
|
|
487
401
|
const typeValue = rawEvaluator.type;
|
|
488
402
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
489
403
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -511,7 +425,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
511
425
|
continue;
|
|
512
426
|
}
|
|
513
427
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
514
|
-
const cwd =
|
|
428
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
515
429
|
let resolvedCwd;
|
|
516
430
|
if (cwd) {
|
|
517
431
|
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
@@ -526,7 +440,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
526
440
|
} else {
|
|
527
441
|
resolvedCwd = searchRoots[0];
|
|
528
442
|
}
|
|
529
|
-
const
|
|
443
|
+
const rawTarget = rawEvaluator.target;
|
|
444
|
+
let targetConfig;
|
|
445
|
+
if (rawTarget !== void 0) {
|
|
446
|
+
if (isJsonObject2(rawTarget)) {
|
|
447
|
+
const maxCalls = rawTarget.max_calls;
|
|
448
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
449
|
+
logWarning2(
|
|
450
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
451
|
+
);
|
|
452
|
+
} else {
|
|
453
|
+
targetConfig = {
|
|
454
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
} else if (rawTarget === true) {
|
|
458
|
+
targetConfig = {};
|
|
459
|
+
} else {
|
|
460
|
+
logWarning2(
|
|
461
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
462
|
+
);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
530
466
|
const config = {};
|
|
531
467
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
532
468
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -540,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
540
476
|
cwd,
|
|
541
477
|
resolvedCwd,
|
|
542
478
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
543
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
479
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
480
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
544
481
|
});
|
|
545
482
|
continue;
|
|
546
483
|
}
|
|
@@ -557,7 +494,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
557
494
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
558
495
|
continue;
|
|
559
496
|
}
|
|
560
|
-
const aggregatorType =
|
|
497
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
561
498
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
562
499
|
logWarning2(
|
|
563
500
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -570,7 +507,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
570
507
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
571
508
|
continue;
|
|
572
509
|
}
|
|
573
|
-
const memberName =
|
|
510
|
+
const memberName = asString(rawMember.name);
|
|
574
511
|
const memberType = rawMember.type;
|
|
575
512
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
576
513
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -608,7 +545,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
608
545
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
609
546
|
};
|
|
610
547
|
} else if (aggregatorType === "code_judge") {
|
|
611
|
-
const aggregatorPath =
|
|
548
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
612
549
|
if (!aggregatorPath) {
|
|
613
550
|
logWarning2(
|
|
614
551
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -621,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
621
558
|
cwd: searchRoots[0]
|
|
622
559
|
};
|
|
623
560
|
} else {
|
|
624
|
-
const aggregatorPrompt =
|
|
561
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
625
562
|
let promptPath2;
|
|
626
563
|
if (aggregatorPrompt) {
|
|
627
564
|
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
@@ -646,7 +583,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
646
583
|
continue;
|
|
647
584
|
}
|
|
648
585
|
if (typeValue === "tool_trajectory") {
|
|
649
|
-
const mode =
|
|
586
|
+
const mode = asString(rawEvaluator.mode);
|
|
650
587
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
651
588
|
logWarning2(
|
|
652
589
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -737,8 +674,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
737
674
|
);
|
|
738
675
|
continue;
|
|
739
676
|
}
|
|
740
|
-
const fieldPath =
|
|
741
|
-
const match =
|
|
677
|
+
const fieldPath = asString(rawField.path);
|
|
678
|
+
const match = asString(rawField.match);
|
|
742
679
|
if (!fieldPath) {
|
|
743
680
|
logWarning2(
|
|
744
681
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -768,7 +705,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
768
705
|
);
|
|
769
706
|
continue;
|
|
770
707
|
}
|
|
771
|
-
const aggregation =
|
|
708
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
772
709
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
773
710
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
774
711
|
evaluators.push({
|
|
@@ -849,7 +786,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
849
786
|
});
|
|
850
787
|
continue;
|
|
851
788
|
}
|
|
852
|
-
const prompt =
|
|
789
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
853
790
|
let promptPath;
|
|
854
791
|
if (prompt) {
|
|
855
792
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
@@ -868,11 +805,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
868
805
|
);
|
|
869
806
|
}
|
|
870
807
|
}
|
|
871
|
-
const _model =
|
|
808
|
+
const _model = asString(rawEvaluator.model);
|
|
872
809
|
const rawRubrics = rawEvaluator.rubrics;
|
|
873
810
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
874
|
-
id:
|
|
875
|
-
description:
|
|
811
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
812
|
+
description: asString(rubric.description) ?? "",
|
|
876
813
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
877
814
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
878
815
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -916,7 +853,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
916
853
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
917
854
|
return void 0;
|
|
918
855
|
}
|
|
919
|
-
function
|
|
856
|
+
function asString(value) {
|
|
920
857
|
return typeof value === "string" ? value : void 0;
|
|
921
858
|
}
|
|
922
859
|
function asStringArray(value, description) {
|
|
@@ -992,6 +929,68 @@ function isValidFieldAggregationType(value) {
|
|
|
992
929
|
// src/evaluation/loaders/message-processor.ts
|
|
993
930
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
994
931
|
import path4 from "node:path";
|
|
932
|
+
|
|
933
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
934
|
+
function formatFileContents(parts) {
|
|
935
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
936
|
+
if (fileCount > 0) {
|
|
937
|
+
return parts.map((part) => {
|
|
938
|
+
if (part.isFile && part.displayPath) {
|
|
939
|
+
return `<file path="${part.displayPath}">
|
|
940
|
+
${part.content}
|
|
941
|
+
</file>`;
|
|
942
|
+
}
|
|
943
|
+
return part.content;
|
|
944
|
+
}).join("\n\n");
|
|
945
|
+
}
|
|
946
|
+
return parts.map((p) => p.content).join(" ");
|
|
947
|
+
}
|
|
948
|
+
function formatSegment(segment, mode = "lm") {
|
|
949
|
+
const type = asString2(segment.type);
|
|
950
|
+
if (type === "text") {
|
|
951
|
+
return asString2(segment.value);
|
|
952
|
+
}
|
|
953
|
+
if (type === "guideline_ref") {
|
|
954
|
+
const refPath = asString2(segment.path);
|
|
955
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
956
|
+
}
|
|
957
|
+
if (type === "file") {
|
|
958
|
+
const filePath = asString2(segment.path);
|
|
959
|
+
if (!filePath) {
|
|
960
|
+
return void 0;
|
|
961
|
+
}
|
|
962
|
+
if (mode === "agent") {
|
|
963
|
+
return `<file: path="${filePath}">`;
|
|
964
|
+
}
|
|
965
|
+
const text = asString2(segment.text);
|
|
966
|
+
if (text && filePath) {
|
|
967
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
return void 0;
|
|
971
|
+
}
|
|
972
|
+
function hasVisibleContent(segments) {
|
|
973
|
+
return segments.some((segment) => {
|
|
974
|
+
const type = asString2(segment.type);
|
|
975
|
+
if (type === "text") {
|
|
976
|
+
const value = asString2(segment.value);
|
|
977
|
+
return value !== void 0 && value.trim().length > 0;
|
|
978
|
+
}
|
|
979
|
+
if (type === "guideline_ref") {
|
|
980
|
+
return false;
|
|
981
|
+
}
|
|
982
|
+
if (type === "file") {
|
|
983
|
+
const text = asString2(segment.text);
|
|
984
|
+
return text !== void 0 && text.trim().length > 0;
|
|
985
|
+
}
|
|
986
|
+
return false;
|
|
987
|
+
});
|
|
988
|
+
}
|
|
989
|
+
function asString2(value) {
|
|
990
|
+
return typeof value === "string" ? value : void 0;
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
// src/evaluation/loaders/message-processor.ts
|
|
995
994
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
996
995
|
var ANSI_RESET4 = "\x1B[0m";
|
|
997
996
|
async function processMessages(options) {
|
|
@@ -1297,9 +1296,6 @@ ${messageContent}`);
|
|
|
1297
1296
|
questionParts.push(formattedContent);
|
|
1298
1297
|
}
|
|
1299
1298
|
}
|
|
1300
|
-
if (testCase.code_snippets.length > 0) {
|
|
1301
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1302
|
-
}
|
|
1303
1299
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1304
1300
|
}
|
|
1305
1301
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1498,7 +1494,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1498
1494
|
repoRootPath,
|
|
1499
1495
|
verbose
|
|
1500
1496
|
}) : [];
|
|
1501
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1502
1497
|
let referenceAnswer = "";
|
|
1503
1498
|
if (outputSegments.length > 0) {
|
|
1504
1499
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1571,7 +1566,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1571
1566
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
1572
1567
|
guideline_patterns: guidelinePatterns,
|
|
1573
1568
|
file_paths: allFilePaths,
|
|
1574
|
-
code_snippets: codeSnippets,
|
|
1575
1569
|
expected_outcome: outcome,
|
|
1576
1570
|
evaluator: evalCaseEvaluatorKind,
|
|
1577
1571
|
evaluators
|
|
@@ -5311,9 +5305,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
5311
5305
|
return createProvider(resolved);
|
|
5312
5306
|
}
|
|
5313
5307
|
|
|
5314
|
-
// src/evaluation/evaluators.ts
|
|
5315
|
-
|
|
5316
|
-
|
|
5308
|
+
// src/evaluation/evaluators/scoring.ts
|
|
5309
|
+
function scoreToVerdict(score) {
|
|
5310
|
+
if (score >= 0.8) {
|
|
5311
|
+
return "pass";
|
|
5312
|
+
}
|
|
5313
|
+
if (score >= 0.6) {
|
|
5314
|
+
return "borderline";
|
|
5315
|
+
}
|
|
5316
|
+
return "fail";
|
|
5317
|
+
}
|
|
5318
|
+
function clampScore(value) {
|
|
5319
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5320
|
+
return 0;
|
|
5321
|
+
}
|
|
5322
|
+
if (value < 0) {
|
|
5323
|
+
return 0;
|
|
5324
|
+
}
|
|
5325
|
+
if (value > 1) {
|
|
5326
|
+
return 1;
|
|
5327
|
+
}
|
|
5328
|
+
return value;
|
|
5329
|
+
}
|
|
5330
|
+
function extractJsonBlob(text) {
|
|
5331
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
5332
|
+
return match?.[0];
|
|
5333
|
+
}
|
|
5334
|
+
function parseJsonFromText(text) {
|
|
5335
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5336
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5337
|
+
return JSON.parse(blob);
|
|
5338
|
+
}
|
|
5339
|
+
function isNonEmptyString(value) {
|
|
5340
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
5341
|
+
}
|
|
5342
|
+
function parseJsonSafe(payload) {
|
|
5343
|
+
try {
|
|
5344
|
+
return JSON.parse(payload);
|
|
5345
|
+
} catch {
|
|
5346
|
+
return void 0;
|
|
5347
|
+
}
|
|
5348
|
+
}
|
|
5349
|
+
function deepEqual(a, b) {
|
|
5350
|
+
if (a === b) return true;
|
|
5351
|
+
if (a === null || b === null) return a === b;
|
|
5352
|
+
if (typeof a !== typeof b) return false;
|
|
5353
|
+
if (typeof a !== "object") return a === b;
|
|
5354
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5355
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5356
|
+
if (a.length !== b.length) return false;
|
|
5357
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5358
|
+
}
|
|
5359
|
+
const aObj = a;
|
|
5360
|
+
const bObj = b;
|
|
5361
|
+
const aKeys = Object.keys(aObj);
|
|
5362
|
+
const bKeys = Object.keys(bObj);
|
|
5363
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
5364
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5365
|
+
}
|
|
5317
5366
|
|
|
5318
5367
|
// src/runtime/exec.ts
|
|
5319
5368
|
function shellEscapePath(value) {
|
|
@@ -5338,7 +5387,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
5338
5387
|
cwd: options.cwd,
|
|
5339
5388
|
stdin: encoder.encode(stdinPayload),
|
|
5340
5389
|
stdout: "pipe",
|
|
5341
|
-
stderr: "pipe"
|
|
5390
|
+
stderr: "pipe",
|
|
5391
|
+
// Merge additional env vars with process.env
|
|
5392
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5342
5393
|
});
|
|
5343
5394
|
let timedOut = false;
|
|
5344
5395
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -5373,7 +5424,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
5373
5424
|
const [cmd, ...args] = argv;
|
|
5374
5425
|
const child = spawn4(cmd, args, {
|
|
5375
5426
|
cwd: options.cwd,
|
|
5376
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
5427
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
5428
|
+
// Merge additional env vars with process.env
|
|
5429
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5377
5430
|
});
|
|
5378
5431
|
const stdoutChunks = [];
|
|
5379
5432
|
const stderrChunks = [];
|
|
@@ -5426,7 +5479,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5426
5479
|
const child = spawn4(wrappedCommand, {
|
|
5427
5480
|
shell: true,
|
|
5428
5481
|
cwd: options.cwd,
|
|
5429
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
5482
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
5483
|
+
// Merge additional env vars with process.env
|
|
5484
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5430
5485
|
});
|
|
5431
5486
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5432
5487
|
child.kill();
|
|
@@ -5453,32 +5508,387 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5453
5508
|
}
|
|
5454
5509
|
}
|
|
5455
5510
|
|
|
5456
|
-
// src/
|
|
5457
|
-
|
|
5458
|
-
|
|
5459
|
-
|
|
5511
|
+
// src/runtime/target-proxy.ts
|
|
5512
|
+
import { randomBytes } from "node:crypto";
|
|
5513
|
+
import { createServer } from "node:http";
|
|
5514
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
5515
|
+
async function createTargetProxy(options) {
|
|
5516
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
5517
|
+
const token = randomBytes(32).toString("hex");
|
|
5518
|
+
let callCount = 0;
|
|
5519
|
+
let isShutdown = false;
|
|
5520
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
5521
|
+
function resolveProvider(targetName) {
|
|
5522
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
5523
|
+
return defaultProvider;
|
|
5524
|
+
}
|
|
5525
|
+
if (targetResolver) {
|
|
5526
|
+
return targetResolver(targetName);
|
|
5527
|
+
}
|
|
5528
|
+
return void 0;
|
|
5460
5529
|
}
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
|
|
5464
|
-
|
|
5465
|
-
|
|
5530
|
+
const server = createServer(async (req, res) => {
|
|
5531
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
5532
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
5533
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
5534
|
+
if (req.method === "OPTIONS") {
|
|
5535
|
+
res.writeHead(204);
|
|
5536
|
+
res.end();
|
|
5537
|
+
return;
|
|
5538
|
+
}
|
|
5539
|
+
const authHeader = req.headers.authorization;
|
|
5540
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
5541
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
5542
|
+
return;
|
|
5543
|
+
}
|
|
5544
|
+
if (isShutdown) {
|
|
5545
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
5546
|
+
return;
|
|
5547
|
+
}
|
|
5548
|
+
const url2 = req.url ?? "";
|
|
5549
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
5550
|
+
handleInfo(res);
|
|
5551
|
+
return;
|
|
5552
|
+
}
|
|
5553
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
5554
|
+
await handleInvoke(req, res);
|
|
5555
|
+
return;
|
|
5556
|
+
}
|
|
5557
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
5558
|
+
await handleInvokeBatch(req, res);
|
|
5559
|
+
return;
|
|
5560
|
+
}
|
|
5561
|
+
sendJson(res, 404, { error: "Not found" });
|
|
5562
|
+
});
|
|
5563
|
+
function handleInfo(res) {
|
|
5564
|
+
const response = {
|
|
5565
|
+
targetName: defaultProvider.targetName,
|
|
5566
|
+
maxCalls,
|
|
5567
|
+
callCount,
|
|
5568
|
+
availableTargets: targetsList
|
|
5569
|
+
};
|
|
5570
|
+
sendJson(res, 200, response);
|
|
5466
5571
|
}
|
|
5467
|
-
|
|
5468
|
-
|
|
5572
|
+
async function handleInvoke(req, res) {
|
|
5573
|
+
if (callCount >= maxCalls) {
|
|
5574
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
5575
|
+
return;
|
|
5576
|
+
}
|
|
5577
|
+
try {
|
|
5578
|
+
const body = await readBody(req);
|
|
5579
|
+
const request = JSON.parse(body);
|
|
5580
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5581
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
5582
|
+
return;
|
|
5583
|
+
}
|
|
5584
|
+
const provider = resolveProvider(request.target);
|
|
5585
|
+
if (!provider) {
|
|
5586
|
+
sendJson(res, 400, {
|
|
5587
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5588
|
+
});
|
|
5589
|
+
return;
|
|
5590
|
+
}
|
|
5591
|
+
callCount++;
|
|
5592
|
+
const response = await provider.invoke({
|
|
5593
|
+
question: request.question,
|
|
5594
|
+
systemPrompt: request.systemPrompt,
|
|
5595
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5596
|
+
attempt: request.attempt ?? 1
|
|
5597
|
+
});
|
|
5598
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5599
|
+
const rawText = extractLastAssistantContent2(outputMessages);
|
|
5600
|
+
const result = {
|
|
5601
|
+
outputMessages,
|
|
5602
|
+
rawText
|
|
5603
|
+
};
|
|
5604
|
+
sendJson(res, 200, result);
|
|
5605
|
+
} catch (error) {
|
|
5606
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5607
|
+
sendJson(res, 500, { error: message });
|
|
5608
|
+
}
|
|
5469
5609
|
}
|
|
5470
|
-
|
|
5471
|
-
|
|
5472
|
-
|
|
5473
|
-
const
|
|
5474
|
-
|
|
5610
|
+
async function handleInvokeBatch(req, res) {
|
|
5611
|
+
try {
|
|
5612
|
+
const body = await readBody(req);
|
|
5613
|
+
const { requests } = JSON.parse(body);
|
|
5614
|
+
if (!Array.isArray(requests)) {
|
|
5615
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
5616
|
+
return;
|
|
5617
|
+
}
|
|
5618
|
+
if (callCount + requests.length > maxCalls) {
|
|
5619
|
+
sendJson(res, 429, {
|
|
5620
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
5621
|
+
});
|
|
5622
|
+
return;
|
|
5623
|
+
}
|
|
5624
|
+
const responses = [];
|
|
5625
|
+
for (const request of requests) {
|
|
5626
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5627
|
+
responses.push({
|
|
5628
|
+
outputMessages: [],
|
|
5629
|
+
rawText: "Error: Missing required field: question"
|
|
5630
|
+
});
|
|
5631
|
+
continue;
|
|
5632
|
+
}
|
|
5633
|
+
const provider = resolveProvider(request.target);
|
|
5634
|
+
if (!provider) {
|
|
5635
|
+
responses.push({
|
|
5636
|
+
outputMessages: [],
|
|
5637
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5638
|
+
});
|
|
5639
|
+
continue;
|
|
5640
|
+
}
|
|
5641
|
+
callCount++;
|
|
5642
|
+
try {
|
|
5643
|
+
const response = await provider.invoke({
|
|
5644
|
+
question: request.question,
|
|
5645
|
+
systemPrompt: request.systemPrompt,
|
|
5646
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5647
|
+
attempt: request.attempt ?? 1
|
|
5648
|
+
});
|
|
5649
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5650
|
+
responses.push({
|
|
5651
|
+
outputMessages,
|
|
5652
|
+
rawText: extractLastAssistantContent2(outputMessages)
|
|
5653
|
+
});
|
|
5654
|
+
} catch (error) {
|
|
5655
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5656
|
+
responses.push({
|
|
5657
|
+
outputMessages: [],
|
|
5658
|
+
rawText: `Error: ${message}`
|
|
5659
|
+
});
|
|
5660
|
+
}
|
|
5661
|
+
}
|
|
5662
|
+
sendJson(res, 200, { responses });
|
|
5663
|
+
} catch (error) {
|
|
5664
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5665
|
+
sendJson(res, 500, { error: message });
|
|
5475
5666
|
}
|
|
5476
|
-
return result;
|
|
5477
5667
|
}
|
|
5478
|
-
|
|
5668
|
+
await new Promise((resolve, reject) => {
|
|
5669
|
+
server.once("error", reject);
|
|
5670
|
+
server.listen(0, "127.0.0.1", () => {
|
|
5671
|
+
server.removeListener("error", reject);
|
|
5672
|
+
resolve();
|
|
5673
|
+
});
|
|
5674
|
+
});
|
|
5675
|
+
const address = server.address();
|
|
5676
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
5677
|
+
return {
|
|
5678
|
+
url,
|
|
5679
|
+
token,
|
|
5680
|
+
shutdown: async () => {
|
|
5681
|
+
isShutdown = true;
|
|
5682
|
+
return new Promise((resolve, reject) => {
|
|
5683
|
+
server.close((err) => {
|
|
5684
|
+
if (err) reject(err);
|
|
5685
|
+
else resolve();
|
|
5686
|
+
});
|
|
5687
|
+
});
|
|
5688
|
+
},
|
|
5689
|
+
getUsageMetadata: () => ({
|
|
5690
|
+
callCount,
|
|
5691
|
+
maxCalls
|
|
5692
|
+
})
|
|
5693
|
+
};
|
|
5694
|
+
}
|
|
5695
|
+
function sendJson(res, statusCode, body) {
|
|
5696
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
5697
|
+
res.end(JSON.stringify(body));
|
|
5698
|
+
}
|
|
5699
|
+
function readBody(req) {
|
|
5700
|
+
return new Promise((resolve, reject) => {
|
|
5701
|
+
const chunks = [];
|
|
5702
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
5703
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
5704
|
+
req.on("error", reject);
|
|
5705
|
+
});
|
|
5706
|
+
}
|
|
5707
|
+
function extractLastAssistantContent2(messages) {
|
|
5708
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
5709
|
+
const msg = messages[i];
|
|
5710
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
5711
|
+
if (typeof msg.content === "string") {
|
|
5712
|
+
return msg.content;
|
|
5713
|
+
}
|
|
5714
|
+
if (Array.isArray(msg.content)) {
|
|
5715
|
+
for (const part of msg.content) {
|
|
5716
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
5717
|
+
return String(part.text);
|
|
5718
|
+
}
|
|
5719
|
+
}
|
|
5720
|
+
}
|
|
5721
|
+
}
|
|
5722
|
+
}
|
|
5723
|
+
return void 0;
|
|
5724
|
+
}
|
|
5725
|
+
|
|
5726
|
+
// src/evaluation/case-conversion.ts
|
|
5727
|
+
function toSnakeCase(str) {
|
|
5728
|
+
if (/^[A-Z]/.test(str)) {
|
|
5729
|
+
return str;
|
|
5730
|
+
}
|
|
5731
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
5732
|
+
}
|
|
5733
|
+
function toSnakeCaseDeep(obj) {
|
|
5734
|
+
if (obj === null || obj === void 0) {
|
|
5735
|
+
return obj;
|
|
5736
|
+
}
|
|
5737
|
+
if (Array.isArray(obj)) {
|
|
5738
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
5739
|
+
}
|
|
5740
|
+
if (typeof obj === "object") {
|
|
5741
|
+
const result = {};
|
|
5742
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
5743
|
+
const snakeKey = toSnakeCase(key);
|
|
5744
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
5745
|
+
}
|
|
5746
|
+
return result;
|
|
5747
|
+
}
|
|
5748
|
+
return obj;
|
|
5749
|
+
}
|
|
5750
|
+
|
|
5751
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
5752
|
+
var CodeEvaluator = class {
|
|
5753
|
+
kind = "code";
|
|
5754
|
+
script;
|
|
5755
|
+
cwd;
|
|
5756
|
+
agentTimeoutMs;
|
|
5757
|
+
config;
|
|
5758
|
+
target;
|
|
5759
|
+
constructor(options) {
|
|
5760
|
+
this.script = options.script;
|
|
5761
|
+
this.cwd = options.cwd;
|
|
5762
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5763
|
+
this.config = options.config;
|
|
5764
|
+
this.target = options.target;
|
|
5765
|
+
}
|
|
5766
|
+
async evaluate(context) {
|
|
5767
|
+
const payload = {
|
|
5768
|
+
question: context.evalCase.question,
|
|
5769
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5770
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5771
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5772
|
+
candidateAnswer: context.candidate,
|
|
5773
|
+
outputMessages: context.outputMessages ?? null,
|
|
5774
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5775
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5776
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5777
|
+
),
|
|
5778
|
+
inputMessages: context.evalCase.input_messages,
|
|
5779
|
+
traceSummary: context.traceSummary ?? null,
|
|
5780
|
+
config: this.config ?? null
|
|
5781
|
+
};
|
|
5782
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5783
|
+
let proxyEnv;
|
|
5784
|
+
let proxyShutdown;
|
|
5785
|
+
let getProxyUsage;
|
|
5786
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
5787
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
5788
|
+
const proxy = await createTargetProxy({
|
|
5789
|
+
defaultProvider: context.judgeProvider,
|
|
5790
|
+
targetResolver: context.targetResolver,
|
|
5791
|
+
availableTargets: context.availableTargets,
|
|
5792
|
+
maxCalls
|
|
5793
|
+
});
|
|
5794
|
+
proxyEnv = {
|
|
5795
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
5796
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
5797
|
+
};
|
|
5798
|
+
proxyShutdown = proxy.shutdown;
|
|
5799
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
5800
|
+
}
|
|
5801
|
+
try {
|
|
5802
|
+
const stdout = await executeScript(
|
|
5803
|
+
this.script,
|
|
5804
|
+
inputPayload,
|
|
5805
|
+
this.agentTimeoutMs,
|
|
5806
|
+
this.cwd,
|
|
5807
|
+
proxyEnv
|
|
5808
|
+
);
|
|
5809
|
+
const parsed = parseJsonSafe(stdout);
|
|
5810
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5811
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5812
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5813
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5814
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
5815
|
+
const proxyUsage = getProxyUsage?.();
|
|
5816
|
+
const evaluatorRawRequest = {
|
|
5817
|
+
script: this.script,
|
|
5818
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5819
|
+
...proxyUsage ? {
|
|
5820
|
+
target_proxy: {
|
|
5821
|
+
call_count: proxyUsage.callCount,
|
|
5822
|
+
max_calls: proxyUsage.maxCalls
|
|
5823
|
+
}
|
|
5824
|
+
} : {}
|
|
5825
|
+
};
|
|
5826
|
+
return {
|
|
5827
|
+
score,
|
|
5828
|
+
verdict: scoreToVerdict(score),
|
|
5829
|
+
hits,
|
|
5830
|
+
misses,
|
|
5831
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
5832
|
+
reasoning,
|
|
5833
|
+
evaluatorRawRequest,
|
|
5834
|
+
...details ? { details } : {}
|
|
5835
|
+
};
|
|
5836
|
+
} catch (error) {
|
|
5837
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5838
|
+
const proxyUsage = getProxyUsage?.();
|
|
5839
|
+
return {
|
|
5840
|
+
score: 0,
|
|
5841
|
+
verdict: "fail",
|
|
5842
|
+
hits: [],
|
|
5843
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
5844
|
+
expectedAspectCount: 1,
|
|
5845
|
+
reasoning: message,
|
|
5846
|
+
evaluatorRawRequest: {
|
|
5847
|
+
script: this.script,
|
|
5848
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5849
|
+
...proxyUsage ? {
|
|
5850
|
+
target_proxy: {
|
|
5851
|
+
call_count: proxyUsage.callCount,
|
|
5852
|
+
max_calls: proxyUsage.maxCalls
|
|
5853
|
+
}
|
|
5854
|
+
} : {},
|
|
5855
|
+
error: message
|
|
5856
|
+
}
|
|
5857
|
+
};
|
|
5858
|
+
} finally {
|
|
5859
|
+
if (proxyShutdown) {
|
|
5860
|
+
await proxyShutdown();
|
|
5861
|
+
}
|
|
5862
|
+
}
|
|
5863
|
+
}
|
|
5864
|
+
};
|
|
5865
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5866
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
5867
|
+
if (exitCode !== 0) {
|
|
5868
|
+
const trimmedErr = formatStderr(stderr);
|
|
5869
|
+
throw new Error(
|
|
5870
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5871
|
+
);
|
|
5872
|
+
}
|
|
5873
|
+
return stdout.trim();
|
|
5874
|
+
}
|
|
5875
|
+
function formatStderr(stderr) {
|
|
5876
|
+
const trimmed = stderr.trim();
|
|
5877
|
+
const maxLength = 2e3;
|
|
5878
|
+
if (trimmed.length <= maxLength) {
|
|
5879
|
+
return trimmed;
|
|
5880
|
+
}
|
|
5881
|
+
const tail = trimmed.slice(-maxLength);
|
|
5882
|
+
return `...(truncated, last ${maxLength} chars)
|
|
5883
|
+
${tail}`;
|
|
5479
5884
|
}
|
|
5480
5885
|
|
|
5481
|
-
// src/evaluation/evaluators.ts
|
|
5886
|
+
// src/evaluation/evaluators/composite.ts
|
|
5887
|
+
import { generateText as generateText3 } from "ai";
|
|
5888
|
+
|
|
5889
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
5890
|
+
import { generateText as generateText2 } from "ai";
|
|
5891
|
+
import { z as z2 } from "zod";
|
|
5482
5892
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
5483
5893
|
|
|
5484
5894
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -5558,7 +5968,7 @@ var LlmJudgeEvaluator = class {
|
|
|
5558
5968
|
target: judgeProvider.targetName
|
|
5559
5969
|
};
|
|
5560
5970
|
try {
|
|
5561
|
-
const { data
|
|
5971
|
+
const { data } = await this.runWithRetry({
|
|
5562
5972
|
context,
|
|
5563
5973
|
judgeProvider,
|
|
5564
5974
|
systemPrompt,
|
|
@@ -5707,105 +6117,11 @@ You must return a valid JSON object matching this schema:
|
|
|
5707
6117
|
"overall_reasoning": "string (summary)"
|
|
5708
6118
|
}`;
|
|
5709
6119
|
}
|
|
5710
|
-
function
|
|
5711
|
-
|
|
5712
|
-
return
|
|
5713
|
-
}
|
|
5714
|
-
if (score >= 0.6) {
|
|
5715
|
-
return "borderline";
|
|
5716
|
-
}
|
|
5717
|
-
return "fail";
|
|
5718
|
-
}
|
|
5719
|
-
function clampScore(value) {
|
|
5720
|
-
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5721
|
-
return 0;
|
|
5722
|
-
}
|
|
5723
|
-
if (value < 0) {
|
|
5724
|
-
return 0;
|
|
5725
|
-
}
|
|
5726
|
-
if (value > 1) {
|
|
5727
|
-
return 1;
|
|
5728
|
-
}
|
|
5729
|
-
return value;
|
|
5730
|
-
}
|
|
5731
|
-
function extractJsonBlob(text) {
|
|
5732
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
5733
|
-
return match?.[0];
|
|
5734
|
-
}
|
|
5735
|
-
function parseJsonFromText(text) {
|
|
5736
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5737
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5738
|
-
return JSON.parse(blob);
|
|
5739
|
-
}
|
|
5740
|
-
function isNonEmptyString(value) {
|
|
5741
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
6120
|
+
function substituteVariables(template, variables) {
|
|
6121
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
6122
|
+
return variables[varName] ?? match;
|
|
6123
|
+
});
|
|
5742
6124
|
}
|
|
5743
|
-
var CodeEvaluator = class {
|
|
5744
|
-
kind = "code";
|
|
5745
|
-
script;
|
|
5746
|
-
cwd;
|
|
5747
|
-
agentTimeoutMs;
|
|
5748
|
-
config;
|
|
5749
|
-
constructor(options) {
|
|
5750
|
-
this.script = options.script;
|
|
5751
|
-
this.cwd = options.cwd;
|
|
5752
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5753
|
-
this.config = options.config;
|
|
5754
|
-
}
|
|
5755
|
-
async evaluate(context) {
|
|
5756
|
-
const payload = {
|
|
5757
|
-
question: context.evalCase.question,
|
|
5758
|
-
expectedOutcome: context.evalCase.expected_outcome,
|
|
5759
|
-
expectedMessages: context.evalCase.expected_messages,
|
|
5760
|
-
referenceAnswer: context.evalCase.reference_answer,
|
|
5761
|
-
candidateAnswer: context.candidate,
|
|
5762
|
-
outputMessages: context.outputMessages ?? null,
|
|
5763
|
-
guidelineFiles: context.evalCase.guideline_paths,
|
|
5764
|
-
inputFiles: context.evalCase.file_paths.filter(
|
|
5765
|
-
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5766
|
-
),
|
|
5767
|
-
inputMessages: context.evalCase.input_messages,
|
|
5768
|
-
traceSummary: context.traceSummary ?? null,
|
|
5769
|
-
config: this.config ?? null
|
|
5770
|
-
};
|
|
5771
|
-
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5772
|
-
try {
|
|
5773
|
-
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
5774
|
-
const parsed = parseJsonSafe(stdout);
|
|
5775
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5776
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5777
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5778
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5779
|
-
return {
|
|
5780
|
-
score,
|
|
5781
|
-
verdict: scoreToVerdict(score),
|
|
5782
|
-
hits,
|
|
5783
|
-
misses,
|
|
5784
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
5785
|
-
reasoning,
|
|
5786
|
-
evaluatorRawRequest: {
|
|
5787
|
-
script: this.script,
|
|
5788
|
-
...this.cwd ? { cwd: this.cwd } : {}
|
|
5789
|
-
}
|
|
5790
|
-
};
|
|
5791
|
-
} catch (error) {
|
|
5792
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
5793
|
-
return {
|
|
5794
|
-
score: 0,
|
|
5795
|
-
verdict: "fail",
|
|
5796
|
-
hits: [],
|
|
5797
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
5798
|
-
expectedAspectCount: 1,
|
|
5799
|
-
reasoning: message,
|
|
5800
|
-
evaluatorRawRequest: {
|
|
5801
|
-
script: this.script,
|
|
5802
|
-
...this.cwd ? { cwd: this.cwd } : {},
|
|
5803
|
-
error: message
|
|
5804
|
-
}
|
|
5805
|
-
};
|
|
5806
|
-
}
|
|
5807
|
-
}
|
|
5808
|
-
};
|
|
5809
6125
|
function calculateRubricScore(result, rubrics) {
|
|
5810
6126
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
5811
6127
|
const hits = [];
|
|
@@ -5833,273 +6149,281 @@ function calculateRubricScore(result, rubrics) {
|
|
|
5833
6149
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
5834
6150
|
return { score, verdict, hits, misses };
|
|
5835
6151
|
}
|
|
5836
|
-
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
|
|
5840
|
-
|
|
5841
|
-
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
}
|
|
5846
|
-
function formatStderr(stderr) {
|
|
5847
|
-
const trimmed = stderr.trim();
|
|
5848
|
-
const maxLength = 2e3;
|
|
5849
|
-
if (trimmed.length <= maxLength) {
|
|
5850
|
-
return trimmed;
|
|
5851
|
-
}
|
|
5852
|
-
const tail = trimmed.slice(-maxLength);
|
|
5853
|
-
return `...(truncated, last ${maxLength} chars)
|
|
5854
|
-
${tail}`;
|
|
5855
|
-
}
|
|
5856
|
-
function parseJsonSafe(payload) {
|
|
5857
|
-
try {
|
|
5858
|
-
return JSON.parse(payload);
|
|
5859
|
-
} catch {
|
|
5860
|
-
return void 0;
|
|
5861
|
-
}
|
|
5862
|
-
}
|
|
5863
|
-
function substituteVariables(template, variables) {
|
|
5864
|
-
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
5865
|
-
return variables[varName] ?? match;
|
|
5866
|
-
});
|
|
5867
|
-
}
|
|
5868
|
-
function deepEqual(a, b) {
|
|
5869
|
-
if (a === b) return true;
|
|
5870
|
-
if (a === null || b === null) return a === b;
|
|
5871
|
-
if (typeof a !== typeof b) return false;
|
|
5872
|
-
if (typeof a !== "object") return a === b;
|
|
5873
|
-
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5874
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5875
|
-
if (a.length !== b.length) return false;
|
|
5876
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5877
|
-
}
|
|
5878
|
-
const aObj = a;
|
|
5879
|
-
const bObj = b;
|
|
5880
|
-
const aKeys = Object.keys(aObj);
|
|
5881
|
-
const bKeys = Object.keys(bObj);
|
|
5882
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
5883
|
-
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5884
|
-
}
|
|
5885
|
-
function argsMatch(expected, actual) {
|
|
5886
|
-
if (expected === void 0) return true;
|
|
5887
|
-
if (expected === "any") return true;
|
|
5888
|
-
if (actual === void 0) return false;
|
|
5889
|
-
for (const key of Object.keys(expected)) {
|
|
5890
|
-
if (!Object.hasOwn(actual, key)) return false;
|
|
5891
|
-
if (!deepEqual(expected[key], actual[key])) return false;
|
|
5892
|
-
}
|
|
5893
|
-
return true;
|
|
5894
|
-
}
|
|
5895
|
-
var ToolTrajectoryEvaluator = class {
|
|
5896
|
-
kind = "tool_trajectory";
|
|
6152
|
+
|
|
6153
|
+
// src/evaluation/evaluators/composite.ts
|
|
6154
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6155
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
6156
|
+
|
|
6157
|
+
Decide the final score and verdict based on all evaluator results.
|
|
6158
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6159
|
+
var CompositeEvaluator = class {
|
|
6160
|
+
kind = "composite";
|
|
5897
6161
|
config;
|
|
6162
|
+
evaluatorFactory;
|
|
6163
|
+
cwd;
|
|
5898
6164
|
constructor(options) {
|
|
5899
6165
|
this.config = options.config;
|
|
6166
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
6167
|
+
this.cwd = options.cwd;
|
|
5900
6168
|
}
|
|
5901
|
-
evaluate(context) {
|
|
5902
|
-
const
|
|
5903
|
-
|
|
5904
|
-
|
|
5905
|
-
return {
|
|
5906
|
-
score: 0,
|
|
5907
|
-
verdict: "fail",
|
|
5908
|
-
hits: [],
|
|
5909
|
-
misses: ["No trace available for evaluation"],
|
|
5910
|
-
expectedAspectCount: 1
|
|
5911
|
-
};
|
|
5912
|
-
}
|
|
5913
|
-
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
5914
|
-
if (!summary) {
|
|
5915
|
-
return {
|
|
5916
|
-
score: 0,
|
|
5917
|
-
verdict: "fail",
|
|
5918
|
-
hits: [],
|
|
5919
|
-
misses: ["No trace available for evaluation"],
|
|
5920
|
-
expectedAspectCount: 1
|
|
5921
|
-
};
|
|
5922
|
-
}
|
|
5923
|
-
switch (this.config.mode) {
|
|
5924
|
-
case "any_order":
|
|
5925
|
-
return this.evaluateAnyOrder(summary);
|
|
5926
|
-
case "in_order":
|
|
5927
|
-
return this.evaluateInOrder(toolCalls);
|
|
5928
|
-
case "exact":
|
|
5929
|
-
return this.evaluateExact(toolCalls);
|
|
5930
|
-
default:
|
|
6169
|
+
async evaluate(context) {
|
|
6170
|
+
const memberResults = await Promise.all(
|
|
6171
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
6172
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
5931
6173
|
return {
|
|
5932
|
-
|
|
5933
|
-
|
|
5934
|
-
|
|
5935
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
5936
|
-
expectedAspectCount: 1
|
|
6174
|
+
id: memberConfig.name,
|
|
6175
|
+
type: memberConfig.type,
|
|
6176
|
+
result: await evaluator.evaluate(context)
|
|
5937
6177
|
};
|
|
5938
|
-
|
|
6178
|
+
})
|
|
6179
|
+
);
|
|
6180
|
+
return this.aggregate(memberResults, context);
|
|
5939
6181
|
}
|
|
5940
|
-
|
|
5941
|
-
|
|
5942
|
-
|
|
5943
|
-
|
|
5944
|
-
|
|
5945
|
-
|
|
5946
|
-
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
if (message.toolCalls) {
|
|
5950
|
-
for (const call of message.toolCalls) {
|
|
5951
|
-
toolCalls.push({
|
|
5952
|
-
name: call.tool,
|
|
5953
|
-
args: call.input
|
|
5954
|
-
});
|
|
5955
|
-
}
|
|
5956
|
-
}
|
|
6182
|
+
async aggregate(results, context) {
|
|
6183
|
+
const aggregator = this.config.aggregator;
|
|
6184
|
+
switch (aggregator.type) {
|
|
6185
|
+
case "code_judge":
|
|
6186
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6187
|
+
case "llm_judge":
|
|
6188
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
6189
|
+
default:
|
|
6190
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
5957
6191
|
}
|
|
5958
|
-
return toolCalls;
|
|
5959
6192
|
}
|
|
5960
|
-
|
|
5961
|
-
|
|
5962
|
-
|
|
5963
|
-
|
|
5964
|
-
const
|
|
5965
|
-
|
|
5966
|
-
|
|
6193
|
+
runWeightedAverage(results, weights) {
|
|
6194
|
+
let totalWeight = 0;
|
|
6195
|
+
let weightedSum = 0;
|
|
6196
|
+
const allHits = [];
|
|
6197
|
+
const allMisses = [];
|
|
6198
|
+
const reasoningParts = [];
|
|
6199
|
+
const evaluatorResults = [];
|
|
6200
|
+
for (const member of results) {
|
|
6201
|
+
const weight = weights?.[member.id] ?? 1;
|
|
6202
|
+
totalWeight += weight;
|
|
6203
|
+
weightedSum += member.result.score * weight;
|
|
6204
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6205
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6206
|
+
if (member.result.reasoning) {
|
|
6207
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
6208
|
+
}
|
|
6209
|
+
evaluatorResults.push({
|
|
6210
|
+
name: member.id,
|
|
6211
|
+
type: member.type,
|
|
6212
|
+
score: member.result.score,
|
|
6213
|
+
weight,
|
|
6214
|
+
verdict: member.result.verdict,
|
|
6215
|
+
hits: [...member.result.hits],
|
|
6216
|
+
misses: [...member.result.misses],
|
|
6217
|
+
reasoning: member.result.reasoning,
|
|
6218
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6219
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6220
|
+
details: member.result.details
|
|
6221
|
+
});
|
|
5967
6222
|
}
|
|
5968
|
-
const
|
|
6223
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5969
6224
|
return {
|
|
5970
|
-
|
|
5971
|
-
|
|
5972
|
-
|
|
5973
|
-
|
|
6225
|
+
score: clampScore(finalScore),
|
|
6226
|
+
verdict: scoreToVerdict(finalScore),
|
|
6227
|
+
hits: allHits,
|
|
6228
|
+
misses: allMisses,
|
|
6229
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6230
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6231
|
+
evaluatorRawRequest: {
|
|
6232
|
+
aggregator: "weighted_average",
|
|
6233
|
+
...weights ? { weights } : {}
|
|
6234
|
+
},
|
|
6235
|
+
evaluatorResults
|
|
5974
6236
|
};
|
|
5975
6237
|
}
|
|
5976
|
-
|
|
5977
|
-
const
|
|
5978
|
-
const
|
|
5979
|
-
|
|
6238
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6239
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6240
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6241
|
+
const evaluatorResults = results.map((member) => ({
|
|
6242
|
+
name: member.id,
|
|
6243
|
+
type: member.type,
|
|
6244
|
+
score: member.result.score,
|
|
6245
|
+
weight: weights?.[member.id] ?? 1,
|
|
6246
|
+
verdict: member.result.verdict,
|
|
6247
|
+
hits: [...member.result.hits],
|
|
6248
|
+
misses: [...member.result.misses],
|
|
6249
|
+
reasoning: member.result.reasoning,
|
|
6250
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6251
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6252
|
+
details: member.result.details
|
|
6253
|
+
}));
|
|
6254
|
+
try {
|
|
6255
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6256
|
+
const parsed = parseJsonSafe(stdout);
|
|
6257
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6258
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6259
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6260
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6261
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
5980
6262
|
return {
|
|
5981
|
-
score
|
|
5982
|
-
verdict
|
|
5983
|
-
hits
|
|
5984
|
-
misses
|
|
5985
|
-
expectedAspectCount:
|
|
6263
|
+
score,
|
|
6264
|
+
verdict,
|
|
6265
|
+
hits,
|
|
6266
|
+
misses,
|
|
6267
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6268
|
+
reasoning,
|
|
6269
|
+
evaluatorRawRequest: {
|
|
6270
|
+
aggregator: "code_judge",
|
|
6271
|
+
script: scriptPath
|
|
6272
|
+
},
|
|
6273
|
+
evaluatorResults
|
|
6274
|
+
};
|
|
6275
|
+
} catch (error) {
|
|
6276
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6277
|
+
return {
|
|
6278
|
+
score: 0,
|
|
6279
|
+
verdict: "fail",
|
|
6280
|
+
hits: [],
|
|
6281
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
6282
|
+
expectedAspectCount: 1,
|
|
6283
|
+
reasoning: message,
|
|
6284
|
+
evaluatorRawRequest: {
|
|
6285
|
+
aggregator: "code_judge",
|
|
6286
|
+
script: scriptPath,
|
|
6287
|
+
error: message
|
|
6288
|
+
},
|
|
6289
|
+
evaluatorResults
|
|
5986
6290
|
};
|
|
5987
6291
|
}
|
|
5988
|
-
|
|
5989
|
-
|
|
5990
|
-
|
|
5991
|
-
|
|
5992
|
-
|
|
5993
|
-
if (actual >= required) {
|
|
5994
|
-
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
5995
|
-
} else {
|
|
5996
|
-
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
5997
|
-
}
|
|
6292
|
+
}
|
|
6293
|
+
async runLlmAggregator(results, context, config) {
|
|
6294
|
+
const judgeProvider = context.judgeProvider;
|
|
6295
|
+
if (!judgeProvider) {
|
|
6296
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
5998
6297
|
}
|
|
5999
|
-
const
|
|
6000
|
-
|
|
6001
|
-
|
|
6002
|
-
|
|
6003
|
-
|
|
6004
|
-
|
|
6005
|
-
|
|
6298
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6299
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6300
|
+
const evaluatorResults = results.map((member) => ({
|
|
6301
|
+
name: member.id,
|
|
6302
|
+
type: member.type,
|
|
6303
|
+
score: member.result.score,
|
|
6304
|
+
verdict: member.result.verdict,
|
|
6305
|
+
hits: [...member.result.hits],
|
|
6306
|
+
misses: [...member.result.misses],
|
|
6307
|
+
reasoning: member.result.reasoning,
|
|
6308
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6309
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6310
|
+
details: member.result.details
|
|
6311
|
+
}));
|
|
6312
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6313
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6314
|
+
const systemPrompt = buildOutputSchema();
|
|
6315
|
+
const evaluatorRawRequest = {
|
|
6316
|
+
aggregator: "llm_judge",
|
|
6317
|
+
userPrompt,
|
|
6318
|
+
systemPrompt,
|
|
6319
|
+
target: judgeProvider.targetName
|
|
6006
6320
|
};
|
|
6007
|
-
|
|
6008
|
-
|
|
6009
|
-
|
|
6010
|
-
|
|
6321
|
+
try {
|
|
6322
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
6323
|
+
if (model) {
|
|
6324
|
+
const { text } = await generateText3({
|
|
6325
|
+
model,
|
|
6326
|
+
system: systemPrompt,
|
|
6327
|
+
prompt: userPrompt
|
|
6328
|
+
});
|
|
6329
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6330
|
+
const score2 = clampScore(data2.score);
|
|
6331
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6332
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6333
|
+
const reasoning2 = data2.reasoning;
|
|
6334
|
+
return {
|
|
6335
|
+
score: score2,
|
|
6336
|
+
verdict: scoreToVerdict(score2),
|
|
6337
|
+
hits: hits2,
|
|
6338
|
+
misses: misses2,
|
|
6339
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6340
|
+
reasoning: reasoning2,
|
|
6341
|
+
evaluatorRawRequest,
|
|
6342
|
+
evaluatorResults
|
|
6343
|
+
};
|
|
6344
|
+
}
|
|
6345
|
+
const response = await judgeProvider.invoke({
|
|
6346
|
+
question: userPrompt,
|
|
6347
|
+
systemPrompt,
|
|
6348
|
+
evalCaseId: context.evalCase.id,
|
|
6349
|
+
attempt: context.attempt
|
|
6350
|
+
});
|
|
6351
|
+
const data = freeformEvaluationSchema.parse(
|
|
6352
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6353
|
+
);
|
|
6354
|
+
const score = clampScore(data.score);
|
|
6355
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6356
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6357
|
+
const reasoning = data.reasoning;
|
|
6011
6358
|
return {
|
|
6012
|
-
score
|
|
6013
|
-
verdict:
|
|
6014
|
-
hits
|
|
6015
|
-
misses
|
|
6016
|
-
expectedAspectCount:
|
|
6359
|
+
score,
|
|
6360
|
+
verdict: scoreToVerdict(score),
|
|
6361
|
+
hits,
|
|
6362
|
+
misses,
|
|
6363
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
6364
|
+
reasoning,
|
|
6365
|
+
evaluatorRawRequest,
|
|
6366
|
+
evaluatorResults
|
|
6017
6367
|
};
|
|
6018
|
-
}
|
|
6019
|
-
const hits = [];
|
|
6020
|
-
const misses = [];
|
|
6021
|
-
let actualIndex = 0;
|
|
6022
|
-
for (let i = 0; i < expected.length; i++) {
|
|
6023
|
-
const expectedItem = expected[i];
|
|
6024
|
-
const expectedTool = expectedItem.tool;
|
|
6025
|
-
let found = false;
|
|
6026
|
-
let argsMismatch = false;
|
|
6027
|
-
while (actualIndex < toolCalls.length) {
|
|
6028
|
-
const actualCall = toolCalls[actualIndex];
|
|
6029
|
-
if (actualCall.name === expectedTool) {
|
|
6030
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
6031
|
-
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
6032
|
-
actualIndex++;
|
|
6033
|
-
found = true;
|
|
6034
|
-
break;
|
|
6035
|
-
}
|
|
6036
|
-
misses.push(
|
|
6037
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
6038
|
-
);
|
|
6039
|
-
actualIndex++;
|
|
6040
|
-
argsMismatch = true;
|
|
6041
|
-
break;
|
|
6042
|
-
}
|
|
6043
|
-
actualIndex++;
|
|
6044
|
-
}
|
|
6045
|
-
if (!found && !argsMismatch) {
|
|
6046
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
6047
|
-
}
|
|
6048
|
-
}
|
|
6049
|
-
const score = hits.length / expected.length;
|
|
6050
|
-
return {
|
|
6051
|
-
score,
|
|
6052
|
-
verdict: scoreToVerdict(score),
|
|
6053
|
-
hits,
|
|
6054
|
-
misses,
|
|
6055
|
-
expectedAspectCount: expected.length
|
|
6056
|
-
};
|
|
6057
|
-
}
|
|
6058
|
-
evaluateExact(toolCalls) {
|
|
6059
|
-
const expected = this.config.expected ?? [];
|
|
6060
|
-
if (expected.length === 0) {
|
|
6368
|
+
} catch {
|
|
6061
6369
|
return {
|
|
6062
|
-
score:
|
|
6063
|
-
verdict: "
|
|
6064
|
-
hits: [
|
|
6370
|
+
score: 0,
|
|
6371
|
+
verdict: "fail",
|
|
6372
|
+
hits: [],
|
|
6065
6373
|
misses: [],
|
|
6066
|
-
expectedAspectCount:
|
|
6374
|
+
expectedAspectCount: 1,
|
|
6375
|
+
evaluatorRawRequest,
|
|
6376
|
+
evaluatorResults
|
|
6067
6377
|
};
|
|
6068
6378
|
}
|
|
6069
|
-
|
|
6070
|
-
|
|
6071
|
-
|
|
6072
|
-
|
|
6073
|
-
|
|
6074
|
-
|
|
6075
|
-
|
|
6076
|
-
|
|
6077
|
-
|
|
6078
|
-
|
|
6079
|
-
|
|
6080
|
-
|
|
6081
|
-
|
|
6082
|
-
|
|
6083
|
-
|
|
6084
|
-
|
|
6379
|
+
}
|
|
6380
|
+
};
|
|
6381
|
+
|
|
6382
|
+
// src/evaluation/evaluators/cost.ts
|
|
6383
|
+
var CostEvaluator = class {
|
|
6384
|
+
kind = "cost";
|
|
6385
|
+
config;
|
|
6386
|
+
constructor(options) {
|
|
6387
|
+
this.config = options.config;
|
|
6388
|
+
}
|
|
6389
|
+
evaluate(context) {
|
|
6390
|
+
const { budget } = this.config;
|
|
6391
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
6392
|
+
if (costUsd === void 0) {
|
|
6393
|
+
return {
|
|
6394
|
+
score: 0,
|
|
6395
|
+
verdict: "fail",
|
|
6396
|
+
hits: [],
|
|
6397
|
+
misses: ["No cost data available in trace"],
|
|
6398
|
+
expectedAspectCount: 1,
|
|
6399
|
+
reasoning: "Execution cost not reported by provider",
|
|
6400
|
+
evaluatorRawRequest: {
|
|
6401
|
+
type: "cost",
|
|
6402
|
+
budget,
|
|
6403
|
+
costUsd: null
|
|
6085
6404
|
}
|
|
6086
|
-
}
|
|
6087
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6088
|
-
}
|
|
6089
|
-
}
|
|
6090
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
6091
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
6405
|
+
};
|
|
6092
6406
|
}
|
|
6093
|
-
const
|
|
6407
|
+
const passed = costUsd <= budget;
|
|
6408
|
+
const score = passed ? 1 : 0;
|
|
6409
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6094
6410
|
return {
|
|
6095
6411
|
score,
|
|
6096
|
-
verdict:
|
|
6097
|
-
hits,
|
|
6098
|
-
misses,
|
|
6099
|
-
expectedAspectCount:
|
|
6412
|
+
verdict: passed ? "pass" : "fail",
|
|
6413
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6414
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6415
|
+
expectedAspectCount: 1,
|
|
6416
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6417
|
+
evaluatorRawRequest: {
|
|
6418
|
+
type: "cost",
|
|
6419
|
+
budget,
|
|
6420
|
+
costUsd
|
|
6421
|
+
}
|
|
6100
6422
|
};
|
|
6101
6423
|
}
|
|
6102
6424
|
};
|
|
6425
|
+
|
|
6426
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
6103
6427
|
var DEFAULT_DATE_FORMATS = [
|
|
6104
6428
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
6105
6429
|
// ISO with timezone
|
|
@@ -6312,434 +6636,209 @@ var FieldAccuracyEvaluator = class {
|
|
|
6312
6636
|
}
|
|
6313
6637
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6314
6638
|
return {
|
|
6315
|
-
path: path15,
|
|
6316
|
-
score: 0,
|
|
6317
|
-
weight,
|
|
6318
|
-
hit: false,
|
|
6319
|
-
message: `${path15} (invalid numeric value)`
|
|
6320
|
-
};
|
|
6321
|
-
}
|
|
6322
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
6323
|
-
let withinTolerance;
|
|
6324
|
-
if (relative) {
|
|
6325
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6326
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
6327
|
-
} else {
|
|
6328
|
-
withinTolerance = diff <= tolerance;
|
|
6329
|
-
}
|
|
6330
|
-
if (withinTolerance) {
|
|
6331
|
-
return {
|
|
6332
|
-
path: path15,
|
|
6333
|
-
score: 1,
|
|
6334
|
-
weight,
|
|
6335
|
-
hit: true,
|
|
6336
|
-
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6337
|
-
};
|
|
6338
|
-
}
|
|
6339
|
-
return {
|
|
6340
|
-
path: path15,
|
|
6341
|
-
score: 0,
|
|
6342
|
-
weight,
|
|
6343
|
-
hit: false,
|
|
6344
|
-
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6345
|
-
};
|
|
6346
|
-
}
|
|
6347
|
-
/**
|
|
6348
|
-
* Date comparison with format normalization.
|
|
6349
|
-
*/
|
|
6350
|
-
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6351
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6352
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6353
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6354
|
-
if (candidateDate === null) {
|
|
6355
|
-
return {
|
|
6356
|
-
path: path15,
|
|
6357
|
-
score: 0,
|
|
6358
|
-
weight,
|
|
6359
|
-
hit: false,
|
|
6360
|
-
message: `${path15} (unparseable candidate date)`
|
|
6361
|
-
};
|
|
6362
|
-
}
|
|
6363
|
-
if (expectedDate === null) {
|
|
6364
|
-
return {
|
|
6365
|
-
path: path15,
|
|
6366
|
-
score: 0,
|
|
6367
|
-
weight,
|
|
6368
|
-
hit: false,
|
|
6369
|
-
message: `${path15} (unparseable expected date)`
|
|
6370
|
-
};
|
|
6371
|
-
}
|
|
6372
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6373
|
-
return {
|
|
6374
|
-
path: path15,
|
|
6375
|
-
score: 1,
|
|
6376
|
-
weight,
|
|
6377
|
-
hit: true,
|
|
6378
|
-
message: path15
|
|
6379
|
-
};
|
|
6380
|
-
}
|
|
6381
|
-
return {
|
|
6382
|
-
path: path15,
|
|
6383
|
-
score: 0,
|
|
6384
|
-
weight,
|
|
6385
|
-
hit: false,
|
|
6386
|
-
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6387
|
-
};
|
|
6388
|
-
}
|
|
6389
|
-
/**
|
|
6390
|
-
* Aggregate field results using configured strategy.
|
|
6391
|
-
*/
|
|
6392
|
-
aggregateResults(results) {
|
|
6393
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6394
|
-
const hits = [];
|
|
6395
|
-
const misses = [];
|
|
6396
|
-
for (const result of results) {
|
|
6397
|
-
if (result.hit) {
|
|
6398
|
-
hits.push(result.message);
|
|
6399
|
-
} else {
|
|
6400
|
-
misses.push(result.message);
|
|
6401
|
-
}
|
|
6402
|
-
}
|
|
6403
|
-
let score;
|
|
6404
|
-
if (aggregation === "all_or_nothing") {
|
|
6405
|
-
score = misses.length === 0 ? 1 : 0;
|
|
6406
|
-
} else {
|
|
6407
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6408
|
-
if (totalWeight === 0) {
|
|
6409
|
-
score = results.length === 0 ? 1 : 0;
|
|
6410
|
-
} else {
|
|
6411
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6412
|
-
score = weightedSum / totalWeight;
|
|
6413
|
-
}
|
|
6414
|
-
}
|
|
6415
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6416
|
-
return {
|
|
6417
|
-
score: clampScore(score),
|
|
6418
|
-
verdict: scoreToVerdict(score),
|
|
6419
|
-
hits: hits.slice(0, 4),
|
|
6420
|
-
misses: misses.slice(0, 4),
|
|
6421
|
-
expectedAspectCount: results.length,
|
|
6422
|
-
reasoning
|
|
6423
|
-
};
|
|
6424
|
-
}
|
|
6425
|
-
};
|
|
6426
|
-
function resolvePath(obj, path15) {
|
|
6427
|
-
if (!path15 || !obj) {
|
|
6428
|
-
return void 0;
|
|
6429
|
-
}
|
|
6430
|
-
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6431
|
-
let current = obj;
|
|
6432
|
-
for (const part of parts) {
|
|
6433
|
-
if (current === null || current === void 0) {
|
|
6434
|
-
return void 0;
|
|
6435
|
-
}
|
|
6436
|
-
if (typeof current !== "object") {
|
|
6437
|
-
return void 0;
|
|
6438
|
-
}
|
|
6439
|
-
const isIndex = /^\d+$/.test(part);
|
|
6440
|
-
if (isIndex && Array.isArray(current)) {
|
|
6441
|
-
current = current[Number.parseInt(part, 10)];
|
|
6442
|
-
} else {
|
|
6443
|
-
current = current[part];
|
|
6444
|
-
}
|
|
6445
|
-
}
|
|
6446
|
-
return current;
|
|
6447
|
-
}
|
|
6448
|
-
function toNumber(value) {
|
|
6449
|
-
if (typeof value === "number") {
|
|
6450
|
-
return value;
|
|
6451
|
-
}
|
|
6452
|
-
if (typeof value === "string") {
|
|
6453
|
-
const num = Number.parseFloat(value);
|
|
6454
|
-
return Number.isNaN(num) ? null : num;
|
|
6455
|
-
}
|
|
6456
|
-
return null;
|
|
6457
|
-
}
|
|
6458
|
-
function parseDate(dateStr, formats) {
|
|
6459
|
-
if (!dateStr) return null;
|
|
6460
|
-
const trimmed = dateStr.trim();
|
|
6461
|
-
const isoDate = new Date(trimmed);
|
|
6462
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
6463
|
-
return isoDate;
|
|
6464
|
-
}
|
|
6465
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6466
|
-
if (localizedMatch) {
|
|
6467
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6468
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
6469
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6470
|
-
const month = MONTH_NAMES[monthName];
|
|
6471
|
-
if (month !== void 0) {
|
|
6472
|
-
return new Date(year, month, day);
|
|
6473
|
-
}
|
|
6474
|
-
}
|
|
6475
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6476
|
-
if (usMatch) {
|
|
6477
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6478
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6479
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
6480
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6481
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
6482
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6483
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6484
|
-
return new Date(year, month, day);
|
|
6485
|
-
}
|
|
6486
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
6487
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
6488
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6489
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6490
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6491
|
-
return new Date(year, month, day);
|
|
6492
|
-
}
|
|
6493
|
-
} else {
|
|
6494
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6495
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6496
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6497
|
-
if (num1 > 12 && num2 <= 12) {
|
|
6498
|
-
return new Date(year, num2 - 1, num1);
|
|
6499
|
-
}
|
|
6500
|
-
if (num2 > 12 && num1 <= 12) {
|
|
6501
|
-
return new Date(year, num1 - 1, num2);
|
|
6502
|
-
}
|
|
6503
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
6504
|
-
return new Date(year, num1 - 1, num2);
|
|
6505
|
-
}
|
|
6506
|
-
}
|
|
6507
|
-
}
|
|
6508
|
-
return null;
|
|
6509
|
-
}
|
|
6510
|
-
function formatDateISO(date) {
|
|
6511
|
-
return date.toISOString().split("T")[0];
|
|
6512
|
-
}
|
|
6513
|
-
function parseJsonFromTextSafe(text) {
|
|
6514
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6515
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
6516
|
-
const blob = match?.[0] ?? cleaned;
|
|
6517
|
-
return JSON.parse(blob);
|
|
6518
|
-
}
|
|
6519
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6520
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
6521
|
-
|
|
6522
|
-
Decide the final score and verdict based on all evaluator results.
|
|
6523
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6524
|
-
var CompositeEvaluator = class {
|
|
6525
|
-
kind = "composite";
|
|
6526
|
-
config;
|
|
6527
|
-
evaluatorFactory;
|
|
6528
|
-
cwd;
|
|
6529
|
-
constructor(options) {
|
|
6530
|
-
this.config = options.config;
|
|
6531
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
6532
|
-
this.cwd = options.cwd;
|
|
6533
|
-
}
|
|
6534
|
-
async evaluate(context) {
|
|
6535
|
-
const memberResults = await Promise.all(
|
|
6536
|
-
this.config.evaluators.map(async (memberConfig) => {
|
|
6537
|
-
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
6538
|
-
return {
|
|
6539
|
-
id: memberConfig.name,
|
|
6540
|
-
type: memberConfig.type,
|
|
6541
|
-
result: await evaluator.evaluate(context)
|
|
6542
|
-
};
|
|
6543
|
-
})
|
|
6544
|
-
);
|
|
6545
|
-
return this.aggregate(memberResults, context);
|
|
6546
|
-
}
|
|
6547
|
-
async aggregate(results, context) {
|
|
6548
|
-
const aggregator = this.config.aggregator;
|
|
6549
|
-
switch (aggregator.type) {
|
|
6550
|
-
case "code_judge":
|
|
6551
|
-
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6552
|
-
case "llm_judge":
|
|
6553
|
-
return this.runLlmAggregator(results, context, aggregator);
|
|
6554
|
-
default:
|
|
6555
|
-
return this.runWeightedAverage(results, aggregator.weights);
|
|
6556
|
-
}
|
|
6557
|
-
}
|
|
6558
|
-
runWeightedAverage(results, weights) {
|
|
6559
|
-
let totalWeight = 0;
|
|
6560
|
-
let weightedSum = 0;
|
|
6561
|
-
const allHits = [];
|
|
6562
|
-
const allMisses = [];
|
|
6563
|
-
const reasoningParts = [];
|
|
6564
|
-
const evaluatorResults = [];
|
|
6565
|
-
for (const member of results) {
|
|
6566
|
-
const weight = weights?.[member.id] ?? 1;
|
|
6567
|
-
totalWeight += weight;
|
|
6568
|
-
weightedSum += member.result.score * weight;
|
|
6569
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6570
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6571
|
-
if (member.result.reasoning) {
|
|
6572
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
6573
|
-
}
|
|
6574
|
-
evaluatorResults.push({
|
|
6575
|
-
name: member.id,
|
|
6576
|
-
type: member.type,
|
|
6577
|
-
score: member.result.score,
|
|
6578
|
-
weight,
|
|
6579
|
-
verdict: member.result.verdict,
|
|
6580
|
-
hits: [...member.result.hits],
|
|
6581
|
-
misses: [...member.result.misses],
|
|
6582
|
-
reasoning: member.result.reasoning,
|
|
6583
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6584
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6585
|
-
});
|
|
6586
|
-
}
|
|
6587
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
6588
|
-
return {
|
|
6589
|
-
score: clampScore(finalScore),
|
|
6590
|
-
verdict: scoreToVerdict(finalScore),
|
|
6591
|
-
hits: allHits,
|
|
6592
|
-
misses: allMisses,
|
|
6593
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6594
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6595
|
-
evaluatorRawRequest: {
|
|
6596
|
-
aggregator: "weighted_average",
|
|
6597
|
-
...weights ? { weights } : {}
|
|
6598
|
-
},
|
|
6599
|
-
evaluatorResults
|
|
6600
|
-
};
|
|
6601
|
-
}
|
|
6602
|
-
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6603
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6604
|
-
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6605
|
-
const evaluatorResults = results.map((member) => ({
|
|
6606
|
-
name: member.id,
|
|
6607
|
-
type: member.type,
|
|
6608
|
-
score: member.result.score,
|
|
6609
|
-
weight: weights?.[member.id] ?? 1,
|
|
6610
|
-
verdict: member.result.verdict,
|
|
6611
|
-
hits: [...member.result.hits],
|
|
6612
|
-
misses: [...member.result.misses],
|
|
6613
|
-
reasoning: member.result.reasoning,
|
|
6614
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6615
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6616
|
-
}));
|
|
6617
|
-
try {
|
|
6618
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6619
|
-
const parsed = parseJsonSafe(stdout);
|
|
6620
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6621
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6622
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6623
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6624
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6625
|
-
return {
|
|
6626
|
-
score,
|
|
6627
|
-
verdict,
|
|
6628
|
-
hits,
|
|
6629
|
-
misses,
|
|
6630
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
6631
|
-
reasoning,
|
|
6632
|
-
evaluatorRawRequest: {
|
|
6633
|
-
aggregator: "code_judge",
|
|
6634
|
-
script: scriptPath
|
|
6635
|
-
},
|
|
6636
|
-
evaluatorResults
|
|
6637
|
-
};
|
|
6638
|
-
} catch (error) {
|
|
6639
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
6640
|
-
return {
|
|
6641
|
-
score: 0,
|
|
6642
|
-
verdict: "fail",
|
|
6643
|
-
hits: [],
|
|
6644
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
6645
|
-
expectedAspectCount: 1,
|
|
6646
|
-
reasoning: message,
|
|
6647
|
-
evaluatorRawRequest: {
|
|
6648
|
-
aggregator: "code_judge",
|
|
6649
|
-
script: scriptPath,
|
|
6650
|
-
error: message
|
|
6651
|
-
},
|
|
6652
|
-
evaluatorResults
|
|
6653
|
-
};
|
|
6654
|
-
}
|
|
6655
|
-
}
|
|
6656
|
-
async runLlmAggregator(results, context, config) {
|
|
6657
|
-
const judgeProvider = context.judgeProvider;
|
|
6658
|
-
if (!judgeProvider) {
|
|
6659
|
-
throw new Error("No judge provider available for LLM aggregation");
|
|
6660
|
-
}
|
|
6661
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6662
|
-
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6663
|
-
const evaluatorResults = results.map((member) => ({
|
|
6664
|
-
name: member.id,
|
|
6665
|
-
type: member.type,
|
|
6666
|
-
score: member.result.score,
|
|
6667
|
-
verdict: member.result.verdict,
|
|
6668
|
-
hits: [...member.result.hits],
|
|
6669
|
-
misses: [...member.result.misses],
|
|
6670
|
-
reasoning: member.result.reasoning,
|
|
6671
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6672
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6673
|
-
}));
|
|
6674
|
-
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6675
|
-
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6676
|
-
const systemPrompt = buildOutputSchema();
|
|
6677
|
-
const evaluatorRawRequest = {
|
|
6678
|
-
aggregator: "llm_judge",
|
|
6679
|
-
userPrompt,
|
|
6680
|
-
systemPrompt,
|
|
6681
|
-
target: judgeProvider.targetName
|
|
6682
|
-
};
|
|
6683
|
-
try {
|
|
6684
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
6685
|
-
if (model) {
|
|
6686
|
-
const { text } = await generateText2({
|
|
6687
|
-
model,
|
|
6688
|
-
system: systemPrompt,
|
|
6689
|
-
prompt: userPrompt
|
|
6690
|
-
});
|
|
6691
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6692
|
-
const score2 = clampScore(data2.score);
|
|
6693
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6694
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6695
|
-
const reasoning2 = data2.reasoning;
|
|
6696
|
-
return {
|
|
6697
|
-
score: score2,
|
|
6698
|
-
verdict: scoreToVerdict(score2),
|
|
6699
|
-
hits: hits2,
|
|
6700
|
-
misses: misses2,
|
|
6701
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6702
|
-
reasoning: reasoning2,
|
|
6703
|
-
evaluatorRawRequest,
|
|
6704
|
-
evaluatorResults
|
|
6705
|
-
};
|
|
6706
|
-
}
|
|
6707
|
-
const response = await judgeProvider.invoke({
|
|
6708
|
-
question: userPrompt,
|
|
6709
|
-
systemPrompt,
|
|
6710
|
-
evalCaseId: context.evalCase.id,
|
|
6711
|
-
attempt: context.attempt
|
|
6712
|
-
});
|
|
6713
|
-
const data = freeformEvaluationSchema.parse(
|
|
6714
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6715
|
-
);
|
|
6716
|
-
const score = clampScore(data.score);
|
|
6717
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6718
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6719
|
-
const reasoning = data.reasoning;
|
|
6720
|
-
return {
|
|
6721
|
-
score,
|
|
6722
|
-
verdict: scoreToVerdict(score),
|
|
6723
|
-
hits,
|
|
6724
|
-
misses,
|
|
6725
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
6726
|
-
reasoning,
|
|
6727
|
-
evaluatorRawRequest,
|
|
6728
|
-
evaluatorResults
|
|
6639
|
+
path: path15,
|
|
6640
|
+
score: 0,
|
|
6641
|
+
weight,
|
|
6642
|
+
hit: false,
|
|
6643
|
+
message: `${path15} (invalid numeric value)`
|
|
6729
6644
|
};
|
|
6730
|
-
}
|
|
6645
|
+
}
|
|
6646
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
6647
|
+
let withinTolerance;
|
|
6648
|
+
if (relative) {
|
|
6649
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6650
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
6651
|
+
} else {
|
|
6652
|
+
withinTolerance = diff <= tolerance;
|
|
6653
|
+
}
|
|
6654
|
+
if (withinTolerance) {
|
|
6655
|
+
return {
|
|
6656
|
+
path: path15,
|
|
6657
|
+
score: 1,
|
|
6658
|
+
weight,
|
|
6659
|
+
hit: true,
|
|
6660
|
+
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6661
|
+
};
|
|
6662
|
+
}
|
|
6663
|
+
return {
|
|
6664
|
+
path: path15,
|
|
6665
|
+
score: 0,
|
|
6666
|
+
weight,
|
|
6667
|
+
hit: false,
|
|
6668
|
+
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6669
|
+
};
|
|
6670
|
+
}
|
|
6671
|
+
/**
|
|
6672
|
+
* Date comparison with format normalization.
|
|
6673
|
+
*/
|
|
6674
|
+
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6675
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6676
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6677
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6678
|
+
if (candidateDate === null) {
|
|
6731
6679
|
return {
|
|
6680
|
+
path: path15,
|
|
6732
6681
|
score: 0,
|
|
6733
|
-
|
|
6734
|
-
|
|
6735
|
-
|
|
6736
|
-
|
|
6737
|
-
|
|
6738
|
-
|
|
6682
|
+
weight,
|
|
6683
|
+
hit: false,
|
|
6684
|
+
message: `${path15} (unparseable candidate date)`
|
|
6685
|
+
};
|
|
6686
|
+
}
|
|
6687
|
+
if (expectedDate === null) {
|
|
6688
|
+
return {
|
|
6689
|
+
path: path15,
|
|
6690
|
+
score: 0,
|
|
6691
|
+
weight,
|
|
6692
|
+
hit: false,
|
|
6693
|
+
message: `${path15} (unparseable expected date)`
|
|
6694
|
+
};
|
|
6695
|
+
}
|
|
6696
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6697
|
+
return {
|
|
6698
|
+
path: path15,
|
|
6699
|
+
score: 1,
|
|
6700
|
+
weight,
|
|
6701
|
+
hit: true,
|
|
6702
|
+
message: path15
|
|
6739
6703
|
};
|
|
6740
6704
|
}
|
|
6705
|
+
return {
|
|
6706
|
+
path: path15,
|
|
6707
|
+
score: 0,
|
|
6708
|
+
weight,
|
|
6709
|
+
hit: false,
|
|
6710
|
+
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6711
|
+
};
|
|
6712
|
+
}
|
|
6713
|
+
/**
|
|
6714
|
+
* Aggregate field results using configured strategy.
|
|
6715
|
+
*/
|
|
6716
|
+
aggregateResults(results) {
|
|
6717
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6718
|
+
const hits = [];
|
|
6719
|
+
const misses = [];
|
|
6720
|
+
for (const result of results) {
|
|
6721
|
+
if (result.hit) {
|
|
6722
|
+
hits.push(result.message);
|
|
6723
|
+
} else {
|
|
6724
|
+
misses.push(result.message);
|
|
6725
|
+
}
|
|
6726
|
+
}
|
|
6727
|
+
let score;
|
|
6728
|
+
if (aggregation === "all_or_nothing") {
|
|
6729
|
+
score = misses.length === 0 ? 1 : 0;
|
|
6730
|
+
} else {
|
|
6731
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6732
|
+
if (totalWeight === 0) {
|
|
6733
|
+
score = results.length === 0 ? 1 : 0;
|
|
6734
|
+
} else {
|
|
6735
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6736
|
+
score = weightedSum / totalWeight;
|
|
6737
|
+
}
|
|
6738
|
+
}
|
|
6739
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6740
|
+
return {
|
|
6741
|
+
score: clampScore(score),
|
|
6742
|
+
verdict: scoreToVerdict(score),
|
|
6743
|
+
hits: hits.slice(0, 4),
|
|
6744
|
+
misses: misses.slice(0, 4),
|
|
6745
|
+
expectedAspectCount: results.length,
|
|
6746
|
+
reasoning
|
|
6747
|
+
};
|
|
6741
6748
|
}
|
|
6742
6749
|
};
|
|
6750
|
+
function resolvePath(obj, path15) {
|
|
6751
|
+
if (!path15 || !obj) {
|
|
6752
|
+
return void 0;
|
|
6753
|
+
}
|
|
6754
|
+
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6755
|
+
let current = obj;
|
|
6756
|
+
for (const part of parts) {
|
|
6757
|
+
if (current === null || current === void 0) {
|
|
6758
|
+
return void 0;
|
|
6759
|
+
}
|
|
6760
|
+
if (typeof current !== "object") {
|
|
6761
|
+
return void 0;
|
|
6762
|
+
}
|
|
6763
|
+
const isIndex = /^\d+$/.test(part);
|
|
6764
|
+
if (isIndex && Array.isArray(current)) {
|
|
6765
|
+
current = current[Number.parseInt(part, 10)];
|
|
6766
|
+
} else {
|
|
6767
|
+
current = current[part];
|
|
6768
|
+
}
|
|
6769
|
+
}
|
|
6770
|
+
return current;
|
|
6771
|
+
}
|
|
6772
|
+
function toNumber(value) {
|
|
6773
|
+
if (typeof value === "number") {
|
|
6774
|
+
return value;
|
|
6775
|
+
}
|
|
6776
|
+
if (typeof value === "string") {
|
|
6777
|
+
const num = Number.parseFloat(value);
|
|
6778
|
+
return Number.isNaN(num) ? null : num;
|
|
6779
|
+
}
|
|
6780
|
+
return null;
|
|
6781
|
+
}
|
|
6782
|
+
function parseDate(dateStr, formats) {
|
|
6783
|
+
if (!dateStr) return null;
|
|
6784
|
+
const trimmed = dateStr.trim();
|
|
6785
|
+
const isoDate = new Date(trimmed);
|
|
6786
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
6787
|
+
return isoDate;
|
|
6788
|
+
}
|
|
6789
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6790
|
+
if (localizedMatch) {
|
|
6791
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6792
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
6793
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6794
|
+
const month = MONTH_NAMES[monthName];
|
|
6795
|
+
if (month !== void 0) {
|
|
6796
|
+
return new Date(year, month, day);
|
|
6797
|
+
}
|
|
6798
|
+
}
|
|
6799
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6800
|
+
if (usMatch) {
|
|
6801
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6802
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6803
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
6804
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6805
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
6806
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6807
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6808
|
+
return new Date(year, month, day);
|
|
6809
|
+
}
|
|
6810
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
6811
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
6812
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6813
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6814
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6815
|
+
return new Date(year, month, day);
|
|
6816
|
+
}
|
|
6817
|
+
} else {
|
|
6818
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6819
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6820
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6821
|
+
if (num1 > 12 && num2 <= 12) {
|
|
6822
|
+
return new Date(year, num2 - 1, num1);
|
|
6823
|
+
}
|
|
6824
|
+
if (num2 > 12 && num1 <= 12) {
|
|
6825
|
+
return new Date(year, num1 - 1, num2);
|
|
6826
|
+
}
|
|
6827
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
6828
|
+
return new Date(year, num1 - 1, num2);
|
|
6829
|
+
}
|
|
6830
|
+
}
|
|
6831
|
+
}
|
|
6832
|
+
return null;
|
|
6833
|
+
}
|
|
6834
|
+
function formatDateISO(date) {
|
|
6835
|
+
return date.toISOString().split("T")[0];
|
|
6836
|
+
}
|
|
6837
|
+
function parseJsonFromTextSafe(text) {
|
|
6838
|
+
return parseJsonFromText(text);
|
|
6839
|
+
}
|
|
6840
|
+
|
|
6841
|
+
// src/evaluation/evaluators/latency.ts
|
|
6743
6842
|
var LatencyEvaluator = class {
|
|
6744
6843
|
kind = "latency";
|
|
6745
6844
|
config;
|
|
@@ -6772,57 +6871,17 @@ var LatencyEvaluator = class {
|
|
|
6772
6871
|
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
6773
6872
|
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
6774
6873
|
expectedAspectCount: 1,
|
|
6775
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6776
|
-
evaluatorRawRequest: {
|
|
6777
|
-
type: "latency",
|
|
6778
|
-
threshold,
|
|
6779
|
-
durationMs
|
|
6780
|
-
}
|
|
6781
|
-
};
|
|
6782
|
-
}
|
|
6783
|
-
};
|
|
6784
|
-
var CostEvaluator = class {
|
|
6785
|
-
kind = "cost";
|
|
6786
|
-
config;
|
|
6787
|
-
constructor(options) {
|
|
6788
|
-
this.config = options.config;
|
|
6789
|
-
}
|
|
6790
|
-
evaluate(context) {
|
|
6791
|
-
const { budget } = this.config;
|
|
6792
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
6793
|
-
if (costUsd === void 0) {
|
|
6794
|
-
return {
|
|
6795
|
-
score: 0,
|
|
6796
|
-
verdict: "fail",
|
|
6797
|
-
hits: [],
|
|
6798
|
-
misses: ["No cost data available in trace"],
|
|
6799
|
-
expectedAspectCount: 1,
|
|
6800
|
-
reasoning: "Execution cost not reported by provider",
|
|
6801
|
-
evaluatorRawRequest: {
|
|
6802
|
-
type: "cost",
|
|
6803
|
-
budget,
|
|
6804
|
-
costUsd: null
|
|
6805
|
-
}
|
|
6806
|
-
};
|
|
6807
|
-
}
|
|
6808
|
-
const passed = costUsd <= budget;
|
|
6809
|
-
const score = passed ? 1 : 0;
|
|
6810
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6811
|
-
return {
|
|
6812
|
-
score,
|
|
6813
|
-
verdict: passed ? "pass" : "fail",
|
|
6814
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6815
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6816
|
-
expectedAspectCount: 1,
|
|
6817
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6818
|
-
evaluatorRawRequest: {
|
|
6819
|
-
type: "cost",
|
|
6820
|
-
budget,
|
|
6821
|
-
costUsd
|
|
6874
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6875
|
+
evaluatorRawRequest: {
|
|
6876
|
+
type: "latency",
|
|
6877
|
+
threshold,
|
|
6878
|
+
durationMs
|
|
6822
6879
|
}
|
|
6823
6880
|
};
|
|
6824
6881
|
}
|
|
6825
6882
|
};
|
|
6883
|
+
|
|
6884
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
6826
6885
|
var TokenUsageEvaluator = class {
|
|
6827
6886
|
kind = "token_usage";
|
|
6828
6887
|
config;
|
|
@@ -6906,6 +6965,226 @@ var TokenUsageEvaluator = class {
|
|
|
6906
6965
|
}
|
|
6907
6966
|
};
|
|
6908
6967
|
|
|
6968
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
6969
|
+
function argsMatch(expected, actual) {
|
|
6970
|
+
if (expected === void 0) return true;
|
|
6971
|
+
if (expected === "any") return true;
|
|
6972
|
+
if (actual === void 0) return false;
|
|
6973
|
+
for (const key of Object.keys(expected)) {
|
|
6974
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
6975
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
6976
|
+
}
|
|
6977
|
+
return true;
|
|
6978
|
+
}
|
|
6979
|
+
var ToolTrajectoryEvaluator = class {
|
|
6980
|
+
kind = "tool_trajectory";
|
|
6981
|
+
config;
|
|
6982
|
+
constructor(options) {
|
|
6983
|
+
this.config = options.config;
|
|
6984
|
+
}
|
|
6985
|
+
evaluate(context) {
|
|
6986
|
+
const { outputMessages, traceSummary } = context;
|
|
6987
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
6988
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
6989
|
+
return {
|
|
6990
|
+
score: 0,
|
|
6991
|
+
verdict: "fail",
|
|
6992
|
+
hits: [],
|
|
6993
|
+
misses: ["No trace available for evaluation"],
|
|
6994
|
+
expectedAspectCount: 1
|
|
6995
|
+
};
|
|
6996
|
+
}
|
|
6997
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
6998
|
+
if (!summary) {
|
|
6999
|
+
return {
|
|
7000
|
+
score: 0,
|
|
7001
|
+
verdict: "fail",
|
|
7002
|
+
hits: [],
|
|
7003
|
+
misses: ["No trace available for evaluation"],
|
|
7004
|
+
expectedAspectCount: 1
|
|
7005
|
+
};
|
|
7006
|
+
}
|
|
7007
|
+
switch (this.config.mode) {
|
|
7008
|
+
case "any_order":
|
|
7009
|
+
return this.evaluateAnyOrder(summary);
|
|
7010
|
+
case "in_order":
|
|
7011
|
+
return this.evaluateInOrder(toolCalls);
|
|
7012
|
+
case "exact":
|
|
7013
|
+
return this.evaluateExact(toolCalls);
|
|
7014
|
+
default:
|
|
7015
|
+
return {
|
|
7016
|
+
score: 0,
|
|
7017
|
+
verdict: "fail",
|
|
7018
|
+
hits: [],
|
|
7019
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
7020
|
+
expectedAspectCount: 1
|
|
7021
|
+
};
|
|
7022
|
+
}
|
|
7023
|
+
}
|
|
7024
|
+
/**
|
|
7025
|
+
* Extract tool calls from output messages.
|
|
7026
|
+
*/
|
|
7027
|
+
extractToolCallsFromMessages(messages) {
|
|
7028
|
+
if (!messages) {
|
|
7029
|
+
return [];
|
|
7030
|
+
}
|
|
7031
|
+
const toolCalls = [];
|
|
7032
|
+
for (const message of messages) {
|
|
7033
|
+
if (message.toolCalls) {
|
|
7034
|
+
for (const call of message.toolCalls) {
|
|
7035
|
+
toolCalls.push({
|
|
7036
|
+
name: call.tool,
|
|
7037
|
+
args: call.input
|
|
7038
|
+
});
|
|
7039
|
+
}
|
|
7040
|
+
}
|
|
7041
|
+
}
|
|
7042
|
+
return toolCalls;
|
|
7043
|
+
}
|
|
7044
|
+
/**
|
|
7045
|
+
* Build a summary from extracted tool calls.
|
|
7046
|
+
*/
|
|
7047
|
+
buildSummary(toolCalls) {
|
|
7048
|
+
const toolCallsByName = {};
|
|
7049
|
+
for (const call of toolCalls) {
|
|
7050
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
7051
|
+
}
|
|
7052
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
7053
|
+
return {
|
|
7054
|
+
eventCount: toolCalls.length,
|
|
7055
|
+
toolNames,
|
|
7056
|
+
toolCallsByName,
|
|
7057
|
+
errorCount: 0
|
|
7058
|
+
};
|
|
7059
|
+
}
|
|
7060
|
+
evaluateAnyOrder(summary) {
|
|
7061
|
+
const minimums = this.config.minimums ?? {};
|
|
7062
|
+
const toolNames = Object.keys(minimums);
|
|
7063
|
+
if (toolNames.length === 0) {
|
|
7064
|
+
return {
|
|
7065
|
+
score: 1,
|
|
7066
|
+
verdict: "pass",
|
|
7067
|
+
hits: ["No tool requirements specified"],
|
|
7068
|
+
misses: [],
|
|
7069
|
+
expectedAspectCount: 0
|
|
7070
|
+
};
|
|
7071
|
+
}
|
|
7072
|
+
const hits = [];
|
|
7073
|
+
const misses = [];
|
|
7074
|
+
for (const toolName of toolNames) {
|
|
7075
|
+
const required = minimums[toolName];
|
|
7076
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
7077
|
+
if (actual >= required) {
|
|
7078
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7079
|
+
} else {
|
|
7080
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7081
|
+
}
|
|
7082
|
+
}
|
|
7083
|
+
const score = hits.length / toolNames.length;
|
|
7084
|
+
return {
|
|
7085
|
+
score,
|
|
7086
|
+
verdict: scoreToVerdict(score),
|
|
7087
|
+
hits,
|
|
7088
|
+
misses,
|
|
7089
|
+
expectedAspectCount: toolNames.length
|
|
7090
|
+
};
|
|
7091
|
+
}
|
|
7092
|
+
evaluateInOrder(toolCalls) {
|
|
7093
|
+
const expected = this.config.expected ?? [];
|
|
7094
|
+
if (expected.length === 0) {
|
|
7095
|
+
return {
|
|
7096
|
+
score: 1,
|
|
7097
|
+
verdict: "pass",
|
|
7098
|
+
hits: ["No tool sequence specified"],
|
|
7099
|
+
misses: [],
|
|
7100
|
+
expectedAspectCount: 0
|
|
7101
|
+
};
|
|
7102
|
+
}
|
|
7103
|
+
const hits = [];
|
|
7104
|
+
const misses = [];
|
|
7105
|
+
let actualIndex = 0;
|
|
7106
|
+
for (let i = 0; i < expected.length; i++) {
|
|
7107
|
+
const expectedItem = expected[i];
|
|
7108
|
+
const expectedTool = expectedItem.tool;
|
|
7109
|
+
let found = false;
|
|
7110
|
+
let argsMismatch = false;
|
|
7111
|
+
while (actualIndex < toolCalls.length) {
|
|
7112
|
+
const actualCall = toolCalls[actualIndex];
|
|
7113
|
+
if (actualCall.name === expectedTool) {
|
|
7114
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7115
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7116
|
+
actualIndex++;
|
|
7117
|
+
found = true;
|
|
7118
|
+
break;
|
|
7119
|
+
}
|
|
7120
|
+
misses.push(
|
|
7121
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
7122
|
+
);
|
|
7123
|
+
actualIndex++;
|
|
7124
|
+
argsMismatch = true;
|
|
7125
|
+
break;
|
|
7126
|
+
}
|
|
7127
|
+
actualIndex++;
|
|
7128
|
+
}
|
|
7129
|
+
if (!found && !argsMismatch) {
|
|
7130
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7131
|
+
}
|
|
7132
|
+
}
|
|
7133
|
+
const score = hits.length / expected.length;
|
|
7134
|
+
return {
|
|
7135
|
+
score,
|
|
7136
|
+
verdict: scoreToVerdict(score),
|
|
7137
|
+
hits,
|
|
7138
|
+
misses,
|
|
7139
|
+
expectedAspectCount: expected.length
|
|
7140
|
+
};
|
|
7141
|
+
}
|
|
7142
|
+
evaluateExact(toolCalls) {
|
|
7143
|
+
const expected = this.config.expected ?? [];
|
|
7144
|
+
if (expected.length === 0) {
|
|
7145
|
+
return {
|
|
7146
|
+
score: 1,
|
|
7147
|
+
verdict: "pass",
|
|
7148
|
+
hits: ["No tool sequence specified"],
|
|
7149
|
+
misses: [],
|
|
7150
|
+
expectedAspectCount: 0
|
|
7151
|
+
};
|
|
7152
|
+
}
|
|
7153
|
+
const hits = [];
|
|
7154
|
+
const misses = [];
|
|
7155
|
+
if (toolCalls.length !== expected.length) {
|
|
7156
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7157
|
+
}
|
|
7158
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
7159
|
+
for (let i = 0; i < checkLength; i++) {
|
|
7160
|
+
const expectedItem = expected[i];
|
|
7161
|
+
const expectedTool = expectedItem.tool;
|
|
7162
|
+
const actualCall = toolCalls[i];
|
|
7163
|
+
const actualTool = actualCall.name;
|
|
7164
|
+
if (actualTool === expectedTool) {
|
|
7165
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7166
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7167
|
+
} else {
|
|
7168
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7169
|
+
}
|
|
7170
|
+
} else {
|
|
7171
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7172
|
+
}
|
|
7173
|
+
}
|
|
7174
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
7175
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7176
|
+
}
|
|
7177
|
+
const score = hits.length / expected.length;
|
|
7178
|
+
return {
|
|
7179
|
+
score,
|
|
7180
|
+
verdict: scoreToVerdict(score),
|
|
7181
|
+
hits,
|
|
7182
|
+
misses,
|
|
7183
|
+
expectedAspectCount: expected.length
|
|
7184
|
+
};
|
|
7185
|
+
}
|
|
7186
|
+
};
|
|
7187
|
+
|
|
6909
7188
|
// src/evaluation/orchestrator.ts
|
|
6910
7189
|
import { createHash } from "node:crypto";
|
|
6911
7190
|
import path14 from "node:path";
|
|
@@ -7119,6 +7398,17 @@ async function runEvaluation(options) {
|
|
|
7119
7398
|
}
|
|
7120
7399
|
return getOrCreateProvider(resolvedJudge);
|
|
7121
7400
|
};
|
|
7401
|
+
const targetResolver = (name) => {
|
|
7402
|
+
const resolved = resolveTargetByName(name);
|
|
7403
|
+
if (!resolved) {
|
|
7404
|
+
return void 0;
|
|
7405
|
+
}
|
|
7406
|
+
return getOrCreateProvider(resolved);
|
|
7407
|
+
};
|
|
7408
|
+
const availableTargets = [
|
|
7409
|
+
target.name,
|
|
7410
|
+
...Array.from(targetDefinitions.keys())
|
|
7411
|
+
];
|
|
7122
7412
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
7123
7413
|
const primaryProvider = getOrCreateProvider(target);
|
|
7124
7414
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -7148,7 +7438,9 @@ async function runEvaluation(options) {
|
|
|
7148
7438
|
onResult,
|
|
7149
7439
|
verbose,
|
|
7150
7440
|
resolveJudgeProvider,
|
|
7151
|
-
agentTimeoutMs
|
|
7441
|
+
agentTimeoutMs,
|
|
7442
|
+
targetResolver,
|
|
7443
|
+
availableTargets
|
|
7152
7444
|
});
|
|
7153
7445
|
} catch (error) {
|
|
7154
7446
|
if (verbose) {
|
|
@@ -7187,7 +7479,9 @@ async function runEvaluation(options) {
|
|
|
7187
7479
|
cache,
|
|
7188
7480
|
useCache,
|
|
7189
7481
|
now,
|
|
7190
|
-
judgeProvider
|
|
7482
|
+
judgeProvider,
|
|
7483
|
+
targetResolver,
|
|
7484
|
+
availableTargets
|
|
7191
7485
|
});
|
|
7192
7486
|
if (onProgress) {
|
|
7193
7487
|
await onProgress({
|
|
@@ -7254,7 +7548,9 @@ async function runBatchEvaluation(options) {
|
|
|
7254
7548
|
onProgress,
|
|
7255
7549
|
onResult,
|
|
7256
7550
|
resolveJudgeProvider,
|
|
7257
|
-
agentTimeoutMs
|
|
7551
|
+
agentTimeoutMs,
|
|
7552
|
+
targetResolver,
|
|
7553
|
+
availableTargets
|
|
7258
7554
|
} = options;
|
|
7259
7555
|
const promptInputsList = [];
|
|
7260
7556
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -7329,7 +7625,9 @@ async function runBatchEvaluation(options) {
|
|
|
7329
7625
|
judgeProvider: await resolveJudgeProvider(target),
|
|
7330
7626
|
agentTimeoutMs,
|
|
7331
7627
|
outputMessages,
|
|
7332
|
-
traceSummary
|
|
7628
|
+
traceSummary,
|
|
7629
|
+
targetResolver,
|
|
7630
|
+
availableTargets
|
|
7333
7631
|
});
|
|
7334
7632
|
if (providerError) {
|
|
7335
7633
|
result = { ...result, error: providerError };
|
|
@@ -7387,7 +7685,9 @@ async function runEvalCase(options) {
|
|
|
7387
7685
|
cache,
|
|
7388
7686
|
useCache,
|
|
7389
7687
|
signal,
|
|
7390
|
-
judgeProvider
|
|
7688
|
+
judgeProvider,
|
|
7689
|
+
targetResolver,
|
|
7690
|
+
availableTargets
|
|
7391
7691
|
} = options;
|
|
7392
7692
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
7393
7693
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -7461,7 +7761,9 @@ async function runEvalCase(options) {
|
|
|
7461
7761
|
judgeProvider,
|
|
7462
7762
|
agentTimeoutMs,
|
|
7463
7763
|
outputMessages,
|
|
7464
|
-
traceSummary
|
|
7764
|
+
traceSummary,
|
|
7765
|
+
targetResolver,
|
|
7766
|
+
availableTargets
|
|
7465
7767
|
});
|
|
7466
7768
|
return providerError ? { ...result, error: providerError } : result;
|
|
7467
7769
|
} catch (error) {
|
|
@@ -7481,7 +7783,9 @@ async function evaluateCandidate(options) {
|
|
|
7481
7783
|
judgeProvider,
|
|
7482
7784
|
agentTimeoutMs,
|
|
7483
7785
|
outputMessages,
|
|
7484
|
-
traceSummary
|
|
7786
|
+
traceSummary,
|
|
7787
|
+
targetResolver,
|
|
7788
|
+
availableTargets
|
|
7485
7789
|
} = options;
|
|
7486
7790
|
const gradeTimestamp = nowFn();
|
|
7487
7791
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -7496,7 +7800,9 @@ async function evaluateCandidate(options) {
|
|
|
7496
7800
|
judgeProvider,
|
|
7497
7801
|
agentTimeoutMs,
|
|
7498
7802
|
outputMessages,
|
|
7499
|
-
traceSummary
|
|
7803
|
+
traceSummary,
|
|
7804
|
+
targetResolver,
|
|
7805
|
+
availableTargets
|
|
7500
7806
|
});
|
|
7501
7807
|
const completedAt = nowFn();
|
|
7502
7808
|
let agentProviderRequest;
|
|
@@ -7549,7 +7855,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7549
7855
|
judgeProvider,
|
|
7550
7856
|
agentTimeoutMs,
|
|
7551
7857
|
outputMessages,
|
|
7552
|
-
traceSummary
|
|
7858
|
+
traceSummary,
|
|
7859
|
+
targetResolver,
|
|
7860
|
+
availableTargets
|
|
7553
7861
|
} = options;
|
|
7554
7862
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
7555
7863
|
return runEvaluatorList({
|
|
@@ -7565,7 +7873,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7565
7873
|
judgeProvider,
|
|
7566
7874
|
agentTimeoutMs,
|
|
7567
7875
|
outputMessages,
|
|
7568
|
-
traceSummary
|
|
7876
|
+
traceSummary,
|
|
7877
|
+
targetResolver,
|
|
7878
|
+
availableTargets
|
|
7569
7879
|
});
|
|
7570
7880
|
}
|
|
7571
7881
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -7583,7 +7893,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7583
7893
|
now,
|
|
7584
7894
|
judgeProvider,
|
|
7585
7895
|
outputMessages,
|
|
7586
|
-
traceSummary
|
|
7896
|
+
traceSummary,
|
|
7897
|
+
targetResolver,
|
|
7898
|
+
availableTargets
|
|
7587
7899
|
});
|
|
7588
7900
|
return { score };
|
|
7589
7901
|
}
|
|
@@ -7601,7 +7913,9 @@ async function runEvaluatorList(options) {
|
|
|
7601
7913
|
judgeProvider,
|
|
7602
7914
|
agentTimeoutMs,
|
|
7603
7915
|
outputMessages,
|
|
7604
|
-
traceSummary
|
|
7916
|
+
traceSummary,
|
|
7917
|
+
targetResolver,
|
|
7918
|
+
availableTargets
|
|
7605
7919
|
} = options;
|
|
7606
7920
|
const scored = [];
|
|
7607
7921
|
const evaluatorResults = [];
|
|
@@ -7639,7 +7953,8 @@ async function runEvaluatorList(options) {
|
|
|
7639
7953
|
script: evaluator.script,
|
|
7640
7954
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
7641
7955
|
agentTimeoutMs,
|
|
7642
|
-
config: evaluator.config
|
|
7956
|
+
config: evaluator.config,
|
|
7957
|
+
target: evaluator.target
|
|
7643
7958
|
});
|
|
7644
7959
|
const score2 = await codeEvaluator.evaluate({
|
|
7645
7960
|
evalCase,
|
|
@@ -7649,8 +7964,11 @@ async function runEvaluatorList(options) {
|
|
|
7649
7964
|
attempt,
|
|
7650
7965
|
promptInputs,
|
|
7651
7966
|
now,
|
|
7967
|
+
judgeProvider,
|
|
7652
7968
|
outputMessages,
|
|
7653
|
-
traceSummary
|
|
7969
|
+
traceSummary,
|
|
7970
|
+
targetResolver,
|
|
7971
|
+
availableTargets
|
|
7654
7972
|
});
|
|
7655
7973
|
const weight = evaluator.weight ?? 1;
|
|
7656
7974
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -7663,7 +7981,8 @@ async function runEvaluatorList(options) {
|
|
|
7663
7981
|
hits: score2.hits,
|
|
7664
7982
|
misses: score2.misses,
|
|
7665
7983
|
reasoning: score2.reasoning,
|
|
7666
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
7984
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
7985
|
+
details: score2.details
|
|
7667
7986
|
});
|
|
7668
7987
|
}
|
|
7669
7988
|
if (evaluator.type === "composite") {
|
|
@@ -7677,7 +7996,8 @@ async function runEvaluatorList(options) {
|
|
|
7677
7996
|
script: memberConfig.script,
|
|
7678
7997
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
7679
7998
|
agentTimeoutMs,
|
|
7680
|
-
config: memberConfig.config
|
|
7999
|
+
config: memberConfig.config,
|
|
8000
|
+
target: memberConfig.target
|
|
7681
8001
|
});
|
|
7682
8002
|
case "composite":
|
|
7683
8003
|
return new CompositeEvaluator({
|
|
@@ -7726,7 +8046,9 @@ async function runEvaluatorList(options) {
|
|
|
7726
8046
|
now,
|
|
7727
8047
|
judgeProvider,
|
|
7728
8048
|
outputMessages,
|
|
7729
|
-
traceSummary
|
|
8049
|
+
traceSummary,
|
|
8050
|
+
targetResolver,
|
|
8051
|
+
availableTargets
|
|
7730
8052
|
});
|
|
7731
8053
|
const weight = evaluator.weight ?? 1;
|
|
7732
8054
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -7922,11 +8244,11 @@ async function runEvaluatorList(options) {
|
|
|
7922
8244
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
7923
8245
|
0
|
|
7924
8246
|
);
|
|
7925
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
8247
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
7926
8248
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
7927
8249
|
const score = {
|
|
7928
8250
|
score: aggregateScore,
|
|
7929
|
-
verdict:
|
|
8251
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
7930
8252
|
hits,
|
|
7931
8253
|
misses,
|
|
7932
8254
|
expectedAspectCount,
|
|
@@ -7973,18 +8295,6 @@ async function resolveCustomPrompt(config) {
|
|
|
7973
8295
|
}
|
|
7974
8296
|
return config.prompt;
|
|
7975
8297
|
}
|
|
7976
|
-
function isNonEmptyString2(value) {
|
|
7977
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7978
|
-
}
|
|
7979
|
-
function scoreToVerdict2(score) {
|
|
7980
|
-
if (score >= 0.8) {
|
|
7981
|
-
return "pass";
|
|
7982
|
-
}
|
|
7983
|
-
if (score >= 0.6) {
|
|
7984
|
-
return "borderline";
|
|
7985
|
-
}
|
|
7986
|
-
return "fail";
|
|
7987
|
-
}
|
|
7988
8298
|
function filterEvalCases(evalCases, evalId) {
|
|
7989
8299
|
if (!evalId) {
|
|
7990
8300
|
return evalCases;
|
|
@@ -8127,7 +8437,8 @@ function mapChildResults(children) {
|
|
|
8127
8437
|
misses: child.misses,
|
|
8128
8438
|
reasoning: child.reasoning,
|
|
8129
8439
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
8130
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
8440
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
8441
|
+
details: child.details
|
|
8131
8442
|
}));
|
|
8132
8443
|
}
|
|
8133
8444
|
function computeWeightedMean(entries) {
|
|
@@ -8142,7 +8453,7 @@ function computeWeightedMean(entries) {
|
|
|
8142
8453
|
}
|
|
8143
8454
|
|
|
8144
8455
|
// src/evaluation/generators/rubric-generator.ts
|
|
8145
|
-
import { generateText as
|
|
8456
|
+
import { generateText as generateText4 } from "ai";
|
|
8146
8457
|
import { z as z3 } from "zod";
|
|
8147
8458
|
var rubricItemSchema = z3.object({
|
|
8148
8459
|
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -8176,7 +8487,7 @@ You must return a valid JSON object matching this schema:
|
|
|
8176
8487
|
let lastError;
|
|
8177
8488
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
8178
8489
|
try {
|
|
8179
|
-
const { text } = await
|
|
8490
|
+
const { text } = await generateText4({
|
|
8180
8491
|
model,
|
|
8181
8492
|
system,
|
|
8182
8493
|
prompt
|
|
@@ -8238,31 +8549,39 @@ export {
|
|
|
8238
8549
|
ToolTrajectoryEvaluator,
|
|
8239
8550
|
avgToolDurationMs,
|
|
8240
8551
|
buildDirectoryChain,
|
|
8552
|
+
buildOutputSchema,
|
|
8241
8553
|
buildPromptInputs,
|
|
8242
8554
|
buildSearchRoots,
|
|
8555
|
+
clampScore,
|
|
8243
8556
|
computeTraceSummary,
|
|
8244
8557
|
consumeClaudeCodeLogEntries,
|
|
8245
8558
|
consumeCodexLogEntries,
|
|
8246
8559
|
consumePiLogEntries,
|
|
8247
8560
|
createAgentKernel,
|
|
8248
8561
|
createProvider,
|
|
8562
|
+
deepEqual,
|
|
8249
8563
|
ensureVSCodeSubagents,
|
|
8564
|
+
executeScript,
|
|
8250
8565
|
explorationRatio,
|
|
8251
|
-
|
|
8566
|
+
extractJsonBlob,
|
|
8252
8567
|
fileExists,
|
|
8253
8568
|
findGitRoot,
|
|
8569
|
+
freeformEvaluationSchema,
|
|
8254
8570
|
generateRubrics,
|
|
8255
8571
|
getHitCount,
|
|
8256
8572
|
isEvaluatorKind,
|
|
8257
8573
|
isGuidelineFile,
|
|
8258
8574
|
isJsonObject,
|
|
8259
8575
|
isJsonValue,
|
|
8576
|
+
isNonEmptyString,
|
|
8260
8577
|
isTestMessage,
|
|
8261
8578
|
isTestMessageRole,
|
|
8262
8579
|
listTargetNames,
|
|
8263
8580
|
loadEvalCases,
|
|
8264
8581
|
mergeExecutionMetrics,
|
|
8265
8582
|
normalizeLineEndings,
|
|
8583
|
+
parseJsonFromText,
|
|
8584
|
+
parseJsonSafe,
|
|
8266
8585
|
readJsonFile,
|
|
8267
8586
|
readTargetDefinitions,
|
|
8268
8587
|
readTestSuiteMetadata,
|
|
@@ -8272,6 +8591,7 @@ export {
|
|
|
8272
8591
|
resolveTargetDefinition,
|
|
8273
8592
|
runEvalCase,
|
|
8274
8593
|
runEvaluation,
|
|
8594
|
+
scoreToVerdict,
|
|
8275
8595
|
subscribeToClaudeCodeLogEntries,
|
|
8276
8596
|
subscribeToCodexLogEntries,
|
|
8277
8597
|
subscribeToPiLogEntries,
|