@agentv/core 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1457 -1121
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +142 -71
- package/dist/index.d.ts +142 -71
- package/dist/index.js +1295 -968
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
|
|
|
150
150
|
import path6 from "node:path";
|
|
151
151
|
import { parse as parse2 } from "yaml";
|
|
152
152
|
|
|
153
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
154
|
-
function extractCodeBlocks(segments) {
|
|
155
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
156
|
-
const codeBlocks = [];
|
|
157
|
-
for (const segment of segments) {
|
|
158
|
-
const typeValue = segment.type;
|
|
159
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
160
|
-
continue;
|
|
161
|
-
}
|
|
162
|
-
const textValue = segment.value;
|
|
163
|
-
if (typeof textValue !== "string") {
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
166
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
167
|
-
if (matches) {
|
|
168
|
-
codeBlocks.push(...matches);
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
return codeBlocks;
|
|
172
|
-
}
|
|
173
|
-
function formatFileContents(parts) {
|
|
174
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
175
|
-
if (fileCount > 0) {
|
|
176
|
-
return parts.map((part) => {
|
|
177
|
-
if (part.isFile && part.displayPath) {
|
|
178
|
-
return `<file path="${part.displayPath}">
|
|
179
|
-
${part.content}
|
|
180
|
-
</file>`;
|
|
181
|
-
}
|
|
182
|
-
return part.content;
|
|
183
|
-
}).join("\n\n");
|
|
184
|
-
}
|
|
185
|
-
return parts.map((p) => p.content).join(" ");
|
|
186
|
-
}
|
|
187
|
-
function formatSegment(segment, mode = "lm") {
|
|
188
|
-
const type = asString(segment.type);
|
|
189
|
-
if (type === "text") {
|
|
190
|
-
return asString(segment.value);
|
|
191
|
-
}
|
|
192
|
-
if (type === "guideline_ref") {
|
|
193
|
-
const refPath = asString(segment.path);
|
|
194
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
195
|
-
}
|
|
196
|
-
if (type === "file") {
|
|
197
|
-
const filePath = asString(segment.path);
|
|
198
|
-
if (!filePath) {
|
|
199
|
-
return void 0;
|
|
200
|
-
}
|
|
201
|
-
if (mode === "agent") {
|
|
202
|
-
return `<file: path="${filePath}">`;
|
|
203
|
-
}
|
|
204
|
-
const text = asString(segment.text);
|
|
205
|
-
if (text && filePath) {
|
|
206
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
return void 0;
|
|
210
|
-
}
|
|
211
|
-
function hasVisibleContent(segments) {
|
|
212
|
-
return segments.some((segment) => {
|
|
213
|
-
const type = asString(segment.type);
|
|
214
|
-
if (type === "text") {
|
|
215
|
-
const value = asString(segment.value);
|
|
216
|
-
return value !== void 0 && value.trim().length > 0;
|
|
217
|
-
}
|
|
218
|
-
if (type === "guideline_ref") {
|
|
219
|
-
return false;
|
|
220
|
-
}
|
|
221
|
-
if (type === "file") {
|
|
222
|
-
const text = asString(segment.text);
|
|
223
|
-
return text !== void 0 && text.trim().length > 0;
|
|
224
|
-
}
|
|
225
|
-
return false;
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
function asString(value) {
|
|
229
|
-
return typeof value === "string" ? value : void 0;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
153
|
// src/evaluation/loaders/config-loader.ts
|
|
233
154
|
import { readFile } from "node:fs/promises";
|
|
234
155
|
import path2 from "node:path";
|
|
@@ -483,7 +404,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
483
404
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
484
405
|
continue;
|
|
485
406
|
}
|
|
486
|
-
const name =
|
|
407
|
+
const name = asString(rawEvaluator.name);
|
|
487
408
|
const typeValue = rawEvaluator.type;
|
|
488
409
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
489
410
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -511,7 +432,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
511
432
|
continue;
|
|
512
433
|
}
|
|
513
434
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
514
|
-
const cwd =
|
|
435
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
515
436
|
let resolvedCwd;
|
|
516
437
|
if (cwd) {
|
|
517
438
|
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
@@ -526,7 +447,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
526
447
|
} else {
|
|
527
448
|
resolvedCwd = searchRoots[0];
|
|
528
449
|
}
|
|
529
|
-
const
|
|
450
|
+
const rawTarget = rawEvaluator.target;
|
|
451
|
+
let targetConfig;
|
|
452
|
+
if (rawTarget !== void 0) {
|
|
453
|
+
if (isJsonObject2(rawTarget)) {
|
|
454
|
+
const maxCalls = rawTarget.max_calls;
|
|
455
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
456
|
+
logWarning2(
|
|
457
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
458
|
+
);
|
|
459
|
+
} else {
|
|
460
|
+
targetConfig = {
|
|
461
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
} else if (rawTarget === true) {
|
|
465
|
+
targetConfig = {};
|
|
466
|
+
} else {
|
|
467
|
+
logWarning2(
|
|
468
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
530
473
|
const config = {};
|
|
531
474
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
532
475
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -540,7 +483,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
540
483
|
cwd,
|
|
541
484
|
resolvedCwd,
|
|
542
485
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
543
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
486
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
487
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
544
488
|
});
|
|
545
489
|
continue;
|
|
546
490
|
}
|
|
@@ -557,7 +501,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
557
501
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
558
502
|
continue;
|
|
559
503
|
}
|
|
560
|
-
const aggregatorType =
|
|
504
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
561
505
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
562
506
|
logWarning2(
|
|
563
507
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -570,7 +514,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
570
514
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
571
515
|
continue;
|
|
572
516
|
}
|
|
573
|
-
const memberName =
|
|
517
|
+
const memberName = asString(rawMember.name);
|
|
574
518
|
const memberType = rawMember.type;
|
|
575
519
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
576
520
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -608,7 +552,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
608
552
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
609
553
|
};
|
|
610
554
|
} else if (aggregatorType === "code_judge") {
|
|
611
|
-
const aggregatorPath =
|
|
555
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
612
556
|
if (!aggregatorPath) {
|
|
613
557
|
logWarning2(
|
|
614
558
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -621,7 +565,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
621
565
|
cwd: searchRoots[0]
|
|
622
566
|
};
|
|
623
567
|
} else {
|
|
624
|
-
const aggregatorPrompt =
|
|
568
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
625
569
|
let promptPath2;
|
|
626
570
|
if (aggregatorPrompt) {
|
|
627
571
|
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
@@ -646,7 +590,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
646
590
|
continue;
|
|
647
591
|
}
|
|
648
592
|
if (typeValue === "tool_trajectory") {
|
|
649
|
-
const mode =
|
|
593
|
+
const mode = asString(rawEvaluator.mode);
|
|
650
594
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
651
595
|
logWarning2(
|
|
652
596
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -737,8 +681,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
737
681
|
);
|
|
738
682
|
continue;
|
|
739
683
|
}
|
|
740
|
-
const fieldPath =
|
|
741
|
-
const match =
|
|
684
|
+
const fieldPath = asString(rawField.path);
|
|
685
|
+
const match = asString(rawField.match);
|
|
742
686
|
if (!fieldPath) {
|
|
743
687
|
logWarning2(
|
|
744
688
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -768,7 +712,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
768
712
|
);
|
|
769
713
|
continue;
|
|
770
714
|
}
|
|
771
|
-
const aggregation =
|
|
715
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
772
716
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
773
717
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
774
718
|
evaluators.push({
|
|
@@ -849,7 +793,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
849
793
|
});
|
|
850
794
|
continue;
|
|
851
795
|
}
|
|
852
|
-
const prompt =
|
|
796
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
853
797
|
let promptPath;
|
|
854
798
|
if (prompt) {
|
|
855
799
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
@@ -868,11 +812,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
868
812
|
);
|
|
869
813
|
}
|
|
870
814
|
}
|
|
871
|
-
const _model =
|
|
815
|
+
const _model = asString(rawEvaluator.model);
|
|
872
816
|
const rawRubrics = rawEvaluator.rubrics;
|
|
873
817
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
874
|
-
id:
|
|
875
|
-
description:
|
|
818
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
819
|
+
description: asString(rubric.description) ?? "",
|
|
876
820
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
877
821
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
878
822
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -916,7 +860,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
916
860
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
917
861
|
return void 0;
|
|
918
862
|
}
|
|
919
|
-
function
|
|
863
|
+
function asString(value) {
|
|
920
864
|
return typeof value === "string" ? value : void 0;
|
|
921
865
|
}
|
|
922
866
|
function asStringArray(value, description) {
|
|
@@ -992,6 +936,68 @@ function isValidFieldAggregationType(value) {
|
|
|
992
936
|
// src/evaluation/loaders/message-processor.ts
|
|
993
937
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
994
938
|
import path4 from "node:path";
|
|
939
|
+
|
|
940
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
941
|
+
function formatFileContents(parts) {
|
|
942
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
943
|
+
if (fileCount > 0) {
|
|
944
|
+
return parts.map((part) => {
|
|
945
|
+
if (part.isFile && part.displayPath) {
|
|
946
|
+
return `<file path="${part.displayPath}">
|
|
947
|
+
${part.content}
|
|
948
|
+
</file>`;
|
|
949
|
+
}
|
|
950
|
+
return part.content;
|
|
951
|
+
}).join("\n\n");
|
|
952
|
+
}
|
|
953
|
+
return parts.map((p) => p.content).join(" ");
|
|
954
|
+
}
|
|
955
|
+
function formatSegment(segment, mode = "lm") {
|
|
956
|
+
const type = asString2(segment.type);
|
|
957
|
+
if (type === "text") {
|
|
958
|
+
return asString2(segment.value);
|
|
959
|
+
}
|
|
960
|
+
if (type === "guideline_ref") {
|
|
961
|
+
const refPath = asString2(segment.path);
|
|
962
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
963
|
+
}
|
|
964
|
+
if (type === "file") {
|
|
965
|
+
const filePath = asString2(segment.path);
|
|
966
|
+
if (!filePath) {
|
|
967
|
+
return void 0;
|
|
968
|
+
}
|
|
969
|
+
if (mode === "agent") {
|
|
970
|
+
return `<file: path="${filePath}">`;
|
|
971
|
+
}
|
|
972
|
+
const text = asString2(segment.text);
|
|
973
|
+
if (text && filePath) {
|
|
974
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
return void 0;
|
|
978
|
+
}
|
|
979
|
+
function hasVisibleContent(segments) {
|
|
980
|
+
return segments.some((segment) => {
|
|
981
|
+
const type = asString2(segment.type);
|
|
982
|
+
if (type === "text") {
|
|
983
|
+
const value = asString2(segment.value);
|
|
984
|
+
return value !== void 0 && value.trim().length > 0;
|
|
985
|
+
}
|
|
986
|
+
if (type === "guideline_ref") {
|
|
987
|
+
return false;
|
|
988
|
+
}
|
|
989
|
+
if (type === "file") {
|
|
990
|
+
const text = asString2(segment.text);
|
|
991
|
+
return text !== void 0 && text.trim().length > 0;
|
|
992
|
+
}
|
|
993
|
+
return false;
|
|
994
|
+
});
|
|
995
|
+
}
|
|
996
|
+
function asString2(value) {
|
|
997
|
+
return typeof value === "string" ? value : void 0;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
// src/evaluation/loaders/message-processor.ts
|
|
995
1001
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
996
1002
|
var ANSI_RESET4 = "\x1B[0m";
|
|
997
1003
|
async function processMessages(options) {
|
|
@@ -1297,9 +1303,6 @@ ${messageContent}`);
|
|
|
1297
1303
|
questionParts.push(formattedContent);
|
|
1298
1304
|
}
|
|
1299
1305
|
}
|
|
1300
|
-
if (testCase.code_snippets.length > 0) {
|
|
1301
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1302
|
-
}
|
|
1303
1306
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1304
1307
|
}
|
|
1305
1308
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1498,7 +1501,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1498
1501
|
repoRootPath,
|
|
1499
1502
|
verbose
|
|
1500
1503
|
}) : [];
|
|
1501
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1502
1504
|
let referenceAnswer = "";
|
|
1503
1505
|
if (outputSegments.length > 0) {
|
|
1504
1506
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1571,7 +1573,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1571
1573
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
1572
1574
|
guideline_patterns: guidelinePatterns,
|
|
1573
1575
|
file_paths: allFilePaths,
|
|
1574
|
-
code_snippets: codeSnippets,
|
|
1575
1576
|
expected_outcome: outcome,
|
|
1576
1577
|
evaluator: evalCaseEvaluatorKind,
|
|
1577
1578
|
evaluators
|
|
@@ -5311,9 +5312,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
5311
5312
|
return createProvider(resolved);
|
|
5312
5313
|
}
|
|
5313
5314
|
|
|
5314
|
-
// src/evaluation/evaluators.ts
|
|
5315
|
-
|
|
5316
|
-
|
|
5315
|
+
// src/evaluation/evaluators/scoring.ts
|
|
5316
|
+
function scoreToVerdict(score) {
|
|
5317
|
+
if (score >= 0.8) {
|
|
5318
|
+
return "pass";
|
|
5319
|
+
}
|
|
5320
|
+
if (score >= 0.6) {
|
|
5321
|
+
return "borderline";
|
|
5322
|
+
}
|
|
5323
|
+
return "fail";
|
|
5324
|
+
}
|
|
5325
|
+
function clampScore(value) {
|
|
5326
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5327
|
+
return 0;
|
|
5328
|
+
}
|
|
5329
|
+
if (value < 0) {
|
|
5330
|
+
return 0;
|
|
5331
|
+
}
|
|
5332
|
+
if (value > 1) {
|
|
5333
|
+
return 1;
|
|
5334
|
+
}
|
|
5335
|
+
return value;
|
|
5336
|
+
}
|
|
5337
|
+
function extractJsonBlob(text) {
|
|
5338
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
5339
|
+
return match?.[0];
|
|
5340
|
+
}
|
|
5341
|
+
function parseJsonFromText(text) {
|
|
5342
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5343
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5344
|
+
return JSON.parse(blob);
|
|
5345
|
+
}
|
|
5346
|
+
function isNonEmptyString(value) {
|
|
5347
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
5348
|
+
}
|
|
5349
|
+
function parseJsonSafe(payload) {
|
|
5350
|
+
try {
|
|
5351
|
+
return JSON.parse(payload);
|
|
5352
|
+
} catch {
|
|
5353
|
+
return void 0;
|
|
5354
|
+
}
|
|
5355
|
+
}
|
|
5356
|
+
function deepEqual(a, b) {
|
|
5357
|
+
if (a === b) return true;
|
|
5358
|
+
if (a === null || b === null) return a === b;
|
|
5359
|
+
if (typeof a !== typeof b) return false;
|
|
5360
|
+
if (typeof a !== "object") return a === b;
|
|
5361
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5362
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5363
|
+
if (a.length !== b.length) return false;
|
|
5364
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5365
|
+
}
|
|
5366
|
+
const aObj = a;
|
|
5367
|
+
const bObj = b;
|
|
5368
|
+
const aKeys = Object.keys(aObj);
|
|
5369
|
+
const bKeys = Object.keys(bObj);
|
|
5370
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
5371
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5372
|
+
}
|
|
5317
5373
|
|
|
5318
5374
|
// src/runtime/exec.ts
|
|
5319
5375
|
function shellEscapePath(value) {
|
|
@@ -5338,7 +5394,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
5338
5394
|
cwd: options.cwd,
|
|
5339
5395
|
stdin: encoder.encode(stdinPayload),
|
|
5340
5396
|
stdout: "pipe",
|
|
5341
|
-
stderr: "pipe"
|
|
5397
|
+
stderr: "pipe",
|
|
5398
|
+
// Merge additional env vars with process.env
|
|
5399
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5342
5400
|
});
|
|
5343
5401
|
let timedOut = false;
|
|
5344
5402
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -5373,7 +5431,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
5373
5431
|
const [cmd, ...args] = argv;
|
|
5374
5432
|
const child = spawn4(cmd, args, {
|
|
5375
5433
|
cwd: options.cwd,
|
|
5376
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
5434
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
5435
|
+
// Merge additional env vars with process.env
|
|
5436
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5377
5437
|
});
|
|
5378
5438
|
const stdoutChunks = [];
|
|
5379
5439
|
const stderrChunks = [];
|
|
@@ -5426,7 +5486,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5426
5486
|
const child = spawn4(wrappedCommand, {
|
|
5427
5487
|
shell: true,
|
|
5428
5488
|
cwd: options.cwd,
|
|
5429
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
5489
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
5490
|
+
// Merge additional env vars with process.env
|
|
5491
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5430
5492
|
});
|
|
5431
5493
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5432
5494
|
child.kill();
|
|
@@ -5453,32 +5515,387 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5453
5515
|
}
|
|
5454
5516
|
}
|
|
5455
5517
|
|
|
5456
|
-
// src/
|
|
5457
|
-
|
|
5458
|
-
|
|
5459
|
-
|
|
5518
|
+
// src/runtime/target-proxy.ts
|
|
5519
|
+
import { randomBytes } from "node:crypto";
|
|
5520
|
+
import { createServer } from "node:http";
|
|
5521
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
5522
|
+
async function createTargetProxy(options) {
|
|
5523
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
5524
|
+
const token = randomBytes(32).toString("hex");
|
|
5525
|
+
let callCount = 0;
|
|
5526
|
+
let isShutdown = false;
|
|
5527
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
5528
|
+
function resolveProvider(targetName) {
|
|
5529
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
5530
|
+
return defaultProvider;
|
|
5531
|
+
}
|
|
5532
|
+
if (targetResolver) {
|
|
5533
|
+
return targetResolver(targetName);
|
|
5534
|
+
}
|
|
5535
|
+
return void 0;
|
|
5460
5536
|
}
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
|
|
5464
|
-
|
|
5465
|
-
|
|
5537
|
+
const server = createServer(async (req, res) => {
|
|
5538
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
5539
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
5540
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
5541
|
+
if (req.method === "OPTIONS") {
|
|
5542
|
+
res.writeHead(204);
|
|
5543
|
+
res.end();
|
|
5544
|
+
return;
|
|
5545
|
+
}
|
|
5546
|
+
const authHeader = req.headers.authorization;
|
|
5547
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
5548
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
5549
|
+
return;
|
|
5550
|
+
}
|
|
5551
|
+
if (isShutdown) {
|
|
5552
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
5553
|
+
return;
|
|
5554
|
+
}
|
|
5555
|
+
const url2 = req.url ?? "";
|
|
5556
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
5557
|
+
handleInfo(res);
|
|
5558
|
+
return;
|
|
5559
|
+
}
|
|
5560
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
5561
|
+
await handleInvoke(req, res);
|
|
5562
|
+
return;
|
|
5563
|
+
}
|
|
5564
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
5565
|
+
await handleInvokeBatch(req, res);
|
|
5566
|
+
return;
|
|
5567
|
+
}
|
|
5568
|
+
sendJson(res, 404, { error: "Not found" });
|
|
5569
|
+
});
|
|
5570
|
+
function handleInfo(res) {
|
|
5571
|
+
const response = {
|
|
5572
|
+
targetName: defaultProvider.targetName,
|
|
5573
|
+
maxCalls,
|
|
5574
|
+
callCount,
|
|
5575
|
+
availableTargets: targetsList
|
|
5576
|
+
};
|
|
5577
|
+
sendJson(res, 200, response);
|
|
5466
5578
|
}
|
|
5467
|
-
|
|
5468
|
-
|
|
5579
|
+
async function handleInvoke(req, res) {
|
|
5580
|
+
if (callCount >= maxCalls) {
|
|
5581
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
5582
|
+
return;
|
|
5583
|
+
}
|
|
5584
|
+
try {
|
|
5585
|
+
const body = await readBody(req);
|
|
5586
|
+
const request = JSON.parse(body);
|
|
5587
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5588
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
5589
|
+
return;
|
|
5590
|
+
}
|
|
5591
|
+
const provider = resolveProvider(request.target);
|
|
5592
|
+
if (!provider) {
|
|
5593
|
+
sendJson(res, 400, {
|
|
5594
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5595
|
+
});
|
|
5596
|
+
return;
|
|
5597
|
+
}
|
|
5598
|
+
callCount++;
|
|
5599
|
+
const response = await provider.invoke({
|
|
5600
|
+
question: request.question,
|
|
5601
|
+
systemPrompt: request.systemPrompt,
|
|
5602
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5603
|
+
attempt: request.attempt ?? 1
|
|
5604
|
+
});
|
|
5605
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5606
|
+
const rawText = extractLastAssistantContent2(outputMessages);
|
|
5607
|
+
const result = {
|
|
5608
|
+
outputMessages,
|
|
5609
|
+
rawText
|
|
5610
|
+
};
|
|
5611
|
+
sendJson(res, 200, result);
|
|
5612
|
+
} catch (error) {
|
|
5613
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5614
|
+
sendJson(res, 500, { error: message });
|
|
5615
|
+
}
|
|
5469
5616
|
}
|
|
5470
|
-
|
|
5471
|
-
|
|
5472
|
-
|
|
5473
|
-
const
|
|
5474
|
-
|
|
5617
|
+
async function handleInvokeBatch(req, res) {
|
|
5618
|
+
try {
|
|
5619
|
+
const body = await readBody(req);
|
|
5620
|
+
const { requests } = JSON.parse(body);
|
|
5621
|
+
if (!Array.isArray(requests)) {
|
|
5622
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
5623
|
+
return;
|
|
5624
|
+
}
|
|
5625
|
+
if (callCount + requests.length > maxCalls) {
|
|
5626
|
+
sendJson(res, 429, {
|
|
5627
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
5628
|
+
});
|
|
5629
|
+
return;
|
|
5630
|
+
}
|
|
5631
|
+
const responses = [];
|
|
5632
|
+
for (const request of requests) {
|
|
5633
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5634
|
+
responses.push({
|
|
5635
|
+
outputMessages: [],
|
|
5636
|
+
rawText: "Error: Missing required field: question"
|
|
5637
|
+
});
|
|
5638
|
+
continue;
|
|
5639
|
+
}
|
|
5640
|
+
const provider = resolveProvider(request.target);
|
|
5641
|
+
if (!provider) {
|
|
5642
|
+
responses.push({
|
|
5643
|
+
outputMessages: [],
|
|
5644
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5645
|
+
});
|
|
5646
|
+
continue;
|
|
5647
|
+
}
|
|
5648
|
+
callCount++;
|
|
5649
|
+
try {
|
|
5650
|
+
const response = await provider.invoke({
|
|
5651
|
+
question: request.question,
|
|
5652
|
+
systemPrompt: request.systemPrompt,
|
|
5653
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5654
|
+
attempt: request.attempt ?? 1
|
|
5655
|
+
});
|
|
5656
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5657
|
+
responses.push({
|
|
5658
|
+
outputMessages,
|
|
5659
|
+
rawText: extractLastAssistantContent2(outputMessages)
|
|
5660
|
+
});
|
|
5661
|
+
} catch (error) {
|
|
5662
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5663
|
+
responses.push({
|
|
5664
|
+
outputMessages: [],
|
|
5665
|
+
rawText: `Error: ${message}`
|
|
5666
|
+
});
|
|
5667
|
+
}
|
|
5668
|
+
}
|
|
5669
|
+
sendJson(res, 200, { responses });
|
|
5670
|
+
} catch (error) {
|
|
5671
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5672
|
+
sendJson(res, 500, { error: message });
|
|
5475
5673
|
}
|
|
5476
|
-
return result;
|
|
5477
5674
|
}
|
|
5478
|
-
|
|
5479
|
-
|
|
5480
|
-
|
|
5481
|
-
|
|
5675
|
+
await new Promise((resolve, reject) => {
|
|
5676
|
+
server.once("error", reject);
|
|
5677
|
+
server.listen(0, "127.0.0.1", () => {
|
|
5678
|
+
server.removeListener("error", reject);
|
|
5679
|
+
resolve();
|
|
5680
|
+
});
|
|
5681
|
+
});
|
|
5682
|
+
const address = server.address();
|
|
5683
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
5684
|
+
return {
|
|
5685
|
+
url,
|
|
5686
|
+
token,
|
|
5687
|
+
shutdown: async () => {
|
|
5688
|
+
isShutdown = true;
|
|
5689
|
+
return new Promise((resolve, reject) => {
|
|
5690
|
+
server.close((err) => {
|
|
5691
|
+
if (err) reject(err);
|
|
5692
|
+
else resolve();
|
|
5693
|
+
});
|
|
5694
|
+
});
|
|
5695
|
+
},
|
|
5696
|
+
getUsageMetadata: () => ({
|
|
5697
|
+
callCount,
|
|
5698
|
+
maxCalls
|
|
5699
|
+
})
|
|
5700
|
+
};
|
|
5701
|
+
}
|
|
5702
|
+
function sendJson(res, statusCode, body) {
|
|
5703
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
5704
|
+
res.end(JSON.stringify(body));
|
|
5705
|
+
}
|
|
5706
|
+
function readBody(req) {
|
|
5707
|
+
return new Promise((resolve, reject) => {
|
|
5708
|
+
const chunks = [];
|
|
5709
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
5710
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
5711
|
+
req.on("error", reject);
|
|
5712
|
+
});
|
|
5713
|
+
}
|
|
5714
|
+
function extractLastAssistantContent2(messages) {
|
|
5715
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
5716
|
+
const msg = messages[i];
|
|
5717
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
5718
|
+
if (typeof msg.content === "string") {
|
|
5719
|
+
return msg.content;
|
|
5720
|
+
}
|
|
5721
|
+
if (Array.isArray(msg.content)) {
|
|
5722
|
+
for (const part of msg.content) {
|
|
5723
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
5724
|
+
return String(part.text);
|
|
5725
|
+
}
|
|
5726
|
+
}
|
|
5727
|
+
}
|
|
5728
|
+
}
|
|
5729
|
+
}
|
|
5730
|
+
return void 0;
|
|
5731
|
+
}
|
|
5732
|
+
|
|
5733
|
+
// src/evaluation/case-conversion.ts
|
|
5734
|
+
function toSnakeCase(str) {
|
|
5735
|
+
if (/^[A-Z]/.test(str)) {
|
|
5736
|
+
return str;
|
|
5737
|
+
}
|
|
5738
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
5739
|
+
}
|
|
5740
|
+
function toSnakeCaseDeep(obj) {
|
|
5741
|
+
if (obj === null || obj === void 0) {
|
|
5742
|
+
return obj;
|
|
5743
|
+
}
|
|
5744
|
+
if (Array.isArray(obj)) {
|
|
5745
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
5746
|
+
}
|
|
5747
|
+
if (typeof obj === "object") {
|
|
5748
|
+
const result = {};
|
|
5749
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
5750
|
+
const snakeKey = toSnakeCase(key);
|
|
5751
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
5752
|
+
}
|
|
5753
|
+
return result;
|
|
5754
|
+
}
|
|
5755
|
+
return obj;
|
|
5756
|
+
}
|
|
5757
|
+
|
|
5758
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
5759
|
+
var CodeEvaluator = class {
|
|
5760
|
+
kind = "code";
|
|
5761
|
+
script;
|
|
5762
|
+
cwd;
|
|
5763
|
+
agentTimeoutMs;
|
|
5764
|
+
config;
|
|
5765
|
+
target;
|
|
5766
|
+
constructor(options) {
|
|
5767
|
+
this.script = options.script;
|
|
5768
|
+
this.cwd = options.cwd;
|
|
5769
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5770
|
+
this.config = options.config;
|
|
5771
|
+
this.target = options.target;
|
|
5772
|
+
}
|
|
5773
|
+
async evaluate(context) {
|
|
5774
|
+
const payload = {
|
|
5775
|
+
question: context.evalCase.question,
|
|
5776
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5777
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5778
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5779
|
+
candidateAnswer: context.candidate,
|
|
5780
|
+
outputMessages: context.outputMessages ?? null,
|
|
5781
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5782
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5783
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5784
|
+
),
|
|
5785
|
+
inputMessages: context.evalCase.input_messages,
|
|
5786
|
+
traceSummary: context.traceSummary ?? null,
|
|
5787
|
+
config: this.config ?? null
|
|
5788
|
+
};
|
|
5789
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5790
|
+
let proxyEnv;
|
|
5791
|
+
let proxyShutdown;
|
|
5792
|
+
let getProxyUsage;
|
|
5793
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
5794
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
5795
|
+
const proxy = await createTargetProxy({
|
|
5796
|
+
defaultProvider: context.judgeProvider,
|
|
5797
|
+
targetResolver: context.targetResolver,
|
|
5798
|
+
availableTargets: context.availableTargets,
|
|
5799
|
+
maxCalls
|
|
5800
|
+
});
|
|
5801
|
+
proxyEnv = {
|
|
5802
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
5803
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
5804
|
+
};
|
|
5805
|
+
proxyShutdown = proxy.shutdown;
|
|
5806
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
5807
|
+
}
|
|
5808
|
+
try {
|
|
5809
|
+
const stdout = await executeScript(
|
|
5810
|
+
this.script,
|
|
5811
|
+
inputPayload,
|
|
5812
|
+
this.agentTimeoutMs,
|
|
5813
|
+
this.cwd,
|
|
5814
|
+
proxyEnv
|
|
5815
|
+
);
|
|
5816
|
+
const parsed = parseJsonSafe(stdout);
|
|
5817
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5818
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5819
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5820
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5821
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
5822
|
+
const proxyUsage = getProxyUsage?.();
|
|
5823
|
+
const evaluatorRawRequest = {
|
|
5824
|
+
script: this.script,
|
|
5825
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5826
|
+
...proxyUsage ? {
|
|
5827
|
+
target_proxy: {
|
|
5828
|
+
call_count: proxyUsage.callCount,
|
|
5829
|
+
max_calls: proxyUsage.maxCalls
|
|
5830
|
+
}
|
|
5831
|
+
} : {}
|
|
5832
|
+
};
|
|
5833
|
+
return {
|
|
5834
|
+
score,
|
|
5835
|
+
verdict: scoreToVerdict(score),
|
|
5836
|
+
hits,
|
|
5837
|
+
misses,
|
|
5838
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
5839
|
+
reasoning,
|
|
5840
|
+
evaluatorRawRequest,
|
|
5841
|
+
...details ? { details } : {}
|
|
5842
|
+
};
|
|
5843
|
+
} catch (error) {
|
|
5844
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5845
|
+
const proxyUsage = getProxyUsage?.();
|
|
5846
|
+
return {
|
|
5847
|
+
score: 0,
|
|
5848
|
+
verdict: "fail",
|
|
5849
|
+
hits: [],
|
|
5850
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
5851
|
+
expectedAspectCount: 1,
|
|
5852
|
+
reasoning: message,
|
|
5853
|
+
evaluatorRawRequest: {
|
|
5854
|
+
script: this.script,
|
|
5855
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5856
|
+
...proxyUsage ? {
|
|
5857
|
+
target_proxy: {
|
|
5858
|
+
call_count: proxyUsage.callCount,
|
|
5859
|
+
max_calls: proxyUsage.maxCalls
|
|
5860
|
+
}
|
|
5861
|
+
} : {},
|
|
5862
|
+
error: message
|
|
5863
|
+
}
|
|
5864
|
+
};
|
|
5865
|
+
} finally {
|
|
5866
|
+
if (proxyShutdown) {
|
|
5867
|
+
await proxyShutdown();
|
|
5868
|
+
}
|
|
5869
|
+
}
|
|
5870
|
+
}
|
|
5871
|
+
};
|
|
5872
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5873
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
5874
|
+
if (exitCode !== 0) {
|
|
5875
|
+
const trimmedErr = formatStderr(stderr);
|
|
5876
|
+
throw new Error(
|
|
5877
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5878
|
+
);
|
|
5879
|
+
}
|
|
5880
|
+
return stdout.trim();
|
|
5881
|
+
}
|
|
5882
|
+
function formatStderr(stderr) {
|
|
5883
|
+
const trimmed = stderr.trim();
|
|
5884
|
+
const maxLength = 2e3;
|
|
5885
|
+
if (trimmed.length <= maxLength) {
|
|
5886
|
+
return trimmed;
|
|
5887
|
+
}
|
|
5888
|
+
const tail = trimmed.slice(-maxLength);
|
|
5889
|
+
return `...(truncated, last ${maxLength} chars)
|
|
5890
|
+
${tail}`;
|
|
5891
|
+
}
|
|
5892
|
+
|
|
5893
|
+
// src/evaluation/evaluators/composite.ts
|
|
5894
|
+
import { generateText as generateText3 } from "ai";
|
|
5895
|
+
|
|
5896
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
5897
|
+
import { generateText as generateText2 } from "ai";
|
|
5898
|
+
import { z as z2 } from "zod";
|
|
5482
5899
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
5483
5900
|
|
|
5484
5901
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -5558,7 +5975,7 @@ var LlmJudgeEvaluator = class {
|
|
|
5558
5975
|
target: judgeProvider.targetName
|
|
5559
5976
|
};
|
|
5560
5977
|
try {
|
|
5561
|
-
const { data
|
|
5978
|
+
const { data } = await this.runWithRetry({
|
|
5562
5979
|
context,
|
|
5563
5980
|
judgeProvider,
|
|
5564
5981
|
systemPrompt,
|
|
@@ -5707,105 +6124,11 @@ You must return a valid JSON object matching this schema:
|
|
|
5707
6124
|
"overall_reasoning": "string (summary)"
|
|
5708
6125
|
}`;
|
|
5709
6126
|
}
|
|
5710
|
-
function
|
|
5711
|
-
|
|
5712
|
-
return
|
|
5713
|
-
}
|
|
5714
|
-
if (score >= 0.6) {
|
|
5715
|
-
return "borderline";
|
|
5716
|
-
}
|
|
5717
|
-
return "fail";
|
|
5718
|
-
}
|
|
5719
|
-
function clampScore(value) {
|
|
5720
|
-
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5721
|
-
return 0;
|
|
5722
|
-
}
|
|
5723
|
-
if (value < 0) {
|
|
5724
|
-
return 0;
|
|
5725
|
-
}
|
|
5726
|
-
if (value > 1) {
|
|
5727
|
-
return 1;
|
|
5728
|
-
}
|
|
5729
|
-
return value;
|
|
5730
|
-
}
|
|
5731
|
-
function extractJsonBlob(text) {
|
|
5732
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
5733
|
-
return match?.[0];
|
|
5734
|
-
}
|
|
5735
|
-
function parseJsonFromText(text) {
|
|
5736
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5737
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5738
|
-
return JSON.parse(blob);
|
|
5739
|
-
}
|
|
5740
|
-
function isNonEmptyString(value) {
|
|
5741
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
6127
|
+
function substituteVariables(template, variables) {
|
|
6128
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
6129
|
+
return variables[varName] ?? match;
|
|
6130
|
+
});
|
|
5742
6131
|
}
|
|
5743
|
-
var CodeEvaluator = class {
|
|
5744
|
-
kind = "code";
|
|
5745
|
-
script;
|
|
5746
|
-
cwd;
|
|
5747
|
-
agentTimeoutMs;
|
|
5748
|
-
config;
|
|
5749
|
-
constructor(options) {
|
|
5750
|
-
this.script = options.script;
|
|
5751
|
-
this.cwd = options.cwd;
|
|
5752
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5753
|
-
this.config = options.config;
|
|
5754
|
-
}
|
|
5755
|
-
async evaluate(context) {
|
|
5756
|
-
const payload = {
|
|
5757
|
-
question: context.evalCase.question,
|
|
5758
|
-
expectedOutcome: context.evalCase.expected_outcome,
|
|
5759
|
-
expectedMessages: context.evalCase.expected_messages,
|
|
5760
|
-
referenceAnswer: context.evalCase.reference_answer,
|
|
5761
|
-
candidateAnswer: context.candidate,
|
|
5762
|
-
outputMessages: context.outputMessages ?? null,
|
|
5763
|
-
guidelineFiles: context.evalCase.guideline_paths,
|
|
5764
|
-
inputFiles: context.evalCase.file_paths.filter(
|
|
5765
|
-
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5766
|
-
),
|
|
5767
|
-
inputMessages: context.evalCase.input_messages,
|
|
5768
|
-
traceSummary: context.traceSummary ?? null,
|
|
5769
|
-
config: this.config ?? null
|
|
5770
|
-
};
|
|
5771
|
-
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5772
|
-
try {
|
|
5773
|
-
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
5774
|
-
const parsed = parseJsonSafe(stdout);
|
|
5775
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5776
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5777
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5778
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5779
|
-
return {
|
|
5780
|
-
score,
|
|
5781
|
-
verdict: scoreToVerdict(score),
|
|
5782
|
-
hits,
|
|
5783
|
-
misses,
|
|
5784
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
5785
|
-
reasoning,
|
|
5786
|
-
evaluatorRawRequest: {
|
|
5787
|
-
script: this.script,
|
|
5788
|
-
...this.cwd ? { cwd: this.cwd } : {}
|
|
5789
|
-
}
|
|
5790
|
-
};
|
|
5791
|
-
} catch (error) {
|
|
5792
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
5793
|
-
return {
|
|
5794
|
-
score: 0,
|
|
5795
|
-
verdict: "fail",
|
|
5796
|
-
hits: [],
|
|
5797
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
5798
|
-
expectedAspectCount: 1,
|
|
5799
|
-
reasoning: message,
|
|
5800
|
-
evaluatorRawRequest: {
|
|
5801
|
-
script: this.script,
|
|
5802
|
-
...this.cwd ? { cwd: this.cwd } : {},
|
|
5803
|
-
error: message
|
|
5804
|
-
}
|
|
5805
|
-
};
|
|
5806
|
-
}
|
|
5807
|
-
}
|
|
5808
|
-
};
|
|
5809
6132
|
function calculateRubricScore(result, rubrics) {
|
|
5810
6133
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
5811
6134
|
const hits = [];
|
|
@@ -5833,273 +6156,281 @@ function calculateRubricScore(result, rubrics) {
|
|
|
5833
6156
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
5834
6157
|
return { score, verdict, hits, misses };
|
|
5835
6158
|
}
|
|
5836
|
-
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
|
|
5840
|
-
|
|
5841
|
-
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
}
|
|
5846
|
-
function formatStderr(stderr) {
|
|
5847
|
-
const trimmed = stderr.trim();
|
|
5848
|
-
const maxLength = 2e3;
|
|
5849
|
-
if (trimmed.length <= maxLength) {
|
|
5850
|
-
return trimmed;
|
|
5851
|
-
}
|
|
5852
|
-
const tail = trimmed.slice(-maxLength);
|
|
5853
|
-
return `...(truncated, last ${maxLength} chars)
|
|
5854
|
-
${tail}`;
|
|
5855
|
-
}
|
|
5856
|
-
function parseJsonSafe(payload) {
|
|
5857
|
-
try {
|
|
5858
|
-
return JSON.parse(payload);
|
|
5859
|
-
} catch {
|
|
5860
|
-
return void 0;
|
|
5861
|
-
}
|
|
5862
|
-
}
|
|
5863
|
-
function substituteVariables(template, variables) {
|
|
5864
|
-
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
5865
|
-
return variables[varName] ?? match;
|
|
5866
|
-
});
|
|
5867
|
-
}
|
|
5868
|
-
function deepEqual(a, b) {
|
|
5869
|
-
if (a === b) return true;
|
|
5870
|
-
if (a === null || b === null) return a === b;
|
|
5871
|
-
if (typeof a !== typeof b) return false;
|
|
5872
|
-
if (typeof a !== "object") return a === b;
|
|
5873
|
-
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5874
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5875
|
-
if (a.length !== b.length) return false;
|
|
5876
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5877
|
-
}
|
|
5878
|
-
const aObj = a;
|
|
5879
|
-
const bObj = b;
|
|
5880
|
-
const aKeys = Object.keys(aObj);
|
|
5881
|
-
const bKeys = Object.keys(bObj);
|
|
5882
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
5883
|
-
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5884
|
-
}
|
|
5885
|
-
function argsMatch(expected, actual) {
|
|
5886
|
-
if (expected === void 0) return true;
|
|
5887
|
-
if (expected === "any") return true;
|
|
5888
|
-
if (actual === void 0) return false;
|
|
5889
|
-
for (const key of Object.keys(expected)) {
|
|
5890
|
-
if (!Object.hasOwn(actual, key)) return false;
|
|
5891
|
-
if (!deepEqual(expected[key], actual[key])) return false;
|
|
5892
|
-
}
|
|
5893
|
-
return true;
|
|
5894
|
-
}
|
|
5895
|
-
var ToolTrajectoryEvaluator = class {
|
|
5896
|
-
kind = "tool_trajectory";
|
|
6159
|
+
|
|
6160
|
+
// src/evaluation/evaluators/composite.ts
|
|
6161
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6162
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
6163
|
+
|
|
6164
|
+
Decide the final score and verdict based on all evaluator results.
|
|
6165
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6166
|
+
var CompositeEvaluator = class {
|
|
6167
|
+
kind = "composite";
|
|
5897
6168
|
config;
|
|
6169
|
+
evaluatorFactory;
|
|
6170
|
+
cwd;
|
|
5898
6171
|
constructor(options) {
|
|
5899
6172
|
this.config = options.config;
|
|
6173
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
6174
|
+
this.cwd = options.cwd;
|
|
5900
6175
|
}
|
|
5901
|
-
evaluate(context) {
|
|
5902
|
-
const
|
|
5903
|
-
|
|
5904
|
-
|
|
5905
|
-
return {
|
|
5906
|
-
score: 0,
|
|
5907
|
-
verdict: "fail",
|
|
5908
|
-
hits: [],
|
|
5909
|
-
misses: ["No trace available for evaluation"],
|
|
5910
|
-
expectedAspectCount: 1
|
|
5911
|
-
};
|
|
5912
|
-
}
|
|
5913
|
-
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
5914
|
-
if (!summary) {
|
|
5915
|
-
return {
|
|
5916
|
-
score: 0,
|
|
5917
|
-
verdict: "fail",
|
|
5918
|
-
hits: [],
|
|
5919
|
-
misses: ["No trace available for evaluation"],
|
|
5920
|
-
expectedAspectCount: 1
|
|
5921
|
-
};
|
|
5922
|
-
}
|
|
5923
|
-
switch (this.config.mode) {
|
|
5924
|
-
case "any_order":
|
|
5925
|
-
return this.evaluateAnyOrder(summary);
|
|
5926
|
-
case "in_order":
|
|
5927
|
-
return this.evaluateInOrder(toolCalls);
|
|
5928
|
-
case "exact":
|
|
5929
|
-
return this.evaluateExact(toolCalls);
|
|
5930
|
-
default:
|
|
6176
|
+
async evaluate(context) {
|
|
6177
|
+
const memberResults = await Promise.all(
|
|
6178
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
6179
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
5931
6180
|
return {
|
|
5932
|
-
|
|
5933
|
-
|
|
5934
|
-
|
|
5935
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
5936
|
-
expectedAspectCount: 1
|
|
6181
|
+
id: memberConfig.name,
|
|
6182
|
+
type: memberConfig.type,
|
|
6183
|
+
result: await evaluator.evaluate(context)
|
|
5937
6184
|
};
|
|
5938
|
-
|
|
5939
|
-
|
|
5940
|
-
|
|
5941
|
-
* Extract tool calls from output messages.
|
|
5942
|
-
*/
|
|
5943
|
-
extractToolCallsFromMessages(messages) {
|
|
5944
|
-
if (!messages) {
|
|
5945
|
-
return [];
|
|
5946
|
-
}
|
|
5947
|
-
const toolCalls = [];
|
|
5948
|
-
for (const message of messages) {
|
|
5949
|
-
if (message.toolCalls) {
|
|
5950
|
-
for (const call of message.toolCalls) {
|
|
5951
|
-
toolCalls.push({
|
|
5952
|
-
name: call.tool,
|
|
5953
|
-
args: call.input
|
|
5954
|
-
});
|
|
5955
|
-
}
|
|
5956
|
-
}
|
|
5957
|
-
}
|
|
5958
|
-
return toolCalls;
|
|
6185
|
+
})
|
|
6186
|
+
);
|
|
6187
|
+
return this.aggregate(memberResults, context);
|
|
5959
6188
|
}
|
|
5960
|
-
|
|
5961
|
-
|
|
5962
|
-
|
|
5963
|
-
|
|
5964
|
-
|
|
5965
|
-
|
|
5966
|
-
|
|
6189
|
+
async aggregate(results, context) {
|
|
6190
|
+
const aggregator = this.config.aggregator;
|
|
6191
|
+
switch (aggregator.type) {
|
|
6192
|
+
case "code_judge":
|
|
6193
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6194
|
+
case "llm_judge":
|
|
6195
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
6196
|
+
default:
|
|
6197
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
5967
6198
|
}
|
|
5968
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
5969
|
-
return {
|
|
5970
|
-
eventCount: toolCalls.length,
|
|
5971
|
-
toolNames,
|
|
5972
|
-
toolCallsByName,
|
|
5973
|
-
errorCount: 0
|
|
5974
|
-
};
|
|
5975
6199
|
}
|
|
5976
|
-
|
|
5977
|
-
|
|
5978
|
-
|
|
5979
|
-
|
|
5980
|
-
|
|
5981
|
-
|
|
5982
|
-
|
|
5983
|
-
|
|
5984
|
-
|
|
5985
|
-
|
|
5986
|
-
|
|
5987
|
-
|
|
5988
|
-
|
|
5989
|
-
|
|
5990
|
-
|
|
5991
|
-
const required = minimums[toolName];
|
|
5992
|
-
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
5993
|
-
if (actual >= required) {
|
|
5994
|
-
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
5995
|
-
} else {
|
|
5996
|
-
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
6200
|
+
runWeightedAverage(results, weights) {
|
|
6201
|
+
let totalWeight = 0;
|
|
6202
|
+
let weightedSum = 0;
|
|
6203
|
+
const allHits = [];
|
|
6204
|
+
const allMisses = [];
|
|
6205
|
+
const reasoningParts = [];
|
|
6206
|
+
const evaluatorResults = [];
|
|
6207
|
+
for (const member of results) {
|
|
6208
|
+
const weight = weights?.[member.id] ?? 1;
|
|
6209
|
+
totalWeight += weight;
|
|
6210
|
+
weightedSum += member.result.score * weight;
|
|
6211
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6212
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6213
|
+
if (member.result.reasoning) {
|
|
6214
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
5997
6215
|
}
|
|
6216
|
+
evaluatorResults.push({
|
|
6217
|
+
name: member.id,
|
|
6218
|
+
type: member.type,
|
|
6219
|
+
score: member.result.score,
|
|
6220
|
+
weight,
|
|
6221
|
+
verdict: member.result.verdict,
|
|
6222
|
+
hits: [...member.result.hits],
|
|
6223
|
+
misses: [...member.result.misses],
|
|
6224
|
+
reasoning: member.result.reasoning,
|
|
6225
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6226
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6227
|
+
details: member.result.details
|
|
6228
|
+
});
|
|
5998
6229
|
}
|
|
5999
|
-
const
|
|
6230
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
6000
6231
|
return {
|
|
6001
|
-
score,
|
|
6002
|
-
verdict: scoreToVerdict(
|
|
6003
|
-
hits,
|
|
6004
|
-
misses,
|
|
6005
|
-
expectedAspectCount:
|
|
6232
|
+
score: clampScore(finalScore),
|
|
6233
|
+
verdict: scoreToVerdict(finalScore),
|
|
6234
|
+
hits: allHits,
|
|
6235
|
+
misses: allMisses,
|
|
6236
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6237
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6238
|
+
evaluatorRawRequest: {
|
|
6239
|
+
aggregator: "weighted_average",
|
|
6240
|
+
...weights ? { weights } : {}
|
|
6241
|
+
},
|
|
6242
|
+
evaluatorResults
|
|
6006
6243
|
};
|
|
6007
6244
|
}
|
|
6008
|
-
|
|
6009
|
-
const
|
|
6010
|
-
|
|
6245
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6246
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6247
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6248
|
+
const evaluatorResults = results.map((member) => ({
|
|
6249
|
+
name: member.id,
|
|
6250
|
+
type: member.type,
|
|
6251
|
+
score: member.result.score,
|
|
6252
|
+
weight: weights?.[member.id] ?? 1,
|
|
6253
|
+
verdict: member.result.verdict,
|
|
6254
|
+
hits: [...member.result.hits],
|
|
6255
|
+
misses: [...member.result.misses],
|
|
6256
|
+
reasoning: member.result.reasoning,
|
|
6257
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6258
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6259
|
+
details: member.result.details
|
|
6260
|
+
}));
|
|
6261
|
+
try {
|
|
6262
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6263
|
+
const parsed = parseJsonSafe(stdout);
|
|
6264
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6265
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6266
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6267
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6268
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6011
6269
|
return {
|
|
6012
|
-
score
|
|
6013
|
-
verdict
|
|
6014
|
-
hits
|
|
6015
|
-
misses
|
|
6016
|
-
expectedAspectCount:
|
|
6270
|
+
score,
|
|
6271
|
+
verdict,
|
|
6272
|
+
hits,
|
|
6273
|
+
misses,
|
|
6274
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6275
|
+
reasoning,
|
|
6276
|
+
evaluatorRawRequest: {
|
|
6277
|
+
aggregator: "code_judge",
|
|
6278
|
+
script: scriptPath
|
|
6279
|
+
},
|
|
6280
|
+
evaluatorResults
|
|
6281
|
+
};
|
|
6282
|
+
} catch (error) {
|
|
6283
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6284
|
+
return {
|
|
6285
|
+
score: 0,
|
|
6286
|
+
verdict: "fail",
|
|
6287
|
+
hits: [],
|
|
6288
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
6289
|
+
expectedAspectCount: 1,
|
|
6290
|
+
reasoning: message,
|
|
6291
|
+
evaluatorRawRequest: {
|
|
6292
|
+
aggregator: "code_judge",
|
|
6293
|
+
script: scriptPath,
|
|
6294
|
+
error: message
|
|
6295
|
+
},
|
|
6296
|
+
evaluatorResults
|
|
6017
6297
|
};
|
|
6018
6298
|
}
|
|
6019
|
-
const hits = [];
|
|
6020
|
-
const misses = [];
|
|
6021
|
-
let actualIndex = 0;
|
|
6022
|
-
for (let i = 0; i < expected.length; i++) {
|
|
6023
|
-
const expectedItem = expected[i];
|
|
6024
|
-
const expectedTool = expectedItem.tool;
|
|
6025
|
-
let found = false;
|
|
6026
|
-
let argsMismatch = false;
|
|
6027
|
-
while (actualIndex < toolCalls.length) {
|
|
6028
|
-
const actualCall = toolCalls[actualIndex];
|
|
6029
|
-
if (actualCall.name === expectedTool) {
|
|
6030
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
6031
|
-
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
6032
|
-
actualIndex++;
|
|
6033
|
-
found = true;
|
|
6034
|
-
break;
|
|
6035
|
-
}
|
|
6036
|
-
misses.push(
|
|
6037
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
6038
|
-
);
|
|
6039
|
-
actualIndex++;
|
|
6040
|
-
argsMismatch = true;
|
|
6041
|
-
break;
|
|
6042
|
-
}
|
|
6043
|
-
actualIndex++;
|
|
6044
|
-
}
|
|
6045
|
-
if (!found && !argsMismatch) {
|
|
6046
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
6047
|
-
}
|
|
6048
|
-
}
|
|
6049
|
-
const score = hits.length / expected.length;
|
|
6050
|
-
return {
|
|
6051
|
-
score,
|
|
6052
|
-
verdict: scoreToVerdict(score),
|
|
6053
|
-
hits,
|
|
6054
|
-
misses,
|
|
6055
|
-
expectedAspectCount: expected.length
|
|
6056
|
-
};
|
|
6057
6299
|
}
|
|
6058
|
-
|
|
6059
|
-
const
|
|
6060
|
-
if (
|
|
6300
|
+
async runLlmAggregator(results, context, config) {
|
|
6301
|
+
const judgeProvider = context.judgeProvider;
|
|
6302
|
+
if (!judgeProvider) {
|
|
6303
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
6304
|
+
}
|
|
6305
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6306
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6307
|
+
const evaluatorResults = results.map((member) => ({
|
|
6308
|
+
name: member.id,
|
|
6309
|
+
type: member.type,
|
|
6310
|
+
score: member.result.score,
|
|
6311
|
+
verdict: member.result.verdict,
|
|
6312
|
+
hits: [...member.result.hits],
|
|
6313
|
+
misses: [...member.result.misses],
|
|
6314
|
+
reasoning: member.result.reasoning,
|
|
6315
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6316
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6317
|
+
details: member.result.details
|
|
6318
|
+
}));
|
|
6319
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6320
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6321
|
+
const systemPrompt = buildOutputSchema();
|
|
6322
|
+
const evaluatorRawRequest = {
|
|
6323
|
+
aggregator: "llm_judge",
|
|
6324
|
+
userPrompt,
|
|
6325
|
+
systemPrompt,
|
|
6326
|
+
target: judgeProvider.targetName
|
|
6327
|
+
};
|
|
6328
|
+
try {
|
|
6329
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
6330
|
+
if (model) {
|
|
6331
|
+
const { text } = await generateText3({
|
|
6332
|
+
model,
|
|
6333
|
+
system: systemPrompt,
|
|
6334
|
+
prompt: userPrompt
|
|
6335
|
+
});
|
|
6336
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6337
|
+
const score2 = clampScore(data2.score);
|
|
6338
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6339
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6340
|
+
const reasoning2 = data2.reasoning;
|
|
6341
|
+
return {
|
|
6342
|
+
score: score2,
|
|
6343
|
+
verdict: scoreToVerdict(score2),
|
|
6344
|
+
hits: hits2,
|
|
6345
|
+
misses: misses2,
|
|
6346
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6347
|
+
reasoning: reasoning2,
|
|
6348
|
+
evaluatorRawRequest,
|
|
6349
|
+
evaluatorResults
|
|
6350
|
+
};
|
|
6351
|
+
}
|
|
6352
|
+
const response = await judgeProvider.invoke({
|
|
6353
|
+
question: userPrompt,
|
|
6354
|
+
systemPrompt,
|
|
6355
|
+
evalCaseId: context.evalCase.id,
|
|
6356
|
+
attempt: context.attempt
|
|
6357
|
+
});
|
|
6358
|
+
const data = freeformEvaluationSchema.parse(
|
|
6359
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6360
|
+
);
|
|
6361
|
+
const score = clampScore(data.score);
|
|
6362
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6363
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6364
|
+
const reasoning = data.reasoning;
|
|
6061
6365
|
return {
|
|
6062
|
-
score
|
|
6063
|
-
verdict:
|
|
6064
|
-
hits
|
|
6366
|
+
score,
|
|
6367
|
+
verdict: scoreToVerdict(score),
|
|
6368
|
+
hits,
|
|
6369
|
+
misses,
|
|
6370
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
6371
|
+
reasoning,
|
|
6372
|
+
evaluatorRawRequest,
|
|
6373
|
+
evaluatorResults
|
|
6374
|
+
};
|
|
6375
|
+
} catch {
|
|
6376
|
+
return {
|
|
6377
|
+
score: 0,
|
|
6378
|
+
verdict: "fail",
|
|
6379
|
+
hits: [],
|
|
6065
6380
|
misses: [],
|
|
6066
|
-
expectedAspectCount:
|
|
6381
|
+
expectedAspectCount: 1,
|
|
6382
|
+
evaluatorRawRequest,
|
|
6383
|
+
evaluatorResults
|
|
6067
6384
|
};
|
|
6068
6385
|
}
|
|
6069
|
-
|
|
6070
|
-
|
|
6071
|
-
|
|
6072
|
-
|
|
6073
|
-
|
|
6074
|
-
|
|
6075
|
-
|
|
6076
|
-
|
|
6077
|
-
|
|
6078
|
-
|
|
6079
|
-
|
|
6080
|
-
|
|
6081
|
-
|
|
6082
|
-
|
|
6083
|
-
|
|
6084
|
-
|
|
6386
|
+
}
|
|
6387
|
+
};
|
|
6388
|
+
|
|
6389
|
+
// src/evaluation/evaluators/cost.ts
|
|
6390
|
+
var CostEvaluator = class {
|
|
6391
|
+
kind = "cost";
|
|
6392
|
+
config;
|
|
6393
|
+
constructor(options) {
|
|
6394
|
+
this.config = options.config;
|
|
6395
|
+
}
|
|
6396
|
+
evaluate(context) {
|
|
6397
|
+
const { budget } = this.config;
|
|
6398
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
6399
|
+
if (costUsd === void 0) {
|
|
6400
|
+
return {
|
|
6401
|
+
score: 0,
|
|
6402
|
+
verdict: "fail",
|
|
6403
|
+
hits: [],
|
|
6404
|
+
misses: ["No cost data available in trace"],
|
|
6405
|
+
expectedAspectCount: 1,
|
|
6406
|
+
reasoning: "Execution cost not reported by provider",
|
|
6407
|
+
evaluatorRawRequest: {
|
|
6408
|
+
type: "cost",
|
|
6409
|
+
budget,
|
|
6410
|
+
costUsd: null
|
|
6085
6411
|
}
|
|
6086
|
-
}
|
|
6087
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6088
|
-
}
|
|
6089
|
-
}
|
|
6090
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
6091
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
6412
|
+
};
|
|
6092
6413
|
}
|
|
6093
|
-
const
|
|
6414
|
+
const passed = costUsd <= budget;
|
|
6415
|
+
const score = passed ? 1 : 0;
|
|
6416
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6094
6417
|
return {
|
|
6095
6418
|
score,
|
|
6096
|
-
verdict:
|
|
6097
|
-
hits,
|
|
6098
|
-
misses,
|
|
6099
|
-
expectedAspectCount:
|
|
6419
|
+
verdict: passed ? "pass" : "fail",
|
|
6420
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6421
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6422
|
+
expectedAspectCount: 1,
|
|
6423
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6424
|
+
evaluatorRawRequest: {
|
|
6425
|
+
type: "cost",
|
|
6426
|
+
budget,
|
|
6427
|
+
costUsd
|
|
6428
|
+
}
|
|
6100
6429
|
};
|
|
6101
6430
|
}
|
|
6102
6431
|
};
|
|
6432
|
+
|
|
6433
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
6103
6434
|
var DEFAULT_DATE_FORMATS = [
|
|
6104
6435
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
6105
6436
|
// ISO with timezone
|
|
@@ -6312,434 +6643,209 @@ var FieldAccuracyEvaluator = class {
|
|
|
6312
6643
|
}
|
|
6313
6644
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6314
6645
|
return {
|
|
6315
|
-
path: path15,
|
|
6316
|
-
score: 0,
|
|
6317
|
-
weight,
|
|
6318
|
-
hit: false,
|
|
6319
|
-
message: `${path15} (invalid numeric value)`
|
|
6320
|
-
};
|
|
6321
|
-
}
|
|
6322
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
6323
|
-
let withinTolerance;
|
|
6324
|
-
if (relative) {
|
|
6325
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6326
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
6327
|
-
} else {
|
|
6328
|
-
withinTolerance = diff <= tolerance;
|
|
6329
|
-
}
|
|
6330
|
-
if (withinTolerance) {
|
|
6331
|
-
return {
|
|
6332
|
-
path: path15,
|
|
6333
|
-
score: 1,
|
|
6334
|
-
weight,
|
|
6335
|
-
hit: true,
|
|
6336
|
-
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6337
|
-
};
|
|
6338
|
-
}
|
|
6339
|
-
return {
|
|
6340
|
-
path: path15,
|
|
6341
|
-
score: 0,
|
|
6342
|
-
weight,
|
|
6343
|
-
hit: false,
|
|
6344
|
-
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6345
|
-
};
|
|
6346
|
-
}
|
|
6347
|
-
/**
|
|
6348
|
-
* Date comparison with format normalization.
|
|
6349
|
-
*/
|
|
6350
|
-
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6351
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6352
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6353
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6354
|
-
if (candidateDate === null) {
|
|
6355
|
-
return {
|
|
6356
|
-
path: path15,
|
|
6357
|
-
score: 0,
|
|
6358
|
-
weight,
|
|
6359
|
-
hit: false,
|
|
6360
|
-
message: `${path15} (unparseable candidate date)`
|
|
6361
|
-
};
|
|
6362
|
-
}
|
|
6363
|
-
if (expectedDate === null) {
|
|
6364
|
-
return {
|
|
6365
|
-
path: path15,
|
|
6366
|
-
score: 0,
|
|
6367
|
-
weight,
|
|
6368
|
-
hit: false,
|
|
6369
|
-
message: `${path15} (unparseable expected date)`
|
|
6370
|
-
};
|
|
6371
|
-
}
|
|
6372
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6373
|
-
return {
|
|
6374
|
-
path: path15,
|
|
6375
|
-
score: 1,
|
|
6376
|
-
weight,
|
|
6377
|
-
hit: true,
|
|
6378
|
-
message: path15
|
|
6379
|
-
};
|
|
6380
|
-
}
|
|
6381
|
-
return {
|
|
6382
|
-
path: path15,
|
|
6383
|
-
score: 0,
|
|
6384
|
-
weight,
|
|
6385
|
-
hit: false,
|
|
6386
|
-
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6387
|
-
};
|
|
6388
|
-
}
|
|
6389
|
-
/**
|
|
6390
|
-
* Aggregate field results using configured strategy.
|
|
6391
|
-
*/
|
|
6392
|
-
aggregateResults(results) {
|
|
6393
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6394
|
-
const hits = [];
|
|
6395
|
-
const misses = [];
|
|
6396
|
-
for (const result of results) {
|
|
6397
|
-
if (result.hit) {
|
|
6398
|
-
hits.push(result.message);
|
|
6399
|
-
} else {
|
|
6400
|
-
misses.push(result.message);
|
|
6401
|
-
}
|
|
6402
|
-
}
|
|
6403
|
-
let score;
|
|
6404
|
-
if (aggregation === "all_or_nothing") {
|
|
6405
|
-
score = misses.length === 0 ? 1 : 0;
|
|
6406
|
-
} else {
|
|
6407
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6408
|
-
if (totalWeight === 0) {
|
|
6409
|
-
score = results.length === 0 ? 1 : 0;
|
|
6410
|
-
} else {
|
|
6411
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6412
|
-
score = weightedSum / totalWeight;
|
|
6413
|
-
}
|
|
6414
|
-
}
|
|
6415
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6416
|
-
return {
|
|
6417
|
-
score: clampScore(score),
|
|
6418
|
-
verdict: scoreToVerdict(score),
|
|
6419
|
-
hits: hits.slice(0, 4),
|
|
6420
|
-
misses: misses.slice(0, 4),
|
|
6421
|
-
expectedAspectCount: results.length,
|
|
6422
|
-
reasoning
|
|
6423
|
-
};
|
|
6424
|
-
}
|
|
6425
|
-
};
|
|
6426
|
-
function resolvePath(obj, path15) {
|
|
6427
|
-
if (!path15 || !obj) {
|
|
6428
|
-
return void 0;
|
|
6429
|
-
}
|
|
6430
|
-
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6431
|
-
let current = obj;
|
|
6432
|
-
for (const part of parts) {
|
|
6433
|
-
if (current === null || current === void 0) {
|
|
6434
|
-
return void 0;
|
|
6435
|
-
}
|
|
6436
|
-
if (typeof current !== "object") {
|
|
6437
|
-
return void 0;
|
|
6438
|
-
}
|
|
6439
|
-
const isIndex = /^\d+$/.test(part);
|
|
6440
|
-
if (isIndex && Array.isArray(current)) {
|
|
6441
|
-
current = current[Number.parseInt(part, 10)];
|
|
6442
|
-
} else {
|
|
6443
|
-
current = current[part];
|
|
6444
|
-
}
|
|
6445
|
-
}
|
|
6446
|
-
return current;
|
|
6447
|
-
}
|
|
6448
|
-
function toNumber(value) {
|
|
6449
|
-
if (typeof value === "number") {
|
|
6450
|
-
return value;
|
|
6451
|
-
}
|
|
6452
|
-
if (typeof value === "string") {
|
|
6453
|
-
const num = Number.parseFloat(value);
|
|
6454
|
-
return Number.isNaN(num) ? null : num;
|
|
6455
|
-
}
|
|
6456
|
-
return null;
|
|
6457
|
-
}
|
|
6458
|
-
function parseDate(dateStr, formats) {
|
|
6459
|
-
if (!dateStr) return null;
|
|
6460
|
-
const trimmed = dateStr.trim();
|
|
6461
|
-
const isoDate = new Date(trimmed);
|
|
6462
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
6463
|
-
return isoDate;
|
|
6464
|
-
}
|
|
6465
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6466
|
-
if (localizedMatch) {
|
|
6467
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6468
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
6469
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6470
|
-
const month = MONTH_NAMES[monthName];
|
|
6471
|
-
if (month !== void 0) {
|
|
6472
|
-
return new Date(year, month, day);
|
|
6473
|
-
}
|
|
6474
|
-
}
|
|
6475
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6476
|
-
if (usMatch) {
|
|
6477
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6478
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6479
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
6480
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6481
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
6482
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6483
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6484
|
-
return new Date(year, month, day);
|
|
6485
|
-
}
|
|
6486
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
6487
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
6488
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6489
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6490
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6491
|
-
return new Date(year, month, day);
|
|
6492
|
-
}
|
|
6493
|
-
} else {
|
|
6494
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6495
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6496
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6497
|
-
if (num1 > 12 && num2 <= 12) {
|
|
6498
|
-
return new Date(year, num2 - 1, num1);
|
|
6499
|
-
}
|
|
6500
|
-
if (num2 > 12 && num1 <= 12) {
|
|
6501
|
-
return new Date(year, num1 - 1, num2);
|
|
6502
|
-
}
|
|
6503
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
6504
|
-
return new Date(year, num1 - 1, num2);
|
|
6505
|
-
}
|
|
6506
|
-
}
|
|
6507
|
-
}
|
|
6508
|
-
return null;
|
|
6509
|
-
}
|
|
6510
|
-
function formatDateISO(date) {
|
|
6511
|
-
return date.toISOString().split("T")[0];
|
|
6512
|
-
}
|
|
6513
|
-
function parseJsonFromTextSafe(text) {
|
|
6514
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6515
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
6516
|
-
const blob = match?.[0] ?? cleaned;
|
|
6517
|
-
return JSON.parse(blob);
|
|
6518
|
-
}
|
|
6519
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6520
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
6521
|
-
|
|
6522
|
-
Decide the final score and verdict based on all evaluator results.
|
|
6523
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6524
|
-
var CompositeEvaluator = class {
|
|
6525
|
-
kind = "composite";
|
|
6526
|
-
config;
|
|
6527
|
-
evaluatorFactory;
|
|
6528
|
-
cwd;
|
|
6529
|
-
constructor(options) {
|
|
6530
|
-
this.config = options.config;
|
|
6531
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
6532
|
-
this.cwd = options.cwd;
|
|
6533
|
-
}
|
|
6534
|
-
async evaluate(context) {
|
|
6535
|
-
const memberResults = await Promise.all(
|
|
6536
|
-
this.config.evaluators.map(async (memberConfig) => {
|
|
6537
|
-
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
6538
|
-
return {
|
|
6539
|
-
id: memberConfig.name,
|
|
6540
|
-
type: memberConfig.type,
|
|
6541
|
-
result: await evaluator.evaluate(context)
|
|
6542
|
-
};
|
|
6543
|
-
})
|
|
6544
|
-
);
|
|
6545
|
-
return this.aggregate(memberResults, context);
|
|
6546
|
-
}
|
|
6547
|
-
async aggregate(results, context) {
|
|
6548
|
-
const aggregator = this.config.aggregator;
|
|
6549
|
-
switch (aggregator.type) {
|
|
6550
|
-
case "code_judge":
|
|
6551
|
-
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6552
|
-
case "llm_judge":
|
|
6553
|
-
return this.runLlmAggregator(results, context, aggregator);
|
|
6554
|
-
default:
|
|
6555
|
-
return this.runWeightedAverage(results, aggregator.weights);
|
|
6556
|
-
}
|
|
6557
|
-
}
|
|
6558
|
-
runWeightedAverage(results, weights) {
|
|
6559
|
-
let totalWeight = 0;
|
|
6560
|
-
let weightedSum = 0;
|
|
6561
|
-
const allHits = [];
|
|
6562
|
-
const allMisses = [];
|
|
6563
|
-
const reasoningParts = [];
|
|
6564
|
-
const evaluatorResults = [];
|
|
6565
|
-
for (const member of results) {
|
|
6566
|
-
const weight = weights?.[member.id] ?? 1;
|
|
6567
|
-
totalWeight += weight;
|
|
6568
|
-
weightedSum += member.result.score * weight;
|
|
6569
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6570
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6571
|
-
if (member.result.reasoning) {
|
|
6572
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
6573
|
-
}
|
|
6574
|
-
evaluatorResults.push({
|
|
6575
|
-
name: member.id,
|
|
6576
|
-
type: member.type,
|
|
6577
|
-
score: member.result.score,
|
|
6578
|
-
weight,
|
|
6579
|
-
verdict: member.result.verdict,
|
|
6580
|
-
hits: [...member.result.hits],
|
|
6581
|
-
misses: [...member.result.misses],
|
|
6582
|
-
reasoning: member.result.reasoning,
|
|
6583
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6584
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6585
|
-
});
|
|
6586
|
-
}
|
|
6587
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
6588
|
-
return {
|
|
6589
|
-
score: clampScore(finalScore),
|
|
6590
|
-
verdict: scoreToVerdict(finalScore),
|
|
6591
|
-
hits: allHits,
|
|
6592
|
-
misses: allMisses,
|
|
6593
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6594
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6595
|
-
evaluatorRawRequest: {
|
|
6596
|
-
aggregator: "weighted_average",
|
|
6597
|
-
...weights ? { weights } : {}
|
|
6598
|
-
},
|
|
6599
|
-
evaluatorResults
|
|
6600
|
-
};
|
|
6601
|
-
}
|
|
6602
|
-
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6603
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6604
|
-
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6605
|
-
const evaluatorResults = results.map((member) => ({
|
|
6606
|
-
name: member.id,
|
|
6607
|
-
type: member.type,
|
|
6608
|
-
score: member.result.score,
|
|
6609
|
-
weight: weights?.[member.id] ?? 1,
|
|
6610
|
-
verdict: member.result.verdict,
|
|
6611
|
-
hits: [...member.result.hits],
|
|
6612
|
-
misses: [...member.result.misses],
|
|
6613
|
-
reasoning: member.result.reasoning,
|
|
6614
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6615
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6616
|
-
}));
|
|
6617
|
-
try {
|
|
6618
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6619
|
-
const parsed = parseJsonSafe(stdout);
|
|
6620
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6621
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6622
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6623
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6624
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6625
|
-
return {
|
|
6626
|
-
score,
|
|
6627
|
-
verdict,
|
|
6628
|
-
hits,
|
|
6629
|
-
misses,
|
|
6630
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
6631
|
-
reasoning,
|
|
6632
|
-
evaluatorRawRequest: {
|
|
6633
|
-
aggregator: "code_judge",
|
|
6634
|
-
script: scriptPath
|
|
6635
|
-
},
|
|
6636
|
-
evaluatorResults
|
|
6637
|
-
};
|
|
6638
|
-
} catch (error) {
|
|
6639
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
6640
|
-
return {
|
|
6641
|
-
score: 0,
|
|
6642
|
-
verdict: "fail",
|
|
6643
|
-
hits: [],
|
|
6644
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
6645
|
-
expectedAspectCount: 1,
|
|
6646
|
-
reasoning: message,
|
|
6647
|
-
evaluatorRawRequest: {
|
|
6648
|
-
aggregator: "code_judge",
|
|
6649
|
-
script: scriptPath,
|
|
6650
|
-
error: message
|
|
6651
|
-
},
|
|
6652
|
-
evaluatorResults
|
|
6653
|
-
};
|
|
6654
|
-
}
|
|
6655
|
-
}
|
|
6656
|
-
async runLlmAggregator(results, context, config) {
|
|
6657
|
-
const judgeProvider = context.judgeProvider;
|
|
6658
|
-
if (!judgeProvider) {
|
|
6659
|
-
throw new Error("No judge provider available for LLM aggregation");
|
|
6660
|
-
}
|
|
6661
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6662
|
-
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6663
|
-
const evaluatorResults = results.map((member) => ({
|
|
6664
|
-
name: member.id,
|
|
6665
|
-
type: member.type,
|
|
6666
|
-
score: member.result.score,
|
|
6667
|
-
verdict: member.result.verdict,
|
|
6668
|
-
hits: [...member.result.hits],
|
|
6669
|
-
misses: [...member.result.misses],
|
|
6670
|
-
reasoning: member.result.reasoning,
|
|
6671
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6672
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6673
|
-
}));
|
|
6674
|
-
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6675
|
-
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6676
|
-
const systemPrompt = buildOutputSchema();
|
|
6677
|
-
const evaluatorRawRequest = {
|
|
6678
|
-
aggregator: "llm_judge",
|
|
6679
|
-
userPrompt,
|
|
6680
|
-
systemPrompt,
|
|
6681
|
-
target: judgeProvider.targetName
|
|
6682
|
-
};
|
|
6683
|
-
try {
|
|
6684
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
6685
|
-
if (model) {
|
|
6686
|
-
const { text } = await generateText2({
|
|
6687
|
-
model,
|
|
6688
|
-
system: systemPrompt,
|
|
6689
|
-
prompt: userPrompt
|
|
6690
|
-
});
|
|
6691
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6692
|
-
const score2 = clampScore(data2.score);
|
|
6693
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6694
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6695
|
-
const reasoning2 = data2.reasoning;
|
|
6696
|
-
return {
|
|
6697
|
-
score: score2,
|
|
6698
|
-
verdict: scoreToVerdict(score2),
|
|
6699
|
-
hits: hits2,
|
|
6700
|
-
misses: misses2,
|
|
6701
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6702
|
-
reasoning: reasoning2,
|
|
6703
|
-
evaluatorRawRequest,
|
|
6704
|
-
evaluatorResults
|
|
6705
|
-
};
|
|
6706
|
-
}
|
|
6707
|
-
const response = await judgeProvider.invoke({
|
|
6708
|
-
question: userPrompt,
|
|
6709
|
-
systemPrompt,
|
|
6710
|
-
evalCaseId: context.evalCase.id,
|
|
6711
|
-
attempt: context.attempt
|
|
6712
|
-
});
|
|
6713
|
-
const data = freeformEvaluationSchema.parse(
|
|
6714
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6715
|
-
);
|
|
6716
|
-
const score = clampScore(data.score);
|
|
6717
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6718
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6719
|
-
const reasoning = data.reasoning;
|
|
6720
|
-
return {
|
|
6721
|
-
score,
|
|
6722
|
-
verdict: scoreToVerdict(score),
|
|
6723
|
-
hits,
|
|
6724
|
-
misses,
|
|
6725
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
6726
|
-
reasoning,
|
|
6727
|
-
evaluatorRawRequest,
|
|
6728
|
-
evaluatorResults
|
|
6646
|
+
path: path15,
|
|
6647
|
+
score: 0,
|
|
6648
|
+
weight,
|
|
6649
|
+
hit: false,
|
|
6650
|
+
message: `${path15} (invalid numeric value)`
|
|
6729
6651
|
};
|
|
6730
|
-
}
|
|
6652
|
+
}
|
|
6653
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
6654
|
+
let withinTolerance;
|
|
6655
|
+
if (relative) {
|
|
6656
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6657
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
6658
|
+
} else {
|
|
6659
|
+
withinTolerance = diff <= tolerance;
|
|
6660
|
+
}
|
|
6661
|
+
if (withinTolerance) {
|
|
6662
|
+
return {
|
|
6663
|
+
path: path15,
|
|
6664
|
+
score: 1,
|
|
6665
|
+
weight,
|
|
6666
|
+
hit: true,
|
|
6667
|
+
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6668
|
+
};
|
|
6669
|
+
}
|
|
6670
|
+
return {
|
|
6671
|
+
path: path15,
|
|
6672
|
+
score: 0,
|
|
6673
|
+
weight,
|
|
6674
|
+
hit: false,
|
|
6675
|
+
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6676
|
+
};
|
|
6677
|
+
}
|
|
6678
|
+
/**
|
|
6679
|
+
* Date comparison with format normalization.
|
|
6680
|
+
*/
|
|
6681
|
+
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6682
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6683
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6684
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6685
|
+
if (candidateDate === null) {
|
|
6731
6686
|
return {
|
|
6687
|
+
path: path15,
|
|
6732
6688
|
score: 0,
|
|
6733
|
-
|
|
6734
|
-
|
|
6735
|
-
|
|
6736
|
-
|
|
6737
|
-
|
|
6738
|
-
|
|
6689
|
+
weight,
|
|
6690
|
+
hit: false,
|
|
6691
|
+
message: `${path15} (unparseable candidate date)`
|
|
6692
|
+
};
|
|
6693
|
+
}
|
|
6694
|
+
if (expectedDate === null) {
|
|
6695
|
+
return {
|
|
6696
|
+
path: path15,
|
|
6697
|
+
score: 0,
|
|
6698
|
+
weight,
|
|
6699
|
+
hit: false,
|
|
6700
|
+
message: `${path15} (unparseable expected date)`
|
|
6701
|
+
};
|
|
6702
|
+
}
|
|
6703
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6704
|
+
return {
|
|
6705
|
+
path: path15,
|
|
6706
|
+
score: 1,
|
|
6707
|
+
weight,
|
|
6708
|
+
hit: true,
|
|
6709
|
+
message: path15
|
|
6739
6710
|
};
|
|
6740
6711
|
}
|
|
6712
|
+
return {
|
|
6713
|
+
path: path15,
|
|
6714
|
+
score: 0,
|
|
6715
|
+
weight,
|
|
6716
|
+
hit: false,
|
|
6717
|
+
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6718
|
+
};
|
|
6719
|
+
}
|
|
6720
|
+
/**
|
|
6721
|
+
* Aggregate field results using configured strategy.
|
|
6722
|
+
*/
|
|
6723
|
+
aggregateResults(results) {
|
|
6724
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6725
|
+
const hits = [];
|
|
6726
|
+
const misses = [];
|
|
6727
|
+
for (const result of results) {
|
|
6728
|
+
if (result.hit) {
|
|
6729
|
+
hits.push(result.message);
|
|
6730
|
+
} else {
|
|
6731
|
+
misses.push(result.message);
|
|
6732
|
+
}
|
|
6733
|
+
}
|
|
6734
|
+
let score;
|
|
6735
|
+
if (aggregation === "all_or_nothing") {
|
|
6736
|
+
score = misses.length === 0 ? 1 : 0;
|
|
6737
|
+
} else {
|
|
6738
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6739
|
+
if (totalWeight === 0) {
|
|
6740
|
+
score = results.length === 0 ? 1 : 0;
|
|
6741
|
+
} else {
|
|
6742
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6743
|
+
score = weightedSum / totalWeight;
|
|
6744
|
+
}
|
|
6745
|
+
}
|
|
6746
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6747
|
+
return {
|
|
6748
|
+
score: clampScore(score),
|
|
6749
|
+
verdict: scoreToVerdict(score),
|
|
6750
|
+
hits: hits.slice(0, 4),
|
|
6751
|
+
misses: misses.slice(0, 4),
|
|
6752
|
+
expectedAspectCount: results.length,
|
|
6753
|
+
reasoning
|
|
6754
|
+
};
|
|
6741
6755
|
}
|
|
6742
6756
|
};
|
|
6757
|
+
function resolvePath(obj, path15) {
|
|
6758
|
+
if (!path15 || !obj) {
|
|
6759
|
+
return void 0;
|
|
6760
|
+
}
|
|
6761
|
+
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6762
|
+
let current = obj;
|
|
6763
|
+
for (const part of parts) {
|
|
6764
|
+
if (current === null || current === void 0) {
|
|
6765
|
+
return void 0;
|
|
6766
|
+
}
|
|
6767
|
+
if (typeof current !== "object") {
|
|
6768
|
+
return void 0;
|
|
6769
|
+
}
|
|
6770
|
+
const isIndex = /^\d+$/.test(part);
|
|
6771
|
+
if (isIndex && Array.isArray(current)) {
|
|
6772
|
+
current = current[Number.parseInt(part, 10)];
|
|
6773
|
+
} else {
|
|
6774
|
+
current = current[part];
|
|
6775
|
+
}
|
|
6776
|
+
}
|
|
6777
|
+
return current;
|
|
6778
|
+
}
|
|
6779
|
+
function toNumber(value) {
|
|
6780
|
+
if (typeof value === "number") {
|
|
6781
|
+
return value;
|
|
6782
|
+
}
|
|
6783
|
+
if (typeof value === "string") {
|
|
6784
|
+
const num = Number.parseFloat(value);
|
|
6785
|
+
return Number.isNaN(num) ? null : num;
|
|
6786
|
+
}
|
|
6787
|
+
return null;
|
|
6788
|
+
}
|
|
6789
|
+
function parseDate(dateStr, formats) {
|
|
6790
|
+
if (!dateStr) return null;
|
|
6791
|
+
const trimmed = dateStr.trim();
|
|
6792
|
+
const isoDate = new Date(trimmed);
|
|
6793
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
6794
|
+
return isoDate;
|
|
6795
|
+
}
|
|
6796
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6797
|
+
if (localizedMatch) {
|
|
6798
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6799
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
6800
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6801
|
+
const month = MONTH_NAMES[monthName];
|
|
6802
|
+
if (month !== void 0) {
|
|
6803
|
+
return new Date(year, month, day);
|
|
6804
|
+
}
|
|
6805
|
+
}
|
|
6806
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6807
|
+
if (usMatch) {
|
|
6808
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6809
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6810
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
6811
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6812
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
6813
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6814
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6815
|
+
return new Date(year, month, day);
|
|
6816
|
+
}
|
|
6817
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
6818
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
6819
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6820
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6821
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6822
|
+
return new Date(year, month, day);
|
|
6823
|
+
}
|
|
6824
|
+
} else {
|
|
6825
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6826
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6827
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6828
|
+
if (num1 > 12 && num2 <= 12) {
|
|
6829
|
+
return new Date(year, num2 - 1, num1);
|
|
6830
|
+
}
|
|
6831
|
+
if (num2 > 12 && num1 <= 12) {
|
|
6832
|
+
return new Date(year, num1 - 1, num2);
|
|
6833
|
+
}
|
|
6834
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
6835
|
+
return new Date(year, num1 - 1, num2);
|
|
6836
|
+
}
|
|
6837
|
+
}
|
|
6838
|
+
}
|
|
6839
|
+
return null;
|
|
6840
|
+
}
|
|
6841
|
+
function formatDateISO(date) {
|
|
6842
|
+
return date.toISOString().split("T")[0];
|
|
6843
|
+
}
|
|
6844
|
+
function parseJsonFromTextSafe(text) {
|
|
6845
|
+
return parseJsonFromText(text);
|
|
6846
|
+
}
|
|
6847
|
+
|
|
6848
|
+
// src/evaluation/evaluators/latency.ts
|
|
6743
6849
|
var LatencyEvaluator = class {
|
|
6744
6850
|
kind = "latency";
|
|
6745
6851
|
config;
|
|
@@ -6772,57 +6878,17 @@ var LatencyEvaluator = class {
|
|
|
6772
6878
|
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
6773
6879
|
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
6774
6880
|
expectedAspectCount: 1,
|
|
6775
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6776
|
-
evaluatorRawRequest: {
|
|
6777
|
-
type: "latency",
|
|
6778
|
-
threshold,
|
|
6779
|
-
durationMs
|
|
6780
|
-
}
|
|
6781
|
-
};
|
|
6782
|
-
}
|
|
6783
|
-
};
|
|
6784
|
-
var CostEvaluator = class {
|
|
6785
|
-
kind = "cost";
|
|
6786
|
-
config;
|
|
6787
|
-
constructor(options) {
|
|
6788
|
-
this.config = options.config;
|
|
6789
|
-
}
|
|
6790
|
-
evaluate(context) {
|
|
6791
|
-
const { budget } = this.config;
|
|
6792
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
6793
|
-
if (costUsd === void 0) {
|
|
6794
|
-
return {
|
|
6795
|
-
score: 0,
|
|
6796
|
-
verdict: "fail",
|
|
6797
|
-
hits: [],
|
|
6798
|
-
misses: ["No cost data available in trace"],
|
|
6799
|
-
expectedAspectCount: 1,
|
|
6800
|
-
reasoning: "Execution cost not reported by provider",
|
|
6801
|
-
evaluatorRawRequest: {
|
|
6802
|
-
type: "cost",
|
|
6803
|
-
budget,
|
|
6804
|
-
costUsd: null
|
|
6805
|
-
}
|
|
6806
|
-
};
|
|
6807
|
-
}
|
|
6808
|
-
const passed = costUsd <= budget;
|
|
6809
|
-
const score = passed ? 1 : 0;
|
|
6810
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6811
|
-
return {
|
|
6812
|
-
score,
|
|
6813
|
-
verdict: passed ? "pass" : "fail",
|
|
6814
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6815
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6816
|
-
expectedAspectCount: 1,
|
|
6817
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6818
|
-
evaluatorRawRequest: {
|
|
6819
|
-
type: "cost",
|
|
6820
|
-
budget,
|
|
6821
|
-
costUsd
|
|
6881
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6882
|
+
evaluatorRawRequest: {
|
|
6883
|
+
type: "latency",
|
|
6884
|
+
threshold,
|
|
6885
|
+
durationMs
|
|
6822
6886
|
}
|
|
6823
6887
|
};
|
|
6824
6888
|
}
|
|
6825
6889
|
};
|
|
6890
|
+
|
|
6891
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
6826
6892
|
var TokenUsageEvaluator = class {
|
|
6827
6893
|
kind = "token_usage";
|
|
6828
6894
|
config;
|
|
@@ -6906,6 +6972,226 @@ var TokenUsageEvaluator = class {
|
|
|
6906
6972
|
}
|
|
6907
6973
|
};
|
|
6908
6974
|
|
|
6975
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
6976
|
+
function argsMatch(expected, actual) {
|
|
6977
|
+
if (expected === void 0) return true;
|
|
6978
|
+
if (expected === "any") return true;
|
|
6979
|
+
if (actual === void 0) return false;
|
|
6980
|
+
for (const key of Object.keys(expected)) {
|
|
6981
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
6982
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
6983
|
+
}
|
|
6984
|
+
return true;
|
|
6985
|
+
}
|
|
6986
|
+
var ToolTrajectoryEvaluator = class {
|
|
6987
|
+
kind = "tool_trajectory";
|
|
6988
|
+
config;
|
|
6989
|
+
constructor(options) {
|
|
6990
|
+
this.config = options.config;
|
|
6991
|
+
}
|
|
6992
|
+
evaluate(context) {
|
|
6993
|
+
const { outputMessages, traceSummary } = context;
|
|
6994
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
6995
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
6996
|
+
return {
|
|
6997
|
+
score: 0,
|
|
6998
|
+
verdict: "fail",
|
|
6999
|
+
hits: [],
|
|
7000
|
+
misses: ["No trace available for evaluation"],
|
|
7001
|
+
expectedAspectCount: 1
|
|
7002
|
+
};
|
|
7003
|
+
}
|
|
7004
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
7005
|
+
if (!summary) {
|
|
7006
|
+
return {
|
|
7007
|
+
score: 0,
|
|
7008
|
+
verdict: "fail",
|
|
7009
|
+
hits: [],
|
|
7010
|
+
misses: ["No trace available for evaluation"],
|
|
7011
|
+
expectedAspectCount: 1
|
|
7012
|
+
};
|
|
7013
|
+
}
|
|
7014
|
+
switch (this.config.mode) {
|
|
7015
|
+
case "any_order":
|
|
7016
|
+
return this.evaluateAnyOrder(summary);
|
|
7017
|
+
case "in_order":
|
|
7018
|
+
return this.evaluateInOrder(toolCalls);
|
|
7019
|
+
case "exact":
|
|
7020
|
+
return this.evaluateExact(toolCalls);
|
|
7021
|
+
default:
|
|
7022
|
+
return {
|
|
7023
|
+
score: 0,
|
|
7024
|
+
verdict: "fail",
|
|
7025
|
+
hits: [],
|
|
7026
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
7027
|
+
expectedAspectCount: 1
|
|
7028
|
+
};
|
|
7029
|
+
}
|
|
7030
|
+
}
|
|
7031
|
+
/**
|
|
7032
|
+
* Extract tool calls from output messages.
|
|
7033
|
+
*/
|
|
7034
|
+
extractToolCallsFromMessages(messages) {
|
|
7035
|
+
if (!messages) {
|
|
7036
|
+
return [];
|
|
7037
|
+
}
|
|
7038
|
+
const toolCalls = [];
|
|
7039
|
+
for (const message of messages) {
|
|
7040
|
+
if (message.toolCalls) {
|
|
7041
|
+
for (const call of message.toolCalls) {
|
|
7042
|
+
toolCalls.push({
|
|
7043
|
+
name: call.tool,
|
|
7044
|
+
args: call.input
|
|
7045
|
+
});
|
|
7046
|
+
}
|
|
7047
|
+
}
|
|
7048
|
+
}
|
|
7049
|
+
return toolCalls;
|
|
7050
|
+
}
|
|
7051
|
+
/**
|
|
7052
|
+
* Build a summary from extracted tool calls.
|
|
7053
|
+
*/
|
|
7054
|
+
buildSummary(toolCalls) {
|
|
7055
|
+
const toolCallsByName = {};
|
|
7056
|
+
for (const call of toolCalls) {
|
|
7057
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
7058
|
+
}
|
|
7059
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
7060
|
+
return {
|
|
7061
|
+
eventCount: toolCalls.length,
|
|
7062
|
+
toolNames,
|
|
7063
|
+
toolCallsByName,
|
|
7064
|
+
errorCount: 0
|
|
7065
|
+
};
|
|
7066
|
+
}
|
|
7067
|
+
evaluateAnyOrder(summary) {
|
|
7068
|
+
const minimums = this.config.minimums ?? {};
|
|
7069
|
+
const toolNames = Object.keys(minimums);
|
|
7070
|
+
if (toolNames.length === 0) {
|
|
7071
|
+
return {
|
|
7072
|
+
score: 1,
|
|
7073
|
+
verdict: "pass",
|
|
7074
|
+
hits: ["No tool requirements specified"],
|
|
7075
|
+
misses: [],
|
|
7076
|
+
expectedAspectCount: 0
|
|
7077
|
+
};
|
|
7078
|
+
}
|
|
7079
|
+
const hits = [];
|
|
7080
|
+
const misses = [];
|
|
7081
|
+
for (const toolName of toolNames) {
|
|
7082
|
+
const required = minimums[toolName];
|
|
7083
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
7084
|
+
if (actual >= required) {
|
|
7085
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7086
|
+
} else {
|
|
7087
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7088
|
+
}
|
|
7089
|
+
}
|
|
7090
|
+
const score = hits.length / toolNames.length;
|
|
7091
|
+
return {
|
|
7092
|
+
score,
|
|
7093
|
+
verdict: scoreToVerdict(score),
|
|
7094
|
+
hits,
|
|
7095
|
+
misses,
|
|
7096
|
+
expectedAspectCount: toolNames.length
|
|
7097
|
+
};
|
|
7098
|
+
}
|
|
7099
|
+
evaluateInOrder(toolCalls) {
|
|
7100
|
+
const expected = this.config.expected ?? [];
|
|
7101
|
+
if (expected.length === 0) {
|
|
7102
|
+
return {
|
|
7103
|
+
score: 1,
|
|
7104
|
+
verdict: "pass",
|
|
7105
|
+
hits: ["No tool sequence specified"],
|
|
7106
|
+
misses: [],
|
|
7107
|
+
expectedAspectCount: 0
|
|
7108
|
+
};
|
|
7109
|
+
}
|
|
7110
|
+
const hits = [];
|
|
7111
|
+
const misses = [];
|
|
7112
|
+
let actualIndex = 0;
|
|
7113
|
+
for (let i = 0; i < expected.length; i++) {
|
|
7114
|
+
const expectedItem = expected[i];
|
|
7115
|
+
const expectedTool = expectedItem.tool;
|
|
7116
|
+
let found = false;
|
|
7117
|
+
let argsMismatch = false;
|
|
7118
|
+
while (actualIndex < toolCalls.length) {
|
|
7119
|
+
const actualCall = toolCalls[actualIndex];
|
|
7120
|
+
if (actualCall.name === expectedTool) {
|
|
7121
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7122
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7123
|
+
actualIndex++;
|
|
7124
|
+
found = true;
|
|
7125
|
+
break;
|
|
7126
|
+
}
|
|
7127
|
+
misses.push(
|
|
7128
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
7129
|
+
);
|
|
7130
|
+
actualIndex++;
|
|
7131
|
+
argsMismatch = true;
|
|
7132
|
+
break;
|
|
7133
|
+
}
|
|
7134
|
+
actualIndex++;
|
|
7135
|
+
}
|
|
7136
|
+
if (!found && !argsMismatch) {
|
|
7137
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7138
|
+
}
|
|
7139
|
+
}
|
|
7140
|
+
const score = hits.length / expected.length;
|
|
7141
|
+
return {
|
|
7142
|
+
score,
|
|
7143
|
+
verdict: scoreToVerdict(score),
|
|
7144
|
+
hits,
|
|
7145
|
+
misses,
|
|
7146
|
+
expectedAspectCount: expected.length
|
|
7147
|
+
};
|
|
7148
|
+
}
|
|
7149
|
+
evaluateExact(toolCalls) {
|
|
7150
|
+
const expected = this.config.expected ?? [];
|
|
7151
|
+
if (expected.length === 0) {
|
|
7152
|
+
return {
|
|
7153
|
+
score: 1,
|
|
7154
|
+
verdict: "pass",
|
|
7155
|
+
hits: ["No tool sequence specified"],
|
|
7156
|
+
misses: [],
|
|
7157
|
+
expectedAspectCount: 0
|
|
7158
|
+
};
|
|
7159
|
+
}
|
|
7160
|
+
const hits = [];
|
|
7161
|
+
const misses = [];
|
|
7162
|
+
if (toolCalls.length !== expected.length) {
|
|
7163
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7164
|
+
}
|
|
7165
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
7166
|
+
for (let i = 0; i < checkLength; i++) {
|
|
7167
|
+
const expectedItem = expected[i];
|
|
7168
|
+
const expectedTool = expectedItem.tool;
|
|
7169
|
+
const actualCall = toolCalls[i];
|
|
7170
|
+
const actualTool = actualCall.name;
|
|
7171
|
+
if (actualTool === expectedTool) {
|
|
7172
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7173
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7174
|
+
} else {
|
|
7175
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7176
|
+
}
|
|
7177
|
+
} else {
|
|
7178
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7179
|
+
}
|
|
7180
|
+
}
|
|
7181
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
7182
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7183
|
+
}
|
|
7184
|
+
const score = hits.length / expected.length;
|
|
7185
|
+
return {
|
|
7186
|
+
score,
|
|
7187
|
+
verdict: scoreToVerdict(score),
|
|
7188
|
+
hits,
|
|
7189
|
+
misses,
|
|
7190
|
+
expectedAspectCount: expected.length
|
|
7191
|
+
};
|
|
7192
|
+
}
|
|
7193
|
+
};
|
|
7194
|
+
|
|
6909
7195
|
// src/evaluation/orchestrator.ts
|
|
6910
7196
|
import { createHash } from "node:crypto";
|
|
6911
7197
|
import path14 from "node:path";
|
|
@@ -7119,6 +7405,17 @@ async function runEvaluation(options) {
|
|
|
7119
7405
|
}
|
|
7120
7406
|
return getOrCreateProvider(resolvedJudge);
|
|
7121
7407
|
};
|
|
7408
|
+
const targetResolver = (name) => {
|
|
7409
|
+
const resolved = resolveTargetByName(name);
|
|
7410
|
+
if (!resolved) {
|
|
7411
|
+
return void 0;
|
|
7412
|
+
}
|
|
7413
|
+
return getOrCreateProvider(resolved);
|
|
7414
|
+
};
|
|
7415
|
+
const availableTargets = [
|
|
7416
|
+
target.name,
|
|
7417
|
+
...Array.from(targetDefinitions.keys())
|
|
7418
|
+
];
|
|
7122
7419
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
7123
7420
|
const primaryProvider = getOrCreateProvider(target);
|
|
7124
7421
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -7148,7 +7445,9 @@ async function runEvaluation(options) {
|
|
|
7148
7445
|
onResult,
|
|
7149
7446
|
verbose,
|
|
7150
7447
|
resolveJudgeProvider,
|
|
7151
|
-
agentTimeoutMs
|
|
7448
|
+
agentTimeoutMs,
|
|
7449
|
+
targetResolver,
|
|
7450
|
+
availableTargets
|
|
7152
7451
|
});
|
|
7153
7452
|
} catch (error) {
|
|
7154
7453
|
if (verbose) {
|
|
@@ -7187,7 +7486,9 @@ async function runEvaluation(options) {
|
|
|
7187
7486
|
cache,
|
|
7188
7487
|
useCache,
|
|
7189
7488
|
now,
|
|
7190
|
-
judgeProvider
|
|
7489
|
+
judgeProvider,
|
|
7490
|
+
targetResolver,
|
|
7491
|
+
availableTargets
|
|
7191
7492
|
});
|
|
7192
7493
|
if (onProgress) {
|
|
7193
7494
|
await onProgress({
|
|
@@ -7254,7 +7555,9 @@ async function runBatchEvaluation(options) {
|
|
|
7254
7555
|
onProgress,
|
|
7255
7556
|
onResult,
|
|
7256
7557
|
resolveJudgeProvider,
|
|
7257
|
-
agentTimeoutMs
|
|
7558
|
+
agentTimeoutMs,
|
|
7559
|
+
targetResolver,
|
|
7560
|
+
availableTargets
|
|
7258
7561
|
} = options;
|
|
7259
7562
|
const promptInputsList = [];
|
|
7260
7563
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -7329,7 +7632,9 @@ async function runBatchEvaluation(options) {
|
|
|
7329
7632
|
judgeProvider: await resolveJudgeProvider(target),
|
|
7330
7633
|
agentTimeoutMs,
|
|
7331
7634
|
outputMessages,
|
|
7332
|
-
traceSummary
|
|
7635
|
+
traceSummary,
|
|
7636
|
+
targetResolver,
|
|
7637
|
+
availableTargets
|
|
7333
7638
|
});
|
|
7334
7639
|
if (providerError) {
|
|
7335
7640
|
result = { ...result, error: providerError };
|
|
@@ -7387,7 +7692,9 @@ async function runEvalCase(options) {
|
|
|
7387
7692
|
cache,
|
|
7388
7693
|
useCache,
|
|
7389
7694
|
signal,
|
|
7390
|
-
judgeProvider
|
|
7695
|
+
judgeProvider,
|
|
7696
|
+
targetResolver,
|
|
7697
|
+
availableTargets
|
|
7391
7698
|
} = options;
|
|
7392
7699
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
7393
7700
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -7461,7 +7768,9 @@ async function runEvalCase(options) {
|
|
|
7461
7768
|
judgeProvider,
|
|
7462
7769
|
agentTimeoutMs,
|
|
7463
7770
|
outputMessages,
|
|
7464
|
-
traceSummary
|
|
7771
|
+
traceSummary,
|
|
7772
|
+
targetResolver,
|
|
7773
|
+
availableTargets
|
|
7465
7774
|
});
|
|
7466
7775
|
return providerError ? { ...result, error: providerError } : result;
|
|
7467
7776
|
} catch (error) {
|
|
@@ -7481,7 +7790,9 @@ async function evaluateCandidate(options) {
|
|
|
7481
7790
|
judgeProvider,
|
|
7482
7791
|
agentTimeoutMs,
|
|
7483
7792
|
outputMessages,
|
|
7484
|
-
traceSummary
|
|
7793
|
+
traceSummary,
|
|
7794
|
+
targetResolver,
|
|
7795
|
+
availableTargets
|
|
7485
7796
|
} = options;
|
|
7486
7797
|
const gradeTimestamp = nowFn();
|
|
7487
7798
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -7496,7 +7807,9 @@ async function evaluateCandidate(options) {
|
|
|
7496
7807
|
judgeProvider,
|
|
7497
7808
|
agentTimeoutMs,
|
|
7498
7809
|
outputMessages,
|
|
7499
|
-
traceSummary
|
|
7810
|
+
traceSummary,
|
|
7811
|
+
targetResolver,
|
|
7812
|
+
availableTargets
|
|
7500
7813
|
});
|
|
7501
7814
|
const completedAt = nowFn();
|
|
7502
7815
|
let agentProviderRequest;
|
|
@@ -7549,7 +7862,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7549
7862
|
judgeProvider,
|
|
7550
7863
|
agentTimeoutMs,
|
|
7551
7864
|
outputMessages,
|
|
7552
|
-
traceSummary
|
|
7865
|
+
traceSummary,
|
|
7866
|
+
targetResolver,
|
|
7867
|
+
availableTargets
|
|
7553
7868
|
} = options;
|
|
7554
7869
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
7555
7870
|
return runEvaluatorList({
|
|
@@ -7565,7 +7880,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7565
7880
|
judgeProvider,
|
|
7566
7881
|
agentTimeoutMs,
|
|
7567
7882
|
outputMessages,
|
|
7568
|
-
traceSummary
|
|
7883
|
+
traceSummary,
|
|
7884
|
+
targetResolver,
|
|
7885
|
+
availableTargets
|
|
7569
7886
|
});
|
|
7570
7887
|
}
|
|
7571
7888
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -7583,7 +7900,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7583
7900
|
now,
|
|
7584
7901
|
judgeProvider,
|
|
7585
7902
|
outputMessages,
|
|
7586
|
-
traceSummary
|
|
7903
|
+
traceSummary,
|
|
7904
|
+
targetResolver,
|
|
7905
|
+
availableTargets
|
|
7587
7906
|
});
|
|
7588
7907
|
return { score };
|
|
7589
7908
|
}
|
|
@@ -7601,7 +7920,9 @@ async function runEvaluatorList(options) {
|
|
|
7601
7920
|
judgeProvider,
|
|
7602
7921
|
agentTimeoutMs,
|
|
7603
7922
|
outputMessages,
|
|
7604
|
-
traceSummary
|
|
7923
|
+
traceSummary,
|
|
7924
|
+
targetResolver,
|
|
7925
|
+
availableTargets
|
|
7605
7926
|
} = options;
|
|
7606
7927
|
const scored = [];
|
|
7607
7928
|
const evaluatorResults = [];
|
|
@@ -7639,7 +7960,8 @@ async function runEvaluatorList(options) {
|
|
|
7639
7960
|
script: evaluator.script,
|
|
7640
7961
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
7641
7962
|
agentTimeoutMs,
|
|
7642
|
-
config: evaluator.config
|
|
7963
|
+
config: evaluator.config,
|
|
7964
|
+
target: evaluator.target
|
|
7643
7965
|
});
|
|
7644
7966
|
const score2 = await codeEvaluator.evaluate({
|
|
7645
7967
|
evalCase,
|
|
@@ -7649,8 +7971,11 @@ async function runEvaluatorList(options) {
|
|
|
7649
7971
|
attempt,
|
|
7650
7972
|
promptInputs,
|
|
7651
7973
|
now,
|
|
7974
|
+
judgeProvider,
|
|
7652
7975
|
outputMessages,
|
|
7653
|
-
traceSummary
|
|
7976
|
+
traceSummary,
|
|
7977
|
+
targetResolver,
|
|
7978
|
+
availableTargets
|
|
7654
7979
|
});
|
|
7655
7980
|
const weight = evaluator.weight ?? 1;
|
|
7656
7981
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -7663,7 +7988,8 @@ async function runEvaluatorList(options) {
|
|
|
7663
7988
|
hits: score2.hits,
|
|
7664
7989
|
misses: score2.misses,
|
|
7665
7990
|
reasoning: score2.reasoning,
|
|
7666
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
7991
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
7992
|
+
details: score2.details
|
|
7667
7993
|
});
|
|
7668
7994
|
}
|
|
7669
7995
|
if (evaluator.type === "composite") {
|
|
@@ -7677,7 +8003,8 @@ async function runEvaluatorList(options) {
|
|
|
7677
8003
|
script: memberConfig.script,
|
|
7678
8004
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
7679
8005
|
agentTimeoutMs,
|
|
7680
|
-
config: memberConfig.config
|
|
8006
|
+
config: memberConfig.config,
|
|
8007
|
+
target: memberConfig.target
|
|
7681
8008
|
});
|
|
7682
8009
|
case "composite":
|
|
7683
8010
|
return new CompositeEvaluator({
|
|
@@ -7726,7 +8053,9 @@ async function runEvaluatorList(options) {
|
|
|
7726
8053
|
now,
|
|
7727
8054
|
judgeProvider,
|
|
7728
8055
|
outputMessages,
|
|
7729
|
-
traceSummary
|
|
8056
|
+
traceSummary,
|
|
8057
|
+
targetResolver,
|
|
8058
|
+
availableTargets
|
|
7730
8059
|
});
|
|
7731
8060
|
const weight = evaluator.weight ?? 1;
|
|
7732
8061
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -7922,11 +8251,11 @@ async function runEvaluatorList(options) {
|
|
|
7922
8251
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
7923
8252
|
0
|
|
7924
8253
|
);
|
|
7925
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
8254
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
7926
8255
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
7927
8256
|
const score = {
|
|
7928
8257
|
score: aggregateScore,
|
|
7929
|
-
verdict:
|
|
8258
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
7930
8259
|
hits,
|
|
7931
8260
|
misses,
|
|
7932
8261
|
expectedAspectCount,
|
|
@@ -7973,18 +8302,6 @@ async function resolveCustomPrompt(config) {
|
|
|
7973
8302
|
}
|
|
7974
8303
|
return config.prompt;
|
|
7975
8304
|
}
|
|
7976
|
-
function isNonEmptyString2(value) {
|
|
7977
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7978
|
-
}
|
|
7979
|
-
function scoreToVerdict2(score) {
|
|
7980
|
-
if (score >= 0.8) {
|
|
7981
|
-
return "pass";
|
|
7982
|
-
}
|
|
7983
|
-
if (score >= 0.6) {
|
|
7984
|
-
return "borderline";
|
|
7985
|
-
}
|
|
7986
|
-
return "fail";
|
|
7987
|
-
}
|
|
7988
8305
|
function filterEvalCases(evalCases, evalId) {
|
|
7989
8306
|
if (!evalId) {
|
|
7990
8307
|
return evalCases;
|
|
@@ -8127,7 +8444,8 @@ function mapChildResults(children) {
|
|
|
8127
8444
|
misses: child.misses,
|
|
8128
8445
|
reasoning: child.reasoning,
|
|
8129
8446
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
8130
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
8447
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
8448
|
+
details: child.details
|
|
8131
8449
|
}));
|
|
8132
8450
|
}
|
|
8133
8451
|
function computeWeightedMean(entries) {
|
|
@@ -8142,7 +8460,7 @@ function computeWeightedMean(entries) {
|
|
|
8142
8460
|
}
|
|
8143
8461
|
|
|
8144
8462
|
// src/evaluation/generators/rubric-generator.ts
|
|
8145
|
-
import { generateText as
|
|
8463
|
+
import { generateText as generateText4 } from "ai";
|
|
8146
8464
|
import { z as z3 } from "zod";
|
|
8147
8465
|
var rubricItemSchema = z3.object({
|
|
8148
8466
|
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -8176,7 +8494,7 @@ You must return a valid JSON object matching this schema:
|
|
|
8176
8494
|
let lastError;
|
|
8177
8495
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
8178
8496
|
try {
|
|
8179
|
-
const { text } = await
|
|
8497
|
+
const { text } = await generateText4({
|
|
8180
8498
|
model,
|
|
8181
8499
|
system,
|
|
8182
8500
|
prompt
|
|
@@ -8238,31 +8556,39 @@ export {
|
|
|
8238
8556
|
ToolTrajectoryEvaluator,
|
|
8239
8557
|
avgToolDurationMs,
|
|
8240
8558
|
buildDirectoryChain,
|
|
8559
|
+
buildOutputSchema,
|
|
8241
8560
|
buildPromptInputs,
|
|
8242
8561
|
buildSearchRoots,
|
|
8562
|
+
clampScore,
|
|
8243
8563
|
computeTraceSummary,
|
|
8244
8564
|
consumeClaudeCodeLogEntries,
|
|
8245
8565
|
consumeCodexLogEntries,
|
|
8246
8566
|
consumePiLogEntries,
|
|
8247
8567
|
createAgentKernel,
|
|
8248
8568
|
createProvider,
|
|
8569
|
+
deepEqual,
|
|
8249
8570
|
ensureVSCodeSubagents,
|
|
8571
|
+
executeScript,
|
|
8250
8572
|
explorationRatio,
|
|
8251
|
-
|
|
8573
|
+
extractJsonBlob,
|
|
8252
8574
|
fileExists,
|
|
8253
8575
|
findGitRoot,
|
|
8576
|
+
freeformEvaluationSchema,
|
|
8254
8577
|
generateRubrics,
|
|
8255
8578
|
getHitCount,
|
|
8256
8579
|
isEvaluatorKind,
|
|
8257
8580
|
isGuidelineFile,
|
|
8258
8581
|
isJsonObject,
|
|
8259
8582
|
isJsonValue,
|
|
8583
|
+
isNonEmptyString,
|
|
8260
8584
|
isTestMessage,
|
|
8261
8585
|
isTestMessageRole,
|
|
8262
8586
|
listTargetNames,
|
|
8263
8587
|
loadEvalCases,
|
|
8264
8588
|
mergeExecutionMetrics,
|
|
8265
8589
|
normalizeLineEndings,
|
|
8590
|
+
parseJsonFromText,
|
|
8591
|
+
parseJsonSafe,
|
|
8266
8592
|
readJsonFile,
|
|
8267
8593
|
readTargetDefinitions,
|
|
8268
8594
|
readTestSuiteMetadata,
|
|
@@ -8272,6 +8598,7 @@ export {
|
|
|
8272
8598
|
resolveTargetDefinition,
|
|
8273
8599
|
runEvalCase,
|
|
8274
8600
|
runEvaluation,
|
|
8601
|
+
scoreToVerdict,
|
|
8275
8602
|
subscribeToClaudeCodeLogEntries,
|
|
8276
8603
|
subscribeToCodexLogEntries,
|
|
8277
8604
|
subscribeToPiLogEntries,
|