@agentv/core 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IBTKEEOT.js → chunk-KDEP4I7G.js} +44 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +1641 -1138
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +157 -100
- package/dist/index.d.ts +157 -100
- package/dist/index.js +1451 -997
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
- package/dist/chunk-IBTKEEOT.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-KDEP4I7G.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
|
|
|
150
150
|
import path6 from "node:path";
|
|
151
151
|
import { parse as parse2 } from "yaml";
|
|
152
152
|
|
|
153
|
-
// src/evaluation/formatting/segment-formatter.ts
|
|
154
|
-
function extractCodeBlocks(segments) {
|
|
155
|
-
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
156
|
-
const codeBlocks = [];
|
|
157
|
-
for (const segment of segments) {
|
|
158
|
-
const typeValue = segment.type;
|
|
159
|
-
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
160
|
-
continue;
|
|
161
|
-
}
|
|
162
|
-
const textValue = segment.value;
|
|
163
|
-
if (typeof textValue !== "string") {
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
166
|
-
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
167
|
-
if (matches) {
|
|
168
|
-
codeBlocks.push(...matches);
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
return codeBlocks;
|
|
172
|
-
}
|
|
173
|
-
function formatFileContents(parts) {
|
|
174
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
175
|
-
if (fileCount > 0) {
|
|
176
|
-
return parts.map((part) => {
|
|
177
|
-
if (part.isFile && part.displayPath) {
|
|
178
|
-
return `<file path="${part.displayPath}">
|
|
179
|
-
${part.content}
|
|
180
|
-
</file>`;
|
|
181
|
-
}
|
|
182
|
-
return part.content;
|
|
183
|
-
}).join("\n\n");
|
|
184
|
-
}
|
|
185
|
-
return parts.map((p) => p.content).join(" ");
|
|
186
|
-
}
|
|
187
|
-
function formatSegment(segment, mode = "lm") {
|
|
188
|
-
const type = asString(segment.type);
|
|
189
|
-
if (type === "text") {
|
|
190
|
-
return asString(segment.value);
|
|
191
|
-
}
|
|
192
|
-
if (type === "guideline_ref") {
|
|
193
|
-
const refPath = asString(segment.path);
|
|
194
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
195
|
-
}
|
|
196
|
-
if (type === "file") {
|
|
197
|
-
const filePath = asString(segment.path);
|
|
198
|
-
if (!filePath) {
|
|
199
|
-
return void 0;
|
|
200
|
-
}
|
|
201
|
-
if (mode === "agent") {
|
|
202
|
-
return `<file: path="${filePath}">`;
|
|
203
|
-
}
|
|
204
|
-
const text = asString(segment.text);
|
|
205
|
-
if (text && filePath) {
|
|
206
|
-
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
return void 0;
|
|
210
|
-
}
|
|
211
|
-
function hasVisibleContent(segments) {
|
|
212
|
-
return segments.some((segment) => {
|
|
213
|
-
const type = asString(segment.type);
|
|
214
|
-
if (type === "text") {
|
|
215
|
-
const value = asString(segment.value);
|
|
216
|
-
return value !== void 0 && value.trim().length > 0;
|
|
217
|
-
}
|
|
218
|
-
if (type === "guideline_ref") {
|
|
219
|
-
return false;
|
|
220
|
-
}
|
|
221
|
-
if (type === "file") {
|
|
222
|
-
const text = asString(segment.text);
|
|
223
|
-
return text !== void 0 && text.trim().length > 0;
|
|
224
|
-
}
|
|
225
|
-
return false;
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
function asString(value) {
|
|
229
|
-
return typeof value === "string" ? value : void 0;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
153
|
// src/evaluation/loaders/config-loader.ts
|
|
233
154
|
import { readFile } from "node:fs/promises";
|
|
234
155
|
import path2 from "node:path";
|
|
@@ -483,7 +404,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
483
404
|
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
484
405
|
continue;
|
|
485
406
|
}
|
|
486
|
-
const name =
|
|
407
|
+
const name = asString(rawEvaluator.name);
|
|
487
408
|
const typeValue = rawEvaluator.type;
|
|
488
409
|
if (!name || !isEvaluatorKind(typeValue)) {
|
|
489
410
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
@@ -511,7 +432,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
511
432
|
continue;
|
|
512
433
|
}
|
|
513
434
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
514
|
-
const cwd =
|
|
435
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
515
436
|
let resolvedCwd;
|
|
516
437
|
if (cwd) {
|
|
517
438
|
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
@@ -526,7 +447,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
526
447
|
} else {
|
|
527
448
|
resolvedCwd = searchRoots[0];
|
|
528
449
|
}
|
|
529
|
-
const
|
|
450
|
+
const rawTarget = rawEvaluator.target;
|
|
451
|
+
let targetConfig;
|
|
452
|
+
if (rawTarget !== void 0) {
|
|
453
|
+
if (isJsonObject2(rawTarget)) {
|
|
454
|
+
const maxCalls = rawTarget.max_calls;
|
|
455
|
+
if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
|
|
456
|
+
logWarning2(
|
|
457
|
+
`Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
|
|
458
|
+
);
|
|
459
|
+
} else {
|
|
460
|
+
targetConfig = {
|
|
461
|
+
...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
} else if (rawTarget === true) {
|
|
465
|
+
targetConfig = {};
|
|
466
|
+
} else {
|
|
467
|
+
logWarning2(
|
|
468
|
+
`Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
530
473
|
const config = {};
|
|
531
474
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
532
475
|
if (!knownProps.has(key) && value !== void 0) {
|
|
@@ -540,7 +483,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
540
483
|
cwd,
|
|
541
484
|
resolvedCwd,
|
|
542
485
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
543
|
-
...Object.keys(config).length > 0 ? { config } : {}
|
|
486
|
+
...Object.keys(config).length > 0 ? { config } : {},
|
|
487
|
+
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
544
488
|
});
|
|
545
489
|
continue;
|
|
546
490
|
}
|
|
@@ -557,7 +501,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
557
501
|
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
558
502
|
continue;
|
|
559
503
|
}
|
|
560
|
-
const aggregatorType =
|
|
504
|
+
const aggregatorType = asString(rawAggregator.type);
|
|
561
505
|
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
562
506
|
logWarning2(
|
|
563
507
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
@@ -570,7 +514,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
570
514
|
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
571
515
|
continue;
|
|
572
516
|
}
|
|
573
|
-
const memberName =
|
|
517
|
+
const memberName = asString(rawMember.name);
|
|
574
518
|
const memberType = rawMember.type;
|
|
575
519
|
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
576
520
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
@@ -608,7 +552,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
608
552
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
609
553
|
};
|
|
610
554
|
} else if (aggregatorType === "code_judge") {
|
|
611
|
-
const aggregatorPath =
|
|
555
|
+
const aggregatorPath = asString(rawAggregator.path);
|
|
612
556
|
if (!aggregatorPath) {
|
|
613
557
|
logWarning2(
|
|
614
558
|
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
@@ -621,7 +565,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
621
565
|
cwd: searchRoots[0]
|
|
622
566
|
};
|
|
623
567
|
} else {
|
|
624
|
-
const aggregatorPrompt =
|
|
568
|
+
const aggregatorPrompt = asString(rawAggregator.prompt);
|
|
625
569
|
let promptPath2;
|
|
626
570
|
if (aggregatorPrompt) {
|
|
627
571
|
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
@@ -646,7 +590,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
646
590
|
continue;
|
|
647
591
|
}
|
|
648
592
|
if (typeValue === "tool_trajectory") {
|
|
649
|
-
const mode =
|
|
593
|
+
const mode = asString(rawEvaluator.mode);
|
|
650
594
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
651
595
|
logWarning2(
|
|
652
596
|
`Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
@@ -737,8 +681,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
737
681
|
);
|
|
738
682
|
continue;
|
|
739
683
|
}
|
|
740
|
-
const fieldPath =
|
|
741
|
-
const match =
|
|
684
|
+
const fieldPath = asString(rawField.path);
|
|
685
|
+
const match = asString(rawField.match);
|
|
742
686
|
if (!fieldPath) {
|
|
743
687
|
logWarning2(
|
|
744
688
|
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
@@ -768,7 +712,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
768
712
|
);
|
|
769
713
|
continue;
|
|
770
714
|
}
|
|
771
|
-
const aggregation =
|
|
715
|
+
const aggregation = asString(rawEvaluator.aggregation);
|
|
772
716
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
773
717
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
774
718
|
evaluators.push({
|
|
@@ -849,7 +793,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
849
793
|
});
|
|
850
794
|
continue;
|
|
851
795
|
}
|
|
852
|
-
const prompt =
|
|
796
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
853
797
|
let promptPath;
|
|
854
798
|
if (prompt) {
|
|
855
799
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
@@ -868,11 +812,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
868
812
|
);
|
|
869
813
|
}
|
|
870
814
|
}
|
|
871
|
-
const _model =
|
|
815
|
+
const _model = asString(rawEvaluator.model);
|
|
872
816
|
const rawRubrics = rawEvaluator.rubrics;
|
|
873
817
|
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
874
|
-
id:
|
|
875
|
-
description:
|
|
818
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
819
|
+
description: asString(rubric.description) ?? "",
|
|
876
820
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
877
821
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
878
822
|
})).filter((r) => r.description.length > 0) : void 0;
|
|
@@ -916,7 +860,7 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
916
860
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
917
861
|
return void 0;
|
|
918
862
|
}
|
|
919
|
-
function
|
|
863
|
+
function asString(value) {
|
|
920
864
|
return typeof value === "string" ? value : void 0;
|
|
921
865
|
}
|
|
922
866
|
function asStringArray(value, description) {
|
|
@@ -992,6 +936,68 @@ function isValidFieldAggregationType(value) {
|
|
|
992
936
|
// src/evaluation/loaders/message-processor.ts
|
|
993
937
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
994
938
|
import path4 from "node:path";
|
|
939
|
+
|
|
940
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
941
|
+
function formatFileContents(parts) {
|
|
942
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
943
|
+
if (fileCount > 0) {
|
|
944
|
+
return parts.map((part) => {
|
|
945
|
+
if (part.isFile && part.displayPath) {
|
|
946
|
+
return `<file path="${part.displayPath}">
|
|
947
|
+
${part.content}
|
|
948
|
+
</file>`;
|
|
949
|
+
}
|
|
950
|
+
return part.content;
|
|
951
|
+
}).join("\n\n");
|
|
952
|
+
}
|
|
953
|
+
return parts.map((p) => p.content).join(" ");
|
|
954
|
+
}
|
|
955
|
+
function formatSegment(segment, mode = "lm") {
|
|
956
|
+
const type = asString2(segment.type);
|
|
957
|
+
if (type === "text") {
|
|
958
|
+
return asString2(segment.value);
|
|
959
|
+
}
|
|
960
|
+
if (type === "guideline_ref") {
|
|
961
|
+
const refPath = asString2(segment.path);
|
|
962
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
963
|
+
}
|
|
964
|
+
if (type === "file") {
|
|
965
|
+
const filePath = asString2(segment.path);
|
|
966
|
+
if (!filePath) {
|
|
967
|
+
return void 0;
|
|
968
|
+
}
|
|
969
|
+
if (mode === "agent") {
|
|
970
|
+
return `<file: path="${filePath}">`;
|
|
971
|
+
}
|
|
972
|
+
const text = asString2(segment.text);
|
|
973
|
+
if (text && filePath) {
|
|
974
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
return void 0;
|
|
978
|
+
}
|
|
979
|
+
function hasVisibleContent(segments) {
|
|
980
|
+
return segments.some((segment) => {
|
|
981
|
+
const type = asString2(segment.type);
|
|
982
|
+
if (type === "text") {
|
|
983
|
+
const value = asString2(segment.value);
|
|
984
|
+
return value !== void 0 && value.trim().length > 0;
|
|
985
|
+
}
|
|
986
|
+
if (type === "guideline_ref") {
|
|
987
|
+
return false;
|
|
988
|
+
}
|
|
989
|
+
if (type === "file") {
|
|
990
|
+
const text = asString2(segment.text);
|
|
991
|
+
return text !== void 0 && text.trim().length > 0;
|
|
992
|
+
}
|
|
993
|
+
return false;
|
|
994
|
+
});
|
|
995
|
+
}
|
|
996
|
+
function asString2(value) {
|
|
997
|
+
return typeof value === "string" ? value : void 0;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
// src/evaluation/loaders/message-processor.ts
|
|
995
1001
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
996
1002
|
var ANSI_RESET4 = "\x1B[0m";
|
|
997
1003
|
async function processMessages(options) {
|
|
@@ -1297,9 +1303,6 @@ ${messageContent}`);
|
|
|
1297
1303
|
questionParts.push(formattedContent);
|
|
1298
1304
|
}
|
|
1299
1305
|
}
|
|
1300
|
-
if (testCase.code_snippets.length > 0) {
|
|
1301
|
-
questionParts.push(testCase.code_snippets.join("\n"));
|
|
1302
|
-
}
|
|
1303
1306
|
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
1304
1307
|
}
|
|
1305
1308
|
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
@@ -1498,7 +1501,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1498
1501
|
repoRootPath,
|
|
1499
1502
|
verbose
|
|
1500
1503
|
}) : [];
|
|
1501
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1502
1504
|
let referenceAnswer = "";
|
|
1503
1505
|
if (outputSegments.length > 0) {
|
|
1504
1506
|
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
@@ -1571,7 +1573,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1571
1573
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
1572
1574
|
guideline_patterns: guidelinePatterns,
|
|
1573
1575
|
file_paths: allFilePaths,
|
|
1574
|
-
code_snippets: codeSnippets,
|
|
1575
1576
|
expected_outcome: outcome,
|
|
1576
1577
|
evaluator: evalCaseEvaluatorKind,
|
|
1577
1578
|
evaluators
|
|
@@ -4084,6 +4085,167 @@ var MockProvider = class {
|
|
|
4084
4085
|
}
|
|
4085
4086
|
};
|
|
4086
4087
|
|
|
4088
|
+
// src/evaluation/providers/pi-agent-sdk.ts
|
|
4089
|
+
var piAgentModule = null;
|
|
4090
|
+
var piAiModule = null;
|
|
4091
|
+
async function loadPiModules() {
|
|
4092
|
+
if (!piAgentModule || !piAiModule) {
|
|
4093
|
+
try {
|
|
4094
|
+
[piAgentModule, piAiModule] = await Promise.all([
|
|
4095
|
+
import("@mariozechner/pi-agent"),
|
|
4096
|
+
import("@mariozechner/pi-ai")
|
|
4097
|
+
]);
|
|
4098
|
+
} catch (error) {
|
|
4099
|
+
throw new Error(
|
|
4100
|
+
`Failed to load pi-agent-sdk dependencies. Please install them:
|
|
4101
|
+
npm install @mariozechner/pi-agent @mariozechner/pi-ai
|
|
4102
|
+
|
|
4103
|
+
Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
4104
|
+
);
|
|
4105
|
+
}
|
|
4106
|
+
}
|
|
4107
|
+
return {
|
|
4108
|
+
Agent: piAgentModule.Agent,
|
|
4109
|
+
ProviderTransport: piAgentModule.ProviderTransport,
|
|
4110
|
+
getModel: piAiModule.getModel,
|
|
4111
|
+
getEnvApiKey: piAiModule.getEnvApiKey
|
|
4112
|
+
};
|
|
4113
|
+
}
|
|
4114
|
+
var PiAgentSdkProvider = class {
|
|
4115
|
+
id;
|
|
4116
|
+
kind = "pi-agent-sdk";
|
|
4117
|
+
targetName;
|
|
4118
|
+
supportsBatch = false;
|
|
4119
|
+
config;
|
|
4120
|
+
constructor(targetName, config) {
|
|
4121
|
+
this.id = `pi-agent-sdk:${targetName}`;
|
|
4122
|
+
this.targetName = targetName;
|
|
4123
|
+
this.config = config;
|
|
4124
|
+
}
|
|
4125
|
+
async invoke(request) {
|
|
4126
|
+
if (request.signal?.aborted) {
|
|
4127
|
+
throw new Error("Pi agent SDK request was aborted before execution");
|
|
4128
|
+
}
|
|
4129
|
+
const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
|
|
4130
|
+
const startTime = Date.now();
|
|
4131
|
+
const providerName = this.config.provider ?? "anthropic";
|
|
4132
|
+
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
4133
|
+
const model = getModel(providerName, modelId);
|
|
4134
|
+
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
4135
|
+
const transport = new ProviderTransport({
|
|
4136
|
+
getApiKey: async (provider) => {
|
|
4137
|
+
return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
|
|
4138
|
+
}
|
|
4139
|
+
});
|
|
4140
|
+
const agent = new Agent({
|
|
4141
|
+
initialState: {
|
|
4142
|
+
systemPrompt,
|
|
4143
|
+
model,
|
|
4144
|
+
tools: [],
|
|
4145
|
+
// No tools for simple Q&A
|
|
4146
|
+
messages: []
|
|
4147
|
+
},
|
|
4148
|
+
transport
|
|
4149
|
+
});
|
|
4150
|
+
const outputMessages = [];
|
|
4151
|
+
let finalAssistantContent = "";
|
|
4152
|
+
const unsubscribe = agent.subscribe((event) => {
|
|
4153
|
+
if (event.type === "message_end") {
|
|
4154
|
+
const msg = event.message;
|
|
4155
|
+
if (msg.role === "assistant") {
|
|
4156
|
+
const content = extractTextContent2(msg.content);
|
|
4157
|
+
if (content) {
|
|
4158
|
+
finalAssistantContent = content;
|
|
4159
|
+
}
|
|
4160
|
+
}
|
|
4161
|
+
}
|
|
4162
|
+
});
|
|
4163
|
+
try {
|
|
4164
|
+
const timeoutMs = this.config.timeoutMs ?? 12e4;
|
|
4165
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
4166
|
+
setTimeout(
|
|
4167
|
+
() => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
|
|
4168
|
+
timeoutMs
|
|
4169
|
+
);
|
|
4170
|
+
});
|
|
4171
|
+
await Promise.race([agent.prompt(request.question), timeoutPromise]);
|
|
4172
|
+
await agent.waitForIdle();
|
|
4173
|
+
const agentMessages = agent.state.messages;
|
|
4174
|
+
for (const msg of agentMessages) {
|
|
4175
|
+
outputMessages.push(convertAgentMessage(msg));
|
|
4176
|
+
}
|
|
4177
|
+
const durationMs = Date.now() - startTime;
|
|
4178
|
+
return {
|
|
4179
|
+
raw: {
|
|
4180
|
+
messages: agentMessages,
|
|
4181
|
+
systemPrompt,
|
|
4182
|
+
model: this.config.model,
|
|
4183
|
+
provider: this.config.provider
|
|
4184
|
+
},
|
|
4185
|
+
outputMessages,
|
|
4186
|
+
durationMs
|
|
4187
|
+
};
|
|
4188
|
+
} finally {
|
|
4189
|
+
unsubscribe();
|
|
4190
|
+
}
|
|
4191
|
+
}
|
|
4192
|
+
};
|
|
4193
|
+
function extractTextContent2(content) {
|
|
4194
|
+
if (typeof content === "string") {
|
|
4195
|
+
return content;
|
|
4196
|
+
}
|
|
4197
|
+
if (!Array.isArray(content)) {
|
|
4198
|
+
return void 0;
|
|
4199
|
+
}
|
|
4200
|
+
const textParts = [];
|
|
4201
|
+
for (const part of content) {
|
|
4202
|
+
if (!part || typeof part !== "object") {
|
|
4203
|
+
continue;
|
|
4204
|
+
}
|
|
4205
|
+
const p = part;
|
|
4206
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
4207
|
+
textParts.push(p.text);
|
|
4208
|
+
}
|
|
4209
|
+
}
|
|
4210
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4211
|
+
}
|
|
4212
|
+
function convertAgentMessage(message) {
|
|
4213
|
+
if (!message || typeof message !== "object") {
|
|
4214
|
+
return { role: "unknown", content: String(message) };
|
|
4215
|
+
}
|
|
4216
|
+
const msg = message;
|
|
4217
|
+
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
4218
|
+
const content = extractTextContent2(msg.content);
|
|
4219
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
4220
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4221
|
+
return {
|
|
4222
|
+
role,
|
|
4223
|
+
content,
|
|
4224
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
4225
|
+
timestamp
|
|
4226
|
+
};
|
|
4227
|
+
}
|
|
4228
|
+
function extractToolCalls2(content) {
|
|
4229
|
+
if (!Array.isArray(content)) {
|
|
4230
|
+
return [];
|
|
4231
|
+
}
|
|
4232
|
+
const toolCalls = [];
|
|
4233
|
+
for (const part of content) {
|
|
4234
|
+
if (!part || typeof part !== "object") {
|
|
4235
|
+
continue;
|
|
4236
|
+
}
|
|
4237
|
+
const p = part;
|
|
4238
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
4239
|
+
toolCalls.push({
|
|
4240
|
+
tool: p.name,
|
|
4241
|
+
input: p.input,
|
|
4242
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
4243
|
+
});
|
|
4244
|
+
}
|
|
4245
|
+
}
|
|
4246
|
+
return toolCalls;
|
|
4247
|
+
}
|
|
4248
|
+
|
|
4087
4249
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
4088
4250
|
import { spawn as spawn3 } from "node:child_process";
|
|
4089
4251
|
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
@@ -4599,8 +4761,8 @@ function convertPiMessage(message) {
|
|
|
4599
4761
|
if (typeof role !== "string") {
|
|
4600
4762
|
return void 0;
|
|
4601
4763
|
}
|
|
4602
|
-
const content =
|
|
4603
|
-
const toolCalls =
|
|
4764
|
+
const content = extractTextContent3(msg.content);
|
|
4765
|
+
const toolCalls = extractToolCalls3(msg.content);
|
|
4604
4766
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4605
4767
|
const metadata = {};
|
|
4606
4768
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -4616,7 +4778,7 @@ function convertPiMessage(message) {
|
|
|
4616
4778
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
4617
4779
|
};
|
|
4618
4780
|
}
|
|
4619
|
-
function
|
|
4781
|
+
function extractTextContent3(content) {
|
|
4620
4782
|
if (typeof content === "string") {
|
|
4621
4783
|
return content;
|
|
4622
4784
|
}
|
|
@@ -4635,7 +4797,7 @@ function extractTextContent2(content) {
|
|
|
4635
4797
|
}
|
|
4636
4798
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4637
4799
|
}
|
|
4638
|
-
function
|
|
4800
|
+
function extractToolCalls3(content) {
|
|
4639
4801
|
if (!Array.isArray(content)) {
|
|
4640
4802
|
return [];
|
|
4641
4803
|
}
|
|
@@ -5130,6 +5292,8 @@ function createProvider(target) {
|
|
|
5130
5292
|
return new CodexProvider(target.name, target.config);
|
|
5131
5293
|
case "pi-coding-agent":
|
|
5132
5294
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
5295
|
+
case "pi-agent-sdk":
|
|
5296
|
+
return new PiAgentSdkProvider(target.name, target.config);
|
|
5133
5297
|
case "claude-code":
|
|
5134
5298
|
return new ClaudeCodeProvider(target.name, target.config);
|
|
5135
5299
|
case "mock":
|
|
@@ -5148,25 +5312,80 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
5148
5312
|
return createProvider(resolved);
|
|
5149
5313
|
}
|
|
5150
5314
|
|
|
5151
|
-
// src/evaluation/evaluators.ts
|
|
5152
|
-
|
|
5153
|
-
|
|
5154
|
-
|
|
5155
|
-
// src/runtime/exec.ts
|
|
5156
|
-
function shellEscapePath(value) {
|
|
5157
|
-
if (process.platform === "win32") {
|
|
5158
|
-
return `"${value.replaceAll('"', '""')}"`;
|
|
5315
|
+
// src/evaluation/evaluators/scoring.ts
|
|
5316
|
+
function scoreToVerdict(score) {
|
|
5317
|
+
if (score >= 0.8) {
|
|
5318
|
+
return "pass";
|
|
5159
5319
|
}
|
|
5160
|
-
|
|
5320
|
+
if (score >= 0.6) {
|
|
5321
|
+
return "borderline";
|
|
5322
|
+
}
|
|
5323
|
+
return "fail";
|
|
5161
5324
|
}
|
|
5162
|
-
|
|
5163
|
-
if (
|
|
5164
|
-
|
|
5325
|
+
function clampScore(value) {
|
|
5326
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5327
|
+
return 0;
|
|
5165
5328
|
}
|
|
5166
|
-
if (
|
|
5167
|
-
return
|
|
5329
|
+
if (value < 0) {
|
|
5330
|
+
return 0;
|
|
5168
5331
|
}
|
|
5169
|
-
|
|
5332
|
+
if (value > 1) {
|
|
5333
|
+
return 1;
|
|
5334
|
+
}
|
|
5335
|
+
return value;
|
|
5336
|
+
}
|
|
5337
|
+
function extractJsonBlob(text) {
|
|
5338
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
5339
|
+
return match?.[0];
|
|
5340
|
+
}
|
|
5341
|
+
function parseJsonFromText(text) {
|
|
5342
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5343
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5344
|
+
return JSON.parse(blob);
|
|
5345
|
+
}
|
|
5346
|
+
function isNonEmptyString(value) {
|
|
5347
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
5348
|
+
}
|
|
5349
|
+
function parseJsonSafe(payload) {
|
|
5350
|
+
try {
|
|
5351
|
+
return JSON.parse(payload);
|
|
5352
|
+
} catch {
|
|
5353
|
+
return void 0;
|
|
5354
|
+
}
|
|
5355
|
+
}
|
|
5356
|
+
function deepEqual(a, b) {
|
|
5357
|
+
if (a === b) return true;
|
|
5358
|
+
if (a === null || b === null) return a === b;
|
|
5359
|
+
if (typeof a !== typeof b) return false;
|
|
5360
|
+
if (typeof a !== "object") return a === b;
|
|
5361
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5362
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5363
|
+
if (a.length !== b.length) return false;
|
|
5364
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5365
|
+
}
|
|
5366
|
+
const aObj = a;
|
|
5367
|
+
const bObj = b;
|
|
5368
|
+
const aKeys = Object.keys(aObj);
|
|
5369
|
+
const bKeys = Object.keys(bObj);
|
|
5370
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
5371
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5372
|
+
}
|
|
5373
|
+
|
|
5374
|
+
// src/runtime/exec.ts
|
|
5375
|
+
function shellEscapePath(value) {
|
|
5376
|
+
if (process.platform === "win32") {
|
|
5377
|
+
return `"${value.replaceAll('"', '""')}"`;
|
|
5378
|
+
}
|
|
5379
|
+
return `'${value.replaceAll("'", `'"'"'`)}'`;
|
|
5380
|
+
}
|
|
5381
|
+
async function execFileWithStdin(argv, stdinPayload, options = {}) {
|
|
5382
|
+
if (argv.length === 0) {
|
|
5383
|
+
throw new Error("Executable argv must include at least one entry");
|
|
5384
|
+
}
|
|
5385
|
+
if (typeof Bun !== "undefined") {
|
|
5386
|
+
return execFileWithStdinBun(argv, stdinPayload, options);
|
|
5387
|
+
}
|
|
5388
|
+
return execFileWithStdinNode(argv, stdinPayload, options);
|
|
5170
5389
|
}
|
|
5171
5390
|
async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
5172
5391
|
const command = [...argv];
|
|
@@ -5175,7 +5394,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
|
5175
5394
|
cwd: options.cwd,
|
|
5176
5395
|
stdin: encoder.encode(stdinPayload),
|
|
5177
5396
|
stdout: "pipe",
|
|
5178
|
-
stderr: "pipe"
|
|
5397
|
+
stderr: "pipe",
|
|
5398
|
+
// Merge additional env vars with process.env
|
|
5399
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5179
5400
|
});
|
|
5180
5401
|
let timedOut = false;
|
|
5181
5402
|
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
@@ -5210,7 +5431,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
5210
5431
|
const [cmd, ...args] = argv;
|
|
5211
5432
|
const child = spawn4(cmd, args, {
|
|
5212
5433
|
cwd: options.cwd,
|
|
5213
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
5434
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
5435
|
+
// Merge additional env vars with process.env
|
|
5436
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5214
5437
|
});
|
|
5215
5438
|
const stdoutChunks = [];
|
|
5216
5439
|
const stderrChunks = [];
|
|
@@ -5263,7 +5486,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5263
5486
|
const child = spawn4(wrappedCommand, {
|
|
5264
5487
|
shell: true,
|
|
5265
5488
|
cwd: options.cwd,
|
|
5266
|
-
stdio: ["ignore", "ignore", "ignore"]
|
|
5489
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
5490
|
+
// Merge additional env vars with process.env
|
|
5491
|
+
env: options.env ? { ...process.env, ...options.env } : process.env
|
|
5267
5492
|
});
|
|
5268
5493
|
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5269
5494
|
child.kill();
|
|
@@ -5290,6 +5515,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5290
5515
|
}
|
|
5291
5516
|
}
|
|
5292
5517
|
|
|
5518
|
+
// src/runtime/target-proxy.ts
|
|
5519
|
+
import { randomBytes } from "node:crypto";
|
|
5520
|
+
import { createServer } from "node:http";
|
|
5521
|
+
var DEFAULT_MAX_CALLS = 50;
|
|
5522
|
+
async function createTargetProxy(options) {
|
|
5523
|
+
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
5524
|
+
const token = randomBytes(32).toString("hex");
|
|
5525
|
+
let callCount = 0;
|
|
5526
|
+
let isShutdown = false;
|
|
5527
|
+
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
5528
|
+
function resolveProvider(targetName) {
|
|
5529
|
+
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
5530
|
+
return defaultProvider;
|
|
5531
|
+
}
|
|
5532
|
+
if (targetResolver) {
|
|
5533
|
+
return targetResolver(targetName);
|
|
5534
|
+
}
|
|
5535
|
+
return void 0;
|
|
5536
|
+
}
|
|
5537
|
+
const server = createServer(async (req, res) => {
|
|
5538
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
5539
|
+
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
|
|
5540
|
+
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
|
|
5541
|
+
if (req.method === "OPTIONS") {
|
|
5542
|
+
res.writeHead(204);
|
|
5543
|
+
res.end();
|
|
5544
|
+
return;
|
|
5545
|
+
}
|
|
5546
|
+
const authHeader = req.headers.authorization;
|
|
5547
|
+
if (!authHeader || authHeader !== `Bearer ${token}`) {
|
|
5548
|
+
sendJson(res, 401, { error: "Unauthorized" });
|
|
5549
|
+
return;
|
|
5550
|
+
}
|
|
5551
|
+
if (isShutdown) {
|
|
5552
|
+
sendJson(res, 503, { error: "Proxy is shutting down" });
|
|
5553
|
+
return;
|
|
5554
|
+
}
|
|
5555
|
+
const url2 = req.url ?? "";
|
|
5556
|
+
if (req.method === "GET" && url2 === "/info") {
|
|
5557
|
+
handleInfo(res);
|
|
5558
|
+
return;
|
|
5559
|
+
}
|
|
5560
|
+
if (req.method === "POST" && url2 === "/invoke") {
|
|
5561
|
+
await handleInvoke(req, res);
|
|
5562
|
+
return;
|
|
5563
|
+
}
|
|
5564
|
+
if (req.method === "POST" && url2 === "/invokeBatch") {
|
|
5565
|
+
await handleInvokeBatch(req, res);
|
|
5566
|
+
return;
|
|
5567
|
+
}
|
|
5568
|
+
sendJson(res, 404, { error: "Not found" });
|
|
5569
|
+
});
|
|
5570
|
+
function handleInfo(res) {
|
|
5571
|
+
const response = {
|
|
5572
|
+
targetName: defaultProvider.targetName,
|
|
5573
|
+
maxCalls,
|
|
5574
|
+
callCount,
|
|
5575
|
+
availableTargets: targetsList
|
|
5576
|
+
};
|
|
5577
|
+
sendJson(res, 200, response);
|
|
5578
|
+
}
|
|
5579
|
+
async function handleInvoke(req, res) {
|
|
5580
|
+
if (callCount >= maxCalls) {
|
|
5581
|
+
sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
|
|
5582
|
+
return;
|
|
5583
|
+
}
|
|
5584
|
+
try {
|
|
5585
|
+
const body = await readBody(req);
|
|
5586
|
+
const request = JSON.parse(body);
|
|
5587
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5588
|
+
sendJson(res, 400, { error: "Missing required field: question" });
|
|
5589
|
+
return;
|
|
5590
|
+
}
|
|
5591
|
+
const provider = resolveProvider(request.target);
|
|
5592
|
+
if (!provider) {
|
|
5593
|
+
sendJson(res, 400, {
|
|
5594
|
+
error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5595
|
+
});
|
|
5596
|
+
return;
|
|
5597
|
+
}
|
|
5598
|
+
callCount++;
|
|
5599
|
+
const response = await provider.invoke({
|
|
5600
|
+
question: request.question,
|
|
5601
|
+
systemPrompt: request.systemPrompt,
|
|
5602
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5603
|
+
attempt: request.attempt ?? 1
|
|
5604
|
+
});
|
|
5605
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5606
|
+
const rawText = extractLastAssistantContent2(outputMessages);
|
|
5607
|
+
const result = {
|
|
5608
|
+
outputMessages,
|
|
5609
|
+
rawText
|
|
5610
|
+
};
|
|
5611
|
+
sendJson(res, 200, result);
|
|
5612
|
+
} catch (error) {
|
|
5613
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5614
|
+
sendJson(res, 500, { error: message });
|
|
5615
|
+
}
|
|
5616
|
+
}
|
|
5617
|
+
async function handleInvokeBatch(req, res) {
|
|
5618
|
+
try {
|
|
5619
|
+
const body = await readBody(req);
|
|
5620
|
+
const { requests } = JSON.parse(body);
|
|
5621
|
+
if (!Array.isArray(requests)) {
|
|
5622
|
+
sendJson(res, 400, { error: "Missing required field: requests (array)" });
|
|
5623
|
+
return;
|
|
5624
|
+
}
|
|
5625
|
+
if (callCount + requests.length > maxCalls) {
|
|
5626
|
+
sendJson(res, 429, {
|
|
5627
|
+
error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
|
|
5628
|
+
});
|
|
5629
|
+
return;
|
|
5630
|
+
}
|
|
5631
|
+
const responses = [];
|
|
5632
|
+
for (const request of requests) {
|
|
5633
|
+
if (!request.question || typeof request.question !== "string") {
|
|
5634
|
+
responses.push({
|
|
5635
|
+
outputMessages: [],
|
|
5636
|
+
rawText: "Error: Missing required field: question"
|
|
5637
|
+
});
|
|
5638
|
+
continue;
|
|
5639
|
+
}
|
|
5640
|
+
const provider = resolveProvider(request.target);
|
|
5641
|
+
if (!provider) {
|
|
5642
|
+
responses.push({
|
|
5643
|
+
outputMessages: [],
|
|
5644
|
+
rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
|
|
5645
|
+
});
|
|
5646
|
+
continue;
|
|
5647
|
+
}
|
|
5648
|
+
callCount++;
|
|
5649
|
+
try {
|
|
5650
|
+
const response = await provider.invoke({
|
|
5651
|
+
question: request.question,
|
|
5652
|
+
systemPrompt: request.systemPrompt,
|
|
5653
|
+
evalCaseId: request.evalCaseId ?? "proxy",
|
|
5654
|
+
attempt: request.attempt ?? 1
|
|
5655
|
+
});
|
|
5656
|
+
const outputMessages = response.outputMessages ?? [];
|
|
5657
|
+
responses.push({
|
|
5658
|
+
outputMessages,
|
|
5659
|
+
rawText: extractLastAssistantContent2(outputMessages)
|
|
5660
|
+
});
|
|
5661
|
+
} catch (error) {
|
|
5662
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5663
|
+
responses.push({
|
|
5664
|
+
outputMessages: [],
|
|
5665
|
+
rawText: `Error: ${message}`
|
|
5666
|
+
});
|
|
5667
|
+
}
|
|
5668
|
+
}
|
|
5669
|
+
sendJson(res, 200, { responses });
|
|
5670
|
+
} catch (error) {
|
|
5671
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5672
|
+
sendJson(res, 500, { error: message });
|
|
5673
|
+
}
|
|
5674
|
+
}
|
|
5675
|
+
await new Promise((resolve, reject) => {
|
|
5676
|
+
server.once("error", reject);
|
|
5677
|
+
server.listen(0, "127.0.0.1", () => {
|
|
5678
|
+
server.removeListener("error", reject);
|
|
5679
|
+
resolve();
|
|
5680
|
+
});
|
|
5681
|
+
});
|
|
5682
|
+
const address = server.address();
|
|
5683
|
+
const url = `http://127.0.0.1:${address.port}`;
|
|
5684
|
+
return {
|
|
5685
|
+
url,
|
|
5686
|
+
token,
|
|
5687
|
+
shutdown: async () => {
|
|
5688
|
+
isShutdown = true;
|
|
5689
|
+
return new Promise((resolve, reject) => {
|
|
5690
|
+
server.close((err) => {
|
|
5691
|
+
if (err) reject(err);
|
|
5692
|
+
else resolve();
|
|
5693
|
+
});
|
|
5694
|
+
});
|
|
5695
|
+
},
|
|
5696
|
+
getUsageMetadata: () => ({
|
|
5697
|
+
callCount,
|
|
5698
|
+
maxCalls
|
|
5699
|
+
})
|
|
5700
|
+
};
|
|
5701
|
+
}
|
|
5702
|
+
function sendJson(res, statusCode, body) {
|
|
5703
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
5704
|
+
res.end(JSON.stringify(body));
|
|
5705
|
+
}
|
|
5706
|
+
function readBody(req) {
|
|
5707
|
+
return new Promise((resolve, reject) => {
|
|
5708
|
+
const chunks = [];
|
|
5709
|
+
req.on("data", (chunk) => chunks.push(chunk));
|
|
5710
|
+
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
5711
|
+
req.on("error", reject);
|
|
5712
|
+
});
|
|
5713
|
+
}
|
|
5714
|
+
function extractLastAssistantContent2(messages) {
|
|
5715
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
5716
|
+
const msg = messages[i];
|
|
5717
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
5718
|
+
if (typeof msg.content === "string") {
|
|
5719
|
+
return msg.content;
|
|
5720
|
+
}
|
|
5721
|
+
if (Array.isArray(msg.content)) {
|
|
5722
|
+
for (const part of msg.content) {
|
|
5723
|
+
if (typeof part === "object" && part !== null && "text" in part) {
|
|
5724
|
+
return String(part.text);
|
|
5725
|
+
}
|
|
5726
|
+
}
|
|
5727
|
+
}
|
|
5728
|
+
}
|
|
5729
|
+
}
|
|
5730
|
+
return void 0;
|
|
5731
|
+
}
|
|
5732
|
+
|
|
5293
5733
|
// src/evaluation/case-conversion.ts
|
|
5294
5734
|
function toSnakeCase(str) {
|
|
5295
5735
|
if (/^[A-Z]/.test(str)) {
|
|
@@ -5297,12 +5737,6 @@ function toSnakeCase(str) {
|
|
|
5297
5737
|
}
|
|
5298
5738
|
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
5299
5739
|
}
|
|
5300
|
-
function toCamelCase(str) {
|
|
5301
|
-
if (/^[A-Z]/.test(str)) {
|
|
5302
|
-
return str;
|
|
5303
|
-
}
|
|
5304
|
-
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
5305
|
-
}
|
|
5306
5740
|
function toSnakeCaseDeep(obj) {
|
|
5307
5741
|
if (obj === null || obj === void 0) {
|
|
5308
5742
|
return obj;
|
|
@@ -5320,25 +5754,148 @@ function toSnakeCaseDeep(obj) {
|
|
|
5320
5754
|
}
|
|
5321
5755
|
return obj;
|
|
5322
5756
|
}
|
|
5323
|
-
|
|
5324
|
-
|
|
5325
|
-
|
|
5326
|
-
|
|
5327
|
-
|
|
5328
|
-
|
|
5757
|
+
|
|
5758
|
+
// src/evaluation/evaluators/code-evaluator.ts
|
|
5759
|
+
var CodeEvaluator = class {
|
|
5760
|
+
kind = "code";
|
|
5761
|
+
script;
|
|
5762
|
+
cwd;
|
|
5763
|
+
agentTimeoutMs;
|
|
5764
|
+
config;
|
|
5765
|
+
target;
|
|
5766
|
+
constructor(options) {
|
|
5767
|
+
this.script = options.script;
|
|
5768
|
+
this.cwd = options.cwd;
|
|
5769
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5770
|
+
this.config = options.config;
|
|
5771
|
+
this.target = options.target;
|
|
5329
5772
|
}
|
|
5330
|
-
|
|
5331
|
-
const
|
|
5332
|
-
|
|
5333
|
-
|
|
5334
|
-
|
|
5773
|
+
async evaluate(context) {
|
|
5774
|
+
const payload = {
|
|
5775
|
+
question: context.evalCase.question,
|
|
5776
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5777
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5778
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5779
|
+
candidateAnswer: context.candidate,
|
|
5780
|
+
outputMessages: context.outputMessages ?? null,
|
|
5781
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5782
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5783
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5784
|
+
),
|
|
5785
|
+
inputMessages: context.evalCase.input_messages,
|
|
5786
|
+
traceSummary: context.traceSummary ?? null,
|
|
5787
|
+
config: this.config ?? null
|
|
5788
|
+
};
|
|
5789
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5790
|
+
let proxyEnv;
|
|
5791
|
+
let proxyShutdown;
|
|
5792
|
+
let getProxyUsage;
|
|
5793
|
+
if (this.target !== void 0 && context.judgeProvider) {
|
|
5794
|
+
const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
|
|
5795
|
+
const proxy = await createTargetProxy({
|
|
5796
|
+
defaultProvider: context.judgeProvider,
|
|
5797
|
+
targetResolver: context.targetResolver,
|
|
5798
|
+
availableTargets: context.availableTargets,
|
|
5799
|
+
maxCalls
|
|
5800
|
+
});
|
|
5801
|
+
proxyEnv = {
|
|
5802
|
+
AGENTV_TARGET_PROXY_URL: proxy.url,
|
|
5803
|
+
AGENTV_TARGET_PROXY_TOKEN: proxy.token
|
|
5804
|
+
};
|
|
5805
|
+
proxyShutdown = proxy.shutdown;
|
|
5806
|
+
getProxyUsage = proxy.getUsageMetadata;
|
|
5807
|
+
}
|
|
5808
|
+
try {
|
|
5809
|
+
const stdout = await executeScript(
|
|
5810
|
+
this.script,
|
|
5811
|
+
inputPayload,
|
|
5812
|
+
this.agentTimeoutMs,
|
|
5813
|
+
this.cwd,
|
|
5814
|
+
proxyEnv
|
|
5815
|
+
);
|
|
5816
|
+
const parsed = parseJsonSafe(stdout);
|
|
5817
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5818
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5819
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5820
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5821
|
+
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
5822
|
+
const proxyUsage = getProxyUsage?.();
|
|
5823
|
+
const evaluatorRawRequest = {
|
|
5824
|
+
script: this.script,
|
|
5825
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5826
|
+
...proxyUsage ? {
|
|
5827
|
+
target_proxy: {
|
|
5828
|
+
call_count: proxyUsage.callCount,
|
|
5829
|
+
max_calls: proxyUsage.maxCalls
|
|
5830
|
+
}
|
|
5831
|
+
} : {}
|
|
5832
|
+
};
|
|
5833
|
+
return {
|
|
5834
|
+
score,
|
|
5835
|
+
verdict: scoreToVerdict(score),
|
|
5836
|
+
hits,
|
|
5837
|
+
misses,
|
|
5838
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
5839
|
+
reasoning,
|
|
5840
|
+
evaluatorRawRequest,
|
|
5841
|
+
...details ? { details } : {}
|
|
5842
|
+
};
|
|
5843
|
+
} catch (error) {
|
|
5844
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5845
|
+
const proxyUsage = getProxyUsage?.();
|
|
5846
|
+
return {
|
|
5847
|
+
score: 0,
|
|
5848
|
+
verdict: "fail",
|
|
5849
|
+
hits: [],
|
|
5850
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
5851
|
+
expectedAspectCount: 1,
|
|
5852
|
+
reasoning: message,
|
|
5853
|
+
evaluatorRawRequest: {
|
|
5854
|
+
script: this.script,
|
|
5855
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
5856
|
+
...proxyUsage ? {
|
|
5857
|
+
target_proxy: {
|
|
5858
|
+
call_count: proxyUsage.callCount,
|
|
5859
|
+
max_calls: proxyUsage.maxCalls
|
|
5860
|
+
}
|
|
5861
|
+
} : {},
|
|
5862
|
+
error: message
|
|
5863
|
+
}
|
|
5864
|
+
};
|
|
5865
|
+
} finally {
|
|
5866
|
+
if (proxyShutdown) {
|
|
5867
|
+
await proxyShutdown();
|
|
5868
|
+
}
|
|
5335
5869
|
}
|
|
5336
|
-
return result;
|
|
5337
5870
|
}
|
|
5338
|
-
|
|
5871
|
+
};
|
|
5872
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5873
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
5874
|
+
if (exitCode !== 0) {
|
|
5875
|
+
const trimmedErr = formatStderr(stderr);
|
|
5876
|
+
throw new Error(
|
|
5877
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5878
|
+
);
|
|
5879
|
+
}
|
|
5880
|
+
return stdout.trim();
|
|
5881
|
+
}
|
|
5882
|
+
function formatStderr(stderr) {
|
|
5883
|
+
const trimmed = stderr.trim();
|
|
5884
|
+
const maxLength = 2e3;
|
|
5885
|
+
if (trimmed.length <= maxLength) {
|
|
5886
|
+
return trimmed;
|
|
5887
|
+
}
|
|
5888
|
+
const tail = trimmed.slice(-maxLength);
|
|
5889
|
+
return `...(truncated, last ${maxLength} chars)
|
|
5890
|
+
${tail}`;
|
|
5339
5891
|
}
|
|
5340
5892
|
|
|
5341
|
-
// src/evaluation/evaluators.ts
|
|
5893
|
+
// src/evaluation/evaluators/composite.ts
|
|
5894
|
+
import { generateText as generateText3 } from "ai";
|
|
5895
|
+
|
|
5896
|
+
// src/evaluation/evaluators/llm-judge.ts
|
|
5897
|
+
import { generateText as generateText2 } from "ai";
|
|
5898
|
+
import { z as z2 } from "zod";
|
|
5342
5899
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
5343
5900
|
|
|
5344
5901
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -5418,7 +5975,7 @@ var LlmJudgeEvaluator = class {
|
|
|
5418
5975
|
target: judgeProvider.targetName
|
|
5419
5976
|
};
|
|
5420
5977
|
try {
|
|
5421
|
-
const { data
|
|
5978
|
+
const { data } = await this.runWithRetry({
|
|
5422
5979
|
context,
|
|
5423
5980
|
judgeProvider,
|
|
5424
5981
|
systemPrompt,
|
|
@@ -5567,105 +6124,11 @@ You must return a valid JSON object matching this schema:
|
|
|
5567
6124
|
"overall_reasoning": "string (summary)"
|
|
5568
6125
|
}`;
|
|
5569
6126
|
}
|
|
5570
|
-
function
|
|
5571
|
-
|
|
5572
|
-
return
|
|
5573
|
-
}
|
|
5574
|
-
if (score >= 0.6) {
|
|
5575
|
-
return "borderline";
|
|
5576
|
-
}
|
|
5577
|
-
return "fail";
|
|
6127
|
+
function substituteVariables(template, variables) {
|
|
6128
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
6129
|
+
return variables[varName] ?? match;
|
|
6130
|
+
});
|
|
5578
6131
|
}
|
|
5579
|
-
function clampScore(value) {
|
|
5580
|
-
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
5581
|
-
return 0;
|
|
5582
|
-
}
|
|
5583
|
-
if (value < 0) {
|
|
5584
|
-
return 0;
|
|
5585
|
-
}
|
|
5586
|
-
if (value > 1) {
|
|
5587
|
-
return 1;
|
|
5588
|
-
}
|
|
5589
|
-
return value;
|
|
5590
|
-
}
|
|
5591
|
-
function extractJsonBlob(text) {
|
|
5592
|
-
const match = text.match(/\{[\s\S]*\}/);
|
|
5593
|
-
return match?.[0];
|
|
5594
|
-
}
|
|
5595
|
-
function parseJsonFromText(text) {
|
|
5596
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
5597
|
-
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
5598
|
-
return JSON.parse(blob);
|
|
5599
|
-
}
|
|
5600
|
-
function isNonEmptyString(value) {
|
|
5601
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
5602
|
-
}
|
|
5603
|
-
var CodeEvaluator = class {
|
|
5604
|
-
kind = "code";
|
|
5605
|
-
script;
|
|
5606
|
-
cwd;
|
|
5607
|
-
agentTimeoutMs;
|
|
5608
|
-
config;
|
|
5609
|
-
constructor(options) {
|
|
5610
|
-
this.script = options.script;
|
|
5611
|
-
this.cwd = options.cwd;
|
|
5612
|
-
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5613
|
-
this.config = options.config;
|
|
5614
|
-
}
|
|
5615
|
-
async evaluate(context) {
|
|
5616
|
-
const payload = {
|
|
5617
|
-
question: context.evalCase.question,
|
|
5618
|
-
expectedOutcome: context.evalCase.expected_outcome,
|
|
5619
|
-
expectedMessages: context.evalCase.expected_messages,
|
|
5620
|
-
referenceAnswer: context.evalCase.reference_answer,
|
|
5621
|
-
candidateAnswer: context.candidate,
|
|
5622
|
-
outputMessages: context.outputMessages ?? null,
|
|
5623
|
-
guidelineFiles: context.evalCase.guideline_paths,
|
|
5624
|
-
inputFiles: context.evalCase.file_paths.filter(
|
|
5625
|
-
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5626
|
-
),
|
|
5627
|
-
inputMessages: context.evalCase.input_messages,
|
|
5628
|
-
traceSummary: context.traceSummary ?? null,
|
|
5629
|
-
config: this.config ?? null
|
|
5630
|
-
};
|
|
5631
|
-
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5632
|
-
try {
|
|
5633
|
-
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
5634
|
-
const parsed = parseJsonSafe(stdout);
|
|
5635
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
5636
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
5637
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
5638
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
5639
|
-
return {
|
|
5640
|
-
score,
|
|
5641
|
-
verdict: scoreToVerdict(score),
|
|
5642
|
-
hits,
|
|
5643
|
-
misses,
|
|
5644
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
5645
|
-
reasoning,
|
|
5646
|
-
evaluatorRawRequest: {
|
|
5647
|
-
script: this.script,
|
|
5648
|
-
...this.cwd ? { cwd: this.cwd } : {}
|
|
5649
|
-
}
|
|
5650
|
-
};
|
|
5651
|
-
} catch (error) {
|
|
5652
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
5653
|
-
return {
|
|
5654
|
-
score: 0,
|
|
5655
|
-
verdict: "fail",
|
|
5656
|
-
hits: [],
|
|
5657
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
5658
|
-
expectedAspectCount: 1,
|
|
5659
|
-
reasoning: message,
|
|
5660
|
-
evaluatorRawRequest: {
|
|
5661
|
-
script: this.script,
|
|
5662
|
-
...this.cwd ? { cwd: this.cwd } : {},
|
|
5663
|
-
error: message
|
|
5664
|
-
}
|
|
5665
|
-
};
|
|
5666
|
-
}
|
|
5667
|
-
}
|
|
5668
|
-
};
|
|
5669
6132
|
function calculateRubricScore(result, rubrics) {
|
|
5670
6133
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
5671
6134
|
const hits = [];
|
|
@@ -5693,273 +6156,281 @@ function calculateRubricScore(result, rubrics) {
|
|
|
5693
6156
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
5694
6157
|
return { score, verdict, hits, misses };
|
|
5695
6158
|
}
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5699
|
-
|
|
5700
|
-
|
|
5701
|
-
|
|
5702
|
-
|
|
5703
|
-
|
|
5704
|
-
|
|
5705
|
-
}
|
|
5706
|
-
function formatStderr(stderr) {
|
|
5707
|
-
const trimmed = stderr.trim();
|
|
5708
|
-
const maxLength = 2e3;
|
|
5709
|
-
if (trimmed.length <= maxLength) {
|
|
5710
|
-
return trimmed;
|
|
5711
|
-
}
|
|
5712
|
-
const tail = trimmed.slice(-maxLength);
|
|
5713
|
-
return `...(truncated, last ${maxLength} chars)
|
|
5714
|
-
${tail}`;
|
|
5715
|
-
}
|
|
5716
|
-
function parseJsonSafe(payload) {
|
|
5717
|
-
try {
|
|
5718
|
-
return JSON.parse(payload);
|
|
5719
|
-
} catch {
|
|
5720
|
-
return void 0;
|
|
5721
|
-
}
|
|
5722
|
-
}
|
|
5723
|
-
function substituteVariables(template, variables) {
|
|
5724
|
-
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
5725
|
-
return variables[varName] ?? match;
|
|
5726
|
-
});
|
|
5727
|
-
}
|
|
5728
|
-
function deepEqual(a, b) {
|
|
5729
|
-
if (a === b) return true;
|
|
5730
|
-
if (a === null || b === null) return a === b;
|
|
5731
|
-
if (typeof a !== typeof b) return false;
|
|
5732
|
-
if (typeof a !== "object") return a === b;
|
|
5733
|
-
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5734
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5735
|
-
if (a.length !== b.length) return false;
|
|
5736
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5737
|
-
}
|
|
5738
|
-
const aObj = a;
|
|
5739
|
-
const bObj = b;
|
|
5740
|
-
const aKeys = Object.keys(aObj);
|
|
5741
|
-
const bKeys = Object.keys(bObj);
|
|
5742
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
5743
|
-
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5744
|
-
}
|
|
5745
|
-
function argsMatch(expected, actual) {
|
|
5746
|
-
if (expected === void 0) return true;
|
|
5747
|
-
if (expected === "any") return true;
|
|
5748
|
-
if (actual === void 0) return false;
|
|
5749
|
-
for (const key of Object.keys(expected)) {
|
|
5750
|
-
if (!Object.hasOwn(actual, key)) return false;
|
|
5751
|
-
if (!deepEqual(expected[key], actual[key])) return false;
|
|
5752
|
-
}
|
|
5753
|
-
return true;
|
|
5754
|
-
}
|
|
5755
|
-
var ToolTrajectoryEvaluator = class {
|
|
5756
|
-
kind = "tool_trajectory";
|
|
6159
|
+
|
|
6160
|
+
// src/evaluation/evaluators/composite.ts
|
|
6161
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6162
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
6163
|
+
|
|
6164
|
+
Decide the final score and verdict based on all evaluator results.
|
|
6165
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6166
|
+
var CompositeEvaluator = class {
|
|
6167
|
+
kind = "composite";
|
|
5757
6168
|
config;
|
|
6169
|
+
evaluatorFactory;
|
|
6170
|
+
cwd;
|
|
5758
6171
|
constructor(options) {
|
|
5759
6172
|
this.config = options.config;
|
|
6173
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
6174
|
+
this.cwd = options.cwd;
|
|
5760
6175
|
}
|
|
5761
|
-
evaluate(context) {
|
|
5762
|
-
const
|
|
5763
|
-
|
|
5764
|
-
|
|
5765
|
-
return {
|
|
5766
|
-
score: 0,
|
|
5767
|
-
verdict: "fail",
|
|
5768
|
-
hits: [],
|
|
5769
|
-
misses: ["No trace available for evaluation"],
|
|
5770
|
-
expectedAspectCount: 1
|
|
5771
|
-
};
|
|
5772
|
-
}
|
|
5773
|
-
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
5774
|
-
if (!summary) {
|
|
5775
|
-
return {
|
|
5776
|
-
score: 0,
|
|
5777
|
-
verdict: "fail",
|
|
5778
|
-
hits: [],
|
|
5779
|
-
misses: ["No trace available for evaluation"],
|
|
5780
|
-
expectedAspectCount: 1
|
|
5781
|
-
};
|
|
5782
|
-
}
|
|
5783
|
-
switch (this.config.mode) {
|
|
5784
|
-
case "any_order":
|
|
5785
|
-
return this.evaluateAnyOrder(summary);
|
|
5786
|
-
case "in_order":
|
|
5787
|
-
return this.evaluateInOrder(toolCalls);
|
|
5788
|
-
case "exact":
|
|
5789
|
-
return this.evaluateExact(toolCalls);
|
|
5790
|
-
default:
|
|
6176
|
+
async evaluate(context) {
|
|
6177
|
+
const memberResults = await Promise.all(
|
|
6178
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
6179
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
5791
6180
|
return {
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
|
|
5795
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
5796
|
-
expectedAspectCount: 1
|
|
6181
|
+
id: memberConfig.name,
|
|
6182
|
+
type: memberConfig.type,
|
|
6183
|
+
result: await evaluator.evaluate(context)
|
|
5797
6184
|
};
|
|
5798
|
-
|
|
6185
|
+
})
|
|
6186
|
+
);
|
|
6187
|
+
return this.aggregate(memberResults, context);
|
|
5799
6188
|
}
|
|
5800
|
-
|
|
5801
|
-
|
|
5802
|
-
|
|
5803
|
-
|
|
5804
|
-
|
|
5805
|
-
|
|
6189
|
+
async aggregate(results, context) {
|
|
6190
|
+
const aggregator = this.config.aggregator;
|
|
6191
|
+
switch (aggregator.type) {
|
|
6192
|
+
case "code_judge":
|
|
6193
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6194
|
+
case "llm_judge":
|
|
6195
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
6196
|
+
default:
|
|
6197
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
5806
6198
|
}
|
|
5807
|
-
|
|
5808
|
-
|
|
5809
|
-
|
|
5810
|
-
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
5814
|
-
|
|
5815
|
-
|
|
6199
|
+
}
|
|
6200
|
+
runWeightedAverage(results, weights) {
|
|
6201
|
+
let totalWeight = 0;
|
|
6202
|
+
let weightedSum = 0;
|
|
6203
|
+
const allHits = [];
|
|
6204
|
+
const allMisses = [];
|
|
6205
|
+
const reasoningParts = [];
|
|
6206
|
+
const evaluatorResults = [];
|
|
6207
|
+
for (const member of results) {
|
|
6208
|
+
const weight = weights?.[member.id] ?? 1;
|
|
6209
|
+
totalWeight += weight;
|
|
6210
|
+
weightedSum += member.result.score * weight;
|
|
6211
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6212
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6213
|
+
if (member.result.reasoning) {
|
|
6214
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
5816
6215
|
}
|
|
6216
|
+
evaluatorResults.push({
|
|
6217
|
+
name: member.id,
|
|
6218
|
+
type: member.type,
|
|
6219
|
+
score: member.result.score,
|
|
6220
|
+
weight,
|
|
6221
|
+
verdict: member.result.verdict,
|
|
6222
|
+
hits: [...member.result.hits],
|
|
6223
|
+
misses: [...member.result.misses],
|
|
6224
|
+
reasoning: member.result.reasoning,
|
|
6225
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6226
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6227
|
+
details: member.result.details
|
|
6228
|
+
});
|
|
5817
6229
|
}
|
|
5818
|
-
|
|
5819
|
-
}
|
|
5820
|
-
/**
|
|
5821
|
-
* Build a summary from extracted tool calls.
|
|
5822
|
-
*/
|
|
5823
|
-
buildSummary(toolCalls) {
|
|
5824
|
-
const toolCallsByName = {};
|
|
5825
|
-
for (const call of toolCalls) {
|
|
5826
|
-
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
5827
|
-
}
|
|
5828
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
6230
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5829
6231
|
return {
|
|
5830
|
-
|
|
5831
|
-
|
|
5832
|
-
|
|
5833
|
-
|
|
6232
|
+
score: clampScore(finalScore),
|
|
6233
|
+
verdict: scoreToVerdict(finalScore),
|
|
6234
|
+
hits: allHits,
|
|
6235
|
+
misses: allMisses,
|
|
6236
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6237
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6238
|
+
evaluatorRawRequest: {
|
|
6239
|
+
aggregator: "weighted_average",
|
|
6240
|
+
...weights ? { weights } : {}
|
|
6241
|
+
},
|
|
6242
|
+
evaluatorResults
|
|
5834
6243
|
};
|
|
5835
6244
|
}
|
|
5836
|
-
|
|
5837
|
-
const
|
|
5838
|
-
const
|
|
5839
|
-
|
|
6245
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6246
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6247
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6248
|
+
const evaluatorResults = results.map((member) => ({
|
|
6249
|
+
name: member.id,
|
|
6250
|
+
type: member.type,
|
|
6251
|
+
score: member.result.score,
|
|
6252
|
+
weight: weights?.[member.id] ?? 1,
|
|
6253
|
+
verdict: member.result.verdict,
|
|
6254
|
+
hits: [...member.result.hits],
|
|
6255
|
+
misses: [...member.result.misses],
|
|
6256
|
+
reasoning: member.result.reasoning,
|
|
6257
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6258
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6259
|
+
details: member.result.details
|
|
6260
|
+
}));
|
|
6261
|
+
try {
|
|
6262
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6263
|
+
const parsed = parseJsonSafe(stdout);
|
|
6264
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6265
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6266
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6267
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6268
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
5840
6269
|
return {
|
|
5841
|
-
score
|
|
5842
|
-
verdict
|
|
5843
|
-
hits
|
|
5844
|
-
misses
|
|
5845
|
-
expectedAspectCount:
|
|
6270
|
+
score,
|
|
6271
|
+
verdict,
|
|
6272
|
+
hits,
|
|
6273
|
+
misses,
|
|
6274
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
6275
|
+
reasoning,
|
|
6276
|
+
evaluatorRawRequest: {
|
|
6277
|
+
aggregator: "code_judge",
|
|
6278
|
+
script: scriptPath
|
|
6279
|
+
},
|
|
6280
|
+
evaluatorResults
|
|
6281
|
+
};
|
|
6282
|
+
} catch (error) {
|
|
6283
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
6284
|
+
return {
|
|
6285
|
+
score: 0,
|
|
6286
|
+
verdict: "fail",
|
|
6287
|
+
hits: [],
|
|
6288
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
6289
|
+
expectedAspectCount: 1,
|
|
6290
|
+
reasoning: message,
|
|
6291
|
+
evaluatorRawRequest: {
|
|
6292
|
+
aggregator: "code_judge",
|
|
6293
|
+
script: scriptPath,
|
|
6294
|
+
error: message
|
|
6295
|
+
},
|
|
6296
|
+
evaluatorResults
|
|
5846
6297
|
};
|
|
5847
6298
|
}
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
if (actual >= required) {
|
|
5854
|
-
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
5855
|
-
} else {
|
|
5856
|
-
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
5857
|
-
}
|
|
6299
|
+
}
|
|
6300
|
+
async runLlmAggregator(results, context, config) {
|
|
6301
|
+
const judgeProvider = context.judgeProvider;
|
|
6302
|
+
if (!judgeProvider) {
|
|
6303
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
5858
6304
|
}
|
|
5859
|
-
const
|
|
5860
|
-
|
|
5861
|
-
|
|
5862
|
-
|
|
5863
|
-
|
|
5864
|
-
|
|
5865
|
-
|
|
6305
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6306
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6307
|
+
const evaluatorResults = results.map((member) => ({
|
|
6308
|
+
name: member.id,
|
|
6309
|
+
type: member.type,
|
|
6310
|
+
score: member.result.score,
|
|
6311
|
+
verdict: member.result.verdict,
|
|
6312
|
+
hits: [...member.result.hits],
|
|
6313
|
+
misses: [...member.result.misses],
|
|
6314
|
+
reasoning: member.result.reasoning,
|
|
6315
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6316
|
+
evaluatorResults: member.result.evaluatorResults,
|
|
6317
|
+
details: member.result.details
|
|
6318
|
+
}));
|
|
6319
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6320
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6321
|
+
const systemPrompt = buildOutputSchema();
|
|
6322
|
+
const evaluatorRawRequest = {
|
|
6323
|
+
aggregator: "llm_judge",
|
|
6324
|
+
userPrompt,
|
|
6325
|
+
systemPrompt,
|
|
6326
|
+
target: judgeProvider.targetName
|
|
5866
6327
|
};
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
6328
|
+
try {
|
|
6329
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
6330
|
+
if (model) {
|
|
6331
|
+
const { text } = await generateText3({
|
|
6332
|
+
model,
|
|
6333
|
+
system: systemPrompt,
|
|
6334
|
+
prompt: userPrompt
|
|
6335
|
+
});
|
|
6336
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6337
|
+
const score2 = clampScore(data2.score);
|
|
6338
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6339
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6340
|
+
const reasoning2 = data2.reasoning;
|
|
6341
|
+
return {
|
|
6342
|
+
score: score2,
|
|
6343
|
+
verdict: scoreToVerdict(score2),
|
|
6344
|
+
hits: hits2,
|
|
6345
|
+
misses: misses2,
|
|
6346
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6347
|
+
reasoning: reasoning2,
|
|
6348
|
+
evaluatorRawRequest,
|
|
6349
|
+
evaluatorResults
|
|
6350
|
+
};
|
|
6351
|
+
}
|
|
6352
|
+
const response = await judgeProvider.invoke({
|
|
6353
|
+
question: userPrompt,
|
|
6354
|
+
systemPrompt,
|
|
6355
|
+
evalCaseId: context.evalCase.id,
|
|
6356
|
+
attempt: context.attempt
|
|
6357
|
+
});
|
|
6358
|
+
const data = freeformEvaluationSchema.parse(
|
|
6359
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6360
|
+
);
|
|
6361
|
+
const score = clampScore(data.score);
|
|
6362
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6363
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6364
|
+
const reasoning = data.reasoning;
|
|
5871
6365
|
return {
|
|
5872
|
-
score
|
|
5873
|
-
verdict:
|
|
5874
|
-
hits
|
|
6366
|
+
score,
|
|
6367
|
+
verdict: scoreToVerdict(score),
|
|
6368
|
+
hits,
|
|
6369
|
+
misses,
|
|
6370
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
6371
|
+
reasoning,
|
|
6372
|
+
evaluatorRawRequest,
|
|
6373
|
+
evaluatorResults
|
|
6374
|
+
};
|
|
6375
|
+
} catch {
|
|
6376
|
+
return {
|
|
6377
|
+
score: 0,
|
|
6378
|
+
verdict: "fail",
|
|
6379
|
+
hits: [],
|
|
5875
6380
|
misses: [],
|
|
5876
|
-
expectedAspectCount:
|
|
6381
|
+
expectedAspectCount: 1,
|
|
6382
|
+
evaluatorRawRequest,
|
|
6383
|
+
evaluatorResults
|
|
5877
6384
|
};
|
|
5878
6385
|
}
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
|
|
5887
|
-
|
|
5888
|
-
|
|
5889
|
-
|
|
5890
|
-
|
|
5891
|
-
|
|
5892
|
-
|
|
5893
|
-
|
|
5894
|
-
|
|
5895
|
-
|
|
5896
|
-
|
|
5897
|
-
|
|
5898
|
-
|
|
5899
|
-
|
|
5900
|
-
|
|
5901
|
-
|
|
6386
|
+
}
|
|
6387
|
+
};
|
|
6388
|
+
|
|
6389
|
+
// src/evaluation/evaluators/cost.ts
|
|
6390
|
+
var CostEvaluator = class {
|
|
6391
|
+
kind = "cost";
|
|
6392
|
+
config;
|
|
6393
|
+
constructor(options) {
|
|
6394
|
+
this.config = options.config;
|
|
6395
|
+
}
|
|
6396
|
+
evaluate(context) {
|
|
6397
|
+
const { budget } = this.config;
|
|
6398
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
6399
|
+
if (costUsd === void 0) {
|
|
6400
|
+
return {
|
|
6401
|
+
score: 0,
|
|
6402
|
+
verdict: "fail",
|
|
6403
|
+
hits: [],
|
|
6404
|
+
misses: ["No cost data available in trace"],
|
|
6405
|
+
expectedAspectCount: 1,
|
|
6406
|
+
reasoning: "Execution cost not reported by provider",
|
|
6407
|
+
evaluatorRawRequest: {
|
|
6408
|
+
type: "cost",
|
|
6409
|
+
budget,
|
|
6410
|
+
costUsd: null
|
|
5902
6411
|
}
|
|
5903
|
-
|
|
5904
|
-
}
|
|
5905
|
-
if (!found && !argsMismatch) {
|
|
5906
|
-
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
5907
|
-
}
|
|
6412
|
+
};
|
|
5908
6413
|
}
|
|
5909
|
-
const
|
|
6414
|
+
const passed = costUsd <= budget;
|
|
6415
|
+
const score = passed ? 1 : 0;
|
|
6416
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
5910
6417
|
return {
|
|
5911
6418
|
score,
|
|
5912
|
-
verdict:
|
|
5913
|
-
hits,
|
|
5914
|
-
misses,
|
|
5915
|
-
expectedAspectCount:
|
|
5916
|
-
|
|
5917
|
-
|
|
5918
|
-
|
|
5919
|
-
|
|
5920
|
-
|
|
5921
|
-
|
|
5922
|
-
score: 1,
|
|
5923
|
-
verdict: "pass",
|
|
5924
|
-
hits: ["No tool sequence specified"],
|
|
5925
|
-
misses: [],
|
|
5926
|
-
expectedAspectCount: 0
|
|
5927
|
-
};
|
|
5928
|
-
}
|
|
5929
|
-
const hits = [];
|
|
5930
|
-
const misses = [];
|
|
5931
|
-
if (toolCalls.length !== expected.length) {
|
|
5932
|
-
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
5933
|
-
}
|
|
5934
|
-
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
5935
|
-
for (let i = 0; i < checkLength; i++) {
|
|
5936
|
-
const expectedItem = expected[i];
|
|
5937
|
-
const expectedTool = expectedItem.tool;
|
|
5938
|
-
const actualCall = toolCalls[i];
|
|
5939
|
-
const actualTool = actualCall.name;
|
|
5940
|
-
if (actualTool === expectedTool) {
|
|
5941
|
-
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
5942
|
-
hits.push(`Position ${i}: ${expectedTool}`);
|
|
5943
|
-
} else {
|
|
5944
|
-
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
5945
|
-
}
|
|
5946
|
-
} else {
|
|
5947
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
5948
|
-
}
|
|
5949
|
-
}
|
|
5950
|
-
for (let i = checkLength; i < expected.length; i++) {
|
|
5951
|
-
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
5952
|
-
}
|
|
5953
|
-
const score = hits.length / expected.length;
|
|
5954
|
-
return {
|
|
5955
|
-
score,
|
|
5956
|
-
verdict: scoreToVerdict(score),
|
|
5957
|
-
hits,
|
|
5958
|
-
misses,
|
|
5959
|
-
expectedAspectCount: expected.length
|
|
6419
|
+
verdict: passed ? "pass" : "fail",
|
|
6420
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6421
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6422
|
+
expectedAspectCount: 1,
|
|
6423
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6424
|
+
evaluatorRawRequest: {
|
|
6425
|
+
type: "cost",
|
|
6426
|
+
budget,
|
|
6427
|
+
costUsd
|
|
6428
|
+
}
|
|
5960
6429
|
};
|
|
5961
6430
|
}
|
|
5962
6431
|
};
|
|
6432
|
+
|
|
6433
|
+
// src/evaluation/evaluators/field-accuracy.ts
|
|
5963
6434
|
var DEFAULT_DATE_FORMATS = [
|
|
5964
6435
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
5965
6436
|
// ISO with timezone
|
|
@@ -6168,438 +6639,213 @@ var FieldAccuracyEvaluator = class {
|
|
|
6168
6639
|
weight,
|
|
6169
6640
|
hit: false,
|
|
6170
6641
|
message: `${path15} (non-numeric value)`
|
|
6171
|
-
};
|
|
6172
|
-
}
|
|
6173
|
-
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6174
|
-
return {
|
|
6175
|
-
path: path15,
|
|
6176
|
-
score: 0,
|
|
6177
|
-
weight,
|
|
6178
|
-
hit: false,
|
|
6179
|
-
message: `${path15} (invalid numeric value)`
|
|
6180
|
-
};
|
|
6181
|
-
}
|
|
6182
|
-
const diff = Math.abs(candidateNum - expectedNum);
|
|
6183
|
-
let withinTolerance;
|
|
6184
|
-
if (relative) {
|
|
6185
|
-
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6186
|
-
withinTolerance = relativeDiff <= tolerance;
|
|
6187
|
-
} else {
|
|
6188
|
-
withinTolerance = diff <= tolerance;
|
|
6189
|
-
}
|
|
6190
|
-
if (withinTolerance) {
|
|
6191
|
-
return {
|
|
6192
|
-
path: path15,
|
|
6193
|
-
score: 1,
|
|
6194
|
-
weight,
|
|
6195
|
-
hit: true,
|
|
6196
|
-
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6197
|
-
};
|
|
6198
|
-
}
|
|
6199
|
-
return {
|
|
6200
|
-
path: path15,
|
|
6201
|
-
score: 0,
|
|
6202
|
-
weight,
|
|
6203
|
-
hit: false,
|
|
6204
|
-
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6205
|
-
};
|
|
6206
|
-
}
|
|
6207
|
-
/**
|
|
6208
|
-
* Date comparison with format normalization.
|
|
6209
|
-
*/
|
|
6210
|
-
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6211
|
-
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6212
|
-
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6213
|
-
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6214
|
-
if (candidateDate === null) {
|
|
6215
|
-
return {
|
|
6216
|
-
path: path15,
|
|
6217
|
-
score: 0,
|
|
6218
|
-
weight,
|
|
6219
|
-
hit: false,
|
|
6220
|
-
message: `${path15} (unparseable candidate date)`
|
|
6221
|
-
};
|
|
6222
|
-
}
|
|
6223
|
-
if (expectedDate === null) {
|
|
6224
|
-
return {
|
|
6225
|
-
path: path15,
|
|
6226
|
-
score: 0,
|
|
6227
|
-
weight,
|
|
6228
|
-
hit: false,
|
|
6229
|
-
message: `${path15} (unparseable expected date)`
|
|
6230
|
-
};
|
|
6231
|
-
}
|
|
6232
|
-
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6233
|
-
return {
|
|
6234
|
-
path: path15,
|
|
6235
|
-
score: 1,
|
|
6236
|
-
weight,
|
|
6237
|
-
hit: true,
|
|
6238
|
-
message: path15
|
|
6239
|
-
};
|
|
6240
|
-
}
|
|
6241
|
-
return {
|
|
6242
|
-
path: path15,
|
|
6243
|
-
score: 0,
|
|
6244
|
-
weight,
|
|
6245
|
-
hit: false,
|
|
6246
|
-
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6247
|
-
};
|
|
6248
|
-
}
|
|
6249
|
-
/**
|
|
6250
|
-
* Aggregate field results using configured strategy.
|
|
6251
|
-
*/
|
|
6252
|
-
aggregateResults(results) {
|
|
6253
|
-
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6254
|
-
const hits = [];
|
|
6255
|
-
const misses = [];
|
|
6256
|
-
for (const result of results) {
|
|
6257
|
-
if (result.hit) {
|
|
6258
|
-
hits.push(result.message);
|
|
6259
|
-
} else {
|
|
6260
|
-
misses.push(result.message);
|
|
6261
|
-
}
|
|
6262
|
-
}
|
|
6263
|
-
let score;
|
|
6264
|
-
if (aggregation === "all_or_nothing") {
|
|
6265
|
-
score = misses.length === 0 ? 1 : 0;
|
|
6266
|
-
} else {
|
|
6267
|
-
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6268
|
-
if (totalWeight === 0) {
|
|
6269
|
-
score = results.length === 0 ? 1 : 0;
|
|
6270
|
-
} else {
|
|
6271
|
-
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6272
|
-
score = weightedSum / totalWeight;
|
|
6273
|
-
}
|
|
6274
|
-
}
|
|
6275
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6276
|
-
return {
|
|
6277
|
-
score: clampScore(score),
|
|
6278
|
-
verdict: scoreToVerdict(score),
|
|
6279
|
-
hits: hits.slice(0, 4),
|
|
6280
|
-
misses: misses.slice(0, 4),
|
|
6281
|
-
expectedAspectCount: results.length,
|
|
6282
|
-
reasoning
|
|
6283
|
-
};
|
|
6284
|
-
}
|
|
6285
|
-
};
|
|
6286
|
-
function resolvePath(obj, path15) {
|
|
6287
|
-
if (!path15 || !obj) {
|
|
6288
|
-
return void 0;
|
|
6289
|
-
}
|
|
6290
|
-
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6291
|
-
let current = obj;
|
|
6292
|
-
for (const part of parts) {
|
|
6293
|
-
if (current === null || current === void 0) {
|
|
6294
|
-
return void 0;
|
|
6295
|
-
}
|
|
6296
|
-
if (typeof current !== "object") {
|
|
6297
|
-
return void 0;
|
|
6298
|
-
}
|
|
6299
|
-
const isIndex = /^\d+$/.test(part);
|
|
6300
|
-
if (isIndex && Array.isArray(current)) {
|
|
6301
|
-
current = current[Number.parseInt(part, 10)];
|
|
6302
|
-
} else {
|
|
6303
|
-
current = current[part];
|
|
6304
|
-
}
|
|
6305
|
-
}
|
|
6306
|
-
return current;
|
|
6307
|
-
}
|
|
6308
|
-
function toNumber(value) {
|
|
6309
|
-
if (typeof value === "number") {
|
|
6310
|
-
return value;
|
|
6311
|
-
}
|
|
6312
|
-
if (typeof value === "string") {
|
|
6313
|
-
const num = Number.parseFloat(value);
|
|
6314
|
-
return Number.isNaN(num) ? null : num;
|
|
6315
|
-
}
|
|
6316
|
-
return null;
|
|
6317
|
-
}
|
|
6318
|
-
function parseDate(dateStr, formats) {
|
|
6319
|
-
if (!dateStr) return null;
|
|
6320
|
-
const trimmed = dateStr.trim();
|
|
6321
|
-
const isoDate = new Date(trimmed);
|
|
6322
|
-
if (!Number.isNaN(isoDate.getTime())) {
|
|
6323
|
-
return isoDate;
|
|
6324
|
-
}
|
|
6325
|
-
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6326
|
-
if (localizedMatch) {
|
|
6327
|
-
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6328
|
-
const monthName = localizedMatch[2].toLowerCase();
|
|
6329
|
-
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6330
|
-
const month = MONTH_NAMES[monthName];
|
|
6331
|
-
if (month !== void 0) {
|
|
6332
|
-
return new Date(year, month, day);
|
|
6333
|
-
}
|
|
6334
|
-
}
|
|
6335
|
-
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6336
|
-
if (usMatch) {
|
|
6337
|
-
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6338
|
-
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6339
|
-
if (hasUSFormat && !hasEUFormat) {
|
|
6340
|
-
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6341
|
-
const day = Number.parseInt(usMatch[2], 10);
|
|
6342
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6343
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6344
|
-
return new Date(year, month, day);
|
|
6345
|
-
}
|
|
6346
|
-
} else if (hasEUFormat && !hasUSFormat) {
|
|
6347
|
-
const day = Number.parseInt(usMatch[1], 10);
|
|
6348
|
-
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6349
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6350
|
-
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6351
|
-
return new Date(year, month, day);
|
|
6352
|
-
}
|
|
6353
|
-
} else {
|
|
6354
|
-
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6355
|
-
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6356
|
-
const year = Number.parseInt(usMatch[3], 10);
|
|
6357
|
-
if (num1 > 12 && num2 <= 12) {
|
|
6358
|
-
return new Date(year, num2 - 1, num1);
|
|
6359
|
-
}
|
|
6360
|
-
if (num2 > 12 && num1 <= 12) {
|
|
6361
|
-
return new Date(year, num1 - 1, num2);
|
|
6362
|
-
}
|
|
6363
|
-
if (num1 <= 12 && num2 <= 31) {
|
|
6364
|
-
return new Date(year, num1 - 1, num2);
|
|
6365
|
-
}
|
|
6366
|
-
}
|
|
6367
|
-
}
|
|
6368
|
-
return null;
|
|
6369
|
-
}
|
|
6370
|
-
function formatDateISO(date) {
|
|
6371
|
-
return date.toISOString().split("T")[0];
|
|
6372
|
-
}
|
|
6373
|
-
function parseJsonFromTextSafe(text) {
|
|
6374
|
-
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6375
|
-
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
6376
|
-
const blob = match?.[0] ?? cleaned;
|
|
6377
|
-
return JSON.parse(blob);
|
|
6378
|
-
}
|
|
6379
|
-
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
6380
|
-
{{EVALUATOR_RESULTS_JSON}}
|
|
6381
|
-
|
|
6382
|
-
Decide the final score and verdict based on all evaluator results.
|
|
6383
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
6384
|
-
var CompositeEvaluator = class {
|
|
6385
|
-
kind = "composite";
|
|
6386
|
-
config;
|
|
6387
|
-
evaluatorFactory;
|
|
6388
|
-
cwd;
|
|
6389
|
-
constructor(options) {
|
|
6390
|
-
this.config = options.config;
|
|
6391
|
-
this.evaluatorFactory = options.evaluatorFactory;
|
|
6392
|
-
this.cwd = options.cwd;
|
|
6393
|
-
}
|
|
6394
|
-
async evaluate(context) {
|
|
6395
|
-
const memberResults = await Promise.all(
|
|
6396
|
-
this.config.evaluators.map(async (memberConfig) => {
|
|
6397
|
-
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
6398
|
-
return {
|
|
6399
|
-
id: memberConfig.name,
|
|
6400
|
-
type: memberConfig.type,
|
|
6401
|
-
result: await evaluator.evaluate(context)
|
|
6402
|
-
};
|
|
6403
|
-
})
|
|
6404
|
-
);
|
|
6405
|
-
return this.aggregate(memberResults, context);
|
|
6406
|
-
}
|
|
6407
|
-
async aggregate(results, context) {
|
|
6408
|
-
const aggregator = this.config.aggregator;
|
|
6409
|
-
switch (aggregator.type) {
|
|
6410
|
-
case "code_judge":
|
|
6411
|
-
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
6412
|
-
case "llm_judge":
|
|
6413
|
-
return this.runLlmAggregator(results, context, aggregator);
|
|
6414
|
-
default:
|
|
6415
|
-
return this.runWeightedAverage(results, aggregator.weights);
|
|
6416
|
-
}
|
|
6417
|
-
}
|
|
6418
|
-
runWeightedAverage(results, weights) {
|
|
6419
|
-
let totalWeight = 0;
|
|
6420
|
-
let weightedSum = 0;
|
|
6421
|
-
const allHits = [];
|
|
6422
|
-
const allMisses = [];
|
|
6423
|
-
const reasoningParts = [];
|
|
6424
|
-
const evaluatorResults = [];
|
|
6425
|
-
for (const member of results) {
|
|
6426
|
-
const weight = weights?.[member.id] ?? 1;
|
|
6427
|
-
totalWeight += weight;
|
|
6428
|
-
weightedSum += member.result.score * weight;
|
|
6429
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
6430
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
6431
|
-
if (member.result.reasoning) {
|
|
6432
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
6433
|
-
}
|
|
6434
|
-
evaluatorResults.push({
|
|
6435
|
-
name: member.id,
|
|
6436
|
-
type: member.type,
|
|
6437
|
-
score: member.result.score,
|
|
6438
|
-
weight,
|
|
6439
|
-
verdict: member.result.verdict,
|
|
6440
|
-
hits: [...member.result.hits],
|
|
6441
|
-
misses: [...member.result.misses],
|
|
6442
|
-
reasoning: member.result.reasoning,
|
|
6443
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6444
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6445
|
-
});
|
|
6446
|
-
}
|
|
6447
|
-
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
6448
|
-
return {
|
|
6449
|
-
score: clampScore(finalScore),
|
|
6450
|
-
verdict: scoreToVerdict(finalScore),
|
|
6451
|
-
hits: allHits,
|
|
6452
|
-
misses: allMisses,
|
|
6453
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
6454
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
6455
|
-
evaluatorRawRequest: {
|
|
6456
|
-
aggregator: "weighted_average",
|
|
6457
|
-
...weights ? { weights } : {}
|
|
6458
|
-
},
|
|
6459
|
-
evaluatorResults
|
|
6460
|
-
};
|
|
6461
|
-
}
|
|
6462
|
-
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
6463
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6464
|
-
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
6465
|
-
const evaluatorResults = results.map((member) => ({
|
|
6466
|
-
name: member.id,
|
|
6467
|
-
type: member.type,
|
|
6468
|
-
score: member.result.score,
|
|
6469
|
-
weight: weights?.[member.id] ?? 1,
|
|
6470
|
-
verdict: member.result.verdict,
|
|
6471
|
-
hits: [...member.result.hits],
|
|
6472
|
-
misses: [...member.result.misses],
|
|
6473
|
-
reasoning: member.result.reasoning,
|
|
6474
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6475
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6476
|
-
}));
|
|
6477
|
-
try {
|
|
6478
|
-
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
6479
|
-
const parsed = parseJsonSafe(stdout);
|
|
6480
|
-
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
6481
|
-
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
6482
|
-
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
6483
|
-
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
6484
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
6485
|
-
return {
|
|
6486
|
-
score,
|
|
6487
|
-
verdict,
|
|
6488
|
-
hits,
|
|
6489
|
-
misses,
|
|
6490
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
6491
|
-
reasoning,
|
|
6492
|
-
evaluatorRawRequest: {
|
|
6493
|
-
aggregator: "code_judge",
|
|
6494
|
-
script: scriptPath
|
|
6495
|
-
},
|
|
6496
|
-
evaluatorResults
|
|
6497
|
-
};
|
|
6498
|
-
} catch (error) {
|
|
6499
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
6500
|
-
return {
|
|
6501
|
-
score: 0,
|
|
6502
|
-
verdict: "fail",
|
|
6503
|
-
hits: [],
|
|
6504
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
6505
|
-
expectedAspectCount: 1,
|
|
6506
|
-
reasoning: message,
|
|
6507
|
-
evaluatorRawRequest: {
|
|
6508
|
-
aggregator: "code_judge",
|
|
6509
|
-
script: scriptPath,
|
|
6510
|
-
error: message
|
|
6511
|
-
},
|
|
6512
|
-
evaluatorResults
|
|
6513
|
-
};
|
|
6514
|
-
}
|
|
6515
|
-
}
|
|
6516
|
-
async runLlmAggregator(results, context, config) {
|
|
6517
|
-
const judgeProvider = context.judgeProvider;
|
|
6518
|
-
if (!judgeProvider) {
|
|
6519
|
-
throw new Error("No judge provider available for LLM aggregation");
|
|
6520
|
-
}
|
|
6521
|
-
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
6522
|
-
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
6523
|
-
const evaluatorResults = results.map((member) => ({
|
|
6524
|
-
name: member.id,
|
|
6525
|
-
type: member.type,
|
|
6526
|
-
score: member.result.score,
|
|
6527
|
-
verdict: member.result.verdict,
|
|
6528
|
-
hits: [...member.result.hits],
|
|
6529
|
-
misses: [...member.result.misses],
|
|
6530
|
-
reasoning: member.result.reasoning,
|
|
6531
|
-
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
6532
|
-
evaluatorResults: member.result.evaluatorResults
|
|
6533
|
-
}));
|
|
6534
|
-
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
6535
|
-
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
6536
|
-
const systemPrompt = buildOutputSchema();
|
|
6537
|
-
const evaluatorRawRequest = {
|
|
6538
|
-
aggregator: "llm_judge",
|
|
6539
|
-
userPrompt,
|
|
6540
|
-
systemPrompt,
|
|
6541
|
-
target: judgeProvider.targetName
|
|
6542
|
-
};
|
|
6543
|
-
try {
|
|
6544
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
6545
|
-
if (model) {
|
|
6546
|
-
const { text } = await generateText2({
|
|
6547
|
-
model,
|
|
6548
|
-
system: systemPrompt,
|
|
6549
|
-
prompt: userPrompt
|
|
6550
|
-
});
|
|
6551
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6552
|
-
const score2 = clampScore(data2.score);
|
|
6553
|
-
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6554
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6555
|
-
const reasoning2 = data2.reasoning;
|
|
6556
|
-
return {
|
|
6557
|
-
score: score2,
|
|
6558
|
-
verdict: scoreToVerdict(score2),
|
|
6559
|
-
hits: hits2,
|
|
6560
|
-
misses: misses2,
|
|
6561
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
6562
|
-
reasoning: reasoning2,
|
|
6563
|
-
evaluatorRawRequest,
|
|
6564
|
-
evaluatorResults
|
|
6565
|
-
};
|
|
6566
|
-
}
|
|
6567
|
-
const response = await judgeProvider.invoke({
|
|
6568
|
-
question: userPrompt,
|
|
6569
|
-
systemPrompt,
|
|
6570
|
-
evalCaseId: context.evalCase.id,
|
|
6571
|
-
attempt: context.attempt
|
|
6572
|
-
});
|
|
6573
|
-
const data = freeformEvaluationSchema.parse(
|
|
6574
|
-
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6575
|
-
);
|
|
6576
|
-
const score = clampScore(data.score);
|
|
6577
|
-
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6578
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
6579
|
-
const reasoning = data.reasoning;
|
|
6642
|
+
};
|
|
6643
|
+
}
|
|
6644
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6580
6645
|
return {
|
|
6581
|
-
|
|
6582
|
-
|
|
6583
|
-
|
|
6584
|
-
|
|
6585
|
-
|
|
6586
|
-
reasoning,
|
|
6587
|
-
evaluatorRawRequest,
|
|
6588
|
-
evaluatorResults
|
|
6646
|
+
path: path15,
|
|
6647
|
+
score: 0,
|
|
6648
|
+
weight,
|
|
6649
|
+
hit: false,
|
|
6650
|
+
message: `${path15} (invalid numeric value)`
|
|
6589
6651
|
};
|
|
6590
|
-
}
|
|
6652
|
+
}
|
|
6653
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
6654
|
+
let withinTolerance;
|
|
6655
|
+
if (relative) {
|
|
6656
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6657
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
6658
|
+
} else {
|
|
6659
|
+
withinTolerance = diff <= tolerance;
|
|
6660
|
+
}
|
|
6661
|
+
if (withinTolerance) {
|
|
6662
|
+
return {
|
|
6663
|
+
path: path15,
|
|
6664
|
+
score: 1,
|
|
6665
|
+
weight,
|
|
6666
|
+
hit: true,
|
|
6667
|
+
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6668
|
+
};
|
|
6669
|
+
}
|
|
6670
|
+
return {
|
|
6671
|
+
path: path15,
|
|
6672
|
+
score: 0,
|
|
6673
|
+
weight,
|
|
6674
|
+
hit: false,
|
|
6675
|
+
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6676
|
+
};
|
|
6677
|
+
}
|
|
6678
|
+
/**
|
|
6679
|
+
* Date comparison with format normalization.
|
|
6680
|
+
*/
|
|
6681
|
+
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6682
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6683
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6684
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6685
|
+
if (candidateDate === null) {
|
|
6591
6686
|
return {
|
|
6687
|
+
path: path15,
|
|
6592
6688
|
score: 0,
|
|
6593
|
-
|
|
6594
|
-
|
|
6595
|
-
|
|
6596
|
-
|
|
6597
|
-
|
|
6598
|
-
|
|
6689
|
+
weight,
|
|
6690
|
+
hit: false,
|
|
6691
|
+
message: `${path15} (unparseable candidate date)`
|
|
6692
|
+
};
|
|
6693
|
+
}
|
|
6694
|
+
if (expectedDate === null) {
|
|
6695
|
+
return {
|
|
6696
|
+
path: path15,
|
|
6697
|
+
score: 0,
|
|
6698
|
+
weight,
|
|
6699
|
+
hit: false,
|
|
6700
|
+
message: `${path15} (unparseable expected date)`
|
|
6701
|
+
};
|
|
6702
|
+
}
|
|
6703
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6704
|
+
return {
|
|
6705
|
+
path: path15,
|
|
6706
|
+
score: 1,
|
|
6707
|
+
weight,
|
|
6708
|
+
hit: true,
|
|
6709
|
+
message: path15
|
|
6599
6710
|
};
|
|
6600
6711
|
}
|
|
6712
|
+
return {
|
|
6713
|
+
path: path15,
|
|
6714
|
+
score: 0,
|
|
6715
|
+
weight,
|
|
6716
|
+
hit: false,
|
|
6717
|
+
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6718
|
+
};
|
|
6719
|
+
}
|
|
6720
|
+
/**
|
|
6721
|
+
* Aggregate field results using configured strategy.
|
|
6722
|
+
*/
|
|
6723
|
+
aggregateResults(results) {
|
|
6724
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6725
|
+
const hits = [];
|
|
6726
|
+
const misses = [];
|
|
6727
|
+
for (const result of results) {
|
|
6728
|
+
if (result.hit) {
|
|
6729
|
+
hits.push(result.message);
|
|
6730
|
+
} else {
|
|
6731
|
+
misses.push(result.message);
|
|
6732
|
+
}
|
|
6733
|
+
}
|
|
6734
|
+
let score;
|
|
6735
|
+
if (aggregation === "all_or_nothing") {
|
|
6736
|
+
score = misses.length === 0 ? 1 : 0;
|
|
6737
|
+
} else {
|
|
6738
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6739
|
+
if (totalWeight === 0) {
|
|
6740
|
+
score = results.length === 0 ? 1 : 0;
|
|
6741
|
+
} else {
|
|
6742
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6743
|
+
score = weightedSum / totalWeight;
|
|
6744
|
+
}
|
|
6745
|
+
}
|
|
6746
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
6747
|
+
return {
|
|
6748
|
+
score: clampScore(score),
|
|
6749
|
+
verdict: scoreToVerdict(score),
|
|
6750
|
+
hits: hits.slice(0, 4),
|
|
6751
|
+
misses: misses.slice(0, 4),
|
|
6752
|
+
expectedAspectCount: results.length,
|
|
6753
|
+
reasoning
|
|
6754
|
+
};
|
|
6601
6755
|
}
|
|
6602
6756
|
};
|
|
6757
|
+
function resolvePath(obj, path15) {
|
|
6758
|
+
if (!path15 || !obj) {
|
|
6759
|
+
return void 0;
|
|
6760
|
+
}
|
|
6761
|
+
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6762
|
+
let current = obj;
|
|
6763
|
+
for (const part of parts) {
|
|
6764
|
+
if (current === null || current === void 0) {
|
|
6765
|
+
return void 0;
|
|
6766
|
+
}
|
|
6767
|
+
if (typeof current !== "object") {
|
|
6768
|
+
return void 0;
|
|
6769
|
+
}
|
|
6770
|
+
const isIndex = /^\d+$/.test(part);
|
|
6771
|
+
if (isIndex && Array.isArray(current)) {
|
|
6772
|
+
current = current[Number.parseInt(part, 10)];
|
|
6773
|
+
} else {
|
|
6774
|
+
current = current[part];
|
|
6775
|
+
}
|
|
6776
|
+
}
|
|
6777
|
+
return current;
|
|
6778
|
+
}
|
|
6779
|
+
function toNumber(value) {
|
|
6780
|
+
if (typeof value === "number") {
|
|
6781
|
+
return value;
|
|
6782
|
+
}
|
|
6783
|
+
if (typeof value === "string") {
|
|
6784
|
+
const num = Number.parseFloat(value);
|
|
6785
|
+
return Number.isNaN(num) ? null : num;
|
|
6786
|
+
}
|
|
6787
|
+
return null;
|
|
6788
|
+
}
|
|
6789
|
+
function parseDate(dateStr, formats) {
|
|
6790
|
+
if (!dateStr) return null;
|
|
6791
|
+
const trimmed = dateStr.trim();
|
|
6792
|
+
const isoDate = new Date(trimmed);
|
|
6793
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
6794
|
+
return isoDate;
|
|
6795
|
+
}
|
|
6796
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6797
|
+
if (localizedMatch) {
|
|
6798
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6799
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
6800
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6801
|
+
const month = MONTH_NAMES[monthName];
|
|
6802
|
+
if (month !== void 0) {
|
|
6803
|
+
return new Date(year, month, day);
|
|
6804
|
+
}
|
|
6805
|
+
}
|
|
6806
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6807
|
+
if (usMatch) {
|
|
6808
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6809
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6810
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
6811
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6812
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
6813
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6814
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6815
|
+
return new Date(year, month, day);
|
|
6816
|
+
}
|
|
6817
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
6818
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
6819
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6820
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6821
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6822
|
+
return new Date(year, month, day);
|
|
6823
|
+
}
|
|
6824
|
+
} else {
|
|
6825
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6826
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6827
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6828
|
+
if (num1 > 12 && num2 <= 12) {
|
|
6829
|
+
return new Date(year, num2 - 1, num1);
|
|
6830
|
+
}
|
|
6831
|
+
if (num2 > 12 && num1 <= 12) {
|
|
6832
|
+
return new Date(year, num1 - 1, num2);
|
|
6833
|
+
}
|
|
6834
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
6835
|
+
return new Date(year, num1 - 1, num2);
|
|
6836
|
+
}
|
|
6837
|
+
}
|
|
6838
|
+
}
|
|
6839
|
+
return null;
|
|
6840
|
+
}
|
|
6841
|
+
function formatDateISO(date) {
|
|
6842
|
+
return date.toISOString().split("T")[0];
|
|
6843
|
+
}
|
|
6844
|
+
function parseJsonFromTextSafe(text) {
|
|
6845
|
+
return parseJsonFromText(text);
|
|
6846
|
+
}
|
|
6847
|
+
|
|
6848
|
+
// src/evaluation/evaluators/latency.ts
|
|
6603
6849
|
var LatencyEvaluator = class {
|
|
6604
6850
|
kind = "latency";
|
|
6605
6851
|
config;
|
|
@@ -6635,54 +6881,14 @@ var LatencyEvaluator = class {
|
|
|
6635
6881
|
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6636
6882
|
evaluatorRawRequest: {
|
|
6637
6883
|
type: "latency",
|
|
6638
|
-
threshold,
|
|
6639
|
-
durationMs
|
|
6640
|
-
}
|
|
6641
|
-
};
|
|
6642
|
-
}
|
|
6643
|
-
};
|
|
6644
|
-
var CostEvaluator = class {
|
|
6645
|
-
kind = "cost";
|
|
6646
|
-
config;
|
|
6647
|
-
constructor(options) {
|
|
6648
|
-
this.config = options.config;
|
|
6649
|
-
}
|
|
6650
|
-
evaluate(context) {
|
|
6651
|
-
const { budget } = this.config;
|
|
6652
|
-
const costUsd = context.traceSummary?.costUsd;
|
|
6653
|
-
if (costUsd === void 0) {
|
|
6654
|
-
return {
|
|
6655
|
-
score: 0,
|
|
6656
|
-
verdict: "fail",
|
|
6657
|
-
hits: [],
|
|
6658
|
-
misses: ["No cost data available in trace"],
|
|
6659
|
-
expectedAspectCount: 1,
|
|
6660
|
-
reasoning: "Execution cost not reported by provider",
|
|
6661
|
-
evaluatorRawRequest: {
|
|
6662
|
-
type: "cost",
|
|
6663
|
-
budget,
|
|
6664
|
-
costUsd: null
|
|
6665
|
-
}
|
|
6666
|
-
};
|
|
6667
|
-
}
|
|
6668
|
-
const passed = costUsd <= budget;
|
|
6669
|
-
const score = passed ? 1 : 0;
|
|
6670
|
-
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6671
|
-
return {
|
|
6672
|
-
score,
|
|
6673
|
-
verdict: passed ? "pass" : "fail",
|
|
6674
|
-
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6675
|
-
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6676
|
-
expectedAspectCount: 1,
|
|
6677
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6678
|
-
evaluatorRawRequest: {
|
|
6679
|
-
type: "cost",
|
|
6680
|
-
budget,
|
|
6681
|
-
costUsd
|
|
6884
|
+
threshold,
|
|
6885
|
+
durationMs
|
|
6682
6886
|
}
|
|
6683
6887
|
};
|
|
6684
6888
|
}
|
|
6685
6889
|
};
|
|
6890
|
+
|
|
6891
|
+
// src/evaluation/evaluators/token-usage.ts
|
|
6686
6892
|
var TokenUsageEvaluator = class {
|
|
6687
6893
|
kind = "token_usage";
|
|
6688
6894
|
config;
|
|
@@ -6766,6 +6972,226 @@ var TokenUsageEvaluator = class {
|
|
|
6766
6972
|
}
|
|
6767
6973
|
};
|
|
6768
6974
|
|
|
6975
|
+
// src/evaluation/evaluators/tool-trajectory.ts
|
|
6976
|
+
function argsMatch(expected, actual) {
|
|
6977
|
+
if (expected === void 0) return true;
|
|
6978
|
+
if (expected === "any") return true;
|
|
6979
|
+
if (actual === void 0) return false;
|
|
6980
|
+
for (const key of Object.keys(expected)) {
|
|
6981
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
6982
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
6983
|
+
}
|
|
6984
|
+
return true;
|
|
6985
|
+
}
|
|
6986
|
+
var ToolTrajectoryEvaluator = class {
|
|
6987
|
+
kind = "tool_trajectory";
|
|
6988
|
+
config;
|
|
6989
|
+
constructor(options) {
|
|
6990
|
+
this.config = options.config;
|
|
6991
|
+
}
|
|
6992
|
+
evaluate(context) {
|
|
6993
|
+
const { outputMessages, traceSummary } = context;
|
|
6994
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
6995
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
6996
|
+
return {
|
|
6997
|
+
score: 0,
|
|
6998
|
+
verdict: "fail",
|
|
6999
|
+
hits: [],
|
|
7000
|
+
misses: ["No trace available for evaluation"],
|
|
7001
|
+
expectedAspectCount: 1
|
|
7002
|
+
};
|
|
7003
|
+
}
|
|
7004
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
7005
|
+
if (!summary) {
|
|
7006
|
+
return {
|
|
7007
|
+
score: 0,
|
|
7008
|
+
verdict: "fail",
|
|
7009
|
+
hits: [],
|
|
7010
|
+
misses: ["No trace available for evaluation"],
|
|
7011
|
+
expectedAspectCount: 1
|
|
7012
|
+
};
|
|
7013
|
+
}
|
|
7014
|
+
switch (this.config.mode) {
|
|
7015
|
+
case "any_order":
|
|
7016
|
+
return this.evaluateAnyOrder(summary);
|
|
7017
|
+
case "in_order":
|
|
7018
|
+
return this.evaluateInOrder(toolCalls);
|
|
7019
|
+
case "exact":
|
|
7020
|
+
return this.evaluateExact(toolCalls);
|
|
7021
|
+
default:
|
|
7022
|
+
return {
|
|
7023
|
+
score: 0,
|
|
7024
|
+
verdict: "fail",
|
|
7025
|
+
hits: [],
|
|
7026
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
7027
|
+
expectedAspectCount: 1
|
|
7028
|
+
};
|
|
7029
|
+
}
|
|
7030
|
+
}
|
|
7031
|
+
/**
|
|
7032
|
+
* Extract tool calls from output messages.
|
|
7033
|
+
*/
|
|
7034
|
+
extractToolCallsFromMessages(messages) {
|
|
7035
|
+
if (!messages) {
|
|
7036
|
+
return [];
|
|
7037
|
+
}
|
|
7038
|
+
const toolCalls = [];
|
|
7039
|
+
for (const message of messages) {
|
|
7040
|
+
if (message.toolCalls) {
|
|
7041
|
+
for (const call of message.toolCalls) {
|
|
7042
|
+
toolCalls.push({
|
|
7043
|
+
name: call.tool,
|
|
7044
|
+
args: call.input
|
|
7045
|
+
});
|
|
7046
|
+
}
|
|
7047
|
+
}
|
|
7048
|
+
}
|
|
7049
|
+
return toolCalls;
|
|
7050
|
+
}
|
|
7051
|
+
/**
|
|
7052
|
+
* Build a summary from extracted tool calls.
|
|
7053
|
+
*/
|
|
7054
|
+
buildSummary(toolCalls) {
|
|
7055
|
+
const toolCallsByName = {};
|
|
7056
|
+
for (const call of toolCalls) {
|
|
7057
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
7058
|
+
}
|
|
7059
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
7060
|
+
return {
|
|
7061
|
+
eventCount: toolCalls.length,
|
|
7062
|
+
toolNames,
|
|
7063
|
+
toolCallsByName,
|
|
7064
|
+
errorCount: 0
|
|
7065
|
+
};
|
|
7066
|
+
}
|
|
7067
|
+
evaluateAnyOrder(summary) {
|
|
7068
|
+
const minimums = this.config.minimums ?? {};
|
|
7069
|
+
const toolNames = Object.keys(minimums);
|
|
7070
|
+
if (toolNames.length === 0) {
|
|
7071
|
+
return {
|
|
7072
|
+
score: 1,
|
|
7073
|
+
verdict: "pass",
|
|
7074
|
+
hits: ["No tool requirements specified"],
|
|
7075
|
+
misses: [],
|
|
7076
|
+
expectedAspectCount: 0
|
|
7077
|
+
};
|
|
7078
|
+
}
|
|
7079
|
+
const hits = [];
|
|
7080
|
+
const misses = [];
|
|
7081
|
+
for (const toolName of toolNames) {
|
|
7082
|
+
const required = minimums[toolName];
|
|
7083
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
7084
|
+
if (actual >= required) {
|
|
7085
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7086
|
+
} else {
|
|
7087
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
|
|
7088
|
+
}
|
|
7089
|
+
}
|
|
7090
|
+
const score = hits.length / toolNames.length;
|
|
7091
|
+
return {
|
|
7092
|
+
score,
|
|
7093
|
+
verdict: scoreToVerdict(score),
|
|
7094
|
+
hits,
|
|
7095
|
+
misses,
|
|
7096
|
+
expectedAspectCount: toolNames.length
|
|
7097
|
+
};
|
|
7098
|
+
}
|
|
7099
|
+
evaluateInOrder(toolCalls) {
|
|
7100
|
+
const expected = this.config.expected ?? [];
|
|
7101
|
+
if (expected.length === 0) {
|
|
7102
|
+
return {
|
|
7103
|
+
score: 1,
|
|
7104
|
+
verdict: "pass",
|
|
7105
|
+
hits: ["No tool sequence specified"],
|
|
7106
|
+
misses: [],
|
|
7107
|
+
expectedAspectCount: 0
|
|
7108
|
+
};
|
|
7109
|
+
}
|
|
7110
|
+
const hits = [];
|
|
7111
|
+
const misses = [];
|
|
7112
|
+
let actualIndex = 0;
|
|
7113
|
+
for (let i = 0; i < expected.length; i++) {
|
|
7114
|
+
const expectedItem = expected[i];
|
|
7115
|
+
const expectedTool = expectedItem.tool;
|
|
7116
|
+
let found = false;
|
|
7117
|
+
let argsMismatch = false;
|
|
7118
|
+
while (actualIndex < toolCalls.length) {
|
|
7119
|
+
const actualCall = toolCalls[actualIndex];
|
|
7120
|
+
if (actualCall.name === expectedTool) {
|
|
7121
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7122
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7123
|
+
actualIndex++;
|
|
7124
|
+
found = true;
|
|
7125
|
+
break;
|
|
7126
|
+
}
|
|
7127
|
+
misses.push(
|
|
7128
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
7129
|
+
);
|
|
7130
|
+
actualIndex++;
|
|
7131
|
+
argsMismatch = true;
|
|
7132
|
+
break;
|
|
7133
|
+
}
|
|
7134
|
+
actualIndex++;
|
|
7135
|
+
}
|
|
7136
|
+
if (!found && !argsMismatch) {
|
|
7137
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7138
|
+
}
|
|
7139
|
+
}
|
|
7140
|
+
const score = hits.length / expected.length;
|
|
7141
|
+
return {
|
|
7142
|
+
score,
|
|
7143
|
+
verdict: scoreToVerdict(score),
|
|
7144
|
+
hits,
|
|
7145
|
+
misses,
|
|
7146
|
+
expectedAspectCount: expected.length
|
|
7147
|
+
};
|
|
7148
|
+
}
|
|
7149
|
+
evaluateExact(toolCalls) {
|
|
7150
|
+
const expected = this.config.expected ?? [];
|
|
7151
|
+
if (expected.length === 0) {
|
|
7152
|
+
return {
|
|
7153
|
+
score: 1,
|
|
7154
|
+
verdict: "pass",
|
|
7155
|
+
hits: ["No tool sequence specified"],
|
|
7156
|
+
misses: [],
|
|
7157
|
+
expectedAspectCount: 0
|
|
7158
|
+
};
|
|
7159
|
+
}
|
|
7160
|
+
const hits = [];
|
|
7161
|
+
const misses = [];
|
|
7162
|
+
if (toolCalls.length !== expected.length) {
|
|
7163
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7164
|
+
}
|
|
7165
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
7166
|
+
for (let i = 0; i < checkLength; i++) {
|
|
7167
|
+
const expectedItem = expected[i];
|
|
7168
|
+
const expectedTool = expectedItem.tool;
|
|
7169
|
+
const actualCall = toolCalls[i];
|
|
7170
|
+
const actualTool = actualCall.name;
|
|
7171
|
+
if (actualTool === expectedTool) {
|
|
7172
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7173
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7174
|
+
} else {
|
|
7175
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7176
|
+
}
|
|
7177
|
+
} else {
|
|
7178
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7179
|
+
}
|
|
7180
|
+
}
|
|
7181
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
7182
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7183
|
+
}
|
|
7184
|
+
const score = hits.length / expected.length;
|
|
7185
|
+
return {
|
|
7186
|
+
score,
|
|
7187
|
+
verdict: scoreToVerdict(score),
|
|
7188
|
+
hits,
|
|
7189
|
+
misses,
|
|
7190
|
+
expectedAspectCount: expected.length
|
|
7191
|
+
};
|
|
7192
|
+
}
|
|
7193
|
+
};
|
|
7194
|
+
|
|
6769
7195
|
// src/evaluation/orchestrator.ts
|
|
6770
7196
|
import { createHash } from "node:crypto";
|
|
6771
7197
|
import path14 from "node:path";
|
|
@@ -6979,6 +7405,17 @@ async function runEvaluation(options) {
|
|
|
6979
7405
|
}
|
|
6980
7406
|
return getOrCreateProvider(resolvedJudge);
|
|
6981
7407
|
};
|
|
7408
|
+
const targetResolver = (name) => {
|
|
7409
|
+
const resolved = resolveTargetByName(name);
|
|
7410
|
+
if (!resolved) {
|
|
7411
|
+
return void 0;
|
|
7412
|
+
}
|
|
7413
|
+
return getOrCreateProvider(resolved);
|
|
7414
|
+
};
|
|
7415
|
+
const availableTargets = [
|
|
7416
|
+
target.name,
|
|
7417
|
+
...Array.from(targetDefinitions.keys())
|
|
7418
|
+
];
|
|
6982
7419
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
6983
7420
|
const primaryProvider = getOrCreateProvider(target);
|
|
6984
7421
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
@@ -7008,7 +7445,9 @@ async function runEvaluation(options) {
|
|
|
7008
7445
|
onResult,
|
|
7009
7446
|
verbose,
|
|
7010
7447
|
resolveJudgeProvider,
|
|
7011
|
-
agentTimeoutMs
|
|
7448
|
+
agentTimeoutMs,
|
|
7449
|
+
targetResolver,
|
|
7450
|
+
availableTargets
|
|
7012
7451
|
});
|
|
7013
7452
|
} catch (error) {
|
|
7014
7453
|
if (verbose) {
|
|
@@ -7047,7 +7486,9 @@ async function runEvaluation(options) {
|
|
|
7047
7486
|
cache,
|
|
7048
7487
|
useCache,
|
|
7049
7488
|
now,
|
|
7050
|
-
judgeProvider
|
|
7489
|
+
judgeProvider,
|
|
7490
|
+
targetResolver,
|
|
7491
|
+
availableTargets
|
|
7051
7492
|
});
|
|
7052
7493
|
if (onProgress) {
|
|
7053
7494
|
await onProgress({
|
|
@@ -7114,7 +7555,9 @@ async function runBatchEvaluation(options) {
|
|
|
7114
7555
|
onProgress,
|
|
7115
7556
|
onResult,
|
|
7116
7557
|
resolveJudgeProvider,
|
|
7117
|
-
agentTimeoutMs
|
|
7558
|
+
agentTimeoutMs,
|
|
7559
|
+
targetResolver,
|
|
7560
|
+
availableTargets
|
|
7118
7561
|
} = options;
|
|
7119
7562
|
const promptInputsList = [];
|
|
7120
7563
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -7189,7 +7632,9 @@ async function runBatchEvaluation(options) {
|
|
|
7189
7632
|
judgeProvider: await resolveJudgeProvider(target),
|
|
7190
7633
|
agentTimeoutMs,
|
|
7191
7634
|
outputMessages,
|
|
7192
|
-
traceSummary
|
|
7635
|
+
traceSummary,
|
|
7636
|
+
targetResolver,
|
|
7637
|
+
availableTargets
|
|
7193
7638
|
});
|
|
7194
7639
|
if (providerError) {
|
|
7195
7640
|
result = { ...result, error: providerError };
|
|
@@ -7247,7 +7692,9 @@ async function runEvalCase(options) {
|
|
|
7247
7692
|
cache,
|
|
7248
7693
|
useCache,
|
|
7249
7694
|
signal,
|
|
7250
|
-
judgeProvider
|
|
7695
|
+
judgeProvider,
|
|
7696
|
+
targetResolver,
|
|
7697
|
+
availableTargets
|
|
7251
7698
|
} = options;
|
|
7252
7699
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
7253
7700
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -7321,7 +7768,9 @@ async function runEvalCase(options) {
|
|
|
7321
7768
|
judgeProvider,
|
|
7322
7769
|
agentTimeoutMs,
|
|
7323
7770
|
outputMessages,
|
|
7324
|
-
traceSummary
|
|
7771
|
+
traceSummary,
|
|
7772
|
+
targetResolver,
|
|
7773
|
+
availableTargets
|
|
7325
7774
|
});
|
|
7326
7775
|
return providerError ? { ...result, error: providerError } : result;
|
|
7327
7776
|
} catch (error) {
|
|
@@ -7341,7 +7790,9 @@ async function evaluateCandidate(options) {
|
|
|
7341
7790
|
judgeProvider,
|
|
7342
7791
|
agentTimeoutMs,
|
|
7343
7792
|
outputMessages,
|
|
7344
|
-
traceSummary
|
|
7793
|
+
traceSummary,
|
|
7794
|
+
targetResolver,
|
|
7795
|
+
availableTargets
|
|
7345
7796
|
} = options;
|
|
7346
7797
|
const gradeTimestamp = nowFn();
|
|
7347
7798
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -7356,7 +7807,9 @@ async function evaluateCandidate(options) {
|
|
|
7356
7807
|
judgeProvider,
|
|
7357
7808
|
agentTimeoutMs,
|
|
7358
7809
|
outputMessages,
|
|
7359
|
-
traceSummary
|
|
7810
|
+
traceSummary,
|
|
7811
|
+
targetResolver,
|
|
7812
|
+
availableTargets
|
|
7360
7813
|
});
|
|
7361
7814
|
const completedAt = nowFn();
|
|
7362
7815
|
let agentProviderRequest;
|
|
@@ -7409,7 +7862,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7409
7862
|
judgeProvider,
|
|
7410
7863
|
agentTimeoutMs,
|
|
7411
7864
|
outputMessages,
|
|
7412
|
-
traceSummary
|
|
7865
|
+
traceSummary,
|
|
7866
|
+
targetResolver,
|
|
7867
|
+
availableTargets
|
|
7413
7868
|
} = options;
|
|
7414
7869
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
7415
7870
|
return runEvaluatorList({
|
|
@@ -7425,7 +7880,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7425
7880
|
judgeProvider,
|
|
7426
7881
|
agentTimeoutMs,
|
|
7427
7882
|
outputMessages,
|
|
7428
|
-
traceSummary
|
|
7883
|
+
traceSummary,
|
|
7884
|
+
targetResolver,
|
|
7885
|
+
availableTargets
|
|
7429
7886
|
});
|
|
7430
7887
|
}
|
|
7431
7888
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -7443,7 +7900,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
7443
7900
|
now,
|
|
7444
7901
|
judgeProvider,
|
|
7445
7902
|
outputMessages,
|
|
7446
|
-
traceSummary
|
|
7903
|
+
traceSummary,
|
|
7904
|
+
targetResolver,
|
|
7905
|
+
availableTargets
|
|
7447
7906
|
});
|
|
7448
7907
|
return { score };
|
|
7449
7908
|
}
|
|
@@ -7461,7 +7920,9 @@ async function runEvaluatorList(options) {
|
|
|
7461
7920
|
judgeProvider,
|
|
7462
7921
|
agentTimeoutMs,
|
|
7463
7922
|
outputMessages,
|
|
7464
|
-
traceSummary
|
|
7923
|
+
traceSummary,
|
|
7924
|
+
targetResolver,
|
|
7925
|
+
availableTargets
|
|
7465
7926
|
} = options;
|
|
7466
7927
|
const scored = [];
|
|
7467
7928
|
const evaluatorResults = [];
|
|
@@ -7499,7 +7960,8 @@ async function runEvaluatorList(options) {
|
|
|
7499
7960
|
script: evaluator.script,
|
|
7500
7961
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
7501
7962
|
agentTimeoutMs,
|
|
7502
|
-
config: evaluator.config
|
|
7963
|
+
config: evaluator.config,
|
|
7964
|
+
target: evaluator.target
|
|
7503
7965
|
});
|
|
7504
7966
|
const score2 = await codeEvaluator.evaluate({
|
|
7505
7967
|
evalCase,
|
|
@@ -7509,8 +7971,11 @@ async function runEvaluatorList(options) {
|
|
|
7509
7971
|
attempt,
|
|
7510
7972
|
promptInputs,
|
|
7511
7973
|
now,
|
|
7974
|
+
judgeProvider,
|
|
7512
7975
|
outputMessages,
|
|
7513
|
-
traceSummary
|
|
7976
|
+
traceSummary,
|
|
7977
|
+
targetResolver,
|
|
7978
|
+
availableTargets
|
|
7514
7979
|
});
|
|
7515
7980
|
const weight = evaluator.weight ?? 1;
|
|
7516
7981
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -7523,7 +7988,8 @@ async function runEvaluatorList(options) {
|
|
|
7523
7988
|
hits: score2.hits,
|
|
7524
7989
|
misses: score2.misses,
|
|
7525
7990
|
reasoning: score2.reasoning,
|
|
7526
|
-
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
7991
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
7992
|
+
details: score2.details
|
|
7527
7993
|
});
|
|
7528
7994
|
}
|
|
7529
7995
|
if (evaluator.type === "composite") {
|
|
@@ -7537,7 +8003,8 @@ async function runEvaluatorList(options) {
|
|
|
7537
8003
|
script: memberConfig.script,
|
|
7538
8004
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
7539
8005
|
agentTimeoutMs,
|
|
7540
|
-
config: memberConfig.config
|
|
8006
|
+
config: memberConfig.config,
|
|
8007
|
+
target: memberConfig.target
|
|
7541
8008
|
});
|
|
7542
8009
|
case "composite":
|
|
7543
8010
|
return new CompositeEvaluator({
|
|
@@ -7586,7 +8053,9 @@ async function runEvaluatorList(options) {
|
|
|
7586
8053
|
now,
|
|
7587
8054
|
judgeProvider,
|
|
7588
8055
|
outputMessages,
|
|
7589
|
-
traceSummary
|
|
8056
|
+
traceSummary,
|
|
8057
|
+
targetResolver,
|
|
8058
|
+
availableTargets
|
|
7590
8059
|
});
|
|
7591
8060
|
const weight = evaluator.weight ?? 1;
|
|
7592
8061
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -7782,11 +8251,11 @@ async function runEvaluatorList(options) {
|
|
|
7782
8251
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
7783
8252
|
0
|
|
7784
8253
|
);
|
|
7785
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(
|
|
8254
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
7786
8255
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
7787
8256
|
const score = {
|
|
7788
8257
|
score: aggregateScore,
|
|
7789
|
-
verdict:
|
|
8258
|
+
verdict: scoreToVerdict(aggregateScore),
|
|
7790
8259
|
hits,
|
|
7791
8260
|
misses,
|
|
7792
8261
|
expectedAspectCount,
|
|
@@ -7833,18 +8302,6 @@ async function resolveCustomPrompt(config) {
|
|
|
7833
8302
|
}
|
|
7834
8303
|
return config.prompt;
|
|
7835
8304
|
}
|
|
7836
|
-
function isNonEmptyString2(value) {
|
|
7837
|
-
return typeof value === "string" && value.trim().length > 0;
|
|
7838
|
-
}
|
|
7839
|
-
function scoreToVerdict2(score) {
|
|
7840
|
-
if (score >= 0.8) {
|
|
7841
|
-
return "pass";
|
|
7842
|
-
}
|
|
7843
|
-
if (score >= 0.6) {
|
|
7844
|
-
return "borderline";
|
|
7845
|
-
}
|
|
7846
|
-
return "fail";
|
|
7847
|
-
}
|
|
7848
8305
|
function filterEvalCases(evalCases, evalId) {
|
|
7849
8306
|
if (!evalId) {
|
|
7850
8307
|
return evalCases;
|
|
@@ -7987,7 +8444,8 @@ function mapChildResults(children) {
|
|
|
7987
8444
|
misses: child.misses,
|
|
7988
8445
|
reasoning: child.reasoning,
|
|
7989
8446
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
7990
|
-
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
8447
|
+
evaluatorResults: mapChildResults(child.evaluatorResults),
|
|
8448
|
+
details: child.details
|
|
7991
8449
|
}));
|
|
7992
8450
|
}
|
|
7993
8451
|
function computeWeightedMean(entries) {
|
|
@@ -8002,7 +8460,7 @@ function computeWeightedMean(entries) {
|
|
|
8002
8460
|
}
|
|
8003
8461
|
|
|
8004
8462
|
// src/evaluation/generators/rubric-generator.ts
|
|
8005
|
-
import { generateText as
|
|
8463
|
+
import { generateText as generateText4 } from "ai";
|
|
8006
8464
|
import { z as z3 } from "zod";
|
|
8007
8465
|
var rubricItemSchema = z3.object({
|
|
8008
8466
|
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -8036,7 +8494,7 @@ You must return a valid JSON object matching this schema:
|
|
|
8036
8494
|
let lastError;
|
|
8037
8495
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
8038
8496
|
try {
|
|
8039
|
-
const { text } = await
|
|
8497
|
+
const { text } = await generateText4({
|
|
8040
8498
|
model,
|
|
8041
8499
|
system,
|
|
8042
8500
|
prompt
|
|
@@ -8081,17 +8539,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
8081
8539
|
return parts.join("\n");
|
|
8082
8540
|
}
|
|
8083
8541
|
|
|
8084
|
-
// src/evaluation/code-judge-sdk.ts
|
|
8085
|
-
import { readFileSync } from "node:fs";
|
|
8086
|
-
function parseCodeJudgePayload(payload) {
|
|
8087
|
-
const parsed = JSON.parse(payload);
|
|
8088
|
-
return toCamelCaseDeep(parsed);
|
|
8089
|
-
}
|
|
8090
|
-
function readCodeJudgePayload() {
|
|
8091
|
-
const stdin = readFileSync(0, "utf8");
|
|
8092
|
-
return parseCodeJudgePayload(stdin);
|
|
8093
|
-
}
|
|
8094
|
-
|
|
8095
8542
|
// src/index.ts
|
|
8096
8543
|
function createAgentKernel() {
|
|
8097
8544
|
return { status: "stub" };
|
|
@@ -8109,33 +8556,39 @@ export {
|
|
|
8109
8556
|
ToolTrajectoryEvaluator,
|
|
8110
8557
|
avgToolDurationMs,
|
|
8111
8558
|
buildDirectoryChain,
|
|
8559
|
+
buildOutputSchema,
|
|
8112
8560
|
buildPromptInputs,
|
|
8113
8561
|
buildSearchRoots,
|
|
8562
|
+
clampScore,
|
|
8114
8563
|
computeTraceSummary,
|
|
8115
8564
|
consumeClaudeCodeLogEntries,
|
|
8116
8565
|
consumeCodexLogEntries,
|
|
8117
8566
|
consumePiLogEntries,
|
|
8118
8567
|
createAgentKernel,
|
|
8119
8568
|
createProvider,
|
|
8569
|
+
deepEqual,
|
|
8120
8570
|
ensureVSCodeSubagents,
|
|
8571
|
+
executeScript,
|
|
8121
8572
|
explorationRatio,
|
|
8122
|
-
|
|
8573
|
+
extractJsonBlob,
|
|
8123
8574
|
fileExists,
|
|
8124
8575
|
findGitRoot,
|
|
8576
|
+
freeformEvaluationSchema,
|
|
8125
8577
|
generateRubrics,
|
|
8126
8578
|
getHitCount,
|
|
8127
8579
|
isEvaluatorKind,
|
|
8128
8580
|
isGuidelineFile,
|
|
8129
8581
|
isJsonObject,
|
|
8130
8582
|
isJsonValue,
|
|
8583
|
+
isNonEmptyString,
|
|
8131
8584
|
isTestMessage,
|
|
8132
8585
|
isTestMessageRole,
|
|
8133
8586
|
listTargetNames,
|
|
8134
8587
|
loadEvalCases,
|
|
8135
8588
|
mergeExecutionMetrics,
|
|
8136
8589
|
normalizeLineEndings,
|
|
8137
|
-
|
|
8138
|
-
|
|
8590
|
+
parseJsonFromText,
|
|
8591
|
+
parseJsonSafe,
|
|
8139
8592
|
readJsonFile,
|
|
8140
8593
|
readTargetDefinitions,
|
|
8141
8594
|
readTestSuiteMetadata,
|
|
@@ -8145,6 +8598,7 @@ export {
|
|
|
8145
8598
|
resolveTargetDefinition,
|
|
8146
8599
|
runEvalCase,
|
|
8147
8600
|
runEvaluation,
|
|
8601
|
+
scoreToVerdict,
|
|
8148
8602
|
subscribeToClaudeCodeLogEntries,
|
|
8149
8603
|
subscribeToCodexLogEntries,
|
|
8150
8604
|
subscribeToPiLogEntries,
|