@agentv/core 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7HPKTRFZ.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +248 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +58 -41
- package/dist/index.d.ts +58 -41
- package/dist/index.js +235 -148
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-7HPKTRFZ.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-N55K52OO.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -396,6 +396,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
396
396
|
continue;
|
|
397
397
|
}
|
|
398
398
|
const config = parsed;
|
|
399
|
+
const requiredVersion = parsed.required_version;
|
|
400
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
401
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
399
404
|
const guidelinePatterns = config.guideline_patterns;
|
|
400
405
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
401
406
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -419,6 +424,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
419
424
|
configPath
|
|
420
425
|
);
|
|
421
426
|
return {
|
|
427
|
+
required_version: requiredVersion,
|
|
422
428
|
guideline_patterns: guidelinePatterns,
|
|
423
429
|
eval_patterns: evalPatterns,
|
|
424
430
|
execution: executionDefaults
|
|
@@ -562,6 +568,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
562
568
|
);
|
|
563
569
|
return void 0;
|
|
564
570
|
}
|
|
571
|
+
function extractFailOnError(suite) {
|
|
572
|
+
const execution = suite.execution;
|
|
573
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
574
|
+
return void 0;
|
|
575
|
+
}
|
|
576
|
+
const executionObj = execution;
|
|
577
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
578
|
+
if (raw === void 0 || raw === null) {
|
|
579
|
+
return void 0;
|
|
580
|
+
}
|
|
581
|
+
if (typeof raw === "boolean") {
|
|
582
|
+
return raw;
|
|
583
|
+
}
|
|
584
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
585
|
+
return void 0;
|
|
586
|
+
}
|
|
565
587
|
function parseExecutionDefaults(raw, configPath) {
|
|
566
588
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
567
589
|
return void 0;
|
|
@@ -660,6 +682,9 @@ function validateTemplateVariables(content, source) {
|
|
|
660
682
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
661
683
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
662
684
|
var ANSI_RESET4 = "\x1B[0m";
|
|
685
|
+
function normalizeEvaluatorType(type) {
|
|
686
|
+
return type.replace(/_/g, "-");
|
|
687
|
+
}
|
|
663
688
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
664
689
|
const execution = rawEvalCase.execution;
|
|
665
690
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -690,7 +715,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
690
715
|
continue;
|
|
691
716
|
}
|
|
692
717
|
const rawName = asString(rawEvaluator.name);
|
|
693
|
-
const
|
|
718
|
+
const rawType = rawEvaluator.type;
|
|
719
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
694
720
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
695
721
|
if (typeof typeValue !== "string") {
|
|
696
722
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -723,25 +749,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
723
749
|
});
|
|
724
750
|
continue;
|
|
725
751
|
}
|
|
726
|
-
if (typeValue === "
|
|
752
|
+
if (typeValue === "code-judge") {
|
|
727
753
|
let command;
|
|
728
754
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
729
755
|
if (typeof rawCommand === "string") {
|
|
730
756
|
const trimmed = rawCommand.trim();
|
|
731
757
|
if (trimmed.length === 0) {
|
|
732
758
|
throw new Error(
|
|
733
|
-
`Invalid
|
|
759
|
+
`Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
|
|
734
760
|
);
|
|
735
761
|
}
|
|
736
762
|
command = parseCommandToArgv(trimmed);
|
|
737
763
|
} else {
|
|
738
764
|
command = asStringArray(
|
|
739
765
|
rawCommand,
|
|
740
|
-
`
|
|
766
|
+
`code-judge command for evaluator '${name}' in '${evalId}'`
|
|
741
767
|
);
|
|
742
768
|
}
|
|
743
769
|
if (!command) {
|
|
744
|
-
logWarning2(`Skipping
|
|
770
|
+
logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
|
|
745
771
|
continue;
|
|
746
772
|
}
|
|
747
773
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
@@ -802,7 +828,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
802
828
|
}
|
|
803
829
|
evaluators.push({
|
|
804
830
|
name,
|
|
805
|
-
type: "code",
|
|
831
|
+
type: "code-judge",
|
|
806
832
|
command,
|
|
807
833
|
cwd,
|
|
808
834
|
resolvedCwd,
|
|
@@ -828,7 +854,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
828
854
|
continue;
|
|
829
855
|
}
|
|
830
856
|
const aggregatorType = asString(rawAggregator.type);
|
|
831
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
857
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
832
858
|
logWarning2(
|
|
833
859
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
834
860
|
);
|
|
@@ -877,16 +903,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
877
903
|
type: "weighted_average",
|
|
878
904
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
879
905
|
};
|
|
880
|
-
} else if (aggregatorType === "
|
|
906
|
+
} else if (aggregatorType === "code-judge") {
|
|
881
907
|
const aggregatorPath = asString(rawAggregator.path);
|
|
882
908
|
if (!aggregatorPath) {
|
|
883
909
|
logWarning2(
|
|
884
|
-
`Skipping composite evaluator '${name}' in '${evalId}':
|
|
910
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
|
|
885
911
|
);
|
|
886
912
|
continue;
|
|
887
913
|
}
|
|
888
914
|
aggregator = {
|
|
889
|
-
type: "
|
|
915
|
+
type: "code-judge",
|
|
890
916
|
path: aggregatorPath,
|
|
891
917
|
cwd: searchRoots[0]
|
|
892
918
|
};
|
|
@@ -912,7 +938,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
912
938
|
}
|
|
913
939
|
}
|
|
914
940
|
aggregator = {
|
|
915
|
-
type: "
|
|
941
|
+
type: "llm-judge",
|
|
916
942
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
917
943
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
918
944
|
};
|
|
@@ -930,11 +956,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
930
956
|
});
|
|
931
957
|
continue;
|
|
932
958
|
}
|
|
933
|
-
if (typeValue === "
|
|
959
|
+
if (typeValue === "tool-trajectory") {
|
|
934
960
|
const mode = asString(rawEvaluator.mode);
|
|
935
961
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
936
962
|
logWarning2(
|
|
937
|
-
`Skipping
|
|
963
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
938
964
|
);
|
|
939
965
|
continue;
|
|
940
966
|
}
|
|
@@ -943,7 +969,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
943
969
|
if (rawMinimums !== void 0) {
|
|
944
970
|
if (!isJsonObject2(rawMinimums)) {
|
|
945
971
|
logWarning2(
|
|
946
|
-
`Skipping
|
|
972
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
947
973
|
);
|
|
948
974
|
continue;
|
|
949
975
|
}
|
|
@@ -969,7 +995,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
969
995
|
argsMatch2 = rawArgsMatch;
|
|
970
996
|
} else {
|
|
971
997
|
logWarning2(
|
|
972
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
998
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
973
999
|
);
|
|
974
1000
|
}
|
|
975
1001
|
}
|
|
@@ -979,7 +1005,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
979
1005
|
if (rawExpected !== void 0) {
|
|
980
1006
|
if (!Array.isArray(rawExpected)) {
|
|
981
1007
|
logWarning2(
|
|
982
|
-
`Skipping
|
|
1008
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
983
1009
|
);
|
|
984
1010
|
continue;
|
|
985
1011
|
}
|
|
@@ -1025,13 +1051,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1025
1051
|
}
|
|
1026
1052
|
if (mode === "any_order" && !minimums) {
|
|
1027
1053
|
logWarning2(
|
|
1028
|
-
`Skipping
|
|
1054
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
1029
1055
|
);
|
|
1030
1056
|
continue;
|
|
1031
1057
|
}
|
|
1032
1058
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
1033
1059
|
logWarning2(
|
|
1034
|
-
`Skipping
|
|
1060
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
1035
1061
|
);
|
|
1036
1062
|
continue;
|
|
1037
1063
|
}
|
|
@@ -1039,7 +1065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1039
1065
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1040
1066
|
const config2 = {
|
|
1041
1067
|
name,
|
|
1042
|
-
type: "
|
|
1068
|
+
type: "tool-trajectory",
|
|
1043
1069
|
mode,
|
|
1044
1070
|
...minimums ? { minimums } : {},
|
|
1045
1071
|
...expected ? { expected } : {},
|
|
@@ -1051,17 +1077,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1051
1077
|
evaluators.push(config2);
|
|
1052
1078
|
continue;
|
|
1053
1079
|
}
|
|
1054
|
-
if (typeValue === "
|
|
1080
|
+
if (typeValue === "field-accuracy") {
|
|
1055
1081
|
const rawFields = rawEvaluator.fields;
|
|
1056
1082
|
if (!Array.isArray(rawFields)) {
|
|
1057
1083
|
logWarning2(
|
|
1058
|
-
`Skipping
|
|
1084
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
1059
1085
|
);
|
|
1060
1086
|
continue;
|
|
1061
1087
|
}
|
|
1062
1088
|
if (rawFields.length === 0) {
|
|
1063
1089
|
logWarning2(
|
|
1064
|
-
`Skipping
|
|
1090
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
1065
1091
|
);
|
|
1066
1092
|
continue;
|
|
1067
1093
|
}
|
|
@@ -1069,7 +1095,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1069
1095
|
for (const rawField of rawFields) {
|
|
1070
1096
|
if (!isJsonObject2(rawField)) {
|
|
1071
1097
|
logWarning2(
|
|
1072
|
-
`Skipping invalid field entry in
|
|
1098
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
|
|
1073
1099
|
);
|
|
1074
1100
|
continue;
|
|
1075
1101
|
}
|
|
@@ -1077,13 +1103,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1077
1103
|
const match = asString(rawField.match);
|
|
1078
1104
|
if (!fieldPath) {
|
|
1079
1105
|
logWarning2(
|
|
1080
|
-
`Skipping field without path in
|
|
1106
|
+
`Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
|
|
1081
1107
|
);
|
|
1082
1108
|
continue;
|
|
1083
1109
|
}
|
|
1084
1110
|
if (!match || !isValidFieldMatchType(match)) {
|
|
1085
1111
|
logWarning2(
|
|
1086
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
1112
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
1087
1113
|
);
|
|
1088
1114
|
continue;
|
|
1089
1115
|
}
|
|
@@ -1100,7 +1126,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1100
1126
|
}
|
|
1101
1127
|
if (fields.length === 0) {
|
|
1102
1128
|
logWarning2(
|
|
1103
|
-
`Skipping
|
|
1129
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
1104
1130
|
);
|
|
1105
1131
|
continue;
|
|
1106
1132
|
}
|
|
@@ -1110,7 +1136,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1110
1136
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1111
1137
|
evaluators.push({
|
|
1112
1138
|
name,
|
|
1113
|
-
type: "
|
|
1139
|
+
type: "field-accuracy",
|
|
1114
1140
|
fields,
|
|
1115
1141
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
1116
1142
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -1159,7 +1185,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1159
1185
|
});
|
|
1160
1186
|
continue;
|
|
1161
1187
|
}
|
|
1162
|
-
if (typeValue === "
|
|
1188
|
+
if (typeValue === "token-usage") {
|
|
1163
1189
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
1164
1190
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
1165
1191
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -1173,7 +1199,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1173
1199
|
if (raw === void 0) continue;
|
|
1174
1200
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
1175
1201
|
logWarning2(
|
|
1176
|
-
`Skipping
|
|
1202
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
1177
1203
|
);
|
|
1178
1204
|
continue;
|
|
1179
1205
|
}
|
|
@@ -1181,7 +1207,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1181
1207
|
}
|
|
1182
1208
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
1183
1209
|
logWarning2(
|
|
1184
|
-
`Skipping
|
|
1210
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
1185
1211
|
);
|
|
1186
1212
|
continue;
|
|
1187
1213
|
}
|
|
@@ -1189,7 +1215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1189
1215
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1190
1216
|
evaluators.push({
|
|
1191
1217
|
name,
|
|
1192
|
-
type: "
|
|
1218
|
+
type: "token-usage",
|
|
1193
1219
|
...validLimits,
|
|
1194
1220
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1195
1221
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1197,7 +1223,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1197
1223
|
});
|
|
1198
1224
|
continue;
|
|
1199
1225
|
}
|
|
1200
|
-
if (typeValue === "
|
|
1226
|
+
if (typeValue === "execution-metrics") {
|
|
1201
1227
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
1202
1228
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
1203
1229
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -1220,7 +1246,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1220
1246
|
if (raw === void 0) continue;
|
|
1221
1247
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
1222
1248
|
logWarning2(
|
|
1223
|
-
`Skipping
|
|
1249
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
1224
1250
|
);
|
|
1225
1251
|
hasError = true;
|
|
1226
1252
|
break;
|
|
@@ -1233,7 +1259,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1233
1259
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
1234
1260
|
if (!hasThreshold) {
|
|
1235
1261
|
logWarning2(
|
|
1236
|
-
`Skipping
|
|
1262
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
1237
1263
|
);
|
|
1238
1264
|
continue;
|
|
1239
1265
|
}
|
|
@@ -1241,7 +1267,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1241
1267
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1242
1268
|
evaluators.push({
|
|
1243
1269
|
name,
|
|
1244
|
-
type: "
|
|
1270
|
+
type: "execution-metrics",
|
|
1245
1271
|
...validThresholds,
|
|
1246
1272
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1247
1273
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1249,13 +1275,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1249
1275
|
});
|
|
1250
1276
|
continue;
|
|
1251
1277
|
}
|
|
1252
|
-
if (typeValue === "
|
|
1278
|
+
if (typeValue === "agent-judge") {
|
|
1253
1279
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
1254
1280
|
let maxSteps;
|
|
1255
1281
|
if (rawMaxSteps !== void 0) {
|
|
1256
1282
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
1257
1283
|
logWarning2(
|
|
1258
|
-
`Skipping
|
|
1284
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
|
|
1259
1285
|
);
|
|
1260
1286
|
continue;
|
|
1261
1287
|
}
|
|
@@ -1266,7 +1292,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1266
1292
|
if (rawTemperature !== void 0) {
|
|
1267
1293
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
1268
1294
|
logWarning2(
|
|
1269
|
-
`Skipping
|
|
1295
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
|
|
1270
1296
|
);
|
|
1271
1297
|
continue;
|
|
1272
1298
|
}
|
|
@@ -1289,7 +1315,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1289
1315
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1290
1316
|
evaluators.push({
|
|
1291
1317
|
name,
|
|
1292
|
-
type: "
|
|
1318
|
+
type: "agent-judge",
|
|
1293
1319
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
1294
1320
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
1295
1321
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -1320,7 +1346,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1320
1346
|
});
|
|
1321
1347
|
continue;
|
|
1322
1348
|
}
|
|
1323
|
-
if (typeValue === "
|
|
1349
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
1324
1350
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1325
1351
|
if (!value || value.length === 0) {
|
|
1326
1352
|
logWarning2(
|
|
@@ -1358,7 +1384,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1358
1384
|
});
|
|
1359
1385
|
continue;
|
|
1360
1386
|
}
|
|
1361
|
-
if (typeValue === "
|
|
1387
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
1362
1388
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1363
1389
|
if (!value || value.length === 0) {
|
|
1364
1390
|
logWarning2(
|
|
@@ -1378,7 +1404,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1378
1404
|
});
|
|
1379
1405
|
continue;
|
|
1380
1406
|
}
|
|
1381
|
-
if (typeValue === "
|
|
1407
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
1382
1408
|
const value = asString(rawEvaluator.value);
|
|
1383
1409
|
if (!value) {
|
|
1384
1410
|
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
@@ -1416,12 +1442,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1416
1442
|
});
|
|
1417
1443
|
continue;
|
|
1418
1444
|
}
|
|
1419
|
-
if (typeValue === "
|
|
1445
|
+
if (typeValue === "is-json") {
|
|
1420
1446
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1421
1447
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1422
1448
|
evaluators.push({
|
|
1423
1449
|
name,
|
|
1424
|
-
type: "
|
|
1450
|
+
type: "is-json",
|
|
1425
1451
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1426
1452
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1427
1453
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -1469,7 +1495,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1469
1495
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1470
1496
|
evaluators.push({
|
|
1471
1497
|
name,
|
|
1472
|
-
type: "
|
|
1498
|
+
type: "llm-judge",
|
|
1473
1499
|
rubrics: parsedCriteria,
|
|
1474
1500
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1475
1501
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1536,7 +1562,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1536
1562
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1537
1563
|
evaluators.push({
|
|
1538
1564
|
name,
|
|
1539
|
-
type: "
|
|
1565
|
+
type: "llm-judge",
|
|
1540
1566
|
rubrics: parsedRubrics,
|
|
1541
1567
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1542
1568
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1568,7 +1594,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1568
1594
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
1569
1595
|
evaluators.push({
|
|
1570
1596
|
name,
|
|
1571
|
-
type: "
|
|
1597
|
+
type: "llm-judge",
|
|
1572
1598
|
prompt,
|
|
1573
1599
|
promptPath,
|
|
1574
1600
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -1584,15 +1610,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1584
1610
|
}
|
|
1585
1611
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1586
1612
|
"contains",
|
|
1587
|
-
"
|
|
1588
|
-
"
|
|
1613
|
+
"contains-any",
|
|
1614
|
+
"contains-all",
|
|
1589
1615
|
"icontains",
|
|
1590
|
-
"
|
|
1591
|
-
"
|
|
1592
|
-
"
|
|
1593
|
-
"
|
|
1616
|
+
"icontains-any",
|
|
1617
|
+
"icontains-all",
|
|
1618
|
+
"starts-with",
|
|
1619
|
+
"ends-with",
|
|
1594
1620
|
"regex",
|
|
1595
|
-
"
|
|
1621
|
+
"is-json",
|
|
1596
1622
|
"equals",
|
|
1597
1623
|
"rubrics"
|
|
1598
1624
|
]);
|
|
@@ -1605,24 +1631,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
1605
1631
|
switch (typeValue) {
|
|
1606
1632
|
case "contains":
|
|
1607
1633
|
return value ? `contains-${value}` : "contains";
|
|
1608
|
-
case "
|
|
1609
|
-
return arrayValue ? `
|
|
1610
|
-
case "
|
|
1611
|
-
return arrayValue ? `
|
|
1634
|
+
case "contains-any":
|
|
1635
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
1636
|
+
case "contains-all":
|
|
1637
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
1612
1638
|
case "icontains":
|
|
1613
1639
|
return value ? `icontains-${value}` : "icontains";
|
|
1614
|
-
case "
|
|
1615
|
-
return arrayValue ? `
|
|
1616
|
-
case "
|
|
1617
|
-
return arrayValue ? `
|
|
1618
|
-
case "
|
|
1619
|
-
return value ? `
|
|
1620
|
-
case "
|
|
1621
|
-
return value ? `
|
|
1640
|
+
case "icontains-any":
|
|
1641
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
1642
|
+
case "icontains-all":
|
|
1643
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
1644
|
+
case "starts-with":
|
|
1645
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
1646
|
+
case "ends-with":
|
|
1647
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
1622
1648
|
case "regex":
|
|
1623
1649
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
1624
|
-
case "
|
|
1625
|
-
return "
|
|
1650
|
+
case "is-json":
|
|
1651
|
+
return "is-json";
|
|
1626
1652
|
case "equals":
|
|
1627
1653
|
return value ? `equals-${value}` : "equals";
|
|
1628
1654
|
case "rubrics":
|
|
@@ -1635,8 +1661,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1635
1661
|
if (typeof candidate !== "string") {
|
|
1636
1662
|
return void 0;
|
|
1637
1663
|
}
|
|
1638
|
-
|
|
1639
|
-
|
|
1664
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
1665
|
+
if (isEvaluatorKind(normalized)) {
|
|
1666
|
+
return normalized;
|
|
1640
1667
|
}
|
|
1641
1668
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
1642
1669
|
return void 0;
|
|
@@ -1682,6 +1709,16 @@ function parseCommandToArgv(command) {
|
|
|
1682
1709
|
function isJsonObject2(value) {
|
|
1683
1710
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1684
1711
|
}
|
|
1712
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
1713
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
1714
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
1715
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
1716
|
+
if (!hasConsumer) {
|
|
1717
|
+
logWarning2(
|
|
1718
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
1719
|
+
);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1685
1722
|
function logWarning2(message, details) {
|
|
1686
1723
|
if (details && details.length > 0) {
|
|
1687
1724
|
const detailBlock = details.join("\n");
|
|
@@ -1931,7 +1968,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
1931
1968
|
}
|
|
1932
1969
|
return {
|
|
1933
1970
|
name: "rubric",
|
|
1934
|
-
type: "
|
|
1971
|
+
type: "llm-judge",
|
|
1935
1972
|
rubrics: rubricItems
|
|
1936
1973
|
};
|
|
1937
1974
|
}
|
|
@@ -2316,7 +2353,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2316
2353
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
2317
2354
|
const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
2318
2355
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
2319
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
2356
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
2320
2357
|
const globalExecution = sidecar.execution;
|
|
2321
2358
|
if (verbose) {
|
|
2322
2359
|
console.log(`
|
|
@@ -2404,6 +2441,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2404
2441
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
2405
2442
|
}
|
|
2406
2443
|
}
|
|
2444
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2407
2445
|
const userFilePaths = [];
|
|
2408
2446
|
for (const segment of inputSegments) {
|
|
2409
2447
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -2757,13 +2795,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2757
2795
|
}
|
|
2758
2796
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
2759
2797
|
const metadata = parseMetadata(parsed);
|
|
2798
|
+
const failOnError = extractFailOnError(parsed);
|
|
2760
2799
|
return {
|
|
2761
2800
|
tests,
|
|
2762
2801
|
trials: extractTrialsConfig(parsed),
|
|
2763
2802
|
targets: extractTargetsFromSuite(parsed),
|
|
2764
2803
|
cacheConfig: extractCacheConfig(parsed),
|
|
2765
2804
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2766
|
-
...metadata !== void 0 && { metadata }
|
|
2805
|
+
...metadata !== void 0 && { metadata },
|
|
2806
|
+
...failOnError !== void 0 && { failOnError }
|
|
2767
2807
|
};
|
|
2768
2808
|
}
|
|
2769
2809
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -2794,7 +2834,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2794
2834
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
2795
2835
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
2796
2836
|
const rawTestcases = resolveTests(suite);
|
|
2797
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
2837
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
2798
2838
|
const evalFileDir = path8.dirname(absoluteTestPath);
|
|
2799
2839
|
let expandedTestcases;
|
|
2800
2840
|
if (typeof rawTestcases === "string") {
|
|
@@ -2891,6 +2931,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2891
2931
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
2892
2932
|
}
|
|
2893
2933
|
}
|
|
2934
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2894
2935
|
const userFilePaths = [];
|
|
2895
2936
|
for (const segment of inputSegments) {
|
|
2896
2937
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -8871,7 +8912,7 @@ function toCamelCaseDeep(obj) {
|
|
|
8871
8912
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
8872
8913
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
8873
8914
|
var CodeEvaluator = class {
|
|
8874
|
-
kind = "code";
|
|
8915
|
+
kind = "code-judge";
|
|
8875
8916
|
command;
|
|
8876
8917
|
cwd;
|
|
8877
8918
|
agentTimeoutMs;
|
|
@@ -9079,7 +9120,7 @@ var scoreRangeEvaluationSchema = z3.object({
|
|
|
9079
9120
|
overall_reasoning: z3.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
9080
9121
|
});
|
|
9081
9122
|
var LlmJudgeEvaluator = class {
|
|
9082
|
-
kind = "
|
|
9123
|
+
kind = "llm-judge";
|
|
9083
9124
|
resolveJudgeProvider;
|
|
9084
9125
|
maxOutputTokens;
|
|
9085
9126
|
temperature;
|
|
@@ -9096,7 +9137,7 @@ var LlmJudgeEvaluator = class {
|
|
|
9096
9137
|
throw new Error("No judge provider available for LLM grading");
|
|
9097
9138
|
}
|
|
9098
9139
|
const config = context.evaluator;
|
|
9099
|
-
if (config?.type === "
|
|
9140
|
+
if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
|
|
9100
9141
|
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
9101
9142
|
}
|
|
9102
9143
|
return this.evaluateFreeform(context, judgeProvider);
|
|
@@ -9170,7 +9211,7 @@ ${context.fileChanges}`;
|
|
|
9170
9211
|
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
9171
9212
|
if (!rubrics || rubrics.length === 0) {
|
|
9172
9213
|
throw new Error(
|
|
9173
|
-
`No rubrics found for evaluator "${context.evaluator?.name ?? "
|
|
9214
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
9174
9215
|
);
|
|
9175
9216
|
}
|
|
9176
9217
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -9506,9 +9547,9 @@ var CompositeEvaluator = class {
|
|
|
9506
9547
|
async aggregate(results, context) {
|
|
9507
9548
|
const aggregator = this.config.aggregator;
|
|
9508
9549
|
switch (aggregator.type) {
|
|
9509
|
-
case "
|
|
9550
|
+
case "code-judge":
|
|
9510
9551
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
9511
|
-
case "
|
|
9552
|
+
case "llm-judge":
|
|
9512
9553
|
return this.runLlmAggregator(results, context, aggregator);
|
|
9513
9554
|
case "threshold":
|
|
9514
9555
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -9651,7 +9692,7 @@ var CompositeEvaluator = class {
|
|
|
9651
9692
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
9652
9693
|
reasoning,
|
|
9653
9694
|
evaluatorRawRequest: {
|
|
9654
|
-
aggregator: "
|
|
9695
|
+
aggregator: "code-judge",
|
|
9655
9696
|
script: scriptPath
|
|
9656
9697
|
},
|
|
9657
9698
|
scores
|
|
@@ -9666,7 +9707,7 @@ var CompositeEvaluator = class {
|
|
|
9666
9707
|
expectedAspectCount: 1,
|
|
9667
9708
|
reasoning: message,
|
|
9668
9709
|
evaluatorRawRequest: {
|
|
9669
|
-
aggregator: "
|
|
9710
|
+
aggregator: "code-judge",
|
|
9670
9711
|
script: scriptPath,
|
|
9671
9712
|
error: message
|
|
9672
9713
|
},
|
|
@@ -9697,7 +9738,7 @@ var CompositeEvaluator = class {
|
|
|
9697
9738
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
9698
9739
|
const systemPrompt = buildOutputSchema();
|
|
9699
9740
|
const evaluatorRawRequest = {
|
|
9700
|
-
aggregator: "
|
|
9741
|
+
aggregator: "llm-judge",
|
|
9701
9742
|
userPrompt,
|
|
9702
9743
|
systemPrompt,
|
|
9703
9744
|
target: judgeProvider.targetName
|
|
@@ -9809,7 +9850,7 @@ var CostEvaluator = class {
|
|
|
9809
9850
|
|
|
9810
9851
|
// src/evaluation/evaluators/execution-metrics.ts
|
|
9811
9852
|
var ExecutionMetricsEvaluator = class {
|
|
9812
|
-
kind = "
|
|
9853
|
+
kind = "execution-metrics";
|
|
9813
9854
|
config;
|
|
9814
9855
|
constructor(options) {
|
|
9815
9856
|
this.config = options.config;
|
|
@@ -9835,7 +9876,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9835
9876
|
expectedAspectCount: 1,
|
|
9836
9877
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
9837
9878
|
evaluatorRawRequest: {
|
|
9838
|
-
type: "
|
|
9879
|
+
type: "execution-metrics",
|
|
9839
9880
|
config: this.extractConfiguredThresholds(),
|
|
9840
9881
|
actual: null
|
|
9841
9882
|
}
|
|
@@ -9944,7 +9985,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9944
9985
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
9945
9986
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
9946
9987
|
}
|
|
9947
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
9988
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
9948
9989
|
return {
|
|
9949
9990
|
score,
|
|
9950
9991
|
verdict: scoreToVerdict(score),
|
|
@@ -9953,7 +9994,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9953
9994
|
expectedAspectCount: totalChecks || 1,
|
|
9954
9995
|
reasoning,
|
|
9955
9996
|
evaluatorRawRequest: {
|
|
9956
|
-
type: "
|
|
9997
|
+
type: "execution-metrics",
|
|
9957
9998
|
config: this.extractConfiguredThresholds(),
|
|
9958
9999
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
9959
10000
|
}
|
|
@@ -10041,7 +10082,7 @@ var MONTH_NAMES = {
|
|
|
10041
10082
|
december: 11
|
|
10042
10083
|
};
|
|
10043
10084
|
var FieldAccuracyEvaluator = class {
|
|
10044
|
-
kind = "
|
|
10085
|
+
kind = "field-accuracy";
|
|
10045
10086
|
config;
|
|
10046
10087
|
constructor(options) {
|
|
10047
10088
|
this.config = options.config;
|
|
@@ -10495,7 +10536,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
10495
10536
|
".dylib"
|
|
10496
10537
|
]);
|
|
10497
10538
|
var AgentJudgeEvaluator = class {
|
|
10498
|
-
kind = "
|
|
10539
|
+
kind = "agent-judge";
|
|
10499
10540
|
resolveJudgeProvider;
|
|
10500
10541
|
maxSteps;
|
|
10501
10542
|
temperature;
|
|
@@ -10520,24 +10561,24 @@ var AgentJudgeEvaluator = class {
|
|
|
10520
10561
|
async evaluateBuiltIn(context) {
|
|
10521
10562
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
10522
10563
|
if (!judgeProvider) {
|
|
10523
|
-
throw new Error("No judge provider available for
|
|
10564
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
10524
10565
|
}
|
|
10525
10566
|
const model = judgeProvider.asLanguageModel?.();
|
|
10526
10567
|
if (!model) {
|
|
10527
10568
|
throw new Error(
|
|
10528
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
10569
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
10529
10570
|
);
|
|
10530
10571
|
}
|
|
10531
10572
|
const workspacePath = context.workspacePath;
|
|
10532
10573
|
if (!workspacePath) {
|
|
10533
10574
|
throw new Error(
|
|
10534
|
-
"
|
|
10575
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
10535
10576
|
);
|
|
10536
10577
|
}
|
|
10537
10578
|
const systemPrompt = this.buildSystemPrompt(context);
|
|
10538
10579
|
const userPrompt = this.buildUserPrompt(context);
|
|
10539
10580
|
const config = context.evaluator;
|
|
10540
|
-
const rubrics = config?.type === "
|
|
10581
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10541
10582
|
const fsTools = createFilesystemTools(workspacePath);
|
|
10542
10583
|
const evaluatorRawRequest = {
|
|
10543
10584
|
mode: "built-in",
|
|
@@ -10568,7 +10609,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10568
10609
|
score: 0,
|
|
10569
10610
|
verdict: "fail",
|
|
10570
10611
|
hits: [],
|
|
10571
|
-
misses: [`
|
|
10612
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
10572
10613
|
expectedAspectCount: 1,
|
|
10573
10614
|
evaluatorRawRequest,
|
|
10574
10615
|
details: { mode: "built-in", error: message }
|
|
@@ -10600,14 +10641,14 @@ var AgentJudgeEvaluator = class {
|
|
|
10600
10641
|
score: 0,
|
|
10601
10642
|
verdict: "fail",
|
|
10602
10643
|
hits: [],
|
|
10603
|
-
misses: ["
|
|
10644
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
10604
10645
|
expectedAspectCount: 1,
|
|
10605
10646
|
evaluatorRawRequest,
|
|
10606
10647
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
10607
10648
|
};
|
|
10608
10649
|
}
|
|
10609
10650
|
const config = context.evaluator;
|
|
10610
|
-
const rubrics = config?.type === "
|
|
10651
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10611
10652
|
const details = {
|
|
10612
10653
|
mode: "judge_target",
|
|
10613
10654
|
judge_target: provider.targetName
|
|
@@ -10619,7 +10660,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10619
10660
|
score: 0,
|
|
10620
10661
|
verdict: "fail",
|
|
10621
10662
|
hits: [],
|
|
10622
|
-
misses: [`
|
|
10663
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
10623
10664
|
expectedAspectCount: 1,
|
|
10624
10665
|
evaluatorRawRequest,
|
|
10625
10666
|
details: {
|
|
@@ -10670,7 +10711,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10670
10711
|
score: 0,
|
|
10671
10712
|
verdict: "fail",
|
|
10672
10713
|
hits: [],
|
|
10673
|
-
misses: ["Failed to parse
|
|
10714
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
10674
10715
|
expectedAspectCount: 1,
|
|
10675
10716
|
evaluatorRawRequest,
|
|
10676
10717
|
details
|
|
@@ -10683,7 +10724,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10683
10724
|
*/
|
|
10684
10725
|
buildSystemPrompt(context) {
|
|
10685
10726
|
const config = context.evaluator;
|
|
10686
|
-
const rubrics = config?.type === "
|
|
10727
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10687
10728
|
const parts = [
|
|
10688
10729
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
10689
10730
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -10714,7 +10755,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10714
10755
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
10715
10756
|
}
|
|
10716
10757
|
const config = context.evaluator;
|
|
10717
|
-
const rubrics = config?.type === "
|
|
10758
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10718
10759
|
const parts = [
|
|
10719
10760
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
10720
10761
|
"",
|
|
@@ -10757,7 +10798,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10757
10798
|
buildDelegatedPrompt(context) {
|
|
10758
10799
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10759
10800
|
const config = context.evaluator;
|
|
10760
|
-
const rubrics = config?.type === "
|
|
10801
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10761
10802
|
if (this.evaluatorTemplate) {
|
|
10762
10803
|
const variables = {
|
|
10763
10804
|
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
@@ -10839,11 +10880,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
10839
10880
|
execute: async (input) => {
|
|
10840
10881
|
try {
|
|
10841
10882
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
10842
|
-
const
|
|
10843
|
-
if (
|
|
10883
|
+
const stat8 = await fs2.stat(resolved);
|
|
10884
|
+
if (stat8.isDirectory()) {
|
|
10844
10885
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
10845
10886
|
}
|
|
10846
|
-
const buffer = Buffer.alloc(Math.min(
|
|
10887
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
10847
10888
|
const fd = await fs2.open(resolved, "r");
|
|
10848
10889
|
try {
|
|
10849
10890
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -10851,8 +10892,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
10851
10892
|
await fd.close();
|
|
10852
10893
|
}
|
|
10853
10894
|
const content = buffer.toString("utf-8");
|
|
10854
|
-
const truncated =
|
|
10855
|
-
return { content, truncated, size:
|
|
10895
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
10896
|
+
return { content, truncated, size: stat8.size };
|
|
10856
10897
|
} catch (error) {
|
|
10857
10898
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
10858
10899
|
}
|
|
@@ -10896,8 +10937,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
10896
10937
|
const ext = path30.extname(entry.name).toLowerCase();
|
|
10897
10938
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
10898
10939
|
try {
|
|
10899
|
-
const
|
|
10900
|
-
if (
|
|
10940
|
+
const stat8 = await fs2.stat(fullPath);
|
|
10941
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
10901
10942
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
10902
10943
|
const lines = content.split("\n");
|
|
10903
10944
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -11059,7 +11100,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
11059
11100
|
|
|
11060
11101
|
// src/evaluation/evaluators/token-usage.ts
|
|
11061
11102
|
var TokenUsageEvaluator = class {
|
|
11062
|
-
kind = "
|
|
11103
|
+
kind = "token-usage";
|
|
11063
11104
|
config;
|
|
11064
11105
|
constructor(options) {
|
|
11065
11106
|
this.config = options.config;
|
|
@@ -11082,7 +11123,7 @@ var TokenUsageEvaluator = class {
|
|
|
11082
11123
|
expectedAspectCount,
|
|
11083
11124
|
reasoning: "Token usage not reported by provider",
|
|
11084
11125
|
evaluatorRawRequest: {
|
|
11085
|
-
type: "
|
|
11126
|
+
type: "token-usage",
|
|
11086
11127
|
max_total: maxTotal ?? null,
|
|
11087
11128
|
max_input: maxInput ?? null,
|
|
11088
11129
|
max_output: maxOutput ?? null,
|
|
@@ -11124,9 +11165,9 @@ var TokenUsageEvaluator = class {
|
|
|
11124
11165
|
hits,
|
|
11125
11166
|
misses,
|
|
11126
11167
|
expectedAspectCount,
|
|
11127
|
-
reasoning: `
|
|
11168
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
11128
11169
|
evaluatorRawRequest: {
|
|
11129
|
-
type: "
|
|
11170
|
+
type: "token-usage",
|
|
11130
11171
|
max_total: maxTotal ?? null,
|
|
11131
11172
|
max_input: maxInput ?? null,
|
|
11132
11173
|
max_output: maxOutput ?? null,
|
|
@@ -11211,7 +11252,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
11211
11252
|
};
|
|
11212
11253
|
}
|
|
11213
11254
|
var ToolTrajectoryEvaluator = class {
|
|
11214
|
-
kind = "
|
|
11255
|
+
kind = "tool-trajectory";
|
|
11215
11256
|
config;
|
|
11216
11257
|
constructor(options) {
|
|
11217
11258
|
this.config = options.config;
|
|
@@ -11399,7 +11440,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
11399
11440
|
}
|
|
11400
11441
|
}
|
|
11401
11442
|
for (const warning of warnings) {
|
|
11402
|
-
console.warn(`[
|
|
11443
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
11403
11444
|
}
|
|
11404
11445
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
11405
11446
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -11475,7 +11516,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
11475
11516
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
11476
11517
|
}
|
|
11477
11518
|
for (const warning of warnings) {
|
|
11478
|
-
console.warn(`[
|
|
11519
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
11479
11520
|
}
|
|
11480
11521
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
11481
11522
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -11705,7 +11746,7 @@ function runEqualsAssertion(output, value) {
|
|
|
11705
11746
|
|
|
11706
11747
|
// src/evaluation/orchestrator.ts
|
|
11707
11748
|
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11708
|
-
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
11749
|
+
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
11709
11750
|
import path37 from "node:path";
|
|
11710
11751
|
import micromatch4 from "micromatch";
|
|
11711
11752
|
|
|
@@ -11965,7 +12006,7 @@ var llmJudgeFactory = (config, context) => {
|
|
|
11965
12006
|
const c = config;
|
|
11966
12007
|
const { llmJudge, agentTimeoutMs } = context;
|
|
11967
12008
|
return {
|
|
11968
|
-
kind: "
|
|
12009
|
+
kind: "llm-judge",
|
|
11969
12010
|
async evaluate(evalContext) {
|
|
11970
12011
|
const customPrompt = await resolveCustomPrompt(
|
|
11971
12012
|
c,
|
|
@@ -12054,7 +12095,7 @@ var agentJudgeFactory = (config, context) => {
|
|
|
12054
12095
|
customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
|
|
12055
12096
|
} catch (error) {
|
|
12056
12097
|
const message = error instanceof Error ? error.message : String(error);
|
|
12057
|
-
console.warn(`Could not read
|
|
12098
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
12058
12099
|
}
|
|
12059
12100
|
} else if (c.prompt) {
|
|
12060
12101
|
customPrompt = c.prompt;
|
|
@@ -12064,7 +12105,7 @@ var agentJudgeFactory = (config, context) => {
|
|
|
12064
12105
|
judgeTargetProvider = targetResolver(c.target);
|
|
12065
12106
|
if (!judgeTargetProvider) {
|
|
12066
12107
|
throw new Error(
|
|
12067
|
-
`
|
|
12108
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
12068
12109
|
);
|
|
12069
12110
|
}
|
|
12070
12111
|
}
|
|
@@ -12108,7 +12149,7 @@ var regexFactory = (config) => {
|
|
|
12108
12149
|
});
|
|
12109
12150
|
};
|
|
12110
12151
|
var isJsonFactory = () => {
|
|
12111
|
-
return new DeterministicAssertionEvaluator("
|
|
12152
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
12112
12153
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
12113
12154
|
return {
|
|
12114
12155
|
score: result.score,
|
|
@@ -12136,7 +12177,7 @@ var equalsFactory = (config) => {
|
|
|
12136
12177
|
};
|
|
12137
12178
|
var containsAnyFactory = (config) => {
|
|
12138
12179
|
const c = config;
|
|
12139
|
-
return new DeterministicAssertionEvaluator("
|
|
12180
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
12140
12181
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
12141
12182
|
return {
|
|
12142
12183
|
score: result.score,
|
|
@@ -12150,7 +12191,7 @@ var containsAnyFactory = (config) => {
|
|
|
12150
12191
|
};
|
|
12151
12192
|
var containsAllFactory = (config) => {
|
|
12152
12193
|
const c = config;
|
|
12153
|
-
return new DeterministicAssertionEvaluator("
|
|
12194
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
12154
12195
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
12155
12196
|
return {
|
|
12156
12197
|
score: result.score,
|
|
@@ -12178,7 +12219,7 @@ var icontainsFactory = (config) => {
|
|
|
12178
12219
|
};
|
|
12179
12220
|
var icontainsAnyFactory = (config) => {
|
|
12180
12221
|
const c = config;
|
|
12181
|
-
return new DeterministicAssertionEvaluator("
|
|
12222
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
12182
12223
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
12183
12224
|
return {
|
|
12184
12225
|
score: result.score,
|
|
@@ -12192,7 +12233,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
12192
12233
|
};
|
|
12193
12234
|
var icontainsAllFactory = (config) => {
|
|
12194
12235
|
const c = config;
|
|
12195
|
-
return new DeterministicAssertionEvaluator("
|
|
12236
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
12196
12237
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
12197
12238
|
return {
|
|
12198
12239
|
score: result.score,
|
|
@@ -12206,7 +12247,7 @@ var icontainsAllFactory = (config) => {
|
|
|
12206
12247
|
};
|
|
12207
12248
|
var startsWithFactory = (config) => {
|
|
12208
12249
|
const c = config;
|
|
12209
|
-
return new DeterministicAssertionEvaluator("
|
|
12250
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
12210
12251
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
12211
12252
|
return {
|
|
12212
12253
|
score: result.score,
|
|
@@ -12220,7 +12261,7 @@ var startsWithFactory = (config) => {
|
|
|
12220
12261
|
};
|
|
12221
12262
|
var endsWithFactory = (config) => {
|
|
12222
12263
|
const c = config;
|
|
12223
|
-
return new DeterministicAssertionEvaluator("
|
|
12264
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
12224
12265
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
12225
12266
|
return {
|
|
12226
12267
|
score: result.score,
|
|
@@ -12234,7 +12275,7 @@ var endsWithFactory = (config) => {
|
|
|
12234
12275
|
};
|
|
12235
12276
|
function createBuiltinRegistry() {
|
|
12236
12277
|
const registry = new EvaluatorRegistry();
|
|
12237
|
-
registry.register("
|
|
12278
|
+
registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
12238
12279
|
return registry;
|
|
12239
12280
|
}
|
|
12240
12281
|
|
|
@@ -12921,7 +12962,8 @@ async function runEvaluation(options) {
|
|
|
12921
12962
|
cleanupWorkspaces,
|
|
12922
12963
|
trials,
|
|
12923
12964
|
streamCallbacks,
|
|
12924
|
-
totalBudgetUsd
|
|
12965
|
+
totalBudgetUsd,
|
|
12966
|
+
failOnError
|
|
12925
12967
|
} = options;
|
|
12926
12968
|
let useCache = options.useCache;
|
|
12927
12969
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -12979,7 +13021,7 @@ async function runEvaluation(options) {
|
|
|
12979
13021
|
};
|
|
12980
13022
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
12981
13023
|
throw new Error(
|
|
12982
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
13024
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
12983
13025
|
);
|
|
12984
13026
|
}
|
|
12985
13027
|
const targetResolver = (name) => {
|
|
@@ -13050,7 +13092,7 @@ async function runEvaluation(options) {
|
|
|
13050
13092
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
13051
13093
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
13052
13094
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
13053
|
-
|
|
13095
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
13054
13096
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13055
13097
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13056
13098
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -13071,6 +13113,14 @@ async function runEvaluation(options) {
|
|
|
13071
13113
|
const message = error instanceof Error ? error.message : String(error);
|
|
13072
13114
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
13073
13115
|
}
|
|
13116
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
13117
|
+
const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
|
|
13118
|
+
try {
|
|
13119
|
+
await stat7(copiedWorkspaceFile);
|
|
13120
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
13121
|
+
} catch {
|
|
13122
|
+
}
|
|
13123
|
+
}
|
|
13074
13124
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
13075
13125
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
13076
13126
|
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
@@ -13117,6 +13167,7 @@ async function runEvaluation(options) {
|
|
|
13117
13167
|
let beforeAllOutputAttached = false;
|
|
13118
13168
|
let cumulativeBudgetCost = 0;
|
|
13119
13169
|
let budgetExhausted = false;
|
|
13170
|
+
let failOnErrorTriggered = false;
|
|
13120
13171
|
const promises = filteredEvalCases.map(
|
|
13121
13172
|
(evalCase) => limit(async () => {
|
|
13122
13173
|
const workerId = nextWorkerId++;
|
|
@@ -13155,6 +13206,37 @@ async function runEvaluation(options) {
|
|
|
13155
13206
|
}
|
|
13156
13207
|
return budgetResult;
|
|
13157
13208
|
}
|
|
13209
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
13210
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
13211
|
+
const haltResult = {
|
|
13212
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13213
|
+
testId: evalCase.id,
|
|
13214
|
+
dataset: evalCase.dataset,
|
|
13215
|
+
score: 0,
|
|
13216
|
+
hits: [],
|
|
13217
|
+
misses: [],
|
|
13218
|
+
answer: "",
|
|
13219
|
+
target: target.name,
|
|
13220
|
+
error: errorMsg,
|
|
13221
|
+
executionStatus: "execution_error",
|
|
13222
|
+
failureStage: "setup",
|
|
13223
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
13224
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
13225
|
+
};
|
|
13226
|
+
if (onProgress) {
|
|
13227
|
+
await onProgress({
|
|
13228
|
+
workerId,
|
|
13229
|
+
testId: evalCase.id,
|
|
13230
|
+
status: "failed",
|
|
13231
|
+
completedAt: Date.now(),
|
|
13232
|
+
error: haltResult.error
|
|
13233
|
+
});
|
|
13234
|
+
}
|
|
13235
|
+
if (onResult) {
|
|
13236
|
+
await onResult(haltResult);
|
|
13237
|
+
}
|
|
13238
|
+
return haltResult;
|
|
13239
|
+
}
|
|
13158
13240
|
if (onProgress) {
|
|
13159
13241
|
await onProgress({
|
|
13160
13242
|
workerId,
|
|
@@ -13207,6 +13289,9 @@ async function runEvaluation(options) {
|
|
|
13207
13289
|
}
|
|
13208
13290
|
}
|
|
13209
13291
|
}
|
|
13292
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
13293
|
+
failOnErrorTriggered = true;
|
|
13294
|
+
}
|
|
13210
13295
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
13211
13296
|
result = { ...result, beforeAllOutput };
|
|
13212
13297
|
beforeAllOutputAttached = true;
|
|
@@ -13514,6 +13599,14 @@ async function runEvalCase(options) {
|
|
|
13514
13599
|
"template_error"
|
|
13515
13600
|
);
|
|
13516
13601
|
}
|
|
13602
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
13603
|
+
const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
|
|
13604
|
+
try {
|
|
13605
|
+
await stat7(copiedFile);
|
|
13606
|
+
caseWorkspaceFile = copiedFile;
|
|
13607
|
+
} catch {
|
|
13608
|
+
}
|
|
13609
|
+
}
|
|
13517
13610
|
}
|
|
13518
13611
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
13519
13612
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -14023,8 +14116,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
14023
14116
|
workspacePath
|
|
14024
14117
|
});
|
|
14025
14118
|
}
|
|
14026
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
14027
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
14119
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
14120
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
14028
14121
|
if (!activeEvaluator) {
|
|
14029
14122
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
14030
14123
|
}
|
|
@@ -14107,25 +14200,24 @@ async function runEvaluatorList(options) {
|
|
|
14107
14200
|
availableTargets,
|
|
14108
14201
|
agentTimeoutMs,
|
|
14109
14202
|
evalFileDir,
|
|
14110
|
-
llmJudge: evaluatorRegistry
|
|
14203
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
14111
14204
|
registry: typeRegistry
|
|
14112
14205
|
};
|
|
14113
14206
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
14114
14207
|
try {
|
|
14115
14208
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
14116
14209
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
14117
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
14118
14210
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14119
14211
|
scored.push({
|
|
14120
14212
|
score: score2,
|
|
14121
14213
|
name: evaluatorConfig.name,
|
|
14122
|
-
type:
|
|
14214
|
+
type: evaluatorConfig.type,
|
|
14123
14215
|
weight,
|
|
14124
14216
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
14125
14217
|
});
|
|
14126
14218
|
scores.push({
|
|
14127
14219
|
name: evaluatorConfig.name,
|
|
14128
|
-
type:
|
|
14220
|
+
type: evaluatorConfig.type,
|
|
14129
14221
|
score: score2.score,
|
|
14130
14222
|
weight,
|
|
14131
14223
|
verdict: score2.verdict,
|
|
@@ -14147,18 +14239,17 @@ async function runEvaluatorList(options) {
|
|
|
14147
14239
|
expectedAspectCount: 1,
|
|
14148
14240
|
reasoning: message
|
|
14149
14241
|
};
|
|
14150
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
14151
14242
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14152
14243
|
scored.push({
|
|
14153
14244
|
score: fallbackScore,
|
|
14154
14245
|
name: evaluatorConfig.name ?? "unknown",
|
|
14155
|
-
type:
|
|
14246
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
14156
14247
|
weight,
|
|
14157
14248
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
14158
14249
|
});
|
|
14159
14250
|
scores.push({
|
|
14160
14251
|
name: evaluatorConfig.name ?? "unknown",
|
|
14161
|
-
type:
|
|
14252
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
14162
14253
|
score: 0,
|
|
14163
14254
|
weight,
|
|
14164
14255
|
verdict: "fail",
|
|
@@ -14219,7 +14310,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
14219
14310
|
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
|
|
14220
14311
|
}
|
|
14221
14312
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
14222
|
-
const llmJudge = overrides?.
|
|
14313
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
14223
14314
|
resolveJudgeProvider: async (context) => {
|
|
14224
14315
|
if (context.judgeProvider) {
|
|
14225
14316
|
return context.judgeProvider;
|
|
@@ -14229,7 +14320,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
14229
14320
|
});
|
|
14230
14321
|
return {
|
|
14231
14322
|
...overrides,
|
|
14232
|
-
|
|
14323
|
+
"llm-judge": llmJudge
|
|
14233
14324
|
};
|
|
14234
14325
|
}
|
|
14235
14326
|
async function invokeProvider(provider, options) {
|
|
@@ -14489,12 +14580,7 @@ async function evaluate(config) {
|
|
|
14489
14580
|
};
|
|
14490
14581
|
}
|
|
14491
14582
|
function mapAssertionType(type) {
|
|
14492
|
-
|
|
14493
|
-
case "code_judge":
|
|
14494
|
-
return "code";
|
|
14495
|
-
default:
|
|
14496
|
-
return type;
|
|
14497
|
-
}
|
|
14583
|
+
return type.replace(/_/g, "-");
|
|
14498
14584
|
}
|
|
14499
14585
|
function computeSummary(results, durationMs) {
|
|
14500
14586
|
const total = results.length;
|
|
@@ -15268,6 +15354,7 @@ export {
|
|
|
15268
15354
|
executeWorkspaceScript,
|
|
15269
15355
|
explorationRatio,
|
|
15270
15356
|
extractCacheConfig,
|
|
15357
|
+
extractFailOnError,
|
|
15271
15358
|
extractJsonBlob,
|
|
15272
15359
|
extractTargetFromSuite,
|
|
15273
15360
|
extractTargetsFromSuite,
|