@agentv/core 2.13.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-JHER2LQ5.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +184 -158
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +40 -40
- package/dist/index.d.ts +40 -40
- package/dist/index.js +172 -146
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-JHER2LQ5.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-N55K52OO.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -682,6 +682,9 @@ function validateTemplateVariables(content, source) {
|
|
|
682
682
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
683
683
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
684
684
|
var ANSI_RESET4 = "\x1B[0m";
|
|
685
|
+
function normalizeEvaluatorType(type) {
|
|
686
|
+
return type.replace(/_/g, "-");
|
|
687
|
+
}
|
|
685
688
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
686
689
|
const execution = rawEvalCase.execution;
|
|
687
690
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -712,7 +715,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
712
715
|
continue;
|
|
713
716
|
}
|
|
714
717
|
const rawName = asString(rawEvaluator.name);
|
|
715
|
-
const
|
|
718
|
+
const rawType = rawEvaluator.type;
|
|
719
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
716
720
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
717
721
|
if (typeof typeValue !== "string") {
|
|
718
722
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -745,25 +749,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
745
749
|
});
|
|
746
750
|
continue;
|
|
747
751
|
}
|
|
748
|
-
if (typeValue === "
|
|
752
|
+
if (typeValue === "code-judge") {
|
|
749
753
|
let command;
|
|
750
754
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
751
755
|
if (typeof rawCommand === "string") {
|
|
752
756
|
const trimmed = rawCommand.trim();
|
|
753
757
|
if (trimmed.length === 0) {
|
|
754
758
|
throw new Error(
|
|
755
|
-
`Invalid
|
|
759
|
+
`Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
|
|
756
760
|
);
|
|
757
761
|
}
|
|
758
762
|
command = parseCommandToArgv(trimmed);
|
|
759
763
|
} else {
|
|
760
764
|
command = asStringArray(
|
|
761
765
|
rawCommand,
|
|
762
|
-
`
|
|
766
|
+
`code-judge command for evaluator '${name}' in '${evalId}'`
|
|
763
767
|
);
|
|
764
768
|
}
|
|
765
769
|
if (!command) {
|
|
766
|
-
logWarning2(`Skipping
|
|
770
|
+
logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
|
|
767
771
|
continue;
|
|
768
772
|
}
|
|
769
773
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
@@ -824,7 +828,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
824
828
|
}
|
|
825
829
|
evaluators.push({
|
|
826
830
|
name,
|
|
827
|
-
type: "code",
|
|
831
|
+
type: "code-judge",
|
|
828
832
|
command,
|
|
829
833
|
cwd,
|
|
830
834
|
resolvedCwd,
|
|
@@ -850,7 +854,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
850
854
|
continue;
|
|
851
855
|
}
|
|
852
856
|
const aggregatorType = asString(rawAggregator.type);
|
|
853
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
857
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
854
858
|
logWarning2(
|
|
855
859
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
856
860
|
);
|
|
@@ -899,16 +903,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
899
903
|
type: "weighted_average",
|
|
900
904
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
901
905
|
};
|
|
902
|
-
} else if (aggregatorType === "
|
|
906
|
+
} else if (aggregatorType === "code-judge") {
|
|
903
907
|
const aggregatorPath = asString(rawAggregator.path);
|
|
904
908
|
if (!aggregatorPath) {
|
|
905
909
|
logWarning2(
|
|
906
|
-
`Skipping composite evaluator '${name}' in '${evalId}':
|
|
910
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
|
|
907
911
|
);
|
|
908
912
|
continue;
|
|
909
913
|
}
|
|
910
914
|
aggregator = {
|
|
911
|
-
type: "
|
|
915
|
+
type: "code-judge",
|
|
912
916
|
path: aggregatorPath,
|
|
913
917
|
cwd: searchRoots[0]
|
|
914
918
|
};
|
|
@@ -934,7 +938,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
934
938
|
}
|
|
935
939
|
}
|
|
936
940
|
aggregator = {
|
|
937
|
-
type: "
|
|
941
|
+
type: "llm-judge",
|
|
938
942
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
939
943
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
940
944
|
};
|
|
@@ -952,11 +956,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
952
956
|
});
|
|
953
957
|
continue;
|
|
954
958
|
}
|
|
955
|
-
if (typeValue === "
|
|
959
|
+
if (typeValue === "tool-trajectory") {
|
|
956
960
|
const mode = asString(rawEvaluator.mode);
|
|
957
961
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
958
962
|
logWarning2(
|
|
959
|
-
`Skipping
|
|
963
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
960
964
|
);
|
|
961
965
|
continue;
|
|
962
966
|
}
|
|
@@ -965,7 +969,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
965
969
|
if (rawMinimums !== void 0) {
|
|
966
970
|
if (!isJsonObject2(rawMinimums)) {
|
|
967
971
|
logWarning2(
|
|
968
|
-
`Skipping
|
|
972
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
969
973
|
);
|
|
970
974
|
continue;
|
|
971
975
|
}
|
|
@@ -991,7 +995,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
991
995
|
argsMatch2 = rawArgsMatch;
|
|
992
996
|
} else {
|
|
993
997
|
logWarning2(
|
|
994
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
998
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
995
999
|
);
|
|
996
1000
|
}
|
|
997
1001
|
}
|
|
@@ -1001,7 +1005,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1001
1005
|
if (rawExpected !== void 0) {
|
|
1002
1006
|
if (!Array.isArray(rawExpected)) {
|
|
1003
1007
|
logWarning2(
|
|
1004
|
-
`Skipping
|
|
1008
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
1005
1009
|
);
|
|
1006
1010
|
continue;
|
|
1007
1011
|
}
|
|
@@ -1047,13 +1051,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1047
1051
|
}
|
|
1048
1052
|
if (mode === "any_order" && !minimums) {
|
|
1049
1053
|
logWarning2(
|
|
1050
|
-
`Skipping
|
|
1054
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
1051
1055
|
);
|
|
1052
1056
|
continue;
|
|
1053
1057
|
}
|
|
1054
1058
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
1055
1059
|
logWarning2(
|
|
1056
|
-
`Skipping
|
|
1060
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
1057
1061
|
);
|
|
1058
1062
|
continue;
|
|
1059
1063
|
}
|
|
@@ -1061,7 +1065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1061
1065
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1062
1066
|
const config2 = {
|
|
1063
1067
|
name,
|
|
1064
|
-
type: "
|
|
1068
|
+
type: "tool-trajectory",
|
|
1065
1069
|
mode,
|
|
1066
1070
|
...minimums ? { minimums } : {},
|
|
1067
1071
|
...expected ? { expected } : {},
|
|
@@ -1073,17 +1077,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1073
1077
|
evaluators.push(config2);
|
|
1074
1078
|
continue;
|
|
1075
1079
|
}
|
|
1076
|
-
if (typeValue === "
|
|
1080
|
+
if (typeValue === "field-accuracy") {
|
|
1077
1081
|
const rawFields = rawEvaluator.fields;
|
|
1078
1082
|
if (!Array.isArray(rawFields)) {
|
|
1079
1083
|
logWarning2(
|
|
1080
|
-
`Skipping
|
|
1084
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
1081
1085
|
);
|
|
1082
1086
|
continue;
|
|
1083
1087
|
}
|
|
1084
1088
|
if (rawFields.length === 0) {
|
|
1085
1089
|
logWarning2(
|
|
1086
|
-
`Skipping
|
|
1090
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
1087
1091
|
);
|
|
1088
1092
|
continue;
|
|
1089
1093
|
}
|
|
@@ -1091,7 +1095,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1091
1095
|
for (const rawField of rawFields) {
|
|
1092
1096
|
if (!isJsonObject2(rawField)) {
|
|
1093
1097
|
logWarning2(
|
|
1094
|
-
`Skipping invalid field entry in
|
|
1098
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
|
|
1095
1099
|
);
|
|
1096
1100
|
continue;
|
|
1097
1101
|
}
|
|
@@ -1099,13 +1103,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1099
1103
|
const match = asString(rawField.match);
|
|
1100
1104
|
if (!fieldPath) {
|
|
1101
1105
|
logWarning2(
|
|
1102
|
-
`Skipping field without path in
|
|
1106
|
+
`Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
|
|
1103
1107
|
);
|
|
1104
1108
|
continue;
|
|
1105
1109
|
}
|
|
1106
1110
|
if (!match || !isValidFieldMatchType(match)) {
|
|
1107
1111
|
logWarning2(
|
|
1108
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
1112
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
1109
1113
|
);
|
|
1110
1114
|
continue;
|
|
1111
1115
|
}
|
|
@@ -1122,7 +1126,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1122
1126
|
}
|
|
1123
1127
|
if (fields.length === 0) {
|
|
1124
1128
|
logWarning2(
|
|
1125
|
-
`Skipping
|
|
1129
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
1126
1130
|
);
|
|
1127
1131
|
continue;
|
|
1128
1132
|
}
|
|
@@ -1132,7 +1136,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1132
1136
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1133
1137
|
evaluators.push({
|
|
1134
1138
|
name,
|
|
1135
|
-
type: "
|
|
1139
|
+
type: "field-accuracy",
|
|
1136
1140
|
fields,
|
|
1137
1141
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
1138
1142
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -1181,7 +1185,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1181
1185
|
});
|
|
1182
1186
|
continue;
|
|
1183
1187
|
}
|
|
1184
|
-
if (typeValue === "
|
|
1188
|
+
if (typeValue === "token-usage") {
|
|
1185
1189
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
1186
1190
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
1187
1191
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -1195,7 +1199,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1195
1199
|
if (raw === void 0) continue;
|
|
1196
1200
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
1197
1201
|
logWarning2(
|
|
1198
|
-
`Skipping
|
|
1202
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
1199
1203
|
);
|
|
1200
1204
|
continue;
|
|
1201
1205
|
}
|
|
@@ -1203,7 +1207,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1203
1207
|
}
|
|
1204
1208
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
1205
1209
|
logWarning2(
|
|
1206
|
-
`Skipping
|
|
1210
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
1207
1211
|
);
|
|
1208
1212
|
continue;
|
|
1209
1213
|
}
|
|
@@ -1211,7 +1215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1211
1215
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1212
1216
|
evaluators.push({
|
|
1213
1217
|
name,
|
|
1214
|
-
type: "
|
|
1218
|
+
type: "token-usage",
|
|
1215
1219
|
...validLimits,
|
|
1216
1220
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1217
1221
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1219,7 +1223,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1219
1223
|
});
|
|
1220
1224
|
continue;
|
|
1221
1225
|
}
|
|
1222
|
-
if (typeValue === "
|
|
1226
|
+
if (typeValue === "execution-metrics") {
|
|
1223
1227
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
1224
1228
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
1225
1229
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -1242,7 +1246,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1242
1246
|
if (raw === void 0) continue;
|
|
1243
1247
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
1244
1248
|
logWarning2(
|
|
1245
|
-
`Skipping
|
|
1249
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
1246
1250
|
);
|
|
1247
1251
|
hasError = true;
|
|
1248
1252
|
break;
|
|
@@ -1255,7 +1259,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1255
1259
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
1256
1260
|
if (!hasThreshold) {
|
|
1257
1261
|
logWarning2(
|
|
1258
|
-
`Skipping
|
|
1262
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
1259
1263
|
);
|
|
1260
1264
|
continue;
|
|
1261
1265
|
}
|
|
@@ -1263,7 +1267,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1263
1267
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1264
1268
|
evaluators.push({
|
|
1265
1269
|
name,
|
|
1266
|
-
type: "
|
|
1270
|
+
type: "execution-metrics",
|
|
1267
1271
|
...validThresholds,
|
|
1268
1272
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1269
1273
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1271,13 +1275,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1271
1275
|
});
|
|
1272
1276
|
continue;
|
|
1273
1277
|
}
|
|
1274
|
-
if (typeValue === "
|
|
1278
|
+
if (typeValue === "agent-judge") {
|
|
1275
1279
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
1276
1280
|
let maxSteps;
|
|
1277
1281
|
if (rawMaxSteps !== void 0) {
|
|
1278
1282
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
1279
1283
|
logWarning2(
|
|
1280
|
-
`Skipping
|
|
1284
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
|
|
1281
1285
|
);
|
|
1282
1286
|
continue;
|
|
1283
1287
|
}
|
|
@@ -1288,7 +1292,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1288
1292
|
if (rawTemperature !== void 0) {
|
|
1289
1293
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
1290
1294
|
logWarning2(
|
|
1291
|
-
`Skipping
|
|
1295
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
|
|
1292
1296
|
);
|
|
1293
1297
|
continue;
|
|
1294
1298
|
}
|
|
@@ -1311,7 +1315,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1311
1315
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1312
1316
|
evaluators.push({
|
|
1313
1317
|
name,
|
|
1314
|
-
type: "
|
|
1318
|
+
type: "agent-judge",
|
|
1315
1319
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
1316
1320
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
1317
1321
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -1342,7 +1346,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1342
1346
|
});
|
|
1343
1347
|
continue;
|
|
1344
1348
|
}
|
|
1345
|
-
if (typeValue === "
|
|
1349
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
1346
1350
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1347
1351
|
if (!value || value.length === 0) {
|
|
1348
1352
|
logWarning2(
|
|
@@ -1380,7 +1384,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1380
1384
|
});
|
|
1381
1385
|
continue;
|
|
1382
1386
|
}
|
|
1383
|
-
if (typeValue === "
|
|
1387
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
1384
1388
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1385
1389
|
if (!value || value.length === 0) {
|
|
1386
1390
|
logWarning2(
|
|
@@ -1400,7 +1404,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1400
1404
|
});
|
|
1401
1405
|
continue;
|
|
1402
1406
|
}
|
|
1403
|
-
if (typeValue === "
|
|
1407
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
1404
1408
|
const value = asString(rawEvaluator.value);
|
|
1405
1409
|
if (!value) {
|
|
1406
1410
|
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
@@ -1438,12 +1442,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1438
1442
|
});
|
|
1439
1443
|
continue;
|
|
1440
1444
|
}
|
|
1441
|
-
if (typeValue === "
|
|
1445
|
+
if (typeValue === "is-json") {
|
|
1442
1446
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1443
1447
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1444
1448
|
evaluators.push({
|
|
1445
1449
|
name,
|
|
1446
|
-
type: "
|
|
1450
|
+
type: "is-json",
|
|
1447
1451
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1448
1452
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1449
1453
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -1491,7 +1495,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1491
1495
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1492
1496
|
evaluators.push({
|
|
1493
1497
|
name,
|
|
1494
|
-
type: "
|
|
1498
|
+
type: "llm-judge",
|
|
1495
1499
|
rubrics: parsedCriteria,
|
|
1496
1500
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1497
1501
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1558,7 +1562,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1558
1562
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1559
1563
|
evaluators.push({
|
|
1560
1564
|
name,
|
|
1561
|
-
type: "
|
|
1565
|
+
type: "llm-judge",
|
|
1562
1566
|
rubrics: parsedRubrics,
|
|
1563
1567
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1564
1568
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -1590,7 +1594,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1590
1594
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
1591
1595
|
evaluators.push({
|
|
1592
1596
|
name,
|
|
1593
|
-
type: "
|
|
1597
|
+
type: "llm-judge",
|
|
1594
1598
|
prompt,
|
|
1595
1599
|
promptPath,
|
|
1596
1600
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -1606,15 +1610,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1606
1610
|
}
|
|
1607
1611
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1608
1612
|
"contains",
|
|
1609
|
-
"
|
|
1610
|
-
"
|
|
1613
|
+
"contains-any",
|
|
1614
|
+
"contains-all",
|
|
1611
1615
|
"icontains",
|
|
1612
|
-
"
|
|
1613
|
-
"
|
|
1614
|
-
"
|
|
1615
|
-
"
|
|
1616
|
+
"icontains-any",
|
|
1617
|
+
"icontains-all",
|
|
1618
|
+
"starts-with",
|
|
1619
|
+
"ends-with",
|
|
1616
1620
|
"regex",
|
|
1617
|
-
"
|
|
1621
|
+
"is-json",
|
|
1618
1622
|
"equals",
|
|
1619
1623
|
"rubrics"
|
|
1620
1624
|
]);
|
|
@@ -1627,24 +1631,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
1627
1631
|
switch (typeValue) {
|
|
1628
1632
|
case "contains":
|
|
1629
1633
|
return value ? `contains-${value}` : "contains";
|
|
1630
|
-
case "
|
|
1631
|
-
return arrayValue ? `
|
|
1632
|
-
case "
|
|
1633
|
-
return arrayValue ? `
|
|
1634
|
+
case "contains-any":
|
|
1635
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
1636
|
+
case "contains-all":
|
|
1637
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
1634
1638
|
case "icontains":
|
|
1635
1639
|
return value ? `icontains-${value}` : "icontains";
|
|
1636
|
-
case "
|
|
1637
|
-
return arrayValue ? `
|
|
1638
|
-
case "
|
|
1639
|
-
return arrayValue ? `
|
|
1640
|
-
case "
|
|
1641
|
-
return value ? `
|
|
1642
|
-
case "
|
|
1643
|
-
return value ? `
|
|
1640
|
+
case "icontains-any":
|
|
1641
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
1642
|
+
case "icontains-all":
|
|
1643
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
1644
|
+
case "starts-with":
|
|
1645
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
1646
|
+
case "ends-with":
|
|
1647
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
1644
1648
|
case "regex":
|
|
1645
1649
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
1646
|
-
case "
|
|
1647
|
-
return "
|
|
1650
|
+
case "is-json":
|
|
1651
|
+
return "is-json";
|
|
1648
1652
|
case "equals":
|
|
1649
1653
|
return value ? `equals-${value}` : "equals";
|
|
1650
1654
|
case "rubrics":
|
|
@@ -1657,8 +1661,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1657
1661
|
if (typeof candidate !== "string") {
|
|
1658
1662
|
return void 0;
|
|
1659
1663
|
}
|
|
1660
|
-
|
|
1661
|
-
|
|
1664
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
1665
|
+
if (isEvaluatorKind(normalized)) {
|
|
1666
|
+
return normalized;
|
|
1662
1667
|
}
|
|
1663
1668
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
1664
1669
|
return void 0;
|
|
@@ -1704,6 +1709,16 @@ function parseCommandToArgv(command) {
|
|
|
1704
1709
|
function isJsonObject2(value) {
|
|
1705
1710
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1706
1711
|
}
|
|
1712
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
1713
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
1714
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
1715
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
1716
|
+
if (!hasConsumer) {
|
|
1717
|
+
logWarning2(
|
|
1718
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
1719
|
+
);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1707
1722
|
function logWarning2(message, details) {
|
|
1708
1723
|
if (details && details.length > 0) {
|
|
1709
1724
|
const detailBlock = details.join("\n");
|
|
@@ -1953,7 +1968,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
1953
1968
|
}
|
|
1954
1969
|
return {
|
|
1955
1970
|
name: "rubric",
|
|
1956
|
-
type: "
|
|
1971
|
+
type: "llm-judge",
|
|
1957
1972
|
rubrics: rubricItems
|
|
1958
1973
|
};
|
|
1959
1974
|
}
|
|
@@ -2338,7 +2353,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2338
2353
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
2339
2354
|
const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
2340
2355
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
2341
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
2356
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
2342
2357
|
const globalExecution = sidecar.execution;
|
|
2343
2358
|
if (verbose) {
|
|
2344
2359
|
console.log(`
|
|
@@ -2426,6 +2441,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2426
2441
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
2427
2442
|
}
|
|
2428
2443
|
}
|
|
2444
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2429
2445
|
const userFilePaths = [];
|
|
2430
2446
|
for (const segment of inputSegments) {
|
|
2431
2447
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -2818,7 +2834,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2818
2834
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
2819
2835
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
2820
2836
|
const rawTestcases = resolveTests(suite);
|
|
2821
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
2837
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
2822
2838
|
const evalFileDir = path8.dirname(absoluteTestPath);
|
|
2823
2839
|
let expandedTestcases;
|
|
2824
2840
|
if (typeof rawTestcases === "string") {
|
|
@@ -2915,6 +2931,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2915
2931
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
2916
2932
|
}
|
|
2917
2933
|
}
|
|
2934
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2918
2935
|
const userFilePaths = [];
|
|
2919
2936
|
for (const segment of inputSegments) {
|
|
2920
2937
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -8895,7 +8912,7 @@ function toCamelCaseDeep(obj) {
|
|
|
8895
8912
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
8896
8913
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
8897
8914
|
var CodeEvaluator = class {
|
|
8898
|
-
kind = "code";
|
|
8915
|
+
kind = "code-judge";
|
|
8899
8916
|
command;
|
|
8900
8917
|
cwd;
|
|
8901
8918
|
agentTimeoutMs;
|
|
@@ -9103,7 +9120,7 @@ var scoreRangeEvaluationSchema = z3.object({
|
|
|
9103
9120
|
overall_reasoning: z3.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
9104
9121
|
});
|
|
9105
9122
|
var LlmJudgeEvaluator = class {
|
|
9106
|
-
kind = "
|
|
9123
|
+
kind = "llm-judge";
|
|
9107
9124
|
resolveJudgeProvider;
|
|
9108
9125
|
maxOutputTokens;
|
|
9109
9126
|
temperature;
|
|
@@ -9120,7 +9137,7 @@ var LlmJudgeEvaluator = class {
|
|
|
9120
9137
|
throw new Error("No judge provider available for LLM grading");
|
|
9121
9138
|
}
|
|
9122
9139
|
const config = context.evaluator;
|
|
9123
|
-
if (config?.type === "
|
|
9140
|
+
if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
|
|
9124
9141
|
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
9125
9142
|
}
|
|
9126
9143
|
return this.evaluateFreeform(context, judgeProvider);
|
|
@@ -9194,7 +9211,7 @@ ${context.fileChanges}`;
|
|
|
9194
9211
|
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
9195
9212
|
if (!rubrics || rubrics.length === 0) {
|
|
9196
9213
|
throw new Error(
|
|
9197
|
-
`No rubrics found for evaluator "${context.evaluator?.name ?? "
|
|
9214
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
9198
9215
|
);
|
|
9199
9216
|
}
|
|
9200
9217
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -9530,9 +9547,9 @@ var CompositeEvaluator = class {
|
|
|
9530
9547
|
async aggregate(results, context) {
|
|
9531
9548
|
const aggregator = this.config.aggregator;
|
|
9532
9549
|
switch (aggregator.type) {
|
|
9533
|
-
case "
|
|
9550
|
+
case "code-judge":
|
|
9534
9551
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
9535
|
-
case "
|
|
9552
|
+
case "llm-judge":
|
|
9536
9553
|
return this.runLlmAggregator(results, context, aggregator);
|
|
9537
9554
|
case "threshold":
|
|
9538
9555
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -9675,7 +9692,7 @@ var CompositeEvaluator = class {
|
|
|
9675
9692
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
9676
9693
|
reasoning,
|
|
9677
9694
|
evaluatorRawRequest: {
|
|
9678
|
-
aggregator: "
|
|
9695
|
+
aggregator: "code-judge",
|
|
9679
9696
|
script: scriptPath
|
|
9680
9697
|
},
|
|
9681
9698
|
scores
|
|
@@ -9690,7 +9707,7 @@ var CompositeEvaluator = class {
|
|
|
9690
9707
|
expectedAspectCount: 1,
|
|
9691
9708
|
reasoning: message,
|
|
9692
9709
|
evaluatorRawRequest: {
|
|
9693
|
-
aggregator: "
|
|
9710
|
+
aggregator: "code-judge",
|
|
9694
9711
|
script: scriptPath,
|
|
9695
9712
|
error: message
|
|
9696
9713
|
},
|
|
@@ -9721,7 +9738,7 @@ var CompositeEvaluator = class {
|
|
|
9721
9738
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
9722
9739
|
const systemPrompt = buildOutputSchema();
|
|
9723
9740
|
const evaluatorRawRequest = {
|
|
9724
|
-
aggregator: "
|
|
9741
|
+
aggregator: "llm-judge",
|
|
9725
9742
|
userPrompt,
|
|
9726
9743
|
systemPrompt,
|
|
9727
9744
|
target: judgeProvider.targetName
|
|
@@ -9833,7 +9850,7 @@ var CostEvaluator = class {
|
|
|
9833
9850
|
|
|
9834
9851
|
// src/evaluation/evaluators/execution-metrics.ts
|
|
9835
9852
|
var ExecutionMetricsEvaluator = class {
|
|
9836
|
-
kind = "
|
|
9853
|
+
kind = "execution-metrics";
|
|
9837
9854
|
config;
|
|
9838
9855
|
constructor(options) {
|
|
9839
9856
|
this.config = options.config;
|
|
@@ -9859,7 +9876,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9859
9876
|
expectedAspectCount: 1,
|
|
9860
9877
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
9861
9878
|
evaluatorRawRequest: {
|
|
9862
|
-
type: "
|
|
9879
|
+
type: "execution-metrics",
|
|
9863
9880
|
config: this.extractConfiguredThresholds(),
|
|
9864
9881
|
actual: null
|
|
9865
9882
|
}
|
|
@@ -9968,7 +9985,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9968
9985
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
9969
9986
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
9970
9987
|
}
|
|
9971
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
9988
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
9972
9989
|
return {
|
|
9973
9990
|
score,
|
|
9974
9991
|
verdict: scoreToVerdict(score),
|
|
@@ -9977,7 +9994,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9977
9994
|
expectedAspectCount: totalChecks || 1,
|
|
9978
9995
|
reasoning,
|
|
9979
9996
|
evaluatorRawRequest: {
|
|
9980
|
-
type: "
|
|
9997
|
+
type: "execution-metrics",
|
|
9981
9998
|
config: this.extractConfiguredThresholds(),
|
|
9982
9999
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
9983
10000
|
}
|
|
@@ -10065,7 +10082,7 @@ var MONTH_NAMES = {
|
|
|
10065
10082
|
december: 11
|
|
10066
10083
|
};
|
|
10067
10084
|
var FieldAccuracyEvaluator = class {
|
|
10068
|
-
kind = "
|
|
10085
|
+
kind = "field-accuracy";
|
|
10069
10086
|
config;
|
|
10070
10087
|
constructor(options) {
|
|
10071
10088
|
this.config = options.config;
|
|
@@ -10519,7 +10536,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
10519
10536
|
".dylib"
|
|
10520
10537
|
]);
|
|
10521
10538
|
var AgentJudgeEvaluator = class {
|
|
10522
|
-
kind = "
|
|
10539
|
+
kind = "agent-judge";
|
|
10523
10540
|
resolveJudgeProvider;
|
|
10524
10541
|
maxSteps;
|
|
10525
10542
|
temperature;
|
|
@@ -10544,24 +10561,24 @@ var AgentJudgeEvaluator = class {
|
|
|
10544
10561
|
async evaluateBuiltIn(context) {
|
|
10545
10562
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
10546
10563
|
if (!judgeProvider) {
|
|
10547
|
-
throw new Error("No judge provider available for
|
|
10564
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
10548
10565
|
}
|
|
10549
10566
|
const model = judgeProvider.asLanguageModel?.();
|
|
10550
10567
|
if (!model) {
|
|
10551
10568
|
throw new Error(
|
|
10552
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
10569
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
10553
10570
|
);
|
|
10554
10571
|
}
|
|
10555
10572
|
const workspacePath = context.workspacePath;
|
|
10556
10573
|
if (!workspacePath) {
|
|
10557
10574
|
throw new Error(
|
|
10558
|
-
"
|
|
10575
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
10559
10576
|
);
|
|
10560
10577
|
}
|
|
10561
10578
|
const systemPrompt = this.buildSystemPrompt(context);
|
|
10562
10579
|
const userPrompt = this.buildUserPrompt(context);
|
|
10563
10580
|
const config = context.evaluator;
|
|
10564
|
-
const rubrics = config?.type === "
|
|
10581
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10565
10582
|
const fsTools = createFilesystemTools(workspacePath);
|
|
10566
10583
|
const evaluatorRawRequest = {
|
|
10567
10584
|
mode: "built-in",
|
|
@@ -10592,7 +10609,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10592
10609
|
score: 0,
|
|
10593
10610
|
verdict: "fail",
|
|
10594
10611
|
hits: [],
|
|
10595
|
-
misses: [`
|
|
10612
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
10596
10613
|
expectedAspectCount: 1,
|
|
10597
10614
|
evaluatorRawRequest,
|
|
10598
10615
|
details: { mode: "built-in", error: message }
|
|
@@ -10624,14 +10641,14 @@ var AgentJudgeEvaluator = class {
|
|
|
10624
10641
|
score: 0,
|
|
10625
10642
|
verdict: "fail",
|
|
10626
10643
|
hits: [],
|
|
10627
|
-
misses: ["
|
|
10644
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
10628
10645
|
expectedAspectCount: 1,
|
|
10629
10646
|
evaluatorRawRequest,
|
|
10630
10647
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
10631
10648
|
};
|
|
10632
10649
|
}
|
|
10633
10650
|
const config = context.evaluator;
|
|
10634
|
-
const rubrics = config?.type === "
|
|
10651
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10635
10652
|
const details = {
|
|
10636
10653
|
mode: "judge_target",
|
|
10637
10654
|
judge_target: provider.targetName
|
|
@@ -10643,7 +10660,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10643
10660
|
score: 0,
|
|
10644
10661
|
verdict: "fail",
|
|
10645
10662
|
hits: [],
|
|
10646
|
-
misses: [`
|
|
10663
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
10647
10664
|
expectedAspectCount: 1,
|
|
10648
10665
|
evaluatorRawRequest,
|
|
10649
10666
|
details: {
|
|
@@ -10694,7 +10711,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10694
10711
|
score: 0,
|
|
10695
10712
|
verdict: "fail",
|
|
10696
10713
|
hits: [],
|
|
10697
|
-
misses: ["Failed to parse
|
|
10714
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
10698
10715
|
expectedAspectCount: 1,
|
|
10699
10716
|
evaluatorRawRequest,
|
|
10700
10717
|
details
|
|
@@ -10707,7 +10724,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10707
10724
|
*/
|
|
10708
10725
|
buildSystemPrompt(context) {
|
|
10709
10726
|
const config = context.evaluator;
|
|
10710
|
-
const rubrics = config?.type === "
|
|
10727
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10711
10728
|
const parts = [
|
|
10712
10729
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
10713
10730
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -10738,7 +10755,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10738
10755
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
10739
10756
|
}
|
|
10740
10757
|
const config = context.evaluator;
|
|
10741
|
-
const rubrics = config?.type === "
|
|
10758
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10742
10759
|
const parts = [
|
|
10743
10760
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
10744
10761
|
"",
|
|
@@ -10781,7 +10798,7 @@ var AgentJudgeEvaluator = class {
|
|
|
10781
10798
|
buildDelegatedPrompt(context) {
|
|
10782
10799
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10783
10800
|
const config = context.evaluator;
|
|
10784
|
-
const rubrics = config?.type === "
|
|
10801
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
10785
10802
|
if (this.evaluatorTemplate) {
|
|
10786
10803
|
const variables = {
|
|
10787
10804
|
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
@@ -10863,11 +10880,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
10863
10880
|
execute: async (input) => {
|
|
10864
10881
|
try {
|
|
10865
10882
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
10866
|
-
const
|
|
10867
|
-
if (
|
|
10883
|
+
const stat8 = await fs2.stat(resolved);
|
|
10884
|
+
if (stat8.isDirectory()) {
|
|
10868
10885
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
10869
10886
|
}
|
|
10870
|
-
const buffer = Buffer.alloc(Math.min(
|
|
10887
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
10871
10888
|
const fd = await fs2.open(resolved, "r");
|
|
10872
10889
|
try {
|
|
10873
10890
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -10875,8 +10892,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
10875
10892
|
await fd.close();
|
|
10876
10893
|
}
|
|
10877
10894
|
const content = buffer.toString("utf-8");
|
|
10878
|
-
const truncated =
|
|
10879
|
-
return { content, truncated, size:
|
|
10895
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
10896
|
+
return { content, truncated, size: stat8.size };
|
|
10880
10897
|
} catch (error) {
|
|
10881
10898
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
10882
10899
|
}
|
|
@@ -10920,8 +10937,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
10920
10937
|
const ext = path30.extname(entry.name).toLowerCase();
|
|
10921
10938
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
10922
10939
|
try {
|
|
10923
|
-
const
|
|
10924
|
-
if (
|
|
10940
|
+
const stat8 = await fs2.stat(fullPath);
|
|
10941
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
10925
10942
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
10926
10943
|
const lines = content.split("\n");
|
|
10927
10944
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -11083,7 +11100,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
11083
11100
|
|
|
11084
11101
|
// src/evaluation/evaluators/token-usage.ts
|
|
11085
11102
|
var TokenUsageEvaluator = class {
|
|
11086
|
-
kind = "
|
|
11103
|
+
kind = "token-usage";
|
|
11087
11104
|
config;
|
|
11088
11105
|
constructor(options) {
|
|
11089
11106
|
this.config = options.config;
|
|
@@ -11106,7 +11123,7 @@ var TokenUsageEvaluator = class {
|
|
|
11106
11123
|
expectedAspectCount,
|
|
11107
11124
|
reasoning: "Token usage not reported by provider",
|
|
11108
11125
|
evaluatorRawRequest: {
|
|
11109
|
-
type: "
|
|
11126
|
+
type: "token-usage",
|
|
11110
11127
|
max_total: maxTotal ?? null,
|
|
11111
11128
|
max_input: maxInput ?? null,
|
|
11112
11129
|
max_output: maxOutput ?? null,
|
|
@@ -11148,9 +11165,9 @@ var TokenUsageEvaluator = class {
|
|
|
11148
11165
|
hits,
|
|
11149
11166
|
misses,
|
|
11150
11167
|
expectedAspectCount,
|
|
11151
|
-
reasoning: `
|
|
11168
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
11152
11169
|
evaluatorRawRequest: {
|
|
11153
|
-
type: "
|
|
11170
|
+
type: "token-usage",
|
|
11154
11171
|
max_total: maxTotal ?? null,
|
|
11155
11172
|
max_input: maxInput ?? null,
|
|
11156
11173
|
max_output: maxOutput ?? null,
|
|
@@ -11235,7 +11252,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
11235
11252
|
};
|
|
11236
11253
|
}
|
|
11237
11254
|
var ToolTrajectoryEvaluator = class {
|
|
11238
|
-
kind = "
|
|
11255
|
+
kind = "tool-trajectory";
|
|
11239
11256
|
config;
|
|
11240
11257
|
constructor(options) {
|
|
11241
11258
|
this.config = options.config;
|
|
@@ -11423,7 +11440,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
11423
11440
|
}
|
|
11424
11441
|
}
|
|
11425
11442
|
for (const warning of warnings) {
|
|
11426
|
-
console.warn(`[
|
|
11443
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
11427
11444
|
}
|
|
11428
11445
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
11429
11446
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -11499,7 +11516,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
11499
11516
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
11500
11517
|
}
|
|
11501
11518
|
for (const warning of warnings) {
|
|
11502
|
-
console.warn(`[
|
|
11519
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
11503
11520
|
}
|
|
11504
11521
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
11505
11522
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -11729,7 +11746,7 @@ function runEqualsAssertion(output, value) {
|
|
|
11729
11746
|
|
|
11730
11747
|
// src/evaluation/orchestrator.ts
|
|
11731
11748
|
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11732
|
-
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
11749
|
+
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
11733
11750
|
import path37 from "node:path";
|
|
11734
11751
|
import micromatch4 from "micromatch";
|
|
11735
11752
|
|
|
@@ -11989,7 +12006,7 @@ var llmJudgeFactory = (config, context) => {
|
|
|
11989
12006
|
const c = config;
|
|
11990
12007
|
const { llmJudge, agentTimeoutMs } = context;
|
|
11991
12008
|
return {
|
|
11992
|
-
kind: "
|
|
12009
|
+
kind: "llm-judge",
|
|
11993
12010
|
async evaluate(evalContext) {
|
|
11994
12011
|
const customPrompt = await resolveCustomPrompt(
|
|
11995
12012
|
c,
|
|
@@ -12078,7 +12095,7 @@ var agentJudgeFactory = (config, context) => {
|
|
|
12078
12095
|
customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
|
|
12079
12096
|
} catch (error) {
|
|
12080
12097
|
const message = error instanceof Error ? error.message : String(error);
|
|
12081
|
-
console.warn(`Could not read
|
|
12098
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
12082
12099
|
}
|
|
12083
12100
|
} else if (c.prompt) {
|
|
12084
12101
|
customPrompt = c.prompt;
|
|
@@ -12088,7 +12105,7 @@ var agentJudgeFactory = (config, context) => {
|
|
|
12088
12105
|
judgeTargetProvider = targetResolver(c.target);
|
|
12089
12106
|
if (!judgeTargetProvider) {
|
|
12090
12107
|
throw new Error(
|
|
12091
|
-
`
|
|
12108
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
12092
12109
|
);
|
|
12093
12110
|
}
|
|
12094
12111
|
}
|
|
@@ -12132,7 +12149,7 @@ var regexFactory = (config) => {
|
|
|
12132
12149
|
});
|
|
12133
12150
|
};
|
|
12134
12151
|
var isJsonFactory = () => {
|
|
12135
|
-
return new DeterministicAssertionEvaluator("
|
|
12152
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
12136
12153
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
12137
12154
|
return {
|
|
12138
12155
|
score: result.score,
|
|
@@ -12160,7 +12177,7 @@ var equalsFactory = (config) => {
|
|
|
12160
12177
|
};
|
|
12161
12178
|
var containsAnyFactory = (config) => {
|
|
12162
12179
|
const c = config;
|
|
12163
|
-
return new DeterministicAssertionEvaluator("
|
|
12180
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
12164
12181
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
12165
12182
|
return {
|
|
12166
12183
|
score: result.score,
|
|
@@ -12174,7 +12191,7 @@ var containsAnyFactory = (config) => {
|
|
|
12174
12191
|
};
|
|
12175
12192
|
var containsAllFactory = (config) => {
|
|
12176
12193
|
const c = config;
|
|
12177
|
-
return new DeterministicAssertionEvaluator("
|
|
12194
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
12178
12195
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
12179
12196
|
return {
|
|
12180
12197
|
score: result.score,
|
|
@@ -12202,7 +12219,7 @@ var icontainsFactory = (config) => {
|
|
|
12202
12219
|
};
|
|
12203
12220
|
var icontainsAnyFactory = (config) => {
|
|
12204
12221
|
const c = config;
|
|
12205
|
-
return new DeterministicAssertionEvaluator("
|
|
12222
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
12206
12223
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
12207
12224
|
return {
|
|
12208
12225
|
score: result.score,
|
|
@@ -12216,7 +12233,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
12216
12233
|
};
|
|
12217
12234
|
var icontainsAllFactory = (config) => {
|
|
12218
12235
|
const c = config;
|
|
12219
|
-
return new DeterministicAssertionEvaluator("
|
|
12236
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
12220
12237
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
12221
12238
|
return {
|
|
12222
12239
|
score: result.score,
|
|
@@ -12230,7 +12247,7 @@ var icontainsAllFactory = (config) => {
|
|
|
12230
12247
|
};
|
|
12231
12248
|
var startsWithFactory = (config) => {
|
|
12232
12249
|
const c = config;
|
|
12233
|
-
return new DeterministicAssertionEvaluator("
|
|
12250
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
12234
12251
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
12235
12252
|
return {
|
|
12236
12253
|
score: result.score,
|
|
@@ -12244,7 +12261,7 @@ var startsWithFactory = (config) => {
|
|
|
12244
12261
|
};
|
|
12245
12262
|
var endsWithFactory = (config) => {
|
|
12246
12263
|
const c = config;
|
|
12247
|
-
return new DeterministicAssertionEvaluator("
|
|
12264
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
12248
12265
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
12249
12266
|
return {
|
|
12250
12267
|
score: result.score,
|
|
@@ -12258,7 +12275,7 @@ var endsWithFactory = (config) => {
|
|
|
12258
12275
|
};
|
|
12259
12276
|
function createBuiltinRegistry() {
|
|
12260
12277
|
const registry = new EvaluatorRegistry();
|
|
12261
|
-
registry.register("
|
|
12278
|
+
registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
12262
12279
|
return registry;
|
|
12263
12280
|
}
|
|
12264
12281
|
|
|
@@ -13004,7 +13021,7 @@ async function runEvaluation(options) {
|
|
|
13004
13021
|
};
|
|
13005
13022
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
13006
13023
|
throw new Error(
|
|
13007
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
13024
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
13008
13025
|
);
|
|
13009
13026
|
}
|
|
13010
13027
|
const targetResolver = (name) => {
|
|
@@ -13075,7 +13092,7 @@ async function runEvaluation(options) {
|
|
|
13075
13092
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
13076
13093
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
13077
13094
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
13078
|
-
|
|
13095
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
13079
13096
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13080
13097
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13081
13098
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -13096,6 +13113,14 @@ async function runEvaluation(options) {
|
|
|
13096
13113
|
const message = error instanceof Error ? error.message : String(error);
|
|
13097
13114
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
13098
13115
|
}
|
|
13116
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
13117
|
+
const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
|
|
13118
|
+
try {
|
|
13119
|
+
await stat7(copiedWorkspaceFile);
|
|
13120
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
13121
|
+
} catch {
|
|
13122
|
+
}
|
|
13123
|
+
}
|
|
13099
13124
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
13100
13125
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
13101
13126
|
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
@@ -13574,6 +13599,14 @@ async function runEvalCase(options) {
|
|
|
13574
13599
|
"template_error"
|
|
13575
13600
|
);
|
|
13576
13601
|
}
|
|
13602
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
13603
|
+
const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
|
|
13604
|
+
try {
|
|
13605
|
+
await stat7(copiedFile);
|
|
13606
|
+
caseWorkspaceFile = copiedFile;
|
|
13607
|
+
} catch {
|
|
13608
|
+
}
|
|
13609
|
+
}
|
|
13577
13610
|
}
|
|
13578
13611
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
13579
13612
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -14083,8 +14116,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
14083
14116
|
workspacePath
|
|
14084
14117
|
});
|
|
14085
14118
|
}
|
|
14086
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
14087
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
14119
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
14120
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
14088
14121
|
if (!activeEvaluator) {
|
|
14089
14122
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
14090
14123
|
}
|
|
@@ -14167,25 +14200,24 @@ async function runEvaluatorList(options) {
|
|
|
14167
14200
|
availableTargets,
|
|
14168
14201
|
agentTimeoutMs,
|
|
14169
14202
|
evalFileDir,
|
|
14170
|
-
llmJudge: evaluatorRegistry
|
|
14203
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
14171
14204
|
registry: typeRegistry
|
|
14172
14205
|
};
|
|
14173
14206
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
14174
14207
|
try {
|
|
14175
14208
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
14176
14209
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
14177
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
14178
14210
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14179
14211
|
scored.push({
|
|
14180
14212
|
score: score2,
|
|
14181
14213
|
name: evaluatorConfig.name,
|
|
14182
|
-
type:
|
|
14214
|
+
type: evaluatorConfig.type,
|
|
14183
14215
|
weight,
|
|
14184
14216
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
14185
14217
|
});
|
|
14186
14218
|
scores.push({
|
|
14187
14219
|
name: evaluatorConfig.name,
|
|
14188
|
-
type:
|
|
14220
|
+
type: evaluatorConfig.type,
|
|
14189
14221
|
score: score2.score,
|
|
14190
14222
|
weight,
|
|
14191
14223
|
verdict: score2.verdict,
|
|
@@ -14207,18 +14239,17 @@ async function runEvaluatorList(options) {
|
|
|
14207
14239
|
expectedAspectCount: 1,
|
|
14208
14240
|
reasoning: message
|
|
14209
14241
|
};
|
|
14210
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
14211
14242
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14212
14243
|
scored.push({
|
|
14213
14244
|
score: fallbackScore,
|
|
14214
14245
|
name: evaluatorConfig.name ?? "unknown",
|
|
14215
|
-
type:
|
|
14246
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
14216
14247
|
weight,
|
|
14217
14248
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
14218
14249
|
});
|
|
14219
14250
|
scores.push({
|
|
14220
14251
|
name: evaluatorConfig.name ?? "unknown",
|
|
14221
|
-
type:
|
|
14252
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
14222
14253
|
score: 0,
|
|
14223
14254
|
weight,
|
|
14224
14255
|
verdict: "fail",
|
|
@@ -14279,7 +14310,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
14279
14310
|
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
|
|
14280
14311
|
}
|
|
14281
14312
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
14282
|
-
const llmJudge = overrides?.
|
|
14313
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
14283
14314
|
resolveJudgeProvider: async (context) => {
|
|
14284
14315
|
if (context.judgeProvider) {
|
|
14285
14316
|
return context.judgeProvider;
|
|
@@ -14289,7 +14320,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
14289
14320
|
});
|
|
14290
14321
|
return {
|
|
14291
14322
|
...overrides,
|
|
14292
|
-
|
|
14323
|
+
"llm-judge": llmJudge
|
|
14293
14324
|
};
|
|
14294
14325
|
}
|
|
14295
14326
|
async function invokeProvider(provider, options) {
|
|
@@ -14549,12 +14580,7 @@ async function evaluate(config) {
|
|
|
14549
14580
|
};
|
|
14550
14581
|
}
|
|
14551
14582
|
function mapAssertionType(type) {
|
|
14552
|
-
|
|
14553
|
-
case "code_judge":
|
|
14554
|
-
return "code";
|
|
14555
|
-
default:
|
|
14556
|
-
return type;
|
|
14557
|
-
}
|
|
14583
|
+
return type.replace(/_/g, "-");
|
|
14558
14584
|
}
|
|
14559
14585
|
function computeSummary(results, durationMs) {
|
|
14560
14586
|
const total = results.length;
|