@agentv/core 2.2.0 → 2.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
- package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +39 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +654 -119
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +75 -6
- package/dist/index.d.ts +75 -6
- package/dist/index.js +655 -120
- package/dist/index.js.map +1 -1
- package/package.json +3 -6
package/dist/index.cjs
CHANGED
|
@@ -229,6 +229,7 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
229
229
|
// src/evaluation/yaml-parser.ts
|
|
230
230
|
var import_promises7 = require("fs/promises");
|
|
231
231
|
var import_node_path7 = __toESM(require("path"), 1);
|
|
232
|
+
var import_micromatch3 = __toESM(require("micromatch"), 1);
|
|
232
233
|
var import_yaml3 = require("yaml");
|
|
233
234
|
|
|
234
235
|
// src/evaluation/loaders/config-loader.ts
|
|
@@ -543,11 +544,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
543
544
|
);
|
|
544
545
|
}
|
|
545
546
|
}
|
|
546
|
-
const
|
|
547
|
-
const
|
|
547
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
548
|
+
const config2 = {};
|
|
548
549
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
549
|
-
if (!
|
|
550
|
-
|
|
550
|
+
if (!knownProps2.has(key) && value !== void 0) {
|
|
551
|
+
config2[key] = value;
|
|
551
552
|
}
|
|
552
553
|
}
|
|
553
554
|
evaluators.push({
|
|
@@ -557,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
557
558
|
cwd,
|
|
558
559
|
resolvedCwd,
|
|
559
560
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
560
|
-
...Object.keys(
|
|
561
|
+
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
561
562
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
562
563
|
});
|
|
563
564
|
continue;
|
|
@@ -722,7 +723,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
722
723
|
continue;
|
|
723
724
|
}
|
|
724
725
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
725
|
-
const
|
|
726
|
+
const config2 = {
|
|
726
727
|
name,
|
|
727
728
|
type: "tool_trajectory",
|
|
728
729
|
mode,
|
|
@@ -730,7 +731,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
730
731
|
...expected ? { expected } : {},
|
|
731
732
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
732
733
|
};
|
|
733
|
-
evaluators.push(
|
|
734
|
+
evaluators.push(config2);
|
|
734
735
|
continue;
|
|
735
736
|
}
|
|
736
737
|
if (typeValue === "field_accuracy") {
|
|
@@ -867,9 +868,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
867
868
|
});
|
|
868
869
|
continue;
|
|
869
870
|
}
|
|
870
|
-
const
|
|
871
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
872
|
+
let prompt;
|
|
871
873
|
let promptPath;
|
|
872
|
-
|
|
874
|
+
let resolvedPromptScript;
|
|
875
|
+
let promptScriptConfig;
|
|
876
|
+
if (isJsonObject2(rawPrompt)) {
|
|
877
|
+
const scriptArray = asStringArray(
|
|
878
|
+
rawPrompt.script,
|
|
879
|
+
`prompt.script for evaluator '${name}' in '${evalId}'`
|
|
880
|
+
);
|
|
881
|
+
if (!scriptArray) {
|
|
882
|
+
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
|
|
883
|
+
}
|
|
884
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
885
|
+
const resolved = await resolveFileReference(scriptPath, searchRoots);
|
|
886
|
+
if (resolved.resolvedPath) {
|
|
887
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path3.default.resolve(resolved.resolvedPath)];
|
|
888
|
+
} else {
|
|
889
|
+
throw new Error(
|
|
890
|
+
`Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
891
|
+
);
|
|
892
|
+
}
|
|
893
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
894
|
+
promptScriptConfig = rawPrompt.config;
|
|
895
|
+
}
|
|
896
|
+
} else if (typeof rawPrompt === "string") {
|
|
897
|
+
prompt = rawPrompt;
|
|
873
898
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
874
899
|
if (resolved.resolvedPath) {
|
|
875
900
|
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
@@ -888,12 +913,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
888
913
|
}
|
|
889
914
|
const _model = asString(rawEvaluator.model);
|
|
890
915
|
const rawRubrics = rawEvaluator.rubrics;
|
|
891
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
892
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
893
|
-
description: asString(rubric.description) ?? "",
|
|
894
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
895
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
896
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
916
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
|
|
897
917
|
if (typeValue === "rubric") {
|
|
898
918
|
if (!parsedRubrics) {
|
|
899
919
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
@@ -913,13 +933,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
913
933
|
continue;
|
|
914
934
|
}
|
|
915
935
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
936
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
937
|
+
const config = {};
|
|
938
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
939
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
940
|
+
config[key] = value;
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
944
|
+
const mergedConfig = { ...config, ...topLevelConfig };
|
|
945
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
916
946
|
evaluators.push({
|
|
917
947
|
name,
|
|
918
948
|
type: "llm_judge",
|
|
919
949
|
prompt,
|
|
920
950
|
promptPath,
|
|
951
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
952
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
921
953
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
922
|
-
...weight !== void 0 ? { weight } : {}
|
|
954
|
+
...weight !== void 0 ? { weight } : {},
|
|
955
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
923
956
|
});
|
|
924
957
|
}
|
|
925
958
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1006,10 +1039,190 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
1006
1039
|
function isValidFieldAggregationType(value) {
|
|
1007
1040
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1008
1041
|
}
|
|
1042
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
1043
|
+
const items = [];
|
|
1044
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
1045
|
+
if (!isJsonObject2(rawRubric)) {
|
|
1046
|
+
logWarning2(
|
|
1047
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
1048
|
+
);
|
|
1049
|
+
continue;
|
|
1050
|
+
}
|
|
1051
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
1052
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
1053
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
1054
|
+
let requiredMinScore;
|
|
1055
|
+
let required;
|
|
1056
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
1057
|
+
const minScore = rawRubric.required_min_score;
|
|
1058
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
1059
|
+
throw new Error(
|
|
1060
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
1061
|
+
);
|
|
1062
|
+
}
|
|
1063
|
+
requiredMinScore = minScore;
|
|
1064
|
+
}
|
|
1065
|
+
if (typeof rawRubric.required === "boolean") {
|
|
1066
|
+
required = rawRubric.required;
|
|
1067
|
+
}
|
|
1068
|
+
let scoreRanges;
|
|
1069
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
1070
|
+
if (rawScoreRanges !== void 0) {
|
|
1071
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
1072
|
+
throw new Error(
|
|
1073
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
1074
|
+
);
|
|
1075
|
+
}
|
|
1076
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
1077
|
+
items.push({
|
|
1078
|
+
id,
|
|
1079
|
+
weight,
|
|
1080
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1081
|
+
...required !== void 0 ? { required } : {},
|
|
1082
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1083
|
+
score_ranges: scoreRanges
|
|
1084
|
+
});
|
|
1085
|
+
} else {
|
|
1086
|
+
if (expectedOutcome.length === 0) {
|
|
1087
|
+
logWarning2(
|
|
1088
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
1089
|
+
);
|
|
1090
|
+
continue;
|
|
1091
|
+
}
|
|
1092
|
+
items.push({
|
|
1093
|
+
id,
|
|
1094
|
+
expected_outcome: expectedOutcome,
|
|
1095
|
+
weight,
|
|
1096
|
+
// Default to required: true if not specified (backward compatibility)
|
|
1097
|
+
required: required ?? true,
|
|
1098
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1099
|
+
});
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
return items.length > 0 ? items : void 0;
|
|
1103
|
+
}
|
|
1104
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
1105
|
+
const ranges = [];
|
|
1106
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
1107
|
+
if (!isJsonObject2(rawRange)) {
|
|
1108
|
+
throw new Error(
|
|
1109
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
1110
|
+
);
|
|
1111
|
+
}
|
|
1112
|
+
const scoreRangeValue = rawRange.score_range;
|
|
1113
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
1114
|
+
throw new Error(
|
|
1115
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
1116
|
+
);
|
|
1117
|
+
}
|
|
1118
|
+
const [min, max] = scoreRangeValue;
|
|
1119
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
1120
|
+
throw new Error(
|
|
1121
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
1122
|
+
);
|
|
1123
|
+
}
|
|
1124
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
1125
|
+
throw new Error(
|
|
1126
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
1127
|
+
);
|
|
1128
|
+
}
|
|
1129
|
+
if (min > max) {
|
|
1130
|
+
throw new Error(
|
|
1131
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
1132
|
+
);
|
|
1133
|
+
}
|
|
1134
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
1135
|
+
if (expectedOutcome.length === 0) {
|
|
1136
|
+
throw new Error(
|
|
1137
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
1138
|
+
);
|
|
1139
|
+
}
|
|
1140
|
+
ranges.push({
|
|
1141
|
+
score_range: [min, max],
|
|
1142
|
+
expected_outcome: expectedOutcome
|
|
1143
|
+
});
|
|
1144
|
+
}
|
|
1145
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
1146
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
1147
|
+
const prev = sortedRanges[i - 1];
|
|
1148
|
+
const curr = sortedRanges[i];
|
|
1149
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
1150
|
+
throw new Error(
|
|
1151
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
1152
|
+
);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
const covered = /* @__PURE__ */ new Set();
|
|
1156
|
+
for (const range of ranges) {
|
|
1157
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
1158
|
+
covered.add(i);
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
const missing = [];
|
|
1162
|
+
for (let i = 0; i <= 10; i++) {
|
|
1163
|
+
if (!covered.has(i)) {
|
|
1164
|
+
missing.push(i);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
if (missing.length > 0) {
|
|
1168
|
+
throw new Error(
|
|
1169
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
1170
|
+
);
|
|
1171
|
+
}
|
|
1172
|
+
return ranges;
|
|
1173
|
+
}
|
|
1174
|
+
function parseInlineRubrics(rawRubrics) {
|
|
1175
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
1176
|
+
if (typeof rubric === "string") {
|
|
1177
|
+
return {
|
|
1178
|
+
id: `rubric-${index + 1}`,
|
|
1179
|
+
expected_outcome: rubric,
|
|
1180
|
+
weight: 1,
|
|
1181
|
+
required: true
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
1184
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
1185
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
1186
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
1187
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
1188
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
1189
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
1190
|
+
const baseRubric = {
|
|
1191
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
1192
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
1193
|
+
};
|
|
1194
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
1195
|
+
return {
|
|
1196
|
+
...baseRubric,
|
|
1197
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1198
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
1199
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
1200
|
+
score_ranges: scoreRanges
|
|
1201
|
+
};
|
|
1202
|
+
}
|
|
1203
|
+
return {
|
|
1204
|
+
...baseRubric,
|
|
1205
|
+
expected_outcome: expectedOutcome,
|
|
1206
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
1207
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
1208
|
+
};
|
|
1209
|
+
}).filter(
|
|
1210
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
1211
|
+
);
|
|
1212
|
+
if (rubricItems.length === 0) {
|
|
1213
|
+
return void 0;
|
|
1214
|
+
}
|
|
1215
|
+
return {
|
|
1216
|
+
name: "rubric",
|
|
1217
|
+
type: "llm_judge",
|
|
1218
|
+
rubrics: rubricItems
|
|
1219
|
+
};
|
|
1220
|
+
}
|
|
1009
1221
|
|
|
1010
1222
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
1011
1223
|
var import_promises5 = require("fs/promises");
|
|
1012
1224
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
1225
|
+
var import_micromatch2 = __toESM(require("micromatch"), 1);
|
|
1013
1226
|
var import_yaml2 = require("yaml");
|
|
1014
1227
|
|
|
1015
1228
|
// src/evaluation/loaders/message-processor.ts
|
|
@@ -1272,6 +1485,65 @@ async function processExpectedMessages(options) {
|
|
|
1272
1485
|
return segments;
|
|
1273
1486
|
}
|
|
1274
1487
|
|
|
1488
|
+
// src/evaluation/loaders/shorthand-expansion.ts
|
|
1489
|
+
function expandInputShorthand(value) {
|
|
1490
|
+
if (value === void 0 || value === null) {
|
|
1491
|
+
return void 0;
|
|
1492
|
+
}
|
|
1493
|
+
if (typeof value === "string") {
|
|
1494
|
+
return [{ role: "user", content: value }];
|
|
1495
|
+
}
|
|
1496
|
+
if (Array.isArray(value)) {
|
|
1497
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1498
|
+
return messages.length > 0 ? messages : void 0;
|
|
1499
|
+
}
|
|
1500
|
+
return void 0;
|
|
1501
|
+
}
|
|
1502
|
+
function expandExpectedOutputShorthand(value) {
|
|
1503
|
+
if (value === void 0 || value === null) {
|
|
1504
|
+
return void 0;
|
|
1505
|
+
}
|
|
1506
|
+
if (typeof value === "string") {
|
|
1507
|
+
return [{ role: "assistant", content: value }];
|
|
1508
|
+
}
|
|
1509
|
+
if (Array.isArray(value)) {
|
|
1510
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
1511
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1512
|
+
return messages.length > 0 ? messages : void 0;
|
|
1513
|
+
}
|
|
1514
|
+
return [{ role: "assistant", content: value }];
|
|
1515
|
+
}
|
|
1516
|
+
if (isJsonObject(value)) {
|
|
1517
|
+
if ("role" in value) {
|
|
1518
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
1519
|
+
}
|
|
1520
|
+
return [{ role: "assistant", content: value }];
|
|
1521
|
+
}
|
|
1522
|
+
return void 0;
|
|
1523
|
+
}
|
|
1524
|
+
function resolveInputMessages(raw) {
|
|
1525
|
+
if (raw.input_messages !== void 0) {
|
|
1526
|
+
if (Array.isArray(raw.input_messages)) {
|
|
1527
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
1528
|
+
return messages.length > 0 ? messages : void 0;
|
|
1529
|
+
}
|
|
1530
|
+
return void 0;
|
|
1531
|
+
}
|
|
1532
|
+
return expandInputShorthand(raw.input);
|
|
1533
|
+
}
|
|
1534
|
+
function resolveExpectedMessages(raw) {
|
|
1535
|
+
if (raw.expected_messages !== void 0) {
|
|
1536
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
1537
|
+
const messages = raw.expected_messages.filter(
|
|
1538
|
+
(msg) => isTestMessage(msg)
|
|
1539
|
+
);
|
|
1540
|
+
return messages.length > 0 ? messages : void 0;
|
|
1541
|
+
}
|
|
1542
|
+
return void 0;
|
|
1543
|
+
}
|
|
1544
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1275
1547
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
1276
1548
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1277
1549
|
var ANSI_RED = "\x1B[31m";
|
|
@@ -1332,7 +1604,7 @@ function parseJsonlContent(content, filePath) {
|
|
|
1332
1604
|
}
|
|
1333
1605
|
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1334
1606
|
const verbose = options?.verbose ?? false;
|
|
1335
|
-
const
|
|
1607
|
+
const filterPattern = options?.filter;
|
|
1336
1608
|
const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
|
|
1337
1609
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1338
1610
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
@@ -1359,28 +1631,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
1359
1631
|
const evalcase = rawCases[lineIndex];
|
|
1360
1632
|
const lineNumber = lineIndex + 1;
|
|
1361
1633
|
const id = asString4(evalcase.id);
|
|
1362
|
-
if (
|
|
1634
|
+
if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
|
|
1363
1635
|
continue;
|
|
1364
1636
|
}
|
|
1365
1637
|
const conversationId = asString4(evalcase.conversation_id);
|
|
1366
1638
|
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1367
|
-
const
|
|
1368
|
-
const
|
|
1369
|
-
if (!id || !outcome || !
|
|
1639
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1640
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1641
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1370
1642
|
logError(
|
|
1371
|
-
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
1643
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
1372
1644
|
);
|
|
1373
1645
|
continue;
|
|
1374
1646
|
}
|
|
1375
|
-
const hasExpectedMessages =
|
|
1376
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1377
|
-
(msg) => isTestMessage(msg)
|
|
1378
|
-
);
|
|
1379
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1380
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1381
|
-
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
1382
|
-
continue;
|
|
1383
|
-
}
|
|
1647
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1384
1648
|
const guidelinePaths = [];
|
|
1385
1649
|
const inputTextParts = [];
|
|
1386
1650
|
const inputSegments = await processMessages({
|
|
@@ -1426,28 +1690,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
1426
1690
|
}
|
|
1427
1691
|
const inlineRubrics = evalcase.rubrics;
|
|
1428
1692
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1429
|
-
const
|
|
1430
|
-
|
|
1431
|
-
return {
|
|
1432
|
-
id: `rubric-${index + 1}`,
|
|
1433
|
-
description: rubric,
|
|
1434
|
-
weight: 1,
|
|
1435
|
-
required: true
|
|
1436
|
-
};
|
|
1437
|
-
}
|
|
1438
|
-
return {
|
|
1439
|
-
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
1440
|
-
description: asString4(rubric.description) ?? "",
|
|
1441
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1442
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1443
|
-
};
|
|
1444
|
-
}).filter((r) => r.description.length > 0);
|
|
1445
|
-
if (rubricItems.length > 0) {
|
|
1446
|
-
const rubricEvaluator = {
|
|
1447
|
-
name: "rubric",
|
|
1448
|
-
type: "llm_judge",
|
|
1449
|
-
rubrics: rubricItems
|
|
1450
|
-
};
|
|
1693
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
1694
|
+
if (rubricEvaluator) {
|
|
1451
1695
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1452
1696
|
}
|
|
1453
1697
|
}
|
|
@@ -1757,7 +2001,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1757
2001
|
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1758
2002
|
}
|
|
1759
2003
|
const verbose = options?.verbose ?? false;
|
|
1760
|
-
const
|
|
2004
|
+
const filterPattern = options?.filter;
|
|
1761
2005
|
const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
|
|
1762
2006
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1763
2007
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
@@ -1787,28 +2031,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1787
2031
|
}
|
|
1788
2032
|
const evalcase = rawEvalcase;
|
|
1789
2033
|
const id = asString6(evalcase.id);
|
|
1790
|
-
if (
|
|
2034
|
+
if (filterPattern && (!id || !import_micromatch3.default.isMatch(id, filterPattern))) {
|
|
1791
2035
|
continue;
|
|
1792
2036
|
}
|
|
1793
2037
|
const conversationId = asString6(evalcase.conversation_id);
|
|
1794
2038
|
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1795
|
-
const
|
|
1796
|
-
const
|
|
1797
|
-
if (!id || !outcome || !
|
|
2039
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
2040
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2041
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1798
2042
|
logError2(
|
|
1799
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
2043
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
1800
2044
|
);
|
|
1801
2045
|
continue;
|
|
1802
2046
|
}
|
|
1803
|
-
const hasExpectedMessages =
|
|
1804
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1805
|
-
(msg) => isTestMessage(msg)
|
|
1806
|
-
);
|
|
1807
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1808
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1809
|
-
logError2(`No valid expected message found for eval case: ${id}`);
|
|
1810
|
-
continue;
|
|
1811
|
-
}
|
|
2047
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1812
2048
|
const guidelinePaths = [];
|
|
1813
2049
|
const inputTextParts = [];
|
|
1814
2050
|
const inputSegments = await processMessages({
|
|
@@ -1852,28 +2088,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1852
2088
|
}
|
|
1853
2089
|
const inlineRubrics = evalcase.rubrics;
|
|
1854
2090
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1855
|
-
const
|
|
1856
|
-
|
|
1857
|
-
return {
|
|
1858
|
-
id: `rubric-${index + 1}`,
|
|
1859
|
-
description: rubric,
|
|
1860
|
-
weight: 1,
|
|
1861
|
-
required: true
|
|
1862
|
-
};
|
|
1863
|
-
}
|
|
1864
|
-
return {
|
|
1865
|
-
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
1866
|
-
description: asString6(rubric.description) ?? "",
|
|
1867
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1868
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1869
|
-
};
|
|
1870
|
-
}).filter((r) => r.description.length > 0);
|
|
1871
|
-
if (rubricItems.length > 0) {
|
|
1872
|
-
const rubricEvaluator = {
|
|
1873
|
-
name: "rubric",
|
|
1874
|
-
type: "llm_judge",
|
|
1875
|
-
rubrics: rubricItems
|
|
1876
|
-
};
|
|
2091
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2092
|
+
if (rubricEvaluator) {
|
|
1877
2093
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1878
2094
|
}
|
|
1879
2095
|
}
|
|
@@ -3245,7 +3461,8 @@ var ToolCallSchema = import_zod.z.object({
|
|
|
3245
3461
|
input: import_zod.z.unknown().optional(),
|
|
3246
3462
|
output: import_zod.z.unknown().optional(),
|
|
3247
3463
|
id: import_zod.z.string().optional(),
|
|
3248
|
-
timestamp: import_zod.z.string().optional()
|
|
3464
|
+
timestamp: import_zod.z.string().optional(),
|
|
3465
|
+
duration_ms: import_zod.z.number().optional()
|
|
3249
3466
|
});
|
|
3250
3467
|
var OutputMessageInputSchema = import_zod.z.object({
|
|
3251
3468
|
role: import_zod.z.string(),
|
|
@@ -3253,6 +3470,7 @@ var OutputMessageInputSchema = import_zod.z.object({
|
|
|
3253
3470
|
content: import_zod.z.unknown().optional(),
|
|
3254
3471
|
tool_calls: import_zod.z.array(ToolCallSchema).optional(),
|
|
3255
3472
|
timestamp: import_zod.z.string().optional(),
|
|
3473
|
+
duration_ms: import_zod.z.number().optional(),
|
|
3256
3474
|
metadata: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
3257
3475
|
});
|
|
3258
3476
|
var TokenUsageSchema = import_zod.z.object({
|
|
@@ -3291,8 +3509,16 @@ function convertOutputMessages(messages) {
|
|
|
3291
3509
|
role: msg.role,
|
|
3292
3510
|
name: msg.name,
|
|
3293
3511
|
content: msg.content,
|
|
3294
|
-
toolCalls: msg.tool_calls
|
|
3512
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
3513
|
+
tool: tc.tool,
|
|
3514
|
+
input: tc.input,
|
|
3515
|
+
output: tc.output,
|
|
3516
|
+
id: tc.id,
|
|
3517
|
+
timestamp: tc.timestamp,
|
|
3518
|
+
durationMs: tc.duration_ms
|
|
3519
|
+
})),
|
|
3295
3520
|
timestamp: msg.timestamp,
|
|
3521
|
+
durationMs: msg.duration_ms,
|
|
3296
3522
|
metadata: msg.metadata
|
|
3297
3523
|
}));
|
|
3298
3524
|
}
|
|
@@ -7226,6 +7452,15 @@ var rubricEvaluationSchema = import_zod3.z.object({
|
|
|
7226
7452
|
checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
7227
7453
|
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
7228
7454
|
});
|
|
7455
|
+
var scoreRangeCheckResultSchema = import_zod3.z.object({
|
|
7456
|
+
id: import_zod3.z.string().describe("The ID of the rubric criterion being scored"),
|
|
7457
|
+
score: import_zod3.z.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
7458
|
+
reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
7459
|
+
});
|
|
7460
|
+
var scoreRangeEvaluationSchema = import_zod3.z.object({
|
|
7461
|
+
checks: import_zod3.z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
7462
|
+
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
7463
|
+
});
|
|
7229
7464
|
var LlmJudgeEvaluator = class {
|
|
7230
7465
|
kind = "llm_judge";
|
|
7231
7466
|
resolveJudgeProvider;
|
|
@@ -7311,6 +7546,10 @@ var LlmJudgeEvaluator = class {
|
|
|
7311
7546
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
7312
7547
|
);
|
|
7313
7548
|
}
|
|
7549
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
7550
|
+
if (hasScoreRanges) {
|
|
7551
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
7552
|
+
}
|
|
7314
7553
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
7315
7554
|
const systemPrompt = buildRubricOutputSchema();
|
|
7316
7555
|
const evaluatorRawRequest = {
|
|
@@ -7336,6 +7575,84 @@ var LlmJudgeEvaluator = class {
|
|
|
7336
7575
|
evaluatorRawRequest
|
|
7337
7576
|
};
|
|
7338
7577
|
}
|
|
7578
|
+
/**
|
|
7579
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
7580
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
7581
|
+
*/
|
|
7582
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
7583
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
7584
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
7585
|
+
const evaluatorRawRequest = {
|
|
7586
|
+
userPrompt: prompt,
|
|
7587
|
+
systemPrompt,
|
|
7588
|
+
target: judgeProvider.targetName
|
|
7589
|
+
};
|
|
7590
|
+
const { data } = await this.runWithRetry({
|
|
7591
|
+
context,
|
|
7592
|
+
judgeProvider,
|
|
7593
|
+
systemPrompt,
|
|
7594
|
+
userPrompt: prompt,
|
|
7595
|
+
schema: scoreRangeEvaluationSchema
|
|
7596
|
+
});
|
|
7597
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
7598
|
+
return {
|
|
7599
|
+
score,
|
|
7600
|
+
verdict,
|
|
7601
|
+
hits,
|
|
7602
|
+
misses,
|
|
7603
|
+
expectedAspectCount: rubrics.length,
|
|
7604
|
+
reasoning: data.overall_reasoning,
|
|
7605
|
+
evaluatorRawRequest,
|
|
7606
|
+
details
|
|
7607
|
+
};
|
|
7608
|
+
}
|
|
7609
|
+
/**
|
|
7610
|
+
* Build prompt for score-range rubric evaluation.
|
|
7611
|
+
*/
|
|
7612
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
7613
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
7614
|
+
const parts = [
|
|
7615
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
7616
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
7617
|
+
"",
|
|
7618
|
+
"[[ ## question ## ]]",
|
|
7619
|
+
formattedQuestion,
|
|
7620
|
+
"",
|
|
7621
|
+
"[[ ## expected_outcome ## ]]",
|
|
7622
|
+
context.evalCase.expected_outcome,
|
|
7623
|
+
""
|
|
7624
|
+
];
|
|
7625
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
7626
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
7627
|
+
}
|
|
7628
|
+
parts.push(
|
|
7629
|
+
"[[ ## candidate_answer ## ]]",
|
|
7630
|
+
context.candidate,
|
|
7631
|
+
"",
|
|
7632
|
+
"[[ ## scoring_criteria ## ]]"
|
|
7633
|
+
);
|
|
7634
|
+
for (const rubric of rubrics) {
|
|
7635
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
7636
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
7637
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
7638
|
+
if (rubric.expected_outcome) {
|
|
7639
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
7640
|
+
}
|
|
7641
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
7642
|
+
parts.push("Score ranges:");
|
|
7643
|
+
for (const range of rubric.score_ranges) {
|
|
7644
|
+
const [min, max] = range.score_range;
|
|
7645
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
7646
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
7647
|
+
}
|
|
7648
|
+
}
|
|
7649
|
+
}
|
|
7650
|
+
parts.push(
|
|
7651
|
+
"",
|
|
7652
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
7653
|
+
);
|
|
7654
|
+
return parts.join("\n");
|
|
7655
|
+
}
|
|
7339
7656
|
buildRubricPrompt(context, rubrics) {
|
|
7340
7657
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
7341
7658
|
const parts = [
|
|
@@ -7355,7 +7672,7 @@ var LlmJudgeEvaluator = class {
|
|
|
7355
7672
|
for (const rubric of rubrics) {
|
|
7356
7673
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
7357
7674
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
7358
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
7675
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
7359
7676
|
}
|
|
7360
7677
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
7361
7678
|
return parts.join("\n");
|
|
@@ -7442,9 +7759,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
7442
7759
|
totalWeight += rubric.weight;
|
|
7443
7760
|
if (check.satisfied) {
|
|
7444
7761
|
earnedWeight += rubric.weight;
|
|
7445
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
7762
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
7446
7763
|
} else {
|
|
7447
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
7764
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
7448
7765
|
if (rubric.required) {
|
|
7449
7766
|
failedRequired = true;
|
|
7450
7767
|
}
|
|
@@ -7454,6 +7771,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
7454
7771
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7455
7772
|
return { score, verdict, hits, misses };
|
|
7456
7773
|
}
|
|
7774
|
+
function buildScoreRangeOutputSchema() {
|
|
7775
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
7776
|
+
You must return a valid JSON object matching this schema:
|
|
7777
|
+
{
|
|
7778
|
+
"checks": [
|
|
7779
|
+
{
|
|
7780
|
+
"id": "string (criterion id)",
|
|
7781
|
+
"score": integer (0-10),
|
|
7782
|
+
"reasoning": "string (brief explanation for score)"
|
|
7783
|
+
}
|
|
7784
|
+
],
|
|
7785
|
+
"overall_reasoning": "string (summary, optional)"
|
|
7786
|
+
}
|
|
7787
|
+
|
|
7788
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
7789
|
+
}
|
|
7790
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
7791
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
7792
|
+
const hits = [];
|
|
7793
|
+
const misses = [];
|
|
7794
|
+
const rawScores = {};
|
|
7795
|
+
let totalWeight = 0;
|
|
7796
|
+
let weightedScoreSum = 0;
|
|
7797
|
+
let failedRequired = false;
|
|
7798
|
+
for (const check of result.checks) {
|
|
7799
|
+
const rubric = rubricMap.get(check.id);
|
|
7800
|
+
if (!rubric) {
|
|
7801
|
+
continue;
|
|
7802
|
+
}
|
|
7803
|
+
const rawScore = Math.max(0, Math.min(10, check.score));
|
|
7804
|
+
const normalizedScore = rawScore / 10;
|
|
7805
|
+
rawScores[rubric.id] = rawScore;
|
|
7806
|
+
totalWeight += rubric.weight;
|
|
7807
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
7808
|
+
let requiredMinScore;
|
|
7809
|
+
if (rubric.required_min_score !== void 0) {
|
|
7810
|
+
requiredMinScore = rubric.required_min_score;
|
|
7811
|
+
} else if (rubric.required === true) {
|
|
7812
|
+
requiredMinScore = 10;
|
|
7813
|
+
}
|
|
7814
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
7815
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
7816
|
+
);
|
|
7817
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
7818
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
7819
|
+
const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
|
|
7820
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
7821
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
7822
|
+
failedRequired = true;
|
|
7823
|
+
misses.push(scoreInfo);
|
|
7824
|
+
} else if (rawScore >= 7) {
|
|
7825
|
+
hits.push(scoreInfo);
|
|
7826
|
+
} else {
|
|
7827
|
+
misses.push(scoreInfo);
|
|
7828
|
+
}
|
|
7829
|
+
}
|
|
7830
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
7831
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7832
|
+
return {
|
|
7833
|
+
score,
|
|
7834
|
+
verdict,
|
|
7835
|
+
hits,
|
|
7836
|
+
misses,
|
|
7837
|
+
details: {
|
|
7838
|
+
raw_scores: rawScores,
|
|
7839
|
+
normalization: "score / 10",
|
|
7840
|
+
aggregation: "weighted_average"
|
|
7841
|
+
}
|
|
7842
|
+
};
|
|
7843
|
+
}
|
|
7457
7844
|
|
|
7458
7845
|
// src/evaluation/evaluators/composite.ts
|
|
7459
7846
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
@@ -8281,6 +8668,27 @@ function argsMatch(expected, actual) {
|
|
|
8281
8668
|
}
|
|
8282
8669
|
return true;
|
|
8283
8670
|
}
|
|
8671
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
8672
|
+
if (maxDurationMs === void 0) {
|
|
8673
|
+
return { status: "skip", message: "" };
|
|
8674
|
+
}
|
|
8675
|
+
if (actualDurationMs === void 0) {
|
|
8676
|
+
return {
|
|
8677
|
+
status: "skip",
|
|
8678
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
8679
|
+
};
|
|
8680
|
+
}
|
|
8681
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
8682
|
+
return {
|
|
8683
|
+
status: "pass",
|
|
8684
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
8685
|
+
};
|
|
8686
|
+
}
|
|
8687
|
+
return {
|
|
8688
|
+
status: "fail",
|
|
8689
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
8690
|
+
};
|
|
8691
|
+
}
|
|
8284
8692
|
var ToolTrajectoryEvaluator = class {
|
|
8285
8693
|
kind = "tool_trajectory";
|
|
8286
8694
|
config;
|
|
@@ -8339,7 +8747,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8339
8747
|
for (const call of message.toolCalls) {
|
|
8340
8748
|
toolCalls.push({
|
|
8341
8749
|
name: call.tool,
|
|
8342
|
-
args: call.input
|
|
8750
|
+
args: call.input,
|
|
8751
|
+
durationMs: call.durationMs
|
|
8343
8752
|
});
|
|
8344
8753
|
}
|
|
8345
8754
|
}
|
|
@@ -8407,17 +8816,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8407
8816
|
}
|
|
8408
8817
|
const hits = [];
|
|
8409
8818
|
const misses = [];
|
|
8819
|
+
const warnings = [];
|
|
8410
8820
|
let actualIndex = 0;
|
|
8821
|
+
let sequenceHits = 0;
|
|
8822
|
+
let latencyHits = 0;
|
|
8823
|
+
let latencySkips = 0;
|
|
8824
|
+
const latencyAssertionCount = expected.filter(
|
|
8825
|
+
(item) => item.maxDurationMs !== void 0
|
|
8826
|
+
).length;
|
|
8411
8827
|
for (let i = 0; i < expected.length; i++) {
|
|
8412
8828
|
const expectedItem = expected[i];
|
|
8413
8829
|
const expectedTool = expectedItem.tool;
|
|
8414
8830
|
let found = false;
|
|
8415
8831
|
let argsMismatch = false;
|
|
8832
|
+
let matchedCall;
|
|
8416
8833
|
while (actualIndex < toolCalls.length) {
|
|
8417
8834
|
const actualCall = toolCalls[actualIndex];
|
|
8418
8835
|
if (actualCall.name === expectedTool) {
|
|
8419
8836
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8420
8837
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
8838
|
+
sequenceHits++;
|
|
8839
|
+
matchedCall = actualCall;
|
|
8421
8840
|
actualIndex++;
|
|
8422
8841
|
found = true;
|
|
8423
8842
|
break;
|
|
@@ -8434,14 +8853,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8434
8853
|
if (!found && !argsMismatch) {
|
|
8435
8854
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
8436
8855
|
}
|
|
8856
|
+
if (found && matchedCall) {
|
|
8857
|
+
const latencyResult = checkLatency(
|
|
8858
|
+
expectedTool,
|
|
8859
|
+
expectedItem.maxDurationMs,
|
|
8860
|
+
matchedCall.durationMs
|
|
8861
|
+
);
|
|
8862
|
+
if (latencyResult.status === "pass") {
|
|
8863
|
+
hits.push(latencyResult.message);
|
|
8864
|
+
latencyHits++;
|
|
8865
|
+
} else if (latencyResult.status === "fail") {
|
|
8866
|
+
misses.push(latencyResult.message);
|
|
8867
|
+
} else if (latencyResult.message) {
|
|
8868
|
+
warnings.push(latencyResult.message);
|
|
8869
|
+
latencySkips++;
|
|
8870
|
+
}
|
|
8871
|
+
}
|
|
8437
8872
|
}
|
|
8438
|
-
const
|
|
8873
|
+
for (const warning of warnings) {
|
|
8874
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
8875
|
+
}
|
|
8876
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
8877
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
8878
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
8439
8879
|
return {
|
|
8440
8880
|
score,
|
|
8441
8881
|
verdict: scoreToVerdict(score),
|
|
8442
8882
|
hits,
|
|
8443
8883
|
misses,
|
|
8444
|
-
expectedAspectCount:
|
|
8884
|
+
expectedAspectCount: totalAssertions
|
|
8445
8885
|
};
|
|
8446
8886
|
}
|
|
8447
8887
|
evaluateExact(toolCalls) {
|
|
@@ -8457,6 +8897,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8457
8897
|
}
|
|
8458
8898
|
const hits = [];
|
|
8459
8899
|
const misses = [];
|
|
8900
|
+
const warnings = [];
|
|
8901
|
+
let sequenceHits = 0;
|
|
8902
|
+
let latencyHits = 0;
|
|
8903
|
+
let latencySkips = 0;
|
|
8904
|
+
const latencyAssertionCount = expected.filter(
|
|
8905
|
+
(item) => item.maxDurationMs !== void 0
|
|
8906
|
+
).length;
|
|
8460
8907
|
if (toolCalls.length !== expected.length) {
|
|
8461
8908
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
8462
8909
|
}
|
|
@@ -8466,26 +8913,50 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8466
8913
|
const expectedTool = expectedItem.tool;
|
|
8467
8914
|
const actualCall = toolCalls[i];
|
|
8468
8915
|
const actualTool = actualCall.name;
|
|
8916
|
+
let sequenceMatched = false;
|
|
8469
8917
|
if (actualTool === expectedTool) {
|
|
8470
8918
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8471
8919
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
8920
|
+
sequenceHits++;
|
|
8921
|
+
sequenceMatched = true;
|
|
8472
8922
|
} else {
|
|
8473
8923
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
8474
8924
|
}
|
|
8475
8925
|
} else {
|
|
8476
8926
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
8477
8927
|
}
|
|
8928
|
+
if (sequenceMatched) {
|
|
8929
|
+
const latencyResult = checkLatency(
|
|
8930
|
+
expectedTool,
|
|
8931
|
+
expectedItem.maxDurationMs,
|
|
8932
|
+
actualCall.durationMs
|
|
8933
|
+
);
|
|
8934
|
+
if (latencyResult.status === "pass") {
|
|
8935
|
+
hits.push(latencyResult.message);
|
|
8936
|
+
latencyHits++;
|
|
8937
|
+
} else if (latencyResult.status === "fail") {
|
|
8938
|
+
misses.push(latencyResult.message);
|
|
8939
|
+
} else if (latencyResult.message) {
|
|
8940
|
+
warnings.push(latencyResult.message);
|
|
8941
|
+
latencySkips++;
|
|
8942
|
+
}
|
|
8943
|
+
}
|
|
8478
8944
|
}
|
|
8479
8945
|
for (let i = checkLength; i < expected.length; i++) {
|
|
8480
8946
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
8481
8947
|
}
|
|
8482
|
-
const
|
|
8948
|
+
for (const warning of warnings) {
|
|
8949
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
8950
|
+
}
|
|
8951
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
8952
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
8953
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
8483
8954
|
return {
|
|
8484
8955
|
score,
|
|
8485
8956
|
verdict: scoreToVerdict(score),
|
|
8486
8957
|
hits,
|
|
8487
8958
|
misses,
|
|
8488
|
-
expectedAspectCount:
|
|
8959
|
+
expectedAspectCount: totalAssertions
|
|
8489
8960
|
};
|
|
8490
8961
|
}
|
|
8491
8962
|
};
|
|
@@ -8493,6 +8964,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8493
8964
|
// src/evaluation/orchestrator.ts
|
|
8494
8965
|
var import_node_crypto5 = require("crypto");
|
|
8495
8966
|
var import_node_path17 = __toESM(require("path"), 1);
|
|
8967
|
+
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
8496
8968
|
|
|
8497
8969
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
8498
8970
|
var Node = class {
|
|
@@ -8651,17 +9123,17 @@ async function runEvaluation(options) {
|
|
|
8651
9123
|
cache,
|
|
8652
9124
|
useCache,
|
|
8653
9125
|
now,
|
|
8654
|
-
|
|
9126
|
+
filter,
|
|
8655
9127
|
verbose,
|
|
8656
9128
|
evalCases: preloadedEvalCases,
|
|
8657
9129
|
onResult,
|
|
8658
9130
|
onProgress
|
|
8659
9131
|
} = options;
|
|
8660
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
8661
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
9132
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
|
|
9133
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
8662
9134
|
if (filteredEvalCases.length === 0) {
|
|
8663
|
-
if (
|
|
8664
|
-
throw new Error(`
|
|
9135
|
+
if (filter) {
|
|
9136
|
+
throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
|
|
8665
9137
|
}
|
|
8666
9138
|
return [];
|
|
8667
9139
|
}
|
|
@@ -9237,7 +9709,10 @@ async function runEvaluatorList(options) {
|
|
|
9237
9709
|
attempt,
|
|
9238
9710
|
promptInputs,
|
|
9239
9711
|
now,
|
|
9240
|
-
judgeProvider
|
|
9712
|
+
judgeProvider,
|
|
9713
|
+
outputMessages,
|
|
9714
|
+
traceSummary,
|
|
9715
|
+
agentTimeoutMs
|
|
9241
9716
|
});
|
|
9242
9717
|
const weight = evaluator.weight ?? 1;
|
|
9243
9718
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -9572,9 +10047,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
9572
10047
|
attempt,
|
|
9573
10048
|
promptInputs,
|
|
9574
10049
|
now,
|
|
9575
|
-
judgeProvider
|
|
10050
|
+
judgeProvider,
|
|
10051
|
+
outputMessages,
|
|
10052
|
+
traceSummary,
|
|
10053
|
+
agentTimeoutMs
|
|
9576
10054
|
} = options;
|
|
9577
|
-
const customPrompt = await resolveCustomPrompt(
|
|
10055
|
+
const customPrompt = await resolveCustomPrompt(
|
|
10056
|
+
config,
|
|
10057
|
+
{
|
|
10058
|
+
evalCase,
|
|
10059
|
+
candidate,
|
|
10060
|
+
outputMessages,
|
|
10061
|
+
traceSummary,
|
|
10062
|
+
config: config.config
|
|
10063
|
+
},
|
|
10064
|
+
agentTimeoutMs
|
|
10065
|
+
);
|
|
9578
10066
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
9579
10067
|
evalCase,
|
|
9580
10068
|
candidate,
|
|
@@ -9588,23 +10076,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
9588
10076
|
evaluator: config
|
|
9589
10077
|
});
|
|
9590
10078
|
}
|
|
9591
|
-
async function resolveCustomPrompt(
|
|
9592
|
-
if (
|
|
10079
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
10080
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
10081
|
+
if (!context) {
|
|
10082
|
+
throw new Error("Context required for executable prompt templates");
|
|
10083
|
+
}
|
|
10084
|
+
return executePromptTemplate(
|
|
10085
|
+
promptConfig.resolvedPromptScript,
|
|
10086
|
+
context,
|
|
10087
|
+
promptConfig.config,
|
|
10088
|
+
timeoutMs
|
|
10089
|
+
);
|
|
10090
|
+
}
|
|
10091
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
10092
|
+
if (promptPath) {
|
|
9593
10093
|
try {
|
|
9594
|
-
const content = await readTextFile(
|
|
10094
|
+
const content = await readTextFile(promptPath);
|
|
9595
10095
|
return content;
|
|
9596
10096
|
} catch (error) {
|
|
9597
10097
|
const message = error instanceof Error ? error.message : String(error);
|
|
9598
|
-
console.warn(`Could not read custom prompt at ${
|
|
10098
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
9599
10099
|
}
|
|
9600
10100
|
}
|
|
9601
|
-
|
|
10101
|
+
const promptValue = promptConfig.prompt;
|
|
10102
|
+
if (typeof promptValue === "string") {
|
|
10103
|
+
return promptValue;
|
|
10104
|
+
}
|
|
10105
|
+
return void 0;
|
|
10106
|
+
}
|
|
10107
|
+
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
10108
|
+
const payload = {
|
|
10109
|
+
question: context.evalCase.question,
|
|
10110
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
10111
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
10112
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
10113
|
+
candidateAnswer: context.candidate,
|
|
10114
|
+
outputMessages: context.outputMessages ?? null,
|
|
10115
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
10116
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
10117
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
10118
|
+
),
|
|
10119
|
+
inputMessages: context.evalCase.input_messages,
|
|
10120
|
+
traceSummary: context.traceSummary ?? null,
|
|
10121
|
+
config: config ?? context.config ?? null
|
|
10122
|
+
};
|
|
10123
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
10124
|
+
const scriptPath = script[script.length - 1];
|
|
10125
|
+
const cwd = import_node_path17.default.dirname(scriptPath);
|
|
10126
|
+
try {
|
|
10127
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
10128
|
+
const prompt = stdout.trim();
|
|
10129
|
+
if (!prompt) {
|
|
10130
|
+
throw new Error("Prompt template produced empty output");
|
|
10131
|
+
}
|
|
10132
|
+
return prompt;
|
|
10133
|
+
} catch (error) {
|
|
10134
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
10135
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
10136
|
+
}
|
|
9602
10137
|
}
|
|
9603
|
-
function filterEvalCases(evalCases,
|
|
9604
|
-
if (!
|
|
10138
|
+
function filterEvalCases(evalCases, filter) {
|
|
10139
|
+
if (!filter) {
|
|
9605
10140
|
return evalCases;
|
|
9606
10141
|
}
|
|
9607
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
10142
|
+
return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
|
|
9608
10143
|
}
|
|
9609
10144
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
9610
10145
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -9762,7 +10297,7 @@ var import_ai4 = require("ai");
|
|
|
9762
10297
|
var import_zod4 = require("zod");
|
|
9763
10298
|
var rubricItemSchema = import_zod4.z.object({
|
|
9764
10299
|
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
9765
|
-
|
|
10300
|
+
expected_outcome: import_zod4.z.string().describe("Concrete expected outcome for this rubric item"),
|
|
9766
10301
|
weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
|
|
9767
10302
|
required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
9768
10303
|
});
|
|
@@ -9782,7 +10317,7 @@ You must return a valid JSON object matching this schema:
|
|
|
9782
10317
|
"rubrics": [
|
|
9783
10318
|
{
|
|
9784
10319
|
"id": "string (short identifier)",
|
|
9785
|
-
"
|
|
10320
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
9786
10321
|
"weight": number (default 1.0),
|
|
9787
10322
|
"required": boolean (default true)
|
|
9788
10323
|
}
|
|
@@ -9818,7 +10353,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
9818
10353
|
"Each rubric should:",
|
|
9819
10354
|
"- Be specific and testable",
|
|
9820
10355
|
"- Have a short, descriptive ID",
|
|
9821
|
-
"- Include a clear
|
|
10356
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
9822
10357
|
"- Indicate if it is required (mandatory) or optional",
|
|
9823
10358
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
9824
10359
|
"",
|