@agentv/core 2.1.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
- package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +39 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1070 -281
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +82 -7
- package/dist/index.d.ts +82 -7
- package/dist/index.js +1018 -230
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -53,6 +53,7 @@ __export(index_exports, {
|
|
|
53
53
|
createAgentKernel: () => createAgentKernel,
|
|
54
54
|
createProvider: () => createProvider,
|
|
55
55
|
deepEqual: () => deepEqual,
|
|
56
|
+
detectFormat: () => detectFormat,
|
|
56
57
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
58
|
executeScript: () => executeScript,
|
|
58
59
|
explorationRatio: () => explorationRatio,
|
|
@@ -226,9 +227,10 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
226
227
|
}
|
|
227
228
|
|
|
228
229
|
// src/evaluation/yaml-parser.ts
|
|
229
|
-
var
|
|
230
|
-
var
|
|
231
|
-
var
|
|
230
|
+
var import_promises7 = require("fs/promises");
|
|
231
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
232
|
+
var import_micromatch3 = __toESM(require("micromatch"), 1);
|
|
233
|
+
var import_yaml3 = require("yaml");
|
|
232
234
|
|
|
233
235
|
// src/evaluation/loaders/config-loader.ts
|
|
234
236
|
var import_promises2 = require("fs/promises");
|
|
@@ -542,11 +544,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
542
544
|
);
|
|
543
545
|
}
|
|
544
546
|
}
|
|
545
|
-
const
|
|
546
|
-
const
|
|
547
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
548
|
+
const config2 = {};
|
|
547
549
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
548
|
-
if (!
|
|
549
|
-
|
|
550
|
+
if (!knownProps2.has(key) && value !== void 0) {
|
|
551
|
+
config2[key] = value;
|
|
550
552
|
}
|
|
551
553
|
}
|
|
552
554
|
evaluators.push({
|
|
@@ -556,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
556
558
|
cwd,
|
|
557
559
|
resolvedCwd,
|
|
558
560
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
559
|
-
...Object.keys(
|
|
561
|
+
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
560
562
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
561
563
|
});
|
|
562
564
|
continue;
|
|
@@ -721,7 +723,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
721
723
|
continue;
|
|
722
724
|
}
|
|
723
725
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
724
|
-
const
|
|
726
|
+
const config2 = {
|
|
725
727
|
name,
|
|
726
728
|
type: "tool_trajectory",
|
|
727
729
|
mode,
|
|
@@ -729,7 +731,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
729
731
|
...expected ? { expected } : {},
|
|
730
732
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
731
733
|
};
|
|
732
|
-
evaluators.push(
|
|
734
|
+
evaluators.push(config2);
|
|
733
735
|
continue;
|
|
734
736
|
}
|
|
735
737
|
if (typeValue === "field_accuracy") {
|
|
@@ -866,9 +868,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
866
868
|
});
|
|
867
869
|
continue;
|
|
868
870
|
}
|
|
869
|
-
const
|
|
871
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
872
|
+
let prompt;
|
|
870
873
|
let promptPath;
|
|
871
|
-
|
|
874
|
+
let resolvedPromptScript;
|
|
875
|
+
let promptScriptConfig;
|
|
876
|
+
if (isJsonObject2(rawPrompt)) {
|
|
877
|
+
const scriptArray = asStringArray(
|
|
878
|
+
rawPrompt.script,
|
|
879
|
+
`prompt.script for evaluator '${name}' in '${evalId}'`
|
|
880
|
+
);
|
|
881
|
+
if (!scriptArray) {
|
|
882
|
+
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
|
|
883
|
+
}
|
|
884
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
885
|
+
const resolved = await resolveFileReference(scriptPath, searchRoots);
|
|
886
|
+
if (resolved.resolvedPath) {
|
|
887
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path3.default.resolve(resolved.resolvedPath)];
|
|
888
|
+
} else {
|
|
889
|
+
throw new Error(
|
|
890
|
+
`Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
891
|
+
);
|
|
892
|
+
}
|
|
893
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
894
|
+
promptScriptConfig = rawPrompt.config;
|
|
895
|
+
}
|
|
896
|
+
} else if (typeof rawPrompt === "string") {
|
|
897
|
+
prompt = rawPrompt;
|
|
872
898
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
873
899
|
if (resolved.resolvedPath) {
|
|
874
900
|
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
@@ -887,12 +913,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
887
913
|
}
|
|
888
914
|
const _model = asString(rawEvaluator.model);
|
|
889
915
|
const rawRubrics = rawEvaluator.rubrics;
|
|
890
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
891
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
892
|
-
description: asString(rubric.description) ?? "",
|
|
893
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
894
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
895
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
916
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
|
|
896
917
|
if (typeValue === "rubric") {
|
|
897
918
|
if (!parsedRubrics) {
|
|
898
919
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
@@ -912,13 +933,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
912
933
|
continue;
|
|
913
934
|
}
|
|
914
935
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
936
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
937
|
+
const config = {};
|
|
938
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
939
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
940
|
+
config[key] = value;
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
944
|
+
const mergedConfig = { ...config, ...topLevelConfig };
|
|
945
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
915
946
|
evaluators.push({
|
|
916
947
|
name,
|
|
917
948
|
type: "llm_judge",
|
|
918
949
|
prompt,
|
|
919
950
|
promptPath,
|
|
951
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
952
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
920
953
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
921
|
-
...weight !== void 0 ? { weight } : {}
|
|
954
|
+
...weight !== void 0 ? { weight } : {},
|
|
955
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
922
956
|
});
|
|
923
957
|
}
|
|
924
958
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1005,6 +1039,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
1005
1039
|
function isValidFieldAggregationType(value) {
|
|
1006
1040
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1007
1041
|
}
|
|
1042
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
1043
|
+
const items = [];
|
|
1044
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
1045
|
+
if (!isJsonObject2(rawRubric)) {
|
|
1046
|
+
logWarning2(
|
|
1047
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
1048
|
+
);
|
|
1049
|
+
continue;
|
|
1050
|
+
}
|
|
1051
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
1052
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
1053
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
1054
|
+
let requiredMinScore;
|
|
1055
|
+
let required;
|
|
1056
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
1057
|
+
const minScore = rawRubric.required_min_score;
|
|
1058
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
1059
|
+
throw new Error(
|
|
1060
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
1061
|
+
);
|
|
1062
|
+
}
|
|
1063
|
+
requiredMinScore = minScore;
|
|
1064
|
+
}
|
|
1065
|
+
if (typeof rawRubric.required === "boolean") {
|
|
1066
|
+
required = rawRubric.required;
|
|
1067
|
+
}
|
|
1068
|
+
let scoreRanges;
|
|
1069
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
1070
|
+
if (rawScoreRanges !== void 0) {
|
|
1071
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
1072
|
+
throw new Error(
|
|
1073
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
1074
|
+
);
|
|
1075
|
+
}
|
|
1076
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
1077
|
+
items.push({
|
|
1078
|
+
id,
|
|
1079
|
+
weight,
|
|
1080
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1081
|
+
...required !== void 0 ? { required } : {},
|
|
1082
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1083
|
+
score_ranges: scoreRanges
|
|
1084
|
+
});
|
|
1085
|
+
} else {
|
|
1086
|
+
if (expectedOutcome.length === 0) {
|
|
1087
|
+
logWarning2(
|
|
1088
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
1089
|
+
);
|
|
1090
|
+
continue;
|
|
1091
|
+
}
|
|
1092
|
+
items.push({
|
|
1093
|
+
id,
|
|
1094
|
+
expected_outcome: expectedOutcome,
|
|
1095
|
+
weight,
|
|
1096
|
+
// Default to required: true if not specified (backward compatibility)
|
|
1097
|
+
required: required ?? true,
|
|
1098
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1099
|
+
});
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
return items.length > 0 ? items : void 0;
|
|
1103
|
+
}
|
|
1104
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
1105
|
+
const ranges = [];
|
|
1106
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
1107
|
+
if (!isJsonObject2(rawRange)) {
|
|
1108
|
+
throw new Error(
|
|
1109
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
1110
|
+
);
|
|
1111
|
+
}
|
|
1112
|
+
const scoreRangeValue = rawRange.score_range;
|
|
1113
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
1114
|
+
throw new Error(
|
|
1115
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
1116
|
+
);
|
|
1117
|
+
}
|
|
1118
|
+
const [min, max] = scoreRangeValue;
|
|
1119
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
1120
|
+
throw new Error(
|
|
1121
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
1122
|
+
);
|
|
1123
|
+
}
|
|
1124
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
1125
|
+
throw new Error(
|
|
1126
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
1127
|
+
);
|
|
1128
|
+
}
|
|
1129
|
+
if (min > max) {
|
|
1130
|
+
throw new Error(
|
|
1131
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
1132
|
+
);
|
|
1133
|
+
}
|
|
1134
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
1135
|
+
if (expectedOutcome.length === 0) {
|
|
1136
|
+
throw new Error(
|
|
1137
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
1138
|
+
);
|
|
1139
|
+
}
|
|
1140
|
+
ranges.push({
|
|
1141
|
+
score_range: [min, max],
|
|
1142
|
+
expected_outcome: expectedOutcome
|
|
1143
|
+
});
|
|
1144
|
+
}
|
|
1145
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
1146
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
1147
|
+
const prev = sortedRanges[i - 1];
|
|
1148
|
+
const curr = sortedRanges[i];
|
|
1149
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
1150
|
+
throw new Error(
|
|
1151
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
1152
|
+
);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
const covered = /* @__PURE__ */ new Set();
|
|
1156
|
+
for (const range of ranges) {
|
|
1157
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
1158
|
+
covered.add(i);
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
const missing = [];
|
|
1162
|
+
for (let i = 0; i <= 10; i++) {
|
|
1163
|
+
if (!covered.has(i)) {
|
|
1164
|
+
missing.push(i);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
if (missing.length > 0) {
|
|
1168
|
+
throw new Error(
|
|
1169
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
1170
|
+
);
|
|
1171
|
+
}
|
|
1172
|
+
return ranges;
|
|
1173
|
+
}
|
|
1174
|
+
function parseInlineRubrics(rawRubrics) {
|
|
1175
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
1176
|
+
if (typeof rubric === "string") {
|
|
1177
|
+
return {
|
|
1178
|
+
id: `rubric-${index + 1}`,
|
|
1179
|
+
expected_outcome: rubric,
|
|
1180
|
+
weight: 1,
|
|
1181
|
+
required: true
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
1184
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
1185
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
1186
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
1187
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
1188
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
1189
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
1190
|
+
const baseRubric = {
|
|
1191
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
1192
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
1193
|
+
};
|
|
1194
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
1195
|
+
return {
|
|
1196
|
+
...baseRubric,
|
|
1197
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1198
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
1199
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
1200
|
+
score_ranges: scoreRanges
|
|
1201
|
+
};
|
|
1202
|
+
}
|
|
1203
|
+
return {
|
|
1204
|
+
...baseRubric,
|
|
1205
|
+
expected_outcome: expectedOutcome,
|
|
1206
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
1207
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
1208
|
+
};
|
|
1209
|
+
}).filter(
|
|
1210
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
1211
|
+
);
|
|
1212
|
+
if (rubricItems.length === 0) {
|
|
1213
|
+
return void 0;
|
|
1214
|
+
}
|
|
1215
|
+
return {
|
|
1216
|
+
name: "rubric",
|
|
1217
|
+
type: "llm_judge",
|
|
1218
|
+
rubrics: rubricItems
|
|
1219
|
+
};
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1223
|
+
var import_promises5 = require("fs/promises");
|
|
1224
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1225
|
+
var import_micromatch2 = __toESM(require("micromatch"), 1);
|
|
1226
|
+
var import_yaml2 = require("yaml");
|
|
1008
1227
|
|
|
1009
1228
|
// src/evaluation/loaders/message-processor.ts
|
|
1010
1229
|
var import_promises4 = require("fs/promises");
|
|
@@ -1266,28 +1485,302 @@ async function processExpectedMessages(options) {
|
|
|
1266
1485
|
return segments;
|
|
1267
1486
|
}
|
|
1268
1487
|
|
|
1269
|
-
// src/evaluation/
|
|
1270
|
-
|
|
1271
|
-
|
|
1488
|
+
// src/evaluation/loaders/shorthand-expansion.ts
|
|
1489
|
+
function expandInputShorthand(value) {
|
|
1490
|
+
if (value === void 0 || value === null) {
|
|
1491
|
+
return void 0;
|
|
1492
|
+
}
|
|
1493
|
+
if (typeof value === "string") {
|
|
1494
|
+
return [{ role: "user", content: value }];
|
|
1495
|
+
}
|
|
1496
|
+
if (Array.isArray(value)) {
|
|
1497
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1498
|
+
return messages.length > 0 ? messages : void 0;
|
|
1499
|
+
}
|
|
1500
|
+
return void 0;
|
|
1501
|
+
}
|
|
1502
|
+
function expandExpectedOutputShorthand(value) {
|
|
1503
|
+
if (value === void 0 || value === null) {
|
|
1504
|
+
return void 0;
|
|
1505
|
+
}
|
|
1506
|
+
if (typeof value === "string") {
|
|
1507
|
+
return [{ role: "assistant", content: value }];
|
|
1508
|
+
}
|
|
1509
|
+
if (Array.isArray(value)) {
|
|
1510
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
1511
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1512
|
+
return messages.length > 0 ? messages : void 0;
|
|
1513
|
+
}
|
|
1514
|
+
return [{ role: "assistant", content: value }];
|
|
1515
|
+
}
|
|
1516
|
+
if (isJsonObject(value)) {
|
|
1517
|
+
if ("role" in value) {
|
|
1518
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
1519
|
+
}
|
|
1520
|
+
return [{ role: "assistant", content: value }];
|
|
1521
|
+
}
|
|
1522
|
+
return void 0;
|
|
1523
|
+
}
|
|
1524
|
+
function resolveInputMessages(raw) {
|
|
1525
|
+
if (raw.input_messages !== void 0) {
|
|
1526
|
+
if (Array.isArray(raw.input_messages)) {
|
|
1527
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
1528
|
+
return messages.length > 0 ? messages : void 0;
|
|
1529
|
+
}
|
|
1530
|
+
return void 0;
|
|
1531
|
+
}
|
|
1532
|
+
return expandInputShorthand(raw.input);
|
|
1533
|
+
}
|
|
1534
|
+
function resolveExpectedMessages(raw) {
|
|
1535
|
+
if (raw.expected_messages !== void 0) {
|
|
1536
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
1537
|
+
const messages = raw.expected_messages.filter(
|
|
1538
|
+
(msg) => isTestMessage(msg)
|
|
1539
|
+
);
|
|
1540
|
+
return messages.length > 0 ? messages : void 0;
|
|
1541
|
+
}
|
|
1542
|
+
return void 0;
|
|
1543
|
+
}
|
|
1544
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1272
1548
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1549
|
+
var ANSI_RED = "\x1B[31m";
|
|
1273
1550
|
var ANSI_RESET5 = "\x1B[0m";
|
|
1551
|
+
function detectFormat(filePath) {
|
|
1552
|
+
const ext = import_node_path5.default.extname(filePath).toLowerCase();
|
|
1553
|
+
if (ext === ".jsonl") return "jsonl";
|
|
1554
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
1555
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
1556
|
+
}
|
|
1557
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
1558
|
+
const dir = import_node_path5.default.dirname(jsonlPath);
|
|
1559
|
+
const base = import_node_path5.default.basename(jsonlPath, ".jsonl");
|
|
1560
|
+
const sidecarPath = import_node_path5.default.join(dir, `${base}.yaml`);
|
|
1561
|
+
if (!await fileExists(sidecarPath)) {
|
|
1562
|
+
if (verbose) {
|
|
1563
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
1564
|
+
}
|
|
1565
|
+
return {};
|
|
1566
|
+
}
|
|
1567
|
+
try {
|
|
1568
|
+
const content = await (0, import_promises5.readFile)(sidecarPath, "utf8");
|
|
1569
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
1570
|
+
if (!isJsonObject(parsed)) {
|
|
1571
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
1572
|
+
return {};
|
|
1573
|
+
}
|
|
1574
|
+
return {
|
|
1575
|
+
description: asString4(parsed.description),
|
|
1576
|
+
dataset: asString4(parsed.dataset),
|
|
1577
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
1578
|
+
evaluator: parsed.evaluator
|
|
1579
|
+
};
|
|
1580
|
+
} catch (error) {
|
|
1581
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
|
|
1582
|
+
return {};
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
function parseJsonlContent(content, filePath) {
|
|
1586
|
+
const lines = content.split("\n");
|
|
1587
|
+
const cases = [];
|
|
1588
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1589
|
+
const line = lines[i].trim();
|
|
1590
|
+
if (line === "") continue;
|
|
1591
|
+
try {
|
|
1592
|
+
const parsed = JSON.parse(line);
|
|
1593
|
+
if (!isJsonObject(parsed)) {
|
|
1594
|
+
throw new Error("Expected JSON object");
|
|
1595
|
+
}
|
|
1596
|
+
cases.push(parsed);
|
|
1597
|
+
} catch (error) {
|
|
1598
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1599
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
1600
|
+
File: ${filePath}`);
|
|
1601
|
+
}
|
|
1602
|
+
}
|
|
1603
|
+
return cases;
|
|
1604
|
+
}
|
|
1605
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1606
|
+
const verbose = options?.verbose ?? false;
|
|
1607
|
+
const filterPattern = options?.filter;
|
|
1608
|
+
const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
|
|
1609
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1610
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1611
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1612
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
1613
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
1614
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
1615
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
1616
|
+
const fallbackDataset = import_node_path5.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
1617
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
1618
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
1619
|
+
const globalExecution = sidecar.execution;
|
|
1620
|
+
if (verbose) {
|
|
1621
|
+
console.log(`
|
|
1622
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
1623
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
1624
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
1625
|
+
if (sidecar.description) {
|
|
1626
|
+
console.log(` Description: ${sidecar.description}`);
|
|
1627
|
+
}
|
|
1628
|
+
}
|
|
1629
|
+
const results = [];
|
|
1630
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
1631
|
+
const evalcase = rawCases[lineIndex];
|
|
1632
|
+
const lineNumber = lineIndex + 1;
|
|
1633
|
+
const id = asString4(evalcase.id);
|
|
1634
|
+
if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
|
|
1635
|
+
continue;
|
|
1636
|
+
}
|
|
1637
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
1638
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1639
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1640
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1641
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1642
|
+
logError(
|
|
1643
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
1644
|
+
);
|
|
1645
|
+
continue;
|
|
1646
|
+
}
|
|
1647
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1648
|
+
const guidelinePaths = [];
|
|
1649
|
+
const inputTextParts = [];
|
|
1650
|
+
const inputSegments = await processMessages({
|
|
1651
|
+
messages: inputMessages,
|
|
1652
|
+
searchRoots,
|
|
1653
|
+
repoRootPath,
|
|
1654
|
+
guidelinePatterns,
|
|
1655
|
+
guidelinePaths,
|
|
1656
|
+
textParts: inputTextParts,
|
|
1657
|
+
messageType: "input",
|
|
1658
|
+
verbose
|
|
1659
|
+
});
|
|
1660
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1661
|
+
messages: expectedMessages,
|
|
1662
|
+
searchRoots,
|
|
1663
|
+
repoRootPath,
|
|
1664
|
+
verbose
|
|
1665
|
+
}) : [];
|
|
1666
|
+
let referenceAnswer = "";
|
|
1667
|
+
if (outputSegments.length > 0) {
|
|
1668
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1669
|
+
const content = lastMessage.content;
|
|
1670
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1671
|
+
if (typeof content === "string") {
|
|
1672
|
+
referenceAnswer = content;
|
|
1673
|
+
} else if (content !== void 0 && content !== null) {
|
|
1674
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1675
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1676
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1680
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
1681
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
1682
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1683
|
+
let evaluators;
|
|
1684
|
+
try {
|
|
1685
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
1686
|
+
} catch (error) {
|
|
1687
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1688
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
1689
|
+
continue;
|
|
1690
|
+
}
|
|
1691
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1692
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1693
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
1694
|
+
if (rubricEvaluator) {
|
|
1695
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1696
|
+
}
|
|
1697
|
+
}
|
|
1698
|
+
const userFilePaths = [];
|
|
1699
|
+
for (const segment of inputSegments) {
|
|
1700
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1701
|
+
userFilePaths.push(segment.resolvedPath);
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
const allFilePaths = [
|
|
1705
|
+
...guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1706
|
+
...userFilePaths
|
|
1707
|
+
];
|
|
1708
|
+
const testCase = {
|
|
1709
|
+
id,
|
|
1710
|
+
dataset: datasetName,
|
|
1711
|
+
conversation_id: conversationId,
|
|
1712
|
+
question,
|
|
1713
|
+
input_messages: inputMessages,
|
|
1714
|
+
input_segments: inputSegments,
|
|
1715
|
+
expected_messages: outputSegments,
|
|
1716
|
+
reference_answer: referenceAnswer,
|
|
1717
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1718
|
+
guideline_patterns: guidelinePatterns,
|
|
1719
|
+
file_paths: allFilePaths,
|
|
1720
|
+
expected_outcome: outcome,
|
|
1721
|
+
evaluator: evalCaseEvaluatorKind,
|
|
1722
|
+
evaluators
|
|
1723
|
+
};
|
|
1724
|
+
if (verbose) {
|
|
1725
|
+
console.log(`
|
|
1726
|
+
[Eval Case: ${id}]`);
|
|
1727
|
+
if (testCase.guideline_paths.length > 0) {
|
|
1728
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
1729
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
1730
|
+
console.log(` - ${guidelinePath}`);
|
|
1731
|
+
}
|
|
1732
|
+
} else {
|
|
1733
|
+
console.log(" No guidelines found");
|
|
1734
|
+
}
|
|
1735
|
+
}
|
|
1736
|
+
results.push(testCase);
|
|
1737
|
+
}
|
|
1738
|
+
return results;
|
|
1739
|
+
}
|
|
1740
|
+
function asString4(value) {
|
|
1741
|
+
return typeof value === "string" ? value : void 0;
|
|
1742
|
+
}
|
|
1743
|
+
function logWarning4(message, details) {
|
|
1744
|
+
if (details && details.length > 0) {
|
|
1745
|
+
const detailBlock = details.join("\n");
|
|
1746
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1747
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1748
|
+
} else {
|
|
1749
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
function logError(message, details) {
|
|
1753
|
+
if (details && details.length > 0) {
|
|
1754
|
+
const detailBlock = details.join("\n");
|
|
1755
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1756
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1757
|
+
} else {
|
|
1758
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
|
|
1762
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
1763
|
+
var import_promises6 = require("fs/promises");
|
|
1764
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
1765
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
1766
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
1274
1767
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
1275
1768
|
const guidelineParts = [];
|
|
1276
1769
|
for (const rawPath of testCase.guideline_paths) {
|
|
1277
|
-
const absolutePath =
|
|
1770
|
+
const absolutePath = import_node_path6.default.resolve(rawPath);
|
|
1278
1771
|
if (!await fileExists(absolutePath)) {
|
|
1279
|
-
|
|
1772
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
1280
1773
|
continue;
|
|
1281
1774
|
}
|
|
1282
1775
|
try {
|
|
1283
|
-
const content = (await (0,
|
|
1776
|
+
const content = (await (0, import_promises6.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
1284
1777
|
guidelineParts.push({
|
|
1285
1778
|
content,
|
|
1286
1779
|
isFile: true,
|
|
1287
|
-
displayPath:
|
|
1780
|
+
displayPath: import_node_path6.default.basename(absolutePath)
|
|
1288
1781
|
});
|
|
1289
1782
|
} catch (error) {
|
|
1290
|
-
|
|
1783
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
1291
1784
|
}
|
|
1292
1785
|
}
|
|
1293
1786
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -1311,9 +1804,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1311
1804
|
messageSegments.push({ type: "text", value: segment });
|
|
1312
1805
|
}
|
|
1313
1806
|
} else if (isJsonObject(segment)) {
|
|
1314
|
-
const type =
|
|
1807
|
+
const type = asString5(segment.type);
|
|
1315
1808
|
if (type === "file") {
|
|
1316
|
-
const value =
|
|
1809
|
+
const value = asString5(segment.value);
|
|
1317
1810
|
if (!value) continue;
|
|
1318
1811
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
1319
1812
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -1324,7 +1817,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1324
1817
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
1325
1818
|
}
|
|
1326
1819
|
} else if (type === "text") {
|
|
1327
|
-
const textValue =
|
|
1820
|
+
const textValue = asString5(segment.value);
|
|
1328
1821
|
if (textValue && textValue.trim().length > 0) {
|
|
1329
1822
|
messageSegments.push({ type: "text", value: textValue });
|
|
1330
1823
|
}
|
|
@@ -1478,22 +1971,22 @@ ${guidelineContent.trim()}`);
|
|
|
1478
1971
|
}
|
|
1479
1972
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
1480
1973
|
}
|
|
1481
|
-
function
|
|
1974
|
+
function asString5(value) {
|
|
1482
1975
|
return typeof value === "string" ? value : void 0;
|
|
1483
1976
|
}
|
|
1484
|
-
function
|
|
1485
|
-
console.warn(`${
|
|
1977
|
+
function logWarning5(message) {
|
|
1978
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1486
1979
|
}
|
|
1487
1980
|
|
|
1488
1981
|
// src/evaluation/yaml-parser.ts
|
|
1489
|
-
var
|
|
1490
|
-
var
|
|
1491
|
-
var
|
|
1982
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
1983
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1984
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
1492
1985
|
async function readTestSuiteMetadata(testFilePath) {
|
|
1493
1986
|
try {
|
|
1494
|
-
const absolutePath =
|
|
1495
|
-
const content = await (0,
|
|
1496
|
-
const parsed = (0,
|
|
1987
|
+
const absolutePath = import_node_path7.default.resolve(testFilePath);
|
|
1988
|
+
const content = await (0, import_promises7.readFile)(absolutePath, "utf8");
|
|
1989
|
+
const parsed = (0, import_yaml3.parse)(content);
|
|
1497
1990
|
if (!isJsonObject(parsed)) {
|
|
1498
1991
|
return {};
|
|
1499
1992
|
}
|
|
@@ -1503,21 +1996,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
1503
1996
|
}
|
|
1504
1997
|
}
|
|
1505
1998
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
1999
|
+
const format = detectFormat(evalFilePath);
|
|
2000
|
+
if (format === "jsonl") {
|
|
2001
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
2002
|
+
}
|
|
1506
2003
|
const verbose = options?.verbose ?? false;
|
|
1507
|
-
const
|
|
1508
|
-
const absoluteTestPath =
|
|
2004
|
+
const filterPattern = options?.filter;
|
|
2005
|
+
const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
|
|
1509
2006
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1510
2007
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1511
2008
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1512
2009
|
const guidelinePatterns = config?.guideline_patterns;
|
|
1513
|
-
const rawFile = await (0,
|
|
1514
|
-
const parsed = (0,
|
|
2010
|
+
const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
|
|
2011
|
+
const parsed = (0, import_yaml3.parse)(rawFile);
|
|
1515
2012
|
if (!isJsonObject(parsed)) {
|
|
1516
2013
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
1517
2014
|
}
|
|
1518
2015
|
const suite = parsed;
|
|
1519
|
-
const datasetNameFromSuite =
|
|
1520
|
-
const fallbackDataset =
|
|
2016
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
2017
|
+
const fallbackDataset = import_node_path7.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
1521
2018
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
1522
2019
|
const rawTestcases = suite.evalcases;
|
|
1523
2020
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -1525,37 +2022,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1525
2022
|
}
|
|
1526
2023
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
1527
2024
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
1528
|
-
const _globalTarget =
|
|
2025
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
1529
2026
|
const results = [];
|
|
1530
2027
|
for (const rawEvalcase of rawTestcases) {
|
|
1531
2028
|
if (!isJsonObject(rawEvalcase)) {
|
|
1532
|
-
|
|
2029
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
1533
2030
|
continue;
|
|
1534
2031
|
}
|
|
1535
2032
|
const evalcase = rawEvalcase;
|
|
1536
|
-
const id =
|
|
1537
|
-
if (
|
|
2033
|
+
const id = asString6(evalcase.id);
|
|
2034
|
+
if (filterPattern && (!id || !import_micromatch3.default.isMatch(id, filterPattern))) {
|
|
1538
2035
|
continue;
|
|
1539
2036
|
}
|
|
1540
|
-
const conversationId =
|
|
1541
|
-
const outcome =
|
|
1542
|
-
const
|
|
1543
|
-
const
|
|
1544
|
-
if (!id || !outcome || !
|
|
1545
|
-
|
|
1546
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
2037
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
2038
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
2039
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
2040
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2041
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
2042
|
+
logError2(
|
|
2043
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
1547
2044
|
);
|
|
1548
2045
|
continue;
|
|
1549
2046
|
}
|
|
1550
|
-
const hasExpectedMessages =
|
|
1551
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1552
|
-
(msg) => isTestMessage(msg)
|
|
1553
|
-
);
|
|
1554
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1555
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1556
|
-
logError(`No valid expected message found for eval case: ${id}`);
|
|
1557
|
-
continue;
|
|
1558
|
-
}
|
|
2047
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1559
2048
|
const guidelinePaths = [];
|
|
1560
2049
|
const inputTextParts = [];
|
|
1561
2050
|
const inputSegments = await processMessages({
|
|
@@ -1594,33 +2083,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1594
2083
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1595
2084
|
} catch (error) {
|
|
1596
2085
|
const message = error instanceof Error ? error.message : String(error);
|
|
1597
|
-
|
|
2086
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
1598
2087
|
continue;
|
|
1599
2088
|
}
|
|
1600
2089
|
const inlineRubrics = evalcase.rubrics;
|
|
1601
2090
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1602
|
-
const
|
|
1603
|
-
|
|
1604
|
-
return {
|
|
1605
|
-
id: `rubric-${index + 1}`,
|
|
1606
|
-
description: rubric,
|
|
1607
|
-
weight: 1,
|
|
1608
|
-
required: true
|
|
1609
|
-
};
|
|
1610
|
-
}
|
|
1611
|
-
return {
|
|
1612
|
-
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
1613
|
-
description: asString5(rubric.description) ?? "",
|
|
1614
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1615
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1616
|
-
};
|
|
1617
|
-
}).filter((r) => r.description.length > 0);
|
|
1618
|
-
if (rubricItems.length > 0) {
|
|
1619
|
-
const rubricEvaluator = {
|
|
1620
|
-
name: "rubric",
|
|
1621
|
-
type: "llm_judge",
|
|
1622
|
-
rubrics: rubricItems
|
|
1623
|
-
};
|
|
2091
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2092
|
+
if (rubricEvaluator) {
|
|
1624
2093
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1625
2094
|
}
|
|
1626
2095
|
}
|
|
@@ -1631,7 +2100,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1631
2100
|
}
|
|
1632
2101
|
}
|
|
1633
2102
|
const allFilePaths = [
|
|
1634
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
2103
|
+
...guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1635
2104
|
...userFilePaths
|
|
1636
2105
|
];
|
|
1637
2106
|
const testCase = {
|
|
@@ -1643,7 +2112,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1643
2112
|
input_segments: inputSegments,
|
|
1644
2113
|
expected_messages: outputSegments,
|
|
1645
2114
|
reference_answer: referenceAnswer,
|
|
1646
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
2115
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1647
2116
|
guideline_patterns: guidelinePatterns,
|
|
1648
2117
|
file_paths: allFilePaths,
|
|
1649
2118
|
expected_outcome: outcome,
|
|
@@ -1666,35 +2135,35 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1666
2135
|
}
|
|
1667
2136
|
return results;
|
|
1668
2137
|
}
|
|
1669
|
-
function
|
|
2138
|
+
function asString6(value) {
|
|
1670
2139
|
return typeof value === "string" ? value : void 0;
|
|
1671
2140
|
}
|
|
1672
|
-
function
|
|
2141
|
+
function logWarning6(message, details) {
|
|
1673
2142
|
if (details && details.length > 0) {
|
|
1674
2143
|
const detailBlock = details.join("\n");
|
|
1675
|
-
console.warn(`${
|
|
1676
|
-
${detailBlock}${
|
|
2144
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
2145
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1677
2146
|
} else {
|
|
1678
|
-
console.warn(`${
|
|
2147
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
1679
2148
|
}
|
|
1680
2149
|
}
|
|
1681
|
-
function
|
|
2150
|
+
function logError2(message, details) {
|
|
1682
2151
|
if (details && details.length > 0) {
|
|
1683
2152
|
const detailBlock = details.join("\n");
|
|
1684
|
-
console.error(`${
|
|
1685
|
-
${detailBlock}${
|
|
2153
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
2154
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1686
2155
|
} else {
|
|
1687
|
-
console.error(`${
|
|
2156
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
1688
2157
|
}
|
|
1689
2158
|
}
|
|
1690
2159
|
|
|
1691
2160
|
// src/evaluation/file-utils.ts
|
|
1692
2161
|
var import_node_fs2 = require("fs");
|
|
1693
|
-
var
|
|
1694
|
-
var
|
|
2162
|
+
var import_promises8 = require("fs/promises");
|
|
2163
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1695
2164
|
async function fileExists2(filePath) {
|
|
1696
2165
|
try {
|
|
1697
|
-
await (0,
|
|
2166
|
+
await (0, import_promises8.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1698
2167
|
return true;
|
|
1699
2168
|
} catch {
|
|
1700
2169
|
return false;
|
|
@@ -1704,22 +2173,22 @@ function normalizeLineEndings(content) {
|
|
|
1704
2173
|
return content.replace(/\r\n/g, "\n");
|
|
1705
2174
|
}
|
|
1706
2175
|
async function readTextFile(filePath) {
|
|
1707
|
-
const content = await (0,
|
|
2176
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1708
2177
|
return normalizeLineEndings(content);
|
|
1709
2178
|
}
|
|
1710
2179
|
async function readJsonFile(filePath) {
|
|
1711
|
-
const content = await (0,
|
|
2180
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1712
2181
|
return JSON.parse(content);
|
|
1713
2182
|
}
|
|
1714
2183
|
async function findGitRoot(startPath) {
|
|
1715
|
-
let currentDir =
|
|
1716
|
-
const root =
|
|
2184
|
+
let currentDir = import_node_path8.default.dirname(import_node_path8.default.resolve(startPath));
|
|
2185
|
+
const root = import_node_path8.default.parse(currentDir).root;
|
|
1717
2186
|
while (currentDir !== root) {
|
|
1718
|
-
const gitPath =
|
|
2187
|
+
const gitPath = import_node_path8.default.join(currentDir, ".git");
|
|
1719
2188
|
if (await fileExists2(gitPath)) {
|
|
1720
2189
|
return currentDir;
|
|
1721
2190
|
}
|
|
1722
|
-
const parentDir =
|
|
2191
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1723
2192
|
if (parentDir === currentDir) {
|
|
1724
2193
|
break;
|
|
1725
2194
|
}
|
|
@@ -1730,8 +2199,8 @@ async function findGitRoot(startPath) {
|
|
|
1730
2199
|
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1731
2200
|
const directories = [];
|
|
1732
2201
|
const seen = /* @__PURE__ */ new Set();
|
|
1733
|
-
const boundary =
|
|
1734
|
-
let current =
|
|
2202
|
+
const boundary = import_node_path8.default.resolve(repoRoot);
|
|
2203
|
+
let current = import_node_path8.default.resolve(import_node_path8.default.dirname(filePath));
|
|
1735
2204
|
while (current !== void 0) {
|
|
1736
2205
|
if (!seen.has(current)) {
|
|
1737
2206
|
directories.push(current);
|
|
@@ -1740,7 +2209,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1740
2209
|
if (current === boundary) {
|
|
1741
2210
|
break;
|
|
1742
2211
|
}
|
|
1743
|
-
const parent =
|
|
2212
|
+
const parent = import_node_path8.default.dirname(current);
|
|
1744
2213
|
if (parent === current) {
|
|
1745
2214
|
break;
|
|
1746
2215
|
}
|
|
@@ -1754,16 +2223,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1754
2223
|
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1755
2224
|
const uniqueRoots = [];
|
|
1756
2225
|
const addRoot = (root) => {
|
|
1757
|
-
const normalized =
|
|
2226
|
+
const normalized = import_node_path8.default.resolve(root);
|
|
1758
2227
|
if (!uniqueRoots.includes(normalized)) {
|
|
1759
2228
|
uniqueRoots.push(normalized);
|
|
1760
2229
|
}
|
|
1761
2230
|
};
|
|
1762
|
-
let currentDir =
|
|
2231
|
+
let currentDir = import_node_path8.default.dirname(evalPath);
|
|
1763
2232
|
let reachedBoundary = false;
|
|
1764
2233
|
while (!reachedBoundary) {
|
|
1765
2234
|
addRoot(currentDir);
|
|
1766
|
-
const parentDir =
|
|
2235
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1767
2236
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1768
2237
|
reachedBoundary = true;
|
|
1769
2238
|
} else {
|
|
@@ -1781,16 +2250,16 @@ function trimLeadingSeparators2(value) {
|
|
|
1781
2250
|
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1782
2251
|
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1783
2252
|
const potentialPaths = [];
|
|
1784
|
-
if (
|
|
1785
|
-
potentialPaths.push(
|
|
2253
|
+
if (import_node_path8.default.isAbsolute(rawValue)) {
|
|
2254
|
+
potentialPaths.push(import_node_path8.default.normalize(rawValue));
|
|
1786
2255
|
}
|
|
1787
2256
|
for (const base of searchRoots) {
|
|
1788
|
-
potentialPaths.push(
|
|
2257
|
+
potentialPaths.push(import_node_path8.default.resolve(base, displayPath));
|
|
1789
2258
|
}
|
|
1790
2259
|
const attempted = [];
|
|
1791
2260
|
const seen = /* @__PURE__ */ new Set();
|
|
1792
2261
|
for (const candidate of potentialPaths) {
|
|
1793
|
-
const absoluteCandidate =
|
|
2262
|
+
const absoluteCandidate = import_node_path8.default.resolve(candidate);
|
|
1794
2263
|
if (seen.has(absoluteCandidate)) {
|
|
1795
2264
|
continue;
|
|
1796
2265
|
}
|
|
@@ -2140,9 +2609,9 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
2140
2609
|
var import_node_child_process = require("child_process");
|
|
2141
2610
|
var import_node_crypto = require("crypto");
|
|
2142
2611
|
var import_node_fs3 = require("fs");
|
|
2143
|
-
var
|
|
2612
|
+
var import_promises9 = require("fs/promises");
|
|
2144
2613
|
var import_node_os = require("os");
|
|
2145
|
-
var
|
|
2614
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
2146
2615
|
|
|
2147
2616
|
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
2148
2617
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
@@ -2198,7 +2667,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
|
|
|
2198
2667
|
}
|
|
2199
2668
|
|
|
2200
2669
|
// src/evaluation/providers/preread.ts
|
|
2201
|
-
var
|
|
2670
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
2202
2671
|
function buildPromptDocument(request, inputFiles, options) {
|
|
2203
2672
|
const parts = [];
|
|
2204
2673
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -2221,7 +2690,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
2221
2690
|
}
|
|
2222
2691
|
const deduped = /* @__PURE__ */ new Map();
|
|
2223
2692
|
for (const inputFile of inputFiles) {
|
|
2224
|
-
const absolutePath =
|
|
2693
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2225
2694
|
if (!deduped.has(absolutePath)) {
|
|
2226
2695
|
deduped.set(absolutePath, absolutePath);
|
|
2227
2696
|
}
|
|
@@ -2234,14 +2703,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
2234
2703
|
}
|
|
2235
2704
|
const unique = /* @__PURE__ */ new Map();
|
|
2236
2705
|
for (const inputFile of inputFiles) {
|
|
2237
|
-
const absolutePath =
|
|
2706
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2238
2707
|
if (overrides?.has(absolutePath)) {
|
|
2239
2708
|
if (!unique.has(absolutePath)) {
|
|
2240
2709
|
unique.set(absolutePath, absolutePath);
|
|
2241
2710
|
}
|
|
2242
2711
|
continue;
|
|
2243
2712
|
}
|
|
2244
|
-
const normalized = absolutePath.split(
|
|
2713
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
2245
2714
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2246
2715
|
if (!unique.has(absolutePath)) {
|
|
2247
2716
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2256,7 +2725,7 @@ function collectInputFiles(inputFiles) {
|
|
|
2256
2725
|
}
|
|
2257
2726
|
const unique = /* @__PURE__ */ new Map();
|
|
2258
2727
|
for (const inputFile of inputFiles) {
|
|
2259
|
-
const absolutePath =
|
|
2728
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2260
2729
|
if (!unique.has(absolutePath)) {
|
|
2261
2730
|
unique.set(absolutePath, absolutePath);
|
|
2262
2731
|
}
|
|
@@ -2268,7 +2737,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
2268
2737
|
return "";
|
|
2269
2738
|
}
|
|
2270
2739
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2271
|
-
const fileName =
|
|
2740
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
2272
2741
|
const fileUri = pathToFileUri(absolutePath);
|
|
2273
2742
|
return `* [${fileName}](${fileUri})`;
|
|
2274
2743
|
});
|
|
@@ -2288,7 +2757,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
2288
2757
|
return sections.join("\n");
|
|
2289
2758
|
}
|
|
2290
2759
|
function pathToFileUri(filePath) {
|
|
2291
|
-
const absolutePath =
|
|
2760
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
2292
2761
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2293
2762
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2294
2763
|
return `file:///${normalizedPath}`;
|
|
@@ -2325,8 +2794,8 @@ var ClaudeCodeProvider = class {
|
|
|
2325
2794
|
const workspaceRoot = await this.createWorkspace();
|
|
2326
2795
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2327
2796
|
try {
|
|
2328
|
-
const promptFile =
|
|
2329
|
-
await (0,
|
|
2797
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2798
|
+
await (0, import_promises9.writeFile)(promptFile, request.question, "utf8");
|
|
2330
2799
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2331
2800
|
const cwd = this.resolveCwd();
|
|
2332
2801
|
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
@@ -2373,7 +2842,7 @@ var ClaudeCodeProvider = class {
|
|
|
2373
2842
|
if (!this.config.cwd) {
|
|
2374
2843
|
return process.cwd();
|
|
2375
2844
|
}
|
|
2376
|
-
return
|
|
2845
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
2377
2846
|
}
|
|
2378
2847
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2379
2848
|
const args = [];
|
|
@@ -2430,11 +2899,11 @@ ${filesContext}`;
|
|
|
2430
2899
|
}
|
|
2431
2900
|
}
|
|
2432
2901
|
async createWorkspace() {
|
|
2433
|
-
return await (0,
|
|
2902
|
+
return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2434
2903
|
}
|
|
2435
2904
|
async cleanupWorkspace(workspaceRoot) {
|
|
2436
2905
|
try {
|
|
2437
|
-
await (0,
|
|
2906
|
+
await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2438
2907
|
} catch {
|
|
2439
2908
|
}
|
|
2440
2909
|
}
|
|
@@ -2444,9 +2913,9 @@ ${filesContext}`;
|
|
|
2444
2913
|
return void 0;
|
|
2445
2914
|
}
|
|
2446
2915
|
if (this.config.logDir) {
|
|
2447
|
-
return
|
|
2916
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
2448
2917
|
}
|
|
2449
|
-
return
|
|
2918
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2450
2919
|
}
|
|
2451
2920
|
async createStreamLogger(request) {
|
|
2452
2921
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2454,13 +2923,13 @@ ${filesContext}`;
|
|
|
2454
2923
|
return void 0;
|
|
2455
2924
|
}
|
|
2456
2925
|
try {
|
|
2457
|
-
await (0,
|
|
2926
|
+
await (0, import_promises9.mkdir)(logDir, { recursive: true });
|
|
2458
2927
|
} catch (error) {
|
|
2459
2928
|
const message = error instanceof Error ? error.message : String(error);
|
|
2460
2929
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2461
2930
|
return void 0;
|
|
2462
2931
|
}
|
|
2463
|
-
const filePath =
|
|
2932
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
2464
2933
|
try {
|
|
2465
2934
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
2466
2935
|
filePath,
|
|
@@ -2865,16 +3334,16 @@ function escapeShellArg(arg) {
|
|
|
2865
3334
|
}
|
|
2866
3335
|
async function defaultClaudeCodeRunner(options) {
|
|
2867
3336
|
const tempId = (0, import_node_crypto.randomUUID)();
|
|
2868
|
-
const stdoutFile =
|
|
2869
|
-
const stderrFile =
|
|
2870
|
-
const exitFile =
|
|
2871
|
-
const pidFile =
|
|
3337
|
+
const stdoutFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
|
|
3338
|
+
const stderrFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
|
|
3339
|
+
const exitFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
|
|
3340
|
+
const pidFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
|
|
2872
3341
|
try {
|
|
2873
3342
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2874
3343
|
} finally {
|
|
2875
3344
|
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2876
3345
|
try {
|
|
2877
|
-
await (0,
|
|
3346
|
+
await (0, import_promises9.rm)(file, { force: true });
|
|
2878
3347
|
} catch {
|
|
2879
3348
|
}
|
|
2880
3349
|
}
|
|
@@ -2908,8 +3377,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2908
3377
|
let lastStdoutSize = 0;
|
|
2909
3378
|
const readFileIfExists = async (filePath) => {
|
|
2910
3379
|
try {
|
|
2911
|
-
const { readFile:
|
|
2912
|
-
return await
|
|
3380
|
+
const { readFile: readFile9 } = await import("fs/promises");
|
|
3381
|
+
return await readFile9(filePath, "utf8");
|
|
2913
3382
|
} catch {
|
|
2914
3383
|
return "";
|
|
2915
3384
|
}
|
|
@@ -2982,9 +3451,9 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2982
3451
|
|
|
2983
3452
|
// src/evaluation/providers/cli.ts
|
|
2984
3453
|
var import_node_child_process2 = require("child_process");
|
|
2985
|
-
var
|
|
3454
|
+
var import_promises10 = __toESM(require("fs/promises"), 1);
|
|
2986
3455
|
var import_node_os2 = __toESM(require("os"), 1);
|
|
2987
|
-
var
|
|
3456
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2988
3457
|
var import_node_util = require("util");
|
|
2989
3458
|
var import_zod = require("zod");
|
|
2990
3459
|
var ToolCallSchema = import_zod.z.object({
|
|
@@ -2992,7 +3461,8 @@ var ToolCallSchema = import_zod.z.object({
|
|
|
2992
3461
|
input: import_zod.z.unknown().optional(),
|
|
2993
3462
|
output: import_zod.z.unknown().optional(),
|
|
2994
3463
|
id: import_zod.z.string().optional(),
|
|
2995
|
-
timestamp: import_zod.z.string().optional()
|
|
3464
|
+
timestamp: import_zod.z.string().optional(),
|
|
3465
|
+
duration_ms: import_zod.z.number().optional()
|
|
2996
3466
|
});
|
|
2997
3467
|
var OutputMessageInputSchema = import_zod.z.object({
|
|
2998
3468
|
role: import_zod.z.string(),
|
|
@@ -3000,6 +3470,7 @@ var OutputMessageInputSchema = import_zod.z.object({
|
|
|
3000
3470
|
content: import_zod.z.unknown().optional(),
|
|
3001
3471
|
tool_calls: import_zod.z.array(ToolCallSchema).optional(),
|
|
3002
3472
|
timestamp: import_zod.z.string().optional(),
|
|
3473
|
+
duration_ms: import_zod.z.number().optional(),
|
|
3003
3474
|
metadata: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
3004
3475
|
});
|
|
3005
3476
|
var TokenUsageSchema = import_zod.z.object({
|
|
@@ -3038,8 +3509,16 @@ function convertOutputMessages(messages) {
|
|
|
3038
3509
|
role: msg.role,
|
|
3039
3510
|
name: msg.name,
|
|
3040
3511
|
content: msg.content,
|
|
3041
|
-
toolCalls: msg.tool_calls
|
|
3512
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
3513
|
+
tool: tc.tool,
|
|
3514
|
+
input: tc.input,
|
|
3515
|
+
output: tc.output,
|
|
3516
|
+
id: tc.id,
|
|
3517
|
+
timestamp: tc.timestamp,
|
|
3518
|
+
durationMs: tc.duration_ms
|
|
3519
|
+
})),
|
|
3042
3520
|
timestamp: msg.timestamp,
|
|
3521
|
+
durationMs: msg.duration_ms,
|
|
3043
3522
|
metadata: msg.metadata
|
|
3044
3523
|
}));
|
|
3045
3524
|
}
|
|
@@ -3353,7 +3832,7 @@ var CliProvider = class {
|
|
|
3353
3832
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
3354
3833
|
} finally {
|
|
3355
3834
|
if (!this.keepTempFiles) {
|
|
3356
|
-
await
|
|
3835
|
+
await import_promises10.default.unlink(filePath).catch(() => {
|
|
3357
3836
|
});
|
|
3358
3837
|
}
|
|
3359
3838
|
}
|
|
@@ -3441,7 +3920,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
3441
3920
|
}
|
|
3442
3921
|
const unique = /* @__PURE__ */ new Map();
|
|
3443
3922
|
for (const inputFile of inputFiles) {
|
|
3444
|
-
const absolutePath =
|
|
3923
|
+
const absolutePath = import_node_path11.default.resolve(inputFile);
|
|
3445
3924
|
if (!unique.has(absolutePath)) {
|
|
3446
3925
|
unique.set(absolutePath, absolutePath);
|
|
3447
3926
|
}
|
|
@@ -3455,7 +3934,7 @@ function formatFileList(files, template) {
|
|
|
3455
3934
|
const formatter = template ?? "{path}";
|
|
3456
3935
|
return files.map((filePath) => {
|
|
3457
3936
|
const escapedPath = shellEscape(filePath);
|
|
3458
|
-
const escapedName = shellEscape(
|
|
3937
|
+
const escapedName = shellEscape(import_node_path11.default.basename(filePath));
|
|
3459
3938
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
3460
3939
|
}).join(" ");
|
|
3461
3940
|
}
|
|
@@ -3479,7 +3958,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
3479
3958
|
const safeEvalId = evalCaseId || "unknown";
|
|
3480
3959
|
const timestamp = Date.now();
|
|
3481
3960
|
const random = Math.random().toString(36).substring(2, 9);
|
|
3482
|
-
return
|
|
3961
|
+
return import_node_path11.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
3483
3962
|
}
|
|
3484
3963
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
3485
3964
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3493,9 +3972,9 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3493
3972
|
var import_node_child_process3 = require("child_process");
|
|
3494
3973
|
var import_node_crypto2 = require("crypto");
|
|
3495
3974
|
var import_node_fs4 = require("fs");
|
|
3496
|
-
var
|
|
3975
|
+
var import_promises11 = require("fs/promises");
|
|
3497
3976
|
var import_node_os3 = require("os");
|
|
3498
|
-
var
|
|
3977
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3499
3978
|
var import_node_util2 = require("util");
|
|
3500
3979
|
|
|
3501
3980
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -3590,8 +4069,8 @@ var CodexProvider = class {
|
|
|
3590
4069
|
const promptContent = `${systemPrompt}
|
|
3591
4070
|
|
|
3592
4071
|
${basePrompt}`;
|
|
3593
|
-
const promptFile =
|
|
3594
|
-
await (0,
|
|
4072
|
+
const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
4073
|
+
await (0, import_promises11.writeFile)(promptFile, promptContent, "utf8");
|
|
3595
4074
|
const args = this.buildCodexArgs();
|
|
3596
4075
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3597
4076
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -3640,7 +4119,7 @@ ${basePrompt}`;
|
|
|
3640
4119
|
if (!this.config.cwd) {
|
|
3641
4120
|
return workspaceRoot;
|
|
3642
4121
|
}
|
|
3643
|
-
return
|
|
4122
|
+
return import_node_path12.default.resolve(this.config.cwd);
|
|
3644
4123
|
}
|
|
3645
4124
|
buildCodexArgs() {
|
|
3646
4125
|
const args = [
|
|
@@ -3682,11 +4161,11 @@ ${basePrompt}`;
|
|
|
3682
4161
|
}
|
|
3683
4162
|
}
|
|
3684
4163
|
async createWorkspace() {
|
|
3685
|
-
return await (0,
|
|
4164
|
+
return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
3686
4165
|
}
|
|
3687
4166
|
async cleanupWorkspace(workspaceRoot) {
|
|
3688
4167
|
try {
|
|
3689
|
-
await (0,
|
|
4168
|
+
await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3690
4169
|
} catch {
|
|
3691
4170
|
}
|
|
3692
4171
|
}
|
|
@@ -3696,9 +4175,9 @@ ${basePrompt}`;
|
|
|
3696
4175
|
return void 0;
|
|
3697
4176
|
}
|
|
3698
4177
|
if (this.config.logDir) {
|
|
3699
|
-
return
|
|
4178
|
+
return import_node_path12.default.resolve(this.config.logDir);
|
|
3700
4179
|
}
|
|
3701
|
-
return
|
|
4180
|
+
return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
3702
4181
|
}
|
|
3703
4182
|
async createStreamLogger(request) {
|
|
3704
4183
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3706,13 +4185,13 @@ ${basePrompt}`;
|
|
|
3706
4185
|
return void 0;
|
|
3707
4186
|
}
|
|
3708
4187
|
try {
|
|
3709
|
-
await (0,
|
|
4188
|
+
await (0, import_promises11.mkdir)(logDir, { recursive: true });
|
|
3710
4189
|
} catch (error) {
|
|
3711
4190
|
const message = error instanceof Error ? error.message : String(error);
|
|
3712
4191
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
3713
4192
|
return void 0;
|
|
3714
4193
|
}
|
|
3715
|
-
const filePath =
|
|
4194
|
+
const filePath = import_node_path12.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3716
4195
|
try {
|
|
3717
4196
|
const logger = await CodexStreamLogger.create({
|
|
3718
4197
|
filePath,
|
|
@@ -3927,9 +4406,9 @@ function tryParseJsonValue2(rawLine) {
|
|
|
3927
4406
|
async function locateExecutable(candidate) {
|
|
3928
4407
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
3929
4408
|
if (includesPathSeparator) {
|
|
3930
|
-
const resolved =
|
|
4409
|
+
const resolved = import_node_path12.default.isAbsolute(candidate) ? candidate : import_node_path12.default.resolve(candidate);
|
|
3931
4410
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
3932
|
-
await (0,
|
|
4411
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3933
4412
|
return executablePath;
|
|
3934
4413
|
}
|
|
3935
4414
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -3939,7 +4418,7 @@ async function locateExecutable(candidate) {
|
|
|
3939
4418
|
const preferred = selectExecutableCandidate(lines);
|
|
3940
4419
|
if (preferred) {
|
|
3941
4420
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
3942
|
-
await (0,
|
|
4421
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3943
4422
|
return executablePath;
|
|
3944
4423
|
}
|
|
3945
4424
|
} catch {
|
|
@@ -3973,7 +4452,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
3973
4452
|
for (const ext of extensions) {
|
|
3974
4453
|
const withExtension = `${candidate}${ext}`;
|
|
3975
4454
|
try {
|
|
3976
|
-
await (0,
|
|
4455
|
+
await (0, import_promises11.access)(withExtension, import_node_fs4.constants.F_OK);
|
|
3977
4456
|
return withExtension;
|
|
3978
4457
|
} catch {
|
|
3979
4458
|
}
|
|
@@ -4438,9 +4917,9 @@ function extractToolCalls2(content) {
|
|
|
4438
4917
|
var import_node_child_process4 = require("child_process");
|
|
4439
4918
|
var import_node_crypto3 = require("crypto");
|
|
4440
4919
|
var import_node_fs5 = require("fs");
|
|
4441
|
-
var
|
|
4920
|
+
var import_promises12 = require("fs/promises");
|
|
4442
4921
|
var import_node_os4 = require("os");
|
|
4443
|
-
var
|
|
4922
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
4444
4923
|
|
|
4445
4924
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
4446
4925
|
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
@@ -4524,8 +5003,8 @@ var PiCodingAgentProvider = class {
|
|
|
4524
5003
|
const workspaceRoot = await this.createWorkspace();
|
|
4525
5004
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
4526
5005
|
try {
|
|
4527
|
-
const promptFile =
|
|
4528
|
-
await (0,
|
|
5006
|
+
const promptFile = import_node_path13.default.join(workspaceRoot, PROMPT_FILENAME3);
|
|
5007
|
+
await (0, import_promises12.writeFile)(promptFile, request.question, "utf8");
|
|
4529
5008
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
4530
5009
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
4531
5010
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
@@ -4566,7 +5045,7 @@ var PiCodingAgentProvider = class {
|
|
|
4566
5045
|
if (!this.config.cwd) {
|
|
4567
5046
|
return workspaceRoot;
|
|
4568
5047
|
}
|
|
4569
|
-
return
|
|
5048
|
+
return import_node_path13.default.resolve(this.config.cwd);
|
|
4570
5049
|
}
|
|
4571
5050
|
buildPiArgs(prompt, inputFiles) {
|
|
4572
5051
|
const args = [];
|
|
@@ -4655,19 +5134,19 @@ ${prompt}`;
|
|
|
4655
5134
|
return env;
|
|
4656
5135
|
}
|
|
4657
5136
|
async createWorkspace() {
|
|
4658
|
-
return await (0,
|
|
5137
|
+
return await (0, import_promises12.mkdtemp)(import_node_path13.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
|
|
4659
5138
|
}
|
|
4660
5139
|
async cleanupWorkspace(workspaceRoot) {
|
|
4661
5140
|
try {
|
|
4662
|
-
await (0,
|
|
5141
|
+
await (0, import_promises12.rm)(workspaceRoot, { recursive: true, force: true });
|
|
4663
5142
|
} catch {
|
|
4664
5143
|
}
|
|
4665
5144
|
}
|
|
4666
5145
|
resolveLogDirectory() {
|
|
4667
5146
|
if (this.config.logDir) {
|
|
4668
|
-
return
|
|
5147
|
+
return import_node_path13.default.resolve(this.config.logDir);
|
|
4669
5148
|
}
|
|
4670
|
-
return
|
|
5149
|
+
return import_node_path13.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
4671
5150
|
}
|
|
4672
5151
|
async createStreamLogger(request) {
|
|
4673
5152
|
const logDir = this.resolveLogDirectory();
|
|
@@ -4675,13 +5154,13 @@ ${prompt}`;
|
|
|
4675
5154
|
return void 0;
|
|
4676
5155
|
}
|
|
4677
5156
|
try {
|
|
4678
|
-
await (0,
|
|
5157
|
+
await (0, import_promises12.mkdir)(logDir, { recursive: true });
|
|
4679
5158
|
} catch (error) {
|
|
4680
5159
|
const message = error instanceof Error ? error.message : String(error);
|
|
4681
5160
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
4682
5161
|
return void 0;
|
|
4683
5162
|
}
|
|
4684
|
-
const filePath =
|
|
5163
|
+
const filePath = import_node_path13.default.join(logDir, buildLogFilename3(request, this.targetName));
|
|
4685
5164
|
try {
|
|
4686
5165
|
const logger = await PiStreamLogger.create({
|
|
4687
5166
|
filePath,
|
|
@@ -5114,7 +5593,7 @@ async function defaultPiRunner(options) {
|
|
|
5114
5593
|
}
|
|
5115
5594
|
|
|
5116
5595
|
// src/evaluation/providers/targets.ts
|
|
5117
|
-
var
|
|
5596
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
5118
5597
|
var import_zod2 = require("zod");
|
|
5119
5598
|
var CliHealthcheckHttpInputSchema = import_zod2.z.object({
|
|
5120
5599
|
type: import_zod2.z.literal("http"),
|
|
@@ -5220,11 +5699,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
5220
5699
|
allowLiteral: true,
|
|
5221
5700
|
optionalEnv: true
|
|
5222
5701
|
});
|
|
5223
|
-
if (cwd && evalFilePath && !
|
|
5224
|
-
cwd =
|
|
5702
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5703
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5225
5704
|
}
|
|
5226
5705
|
if (!cwd && evalFilePath) {
|
|
5227
|
-
cwd =
|
|
5706
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5228
5707
|
}
|
|
5229
5708
|
return {
|
|
5230
5709
|
type: "command",
|
|
@@ -5251,11 +5730,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
5251
5730
|
allowLiteral: true,
|
|
5252
5731
|
optionalEnv: true
|
|
5253
5732
|
});
|
|
5254
|
-
if (cwd && evalFilePath && !
|
|
5255
|
-
cwd =
|
|
5733
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5734
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5256
5735
|
}
|
|
5257
5736
|
if (!cwd && evalFilePath) {
|
|
5258
|
-
cwd =
|
|
5737
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5259
5738
|
}
|
|
5260
5739
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
5261
5740
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -5760,8 +6239,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
5760
6239
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
5761
6240
|
if (!parseResult.success) {
|
|
5762
6241
|
const firstError = parseResult.error.errors[0];
|
|
5763
|
-
const
|
|
5764
|
-
const prefix =
|
|
6242
|
+
const path18 = firstError?.path.join(".") || "";
|
|
6243
|
+
const prefix = path18 ? `${target.name} ${path18}: ` : `${target.name}: `;
|
|
5765
6244
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
5766
6245
|
}
|
|
5767
6246
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -5949,7 +6428,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
5949
6428
|
}
|
|
5950
6429
|
|
|
5951
6430
|
// src/evaluation/providers/vscode.ts
|
|
5952
|
-
var
|
|
6431
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
5953
6432
|
var import_subagent = require("subagent");
|
|
5954
6433
|
|
|
5955
6434
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -6119,7 +6598,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
6119
6598
|
return "";
|
|
6120
6599
|
}
|
|
6121
6600
|
const buildList = (files) => files.map((absolutePath) => {
|
|
6122
|
-
const fileName =
|
|
6601
|
+
const fileName = import_node_path15.default.basename(absolutePath);
|
|
6123
6602
|
const fileUri = pathToFileUri2(absolutePath);
|
|
6124
6603
|
return `* [${fileName}](${fileUri})`;
|
|
6125
6604
|
});
|
|
@@ -6144,8 +6623,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
6144
6623
|
}
|
|
6145
6624
|
const unique = /* @__PURE__ */ new Map();
|
|
6146
6625
|
for (const attachment of attachments) {
|
|
6147
|
-
const absolutePath =
|
|
6148
|
-
const normalized = absolutePath.split(
|
|
6626
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6627
|
+
const normalized = absolutePath.split(import_node_path15.default.sep).join("/");
|
|
6149
6628
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
6150
6629
|
if (!unique.has(absolutePath)) {
|
|
6151
6630
|
unique.set(absolutePath, absolutePath);
|
|
@@ -6160,7 +6639,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6160
6639
|
}
|
|
6161
6640
|
const unique = /* @__PURE__ */ new Map();
|
|
6162
6641
|
for (const attachment of attachments) {
|
|
6163
|
-
const absolutePath =
|
|
6642
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6164
6643
|
if (!unique.has(absolutePath)) {
|
|
6165
6644
|
unique.set(absolutePath, absolutePath);
|
|
6166
6645
|
}
|
|
@@ -6168,7 +6647,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6168
6647
|
return Array.from(unique.values());
|
|
6169
6648
|
}
|
|
6170
6649
|
function pathToFileUri2(filePath) {
|
|
6171
|
-
const absolutePath =
|
|
6650
|
+
const absolutePath = import_node_path15.default.isAbsolute(filePath) ? filePath : import_node_path15.default.resolve(filePath);
|
|
6172
6651
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
6173
6652
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
6174
6653
|
return `file:///${normalizedPath}`;
|
|
@@ -6181,7 +6660,7 @@ function normalizeAttachments(attachments) {
|
|
|
6181
6660
|
}
|
|
6182
6661
|
const deduped = /* @__PURE__ */ new Set();
|
|
6183
6662
|
for (const attachment of attachments) {
|
|
6184
|
-
deduped.add(
|
|
6663
|
+
deduped.add(import_node_path15.default.resolve(attachment));
|
|
6185
6664
|
}
|
|
6186
6665
|
return Array.from(deduped);
|
|
6187
6666
|
}
|
|
@@ -6190,7 +6669,7 @@ function mergeAttachments(all) {
|
|
|
6190
6669
|
for (const list of all) {
|
|
6191
6670
|
if (!list) continue;
|
|
6192
6671
|
for (const inputFile of list) {
|
|
6193
|
-
deduped.add(
|
|
6672
|
+
deduped.add(import_node_path15.default.resolve(inputFile));
|
|
6194
6673
|
}
|
|
6195
6674
|
}
|
|
6196
6675
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -6238,9 +6717,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
6238
6717
|
|
|
6239
6718
|
// src/evaluation/providers/targets-file.ts
|
|
6240
6719
|
var import_node_fs6 = require("fs");
|
|
6241
|
-
var
|
|
6242
|
-
var
|
|
6243
|
-
var
|
|
6720
|
+
var import_promises13 = require("fs/promises");
|
|
6721
|
+
var import_node_path16 = __toESM(require("path"), 1);
|
|
6722
|
+
var import_yaml4 = require("yaml");
|
|
6244
6723
|
function isRecord(value) {
|
|
6245
6724
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
6246
6725
|
}
|
|
@@ -6269,19 +6748,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
6269
6748
|
}
|
|
6270
6749
|
async function fileExists3(filePath) {
|
|
6271
6750
|
try {
|
|
6272
|
-
await (0,
|
|
6751
|
+
await (0, import_promises13.access)(filePath, import_node_fs6.constants.F_OK);
|
|
6273
6752
|
return true;
|
|
6274
6753
|
} catch {
|
|
6275
6754
|
return false;
|
|
6276
6755
|
}
|
|
6277
6756
|
}
|
|
6278
6757
|
async function readTargetDefinitions(filePath) {
|
|
6279
|
-
const absolutePath =
|
|
6758
|
+
const absolutePath = import_node_path16.default.resolve(filePath);
|
|
6280
6759
|
if (!await fileExists3(absolutePath)) {
|
|
6281
6760
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
6282
6761
|
}
|
|
6283
|
-
const raw = await (0,
|
|
6284
|
-
const parsed = (0,
|
|
6762
|
+
const raw = await (0, import_promises13.readFile)(absolutePath, "utf8");
|
|
6763
|
+
const parsed = (0, import_yaml4.parse)(raw);
|
|
6285
6764
|
if (!isRecord(parsed)) {
|
|
6286
6765
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
6287
6766
|
}
|
|
@@ -6487,15 +6966,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6487
6966
|
});
|
|
6488
6967
|
}
|
|
6489
6968
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
6490
|
-
const { mkdir: mkdir4, readFile:
|
|
6969
|
+
const { mkdir: mkdir4, readFile: readFile9, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
|
|
6491
6970
|
const { tmpdir: tmpdir4 } = await import("os");
|
|
6492
|
-
const
|
|
6971
|
+
const path18 = await import("path");
|
|
6493
6972
|
const { randomUUID: randomUUID4 } = await import("crypto");
|
|
6494
|
-
const dir =
|
|
6973
|
+
const dir = path18.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
6495
6974
|
await mkdir4(dir, { recursive: true });
|
|
6496
|
-
const stdinPath =
|
|
6497
|
-
const stdoutPath =
|
|
6498
|
-
const stderrPath =
|
|
6975
|
+
const stdinPath = path18.join(dir, "stdin.txt");
|
|
6976
|
+
const stdoutPath = path18.join(dir, "stdout.txt");
|
|
6977
|
+
const stderrPath = path18.join(dir, "stderr.txt");
|
|
6499
6978
|
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
6500
6979
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
6501
6980
|
const { spawn: spawn4 } = await import("child_process");
|
|
@@ -6525,8 +7004,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6525
7004
|
resolve(code ?? 0);
|
|
6526
7005
|
});
|
|
6527
7006
|
});
|
|
6528
|
-
const stdout = (await
|
|
6529
|
-
const stderr = (await
|
|
7007
|
+
const stdout = (await readFile9(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
7008
|
+
const stderr = (await readFile9(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6530
7009
|
return { stdout, stderr, exitCode };
|
|
6531
7010
|
} finally {
|
|
6532
7011
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -6798,7 +7277,7 @@ var CodeEvaluator = class {
|
|
|
6798
7277
|
outputMessages: context.outputMessages ?? null,
|
|
6799
7278
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
6800
7279
|
inputFiles: context.evalCase.file_paths.filter(
|
|
6801
|
-
(
|
|
7280
|
+
(path18) => !context.evalCase.guideline_paths.includes(path18)
|
|
6802
7281
|
),
|
|
6803
7282
|
inputMessages: context.evalCase.input_messages,
|
|
6804
7283
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -6973,6 +7452,15 @@ var rubricEvaluationSchema = import_zod3.z.object({
|
|
|
6973
7452
|
checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
6974
7453
|
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
6975
7454
|
});
|
|
7455
|
+
var scoreRangeCheckResultSchema = import_zod3.z.object({
|
|
7456
|
+
id: import_zod3.z.string().describe("The ID of the rubric criterion being scored"),
|
|
7457
|
+
score: import_zod3.z.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
7458
|
+
reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
7459
|
+
});
|
|
7460
|
+
var scoreRangeEvaluationSchema = import_zod3.z.object({
|
|
7461
|
+
checks: import_zod3.z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
7462
|
+
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
7463
|
+
});
|
|
6976
7464
|
var LlmJudgeEvaluator = class {
|
|
6977
7465
|
kind = "llm_judge";
|
|
6978
7466
|
resolveJudgeProvider;
|
|
@@ -7058,6 +7546,10 @@ var LlmJudgeEvaluator = class {
|
|
|
7058
7546
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
7059
7547
|
);
|
|
7060
7548
|
}
|
|
7549
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
7550
|
+
if (hasScoreRanges) {
|
|
7551
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
7552
|
+
}
|
|
7061
7553
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
7062
7554
|
const systemPrompt = buildRubricOutputSchema();
|
|
7063
7555
|
const evaluatorRawRequest = {
|
|
@@ -7083,6 +7575,84 @@ var LlmJudgeEvaluator = class {
|
|
|
7083
7575
|
evaluatorRawRequest
|
|
7084
7576
|
};
|
|
7085
7577
|
}
|
|
7578
|
+
/**
|
|
7579
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
7580
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
7581
|
+
*/
|
|
7582
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
7583
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
7584
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
7585
|
+
const evaluatorRawRequest = {
|
|
7586
|
+
userPrompt: prompt,
|
|
7587
|
+
systemPrompt,
|
|
7588
|
+
target: judgeProvider.targetName
|
|
7589
|
+
};
|
|
7590
|
+
const { data } = await this.runWithRetry({
|
|
7591
|
+
context,
|
|
7592
|
+
judgeProvider,
|
|
7593
|
+
systemPrompt,
|
|
7594
|
+
userPrompt: prompt,
|
|
7595
|
+
schema: scoreRangeEvaluationSchema
|
|
7596
|
+
});
|
|
7597
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
7598
|
+
return {
|
|
7599
|
+
score,
|
|
7600
|
+
verdict,
|
|
7601
|
+
hits,
|
|
7602
|
+
misses,
|
|
7603
|
+
expectedAspectCount: rubrics.length,
|
|
7604
|
+
reasoning: data.overall_reasoning,
|
|
7605
|
+
evaluatorRawRequest,
|
|
7606
|
+
details
|
|
7607
|
+
};
|
|
7608
|
+
}
|
|
7609
|
+
/**
|
|
7610
|
+
* Build prompt for score-range rubric evaluation.
|
|
7611
|
+
*/
|
|
7612
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
7613
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
7614
|
+
const parts = [
|
|
7615
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
7616
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
7617
|
+
"",
|
|
7618
|
+
"[[ ## question ## ]]",
|
|
7619
|
+
formattedQuestion,
|
|
7620
|
+
"",
|
|
7621
|
+
"[[ ## expected_outcome ## ]]",
|
|
7622
|
+
context.evalCase.expected_outcome,
|
|
7623
|
+
""
|
|
7624
|
+
];
|
|
7625
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
7626
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
7627
|
+
}
|
|
7628
|
+
parts.push(
|
|
7629
|
+
"[[ ## candidate_answer ## ]]",
|
|
7630
|
+
context.candidate,
|
|
7631
|
+
"",
|
|
7632
|
+
"[[ ## scoring_criteria ## ]]"
|
|
7633
|
+
);
|
|
7634
|
+
for (const rubric of rubrics) {
|
|
7635
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
7636
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
7637
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
7638
|
+
if (rubric.expected_outcome) {
|
|
7639
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
7640
|
+
}
|
|
7641
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
7642
|
+
parts.push("Score ranges:");
|
|
7643
|
+
for (const range of rubric.score_ranges) {
|
|
7644
|
+
const [min, max] = range.score_range;
|
|
7645
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
7646
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
7647
|
+
}
|
|
7648
|
+
}
|
|
7649
|
+
}
|
|
7650
|
+
parts.push(
|
|
7651
|
+
"",
|
|
7652
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
7653
|
+
);
|
|
7654
|
+
return parts.join("\n");
|
|
7655
|
+
}
|
|
7086
7656
|
buildRubricPrompt(context, rubrics) {
|
|
7087
7657
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
7088
7658
|
const parts = [
|
|
@@ -7102,7 +7672,7 @@ var LlmJudgeEvaluator = class {
|
|
|
7102
7672
|
for (const rubric of rubrics) {
|
|
7103
7673
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
7104
7674
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
7105
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
7675
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
7106
7676
|
}
|
|
7107
7677
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
7108
7678
|
return parts.join("\n");
|
|
@@ -7189,9 +7759,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
7189
7759
|
totalWeight += rubric.weight;
|
|
7190
7760
|
if (check.satisfied) {
|
|
7191
7761
|
earnedWeight += rubric.weight;
|
|
7192
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
7762
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
7193
7763
|
} else {
|
|
7194
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
7764
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
7195
7765
|
if (rubric.required) {
|
|
7196
7766
|
failedRequired = true;
|
|
7197
7767
|
}
|
|
@@ -7201,6 +7771,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
7201
7771
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7202
7772
|
return { score, verdict, hits, misses };
|
|
7203
7773
|
}
|
|
7774
|
+
function buildScoreRangeOutputSchema() {
|
|
7775
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
7776
|
+
You must return a valid JSON object matching this schema:
|
|
7777
|
+
{
|
|
7778
|
+
"checks": [
|
|
7779
|
+
{
|
|
7780
|
+
"id": "string (criterion id)",
|
|
7781
|
+
"score": integer (0-10),
|
|
7782
|
+
"reasoning": "string (brief explanation for score)"
|
|
7783
|
+
}
|
|
7784
|
+
],
|
|
7785
|
+
"overall_reasoning": "string (summary, optional)"
|
|
7786
|
+
}
|
|
7787
|
+
|
|
7788
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
7789
|
+
}
|
|
7790
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
7791
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
7792
|
+
const hits = [];
|
|
7793
|
+
const misses = [];
|
|
7794
|
+
const rawScores = {};
|
|
7795
|
+
let totalWeight = 0;
|
|
7796
|
+
let weightedScoreSum = 0;
|
|
7797
|
+
let failedRequired = false;
|
|
7798
|
+
for (const check of result.checks) {
|
|
7799
|
+
const rubric = rubricMap.get(check.id);
|
|
7800
|
+
if (!rubric) {
|
|
7801
|
+
continue;
|
|
7802
|
+
}
|
|
7803
|
+
const rawScore = Math.max(0, Math.min(10, check.score));
|
|
7804
|
+
const normalizedScore = rawScore / 10;
|
|
7805
|
+
rawScores[rubric.id] = rawScore;
|
|
7806
|
+
totalWeight += rubric.weight;
|
|
7807
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
7808
|
+
let requiredMinScore;
|
|
7809
|
+
if (rubric.required_min_score !== void 0) {
|
|
7810
|
+
requiredMinScore = rubric.required_min_score;
|
|
7811
|
+
} else if (rubric.required === true) {
|
|
7812
|
+
requiredMinScore = 10;
|
|
7813
|
+
}
|
|
7814
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
7815
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
7816
|
+
);
|
|
7817
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
7818
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
7819
|
+
const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
|
|
7820
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
7821
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
7822
|
+
failedRequired = true;
|
|
7823
|
+
misses.push(scoreInfo);
|
|
7824
|
+
} else if (rawScore >= 7) {
|
|
7825
|
+
hits.push(scoreInfo);
|
|
7826
|
+
} else {
|
|
7827
|
+
misses.push(scoreInfo);
|
|
7828
|
+
}
|
|
7829
|
+
}
|
|
7830
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
7831
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
7832
|
+
return {
|
|
7833
|
+
score,
|
|
7834
|
+
verdict,
|
|
7835
|
+
hits,
|
|
7836
|
+
misses,
|
|
7837
|
+
details: {
|
|
7838
|
+
raw_scores: rawScores,
|
|
7839
|
+
normalization: "score / 10",
|
|
7840
|
+
aggregation: "weighted_average"
|
|
7841
|
+
}
|
|
7842
|
+
};
|
|
7843
|
+
}
|
|
7204
7844
|
|
|
7205
7845
|
// src/evaluation/evaluators/composite.ts
|
|
7206
7846
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
@@ -7584,115 +8224,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
7584
8224
|
* Evaluate a single field against the expected value.
|
|
7585
8225
|
*/
|
|
7586
8226
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7587
|
-
const { path:
|
|
7588
|
-
const candidateValue = resolvePath(candidateData,
|
|
7589
|
-
const expectedValue = resolvePath(expectedData,
|
|
8227
|
+
const { path: path18, match, required = true, weight = 1 } = fieldConfig;
|
|
8228
|
+
const candidateValue = resolvePath(candidateData, path18);
|
|
8229
|
+
const expectedValue = resolvePath(expectedData, path18);
|
|
7590
8230
|
if (expectedValue === void 0) {
|
|
7591
8231
|
return {
|
|
7592
|
-
path:
|
|
8232
|
+
path: path18,
|
|
7593
8233
|
score: 1,
|
|
7594
8234
|
// No expected value means no comparison needed
|
|
7595
8235
|
weight,
|
|
7596
8236
|
hit: true,
|
|
7597
|
-
message: `${
|
|
8237
|
+
message: `${path18}: no expected value`
|
|
7598
8238
|
};
|
|
7599
8239
|
}
|
|
7600
8240
|
if (candidateValue === void 0) {
|
|
7601
8241
|
if (required) {
|
|
7602
8242
|
return {
|
|
7603
|
-
path:
|
|
8243
|
+
path: path18,
|
|
7604
8244
|
score: 0,
|
|
7605
8245
|
weight,
|
|
7606
8246
|
hit: false,
|
|
7607
|
-
message: `${
|
|
8247
|
+
message: `${path18} (required, missing)`
|
|
7608
8248
|
};
|
|
7609
8249
|
}
|
|
7610
8250
|
return {
|
|
7611
|
-
path:
|
|
8251
|
+
path: path18,
|
|
7612
8252
|
score: 1,
|
|
7613
8253
|
// Don't penalize missing optional fields
|
|
7614
8254
|
weight: 0,
|
|
7615
8255
|
// Zero weight means it won't affect the score
|
|
7616
8256
|
hit: true,
|
|
7617
|
-
message: `${
|
|
8257
|
+
message: `${path18}: optional field missing`
|
|
7618
8258
|
};
|
|
7619
8259
|
}
|
|
7620
8260
|
switch (match) {
|
|
7621
8261
|
case "exact":
|
|
7622
|
-
return this.compareExact(
|
|
8262
|
+
return this.compareExact(path18, candidateValue, expectedValue, weight);
|
|
7623
8263
|
case "numeric_tolerance":
|
|
7624
8264
|
return this.compareNumericTolerance(
|
|
7625
|
-
|
|
8265
|
+
path18,
|
|
7626
8266
|
candidateValue,
|
|
7627
8267
|
expectedValue,
|
|
7628
8268
|
fieldConfig,
|
|
7629
8269
|
weight
|
|
7630
8270
|
);
|
|
7631
8271
|
case "date":
|
|
7632
|
-
return this.compareDate(
|
|
8272
|
+
return this.compareDate(path18, candidateValue, expectedValue, fieldConfig, weight);
|
|
7633
8273
|
default:
|
|
7634
8274
|
return {
|
|
7635
|
-
path:
|
|
8275
|
+
path: path18,
|
|
7636
8276
|
score: 0,
|
|
7637
8277
|
weight,
|
|
7638
8278
|
hit: false,
|
|
7639
|
-
message: `${
|
|
8279
|
+
message: `${path18}: unknown match type "${match}"`
|
|
7640
8280
|
};
|
|
7641
8281
|
}
|
|
7642
8282
|
}
|
|
7643
8283
|
/**
|
|
7644
8284
|
* Exact equality comparison.
|
|
7645
8285
|
*/
|
|
7646
|
-
compareExact(
|
|
8286
|
+
compareExact(path18, candidateValue, expectedValue, weight) {
|
|
7647
8287
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
7648
8288
|
return {
|
|
7649
|
-
path:
|
|
8289
|
+
path: path18,
|
|
7650
8290
|
score: 1,
|
|
7651
8291
|
weight,
|
|
7652
8292
|
hit: true,
|
|
7653
|
-
message:
|
|
8293
|
+
message: path18
|
|
7654
8294
|
};
|
|
7655
8295
|
}
|
|
7656
8296
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
7657
8297
|
return {
|
|
7658
|
-
path:
|
|
8298
|
+
path: path18,
|
|
7659
8299
|
score: 0,
|
|
7660
8300
|
weight,
|
|
7661
8301
|
hit: false,
|
|
7662
|
-
message: `${
|
|
8302
|
+
message: `${path18} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7663
8303
|
};
|
|
7664
8304
|
}
|
|
7665
8305
|
return {
|
|
7666
|
-
path:
|
|
8306
|
+
path: path18,
|
|
7667
8307
|
score: 0,
|
|
7668
8308
|
weight,
|
|
7669
8309
|
hit: false,
|
|
7670
|
-
message: `${
|
|
8310
|
+
message: `${path18} (value mismatch)`
|
|
7671
8311
|
};
|
|
7672
8312
|
}
|
|
7673
8313
|
/**
|
|
7674
8314
|
* Numeric comparison with absolute or relative tolerance.
|
|
7675
8315
|
*/
|
|
7676
|
-
compareNumericTolerance(
|
|
8316
|
+
compareNumericTolerance(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7677
8317
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7678
8318
|
const candidateNum = toNumber(candidateValue);
|
|
7679
8319
|
const expectedNum = toNumber(expectedValue);
|
|
7680
8320
|
if (candidateNum === null || expectedNum === null) {
|
|
7681
8321
|
return {
|
|
7682
|
-
path:
|
|
8322
|
+
path: path18,
|
|
7683
8323
|
score: 0,
|
|
7684
8324
|
weight,
|
|
7685
8325
|
hit: false,
|
|
7686
|
-
message: `${
|
|
8326
|
+
message: `${path18} (non-numeric value)`
|
|
7687
8327
|
};
|
|
7688
8328
|
}
|
|
7689
8329
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7690
8330
|
return {
|
|
7691
|
-
path:
|
|
8331
|
+
path: path18,
|
|
7692
8332
|
score: 0,
|
|
7693
8333
|
weight,
|
|
7694
8334
|
hit: false,
|
|
7695
|
-
message: `${
|
|
8335
|
+
message: `${path18} (invalid numeric value)`
|
|
7696
8336
|
};
|
|
7697
8337
|
}
|
|
7698
8338
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -7705,61 +8345,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
7705
8345
|
}
|
|
7706
8346
|
if (withinTolerance) {
|
|
7707
8347
|
return {
|
|
7708
|
-
path:
|
|
8348
|
+
path: path18,
|
|
7709
8349
|
score: 1,
|
|
7710
8350
|
weight,
|
|
7711
8351
|
hit: true,
|
|
7712
|
-
message: `${
|
|
8352
|
+
message: `${path18} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7713
8353
|
};
|
|
7714
8354
|
}
|
|
7715
8355
|
return {
|
|
7716
|
-
path:
|
|
8356
|
+
path: path18,
|
|
7717
8357
|
score: 0,
|
|
7718
8358
|
weight,
|
|
7719
8359
|
hit: false,
|
|
7720
|
-
message: `${
|
|
8360
|
+
message: `${path18} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7721
8361
|
};
|
|
7722
8362
|
}
|
|
7723
8363
|
/**
|
|
7724
8364
|
* Date comparison with format normalization.
|
|
7725
8365
|
*/
|
|
7726
|
-
compareDate(
|
|
8366
|
+
compareDate(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7727
8367
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7728
8368
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7729
8369
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7730
8370
|
if (candidateDate === null) {
|
|
7731
8371
|
return {
|
|
7732
|
-
path:
|
|
8372
|
+
path: path18,
|
|
7733
8373
|
score: 0,
|
|
7734
8374
|
weight,
|
|
7735
8375
|
hit: false,
|
|
7736
|
-
message: `${
|
|
8376
|
+
message: `${path18} (unparseable candidate date)`
|
|
7737
8377
|
};
|
|
7738
8378
|
}
|
|
7739
8379
|
if (expectedDate === null) {
|
|
7740
8380
|
return {
|
|
7741
|
-
path:
|
|
8381
|
+
path: path18,
|
|
7742
8382
|
score: 0,
|
|
7743
8383
|
weight,
|
|
7744
8384
|
hit: false,
|
|
7745
|
-
message: `${
|
|
8385
|
+
message: `${path18} (unparseable expected date)`
|
|
7746
8386
|
};
|
|
7747
8387
|
}
|
|
7748
8388
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7749
8389
|
return {
|
|
7750
|
-
path:
|
|
8390
|
+
path: path18,
|
|
7751
8391
|
score: 1,
|
|
7752
8392
|
weight,
|
|
7753
8393
|
hit: true,
|
|
7754
|
-
message:
|
|
8394
|
+
message: path18
|
|
7755
8395
|
};
|
|
7756
8396
|
}
|
|
7757
8397
|
return {
|
|
7758
|
-
path:
|
|
8398
|
+
path: path18,
|
|
7759
8399
|
score: 0,
|
|
7760
8400
|
weight,
|
|
7761
8401
|
hit: false,
|
|
7762
|
-
message: `${
|
|
8402
|
+
message: `${path18} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7763
8403
|
};
|
|
7764
8404
|
}
|
|
7765
8405
|
/**
|
|
@@ -7799,11 +8439,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
7799
8439
|
};
|
|
7800
8440
|
}
|
|
7801
8441
|
};
|
|
7802
|
-
function resolvePath(obj,
|
|
7803
|
-
if (!
|
|
8442
|
+
function resolvePath(obj, path18) {
|
|
8443
|
+
if (!path18 || !obj) {
|
|
7804
8444
|
return void 0;
|
|
7805
8445
|
}
|
|
7806
|
-
const parts =
|
|
8446
|
+
const parts = path18.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7807
8447
|
let current = obj;
|
|
7808
8448
|
for (const part of parts) {
|
|
7809
8449
|
if (current === null || current === void 0) {
|
|
@@ -8028,6 +8668,27 @@ function argsMatch(expected, actual) {
|
|
|
8028
8668
|
}
|
|
8029
8669
|
return true;
|
|
8030
8670
|
}
|
|
8671
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
8672
|
+
if (maxDurationMs === void 0) {
|
|
8673
|
+
return { status: "skip", message: "" };
|
|
8674
|
+
}
|
|
8675
|
+
if (actualDurationMs === void 0) {
|
|
8676
|
+
return {
|
|
8677
|
+
status: "skip",
|
|
8678
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
8679
|
+
};
|
|
8680
|
+
}
|
|
8681
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
8682
|
+
return {
|
|
8683
|
+
status: "pass",
|
|
8684
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
8685
|
+
};
|
|
8686
|
+
}
|
|
8687
|
+
return {
|
|
8688
|
+
status: "fail",
|
|
8689
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
8690
|
+
};
|
|
8691
|
+
}
|
|
8031
8692
|
var ToolTrajectoryEvaluator = class {
|
|
8032
8693
|
kind = "tool_trajectory";
|
|
8033
8694
|
config;
|
|
@@ -8086,7 +8747,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8086
8747
|
for (const call of message.toolCalls) {
|
|
8087
8748
|
toolCalls.push({
|
|
8088
8749
|
name: call.tool,
|
|
8089
|
-
args: call.input
|
|
8750
|
+
args: call.input,
|
|
8751
|
+
durationMs: call.durationMs
|
|
8090
8752
|
});
|
|
8091
8753
|
}
|
|
8092
8754
|
}
|
|
@@ -8154,17 +8816,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8154
8816
|
}
|
|
8155
8817
|
const hits = [];
|
|
8156
8818
|
const misses = [];
|
|
8819
|
+
const warnings = [];
|
|
8157
8820
|
let actualIndex = 0;
|
|
8821
|
+
let sequenceHits = 0;
|
|
8822
|
+
let latencyHits = 0;
|
|
8823
|
+
let latencySkips = 0;
|
|
8824
|
+
const latencyAssertionCount = expected.filter(
|
|
8825
|
+
(item) => item.maxDurationMs !== void 0
|
|
8826
|
+
).length;
|
|
8158
8827
|
for (let i = 0; i < expected.length; i++) {
|
|
8159
8828
|
const expectedItem = expected[i];
|
|
8160
8829
|
const expectedTool = expectedItem.tool;
|
|
8161
8830
|
let found = false;
|
|
8162
8831
|
let argsMismatch = false;
|
|
8832
|
+
let matchedCall;
|
|
8163
8833
|
while (actualIndex < toolCalls.length) {
|
|
8164
8834
|
const actualCall = toolCalls[actualIndex];
|
|
8165
8835
|
if (actualCall.name === expectedTool) {
|
|
8166
8836
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8167
8837
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
8838
|
+
sequenceHits++;
|
|
8839
|
+
matchedCall = actualCall;
|
|
8168
8840
|
actualIndex++;
|
|
8169
8841
|
found = true;
|
|
8170
8842
|
break;
|
|
@@ -8181,14 +8853,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8181
8853
|
if (!found && !argsMismatch) {
|
|
8182
8854
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
8183
8855
|
}
|
|
8856
|
+
if (found && matchedCall) {
|
|
8857
|
+
const latencyResult = checkLatency(
|
|
8858
|
+
expectedTool,
|
|
8859
|
+
expectedItem.maxDurationMs,
|
|
8860
|
+
matchedCall.durationMs
|
|
8861
|
+
);
|
|
8862
|
+
if (latencyResult.status === "pass") {
|
|
8863
|
+
hits.push(latencyResult.message);
|
|
8864
|
+
latencyHits++;
|
|
8865
|
+
} else if (latencyResult.status === "fail") {
|
|
8866
|
+
misses.push(latencyResult.message);
|
|
8867
|
+
} else if (latencyResult.message) {
|
|
8868
|
+
warnings.push(latencyResult.message);
|
|
8869
|
+
latencySkips++;
|
|
8870
|
+
}
|
|
8871
|
+
}
|
|
8184
8872
|
}
|
|
8185
|
-
const
|
|
8873
|
+
for (const warning of warnings) {
|
|
8874
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
8875
|
+
}
|
|
8876
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
8877
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
8878
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
8186
8879
|
return {
|
|
8187
8880
|
score,
|
|
8188
8881
|
verdict: scoreToVerdict(score),
|
|
8189
8882
|
hits,
|
|
8190
8883
|
misses,
|
|
8191
|
-
expectedAspectCount:
|
|
8884
|
+
expectedAspectCount: totalAssertions
|
|
8192
8885
|
};
|
|
8193
8886
|
}
|
|
8194
8887
|
evaluateExact(toolCalls) {
|
|
@@ -8204,6 +8897,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8204
8897
|
}
|
|
8205
8898
|
const hits = [];
|
|
8206
8899
|
const misses = [];
|
|
8900
|
+
const warnings = [];
|
|
8901
|
+
let sequenceHits = 0;
|
|
8902
|
+
let latencyHits = 0;
|
|
8903
|
+
let latencySkips = 0;
|
|
8904
|
+
const latencyAssertionCount = expected.filter(
|
|
8905
|
+
(item) => item.maxDurationMs !== void 0
|
|
8906
|
+
).length;
|
|
8207
8907
|
if (toolCalls.length !== expected.length) {
|
|
8208
8908
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
8209
8909
|
}
|
|
@@ -8213,33 +8913,58 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8213
8913
|
const expectedTool = expectedItem.tool;
|
|
8214
8914
|
const actualCall = toolCalls[i];
|
|
8215
8915
|
const actualTool = actualCall.name;
|
|
8916
|
+
let sequenceMatched = false;
|
|
8216
8917
|
if (actualTool === expectedTool) {
|
|
8217
8918
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
8218
8919
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
8920
|
+
sequenceHits++;
|
|
8921
|
+
sequenceMatched = true;
|
|
8219
8922
|
} else {
|
|
8220
8923
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
8221
8924
|
}
|
|
8222
8925
|
} else {
|
|
8223
8926
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
8224
8927
|
}
|
|
8928
|
+
if (sequenceMatched) {
|
|
8929
|
+
const latencyResult = checkLatency(
|
|
8930
|
+
expectedTool,
|
|
8931
|
+
expectedItem.maxDurationMs,
|
|
8932
|
+
actualCall.durationMs
|
|
8933
|
+
);
|
|
8934
|
+
if (latencyResult.status === "pass") {
|
|
8935
|
+
hits.push(latencyResult.message);
|
|
8936
|
+
latencyHits++;
|
|
8937
|
+
} else if (latencyResult.status === "fail") {
|
|
8938
|
+
misses.push(latencyResult.message);
|
|
8939
|
+
} else if (latencyResult.message) {
|
|
8940
|
+
warnings.push(latencyResult.message);
|
|
8941
|
+
latencySkips++;
|
|
8942
|
+
}
|
|
8943
|
+
}
|
|
8225
8944
|
}
|
|
8226
8945
|
for (let i = checkLength; i < expected.length; i++) {
|
|
8227
8946
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
8228
8947
|
}
|
|
8229
|
-
const
|
|
8948
|
+
for (const warning of warnings) {
|
|
8949
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
8950
|
+
}
|
|
8951
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
8952
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
8953
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
8230
8954
|
return {
|
|
8231
8955
|
score,
|
|
8232
8956
|
verdict: scoreToVerdict(score),
|
|
8233
8957
|
hits,
|
|
8234
8958
|
misses,
|
|
8235
|
-
expectedAspectCount:
|
|
8959
|
+
expectedAspectCount: totalAssertions
|
|
8236
8960
|
};
|
|
8237
8961
|
}
|
|
8238
8962
|
};
|
|
8239
8963
|
|
|
8240
8964
|
// src/evaluation/orchestrator.ts
|
|
8241
8965
|
var import_node_crypto5 = require("crypto");
|
|
8242
|
-
var
|
|
8966
|
+
var import_node_path17 = __toESM(require("path"), 1);
|
|
8967
|
+
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
8243
8968
|
|
|
8244
8969
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
8245
8970
|
var Node = class {
|
|
@@ -8398,17 +9123,17 @@ async function runEvaluation(options) {
|
|
|
8398
9123
|
cache,
|
|
8399
9124
|
useCache,
|
|
8400
9125
|
now,
|
|
8401
|
-
|
|
9126
|
+
filter,
|
|
8402
9127
|
verbose,
|
|
8403
9128
|
evalCases: preloadedEvalCases,
|
|
8404
9129
|
onResult,
|
|
8405
9130
|
onProgress
|
|
8406
9131
|
} = options;
|
|
8407
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
8408
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
9132
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
|
|
9133
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
8409
9134
|
if (filteredEvalCases.length === 0) {
|
|
8410
|
-
if (
|
|
8411
|
-
throw new Error(`
|
|
9135
|
+
if (filter) {
|
|
9136
|
+
throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
|
|
8412
9137
|
}
|
|
8413
9138
|
return [];
|
|
8414
9139
|
}
|
|
@@ -8984,7 +9709,10 @@ async function runEvaluatorList(options) {
|
|
|
8984
9709
|
attempt,
|
|
8985
9710
|
promptInputs,
|
|
8986
9711
|
now,
|
|
8987
|
-
judgeProvider
|
|
9712
|
+
judgeProvider,
|
|
9713
|
+
outputMessages,
|
|
9714
|
+
traceSummary,
|
|
9715
|
+
agentTimeoutMs
|
|
8988
9716
|
});
|
|
8989
9717
|
const weight = evaluator.weight ?? 1;
|
|
8990
9718
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -9038,7 +9766,7 @@ async function runEvaluatorList(options) {
|
|
|
9038
9766
|
});
|
|
9039
9767
|
}
|
|
9040
9768
|
if (evaluator.type === "composite") {
|
|
9041
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
9769
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path17.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
9042
9770
|
const createEvaluator = (memberConfig) => {
|
|
9043
9771
|
switch (memberConfig.type) {
|
|
9044
9772
|
case "llm_judge":
|
|
@@ -9319,9 +10047,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
9319
10047
|
attempt,
|
|
9320
10048
|
promptInputs,
|
|
9321
10049
|
now,
|
|
9322
|
-
judgeProvider
|
|
10050
|
+
judgeProvider,
|
|
10051
|
+
outputMessages,
|
|
10052
|
+
traceSummary,
|
|
10053
|
+
agentTimeoutMs
|
|
9323
10054
|
} = options;
|
|
9324
|
-
const customPrompt = await resolveCustomPrompt(
|
|
10055
|
+
const customPrompt = await resolveCustomPrompt(
|
|
10056
|
+
config,
|
|
10057
|
+
{
|
|
10058
|
+
evalCase,
|
|
10059
|
+
candidate,
|
|
10060
|
+
outputMessages,
|
|
10061
|
+
traceSummary,
|
|
10062
|
+
config: config.config
|
|
10063
|
+
},
|
|
10064
|
+
agentTimeoutMs
|
|
10065
|
+
);
|
|
9325
10066
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
9326
10067
|
evalCase,
|
|
9327
10068
|
candidate,
|
|
@@ -9335,23 +10076,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
9335
10076
|
evaluator: config
|
|
9336
10077
|
});
|
|
9337
10078
|
}
|
|
9338
|
-
async function resolveCustomPrompt(
|
|
9339
|
-
if (
|
|
10079
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
10080
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
10081
|
+
if (!context) {
|
|
10082
|
+
throw new Error("Context required for executable prompt templates");
|
|
10083
|
+
}
|
|
10084
|
+
return executePromptTemplate(
|
|
10085
|
+
promptConfig.resolvedPromptScript,
|
|
10086
|
+
context,
|
|
10087
|
+
promptConfig.config,
|
|
10088
|
+
timeoutMs
|
|
10089
|
+
);
|
|
10090
|
+
}
|
|
10091
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
10092
|
+
if (promptPath) {
|
|
9340
10093
|
try {
|
|
9341
|
-
const content = await readTextFile(
|
|
10094
|
+
const content = await readTextFile(promptPath);
|
|
9342
10095
|
return content;
|
|
9343
10096
|
} catch (error) {
|
|
9344
10097
|
const message = error instanceof Error ? error.message : String(error);
|
|
9345
|
-
console.warn(`Could not read custom prompt at ${
|
|
10098
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
10099
|
+
}
|
|
10100
|
+
}
|
|
10101
|
+
const promptValue = promptConfig.prompt;
|
|
10102
|
+
if (typeof promptValue === "string") {
|
|
10103
|
+
return promptValue;
|
|
10104
|
+
}
|
|
10105
|
+
return void 0;
|
|
10106
|
+
}
|
|
10107
|
+
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
10108
|
+
const payload = {
|
|
10109
|
+
question: context.evalCase.question,
|
|
10110
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
10111
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
10112
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
10113
|
+
candidateAnswer: context.candidate,
|
|
10114
|
+
outputMessages: context.outputMessages ?? null,
|
|
10115
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
10116
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
10117
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
10118
|
+
),
|
|
10119
|
+
inputMessages: context.evalCase.input_messages,
|
|
10120
|
+
traceSummary: context.traceSummary ?? null,
|
|
10121
|
+
config: config ?? context.config ?? null
|
|
10122
|
+
};
|
|
10123
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
10124
|
+
const scriptPath = script[script.length - 1];
|
|
10125
|
+
const cwd = import_node_path17.default.dirname(scriptPath);
|
|
10126
|
+
try {
|
|
10127
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
10128
|
+
const prompt = stdout.trim();
|
|
10129
|
+
if (!prompt) {
|
|
10130
|
+
throw new Error("Prompt template produced empty output");
|
|
9346
10131
|
}
|
|
10132
|
+
return prompt;
|
|
10133
|
+
} catch (error) {
|
|
10134
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
10135
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
9347
10136
|
}
|
|
9348
|
-
return config.prompt;
|
|
9349
10137
|
}
|
|
9350
|
-
function filterEvalCases(evalCases,
|
|
9351
|
-
if (!
|
|
10138
|
+
function filterEvalCases(evalCases, filter) {
|
|
10139
|
+
if (!filter) {
|
|
9352
10140
|
return evalCases;
|
|
9353
10141
|
}
|
|
9354
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
10142
|
+
return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
|
|
9355
10143
|
}
|
|
9356
10144
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
9357
10145
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -9509,7 +10297,7 @@ var import_ai4 = require("ai");
|
|
|
9509
10297
|
var import_zod4 = require("zod");
|
|
9510
10298
|
var rubricItemSchema = import_zod4.z.object({
|
|
9511
10299
|
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
9512
|
-
|
|
10300
|
+
expected_outcome: import_zod4.z.string().describe("Concrete expected outcome for this rubric item"),
|
|
9513
10301
|
weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
|
|
9514
10302
|
required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
9515
10303
|
});
|
|
@@ -9529,7 +10317,7 @@ You must return a valid JSON object matching this schema:
|
|
|
9529
10317
|
"rubrics": [
|
|
9530
10318
|
{
|
|
9531
10319
|
"id": "string (short identifier)",
|
|
9532
|
-
"
|
|
10320
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
9533
10321
|
"weight": number (default 1.0),
|
|
9534
10322
|
"required": boolean (default true)
|
|
9535
10323
|
}
|
|
@@ -9565,7 +10353,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
9565
10353
|
"Each rubric should:",
|
|
9566
10354
|
"- Be specific and testable",
|
|
9567
10355
|
"- Have a short, descriptive ID",
|
|
9568
|
-
"- Include a clear
|
|
10356
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
9569
10357
|
"- Indicate if it is required (mandatory) or optional",
|
|
9570
10358
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
9571
10359
|
"",
|
|
@@ -9613,6 +10401,7 @@ function createAgentKernel() {
|
|
|
9613
10401
|
createAgentKernel,
|
|
9614
10402
|
createProvider,
|
|
9615
10403
|
deepEqual,
|
|
10404
|
+
detectFormat,
|
|
9616
10405
|
ensureVSCodeSubagents,
|
|
9617
10406
|
executeScript,
|
|
9618
10407
|
explorationRatio,
|