@agentv/core 2.1.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
- package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +39 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1070 -281
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +82 -7
- package/dist/index.d.ts +82 -7
- package/dist/index.js +1018 -230
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-RP3M7COZ.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -146,8 +146,9 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
146
146
|
}
|
|
147
147
|
|
|
148
148
|
// src/evaluation/yaml-parser.ts
|
|
149
|
-
import { readFile as
|
|
150
|
-
import
|
|
149
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
150
|
+
import path7 from "node:path";
|
|
151
|
+
import micromatch3 from "micromatch";
|
|
151
152
|
import { parse as parse2 } from "yaml";
|
|
152
153
|
|
|
153
154
|
// src/evaluation/loaders/config-loader.ts
|
|
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
462
463
|
);
|
|
463
464
|
}
|
|
464
465
|
}
|
|
465
|
-
const
|
|
466
|
-
const
|
|
466
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
467
|
+
const config2 = {};
|
|
467
468
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
468
|
-
if (!
|
|
469
|
-
|
|
469
|
+
if (!knownProps2.has(key) && value !== void 0) {
|
|
470
|
+
config2[key] = value;
|
|
470
471
|
}
|
|
471
472
|
}
|
|
472
473
|
evaluators.push({
|
|
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
476
477
|
cwd,
|
|
477
478
|
resolvedCwd,
|
|
478
479
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
479
|
-
...Object.keys(
|
|
480
|
+
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
480
481
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
481
482
|
});
|
|
482
483
|
continue;
|
|
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
641
642
|
continue;
|
|
642
643
|
}
|
|
643
644
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
644
|
-
const
|
|
645
|
+
const config2 = {
|
|
645
646
|
name,
|
|
646
647
|
type: "tool_trajectory",
|
|
647
648
|
mode,
|
|
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
649
650
|
...expected ? { expected } : {},
|
|
650
651
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
651
652
|
};
|
|
652
|
-
evaluators.push(
|
|
653
|
+
evaluators.push(config2);
|
|
653
654
|
continue;
|
|
654
655
|
}
|
|
655
656
|
if (typeValue === "field_accuracy") {
|
|
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
786
787
|
});
|
|
787
788
|
continue;
|
|
788
789
|
}
|
|
789
|
-
const
|
|
790
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
791
|
+
let prompt;
|
|
790
792
|
let promptPath;
|
|
791
|
-
|
|
793
|
+
let resolvedPromptScript;
|
|
794
|
+
let promptScriptConfig;
|
|
795
|
+
if (isJsonObject2(rawPrompt)) {
|
|
796
|
+
const scriptArray = asStringArray(
|
|
797
|
+
rawPrompt.script,
|
|
798
|
+
`prompt.script for evaluator '${name}' in '${evalId}'`
|
|
799
|
+
);
|
|
800
|
+
if (!scriptArray) {
|
|
801
|
+
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
|
|
802
|
+
}
|
|
803
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
804
|
+
const resolved = await resolveFileReference2(scriptPath, searchRoots);
|
|
805
|
+
if (resolved.resolvedPath) {
|
|
806
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
|
|
807
|
+
} else {
|
|
808
|
+
throw new Error(
|
|
809
|
+
`Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
810
|
+
);
|
|
811
|
+
}
|
|
812
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
813
|
+
promptScriptConfig = rawPrompt.config;
|
|
814
|
+
}
|
|
815
|
+
} else if (typeof rawPrompt === "string") {
|
|
816
|
+
prompt = rawPrompt;
|
|
792
817
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
793
818
|
if (resolved.resolvedPath) {
|
|
794
819
|
promptPath = path3.resolve(resolved.resolvedPath);
|
|
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
807
832
|
}
|
|
808
833
|
const _model = asString(rawEvaluator.model);
|
|
809
834
|
const rawRubrics = rawEvaluator.rubrics;
|
|
810
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
811
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
812
|
-
description: asString(rubric.description) ?? "",
|
|
813
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
814
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
815
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
835
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
|
|
816
836
|
if (typeValue === "rubric") {
|
|
817
837
|
if (!parsedRubrics) {
|
|
818
838
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
832
852
|
continue;
|
|
833
853
|
}
|
|
834
854
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
855
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
856
|
+
const config = {};
|
|
857
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
858
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
859
|
+
config[key] = value;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
863
|
+
const mergedConfig = { ...config, ...topLevelConfig };
|
|
864
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
835
865
|
evaluators.push({
|
|
836
866
|
name,
|
|
837
867
|
type: "llm_judge",
|
|
838
868
|
prompt,
|
|
839
869
|
promptPath,
|
|
870
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
871
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
840
872
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
841
|
-
...weight !== void 0 ? { weight } : {}
|
|
873
|
+
...weight !== void 0 ? { weight } : {},
|
|
874
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
842
875
|
});
|
|
843
876
|
}
|
|
844
877
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -925,6 +958,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
925
958
|
function isValidFieldAggregationType(value) {
|
|
926
959
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
927
960
|
}
|
|
961
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
962
|
+
const items = [];
|
|
963
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
964
|
+
if (!isJsonObject2(rawRubric)) {
|
|
965
|
+
logWarning2(
|
|
966
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
967
|
+
);
|
|
968
|
+
continue;
|
|
969
|
+
}
|
|
970
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
971
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
972
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
973
|
+
let requiredMinScore;
|
|
974
|
+
let required;
|
|
975
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
976
|
+
const minScore = rawRubric.required_min_score;
|
|
977
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
978
|
+
throw new Error(
|
|
979
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
980
|
+
);
|
|
981
|
+
}
|
|
982
|
+
requiredMinScore = minScore;
|
|
983
|
+
}
|
|
984
|
+
if (typeof rawRubric.required === "boolean") {
|
|
985
|
+
required = rawRubric.required;
|
|
986
|
+
}
|
|
987
|
+
let scoreRanges;
|
|
988
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
989
|
+
if (rawScoreRanges !== void 0) {
|
|
990
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
991
|
+
throw new Error(
|
|
992
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
993
|
+
);
|
|
994
|
+
}
|
|
995
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
996
|
+
items.push({
|
|
997
|
+
id,
|
|
998
|
+
weight,
|
|
999
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1000
|
+
...required !== void 0 ? { required } : {},
|
|
1001
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1002
|
+
score_ranges: scoreRanges
|
|
1003
|
+
});
|
|
1004
|
+
} else {
|
|
1005
|
+
if (expectedOutcome.length === 0) {
|
|
1006
|
+
logWarning2(
|
|
1007
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
1008
|
+
);
|
|
1009
|
+
continue;
|
|
1010
|
+
}
|
|
1011
|
+
items.push({
|
|
1012
|
+
id,
|
|
1013
|
+
expected_outcome: expectedOutcome,
|
|
1014
|
+
weight,
|
|
1015
|
+
// Default to required: true if not specified (backward compatibility)
|
|
1016
|
+
required: required ?? true,
|
|
1017
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
return items.length > 0 ? items : void 0;
|
|
1022
|
+
}
|
|
1023
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
1024
|
+
const ranges = [];
|
|
1025
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
1026
|
+
if (!isJsonObject2(rawRange)) {
|
|
1027
|
+
throw new Error(
|
|
1028
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
1029
|
+
);
|
|
1030
|
+
}
|
|
1031
|
+
const scoreRangeValue = rawRange.score_range;
|
|
1032
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
1033
|
+
throw new Error(
|
|
1034
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
1035
|
+
);
|
|
1036
|
+
}
|
|
1037
|
+
const [min, max] = scoreRangeValue;
|
|
1038
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
1039
|
+
throw new Error(
|
|
1040
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
1041
|
+
);
|
|
1042
|
+
}
|
|
1043
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
1044
|
+
throw new Error(
|
|
1045
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
1046
|
+
);
|
|
1047
|
+
}
|
|
1048
|
+
if (min > max) {
|
|
1049
|
+
throw new Error(
|
|
1050
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
1051
|
+
);
|
|
1052
|
+
}
|
|
1053
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
1054
|
+
if (expectedOutcome.length === 0) {
|
|
1055
|
+
throw new Error(
|
|
1056
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
1057
|
+
);
|
|
1058
|
+
}
|
|
1059
|
+
ranges.push({
|
|
1060
|
+
score_range: [min, max],
|
|
1061
|
+
expected_outcome: expectedOutcome
|
|
1062
|
+
});
|
|
1063
|
+
}
|
|
1064
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
1065
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
1066
|
+
const prev = sortedRanges[i - 1];
|
|
1067
|
+
const curr = sortedRanges[i];
|
|
1068
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
1069
|
+
throw new Error(
|
|
1070
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
1071
|
+
);
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
const covered = /* @__PURE__ */ new Set();
|
|
1075
|
+
for (const range of ranges) {
|
|
1076
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
1077
|
+
covered.add(i);
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
const missing = [];
|
|
1081
|
+
for (let i = 0; i <= 10; i++) {
|
|
1082
|
+
if (!covered.has(i)) {
|
|
1083
|
+
missing.push(i);
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
if (missing.length > 0) {
|
|
1087
|
+
throw new Error(
|
|
1088
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
1089
|
+
);
|
|
1090
|
+
}
|
|
1091
|
+
return ranges;
|
|
1092
|
+
}
|
|
1093
|
+
function parseInlineRubrics(rawRubrics) {
|
|
1094
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
1095
|
+
if (typeof rubric === "string") {
|
|
1096
|
+
return {
|
|
1097
|
+
id: `rubric-${index + 1}`,
|
|
1098
|
+
expected_outcome: rubric,
|
|
1099
|
+
weight: 1,
|
|
1100
|
+
required: true
|
|
1101
|
+
};
|
|
1102
|
+
}
|
|
1103
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
1104
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
1105
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
1106
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
1107
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
1108
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
1109
|
+
const baseRubric = {
|
|
1110
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
1111
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
1112
|
+
};
|
|
1113
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
1114
|
+
return {
|
|
1115
|
+
...baseRubric,
|
|
1116
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1117
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
1118
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
1119
|
+
score_ranges: scoreRanges
|
|
1120
|
+
};
|
|
1121
|
+
}
|
|
1122
|
+
return {
|
|
1123
|
+
...baseRubric,
|
|
1124
|
+
expected_outcome: expectedOutcome,
|
|
1125
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
1126
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
1127
|
+
};
|
|
1128
|
+
}).filter(
|
|
1129
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
1130
|
+
);
|
|
1131
|
+
if (rubricItems.length === 0) {
|
|
1132
|
+
return void 0;
|
|
1133
|
+
}
|
|
1134
|
+
return {
|
|
1135
|
+
name: "rubric",
|
|
1136
|
+
type: "llm_judge",
|
|
1137
|
+
rubrics: rubricItems
|
|
1138
|
+
};
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1142
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
1143
|
+
import path5 from "node:path";
|
|
1144
|
+
import micromatch2 from "micromatch";
|
|
1145
|
+
import { parse as parseYaml } from "yaml";
|
|
928
1146
|
|
|
929
1147
|
// src/evaluation/loaders/message-processor.ts
|
|
930
1148
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -1186,28 +1404,302 @@ async function processExpectedMessages(options) {
|
|
|
1186
1404
|
return segments;
|
|
1187
1405
|
}
|
|
1188
1406
|
|
|
1189
|
-
// src/evaluation/
|
|
1190
|
-
|
|
1191
|
-
|
|
1407
|
+
// src/evaluation/loaders/shorthand-expansion.ts
|
|
1408
|
+
function expandInputShorthand(value) {
|
|
1409
|
+
if (value === void 0 || value === null) {
|
|
1410
|
+
return void 0;
|
|
1411
|
+
}
|
|
1412
|
+
if (typeof value === "string") {
|
|
1413
|
+
return [{ role: "user", content: value }];
|
|
1414
|
+
}
|
|
1415
|
+
if (Array.isArray(value)) {
|
|
1416
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1417
|
+
return messages.length > 0 ? messages : void 0;
|
|
1418
|
+
}
|
|
1419
|
+
return void 0;
|
|
1420
|
+
}
|
|
1421
|
+
function expandExpectedOutputShorthand(value) {
|
|
1422
|
+
if (value === void 0 || value === null) {
|
|
1423
|
+
return void 0;
|
|
1424
|
+
}
|
|
1425
|
+
if (typeof value === "string") {
|
|
1426
|
+
return [{ role: "assistant", content: value }];
|
|
1427
|
+
}
|
|
1428
|
+
if (Array.isArray(value)) {
|
|
1429
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
1430
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1431
|
+
return messages.length > 0 ? messages : void 0;
|
|
1432
|
+
}
|
|
1433
|
+
return [{ role: "assistant", content: value }];
|
|
1434
|
+
}
|
|
1435
|
+
if (isJsonObject(value)) {
|
|
1436
|
+
if ("role" in value) {
|
|
1437
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
1438
|
+
}
|
|
1439
|
+
return [{ role: "assistant", content: value }];
|
|
1440
|
+
}
|
|
1441
|
+
return void 0;
|
|
1442
|
+
}
|
|
1443
|
+
function resolveInputMessages(raw) {
|
|
1444
|
+
if (raw.input_messages !== void 0) {
|
|
1445
|
+
if (Array.isArray(raw.input_messages)) {
|
|
1446
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
1447
|
+
return messages.length > 0 ? messages : void 0;
|
|
1448
|
+
}
|
|
1449
|
+
return void 0;
|
|
1450
|
+
}
|
|
1451
|
+
return expandInputShorthand(raw.input);
|
|
1452
|
+
}
|
|
1453
|
+
function resolveExpectedMessages(raw) {
|
|
1454
|
+
if (raw.expected_messages !== void 0) {
|
|
1455
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
1456
|
+
const messages = raw.expected_messages.filter(
|
|
1457
|
+
(msg) => isTestMessage(msg)
|
|
1458
|
+
);
|
|
1459
|
+
return messages.length > 0 ? messages : void 0;
|
|
1460
|
+
}
|
|
1461
|
+
return void 0;
|
|
1462
|
+
}
|
|
1463
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1192
1467
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1468
|
+
var ANSI_RED = "\x1B[31m";
|
|
1193
1469
|
var ANSI_RESET5 = "\x1B[0m";
|
|
1470
|
+
function detectFormat(filePath) {
|
|
1471
|
+
const ext = path5.extname(filePath).toLowerCase();
|
|
1472
|
+
if (ext === ".jsonl") return "jsonl";
|
|
1473
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
1474
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
1475
|
+
}
|
|
1476
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
1477
|
+
const dir = path5.dirname(jsonlPath);
|
|
1478
|
+
const base = path5.basename(jsonlPath, ".jsonl");
|
|
1479
|
+
const sidecarPath = path5.join(dir, `${base}.yaml`);
|
|
1480
|
+
if (!await fileExists2(sidecarPath)) {
|
|
1481
|
+
if (verbose) {
|
|
1482
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
1483
|
+
}
|
|
1484
|
+
return {};
|
|
1485
|
+
}
|
|
1486
|
+
try {
|
|
1487
|
+
const content = await readFile4(sidecarPath, "utf8");
|
|
1488
|
+
const parsed = parseYaml(content);
|
|
1489
|
+
if (!isJsonObject(parsed)) {
|
|
1490
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
1491
|
+
return {};
|
|
1492
|
+
}
|
|
1493
|
+
return {
|
|
1494
|
+
description: asString4(parsed.description),
|
|
1495
|
+
dataset: asString4(parsed.dataset),
|
|
1496
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
1497
|
+
evaluator: parsed.evaluator
|
|
1498
|
+
};
|
|
1499
|
+
} catch (error) {
|
|
1500
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
|
|
1501
|
+
return {};
|
|
1502
|
+
}
|
|
1503
|
+
}
|
|
1504
|
+
function parseJsonlContent(content, filePath) {
|
|
1505
|
+
const lines = content.split("\n");
|
|
1506
|
+
const cases = [];
|
|
1507
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1508
|
+
const line = lines[i].trim();
|
|
1509
|
+
if (line === "") continue;
|
|
1510
|
+
try {
|
|
1511
|
+
const parsed = JSON.parse(line);
|
|
1512
|
+
if (!isJsonObject(parsed)) {
|
|
1513
|
+
throw new Error("Expected JSON object");
|
|
1514
|
+
}
|
|
1515
|
+
cases.push(parsed);
|
|
1516
|
+
} catch (error) {
|
|
1517
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1518
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
1519
|
+
File: ${filePath}`);
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
return cases;
|
|
1523
|
+
}
|
|
1524
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1525
|
+
const verbose = options?.verbose ?? false;
|
|
1526
|
+
const filterPattern = options?.filter;
|
|
1527
|
+
const absoluteTestPath = path5.resolve(evalFilePath);
|
|
1528
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1529
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
1530
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1531
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
1532
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
1533
|
+
const rawFile = await readFile4(absoluteTestPath, "utf8");
|
|
1534
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
1535
|
+
const fallbackDataset = path5.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
1536
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
1537
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
1538
|
+
const globalExecution = sidecar.execution;
|
|
1539
|
+
if (verbose) {
|
|
1540
|
+
console.log(`
|
|
1541
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
1542
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
1543
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
1544
|
+
if (sidecar.description) {
|
|
1545
|
+
console.log(` Description: ${sidecar.description}`);
|
|
1546
|
+
}
|
|
1547
|
+
}
|
|
1548
|
+
const results = [];
|
|
1549
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
1550
|
+
const evalcase = rawCases[lineIndex];
|
|
1551
|
+
const lineNumber = lineIndex + 1;
|
|
1552
|
+
const id = asString4(evalcase.id);
|
|
1553
|
+
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
1554
|
+
continue;
|
|
1555
|
+
}
|
|
1556
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
1557
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1558
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1559
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1560
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1561
|
+
logError(
|
|
1562
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
1563
|
+
);
|
|
1564
|
+
continue;
|
|
1565
|
+
}
|
|
1566
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1567
|
+
const guidelinePaths = [];
|
|
1568
|
+
const inputTextParts = [];
|
|
1569
|
+
const inputSegments = await processMessages({
|
|
1570
|
+
messages: inputMessages,
|
|
1571
|
+
searchRoots,
|
|
1572
|
+
repoRootPath,
|
|
1573
|
+
guidelinePatterns,
|
|
1574
|
+
guidelinePaths,
|
|
1575
|
+
textParts: inputTextParts,
|
|
1576
|
+
messageType: "input",
|
|
1577
|
+
verbose
|
|
1578
|
+
});
|
|
1579
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1580
|
+
messages: expectedMessages,
|
|
1581
|
+
searchRoots,
|
|
1582
|
+
repoRootPath,
|
|
1583
|
+
verbose
|
|
1584
|
+
}) : [];
|
|
1585
|
+
let referenceAnswer = "";
|
|
1586
|
+
if (outputSegments.length > 0) {
|
|
1587
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1588
|
+
const content = lastMessage.content;
|
|
1589
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1590
|
+
if (typeof content === "string") {
|
|
1591
|
+
referenceAnswer = content;
|
|
1592
|
+
} else if (content !== void 0 && content !== null) {
|
|
1593
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1594
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1595
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1599
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
1600
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
1601
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1602
|
+
let evaluators;
|
|
1603
|
+
try {
|
|
1604
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
1605
|
+
} catch (error) {
|
|
1606
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1607
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
1608
|
+
continue;
|
|
1609
|
+
}
|
|
1610
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1611
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1612
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
1613
|
+
if (rubricEvaluator) {
|
|
1614
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
const userFilePaths = [];
|
|
1618
|
+
for (const segment of inputSegments) {
|
|
1619
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1620
|
+
userFilePaths.push(segment.resolvedPath);
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
const allFilePaths = [
|
|
1624
|
+
...guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
|
|
1625
|
+
...userFilePaths
|
|
1626
|
+
];
|
|
1627
|
+
const testCase = {
|
|
1628
|
+
id,
|
|
1629
|
+
dataset: datasetName,
|
|
1630
|
+
conversation_id: conversationId,
|
|
1631
|
+
question,
|
|
1632
|
+
input_messages: inputMessages,
|
|
1633
|
+
input_segments: inputSegments,
|
|
1634
|
+
expected_messages: outputSegments,
|
|
1635
|
+
reference_answer: referenceAnswer,
|
|
1636
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
|
|
1637
|
+
guideline_patterns: guidelinePatterns,
|
|
1638
|
+
file_paths: allFilePaths,
|
|
1639
|
+
expected_outcome: outcome,
|
|
1640
|
+
evaluator: evalCaseEvaluatorKind,
|
|
1641
|
+
evaluators
|
|
1642
|
+
};
|
|
1643
|
+
if (verbose) {
|
|
1644
|
+
console.log(`
|
|
1645
|
+
[Eval Case: ${id}]`);
|
|
1646
|
+
if (testCase.guideline_paths.length > 0) {
|
|
1647
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
1648
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
1649
|
+
console.log(` - ${guidelinePath}`);
|
|
1650
|
+
}
|
|
1651
|
+
} else {
|
|
1652
|
+
console.log(" No guidelines found");
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
results.push(testCase);
|
|
1656
|
+
}
|
|
1657
|
+
return results;
|
|
1658
|
+
}
|
|
1659
|
+
function asString4(value) {
|
|
1660
|
+
return typeof value === "string" ? value : void 0;
|
|
1661
|
+
}
|
|
1662
|
+
function logWarning4(message, details) {
|
|
1663
|
+
if (details && details.length > 0) {
|
|
1664
|
+
const detailBlock = details.join("\n");
|
|
1665
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1666
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1667
|
+
} else {
|
|
1668
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
function logError(message, details) {
|
|
1672
|
+
if (details && details.length > 0) {
|
|
1673
|
+
const detailBlock = details.join("\n");
|
|
1674
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1675
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1676
|
+
} else {
|
|
1677
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
|
|
1681
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
1682
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
1683
|
+
import path6 from "node:path";
|
|
1684
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
1685
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
1194
1686
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
1195
1687
|
const guidelineParts = [];
|
|
1196
1688
|
for (const rawPath of testCase.guideline_paths) {
|
|
1197
|
-
const absolutePath =
|
|
1689
|
+
const absolutePath = path6.resolve(rawPath);
|
|
1198
1690
|
if (!await fileExists2(absolutePath)) {
|
|
1199
|
-
|
|
1691
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
1200
1692
|
continue;
|
|
1201
1693
|
}
|
|
1202
1694
|
try {
|
|
1203
|
-
const content = (await
|
|
1695
|
+
const content = (await readFile5(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
1204
1696
|
guidelineParts.push({
|
|
1205
1697
|
content,
|
|
1206
1698
|
isFile: true,
|
|
1207
|
-
displayPath:
|
|
1699
|
+
displayPath: path6.basename(absolutePath)
|
|
1208
1700
|
});
|
|
1209
1701
|
} catch (error) {
|
|
1210
|
-
|
|
1702
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
1211
1703
|
}
|
|
1212
1704
|
}
|
|
1213
1705
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -1231,9 +1723,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1231
1723
|
messageSegments.push({ type: "text", value: segment });
|
|
1232
1724
|
}
|
|
1233
1725
|
} else if (isJsonObject(segment)) {
|
|
1234
|
-
const type =
|
|
1726
|
+
const type = asString5(segment.type);
|
|
1235
1727
|
if (type === "file") {
|
|
1236
|
-
const value =
|
|
1728
|
+
const value = asString5(segment.value);
|
|
1237
1729
|
if (!value) continue;
|
|
1238
1730
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
1239
1731
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -1244,7 +1736,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1244
1736
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
1245
1737
|
}
|
|
1246
1738
|
} else if (type === "text") {
|
|
1247
|
-
const textValue =
|
|
1739
|
+
const textValue = asString5(segment.value);
|
|
1248
1740
|
if (textValue && textValue.trim().length > 0) {
|
|
1249
1741
|
messageSegments.push({ type: "text", value: textValue });
|
|
1250
1742
|
}
|
|
@@ -1398,21 +1890,21 @@ ${guidelineContent.trim()}`);
|
|
|
1398
1890
|
}
|
|
1399
1891
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
1400
1892
|
}
|
|
1401
|
-
function
|
|
1893
|
+
function asString5(value) {
|
|
1402
1894
|
return typeof value === "string" ? value : void 0;
|
|
1403
1895
|
}
|
|
1404
|
-
function
|
|
1405
|
-
console.warn(`${
|
|
1896
|
+
function logWarning5(message) {
|
|
1897
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1406
1898
|
}
|
|
1407
1899
|
|
|
1408
1900
|
// src/evaluation/yaml-parser.ts
|
|
1409
|
-
var
|
|
1410
|
-
var
|
|
1411
|
-
var
|
|
1901
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
1902
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1903
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
1412
1904
|
async function readTestSuiteMetadata(testFilePath) {
|
|
1413
1905
|
try {
|
|
1414
|
-
const absolutePath =
|
|
1415
|
-
const content = await
|
|
1906
|
+
const absolutePath = path7.resolve(testFilePath);
|
|
1907
|
+
const content = await readFile6(absolutePath, "utf8");
|
|
1416
1908
|
const parsed = parse2(content);
|
|
1417
1909
|
if (!isJsonObject(parsed)) {
|
|
1418
1910
|
return {};
|
|
@@ -1423,21 +1915,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
1423
1915
|
}
|
|
1424
1916
|
}
|
|
1425
1917
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
1918
|
+
const format = detectFormat(evalFilePath);
|
|
1919
|
+
if (format === "jsonl") {
|
|
1920
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1921
|
+
}
|
|
1426
1922
|
const verbose = options?.verbose ?? false;
|
|
1427
|
-
const
|
|
1428
|
-
const absoluteTestPath =
|
|
1923
|
+
const filterPattern = options?.filter;
|
|
1924
|
+
const absoluteTestPath = path7.resolve(evalFilePath);
|
|
1429
1925
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1430
1926
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
1431
1927
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1432
1928
|
const guidelinePatterns = config?.guideline_patterns;
|
|
1433
|
-
const rawFile = await
|
|
1929
|
+
const rawFile = await readFile6(absoluteTestPath, "utf8");
|
|
1434
1930
|
const parsed = parse2(rawFile);
|
|
1435
1931
|
if (!isJsonObject(parsed)) {
|
|
1436
1932
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
1437
1933
|
}
|
|
1438
1934
|
const suite = parsed;
|
|
1439
|
-
const datasetNameFromSuite =
|
|
1440
|
-
const fallbackDataset =
|
|
1935
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
1936
|
+
const fallbackDataset = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
1441
1937
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
1442
1938
|
const rawTestcases = suite.evalcases;
|
|
1443
1939
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -1445,37 +1941,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1445
1941
|
}
|
|
1446
1942
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
1447
1943
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
1448
|
-
const _globalTarget =
|
|
1944
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
1449
1945
|
const results = [];
|
|
1450
1946
|
for (const rawEvalcase of rawTestcases) {
|
|
1451
1947
|
if (!isJsonObject(rawEvalcase)) {
|
|
1452
|
-
|
|
1948
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
1453
1949
|
continue;
|
|
1454
1950
|
}
|
|
1455
1951
|
const evalcase = rawEvalcase;
|
|
1456
|
-
const id =
|
|
1457
|
-
if (
|
|
1952
|
+
const id = asString6(evalcase.id);
|
|
1953
|
+
if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
|
|
1458
1954
|
continue;
|
|
1459
1955
|
}
|
|
1460
|
-
const conversationId =
|
|
1461
|
-
const outcome =
|
|
1462
|
-
const
|
|
1463
|
-
const
|
|
1464
|
-
if (!id || !outcome || !
|
|
1465
|
-
|
|
1466
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
1956
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
1957
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1958
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1959
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1960
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1961
|
+
logError2(
|
|
1962
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
1467
1963
|
);
|
|
1468
1964
|
continue;
|
|
1469
1965
|
}
|
|
1470
|
-
const hasExpectedMessages =
|
|
1471
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1472
|
-
(msg) => isTestMessage(msg)
|
|
1473
|
-
);
|
|
1474
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1475
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1476
|
-
logError(`No valid expected message found for eval case: ${id}`);
|
|
1477
|
-
continue;
|
|
1478
|
-
}
|
|
1966
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1479
1967
|
const guidelinePaths = [];
|
|
1480
1968
|
const inputTextParts = [];
|
|
1481
1969
|
const inputSegments = await processMessages({
|
|
@@ -1514,33 +2002,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1514
2002
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1515
2003
|
} catch (error) {
|
|
1516
2004
|
const message = error instanceof Error ? error.message : String(error);
|
|
1517
|
-
|
|
2005
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
1518
2006
|
continue;
|
|
1519
2007
|
}
|
|
1520
2008
|
const inlineRubrics = evalcase.rubrics;
|
|
1521
2009
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1522
|
-
const
|
|
1523
|
-
|
|
1524
|
-
return {
|
|
1525
|
-
id: `rubric-${index + 1}`,
|
|
1526
|
-
description: rubric,
|
|
1527
|
-
weight: 1,
|
|
1528
|
-
required: true
|
|
1529
|
-
};
|
|
1530
|
-
}
|
|
1531
|
-
return {
|
|
1532
|
-
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
1533
|
-
description: asString5(rubric.description) ?? "",
|
|
1534
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1535
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1536
|
-
};
|
|
1537
|
-
}).filter((r) => r.description.length > 0);
|
|
1538
|
-
if (rubricItems.length > 0) {
|
|
1539
|
-
const rubricEvaluator = {
|
|
1540
|
-
name: "rubric",
|
|
1541
|
-
type: "llm_judge",
|
|
1542
|
-
rubrics: rubricItems
|
|
1543
|
-
};
|
|
2010
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2011
|
+
if (rubricEvaluator) {
|
|
1544
2012
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1545
2013
|
}
|
|
1546
2014
|
}
|
|
@@ -1551,7 +2019,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1551
2019
|
}
|
|
1552
2020
|
}
|
|
1553
2021
|
const allFilePaths = [
|
|
1554
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
2022
|
+
...guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
|
|
1555
2023
|
...userFilePaths
|
|
1556
2024
|
];
|
|
1557
2025
|
const testCase = {
|
|
@@ -1563,7 +2031,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1563
2031
|
input_segments: inputSegments,
|
|
1564
2032
|
expected_messages: outputSegments,
|
|
1565
2033
|
reference_answer: referenceAnswer,
|
|
1566
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
2034
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
|
|
1567
2035
|
guideline_patterns: guidelinePatterns,
|
|
1568
2036
|
file_paths: allFilePaths,
|
|
1569
2037
|
expected_outcome: outcome,
|
|
@@ -1586,25 +2054,25 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1586
2054
|
}
|
|
1587
2055
|
return results;
|
|
1588
2056
|
}
|
|
1589
|
-
function
|
|
2057
|
+
function asString6(value) {
|
|
1590
2058
|
return typeof value === "string" ? value : void 0;
|
|
1591
2059
|
}
|
|
1592
|
-
function
|
|
2060
|
+
function logWarning6(message, details) {
|
|
1593
2061
|
if (details && details.length > 0) {
|
|
1594
2062
|
const detailBlock = details.join("\n");
|
|
1595
|
-
console.warn(`${
|
|
1596
|
-
${detailBlock}${
|
|
2063
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
2064
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1597
2065
|
} else {
|
|
1598
|
-
console.warn(`${
|
|
2066
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
1599
2067
|
}
|
|
1600
2068
|
}
|
|
1601
|
-
function
|
|
2069
|
+
function logError2(message, details) {
|
|
1602
2070
|
if (details && details.length > 0) {
|
|
1603
2071
|
const detailBlock = details.join("\n");
|
|
1604
|
-
console.error(`${
|
|
1605
|
-
${detailBlock}${
|
|
2072
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
2073
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1606
2074
|
} else {
|
|
1607
|
-
console.error(`${
|
|
2075
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
1608
2076
|
}
|
|
1609
2077
|
}
|
|
1610
2078
|
|
|
@@ -1947,7 +2415,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1947
2415
|
import { createWriteStream } from "node:fs";
|
|
1948
2416
|
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1949
2417
|
import { tmpdir } from "node:os";
|
|
1950
|
-
import
|
|
2418
|
+
import path9 from "node:path";
|
|
1951
2419
|
|
|
1952
2420
|
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
1953
2421
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
@@ -2003,7 +2471,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
|
|
|
2003
2471
|
}
|
|
2004
2472
|
|
|
2005
2473
|
// src/evaluation/providers/preread.ts
|
|
2006
|
-
import
|
|
2474
|
+
import path8 from "node:path";
|
|
2007
2475
|
function buildPromptDocument(request, inputFiles, options) {
|
|
2008
2476
|
const parts = [];
|
|
2009
2477
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -2026,7 +2494,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
2026
2494
|
}
|
|
2027
2495
|
const deduped = /* @__PURE__ */ new Map();
|
|
2028
2496
|
for (const inputFile of inputFiles) {
|
|
2029
|
-
const absolutePath =
|
|
2497
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2030
2498
|
if (!deduped.has(absolutePath)) {
|
|
2031
2499
|
deduped.set(absolutePath, absolutePath);
|
|
2032
2500
|
}
|
|
@@ -2039,14 +2507,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
2039
2507
|
}
|
|
2040
2508
|
const unique = /* @__PURE__ */ new Map();
|
|
2041
2509
|
for (const inputFile of inputFiles) {
|
|
2042
|
-
const absolutePath =
|
|
2510
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2043
2511
|
if (overrides?.has(absolutePath)) {
|
|
2044
2512
|
if (!unique.has(absolutePath)) {
|
|
2045
2513
|
unique.set(absolutePath, absolutePath);
|
|
2046
2514
|
}
|
|
2047
2515
|
continue;
|
|
2048
2516
|
}
|
|
2049
|
-
const normalized = absolutePath.split(
|
|
2517
|
+
const normalized = absolutePath.split(path8.sep).join("/");
|
|
2050
2518
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2051
2519
|
if (!unique.has(absolutePath)) {
|
|
2052
2520
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2061,7 +2529,7 @@ function collectInputFiles(inputFiles) {
|
|
|
2061
2529
|
}
|
|
2062
2530
|
const unique = /* @__PURE__ */ new Map();
|
|
2063
2531
|
for (const inputFile of inputFiles) {
|
|
2064
|
-
const absolutePath =
|
|
2532
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2065
2533
|
if (!unique.has(absolutePath)) {
|
|
2066
2534
|
unique.set(absolutePath, absolutePath);
|
|
2067
2535
|
}
|
|
@@ -2073,7 +2541,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
2073
2541
|
return "";
|
|
2074
2542
|
}
|
|
2075
2543
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2076
|
-
const fileName =
|
|
2544
|
+
const fileName = path8.basename(absolutePath);
|
|
2077
2545
|
const fileUri = pathToFileUri(absolutePath);
|
|
2078
2546
|
return `* [${fileName}](${fileUri})`;
|
|
2079
2547
|
});
|
|
@@ -2093,7 +2561,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
2093
2561
|
return sections.join("\n");
|
|
2094
2562
|
}
|
|
2095
2563
|
function pathToFileUri(filePath) {
|
|
2096
|
-
const absolutePath =
|
|
2564
|
+
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
2097
2565
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2098
2566
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2099
2567
|
return `file:///${normalizedPath}`;
|
|
@@ -2130,7 +2598,7 @@ var ClaudeCodeProvider = class {
|
|
|
2130
2598
|
const workspaceRoot = await this.createWorkspace();
|
|
2131
2599
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2132
2600
|
try {
|
|
2133
|
-
const promptFile =
|
|
2601
|
+
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
2134
2602
|
await writeFile(promptFile, request.question, "utf8");
|
|
2135
2603
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2136
2604
|
const cwd = this.resolveCwd();
|
|
@@ -2178,7 +2646,7 @@ var ClaudeCodeProvider = class {
|
|
|
2178
2646
|
if (!this.config.cwd) {
|
|
2179
2647
|
return process.cwd();
|
|
2180
2648
|
}
|
|
2181
|
-
return
|
|
2649
|
+
return path9.resolve(this.config.cwd);
|
|
2182
2650
|
}
|
|
2183
2651
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2184
2652
|
const args = [];
|
|
@@ -2235,7 +2703,7 @@ ${filesContext}`;
|
|
|
2235
2703
|
}
|
|
2236
2704
|
}
|
|
2237
2705
|
async createWorkspace() {
|
|
2238
|
-
return await mkdtemp(
|
|
2706
|
+
return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
|
|
2239
2707
|
}
|
|
2240
2708
|
async cleanupWorkspace(workspaceRoot) {
|
|
2241
2709
|
try {
|
|
@@ -2249,9 +2717,9 @@ ${filesContext}`;
|
|
|
2249
2717
|
return void 0;
|
|
2250
2718
|
}
|
|
2251
2719
|
if (this.config.logDir) {
|
|
2252
|
-
return
|
|
2720
|
+
return path9.resolve(this.config.logDir);
|
|
2253
2721
|
}
|
|
2254
|
-
return
|
|
2722
|
+
return path9.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2255
2723
|
}
|
|
2256
2724
|
async createStreamLogger(request) {
|
|
2257
2725
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2265,7 +2733,7 @@ ${filesContext}`;
|
|
|
2265
2733
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2266
2734
|
return void 0;
|
|
2267
2735
|
}
|
|
2268
|
-
const filePath =
|
|
2736
|
+
const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
|
|
2269
2737
|
try {
|
|
2270
2738
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
2271
2739
|
filePath,
|
|
@@ -2670,10 +3138,10 @@ function escapeShellArg(arg) {
|
|
|
2670
3138
|
}
|
|
2671
3139
|
async function defaultClaudeCodeRunner(options) {
|
|
2672
3140
|
const tempId = randomUUID();
|
|
2673
|
-
const stdoutFile =
|
|
2674
|
-
const stderrFile =
|
|
2675
|
-
const exitFile =
|
|
2676
|
-
const pidFile =
|
|
3141
|
+
const stdoutFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
|
|
3142
|
+
const stderrFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
|
|
3143
|
+
const exitFile = path9.join(tmpdir(), `agentv-cc-${tempId}-exit`);
|
|
3144
|
+
const pidFile = path9.join(tmpdir(), `agentv-cc-${tempId}-pid`);
|
|
2677
3145
|
try {
|
|
2678
3146
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2679
3147
|
} finally {
|
|
@@ -2713,8 +3181,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2713
3181
|
let lastStdoutSize = 0;
|
|
2714
3182
|
const readFileIfExists = async (filePath) => {
|
|
2715
3183
|
try {
|
|
2716
|
-
const { readFile:
|
|
2717
|
-
return await
|
|
3184
|
+
const { readFile: readFile8 } = await import("node:fs/promises");
|
|
3185
|
+
return await readFile8(filePath, "utf8");
|
|
2718
3186
|
} catch {
|
|
2719
3187
|
return "";
|
|
2720
3188
|
}
|
|
@@ -2789,7 +3257,7 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2789
3257
|
import { exec as execWithCallback } from "node:child_process";
|
|
2790
3258
|
import fs from "node:fs/promises";
|
|
2791
3259
|
import os from "node:os";
|
|
2792
|
-
import
|
|
3260
|
+
import path10 from "node:path";
|
|
2793
3261
|
import { promisify } from "node:util";
|
|
2794
3262
|
import { z } from "zod";
|
|
2795
3263
|
var ToolCallSchema = z.object({
|
|
@@ -2797,7 +3265,8 @@ var ToolCallSchema = z.object({
|
|
|
2797
3265
|
input: z.unknown().optional(),
|
|
2798
3266
|
output: z.unknown().optional(),
|
|
2799
3267
|
id: z.string().optional(),
|
|
2800
|
-
timestamp: z.string().optional()
|
|
3268
|
+
timestamp: z.string().optional(),
|
|
3269
|
+
duration_ms: z.number().optional()
|
|
2801
3270
|
});
|
|
2802
3271
|
var OutputMessageInputSchema = z.object({
|
|
2803
3272
|
role: z.string(),
|
|
@@ -2805,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
|
|
|
2805
3274
|
content: z.unknown().optional(),
|
|
2806
3275
|
tool_calls: z.array(ToolCallSchema).optional(),
|
|
2807
3276
|
timestamp: z.string().optional(),
|
|
3277
|
+
duration_ms: z.number().optional(),
|
|
2808
3278
|
metadata: z.record(z.unknown()).optional()
|
|
2809
3279
|
});
|
|
2810
3280
|
var TokenUsageSchema = z.object({
|
|
@@ -2843,8 +3313,16 @@ function convertOutputMessages(messages) {
|
|
|
2843
3313
|
role: msg.role,
|
|
2844
3314
|
name: msg.name,
|
|
2845
3315
|
content: msg.content,
|
|
2846
|
-
toolCalls: msg.tool_calls
|
|
3316
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
3317
|
+
tool: tc.tool,
|
|
3318
|
+
input: tc.input,
|
|
3319
|
+
output: tc.output,
|
|
3320
|
+
id: tc.id,
|
|
3321
|
+
timestamp: tc.timestamp,
|
|
3322
|
+
durationMs: tc.duration_ms
|
|
3323
|
+
})),
|
|
2847
3324
|
timestamp: msg.timestamp,
|
|
3325
|
+
durationMs: msg.duration_ms,
|
|
2848
3326
|
metadata: msg.metadata
|
|
2849
3327
|
}));
|
|
2850
3328
|
}
|
|
@@ -3246,7 +3724,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
3246
3724
|
}
|
|
3247
3725
|
const unique = /* @__PURE__ */ new Map();
|
|
3248
3726
|
for (const inputFile of inputFiles) {
|
|
3249
|
-
const absolutePath =
|
|
3727
|
+
const absolutePath = path10.resolve(inputFile);
|
|
3250
3728
|
if (!unique.has(absolutePath)) {
|
|
3251
3729
|
unique.set(absolutePath, absolutePath);
|
|
3252
3730
|
}
|
|
@@ -3260,7 +3738,7 @@ function formatFileList(files, template) {
|
|
|
3260
3738
|
const formatter = template ?? "{path}";
|
|
3261
3739
|
return files.map((filePath) => {
|
|
3262
3740
|
const escapedPath = shellEscape(filePath);
|
|
3263
|
-
const escapedName = shellEscape(
|
|
3741
|
+
const escapedName = shellEscape(path10.basename(filePath));
|
|
3264
3742
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
3265
3743
|
}).join(" ");
|
|
3266
3744
|
}
|
|
@@ -3284,7 +3762,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
3284
3762
|
const safeEvalId = evalCaseId || "unknown";
|
|
3285
3763
|
const timestamp = Date.now();
|
|
3286
3764
|
const random = Math.random().toString(36).substring(2, 9);
|
|
3287
|
-
return
|
|
3765
|
+
return path10.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
3288
3766
|
}
|
|
3289
3767
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
3290
3768
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3300,7 +3778,7 @@ import { randomUUID as randomUUID2 } from "node:crypto";
|
|
|
3300
3778
|
import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
|
|
3301
3779
|
import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3302
3780
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
3303
|
-
import
|
|
3781
|
+
import path11 from "node:path";
|
|
3304
3782
|
import { promisify as promisify2 } from "node:util";
|
|
3305
3783
|
|
|
3306
3784
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -3395,7 +3873,7 @@ var CodexProvider = class {
|
|
|
3395
3873
|
const promptContent = `${systemPrompt}
|
|
3396
3874
|
|
|
3397
3875
|
${basePrompt}`;
|
|
3398
|
-
const promptFile =
|
|
3876
|
+
const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3399
3877
|
await writeFile2(promptFile, promptContent, "utf8");
|
|
3400
3878
|
const args = this.buildCodexArgs();
|
|
3401
3879
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -3445,7 +3923,7 @@ ${basePrompt}`;
|
|
|
3445
3923
|
if (!this.config.cwd) {
|
|
3446
3924
|
return workspaceRoot;
|
|
3447
3925
|
}
|
|
3448
|
-
return
|
|
3926
|
+
return path11.resolve(this.config.cwd);
|
|
3449
3927
|
}
|
|
3450
3928
|
buildCodexArgs() {
|
|
3451
3929
|
const args = [
|
|
@@ -3487,7 +3965,7 @@ ${basePrompt}`;
|
|
|
3487
3965
|
}
|
|
3488
3966
|
}
|
|
3489
3967
|
async createWorkspace() {
|
|
3490
|
-
return await mkdtemp2(
|
|
3968
|
+
return await mkdtemp2(path11.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
3491
3969
|
}
|
|
3492
3970
|
async cleanupWorkspace(workspaceRoot) {
|
|
3493
3971
|
try {
|
|
@@ -3501,9 +3979,9 @@ ${basePrompt}`;
|
|
|
3501
3979
|
return void 0;
|
|
3502
3980
|
}
|
|
3503
3981
|
if (this.config.logDir) {
|
|
3504
|
-
return
|
|
3982
|
+
return path11.resolve(this.config.logDir);
|
|
3505
3983
|
}
|
|
3506
|
-
return
|
|
3984
|
+
return path11.join(process.cwd(), ".agentv", "logs", "codex");
|
|
3507
3985
|
}
|
|
3508
3986
|
async createStreamLogger(request) {
|
|
3509
3987
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3517,7 +3995,7 @@ ${basePrompt}`;
|
|
|
3517
3995
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
3518
3996
|
return void 0;
|
|
3519
3997
|
}
|
|
3520
|
-
const filePath =
|
|
3998
|
+
const filePath = path11.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3521
3999
|
try {
|
|
3522
4000
|
const logger = await CodexStreamLogger.create({
|
|
3523
4001
|
filePath,
|
|
@@ -3732,7 +4210,7 @@ function tryParseJsonValue2(rawLine) {
|
|
|
3732
4210
|
async function locateExecutable(candidate) {
|
|
3733
4211
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
3734
4212
|
if (includesPathSeparator) {
|
|
3735
|
-
const resolved =
|
|
4213
|
+
const resolved = path11.isAbsolute(candidate) ? candidate : path11.resolve(candidate);
|
|
3736
4214
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
3737
4215
|
await access2(executablePath, constants2.F_OK);
|
|
3738
4216
|
return executablePath;
|
|
@@ -4245,7 +4723,7 @@ import { randomUUID as randomUUID3 } from "node:crypto";
|
|
|
4245
4723
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
4246
4724
|
import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
|
|
4247
4725
|
import { tmpdir as tmpdir3 } from "node:os";
|
|
4248
|
-
import
|
|
4726
|
+
import path12 from "node:path";
|
|
4249
4727
|
|
|
4250
4728
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
4251
4729
|
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
@@ -4329,7 +4807,7 @@ var PiCodingAgentProvider = class {
|
|
|
4329
4807
|
const workspaceRoot = await this.createWorkspace();
|
|
4330
4808
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
4331
4809
|
try {
|
|
4332
|
-
const promptFile =
|
|
4810
|
+
const promptFile = path12.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4333
4811
|
await writeFile3(promptFile, request.question, "utf8");
|
|
4334
4812
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
4335
4813
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -4371,7 +4849,7 @@ var PiCodingAgentProvider = class {
|
|
|
4371
4849
|
if (!this.config.cwd) {
|
|
4372
4850
|
return workspaceRoot;
|
|
4373
4851
|
}
|
|
4374
|
-
return
|
|
4852
|
+
return path12.resolve(this.config.cwd);
|
|
4375
4853
|
}
|
|
4376
4854
|
buildPiArgs(prompt, inputFiles) {
|
|
4377
4855
|
const args = [];
|
|
@@ -4460,7 +4938,7 @@ ${prompt}`;
|
|
|
4460
4938
|
return env;
|
|
4461
4939
|
}
|
|
4462
4940
|
async createWorkspace() {
|
|
4463
|
-
return await mkdtemp3(
|
|
4941
|
+
return await mkdtemp3(path12.join(tmpdir3(), WORKSPACE_PREFIX3));
|
|
4464
4942
|
}
|
|
4465
4943
|
async cleanupWorkspace(workspaceRoot) {
|
|
4466
4944
|
try {
|
|
@@ -4470,9 +4948,9 @@ ${prompt}`;
|
|
|
4470
4948
|
}
|
|
4471
4949
|
resolveLogDirectory() {
|
|
4472
4950
|
if (this.config.logDir) {
|
|
4473
|
-
return
|
|
4951
|
+
return path12.resolve(this.config.logDir);
|
|
4474
4952
|
}
|
|
4475
|
-
return
|
|
4953
|
+
return path12.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
4476
4954
|
}
|
|
4477
4955
|
async createStreamLogger(request) {
|
|
4478
4956
|
const logDir = this.resolveLogDirectory();
|
|
@@ -4486,7 +4964,7 @@ ${prompt}`;
|
|
|
4486
4964
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
4487
4965
|
return void 0;
|
|
4488
4966
|
}
|
|
4489
|
-
const filePath =
|
|
4967
|
+
const filePath = path12.join(logDir, buildLogFilename3(request, this.targetName));
|
|
4490
4968
|
try {
|
|
4491
4969
|
const logger = await PiStreamLogger.create({
|
|
4492
4970
|
filePath,
|
|
@@ -4919,7 +5397,7 @@ async function defaultPiRunner(options) {
|
|
|
4919
5397
|
}
|
|
4920
5398
|
|
|
4921
5399
|
// src/evaluation/providers/vscode.ts
|
|
4922
|
-
import
|
|
5400
|
+
import path13 from "node:path";
|
|
4923
5401
|
import {
|
|
4924
5402
|
dispatchAgentSession,
|
|
4925
5403
|
dispatchBatchAgent,
|
|
@@ -5094,7 +5572,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
5094
5572
|
return "";
|
|
5095
5573
|
}
|
|
5096
5574
|
const buildList = (files) => files.map((absolutePath) => {
|
|
5097
|
-
const fileName =
|
|
5575
|
+
const fileName = path13.basename(absolutePath);
|
|
5098
5576
|
const fileUri = pathToFileUri2(absolutePath);
|
|
5099
5577
|
return `* [${fileName}](${fileUri})`;
|
|
5100
5578
|
});
|
|
@@ -5119,8 +5597,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
5119
5597
|
}
|
|
5120
5598
|
const unique = /* @__PURE__ */ new Map();
|
|
5121
5599
|
for (const attachment of attachments) {
|
|
5122
|
-
const absolutePath =
|
|
5123
|
-
const normalized = absolutePath.split(
|
|
5600
|
+
const absolutePath = path13.resolve(attachment);
|
|
5601
|
+
const normalized = absolutePath.split(path13.sep).join("/");
|
|
5124
5602
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
5125
5603
|
if (!unique.has(absolutePath)) {
|
|
5126
5604
|
unique.set(absolutePath, absolutePath);
|
|
@@ -5135,7 +5613,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
5135
5613
|
}
|
|
5136
5614
|
const unique = /* @__PURE__ */ new Map();
|
|
5137
5615
|
for (const attachment of attachments) {
|
|
5138
|
-
const absolutePath =
|
|
5616
|
+
const absolutePath = path13.resolve(attachment);
|
|
5139
5617
|
if (!unique.has(absolutePath)) {
|
|
5140
5618
|
unique.set(absolutePath, absolutePath);
|
|
5141
5619
|
}
|
|
@@ -5143,7 +5621,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
5143
5621
|
return Array.from(unique.values());
|
|
5144
5622
|
}
|
|
5145
5623
|
function pathToFileUri2(filePath) {
|
|
5146
|
-
const absolutePath =
|
|
5624
|
+
const absolutePath = path13.isAbsolute(filePath) ? filePath : path13.resolve(filePath);
|
|
5147
5625
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
5148
5626
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
5149
5627
|
return `file:///${normalizedPath}`;
|
|
@@ -5156,7 +5634,7 @@ function normalizeAttachments(attachments) {
|
|
|
5156
5634
|
}
|
|
5157
5635
|
const deduped = /* @__PURE__ */ new Set();
|
|
5158
5636
|
for (const attachment of attachments) {
|
|
5159
|
-
deduped.add(
|
|
5637
|
+
deduped.add(path13.resolve(attachment));
|
|
5160
5638
|
}
|
|
5161
5639
|
return Array.from(deduped);
|
|
5162
5640
|
}
|
|
@@ -5165,7 +5643,7 @@ function mergeAttachments(all) {
|
|
|
5165
5643
|
for (const list of all) {
|
|
5166
5644
|
if (!list) continue;
|
|
5167
5645
|
for (const inputFile of list) {
|
|
5168
|
-
deduped.add(
|
|
5646
|
+
deduped.add(path13.resolve(inputFile));
|
|
5169
5647
|
}
|
|
5170
5648
|
}
|
|
5171
5649
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -5213,8 +5691,8 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
5213
5691
|
|
|
5214
5692
|
// src/evaluation/providers/targets-file.ts
|
|
5215
5693
|
import { constants as constants3 } from "node:fs";
|
|
5216
|
-
import { access as access3, readFile as
|
|
5217
|
-
import
|
|
5694
|
+
import { access as access3, readFile as readFile7 } from "node:fs/promises";
|
|
5695
|
+
import path14 from "node:path";
|
|
5218
5696
|
import { parse as parse3 } from "yaml";
|
|
5219
5697
|
function isRecord(value) {
|
|
5220
5698
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -5251,11 +5729,11 @@ async function fileExists3(filePath) {
|
|
|
5251
5729
|
}
|
|
5252
5730
|
}
|
|
5253
5731
|
async function readTargetDefinitions(filePath) {
|
|
5254
|
-
const absolutePath =
|
|
5732
|
+
const absolutePath = path14.resolve(filePath);
|
|
5255
5733
|
if (!await fileExists3(absolutePath)) {
|
|
5256
5734
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
5257
5735
|
}
|
|
5258
|
-
const raw = await
|
|
5736
|
+
const raw = await readFile7(absolutePath, "utf8");
|
|
5259
5737
|
const parsed = parse3(raw);
|
|
5260
5738
|
if (!isRecord(parsed)) {
|
|
5261
5739
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -5462,15 +5940,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
5462
5940
|
});
|
|
5463
5941
|
}
|
|
5464
5942
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5465
|
-
const { mkdir: mkdir4, readFile:
|
|
5943
|
+
const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
|
|
5466
5944
|
const { tmpdir: tmpdir4 } = await import("node:os");
|
|
5467
|
-
const
|
|
5945
|
+
const path16 = await import("node:path");
|
|
5468
5946
|
const { randomUUID: randomUUID4 } = await import("node:crypto");
|
|
5469
|
-
const dir =
|
|
5947
|
+
const dir = path16.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
5470
5948
|
await mkdir4(dir, { recursive: true });
|
|
5471
|
-
const stdinPath =
|
|
5472
|
-
const stdoutPath =
|
|
5473
|
-
const stderrPath =
|
|
5949
|
+
const stdinPath = path16.join(dir, "stdin.txt");
|
|
5950
|
+
const stdoutPath = path16.join(dir, "stdout.txt");
|
|
5951
|
+
const stderrPath = path16.join(dir, "stderr.txt");
|
|
5474
5952
|
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
5475
5953
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
5476
5954
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
@@ -5500,8 +5978,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5500
5978
|
resolve(code ?? 0);
|
|
5501
5979
|
});
|
|
5502
5980
|
});
|
|
5503
|
-
const stdout = (await
|
|
5504
|
-
const stderr = (await
|
|
5981
|
+
const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5982
|
+
const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5505
5983
|
return { stdout, stderr, exitCode };
|
|
5506
5984
|
} finally {
|
|
5507
5985
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -5773,7 +6251,7 @@ var CodeEvaluator = class {
|
|
|
5773
6251
|
outputMessages: context.outputMessages ?? null,
|
|
5774
6252
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
5775
6253
|
inputFiles: context.evalCase.file_paths.filter(
|
|
5776
|
-
(
|
|
6254
|
+
(path16) => !context.evalCase.guideline_paths.includes(path16)
|
|
5777
6255
|
),
|
|
5778
6256
|
inputMessages: context.evalCase.input_messages,
|
|
5779
6257
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -5921,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
|
|
|
5921
6399
|
checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
5922
6400
|
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
|
|
5923
6401
|
});
|
|
6402
|
+
var scoreRangeCheckResultSchema = z2.object({
|
|
6403
|
+
id: z2.string().describe("The ID of the rubric criterion being scored"),
|
|
6404
|
+
score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
6405
|
+
reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
6406
|
+
});
|
|
6407
|
+
var scoreRangeEvaluationSchema = z2.object({
|
|
6408
|
+
checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
6409
|
+
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
6410
|
+
});
|
|
5924
6411
|
var LlmJudgeEvaluator = class {
|
|
5925
6412
|
kind = "llm_judge";
|
|
5926
6413
|
resolveJudgeProvider;
|
|
@@ -6006,6 +6493,10 @@ var LlmJudgeEvaluator = class {
|
|
|
6006
6493
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
6007
6494
|
);
|
|
6008
6495
|
}
|
|
6496
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
6497
|
+
if (hasScoreRanges) {
|
|
6498
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
6499
|
+
}
|
|
6009
6500
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
6010
6501
|
const systemPrompt = buildRubricOutputSchema();
|
|
6011
6502
|
const evaluatorRawRequest = {
|
|
@@ -6031,6 +6522,84 @@ var LlmJudgeEvaluator = class {
|
|
|
6031
6522
|
evaluatorRawRequest
|
|
6032
6523
|
};
|
|
6033
6524
|
}
|
|
6525
|
+
/**
|
|
6526
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
6527
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
6528
|
+
*/
|
|
6529
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
6530
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
6531
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
6532
|
+
const evaluatorRawRequest = {
|
|
6533
|
+
userPrompt: prompt,
|
|
6534
|
+
systemPrompt,
|
|
6535
|
+
target: judgeProvider.targetName
|
|
6536
|
+
};
|
|
6537
|
+
const { data } = await this.runWithRetry({
|
|
6538
|
+
context,
|
|
6539
|
+
judgeProvider,
|
|
6540
|
+
systemPrompt,
|
|
6541
|
+
userPrompt: prompt,
|
|
6542
|
+
schema: scoreRangeEvaluationSchema
|
|
6543
|
+
});
|
|
6544
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
6545
|
+
return {
|
|
6546
|
+
score,
|
|
6547
|
+
verdict,
|
|
6548
|
+
hits,
|
|
6549
|
+
misses,
|
|
6550
|
+
expectedAspectCount: rubrics.length,
|
|
6551
|
+
reasoning: data.overall_reasoning,
|
|
6552
|
+
evaluatorRawRequest,
|
|
6553
|
+
details
|
|
6554
|
+
};
|
|
6555
|
+
}
|
|
6556
|
+
/**
|
|
6557
|
+
* Build prompt for score-range rubric evaluation.
|
|
6558
|
+
*/
|
|
6559
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
6560
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
6561
|
+
const parts = [
|
|
6562
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
6563
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
6564
|
+
"",
|
|
6565
|
+
"[[ ## question ## ]]",
|
|
6566
|
+
formattedQuestion,
|
|
6567
|
+
"",
|
|
6568
|
+
"[[ ## expected_outcome ## ]]",
|
|
6569
|
+
context.evalCase.expected_outcome,
|
|
6570
|
+
""
|
|
6571
|
+
];
|
|
6572
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
6573
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
6574
|
+
}
|
|
6575
|
+
parts.push(
|
|
6576
|
+
"[[ ## candidate_answer ## ]]",
|
|
6577
|
+
context.candidate,
|
|
6578
|
+
"",
|
|
6579
|
+
"[[ ## scoring_criteria ## ]]"
|
|
6580
|
+
);
|
|
6581
|
+
for (const rubric of rubrics) {
|
|
6582
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
6583
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
6584
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
6585
|
+
if (rubric.expected_outcome) {
|
|
6586
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
6587
|
+
}
|
|
6588
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
6589
|
+
parts.push("Score ranges:");
|
|
6590
|
+
for (const range of rubric.score_ranges) {
|
|
6591
|
+
const [min, max] = range.score_range;
|
|
6592
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
6593
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
6594
|
+
}
|
|
6595
|
+
}
|
|
6596
|
+
}
|
|
6597
|
+
parts.push(
|
|
6598
|
+
"",
|
|
6599
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
6600
|
+
);
|
|
6601
|
+
return parts.join("\n");
|
|
6602
|
+
}
|
|
6034
6603
|
buildRubricPrompt(context, rubrics) {
|
|
6035
6604
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
6036
6605
|
const parts = [
|
|
@@ -6050,7 +6619,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6050
6619
|
for (const rubric of rubrics) {
|
|
6051
6620
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
6052
6621
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
6053
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
6622
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
6054
6623
|
}
|
|
6055
6624
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
6056
6625
|
return parts.join("\n");
|
|
@@ -6137,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
6137
6706
|
totalWeight += rubric.weight;
|
|
6138
6707
|
if (check.satisfied) {
|
|
6139
6708
|
earnedWeight += rubric.weight;
|
|
6140
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
6709
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
6141
6710
|
} else {
|
|
6142
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
6711
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
6143
6712
|
if (rubric.required) {
|
|
6144
6713
|
failedRequired = true;
|
|
6145
6714
|
}
|
|
@@ -6149,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
6149
6718
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
6150
6719
|
return { score, verdict, hits, misses };
|
|
6151
6720
|
}
|
|
6721
|
+
function buildScoreRangeOutputSchema() {
|
|
6722
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
6723
|
+
You must return a valid JSON object matching this schema:
|
|
6724
|
+
{
|
|
6725
|
+
"checks": [
|
|
6726
|
+
{
|
|
6727
|
+
"id": "string (criterion id)",
|
|
6728
|
+
"score": integer (0-10),
|
|
6729
|
+
"reasoning": "string (brief explanation for score)"
|
|
6730
|
+
}
|
|
6731
|
+
],
|
|
6732
|
+
"overall_reasoning": "string (summary, optional)"
|
|
6733
|
+
}
|
|
6734
|
+
|
|
6735
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
6736
|
+
}
|
|
6737
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
6738
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
6739
|
+
const hits = [];
|
|
6740
|
+
const misses = [];
|
|
6741
|
+
const rawScores = {};
|
|
6742
|
+
let totalWeight = 0;
|
|
6743
|
+
let weightedScoreSum = 0;
|
|
6744
|
+
let failedRequired = false;
|
|
6745
|
+
for (const check of result.checks) {
|
|
6746
|
+
const rubric = rubricMap.get(check.id);
|
|
6747
|
+
if (!rubric) {
|
|
6748
|
+
continue;
|
|
6749
|
+
}
|
|
6750
|
+
const rawScore = Math.max(0, Math.min(10, check.score));
|
|
6751
|
+
const normalizedScore = rawScore / 10;
|
|
6752
|
+
rawScores[rubric.id] = rawScore;
|
|
6753
|
+
totalWeight += rubric.weight;
|
|
6754
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
6755
|
+
let requiredMinScore;
|
|
6756
|
+
if (rubric.required_min_score !== void 0) {
|
|
6757
|
+
requiredMinScore = rubric.required_min_score;
|
|
6758
|
+
} else if (rubric.required === true) {
|
|
6759
|
+
requiredMinScore = 10;
|
|
6760
|
+
}
|
|
6761
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
6762
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
6763
|
+
);
|
|
6764
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
6765
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
6766
|
+
const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
|
|
6767
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
6768
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
6769
|
+
failedRequired = true;
|
|
6770
|
+
misses.push(scoreInfo);
|
|
6771
|
+
} else if (rawScore >= 7) {
|
|
6772
|
+
hits.push(scoreInfo);
|
|
6773
|
+
} else {
|
|
6774
|
+
misses.push(scoreInfo);
|
|
6775
|
+
}
|
|
6776
|
+
}
|
|
6777
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
6778
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
6779
|
+
return {
|
|
6780
|
+
score,
|
|
6781
|
+
verdict,
|
|
6782
|
+
hits,
|
|
6783
|
+
misses,
|
|
6784
|
+
details: {
|
|
6785
|
+
raw_scores: rawScores,
|
|
6786
|
+
normalization: "score / 10",
|
|
6787
|
+
aggregation: "weighted_average"
|
|
6788
|
+
}
|
|
6789
|
+
};
|
|
6790
|
+
}
|
|
6152
6791
|
|
|
6153
6792
|
// src/evaluation/evaluators/composite.ts
|
|
6154
6793
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
@@ -6532,115 +7171,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
6532
7171
|
* Evaluate a single field against the expected value.
|
|
6533
7172
|
*/
|
|
6534
7173
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
6535
|
-
const { path:
|
|
6536
|
-
const candidateValue = resolvePath(candidateData,
|
|
6537
|
-
const expectedValue = resolvePath(expectedData,
|
|
7174
|
+
const { path: path16, match, required = true, weight = 1 } = fieldConfig;
|
|
7175
|
+
const candidateValue = resolvePath(candidateData, path16);
|
|
7176
|
+
const expectedValue = resolvePath(expectedData, path16);
|
|
6538
7177
|
if (expectedValue === void 0) {
|
|
6539
7178
|
return {
|
|
6540
|
-
path:
|
|
7179
|
+
path: path16,
|
|
6541
7180
|
score: 1,
|
|
6542
7181
|
// No expected value means no comparison needed
|
|
6543
7182
|
weight,
|
|
6544
7183
|
hit: true,
|
|
6545
|
-
message: `${
|
|
7184
|
+
message: `${path16}: no expected value`
|
|
6546
7185
|
};
|
|
6547
7186
|
}
|
|
6548
7187
|
if (candidateValue === void 0) {
|
|
6549
7188
|
if (required) {
|
|
6550
7189
|
return {
|
|
6551
|
-
path:
|
|
7190
|
+
path: path16,
|
|
6552
7191
|
score: 0,
|
|
6553
7192
|
weight,
|
|
6554
7193
|
hit: false,
|
|
6555
|
-
message: `${
|
|
7194
|
+
message: `${path16} (required, missing)`
|
|
6556
7195
|
};
|
|
6557
7196
|
}
|
|
6558
7197
|
return {
|
|
6559
|
-
path:
|
|
7198
|
+
path: path16,
|
|
6560
7199
|
score: 1,
|
|
6561
7200
|
// Don't penalize missing optional fields
|
|
6562
7201
|
weight: 0,
|
|
6563
7202
|
// Zero weight means it won't affect the score
|
|
6564
7203
|
hit: true,
|
|
6565
|
-
message: `${
|
|
7204
|
+
message: `${path16}: optional field missing`
|
|
6566
7205
|
};
|
|
6567
7206
|
}
|
|
6568
7207
|
switch (match) {
|
|
6569
7208
|
case "exact":
|
|
6570
|
-
return this.compareExact(
|
|
7209
|
+
return this.compareExact(path16, candidateValue, expectedValue, weight);
|
|
6571
7210
|
case "numeric_tolerance":
|
|
6572
7211
|
return this.compareNumericTolerance(
|
|
6573
|
-
|
|
7212
|
+
path16,
|
|
6574
7213
|
candidateValue,
|
|
6575
7214
|
expectedValue,
|
|
6576
7215
|
fieldConfig,
|
|
6577
7216
|
weight
|
|
6578
7217
|
);
|
|
6579
7218
|
case "date":
|
|
6580
|
-
return this.compareDate(
|
|
7219
|
+
return this.compareDate(path16, candidateValue, expectedValue, fieldConfig, weight);
|
|
6581
7220
|
default:
|
|
6582
7221
|
return {
|
|
6583
|
-
path:
|
|
7222
|
+
path: path16,
|
|
6584
7223
|
score: 0,
|
|
6585
7224
|
weight,
|
|
6586
7225
|
hit: false,
|
|
6587
|
-
message: `${
|
|
7226
|
+
message: `${path16}: unknown match type "${match}"`
|
|
6588
7227
|
};
|
|
6589
7228
|
}
|
|
6590
7229
|
}
|
|
6591
7230
|
/**
|
|
6592
7231
|
* Exact equality comparison.
|
|
6593
7232
|
*/
|
|
6594
|
-
compareExact(
|
|
7233
|
+
compareExact(path16, candidateValue, expectedValue, weight) {
|
|
6595
7234
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
6596
7235
|
return {
|
|
6597
|
-
path:
|
|
7236
|
+
path: path16,
|
|
6598
7237
|
score: 1,
|
|
6599
7238
|
weight,
|
|
6600
7239
|
hit: true,
|
|
6601
|
-
message:
|
|
7240
|
+
message: path16
|
|
6602
7241
|
};
|
|
6603
7242
|
}
|
|
6604
7243
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
6605
7244
|
return {
|
|
6606
|
-
path:
|
|
7245
|
+
path: path16,
|
|
6607
7246
|
score: 0,
|
|
6608
7247
|
weight,
|
|
6609
7248
|
hit: false,
|
|
6610
|
-
message: `${
|
|
7249
|
+
message: `${path16} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
6611
7250
|
};
|
|
6612
7251
|
}
|
|
6613
7252
|
return {
|
|
6614
|
-
path:
|
|
7253
|
+
path: path16,
|
|
6615
7254
|
score: 0,
|
|
6616
7255
|
weight,
|
|
6617
7256
|
hit: false,
|
|
6618
|
-
message: `${
|
|
7257
|
+
message: `${path16} (value mismatch)`
|
|
6619
7258
|
};
|
|
6620
7259
|
}
|
|
6621
7260
|
/**
|
|
6622
7261
|
* Numeric comparison with absolute or relative tolerance.
|
|
6623
7262
|
*/
|
|
6624
|
-
compareNumericTolerance(
|
|
7263
|
+
compareNumericTolerance(path16, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6625
7264
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
6626
7265
|
const candidateNum = toNumber(candidateValue);
|
|
6627
7266
|
const expectedNum = toNumber(expectedValue);
|
|
6628
7267
|
if (candidateNum === null || expectedNum === null) {
|
|
6629
7268
|
return {
|
|
6630
|
-
path:
|
|
7269
|
+
path: path16,
|
|
6631
7270
|
score: 0,
|
|
6632
7271
|
weight,
|
|
6633
7272
|
hit: false,
|
|
6634
|
-
message: `${
|
|
7273
|
+
message: `${path16} (non-numeric value)`
|
|
6635
7274
|
};
|
|
6636
7275
|
}
|
|
6637
7276
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6638
7277
|
return {
|
|
6639
|
-
path:
|
|
7278
|
+
path: path16,
|
|
6640
7279
|
score: 0,
|
|
6641
7280
|
weight,
|
|
6642
7281
|
hit: false,
|
|
6643
|
-
message: `${
|
|
7282
|
+
message: `${path16} (invalid numeric value)`
|
|
6644
7283
|
};
|
|
6645
7284
|
}
|
|
6646
7285
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -6653,61 +7292,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
6653
7292
|
}
|
|
6654
7293
|
if (withinTolerance) {
|
|
6655
7294
|
return {
|
|
6656
|
-
path:
|
|
7295
|
+
path: path16,
|
|
6657
7296
|
score: 1,
|
|
6658
7297
|
weight,
|
|
6659
7298
|
hit: true,
|
|
6660
|
-
message: `${
|
|
7299
|
+
message: `${path16} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6661
7300
|
};
|
|
6662
7301
|
}
|
|
6663
7302
|
return {
|
|
6664
|
-
path:
|
|
7303
|
+
path: path16,
|
|
6665
7304
|
score: 0,
|
|
6666
7305
|
weight,
|
|
6667
7306
|
hit: false,
|
|
6668
|
-
message: `${
|
|
7307
|
+
message: `${path16} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6669
7308
|
};
|
|
6670
7309
|
}
|
|
6671
7310
|
/**
|
|
6672
7311
|
* Date comparison with format normalization.
|
|
6673
7312
|
*/
|
|
6674
|
-
compareDate(
|
|
7313
|
+
compareDate(path16, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6675
7314
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6676
7315
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6677
7316
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6678
7317
|
if (candidateDate === null) {
|
|
6679
7318
|
return {
|
|
6680
|
-
path:
|
|
7319
|
+
path: path16,
|
|
6681
7320
|
score: 0,
|
|
6682
7321
|
weight,
|
|
6683
7322
|
hit: false,
|
|
6684
|
-
message: `${
|
|
7323
|
+
message: `${path16} (unparseable candidate date)`
|
|
6685
7324
|
};
|
|
6686
7325
|
}
|
|
6687
7326
|
if (expectedDate === null) {
|
|
6688
7327
|
return {
|
|
6689
|
-
path:
|
|
7328
|
+
path: path16,
|
|
6690
7329
|
score: 0,
|
|
6691
7330
|
weight,
|
|
6692
7331
|
hit: false,
|
|
6693
|
-
message: `${
|
|
7332
|
+
message: `${path16} (unparseable expected date)`
|
|
6694
7333
|
};
|
|
6695
7334
|
}
|
|
6696
7335
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6697
7336
|
return {
|
|
6698
|
-
path:
|
|
7337
|
+
path: path16,
|
|
6699
7338
|
score: 1,
|
|
6700
7339
|
weight,
|
|
6701
7340
|
hit: true,
|
|
6702
|
-
message:
|
|
7341
|
+
message: path16
|
|
6703
7342
|
};
|
|
6704
7343
|
}
|
|
6705
7344
|
return {
|
|
6706
|
-
path:
|
|
7345
|
+
path: path16,
|
|
6707
7346
|
score: 0,
|
|
6708
7347
|
weight,
|
|
6709
7348
|
hit: false,
|
|
6710
|
-
message: `${
|
|
7349
|
+
message: `${path16} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6711
7350
|
};
|
|
6712
7351
|
}
|
|
6713
7352
|
/**
|
|
@@ -6747,11 +7386,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
6747
7386
|
};
|
|
6748
7387
|
}
|
|
6749
7388
|
};
|
|
6750
|
-
function resolvePath(obj,
|
|
6751
|
-
if (!
|
|
7389
|
+
function resolvePath(obj, path16) {
|
|
7390
|
+
if (!path16 || !obj) {
|
|
6752
7391
|
return void 0;
|
|
6753
7392
|
}
|
|
6754
|
-
const parts =
|
|
7393
|
+
const parts = path16.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6755
7394
|
let current = obj;
|
|
6756
7395
|
for (const part of parts) {
|
|
6757
7396
|
if (current === null || current === void 0) {
|
|
@@ -6976,6 +7615,27 @@ function argsMatch(expected, actual) {
|
|
|
6976
7615
|
}
|
|
6977
7616
|
return true;
|
|
6978
7617
|
}
|
|
7618
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
7619
|
+
if (maxDurationMs === void 0) {
|
|
7620
|
+
return { status: "skip", message: "" };
|
|
7621
|
+
}
|
|
7622
|
+
if (actualDurationMs === void 0) {
|
|
7623
|
+
return {
|
|
7624
|
+
status: "skip",
|
|
7625
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
7626
|
+
};
|
|
7627
|
+
}
|
|
7628
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
7629
|
+
return {
|
|
7630
|
+
status: "pass",
|
|
7631
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
7632
|
+
};
|
|
7633
|
+
}
|
|
7634
|
+
return {
|
|
7635
|
+
status: "fail",
|
|
7636
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
7637
|
+
};
|
|
7638
|
+
}
|
|
6979
7639
|
var ToolTrajectoryEvaluator = class {
|
|
6980
7640
|
kind = "tool_trajectory";
|
|
6981
7641
|
config;
|
|
@@ -7034,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7034
7694
|
for (const call of message.toolCalls) {
|
|
7035
7695
|
toolCalls.push({
|
|
7036
7696
|
name: call.tool,
|
|
7037
|
-
args: call.input
|
|
7697
|
+
args: call.input,
|
|
7698
|
+
durationMs: call.durationMs
|
|
7038
7699
|
});
|
|
7039
7700
|
}
|
|
7040
7701
|
}
|
|
@@ -7102,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7102
7763
|
}
|
|
7103
7764
|
const hits = [];
|
|
7104
7765
|
const misses = [];
|
|
7766
|
+
const warnings = [];
|
|
7105
7767
|
let actualIndex = 0;
|
|
7768
|
+
let sequenceHits = 0;
|
|
7769
|
+
let latencyHits = 0;
|
|
7770
|
+
let latencySkips = 0;
|
|
7771
|
+
const latencyAssertionCount = expected.filter(
|
|
7772
|
+
(item) => item.maxDurationMs !== void 0
|
|
7773
|
+
).length;
|
|
7106
7774
|
for (let i = 0; i < expected.length; i++) {
|
|
7107
7775
|
const expectedItem = expected[i];
|
|
7108
7776
|
const expectedTool = expectedItem.tool;
|
|
7109
7777
|
let found = false;
|
|
7110
7778
|
let argsMismatch = false;
|
|
7779
|
+
let matchedCall;
|
|
7111
7780
|
while (actualIndex < toolCalls.length) {
|
|
7112
7781
|
const actualCall = toolCalls[actualIndex];
|
|
7113
7782
|
if (actualCall.name === expectedTool) {
|
|
7114
7783
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7115
7784
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7785
|
+
sequenceHits++;
|
|
7786
|
+
matchedCall = actualCall;
|
|
7116
7787
|
actualIndex++;
|
|
7117
7788
|
found = true;
|
|
7118
7789
|
break;
|
|
@@ -7129,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7129
7800
|
if (!found && !argsMismatch) {
|
|
7130
7801
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7131
7802
|
}
|
|
7803
|
+
if (found && matchedCall) {
|
|
7804
|
+
const latencyResult = checkLatency(
|
|
7805
|
+
expectedTool,
|
|
7806
|
+
expectedItem.maxDurationMs,
|
|
7807
|
+
matchedCall.durationMs
|
|
7808
|
+
);
|
|
7809
|
+
if (latencyResult.status === "pass") {
|
|
7810
|
+
hits.push(latencyResult.message);
|
|
7811
|
+
latencyHits++;
|
|
7812
|
+
} else if (latencyResult.status === "fail") {
|
|
7813
|
+
misses.push(latencyResult.message);
|
|
7814
|
+
} else if (latencyResult.message) {
|
|
7815
|
+
warnings.push(latencyResult.message);
|
|
7816
|
+
latencySkips++;
|
|
7817
|
+
}
|
|
7818
|
+
}
|
|
7132
7819
|
}
|
|
7133
|
-
const
|
|
7820
|
+
for (const warning of warnings) {
|
|
7821
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
7822
|
+
}
|
|
7823
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
7824
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
7825
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
7134
7826
|
return {
|
|
7135
7827
|
score,
|
|
7136
7828
|
verdict: scoreToVerdict(score),
|
|
7137
7829
|
hits,
|
|
7138
7830
|
misses,
|
|
7139
|
-
expectedAspectCount:
|
|
7831
|
+
expectedAspectCount: totalAssertions
|
|
7140
7832
|
};
|
|
7141
7833
|
}
|
|
7142
7834
|
evaluateExact(toolCalls) {
|
|
@@ -7152,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7152
7844
|
}
|
|
7153
7845
|
const hits = [];
|
|
7154
7846
|
const misses = [];
|
|
7847
|
+
const warnings = [];
|
|
7848
|
+
let sequenceHits = 0;
|
|
7849
|
+
let latencyHits = 0;
|
|
7850
|
+
let latencySkips = 0;
|
|
7851
|
+
const latencyAssertionCount = expected.filter(
|
|
7852
|
+
(item) => item.maxDurationMs !== void 0
|
|
7853
|
+
).length;
|
|
7155
7854
|
if (toolCalls.length !== expected.length) {
|
|
7156
7855
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7157
7856
|
}
|
|
@@ -7161,33 +7860,58 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7161
7860
|
const expectedTool = expectedItem.tool;
|
|
7162
7861
|
const actualCall = toolCalls[i];
|
|
7163
7862
|
const actualTool = actualCall.name;
|
|
7863
|
+
let sequenceMatched = false;
|
|
7164
7864
|
if (actualTool === expectedTool) {
|
|
7165
7865
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7166
7866
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7867
|
+
sequenceHits++;
|
|
7868
|
+
sequenceMatched = true;
|
|
7167
7869
|
} else {
|
|
7168
7870
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7169
7871
|
}
|
|
7170
7872
|
} else {
|
|
7171
7873
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7172
7874
|
}
|
|
7875
|
+
if (sequenceMatched) {
|
|
7876
|
+
const latencyResult = checkLatency(
|
|
7877
|
+
expectedTool,
|
|
7878
|
+
expectedItem.maxDurationMs,
|
|
7879
|
+
actualCall.durationMs
|
|
7880
|
+
);
|
|
7881
|
+
if (latencyResult.status === "pass") {
|
|
7882
|
+
hits.push(latencyResult.message);
|
|
7883
|
+
latencyHits++;
|
|
7884
|
+
} else if (latencyResult.status === "fail") {
|
|
7885
|
+
misses.push(latencyResult.message);
|
|
7886
|
+
} else if (latencyResult.message) {
|
|
7887
|
+
warnings.push(latencyResult.message);
|
|
7888
|
+
latencySkips++;
|
|
7889
|
+
}
|
|
7890
|
+
}
|
|
7173
7891
|
}
|
|
7174
7892
|
for (let i = checkLength; i < expected.length; i++) {
|
|
7175
7893
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7176
7894
|
}
|
|
7177
|
-
const
|
|
7895
|
+
for (const warning of warnings) {
|
|
7896
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
7897
|
+
}
|
|
7898
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
7899
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
7900
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
7178
7901
|
return {
|
|
7179
7902
|
score,
|
|
7180
7903
|
verdict: scoreToVerdict(score),
|
|
7181
7904
|
hits,
|
|
7182
7905
|
misses,
|
|
7183
|
-
expectedAspectCount:
|
|
7906
|
+
expectedAspectCount: totalAssertions
|
|
7184
7907
|
};
|
|
7185
7908
|
}
|
|
7186
7909
|
};
|
|
7187
7910
|
|
|
7188
7911
|
// src/evaluation/orchestrator.ts
|
|
7189
7912
|
import { createHash } from "node:crypto";
|
|
7190
|
-
import
|
|
7913
|
+
import path15 from "node:path";
|
|
7914
|
+
import micromatch4 from "micromatch";
|
|
7191
7915
|
|
|
7192
7916
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
7193
7917
|
var Node = class {
|
|
@@ -7346,17 +8070,17 @@ async function runEvaluation(options) {
|
|
|
7346
8070
|
cache,
|
|
7347
8071
|
useCache,
|
|
7348
8072
|
now,
|
|
7349
|
-
|
|
8073
|
+
filter,
|
|
7350
8074
|
verbose,
|
|
7351
8075
|
evalCases: preloadedEvalCases,
|
|
7352
8076
|
onResult,
|
|
7353
8077
|
onProgress
|
|
7354
8078
|
} = options;
|
|
7355
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
7356
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
8079
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
|
|
8080
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
7357
8081
|
if (filteredEvalCases.length === 0) {
|
|
7358
|
-
if (
|
|
7359
|
-
throw new Error(`
|
|
8082
|
+
if (filter) {
|
|
8083
|
+
throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
|
|
7360
8084
|
}
|
|
7361
8085
|
return [];
|
|
7362
8086
|
}
|
|
@@ -7932,7 +8656,10 @@ async function runEvaluatorList(options) {
|
|
|
7932
8656
|
attempt,
|
|
7933
8657
|
promptInputs,
|
|
7934
8658
|
now,
|
|
7935
|
-
judgeProvider
|
|
8659
|
+
judgeProvider,
|
|
8660
|
+
outputMessages,
|
|
8661
|
+
traceSummary,
|
|
8662
|
+
agentTimeoutMs
|
|
7936
8663
|
});
|
|
7937
8664
|
const weight = evaluator.weight ?? 1;
|
|
7938
8665
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -7986,7 +8713,7 @@ async function runEvaluatorList(options) {
|
|
|
7986
8713
|
});
|
|
7987
8714
|
}
|
|
7988
8715
|
if (evaluator.type === "composite") {
|
|
7989
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
8716
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path15.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
7990
8717
|
const createEvaluator = (memberConfig) => {
|
|
7991
8718
|
switch (memberConfig.type) {
|
|
7992
8719
|
case "llm_judge":
|
|
@@ -8267,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
8267
8994
|
attempt,
|
|
8268
8995
|
promptInputs,
|
|
8269
8996
|
now,
|
|
8270
|
-
judgeProvider
|
|
8997
|
+
judgeProvider,
|
|
8998
|
+
outputMessages,
|
|
8999
|
+
traceSummary,
|
|
9000
|
+
agentTimeoutMs
|
|
8271
9001
|
} = options;
|
|
8272
|
-
const customPrompt = await resolveCustomPrompt(
|
|
9002
|
+
const customPrompt = await resolveCustomPrompt(
|
|
9003
|
+
config,
|
|
9004
|
+
{
|
|
9005
|
+
evalCase,
|
|
9006
|
+
candidate,
|
|
9007
|
+
outputMessages,
|
|
9008
|
+
traceSummary,
|
|
9009
|
+
config: config.config
|
|
9010
|
+
},
|
|
9011
|
+
agentTimeoutMs
|
|
9012
|
+
);
|
|
8273
9013
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
8274
9014
|
evalCase,
|
|
8275
9015
|
candidate,
|
|
@@ -8283,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
8283
9023
|
evaluator: config
|
|
8284
9024
|
});
|
|
8285
9025
|
}
|
|
8286
|
-
async function resolveCustomPrompt(
|
|
8287
|
-
if (
|
|
9026
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
9027
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
9028
|
+
if (!context) {
|
|
9029
|
+
throw new Error("Context required for executable prompt templates");
|
|
9030
|
+
}
|
|
9031
|
+
return executePromptTemplate(
|
|
9032
|
+
promptConfig.resolvedPromptScript,
|
|
9033
|
+
context,
|
|
9034
|
+
promptConfig.config,
|
|
9035
|
+
timeoutMs
|
|
9036
|
+
);
|
|
9037
|
+
}
|
|
9038
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
9039
|
+
if (promptPath) {
|
|
8288
9040
|
try {
|
|
8289
|
-
const content = await readTextFile(
|
|
9041
|
+
const content = await readTextFile(promptPath);
|
|
8290
9042
|
return content;
|
|
8291
9043
|
} catch (error) {
|
|
8292
9044
|
const message = error instanceof Error ? error.message : String(error);
|
|
8293
|
-
console.warn(`Could not read custom prompt at ${
|
|
9045
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
8294
9046
|
}
|
|
8295
9047
|
}
|
|
8296
|
-
|
|
9048
|
+
const promptValue = promptConfig.prompt;
|
|
9049
|
+
if (typeof promptValue === "string") {
|
|
9050
|
+
return promptValue;
|
|
9051
|
+
}
|
|
9052
|
+
return void 0;
|
|
9053
|
+
}
|
|
9054
|
+
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
9055
|
+
const payload = {
|
|
9056
|
+
question: context.evalCase.question,
|
|
9057
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
9058
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
9059
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
9060
|
+
candidateAnswer: context.candidate,
|
|
9061
|
+
outputMessages: context.outputMessages ?? null,
|
|
9062
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
9063
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
9064
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
9065
|
+
),
|
|
9066
|
+
inputMessages: context.evalCase.input_messages,
|
|
9067
|
+
traceSummary: context.traceSummary ?? null,
|
|
9068
|
+
config: config ?? context.config ?? null
|
|
9069
|
+
};
|
|
9070
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
9071
|
+
const scriptPath = script[script.length - 1];
|
|
9072
|
+
const cwd = path15.dirname(scriptPath);
|
|
9073
|
+
try {
|
|
9074
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
9075
|
+
const prompt = stdout.trim();
|
|
9076
|
+
if (!prompt) {
|
|
9077
|
+
throw new Error("Prompt template produced empty output");
|
|
9078
|
+
}
|
|
9079
|
+
return prompt;
|
|
9080
|
+
} catch (error) {
|
|
9081
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
9082
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
9083
|
+
}
|
|
8297
9084
|
}
|
|
8298
|
-
function filterEvalCases(evalCases,
|
|
8299
|
-
if (!
|
|
9085
|
+
function filterEvalCases(evalCases, filter) {
|
|
9086
|
+
if (!filter) {
|
|
8300
9087
|
return evalCases;
|
|
8301
9088
|
}
|
|
8302
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
9089
|
+
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
|
|
8303
9090
|
}
|
|
8304
9091
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
8305
9092
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -8457,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
|
|
|
8457
9244
|
import { z as z3 } from "zod";
|
|
8458
9245
|
var rubricItemSchema = z3.object({
|
|
8459
9246
|
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
8460
|
-
|
|
9247
|
+
expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
|
|
8461
9248
|
weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
|
|
8462
9249
|
required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
8463
9250
|
});
|
|
@@ -8477,7 +9264,7 @@ You must return a valid JSON object matching this schema:
|
|
|
8477
9264
|
"rubrics": [
|
|
8478
9265
|
{
|
|
8479
9266
|
"id": "string (short identifier)",
|
|
8480
|
-
"
|
|
9267
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
8481
9268
|
"weight": number (default 1.0),
|
|
8482
9269
|
"required": boolean (default true)
|
|
8483
9270
|
}
|
|
@@ -8513,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
8513
9300
|
"Each rubric should:",
|
|
8514
9301
|
"- Be specific and testable",
|
|
8515
9302
|
"- Have a short, descriptive ID",
|
|
8516
|
-
"- Include a clear
|
|
9303
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
8517
9304
|
"- Indicate if it is required (mandatory) or optional",
|
|
8518
9305
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
8519
9306
|
"",
|
|
@@ -8560,6 +9347,7 @@ export {
|
|
|
8560
9347
|
createAgentKernel,
|
|
8561
9348
|
createProvider,
|
|
8562
9349
|
deepEqual,
|
|
9350
|
+
detectFormat,
|
|
8563
9351
|
ensureVSCodeSubagents,
|
|
8564
9352
|
executeScript,
|
|
8565
9353
|
explorationRatio,
|