@agentv/core 2.2.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
- package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +39 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +654 -119
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +75 -6
- package/dist/index.d.ts +75 -6
- package/dist/index.js +655 -120
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-RP3M7COZ.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -148,6 +148,7 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
148
148
|
// src/evaluation/yaml-parser.ts
|
|
149
149
|
import { readFile as readFile6 } from "node:fs/promises";
|
|
150
150
|
import path7 from "node:path";
|
|
151
|
+
import micromatch3 from "micromatch";
|
|
151
152
|
import { parse as parse2 } from "yaml";
|
|
152
153
|
|
|
153
154
|
// src/evaluation/loaders/config-loader.ts
|
|
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
462
463
|
);
|
|
463
464
|
}
|
|
464
465
|
}
|
|
465
|
-
const
|
|
466
|
-
const
|
|
466
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
467
|
+
const config2 = {};
|
|
467
468
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
468
|
-
if (!
|
|
469
|
-
|
|
469
|
+
if (!knownProps2.has(key) && value !== void 0) {
|
|
470
|
+
config2[key] = value;
|
|
470
471
|
}
|
|
471
472
|
}
|
|
472
473
|
evaluators.push({
|
|
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
476
477
|
cwd,
|
|
477
478
|
resolvedCwd,
|
|
478
479
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
479
|
-
...Object.keys(
|
|
480
|
+
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
480
481
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
481
482
|
});
|
|
482
483
|
continue;
|
|
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
641
642
|
continue;
|
|
642
643
|
}
|
|
643
644
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
644
|
-
const
|
|
645
|
+
const config2 = {
|
|
645
646
|
name,
|
|
646
647
|
type: "tool_trajectory",
|
|
647
648
|
mode,
|
|
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
649
650
|
...expected ? { expected } : {},
|
|
650
651
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
651
652
|
};
|
|
652
|
-
evaluators.push(
|
|
653
|
+
evaluators.push(config2);
|
|
653
654
|
continue;
|
|
654
655
|
}
|
|
655
656
|
if (typeValue === "field_accuracy") {
|
|
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
786
787
|
});
|
|
787
788
|
continue;
|
|
788
789
|
}
|
|
789
|
-
const
|
|
790
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
791
|
+
let prompt;
|
|
790
792
|
let promptPath;
|
|
791
|
-
|
|
793
|
+
let resolvedPromptScript;
|
|
794
|
+
let promptScriptConfig;
|
|
795
|
+
if (isJsonObject2(rawPrompt)) {
|
|
796
|
+
const scriptArray = asStringArray(
|
|
797
|
+
rawPrompt.script,
|
|
798
|
+
`prompt.script for evaluator '${name}' in '${evalId}'`
|
|
799
|
+
);
|
|
800
|
+
if (!scriptArray) {
|
|
801
|
+
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
|
|
802
|
+
}
|
|
803
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
804
|
+
const resolved = await resolveFileReference2(scriptPath, searchRoots);
|
|
805
|
+
if (resolved.resolvedPath) {
|
|
806
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
|
|
807
|
+
} else {
|
|
808
|
+
throw new Error(
|
|
809
|
+
`Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
810
|
+
);
|
|
811
|
+
}
|
|
812
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
813
|
+
promptScriptConfig = rawPrompt.config;
|
|
814
|
+
}
|
|
815
|
+
} else if (typeof rawPrompt === "string") {
|
|
816
|
+
prompt = rawPrompt;
|
|
792
817
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
793
818
|
if (resolved.resolvedPath) {
|
|
794
819
|
promptPath = path3.resolve(resolved.resolvedPath);
|
|
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
807
832
|
}
|
|
808
833
|
const _model = asString(rawEvaluator.model);
|
|
809
834
|
const rawRubrics = rawEvaluator.rubrics;
|
|
810
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
811
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
812
|
-
description: asString(rubric.description) ?? "",
|
|
813
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
814
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
815
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
835
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
|
|
816
836
|
if (typeValue === "rubric") {
|
|
817
837
|
if (!parsedRubrics) {
|
|
818
838
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
832
852
|
continue;
|
|
833
853
|
}
|
|
834
854
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
855
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
856
|
+
const config = {};
|
|
857
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
858
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
859
|
+
config[key] = value;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
863
|
+
const mergedConfig = { ...config, ...topLevelConfig };
|
|
864
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
835
865
|
evaluators.push({
|
|
836
866
|
name,
|
|
837
867
|
type: "llm_judge",
|
|
838
868
|
prompt,
|
|
839
869
|
promptPath,
|
|
870
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
871
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
840
872
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
841
|
-
...weight !== void 0 ? { weight } : {}
|
|
873
|
+
...weight !== void 0 ? { weight } : {},
|
|
874
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
842
875
|
});
|
|
843
876
|
}
|
|
844
877
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -925,10 +958,190 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
925
958
|
function isValidFieldAggregationType(value) {
|
|
926
959
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
927
960
|
}
|
|
961
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
962
|
+
const items = [];
|
|
963
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
964
|
+
if (!isJsonObject2(rawRubric)) {
|
|
965
|
+
logWarning2(
|
|
966
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
967
|
+
);
|
|
968
|
+
continue;
|
|
969
|
+
}
|
|
970
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
971
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
972
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
973
|
+
let requiredMinScore;
|
|
974
|
+
let required;
|
|
975
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
976
|
+
const minScore = rawRubric.required_min_score;
|
|
977
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
978
|
+
throw new Error(
|
|
979
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
980
|
+
);
|
|
981
|
+
}
|
|
982
|
+
requiredMinScore = minScore;
|
|
983
|
+
}
|
|
984
|
+
if (typeof rawRubric.required === "boolean") {
|
|
985
|
+
required = rawRubric.required;
|
|
986
|
+
}
|
|
987
|
+
let scoreRanges;
|
|
988
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
989
|
+
if (rawScoreRanges !== void 0) {
|
|
990
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
991
|
+
throw new Error(
|
|
992
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
993
|
+
);
|
|
994
|
+
}
|
|
995
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
996
|
+
items.push({
|
|
997
|
+
id,
|
|
998
|
+
weight,
|
|
999
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1000
|
+
...required !== void 0 ? { required } : {},
|
|
1001
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1002
|
+
score_ranges: scoreRanges
|
|
1003
|
+
});
|
|
1004
|
+
} else {
|
|
1005
|
+
if (expectedOutcome.length === 0) {
|
|
1006
|
+
logWarning2(
|
|
1007
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
1008
|
+
);
|
|
1009
|
+
continue;
|
|
1010
|
+
}
|
|
1011
|
+
items.push({
|
|
1012
|
+
id,
|
|
1013
|
+
expected_outcome: expectedOutcome,
|
|
1014
|
+
weight,
|
|
1015
|
+
// Default to required: true if not specified (backward compatibility)
|
|
1016
|
+
required: required ?? true,
|
|
1017
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
return items.length > 0 ? items : void 0;
|
|
1022
|
+
}
|
|
1023
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
1024
|
+
const ranges = [];
|
|
1025
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
1026
|
+
if (!isJsonObject2(rawRange)) {
|
|
1027
|
+
throw new Error(
|
|
1028
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
1029
|
+
);
|
|
1030
|
+
}
|
|
1031
|
+
const scoreRangeValue = rawRange.score_range;
|
|
1032
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
1033
|
+
throw new Error(
|
|
1034
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
1035
|
+
);
|
|
1036
|
+
}
|
|
1037
|
+
const [min, max] = scoreRangeValue;
|
|
1038
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
1039
|
+
throw new Error(
|
|
1040
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
1041
|
+
);
|
|
1042
|
+
}
|
|
1043
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
1044
|
+
throw new Error(
|
|
1045
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
1046
|
+
);
|
|
1047
|
+
}
|
|
1048
|
+
if (min > max) {
|
|
1049
|
+
throw new Error(
|
|
1050
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
1051
|
+
);
|
|
1052
|
+
}
|
|
1053
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
1054
|
+
if (expectedOutcome.length === 0) {
|
|
1055
|
+
throw new Error(
|
|
1056
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
1057
|
+
);
|
|
1058
|
+
}
|
|
1059
|
+
ranges.push({
|
|
1060
|
+
score_range: [min, max],
|
|
1061
|
+
expected_outcome: expectedOutcome
|
|
1062
|
+
});
|
|
1063
|
+
}
|
|
1064
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
1065
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
1066
|
+
const prev = sortedRanges[i - 1];
|
|
1067
|
+
const curr = sortedRanges[i];
|
|
1068
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
1069
|
+
throw new Error(
|
|
1070
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
1071
|
+
);
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
const covered = /* @__PURE__ */ new Set();
|
|
1075
|
+
for (const range of ranges) {
|
|
1076
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
1077
|
+
covered.add(i);
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
const missing = [];
|
|
1081
|
+
for (let i = 0; i <= 10; i++) {
|
|
1082
|
+
if (!covered.has(i)) {
|
|
1083
|
+
missing.push(i);
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
if (missing.length > 0) {
|
|
1087
|
+
throw new Error(
|
|
1088
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
1089
|
+
);
|
|
1090
|
+
}
|
|
1091
|
+
return ranges;
|
|
1092
|
+
}
|
|
1093
|
+
function parseInlineRubrics(rawRubrics) {
|
|
1094
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
1095
|
+
if (typeof rubric === "string") {
|
|
1096
|
+
return {
|
|
1097
|
+
id: `rubric-${index + 1}`,
|
|
1098
|
+
expected_outcome: rubric,
|
|
1099
|
+
weight: 1,
|
|
1100
|
+
required: true
|
|
1101
|
+
};
|
|
1102
|
+
}
|
|
1103
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
1104
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
1105
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
1106
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
1107
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
1108
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
1109
|
+
const baseRubric = {
|
|
1110
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
1111
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
1112
|
+
};
|
|
1113
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
1114
|
+
return {
|
|
1115
|
+
...baseRubric,
|
|
1116
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
1117
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
1118
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
1119
|
+
score_ranges: scoreRanges
|
|
1120
|
+
};
|
|
1121
|
+
}
|
|
1122
|
+
return {
|
|
1123
|
+
...baseRubric,
|
|
1124
|
+
expected_outcome: expectedOutcome,
|
|
1125
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
1126
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
1127
|
+
};
|
|
1128
|
+
}).filter(
|
|
1129
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
1130
|
+
);
|
|
1131
|
+
if (rubricItems.length === 0) {
|
|
1132
|
+
return void 0;
|
|
1133
|
+
}
|
|
1134
|
+
return {
|
|
1135
|
+
name: "rubric",
|
|
1136
|
+
type: "llm_judge",
|
|
1137
|
+
rubrics: rubricItems
|
|
1138
|
+
};
|
|
1139
|
+
}
|
|
928
1140
|
|
|
929
1141
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
930
1142
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
931
1143
|
import path5 from "node:path";
|
|
1144
|
+
import micromatch2 from "micromatch";
|
|
932
1145
|
import { parse as parseYaml } from "yaml";
|
|
933
1146
|
|
|
934
1147
|
// src/evaluation/loaders/message-processor.ts
|
|
@@ -1191,6 +1404,65 @@ async function processExpectedMessages(options) {
|
|
|
1191
1404
|
return segments;
|
|
1192
1405
|
}
|
|
1193
1406
|
|
|
1407
|
+
// src/evaluation/loaders/shorthand-expansion.ts
|
|
1408
|
+
function expandInputShorthand(value) {
|
|
1409
|
+
if (value === void 0 || value === null) {
|
|
1410
|
+
return void 0;
|
|
1411
|
+
}
|
|
1412
|
+
if (typeof value === "string") {
|
|
1413
|
+
return [{ role: "user", content: value }];
|
|
1414
|
+
}
|
|
1415
|
+
if (Array.isArray(value)) {
|
|
1416
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1417
|
+
return messages.length > 0 ? messages : void 0;
|
|
1418
|
+
}
|
|
1419
|
+
return void 0;
|
|
1420
|
+
}
|
|
1421
|
+
function expandExpectedOutputShorthand(value) {
|
|
1422
|
+
if (value === void 0 || value === null) {
|
|
1423
|
+
return void 0;
|
|
1424
|
+
}
|
|
1425
|
+
if (typeof value === "string") {
|
|
1426
|
+
return [{ role: "assistant", content: value }];
|
|
1427
|
+
}
|
|
1428
|
+
if (Array.isArray(value)) {
|
|
1429
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
1430
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
1431
|
+
return messages.length > 0 ? messages : void 0;
|
|
1432
|
+
}
|
|
1433
|
+
return [{ role: "assistant", content: value }];
|
|
1434
|
+
}
|
|
1435
|
+
if (isJsonObject(value)) {
|
|
1436
|
+
if ("role" in value) {
|
|
1437
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
1438
|
+
}
|
|
1439
|
+
return [{ role: "assistant", content: value }];
|
|
1440
|
+
}
|
|
1441
|
+
return void 0;
|
|
1442
|
+
}
|
|
1443
|
+
function resolveInputMessages(raw) {
|
|
1444
|
+
if (raw.input_messages !== void 0) {
|
|
1445
|
+
if (Array.isArray(raw.input_messages)) {
|
|
1446
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
1447
|
+
return messages.length > 0 ? messages : void 0;
|
|
1448
|
+
}
|
|
1449
|
+
return void 0;
|
|
1450
|
+
}
|
|
1451
|
+
return expandInputShorthand(raw.input);
|
|
1452
|
+
}
|
|
1453
|
+
function resolveExpectedMessages(raw) {
|
|
1454
|
+
if (raw.expected_messages !== void 0) {
|
|
1455
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
1456
|
+
const messages = raw.expected_messages.filter(
|
|
1457
|
+
(msg) => isTestMessage(msg)
|
|
1458
|
+
);
|
|
1459
|
+
return messages.length > 0 ? messages : void 0;
|
|
1460
|
+
}
|
|
1461
|
+
return void 0;
|
|
1462
|
+
}
|
|
1463
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1194
1466
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
1195
1467
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1196
1468
|
var ANSI_RED = "\x1B[31m";
|
|
@@ -1251,7 +1523,7 @@ function parseJsonlContent(content, filePath) {
|
|
|
1251
1523
|
}
|
|
1252
1524
|
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1253
1525
|
const verbose = options?.verbose ?? false;
|
|
1254
|
-
const
|
|
1526
|
+
const filterPattern = options?.filter;
|
|
1255
1527
|
const absoluteTestPath = path5.resolve(evalFilePath);
|
|
1256
1528
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1257
1529
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
@@ -1278,28 +1550,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
1278
1550
|
const evalcase = rawCases[lineIndex];
|
|
1279
1551
|
const lineNumber = lineIndex + 1;
|
|
1280
1552
|
const id = asString4(evalcase.id);
|
|
1281
|
-
if (
|
|
1553
|
+
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
1282
1554
|
continue;
|
|
1283
1555
|
}
|
|
1284
1556
|
const conversationId = asString4(evalcase.conversation_id);
|
|
1285
1557
|
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1286
|
-
const
|
|
1287
|
-
const
|
|
1288
|
-
if (!id || !outcome || !
|
|
1558
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1559
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1560
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1289
1561
|
logError(
|
|
1290
|
-
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
1562
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
1291
1563
|
);
|
|
1292
1564
|
continue;
|
|
1293
1565
|
}
|
|
1294
|
-
const hasExpectedMessages =
|
|
1295
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1296
|
-
(msg) => isTestMessage(msg)
|
|
1297
|
-
);
|
|
1298
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1299
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1300
|
-
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
1301
|
-
continue;
|
|
1302
|
-
}
|
|
1566
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1303
1567
|
const guidelinePaths = [];
|
|
1304
1568
|
const inputTextParts = [];
|
|
1305
1569
|
const inputSegments = await processMessages({
|
|
@@ -1345,28 +1609,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
1345
1609
|
}
|
|
1346
1610
|
const inlineRubrics = evalcase.rubrics;
|
|
1347
1611
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1348
|
-
const
|
|
1349
|
-
|
|
1350
|
-
return {
|
|
1351
|
-
id: `rubric-${index + 1}`,
|
|
1352
|
-
description: rubric,
|
|
1353
|
-
weight: 1,
|
|
1354
|
-
required: true
|
|
1355
|
-
};
|
|
1356
|
-
}
|
|
1357
|
-
return {
|
|
1358
|
-
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
1359
|
-
description: asString4(rubric.description) ?? "",
|
|
1360
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1361
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1362
|
-
};
|
|
1363
|
-
}).filter((r) => r.description.length > 0);
|
|
1364
|
-
if (rubricItems.length > 0) {
|
|
1365
|
-
const rubricEvaluator = {
|
|
1366
|
-
name: "rubric",
|
|
1367
|
-
type: "llm_judge",
|
|
1368
|
-
rubrics: rubricItems
|
|
1369
|
-
};
|
|
1612
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
1613
|
+
if (rubricEvaluator) {
|
|
1370
1614
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1371
1615
|
}
|
|
1372
1616
|
}
|
|
@@ -1676,7 +1920,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1676
1920
|
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1677
1921
|
}
|
|
1678
1922
|
const verbose = options?.verbose ?? false;
|
|
1679
|
-
const
|
|
1923
|
+
const filterPattern = options?.filter;
|
|
1680
1924
|
const absoluteTestPath = path7.resolve(evalFilePath);
|
|
1681
1925
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1682
1926
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
@@ -1706,28 +1950,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1706
1950
|
}
|
|
1707
1951
|
const evalcase = rawEvalcase;
|
|
1708
1952
|
const id = asString6(evalcase.id);
|
|
1709
|
-
if (
|
|
1953
|
+
if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
|
|
1710
1954
|
continue;
|
|
1711
1955
|
}
|
|
1712
1956
|
const conversationId = asString6(evalcase.conversation_id);
|
|
1713
1957
|
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1714
|
-
const
|
|
1715
|
-
const
|
|
1716
|
-
if (!id || !outcome || !
|
|
1958
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
1959
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
1960
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
1717
1961
|
logError2(
|
|
1718
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
1962
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
1719
1963
|
);
|
|
1720
1964
|
continue;
|
|
1721
1965
|
}
|
|
1722
|
-
const hasExpectedMessages =
|
|
1723
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1724
|
-
(msg) => isTestMessage(msg)
|
|
1725
|
-
);
|
|
1726
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1727
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1728
|
-
logError2(`No valid expected message found for eval case: ${id}`);
|
|
1729
|
-
continue;
|
|
1730
|
-
}
|
|
1966
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
1731
1967
|
const guidelinePaths = [];
|
|
1732
1968
|
const inputTextParts = [];
|
|
1733
1969
|
const inputSegments = await processMessages({
|
|
@@ -1771,28 +2007,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1771
2007
|
}
|
|
1772
2008
|
const inlineRubrics = evalcase.rubrics;
|
|
1773
2009
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1774
|
-
const
|
|
1775
|
-
|
|
1776
|
-
return {
|
|
1777
|
-
id: `rubric-${index + 1}`,
|
|
1778
|
-
description: rubric,
|
|
1779
|
-
weight: 1,
|
|
1780
|
-
required: true
|
|
1781
|
-
};
|
|
1782
|
-
}
|
|
1783
|
-
return {
|
|
1784
|
-
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
1785
|
-
description: asString6(rubric.description) ?? "",
|
|
1786
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1787
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1788
|
-
};
|
|
1789
|
-
}).filter((r) => r.description.length > 0);
|
|
1790
|
-
if (rubricItems.length > 0) {
|
|
1791
|
-
const rubricEvaluator = {
|
|
1792
|
-
name: "rubric",
|
|
1793
|
-
type: "llm_judge",
|
|
1794
|
-
rubrics: rubricItems
|
|
1795
|
-
};
|
|
2010
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2011
|
+
if (rubricEvaluator) {
|
|
1796
2012
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1797
2013
|
}
|
|
1798
2014
|
}
|
|
@@ -3049,7 +3265,8 @@ var ToolCallSchema = z.object({
|
|
|
3049
3265
|
input: z.unknown().optional(),
|
|
3050
3266
|
output: z.unknown().optional(),
|
|
3051
3267
|
id: z.string().optional(),
|
|
3052
|
-
timestamp: z.string().optional()
|
|
3268
|
+
timestamp: z.string().optional(),
|
|
3269
|
+
duration_ms: z.number().optional()
|
|
3053
3270
|
});
|
|
3054
3271
|
var OutputMessageInputSchema = z.object({
|
|
3055
3272
|
role: z.string(),
|
|
@@ -3057,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
|
|
|
3057
3274
|
content: z.unknown().optional(),
|
|
3058
3275
|
tool_calls: z.array(ToolCallSchema).optional(),
|
|
3059
3276
|
timestamp: z.string().optional(),
|
|
3277
|
+
duration_ms: z.number().optional(),
|
|
3060
3278
|
metadata: z.record(z.unknown()).optional()
|
|
3061
3279
|
});
|
|
3062
3280
|
var TokenUsageSchema = z.object({
|
|
@@ -3095,8 +3313,16 @@ function convertOutputMessages(messages) {
|
|
|
3095
3313
|
role: msg.role,
|
|
3096
3314
|
name: msg.name,
|
|
3097
3315
|
content: msg.content,
|
|
3098
|
-
toolCalls: msg.tool_calls
|
|
3316
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
3317
|
+
tool: tc.tool,
|
|
3318
|
+
input: tc.input,
|
|
3319
|
+
output: tc.output,
|
|
3320
|
+
id: tc.id,
|
|
3321
|
+
timestamp: tc.timestamp,
|
|
3322
|
+
durationMs: tc.duration_ms
|
|
3323
|
+
})),
|
|
3099
3324
|
timestamp: msg.timestamp,
|
|
3325
|
+
durationMs: msg.duration_ms,
|
|
3100
3326
|
metadata: msg.metadata
|
|
3101
3327
|
}));
|
|
3102
3328
|
}
|
|
@@ -6173,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
|
|
|
6173
6399
|
checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
6174
6400
|
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
|
|
6175
6401
|
});
|
|
6402
|
+
var scoreRangeCheckResultSchema = z2.object({
|
|
6403
|
+
id: z2.string().describe("The ID of the rubric criterion being scored"),
|
|
6404
|
+
score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
6405
|
+
reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
6406
|
+
});
|
|
6407
|
+
var scoreRangeEvaluationSchema = z2.object({
|
|
6408
|
+
checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
6409
|
+
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
6410
|
+
});
|
|
6176
6411
|
var LlmJudgeEvaluator = class {
|
|
6177
6412
|
kind = "llm_judge";
|
|
6178
6413
|
resolveJudgeProvider;
|
|
@@ -6258,6 +6493,10 @@ var LlmJudgeEvaluator = class {
|
|
|
6258
6493
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
6259
6494
|
);
|
|
6260
6495
|
}
|
|
6496
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
6497
|
+
if (hasScoreRanges) {
|
|
6498
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
6499
|
+
}
|
|
6261
6500
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
6262
6501
|
const systemPrompt = buildRubricOutputSchema();
|
|
6263
6502
|
const evaluatorRawRequest = {
|
|
@@ -6283,6 +6522,84 @@ var LlmJudgeEvaluator = class {
|
|
|
6283
6522
|
evaluatorRawRequest
|
|
6284
6523
|
};
|
|
6285
6524
|
}
|
|
6525
|
+
/**
|
|
6526
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
6527
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
6528
|
+
*/
|
|
6529
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
6530
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
6531
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
6532
|
+
const evaluatorRawRequest = {
|
|
6533
|
+
userPrompt: prompt,
|
|
6534
|
+
systemPrompt,
|
|
6535
|
+
target: judgeProvider.targetName
|
|
6536
|
+
};
|
|
6537
|
+
const { data } = await this.runWithRetry({
|
|
6538
|
+
context,
|
|
6539
|
+
judgeProvider,
|
|
6540
|
+
systemPrompt,
|
|
6541
|
+
userPrompt: prompt,
|
|
6542
|
+
schema: scoreRangeEvaluationSchema
|
|
6543
|
+
});
|
|
6544
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
6545
|
+
return {
|
|
6546
|
+
score,
|
|
6547
|
+
verdict,
|
|
6548
|
+
hits,
|
|
6549
|
+
misses,
|
|
6550
|
+
expectedAspectCount: rubrics.length,
|
|
6551
|
+
reasoning: data.overall_reasoning,
|
|
6552
|
+
evaluatorRawRequest,
|
|
6553
|
+
details
|
|
6554
|
+
};
|
|
6555
|
+
}
|
|
6556
|
+
/**
|
|
6557
|
+
* Build prompt for score-range rubric evaluation.
|
|
6558
|
+
*/
|
|
6559
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
6560
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
6561
|
+
const parts = [
|
|
6562
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
6563
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
6564
|
+
"",
|
|
6565
|
+
"[[ ## question ## ]]",
|
|
6566
|
+
formattedQuestion,
|
|
6567
|
+
"",
|
|
6568
|
+
"[[ ## expected_outcome ## ]]",
|
|
6569
|
+
context.evalCase.expected_outcome,
|
|
6570
|
+
""
|
|
6571
|
+
];
|
|
6572
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
6573
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
6574
|
+
}
|
|
6575
|
+
parts.push(
|
|
6576
|
+
"[[ ## candidate_answer ## ]]",
|
|
6577
|
+
context.candidate,
|
|
6578
|
+
"",
|
|
6579
|
+
"[[ ## scoring_criteria ## ]]"
|
|
6580
|
+
);
|
|
6581
|
+
for (const rubric of rubrics) {
|
|
6582
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
6583
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
6584
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
6585
|
+
if (rubric.expected_outcome) {
|
|
6586
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
6587
|
+
}
|
|
6588
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
6589
|
+
parts.push("Score ranges:");
|
|
6590
|
+
for (const range of rubric.score_ranges) {
|
|
6591
|
+
const [min, max] = range.score_range;
|
|
6592
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
6593
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
6594
|
+
}
|
|
6595
|
+
}
|
|
6596
|
+
}
|
|
6597
|
+
parts.push(
|
|
6598
|
+
"",
|
|
6599
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
6600
|
+
);
|
|
6601
|
+
return parts.join("\n");
|
|
6602
|
+
}
|
|
6286
6603
|
buildRubricPrompt(context, rubrics) {
|
|
6287
6604
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
6288
6605
|
const parts = [
|
|
@@ -6302,7 +6619,7 @@ var LlmJudgeEvaluator = class {
|
|
|
6302
6619
|
for (const rubric of rubrics) {
|
|
6303
6620
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
6304
6621
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
6305
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
6622
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
6306
6623
|
}
|
|
6307
6624
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
6308
6625
|
return parts.join("\n");
|
|
@@ -6389,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
6389
6706
|
totalWeight += rubric.weight;
|
|
6390
6707
|
if (check.satisfied) {
|
|
6391
6708
|
earnedWeight += rubric.weight;
|
|
6392
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
6709
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
6393
6710
|
} else {
|
|
6394
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
6711
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
|
|
6395
6712
|
if (rubric.required) {
|
|
6396
6713
|
failedRequired = true;
|
|
6397
6714
|
}
|
|
@@ -6401,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
6401
6718
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
6402
6719
|
return { score, verdict, hits, misses };
|
|
6403
6720
|
}
|
|
6721
|
+
function buildScoreRangeOutputSchema() {
|
|
6722
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
6723
|
+
You must return a valid JSON object matching this schema:
|
|
6724
|
+
{
|
|
6725
|
+
"checks": [
|
|
6726
|
+
{
|
|
6727
|
+
"id": "string (criterion id)",
|
|
6728
|
+
"score": integer (0-10),
|
|
6729
|
+
"reasoning": "string (brief explanation for score)"
|
|
6730
|
+
}
|
|
6731
|
+
],
|
|
6732
|
+
"overall_reasoning": "string (summary, optional)"
|
|
6733
|
+
}
|
|
6734
|
+
|
|
6735
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
6736
|
+
}
|
|
6737
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
6738
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
6739
|
+
const hits = [];
|
|
6740
|
+
const misses = [];
|
|
6741
|
+
const rawScores = {};
|
|
6742
|
+
let totalWeight = 0;
|
|
6743
|
+
let weightedScoreSum = 0;
|
|
6744
|
+
let failedRequired = false;
|
|
6745
|
+
for (const check of result.checks) {
|
|
6746
|
+
const rubric = rubricMap.get(check.id);
|
|
6747
|
+
if (!rubric) {
|
|
6748
|
+
continue;
|
|
6749
|
+
}
|
|
6750
|
+
const rawScore = Math.max(0, Math.min(10, check.score));
|
|
6751
|
+
const normalizedScore = rawScore / 10;
|
|
6752
|
+
rawScores[rubric.id] = rawScore;
|
|
6753
|
+
totalWeight += rubric.weight;
|
|
6754
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
6755
|
+
let requiredMinScore;
|
|
6756
|
+
if (rubric.required_min_score !== void 0) {
|
|
6757
|
+
requiredMinScore = rubric.required_min_score;
|
|
6758
|
+
} else if (rubric.required === true) {
|
|
6759
|
+
requiredMinScore = 10;
|
|
6760
|
+
}
|
|
6761
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
6762
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
6763
|
+
);
|
|
6764
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
6765
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
6766
|
+
const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
|
|
6767
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
6768
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
6769
|
+
failedRequired = true;
|
|
6770
|
+
misses.push(scoreInfo);
|
|
6771
|
+
} else if (rawScore >= 7) {
|
|
6772
|
+
hits.push(scoreInfo);
|
|
6773
|
+
} else {
|
|
6774
|
+
misses.push(scoreInfo);
|
|
6775
|
+
}
|
|
6776
|
+
}
|
|
6777
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
6778
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
6779
|
+
return {
|
|
6780
|
+
score,
|
|
6781
|
+
verdict,
|
|
6782
|
+
hits,
|
|
6783
|
+
misses,
|
|
6784
|
+
details: {
|
|
6785
|
+
raw_scores: rawScores,
|
|
6786
|
+
normalization: "score / 10",
|
|
6787
|
+
aggregation: "weighted_average"
|
|
6788
|
+
}
|
|
6789
|
+
};
|
|
6790
|
+
}
|
|
6404
6791
|
|
|
6405
6792
|
// src/evaluation/evaluators/composite.ts
|
|
6406
6793
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
@@ -7228,6 +7615,27 @@ function argsMatch(expected, actual) {
|
|
|
7228
7615
|
}
|
|
7229
7616
|
return true;
|
|
7230
7617
|
}
|
|
7618
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
7619
|
+
if (maxDurationMs === void 0) {
|
|
7620
|
+
return { status: "skip", message: "" };
|
|
7621
|
+
}
|
|
7622
|
+
if (actualDurationMs === void 0) {
|
|
7623
|
+
return {
|
|
7624
|
+
status: "skip",
|
|
7625
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
7626
|
+
};
|
|
7627
|
+
}
|
|
7628
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
7629
|
+
return {
|
|
7630
|
+
status: "pass",
|
|
7631
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
7632
|
+
};
|
|
7633
|
+
}
|
|
7634
|
+
return {
|
|
7635
|
+
status: "fail",
|
|
7636
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
7637
|
+
};
|
|
7638
|
+
}
|
|
7231
7639
|
var ToolTrajectoryEvaluator = class {
|
|
7232
7640
|
kind = "tool_trajectory";
|
|
7233
7641
|
config;
|
|
@@ -7286,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7286
7694
|
for (const call of message.toolCalls) {
|
|
7287
7695
|
toolCalls.push({
|
|
7288
7696
|
name: call.tool,
|
|
7289
|
-
args: call.input
|
|
7697
|
+
args: call.input,
|
|
7698
|
+
durationMs: call.durationMs
|
|
7290
7699
|
});
|
|
7291
7700
|
}
|
|
7292
7701
|
}
|
|
@@ -7354,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7354
7763
|
}
|
|
7355
7764
|
const hits = [];
|
|
7356
7765
|
const misses = [];
|
|
7766
|
+
const warnings = [];
|
|
7357
7767
|
let actualIndex = 0;
|
|
7768
|
+
let sequenceHits = 0;
|
|
7769
|
+
let latencyHits = 0;
|
|
7770
|
+
let latencySkips = 0;
|
|
7771
|
+
const latencyAssertionCount = expected.filter(
|
|
7772
|
+
(item) => item.maxDurationMs !== void 0
|
|
7773
|
+
).length;
|
|
7358
7774
|
for (let i = 0; i < expected.length; i++) {
|
|
7359
7775
|
const expectedItem = expected[i];
|
|
7360
7776
|
const expectedTool = expectedItem.tool;
|
|
7361
7777
|
let found = false;
|
|
7362
7778
|
let argsMismatch = false;
|
|
7779
|
+
let matchedCall;
|
|
7363
7780
|
while (actualIndex < toolCalls.length) {
|
|
7364
7781
|
const actualCall = toolCalls[actualIndex];
|
|
7365
7782
|
if (actualCall.name === expectedTool) {
|
|
7366
7783
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7367
7784
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
7785
|
+
sequenceHits++;
|
|
7786
|
+
matchedCall = actualCall;
|
|
7368
7787
|
actualIndex++;
|
|
7369
7788
|
found = true;
|
|
7370
7789
|
break;
|
|
@@ -7381,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7381
7800
|
if (!found && !argsMismatch) {
|
|
7382
7801
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
7383
7802
|
}
|
|
7803
|
+
if (found && matchedCall) {
|
|
7804
|
+
const latencyResult = checkLatency(
|
|
7805
|
+
expectedTool,
|
|
7806
|
+
expectedItem.maxDurationMs,
|
|
7807
|
+
matchedCall.durationMs
|
|
7808
|
+
);
|
|
7809
|
+
if (latencyResult.status === "pass") {
|
|
7810
|
+
hits.push(latencyResult.message);
|
|
7811
|
+
latencyHits++;
|
|
7812
|
+
} else if (latencyResult.status === "fail") {
|
|
7813
|
+
misses.push(latencyResult.message);
|
|
7814
|
+
} else if (latencyResult.message) {
|
|
7815
|
+
warnings.push(latencyResult.message);
|
|
7816
|
+
latencySkips++;
|
|
7817
|
+
}
|
|
7818
|
+
}
|
|
7384
7819
|
}
|
|
7385
|
-
const
|
|
7820
|
+
for (const warning of warnings) {
|
|
7821
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
7822
|
+
}
|
|
7823
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
7824
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
7825
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
7386
7826
|
return {
|
|
7387
7827
|
score,
|
|
7388
7828
|
verdict: scoreToVerdict(score),
|
|
7389
7829
|
hits,
|
|
7390
7830
|
misses,
|
|
7391
|
-
expectedAspectCount:
|
|
7831
|
+
expectedAspectCount: totalAssertions
|
|
7392
7832
|
};
|
|
7393
7833
|
}
|
|
7394
7834
|
evaluateExact(toolCalls) {
|
|
@@ -7404,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7404
7844
|
}
|
|
7405
7845
|
const hits = [];
|
|
7406
7846
|
const misses = [];
|
|
7847
|
+
const warnings = [];
|
|
7848
|
+
let sequenceHits = 0;
|
|
7849
|
+
let latencyHits = 0;
|
|
7850
|
+
let latencySkips = 0;
|
|
7851
|
+
const latencyAssertionCount = expected.filter(
|
|
7852
|
+
(item) => item.maxDurationMs !== void 0
|
|
7853
|
+
).length;
|
|
7407
7854
|
if (toolCalls.length !== expected.length) {
|
|
7408
7855
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
7409
7856
|
}
|
|
@@ -7413,26 +7860,50 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7413
7860
|
const expectedTool = expectedItem.tool;
|
|
7414
7861
|
const actualCall = toolCalls[i];
|
|
7415
7862
|
const actualTool = actualCall.name;
|
|
7863
|
+
let sequenceMatched = false;
|
|
7416
7864
|
if (actualTool === expectedTool) {
|
|
7417
7865
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
7418
7866
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
7867
|
+
sequenceHits++;
|
|
7868
|
+
sequenceMatched = true;
|
|
7419
7869
|
} else {
|
|
7420
7870
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
7421
7871
|
}
|
|
7422
7872
|
} else {
|
|
7423
7873
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7424
7874
|
}
|
|
7875
|
+
if (sequenceMatched) {
|
|
7876
|
+
const latencyResult = checkLatency(
|
|
7877
|
+
expectedTool,
|
|
7878
|
+
expectedItem.maxDurationMs,
|
|
7879
|
+
actualCall.durationMs
|
|
7880
|
+
);
|
|
7881
|
+
if (latencyResult.status === "pass") {
|
|
7882
|
+
hits.push(latencyResult.message);
|
|
7883
|
+
latencyHits++;
|
|
7884
|
+
} else if (latencyResult.status === "fail") {
|
|
7885
|
+
misses.push(latencyResult.message);
|
|
7886
|
+
} else if (latencyResult.message) {
|
|
7887
|
+
warnings.push(latencyResult.message);
|
|
7888
|
+
latencySkips++;
|
|
7889
|
+
}
|
|
7890
|
+
}
|
|
7425
7891
|
}
|
|
7426
7892
|
for (let i = checkLength; i < expected.length; i++) {
|
|
7427
7893
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7428
7894
|
}
|
|
7429
|
-
const
|
|
7895
|
+
for (const warning of warnings) {
|
|
7896
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
7897
|
+
}
|
|
7898
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
7899
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
7900
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
7430
7901
|
return {
|
|
7431
7902
|
score,
|
|
7432
7903
|
verdict: scoreToVerdict(score),
|
|
7433
7904
|
hits,
|
|
7434
7905
|
misses,
|
|
7435
|
-
expectedAspectCount:
|
|
7906
|
+
expectedAspectCount: totalAssertions
|
|
7436
7907
|
};
|
|
7437
7908
|
}
|
|
7438
7909
|
};
|
|
@@ -7440,6 +7911,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7440
7911
|
// src/evaluation/orchestrator.ts
|
|
7441
7912
|
import { createHash } from "node:crypto";
|
|
7442
7913
|
import path15 from "node:path";
|
|
7914
|
+
import micromatch4 from "micromatch";
|
|
7443
7915
|
|
|
7444
7916
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
7445
7917
|
var Node = class {
|
|
@@ -7598,17 +8070,17 @@ async function runEvaluation(options) {
|
|
|
7598
8070
|
cache,
|
|
7599
8071
|
useCache,
|
|
7600
8072
|
now,
|
|
7601
|
-
|
|
8073
|
+
filter,
|
|
7602
8074
|
verbose,
|
|
7603
8075
|
evalCases: preloadedEvalCases,
|
|
7604
8076
|
onResult,
|
|
7605
8077
|
onProgress
|
|
7606
8078
|
} = options;
|
|
7607
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
7608
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
8079
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
|
|
8080
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
7609
8081
|
if (filteredEvalCases.length === 0) {
|
|
7610
|
-
if (
|
|
7611
|
-
throw new Error(`
|
|
8082
|
+
if (filter) {
|
|
8083
|
+
throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
|
|
7612
8084
|
}
|
|
7613
8085
|
return [];
|
|
7614
8086
|
}
|
|
@@ -8184,7 +8656,10 @@ async function runEvaluatorList(options) {
|
|
|
8184
8656
|
attempt,
|
|
8185
8657
|
promptInputs,
|
|
8186
8658
|
now,
|
|
8187
|
-
judgeProvider
|
|
8659
|
+
judgeProvider,
|
|
8660
|
+
outputMessages,
|
|
8661
|
+
traceSummary,
|
|
8662
|
+
agentTimeoutMs
|
|
8188
8663
|
});
|
|
8189
8664
|
const weight = evaluator.weight ?? 1;
|
|
8190
8665
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -8519,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
8519
8994
|
attempt,
|
|
8520
8995
|
promptInputs,
|
|
8521
8996
|
now,
|
|
8522
|
-
judgeProvider
|
|
8997
|
+
judgeProvider,
|
|
8998
|
+
outputMessages,
|
|
8999
|
+
traceSummary,
|
|
9000
|
+
agentTimeoutMs
|
|
8523
9001
|
} = options;
|
|
8524
|
-
const customPrompt = await resolveCustomPrompt(
|
|
9002
|
+
const customPrompt = await resolveCustomPrompt(
|
|
9003
|
+
config,
|
|
9004
|
+
{
|
|
9005
|
+
evalCase,
|
|
9006
|
+
candidate,
|
|
9007
|
+
outputMessages,
|
|
9008
|
+
traceSummary,
|
|
9009
|
+
config: config.config
|
|
9010
|
+
},
|
|
9011
|
+
agentTimeoutMs
|
|
9012
|
+
);
|
|
8525
9013
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
8526
9014
|
evalCase,
|
|
8527
9015
|
candidate,
|
|
@@ -8535,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
8535
9023
|
evaluator: config
|
|
8536
9024
|
});
|
|
8537
9025
|
}
|
|
8538
|
-
async function resolveCustomPrompt(
|
|
8539
|
-
if (
|
|
9026
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
9027
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
9028
|
+
if (!context) {
|
|
9029
|
+
throw new Error("Context required for executable prompt templates");
|
|
9030
|
+
}
|
|
9031
|
+
return executePromptTemplate(
|
|
9032
|
+
promptConfig.resolvedPromptScript,
|
|
9033
|
+
context,
|
|
9034
|
+
promptConfig.config,
|
|
9035
|
+
timeoutMs
|
|
9036
|
+
);
|
|
9037
|
+
}
|
|
9038
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
9039
|
+
if (promptPath) {
|
|
8540
9040
|
try {
|
|
8541
|
-
const content = await readTextFile(
|
|
9041
|
+
const content = await readTextFile(promptPath);
|
|
8542
9042
|
return content;
|
|
8543
9043
|
} catch (error) {
|
|
8544
9044
|
const message = error instanceof Error ? error.message : String(error);
|
|
8545
|
-
console.warn(`Could not read custom prompt at ${
|
|
9045
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
8546
9046
|
}
|
|
8547
9047
|
}
|
|
8548
|
-
|
|
9048
|
+
const promptValue = promptConfig.prompt;
|
|
9049
|
+
if (typeof promptValue === "string") {
|
|
9050
|
+
return promptValue;
|
|
9051
|
+
}
|
|
9052
|
+
return void 0;
|
|
9053
|
+
}
|
|
9054
|
+
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
9055
|
+
const payload = {
|
|
9056
|
+
question: context.evalCase.question,
|
|
9057
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
9058
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
9059
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
9060
|
+
candidateAnswer: context.candidate,
|
|
9061
|
+
outputMessages: context.outputMessages ?? null,
|
|
9062
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
9063
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
9064
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
9065
|
+
),
|
|
9066
|
+
inputMessages: context.evalCase.input_messages,
|
|
9067
|
+
traceSummary: context.traceSummary ?? null,
|
|
9068
|
+
config: config ?? context.config ?? null
|
|
9069
|
+
};
|
|
9070
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
9071
|
+
const scriptPath = script[script.length - 1];
|
|
9072
|
+
const cwd = path15.dirname(scriptPath);
|
|
9073
|
+
try {
|
|
9074
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
9075
|
+
const prompt = stdout.trim();
|
|
9076
|
+
if (!prompt) {
|
|
9077
|
+
throw new Error("Prompt template produced empty output");
|
|
9078
|
+
}
|
|
9079
|
+
return prompt;
|
|
9080
|
+
} catch (error) {
|
|
9081
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
9082
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
9083
|
+
}
|
|
8549
9084
|
}
|
|
8550
|
-
function filterEvalCases(evalCases,
|
|
8551
|
-
if (!
|
|
9085
|
+
function filterEvalCases(evalCases, filter) {
|
|
9086
|
+
if (!filter) {
|
|
8552
9087
|
return evalCases;
|
|
8553
9088
|
}
|
|
8554
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
9089
|
+
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
|
|
8555
9090
|
}
|
|
8556
9091
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
8557
9092
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -8709,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
|
|
|
8709
9244
|
import { z as z3 } from "zod";
|
|
8710
9245
|
var rubricItemSchema = z3.object({
|
|
8711
9246
|
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
8712
|
-
|
|
9247
|
+
expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
|
|
8713
9248
|
weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
|
|
8714
9249
|
required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
8715
9250
|
});
|
|
@@ -8729,7 +9264,7 @@ You must return a valid JSON object matching this schema:
|
|
|
8729
9264
|
"rubrics": [
|
|
8730
9265
|
{
|
|
8731
9266
|
"id": "string (short identifier)",
|
|
8732
|
-
"
|
|
9267
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
8733
9268
|
"weight": number (default 1.0),
|
|
8734
9269
|
"required": boolean (default true)
|
|
8735
9270
|
}
|
|
@@ -8765,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
8765
9300
|
"Each rubric should:",
|
|
8766
9301
|
"- Be specific and testable",
|
|
8767
9302
|
"- Have a short, descriptive ID",
|
|
8768
|
-
"- Include a clear
|
|
9303
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
8769
9304
|
"- Indicate if it is required (mandatory) or optional",
|
|
8770
9305
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
8771
9306
|
"",
|