@agentv/core 2.2.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -229,6 +229,7 @@ function mergeExecutionMetrics(summary, metrics) {
229
229
  // src/evaluation/yaml-parser.ts
230
230
  var import_promises7 = require("fs/promises");
231
231
  var import_node_path7 = __toESM(require("path"), 1);
232
+ var import_micromatch3 = __toESM(require("micromatch"), 1);
232
233
  var import_yaml3 = require("yaml");
233
234
 
234
235
  // src/evaluation/loaders/config-loader.ts
@@ -543,11 +544,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
543
544
  );
544
545
  }
545
546
  }
546
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
547
- const config = {};
547
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
548
+ const config2 = {};
548
549
  for (const [key, value] of Object.entries(rawEvaluator)) {
549
- if (!knownProps.has(key) && value !== void 0) {
550
- config[key] = value;
550
+ if (!knownProps2.has(key) && value !== void 0) {
551
+ config2[key] = value;
551
552
  }
552
553
  }
553
554
  evaluators.push({
@@ -557,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
557
558
  cwd,
558
559
  resolvedCwd,
559
560
  ...weight2 !== void 0 ? { weight: weight2 } : {},
560
- ...Object.keys(config).length > 0 ? { config } : {},
561
+ ...Object.keys(config2).length > 0 ? { config: config2 } : {},
561
562
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
562
563
  });
563
564
  continue;
@@ -722,7 +723,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
722
723
  continue;
723
724
  }
724
725
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
725
- const config = {
726
+ const config2 = {
726
727
  name,
727
728
  type: "tool_trajectory",
728
729
  mode,
@@ -730,7 +731,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
730
731
  ...expected ? { expected } : {},
731
732
  ...weight2 !== void 0 ? { weight: weight2 } : {}
732
733
  };
733
- evaluators.push(config);
734
+ evaluators.push(config2);
734
735
  continue;
735
736
  }
736
737
  if (typeValue === "field_accuracy") {
@@ -867,9 +868,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
867
868
  });
868
869
  continue;
869
870
  }
870
- const prompt = asString(rawEvaluator.prompt);
871
+ const rawPrompt = rawEvaluator.prompt;
872
+ let prompt;
871
873
  let promptPath;
872
- if (prompt) {
874
+ let resolvedPromptScript;
875
+ let promptScriptConfig;
876
+ if (isJsonObject2(rawPrompt)) {
877
+ const scriptArray = asStringArray(
878
+ rawPrompt.script,
879
+ `prompt.script for evaluator '${name}' in '${evalId}'`
880
+ );
881
+ if (!scriptArray) {
882
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
883
+ }
884
+ const scriptPath = scriptArray[scriptArray.length - 1];
885
+ const resolved = await resolveFileReference(scriptPath, searchRoots);
886
+ if (resolved.resolvedPath) {
887
+ resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path3.default.resolve(resolved.resolvedPath)];
888
+ } else {
889
+ throw new Error(
890
+ `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
891
+ );
892
+ }
893
+ if (isJsonObject2(rawPrompt.config)) {
894
+ promptScriptConfig = rawPrompt.config;
895
+ }
896
+ } else if (typeof rawPrompt === "string") {
897
+ prompt = rawPrompt;
873
898
  const resolved = await resolveFileReference(prompt, searchRoots);
874
899
  if (resolved.resolvedPath) {
875
900
  promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
@@ -888,12 +913,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
888
913
  }
889
914
  const _model = asString(rawEvaluator.model);
890
915
  const rawRubrics = rawEvaluator.rubrics;
891
- const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
892
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
893
- description: asString(rubric.description) ?? "",
894
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
895
- required: typeof rubric.required === "boolean" ? rubric.required : true
896
- })).filter((r) => r.description.length > 0) : void 0;
916
+ const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
897
917
  if (typeValue === "rubric") {
898
918
  if (!parsedRubrics) {
899
919
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -913,13 +933,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
913
933
  continue;
914
934
  }
915
935
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
936
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
937
+ const config = {};
938
+ for (const [key, value] of Object.entries(rawEvaluator)) {
939
+ if (!knownProps.has(key) && value !== void 0) {
940
+ config[key] = value;
941
+ }
942
+ }
943
+ const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
944
+ const mergedConfig = { ...config, ...topLevelConfig };
945
+ const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
916
946
  evaluators.push({
917
947
  name,
918
948
  type: "llm_judge",
919
949
  prompt,
920
950
  promptPath,
951
+ ...promptPath ? { resolvedPromptPath: promptPath } : {},
952
+ ...resolvedPromptScript ? { resolvedPromptScript } : {},
921
953
  ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
922
- ...weight !== void 0 ? { weight } : {}
954
+ ...weight !== void 0 ? { weight } : {},
955
+ ...finalConfig ? { config: finalConfig } : {}
923
956
  });
924
957
  }
925
958
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1006,10 +1039,190 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
1006
1039
  function isValidFieldAggregationType(value) {
1007
1040
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
1008
1041
  }
1042
+ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1043
+ const items = [];
1044
+ for (const [index, rawRubric] of rawRubrics.entries()) {
1045
+ if (!isJsonObject2(rawRubric)) {
1046
+ logWarning2(
1047
+ `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
1048
+ );
1049
+ continue;
1050
+ }
1051
+ const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
1052
+ const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
1053
+ const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
1054
+ let requiredMinScore;
1055
+ let required;
1056
+ if (typeof rawRubric.required_min_score === "number") {
1057
+ const minScore = rawRubric.required_min_score;
1058
+ if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
1059
+ throw new Error(
1060
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
1061
+ );
1062
+ }
1063
+ requiredMinScore = minScore;
1064
+ }
1065
+ if (typeof rawRubric.required === "boolean") {
1066
+ required = rawRubric.required;
1067
+ }
1068
+ let scoreRanges;
1069
+ const rawScoreRanges = rawRubric.score_ranges;
1070
+ if (rawScoreRanges !== void 0) {
1071
+ if (!Array.isArray(rawScoreRanges)) {
1072
+ throw new Error(
1073
+ `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
1074
+ );
1075
+ }
1076
+ scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
1077
+ items.push({
1078
+ id,
1079
+ weight,
1080
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1081
+ ...required !== void 0 ? { required } : {},
1082
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1083
+ score_ranges: scoreRanges
1084
+ });
1085
+ } else {
1086
+ if (expectedOutcome.length === 0) {
1087
+ logWarning2(
1088
+ `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
1089
+ );
1090
+ continue;
1091
+ }
1092
+ items.push({
1093
+ id,
1094
+ expected_outcome: expectedOutcome,
1095
+ weight,
1096
+ // Default to required: true if not specified (backward compatibility)
1097
+ required: required ?? true,
1098
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1099
+ });
1100
+ }
1101
+ }
1102
+ return items.length > 0 ? items : void 0;
1103
+ }
1104
+ function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
1105
+ const ranges = [];
1106
+ for (const [index, rawRange] of rawRanges.entries()) {
1107
+ if (!isJsonObject2(rawRange)) {
1108
+ throw new Error(
1109
+ `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
1110
+ );
1111
+ }
1112
+ const scoreRangeValue = rawRange.score_range;
1113
+ if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
1114
+ throw new Error(
1115
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
1116
+ );
1117
+ }
1118
+ const [min, max] = scoreRangeValue;
1119
+ if (!Number.isInteger(min) || !Number.isInteger(max)) {
1120
+ throw new Error(
1121
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
1122
+ );
1123
+ }
1124
+ if (min < 0 || min > 10 || max < 0 || max > 10) {
1125
+ throw new Error(
1126
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
1127
+ );
1128
+ }
1129
+ if (min > max) {
1130
+ throw new Error(
1131
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
1132
+ );
1133
+ }
1134
+ const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
1135
+ if (expectedOutcome.length === 0) {
1136
+ throw new Error(
1137
+ `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
1138
+ );
1139
+ }
1140
+ ranges.push({
1141
+ score_range: [min, max],
1142
+ expected_outcome: expectedOutcome
1143
+ });
1144
+ }
1145
+ const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
1146
+ for (let i = 1; i < sortedRanges.length; i++) {
1147
+ const prev = sortedRanges[i - 1];
1148
+ const curr = sortedRanges[i];
1149
+ if (curr.score_range[0] <= prev.score_range[1]) {
1150
+ throw new Error(
1151
+ `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
1152
+ );
1153
+ }
1154
+ }
1155
+ const covered = /* @__PURE__ */ new Set();
1156
+ for (const range of ranges) {
1157
+ for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
1158
+ covered.add(i);
1159
+ }
1160
+ }
1161
+ const missing = [];
1162
+ for (let i = 0; i <= 10; i++) {
1163
+ if (!covered.has(i)) {
1164
+ missing.push(i);
1165
+ }
1166
+ }
1167
+ if (missing.length > 0) {
1168
+ throw new Error(
1169
+ `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
1170
+ );
1171
+ }
1172
+ return ranges;
1173
+ }
1174
+ function parseInlineRubrics(rawRubrics) {
1175
+ const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
1176
+ if (typeof rubric === "string") {
1177
+ return {
1178
+ id: `rubric-${index + 1}`,
1179
+ expected_outcome: rubric,
1180
+ weight: 1,
1181
+ required: true
1182
+ };
1183
+ }
1184
+ const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
1185
+ const rawScoreRanges = rubric.score_ranges;
1186
+ const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
1187
+ score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
1188
+ expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
1189
+ })).filter((r) => r.expected_outcome.length > 0) : void 0;
1190
+ const baseRubric = {
1191
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
1192
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1
1193
+ };
1194
+ if (scoreRanges && scoreRanges.length > 0) {
1195
+ return {
1196
+ ...baseRubric,
1197
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1198
+ ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
1199
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
1200
+ score_ranges: scoreRanges
1201
+ };
1202
+ }
1203
+ return {
1204
+ ...baseRubric,
1205
+ expected_outcome: expectedOutcome,
1206
+ required: typeof rubric.required === "boolean" ? rubric.required : true,
1207
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
1208
+ };
1209
+ }).filter(
1210
+ (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
1211
+ );
1212
+ if (rubricItems.length === 0) {
1213
+ return void 0;
1214
+ }
1215
+ return {
1216
+ name: "rubric",
1217
+ type: "llm_judge",
1218
+ rubrics: rubricItems
1219
+ };
1220
+ }
1009
1221
 
1010
1222
  // src/evaluation/loaders/jsonl-parser.ts
1011
1223
  var import_promises5 = require("fs/promises");
1012
1224
  var import_node_path5 = __toESM(require("path"), 1);
1225
+ var import_micromatch2 = __toESM(require("micromatch"), 1);
1013
1226
  var import_yaml2 = require("yaml");
1014
1227
 
1015
1228
  // src/evaluation/loaders/message-processor.ts
@@ -1272,6 +1485,65 @@ async function processExpectedMessages(options) {
1272
1485
  return segments;
1273
1486
  }
1274
1487
 
1488
+ // src/evaluation/loaders/shorthand-expansion.ts
1489
+ function expandInputShorthand(value) {
1490
+ if (value === void 0 || value === null) {
1491
+ return void 0;
1492
+ }
1493
+ if (typeof value === "string") {
1494
+ return [{ role: "user", content: value }];
1495
+ }
1496
+ if (Array.isArray(value)) {
1497
+ const messages = value.filter((msg) => isTestMessage(msg));
1498
+ return messages.length > 0 ? messages : void 0;
1499
+ }
1500
+ return void 0;
1501
+ }
1502
+ function expandExpectedOutputShorthand(value) {
1503
+ if (value === void 0 || value === null) {
1504
+ return void 0;
1505
+ }
1506
+ if (typeof value === "string") {
1507
+ return [{ role: "assistant", content: value }];
1508
+ }
1509
+ if (Array.isArray(value)) {
1510
+ if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
1511
+ const messages = value.filter((msg) => isTestMessage(msg));
1512
+ return messages.length > 0 ? messages : void 0;
1513
+ }
1514
+ return [{ role: "assistant", content: value }];
1515
+ }
1516
+ if (isJsonObject(value)) {
1517
+ if ("role" in value) {
1518
+ return isTestMessage(value) ? [value] : void 0;
1519
+ }
1520
+ return [{ role: "assistant", content: value }];
1521
+ }
1522
+ return void 0;
1523
+ }
1524
+ function resolveInputMessages(raw) {
1525
+ if (raw.input_messages !== void 0) {
1526
+ if (Array.isArray(raw.input_messages)) {
1527
+ const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
1528
+ return messages.length > 0 ? messages : void 0;
1529
+ }
1530
+ return void 0;
1531
+ }
1532
+ return expandInputShorthand(raw.input);
1533
+ }
1534
+ function resolveExpectedMessages(raw) {
1535
+ if (raw.expected_messages !== void 0) {
1536
+ if (Array.isArray(raw.expected_messages)) {
1537
+ const messages = raw.expected_messages.filter(
1538
+ (msg) => isTestMessage(msg)
1539
+ );
1540
+ return messages.length > 0 ? messages : void 0;
1541
+ }
1542
+ return void 0;
1543
+ }
1544
+ return expandExpectedOutputShorthand(raw.expected_output);
1545
+ }
1546
+
1275
1547
  // src/evaluation/loaders/jsonl-parser.ts
1276
1548
  var ANSI_YELLOW5 = "\x1B[33m";
1277
1549
  var ANSI_RED = "\x1B[31m";
@@ -1332,7 +1604,7 @@ function parseJsonlContent(content, filePath) {
1332
1604
  }
1333
1605
  async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1334
1606
  const verbose = options?.verbose ?? false;
1335
- const evalIdFilter = options?.evalId;
1607
+ const filterPattern = options?.filter;
1336
1608
  const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
1337
1609
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1338
1610
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
@@ -1359,28 +1631,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1359
1631
  const evalcase = rawCases[lineIndex];
1360
1632
  const lineNumber = lineIndex + 1;
1361
1633
  const id = asString4(evalcase.id);
1362
- if (evalIdFilter && id !== evalIdFilter) {
1634
+ if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
1363
1635
  continue;
1364
1636
  }
1365
1637
  const conversationId = asString4(evalcase.conversation_id);
1366
1638
  const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
1367
- const inputMessagesValue = evalcase.input_messages;
1368
- const expectedMessagesValue = evalcase.expected_messages;
1369
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
1639
+ const inputMessages = resolveInputMessages(evalcase);
1640
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1641
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1370
1642
  logError(
1371
- `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
1643
+ `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
1372
1644
  );
1373
1645
  continue;
1374
1646
  }
1375
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1376
- const inputMessages = inputMessagesValue.filter(
1377
- (msg) => isTestMessage(msg)
1378
- );
1379
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1380
- if (hasExpectedMessages && expectedMessages.length === 0) {
1381
- logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
1382
- continue;
1383
- }
1647
+ const hasExpectedMessages = expectedMessages.length > 0;
1384
1648
  const guidelinePaths = [];
1385
1649
  const inputTextParts = [];
1386
1650
  const inputSegments = await processMessages({
@@ -1426,28 +1690,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1426
1690
  }
1427
1691
  const inlineRubrics = evalcase.rubrics;
1428
1692
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1429
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1430
- if (typeof rubric === "string") {
1431
- return {
1432
- id: `rubric-${index + 1}`,
1433
- description: rubric,
1434
- weight: 1,
1435
- required: true
1436
- };
1437
- }
1438
- return {
1439
- id: asString4(rubric.id) ?? `rubric-${index + 1}`,
1440
- description: asString4(rubric.description) ?? "",
1441
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1442
- required: typeof rubric.required === "boolean" ? rubric.required : true
1443
- };
1444
- }).filter((r) => r.description.length > 0);
1445
- if (rubricItems.length > 0) {
1446
- const rubricEvaluator = {
1447
- name: "rubric",
1448
- type: "llm_judge",
1449
- rubrics: rubricItems
1450
- };
1693
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
1694
+ if (rubricEvaluator) {
1451
1695
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1452
1696
  }
1453
1697
  }
@@ -1757,7 +2001,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1757
2001
  return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
1758
2002
  }
1759
2003
  const verbose = options?.verbose ?? false;
1760
- const evalIdFilter = options?.evalId;
2004
+ const filterPattern = options?.filter;
1761
2005
  const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
1762
2006
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1763
2007
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
@@ -1787,28 +2031,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1787
2031
  }
1788
2032
  const evalcase = rawEvalcase;
1789
2033
  const id = asString6(evalcase.id);
1790
- if (evalIdFilter && id !== evalIdFilter) {
2034
+ if (filterPattern && (!id || !import_micromatch3.default.isMatch(id, filterPattern))) {
1791
2035
  continue;
1792
2036
  }
1793
2037
  const conversationId = asString6(evalcase.conversation_id);
1794
2038
  const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
1795
- const inputMessagesValue = evalcase.input_messages;
1796
- const expectedMessagesValue = evalcase.expected_messages;
1797
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
2039
+ const inputMessages = resolveInputMessages(evalcase);
2040
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2041
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1798
2042
  logError2(
1799
- `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
2043
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
1800
2044
  );
1801
2045
  continue;
1802
2046
  }
1803
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1804
- const inputMessages = inputMessagesValue.filter(
1805
- (msg) => isTestMessage(msg)
1806
- );
1807
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1808
- if (hasExpectedMessages && expectedMessages.length === 0) {
1809
- logError2(`No valid expected message found for eval case: ${id}`);
1810
- continue;
1811
- }
2047
+ const hasExpectedMessages = expectedMessages.length > 0;
1812
2048
  const guidelinePaths = [];
1813
2049
  const inputTextParts = [];
1814
2050
  const inputSegments = await processMessages({
@@ -1852,28 +2088,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1852
2088
  }
1853
2089
  const inlineRubrics = evalcase.rubrics;
1854
2090
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1855
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1856
- if (typeof rubric === "string") {
1857
- return {
1858
- id: `rubric-${index + 1}`,
1859
- description: rubric,
1860
- weight: 1,
1861
- required: true
1862
- };
1863
- }
1864
- return {
1865
- id: asString6(rubric.id) ?? `rubric-${index + 1}`,
1866
- description: asString6(rubric.description) ?? "",
1867
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1868
- required: typeof rubric.required === "boolean" ? rubric.required : true
1869
- };
1870
- }).filter((r) => r.description.length > 0);
1871
- if (rubricItems.length > 0) {
1872
- const rubricEvaluator = {
1873
- name: "rubric",
1874
- type: "llm_judge",
1875
- rubrics: rubricItems
1876
- };
2091
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2092
+ if (rubricEvaluator) {
1877
2093
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1878
2094
  }
1879
2095
  }
@@ -3245,7 +3461,8 @@ var ToolCallSchema = import_zod.z.object({
3245
3461
  input: import_zod.z.unknown().optional(),
3246
3462
  output: import_zod.z.unknown().optional(),
3247
3463
  id: import_zod.z.string().optional(),
3248
- timestamp: import_zod.z.string().optional()
3464
+ timestamp: import_zod.z.string().optional(),
3465
+ duration_ms: import_zod.z.number().optional()
3249
3466
  });
3250
3467
  var OutputMessageInputSchema = import_zod.z.object({
3251
3468
  role: import_zod.z.string(),
@@ -3253,6 +3470,7 @@ var OutputMessageInputSchema = import_zod.z.object({
3253
3470
  content: import_zod.z.unknown().optional(),
3254
3471
  tool_calls: import_zod.z.array(ToolCallSchema).optional(),
3255
3472
  timestamp: import_zod.z.string().optional(),
3473
+ duration_ms: import_zod.z.number().optional(),
3256
3474
  metadata: import_zod.z.record(import_zod.z.unknown()).optional()
3257
3475
  });
3258
3476
  var TokenUsageSchema = import_zod.z.object({
@@ -3291,8 +3509,16 @@ function convertOutputMessages(messages) {
3291
3509
  role: msg.role,
3292
3510
  name: msg.name,
3293
3511
  content: msg.content,
3294
- toolCalls: msg.tool_calls,
3512
+ toolCalls: msg.tool_calls?.map((tc) => ({
3513
+ tool: tc.tool,
3514
+ input: tc.input,
3515
+ output: tc.output,
3516
+ id: tc.id,
3517
+ timestamp: tc.timestamp,
3518
+ durationMs: tc.duration_ms
3519
+ })),
3295
3520
  timestamp: msg.timestamp,
3521
+ durationMs: msg.duration_ms,
3296
3522
  metadata: msg.metadata
3297
3523
  }));
3298
3524
  }
@@ -7226,6 +7452,15 @@ var rubricEvaluationSchema = import_zod3.z.object({
7226
7452
  checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
7227
7453
  overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
7228
7454
  });
7455
+ var scoreRangeCheckResultSchema = import_zod3.z.object({
7456
+ id: import_zod3.z.string().describe("The ID of the rubric criterion being scored"),
7457
+ score: import_zod3.z.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
7458
+ reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this score").optional()
7459
+ });
7460
+ var scoreRangeEvaluationSchema = import_zod3.z.object({
7461
+ checks: import_zod3.z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
7462
+ overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
7463
+ });
7229
7464
  var LlmJudgeEvaluator = class {
7230
7465
  kind = "llm_judge";
7231
7466
  resolveJudgeProvider;
@@ -7311,6 +7546,10 @@ var LlmJudgeEvaluator = class {
7311
7546
  `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
7312
7547
  );
7313
7548
  }
7549
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
7550
+ if (hasScoreRanges) {
7551
+ return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
7552
+ }
7314
7553
  const prompt = this.buildRubricPrompt(context, rubrics);
7315
7554
  const systemPrompt = buildRubricOutputSchema();
7316
7555
  const evaluatorRawRequest = {
@@ -7336,6 +7575,84 @@ var LlmJudgeEvaluator = class {
7336
7575
  evaluatorRawRequest
7337
7576
  };
7338
7577
  }
7578
+ /**
7579
+ * Evaluate using score-range rubrics (analytic rubric scoring).
7580
+ * Each criterion is scored 0-10 and normalized to 0-1.
7581
+ */
7582
+ async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
7583
+ const prompt = this.buildScoreRangePrompt(context, rubrics);
7584
+ const systemPrompt = buildScoreRangeOutputSchema();
7585
+ const evaluatorRawRequest = {
7586
+ userPrompt: prompt,
7587
+ systemPrompt,
7588
+ target: judgeProvider.targetName
7589
+ };
7590
+ const { data } = await this.runWithRetry({
7591
+ context,
7592
+ judgeProvider,
7593
+ systemPrompt,
7594
+ userPrompt: prompt,
7595
+ schema: scoreRangeEvaluationSchema
7596
+ });
7597
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
7598
+ return {
7599
+ score,
7600
+ verdict,
7601
+ hits,
7602
+ misses,
7603
+ expectedAspectCount: rubrics.length,
7604
+ reasoning: data.overall_reasoning,
7605
+ evaluatorRawRequest,
7606
+ details
7607
+ };
7608
+ }
7609
+ /**
7610
+ * Build prompt for score-range rubric evaluation.
7611
+ */
7612
+ buildScoreRangePrompt(context, rubrics) {
7613
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
7614
+ const parts = [
7615
+ "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
7616
+ "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
7617
+ "",
7618
+ "[[ ## question ## ]]",
7619
+ formattedQuestion,
7620
+ "",
7621
+ "[[ ## expected_outcome ## ]]",
7622
+ context.evalCase.expected_outcome,
7623
+ ""
7624
+ ];
7625
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
7626
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
7627
+ }
7628
+ parts.push(
7629
+ "[[ ## candidate_answer ## ]]",
7630
+ context.candidate,
7631
+ "",
7632
+ "[[ ## scoring_criteria ## ]]"
7633
+ );
7634
+ for (const rubric of rubrics) {
7635
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
7636
+ const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
7637
+ parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
7638
+ if (rubric.expected_outcome) {
7639
+ parts.push(`Description: ${rubric.expected_outcome}`);
7640
+ }
7641
+ if (rubric.score_ranges && rubric.score_ranges.length > 0) {
7642
+ parts.push("Score ranges:");
7643
+ for (const range of rubric.score_ranges) {
7644
+ const [min, max] = range.score_range;
7645
+ const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
7646
+ parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
7647
+ }
7648
+ }
7649
+ }
7650
+ parts.push(
7651
+ "",
7652
+ "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
7653
+ );
7654
+ return parts.join("\n");
7655
+ }
7339
7656
  buildRubricPrompt(context, rubrics) {
7340
7657
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
7341
7658
  const parts = [
@@ -7355,7 +7672,7 @@ var LlmJudgeEvaluator = class {
7355
7672
  for (const rubric of rubrics) {
7356
7673
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
7357
7674
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
7358
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
7675
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
7359
7676
  }
7360
7677
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
7361
7678
  return parts.join("\n");
@@ -7442,9 +7759,9 @@ function calculateRubricScore(result, rubrics) {
7442
7759
  totalWeight += rubric.weight;
7443
7760
  if (check.satisfied) {
7444
7761
  earnedWeight += rubric.weight;
7445
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7762
+ hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
7446
7763
  } else {
7447
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7764
+ misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
7448
7765
  if (rubric.required) {
7449
7766
  failedRequired = true;
7450
7767
  }
@@ -7454,6 +7771,76 @@ function calculateRubricScore(result, rubrics) {
7454
7771
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7455
7772
  return { score, verdict, hits, misses };
7456
7773
  }
7774
+ function buildScoreRangeOutputSchema() {
7775
+ return `You are an expert evaluator. Score the candidate answer on each criterion.
7776
+ You must return a valid JSON object matching this schema:
7777
+ {
7778
+ "checks": [
7779
+ {
7780
+ "id": "string (criterion id)",
7781
+ "score": integer (0-10),
7782
+ "reasoning": "string (brief explanation for score)"
7783
+ }
7784
+ ],
7785
+ "overall_reasoning": "string (summary, optional)"
7786
+ }
7787
+
7788
+ Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
7789
+ }
7790
+ function calculateScoreRangeResult(result, rubrics) {
7791
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
7792
+ const hits = [];
7793
+ const misses = [];
7794
+ const rawScores = {};
7795
+ let totalWeight = 0;
7796
+ let weightedScoreSum = 0;
7797
+ let failedRequired = false;
7798
+ for (const check of result.checks) {
7799
+ const rubric = rubricMap.get(check.id);
7800
+ if (!rubric) {
7801
+ continue;
7802
+ }
7803
+ const rawScore = Math.max(0, Math.min(10, check.score));
7804
+ const normalizedScore = rawScore / 10;
7805
+ rawScores[rubric.id] = rawScore;
7806
+ totalWeight += rubric.weight;
7807
+ weightedScoreSum += normalizedScore * rubric.weight;
7808
+ let requiredMinScore;
7809
+ if (rubric.required_min_score !== void 0) {
7810
+ requiredMinScore = rubric.required_min_score;
7811
+ } else if (rubric.required === true) {
7812
+ requiredMinScore = 10;
7813
+ }
7814
+ const matchingRange = rubric.score_ranges?.find(
7815
+ (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
7816
+ );
7817
+ const rangeDescription = matchingRange?.expected_outcome ?? "";
7818
+ const criterionLabel = rubric.expected_outcome ?? rubric.id;
7819
+ const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
7820
+ const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
7821
+ if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
7822
+ failedRequired = true;
7823
+ misses.push(scoreInfo);
7824
+ } else if (rawScore >= 7) {
7825
+ hits.push(scoreInfo);
7826
+ } else {
7827
+ misses.push(scoreInfo);
7828
+ }
7829
+ }
7830
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
7831
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7832
+ return {
7833
+ score,
7834
+ verdict,
7835
+ hits,
7836
+ misses,
7837
+ details: {
7838
+ raw_scores: rawScores,
7839
+ normalization: "score / 10",
7840
+ aggregation: "weighted_average"
7841
+ }
7842
+ };
7843
+ }
7457
7844
 
7458
7845
  // src/evaluation/evaluators/composite.ts
7459
7846
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -8281,6 +8668,27 @@ function argsMatch(expected, actual) {
8281
8668
  }
8282
8669
  return true;
8283
8670
  }
8671
+ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
8672
+ if (maxDurationMs === void 0) {
8673
+ return { status: "skip", message: "" };
8674
+ }
8675
+ if (actualDurationMs === void 0) {
8676
+ return {
8677
+ status: "skip",
8678
+ message: `No duration data for ${toolName}; latency assertion skipped`
8679
+ };
8680
+ }
8681
+ if (actualDurationMs <= maxDurationMs) {
8682
+ return {
8683
+ status: "pass",
8684
+ message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
8685
+ };
8686
+ }
8687
+ return {
8688
+ status: "fail",
8689
+ message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
8690
+ };
8691
+ }
8284
8692
  var ToolTrajectoryEvaluator = class {
8285
8693
  kind = "tool_trajectory";
8286
8694
  config;
@@ -8339,7 +8747,8 @@ var ToolTrajectoryEvaluator = class {
8339
8747
  for (const call of message.toolCalls) {
8340
8748
  toolCalls.push({
8341
8749
  name: call.tool,
8342
- args: call.input
8750
+ args: call.input,
8751
+ durationMs: call.durationMs
8343
8752
  });
8344
8753
  }
8345
8754
  }
@@ -8407,17 +8816,27 @@ var ToolTrajectoryEvaluator = class {
8407
8816
  }
8408
8817
  const hits = [];
8409
8818
  const misses = [];
8819
+ const warnings = [];
8410
8820
  let actualIndex = 0;
8821
+ let sequenceHits = 0;
8822
+ let latencyHits = 0;
8823
+ let latencySkips = 0;
8824
+ const latencyAssertionCount = expected.filter(
8825
+ (item) => item.maxDurationMs !== void 0
8826
+ ).length;
8411
8827
  for (let i = 0; i < expected.length; i++) {
8412
8828
  const expectedItem = expected[i];
8413
8829
  const expectedTool = expectedItem.tool;
8414
8830
  let found = false;
8415
8831
  let argsMismatch = false;
8832
+ let matchedCall;
8416
8833
  while (actualIndex < toolCalls.length) {
8417
8834
  const actualCall = toolCalls[actualIndex];
8418
8835
  if (actualCall.name === expectedTool) {
8419
8836
  if (argsMatch(expectedItem.args, actualCall.args)) {
8420
8837
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
8838
+ sequenceHits++;
8839
+ matchedCall = actualCall;
8421
8840
  actualIndex++;
8422
8841
  found = true;
8423
8842
  break;
@@ -8434,14 +8853,35 @@ var ToolTrajectoryEvaluator = class {
8434
8853
  if (!found && !argsMismatch) {
8435
8854
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
8436
8855
  }
8856
+ if (found && matchedCall) {
8857
+ const latencyResult = checkLatency(
8858
+ expectedTool,
8859
+ expectedItem.maxDurationMs,
8860
+ matchedCall.durationMs
8861
+ );
8862
+ if (latencyResult.status === "pass") {
8863
+ hits.push(latencyResult.message);
8864
+ latencyHits++;
8865
+ } else if (latencyResult.status === "fail") {
8866
+ misses.push(latencyResult.message);
8867
+ } else if (latencyResult.message) {
8868
+ warnings.push(latencyResult.message);
8869
+ latencySkips++;
8870
+ }
8871
+ }
8437
8872
  }
8438
- const score = hits.length / expected.length;
8873
+ for (const warning of warnings) {
8874
+ console.warn(`[tool_trajectory] ${warning}`);
8875
+ }
8876
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
8877
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
8878
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
8439
8879
  return {
8440
8880
  score,
8441
8881
  verdict: scoreToVerdict(score),
8442
8882
  hits,
8443
8883
  misses,
8444
- expectedAspectCount: expected.length
8884
+ expectedAspectCount: totalAssertions
8445
8885
  };
8446
8886
  }
8447
8887
  evaluateExact(toolCalls) {
@@ -8457,6 +8897,13 @@ var ToolTrajectoryEvaluator = class {
8457
8897
  }
8458
8898
  const hits = [];
8459
8899
  const misses = [];
8900
+ const warnings = [];
8901
+ let sequenceHits = 0;
8902
+ let latencyHits = 0;
8903
+ let latencySkips = 0;
8904
+ const latencyAssertionCount = expected.filter(
8905
+ (item) => item.maxDurationMs !== void 0
8906
+ ).length;
8460
8907
  if (toolCalls.length !== expected.length) {
8461
8908
  misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
8462
8909
  }
@@ -8466,26 +8913,50 @@ var ToolTrajectoryEvaluator = class {
8466
8913
  const expectedTool = expectedItem.tool;
8467
8914
  const actualCall = toolCalls[i];
8468
8915
  const actualTool = actualCall.name;
8916
+ let sequenceMatched = false;
8469
8917
  if (actualTool === expectedTool) {
8470
8918
  if (argsMatch(expectedItem.args, actualCall.args)) {
8471
8919
  hits.push(`Position ${i}: ${expectedTool}`);
8920
+ sequenceHits++;
8921
+ sequenceMatched = true;
8472
8922
  } else {
8473
8923
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
8474
8924
  }
8475
8925
  } else {
8476
8926
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
8477
8927
  }
8928
+ if (sequenceMatched) {
8929
+ const latencyResult = checkLatency(
8930
+ expectedTool,
8931
+ expectedItem.maxDurationMs,
8932
+ actualCall.durationMs
8933
+ );
8934
+ if (latencyResult.status === "pass") {
8935
+ hits.push(latencyResult.message);
8936
+ latencyHits++;
8937
+ } else if (latencyResult.status === "fail") {
8938
+ misses.push(latencyResult.message);
8939
+ } else if (latencyResult.message) {
8940
+ warnings.push(latencyResult.message);
8941
+ latencySkips++;
8942
+ }
8943
+ }
8478
8944
  }
8479
8945
  for (let i = checkLength; i < expected.length; i++) {
8480
8946
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
8481
8947
  }
8482
- const score = hits.length / expected.length;
8948
+ for (const warning of warnings) {
8949
+ console.warn(`[tool_trajectory] ${warning}`);
8950
+ }
8951
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
8952
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
8953
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
8483
8954
  return {
8484
8955
  score,
8485
8956
  verdict: scoreToVerdict(score),
8486
8957
  hits,
8487
8958
  misses,
8488
- expectedAspectCount: expected.length
8959
+ expectedAspectCount: totalAssertions
8489
8960
  };
8490
8961
  }
8491
8962
  };
@@ -8493,6 +8964,7 @@ var ToolTrajectoryEvaluator = class {
8493
8964
  // src/evaluation/orchestrator.ts
8494
8965
  var import_node_crypto5 = require("crypto");
8495
8966
  var import_node_path17 = __toESM(require("path"), 1);
8967
+ var import_micromatch4 = __toESM(require("micromatch"), 1);
8496
8968
 
8497
8969
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
8498
8970
  var Node = class {
@@ -8651,17 +9123,17 @@ async function runEvaluation(options) {
8651
9123
  cache,
8652
9124
  useCache,
8653
9125
  now,
8654
- evalId,
9126
+ filter,
8655
9127
  verbose,
8656
9128
  evalCases: preloadedEvalCases,
8657
9129
  onResult,
8658
9130
  onProgress
8659
9131
  } = options;
8660
- const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
8661
- const filteredEvalCases = filterEvalCases(evalCases, evalId);
9132
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
9133
+ const filteredEvalCases = filterEvalCases(evalCases, filter);
8662
9134
  if (filteredEvalCases.length === 0) {
8663
- if (evalId) {
8664
- throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
9135
+ if (filter) {
9136
+ throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
8665
9137
  }
8666
9138
  return [];
8667
9139
  }
@@ -9237,7 +9709,10 @@ async function runEvaluatorList(options) {
9237
9709
  attempt,
9238
9710
  promptInputs,
9239
9711
  now,
9240
- judgeProvider
9712
+ judgeProvider,
9713
+ outputMessages,
9714
+ traceSummary,
9715
+ agentTimeoutMs
9241
9716
  });
9242
9717
  const weight = evaluator.weight ?? 1;
9243
9718
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -9572,9 +10047,22 @@ async function runLlmJudgeEvaluator(options) {
9572
10047
  attempt,
9573
10048
  promptInputs,
9574
10049
  now,
9575
- judgeProvider
10050
+ judgeProvider,
10051
+ outputMessages,
10052
+ traceSummary,
10053
+ agentTimeoutMs
9576
10054
  } = options;
9577
- const customPrompt = await resolveCustomPrompt(config);
10055
+ const customPrompt = await resolveCustomPrompt(
10056
+ config,
10057
+ {
10058
+ evalCase,
10059
+ candidate,
10060
+ outputMessages,
10061
+ traceSummary,
10062
+ config: config.config
10063
+ },
10064
+ agentTimeoutMs
10065
+ );
9578
10066
  return evaluatorRegistry.llm_judge.evaluate({
9579
10067
  evalCase,
9580
10068
  candidate,
@@ -9588,23 +10076,70 @@ async function runLlmJudgeEvaluator(options) {
9588
10076
  evaluator: config
9589
10077
  });
9590
10078
  }
9591
- async function resolveCustomPrompt(config) {
9592
- if (config.promptPath) {
10079
+ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
10080
+ if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
10081
+ if (!context) {
10082
+ throw new Error("Context required for executable prompt templates");
10083
+ }
10084
+ return executePromptTemplate(
10085
+ promptConfig.resolvedPromptScript,
10086
+ context,
10087
+ promptConfig.config,
10088
+ timeoutMs
10089
+ );
10090
+ }
10091
+ const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
10092
+ if (promptPath) {
9593
10093
  try {
9594
- const content = await readTextFile(config.promptPath);
10094
+ const content = await readTextFile(promptPath);
9595
10095
  return content;
9596
10096
  } catch (error) {
9597
10097
  const message = error instanceof Error ? error.message : String(error);
9598
- console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
10098
+ console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
9599
10099
  }
9600
10100
  }
9601
- return config.prompt;
10101
+ const promptValue = promptConfig.prompt;
10102
+ if (typeof promptValue === "string") {
10103
+ return promptValue;
10104
+ }
10105
+ return void 0;
10106
+ }
10107
+ async function executePromptTemplate(script, context, config, timeoutMs) {
10108
+ const payload = {
10109
+ question: context.evalCase.question,
10110
+ expectedOutcome: context.evalCase.expected_outcome,
10111
+ expectedMessages: context.evalCase.expected_messages,
10112
+ referenceAnswer: context.evalCase.reference_answer,
10113
+ candidateAnswer: context.candidate,
10114
+ outputMessages: context.outputMessages ?? null,
10115
+ guidelineFiles: context.evalCase.guideline_paths,
10116
+ inputFiles: context.evalCase.file_paths.filter(
10117
+ (p) => !context.evalCase.guideline_paths.includes(p)
10118
+ ),
10119
+ inputMessages: context.evalCase.input_messages,
10120
+ traceSummary: context.traceSummary ?? null,
10121
+ config: config ?? context.config ?? null
10122
+ };
10123
+ const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
10124
+ const scriptPath = script[script.length - 1];
10125
+ const cwd = import_node_path17.default.dirname(scriptPath);
10126
+ try {
10127
+ const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
10128
+ const prompt = stdout.trim();
10129
+ if (!prompt) {
10130
+ throw new Error("Prompt template produced empty output");
10131
+ }
10132
+ return prompt;
10133
+ } catch (error) {
10134
+ const message = error instanceof Error ? error.message : String(error);
10135
+ throw new Error(`Prompt template execution failed: ${message}`);
10136
+ }
9602
10137
  }
9603
- function filterEvalCases(evalCases, evalId) {
9604
- if (!evalId) {
10138
+ function filterEvalCases(evalCases, filter) {
10139
+ if (!filter) {
9605
10140
  return evalCases;
9606
10141
  }
9607
- return evalCases.filter((evalCase) => evalCase.id === evalId);
10142
+ return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
9608
10143
  }
9609
10144
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
9610
10145
  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -9762,7 +10297,7 @@ var import_ai4 = require("ai");
9762
10297
  var import_zod4 = require("zod");
9763
10298
  var rubricItemSchema = import_zod4.z.object({
9764
10299
  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
9765
- description: import_zod4.z.string().describe("What this rubric checks for"),
10300
+ expected_outcome: import_zod4.z.string().describe("Concrete expected outcome for this rubric item"),
9766
10301
  weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
9767
10302
  required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
9768
10303
  });
@@ -9782,7 +10317,7 @@ You must return a valid JSON object matching this schema:
9782
10317
  "rubrics": [
9783
10318
  {
9784
10319
  "id": "string (short identifier)",
9785
- "description": "string (what to check)",
10320
+ "expected_outcome": "string (concrete expected outcome for this rubric item)",
9786
10321
  "weight": number (default 1.0),
9787
10322
  "required": boolean (default true)
9788
10323
  }
@@ -9818,7 +10353,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
9818
10353
  "Each rubric should:",
9819
10354
  "- Be specific and testable",
9820
10355
  "- Have a short, descriptive ID",
9821
- "- Include a clear description of what to check",
10356
+ "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
9822
10357
  "- Indicate if it is required (mandatory) or optional",
9823
10358
  "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
9824
10359
  "",