@agentv/core 2.2.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-KDEP4I7G.js";
13
+ } from "./chunk-RP3M7COZ.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -148,6 +148,7 @@ function mergeExecutionMetrics(summary, metrics) {
148
148
  // src/evaluation/yaml-parser.ts
149
149
  import { readFile as readFile6 } from "node:fs/promises";
150
150
  import path7 from "node:path";
151
+ import micromatch3 from "micromatch";
151
152
  import { parse as parse2 } from "yaml";
152
153
 
153
154
  // src/evaluation/loaders/config-loader.ts
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
462
463
  );
463
464
  }
464
465
  }
465
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
466
- const config = {};
466
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
467
+ const config2 = {};
467
468
  for (const [key, value] of Object.entries(rawEvaluator)) {
468
- if (!knownProps.has(key) && value !== void 0) {
469
- config[key] = value;
469
+ if (!knownProps2.has(key) && value !== void 0) {
470
+ config2[key] = value;
470
471
  }
471
472
  }
472
473
  evaluators.push({
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
476
477
  cwd,
477
478
  resolvedCwd,
478
479
  ...weight2 !== void 0 ? { weight: weight2 } : {},
479
- ...Object.keys(config).length > 0 ? { config } : {},
480
+ ...Object.keys(config2).length > 0 ? { config: config2 } : {},
480
481
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
481
482
  });
482
483
  continue;
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
641
642
  continue;
642
643
  }
643
644
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
644
- const config = {
645
+ const config2 = {
645
646
  name,
646
647
  type: "tool_trajectory",
647
648
  mode,
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
649
650
  ...expected ? { expected } : {},
650
651
  ...weight2 !== void 0 ? { weight: weight2 } : {}
651
652
  };
652
- evaluators.push(config);
653
+ evaluators.push(config2);
653
654
  continue;
654
655
  }
655
656
  if (typeValue === "field_accuracy") {
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
786
787
  });
787
788
  continue;
788
789
  }
789
- const prompt = asString(rawEvaluator.prompt);
790
+ const rawPrompt = rawEvaluator.prompt;
791
+ let prompt;
790
792
  let promptPath;
791
- if (prompt) {
793
+ let resolvedPromptScript;
794
+ let promptScriptConfig;
795
+ if (isJsonObject2(rawPrompt)) {
796
+ const scriptArray = asStringArray(
797
+ rawPrompt.script,
798
+ `prompt.script for evaluator '${name}' in '${evalId}'`
799
+ );
800
+ if (!scriptArray) {
801
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
802
+ }
803
+ const scriptPath = scriptArray[scriptArray.length - 1];
804
+ const resolved = await resolveFileReference2(scriptPath, searchRoots);
805
+ if (resolved.resolvedPath) {
806
+ resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
807
+ } else {
808
+ throw new Error(
809
+ `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
810
+ );
811
+ }
812
+ if (isJsonObject2(rawPrompt.config)) {
813
+ promptScriptConfig = rawPrompt.config;
814
+ }
815
+ } else if (typeof rawPrompt === "string") {
816
+ prompt = rawPrompt;
792
817
  const resolved = await resolveFileReference2(prompt, searchRoots);
793
818
  if (resolved.resolvedPath) {
794
819
  promptPath = path3.resolve(resolved.resolvedPath);
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
807
832
  }
808
833
  const _model = asString(rawEvaluator.model);
809
834
  const rawRubrics = rawEvaluator.rubrics;
810
- const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
811
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
812
- description: asString(rubric.description) ?? "",
813
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
814
- required: typeof rubric.required === "boolean" ? rubric.required : true
815
- })).filter((r) => r.description.length > 0) : void 0;
835
+ const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
816
836
  if (typeValue === "rubric") {
817
837
  if (!parsedRubrics) {
818
838
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
832
852
  continue;
833
853
  }
834
854
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
855
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
856
+ const config = {};
857
+ for (const [key, value] of Object.entries(rawEvaluator)) {
858
+ if (!knownProps.has(key) && value !== void 0) {
859
+ config[key] = value;
860
+ }
861
+ }
862
+ const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
863
+ const mergedConfig = { ...config, ...topLevelConfig };
864
+ const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
835
865
  evaluators.push({
836
866
  name,
837
867
  type: "llm_judge",
838
868
  prompt,
839
869
  promptPath,
870
+ ...promptPath ? { resolvedPromptPath: promptPath } : {},
871
+ ...resolvedPromptScript ? { resolvedPromptScript } : {},
840
872
  ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
841
- ...weight !== void 0 ? { weight } : {}
873
+ ...weight !== void 0 ? { weight } : {},
874
+ ...finalConfig ? { config: finalConfig } : {}
842
875
  });
843
876
  }
844
877
  return evaluators.length > 0 ? evaluators : void 0;
@@ -925,10 +958,190 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
925
958
  function isValidFieldAggregationType(value) {
926
959
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
927
960
  }
961
+ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
962
+ const items = [];
963
+ for (const [index, rawRubric] of rawRubrics.entries()) {
964
+ if (!isJsonObject2(rawRubric)) {
965
+ logWarning2(
966
+ `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
967
+ );
968
+ continue;
969
+ }
970
+ const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
971
+ const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
972
+ const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
973
+ let requiredMinScore;
974
+ let required;
975
+ if (typeof rawRubric.required_min_score === "number") {
976
+ const minScore = rawRubric.required_min_score;
977
+ if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
978
+ throw new Error(
979
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
980
+ );
981
+ }
982
+ requiredMinScore = minScore;
983
+ }
984
+ if (typeof rawRubric.required === "boolean") {
985
+ required = rawRubric.required;
986
+ }
987
+ let scoreRanges;
988
+ const rawScoreRanges = rawRubric.score_ranges;
989
+ if (rawScoreRanges !== void 0) {
990
+ if (!Array.isArray(rawScoreRanges)) {
991
+ throw new Error(
992
+ `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
993
+ );
994
+ }
995
+ scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
996
+ items.push({
997
+ id,
998
+ weight,
999
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1000
+ ...required !== void 0 ? { required } : {},
1001
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1002
+ score_ranges: scoreRanges
1003
+ });
1004
+ } else {
1005
+ if (expectedOutcome.length === 0) {
1006
+ logWarning2(
1007
+ `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
1008
+ );
1009
+ continue;
1010
+ }
1011
+ items.push({
1012
+ id,
1013
+ expected_outcome: expectedOutcome,
1014
+ weight,
1015
+ // Default to required: true if not specified (backward compatibility)
1016
+ required: required ?? true,
1017
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1018
+ });
1019
+ }
1020
+ }
1021
+ return items.length > 0 ? items : void 0;
1022
+ }
1023
+ function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
1024
+ const ranges = [];
1025
+ for (const [index, rawRange] of rawRanges.entries()) {
1026
+ if (!isJsonObject2(rawRange)) {
1027
+ throw new Error(
1028
+ `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
1029
+ );
1030
+ }
1031
+ const scoreRangeValue = rawRange.score_range;
1032
+ if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
1033
+ throw new Error(
1034
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
1035
+ );
1036
+ }
1037
+ const [min, max] = scoreRangeValue;
1038
+ if (!Number.isInteger(min) || !Number.isInteger(max)) {
1039
+ throw new Error(
1040
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
1041
+ );
1042
+ }
1043
+ if (min < 0 || min > 10 || max < 0 || max > 10) {
1044
+ throw new Error(
1045
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
1046
+ );
1047
+ }
1048
+ if (min > max) {
1049
+ throw new Error(
1050
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
1051
+ );
1052
+ }
1053
+ const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
1054
+ if (expectedOutcome.length === 0) {
1055
+ throw new Error(
1056
+ `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
1057
+ );
1058
+ }
1059
+ ranges.push({
1060
+ score_range: [min, max],
1061
+ expected_outcome: expectedOutcome
1062
+ });
1063
+ }
1064
+ const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
1065
+ for (let i = 1; i < sortedRanges.length; i++) {
1066
+ const prev = sortedRanges[i - 1];
1067
+ const curr = sortedRanges[i];
1068
+ if (curr.score_range[0] <= prev.score_range[1]) {
1069
+ throw new Error(
1070
+ `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
1071
+ );
1072
+ }
1073
+ }
1074
+ const covered = /* @__PURE__ */ new Set();
1075
+ for (const range of ranges) {
1076
+ for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
1077
+ covered.add(i);
1078
+ }
1079
+ }
1080
+ const missing = [];
1081
+ for (let i = 0; i <= 10; i++) {
1082
+ if (!covered.has(i)) {
1083
+ missing.push(i);
1084
+ }
1085
+ }
1086
+ if (missing.length > 0) {
1087
+ throw new Error(
1088
+ `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
1089
+ );
1090
+ }
1091
+ return ranges;
1092
+ }
1093
+ function parseInlineRubrics(rawRubrics) {
1094
+ const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
1095
+ if (typeof rubric === "string") {
1096
+ return {
1097
+ id: `rubric-${index + 1}`,
1098
+ expected_outcome: rubric,
1099
+ weight: 1,
1100
+ required: true
1101
+ };
1102
+ }
1103
+ const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
1104
+ const rawScoreRanges = rubric.score_ranges;
1105
+ const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
1106
+ score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
1107
+ expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
1108
+ })).filter((r) => r.expected_outcome.length > 0) : void 0;
1109
+ const baseRubric = {
1110
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
1111
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1
1112
+ };
1113
+ if (scoreRanges && scoreRanges.length > 0) {
1114
+ return {
1115
+ ...baseRubric,
1116
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1117
+ ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
1118
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
1119
+ score_ranges: scoreRanges
1120
+ };
1121
+ }
1122
+ return {
1123
+ ...baseRubric,
1124
+ expected_outcome: expectedOutcome,
1125
+ required: typeof rubric.required === "boolean" ? rubric.required : true,
1126
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
1127
+ };
1128
+ }).filter(
1129
+ (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
1130
+ );
1131
+ if (rubricItems.length === 0) {
1132
+ return void 0;
1133
+ }
1134
+ return {
1135
+ name: "rubric",
1136
+ type: "llm_judge",
1137
+ rubrics: rubricItems
1138
+ };
1139
+ }
928
1140
 
929
1141
  // src/evaluation/loaders/jsonl-parser.ts
930
1142
  import { readFile as readFile4 } from "node:fs/promises";
931
1143
  import path5 from "node:path";
1144
+ import micromatch2 from "micromatch";
932
1145
  import { parse as parseYaml } from "yaml";
933
1146
 
934
1147
  // src/evaluation/loaders/message-processor.ts
@@ -1191,6 +1404,65 @@ async function processExpectedMessages(options) {
1191
1404
  return segments;
1192
1405
  }
1193
1406
 
1407
+ // src/evaluation/loaders/shorthand-expansion.ts
1408
+ function expandInputShorthand(value) {
1409
+ if (value === void 0 || value === null) {
1410
+ return void 0;
1411
+ }
1412
+ if (typeof value === "string") {
1413
+ return [{ role: "user", content: value }];
1414
+ }
1415
+ if (Array.isArray(value)) {
1416
+ const messages = value.filter((msg) => isTestMessage(msg));
1417
+ return messages.length > 0 ? messages : void 0;
1418
+ }
1419
+ return void 0;
1420
+ }
1421
+ function expandExpectedOutputShorthand(value) {
1422
+ if (value === void 0 || value === null) {
1423
+ return void 0;
1424
+ }
1425
+ if (typeof value === "string") {
1426
+ return [{ role: "assistant", content: value }];
1427
+ }
1428
+ if (Array.isArray(value)) {
1429
+ if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
1430
+ const messages = value.filter((msg) => isTestMessage(msg));
1431
+ return messages.length > 0 ? messages : void 0;
1432
+ }
1433
+ return [{ role: "assistant", content: value }];
1434
+ }
1435
+ if (isJsonObject(value)) {
1436
+ if ("role" in value) {
1437
+ return isTestMessage(value) ? [value] : void 0;
1438
+ }
1439
+ return [{ role: "assistant", content: value }];
1440
+ }
1441
+ return void 0;
1442
+ }
1443
+ function resolveInputMessages(raw) {
1444
+ if (raw.input_messages !== void 0) {
1445
+ if (Array.isArray(raw.input_messages)) {
1446
+ const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
1447
+ return messages.length > 0 ? messages : void 0;
1448
+ }
1449
+ return void 0;
1450
+ }
1451
+ return expandInputShorthand(raw.input);
1452
+ }
1453
+ function resolveExpectedMessages(raw) {
1454
+ if (raw.expected_messages !== void 0) {
1455
+ if (Array.isArray(raw.expected_messages)) {
1456
+ const messages = raw.expected_messages.filter(
1457
+ (msg) => isTestMessage(msg)
1458
+ );
1459
+ return messages.length > 0 ? messages : void 0;
1460
+ }
1461
+ return void 0;
1462
+ }
1463
+ return expandExpectedOutputShorthand(raw.expected_output);
1464
+ }
1465
+
1194
1466
  // src/evaluation/loaders/jsonl-parser.ts
1195
1467
  var ANSI_YELLOW5 = "\x1B[33m";
1196
1468
  var ANSI_RED = "\x1B[31m";
@@ -1251,7 +1523,7 @@ function parseJsonlContent(content, filePath) {
1251
1523
  }
1252
1524
  async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1253
1525
  const verbose = options?.verbose ?? false;
1254
- const evalIdFilter = options?.evalId;
1526
+ const filterPattern = options?.filter;
1255
1527
  const absoluteTestPath = path5.resolve(evalFilePath);
1256
1528
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1257
1529
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -1278,28 +1550,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1278
1550
  const evalcase = rawCases[lineIndex];
1279
1551
  const lineNumber = lineIndex + 1;
1280
1552
  const id = asString4(evalcase.id);
1281
- if (evalIdFilter && id !== evalIdFilter) {
1553
+ if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
1282
1554
  continue;
1283
1555
  }
1284
1556
  const conversationId = asString4(evalcase.conversation_id);
1285
1557
  const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
1286
- const inputMessagesValue = evalcase.input_messages;
1287
- const expectedMessagesValue = evalcase.expected_messages;
1288
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
1558
+ const inputMessages = resolveInputMessages(evalcase);
1559
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1560
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1289
1561
  logError(
1290
- `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
1562
+ `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
1291
1563
  );
1292
1564
  continue;
1293
1565
  }
1294
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1295
- const inputMessages = inputMessagesValue.filter(
1296
- (msg) => isTestMessage(msg)
1297
- );
1298
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1299
- if (hasExpectedMessages && expectedMessages.length === 0) {
1300
- logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
1301
- continue;
1302
- }
1566
+ const hasExpectedMessages = expectedMessages.length > 0;
1303
1567
  const guidelinePaths = [];
1304
1568
  const inputTextParts = [];
1305
1569
  const inputSegments = await processMessages({
@@ -1345,28 +1609,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1345
1609
  }
1346
1610
  const inlineRubrics = evalcase.rubrics;
1347
1611
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1348
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1349
- if (typeof rubric === "string") {
1350
- return {
1351
- id: `rubric-${index + 1}`,
1352
- description: rubric,
1353
- weight: 1,
1354
- required: true
1355
- };
1356
- }
1357
- return {
1358
- id: asString4(rubric.id) ?? `rubric-${index + 1}`,
1359
- description: asString4(rubric.description) ?? "",
1360
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1361
- required: typeof rubric.required === "boolean" ? rubric.required : true
1362
- };
1363
- }).filter((r) => r.description.length > 0);
1364
- if (rubricItems.length > 0) {
1365
- const rubricEvaluator = {
1366
- name: "rubric",
1367
- type: "llm_judge",
1368
- rubrics: rubricItems
1369
- };
1612
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
1613
+ if (rubricEvaluator) {
1370
1614
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1371
1615
  }
1372
1616
  }
@@ -1676,7 +1920,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1676
1920
  return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
1677
1921
  }
1678
1922
  const verbose = options?.verbose ?? false;
1679
- const evalIdFilter = options?.evalId;
1923
+ const filterPattern = options?.filter;
1680
1924
  const absoluteTestPath = path7.resolve(evalFilePath);
1681
1925
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1682
1926
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -1706,28 +1950,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1706
1950
  }
1707
1951
  const evalcase = rawEvalcase;
1708
1952
  const id = asString6(evalcase.id);
1709
- if (evalIdFilter && id !== evalIdFilter) {
1953
+ if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
1710
1954
  continue;
1711
1955
  }
1712
1956
  const conversationId = asString6(evalcase.conversation_id);
1713
1957
  const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
1714
- const inputMessagesValue = evalcase.input_messages;
1715
- const expectedMessagesValue = evalcase.expected_messages;
1716
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
1958
+ const inputMessages = resolveInputMessages(evalcase);
1959
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1960
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1717
1961
  logError2(
1718
- `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
1962
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
1719
1963
  );
1720
1964
  continue;
1721
1965
  }
1722
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1723
- const inputMessages = inputMessagesValue.filter(
1724
- (msg) => isTestMessage(msg)
1725
- );
1726
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1727
- if (hasExpectedMessages && expectedMessages.length === 0) {
1728
- logError2(`No valid expected message found for eval case: ${id}`);
1729
- continue;
1730
- }
1966
+ const hasExpectedMessages = expectedMessages.length > 0;
1731
1967
  const guidelinePaths = [];
1732
1968
  const inputTextParts = [];
1733
1969
  const inputSegments = await processMessages({
@@ -1771,28 +2007,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1771
2007
  }
1772
2008
  const inlineRubrics = evalcase.rubrics;
1773
2009
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1774
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1775
- if (typeof rubric === "string") {
1776
- return {
1777
- id: `rubric-${index + 1}`,
1778
- description: rubric,
1779
- weight: 1,
1780
- required: true
1781
- };
1782
- }
1783
- return {
1784
- id: asString6(rubric.id) ?? `rubric-${index + 1}`,
1785
- description: asString6(rubric.description) ?? "",
1786
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1787
- required: typeof rubric.required === "boolean" ? rubric.required : true
1788
- };
1789
- }).filter((r) => r.description.length > 0);
1790
- if (rubricItems.length > 0) {
1791
- const rubricEvaluator = {
1792
- name: "rubric",
1793
- type: "llm_judge",
1794
- rubrics: rubricItems
1795
- };
2010
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2011
+ if (rubricEvaluator) {
1796
2012
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1797
2013
  }
1798
2014
  }
@@ -3049,7 +3265,8 @@ var ToolCallSchema = z.object({
3049
3265
  input: z.unknown().optional(),
3050
3266
  output: z.unknown().optional(),
3051
3267
  id: z.string().optional(),
3052
- timestamp: z.string().optional()
3268
+ timestamp: z.string().optional(),
3269
+ duration_ms: z.number().optional()
3053
3270
  });
3054
3271
  var OutputMessageInputSchema = z.object({
3055
3272
  role: z.string(),
@@ -3057,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
3057
3274
  content: z.unknown().optional(),
3058
3275
  tool_calls: z.array(ToolCallSchema).optional(),
3059
3276
  timestamp: z.string().optional(),
3277
+ duration_ms: z.number().optional(),
3060
3278
  metadata: z.record(z.unknown()).optional()
3061
3279
  });
3062
3280
  var TokenUsageSchema = z.object({
@@ -3095,8 +3313,16 @@ function convertOutputMessages(messages) {
3095
3313
  role: msg.role,
3096
3314
  name: msg.name,
3097
3315
  content: msg.content,
3098
- toolCalls: msg.tool_calls,
3316
+ toolCalls: msg.tool_calls?.map((tc) => ({
3317
+ tool: tc.tool,
3318
+ input: tc.input,
3319
+ output: tc.output,
3320
+ id: tc.id,
3321
+ timestamp: tc.timestamp,
3322
+ durationMs: tc.duration_ms
3323
+ })),
3099
3324
  timestamp: msg.timestamp,
3325
+ durationMs: msg.duration_ms,
3100
3326
  metadata: msg.metadata
3101
3327
  }));
3102
3328
  }
@@ -6173,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
6173
6399
  checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
6174
6400
  overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
6175
6401
  });
6402
+ var scoreRangeCheckResultSchema = z2.object({
6403
+ id: z2.string().describe("The ID of the rubric criterion being scored"),
6404
+ score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
6405
+ reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
6406
+ });
6407
+ var scoreRangeEvaluationSchema = z2.object({
6408
+ checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
6409
+ overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
6410
+ });
6176
6411
  var LlmJudgeEvaluator = class {
6177
6412
  kind = "llm_judge";
6178
6413
  resolveJudgeProvider;
@@ -6258,6 +6493,10 @@ var LlmJudgeEvaluator = class {
6258
6493
  `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
6259
6494
  );
6260
6495
  }
6496
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
6497
+ if (hasScoreRanges) {
6498
+ return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
6499
+ }
6261
6500
  const prompt = this.buildRubricPrompt(context, rubrics);
6262
6501
  const systemPrompt = buildRubricOutputSchema();
6263
6502
  const evaluatorRawRequest = {
@@ -6283,6 +6522,84 @@ var LlmJudgeEvaluator = class {
6283
6522
  evaluatorRawRequest
6284
6523
  };
6285
6524
  }
6525
+ /**
6526
+ * Evaluate using score-range rubrics (analytic rubric scoring).
6527
+ * Each criterion is scored 0-10 and normalized to 0-1.
6528
+ */
6529
+ async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
6530
+ const prompt = this.buildScoreRangePrompt(context, rubrics);
6531
+ const systemPrompt = buildScoreRangeOutputSchema();
6532
+ const evaluatorRawRequest = {
6533
+ userPrompt: prompt,
6534
+ systemPrompt,
6535
+ target: judgeProvider.targetName
6536
+ };
6537
+ const { data } = await this.runWithRetry({
6538
+ context,
6539
+ judgeProvider,
6540
+ systemPrompt,
6541
+ userPrompt: prompt,
6542
+ schema: scoreRangeEvaluationSchema
6543
+ });
6544
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
6545
+ return {
6546
+ score,
6547
+ verdict,
6548
+ hits,
6549
+ misses,
6550
+ expectedAspectCount: rubrics.length,
6551
+ reasoning: data.overall_reasoning,
6552
+ evaluatorRawRequest,
6553
+ details
6554
+ };
6555
+ }
6556
+ /**
6557
+ * Build prompt for score-range rubric evaluation.
6558
+ */
6559
+ buildScoreRangePrompt(context, rubrics) {
6560
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
6561
+ const parts = [
6562
+ "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
6563
+ "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
6564
+ "",
6565
+ "[[ ## question ## ]]",
6566
+ formattedQuestion,
6567
+ "",
6568
+ "[[ ## expected_outcome ## ]]",
6569
+ context.evalCase.expected_outcome,
6570
+ ""
6571
+ ];
6572
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
6573
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
6574
+ }
6575
+ parts.push(
6576
+ "[[ ## candidate_answer ## ]]",
6577
+ context.candidate,
6578
+ "",
6579
+ "[[ ## scoring_criteria ## ]]"
6580
+ );
6581
+ for (const rubric of rubrics) {
6582
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
6583
+ const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
6584
+ parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
6585
+ if (rubric.expected_outcome) {
6586
+ parts.push(`Description: ${rubric.expected_outcome}`);
6587
+ }
6588
+ if (rubric.score_ranges && rubric.score_ranges.length > 0) {
6589
+ parts.push("Score ranges:");
6590
+ for (const range of rubric.score_ranges) {
6591
+ const [min, max] = range.score_range;
6592
+ const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
6593
+ parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
6594
+ }
6595
+ }
6596
+ }
6597
+ parts.push(
6598
+ "",
6599
+ "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
6600
+ );
6601
+ return parts.join("\n");
6602
+ }
6286
6603
  buildRubricPrompt(context, rubrics) {
6287
6604
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
6288
6605
  const parts = [
@@ -6302,7 +6619,7 @@ var LlmJudgeEvaluator = class {
6302
6619
  for (const rubric of rubrics) {
6303
6620
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
6304
6621
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
6305
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
6622
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
6306
6623
  }
6307
6624
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
6308
6625
  return parts.join("\n");
@@ -6389,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
6389
6706
  totalWeight += rubric.weight;
6390
6707
  if (check.satisfied) {
6391
6708
  earnedWeight += rubric.weight;
6392
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6709
+ hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
6393
6710
  } else {
6394
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6711
+ misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
6395
6712
  if (rubric.required) {
6396
6713
  failedRequired = true;
6397
6714
  }
@@ -6401,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
6401
6718
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6402
6719
  return { score, verdict, hits, misses };
6403
6720
  }
6721
+ function buildScoreRangeOutputSchema() {
6722
+ return `You are an expert evaluator. Score the candidate answer on each criterion.
6723
+ You must return a valid JSON object matching this schema:
6724
+ {
6725
+ "checks": [
6726
+ {
6727
+ "id": "string (criterion id)",
6728
+ "score": integer (0-10),
6729
+ "reasoning": "string (brief explanation for score)"
6730
+ }
6731
+ ],
6732
+ "overall_reasoning": "string (summary, optional)"
6733
+ }
6734
+
6735
+ Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
6736
+ }
6737
+ function calculateScoreRangeResult(result, rubrics) {
6738
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
6739
+ const hits = [];
6740
+ const misses = [];
6741
+ const rawScores = {};
6742
+ let totalWeight = 0;
6743
+ let weightedScoreSum = 0;
6744
+ let failedRequired = false;
6745
+ for (const check of result.checks) {
6746
+ const rubric = rubricMap.get(check.id);
6747
+ if (!rubric) {
6748
+ continue;
6749
+ }
6750
+ const rawScore = Math.max(0, Math.min(10, check.score));
6751
+ const normalizedScore = rawScore / 10;
6752
+ rawScores[rubric.id] = rawScore;
6753
+ totalWeight += rubric.weight;
6754
+ weightedScoreSum += normalizedScore * rubric.weight;
6755
+ let requiredMinScore;
6756
+ if (rubric.required_min_score !== void 0) {
6757
+ requiredMinScore = rubric.required_min_score;
6758
+ } else if (rubric.required === true) {
6759
+ requiredMinScore = 10;
6760
+ }
6761
+ const matchingRange = rubric.score_ranges?.find(
6762
+ (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
6763
+ );
6764
+ const rangeDescription = matchingRange?.expected_outcome ?? "";
6765
+ const criterionLabel = rubric.expected_outcome ?? rubric.id;
6766
+ const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
6767
+ const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
6768
+ if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
6769
+ failedRequired = true;
6770
+ misses.push(scoreInfo);
6771
+ } else if (rawScore >= 7) {
6772
+ hits.push(scoreInfo);
6773
+ } else {
6774
+ misses.push(scoreInfo);
6775
+ }
6776
+ }
6777
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
6778
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6779
+ return {
6780
+ score,
6781
+ verdict,
6782
+ hits,
6783
+ misses,
6784
+ details: {
6785
+ raw_scores: rawScores,
6786
+ normalization: "score / 10",
6787
+ aggregation: "weighted_average"
6788
+ }
6789
+ };
6790
+ }
6404
6791
 
6405
6792
  // src/evaluation/evaluators/composite.ts
6406
6793
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -7228,6 +7615,27 @@ function argsMatch(expected, actual) {
7228
7615
  }
7229
7616
  return true;
7230
7617
  }
7618
+ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
7619
+ if (maxDurationMs === void 0) {
7620
+ return { status: "skip", message: "" };
7621
+ }
7622
+ if (actualDurationMs === void 0) {
7623
+ return {
7624
+ status: "skip",
7625
+ message: `No duration data for ${toolName}; latency assertion skipped`
7626
+ };
7627
+ }
7628
+ if (actualDurationMs <= maxDurationMs) {
7629
+ return {
7630
+ status: "pass",
7631
+ message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
7632
+ };
7633
+ }
7634
+ return {
7635
+ status: "fail",
7636
+ message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
7637
+ };
7638
+ }
7231
7639
  var ToolTrajectoryEvaluator = class {
7232
7640
  kind = "tool_trajectory";
7233
7641
  config;
@@ -7286,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
7286
7694
  for (const call of message.toolCalls) {
7287
7695
  toolCalls.push({
7288
7696
  name: call.tool,
7289
- args: call.input
7697
+ args: call.input,
7698
+ durationMs: call.durationMs
7290
7699
  });
7291
7700
  }
7292
7701
  }
@@ -7354,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
7354
7763
  }
7355
7764
  const hits = [];
7356
7765
  const misses = [];
7766
+ const warnings = [];
7357
7767
  let actualIndex = 0;
7768
+ let sequenceHits = 0;
7769
+ let latencyHits = 0;
7770
+ let latencySkips = 0;
7771
+ const latencyAssertionCount = expected.filter(
7772
+ (item) => item.maxDurationMs !== void 0
7773
+ ).length;
7358
7774
  for (let i = 0; i < expected.length; i++) {
7359
7775
  const expectedItem = expected[i];
7360
7776
  const expectedTool = expectedItem.tool;
7361
7777
  let found = false;
7362
7778
  let argsMismatch = false;
7779
+ let matchedCall;
7363
7780
  while (actualIndex < toolCalls.length) {
7364
7781
  const actualCall = toolCalls[actualIndex];
7365
7782
  if (actualCall.name === expectedTool) {
7366
7783
  if (argsMatch(expectedItem.args, actualCall.args)) {
7367
7784
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7785
+ sequenceHits++;
7786
+ matchedCall = actualCall;
7368
7787
  actualIndex++;
7369
7788
  found = true;
7370
7789
  break;
@@ -7381,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
7381
7800
  if (!found && !argsMismatch) {
7382
7801
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7383
7802
  }
7803
+ if (found && matchedCall) {
7804
+ const latencyResult = checkLatency(
7805
+ expectedTool,
7806
+ expectedItem.maxDurationMs,
7807
+ matchedCall.durationMs
7808
+ );
7809
+ if (latencyResult.status === "pass") {
7810
+ hits.push(latencyResult.message);
7811
+ latencyHits++;
7812
+ } else if (latencyResult.status === "fail") {
7813
+ misses.push(latencyResult.message);
7814
+ } else if (latencyResult.message) {
7815
+ warnings.push(latencyResult.message);
7816
+ latencySkips++;
7817
+ }
7818
+ }
7384
7819
  }
7385
- const score = hits.length / expected.length;
7820
+ for (const warning of warnings) {
7821
+ console.warn(`[tool_trajectory] ${warning}`);
7822
+ }
7823
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
7824
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
7825
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
7386
7826
  return {
7387
7827
  score,
7388
7828
  verdict: scoreToVerdict(score),
7389
7829
  hits,
7390
7830
  misses,
7391
- expectedAspectCount: expected.length
7831
+ expectedAspectCount: totalAssertions
7392
7832
  };
7393
7833
  }
7394
7834
  evaluateExact(toolCalls) {
@@ -7404,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
7404
7844
  }
7405
7845
  const hits = [];
7406
7846
  const misses = [];
7847
+ const warnings = [];
7848
+ let sequenceHits = 0;
7849
+ let latencyHits = 0;
7850
+ let latencySkips = 0;
7851
+ const latencyAssertionCount = expected.filter(
7852
+ (item) => item.maxDurationMs !== void 0
7853
+ ).length;
7407
7854
  if (toolCalls.length !== expected.length) {
7408
7855
  misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7409
7856
  }
@@ -7413,26 +7860,50 @@ var ToolTrajectoryEvaluator = class {
7413
7860
  const expectedTool = expectedItem.tool;
7414
7861
  const actualCall = toolCalls[i];
7415
7862
  const actualTool = actualCall.name;
7863
+ let sequenceMatched = false;
7416
7864
  if (actualTool === expectedTool) {
7417
7865
  if (argsMatch(expectedItem.args, actualCall.args)) {
7418
7866
  hits.push(`Position ${i}: ${expectedTool}`);
7867
+ sequenceHits++;
7868
+ sequenceMatched = true;
7419
7869
  } else {
7420
7870
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7421
7871
  }
7422
7872
  } else {
7423
7873
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7424
7874
  }
7875
+ if (sequenceMatched) {
7876
+ const latencyResult = checkLatency(
7877
+ expectedTool,
7878
+ expectedItem.maxDurationMs,
7879
+ actualCall.durationMs
7880
+ );
7881
+ if (latencyResult.status === "pass") {
7882
+ hits.push(latencyResult.message);
7883
+ latencyHits++;
7884
+ } else if (latencyResult.status === "fail") {
7885
+ misses.push(latencyResult.message);
7886
+ } else if (latencyResult.message) {
7887
+ warnings.push(latencyResult.message);
7888
+ latencySkips++;
7889
+ }
7890
+ }
7425
7891
  }
7426
7892
  for (let i = checkLength; i < expected.length; i++) {
7427
7893
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7428
7894
  }
7429
- const score = hits.length / expected.length;
7895
+ for (const warning of warnings) {
7896
+ console.warn(`[tool_trajectory] ${warning}`);
7897
+ }
7898
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
7899
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
7900
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
7430
7901
  return {
7431
7902
  score,
7432
7903
  verdict: scoreToVerdict(score),
7433
7904
  hits,
7434
7905
  misses,
7435
- expectedAspectCount: expected.length
7906
+ expectedAspectCount: totalAssertions
7436
7907
  };
7437
7908
  }
7438
7909
  };
@@ -7440,6 +7911,7 @@ var ToolTrajectoryEvaluator = class {
7440
7911
  // src/evaluation/orchestrator.ts
7441
7912
  import { createHash } from "node:crypto";
7442
7913
  import path15 from "node:path";
7914
+ import micromatch4 from "micromatch";
7443
7915
 
7444
7916
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
7445
7917
  var Node = class {
@@ -7598,17 +8070,17 @@ async function runEvaluation(options) {
7598
8070
  cache,
7599
8071
  useCache,
7600
8072
  now,
7601
- evalId,
8073
+ filter,
7602
8074
  verbose,
7603
8075
  evalCases: preloadedEvalCases,
7604
8076
  onResult,
7605
8077
  onProgress
7606
8078
  } = options;
7607
- const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
7608
- const filteredEvalCases = filterEvalCases(evalCases, evalId);
8079
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
8080
+ const filteredEvalCases = filterEvalCases(evalCases, filter);
7609
8081
  if (filteredEvalCases.length === 0) {
7610
- if (evalId) {
7611
- throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
8082
+ if (filter) {
8083
+ throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
7612
8084
  }
7613
8085
  return [];
7614
8086
  }
@@ -8184,7 +8656,10 @@ async function runEvaluatorList(options) {
8184
8656
  attempt,
8185
8657
  promptInputs,
8186
8658
  now,
8187
- judgeProvider
8659
+ judgeProvider,
8660
+ outputMessages,
8661
+ traceSummary,
8662
+ agentTimeoutMs
8188
8663
  });
8189
8664
  const weight = evaluator.weight ?? 1;
8190
8665
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8519,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
8519
8994
  attempt,
8520
8995
  promptInputs,
8521
8996
  now,
8522
- judgeProvider
8997
+ judgeProvider,
8998
+ outputMessages,
8999
+ traceSummary,
9000
+ agentTimeoutMs
8523
9001
  } = options;
8524
- const customPrompt = await resolveCustomPrompt(config);
9002
+ const customPrompt = await resolveCustomPrompt(
9003
+ config,
9004
+ {
9005
+ evalCase,
9006
+ candidate,
9007
+ outputMessages,
9008
+ traceSummary,
9009
+ config: config.config
9010
+ },
9011
+ agentTimeoutMs
9012
+ );
8525
9013
  return evaluatorRegistry.llm_judge.evaluate({
8526
9014
  evalCase,
8527
9015
  candidate,
@@ -8535,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
8535
9023
  evaluator: config
8536
9024
  });
8537
9025
  }
8538
- async function resolveCustomPrompt(config) {
8539
- if (config.promptPath) {
9026
+ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
9027
+ if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
9028
+ if (!context) {
9029
+ throw new Error("Context required for executable prompt templates");
9030
+ }
9031
+ return executePromptTemplate(
9032
+ promptConfig.resolvedPromptScript,
9033
+ context,
9034
+ promptConfig.config,
9035
+ timeoutMs
9036
+ );
9037
+ }
9038
+ const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
9039
+ if (promptPath) {
8540
9040
  try {
8541
- const content = await readTextFile(config.promptPath);
9041
+ const content = await readTextFile(promptPath);
8542
9042
  return content;
8543
9043
  } catch (error) {
8544
9044
  const message = error instanceof Error ? error.message : String(error);
8545
- console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
9045
+ console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
8546
9046
  }
8547
9047
  }
8548
- return config.prompt;
9048
+ const promptValue = promptConfig.prompt;
9049
+ if (typeof promptValue === "string") {
9050
+ return promptValue;
9051
+ }
9052
+ return void 0;
9053
+ }
9054
+ async function executePromptTemplate(script, context, config, timeoutMs) {
9055
+ const payload = {
9056
+ question: context.evalCase.question,
9057
+ expectedOutcome: context.evalCase.expected_outcome,
9058
+ expectedMessages: context.evalCase.expected_messages,
9059
+ referenceAnswer: context.evalCase.reference_answer,
9060
+ candidateAnswer: context.candidate,
9061
+ outputMessages: context.outputMessages ?? null,
9062
+ guidelineFiles: context.evalCase.guideline_paths,
9063
+ inputFiles: context.evalCase.file_paths.filter(
9064
+ (p) => !context.evalCase.guideline_paths.includes(p)
9065
+ ),
9066
+ inputMessages: context.evalCase.input_messages,
9067
+ traceSummary: context.traceSummary ?? null,
9068
+ config: config ?? context.config ?? null
9069
+ };
9070
+ const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
9071
+ const scriptPath = script[script.length - 1];
9072
+ const cwd = path15.dirname(scriptPath);
9073
+ try {
9074
+ const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
9075
+ const prompt = stdout.trim();
9076
+ if (!prompt) {
9077
+ throw new Error("Prompt template produced empty output");
9078
+ }
9079
+ return prompt;
9080
+ } catch (error) {
9081
+ const message = error instanceof Error ? error.message : String(error);
9082
+ throw new Error(`Prompt template execution failed: ${message}`);
9083
+ }
8549
9084
  }
8550
- function filterEvalCases(evalCases, evalId) {
8551
- if (!evalId) {
9085
+ function filterEvalCases(evalCases, filter) {
9086
+ if (!filter) {
8552
9087
  return evalCases;
8553
9088
  }
8554
- return evalCases.filter((evalCase) => evalCase.id === evalId);
9089
+ return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
8555
9090
  }
8556
9091
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
8557
9092
  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -8709,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
8709
9244
  import { z as z3 } from "zod";
8710
9245
  var rubricItemSchema = z3.object({
8711
9246
  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
8712
- description: z3.string().describe("What this rubric checks for"),
9247
+ expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
8713
9248
  weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
8714
9249
  required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
8715
9250
  });
@@ -8729,7 +9264,7 @@ You must return a valid JSON object matching this schema:
8729
9264
  "rubrics": [
8730
9265
  {
8731
9266
  "id": "string (short identifier)",
8732
- "description": "string (what to check)",
9267
+ "expected_outcome": "string (concrete expected outcome for this rubric item)",
8733
9268
  "weight": number (default 1.0),
8734
9269
  "required": boolean (default true)
8735
9270
  }
@@ -8765,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
8765
9300
  "Each rubric should:",
8766
9301
  "- Be specific and testable",
8767
9302
  "- Have a short, descriptive ID",
8768
- "- Include a clear description of what to check",
9303
+ "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
8769
9304
  "- Indicate if it is required (mandatory) or optional",
8770
9305
  "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
8771
9306
  "",