@agentv/core 2.1.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -53,6 +53,7 @@ __export(index_exports, {
53
53
  createAgentKernel: () => createAgentKernel,
54
54
  createProvider: () => createProvider,
55
55
  deepEqual: () => deepEqual,
56
+ detectFormat: () => detectFormat,
56
57
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
57
58
  executeScript: () => executeScript,
58
59
  explorationRatio: () => explorationRatio,
@@ -226,9 +227,10 @@ function mergeExecutionMetrics(summary, metrics) {
226
227
  }
227
228
 
228
229
  // src/evaluation/yaml-parser.ts
229
- var import_promises6 = require("fs/promises");
230
- var import_node_path6 = __toESM(require("path"), 1);
231
- var import_yaml2 = require("yaml");
230
+ var import_promises7 = require("fs/promises");
231
+ var import_node_path7 = __toESM(require("path"), 1);
232
+ var import_micromatch3 = __toESM(require("micromatch"), 1);
233
+ var import_yaml3 = require("yaml");
232
234
 
233
235
  // src/evaluation/loaders/config-loader.ts
234
236
  var import_promises2 = require("fs/promises");
@@ -542,11 +544,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
542
544
  );
543
545
  }
544
546
  }
545
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
546
- const config = {};
547
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
548
+ const config2 = {};
547
549
  for (const [key, value] of Object.entries(rawEvaluator)) {
548
- if (!knownProps.has(key) && value !== void 0) {
549
- config[key] = value;
550
+ if (!knownProps2.has(key) && value !== void 0) {
551
+ config2[key] = value;
550
552
  }
551
553
  }
552
554
  evaluators.push({
@@ -556,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
556
558
  cwd,
557
559
  resolvedCwd,
558
560
  ...weight2 !== void 0 ? { weight: weight2 } : {},
559
- ...Object.keys(config).length > 0 ? { config } : {},
561
+ ...Object.keys(config2).length > 0 ? { config: config2 } : {},
560
562
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
561
563
  });
562
564
  continue;
@@ -721,7 +723,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
721
723
  continue;
722
724
  }
723
725
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
724
- const config = {
726
+ const config2 = {
725
727
  name,
726
728
  type: "tool_trajectory",
727
729
  mode,
@@ -729,7 +731,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
729
731
  ...expected ? { expected } : {},
730
732
  ...weight2 !== void 0 ? { weight: weight2 } : {}
731
733
  };
732
- evaluators.push(config);
734
+ evaluators.push(config2);
733
735
  continue;
734
736
  }
735
737
  if (typeValue === "field_accuracy") {
@@ -866,9 +868,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
866
868
  });
867
869
  continue;
868
870
  }
869
- const prompt = asString(rawEvaluator.prompt);
871
+ const rawPrompt = rawEvaluator.prompt;
872
+ let prompt;
870
873
  let promptPath;
871
- if (prompt) {
874
+ let resolvedPromptScript;
875
+ let promptScriptConfig;
876
+ if (isJsonObject2(rawPrompt)) {
877
+ const scriptArray = asStringArray(
878
+ rawPrompt.script,
879
+ `prompt.script for evaluator '${name}' in '${evalId}'`
880
+ );
881
+ if (!scriptArray) {
882
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
883
+ }
884
+ const scriptPath = scriptArray[scriptArray.length - 1];
885
+ const resolved = await resolveFileReference(scriptPath, searchRoots);
886
+ if (resolved.resolvedPath) {
887
+ resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path3.default.resolve(resolved.resolvedPath)];
888
+ } else {
889
+ throw new Error(
890
+ `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
891
+ );
892
+ }
893
+ if (isJsonObject2(rawPrompt.config)) {
894
+ promptScriptConfig = rawPrompt.config;
895
+ }
896
+ } else if (typeof rawPrompt === "string") {
897
+ prompt = rawPrompt;
872
898
  const resolved = await resolveFileReference(prompt, searchRoots);
873
899
  if (resolved.resolvedPath) {
874
900
  promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
@@ -887,12 +913,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
887
913
  }
888
914
  const _model = asString(rawEvaluator.model);
889
915
  const rawRubrics = rawEvaluator.rubrics;
890
- const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
891
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
892
- description: asString(rubric.description) ?? "",
893
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
894
- required: typeof rubric.required === "boolean" ? rubric.required : true
895
- })).filter((r) => r.description.length > 0) : void 0;
916
+ const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
896
917
  if (typeValue === "rubric") {
897
918
  if (!parsedRubrics) {
898
919
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -912,13 +933,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
912
933
  continue;
913
934
  }
914
935
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
936
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
937
+ const config = {};
938
+ for (const [key, value] of Object.entries(rawEvaluator)) {
939
+ if (!knownProps.has(key) && value !== void 0) {
940
+ config[key] = value;
941
+ }
942
+ }
943
+ const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
944
+ const mergedConfig = { ...config, ...topLevelConfig };
945
+ const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
915
946
  evaluators.push({
916
947
  name,
917
948
  type: "llm_judge",
918
949
  prompt,
919
950
  promptPath,
951
+ ...promptPath ? { resolvedPromptPath: promptPath } : {},
952
+ ...resolvedPromptScript ? { resolvedPromptScript } : {},
920
953
  ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
921
- ...weight !== void 0 ? { weight } : {}
954
+ ...weight !== void 0 ? { weight } : {},
955
+ ...finalConfig ? { config: finalConfig } : {}
922
956
  });
923
957
  }
924
958
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1005,6 +1039,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
1005
1039
  function isValidFieldAggregationType(value) {
1006
1040
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
1007
1041
  }
1042
+ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1043
+ const items = [];
1044
+ for (const [index, rawRubric] of rawRubrics.entries()) {
1045
+ if (!isJsonObject2(rawRubric)) {
1046
+ logWarning2(
1047
+ `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
1048
+ );
1049
+ continue;
1050
+ }
1051
+ const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
1052
+ const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
1053
+ const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
1054
+ let requiredMinScore;
1055
+ let required;
1056
+ if (typeof rawRubric.required_min_score === "number") {
1057
+ const minScore = rawRubric.required_min_score;
1058
+ if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
1059
+ throw new Error(
1060
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
1061
+ );
1062
+ }
1063
+ requiredMinScore = minScore;
1064
+ }
1065
+ if (typeof rawRubric.required === "boolean") {
1066
+ required = rawRubric.required;
1067
+ }
1068
+ let scoreRanges;
1069
+ const rawScoreRanges = rawRubric.score_ranges;
1070
+ if (rawScoreRanges !== void 0) {
1071
+ if (!Array.isArray(rawScoreRanges)) {
1072
+ throw new Error(
1073
+ `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
1074
+ );
1075
+ }
1076
+ scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
1077
+ items.push({
1078
+ id,
1079
+ weight,
1080
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1081
+ ...required !== void 0 ? { required } : {},
1082
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1083
+ score_ranges: scoreRanges
1084
+ });
1085
+ } else {
1086
+ if (expectedOutcome.length === 0) {
1087
+ logWarning2(
1088
+ `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
1089
+ );
1090
+ continue;
1091
+ }
1092
+ items.push({
1093
+ id,
1094
+ expected_outcome: expectedOutcome,
1095
+ weight,
1096
+ // Default to required: true if not specified (backward compatibility)
1097
+ required: required ?? true,
1098
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1099
+ });
1100
+ }
1101
+ }
1102
+ return items.length > 0 ? items : void 0;
1103
+ }
1104
+ function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
1105
+ const ranges = [];
1106
+ for (const [index, rawRange] of rawRanges.entries()) {
1107
+ if (!isJsonObject2(rawRange)) {
1108
+ throw new Error(
1109
+ `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
1110
+ );
1111
+ }
1112
+ const scoreRangeValue = rawRange.score_range;
1113
+ if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
1114
+ throw new Error(
1115
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
1116
+ );
1117
+ }
1118
+ const [min, max] = scoreRangeValue;
1119
+ if (!Number.isInteger(min) || !Number.isInteger(max)) {
1120
+ throw new Error(
1121
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
1122
+ );
1123
+ }
1124
+ if (min < 0 || min > 10 || max < 0 || max > 10) {
1125
+ throw new Error(
1126
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
1127
+ );
1128
+ }
1129
+ if (min > max) {
1130
+ throw new Error(
1131
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
1132
+ );
1133
+ }
1134
+ const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
1135
+ if (expectedOutcome.length === 0) {
1136
+ throw new Error(
1137
+ `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
1138
+ );
1139
+ }
1140
+ ranges.push({
1141
+ score_range: [min, max],
1142
+ expected_outcome: expectedOutcome
1143
+ });
1144
+ }
1145
+ const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
1146
+ for (let i = 1; i < sortedRanges.length; i++) {
1147
+ const prev = sortedRanges[i - 1];
1148
+ const curr = sortedRanges[i];
1149
+ if (curr.score_range[0] <= prev.score_range[1]) {
1150
+ throw new Error(
1151
+ `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
1152
+ );
1153
+ }
1154
+ }
1155
+ const covered = /* @__PURE__ */ new Set();
1156
+ for (const range of ranges) {
1157
+ for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
1158
+ covered.add(i);
1159
+ }
1160
+ }
1161
+ const missing = [];
1162
+ for (let i = 0; i <= 10; i++) {
1163
+ if (!covered.has(i)) {
1164
+ missing.push(i);
1165
+ }
1166
+ }
1167
+ if (missing.length > 0) {
1168
+ throw new Error(
1169
+ `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
1170
+ );
1171
+ }
1172
+ return ranges;
1173
+ }
1174
+ function parseInlineRubrics(rawRubrics) {
1175
+ const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
1176
+ if (typeof rubric === "string") {
1177
+ return {
1178
+ id: `rubric-${index + 1}`,
1179
+ expected_outcome: rubric,
1180
+ weight: 1,
1181
+ required: true
1182
+ };
1183
+ }
1184
+ const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
1185
+ const rawScoreRanges = rubric.score_ranges;
1186
+ const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
1187
+ score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
1188
+ expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
1189
+ })).filter((r) => r.expected_outcome.length > 0) : void 0;
1190
+ const baseRubric = {
1191
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
1192
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1
1193
+ };
1194
+ if (scoreRanges && scoreRanges.length > 0) {
1195
+ return {
1196
+ ...baseRubric,
1197
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1198
+ ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
1199
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
1200
+ score_ranges: scoreRanges
1201
+ };
1202
+ }
1203
+ return {
1204
+ ...baseRubric,
1205
+ expected_outcome: expectedOutcome,
1206
+ required: typeof rubric.required === "boolean" ? rubric.required : true,
1207
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
1208
+ };
1209
+ }).filter(
1210
+ (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
1211
+ );
1212
+ if (rubricItems.length === 0) {
1213
+ return void 0;
1214
+ }
1215
+ return {
1216
+ name: "rubric",
1217
+ type: "llm_judge",
1218
+ rubrics: rubricItems
1219
+ };
1220
+ }
1221
+
1222
+ // src/evaluation/loaders/jsonl-parser.ts
1223
+ var import_promises5 = require("fs/promises");
1224
+ var import_node_path5 = __toESM(require("path"), 1);
1225
+ var import_micromatch2 = __toESM(require("micromatch"), 1);
1226
+ var import_yaml2 = require("yaml");
1008
1227
 
1009
1228
  // src/evaluation/loaders/message-processor.ts
1010
1229
  var import_promises4 = require("fs/promises");
@@ -1266,28 +1485,302 @@ async function processExpectedMessages(options) {
1266
1485
  return segments;
1267
1486
  }
1268
1487
 
1269
- // src/evaluation/formatting/prompt-builder.ts
1270
- var import_promises5 = require("fs/promises");
1271
- var import_node_path5 = __toESM(require("path"), 1);
1488
+ // src/evaluation/loaders/shorthand-expansion.ts
1489
+ function expandInputShorthand(value) {
1490
+ if (value === void 0 || value === null) {
1491
+ return void 0;
1492
+ }
1493
+ if (typeof value === "string") {
1494
+ return [{ role: "user", content: value }];
1495
+ }
1496
+ if (Array.isArray(value)) {
1497
+ const messages = value.filter((msg) => isTestMessage(msg));
1498
+ return messages.length > 0 ? messages : void 0;
1499
+ }
1500
+ return void 0;
1501
+ }
1502
+ function expandExpectedOutputShorthand(value) {
1503
+ if (value === void 0 || value === null) {
1504
+ return void 0;
1505
+ }
1506
+ if (typeof value === "string") {
1507
+ return [{ role: "assistant", content: value }];
1508
+ }
1509
+ if (Array.isArray(value)) {
1510
+ if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
1511
+ const messages = value.filter((msg) => isTestMessage(msg));
1512
+ return messages.length > 0 ? messages : void 0;
1513
+ }
1514
+ return [{ role: "assistant", content: value }];
1515
+ }
1516
+ if (isJsonObject(value)) {
1517
+ if ("role" in value) {
1518
+ return isTestMessage(value) ? [value] : void 0;
1519
+ }
1520
+ return [{ role: "assistant", content: value }];
1521
+ }
1522
+ return void 0;
1523
+ }
1524
+ function resolveInputMessages(raw) {
1525
+ if (raw.input_messages !== void 0) {
1526
+ if (Array.isArray(raw.input_messages)) {
1527
+ const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
1528
+ return messages.length > 0 ? messages : void 0;
1529
+ }
1530
+ return void 0;
1531
+ }
1532
+ return expandInputShorthand(raw.input);
1533
+ }
1534
+ function resolveExpectedMessages(raw) {
1535
+ if (raw.expected_messages !== void 0) {
1536
+ if (Array.isArray(raw.expected_messages)) {
1537
+ const messages = raw.expected_messages.filter(
1538
+ (msg) => isTestMessage(msg)
1539
+ );
1540
+ return messages.length > 0 ? messages : void 0;
1541
+ }
1542
+ return void 0;
1543
+ }
1544
+ return expandExpectedOutputShorthand(raw.expected_output);
1545
+ }
1546
+
1547
+ // src/evaluation/loaders/jsonl-parser.ts
1272
1548
  var ANSI_YELLOW5 = "\x1B[33m";
1549
+ var ANSI_RED = "\x1B[31m";
1273
1550
  var ANSI_RESET5 = "\x1B[0m";
1551
+ function detectFormat(filePath) {
1552
+ const ext = import_node_path5.default.extname(filePath).toLowerCase();
1553
+ if (ext === ".jsonl") return "jsonl";
1554
+ if (ext === ".yaml" || ext === ".yml") return "yaml";
1555
+ throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
1556
+ }
1557
+ async function loadSidecarMetadata(jsonlPath, verbose) {
1558
+ const dir = import_node_path5.default.dirname(jsonlPath);
1559
+ const base = import_node_path5.default.basename(jsonlPath, ".jsonl");
1560
+ const sidecarPath = import_node_path5.default.join(dir, `${base}.yaml`);
1561
+ if (!await fileExists(sidecarPath)) {
1562
+ if (verbose) {
1563
+ logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
1564
+ }
1565
+ return {};
1566
+ }
1567
+ try {
1568
+ const content = await (0, import_promises5.readFile)(sidecarPath, "utf8");
1569
+ const parsed = (0, import_yaml2.parse)(content);
1570
+ if (!isJsonObject(parsed)) {
1571
+ logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
1572
+ return {};
1573
+ }
1574
+ return {
1575
+ description: asString4(parsed.description),
1576
+ dataset: asString4(parsed.dataset),
1577
+ execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
1578
+ evaluator: parsed.evaluator
1579
+ };
1580
+ } catch (error) {
1581
+ logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
1582
+ return {};
1583
+ }
1584
+ }
1585
+ function parseJsonlContent(content, filePath) {
1586
+ const lines = content.split("\n");
1587
+ const cases = [];
1588
+ for (let i = 0; i < lines.length; i++) {
1589
+ const line = lines[i].trim();
1590
+ if (line === "") continue;
1591
+ try {
1592
+ const parsed = JSON.parse(line);
1593
+ if (!isJsonObject(parsed)) {
1594
+ throw new Error("Expected JSON object");
1595
+ }
1596
+ cases.push(parsed);
1597
+ } catch (error) {
1598
+ const message = error instanceof Error ? error.message : String(error);
1599
+ throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
1600
+ File: ${filePath}`);
1601
+ }
1602
+ }
1603
+ return cases;
1604
+ }
1605
+ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1606
+ const verbose = options?.verbose ?? false;
1607
+ const filterPattern = options?.filter;
1608
+ const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
1609
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
1610
+ const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
1611
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
1612
+ const guidelinePatterns = config?.guideline_patterns;
1613
+ const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
1614
+ const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
1615
+ const rawCases = parseJsonlContent(rawFile, evalFilePath);
1616
+ const fallbackDataset = import_node_path5.default.basename(absoluteTestPath, ".jsonl") || "eval";
1617
+ const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
1618
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
1619
+ const globalExecution = sidecar.execution;
1620
+ if (verbose) {
1621
+ console.log(`
1622
+ [JSONL Dataset: ${evalFilePath}]`);
1623
+ console.log(` Cases: ${rawCases.length}`);
1624
+ console.log(` Dataset name: ${datasetName}`);
1625
+ if (sidecar.description) {
1626
+ console.log(` Description: ${sidecar.description}`);
1627
+ }
1628
+ }
1629
+ const results = [];
1630
+ for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
1631
+ const evalcase = rawCases[lineIndex];
1632
+ const lineNumber = lineIndex + 1;
1633
+ const id = asString4(evalcase.id);
1634
+ if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
1635
+ continue;
1636
+ }
1637
+ const conversationId = asString4(evalcase.conversation_id);
1638
+ const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
1639
+ const inputMessages = resolveInputMessages(evalcase);
1640
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1641
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1642
+ logError(
1643
+ `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
1644
+ );
1645
+ continue;
1646
+ }
1647
+ const hasExpectedMessages = expectedMessages.length > 0;
1648
+ const guidelinePaths = [];
1649
+ const inputTextParts = [];
1650
+ const inputSegments = await processMessages({
1651
+ messages: inputMessages,
1652
+ searchRoots,
1653
+ repoRootPath,
1654
+ guidelinePatterns,
1655
+ guidelinePaths,
1656
+ textParts: inputTextParts,
1657
+ messageType: "input",
1658
+ verbose
1659
+ });
1660
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1661
+ messages: expectedMessages,
1662
+ searchRoots,
1663
+ repoRootPath,
1664
+ verbose
1665
+ }) : [];
1666
+ let referenceAnswer = "";
1667
+ if (outputSegments.length > 0) {
1668
+ const lastMessage = outputSegments[outputSegments.length - 1];
1669
+ const content = lastMessage.content;
1670
+ const toolCalls = lastMessage.tool_calls;
1671
+ if (typeof content === "string") {
1672
+ referenceAnswer = content;
1673
+ } else if (content !== void 0 && content !== null) {
1674
+ referenceAnswer = JSON.stringify(content, null, 2);
1675
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1676
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1677
+ }
1678
+ }
1679
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1680
+ const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
1681
+ const mergedExecution = caseExecution ?? globalExecution;
1682
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1683
+ let evaluators;
1684
+ try {
1685
+ evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
1686
+ } catch (error) {
1687
+ const message = error instanceof Error ? error.message : String(error);
1688
+ logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
1689
+ continue;
1690
+ }
1691
+ const inlineRubrics = evalcase.rubrics;
1692
+ if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1693
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
1694
+ if (rubricEvaluator) {
1695
+ evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1696
+ }
1697
+ }
1698
+ const userFilePaths = [];
1699
+ for (const segment of inputSegments) {
1700
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
1701
+ userFilePaths.push(segment.resolvedPath);
1702
+ }
1703
+ }
1704
+ const allFilePaths = [
1705
+ ...guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
1706
+ ...userFilePaths
1707
+ ];
1708
+ const testCase = {
1709
+ id,
1710
+ dataset: datasetName,
1711
+ conversation_id: conversationId,
1712
+ question,
1713
+ input_messages: inputMessages,
1714
+ input_segments: inputSegments,
1715
+ expected_messages: outputSegments,
1716
+ reference_answer: referenceAnswer,
1717
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
1718
+ guideline_patterns: guidelinePatterns,
1719
+ file_paths: allFilePaths,
1720
+ expected_outcome: outcome,
1721
+ evaluator: evalCaseEvaluatorKind,
1722
+ evaluators
1723
+ };
1724
+ if (verbose) {
1725
+ console.log(`
1726
+ [Eval Case: ${id}]`);
1727
+ if (testCase.guideline_paths.length > 0) {
1728
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
1729
+ for (const guidelinePath of testCase.guideline_paths) {
1730
+ console.log(` - ${guidelinePath}`);
1731
+ }
1732
+ } else {
1733
+ console.log(" No guidelines found");
1734
+ }
1735
+ }
1736
+ results.push(testCase);
1737
+ }
1738
+ return results;
1739
+ }
1740
+ function asString4(value) {
1741
+ return typeof value === "string" ? value : void 0;
1742
+ }
1743
+ function logWarning4(message, details) {
1744
+ if (details && details.length > 0) {
1745
+ const detailBlock = details.join("\n");
1746
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
1747
+ ${detailBlock}${ANSI_RESET5}`);
1748
+ } else {
1749
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1750
+ }
1751
+ }
1752
+ function logError(message, details) {
1753
+ if (details && details.length > 0) {
1754
+ const detailBlock = details.join("\n");
1755
+ console.error(`${ANSI_RED}Error: ${message}
1756
+ ${detailBlock}${ANSI_RESET5}`);
1757
+ } else {
1758
+ console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
1759
+ }
1760
+ }
1761
+
1762
+ // src/evaluation/formatting/prompt-builder.ts
1763
+ var import_promises6 = require("fs/promises");
1764
+ var import_node_path6 = __toESM(require("path"), 1);
1765
+ var ANSI_YELLOW6 = "\x1B[33m";
1766
+ var ANSI_RESET6 = "\x1B[0m";
1274
1767
  async function buildPromptInputs(testCase, mode = "lm") {
1275
1768
  const guidelineParts = [];
1276
1769
  for (const rawPath of testCase.guideline_paths) {
1277
- const absolutePath = import_node_path5.default.resolve(rawPath);
1770
+ const absolutePath = import_node_path6.default.resolve(rawPath);
1278
1771
  if (!await fileExists(absolutePath)) {
1279
- logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
1772
+ logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
1280
1773
  continue;
1281
1774
  }
1282
1775
  try {
1283
- const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
1776
+ const content = (await (0, import_promises6.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
1284
1777
  guidelineParts.push({
1285
1778
  content,
1286
1779
  isFile: true,
1287
- displayPath: import_node_path5.default.basename(absolutePath)
1780
+ displayPath: import_node_path6.default.basename(absolutePath)
1288
1781
  });
1289
1782
  } catch (error) {
1290
- logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
1783
+ logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
1291
1784
  }
1292
1785
  }
1293
1786
  const guidelines = formatFileContents(guidelineParts);
@@ -1311,9 +1804,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
1311
1804
  messageSegments.push({ type: "text", value: segment });
1312
1805
  }
1313
1806
  } else if (isJsonObject(segment)) {
1314
- const type = asString4(segment.type);
1807
+ const type = asString5(segment.type);
1315
1808
  if (type === "file") {
1316
- const value = asString4(segment.value);
1809
+ const value = asString5(segment.value);
1317
1810
  if (!value) continue;
1318
1811
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
1319
1812
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -1324,7 +1817,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
1324
1817
  messageSegments.push({ type: "file", text: fileText, path: value });
1325
1818
  }
1326
1819
  } else if (type === "text") {
1327
- const textValue = asString4(segment.value);
1820
+ const textValue = asString5(segment.value);
1328
1821
  if (textValue && textValue.trim().length > 0) {
1329
1822
  messageSegments.push({ type: "text", value: textValue });
1330
1823
  }
@@ -1478,22 +1971,22 @@ ${guidelineContent.trim()}`);
1478
1971
  }
1479
1972
  return chatPrompt.length > 0 ? chatPrompt : void 0;
1480
1973
  }
1481
- function asString4(value) {
1974
+ function asString5(value) {
1482
1975
  return typeof value === "string" ? value : void 0;
1483
1976
  }
1484
- function logWarning4(message) {
1485
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1977
+ function logWarning5(message) {
1978
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
1486
1979
  }
1487
1980
 
1488
1981
  // src/evaluation/yaml-parser.ts
1489
- var ANSI_YELLOW6 = "\x1B[33m";
1490
- var ANSI_RED = "\x1B[31m";
1491
- var ANSI_RESET6 = "\x1B[0m";
1982
+ var ANSI_YELLOW7 = "\x1B[33m";
1983
+ var ANSI_RED2 = "\x1B[31m";
1984
+ var ANSI_RESET7 = "\x1B[0m";
1492
1985
  async function readTestSuiteMetadata(testFilePath) {
1493
1986
  try {
1494
- const absolutePath = import_node_path6.default.resolve(testFilePath);
1495
- const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
1496
- const parsed = (0, import_yaml2.parse)(content);
1987
+ const absolutePath = import_node_path7.default.resolve(testFilePath);
1988
+ const content = await (0, import_promises7.readFile)(absolutePath, "utf8");
1989
+ const parsed = (0, import_yaml3.parse)(content);
1497
1990
  if (!isJsonObject(parsed)) {
1498
1991
  return {};
1499
1992
  }
@@ -1503,21 +1996,25 @@ async function readTestSuiteMetadata(testFilePath) {
1503
1996
  }
1504
1997
  }
1505
1998
  async function loadEvalCases(evalFilePath, repoRoot, options) {
1999
+ const format = detectFormat(evalFilePath);
2000
+ if (format === "jsonl") {
2001
+ return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
2002
+ }
1506
2003
  const verbose = options?.verbose ?? false;
1507
- const evalIdFilter = options?.evalId;
1508
- const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
2004
+ const filterPattern = options?.filter;
2005
+ const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
1509
2006
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1510
2007
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
1511
2008
  const config = await loadConfig(absoluteTestPath, repoRootPath);
1512
2009
  const guidelinePatterns = config?.guideline_patterns;
1513
- const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
1514
- const parsed = (0, import_yaml2.parse)(rawFile);
2010
+ const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
2011
+ const parsed = (0, import_yaml3.parse)(rawFile);
1515
2012
  if (!isJsonObject(parsed)) {
1516
2013
  throw new Error(`Invalid test file format: ${evalFilePath}`);
1517
2014
  }
1518
2015
  const suite = parsed;
1519
- const datasetNameFromSuite = asString5(suite.dataset)?.trim();
1520
- const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2016
+ const datasetNameFromSuite = asString6(suite.dataset)?.trim();
2017
+ const fallbackDataset = import_node_path7.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
1521
2018
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
1522
2019
  const rawTestcases = suite.evalcases;
1523
2020
  if (!Array.isArray(rawTestcases)) {
@@ -1525,37 +2022,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1525
2022
  }
1526
2023
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
1527
2024
  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
1528
- const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
2025
+ const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
1529
2026
  const results = [];
1530
2027
  for (const rawEvalcase of rawTestcases) {
1531
2028
  if (!isJsonObject(rawEvalcase)) {
1532
- logWarning5("Skipping invalid eval case entry (expected object)");
2029
+ logWarning6("Skipping invalid eval case entry (expected object)");
1533
2030
  continue;
1534
2031
  }
1535
2032
  const evalcase = rawEvalcase;
1536
- const id = asString5(evalcase.id);
1537
- if (evalIdFilter && id !== evalIdFilter) {
2033
+ const id = asString6(evalcase.id);
2034
+ if (filterPattern && (!id || !import_micromatch3.default.isMatch(id, filterPattern))) {
1538
2035
  continue;
1539
2036
  }
1540
- const conversationId = asString5(evalcase.conversation_id);
1541
- const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
1542
- const inputMessagesValue = evalcase.input_messages;
1543
- const expectedMessagesValue = evalcase.expected_messages;
1544
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
1545
- logError(
1546
- `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
2037
+ const conversationId = asString6(evalcase.conversation_id);
2038
+ const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
2039
+ const inputMessages = resolveInputMessages(evalcase);
2040
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2041
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
2042
+ logError2(
2043
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
1547
2044
  );
1548
2045
  continue;
1549
2046
  }
1550
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1551
- const inputMessages = inputMessagesValue.filter(
1552
- (msg) => isTestMessage(msg)
1553
- );
1554
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1555
- if (hasExpectedMessages && expectedMessages.length === 0) {
1556
- logError(`No valid expected message found for eval case: ${id}`);
1557
- continue;
1558
- }
2047
+ const hasExpectedMessages = expectedMessages.length > 0;
1559
2048
  const guidelinePaths = [];
1560
2049
  const inputTextParts = [];
1561
2050
  const inputSegments = await processMessages({
@@ -1594,33 +2083,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1594
2083
  evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
1595
2084
  } catch (error) {
1596
2085
  const message = error instanceof Error ? error.message : String(error);
1597
- logError(`Skipping eval case '${id}': ${message}`);
2086
+ logError2(`Skipping eval case '${id}': ${message}`);
1598
2087
  continue;
1599
2088
  }
1600
2089
  const inlineRubrics = evalcase.rubrics;
1601
2090
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1602
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1603
- if (typeof rubric === "string") {
1604
- return {
1605
- id: `rubric-${index + 1}`,
1606
- description: rubric,
1607
- weight: 1,
1608
- required: true
1609
- };
1610
- }
1611
- return {
1612
- id: asString5(rubric.id) ?? `rubric-${index + 1}`,
1613
- description: asString5(rubric.description) ?? "",
1614
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1615
- required: typeof rubric.required === "boolean" ? rubric.required : true
1616
- };
1617
- }).filter((r) => r.description.length > 0);
1618
- if (rubricItems.length > 0) {
1619
- const rubricEvaluator = {
1620
- name: "rubric",
1621
- type: "llm_judge",
1622
- rubrics: rubricItems
1623
- };
2091
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2092
+ if (rubricEvaluator) {
1624
2093
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1625
2094
  }
1626
2095
  }
@@ -1631,7 +2100,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1631
2100
  }
1632
2101
  }
1633
2102
  const allFilePaths = [
1634
- ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
2103
+ ...guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
1635
2104
  ...userFilePaths
1636
2105
  ];
1637
2106
  const testCase = {
@@ -1643,7 +2112,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1643
2112
  input_segments: inputSegments,
1644
2113
  expected_messages: outputSegments,
1645
2114
  reference_answer: referenceAnswer,
1646
- guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
2115
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
1647
2116
  guideline_patterns: guidelinePatterns,
1648
2117
  file_paths: allFilePaths,
1649
2118
  expected_outcome: outcome,
@@ -1666,35 +2135,35 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1666
2135
  }
1667
2136
  return results;
1668
2137
  }
1669
- function asString5(value) {
2138
+ function asString6(value) {
1670
2139
  return typeof value === "string" ? value : void 0;
1671
2140
  }
1672
- function logWarning5(message, details) {
2141
+ function logWarning6(message, details) {
1673
2142
  if (details && details.length > 0) {
1674
2143
  const detailBlock = details.join("\n");
1675
- console.warn(`${ANSI_YELLOW6}Warning: ${message}
1676
- ${detailBlock}${ANSI_RESET6}`);
2144
+ console.warn(`${ANSI_YELLOW7}Warning: ${message}
2145
+ ${detailBlock}${ANSI_RESET7}`);
1677
2146
  } else {
1678
- console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
2147
+ console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
1679
2148
  }
1680
2149
  }
1681
- function logError(message, details) {
2150
+ function logError2(message, details) {
1682
2151
  if (details && details.length > 0) {
1683
2152
  const detailBlock = details.join("\n");
1684
- console.error(`${ANSI_RED}Error: ${message}
1685
- ${detailBlock}${ANSI_RESET6}`);
2153
+ console.error(`${ANSI_RED2}Error: ${message}
2154
+ ${detailBlock}${ANSI_RESET7}`);
1686
2155
  } else {
1687
- console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
2156
+ console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
1688
2157
  }
1689
2158
  }
1690
2159
 
1691
2160
  // src/evaluation/file-utils.ts
1692
2161
  var import_node_fs2 = require("fs");
1693
- var import_promises7 = require("fs/promises");
1694
- var import_node_path7 = __toESM(require("path"), 1);
2162
+ var import_promises8 = require("fs/promises");
2163
+ var import_node_path8 = __toESM(require("path"), 1);
1695
2164
  async function fileExists2(filePath) {
1696
2165
  try {
1697
- await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
2166
+ await (0, import_promises8.access)(filePath, import_node_fs2.constants.F_OK);
1698
2167
  return true;
1699
2168
  } catch {
1700
2169
  return false;
@@ -1704,22 +2173,22 @@ function normalizeLineEndings(content) {
1704
2173
  return content.replace(/\r\n/g, "\n");
1705
2174
  }
1706
2175
  async function readTextFile(filePath) {
1707
- const content = await (0, import_promises7.readFile)(filePath, "utf8");
2176
+ const content = await (0, import_promises8.readFile)(filePath, "utf8");
1708
2177
  return normalizeLineEndings(content);
1709
2178
  }
1710
2179
  async function readJsonFile(filePath) {
1711
- const content = await (0, import_promises7.readFile)(filePath, "utf8");
2180
+ const content = await (0, import_promises8.readFile)(filePath, "utf8");
1712
2181
  return JSON.parse(content);
1713
2182
  }
1714
2183
  async function findGitRoot(startPath) {
1715
- let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1716
- const root = import_node_path7.default.parse(currentDir).root;
2184
+ let currentDir = import_node_path8.default.dirname(import_node_path8.default.resolve(startPath));
2185
+ const root = import_node_path8.default.parse(currentDir).root;
1717
2186
  while (currentDir !== root) {
1718
- const gitPath = import_node_path7.default.join(currentDir, ".git");
2187
+ const gitPath = import_node_path8.default.join(currentDir, ".git");
1719
2188
  if (await fileExists2(gitPath)) {
1720
2189
  return currentDir;
1721
2190
  }
1722
- const parentDir = import_node_path7.default.dirname(currentDir);
2191
+ const parentDir = import_node_path8.default.dirname(currentDir);
1723
2192
  if (parentDir === currentDir) {
1724
2193
  break;
1725
2194
  }
@@ -1730,8 +2199,8 @@ async function findGitRoot(startPath) {
1730
2199
  function buildDirectoryChain2(filePath, repoRoot) {
1731
2200
  const directories = [];
1732
2201
  const seen = /* @__PURE__ */ new Set();
1733
- const boundary = import_node_path7.default.resolve(repoRoot);
1734
- let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
2202
+ const boundary = import_node_path8.default.resolve(repoRoot);
2203
+ let current = import_node_path8.default.resolve(import_node_path8.default.dirname(filePath));
1735
2204
  while (current !== void 0) {
1736
2205
  if (!seen.has(current)) {
1737
2206
  directories.push(current);
@@ -1740,7 +2209,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
1740
2209
  if (current === boundary) {
1741
2210
  break;
1742
2211
  }
1743
- const parent = import_node_path7.default.dirname(current);
2212
+ const parent = import_node_path8.default.dirname(current);
1744
2213
  if (parent === current) {
1745
2214
  break;
1746
2215
  }
@@ -1754,16 +2223,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
1754
2223
  function buildSearchRoots2(evalPath, repoRoot) {
1755
2224
  const uniqueRoots = [];
1756
2225
  const addRoot = (root) => {
1757
- const normalized = import_node_path7.default.resolve(root);
2226
+ const normalized = import_node_path8.default.resolve(root);
1758
2227
  if (!uniqueRoots.includes(normalized)) {
1759
2228
  uniqueRoots.push(normalized);
1760
2229
  }
1761
2230
  };
1762
- let currentDir = import_node_path7.default.dirname(evalPath);
2231
+ let currentDir = import_node_path8.default.dirname(evalPath);
1763
2232
  let reachedBoundary = false;
1764
2233
  while (!reachedBoundary) {
1765
2234
  addRoot(currentDir);
1766
- const parentDir = import_node_path7.default.dirname(currentDir);
2235
+ const parentDir = import_node_path8.default.dirname(currentDir);
1767
2236
  if (currentDir === repoRoot || parentDir === currentDir) {
1768
2237
  reachedBoundary = true;
1769
2238
  } else {
@@ -1781,16 +2250,16 @@ function trimLeadingSeparators2(value) {
1781
2250
  async function resolveFileReference2(rawValue, searchRoots) {
1782
2251
  const displayPath = trimLeadingSeparators2(rawValue);
1783
2252
  const potentialPaths = [];
1784
- if (import_node_path7.default.isAbsolute(rawValue)) {
1785
- potentialPaths.push(import_node_path7.default.normalize(rawValue));
2253
+ if (import_node_path8.default.isAbsolute(rawValue)) {
2254
+ potentialPaths.push(import_node_path8.default.normalize(rawValue));
1786
2255
  }
1787
2256
  for (const base of searchRoots) {
1788
- potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
2257
+ potentialPaths.push(import_node_path8.default.resolve(base, displayPath));
1789
2258
  }
1790
2259
  const attempted = [];
1791
2260
  const seen = /* @__PURE__ */ new Set();
1792
2261
  for (const candidate of potentialPaths) {
1793
- const absoluteCandidate = import_node_path7.default.resolve(candidate);
2262
+ const absoluteCandidate = import_node_path8.default.resolve(candidate);
1794
2263
  if (seen.has(absoluteCandidate)) {
1795
2264
  continue;
1796
2265
  }
@@ -2140,9 +2609,9 @@ async function withRetry(fn, retryConfig, signal) {
2140
2609
  var import_node_child_process = require("child_process");
2141
2610
  var import_node_crypto = require("crypto");
2142
2611
  var import_node_fs3 = require("fs");
2143
- var import_promises8 = require("fs/promises");
2612
+ var import_promises9 = require("fs/promises");
2144
2613
  var import_node_os = require("os");
2145
- var import_node_path9 = __toESM(require("path"), 1);
2614
+ var import_node_path10 = __toESM(require("path"), 1);
2146
2615
 
2147
2616
  // src/evaluation/providers/claude-code-log-tracker.ts
2148
2617
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
@@ -2198,7 +2667,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
2198
2667
  }
2199
2668
 
2200
2669
  // src/evaluation/providers/preread.ts
2201
- var import_node_path8 = __toESM(require("path"), 1);
2670
+ var import_node_path9 = __toESM(require("path"), 1);
2202
2671
  function buildPromptDocument(request, inputFiles, options) {
2203
2672
  const parts = [];
2204
2673
  const guidelineFiles = collectGuidelineFiles(
@@ -2221,7 +2690,7 @@ function normalizeInputFiles(inputFiles) {
2221
2690
  }
2222
2691
  const deduped = /* @__PURE__ */ new Map();
2223
2692
  for (const inputFile of inputFiles) {
2224
- const absolutePath = import_node_path8.default.resolve(inputFile);
2693
+ const absolutePath = import_node_path9.default.resolve(inputFile);
2225
2694
  if (!deduped.has(absolutePath)) {
2226
2695
  deduped.set(absolutePath, absolutePath);
2227
2696
  }
@@ -2234,14 +2703,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2234
2703
  }
2235
2704
  const unique = /* @__PURE__ */ new Map();
2236
2705
  for (const inputFile of inputFiles) {
2237
- const absolutePath = import_node_path8.default.resolve(inputFile);
2706
+ const absolutePath = import_node_path9.default.resolve(inputFile);
2238
2707
  if (overrides?.has(absolutePath)) {
2239
2708
  if (!unique.has(absolutePath)) {
2240
2709
  unique.set(absolutePath, absolutePath);
2241
2710
  }
2242
2711
  continue;
2243
2712
  }
2244
- const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
2713
+ const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
2245
2714
  if (isGuidelineFile(normalized, guidelinePatterns)) {
2246
2715
  if (!unique.has(absolutePath)) {
2247
2716
  unique.set(absolutePath, absolutePath);
@@ -2256,7 +2725,7 @@ function collectInputFiles(inputFiles) {
2256
2725
  }
2257
2726
  const unique = /* @__PURE__ */ new Map();
2258
2727
  for (const inputFile of inputFiles) {
2259
- const absolutePath = import_node_path8.default.resolve(inputFile);
2728
+ const absolutePath = import_node_path9.default.resolve(inputFile);
2260
2729
  if (!unique.has(absolutePath)) {
2261
2730
  unique.set(absolutePath, absolutePath);
2262
2731
  }
@@ -2268,7 +2737,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2268
2737
  return "";
2269
2738
  }
2270
2739
  const buildList = (files) => files.map((absolutePath) => {
2271
- const fileName = import_node_path8.default.basename(absolutePath);
2740
+ const fileName = import_node_path9.default.basename(absolutePath);
2272
2741
  const fileUri = pathToFileUri(absolutePath);
2273
2742
  return `* [${fileName}](${fileUri})`;
2274
2743
  });
@@ -2288,7 +2757,7 @@ ${buildList(inputFiles).join("\n")}.`);
2288
2757
  return sections.join("\n");
2289
2758
  }
2290
2759
  function pathToFileUri(filePath) {
2291
- const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
2760
+ const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
2292
2761
  const normalizedPath = absolutePath.replace(/\\/g, "/");
2293
2762
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2294
2763
  return `file:///${normalizedPath}`;
@@ -2325,8 +2794,8 @@ var ClaudeCodeProvider = class {
2325
2794
  const workspaceRoot = await this.createWorkspace();
2326
2795
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2327
2796
  try {
2328
- const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
2329
- await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
2797
+ const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
2798
+ await (0, import_promises9.writeFile)(promptFile, request.question, "utf8");
2330
2799
  const args = this.buildClaudeCodeArgs(request.question, inputFiles);
2331
2800
  const cwd = this.resolveCwd();
2332
2801
  const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
@@ -2373,7 +2842,7 @@ var ClaudeCodeProvider = class {
2373
2842
  if (!this.config.cwd) {
2374
2843
  return process.cwd();
2375
2844
  }
2376
- return import_node_path9.default.resolve(this.config.cwd);
2845
+ return import_node_path10.default.resolve(this.config.cwd);
2377
2846
  }
2378
2847
  buildClaudeCodeArgs(prompt, inputFiles) {
2379
2848
  const args = [];
@@ -2430,11 +2899,11 @@ ${filesContext}`;
2430
2899
  }
2431
2900
  }
2432
2901
  async createWorkspace() {
2433
- return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2902
+ return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2434
2903
  }
2435
2904
  async cleanupWorkspace(workspaceRoot) {
2436
2905
  try {
2437
- await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
2906
+ await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
2438
2907
  } catch {
2439
2908
  }
2440
2909
  }
@@ -2444,9 +2913,9 @@ ${filesContext}`;
2444
2913
  return void 0;
2445
2914
  }
2446
2915
  if (this.config.logDir) {
2447
- return import_node_path9.default.resolve(this.config.logDir);
2916
+ return import_node_path10.default.resolve(this.config.logDir);
2448
2917
  }
2449
- return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
2918
+ return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "claude-code");
2450
2919
  }
2451
2920
  async createStreamLogger(request) {
2452
2921
  const logDir = this.resolveLogDirectory();
@@ -2454,13 +2923,13 @@ ${filesContext}`;
2454
2923
  return void 0;
2455
2924
  }
2456
2925
  try {
2457
- await (0, import_promises8.mkdir)(logDir, { recursive: true });
2926
+ await (0, import_promises9.mkdir)(logDir, { recursive: true });
2458
2927
  } catch (error) {
2459
2928
  const message = error instanceof Error ? error.message : String(error);
2460
2929
  console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
2461
2930
  return void 0;
2462
2931
  }
2463
- const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
2932
+ const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
2464
2933
  try {
2465
2934
  const logger = await ClaudeCodeStreamLogger.create({
2466
2935
  filePath,
@@ -2865,16 +3334,16 @@ function escapeShellArg(arg) {
2865
3334
  }
2866
3335
  async function defaultClaudeCodeRunner(options) {
2867
3336
  const tempId = (0, import_node_crypto.randomUUID)();
2868
- const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
2869
- const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
2870
- const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
2871
- const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
3337
+ const stdoutFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
3338
+ const stderrFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
3339
+ const exitFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
3340
+ const pidFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
2872
3341
  try {
2873
3342
  return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
2874
3343
  } finally {
2875
3344
  for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
2876
3345
  try {
2877
- await (0, import_promises8.rm)(file, { force: true });
3346
+ await (0, import_promises9.rm)(file, { force: true });
2878
3347
  } catch {
2879
3348
  }
2880
3349
  }
@@ -2908,8 +3377,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
2908
3377
  let lastStdoutSize = 0;
2909
3378
  const readFileIfExists = async (filePath) => {
2910
3379
  try {
2911
- const { readFile: readFile8 } = await import("fs/promises");
2912
- return await readFile8(filePath, "utf8");
3380
+ const { readFile: readFile9 } = await import("fs/promises");
3381
+ return await readFile9(filePath, "utf8");
2913
3382
  } catch {
2914
3383
  return "";
2915
3384
  }
@@ -2982,9 +3451,9 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
2982
3451
 
2983
3452
  // src/evaluation/providers/cli.ts
2984
3453
  var import_node_child_process2 = require("child_process");
2985
- var import_promises9 = __toESM(require("fs/promises"), 1);
3454
+ var import_promises10 = __toESM(require("fs/promises"), 1);
2986
3455
  var import_node_os2 = __toESM(require("os"), 1);
2987
- var import_node_path10 = __toESM(require("path"), 1);
3456
+ var import_node_path11 = __toESM(require("path"), 1);
2988
3457
  var import_node_util = require("util");
2989
3458
  var import_zod = require("zod");
2990
3459
  var ToolCallSchema = import_zod.z.object({
@@ -2992,7 +3461,8 @@ var ToolCallSchema = import_zod.z.object({
2992
3461
  input: import_zod.z.unknown().optional(),
2993
3462
  output: import_zod.z.unknown().optional(),
2994
3463
  id: import_zod.z.string().optional(),
2995
- timestamp: import_zod.z.string().optional()
3464
+ timestamp: import_zod.z.string().optional(),
3465
+ duration_ms: import_zod.z.number().optional()
2996
3466
  });
2997
3467
  var OutputMessageInputSchema = import_zod.z.object({
2998
3468
  role: import_zod.z.string(),
@@ -3000,6 +3470,7 @@ var OutputMessageInputSchema = import_zod.z.object({
3000
3470
  content: import_zod.z.unknown().optional(),
3001
3471
  tool_calls: import_zod.z.array(ToolCallSchema).optional(),
3002
3472
  timestamp: import_zod.z.string().optional(),
3473
+ duration_ms: import_zod.z.number().optional(),
3003
3474
  metadata: import_zod.z.record(import_zod.z.unknown()).optional()
3004
3475
  });
3005
3476
  var TokenUsageSchema = import_zod.z.object({
@@ -3038,8 +3509,16 @@ function convertOutputMessages(messages) {
3038
3509
  role: msg.role,
3039
3510
  name: msg.name,
3040
3511
  content: msg.content,
3041
- toolCalls: msg.tool_calls,
3512
+ toolCalls: msg.tool_calls?.map((tc) => ({
3513
+ tool: tc.tool,
3514
+ input: tc.input,
3515
+ output: tc.output,
3516
+ id: tc.id,
3517
+ timestamp: tc.timestamp,
3518
+ durationMs: tc.duration_ms
3519
+ })),
3042
3520
  timestamp: msg.timestamp,
3521
+ durationMs: msg.duration_ms,
3043
3522
  metadata: msg.metadata
3044
3523
  }));
3045
3524
  }
@@ -3353,7 +3832,7 @@ var CliProvider = class {
3353
3832
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
3354
3833
  } finally {
3355
3834
  if (!this.keepTempFiles) {
3356
- await import_promises9.default.unlink(filePath).catch(() => {
3835
+ await import_promises10.default.unlink(filePath).catch(() => {
3357
3836
  });
3358
3837
  }
3359
3838
  }
@@ -3441,7 +3920,7 @@ function normalizeInputFiles2(inputFiles) {
3441
3920
  }
3442
3921
  const unique = /* @__PURE__ */ new Map();
3443
3922
  for (const inputFile of inputFiles) {
3444
- const absolutePath = import_node_path10.default.resolve(inputFile);
3923
+ const absolutePath = import_node_path11.default.resolve(inputFile);
3445
3924
  if (!unique.has(absolutePath)) {
3446
3925
  unique.set(absolutePath, absolutePath);
3447
3926
  }
@@ -3455,7 +3934,7 @@ function formatFileList(files, template) {
3455
3934
  const formatter = template ?? "{path}";
3456
3935
  return files.map((filePath) => {
3457
3936
  const escapedPath = shellEscape(filePath);
3458
- const escapedName = shellEscape(import_node_path10.default.basename(filePath));
3937
+ const escapedName = shellEscape(import_node_path11.default.basename(filePath));
3459
3938
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
3460
3939
  }).join(" ");
3461
3940
  }
@@ -3479,7 +3958,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
3479
3958
  const safeEvalId = evalCaseId || "unknown";
3480
3959
  const timestamp = Date.now();
3481
3960
  const random = Math.random().toString(36).substring(2, 9);
3482
- return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3961
+ return import_node_path11.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3483
3962
  }
3484
3963
  function formatTimeoutSuffix2(timeoutMs) {
3485
3964
  if (!timeoutMs || timeoutMs <= 0) {
@@ -3493,9 +3972,9 @@ function formatTimeoutSuffix2(timeoutMs) {
3493
3972
  var import_node_child_process3 = require("child_process");
3494
3973
  var import_node_crypto2 = require("crypto");
3495
3974
  var import_node_fs4 = require("fs");
3496
- var import_promises10 = require("fs/promises");
3975
+ var import_promises11 = require("fs/promises");
3497
3976
  var import_node_os3 = require("os");
3498
- var import_node_path11 = __toESM(require("path"), 1);
3977
+ var import_node_path12 = __toESM(require("path"), 1);
3499
3978
  var import_node_util2 = require("util");
3500
3979
 
3501
3980
  // src/evaluation/providers/codex-log-tracker.ts
@@ -3590,8 +4069,8 @@ var CodexProvider = class {
3590
4069
  const promptContent = `${systemPrompt}
3591
4070
 
3592
4071
  ${basePrompt}`;
3593
- const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3594
- await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
4072
+ const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME2);
4073
+ await (0, import_promises11.writeFile)(promptFile, promptContent, "utf8");
3595
4074
  const args = this.buildCodexArgs();
3596
4075
  const cwd = this.resolveCwd(workspaceRoot);
3597
4076
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -3640,7 +4119,7 @@ ${basePrompt}`;
3640
4119
  if (!this.config.cwd) {
3641
4120
  return workspaceRoot;
3642
4121
  }
3643
- return import_node_path11.default.resolve(this.config.cwd);
4122
+ return import_node_path12.default.resolve(this.config.cwd);
3644
4123
  }
3645
4124
  buildCodexArgs() {
3646
4125
  const args = [
@@ -3682,11 +4161,11 @@ ${basePrompt}`;
3682
4161
  }
3683
4162
  }
3684
4163
  async createWorkspace() {
3685
- return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
4164
+ return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
3686
4165
  }
3687
4166
  async cleanupWorkspace(workspaceRoot) {
3688
4167
  try {
3689
- await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
4168
+ await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
3690
4169
  } catch {
3691
4170
  }
3692
4171
  }
@@ -3696,9 +4175,9 @@ ${basePrompt}`;
3696
4175
  return void 0;
3697
4176
  }
3698
4177
  if (this.config.logDir) {
3699
- return import_node_path11.default.resolve(this.config.logDir);
4178
+ return import_node_path12.default.resolve(this.config.logDir);
3700
4179
  }
3701
- return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
4180
+ return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "codex");
3702
4181
  }
3703
4182
  async createStreamLogger(request) {
3704
4183
  const logDir = this.resolveLogDirectory();
@@ -3706,13 +4185,13 @@ ${basePrompt}`;
3706
4185
  return void 0;
3707
4186
  }
3708
4187
  try {
3709
- await (0, import_promises10.mkdir)(logDir, { recursive: true });
4188
+ await (0, import_promises11.mkdir)(logDir, { recursive: true });
3710
4189
  } catch (error) {
3711
4190
  const message = error instanceof Error ? error.message : String(error);
3712
4191
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
3713
4192
  return void 0;
3714
4193
  }
3715
- const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
4194
+ const filePath = import_node_path12.default.join(logDir, buildLogFilename2(request, this.targetName));
3716
4195
  try {
3717
4196
  const logger = await CodexStreamLogger.create({
3718
4197
  filePath,
@@ -3927,9 +4406,9 @@ function tryParseJsonValue2(rawLine) {
3927
4406
  async function locateExecutable(candidate) {
3928
4407
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
3929
4408
  if (includesPathSeparator) {
3930
- const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
4409
+ const resolved = import_node_path12.default.isAbsolute(candidate) ? candidate : import_node_path12.default.resolve(candidate);
3931
4410
  const executablePath = await ensureWindowsExecutableVariant(resolved);
3932
- await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
4411
+ await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
3933
4412
  return executablePath;
3934
4413
  }
3935
4414
  const locator = process.platform === "win32" ? "where" : "which";
@@ -3939,7 +4418,7 @@ async function locateExecutable(candidate) {
3939
4418
  const preferred = selectExecutableCandidate(lines);
3940
4419
  if (preferred) {
3941
4420
  const executablePath = await ensureWindowsExecutableVariant(preferred);
3942
- await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
4421
+ await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
3943
4422
  return executablePath;
3944
4423
  }
3945
4424
  } catch {
@@ -3973,7 +4452,7 @@ async function ensureWindowsExecutableVariant(candidate) {
3973
4452
  for (const ext of extensions) {
3974
4453
  const withExtension = `${candidate}${ext}`;
3975
4454
  try {
3976
- await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
4455
+ await (0, import_promises11.access)(withExtension, import_node_fs4.constants.F_OK);
3977
4456
  return withExtension;
3978
4457
  } catch {
3979
4458
  }
@@ -4438,9 +4917,9 @@ function extractToolCalls2(content) {
4438
4917
  var import_node_child_process4 = require("child_process");
4439
4918
  var import_node_crypto3 = require("crypto");
4440
4919
  var import_node_fs5 = require("fs");
4441
- var import_promises11 = require("fs/promises");
4920
+ var import_promises12 = require("fs/promises");
4442
4921
  var import_node_os4 = require("os");
4443
- var import_node_path12 = __toESM(require("path"), 1);
4922
+ var import_node_path13 = __toESM(require("path"), 1);
4444
4923
 
4445
4924
  // src/evaluation/providers/pi-log-tracker.ts
4446
4925
  var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
@@ -4524,8 +5003,8 @@ var PiCodingAgentProvider = class {
4524
5003
  const workspaceRoot = await this.createWorkspace();
4525
5004
  const logger = await this.createStreamLogger(request).catch(() => void 0);
4526
5005
  try {
4527
- const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
4528
- await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
5006
+ const promptFile = import_node_path13.default.join(workspaceRoot, PROMPT_FILENAME3);
5007
+ await (0, import_promises12.writeFile)(promptFile, request.question, "utf8");
4529
5008
  const args = this.buildPiArgs(request.question, inputFiles);
4530
5009
  const cwd = this.resolveCwd(workspaceRoot);
4531
5010
  const result = await this.executePi(args, cwd, request.signal, logger);
@@ -4566,7 +5045,7 @@ var PiCodingAgentProvider = class {
4566
5045
  if (!this.config.cwd) {
4567
5046
  return workspaceRoot;
4568
5047
  }
4569
- return import_node_path12.default.resolve(this.config.cwd);
5048
+ return import_node_path13.default.resolve(this.config.cwd);
4570
5049
  }
4571
5050
  buildPiArgs(prompt, inputFiles) {
4572
5051
  const args = [];
@@ -4655,19 +5134,19 @@ ${prompt}`;
4655
5134
  return env;
4656
5135
  }
4657
5136
  async createWorkspace() {
4658
- return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
5137
+ return await (0, import_promises12.mkdtemp)(import_node_path13.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
4659
5138
  }
4660
5139
  async cleanupWorkspace(workspaceRoot) {
4661
5140
  try {
4662
- await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
5141
+ await (0, import_promises12.rm)(workspaceRoot, { recursive: true, force: true });
4663
5142
  } catch {
4664
5143
  }
4665
5144
  }
4666
5145
  resolveLogDirectory() {
4667
5146
  if (this.config.logDir) {
4668
- return import_node_path12.default.resolve(this.config.logDir);
5147
+ return import_node_path13.default.resolve(this.config.logDir);
4669
5148
  }
4670
- return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
5149
+ return import_node_path13.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4671
5150
  }
4672
5151
  async createStreamLogger(request) {
4673
5152
  const logDir = this.resolveLogDirectory();
@@ -4675,13 +5154,13 @@ ${prompt}`;
4675
5154
  return void 0;
4676
5155
  }
4677
5156
  try {
4678
- await (0, import_promises11.mkdir)(logDir, { recursive: true });
5157
+ await (0, import_promises12.mkdir)(logDir, { recursive: true });
4679
5158
  } catch (error) {
4680
5159
  const message = error instanceof Error ? error.message : String(error);
4681
5160
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
4682
5161
  return void 0;
4683
5162
  }
4684
- const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
5163
+ const filePath = import_node_path13.default.join(logDir, buildLogFilename3(request, this.targetName));
4685
5164
  try {
4686
5165
  const logger = await PiStreamLogger.create({
4687
5166
  filePath,
@@ -5114,7 +5593,7 @@ async function defaultPiRunner(options) {
5114
5593
  }
5115
5594
 
5116
5595
  // src/evaluation/providers/targets.ts
5117
- var import_node_path13 = __toESM(require("path"), 1);
5596
+ var import_node_path14 = __toESM(require("path"), 1);
5118
5597
  var import_zod2 = require("zod");
5119
5598
  var CliHealthcheckHttpInputSchema = import_zod2.z.object({
5120
5599
  type: import_zod2.z.literal("http"),
@@ -5220,11 +5699,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
5220
5699
  allowLiteral: true,
5221
5700
  optionalEnv: true
5222
5701
  });
5223
- if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5224
- cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
5702
+ if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
5703
+ cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
5225
5704
  }
5226
5705
  if (!cwd && evalFilePath) {
5227
- cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
5706
+ cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
5228
5707
  }
5229
5708
  return {
5230
5709
  type: "command",
@@ -5251,11 +5730,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
5251
5730
  allowLiteral: true,
5252
5731
  optionalEnv: true
5253
5732
  });
5254
- if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5255
- cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
5733
+ if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
5734
+ cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
5256
5735
  }
5257
5736
  if (!cwd && evalFilePath) {
5258
- cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
5737
+ cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
5259
5738
  }
5260
5739
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
5261
5740
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -5760,8 +6239,8 @@ function resolveCliConfig(target, env, evalFilePath) {
5760
6239
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
5761
6240
  if (!parseResult.success) {
5762
6241
  const firstError = parseResult.error.errors[0];
5763
- const path17 = firstError?.path.join(".") || "";
5764
- const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
6242
+ const path18 = firstError?.path.join(".") || "";
6243
+ const prefix = path18 ? `${target.name} ${path18}: ` : `${target.name}: `;
5765
6244
  throw new Error(`${prefix}${firstError?.message}`);
5766
6245
  }
5767
6246
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -5949,7 +6428,7 @@ function resolveOptionalNumberArray(source, description) {
5949
6428
  }
5950
6429
 
5951
6430
  // src/evaluation/providers/vscode.ts
5952
- var import_node_path14 = __toESM(require("path"), 1);
6431
+ var import_node_path15 = __toESM(require("path"), 1);
5953
6432
  var import_subagent = require("subagent");
5954
6433
 
5955
6434
  // src/evaluation/providers/vscode-templates.ts
@@ -6119,7 +6598,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
6119
6598
  return "";
6120
6599
  }
6121
6600
  const buildList = (files) => files.map((absolutePath) => {
6122
- const fileName = import_node_path14.default.basename(absolutePath);
6601
+ const fileName = import_node_path15.default.basename(absolutePath);
6123
6602
  const fileUri = pathToFileUri2(absolutePath);
6124
6603
  return `* [${fileName}](${fileUri})`;
6125
6604
  });
@@ -6144,8 +6623,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
6144
6623
  }
6145
6624
  const unique = /* @__PURE__ */ new Map();
6146
6625
  for (const attachment of attachments) {
6147
- const absolutePath = import_node_path14.default.resolve(attachment);
6148
- const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
6626
+ const absolutePath = import_node_path15.default.resolve(attachment);
6627
+ const normalized = absolutePath.split(import_node_path15.default.sep).join("/");
6149
6628
  if (isGuidelineFile(normalized, guidelinePatterns)) {
6150
6629
  if (!unique.has(absolutePath)) {
6151
6630
  unique.set(absolutePath, absolutePath);
@@ -6160,7 +6639,7 @@ function collectAttachmentFiles(attachments) {
6160
6639
  }
6161
6640
  const unique = /* @__PURE__ */ new Map();
6162
6641
  for (const attachment of attachments) {
6163
- const absolutePath = import_node_path14.default.resolve(attachment);
6642
+ const absolutePath = import_node_path15.default.resolve(attachment);
6164
6643
  if (!unique.has(absolutePath)) {
6165
6644
  unique.set(absolutePath, absolutePath);
6166
6645
  }
@@ -6168,7 +6647,7 @@ function collectAttachmentFiles(attachments) {
6168
6647
  return Array.from(unique.values());
6169
6648
  }
6170
6649
  function pathToFileUri2(filePath) {
6171
- const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
6650
+ const absolutePath = import_node_path15.default.isAbsolute(filePath) ? filePath : import_node_path15.default.resolve(filePath);
6172
6651
  const normalizedPath = absolutePath.replace(/\\/g, "/");
6173
6652
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
6174
6653
  return `file:///${normalizedPath}`;
@@ -6181,7 +6660,7 @@ function normalizeAttachments(attachments) {
6181
6660
  }
6182
6661
  const deduped = /* @__PURE__ */ new Set();
6183
6662
  for (const attachment of attachments) {
6184
- deduped.add(import_node_path14.default.resolve(attachment));
6663
+ deduped.add(import_node_path15.default.resolve(attachment));
6185
6664
  }
6186
6665
  return Array.from(deduped);
6187
6666
  }
@@ -6190,7 +6669,7 @@ function mergeAttachments(all) {
6190
6669
  for (const list of all) {
6191
6670
  if (!list) continue;
6192
6671
  for (const inputFile of list) {
6193
- deduped.add(import_node_path14.default.resolve(inputFile));
6672
+ deduped.add(import_node_path15.default.resolve(inputFile));
6194
6673
  }
6195
6674
  }
6196
6675
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -6238,9 +6717,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
6238
6717
 
6239
6718
  // src/evaluation/providers/targets-file.ts
6240
6719
  var import_node_fs6 = require("fs");
6241
- var import_promises12 = require("fs/promises");
6242
- var import_node_path15 = __toESM(require("path"), 1);
6243
- var import_yaml3 = require("yaml");
6720
+ var import_promises13 = require("fs/promises");
6721
+ var import_node_path16 = __toESM(require("path"), 1);
6722
+ var import_yaml4 = require("yaml");
6244
6723
  function isRecord(value) {
6245
6724
  return typeof value === "object" && value !== null && !Array.isArray(value);
6246
6725
  }
@@ -6269,19 +6748,19 @@ function assertTargetDefinition(value, index, filePath) {
6269
6748
  }
6270
6749
  async function fileExists3(filePath) {
6271
6750
  try {
6272
- await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
6751
+ await (0, import_promises13.access)(filePath, import_node_fs6.constants.F_OK);
6273
6752
  return true;
6274
6753
  } catch {
6275
6754
  return false;
6276
6755
  }
6277
6756
  }
6278
6757
  async function readTargetDefinitions(filePath) {
6279
- const absolutePath = import_node_path15.default.resolve(filePath);
6758
+ const absolutePath = import_node_path16.default.resolve(filePath);
6280
6759
  if (!await fileExists3(absolutePath)) {
6281
6760
  throw new Error(`targets.yaml not found at ${absolutePath}`);
6282
6761
  }
6283
- const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
6284
- const parsed = (0, import_yaml3.parse)(raw);
6762
+ const raw = await (0, import_promises13.readFile)(absolutePath, "utf8");
6763
+ const parsed = (0, import_yaml4.parse)(raw);
6285
6764
  if (!isRecord(parsed)) {
6286
6765
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
6287
6766
  }
@@ -6487,15 +6966,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
6487
6966
  });
6488
6967
  }
6489
6968
  async function execShellWithStdin(command, stdinPayload, options = {}) {
6490
- const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
6969
+ const { mkdir: mkdir4, readFile: readFile9, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
6491
6970
  const { tmpdir: tmpdir4 } = await import("os");
6492
- const path17 = await import("path");
6971
+ const path18 = await import("path");
6493
6972
  const { randomUUID: randomUUID4 } = await import("crypto");
6494
- const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
6973
+ const dir = path18.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
6495
6974
  await mkdir4(dir, { recursive: true });
6496
- const stdinPath = path17.join(dir, "stdin.txt");
6497
- const stdoutPath = path17.join(dir, "stdout.txt");
6498
- const stderrPath = path17.join(dir, "stderr.txt");
6975
+ const stdinPath = path18.join(dir, "stdin.txt");
6976
+ const stdoutPath = path18.join(dir, "stdout.txt");
6977
+ const stderrPath = path18.join(dir, "stderr.txt");
6499
6978
  await writeFile4(stdinPath, stdinPayload, "utf8");
6500
6979
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
6501
6980
  const { spawn: spawn4 } = await import("child_process");
@@ -6525,8 +7004,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6525
7004
  resolve(code ?? 0);
6526
7005
  });
6527
7006
  });
6528
- const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
6529
- const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
7007
+ const stdout = (await readFile9(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
7008
+ const stderr = (await readFile9(stderrPath, "utf8")).replace(/\r\n/g, "\n");
6530
7009
  return { stdout, stderr, exitCode };
6531
7010
  } finally {
6532
7011
  await rm4(dir, { recursive: true, force: true });
@@ -6798,7 +7277,7 @@ var CodeEvaluator = class {
6798
7277
  outputMessages: context.outputMessages ?? null,
6799
7278
  guidelineFiles: context.evalCase.guideline_paths,
6800
7279
  inputFiles: context.evalCase.file_paths.filter(
6801
- (path17) => !context.evalCase.guideline_paths.includes(path17)
7280
+ (path18) => !context.evalCase.guideline_paths.includes(path18)
6802
7281
  ),
6803
7282
  inputMessages: context.evalCase.input_messages,
6804
7283
  traceSummary: context.traceSummary ?? null,
@@ -6973,6 +7452,15 @@ var rubricEvaluationSchema = import_zod3.z.object({
6973
7452
  checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
6974
7453
  overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
6975
7454
  });
7455
+ var scoreRangeCheckResultSchema = import_zod3.z.object({
7456
+ id: import_zod3.z.string().describe("The ID of the rubric criterion being scored"),
7457
+ score: import_zod3.z.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
7458
+ reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this score").optional()
7459
+ });
7460
+ var scoreRangeEvaluationSchema = import_zod3.z.object({
7461
+ checks: import_zod3.z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
7462
+ overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
7463
+ });
6976
7464
  var LlmJudgeEvaluator = class {
6977
7465
  kind = "llm_judge";
6978
7466
  resolveJudgeProvider;
@@ -7058,6 +7546,10 @@ var LlmJudgeEvaluator = class {
7058
7546
  `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
7059
7547
  );
7060
7548
  }
7549
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
7550
+ if (hasScoreRanges) {
7551
+ return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
7552
+ }
7061
7553
  const prompt = this.buildRubricPrompt(context, rubrics);
7062
7554
  const systemPrompt = buildRubricOutputSchema();
7063
7555
  const evaluatorRawRequest = {
@@ -7083,6 +7575,84 @@ var LlmJudgeEvaluator = class {
7083
7575
  evaluatorRawRequest
7084
7576
  };
7085
7577
  }
7578
+ /**
7579
+ * Evaluate using score-range rubrics (analytic rubric scoring).
7580
+ * Each criterion is scored 0-10 and normalized to 0-1.
7581
+ */
7582
+ async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
7583
+ const prompt = this.buildScoreRangePrompt(context, rubrics);
7584
+ const systemPrompt = buildScoreRangeOutputSchema();
7585
+ const evaluatorRawRequest = {
7586
+ userPrompt: prompt,
7587
+ systemPrompt,
7588
+ target: judgeProvider.targetName
7589
+ };
7590
+ const { data } = await this.runWithRetry({
7591
+ context,
7592
+ judgeProvider,
7593
+ systemPrompt,
7594
+ userPrompt: prompt,
7595
+ schema: scoreRangeEvaluationSchema
7596
+ });
7597
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
7598
+ return {
7599
+ score,
7600
+ verdict,
7601
+ hits,
7602
+ misses,
7603
+ expectedAspectCount: rubrics.length,
7604
+ reasoning: data.overall_reasoning,
7605
+ evaluatorRawRequest,
7606
+ details
7607
+ };
7608
+ }
7609
+ /**
7610
+ * Build prompt for score-range rubric evaluation.
7611
+ */
7612
+ buildScoreRangePrompt(context, rubrics) {
7613
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
7614
+ const parts = [
7615
+ "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
7616
+ "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
7617
+ "",
7618
+ "[[ ## question ## ]]",
7619
+ formattedQuestion,
7620
+ "",
7621
+ "[[ ## expected_outcome ## ]]",
7622
+ context.evalCase.expected_outcome,
7623
+ ""
7624
+ ];
7625
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
7626
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
7627
+ }
7628
+ parts.push(
7629
+ "[[ ## candidate_answer ## ]]",
7630
+ context.candidate,
7631
+ "",
7632
+ "[[ ## scoring_criteria ## ]]"
7633
+ );
7634
+ for (const rubric of rubrics) {
7635
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
7636
+ const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
7637
+ parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
7638
+ if (rubric.expected_outcome) {
7639
+ parts.push(`Description: ${rubric.expected_outcome}`);
7640
+ }
7641
+ if (rubric.score_ranges && rubric.score_ranges.length > 0) {
7642
+ parts.push("Score ranges:");
7643
+ for (const range of rubric.score_ranges) {
7644
+ const [min, max] = range.score_range;
7645
+ const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
7646
+ parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
7647
+ }
7648
+ }
7649
+ }
7650
+ parts.push(
7651
+ "",
7652
+ "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
7653
+ );
7654
+ return parts.join("\n");
7655
+ }
7086
7656
  buildRubricPrompt(context, rubrics) {
7087
7657
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
7088
7658
  const parts = [
@@ -7102,7 +7672,7 @@ var LlmJudgeEvaluator = class {
7102
7672
  for (const rubric of rubrics) {
7103
7673
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
7104
7674
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
7105
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
7675
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
7106
7676
  }
7107
7677
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
7108
7678
  return parts.join("\n");
@@ -7189,9 +7759,9 @@ function calculateRubricScore(result, rubrics) {
7189
7759
  totalWeight += rubric.weight;
7190
7760
  if (check.satisfied) {
7191
7761
  earnedWeight += rubric.weight;
7192
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7762
+ hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
7193
7763
  } else {
7194
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7764
+ misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
7195
7765
  if (rubric.required) {
7196
7766
  failedRequired = true;
7197
7767
  }
@@ -7201,6 +7771,76 @@ function calculateRubricScore(result, rubrics) {
7201
7771
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7202
7772
  return { score, verdict, hits, misses };
7203
7773
  }
7774
+ function buildScoreRangeOutputSchema() {
7775
+ return `You are an expert evaluator. Score the candidate answer on each criterion.
7776
+ You must return a valid JSON object matching this schema:
7777
+ {
7778
+ "checks": [
7779
+ {
7780
+ "id": "string (criterion id)",
7781
+ "score": integer (0-10),
7782
+ "reasoning": "string (brief explanation for score)"
7783
+ }
7784
+ ],
7785
+ "overall_reasoning": "string (summary, optional)"
7786
+ }
7787
+
7788
+ Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
7789
+ }
7790
+ function calculateScoreRangeResult(result, rubrics) {
7791
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
7792
+ const hits = [];
7793
+ const misses = [];
7794
+ const rawScores = {};
7795
+ let totalWeight = 0;
7796
+ let weightedScoreSum = 0;
7797
+ let failedRequired = false;
7798
+ for (const check of result.checks) {
7799
+ const rubric = rubricMap.get(check.id);
7800
+ if (!rubric) {
7801
+ continue;
7802
+ }
7803
+ const rawScore = Math.max(0, Math.min(10, check.score));
7804
+ const normalizedScore = rawScore / 10;
7805
+ rawScores[rubric.id] = rawScore;
7806
+ totalWeight += rubric.weight;
7807
+ weightedScoreSum += normalizedScore * rubric.weight;
7808
+ let requiredMinScore;
7809
+ if (rubric.required_min_score !== void 0) {
7810
+ requiredMinScore = rubric.required_min_score;
7811
+ } else if (rubric.required === true) {
7812
+ requiredMinScore = 10;
7813
+ }
7814
+ const matchingRange = rubric.score_ranges?.find(
7815
+ (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
7816
+ );
7817
+ const rangeDescription = matchingRange?.expected_outcome ?? "";
7818
+ const criterionLabel = rubric.expected_outcome ?? rubric.id;
7819
+ const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
7820
+ const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
7821
+ if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
7822
+ failedRequired = true;
7823
+ misses.push(scoreInfo);
7824
+ } else if (rawScore >= 7) {
7825
+ hits.push(scoreInfo);
7826
+ } else {
7827
+ misses.push(scoreInfo);
7828
+ }
7829
+ }
7830
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
7831
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7832
+ return {
7833
+ score,
7834
+ verdict,
7835
+ hits,
7836
+ misses,
7837
+ details: {
7838
+ raw_scores: rawScores,
7839
+ normalization: "score / 10",
7840
+ aggregation: "weighted_average"
7841
+ }
7842
+ };
7843
+ }
7204
7844
 
7205
7845
  // src/evaluation/evaluators/composite.ts
7206
7846
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -7584,115 +8224,115 @@ var FieldAccuracyEvaluator = class {
7584
8224
  * Evaluate a single field against the expected value.
7585
8225
  */
7586
8226
  evaluateField(fieldConfig, candidateData, expectedData) {
7587
- const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7588
- const candidateValue = resolvePath(candidateData, path17);
7589
- const expectedValue = resolvePath(expectedData, path17);
8227
+ const { path: path18, match, required = true, weight = 1 } = fieldConfig;
8228
+ const candidateValue = resolvePath(candidateData, path18);
8229
+ const expectedValue = resolvePath(expectedData, path18);
7590
8230
  if (expectedValue === void 0) {
7591
8231
  return {
7592
- path: path17,
8232
+ path: path18,
7593
8233
  score: 1,
7594
8234
  // No expected value means no comparison needed
7595
8235
  weight,
7596
8236
  hit: true,
7597
- message: `${path17}: no expected value`
8237
+ message: `${path18}: no expected value`
7598
8238
  };
7599
8239
  }
7600
8240
  if (candidateValue === void 0) {
7601
8241
  if (required) {
7602
8242
  return {
7603
- path: path17,
8243
+ path: path18,
7604
8244
  score: 0,
7605
8245
  weight,
7606
8246
  hit: false,
7607
- message: `${path17} (required, missing)`
8247
+ message: `${path18} (required, missing)`
7608
8248
  };
7609
8249
  }
7610
8250
  return {
7611
- path: path17,
8251
+ path: path18,
7612
8252
  score: 1,
7613
8253
  // Don't penalize missing optional fields
7614
8254
  weight: 0,
7615
8255
  // Zero weight means it won't affect the score
7616
8256
  hit: true,
7617
- message: `${path17}: optional field missing`
8257
+ message: `${path18}: optional field missing`
7618
8258
  };
7619
8259
  }
7620
8260
  switch (match) {
7621
8261
  case "exact":
7622
- return this.compareExact(path17, candidateValue, expectedValue, weight);
8262
+ return this.compareExact(path18, candidateValue, expectedValue, weight);
7623
8263
  case "numeric_tolerance":
7624
8264
  return this.compareNumericTolerance(
7625
- path17,
8265
+ path18,
7626
8266
  candidateValue,
7627
8267
  expectedValue,
7628
8268
  fieldConfig,
7629
8269
  weight
7630
8270
  );
7631
8271
  case "date":
7632
- return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
8272
+ return this.compareDate(path18, candidateValue, expectedValue, fieldConfig, weight);
7633
8273
  default:
7634
8274
  return {
7635
- path: path17,
8275
+ path: path18,
7636
8276
  score: 0,
7637
8277
  weight,
7638
8278
  hit: false,
7639
- message: `${path17}: unknown match type "${match}"`
8279
+ message: `${path18}: unknown match type "${match}"`
7640
8280
  };
7641
8281
  }
7642
8282
  }
7643
8283
  /**
7644
8284
  * Exact equality comparison.
7645
8285
  */
7646
- compareExact(path17, candidateValue, expectedValue, weight) {
8286
+ compareExact(path18, candidateValue, expectedValue, weight) {
7647
8287
  if (deepEqual(candidateValue, expectedValue)) {
7648
8288
  return {
7649
- path: path17,
8289
+ path: path18,
7650
8290
  score: 1,
7651
8291
  weight,
7652
8292
  hit: true,
7653
- message: path17
8293
+ message: path18
7654
8294
  };
7655
8295
  }
7656
8296
  if (typeof candidateValue !== typeof expectedValue) {
7657
8297
  return {
7658
- path: path17,
8298
+ path: path18,
7659
8299
  score: 0,
7660
8300
  weight,
7661
8301
  hit: false,
7662
- message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
8302
+ message: `${path18} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7663
8303
  };
7664
8304
  }
7665
8305
  return {
7666
- path: path17,
8306
+ path: path18,
7667
8307
  score: 0,
7668
8308
  weight,
7669
8309
  hit: false,
7670
- message: `${path17} (value mismatch)`
8310
+ message: `${path18} (value mismatch)`
7671
8311
  };
7672
8312
  }
7673
8313
  /**
7674
8314
  * Numeric comparison with absolute or relative tolerance.
7675
8315
  */
7676
- compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
8316
+ compareNumericTolerance(path18, candidateValue, expectedValue, fieldConfig, weight) {
7677
8317
  const { tolerance = 0, relative = false } = fieldConfig;
7678
8318
  const candidateNum = toNumber(candidateValue);
7679
8319
  const expectedNum = toNumber(expectedValue);
7680
8320
  if (candidateNum === null || expectedNum === null) {
7681
8321
  return {
7682
- path: path17,
8322
+ path: path18,
7683
8323
  score: 0,
7684
8324
  weight,
7685
8325
  hit: false,
7686
- message: `${path17} (non-numeric value)`
8326
+ message: `${path18} (non-numeric value)`
7687
8327
  };
7688
8328
  }
7689
8329
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7690
8330
  return {
7691
- path: path17,
8331
+ path: path18,
7692
8332
  score: 0,
7693
8333
  weight,
7694
8334
  hit: false,
7695
- message: `${path17} (invalid numeric value)`
8335
+ message: `${path18} (invalid numeric value)`
7696
8336
  };
7697
8337
  }
7698
8338
  const diff = Math.abs(candidateNum - expectedNum);
@@ -7705,61 +8345,61 @@ var FieldAccuracyEvaluator = class {
7705
8345
  }
7706
8346
  if (withinTolerance) {
7707
8347
  return {
7708
- path: path17,
8348
+ path: path18,
7709
8349
  score: 1,
7710
8350
  weight,
7711
8351
  hit: true,
7712
- message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
8352
+ message: `${path18} (within tolerance: diff=${diff.toFixed(2)})`
7713
8353
  };
7714
8354
  }
7715
8355
  return {
7716
- path: path17,
8356
+ path: path18,
7717
8357
  score: 0,
7718
8358
  weight,
7719
8359
  hit: false,
7720
- message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
8360
+ message: `${path18} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7721
8361
  };
7722
8362
  }
7723
8363
  /**
7724
8364
  * Date comparison with format normalization.
7725
8365
  */
7726
- compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
8366
+ compareDate(path18, candidateValue, expectedValue, fieldConfig, weight) {
7727
8367
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7728
8368
  const candidateDate = parseDate(String(candidateValue), formats);
7729
8369
  const expectedDate = parseDate(String(expectedValue), formats);
7730
8370
  if (candidateDate === null) {
7731
8371
  return {
7732
- path: path17,
8372
+ path: path18,
7733
8373
  score: 0,
7734
8374
  weight,
7735
8375
  hit: false,
7736
- message: `${path17} (unparseable candidate date)`
8376
+ message: `${path18} (unparseable candidate date)`
7737
8377
  };
7738
8378
  }
7739
8379
  if (expectedDate === null) {
7740
8380
  return {
7741
- path: path17,
8381
+ path: path18,
7742
8382
  score: 0,
7743
8383
  weight,
7744
8384
  hit: false,
7745
- message: `${path17} (unparseable expected date)`
8385
+ message: `${path18} (unparseable expected date)`
7746
8386
  };
7747
8387
  }
7748
8388
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7749
8389
  return {
7750
- path: path17,
8390
+ path: path18,
7751
8391
  score: 1,
7752
8392
  weight,
7753
8393
  hit: true,
7754
- message: path17
8394
+ message: path18
7755
8395
  };
7756
8396
  }
7757
8397
  return {
7758
- path: path17,
8398
+ path: path18,
7759
8399
  score: 0,
7760
8400
  weight,
7761
8401
  hit: false,
7762
- message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
8402
+ message: `${path18} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7763
8403
  };
7764
8404
  }
7765
8405
  /**
@@ -7799,11 +8439,11 @@ var FieldAccuracyEvaluator = class {
7799
8439
  };
7800
8440
  }
7801
8441
  };
7802
- function resolvePath(obj, path17) {
7803
- if (!path17 || !obj) {
8442
+ function resolvePath(obj, path18) {
8443
+ if (!path18 || !obj) {
7804
8444
  return void 0;
7805
8445
  }
7806
- const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
8446
+ const parts = path18.split(/\.|\[|\]/).filter((p) => p.length > 0);
7807
8447
  let current = obj;
7808
8448
  for (const part of parts) {
7809
8449
  if (current === null || current === void 0) {
@@ -8028,6 +8668,27 @@ function argsMatch(expected, actual) {
8028
8668
  }
8029
8669
  return true;
8030
8670
  }
8671
+ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
8672
+ if (maxDurationMs === void 0) {
8673
+ return { status: "skip", message: "" };
8674
+ }
8675
+ if (actualDurationMs === void 0) {
8676
+ return {
8677
+ status: "skip",
8678
+ message: `No duration data for ${toolName}; latency assertion skipped`
8679
+ };
8680
+ }
8681
+ if (actualDurationMs <= maxDurationMs) {
8682
+ return {
8683
+ status: "pass",
8684
+ message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
8685
+ };
8686
+ }
8687
+ return {
8688
+ status: "fail",
8689
+ message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
8690
+ };
8691
+ }
8031
8692
  var ToolTrajectoryEvaluator = class {
8032
8693
  kind = "tool_trajectory";
8033
8694
  config;
@@ -8086,7 +8747,8 @@ var ToolTrajectoryEvaluator = class {
8086
8747
  for (const call of message.toolCalls) {
8087
8748
  toolCalls.push({
8088
8749
  name: call.tool,
8089
- args: call.input
8750
+ args: call.input,
8751
+ durationMs: call.durationMs
8090
8752
  });
8091
8753
  }
8092
8754
  }
@@ -8154,17 +8816,27 @@ var ToolTrajectoryEvaluator = class {
8154
8816
  }
8155
8817
  const hits = [];
8156
8818
  const misses = [];
8819
+ const warnings = [];
8157
8820
  let actualIndex = 0;
8821
+ let sequenceHits = 0;
8822
+ let latencyHits = 0;
8823
+ let latencySkips = 0;
8824
+ const latencyAssertionCount = expected.filter(
8825
+ (item) => item.maxDurationMs !== void 0
8826
+ ).length;
8158
8827
  for (let i = 0; i < expected.length; i++) {
8159
8828
  const expectedItem = expected[i];
8160
8829
  const expectedTool = expectedItem.tool;
8161
8830
  let found = false;
8162
8831
  let argsMismatch = false;
8832
+ let matchedCall;
8163
8833
  while (actualIndex < toolCalls.length) {
8164
8834
  const actualCall = toolCalls[actualIndex];
8165
8835
  if (actualCall.name === expectedTool) {
8166
8836
  if (argsMatch(expectedItem.args, actualCall.args)) {
8167
8837
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
8838
+ sequenceHits++;
8839
+ matchedCall = actualCall;
8168
8840
  actualIndex++;
8169
8841
  found = true;
8170
8842
  break;
@@ -8181,14 +8853,35 @@ var ToolTrajectoryEvaluator = class {
8181
8853
  if (!found && !argsMismatch) {
8182
8854
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
8183
8855
  }
8856
+ if (found && matchedCall) {
8857
+ const latencyResult = checkLatency(
8858
+ expectedTool,
8859
+ expectedItem.maxDurationMs,
8860
+ matchedCall.durationMs
8861
+ );
8862
+ if (latencyResult.status === "pass") {
8863
+ hits.push(latencyResult.message);
8864
+ latencyHits++;
8865
+ } else if (latencyResult.status === "fail") {
8866
+ misses.push(latencyResult.message);
8867
+ } else if (latencyResult.message) {
8868
+ warnings.push(latencyResult.message);
8869
+ latencySkips++;
8870
+ }
8871
+ }
8184
8872
  }
8185
- const score = hits.length / expected.length;
8873
+ for (const warning of warnings) {
8874
+ console.warn(`[tool_trajectory] ${warning}`);
8875
+ }
8876
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
8877
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
8878
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
8186
8879
  return {
8187
8880
  score,
8188
8881
  verdict: scoreToVerdict(score),
8189
8882
  hits,
8190
8883
  misses,
8191
- expectedAspectCount: expected.length
8884
+ expectedAspectCount: totalAssertions
8192
8885
  };
8193
8886
  }
8194
8887
  evaluateExact(toolCalls) {
@@ -8204,6 +8897,13 @@ var ToolTrajectoryEvaluator = class {
8204
8897
  }
8205
8898
  const hits = [];
8206
8899
  const misses = [];
8900
+ const warnings = [];
8901
+ let sequenceHits = 0;
8902
+ let latencyHits = 0;
8903
+ let latencySkips = 0;
8904
+ const latencyAssertionCount = expected.filter(
8905
+ (item) => item.maxDurationMs !== void 0
8906
+ ).length;
8207
8907
  if (toolCalls.length !== expected.length) {
8208
8908
  misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
8209
8909
  }
@@ -8213,33 +8913,58 @@ var ToolTrajectoryEvaluator = class {
8213
8913
  const expectedTool = expectedItem.tool;
8214
8914
  const actualCall = toolCalls[i];
8215
8915
  const actualTool = actualCall.name;
8916
+ let sequenceMatched = false;
8216
8917
  if (actualTool === expectedTool) {
8217
8918
  if (argsMatch(expectedItem.args, actualCall.args)) {
8218
8919
  hits.push(`Position ${i}: ${expectedTool}`);
8920
+ sequenceHits++;
8921
+ sequenceMatched = true;
8219
8922
  } else {
8220
8923
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
8221
8924
  }
8222
8925
  } else {
8223
8926
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
8224
8927
  }
8928
+ if (sequenceMatched) {
8929
+ const latencyResult = checkLatency(
8930
+ expectedTool,
8931
+ expectedItem.maxDurationMs,
8932
+ actualCall.durationMs
8933
+ );
8934
+ if (latencyResult.status === "pass") {
8935
+ hits.push(latencyResult.message);
8936
+ latencyHits++;
8937
+ } else if (latencyResult.status === "fail") {
8938
+ misses.push(latencyResult.message);
8939
+ } else if (latencyResult.message) {
8940
+ warnings.push(latencyResult.message);
8941
+ latencySkips++;
8942
+ }
8943
+ }
8225
8944
  }
8226
8945
  for (let i = checkLength; i < expected.length; i++) {
8227
8946
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
8228
8947
  }
8229
- const score = hits.length / expected.length;
8948
+ for (const warning of warnings) {
8949
+ console.warn(`[tool_trajectory] ${warning}`);
8950
+ }
8951
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
8952
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
8953
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
8230
8954
  return {
8231
8955
  score,
8232
8956
  verdict: scoreToVerdict(score),
8233
8957
  hits,
8234
8958
  misses,
8235
- expectedAspectCount: expected.length
8959
+ expectedAspectCount: totalAssertions
8236
8960
  };
8237
8961
  }
8238
8962
  };
8239
8963
 
8240
8964
  // src/evaluation/orchestrator.ts
8241
8965
  var import_node_crypto5 = require("crypto");
8242
- var import_node_path16 = __toESM(require("path"), 1);
8966
+ var import_node_path17 = __toESM(require("path"), 1);
8967
+ var import_micromatch4 = __toESM(require("micromatch"), 1);
8243
8968
 
8244
8969
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
8245
8970
  var Node = class {
@@ -8398,17 +9123,17 @@ async function runEvaluation(options) {
8398
9123
  cache,
8399
9124
  useCache,
8400
9125
  now,
8401
- evalId,
9126
+ filter,
8402
9127
  verbose,
8403
9128
  evalCases: preloadedEvalCases,
8404
9129
  onResult,
8405
9130
  onProgress
8406
9131
  } = options;
8407
- const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
8408
- const filteredEvalCases = filterEvalCases(evalCases, evalId);
9132
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
9133
+ const filteredEvalCases = filterEvalCases(evalCases, filter);
8409
9134
  if (filteredEvalCases.length === 0) {
8410
- if (evalId) {
8411
- throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
9135
+ if (filter) {
9136
+ throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
8412
9137
  }
8413
9138
  return [];
8414
9139
  }
@@ -8984,7 +9709,10 @@ async function runEvaluatorList(options) {
8984
9709
  attempt,
8985
9710
  promptInputs,
8986
9711
  now,
8987
- judgeProvider
9712
+ judgeProvider,
9713
+ outputMessages,
9714
+ traceSummary,
9715
+ agentTimeoutMs
8988
9716
  });
8989
9717
  const weight = evaluator.weight ?? 1;
8990
9718
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -9038,7 +9766,7 @@ async function runEvaluatorList(options) {
9038
9766
  });
9039
9767
  }
9040
9768
  if (evaluator.type === "composite") {
9041
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
9769
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path17.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
9042
9770
  const createEvaluator = (memberConfig) => {
9043
9771
  switch (memberConfig.type) {
9044
9772
  case "llm_judge":
@@ -9319,9 +10047,22 @@ async function runLlmJudgeEvaluator(options) {
9319
10047
  attempt,
9320
10048
  promptInputs,
9321
10049
  now,
9322
- judgeProvider
10050
+ judgeProvider,
10051
+ outputMessages,
10052
+ traceSummary,
10053
+ agentTimeoutMs
9323
10054
  } = options;
9324
- const customPrompt = await resolveCustomPrompt(config);
10055
+ const customPrompt = await resolveCustomPrompt(
10056
+ config,
10057
+ {
10058
+ evalCase,
10059
+ candidate,
10060
+ outputMessages,
10061
+ traceSummary,
10062
+ config: config.config
10063
+ },
10064
+ agentTimeoutMs
10065
+ );
9325
10066
  return evaluatorRegistry.llm_judge.evaluate({
9326
10067
  evalCase,
9327
10068
  candidate,
@@ -9335,23 +10076,70 @@ async function runLlmJudgeEvaluator(options) {
9335
10076
  evaluator: config
9336
10077
  });
9337
10078
  }
9338
- async function resolveCustomPrompt(config) {
9339
- if (config.promptPath) {
10079
+ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
10080
+ if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
10081
+ if (!context) {
10082
+ throw new Error("Context required for executable prompt templates");
10083
+ }
10084
+ return executePromptTemplate(
10085
+ promptConfig.resolvedPromptScript,
10086
+ context,
10087
+ promptConfig.config,
10088
+ timeoutMs
10089
+ );
10090
+ }
10091
+ const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
10092
+ if (promptPath) {
9340
10093
  try {
9341
- const content = await readTextFile(config.promptPath);
10094
+ const content = await readTextFile(promptPath);
9342
10095
  return content;
9343
10096
  } catch (error) {
9344
10097
  const message = error instanceof Error ? error.message : String(error);
9345
- console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
10098
+ console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
10099
+ }
10100
+ }
10101
+ const promptValue = promptConfig.prompt;
10102
+ if (typeof promptValue === "string") {
10103
+ return promptValue;
10104
+ }
10105
+ return void 0;
10106
+ }
10107
+ async function executePromptTemplate(script, context, config, timeoutMs) {
10108
+ const payload = {
10109
+ question: context.evalCase.question,
10110
+ expectedOutcome: context.evalCase.expected_outcome,
10111
+ expectedMessages: context.evalCase.expected_messages,
10112
+ referenceAnswer: context.evalCase.reference_answer,
10113
+ candidateAnswer: context.candidate,
10114
+ outputMessages: context.outputMessages ?? null,
10115
+ guidelineFiles: context.evalCase.guideline_paths,
10116
+ inputFiles: context.evalCase.file_paths.filter(
10117
+ (p) => !context.evalCase.guideline_paths.includes(p)
10118
+ ),
10119
+ inputMessages: context.evalCase.input_messages,
10120
+ traceSummary: context.traceSummary ?? null,
10121
+ config: config ?? context.config ?? null
10122
+ };
10123
+ const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
10124
+ const scriptPath = script[script.length - 1];
10125
+ const cwd = import_node_path17.default.dirname(scriptPath);
10126
+ try {
10127
+ const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
10128
+ const prompt = stdout.trim();
10129
+ if (!prompt) {
10130
+ throw new Error("Prompt template produced empty output");
9346
10131
  }
10132
+ return prompt;
10133
+ } catch (error) {
10134
+ const message = error instanceof Error ? error.message : String(error);
10135
+ throw new Error(`Prompt template execution failed: ${message}`);
9347
10136
  }
9348
- return config.prompt;
9349
10137
  }
9350
- function filterEvalCases(evalCases, evalId) {
9351
- if (!evalId) {
10138
+ function filterEvalCases(evalCases, filter) {
10139
+ if (!filter) {
9352
10140
  return evalCases;
9353
10141
  }
9354
- return evalCases.filter((evalCase) => evalCase.id === evalId);
10142
+ return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
9355
10143
  }
9356
10144
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
9357
10145
  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -9509,7 +10297,7 @@ var import_ai4 = require("ai");
9509
10297
  var import_zod4 = require("zod");
9510
10298
  var rubricItemSchema = import_zod4.z.object({
9511
10299
  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
9512
- description: import_zod4.z.string().describe("What this rubric checks for"),
10300
+ expected_outcome: import_zod4.z.string().describe("Concrete expected outcome for this rubric item"),
9513
10301
  weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
9514
10302
  required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
9515
10303
  });
@@ -9529,7 +10317,7 @@ You must return a valid JSON object matching this schema:
9529
10317
  "rubrics": [
9530
10318
  {
9531
10319
  "id": "string (short identifier)",
9532
- "description": "string (what to check)",
10320
+ "expected_outcome": "string (concrete expected outcome for this rubric item)",
9533
10321
  "weight": number (default 1.0),
9534
10322
  "required": boolean (default true)
9535
10323
  }
@@ -9565,7 +10353,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
9565
10353
  "Each rubric should:",
9566
10354
  "- Be specific and testable",
9567
10355
  "- Have a short, descriptive ID",
9568
- "- Include a clear description of what to check",
10356
+ "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
9569
10357
  "- Indicate if it is required (mandatory) or optional",
9570
10358
  "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
9571
10359
  "",
@@ -9613,6 +10401,7 @@ function createAgentKernel() {
9613
10401
  createAgentKernel,
9614
10402
  createProvider,
9615
10403
  deepEqual,
10404
+ detectFormat,
9616
10405
  ensureVSCodeSubagents,
9617
10406
  executeScript,
9618
10407
  explorationRatio,