@agentv/core 2.1.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-KDEP4I7G.js";
13
+ } from "./chunk-RP3M7COZ.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -146,8 +146,9 @@ function mergeExecutionMetrics(summary, metrics) {
146
146
  }
147
147
 
148
148
  // src/evaluation/yaml-parser.ts
149
- import { readFile as readFile5 } from "node:fs/promises";
150
- import path6 from "node:path";
149
+ import { readFile as readFile6 } from "node:fs/promises";
150
+ import path7 from "node:path";
151
+ import micromatch3 from "micromatch";
151
152
  import { parse as parse2 } from "yaml";
152
153
 
153
154
  // src/evaluation/loaders/config-loader.ts
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
462
463
  );
463
464
  }
464
465
  }
465
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
466
- const config = {};
466
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
467
+ const config2 = {};
467
468
  for (const [key, value] of Object.entries(rawEvaluator)) {
468
- if (!knownProps.has(key) && value !== void 0) {
469
- config[key] = value;
469
+ if (!knownProps2.has(key) && value !== void 0) {
470
+ config2[key] = value;
470
471
  }
471
472
  }
472
473
  evaluators.push({
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
476
477
  cwd,
477
478
  resolvedCwd,
478
479
  ...weight2 !== void 0 ? { weight: weight2 } : {},
479
- ...Object.keys(config).length > 0 ? { config } : {},
480
+ ...Object.keys(config2).length > 0 ? { config: config2 } : {},
480
481
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
481
482
  });
482
483
  continue;
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
641
642
  continue;
642
643
  }
643
644
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
644
- const config = {
645
+ const config2 = {
645
646
  name,
646
647
  type: "tool_trajectory",
647
648
  mode,
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
649
650
  ...expected ? { expected } : {},
650
651
  ...weight2 !== void 0 ? { weight: weight2 } : {}
651
652
  };
652
- evaluators.push(config);
653
+ evaluators.push(config2);
653
654
  continue;
654
655
  }
655
656
  if (typeValue === "field_accuracy") {
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
786
787
  });
787
788
  continue;
788
789
  }
789
- const prompt = asString(rawEvaluator.prompt);
790
+ const rawPrompt = rawEvaluator.prompt;
791
+ let prompt;
790
792
  let promptPath;
791
- if (prompt) {
793
+ let resolvedPromptScript;
794
+ let promptScriptConfig;
795
+ if (isJsonObject2(rawPrompt)) {
796
+ const scriptArray = asStringArray(
797
+ rawPrompt.script,
798
+ `prompt.script for evaluator '${name}' in '${evalId}'`
799
+ );
800
+ if (!scriptArray) {
801
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
802
+ }
803
+ const scriptPath = scriptArray[scriptArray.length - 1];
804
+ const resolved = await resolveFileReference2(scriptPath, searchRoots);
805
+ if (resolved.resolvedPath) {
806
+ resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
807
+ } else {
808
+ throw new Error(
809
+ `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
810
+ );
811
+ }
812
+ if (isJsonObject2(rawPrompt.config)) {
813
+ promptScriptConfig = rawPrompt.config;
814
+ }
815
+ } else if (typeof rawPrompt === "string") {
816
+ prompt = rawPrompt;
792
817
  const resolved = await resolveFileReference2(prompt, searchRoots);
793
818
  if (resolved.resolvedPath) {
794
819
  promptPath = path3.resolve(resolved.resolvedPath);
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
807
832
  }
808
833
  const _model = asString(rawEvaluator.model);
809
834
  const rawRubrics = rawEvaluator.rubrics;
810
- const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
811
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
812
- description: asString(rubric.description) ?? "",
813
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
814
- required: typeof rubric.required === "boolean" ? rubric.required : true
815
- })).filter((r) => r.description.length > 0) : void 0;
835
+ const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
816
836
  if (typeValue === "rubric") {
817
837
  if (!parsedRubrics) {
818
838
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
832
852
  continue;
833
853
  }
834
854
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
855
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
856
+ const config = {};
857
+ for (const [key, value] of Object.entries(rawEvaluator)) {
858
+ if (!knownProps.has(key) && value !== void 0) {
859
+ config[key] = value;
860
+ }
861
+ }
862
+ const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
863
+ const mergedConfig = { ...config, ...topLevelConfig };
864
+ const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
835
865
  evaluators.push({
836
866
  name,
837
867
  type: "llm_judge",
838
868
  prompt,
839
869
  promptPath,
870
+ ...promptPath ? { resolvedPromptPath: promptPath } : {},
871
+ ...resolvedPromptScript ? { resolvedPromptScript } : {},
840
872
  ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
841
- ...weight !== void 0 ? { weight } : {}
873
+ ...weight !== void 0 ? { weight } : {},
874
+ ...finalConfig ? { config: finalConfig } : {}
842
875
  });
843
876
  }
844
877
  return evaluators.length > 0 ? evaluators : void 0;
@@ -925,6 +958,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
925
958
  function isValidFieldAggregationType(value) {
926
959
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
927
960
  }
961
+ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
962
+ const items = [];
963
+ for (const [index, rawRubric] of rawRubrics.entries()) {
964
+ if (!isJsonObject2(rawRubric)) {
965
+ logWarning2(
966
+ `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
967
+ );
968
+ continue;
969
+ }
970
+ const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
971
+ const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
972
+ const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
973
+ let requiredMinScore;
974
+ let required;
975
+ if (typeof rawRubric.required_min_score === "number") {
976
+ const minScore = rawRubric.required_min_score;
977
+ if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
978
+ throw new Error(
979
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
980
+ );
981
+ }
982
+ requiredMinScore = minScore;
983
+ }
984
+ if (typeof rawRubric.required === "boolean") {
985
+ required = rawRubric.required;
986
+ }
987
+ let scoreRanges;
988
+ const rawScoreRanges = rawRubric.score_ranges;
989
+ if (rawScoreRanges !== void 0) {
990
+ if (!Array.isArray(rawScoreRanges)) {
991
+ throw new Error(
992
+ `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
993
+ );
994
+ }
995
+ scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
996
+ items.push({
997
+ id,
998
+ weight,
999
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1000
+ ...required !== void 0 ? { required } : {},
1001
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1002
+ score_ranges: scoreRanges
1003
+ });
1004
+ } else {
1005
+ if (expectedOutcome.length === 0) {
1006
+ logWarning2(
1007
+ `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
1008
+ );
1009
+ continue;
1010
+ }
1011
+ items.push({
1012
+ id,
1013
+ expected_outcome: expectedOutcome,
1014
+ weight,
1015
+ // Default to required: true if not specified (backward compatibility)
1016
+ required: required ?? true,
1017
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1018
+ });
1019
+ }
1020
+ }
1021
+ return items.length > 0 ? items : void 0;
1022
+ }
1023
+ function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
1024
+ const ranges = [];
1025
+ for (const [index, rawRange] of rawRanges.entries()) {
1026
+ if (!isJsonObject2(rawRange)) {
1027
+ throw new Error(
1028
+ `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
1029
+ );
1030
+ }
1031
+ const scoreRangeValue = rawRange.score_range;
1032
+ if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
1033
+ throw new Error(
1034
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
1035
+ );
1036
+ }
1037
+ const [min, max] = scoreRangeValue;
1038
+ if (!Number.isInteger(min) || !Number.isInteger(max)) {
1039
+ throw new Error(
1040
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
1041
+ );
1042
+ }
1043
+ if (min < 0 || min > 10 || max < 0 || max > 10) {
1044
+ throw new Error(
1045
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
1046
+ );
1047
+ }
1048
+ if (min > max) {
1049
+ throw new Error(
1050
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
1051
+ );
1052
+ }
1053
+ const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
1054
+ if (expectedOutcome.length === 0) {
1055
+ throw new Error(
1056
+ `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
1057
+ );
1058
+ }
1059
+ ranges.push({
1060
+ score_range: [min, max],
1061
+ expected_outcome: expectedOutcome
1062
+ });
1063
+ }
1064
+ const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
1065
+ for (let i = 1; i < sortedRanges.length; i++) {
1066
+ const prev = sortedRanges[i - 1];
1067
+ const curr = sortedRanges[i];
1068
+ if (curr.score_range[0] <= prev.score_range[1]) {
1069
+ throw new Error(
1070
+ `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
1071
+ );
1072
+ }
1073
+ }
1074
+ const covered = /* @__PURE__ */ new Set();
1075
+ for (const range of ranges) {
1076
+ for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
1077
+ covered.add(i);
1078
+ }
1079
+ }
1080
+ const missing = [];
1081
+ for (let i = 0; i <= 10; i++) {
1082
+ if (!covered.has(i)) {
1083
+ missing.push(i);
1084
+ }
1085
+ }
1086
+ if (missing.length > 0) {
1087
+ throw new Error(
1088
+ `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
1089
+ );
1090
+ }
1091
+ return ranges;
1092
+ }
1093
+ function parseInlineRubrics(rawRubrics) {
1094
+ const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
1095
+ if (typeof rubric === "string") {
1096
+ return {
1097
+ id: `rubric-${index + 1}`,
1098
+ expected_outcome: rubric,
1099
+ weight: 1,
1100
+ required: true
1101
+ };
1102
+ }
1103
+ const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
1104
+ const rawScoreRanges = rubric.score_ranges;
1105
+ const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
1106
+ score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
1107
+ expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
1108
+ })).filter((r) => r.expected_outcome.length > 0) : void 0;
1109
+ const baseRubric = {
1110
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
1111
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1
1112
+ };
1113
+ if (scoreRanges && scoreRanges.length > 0) {
1114
+ return {
1115
+ ...baseRubric,
1116
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
1117
+ ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
1118
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
1119
+ score_ranges: scoreRanges
1120
+ };
1121
+ }
1122
+ return {
1123
+ ...baseRubric,
1124
+ expected_outcome: expectedOutcome,
1125
+ required: typeof rubric.required === "boolean" ? rubric.required : true,
1126
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
1127
+ };
1128
+ }).filter(
1129
+ (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
1130
+ );
1131
+ if (rubricItems.length === 0) {
1132
+ return void 0;
1133
+ }
1134
+ return {
1135
+ name: "rubric",
1136
+ type: "llm_judge",
1137
+ rubrics: rubricItems
1138
+ };
1139
+ }
1140
+
1141
+ // src/evaluation/loaders/jsonl-parser.ts
1142
+ import { readFile as readFile4 } from "node:fs/promises";
1143
+ import path5 from "node:path";
1144
+ import micromatch2 from "micromatch";
1145
+ import { parse as parseYaml } from "yaml";
928
1146
 
929
1147
  // src/evaluation/loaders/message-processor.ts
930
1148
  import { readFile as readFile3 } from "node:fs/promises";
@@ -1186,28 +1404,302 @@ async function processExpectedMessages(options) {
1186
1404
  return segments;
1187
1405
  }
1188
1406
 
1189
- // src/evaluation/formatting/prompt-builder.ts
1190
- import { readFile as readFile4 } from "node:fs/promises";
1191
- import path5 from "node:path";
1407
+ // src/evaluation/loaders/shorthand-expansion.ts
1408
+ function expandInputShorthand(value) {
1409
+ if (value === void 0 || value === null) {
1410
+ return void 0;
1411
+ }
1412
+ if (typeof value === "string") {
1413
+ return [{ role: "user", content: value }];
1414
+ }
1415
+ if (Array.isArray(value)) {
1416
+ const messages = value.filter((msg) => isTestMessage(msg));
1417
+ return messages.length > 0 ? messages : void 0;
1418
+ }
1419
+ return void 0;
1420
+ }
1421
+ function expandExpectedOutputShorthand(value) {
1422
+ if (value === void 0 || value === null) {
1423
+ return void 0;
1424
+ }
1425
+ if (typeof value === "string") {
1426
+ return [{ role: "assistant", content: value }];
1427
+ }
1428
+ if (Array.isArray(value)) {
1429
+ if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
1430
+ const messages = value.filter((msg) => isTestMessage(msg));
1431
+ return messages.length > 0 ? messages : void 0;
1432
+ }
1433
+ return [{ role: "assistant", content: value }];
1434
+ }
1435
+ if (isJsonObject(value)) {
1436
+ if ("role" in value) {
1437
+ return isTestMessage(value) ? [value] : void 0;
1438
+ }
1439
+ return [{ role: "assistant", content: value }];
1440
+ }
1441
+ return void 0;
1442
+ }
1443
+ function resolveInputMessages(raw) {
1444
+ if (raw.input_messages !== void 0) {
1445
+ if (Array.isArray(raw.input_messages)) {
1446
+ const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
1447
+ return messages.length > 0 ? messages : void 0;
1448
+ }
1449
+ return void 0;
1450
+ }
1451
+ return expandInputShorthand(raw.input);
1452
+ }
1453
+ function resolveExpectedMessages(raw) {
1454
+ if (raw.expected_messages !== void 0) {
1455
+ if (Array.isArray(raw.expected_messages)) {
1456
+ const messages = raw.expected_messages.filter(
1457
+ (msg) => isTestMessage(msg)
1458
+ );
1459
+ return messages.length > 0 ? messages : void 0;
1460
+ }
1461
+ return void 0;
1462
+ }
1463
+ return expandExpectedOutputShorthand(raw.expected_output);
1464
+ }
1465
+
1466
+ // src/evaluation/loaders/jsonl-parser.ts
1192
1467
  var ANSI_YELLOW5 = "\x1B[33m";
1468
+ var ANSI_RED = "\x1B[31m";
1193
1469
  var ANSI_RESET5 = "\x1B[0m";
1470
+ function detectFormat(filePath) {
1471
+ const ext = path5.extname(filePath).toLowerCase();
1472
+ if (ext === ".jsonl") return "jsonl";
1473
+ if (ext === ".yaml" || ext === ".yml") return "yaml";
1474
+ throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
1475
+ }
1476
+ async function loadSidecarMetadata(jsonlPath, verbose) {
1477
+ const dir = path5.dirname(jsonlPath);
1478
+ const base = path5.basename(jsonlPath, ".jsonl");
1479
+ const sidecarPath = path5.join(dir, `${base}.yaml`);
1480
+ if (!await fileExists2(sidecarPath)) {
1481
+ if (verbose) {
1482
+ logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
1483
+ }
1484
+ return {};
1485
+ }
1486
+ try {
1487
+ const content = await readFile4(sidecarPath, "utf8");
1488
+ const parsed = parseYaml(content);
1489
+ if (!isJsonObject(parsed)) {
1490
+ logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
1491
+ return {};
1492
+ }
1493
+ return {
1494
+ description: asString4(parsed.description),
1495
+ dataset: asString4(parsed.dataset),
1496
+ execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
1497
+ evaluator: parsed.evaluator
1498
+ };
1499
+ } catch (error) {
1500
+ logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
1501
+ return {};
1502
+ }
1503
+ }
1504
+ function parseJsonlContent(content, filePath) {
1505
+ const lines = content.split("\n");
1506
+ const cases = [];
1507
+ for (let i = 0; i < lines.length; i++) {
1508
+ const line = lines[i].trim();
1509
+ if (line === "") continue;
1510
+ try {
1511
+ const parsed = JSON.parse(line);
1512
+ if (!isJsonObject(parsed)) {
1513
+ throw new Error("Expected JSON object");
1514
+ }
1515
+ cases.push(parsed);
1516
+ } catch (error) {
1517
+ const message = error instanceof Error ? error.message : String(error);
1518
+ throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
1519
+ File: ${filePath}`);
1520
+ }
1521
+ }
1522
+ return cases;
1523
+ }
1524
+ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
1525
+ const verbose = options?.verbose ?? false;
1526
+ const filterPattern = options?.filter;
1527
+ const absoluteTestPath = path5.resolve(evalFilePath);
1528
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
1529
+ const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
1530
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
1531
+ const guidelinePatterns = config?.guideline_patterns;
1532
+ const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
1533
+ const rawFile = await readFile4(absoluteTestPath, "utf8");
1534
+ const rawCases = parseJsonlContent(rawFile, evalFilePath);
1535
+ const fallbackDataset = path5.basename(absoluteTestPath, ".jsonl") || "eval";
1536
+ const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
1537
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
1538
+ const globalExecution = sidecar.execution;
1539
+ if (verbose) {
1540
+ console.log(`
1541
+ [JSONL Dataset: ${evalFilePath}]`);
1542
+ console.log(` Cases: ${rawCases.length}`);
1543
+ console.log(` Dataset name: ${datasetName}`);
1544
+ if (sidecar.description) {
1545
+ console.log(` Description: ${sidecar.description}`);
1546
+ }
1547
+ }
1548
+ const results = [];
1549
+ for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
1550
+ const evalcase = rawCases[lineIndex];
1551
+ const lineNumber = lineIndex + 1;
1552
+ const id = asString4(evalcase.id);
1553
+ if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
1554
+ continue;
1555
+ }
1556
+ const conversationId = asString4(evalcase.conversation_id);
1557
+ const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
1558
+ const inputMessages = resolveInputMessages(evalcase);
1559
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1560
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1561
+ logError(
1562
+ `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
1563
+ );
1564
+ continue;
1565
+ }
1566
+ const hasExpectedMessages = expectedMessages.length > 0;
1567
+ const guidelinePaths = [];
1568
+ const inputTextParts = [];
1569
+ const inputSegments = await processMessages({
1570
+ messages: inputMessages,
1571
+ searchRoots,
1572
+ repoRootPath,
1573
+ guidelinePatterns,
1574
+ guidelinePaths,
1575
+ textParts: inputTextParts,
1576
+ messageType: "input",
1577
+ verbose
1578
+ });
1579
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1580
+ messages: expectedMessages,
1581
+ searchRoots,
1582
+ repoRootPath,
1583
+ verbose
1584
+ }) : [];
1585
+ let referenceAnswer = "";
1586
+ if (outputSegments.length > 0) {
1587
+ const lastMessage = outputSegments[outputSegments.length - 1];
1588
+ const content = lastMessage.content;
1589
+ const toolCalls = lastMessage.tool_calls;
1590
+ if (typeof content === "string") {
1591
+ referenceAnswer = content;
1592
+ } else if (content !== void 0 && content !== null) {
1593
+ referenceAnswer = JSON.stringify(content, null, 2);
1594
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1595
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1596
+ }
1597
+ }
1598
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1599
+ const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
1600
+ const mergedExecution = caseExecution ?? globalExecution;
1601
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1602
+ let evaluators;
1603
+ try {
1604
+ evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
1605
+ } catch (error) {
1606
+ const message = error instanceof Error ? error.message : String(error);
1607
+ logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
1608
+ continue;
1609
+ }
1610
+ const inlineRubrics = evalcase.rubrics;
1611
+ if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1612
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
1613
+ if (rubricEvaluator) {
1614
+ evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1615
+ }
1616
+ }
1617
+ const userFilePaths = [];
1618
+ for (const segment of inputSegments) {
1619
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
1620
+ userFilePaths.push(segment.resolvedPath);
1621
+ }
1622
+ }
1623
+ const allFilePaths = [
1624
+ ...guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
1625
+ ...userFilePaths
1626
+ ];
1627
+ const testCase = {
1628
+ id,
1629
+ dataset: datasetName,
1630
+ conversation_id: conversationId,
1631
+ question,
1632
+ input_messages: inputMessages,
1633
+ input_segments: inputSegments,
1634
+ expected_messages: outputSegments,
1635
+ reference_answer: referenceAnswer,
1636
+ guideline_paths: guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
1637
+ guideline_patterns: guidelinePatterns,
1638
+ file_paths: allFilePaths,
1639
+ expected_outcome: outcome,
1640
+ evaluator: evalCaseEvaluatorKind,
1641
+ evaluators
1642
+ };
1643
+ if (verbose) {
1644
+ console.log(`
1645
+ [Eval Case: ${id}]`);
1646
+ if (testCase.guideline_paths.length > 0) {
1647
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
1648
+ for (const guidelinePath of testCase.guideline_paths) {
1649
+ console.log(` - ${guidelinePath}`);
1650
+ }
1651
+ } else {
1652
+ console.log(" No guidelines found");
1653
+ }
1654
+ }
1655
+ results.push(testCase);
1656
+ }
1657
+ return results;
1658
+ }
1659
+ function asString4(value) {
1660
+ return typeof value === "string" ? value : void 0;
1661
+ }
1662
+ function logWarning4(message, details) {
1663
+ if (details && details.length > 0) {
1664
+ const detailBlock = details.join("\n");
1665
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
1666
+ ${detailBlock}${ANSI_RESET5}`);
1667
+ } else {
1668
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1669
+ }
1670
+ }
1671
+ function logError(message, details) {
1672
+ if (details && details.length > 0) {
1673
+ const detailBlock = details.join("\n");
1674
+ console.error(`${ANSI_RED}Error: ${message}
1675
+ ${detailBlock}${ANSI_RESET5}`);
1676
+ } else {
1677
+ console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
1678
+ }
1679
+ }
1680
+
1681
+ // src/evaluation/formatting/prompt-builder.ts
1682
+ import { readFile as readFile5 } from "node:fs/promises";
1683
+ import path6 from "node:path";
1684
+ var ANSI_YELLOW6 = "\x1B[33m";
1685
+ var ANSI_RESET6 = "\x1B[0m";
1194
1686
  async function buildPromptInputs(testCase, mode = "lm") {
1195
1687
  const guidelineParts = [];
1196
1688
  for (const rawPath of testCase.guideline_paths) {
1197
- const absolutePath = path5.resolve(rawPath);
1689
+ const absolutePath = path6.resolve(rawPath);
1198
1690
  if (!await fileExists2(absolutePath)) {
1199
- logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
1691
+ logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
1200
1692
  continue;
1201
1693
  }
1202
1694
  try {
1203
- const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
1695
+ const content = (await readFile5(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
1204
1696
  guidelineParts.push({
1205
1697
  content,
1206
1698
  isFile: true,
1207
- displayPath: path5.basename(absolutePath)
1699
+ displayPath: path6.basename(absolutePath)
1208
1700
  });
1209
1701
  } catch (error) {
1210
- logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
1702
+ logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
1211
1703
  }
1212
1704
  }
1213
1705
  const guidelines = formatFileContents(guidelineParts);
@@ -1231,9 +1723,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
1231
1723
  messageSegments.push({ type: "text", value: segment });
1232
1724
  }
1233
1725
  } else if (isJsonObject(segment)) {
1234
- const type = asString4(segment.type);
1726
+ const type = asString5(segment.type);
1235
1727
  if (type === "file") {
1236
- const value = asString4(segment.value);
1728
+ const value = asString5(segment.value);
1237
1729
  if (!value) continue;
1238
1730
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
1239
1731
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -1244,7 +1736,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
1244
1736
  messageSegments.push({ type: "file", text: fileText, path: value });
1245
1737
  }
1246
1738
  } else if (type === "text") {
1247
- const textValue = asString4(segment.value);
1739
+ const textValue = asString5(segment.value);
1248
1740
  if (textValue && textValue.trim().length > 0) {
1249
1741
  messageSegments.push({ type: "text", value: textValue });
1250
1742
  }
@@ -1398,21 +1890,21 @@ ${guidelineContent.trim()}`);
1398
1890
  }
1399
1891
  return chatPrompt.length > 0 ? chatPrompt : void 0;
1400
1892
  }
1401
- function asString4(value) {
1893
+ function asString5(value) {
1402
1894
  return typeof value === "string" ? value : void 0;
1403
1895
  }
1404
- function logWarning4(message) {
1405
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1896
+ function logWarning5(message) {
1897
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
1406
1898
  }
1407
1899
 
1408
1900
  // src/evaluation/yaml-parser.ts
1409
- var ANSI_YELLOW6 = "\x1B[33m";
1410
- var ANSI_RED = "\x1B[31m";
1411
- var ANSI_RESET6 = "\x1B[0m";
1901
+ var ANSI_YELLOW7 = "\x1B[33m";
1902
+ var ANSI_RED2 = "\x1B[31m";
1903
+ var ANSI_RESET7 = "\x1B[0m";
1412
1904
  async function readTestSuiteMetadata(testFilePath) {
1413
1905
  try {
1414
- const absolutePath = path6.resolve(testFilePath);
1415
- const content = await readFile5(absolutePath, "utf8");
1906
+ const absolutePath = path7.resolve(testFilePath);
1907
+ const content = await readFile6(absolutePath, "utf8");
1416
1908
  const parsed = parse2(content);
1417
1909
  if (!isJsonObject(parsed)) {
1418
1910
  return {};
@@ -1423,21 +1915,25 @@ async function readTestSuiteMetadata(testFilePath) {
1423
1915
  }
1424
1916
  }
1425
1917
  async function loadEvalCases(evalFilePath, repoRoot, options) {
1918
+ const format = detectFormat(evalFilePath);
1919
+ if (format === "jsonl") {
1920
+ return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
1921
+ }
1426
1922
  const verbose = options?.verbose ?? false;
1427
- const evalIdFilter = options?.evalId;
1428
- const absoluteTestPath = path6.resolve(evalFilePath);
1923
+ const filterPattern = options?.filter;
1924
+ const absoluteTestPath = path7.resolve(evalFilePath);
1429
1925
  const repoRootPath = resolveToAbsolutePath(repoRoot);
1430
1926
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
1431
1927
  const config = await loadConfig(absoluteTestPath, repoRootPath);
1432
1928
  const guidelinePatterns = config?.guideline_patterns;
1433
- const rawFile = await readFile5(absoluteTestPath, "utf8");
1929
+ const rawFile = await readFile6(absoluteTestPath, "utf8");
1434
1930
  const parsed = parse2(rawFile);
1435
1931
  if (!isJsonObject(parsed)) {
1436
1932
  throw new Error(`Invalid test file format: ${evalFilePath}`);
1437
1933
  }
1438
1934
  const suite = parsed;
1439
- const datasetNameFromSuite = asString5(suite.dataset)?.trim();
1440
- const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
1935
+ const datasetNameFromSuite = asString6(suite.dataset)?.trim();
1936
+ const fallbackDataset = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
1441
1937
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
1442
1938
  const rawTestcases = suite.evalcases;
1443
1939
  if (!Array.isArray(rawTestcases)) {
@@ -1445,37 +1941,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1445
1941
  }
1446
1942
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
1447
1943
  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
1448
- const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
1944
+ const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
1449
1945
  const results = [];
1450
1946
  for (const rawEvalcase of rawTestcases) {
1451
1947
  if (!isJsonObject(rawEvalcase)) {
1452
- logWarning5("Skipping invalid eval case entry (expected object)");
1948
+ logWarning6("Skipping invalid eval case entry (expected object)");
1453
1949
  continue;
1454
1950
  }
1455
1951
  const evalcase = rawEvalcase;
1456
- const id = asString5(evalcase.id);
1457
- if (evalIdFilter && id !== evalIdFilter) {
1952
+ const id = asString6(evalcase.id);
1953
+ if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
1458
1954
  continue;
1459
1955
  }
1460
- const conversationId = asString5(evalcase.conversation_id);
1461
- const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
1462
- const inputMessagesValue = evalcase.input_messages;
1463
- const expectedMessagesValue = evalcase.expected_messages;
1464
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
1465
- logError(
1466
- `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
1956
+ const conversationId = asString6(evalcase.conversation_id);
1957
+ const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
1958
+ const inputMessages = resolveInputMessages(evalcase);
1959
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
1960
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
1961
+ logError2(
1962
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
1467
1963
  );
1468
1964
  continue;
1469
1965
  }
1470
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
1471
- const inputMessages = inputMessagesValue.filter(
1472
- (msg) => isTestMessage(msg)
1473
- );
1474
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
1475
- if (hasExpectedMessages && expectedMessages.length === 0) {
1476
- logError(`No valid expected message found for eval case: ${id}`);
1477
- continue;
1478
- }
1966
+ const hasExpectedMessages = expectedMessages.length > 0;
1479
1967
  const guidelinePaths = [];
1480
1968
  const inputTextParts = [];
1481
1969
  const inputSegments = await processMessages({
@@ -1514,33 +2002,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1514
2002
  evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
1515
2003
  } catch (error) {
1516
2004
  const message = error instanceof Error ? error.message : String(error);
1517
- logError(`Skipping eval case '${id}': ${message}`);
2005
+ logError2(`Skipping eval case '${id}': ${message}`);
1518
2006
  continue;
1519
2007
  }
1520
2008
  const inlineRubrics = evalcase.rubrics;
1521
2009
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1522
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1523
- if (typeof rubric === "string") {
1524
- return {
1525
- id: `rubric-${index + 1}`,
1526
- description: rubric,
1527
- weight: 1,
1528
- required: true
1529
- };
1530
- }
1531
- return {
1532
- id: asString5(rubric.id) ?? `rubric-${index + 1}`,
1533
- description: asString5(rubric.description) ?? "",
1534
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1535
- required: typeof rubric.required === "boolean" ? rubric.required : true
1536
- };
1537
- }).filter((r) => r.description.length > 0);
1538
- if (rubricItems.length > 0) {
1539
- const rubricEvaluator = {
1540
- name: "rubric",
1541
- type: "llm_judge",
1542
- rubrics: rubricItems
1543
- };
2010
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2011
+ if (rubricEvaluator) {
1544
2012
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1545
2013
  }
1546
2014
  }
@@ -1551,7 +2019,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1551
2019
  }
1552
2020
  }
1553
2021
  const allFilePaths = [
1554
- ...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
2022
+ ...guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
1555
2023
  ...userFilePaths
1556
2024
  ];
1557
2025
  const testCase = {
@@ -1563,7 +2031,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1563
2031
  input_segments: inputSegments,
1564
2032
  expected_messages: outputSegments,
1565
2033
  reference_answer: referenceAnswer,
1566
- guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
2034
+ guideline_paths: guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
1567
2035
  guideline_patterns: guidelinePatterns,
1568
2036
  file_paths: allFilePaths,
1569
2037
  expected_outcome: outcome,
@@ -1586,25 +2054,25 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1586
2054
  }
1587
2055
  return results;
1588
2056
  }
1589
- function asString5(value) {
2057
+ function asString6(value) {
1590
2058
  return typeof value === "string" ? value : void 0;
1591
2059
  }
1592
- function logWarning5(message, details) {
2060
+ function logWarning6(message, details) {
1593
2061
  if (details && details.length > 0) {
1594
2062
  const detailBlock = details.join("\n");
1595
- console.warn(`${ANSI_YELLOW6}Warning: ${message}
1596
- ${detailBlock}${ANSI_RESET6}`);
2063
+ console.warn(`${ANSI_YELLOW7}Warning: ${message}
2064
+ ${detailBlock}${ANSI_RESET7}`);
1597
2065
  } else {
1598
- console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
2066
+ console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
1599
2067
  }
1600
2068
  }
1601
- function logError(message, details) {
2069
+ function logError2(message, details) {
1602
2070
  if (details && details.length > 0) {
1603
2071
  const detailBlock = details.join("\n");
1604
- console.error(`${ANSI_RED}Error: ${message}
1605
- ${detailBlock}${ANSI_RESET6}`);
2072
+ console.error(`${ANSI_RED2}Error: ${message}
2073
+ ${detailBlock}${ANSI_RESET7}`);
1606
2074
  } else {
1607
- console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
2075
+ console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
1608
2076
  }
1609
2077
  }
1610
2078
 
@@ -1947,7 +2415,7 @@ import { randomUUID } from "node:crypto";
1947
2415
  import { createWriteStream } from "node:fs";
1948
2416
  import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
1949
2417
  import { tmpdir } from "node:os";
1950
- import path8 from "node:path";
2418
+ import path9 from "node:path";
1951
2419
 
1952
2420
  // src/evaluation/providers/claude-code-log-tracker.ts
1953
2421
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
@@ -2003,7 +2471,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
2003
2471
  }
2004
2472
 
2005
2473
  // src/evaluation/providers/preread.ts
2006
- import path7 from "node:path";
2474
+ import path8 from "node:path";
2007
2475
  function buildPromptDocument(request, inputFiles, options) {
2008
2476
  const parts = [];
2009
2477
  const guidelineFiles = collectGuidelineFiles(
@@ -2026,7 +2494,7 @@ function normalizeInputFiles(inputFiles) {
2026
2494
  }
2027
2495
  const deduped = /* @__PURE__ */ new Map();
2028
2496
  for (const inputFile of inputFiles) {
2029
- const absolutePath = path7.resolve(inputFile);
2497
+ const absolutePath = path8.resolve(inputFile);
2030
2498
  if (!deduped.has(absolutePath)) {
2031
2499
  deduped.set(absolutePath, absolutePath);
2032
2500
  }
@@ -2039,14 +2507,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2039
2507
  }
2040
2508
  const unique = /* @__PURE__ */ new Map();
2041
2509
  for (const inputFile of inputFiles) {
2042
- const absolutePath = path7.resolve(inputFile);
2510
+ const absolutePath = path8.resolve(inputFile);
2043
2511
  if (overrides?.has(absolutePath)) {
2044
2512
  if (!unique.has(absolutePath)) {
2045
2513
  unique.set(absolutePath, absolutePath);
2046
2514
  }
2047
2515
  continue;
2048
2516
  }
2049
- const normalized = absolutePath.split(path7.sep).join("/");
2517
+ const normalized = absolutePath.split(path8.sep).join("/");
2050
2518
  if (isGuidelineFile(normalized, guidelinePatterns)) {
2051
2519
  if (!unique.has(absolutePath)) {
2052
2520
  unique.set(absolutePath, absolutePath);
@@ -2061,7 +2529,7 @@ function collectInputFiles(inputFiles) {
2061
2529
  }
2062
2530
  const unique = /* @__PURE__ */ new Map();
2063
2531
  for (const inputFile of inputFiles) {
2064
- const absolutePath = path7.resolve(inputFile);
2532
+ const absolutePath = path8.resolve(inputFile);
2065
2533
  if (!unique.has(absolutePath)) {
2066
2534
  unique.set(absolutePath, absolutePath);
2067
2535
  }
@@ -2073,7 +2541,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2073
2541
  return "";
2074
2542
  }
2075
2543
  const buildList = (files) => files.map((absolutePath) => {
2076
- const fileName = path7.basename(absolutePath);
2544
+ const fileName = path8.basename(absolutePath);
2077
2545
  const fileUri = pathToFileUri(absolutePath);
2078
2546
  return `* [${fileName}](${fileUri})`;
2079
2547
  });
@@ -2093,7 +2561,7 @@ ${buildList(inputFiles).join("\n")}.`);
2093
2561
  return sections.join("\n");
2094
2562
  }
2095
2563
  function pathToFileUri(filePath) {
2096
- const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
2564
+ const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
2097
2565
  const normalizedPath = absolutePath.replace(/\\/g, "/");
2098
2566
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2099
2567
  return `file:///${normalizedPath}`;
@@ -2130,7 +2598,7 @@ var ClaudeCodeProvider = class {
2130
2598
  const workspaceRoot = await this.createWorkspace();
2131
2599
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2132
2600
  try {
2133
- const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
2601
+ const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
2134
2602
  await writeFile(promptFile, request.question, "utf8");
2135
2603
  const args = this.buildClaudeCodeArgs(request.question, inputFiles);
2136
2604
  const cwd = this.resolveCwd();
@@ -2178,7 +2646,7 @@ var ClaudeCodeProvider = class {
2178
2646
  if (!this.config.cwd) {
2179
2647
  return process.cwd();
2180
2648
  }
2181
- return path8.resolve(this.config.cwd);
2649
+ return path9.resolve(this.config.cwd);
2182
2650
  }
2183
2651
  buildClaudeCodeArgs(prompt, inputFiles) {
2184
2652
  const args = [];
@@ -2235,7 +2703,7 @@ ${filesContext}`;
2235
2703
  }
2236
2704
  }
2237
2705
  async createWorkspace() {
2238
- return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
2706
+ return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
2239
2707
  }
2240
2708
  async cleanupWorkspace(workspaceRoot) {
2241
2709
  try {
@@ -2249,9 +2717,9 @@ ${filesContext}`;
2249
2717
  return void 0;
2250
2718
  }
2251
2719
  if (this.config.logDir) {
2252
- return path8.resolve(this.config.logDir);
2720
+ return path9.resolve(this.config.logDir);
2253
2721
  }
2254
- return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
2722
+ return path9.join(process.cwd(), ".agentv", "logs", "claude-code");
2255
2723
  }
2256
2724
  async createStreamLogger(request) {
2257
2725
  const logDir = this.resolveLogDirectory();
@@ -2265,7 +2733,7 @@ ${filesContext}`;
2265
2733
  console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
2266
2734
  return void 0;
2267
2735
  }
2268
- const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
2736
+ const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
2269
2737
  try {
2270
2738
  const logger = await ClaudeCodeStreamLogger.create({
2271
2739
  filePath,
@@ -2670,10 +3138,10 @@ function escapeShellArg(arg) {
2670
3138
  }
2671
3139
  async function defaultClaudeCodeRunner(options) {
2672
3140
  const tempId = randomUUID();
2673
- const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
2674
- const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
2675
- const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
2676
- const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
3141
+ const stdoutFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
3142
+ const stderrFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
3143
+ const exitFile = path9.join(tmpdir(), `agentv-cc-${tempId}-exit`);
3144
+ const pidFile = path9.join(tmpdir(), `agentv-cc-${tempId}-pid`);
2677
3145
  try {
2678
3146
  return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
2679
3147
  } finally {
@@ -2713,8 +3181,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
2713
3181
  let lastStdoutSize = 0;
2714
3182
  const readFileIfExists = async (filePath) => {
2715
3183
  try {
2716
- const { readFile: readFile7 } = await import("node:fs/promises");
2717
- return await readFile7(filePath, "utf8");
3184
+ const { readFile: readFile8 } = await import("node:fs/promises");
3185
+ return await readFile8(filePath, "utf8");
2718
3186
  } catch {
2719
3187
  return "";
2720
3188
  }
@@ -2789,7 +3257,7 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
2789
3257
  import { exec as execWithCallback } from "node:child_process";
2790
3258
  import fs from "node:fs/promises";
2791
3259
  import os from "node:os";
2792
- import path9 from "node:path";
3260
+ import path10 from "node:path";
2793
3261
  import { promisify } from "node:util";
2794
3262
  import { z } from "zod";
2795
3263
  var ToolCallSchema = z.object({
@@ -2797,7 +3265,8 @@ var ToolCallSchema = z.object({
2797
3265
  input: z.unknown().optional(),
2798
3266
  output: z.unknown().optional(),
2799
3267
  id: z.string().optional(),
2800
- timestamp: z.string().optional()
3268
+ timestamp: z.string().optional(),
3269
+ duration_ms: z.number().optional()
2801
3270
  });
2802
3271
  var OutputMessageInputSchema = z.object({
2803
3272
  role: z.string(),
@@ -2805,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
2805
3274
  content: z.unknown().optional(),
2806
3275
  tool_calls: z.array(ToolCallSchema).optional(),
2807
3276
  timestamp: z.string().optional(),
3277
+ duration_ms: z.number().optional(),
2808
3278
  metadata: z.record(z.unknown()).optional()
2809
3279
  });
2810
3280
  var TokenUsageSchema = z.object({
@@ -2843,8 +3313,16 @@ function convertOutputMessages(messages) {
2843
3313
  role: msg.role,
2844
3314
  name: msg.name,
2845
3315
  content: msg.content,
2846
- toolCalls: msg.tool_calls,
3316
+ toolCalls: msg.tool_calls?.map((tc) => ({
3317
+ tool: tc.tool,
3318
+ input: tc.input,
3319
+ output: tc.output,
3320
+ id: tc.id,
3321
+ timestamp: tc.timestamp,
3322
+ durationMs: tc.duration_ms
3323
+ })),
2847
3324
  timestamp: msg.timestamp,
3325
+ durationMs: msg.duration_ms,
2848
3326
  metadata: msg.metadata
2849
3327
  }));
2850
3328
  }
@@ -3246,7 +3724,7 @@ function normalizeInputFiles2(inputFiles) {
3246
3724
  }
3247
3725
  const unique = /* @__PURE__ */ new Map();
3248
3726
  for (const inputFile of inputFiles) {
3249
- const absolutePath = path9.resolve(inputFile);
3727
+ const absolutePath = path10.resolve(inputFile);
3250
3728
  if (!unique.has(absolutePath)) {
3251
3729
  unique.set(absolutePath, absolutePath);
3252
3730
  }
@@ -3260,7 +3738,7 @@ function formatFileList(files, template) {
3260
3738
  const formatter = template ?? "{path}";
3261
3739
  return files.map((filePath) => {
3262
3740
  const escapedPath = shellEscape(filePath);
3263
- const escapedName = shellEscape(path9.basename(filePath));
3741
+ const escapedName = shellEscape(path10.basename(filePath));
3264
3742
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
3265
3743
  }).join(" ");
3266
3744
  }
@@ -3284,7 +3762,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
3284
3762
  const safeEvalId = evalCaseId || "unknown";
3285
3763
  const timestamp = Date.now();
3286
3764
  const random = Math.random().toString(36).substring(2, 9);
3287
- return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3765
+ return path10.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3288
3766
  }
3289
3767
  function formatTimeoutSuffix2(timeoutMs) {
3290
3768
  if (!timeoutMs || timeoutMs <= 0) {
@@ -3300,7 +3778,7 @@ import { randomUUID as randomUUID2 } from "node:crypto";
3300
3778
  import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
3301
3779
  import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
3302
3780
  import { tmpdir as tmpdir2 } from "node:os";
3303
- import path10 from "node:path";
3781
+ import path11 from "node:path";
3304
3782
  import { promisify as promisify2 } from "node:util";
3305
3783
 
3306
3784
  // src/evaluation/providers/codex-log-tracker.ts
@@ -3395,7 +3873,7 @@ var CodexProvider = class {
3395
3873
  const promptContent = `${systemPrompt}
3396
3874
 
3397
3875
  ${basePrompt}`;
3398
- const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
3876
+ const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME2);
3399
3877
  await writeFile2(promptFile, promptContent, "utf8");
3400
3878
  const args = this.buildCodexArgs();
3401
3879
  const cwd = this.resolveCwd(workspaceRoot);
@@ -3445,7 +3923,7 @@ ${basePrompt}`;
3445
3923
  if (!this.config.cwd) {
3446
3924
  return workspaceRoot;
3447
3925
  }
3448
- return path10.resolve(this.config.cwd);
3926
+ return path11.resolve(this.config.cwd);
3449
3927
  }
3450
3928
  buildCodexArgs() {
3451
3929
  const args = [
@@ -3487,7 +3965,7 @@ ${basePrompt}`;
3487
3965
  }
3488
3966
  }
3489
3967
  async createWorkspace() {
3490
- return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
3968
+ return await mkdtemp2(path11.join(tmpdir2(), WORKSPACE_PREFIX2));
3491
3969
  }
3492
3970
  async cleanupWorkspace(workspaceRoot) {
3493
3971
  try {
@@ -3501,9 +3979,9 @@ ${basePrompt}`;
3501
3979
  return void 0;
3502
3980
  }
3503
3981
  if (this.config.logDir) {
3504
- return path10.resolve(this.config.logDir);
3982
+ return path11.resolve(this.config.logDir);
3505
3983
  }
3506
- return path10.join(process.cwd(), ".agentv", "logs", "codex");
3984
+ return path11.join(process.cwd(), ".agentv", "logs", "codex");
3507
3985
  }
3508
3986
  async createStreamLogger(request) {
3509
3987
  const logDir = this.resolveLogDirectory();
@@ -3517,7 +3995,7 @@ ${basePrompt}`;
3517
3995
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
3518
3996
  return void 0;
3519
3997
  }
3520
- const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
3998
+ const filePath = path11.join(logDir, buildLogFilename2(request, this.targetName));
3521
3999
  try {
3522
4000
  const logger = await CodexStreamLogger.create({
3523
4001
  filePath,
@@ -3732,7 +4210,7 @@ function tryParseJsonValue2(rawLine) {
3732
4210
  async function locateExecutable(candidate) {
3733
4211
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
3734
4212
  if (includesPathSeparator) {
3735
- const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
4213
+ const resolved = path11.isAbsolute(candidate) ? candidate : path11.resolve(candidate);
3736
4214
  const executablePath = await ensureWindowsExecutableVariant(resolved);
3737
4215
  await access2(executablePath, constants2.F_OK);
3738
4216
  return executablePath;
@@ -4245,7 +4723,7 @@ import { randomUUID as randomUUID3 } from "node:crypto";
4245
4723
  import { createWriteStream as createWriteStream3 } from "node:fs";
4246
4724
  import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
4247
4725
  import { tmpdir as tmpdir3 } from "node:os";
4248
- import path11 from "node:path";
4726
+ import path12 from "node:path";
4249
4727
 
4250
4728
  // src/evaluation/providers/pi-log-tracker.ts
4251
4729
  var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
@@ -4329,7 +4807,7 @@ var PiCodingAgentProvider = class {
4329
4807
  const workspaceRoot = await this.createWorkspace();
4330
4808
  const logger = await this.createStreamLogger(request).catch(() => void 0);
4331
4809
  try {
4332
- const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
4810
+ const promptFile = path12.join(workspaceRoot, PROMPT_FILENAME3);
4333
4811
  await writeFile3(promptFile, request.question, "utf8");
4334
4812
  const args = this.buildPiArgs(request.question, inputFiles);
4335
4813
  const cwd = this.resolveCwd(workspaceRoot);
@@ -4371,7 +4849,7 @@ var PiCodingAgentProvider = class {
4371
4849
  if (!this.config.cwd) {
4372
4850
  return workspaceRoot;
4373
4851
  }
4374
- return path11.resolve(this.config.cwd);
4852
+ return path12.resolve(this.config.cwd);
4375
4853
  }
4376
4854
  buildPiArgs(prompt, inputFiles) {
4377
4855
  const args = [];
@@ -4460,7 +4938,7 @@ ${prompt}`;
4460
4938
  return env;
4461
4939
  }
4462
4940
  async createWorkspace() {
4463
- return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
4941
+ return await mkdtemp3(path12.join(tmpdir3(), WORKSPACE_PREFIX3));
4464
4942
  }
4465
4943
  async cleanupWorkspace(workspaceRoot) {
4466
4944
  try {
@@ -4470,9 +4948,9 @@ ${prompt}`;
4470
4948
  }
4471
4949
  resolveLogDirectory() {
4472
4950
  if (this.config.logDir) {
4473
- return path11.resolve(this.config.logDir);
4951
+ return path12.resolve(this.config.logDir);
4474
4952
  }
4475
- return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4953
+ return path12.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4476
4954
  }
4477
4955
  async createStreamLogger(request) {
4478
4956
  const logDir = this.resolveLogDirectory();
@@ -4486,7 +4964,7 @@ ${prompt}`;
4486
4964
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
4487
4965
  return void 0;
4488
4966
  }
4489
- const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
4967
+ const filePath = path12.join(logDir, buildLogFilename3(request, this.targetName));
4490
4968
  try {
4491
4969
  const logger = await PiStreamLogger.create({
4492
4970
  filePath,
@@ -4919,7 +5397,7 @@ async function defaultPiRunner(options) {
4919
5397
  }
4920
5398
 
4921
5399
  // src/evaluation/providers/vscode.ts
4922
- import path12 from "node:path";
5400
+ import path13 from "node:path";
4923
5401
  import {
4924
5402
  dispatchAgentSession,
4925
5403
  dispatchBatchAgent,
@@ -5094,7 +5572,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
5094
5572
  return "";
5095
5573
  }
5096
5574
  const buildList = (files) => files.map((absolutePath) => {
5097
- const fileName = path12.basename(absolutePath);
5575
+ const fileName = path13.basename(absolutePath);
5098
5576
  const fileUri = pathToFileUri2(absolutePath);
5099
5577
  return `* [${fileName}](${fileUri})`;
5100
5578
  });
@@ -5119,8 +5597,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
5119
5597
  }
5120
5598
  const unique = /* @__PURE__ */ new Map();
5121
5599
  for (const attachment of attachments) {
5122
- const absolutePath = path12.resolve(attachment);
5123
- const normalized = absolutePath.split(path12.sep).join("/");
5600
+ const absolutePath = path13.resolve(attachment);
5601
+ const normalized = absolutePath.split(path13.sep).join("/");
5124
5602
  if (isGuidelineFile(normalized, guidelinePatterns)) {
5125
5603
  if (!unique.has(absolutePath)) {
5126
5604
  unique.set(absolutePath, absolutePath);
@@ -5135,7 +5613,7 @@ function collectAttachmentFiles(attachments) {
5135
5613
  }
5136
5614
  const unique = /* @__PURE__ */ new Map();
5137
5615
  for (const attachment of attachments) {
5138
- const absolutePath = path12.resolve(attachment);
5616
+ const absolutePath = path13.resolve(attachment);
5139
5617
  if (!unique.has(absolutePath)) {
5140
5618
  unique.set(absolutePath, absolutePath);
5141
5619
  }
@@ -5143,7 +5621,7 @@ function collectAttachmentFiles(attachments) {
5143
5621
  return Array.from(unique.values());
5144
5622
  }
5145
5623
  function pathToFileUri2(filePath) {
5146
- const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
5624
+ const absolutePath = path13.isAbsolute(filePath) ? filePath : path13.resolve(filePath);
5147
5625
  const normalizedPath = absolutePath.replace(/\\/g, "/");
5148
5626
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
5149
5627
  return `file:///${normalizedPath}`;
@@ -5156,7 +5634,7 @@ function normalizeAttachments(attachments) {
5156
5634
  }
5157
5635
  const deduped = /* @__PURE__ */ new Set();
5158
5636
  for (const attachment of attachments) {
5159
- deduped.add(path12.resolve(attachment));
5637
+ deduped.add(path13.resolve(attachment));
5160
5638
  }
5161
5639
  return Array.from(deduped);
5162
5640
  }
@@ -5165,7 +5643,7 @@ function mergeAttachments(all) {
5165
5643
  for (const list of all) {
5166
5644
  if (!list) continue;
5167
5645
  for (const inputFile of list) {
5168
- deduped.add(path12.resolve(inputFile));
5646
+ deduped.add(path13.resolve(inputFile));
5169
5647
  }
5170
5648
  }
5171
5649
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -5213,8 +5691,8 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
5213
5691
 
5214
5692
  // src/evaluation/providers/targets-file.ts
5215
5693
  import { constants as constants3 } from "node:fs";
5216
- import { access as access3, readFile as readFile6 } from "node:fs/promises";
5217
- import path13 from "node:path";
5694
+ import { access as access3, readFile as readFile7 } from "node:fs/promises";
5695
+ import path14 from "node:path";
5218
5696
  import { parse as parse3 } from "yaml";
5219
5697
  function isRecord(value) {
5220
5698
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -5251,11 +5729,11 @@ async function fileExists3(filePath) {
5251
5729
  }
5252
5730
  }
5253
5731
  async function readTargetDefinitions(filePath) {
5254
- const absolutePath = path13.resolve(filePath);
5732
+ const absolutePath = path14.resolve(filePath);
5255
5733
  if (!await fileExists3(absolutePath)) {
5256
5734
  throw new Error(`targets.yaml not found at ${absolutePath}`);
5257
5735
  }
5258
- const raw = await readFile6(absolutePath, "utf8");
5736
+ const raw = await readFile7(absolutePath, "utf8");
5259
5737
  const parsed = parse3(raw);
5260
5738
  if (!isRecord(parsed)) {
5261
5739
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -5462,15 +5940,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
5462
5940
  });
5463
5941
  }
5464
5942
  async function execShellWithStdin(command, stdinPayload, options = {}) {
5465
- const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
5943
+ const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
5466
5944
  const { tmpdir: tmpdir4 } = await import("node:os");
5467
- const path15 = await import("node:path");
5945
+ const path16 = await import("node:path");
5468
5946
  const { randomUUID: randomUUID4 } = await import("node:crypto");
5469
- const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
5947
+ const dir = path16.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
5470
5948
  await mkdir4(dir, { recursive: true });
5471
- const stdinPath = path15.join(dir, "stdin.txt");
5472
- const stdoutPath = path15.join(dir, "stdout.txt");
5473
- const stderrPath = path15.join(dir, "stderr.txt");
5949
+ const stdinPath = path16.join(dir, "stdin.txt");
5950
+ const stdoutPath = path16.join(dir, "stdout.txt");
5951
+ const stderrPath = path16.join(dir, "stderr.txt");
5474
5952
  await writeFile4(stdinPath, stdinPayload, "utf8");
5475
5953
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
5476
5954
  const { spawn: spawn4 } = await import("node:child_process");
@@ -5500,8 +5978,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5500
5978
  resolve(code ?? 0);
5501
5979
  });
5502
5980
  });
5503
- const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
5504
- const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
5981
+ const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
5982
+ const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
5505
5983
  return { stdout, stderr, exitCode };
5506
5984
  } finally {
5507
5985
  await rm4(dir, { recursive: true, force: true });
@@ -5773,7 +6251,7 @@ var CodeEvaluator = class {
5773
6251
  outputMessages: context.outputMessages ?? null,
5774
6252
  guidelineFiles: context.evalCase.guideline_paths,
5775
6253
  inputFiles: context.evalCase.file_paths.filter(
5776
- (path15) => !context.evalCase.guideline_paths.includes(path15)
6254
+ (path16) => !context.evalCase.guideline_paths.includes(path16)
5777
6255
  ),
5778
6256
  inputMessages: context.evalCase.input_messages,
5779
6257
  traceSummary: context.traceSummary ?? null,
@@ -5921,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
5921
6399
  checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
5922
6400
  overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
5923
6401
  });
6402
+ var scoreRangeCheckResultSchema = z2.object({
6403
+ id: z2.string().describe("The ID of the rubric criterion being scored"),
6404
+ score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
6405
+ reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
6406
+ });
6407
+ var scoreRangeEvaluationSchema = z2.object({
6408
+ checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
6409
+ overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
6410
+ });
5924
6411
  var LlmJudgeEvaluator = class {
5925
6412
  kind = "llm_judge";
5926
6413
  resolveJudgeProvider;
@@ -6006,6 +6493,10 @@ var LlmJudgeEvaluator = class {
6006
6493
  `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
6007
6494
  );
6008
6495
  }
6496
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
6497
+ if (hasScoreRanges) {
6498
+ return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
6499
+ }
6009
6500
  const prompt = this.buildRubricPrompt(context, rubrics);
6010
6501
  const systemPrompt = buildRubricOutputSchema();
6011
6502
  const evaluatorRawRequest = {
@@ -6031,6 +6522,84 @@ var LlmJudgeEvaluator = class {
6031
6522
  evaluatorRawRequest
6032
6523
  };
6033
6524
  }
6525
+ /**
6526
+ * Evaluate using score-range rubrics (analytic rubric scoring).
6527
+ * Each criterion is scored 0-10 and normalized to 0-1.
6528
+ */
6529
+ async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
6530
+ const prompt = this.buildScoreRangePrompt(context, rubrics);
6531
+ const systemPrompt = buildScoreRangeOutputSchema();
6532
+ const evaluatorRawRequest = {
6533
+ userPrompt: prompt,
6534
+ systemPrompt,
6535
+ target: judgeProvider.targetName
6536
+ };
6537
+ const { data } = await this.runWithRetry({
6538
+ context,
6539
+ judgeProvider,
6540
+ systemPrompt,
6541
+ userPrompt: prompt,
6542
+ schema: scoreRangeEvaluationSchema
6543
+ });
6544
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
6545
+ return {
6546
+ score,
6547
+ verdict,
6548
+ hits,
6549
+ misses,
6550
+ expectedAspectCount: rubrics.length,
6551
+ reasoning: data.overall_reasoning,
6552
+ evaluatorRawRequest,
6553
+ details
6554
+ };
6555
+ }
6556
+ /**
6557
+ * Build prompt for score-range rubric evaluation.
6558
+ */
6559
+ buildScoreRangePrompt(context, rubrics) {
6560
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
6561
+ const parts = [
6562
+ "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
6563
+ "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
6564
+ "",
6565
+ "[[ ## question ## ]]",
6566
+ formattedQuestion,
6567
+ "",
6568
+ "[[ ## expected_outcome ## ]]",
6569
+ context.evalCase.expected_outcome,
6570
+ ""
6571
+ ];
6572
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
6573
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
6574
+ }
6575
+ parts.push(
6576
+ "[[ ## candidate_answer ## ]]",
6577
+ context.candidate,
6578
+ "",
6579
+ "[[ ## scoring_criteria ## ]]"
6580
+ );
6581
+ for (const rubric of rubrics) {
6582
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
6583
+ const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
6584
+ parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
6585
+ if (rubric.expected_outcome) {
6586
+ parts.push(`Description: ${rubric.expected_outcome}`);
6587
+ }
6588
+ if (rubric.score_ranges && rubric.score_ranges.length > 0) {
6589
+ parts.push("Score ranges:");
6590
+ for (const range of rubric.score_ranges) {
6591
+ const [min, max] = range.score_range;
6592
+ const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
6593
+ parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
6594
+ }
6595
+ }
6596
+ }
6597
+ parts.push(
6598
+ "",
6599
+ "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
6600
+ );
6601
+ return parts.join("\n");
6602
+ }
6034
6603
  buildRubricPrompt(context, rubrics) {
6035
6604
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
6036
6605
  const parts = [
@@ -6050,7 +6619,7 @@ var LlmJudgeEvaluator = class {
6050
6619
  for (const rubric of rubrics) {
6051
6620
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
6052
6621
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
6053
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
6622
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
6054
6623
  }
6055
6624
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
6056
6625
  return parts.join("\n");
@@ -6137,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
6137
6706
  totalWeight += rubric.weight;
6138
6707
  if (check.satisfied) {
6139
6708
  earnedWeight += rubric.weight;
6140
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6709
+ hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
6141
6710
  } else {
6142
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6711
+ misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
6143
6712
  if (rubric.required) {
6144
6713
  failedRequired = true;
6145
6714
  }
@@ -6149,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
6149
6718
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6150
6719
  return { score, verdict, hits, misses };
6151
6720
  }
6721
+ function buildScoreRangeOutputSchema() {
6722
+ return `You are an expert evaluator. Score the candidate answer on each criterion.
6723
+ You must return a valid JSON object matching this schema:
6724
+ {
6725
+ "checks": [
6726
+ {
6727
+ "id": "string (criterion id)",
6728
+ "score": integer (0-10),
6729
+ "reasoning": "string (brief explanation for score)"
6730
+ }
6731
+ ],
6732
+ "overall_reasoning": "string (summary, optional)"
6733
+ }
6734
+
6735
+ Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
6736
+ }
6737
+ function calculateScoreRangeResult(result, rubrics) {
6738
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
6739
+ const hits = [];
6740
+ const misses = [];
6741
+ const rawScores = {};
6742
+ let totalWeight = 0;
6743
+ let weightedScoreSum = 0;
6744
+ let failedRequired = false;
6745
+ for (const check of result.checks) {
6746
+ const rubric = rubricMap.get(check.id);
6747
+ if (!rubric) {
6748
+ continue;
6749
+ }
6750
+ const rawScore = Math.max(0, Math.min(10, check.score));
6751
+ const normalizedScore = rawScore / 10;
6752
+ rawScores[rubric.id] = rawScore;
6753
+ totalWeight += rubric.weight;
6754
+ weightedScoreSum += normalizedScore * rubric.weight;
6755
+ let requiredMinScore;
6756
+ if (rubric.required_min_score !== void 0) {
6757
+ requiredMinScore = rubric.required_min_score;
6758
+ } else if (rubric.required === true) {
6759
+ requiredMinScore = 10;
6760
+ }
6761
+ const matchingRange = rubric.score_ranges?.find(
6762
+ (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
6763
+ );
6764
+ const rangeDescription = matchingRange?.expected_outcome ?? "";
6765
+ const criterionLabel = rubric.expected_outcome ?? rubric.id;
6766
+ const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
6767
+ const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
6768
+ if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
6769
+ failedRequired = true;
6770
+ misses.push(scoreInfo);
6771
+ } else if (rawScore >= 7) {
6772
+ hits.push(scoreInfo);
6773
+ } else {
6774
+ misses.push(scoreInfo);
6775
+ }
6776
+ }
6777
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
6778
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6779
+ return {
6780
+ score,
6781
+ verdict,
6782
+ hits,
6783
+ misses,
6784
+ details: {
6785
+ raw_scores: rawScores,
6786
+ normalization: "score / 10",
6787
+ aggregation: "weighted_average"
6788
+ }
6789
+ };
6790
+ }
6152
6791
 
6153
6792
  // src/evaluation/evaluators/composite.ts
6154
6793
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -6532,115 +7171,115 @@ var FieldAccuracyEvaluator = class {
6532
7171
  * Evaluate a single field against the expected value.
6533
7172
  */
6534
7173
  evaluateField(fieldConfig, candidateData, expectedData) {
6535
- const { path: path15, match, required = true, weight = 1 } = fieldConfig;
6536
- const candidateValue = resolvePath(candidateData, path15);
6537
- const expectedValue = resolvePath(expectedData, path15);
7174
+ const { path: path16, match, required = true, weight = 1 } = fieldConfig;
7175
+ const candidateValue = resolvePath(candidateData, path16);
7176
+ const expectedValue = resolvePath(expectedData, path16);
6538
7177
  if (expectedValue === void 0) {
6539
7178
  return {
6540
- path: path15,
7179
+ path: path16,
6541
7180
  score: 1,
6542
7181
  // No expected value means no comparison needed
6543
7182
  weight,
6544
7183
  hit: true,
6545
- message: `${path15}: no expected value`
7184
+ message: `${path16}: no expected value`
6546
7185
  };
6547
7186
  }
6548
7187
  if (candidateValue === void 0) {
6549
7188
  if (required) {
6550
7189
  return {
6551
- path: path15,
7190
+ path: path16,
6552
7191
  score: 0,
6553
7192
  weight,
6554
7193
  hit: false,
6555
- message: `${path15} (required, missing)`
7194
+ message: `${path16} (required, missing)`
6556
7195
  };
6557
7196
  }
6558
7197
  return {
6559
- path: path15,
7198
+ path: path16,
6560
7199
  score: 1,
6561
7200
  // Don't penalize missing optional fields
6562
7201
  weight: 0,
6563
7202
  // Zero weight means it won't affect the score
6564
7203
  hit: true,
6565
- message: `${path15}: optional field missing`
7204
+ message: `${path16}: optional field missing`
6566
7205
  };
6567
7206
  }
6568
7207
  switch (match) {
6569
7208
  case "exact":
6570
- return this.compareExact(path15, candidateValue, expectedValue, weight);
7209
+ return this.compareExact(path16, candidateValue, expectedValue, weight);
6571
7210
  case "numeric_tolerance":
6572
7211
  return this.compareNumericTolerance(
6573
- path15,
7212
+ path16,
6574
7213
  candidateValue,
6575
7214
  expectedValue,
6576
7215
  fieldConfig,
6577
7216
  weight
6578
7217
  );
6579
7218
  case "date":
6580
- return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
7219
+ return this.compareDate(path16, candidateValue, expectedValue, fieldConfig, weight);
6581
7220
  default:
6582
7221
  return {
6583
- path: path15,
7222
+ path: path16,
6584
7223
  score: 0,
6585
7224
  weight,
6586
7225
  hit: false,
6587
- message: `${path15}: unknown match type "${match}"`
7226
+ message: `${path16}: unknown match type "${match}"`
6588
7227
  };
6589
7228
  }
6590
7229
  }
6591
7230
  /**
6592
7231
  * Exact equality comparison.
6593
7232
  */
6594
- compareExact(path15, candidateValue, expectedValue, weight) {
7233
+ compareExact(path16, candidateValue, expectedValue, weight) {
6595
7234
  if (deepEqual(candidateValue, expectedValue)) {
6596
7235
  return {
6597
- path: path15,
7236
+ path: path16,
6598
7237
  score: 1,
6599
7238
  weight,
6600
7239
  hit: true,
6601
- message: path15
7240
+ message: path16
6602
7241
  };
6603
7242
  }
6604
7243
  if (typeof candidateValue !== typeof expectedValue) {
6605
7244
  return {
6606
- path: path15,
7245
+ path: path16,
6607
7246
  score: 0,
6608
7247
  weight,
6609
7248
  hit: false,
6610
- message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7249
+ message: `${path16} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
6611
7250
  };
6612
7251
  }
6613
7252
  return {
6614
- path: path15,
7253
+ path: path16,
6615
7254
  score: 0,
6616
7255
  weight,
6617
7256
  hit: false,
6618
- message: `${path15} (value mismatch)`
7257
+ message: `${path16} (value mismatch)`
6619
7258
  };
6620
7259
  }
6621
7260
  /**
6622
7261
  * Numeric comparison with absolute or relative tolerance.
6623
7262
  */
6624
- compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
7263
+ compareNumericTolerance(path16, candidateValue, expectedValue, fieldConfig, weight) {
6625
7264
  const { tolerance = 0, relative = false } = fieldConfig;
6626
7265
  const candidateNum = toNumber(candidateValue);
6627
7266
  const expectedNum = toNumber(expectedValue);
6628
7267
  if (candidateNum === null || expectedNum === null) {
6629
7268
  return {
6630
- path: path15,
7269
+ path: path16,
6631
7270
  score: 0,
6632
7271
  weight,
6633
7272
  hit: false,
6634
- message: `${path15} (non-numeric value)`
7273
+ message: `${path16} (non-numeric value)`
6635
7274
  };
6636
7275
  }
6637
7276
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6638
7277
  return {
6639
- path: path15,
7278
+ path: path16,
6640
7279
  score: 0,
6641
7280
  weight,
6642
7281
  hit: false,
6643
- message: `${path15} (invalid numeric value)`
7282
+ message: `${path16} (invalid numeric value)`
6644
7283
  };
6645
7284
  }
6646
7285
  const diff = Math.abs(candidateNum - expectedNum);
@@ -6653,61 +7292,61 @@ var FieldAccuracyEvaluator = class {
6653
7292
  }
6654
7293
  if (withinTolerance) {
6655
7294
  return {
6656
- path: path15,
7295
+ path: path16,
6657
7296
  score: 1,
6658
7297
  weight,
6659
7298
  hit: true,
6660
- message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
7299
+ message: `${path16} (within tolerance: diff=${diff.toFixed(2)})`
6661
7300
  };
6662
7301
  }
6663
7302
  return {
6664
- path: path15,
7303
+ path: path16,
6665
7304
  score: 0,
6666
7305
  weight,
6667
7306
  hit: false,
6668
- message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7307
+ message: `${path16} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6669
7308
  };
6670
7309
  }
6671
7310
  /**
6672
7311
  * Date comparison with format normalization.
6673
7312
  */
6674
- compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
7313
+ compareDate(path16, candidateValue, expectedValue, fieldConfig, weight) {
6675
7314
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6676
7315
  const candidateDate = parseDate(String(candidateValue), formats);
6677
7316
  const expectedDate = parseDate(String(expectedValue), formats);
6678
7317
  if (candidateDate === null) {
6679
7318
  return {
6680
- path: path15,
7319
+ path: path16,
6681
7320
  score: 0,
6682
7321
  weight,
6683
7322
  hit: false,
6684
- message: `${path15} (unparseable candidate date)`
7323
+ message: `${path16} (unparseable candidate date)`
6685
7324
  };
6686
7325
  }
6687
7326
  if (expectedDate === null) {
6688
7327
  return {
6689
- path: path15,
7328
+ path: path16,
6690
7329
  score: 0,
6691
7330
  weight,
6692
7331
  hit: false,
6693
- message: `${path15} (unparseable expected date)`
7332
+ message: `${path16} (unparseable expected date)`
6694
7333
  };
6695
7334
  }
6696
7335
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6697
7336
  return {
6698
- path: path15,
7337
+ path: path16,
6699
7338
  score: 1,
6700
7339
  weight,
6701
7340
  hit: true,
6702
- message: path15
7341
+ message: path16
6703
7342
  };
6704
7343
  }
6705
7344
  return {
6706
- path: path15,
7345
+ path: path16,
6707
7346
  score: 0,
6708
7347
  weight,
6709
7348
  hit: false,
6710
- message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7349
+ message: `${path16} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6711
7350
  };
6712
7351
  }
6713
7352
  /**
@@ -6747,11 +7386,11 @@ var FieldAccuracyEvaluator = class {
6747
7386
  };
6748
7387
  }
6749
7388
  };
6750
- function resolvePath(obj, path15) {
6751
- if (!path15 || !obj) {
7389
+ function resolvePath(obj, path16) {
7390
+ if (!path16 || !obj) {
6752
7391
  return void 0;
6753
7392
  }
6754
- const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
7393
+ const parts = path16.split(/\.|\[|\]/).filter((p) => p.length > 0);
6755
7394
  let current = obj;
6756
7395
  for (const part of parts) {
6757
7396
  if (current === null || current === void 0) {
@@ -6976,6 +7615,27 @@ function argsMatch(expected, actual) {
6976
7615
  }
6977
7616
  return true;
6978
7617
  }
7618
+ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
7619
+ if (maxDurationMs === void 0) {
7620
+ return { status: "skip", message: "" };
7621
+ }
7622
+ if (actualDurationMs === void 0) {
7623
+ return {
7624
+ status: "skip",
7625
+ message: `No duration data for ${toolName}; latency assertion skipped`
7626
+ };
7627
+ }
7628
+ if (actualDurationMs <= maxDurationMs) {
7629
+ return {
7630
+ status: "pass",
7631
+ message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
7632
+ };
7633
+ }
7634
+ return {
7635
+ status: "fail",
7636
+ message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
7637
+ };
7638
+ }
6979
7639
  var ToolTrajectoryEvaluator = class {
6980
7640
  kind = "tool_trajectory";
6981
7641
  config;
@@ -7034,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
7034
7694
  for (const call of message.toolCalls) {
7035
7695
  toolCalls.push({
7036
7696
  name: call.tool,
7037
- args: call.input
7697
+ args: call.input,
7698
+ durationMs: call.durationMs
7038
7699
  });
7039
7700
  }
7040
7701
  }
@@ -7102,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
7102
7763
  }
7103
7764
  const hits = [];
7104
7765
  const misses = [];
7766
+ const warnings = [];
7105
7767
  let actualIndex = 0;
7768
+ let sequenceHits = 0;
7769
+ let latencyHits = 0;
7770
+ let latencySkips = 0;
7771
+ const latencyAssertionCount = expected.filter(
7772
+ (item) => item.maxDurationMs !== void 0
7773
+ ).length;
7106
7774
  for (let i = 0; i < expected.length; i++) {
7107
7775
  const expectedItem = expected[i];
7108
7776
  const expectedTool = expectedItem.tool;
7109
7777
  let found = false;
7110
7778
  let argsMismatch = false;
7779
+ let matchedCall;
7111
7780
  while (actualIndex < toolCalls.length) {
7112
7781
  const actualCall = toolCalls[actualIndex];
7113
7782
  if (actualCall.name === expectedTool) {
7114
7783
  if (argsMatch(expectedItem.args, actualCall.args)) {
7115
7784
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7785
+ sequenceHits++;
7786
+ matchedCall = actualCall;
7116
7787
  actualIndex++;
7117
7788
  found = true;
7118
7789
  break;
@@ -7129,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
7129
7800
  if (!found && !argsMismatch) {
7130
7801
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7131
7802
  }
7803
+ if (found && matchedCall) {
7804
+ const latencyResult = checkLatency(
7805
+ expectedTool,
7806
+ expectedItem.maxDurationMs,
7807
+ matchedCall.durationMs
7808
+ );
7809
+ if (latencyResult.status === "pass") {
7810
+ hits.push(latencyResult.message);
7811
+ latencyHits++;
7812
+ } else if (latencyResult.status === "fail") {
7813
+ misses.push(latencyResult.message);
7814
+ } else if (latencyResult.message) {
7815
+ warnings.push(latencyResult.message);
7816
+ latencySkips++;
7817
+ }
7818
+ }
7132
7819
  }
7133
- const score = hits.length / expected.length;
7820
+ for (const warning of warnings) {
7821
+ console.warn(`[tool_trajectory] ${warning}`);
7822
+ }
7823
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
7824
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
7825
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
7134
7826
  return {
7135
7827
  score,
7136
7828
  verdict: scoreToVerdict(score),
7137
7829
  hits,
7138
7830
  misses,
7139
- expectedAspectCount: expected.length
7831
+ expectedAspectCount: totalAssertions
7140
7832
  };
7141
7833
  }
7142
7834
  evaluateExact(toolCalls) {
@@ -7152,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
7152
7844
  }
7153
7845
  const hits = [];
7154
7846
  const misses = [];
7847
+ const warnings = [];
7848
+ let sequenceHits = 0;
7849
+ let latencyHits = 0;
7850
+ let latencySkips = 0;
7851
+ const latencyAssertionCount = expected.filter(
7852
+ (item) => item.maxDurationMs !== void 0
7853
+ ).length;
7155
7854
  if (toolCalls.length !== expected.length) {
7156
7855
  misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7157
7856
  }
@@ -7161,33 +7860,58 @@ var ToolTrajectoryEvaluator = class {
7161
7860
  const expectedTool = expectedItem.tool;
7162
7861
  const actualCall = toolCalls[i];
7163
7862
  const actualTool = actualCall.name;
7863
+ let sequenceMatched = false;
7164
7864
  if (actualTool === expectedTool) {
7165
7865
  if (argsMatch(expectedItem.args, actualCall.args)) {
7166
7866
  hits.push(`Position ${i}: ${expectedTool}`);
7867
+ sequenceHits++;
7868
+ sequenceMatched = true;
7167
7869
  } else {
7168
7870
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7169
7871
  }
7170
7872
  } else {
7171
7873
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7172
7874
  }
7875
+ if (sequenceMatched) {
7876
+ const latencyResult = checkLatency(
7877
+ expectedTool,
7878
+ expectedItem.maxDurationMs,
7879
+ actualCall.durationMs
7880
+ );
7881
+ if (latencyResult.status === "pass") {
7882
+ hits.push(latencyResult.message);
7883
+ latencyHits++;
7884
+ } else if (latencyResult.status === "fail") {
7885
+ misses.push(latencyResult.message);
7886
+ } else if (latencyResult.message) {
7887
+ warnings.push(latencyResult.message);
7888
+ latencySkips++;
7889
+ }
7890
+ }
7173
7891
  }
7174
7892
  for (let i = checkLength; i < expected.length; i++) {
7175
7893
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7176
7894
  }
7177
- const score = hits.length / expected.length;
7895
+ for (const warning of warnings) {
7896
+ console.warn(`[tool_trajectory] ${warning}`);
7897
+ }
7898
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
7899
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
7900
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
7178
7901
  return {
7179
7902
  score,
7180
7903
  verdict: scoreToVerdict(score),
7181
7904
  hits,
7182
7905
  misses,
7183
- expectedAspectCount: expected.length
7906
+ expectedAspectCount: totalAssertions
7184
7907
  };
7185
7908
  }
7186
7909
  };
7187
7910
 
7188
7911
  // src/evaluation/orchestrator.ts
7189
7912
  import { createHash } from "node:crypto";
7190
- import path14 from "node:path";
7913
+ import path15 from "node:path";
7914
+ import micromatch4 from "micromatch";
7191
7915
 
7192
7916
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
7193
7917
  var Node = class {
@@ -7346,17 +8070,17 @@ async function runEvaluation(options) {
7346
8070
  cache,
7347
8071
  useCache,
7348
8072
  now,
7349
- evalId,
8073
+ filter,
7350
8074
  verbose,
7351
8075
  evalCases: preloadedEvalCases,
7352
8076
  onResult,
7353
8077
  onProgress
7354
8078
  } = options;
7355
- const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
7356
- const filteredEvalCases = filterEvalCases(evalCases, evalId);
8079
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
8080
+ const filteredEvalCases = filterEvalCases(evalCases, filter);
7357
8081
  if (filteredEvalCases.length === 0) {
7358
- if (evalId) {
7359
- throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
8082
+ if (filter) {
8083
+ throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
7360
8084
  }
7361
8085
  return [];
7362
8086
  }
@@ -7932,7 +8656,10 @@ async function runEvaluatorList(options) {
7932
8656
  attempt,
7933
8657
  promptInputs,
7934
8658
  now,
7935
- judgeProvider
8659
+ judgeProvider,
8660
+ outputMessages,
8661
+ traceSummary,
8662
+ agentTimeoutMs
7936
8663
  });
7937
8664
  const weight = evaluator.weight ?? 1;
7938
8665
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7986,7 +8713,7 @@ async function runEvaluatorList(options) {
7986
8713
  });
7987
8714
  }
7988
8715
  if (evaluator.type === "composite") {
7989
- const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
8716
+ const evalFileDir = evalCase.guideline_paths[0] ? path15.dirname(evalCase.guideline_paths[0]) : process.cwd();
7990
8717
  const createEvaluator = (memberConfig) => {
7991
8718
  switch (memberConfig.type) {
7992
8719
  case "llm_judge":
@@ -8267,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
8267
8994
  attempt,
8268
8995
  promptInputs,
8269
8996
  now,
8270
- judgeProvider
8997
+ judgeProvider,
8998
+ outputMessages,
8999
+ traceSummary,
9000
+ agentTimeoutMs
8271
9001
  } = options;
8272
- const customPrompt = await resolveCustomPrompt(config);
9002
+ const customPrompt = await resolveCustomPrompt(
9003
+ config,
9004
+ {
9005
+ evalCase,
9006
+ candidate,
9007
+ outputMessages,
9008
+ traceSummary,
9009
+ config: config.config
9010
+ },
9011
+ agentTimeoutMs
9012
+ );
8273
9013
  return evaluatorRegistry.llm_judge.evaluate({
8274
9014
  evalCase,
8275
9015
  candidate,
@@ -8283,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
8283
9023
  evaluator: config
8284
9024
  });
8285
9025
  }
8286
- async function resolveCustomPrompt(config) {
8287
- if (config.promptPath) {
9026
+ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
9027
+ if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
9028
+ if (!context) {
9029
+ throw new Error("Context required for executable prompt templates");
9030
+ }
9031
+ return executePromptTemplate(
9032
+ promptConfig.resolvedPromptScript,
9033
+ context,
9034
+ promptConfig.config,
9035
+ timeoutMs
9036
+ );
9037
+ }
9038
+ const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
9039
+ if (promptPath) {
8288
9040
  try {
8289
- const content = await readTextFile(config.promptPath);
9041
+ const content = await readTextFile(promptPath);
8290
9042
  return content;
8291
9043
  } catch (error) {
8292
9044
  const message = error instanceof Error ? error.message : String(error);
8293
- console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
9045
+ console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
8294
9046
  }
8295
9047
  }
8296
- return config.prompt;
9048
+ const promptValue = promptConfig.prompt;
9049
+ if (typeof promptValue === "string") {
9050
+ return promptValue;
9051
+ }
9052
+ return void 0;
9053
+ }
9054
+ async function executePromptTemplate(script, context, config, timeoutMs) {
9055
+ const payload = {
9056
+ question: context.evalCase.question,
9057
+ expectedOutcome: context.evalCase.expected_outcome,
9058
+ expectedMessages: context.evalCase.expected_messages,
9059
+ referenceAnswer: context.evalCase.reference_answer,
9060
+ candidateAnswer: context.candidate,
9061
+ outputMessages: context.outputMessages ?? null,
9062
+ guidelineFiles: context.evalCase.guideline_paths,
9063
+ inputFiles: context.evalCase.file_paths.filter(
9064
+ (p) => !context.evalCase.guideline_paths.includes(p)
9065
+ ),
9066
+ inputMessages: context.evalCase.input_messages,
9067
+ traceSummary: context.traceSummary ?? null,
9068
+ config: config ?? context.config ?? null
9069
+ };
9070
+ const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
9071
+ const scriptPath = script[script.length - 1];
9072
+ const cwd = path15.dirname(scriptPath);
9073
+ try {
9074
+ const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
9075
+ const prompt = stdout.trim();
9076
+ if (!prompt) {
9077
+ throw new Error("Prompt template produced empty output");
9078
+ }
9079
+ return prompt;
9080
+ } catch (error) {
9081
+ const message = error instanceof Error ? error.message : String(error);
9082
+ throw new Error(`Prompt template execution failed: ${message}`);
9083
+ }
8297
9084
  }
8298
- function filterEvalCases(evalCases, evalId) {
8299
- if (!evalId) {
9085
+ function filterEvalCases(evalCases, filter) {
9086
+ if (!filter) {
8300
9087
  return evalCases;
8301
9088
  }
8302
- return evalCases.filter((evalCase) => evalCase.id === evalId);
9089
+ return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
8303
9090
  }
8304
9091
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
8305
9092
  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -8457,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
8457
9244
  import { z as z3 } from "zod";
8458
9245
  var rubricItemSchema = z3.object({
8459
9246
  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
8460
- description: z3.string().describe("What this rubric checks for"),
9247
+ expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
8461
9248
  weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
8462
9249
  required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
8463
9250
  });
@@ -8477,7 +9264,7 @@ You must return a valid JSON object matching this schema:
8477
9264
  "rubrics": [
8478
9265
  {
8479
9266
  "id": "string (short identifier)",
8480
- "description": "string (what to check)",
9267
+ "expected_outcome": "string (concrete expected outcome for this rubric item)",
8481
9268
  "weight": number (default 1.0),
8482
9269
  "required": boolean (default true)
8483
9270
  }
@@ -8513,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
8513
9300
  "Each rubric should:",
8514
9301
  "- Be specific and testable",
8515
9302
  "- Have a short, descriptive ID",
8516
- "- Include a clear description of what to check",
9303
+ "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
8517
9304
  "- Indicate if it is required (mandatory) or optional",
8518
9305
  "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
8519
9306
  "",
@@ -8560,6 +9347,7 @@ export {
8560
9347
  createAgentKernel,
8561
9348
  createProvider,
8562
9349
  deepEqual,
9350
+ detectFormat,
8563
9351
  ensureVSCodeSubagents,
8564
9352
  executeScript,
8565
9353
  explorationRatio,