agentv 4.17.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,21 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
+ import {
3
+ RESULT_INDEX_FILENAME,
4
+ RESULT_RUNS_DIRNAME,
5
+ aggregateRunDir,
6
+ buildDefaultRunDir,
7
+ buildTestTargetKey,
8
+ deduplicateByTestIdTarget,
9
+ isDirectoryPath,
10
+ normalizeExperimentName,
11
+ parseJsonlResults,
12
+ resolveExistingRunPrimaryPath,
13
+ resolveRunIndexPath,
14
+ resolveRunManifestPath,
15
+ resolveWorkspaceOrFilePath,
16
+ toSnakeCaseDeep,
17
+ writeArtifactsFromResults
18
+ } from "./chunk-HBDOJJFY.js";
2
19
  import {
3
20
  CLI_PLACEHOLDERS,
4
21
  COMMON_TARGET_SETTINGS,
@@ -18,7 +35,7 @@ import {
18
35
  findGitRoot,
19
36
  getResultsRepoStatus,
20
37
  interpolateEnv,
21
- isEvaluatorKind,
38
+ isGraderKind,
22
39
  listTargetNames,
23
40
  loadCasesFromFile,
24
41
  loadConfig,
@@ -40,14 +57,13 @@ import {
40
57
  subscribeToCopilotSdkLogEntries,
41
58
  subscribeToPiLogEntries,
42
59
  syncResultsRepo,
43
- toCamelCaseDeep,
44
- toTranscriptJsonLines
45
- } from "./chunk-IRU2UOWN.js";
60
+ toCamelCaseDeep
61
+ } from "./chunk-RCOAXXHP.js";
46
62
 
47
63
  // package.json
48
64
  var package_default = {
49
65
  name: "agentv",
50
- version: "4.17.1",
66
+ version: "4.18.0-next.1",
51
67
  description: "CLI entry point for AgentV",
52
68
  type: "module",
53
69
  repository: {
@@ -261,9 +277,9 @@ async function discoverTargetsFile(options) {
261
277
  }
262
278
 
263
279
  // src/commands/eval/run-eval.ts
264
- import { constants as constants4, mkdirSync } from "node:fs";
265
- import { access as access5 } from "node:fs/promises";
266
- import path17 from "node:path";
280
+ import { constants as constants4, existsSync as existsSync2, mkdirSync } from "node:fs";
281
+ import { access as access5, readFile as readFile8 } from "node:fs/promises";
282
+ import path15 from "node:path";
267
283
  import { pathToFileURL } from "node:url";
268
284
 
269
285
  // src/version-check.ts
@@ -321,88 +337,15 @@ async function promptContinue() {
321
337
  }
322
338
 
323
339
  // src/commands/results/remote.ts
324
- import path6 from "node:path";
325
-
326
- // src/commands/inspect/utils.ts
327
- import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
328
340
  import path5 from "node:path";
329
341
 
330
- // src/commands/eval/result-layout.ts
331
- import { existsSync, statSync } from "node:fs";
332
- import path3 from "node:path";
333
- var RESULT_INDEX_FILENAME = "index.jsonl";
334
- var RESULT_RUNS_DIRNAME = "runs";
335
- var DEFAULT_EXPERIMENT_NAME = "default";
336
- function normalizeExperimentName(experiment) {
337
- const trimmed = experiment?.trim();
338
- if (!trimmed) {
339
- return DEFAULT_EXPERIMENT_NAME;
340
- }
341
- if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
342
- throw new Error(
343
- `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
344
- );
345
- }
346
- return trimmed;
347
- }
348
- function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
349
- return timestamp.toISOString().replace(/[:.]/g, "-");
350
- }
351
- function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
352
- return path3.join(
353
- cwd,
354
- ".agentv",
355
- "results",
356
- RESULT_RUNS_DIRNAME,
357
- normalizeExperimentName(experiment),
358
- createRunDirName(timestamp)
359
- );
360
- }
361
- function resolveRunIndexPath(runDir) {
362
- return path3.join(runDir, RESULT_INDEX_FILENAME);
363
- }
364
- function isRunManifestPath(filePath) {
365
- return path3.basename(filePath) === RESULT_INDEX_FILENAME;
366
- }
367
- function resolveExistingRunPrimaryPath(runDir) {
368
- const indexPath = resolveRunIndexPath(runDir);
369
- if (existsSync(indexPath)) {
370
- return indexPath;
371
- }
372
- return void 0;
373
- }
374
- function isDirectoryPath(filePath) {
375
- try {
376
- return statSync(filePath).isDirectory();
377
- } catch {
378
- return false;
379
- }
380
- }
381
- function resolveWorkspaceOrFilePath(filePath) {
382
- if (!isDirectoryPath(filePath)) {
383
- return filePath;
384
- }
385
- const existing = resolveExistingRunPrimaryPath(filePath);
386
- if (!existing) {
387
- throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
388
- }
389
- return existing;
390
- }
391
- function resolveRunManifestPath(filePath) {
392
- if (isDirectoryPath(filePath)) {
393
- return resolveWorkspaceOrFilePath(filePath);
394
- }
395
- if (!isRunManifestPath(filePath)) {
396
- throw new Error(
397
- `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
398
- );
399
- }
400
- return filePath;
401
- }
342
+ // src/commands/inspect/utils.ts
343
+ import { readFileSync as readFileSync2, readdirSync, statSync } from "node:fs";
344
+ import path4 from "node:path";
402
345
 
403
346
  // src/commands/results/manifest.ts
404
- import { existsSync as existsSync2, readFileSync } from "node:fs";
405
- import path4 from "node:path";
347
+ import { existsSync, readFileSync } from "node:fs";
348
+ import path3 from "node:path";
406
349
  function parseJsonlLines(content) {
407
350
  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
408
351
  }
@@ -421,8 +364,8 @@ function readOptionalText(baseDir, relativePath) {
421
364
  if (!relativePath) {
422
365
  return void 0;
423
366
  }
424
- const absolutePath = path4.join(baseDir, relativePath);
425
- if (!existsSync2(absolutePath)) {
367
+ const absolutePath = path3.join(baseDir, relativePath);
368
+ if (!existsSync(absolutePath)) {
426
369
  return void 0;
427
370
  }
428
371
  return readFileSync(absolutePath, "utf8");
@@ -510,8 +453,8 @@ function parseResultManifest(content) {
510
453
  return parseJsonlLines(content);
511
454
  }
512
455
  function resolveResultSourcePath(source, cwd) {
513
- const resolved = path4.isAbsolute(source) ? source : path4.resolve(cwd ?? process.cwd(), source);
514
- if (isDirectoryPath(resolved) || path4.basename(resolved) === RESULT_INDEX_FILENAME) {
456
+ const resolved = path3.isAbsolute(source) ? source : path3.resolve(cwd ?? process.cwd(), source);
457
+ if (isDirectoryPath(resolved) || path3.basename(resolved) === RESULT_INDEX_FILENAME) {
515
458
  return resolveRunManifestPath(resolved);
516
459
  }
517
460
  return resolved;
@@ -520,7 +463,7 @@ function loadManifestResults(sourceFile) {
520
463
  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
521
464
  const content = readFileSync(resolvedSourceFile, "utf8");
522
465
  const records = parseResultManifest(content);
523
- const baseDir = path4.dirname(resolvedSourceFile);
466
+ const baseDir = path3.dirname(resolvedSourceFile);
524
467
  return records.map((record) => hydrateManifestRecord(baseDir, record));
525
468
  }
526
469
  function loadLightweightResults(sourceFile) {
@@ -566,10 +509,10 @@ function padLeft(str, len) {
566
509
  }
567
510
  function loadResultFile(filePath) {
568
511
  const resolvedFilePath = resolveTraceResultPath(filePath);
569
- if (path5.extname(resolvedFilePath) === ".json") {
512
+ if (path4.extname(resolvedFilePath) === ".json") {
570
513
  return loadOtlpTraceFile(resolvedFilePath);
571
514
  }
572
- if (path5.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
515
+ if (path4.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
573
516
  return loadManifestAsRawResults(resolvedFilePath);
574
517
  }
575
518
  return loadJsonlRecords(resolvedFilePath);
@@ -848,7 +791,7 @@ function toTraceSummary(result) {
848
791
  return toCamelCaseDeep(rawTrace);
849
792
  }
850
793
  function buildRunId(relativeRunPath) {
851
- const normalized = relativeRunPath.split(path5.sep).join("/");
794
+ const normalized = relativeRunPath.split(path4.sep).join("/");
852
795
  const segments = normalized.split("/").filter(Boolean);
853
796
  if (segments.length >= 2) {
854
797
  const experiment = segments.slice(0, -1).join("/");
@@ -863,10 +806,10 @@ function buildRunId(relativeRunPath) {
863
806
  function collectRunManifestPaths(runsDir, currentDir, files) {
864
807
  const primaryPath = resolveExistingRunPrimaryPath(currentDir);
865
808
  if (primaryPath) {
866
- const relativeRunPath = path5.relative(runsDir, currentDir);
809
+ const relativeRunPath = path4.relative(runsDir, currentDir);
867
810
  files.push({
868
811
  filePath: primaryPath,
869
- displayName: path5.basename(currentDir),
812
+ displayName: path4.basename(currentDir),
870
813
  runId: buildRunId(relativeRunPath)
871
814
  });
872
815
  return;
@@ -874,7 +817,7 @@ function collectRunManifestPaths(runsDir, currentDir, files) {
874
817
  const entries = readdirSync(currentDir, { withFileTypes: true });
875
818
  for (const entry of entries) {
876
819
  if (entry.isDirectory()) {
877
- collectRunManifestPaths(runsDir, path5.join(currentDir, entry.name), files);
820
+ collectRunManifestPaths(runsDir, path4.join(currentDir, entry.name), files);
878
821
  }
879
822
  }
880
823
  }
@@ -884,7 +827,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
884
827
  const entries = readdirSync(runsDir, { withFileTypes: true });
885
828
  for (const entry of entries) {
886
829
  if (entry.isDirectory()) {
887
- collectRunManifestPaths(runsDir, path5.join(runsDir, entry.name), files);
830
+ collectRunManifestPaths(runsDir, path4.join(runsDir, entry.name), files);
888
831
  }
889
832
  }
890
833
  } catch {
@@ -894,7 +837,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
894
837
  const metas = [];
895
838
  for (const { filePath, displayName, runId } of limited) {
896
839
  try {
897
- const fileStat = statSync2(filePath);
840
+ const fileStat = statSync(filePath);
898
841
  const results = loadResultFile(filePath);
899
842
  const testCount = results.length;
900
843
  const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
@@ -919,7 +862,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
919
862
  }
920
863
  function listResultFiles(cwd, limit) {
921
864
  return listResultFilesFromRunsDir(
922
- path5.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
865
+ path4.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
923
866
  limit
924
867
  );
925
868
  }
@@ -971,17 +914,17 @@ function slugify(value) {
971
914
  return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
972
915
  }
973
916
  function getRelativeRunPath(cwd, runDir) {
974
- const relative = path6.relative(path6.join(cwd, ".agentv", "results", "runs"), runDir);
975
- if (!relative.startsWith("..") && !path6.isAbsolute(relative)) {
917
+ const relative = path5.relative(path5.join(cwd, ".agentv", "results", "runs"), runDir);
918
+ if (!relative.startsWith("..") && !path5.isAbsolute(relative)) {
976
919
  return relative;
977
920
  }
978
- const experiment = path6.basename(path6.dirname(runDir));
979
- const runName = path6.basename(runDir);
980
- return experiment && experiment !== runName ? path6.join(experiment, runName) : runName;
921
+ const experiment = path5.basename(path5.dirname(runDir));
922
+ const runName = path5.basename(runDir);
923
+ return experiment && experiment !== runName ? path5.join(experiment, runName) : runName;
981
924
  }
982
925
  function buildBranchName(config, payload) {
983
- const timestamp = path6.basename(payload.run_dir);
984
- const evalStem = payload.test_files.length === 1 ? path6.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
926
+ const timestamp = path5.basename(payload.run_dir);
927
+ const evalStem = payload.test_files.length === 1 ? path5.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
985
928
  const experiment = slugify(payload.experiment ?? "default");
986
929
  const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
987
930
  return `${config.branch_prefix}/${branchLeaf}`;
@@ -1010,7 +953,7 @@ function buildPrBody(payload) {
1010
953
  "",
1011
954
  sections,
1012
955
  "",
1013
- `Run: ${path6.basename(payload.run_dir)}`,
956
+ `Run: ${path5.basename(payload.run_dir)}`,
1014
957
  `Experiment: ${payload.experiment ?? "default"}`,
1015
958
  `Eval Files: ${payload.test_files.join(", ")}`
1016
959
  ].join("\n");
@@ -1025,7 +968,7 @@ async function maybeWarnLargeArtifact(runDir) {
1025
968
  }
1026
969
  async function loadNormalizedResultsConfig(cwd) {
1027
970
  const repoRoot = await findRepoRoot(cwd) ?? cwd;
1028
- const config = await loadConfig(path6.join(cwd, "_"), repoRoot);
971
+ const config = await loadConfig(path5.join(cwd, "_"), repoRoot);
1029
972
  if (!config?.results?.export) {
1030
973
  return void 0;
1031
974
  }
@@ -1109,7 +1052,7 @@ async function maybeAutoExportRunArtifacts(payload) {
1109
1052
  const prepared = await prepareResultsRepoBranch(config, branchName);
1110
1053
  try {
1111
1054
  const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
1112
- const destinationDir = path6.join(prepared.repoDir, config.path, relativeRunPath);
1055
+ const destinationDir = path5.join(prepared.repoDir, config.path, relativeRunPath);
1113
1056
  await stageResultsArtifacts({
1114
1057
  repoDir: prepared.repoDir,
1115
1058
  sourceDir: payload.run_dir,
@@ -1143,36 +1086,8 @@ async function maybeAutoExportRunArtifacts(payload) {
1143
1086
  }
1144
1087
  }
1145
1088
 
1146
- // src/commands/eval/artifact-writer.ts
1147
- import { mkdir, readFile, writeFile } from "node:fs/promises";
1148
- import path7 from "node:path";
1149
-
1150
- // src/utils/case-conversion.ts
1151
- function toSnakeCase(str) {
1152
- if (/^[A-Z]/.test(str)) {
1153
- return str;
1154
- }
1155
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
1156
- }
1157
- function toSnakeCaseDeep(obj) {
1158
- if (obj === null || obj === void 0) {
1159
- return obj;
1160
- }
1161
- if (Array.isArray(obj)) {
1162
- return obj.map((item) => toSnakeCaseDeep(item));
1163
- }
1164
- if (typeof obj === "object") {
1165
- const result = {};
1166
- for (const [key, value] of Object.entries(obj)) {
1167
- const snakeKey = toSnakeCase(key);
1168
- result[snakeKey] = toSnakeCaseDeep(value);
1169
- }
1170
- return result;
1171
- }
1172
- return obj;
1173
- }
1174
-
1175
- // src/commands/eval/artifact-writer.ts
1089
+ // src/commands/eval/benchmark-writer.ts
1090
+ import { writeFile } from "node:fs/promises";
1176
1091
  function computeStats(values) {
1177
1092
  if (values.length === 0) {
1178
1093
  return { mean: 0, stddev: 0 };
@@ -1185,361 +1100,6 @@ function computeStats(values) {
1185
1100
  };
1186
1101
  }
1187
1102
  function computePassRate(result) {
1188
- const scores = result.scores;
1189
- if (scores && scores.length > 0) {
1190
- const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
1191
- return passed / scores.length;
1192
- }
1193
- return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
1194
- }
1195
- function countToolCalls(result) {
1196
- const toolCalls = {};
1197
- let total = 0;
1198
- const trace = result.trace;
1199
- if (trace?.steps) {
1200
- for (const step of trace.steps) {
1201
- if (step.toolName || step.type === "tool") {
1202
- const name = step.toolName ?? "unknown";
1203
- toolCalls[name] = (toolCalls[name] ?? 0) + 1;
1204
- total += 1;
1205
- }
1206
- }
1207
- }
1208
- return { toolCalls, total };
1209
- }
1210
- function parseWorkspaceChanges(fileChanges) {
1211
- if (!fileChanges) {
1212
- return void 0;
1213
- }
1214
- let filesModified = 0;
1215
- let filesCreated = 0;
1216
- const lines = fileChanges.split("\n");
1217
- for (const line of lines) {
1218
- if (line.startsWith("--- /dev/null")) {
1219
- filesCreated += 1;
1220
- } else if (line.startsWith("--- a/")) {
1221
- filesModified += 1;
1222
- }
1223
- }
1224
- const summaryLines = lines.slice(0, 20);
1225
- const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
1226
- ... (${lines.length - 20} more lines)` : fileChanges;
1227
- return {
1228
- files_modified: filesModified,
1229
- files_created: filesCreated,
1230
- diff_summary: diffSummary
1231
- };
1232
- }
1233
- function buildAssertions(result) {
1234
- if (!result.assertions) return [];
1235
- return result.assertions.map((a) => ({
1236
- text: a.text,
1237
- passed: a.passed,
1238
- evidence: a.evidence ?? ""
1239
- }));
1240
- }
1241
- function buildEvaluators(scores) {
1242
- if (!scores || scores.length === 0) {
1243
- return void 0;
1244
- }
1245
- return scores.map((s) => ({
1246
- name: s.name,
1247
- type: s.type,
1248
- score: s.score,
1249
- reasoning: "",
1250
- weight: s.weight,
1251
- verdict: s.verdict,
1252
- assertions: s.assertions,
1253
- details: s.details
1254
- }));
1255
- }
1256
- function buildGradingArtifact(result) {
1257
- const assertions = buildAssertions(result);
1258
- const passed = assertions.filter((e) => e.passed).length;
1259
- const failed = assertions.filter((e) => !e.passed).length;
1260
- const total = assertions.length;
1261
- const { toolCalls, total: totalToolCalls } = countToolCalls(result);
1262
- const errorsEncountered = result.error ? 1 : 0;
1263
- return {
1264
- assertions,
1265
- summary: {
1266
- passed,
1267
- failed,
1268
- total,
1269
- pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
1270
- },
1271
- execution_metrics: {
1272
- tool_calls: toolCalls,
1273
- total_tool_calls: totalToolCalls,
1274
- errors_encountered: errorsEncountered
1275
- },
1276
- graders: buildEvaluators(result.scores),
1277
- workspace_changes: parseWorkspaceChanges(result.fileChanges),
1278
- conversation: result.conversationId ? {
1279
- turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
1280
- conversation_id: result.conversationId
1281
- } : void 0
1282
- };
1283
- }
1284
- function buildTimingArtifact(results) {
1285
- let totalInput = 0;
1286
- let totalOutput = 0;
1287
- let totalReasoning = 0;
1288
- let totalDurationMs = 0;
1289
- for (const result of results) {
1290
- const usage = result.tokenUsage;
1291
- if (usage) {
1292
- totalInput += usage.input ?? 0;
1293
- totalOutput += usage.output ?? 0;
1294
- totalReasoning += usage.reasoning ?? 0;
1295
- }
1296
- if (result.durationMs != null) {
1297
- totalDurationMs += result.durationMs;
1298
- }
1299
- }
1300
- return {
1301
- total_tokens: totalInput + totalOutput,
1302
- duration_ms: totalDurationMs,
1303
- total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
1304
- token_usage: {
1305
- input: totalInput,
1306
- output: totalOutput,
1307
- reasoning: totalReasoning
1308
- }
1309
- };
1310
- }
1311
- function buildBenchmarkArtifact(results, evalFile = "", experiment) {
1312
- const targetSet = /* @__PURE__ */ new Set();
1313
- const testIdSet = /* @__PURE__ */ new Set();
1314
- for (const result of results) {
1315
- targetSet.add(result.target ?? "unknown");
1316
- testIdSet.add(result.testId ?? "unknown");
1317
- }
1318
- const targets = [...targetSet].sort();
1319
- const testIds = [...testIdSet].sort();
1320
- const runSummary = {};
1321
- const notes = [];
1322
- for (const target of targets) {
1323
- const targetResults = results.filter((r) => r.target === target);
1324
- const passRates = targetResults.map(computePassRate);
1325
- const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
1326
- const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
1327
- const usage = r.tokenUsage;
1328
- return (usage.input ?? 0) + (usage.output ?? 0);
1329
- });
1330
- const entry = {
1331
- pass_rate: computeStats(passRates),
1332
- time_seconds: computeStats(timings),
1333
- tokens: computeStats(tokens)
1334
- };
1335
- const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
1336
- if (toolCallCounts.some((c2) => c2 > 0)) {
1337
- entry.tool_calls = computeStats(toolCallCounts);
1338
- }
1339
- const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
1340
- if (costs.length > 0) {
1341
- entry.cost_usd = computeStats(costs);
1342
- }
1343
- runSummary[target] = entry;
1344
- }
1345
- const evaluatorScores = /* @__PURE__ */ new Map();
1346
- for (const result of results) {
1347
- if (result.scores) {
1348
- for (const score of result.scores) {
1349
- const key = `${score.name}:${score.type}`;
1350
- if (!evaluatorScores.has(key)) {
1351
- evaluatorScores.set(key, []);
1352
- }
1353
- evaluatorScores.get(key)?.push(score.score);
1354
- }
1355
- }
1356
- }
1357
- let perEvaluatorSummary;
1358
- if (evaluatorScores.size > 0) {
1359
- perEvaluatorSummary = {};
1360
- for (const [key, scores] of evaluatorScores) {
1361
- perEvaluatorSummary[key] = computeStats(scores);
1362
- }
1363
- }
1364
- const errorCount = results.filter(
1365
- (r) => r.executionStatus != null && r.executionStatus === "execution_error"
1366
- ).length;
1367
- if (errorCount > 0) {
1368
- notes.push(
1369
- `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
1370
- );
1371
- }
1372
- if (results.length === 0) {
1373
- notes.push("No results to summarize");
1374
- }
1375
- const firstResult = results[0];
1376
- const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
1377
- return {
1378
- metadata: {
1379
- eval_file: evalFile,
1380
- timestamp,
1381
- targets,
1382
- tests_run: testIds,
1383
- experiment
1384
- },
1385
- run_summary: runSummary,
1386
- per_grader_summary: perEvaluatorSummary,
1387
- notes
1388
- };
1389
- }
1390
- function safeArtifactPathSegment(value, fallback) {
1391
- const trimmed = value?.trim();
1392
- if (!trimmed) {
1393
- return fallback;
1394
- }
1395
- return trimmed.replace(/[/\\:*?"<>|]/g, "_");
1396
- }
1397
- function safeTestId(testId) {
1398
- return safeArtifactPathSegment(testId, "unknown");
1399
- }
1400
- function getSuite(result) {
1401
- return result.suite;
1402
- }
1403
- function buildArtifactSubdir(result) {
1404
- const segments = [];
1405
- const evalSet = getSuite(result);
1406
- if (evalSet) {
1407
- segments.push(safeArtifactPathSegment(evalSet, "default"));
1408
- }
1409
- segments.push(safeTestId(result.testId));
1410
- return path7.posix.join(...segments);
1411
- }
1412
- function formatOutputMarkdown(output) {
1413
- return output.map((msg) => `@[${msg.role}]:
1414
- ${String(msg.content ?? "")}`).join("\n\n");
1415
- }
1416
- function extractInput(result) {
1417
- const input = result.input;
1418
- if (!input) return null;
1419
- if (typeof input === "string") return input;
1420
- if (Array.isArray(input) && input.length > 0) {
1421
- return formatOutputMarkdown(input);
1422
- }
1423
- return null;
1424
- }
1425
- function buildResultIndexArtifact(result) {
1426
- const artifactSubdir = buildArtifactSubdir(result);
1427
- const input = extractInput(result);
1428
- const hasResponse = Array.isArray(result.output) && result.output.length > 0;
1429
- return {
1430
- timestamp: result.timestamp,
1431
- test_id: result.testId ?? "unknown",
1432
- suite: getSuite(result),
1433
- category: result.category,
1434
- conversation_id: result.conversationId,
1435
- score: result.score,
1436
- target: result.target ?? "unknown",
1437
- scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
1438
- execution_status: result.executionStatus,
1439
- error: result.error,
1440
- failure_stage: result.failureStage,
1441
- failure_reason_code: result.failureReasonCode,
1442
- workspace_path: result.workspacePath,
1443
- grading_path: path7.posix.join(artifactSubdir, "grading.json"),
1444
- timing_path: path7.posix.join(artifactSubdir, "timing.json"),
1445
- input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
1446
- output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
1447
- response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
1448
- };
1449
- }
1450
- async function writeJsonlFile(filePath, records) {
1451
- const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
1452
- `;
1453
- await writeFile(filePath, content, "utf8");
1454
- }
1455
- function buildTranscriptMessageLines(results) {
1456
- const lines = [];
1457
- for (const result of results) {
1458
- const transcriptLines = toTranscriptJsonLines(
1459
- {
1460
- messages: [...result.input ?? [], ...result.output],
1461
- source: {
1462
- provider: result.target,
1463
- sessionId: result.conversationId ?? result.testId,
1464
- startedAt: result.timestamp
1465
- },
1466
- tokenUsage: result.tokenUsage,
1467
- durationMs: result.durationMs,
1468
- costUsd: result.costUsd
1469
- },
1470
- {
1471
- testId: result.testId,
1472
- target: result.target
1473
- }
1474
- );
1475
- lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
1476
- }
1477
- return lines.length > 0 ? `${lines.join("\n")}
1478
- ` : "";
1479
- }
1480
- async function writeArtifactsFromResults(results, outputDir, options) {
1481
- const testArtifactDir = outputDir;
1482
- const timingPath = path7.join(outputDir, "timing.json");
1483
- const benchmarkPath = path7.join(outputDir, "benchmark.json");
1484
- const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
1485
- await mkdir(outputDir, { recursive: true });
1486
- const indexRecords = [];
1487
- for (const result of results) {
1488
- const grading = buildGradingArtifact(result);
1489
- const timing2 = buildTimingArtifact([result]);
1490
- const artifactSubdir = buildArtifactSubdir(result);
1491
- const testDir = path7.join(outputDir, artifactSubdir);
1492
- const gradingPath = path7.join(testDir, "grading.json");
1493
- const perTestTimingPath = path7.join(testDir, "timing.json");
1494
- await mkdir(testDir, { recursive: true });
1495
- await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
1496
- `, "utf8");
1497
- await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
1498
- `, "utf8");
1499
- const input = extractInput(result);
1500
- if (input) {
1501
- await writeFile(path7.join(testDir, "input.md"), input, "utf8");
1502
- }
1503
- if (result.output && result.output.length > 0) {
1504
- const outputsDir = path7.join(testDir, "outputs");
1505
- await mkdir(outputsDir, { recursive: true });
1506
- await writeFile(
1507
- path7.join(outputsDir, "response.md"),
1508
- formatOutputMarkdown(result.output),
1509
- "utf8"
1510
- );
1511
- }
1512
- indexRecords.push({
1513
- ...buildResultIndexArtifact(result),
1514
- experiment: options?.experiment
1515
- });
1516
- }
1517
- const timing = buildTimingArtifact(results);
1518
- await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
1519
- `, "utf8");
1520
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
1521
- await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
1522
- `, "utf8");
1523
- await writeJsonlFile(indexPath, indexRecords);
1524
- const transcriptPath = path7.join(outputDir, "transcript.jsonl");
1525
- await writeFile(transcriptPath, buildTranscriptMessageLines(results), "utf8");
1526
- return { testArtifactDir, timingPath, benchmarkPath, indexPath };
1527
- }
1528
-
1529
- // src/commands/eval/benchmark-writer.ts
1530
- import { writeFile as writeFile2 } from "node:fs/promises";
1531
- function computeStats2(values) {
1532
- if (values.length === 0) {
1533
- return { mean: 0, stddev: 0 };
1534
- }
1535
- const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
1536
- const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
1537
- return {
1538
- mean: Math.round(mean * 1e3) / 1e3,
1539
- stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
1540
- };
1541
- }
1542
- function computePassRate2(result) {
1543
1103
  const scores = result.scores;
1544
1104
  if (scores && scores.length > 0) {
1545
1105
  const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
@@ -1548,7 +1108,7 @@ function computePassRate2(result) {
1548
1108
  return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
1549
1109
  }
1550
1110
  function buildBenchmarkJson(results) {
1551
- const passRates = results.map(computePassRate2);
1111
+ const passRates = results.map(computePassRate);
1552
1112
  const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
1553
1113
  const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
1554
1114
  const usage = r.tokenUsage;
@@ -1557,29 +1117,29 @@ function buildBenchmarkJson(results) {
1557
1117
  return {
1558
1118
  run_summary: {
1559
1119
  with_skill: {
1560
- pass_rate: computeStats2(passRates),
1561
- time_seconds: computeStats2(timings),
1562
- tokens: computeStats2(tokens)
1120
+ pass_rate: computeStats(passRates),
1121
+ time_seconds: computeStats(timings),
1122
+ tokens: computeStats(tokens)
1563
1123
  }
1564
1124
  }
1565
1125
  };
1566
1126
  }
1567
1127
  async function writeBenchmarkJson(outputPath, results) {
1568
1128
  const benchmark = buildBenchmarkJson(results);
1569
- await writeFile2(outputPath, `${JSON.stringify(benchmark, null, 2)}
1129
+ await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}
1570
1130
  `, "utf8");
1571
1131
  }
1572
1132
 
1573
1133
  // src/commands/eval/env.ts
1574
1134
  import { constants as constants3 } from "node:fs";
1575
1135
  import { access as access3 } from "node:fs/promises";
1576
- import path8 from "node:path";
1136
+ import path6 from "node:path";
1577
1137
  import { config as loadDotenv } from "dotenv";
1578
1138
  function uniqueDirs(directories) {
1579
1139
  const seen = /* @__PURE__ */ new Set();
1580
1140
  const result = [];
1581
1141
  for (const dir of directories) {
1582
- const absolute = path8.resolve(dir);
1142
+ const absolute = path6.resolve(dir);
1583
1143
  if (seen.has(absolute)) {
1584
1144
  continue;
1585
1145
  }
@@ -1598,14 +1158,14 @@ async function fileExists2(filePath) {
1598
1158
  }
1599
1159
  function collectAncestorDirectories(start, boundary) {
1600
1160
  const directories = [];
1601
- const boundaryDir = path8.resolve(boundary);
1602
- let current = path8.resolve(start);
1161
+ const boundaryDir = path6.resolve(boundary);
1162
+ let current = path6.resolve(start);
1603
1163
  while (current !== void 0) {
1604
1164
  directories.push(current);
1605
1165
  if (current === boundaryDir) {
1606
1166
  break;
1607
1167
  }
1608
- const parent = path8.dirname(current);
1168
+ const parent = path6.dirname(current);
1609
1169
  if (parent === current) {
1610
1170
  break;
1611
1171
  }
@@ -1615,12 +1175,12 @@ function collectAncestorDirectories(start, boundary) {
1615
1175
  }
1616
1176
  async function loadEnvFromHierarchy(options) {
1617
1177
  const { testFilePath, repoRoot, verbose } = options;
1618
- const testDir = path8.dirname(path8.resolve(testFilePath));
1178
+ const testDir = path6.dirname(path6.resolve(testFilePath));
1619
1179
  const cwd = process.cwd();
1620
1180
  const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
1621
1181
  const envFiles = [];
1622
1182
  for (const dir of searchDirs) {
1623
- const candidate = path8.join(dir, ".env");
1183
+ const candidate = path6.join(dir, ".env");
1624
1184
  if (await fileExists2(candidate)) {
1625
1185
  envFiles.push(candidate);
1626
1186
  }
@@ -1642,11 +1202,11 @@ async function loadEnvFromHierarchy(options) {
1642
1202
  }
1643
1203
 
1644
1204
  // src/commands/eval/output-writer.ts
1645
- import path14 from "node:path";
1205
+ import path12 from "node:path";
1646
1206
 
1647
1207
  // src/commands/eval/html-writer.ts
1648
- import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
1649
- import path9 from "node:path";
1208
+ import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
1209
+ import path7 from "node:path";
1650
1210
 
1651
1211
  // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
1652
1212
  var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -1865,7 +1425,7 @@ var HtmlWriter = class _HtmlWriter {
1865
1425
  this.filePath = filePath;
1866
1426
  }
1867
1427
  static async open(filePath) {
1868
- await mkdir2(path9.dirname(filePath), { recursive: true });
1428
+ await mkdir(path7.dirname(filePath), { recursive: true });
1869
1429
  const writer = new _HtmlWriter(filePath);
1870
1430
  await writer.writeHtml();
1871
1431
  return writer;
@@ -1891,7 +1451,7 @@ var HtmlWriter = class _HtmlWriter {
1891
1451
  }
1892
1452
  async writeHtml() {
1893
1453
  const html = generateHtml(this.results, this.isLive);
1894
- await writeFile3(this.filePath, html, "utf8");
1454
+ await writeFile2(this.filePath, html, "utf8");
1895
1455
  }
1896
1456
  };
1897
1457
  function generateHtml(results, isLive) {
@@ -2326,10 +1886,10 @@ var SCRIPT = `
2326
1886
  h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
2327
1887
  h+="</div>";
2328
1888
 
2329
- /* evaluator results */
1889
+ /* grader results */
2330
1890
  if(r.scores&&r.scores.length>0){
2331
- h+="<h4>Evaluator Results</h4>";
2332
- h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
1891
+ h+="<h4>Grader Results</h4>";
1892
+ h+='<table class="eval-table"><thead><tr><th>Grader</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
2333
1893
  for(var i=0;i<r.scores.length;i++){
2334
1894
  var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
2335
1895
  var evAssertions=ev.assertions||[];
@@ -2375,8 +1935,8 @@ var SCRIPT = `
2375
1935
  `;
2376
1936
 
2377
1937
  // src/commands/eval/json-writer.ts
2378
- import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
2379
- import path10 from "node:path";
1938
+ import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
1939
+ import path8 from "node:path";
2380
1940
  var JsonWriter = class _JsonWriter {
2381
1941
  filePath;
2382
1942
  results = [];
@@ -2385,7 +1945,7 @@ var JsonWriter = class _JsonWriter {
2385
1945
  this.filePath = filePath;
2386
1946
  }
2387
1947
  static async open(filePath) {
2388
- await mkdir3(path10.dirname(filePath), { recursive: true });
1948
+ await mkdir2(path8.dirname(filePath), { recursive: true });
2389
1949
  return new _JsonWriter(filePath);
2390
1950
  }
2391
1951
  async append(result) {
@@ -2412,15 +1972,15 @@ var JsonWriter = class _JsonWriter {
2412
1972
  results: this.results
2413
1973
  };
2414
1974
  const snakeCaseOutput = toSnakeCaseDeep(output);
2415
- await writeFile4(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
1975
+ await writeFile3(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
2416
1976
  `, "utf8");
2417
1977
  }
2418
1978
  };
2419
1979
 
2420
1980
  // src/commands/eval/jsonl-writer.ts
2421
1981
  import { createWriteStream } from "node:fs";
2422
- import { mkdir as mkdir4 } from "node:fs/promises";
2423
- import path11 from "node:path";
1982
+ import { mkdir as mkdir3 } from "node:fs/promises";
1983
+ import path9 from "node:path";
2424
1984
  import { finished } from "node:stream/promises";
2425
1985
  var JsonlWriter = class _JsonlWriter {
2426
1986
  stream;
@@ -2429,9 +1989,10 @@ var JsonlWriter = class _JsonlWriter {
2429
1989
  constructor(stream) {
2430
1990
  this.stream = stream;
2431
1991
  }
2432
- static async open(filePath) {
2433
- await mkdir4(path11.dirname(filePath), { recursive: true });
2434
- const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
1992
+ static async open(filePath, options) {
1993
+ await mkdir3(path9.dirname(filePath), { recursive: true });
1994
+ const flags = options?.append ? "a" : "w";
1995
+ const stream = createWriteStream(filePath, { flags, encoding: "utf8" });
2435
1996
  return new _JsonlWriter(stream);
2436
1997
  }
2437
1998
  async append(record) {
@@ -2461,8 +2022,8 @@ var JsonlWriter = class _JsonlWriter {
2461
2022
  };
2462
2023
 
2463
2024
  // src/commands/eval/junit-writer.ts
2464
- import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
2465
- import path12 from "node:path";
2025
+ import { mkdir as mkdir4, writeFile as writeFile4 } from "node:fs/promises";
2026
+ import path10 from "node:path";
2466
2027
  function escapeXml(str) {
2467
2028
  return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
2468
2029
  }
@@ -2476,7 +2037,7 @@ var JunitWriter = class _JunitWriter {
2476
2037
  this.threshold = options?.threshold ?? 0.5;
2477
2038
  }
2478
2039
  static async open(filePath, options) {
2479
- await mkdir5(path12.dirname(filePath), { recursive: true });
2040
+ await mkdir4(path10.dirname(filePath), { recursive: true });
2480
2041
  return new _JunitWriter(filePath, options);
2481
2042
  }
2482
2043
  async append(result) {
@@ -2545,14 +2106,14 @@ ${testCases.join("\n")}
2545
2106
  ${suiteXmls.join("\n")}
2546
2107
  </testsuites>
2547
2108
  `;
2548
- await writeFile5(this.filePath, xml, "utf8");
2109
+ await writeFile4(this.filePath, xml, "utf8");
2549
2110
  }
2550
2111
  };
2551
2112
 
2552
2113
  // src/commands/eval/yaml-writer.ts
2553
2114
  import { createWriteStream as createWriteStream2 } from "node:fs";
2554
- import { mkdir as mkdir6 } from "node:fs/promises";
2555
- import path13 from "node:path";
2115
+ import { mkdir as mkdir5 } from "node:fs/promises";
2116
+ import path11 from "node:path";
2556
2117
  import { finished as finished2 } from "node:stream/promises";
2557
2118
  import { stringify as stringifyYaml } from "yaml";
2558
2119
  var YamlWriter = class _YamlWriter {
@@ -2564,7 +2125,7 @@ var YamlWriter = class _YamlWriter {
2564
2125
  this.stream = stream;
2565
2126
  }
2566
2127
  static async open(filePath) {
2567
- await mkdir6(path13.dirname(filePath), { recursive: true });
2128
+ await mkdir5(path11.dirname(filePath), { recursive: true });
2568
2129
  const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
2569
2130
  return new _YamlWriter(stream);
2570
2131
  }
@@ -2604,10 +2165,10 @@ var YamlWriter = class _YamlWriter {
2604
2165
  };
2605
2166
 
2606
2167
  // src/commands/eval/output-writer.ts
2607
- async function createOutputWriter(filePath, format) {
2168
+ async function createOutputWriter(filePath, format, options) {
2608
2169
  switch (format) {
2609
2170
  case "jsonl":
2610
- return JsonlWriter.open(filePath);
2171
+ return JsonlWriter.open(filePath, { append: options?.append });
2611
2172
  case "yaml":
2612
2173
  return YamlWriter.open(filePath);
2613
2174
  case "html":
@@ -2620,7 +2181,7 @@ async function createOutputWriter(filePath, format) {
2620
2181
  }
2621
2182
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
2622
2183
  function createWriterFromPath(filePath, options) {
2623
- const ext = path14.extname(filePath).toLowerCase();
2184
+ const ext = path12.extname(filePath).toLowerCase();
2624
2185
  switch (ext) {
2625
2186
  case ".jsonl":
2626
2187
  return JsonlWriter.open(filePath);
@@ -2719,12 +2280,12 @@ var ProgressDisplay = class {
2719
2280
  }
2720
2281
  addLogPaths(paths) {
2721
2282
  const newPaths = [];
2722
- for (const path19 of paths) {
2723
- if (this.logPathSet.has(path19)) {
2283
+ for (const path17 of paths) {
2284
+ if (this.logPathSet.has(path17)) {
2724
2285
  continue;
2725
2286
  }
2726
- this.logPathSet.add(path19);
2727
- newPaths.push(path19);
2287
+ this.logPathSet.add(path17);
2288
+ newPaths.push(path17);
2728
2289
  }
2729
2290
  if (newPaths.length === 0) {
2730
2291
  return;
@@ -2777,8 +2338,8 @@ async function loadNonErrorResults(jsonlPath) {
2777
2338
  }
2778
2339
 
2779
2340
  // src/commands/eval/run-cache.ts
2780
- import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
2781
- import path15 from "node:path";
2341
+ import { mkdir as mkdir6, readFile, writeFile as writeFile5 } from "node:fs/promises";
2342
+ import path13 from "node:path";
2782
2343
  var CACHE_FILENAME = "cache.json";
2783
2344
  function resolveRunCacheFile(cache) {
2784
2345
  if (cache.lastRunDir) {
@@ -2787,27 +2348,27 @@ function resolveRunCacheFile(cache) {
2787
2348
  return "";
2788
2349
  }
2789
2350
  function cachePath(cwd) {
2790
- return path15.join(cwd, ".agentv", CACHE_FILENAME);
2351
+ return path13.join(cwd, ".agentv", CACHE_FILENAME);
2791
2352
  }
2792
2353
  async function loadRunCache(cwd) {
2793
2354
  try {
2794
- const content = await readFile2(cachePath(cwd), "utf-8");
2355
+ const content = await readFile(cachePath(cwd), "utf-8");
2795
2356
  return JSON.parse(content);
2796
2357
  } catch {
2797
2358
  return void 0;
2798
2359
  }
2799
2360
  }
2800
2361
  async function saveRunCache(cwd, resultPath) {
2801
- if (path15.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2362
+ if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2802
2363
  return;
2803
2364
  }
2804
- const dir = path15.join(cwd, ".agentv");
2805
- await mkdir7(dir, { recursive: true });
2365
+ const dir = path13.join(cwd, ".agentv");
2366
+ await mkdir6(dir, { recursive: true });
2806
2367
  const cache = {
2807
- lastRunDir: path15.dirname(resultPath),
2368
+ lastRunDir: path13.dirname(resultPath),
2808
2369
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
2809
2370
  };
2810
- await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2371
+ await writeFile5(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2811
2372
  `, "utf-8");
2812
2373
  }
2813
2374
 
@@ -3072,13 +2633,13 @@ function formatMatrixSummary(results) {
3072
2633
  }
3073
2634
 
3074
2635
  // ../../packages/core/dist/evaluation/validation/index.js
3075
- import { readFile as readFile3 } from "node:fs/promises";
3076
- import path16 from "node:path";
2636
+ import { readFile as readFile2 } from "node:fs/promises";
2637
+ import path14 from "node:path";
3077
2638
  import { parse } from "yaml";
3078
2639
  import { readFile as readFile22, readdir } from "node:fs/promises";
3079
2640
  import path22 from "node:path";
3080
2641
  import { parse as parse2 } from "yaml";
3081
- import { readFile as readFile32 } from "node:fs/promises";
2642
+ import { readFile as readFile3 } from "node:fs/promises";
3082
2643
  import path32 from "node:path";
3083
2644
  import { parse as parse3 } from "yaml";
3084
2645
  import { readFile as readFile4 } from "node:fs/promises";
@@ -3097,7 +2658,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
3097
2658
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
3098
2659
  async function detectFileType(filePath) {
3099
2660
  try {
3100
- const content = await readFile3(filePath, "utf8");
2661
+ const content = await readFile2(filePath, "utf8");
3101
2662
  const parsed = parse(content);
3102
2663
  if (Array.isArray(parsed)) {
3103
2664
  return "cases";
@@ -3125,8 +2686,8 @@ async function detectFileType(filePath) {
3125
2686
  }
3126
2687
  }
3127
2688
  function inferFileTypeFromPath(filePath) {
3128
- const normalized = path16.normalize(filePath).replace(/\\/g, "/");
3129
- const basename = path16.basename(filePath);
2689
+ const normalized = path14.normalize(filePath).replace(/\\/g, "/");
2690
+ const basename = path14.basename(filePath);
3130
2691
  if (normalized.includes("/.agentv/")) {
3131
2692
  if (basename === "config.yaml" || basename === "config.yml") {
3132
2693
  return "config";
@@ -3723,7 +3284,7 @@ function validateAssertArray(assertField, parentLocation, filePath, errors, cust
3723
3284
  continue;
3724
3285
  }
3725
3286
  const typeValue = rawTypeValue.replace(/_/g, "-");
3726
- if (!isEvaluatorKind(typeValue) && !customAssertionTypes.has(typeValue)) {
3287
+ if (!isGraderKind(typeValue) && !customAssertionTypes.has(typeValue)) {
3727
3288
  errors.push({
3728
3289
  severity: "warning",
3729
3290
  filePath,
@@ -3897,7 +3458,7 @@ async function validateCasesFile(filePath) {
3897
3458
  const absolutePath = path32.resolve(filePath);
3898
3459
  let parsed;
3899
3460
  try {
3900
- const content = await readFile32(absolutePath, "utf8");
3461
+ const content = await readFile3(absolutePath, "utf8");
3901
3462
  parsed = parse3(content);
3902
3463
  } catch (error) {
3903
3464
  errors.push({
@@ -5005,6 +4566,12 @@ Errors in ${targetsFilePath}:`);
5005
4566
 
5006
4567
  // src/commands/eval/run-eval.ts
5007
4568
  var DEFAULT_WORKERS = 3;
4569
+ function shouldSkipExistingResultForResume(result, rerunFailed) {
4570
+ if (rerunFailed) {
4571
+ return result.executionStatus === "ok";
4572
+ }
4573
+ return result.executionStatus !== "execution_error";
4574
+ }
5008
4575
  function normalizeBoolean(value) {
5009
4576
  return value === true;
5010
4577
  }
@@ -5175,6 +4742,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
5175
4742
  otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
5176
4743
  otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
5177
4744
  retryErrors: normalizeString(rawOptions.retryErrors),
4745
+ resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
4746
+ rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
5178
4747
  workspaceMode,
5179
4748
  workspacePath,
5180
4749
  // Precedence: CLI > YAML config > TS config
@@ -5201,7 +4770,7 @@ async function ensureFileExists(filePath, description) {
5201
4770
  function buildDefaultOutputPathForExperiment(cwd, experiment) {
5202
4771
  const runDir = buildDefaultRunDir(cwd, experiment);
5203
4772
  mkdirSync(runDir, { recursive: true });
5204
- return path17.join(runDir, "index.jsonl");
4773
+ return path15.join(runDir, "index.jsonl");
5205
4774
  }
5206
4775
  function createProgressReporter(maxWorkers, options) {
5207
4776
  const display = new ProgressDisplay(maxWorkers, options);
@@ -5215,7 +4784,7 @@ function createProgressReporter(maxWorkers, options) {
5215
4784
  };
5216
4785
  }
5217
4786
  function makeTestCaseKey(testFilePath, testId) {
5218
- return `${path17.resolve(testFilePath)}::${testId}`;
4787
+ return `${path15.resolve(testFilePath)}::${testId}`;
5219
4788
  }
5220
4789
  function resolveTargetLabel(requestedName, resolvedName) {
5221
4790
  if (resolvedName !== requestedName) {
@@ -5277,7 +4846,7 @@ async function prepareFileMetadata(params) {
5277
4846
  repoRoot,
5278
4847
  verbose: options.verbose
5279
4848
  });
5280
- const relativePath = path17.relative(cwd, testFilePath);
4849
+ const relativePath = path15.relative(cwd, testFilePath);
5281
4850
  const category = deriveCategory(relativePath);
5282
4851
  const suite = await loadTestSuite(testFilePath, repoRoot, {
5283
4852
  verbose: options.verbose,
@@ -5302,7 +4871,7 @@ async function prepareFileMetadata(params) {
5302
4871
  selections = [
5303
4872
  {
5304
4873
  selection: transcriptSelection,
5305
- inlineTargetLabel: `transcript (${path17.basename(options.transcript)})`
4874
+ inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
5306
4875
  }
5307
4876
  ];
5308
4877
  } else {
@@ -5372,7 +4941,7 @@ async function prepareFileMetadata(params) {
5372
4941
  yamlWorkers: suite.workers,
5373
4942
  yamlCache: suite.cacheConfig?.enabled,
5374
4943
  yamlCachePath: suite.cacheConfig?.cachePath,
5375
- totalBudgetUsd: suite.totalBudgetUsd,
4944
+ budgetUsd: suite.budgetUsd,
5376
4945
  failOnError: suite.failOnError,
5377
4946
  threshold: suite.threshold,
5378
4947
  tags: suite.metadata?.tags
@@ -5398,7 +4967,7 @@ async function runSingleEvalFile(params) {
5398
4967
  testCases,
5399
4968
  trialsConfig,
5400
4969
  matrixMode,
5401
- totalBudgetUsd,
4970
+ budgetUsd,
5402
4971
  failOnError,
5403
4972
  providerFactory
5404
4973
  } = params;
@@ -5464,7 +5033,7 @@ async function runSingleEvalFile(params) {
5464
5033
  workspacePath: options.workspacePath,
5465
5034
  keepWorkspaces: options.keepWorkspaces,
5466
5035
  trials: trialsConfig,
5467
- totalBudgetUsd,
5036
+ budgetUsd,
5468
5037
  failOnError,
5469
5038
  graderTarget: options.graderTarget,
5470
5039
  model: options.model,
@@ -5537,7 +5106,7 @@ async function runEvalCommand(input) {
5537
5106
  );
5538
5107
  }
5539
5108
  const repoRoot = await findRepoRoot(cwd);
5540
- const yamlConfig = await loadConfig(path17.join(cwd, "_"), repoRoot);
5109
+ const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
5541
5110
  if (yamlConfig?.required_version) {
5542
5111
  await enforceRequiredVersion(yamlConfig.required_version, {
5543
5112
  strict: normalizeBoolean(input.rawOptions.strict)
@@ -5552,7 +5121,7 @@ async function runEvalCommand(input) {
5552
5121
  }
5553
5122
  let retryNonErrorResults;
5554
5123
  if (options.retryErrors) {
5555
- const retryPath = path17.resolve(options.retryErrors);
5124
+ const retryPath = path15.resolve(options.retryErrors);
5556
5125
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
5557
5126
  const completedIds = await loadFullyCompletedTestIds(retryPath);
5558
5127
  const errorIds = await loadErrorTestIds(retryPath);
@@ -5565,8 +5134,37 @@ async function runEvalCommand(input) {
5565
5134
  console.log(`Skipping ${completedIds.length} already-completed test(s).`);
5566
5135
  }
5567
5136
  }
5137
+ let resumeSkipKeys;
5138
+ let isResumeAppend = false;
5139
+ if (options.resume && !options.retryErrors) {
5140
+ const explicitResumeDir = options.outputDir ?? options.artifacts;
5141
+ if (explicitResumeDir) {
5142
+ const resumeIndexPath = path15.join(path15.resolve(explicitResumeDir), "index.jsonl");
5143
+ if (existsSync2(resumeIndexPath)) {
5144
+ const content = await readFile8(resumeIndexPath, "utf8");
5145
+ const existingResults = parseJsonlResults(content);
5146
+ resumeSkipKeys = /* @__PURE__ */ new Set();
5147
+ for (const r of existingResults) {
5148
+ if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
5149
+ resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target));
5150
+ }
5151
+ }
5152
+ isResumeAppend = true;
5153
+ const modeLabel = options.rerunFailed ? "Rerun-failed" : "Resume";
5154
+ console.log(
5155
+ `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`
5156
+ );
5157
+ } else {
5158
+ console.log("Resume: no existing index.jsonl found, starting fresh run.");
5159
+ }
5160
+ } else {
5161
+ console.warn(
5162
+ "Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume."
5163
+ );
5164
+ }
5165
+ }
5568
5166
  if (options.workspacePath) {
5569
- const resolvedWorkspace = path17.resolve(options.workspacePath);
5167
+ const resolvedWorkspace = path15.resolve(options.workspacePath);
5570
5168
  try {
5571
5169
  const { stat: stat2 } = await import("node:fs/promises");
5572
5170
  const stats = await stat2(resolvedWorkspace);
@@ -5607,25 +5205,25 @@ async function runEvalCommand(input) {
5607
5205
  let outputPath;
5608
5206
  let usesDefaultArtifactWorkspace;
5609
5207
  if (explicitDir) {
5610
- runDir = path17.resolve(explicitDir);
5208
+ runDir = path15.resolve(explicitDir);
5611
5209
  mkdirSync(runDir, { recursive: true });
5612
- outputPath = path17.join(runDir, "index.jsonl");
5210
+ outputPath = path15.join(runDir, "index.jsonl");
5613
5211
  usesDefaultArtifactWorkspace = true;
5614
5212
  } else if (options.outPath) {
5615
- outputPath = path17.resolve(options.outPath);
5616
- runDir = path17.dirname(outputPath);
5213
+ outputPath = path15.resolve(options.outPath);
5214
+ runDir = path15.dirname(outputPath);
5617
5215
  mkdirSync(runDir, { recursive: true });
5618
5216
  usesDefaultArtifactWorkspace = false;
5619
5217
  } else {
5620
5218
  outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
5621
- runDir = path17.dirname(outputPath);
5219
+ runDir = path15.dirname(outputPath);
5622
5220
  usesDefaultArtifactWorkspace = true;
5623
5221
  }
5624
5222
  let otelExporter = null;
5625
5223
  const useFileExport = !!options.otelFile;
5626
5224
  if (options.exportOtel || useFileExport) {
5627
5225
  try {
5628
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-U5EXNMON.js");
5226
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-7W4OI3X2.js");
5629
5227
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
5630
5228
  let headers = {};
5631
5229
  if (options.otelBackend) {
@@ -5649,7 +5247,7 @@ async function runEvalCommand(input) {
5649
5247
  headers,
5650
5248
  captureContent,
5651
5249
  groupTurns: options.otelGroupTurns,
5652
- otlpFilePath: options.otelFile ? path17.resolve(options.otelFile) : void 0
5250
+ otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
5653
5251
  });
5654
5252
  const initialized = await otelExporter.init();
5655
5253
  if (!initialized) {
@@ -5666,7 +5264,7 @@ async function runEvalCommand(input) {
5666
5264
  }
5667
5265
  }
5668
5266
  const primaryWritePath = outputPath;
5669
- const resolvedExportPaths = options.exportPaths.map((p) => path17.resolve(p));
5267
+ const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
5670
5268
  console.log(`Artifact directory: ${runDir}`);
5671
5269
  if (resolvedExportPaths.length > 0) {
5672
5270
  console.log("Export files:");
@@ -5674,9 +5272,9 @@ async function runEvalCommand(input) {
5674
5272
  console.log(` ${p}`);
5675
5273
  }
5676
5274
  }
5677
- const resolvedTestFiles = input.testFiles.map((file) => path17.resolve(file));
5275
+ const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
5678
5276
  if (options.otelFile) {
5679
- console.log(`OTLP JSON file: ${path17.resolve(options.otelFile)}`);
5277
+ console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
5680
5278
  }
5681
5279
  const evaluationRunner = await resolveEvaluationRunner();
5682
5280
  const allResults = [];
@@ -5716,7 +5314,7 @@ async function runEvalCommand(input) {
5716
5314
  for (const [testFilePath, meta] of fileMetadata.entries()) {
5717
5315
  if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
5718
5316
  fileMetadata.delete(testFilePath);
5719
- skippedFiles.push(path17.relative(cwd, testFilePath));
5317
+ skippedFiles.push(path15.relative(cwd, testFilePath));
5720
5318
  }
5721
5319
  }
5722
5320
  if (skippedFiles.length > 0 && options.verbose) {
@@ -5737,7 +5335,7 @@ async function runEvalCommand(input) {
5737
5335
  cliNoCache: options.noCache,
5738
5336
  yamlCache: yamlCacheEnabled
5739
5337
  });
5740
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path17.resolve(yamlCachePath) : void 0) : void 0;
5338
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
5741
5339
  if (cacheEnabled) {
5742
5340
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
5743
5341
  }
@@ -5747,14 +5345,25 @@ async function runEvalCommand(input) {
5747
5345
  throw new Error("--threshold must be between 0 and 1");
5748
5346
  }
5749
5347
  const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
5750
- const outputWriter = await createOutputWriter(primaryWritePath, "jsonl");
5348
+ const outputWriter = await createOutputWriter(primaryWritePath, "jsonl", {
5349
+ append: isResumeAppend
5350
+ });
5751
5351
  const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
5752
5352
  let totalEvalCount = 0;
5353
+ let resumeSkippedCount = 0;
5753
5354
  for (const meta of fileMetadata.values()) {
5754
5355
  const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
5755
5356
  for (const test of meta.testCases) {
5756
5357
  const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
5757
- totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
5358
+ const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ["unknown"];
5359
+ for (const tn of effectiveTargets) {
5360
+ const key = `${test.id}::${tn}`;
5361
+ if (resumeSkipKeys?.has(key)) {
5362
+ resumeSkippedCount++;
5363
+ } else {
5364
+ totalEvalCount++;
5365
+ }
5366
+ }
5758
5367
  }
5759
5368
  }
5760
5369
  if (totalEvalCount === 0) {
@@ -5762,6 +5371,10 @@ async function runEvalCommand(input) {
5762
5371
  console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
5763
5372
  return;
5764
5373
  }
5374
+ if (resumeSkipKeys && resumeSkippedCount > 0) {
5375
+ console.log(`Nothing to resume \u2014 all ${resumeSkippedCount} test(s) already completed.`);
5376
+ return;
5377
+ }
5765
5378
  throw new Error("No tests matched the provided filters.");
5766
5379
  }
5767
5380
  const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
@@ -5821,7 +5434,7 @@ async function runEvalCommand(input) {
5821
5434
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
5822
5435
  let transcriptProviderFactory;
5823
5436
  if (options.transcript) {
5824
- const { TranscriptProvider } = await import("./dist-U5EXNMON.js");
5437
+ const { TranscriptProvider } = await import("./dist-7W4OI3X2.js");
5825
5438
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
5826
5439
  const totalTests = [...fileMetadata.values()].reduce(
5827
5440
  (sum, meta) => sum + meta.testCases.length,
@@ -5852,7 +5465,10 @@ async function runEvalCommand(input) {
5852
5465
  }
5853
5466
  return true;
5854
5467
  }) : targetPrep.testCases;
5855
- if (applicableTestCases.length === 0) {
5468
+ const filteredTestCases = resumeSkipKeys ? applicableTestCases.filter(
5469
+ (test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName))
5470
+ ) : applicableTestCases;
5471
+ if (filteredTestCases.length === 0) {
5856
5472
  return [];
5857
5473
  }
5858
5474
  try {
@@ -5872,15 +5488,15 @@ async function runEvalCommand(input) {
5872
5488
  displayIdTracker,
5873
5489
  selection,
5874
5490
  inlineTargetLabel,
5875
- testCases: applicableTestCases,
5491
+ testCases: filteredTestCases,
5876
5492
  trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
5877
5493
  matrixMode: targetPrep.selections.length > 1,
5878
- totalBudgetUsd: targetPrep.totalBudgetUsd,
5494
+ budgetUsd: targetPrep.budgetUsd,
5879
5495
  failOnError: targetPrep.failOnError,
5880
5496
  threshold: resolvedThreshold,
5881
5497
  providerFactory: transcriptProviderFactory
5882
5498
  });
5883
- const evalFile = path17.relative(cwd, testFilePath);
5499
+ const evalFile = path15.relative(cwd, testFilePath);
5884
5500
  const existingSummary = remoteEvalSummaries.find(
5885
5501
  (summary2) => summary2.evalFile === evalFile
5886
5502
  );
@@ -5897,10 +5513,10 @@ async function runEvalCommand(input) {
5897
5513
  const message = fileError instanceof Error ? fileError.message : String(fileError);
5898
5514
  console.error(
5899
5515
  `
5900
- [ERROR] \u26A0 Eval file failed: ${path17.basename(testFilePath)} \u2014 ${message}
5516
+ [ERROR] \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
5901
5517
  `
5902
5518
  );
5903
- const errorResults = applicableTestCases.map((testCase) => ({
5519
+ const errorResults = filteredTestCases.map((testCase) => ({
5904
5520
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5905
5521
  testId: testCase.id,
5906
5522
  score: 0,
@@ -5936,37 +5552,60 @@ async function runEvalCommand(input) {
5936
5552
  `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
5937
5553
  );
5938
5554
  }
5555
+ await outputWriter.close().catch(() => void 0);
5556
+ let summaryResults = allResults;
5557
+ if (isResumeAppend && usesDefaultArtifactWorkspace) {
5558
+ const content = await readFile8(outputPath, "utf8");
5559
+ summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
5560
+ }
5939
5561
  const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
5940
- const summary = calculateEvaluationSummary(allResults, thresholdOpts);
5562
+ const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
5941
5563
  console.log(formatEvaluationSummary(summary, thresholdOpts));
5942
5564
  const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
5943
5565
  const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
5944
- if (isMatrixMode && allResults.length > 0) {
5945
- console.log(formatMatrixSummary(allResults));
5566
+ if (isMatrixMode && summaryResults.length > 0) {
5567
+ console.log(formatMatrixSummary(summaryResults));
5946
5568
  }
5947
5569
  if (options.benchmarkJson && allResults.length > 0) {
5948
- const benchmarkPath = path17.resolve(options.benchmarkJson);
5570
+ const benchmarkPath = path15.resolve(options.benchmarkJson);
5949
5571
  await writeBenchmarkJson(benchmarkPath, allResults);
5950
5572
  console.log(`Benchmark written to: ${benchmarkPath}`);
5951
5573
  }
5952
5574
  if (usesDefaultArtifactWorkspace && allResults.length > 0) {
5953
5575
  const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
5954
- const {
5955
- testArtifactDir,
5956
- timingPath,
5957
- benchmarkPath: workspaceBenchmarkPath,
5958
- indexPath
5959
- } = await writeArtifactsFromResults(allResults, runDir, {
5960
- evalFile,
5961
- experiment: normalizeExperimentName(options.experiment)
5962
- });
5963
- console.log(`Artifact workspace written to: ${runDir}`);
5964
- console.log(` Index: ${indexPath}`);
5965
- console.log(
5966
- ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
5967
- );
5968
- console.log(` Timing: ${timingPath}`);
5969
- console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5576
+ if (isResumeAppend) {
5577
+ const { writePerTestArtifacts } = await import("./artifact-writer-WH3OE42V.js");
5578
+ await writePerTestArtifacts(allResults, runDir, {
5579
+ experiment: normalizeExperimentName(options.experiment)
5580
+ });
5581
+ const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
5582
+ runDir,
5583
+ { evalFile, experiment: normalizeExperimentName(options.experiment) }
5584
+ );
5585
+ const indexPath = path15.join(runDir, "index.jsonl");
5586
+ console.log(`Artifact workspace updated: ${runDir}`);
5587
+ console.log(` Index: ${indexPath}`);
5588
+ console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
5589
+ console.log(` Timing: ${timingPath}`);
5590
+ console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5591
+ } else {
5592
+ const {
5593
+ testArtifactDir,
5594
+ timingPath,
5595
+ benchmarkPath: workspaceBenchmarkPath,
5596
+ indexPath
5597
+ } = await writeArtifactsFromResults(allResults, runDir, {
5598
+ evalFile,
5599
+ experiment: normalizeExperimentName(options.experiment)
5600
+ });
5601
+ console.log(`Artifact workspace written to: ${runDir}`);
5602
+ console.log(` Index: ${indexPath}`);
5603
+ console.log(
5604
+ ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
5605
+ );
5606
+ console.log(` Timing: ${timingPath}`);
5607
+ console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5608
+ }
5970
5609
  }
5971
5610
  if (resolvedExportPaths.length > 0 && allResults.length > 0) {
5972
5611
  for (const exportPath of resolvedExportPaths) {
@@ -5977,7 +5616,7 @@ async function runEvalCommand(input) {
5977
5616
  await writer.close();
5978
5617
  }
5979
5618
  console.log(
5980
- `Export file(s) written: ${resolvedExportPaths.map((p) => path17.relative(cwd, p)).join(", ")}`
5619
+ `Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
5981
5620
  );
5982
5621
  }
5983
5622
  const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath);
@@ -6015,14 +5654,14 @@ Results written to: ${outputPath}`);
6015
5654
  experiment: normalizeExperimentName(options.experiment)
6016
5655
  });
6017
5656
  }
6018
- if (summary.executionErrorCount > 0 && !options.retryErrors) {
6019
- const evalFileArgs = activeTestFiles.map((f) => path17.relative(cwd, f)).join(" ");
5657
+ if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) {
5658
+ const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
6020
5659
  const targetFlag = options.target ? ` --target ${options.target}` : "";
6021
- const relativeOutputPath = path17.relative(cwd, outputPath);
5660
+ const relativeRunDir = path15.relative(cwd, runDir);
6022
5661
  console.log(
6023
5662
  `
6024
5663
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
6025
- agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`
5664
+ agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
6026
5665
  );
6027
5666
  }
6028
5667
  return {
@@ -6052,7 +5691,7 @@ async function resolveEvaluationRunner() {
6052
5691
  if (!overridePath) {
6053
5692
  return runEvaluation;
6054
5693
  }
6055
- const resolved = path17.isAbsolute(overridePath) ? overridePath : path17.resolve(process.cwd(), overridePath);
5694
+ const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
6056
5695
  const moduleUrl = pathToFileURL(resolved).href;
6057
5696
  const mod = await import(moduleUrl);
6058
5697
  const candidate = mod.runEvaluation;
@@ -6065,11 +5704,11 @@ async function resolveEvaluationRunner() {
6065
5704
  }
6066
5705
 
6067
5706
  // src/commands/eval/discover.ts
6068
- import path18 from "node:path";
5707
+ import path16 from "node:path";
6069
5708
  import fg2 from "fast-glob";
6070
5709
  async function discoverEvalFiles(cwd) {
6071
5710
  const repoRoot = await findRepoRoot(cwd);
6072
- const config = await loadConfig(path18.join(cwd, "_"), repoRoot);
5711
+ const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
6073
5712
  const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
6074
5713
  const ignore = ["**/node_modules/**", "**/dist/**"];
6075
5714
  const matches = await fg2(patterns, {
@@ -6081,7 +5720,7 @@ async function discoverEvalFiles(cwd) {
6081
5720
  caseSensitiveMatch: false
6082
5721
  });
6083
5722
  const evalFiles = matches.map((absPath) => {
6084
- const relativePath = path18.relative(cwd, absPath);
5723
+ const relativePath = path16.relative(cwd, absPath);
6085
5724
  const category = deriveCategory(relativePath);
6086
5725
  return { path: absPath, relativePath, category };
6087
5726
  });
@@ -6103,10 +5742,6 @@ function filterByCategory(files, category) {
6103
5742
 
6104
5743
  export {
6105
5744
  package_default,
6106
- toSnakeCaseDeep,
6107
- RESULT_INDEX_FILENAME,
6108
- buildDefaultRunDir,
6109
- resolveRunManifestPath,
6110
5745
  parseResultManifest,
6111
5746
  resolveResultSourcePath,
6112
5747
  loadManifestResults,
@@ -6132,7 +5767,6 @@ export {
6132
5767
  listMergedResultFiles,
6133
5768
  findRunById,
6134
5769
  maybeAutoExportRunArtifacts,
6135
- writeArtifactsFromResults,
6136
5770
  resolveRunCacheFile,
6137
5771
  loadRunCache,
6138
5772
  detectFileType,
@@ -6150,4 +5784,4 @@ export {
6150
5784
  getCategories,
6151
5785
  filterByCategory
6152
5786
  };
6153
- //# sourceMappingURL=chunk-ZUNYOUFO.js.map
5787
+ //# sourceMappingURL=chunk-VRPCMCLQ.js.map