agentv 4.17.1 → 4.19.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,36 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
+ import {
3
+ RESULT_INDEX_FILENAME,
4
+ RESULT_RUNS_DIRNAME,
5
+ aggregateRunDir,
6
+ buildDefaultRunDir,
7
+ buildTestTargetKey,
8
+ deduplicateByTestIdTarget,
9
+ isDirectoryPath,
10
+ normalizeExperimentName,
11
+ parseJsonlResults,
12
+ resolveExistingRunPrimaryPath,
13
+ resolveRunIndexPath,
14
+ resolveRunManifestPath,
15
+ resolveWorkspaceOrFilePath,
16
+ toSnakeCaseDeep,
17
+ writeArtifactsFromResults
18
+ } from "./chunk-NL6P5MUH.js";
19
+ import {
20
+ ResponseCache,
21
+ commitAndPushResultsBranch,
22
+ createDraftResultsPr,
23
+ deriveCategory,
24
+ directorySizeBytes,
25
+ getResultsRepoStatus,
26
+ loadTsConfig,
27
+ prepareResultsRepoBranch,
28
+ resolveResultsRepoRunsDir,
29
+ shouldEnableCache,
30
+ shouldSkipCacheForTemperature,
31
+ stageResultsArtifacts,
32
+ syncResultsRepo
33
+ } from "./chunk-R2QDYORI.js";
2
34
  import {
3
35
  CLI_PLACEHOLDERS,
4
36
  COMMON_TARGET_SETTINGS,
@@ -6,48 +38,34 @@ import {
6
38
  DEFAULT_THRESHOLD,
7
39
  KNOWN_PROVIDERS,
8
40
  PROVIDER_ALIASES,
9
- ResponseCache,
10
41
  buildDirectoryChain,
11
42
  buildSearchRoots,
12
- commitAndPushResultsBranch,
13
- createDraftResultsPr,
14
- deriveCategory,
15
- directorySizeBytes,
16
43
  ensureVSCodeSubagents,
17
44
  findDeprecatedCamelCaseTargetWarnings,
18
45
  findGitRoot,
19
- getResultsRepoStatus,
20
46
  interpolateEnv,
21
- isEvaluatorKind,
47
+ isGraderKind,
22
48
  listTargetNames,
23
49
  loadCasesFromFile,
24
50
  loadConfig,
25
51
  loadTestSuite,
26
- loadTsConfig,
27
52
  normalizeLineEndings,
28
- prepareResultsRepoBranch,
29
53
  readTargetDefinitions,
30
54
  readTestSuiteMetadata,
31
55
  resolveFileReference,
32
- resolveResultsRepoRunsDir,
33
56
  resolveTargetDefinition,
34
57
  runEvaluation,
35
- shouldEnableCache,
36
- shouldSkipCacheForTemperature,
37
- stageResultsArtifacts,
38
58
  subscribeToCodexLogEntries,
39
59
  subscribeToCopilotCliLogEntries,
40
60
  subscribeToCopilotSdkLogEntries,
41
61
  subscribeToPiLogEntries,
42
- syncResultsRepo,
43
- toCamelCaseDeep,
44
- toTranscriptJsonLines
45
- } from "./chunk-IRU2UOWN.js";
62
+ toCamelCaseDeep
63
+ } from "./chunk-PTYQS37Y.js";
46
64
 
47
65
  // package.json
48
66
  var package_default = {
49
67
  name: "agentv",
50
- version: "4.17.1",
68
+ version: "4.19.0-next.1",
51
69
  description: "CLI entry point for AgentV",
52
70
  type: "module",
53
71
  repository: {
@@ -132,12 +150,15 @@ async function resolveEvalPaths(evalPaths, cwd) {
132
150
  const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
133
151
  try {
134
152
  const stats = await stat(candidatePath);
135
- if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
153
+ if (stats.isFile() && /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(candidatePath)) {
136
154
  results.add(candidatePath);
137
155
  continue;
138
156
  }
139
157
  if (stats.isDirectory()) {
140
- const dirGlob = path.posix.join(candidatePath.replace(/\\/g, "/"), "**/*.eval.{yaml,yml}");
158
+ const dirGlob = path.posix.join(
159
+ candidatePath.replace(/\\/g, "/"),
160
+ "**/{*.eval.yaml,*.eval.yml,eval.yaml,eval.yml,*.eval.ts,*.eval.mts}"
161
+ );
141
162
  const dirMatches = await fg(dirGlob, {
142
163
  absolute: true,
143
164
  onlyFiles: true,
@@ -163,7 +184,9 @@ async function resolveEvalPaths(evalPaths, cwd) {
163
184
  followSymbolicLinks: true,
164
185
  ignore: ignorePatterns
165
186
  });
166
- const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
187
+ const yamlMatches = matches.filter(
188
+ (filePath) => /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(filePath)
189
+ );
167
190
  for (const filePath of yamlMatches) {
168
191
  results.add(path.normalize(filePath));
169
192
  }
@@ -185,7 +208,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
185
208
  throw new Error(
186
209
  `No eval files matched any provided paths or globs: ${includePatterns.join(
187
210
  ", "
188
- )}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`
211
+ )}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/eval.yaml", "evals/**/*.eval.ts").`
189
212
  );
190
213
  }
191
214
  const sorted = Array.from(results);
@@ -261,9 +284,9 @@ async function discoverTargetsFile(options) {
261
284
  }
262
285
 
263
286
  // src/commands/eval/run-eval.ts
264
- import { constants as constants4, mkdirSync } from "node:fs";
265
- import { access as access5 } from "node:fs/promises";
266
- import path17 from "node:path";
287
+ import { constants as constants4, existsSync as existsSync2, mkdirSync } from "node:fs";
288
+ import { access as access5, readFile as readFile8 } from "node:fs/promises";
289
+ import path15 from "node:path";
267
290
  import { pathToFileURL } from "node:url";
268
291
 
269
292
  // src/version-check.ts
@@ -321,88 +344,15 @@ async function promptContinue() {
321
344
  }
322
345
 
323
346
  // src/commands/results/remote.ts
324
- import path6 from "node:path";
325
-
326
- // src/commands/inspect/utils.ts
327
- import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
328
347
  import path5 from "node:path";
329
348
 
330
- // src/commands/eval/result-layout.ts
331
- import { existsSync, statSync } from "node:fs";
332
- import path3 from "node:path";
333
- var RESULT_INDEX_FILENAME = "index.jsonl";
334
- var RESULT_RUNS_DIRNAME = "runs";
335
- var DEFAULT_EXPERIMENT_NAME = "default";
336
- function normalizeExperimentName(experiment) {
337
- const trimmed = experiment?.trim();
338
- if (!trimmed) {
339
- return DEFAULT_EXPERIMENT_NAME;
340
- }
341
- if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
342
- throw new Error(
343
- `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
344
- );
345
- }
346
- return trimmed;
347
- }
348
- function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
349
- return timestamp.toISOString().replace(/[:.]/g, "-");
350
- }
351
- function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
352
- return path3.join(
353
- cwd,
354
- ".agentv",
355
- "results",
356
- RESULT_RUNS_DIRNAME,
357
- normalizeExperimentName(experiment),
358
- createRunDirName(timestamp)
359
- );
360
- }
361
- function resolveRunIndexPath(runDir) {
362
- return path3.join(runDir, RESULT_INDEX_FILENAME);
363
- }
364
- function isRunManifestPath(filePath) {
365
- return path3.basename(filePath) === RESULT_INDEX_FILENAME;
366
- }
367
- function resolveExistingRunPrimaryPath(runDir) {
368
- const indexPath = resolveRunIndexPath(runDir);
369
- if (existsSync(indexPath)) {
370
- return indexPath;
371
- }
372
- return void 0;
373
- }
374
- function isDirectoryPath(filePath) {
375
- try {
376
- return statSync(filePath).isDirectory();
377
- } catch {
378
- return false;
379
- }
380
- }
381
- function resolveWorkspaceOrFilePath(filePath) {
382
- if (!isDirectoryPath(filePath)) {
383
- return filePath;
384
- }
385
- const existing = resolveExistingRunPrimaryPath(filePath);
386
- if (!existing) {
387
- throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
388
- }
389
- return existing;
390
- }
391
- function resolveRunManifestPath(filePath) {
392
- if (isDirectoryPath(filePath)) {
393
- return resolveWorkspaceOrFilePath(filePath);
394
- }
395
- if (!isRunManifestPath(filePath)) {
396
- throw new Error(
397
- `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
398
- );
399
- }
400
- return filePath;
401
- }
349
+ // src/commands/inspect/utils.ts
350
+ import { readFileSync as readFileSync2, readdirSync, statSync } from "node:fs";
351
+ import path4 from "node:path";
402
352
 
403
353
  // src/commands/results/manifest.ts
404
- import { existsSync as existsSync2, readFileSync } from "node:fs";
405
- import path4 from "node:path";
354
+ import { existsSync, readFileSync } from "node:fs";
355
+ import path3 from "node:path";
406
356
  function parseJsonlLines(content) {
407
357
  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
408
358
  }
@@ -421,8 +371,8 @@ function readOptionalText(baseDir, relativePath) {
421
371
  if (!relativePath) {
422
372
  return void 0;
423
373
  }
424
- const absolutePath = path4.join(baseDir, relativePath);
425
- if (!existsSync2(absolutePath)) {
374
+ const absolutePath = path3.join(baseDir, relativePath);
375
+ if (!existsSync(absolutePath)) {
426
376
  return void 0;
427
377
  }
428
378
  return readFileSync(absolutePath, "utf8");
@@ -510,8 +460,8 @@ function parseResultManifest(content) {
510
460
  return parseJsonlLines(content);
511
461
  }
512
462
  function resolveResultSourcePath(source, cwd) {
513
- const resolved = path4.isAbsolute(source) ? source : path4.resolve(cwd ?? process.cwd(), source);
514
- if (isDirectoryPath(resolved) || path4.basename(resolved) === RESULT_INDEX_FILENAME) {
463
+ const resolved = path3.isAbsolute(source) ? source : path3.resolve(cwd ?? process.cwd(), source);
464
+ if (isDirectoryPath(resolved) || path3.basename(resolved) === RESULT_INDEX_FILENAME) {
515
465
  return resolveRunManifestPath(resolved);
516
466
  }
517
467
  return resolved;
@@ -520,7 +470,7 @@ function loadManifestResults(sourceFile) {
520
470
  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
521
471
  const content = readFileSync(resolvedSourceFile, "utf8");
522
472
  const records = parseResultManifest(content);
523
- const baseDir = path4.dirname(resolvedSourceFile);
473
+ const baseDir = path3.dirname(resolvedSourceFile);
524
474
  return records.map((record) => hydrateManifestRecord(baseDir, record));
525
475
  }
526
476
  function loadLightweightResults(sourceFile) {
@@ -566,10 +516,10 @@ function padLeft(str, len) {
566
516
  }
567
517
  function loadResultFile(filePath) {
568
518
  const resolvedFilePath = resolveTraceResultPath(filePath);
569
- if (path5.extname(resolvedFilePath) === ".json") {
519
+ if (path4.extname(resolvedFilePath) === ".json") {
570
520
  return loadOtlpTraceFile(resolvedFilePath);
571
521
  }
572
- if (path5.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
522
+ if (path4.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
573
523
  return loadManifestAsRawResults(resolvedFilePath);
574
524
  }
575
525
  return loadJsonlRecords(resolvedFilePath);
@@ -848,7 +798,7 @@ function toTraceSummary(result) {
848
798
  return toCamelCaseDeep(rawTrace);
849
799
  }
850
800
  function buildRunId(relativeRunPath) {
851
- const normalized = relativeRunPath.split(path5.sep).join("/");
801
+ const normalized = relativeRunPath.split(path4.sep).join("/");
852
802
  const segments = normalized.split("/").filter(Boolean);
853
803
  if (segments.length >= 2) {
854
804
  const experiment = segments.slice(0, -1).join("/");
@@ -863,10 +813,10 @@ function buildRunId(relativeRunPath) {
863
813
  function collectRunManifestPaths(runsDir, currentDir, files) {
864
814
  const primaryPath = resolveExistingRunPrimaryPath(currentDir);
865
815
  if (primaryPath) {
866
- const relativeRunPath = path5.relative(runsDir, currentDir);
816
+ const relativeRunPath = path4.relative(runsDir, currentDir);
867
817
  files.push({
868
818
  filePath: primaryPath,
869
- displayName: path5.basename(currentDir),
819
+ displayName: path4.basename(currentDir),
870
820
  runId: buildRunId(relativeRunPath)
871
821
  });
872
822
  return;
@@ -874,7 +824,7 @@ function collectRunManifestPaths(runsDir, currentDir, files) {
874
824
  const entries = readdirSync(currentDir, { withFileTypes: true });
875
825
  for (const entry of entries) {
876
826
  if (entry.isDirectory()) {
877
- collectRunManifestPaths(runsDir, path5.join(currentDir, entry.name), files);
827
+ collectRunManifestPaths(runsDir, path4.join(currentDir, entry.name), files);
878
828
  }
879
829
  }
880
830
  }
@@ -884,7 +834,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
884
834
  const entries = readdirSync(runsDir, { withFileTypes: true });
885
835
  for (const entry of entries) {
886
836
  if (entry.isDirectory()) {
887
- collectRunManifestPaths(runsDir, path5.join(runsDir, entry.name), files);
837
+ collectRunManifestPaths(runsDir, path4.join(runsDir, entry.name), files);
888
838
  }
889
839
  }
890
840
  } catch {
@@ -894,7 +844,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
894
844
  const metas = [];
895
845
  for (const { filePath, displayName, runId } of limited) {
896
846
  try {
897
- const fileStat = statSync2(filePath);
847
+ const fileStat = statSync(filePath);
898
848
  const results = loadResultFile(filePath);
899
849
  const testCount = results.length;
900
850
  const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
@@ -919,7 +869,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
919
869
  }
920
870
  function listResultFiles(cwd, limit) {
921
871
  return listResultFilesFromRunsDir(
922
- path5.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
872
+ path4.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
923
873
  limit
924
874
  );
925
875
  }
@@ -971,17 +921,17 @@ function slugify(value) {
971
921
  return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
972
922
  }
973
923
  function getRelativeRunPath(cwd, runDir) {
974
- const relative = path6.relative(path6.join(cwd, ".agentv", "results", "runs"), runDir);
975
- if (!relative.startsWith("..") && !path6.isAbsolute(relative)) {
924
+ const relative = path5.relative(path5.join(cwd, ".agentv", "results", "runs"), runDir);
925
+ if (!relative.startsWith("..") && !path5.isAbsolute(relative)) {
976
926
  return relative;
977
927
  }
978
- const experiment = path6.basename(path6.dirname(runDir));
979
- const runName = path6.basename(runDir);
980
- return experiment && experiment !== runName ? path6.join(experiment, runName) : runName;
928
+ const experiment = path5.basename(path5.dirname(runDir));
929
+ const runName = path5.basename(runDir);
930
+ return experiment && experiment !== runName ? path5.join(experiment, runName) : runName;
981
931
  }
982
932
  function buildBranchName(config, payload) {
983
- const timestamp = path6.basename(payload.run_dir);
984
- const evalStem = payload.test_files.length === 1 ? path6.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
933
+ const timestamp = path5.basename(payload.run_dir);
934
+ const evalStem = payload.test_files.length === 1 ? path5.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
985
935
  const experiment = slugify(payload.experiment ?? "default");
986
936
  const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
987
937
  return `${config.branch_prefix}/${branchLeaf}`;
@@ -1010,7 +960,7 @@ function buildPrBody(payload) {
1010
960
  "",
1011
961
  sections,
1012
962
  "",
1013
- `Run: ${path6.basename(payload.run_dir)}`,
963
+ `Run: ${path5.basename(payload.run_dir)}`,
1014
964
  `Experiment: ${payload.experiment ?? "default"}`,
1015
965
  `Eval Files: ${payload.test_files.join(", ")}`
1016
966
  ].join("\n");
@@ -1025,7 +975,7 @@ async function maybeWarnLargeArtifact(runDir) {
1025
975
  }
1026
976
  async function loadNormalizedResultsConfig(cwd) {
1027
977
  const repoRoot = await findRepoRoot(cwd) ?? cwd;
1028
- const config = await loadConfig(path6.join(cwd, "_"), repoRoot);
978
+ const config = await loadConfig(path5.join(cwd, "_"), repoRoot);
1029
979
  if (!config?.results?.export) {
1030
980
  return void 0;
1031
981
  }
@@ -1109,7 +1059,7 @@ async function maybeAutoExportRunArtifacts(payload) {
1109
1059
  const prepared = await prepareResultsRepoBranch(config, branchName);
1110
1060
  try {
1111
1061
  const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
1112
- const destinationDir = path6.join(prepared.repoDir, config.path, relativeRunPath);
1062
+ const destinationDir = path5.join(prepared.repoDir, config.path, relativeRunPath);
1113
1063
  await stageResultsArtifacts({
1114
1064
  repoDir: prepared.repoDir,
1115
1065
  sourceDir: payload.run_dir,
@@ -1143,36 +1093,8 @@ async function maybeAutoExportRunArtifacts(payload) {
1143
1093
  }
1144
1094
  }
1145
1095
 
1146
- // src/commands/eval/artifact-writer.ts
1147
- import { mkdir, readFile, writeFile } from "node:fs/promises";
1148
- import path7 from "node:path";
1149
-
1150
- // src/utils/case-conversion.ts
1151
- function toSnakeCase(str) {
1152
- if (/^[A-Z]/.test(str)) {
1153
- return str;
1154
- }
1155
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
1156
- }
1157
- function toSnakeCaseDeep(obj) {
1158
- if (obj === null || obj === void 0) {
1159
- return obj;
1160
- }
1161
- if (Array.isArray(obj)) {
1162
- return obj.map((item) => toSnakeCaseDeep(item));
1163
- }
1164
- if (typeof obj === "object") {
1165
- const result = {};
1166
- for (const [key, value] of Object.entries(obj)) {
1167
- const snakeKey = toSnakeCase(key);
1168
- result[snakeKey] = toSnakeCaseDeep(value);
1169
- }
1170
- return result;
1171
- }
1172
- return obj;
1173
- }
1174
-
1175
- // src/commands/eval/artifact-writer.ts
1096
+ // src/commands/eval/benchmark-writer.ts
1097
+ import { writeFile } from "node:fs/promises";
1176
1098
  function computeStats(values) {
1177
1099
  if (values.length === 0) {
1178
1100
  return { mean: 0, stddev: 0 };
@@ -1185,361 +1107,6 @@ function computeStats(values) {
1185
1107
  };
1186
1108
  }
1187
1109
  function computePassRate(result) {
1188
- const scores = result.scores;
1189
- if (scores && scores.length > 0) {
1190
- const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
1191
- return passed / scores.length;
1192
- }
1193
- return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
1194
- }
1195
- function countToolCalls(result) {
1196
- const toolCalls = {};
1197
- let total = 0;
1198
- const trace = result.trace;
1199
- if (trace?.steps) {
1200
- for (const step of trace.steps) {
1201
- if (step.toolName || step.type === "tool") {
1202
- const name = step.toolName ?? "unknown";
1203
- toolCalls[name] = (toolCalls[name] ?? 0) + 1;
1204
- total += 1;
1205
- }
1206
- }
1207
- }
1208
- return { toolCalls, total };
1209
- }
1210
- function parseWorkspaceChanges(fileChanges) {
1211
- if (!fileChanges) {
1212
- return void 0;
1213
- }
1214
- let filesModified = 0;
1215
- let filesCreated = 0;
1216
- const lines = fileChanges.split("\n");
1217
- for (const line of lines) {
1218
- if (line.startsWith("--- /dev/null")) {
1219
- filesCreated += 1;
1220
- } else if (line.startsWith("--- a/")) {
1221
- filesModified += 1;
1222
- }
1223
- }
1224
- const summaryLines = lines.slice(0, 20);
1225
- const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
1226
- ... (${lines.length - 20} more lines)` : fileChanges;
1227
- return {
1228
- files_modified: filesModified,
1229
- files_created: filesCreated,
1230
- diff_summary: diffSummary
1231
- };
1232
- }
1233
- function buildAssertions(result) {
1234
- if (!result.assertions) return [];
1235
- return result.assertions.map((a) => ({
1236
- text: a.text,
1237
- passed: a.passed,
1238
- evidence: a.evidence ?? ""
1239
- }));
1240
- }
1241
- function buildEvaluators(scores) {
1242
- if (!scores || scores.length === 0) {
1243
- return void 0;
1244
- }
1245
- return scores.map((s) => ({
1246
- name: s.name,
1247
- type: s.type,
1248
- score: s.score,
1249
- reasoning: "",
1250
- weight: s.weight,
1251
- verdict: s.verdict,
1252
- assertions: s.assertions,
1253
- details: s.details
1254
- }));
1255
- }
1256
- function buildGradingArtifact(result) {
1257
- const assertions = buildAssertions(result);
1258
- const passed = assertions.filter((e) => e.passed).length;
1259
- const failed = assertions.filter((e) => !e.passed).length;
1260
- const total = assertions.length;
1261
- const { toolCalls, total: totalToolCalls } = countToolCalls(result);
1262
- const errorsEncountered = result.error ? 1 : 0;
1263
- return {
1264
- assertions,
1265
- summary: {
1266
- passed,
1267
- failed,
1268
- total,
1269
- pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
1270
- },
1271
- execution_metrics: {
1272
- tool_calls: toolCalls,
1273
- total_tool_calls: totalToolCalls,
1274
- errors_encountered: errorsEncountered
1275
- },
1276
- graders: buildEvaluators(result.scores),
1277
- workspace_changes: parseWorkspaceChanges(result.fileChanges),
1278
- conversation: result.conversationId ? {
1279
- turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
1280
- conversation_id: result.conversationId
1281
- } : void 0
1282
- };
1283
- }
1284
- function buildTimingArtifact(results) {
1285
- let totalInput = 0;
1286
- let totalOutput = 0;
1287
- let totalReasoning = 0;
1288
- let totalDurationMs = 0;
1289
- for (const result of results) {
1290
- const usage = result.tokenUsage;
1291
- if (usage) {
1292
- totalInput += usage.input ?? 0;
1293
- totalOutput += usage.output ?? 0;
1294
- totalReasoning += usage.reasoning ?? 0;
1295
- }
1296
- if (result.durationMs != null) {
1297
- totalDurationMs += result.durationMs;
1298
- }
1299
- }
1300
- return {
1301
- total_tokens: totalInput + totalOutput,
1302
- duration_ms: totalDurationMs,
1303
- total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
1304
- token_usage: {
1305
- input: totalInput,
1306
- output: totalOutput,
1307
- reasoning: totalReasoning
1308
- }
1309
- };
1310
- }
1311
- function buildBenchmarkArtifact(results, evalFile = "", experiment) {
1312
- const targetSet = /* @__PURE__ */ new Set();
1313
- const testIdSet = /* @__PURE__ */ new Set();
1314
- for (const result of results) {
1315
- targetSet.add(result.target ?? "unknown");
1316
- testIdSet.add(result.testId ?? "unknown");
1317
- }
1318
- const targets = [...targetSet].sort();
1319
- const testIds = [...testIdSet].sort();
1320
- const runSummary = {};
1321
- const notes = [];
1322
- for (const target of targets) {
1323
- const targetResults = results.filter((r) => r.target === target);
1324
- const passRates = targetResults.map(computePassRate);
1325
- const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
1326
- const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
1327
- const usage = r.tokenUsage;
1328
- return (usage.input ?? 0) + (usage.output ?? 0);
1329
- });
1330
- const entry = {
1331
- pass_rate: computeStats(passRates),
1332
- time_seconds: computeStats(timings),
1333
- tokens: computeStats(tokens)
1334
- };
1335
- const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
1336
- if (toolCallCounts.some((c2) => c2 > 0)) {
1337
- entry.tool_calls = computeStats(toolCallCounts);
1338
- }
1339
- const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
1340
- if (costs.length > 0) {
1341
- entry.cost_usd = computeStats(costs);
1342
- }
1343
- runSummary[target] = entry;
1344
- }
1345
- const evaluatorScores = /* @__PURE__ */ new Map();
1346
- for (const result of results) {
1347
- if (result.scores) {
1348
- for (const score of result.scores) {
1349
- const key = `${score.name}:${score.type}`;
1350
- if (!evaluatorScores.has(key)) {
1351
- evaluatorScores.set(key, []);
1352
- }
1353
- evaluatorScores.get(key)?.push(score.score);
1354
- }
1355
- }
1356
- }
1357
- let perEvaluatorSummary;
1358
- if (evaluatorScores.size > 0) {
1359
- perEvaluatorSummary = {};
1360
- for (const [key, scores] of evaluatorScores) {
1361
- perEvaluatorSummary[key] = computeStats(scores);
1362
- }
1363
- }
1364
- const errorCount = results.filter(
1365
- (r) => r.executionStatus != null && r.executionStatus === "execution_error"
1366
- ).length;
1367
- if (errorCount > 0) {
1368
- notes.push(
1369
- `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
1370
- );
1371
- }
1372
- if (results.length === 0) {
1373
- notes.push("No results to summarize");
1374
- }
1375
- const firstResult = results[0];
1376
- const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
1377
- return {
1378
- metadata: {
1379
- eval_file: evalFile,
1380
- timestamp,
1381
- targets,
1382
- tests_run: testIds,
1383
- experiment
1384
- },
1385
- run_summary: runSummary,
1386
- per_grader_summary: perEvaluatorSummary,
1387
- notes
1388
- };
1389
- }
1390
- function safeArtifactPathSegment(value, fallback) {
1391
- const trimmed = value?.trim();
1392
- if (!trimmed) {
1393
- return fallback;
1394
- }
1395
- return trimmed.replace(/[/\\:*?"<>|]/g, "_");
1396
- }
1397
- function safeTestId(testId) {
1398
- return safeArtifactPathSegment(testId, "unknown");
1399
- }
1400
- function getSuite(result) {
1401
- return result.suite;
1402
- }
1403
- function buildArtifactSubdir(result) {
1404
- const segments = [];
1405
- const evalSet = getSuite(result);
1406
- if (evalSet) {
1407
- segments.push(safeArtifactPathSegment(evalSet, "default"));
1408
- }
1409
- segments.push(safeTestId(result.testId));
1410
- return path7.posix.join(...segments);
1411
- }
1412
- function formatOutputMarkdown(output) {
1413
- return output.map((msg) => `@[${msg.role}]:
1414
- ${String(msg.content ?? "")}`).join("\n\n");
1415
- }
1416
- function extractInput(result) {
1417
- const input = result.input;
1418
- if (!input) return null;
1419
- if (typeof input === "string") return input;
1420
- if (Array.isArray(input) && input.length > 0) {
1421
- return formatOutputMarkdown(input);
1422
- }
1423
- return null;
1424
- }
1425
- function buildResultIndexArtifact(result) {
1426
- const artifactSubdir = buildArtifactSubdir(result);
1427
- const input = extractInput(result);
1428
- const hasResponse = Array.isArray(result.output) && result.output.length > 0;
1429
- return {
1430
- timestamp: result.timestamp,
1431
- test_id: result.testId ?? "unknown",
1432
- suite: getSuite(result),
1433
- category: result.category,
1434
- conversation_id: result.conversationId,
1435
- score: result.score,
1436
- target: result.target ?? "unknown",
1437
- scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
1438
- execution_status: result.executionStatus,
1439
- error: result.error,
1440
- failure_stage: result.failureStage,
1441
- failure_reason_code: result.failureReasonCode,
1442
- workspace_path: result.workspacePath,
1443
- grading_path: path7.posix.join(artifactSubdir, "grading.json"),
1444
- timing_path: path7.posix.join(artifactSubdir, "timing.json"),
1445
- input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
1446
- output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
1447
- response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
1448
- };
1449
- }
1450
- async function writeJsonlFile(filePath, records) {
1451
- const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
1452
- `;
1453
- await writeFile(filePath, content, "utf8");
1454
- }
1455
- function buildTranscriptMessageLines(results) {
1456
- const lines = [];
1457
- for (const result of results) {
1458
- const transcriptLines = toTranscriptJsonLines(
1459
- {
1460
- messages: [...result.input ?? [], ...result.output],
1461
- source: {
1462
- provider: result.target,
1463
- sessionId: result.conversationId ?? result.testId,
1464
- startedAt: result.timestamp
1465
- },
1466
- tokenUsage: result.tokenUsage,
1467
- durationMs: result.durationMs,
1468
- costUsd: result.costUsd
1469
- },
1470
- {
1471
- testId: result.testId,
1472
- target: result.target
1473
- }
1474
- );
1475
- lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
1476
- }
1477
- return lines.length > 0 ? `${lines.join("\n")}
1478
- ` : "";
1479
- }
1480
- async function writeArtifactsFromResults(results, outputDir, options) {
1481
- const testArtifactDir = outputDir;
1482
- const timingPath = path7.join(outputDir, "timing.json");
1483
- const benchmarkPath = path7.join(outputDir, "benchmark.json");
1484
- const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
1485
- await mkdir(outputDir, { recursive: true });
1486
- const indexRecords = [];
1487
- for (const result of results) {
1488
- const grading = buildGradingArtifact(result);
1489
- const timing2 = buildTimingArtifact([result]);
1490
- const artifactSubdir = buildArtifactSubdir(result);
1491
- const testDir = path7.join(outputDir, artifactSubdir);
1492
- const gradingPath = path7.join(testDir, "grading.json");
1493
- const perTestTimingPath = path7.join(testDir, "timing.json");
1494
- await mkdir(testDir, { recursive: true });
1495
- await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
1496
- `, "utf8");
1497
- await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
1498
- `, "utf8");
1499
- const input = extractInput(result);
1500
- if (input) {
1501
- await writeFile(path7.join(testDir, "input.md"), input, "utf8");
1502
- }
1503
- if (result.output && result.output.length > 0) {
1504
- const outputsDir = path7.join(testDir, "outputs");
1505
- await mkdir(outputsDir, { recursive: true });
1506
- await writeFile(
1507
- path7.join(outputsDir, "response.md"),
1508
- formatOutputMarkdown(result.output),
1509
- "utf8"
1510
- );
1511
- }
1512
- indexRecords.push({
1513
- ...buildResultIndexArtifact(result),
1514
- experiment: options?.experiment
1515
- });
1516
- }
1517
- const timing = buildTimingArtifact(results);
1518
- await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
1519
- `, "utf8");
1520
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
1521
- await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
1522
- `, "utf8");
1523
- await writeJsonlFile(indexPath, indexRecords);
1524
- const transcriptPath = path7.join(outputDir, "transcript.jsonl");
1525
- await writeFile(transcriptPath, buildTranscriptMessageLines(results), "utf8");
1526
- return { testArtifactDir, timingPath, benchmarkPath, indexPath };
1527
- }
1528
-
1529
- // src/commands/eval/benchmark-writer.ts
1530
- import { writeFile as writeFile2 } from "node:fs/promises";
1531
- function computeStats2(values) {
1532
- if (values.length === 0) {
1533
- return { mean: 0, stddev: 0 };
1534
- }
1535
- const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
1536
- const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
1537
- return {
1538
- mean: Math.round(mean * 1e3) / 1e3,
1539
- stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
1540
- };
1541
- }
1542
- function computePassRate2(result) {
1543
1110
  const scores = result.scores;
1544
1111
  if (scores && scores.length > 0) {
1545
1112
  const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
@@ -1548,7 +1115,7 @@ function computePassRate2(result) {
1548
1115
  return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
1549
1116
  }
1550
1117
  function buildBenchmarkJson(results) {
1551
- const passRates = results.map(computePassRate2);
1118
+ const passRates = results.map(computePassRate);
1552
1119
  const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
1553
1120
  const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
1554
1121
  const usage = r.tokenUsage;
@@ -1557,29 +1124,29 @@ function buildBenchmarkJson(results) {
1557
1124
  return {
1558
1125
  run_summary: {
1559
1126
  with_skill: {
1560
- pass_rate: computeStats2(passRates),
1561
- time_seconds: computeStats2(timings),
1562
- tokens: computeStats2(tokens)
1127
+ pass_rate: computeStats(passRates),
1128
+ time_seconds: computeStats(timings),
1129
+ tokens: computeStats(tokens)
1563
1130
  }
1564
1131
  }
1565
1132
  };
1566
1133
  }
1567
1134
  async function writeBenchmarkJson(outputPath, results) {
1568
1135
  const benchmark = buildBenchmarkJson(results);
1569
- await writeFile2(outputPath, `${JSON.stringify(benchmark, null, 2)}
1136
+ await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}
1570
1137
  `, "utf8");
1571
1138
  }
1572
1139
 
1573
1140
  // src/commands/eval/env.ts
1574
1141
  import { constants as constants3 } from "node:fs";
1575
1142
  import { access as access3 } from "node:fs/promises";
1576
- import path8 from "node:path";
1143
+ import path6 from "node:path";
1577
1144
  import { config as loadDotenv } from "dotenv";
1578
1145
  function uniqueDirs(directories) {
1579
1146
  const seen = /* @__PURE__ */ new Set();
1580
1147
  const result = [];
1581
1148
  for (const dir of directories) {
1582
- const absolute = path8.resolve(dir);
1149
+ const absolute = path6.resolve(dir);
1583
1150
  if (seen.has(absolute)) {
1584
1151
  continue;
1585
1152
  }
@@ -1598,14 +1165,14 @@ async function fileExists2(filePath) {
1598
1165
  }
1599
1166
  function collectAncestorDirectories(start, boundary) {
1600
1167
  const directories = [];
1601
- const boundaryDir = path8.resolve(boundary);
1602
- let current = path8.resolve(start);
1168
+ const boundaryDir = path6.resolve(boundary);
1169
+ let current = path6.resolve(start);
1603
1170
  while (current !== void 0) {
1604
1171
  directories.push(current);
1605
1172
  if (current === boundaryDir) {
1606
1173
  break;
1607
1174
  }
1608
- const parent = path8.dirname(current);
1175
+ const parent = path6.dirname(current);
1609
1176
  if (parent === current) {
1610
1177
  break;
1611
1178
  }
@@ -1615,12 +1182,12 @@ function collectAncestorDirectories(start, boundary) {
1615
1182
  }
1616
1183
  async function loadEnvFromHierarchy(options) {
1617
1184
  const { testFilePath, repoRoot, verbose } = options;
1618
- const testDir = path8.dirname(path8.resolve(testFilePath));
1185
+ const testDir = path6.dirname(path6.resolve(testFilePath));
1619
1186
  const cwd = process.cwd();
1620
1187
  const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
1621
1188
  const envFiles = [];
1622
1189
  for (const dir of searchDirs) {
1623
- const candidate = path8.join(dir, ".env");
1190
+ const candidate = path6.join(dir, ".env");
1624
1191
  if (await fileExists2(candidate)) {
1625
1192
  envFiles.push(candidate);
1626
1193
  }
@@ -1642,11 +1209,11 @@ async function loadEnvFromHierarchy(options) {
1642
1209
  }
1643
1210
 
1644
1211
  // src/commands/eval/output-writer.ts
1645
- import path14 from "node:path";
1212
+ import path12 from "node:path";
1646
1213
 
1647
1214
  // src/commands/eval/html-writer.ts
1648
- import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
1649
- import path9 from "node:path";
1215
+ import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
1216
+ import path7 from "node:path";
1650
1217
 
1651
1218
  // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
1652
1219
  var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -1865,7 +1432,7 @@ var HtmlWriter = class _HtmlWriter {
1865
1432
  this.filePath = filePath;
1866
1433
  }
1867
1434
  static async open(filePath) {
1868
- await mkdir2(path9.dirname(filePath), { recursive: true });
1435
+ await mkdir(path7.dirname(filePath), { recursive: true });
1869
1436
  const writer = new _HtmlWriter(filePath);
1870
1437
  await writer.writeHtml();
1871
1438
  return writer;
@@ -1891,7 +1458,7 @@ var HtmlWriter = class _HtmlWriter {
1891
1458
  }
1892
1459
  async writeHtml() {
1893
1460
  const html = generateHtml(this.results, this.isLive);
1894
- await writeFile3(this.filePath, html, "utf8");
1461
+ await writeFile2(this.filePath, html, "utf8");
1895
1462
  }
1896
1463
  };
1897
1464
  function generateHtml(results, isLive) {
@@ -2326,10 +1893,10 @@ var SCRIPT = `
2326
1893
  h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
2327
1894
  h+="</div>";
2328
1895
 
2329
- /* evaluator results */
1896
+ /* grader results */
2330
1897
  if(r.scores&&r.scores.length>0){
2331
- h+="<h4>Evaluator Results</h4>";
2332
- h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
1898
+ h+="<h4>Grader Results</h4>";
1899
+ h+='<table class="eval-table"><thead><tr><th>Grader</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
2333
1900
  for(var i=0;i<r.scores.length;i++){
2334
1901
  var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
2335
1902
  var evAssertions=ev.assertions||[];
@@ -2375,8 +1942,8 @@ var SCRIPT = `
2375
1942
  `;
2376
1943
 
2377
1944
  // src/commands/eval/json-writer.ts
2378
- import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
2379
- import path10 from "node:path";
1945
+ import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
1946
+ import path8 from "node:path";
2380
1947
  var JsonWriter = class _JsonWriter {
2381
1948
  filePath;
2382
1949
  results = [];
@@ -2385,7 +1952,7 @@ var JsonWriter = class _JsonWriter {
2385
1952
  this.filePath = filePath;
2386
1953
  }
2387
1954
  static async open(filePath) {
2388
- await mkdir3(path10.dirname(filePath), { recursive: true });
1955
+ await mkdir2(path8.dirname(filePath), { recursive: true });
2389
1956
  return new _JsonWriter(filePath);
2390
1957
  }
2391
1958
  async append(result) {
@@ -2412,15 +1979,15 @@ var JsonWriter = class _JsonWriter {
2412
1979
  results: this.results
2413
1980
  };
2414
1981
  const snakeCaseOutput = toSnakeCaseDeep(output);
2415
- await writeFile4(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
1982
+ await writeFile3(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
2416
1983
  `, "utf8");
2417
1984
  }
2418
1985
  };
2419
1986
 
2420
1987
  // src/commands/eval/jsonl-writer.ts
2421
1988
  import { createWriteStream } from "node:fs";
2422
- import { mkdir as mkdir4 } from "node:fs/promises";
2423
- import path11 from "node:path";
1989
+ import { mkdir as mkdir3 } from "node:fs/promises";
1990
+ import path9 from "node:path";
2424
1991
  import { finished } from "node:stream/promises";
2425
1992
  var JsonlWriter = class _JsonlWriter {
2426
1993
  stream;
@@ -2429,9 +1996,10 @@ var JsonlWriter = class _JsonlWriter {
2429
1996
  constructor(stream) {
2430
1997
  this.stream = stream;
2431
1998
  }
2432
- static async open(filePath) {
2433
- await mkdir4(path11.dirname(filePath), { recursive: true });
2434
- const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
1999
+ static async open(filePath, options) {
2000
+ await mkdir3(path9.dirname(filePath), { recursive: true });
2001
+ const flags = options?.append ? "a" : "w";
2002
+ const stream = createWriteStream(filePath, { flags, encoding: "utf8" });
2435
2003
  return new _JsonlWriter(stream);
2436
2004
  }
2437
2005
  async append(record) {
@@ -2461,8 +2029,8 @@ var JsonlWriter = class _JsonlWriter {
2461
2029
  };
2462
2030
 
2463
2031
  // src/commands/eval/junit-writer.ts
2464
- import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
2465
- import path12 from "node:path";
2032
+ import { mkdir as mkdir4, writeFile as writeFile4 } from "node:fs/promises";
2033
+ import path10 from "node:path";
2466
2034
  function escapeXml(str) {
2467
2035
  return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
2468
2036
  }
@@ -2476,7 +2044,7 @@ var JunitWriter = class _JunitWriter {
2476
2044
  this.threshold = options?.threshold ?? 0.5;
2477
2045
  }
2478
2046
  static async open(filePath, options) {
2479
- await mkdir5(path12.dirname(filePath), { recursive: true });
2047
+ await mkdir4(path10.dirname(filePath), { recursive: true });
2480
2048
  return new _JunitWriter(filePath, options);
2481
2049
  }
2482
2050
  async append(result) {
@@ -2545,14 +2113,14 @@ ${testCases.join("\n")}
2545
2113
  ${suiteXmls.join("\n")}
2546
2114
  </testsuites>
2547
2115
  `;
2548
- await writeFile5(this.filePath, xml, "utf8");
2116
+ await writeFile4(this.filePath, xml, "utf8");
2549
2117
  }
2550
2118
  };
2551
2119
 
2552
2120
  // src/commands/eval/yaml-writer.ts
2553
2121
  import { createWriteStream as createWriteStream2 } from "node:fs";
2554
- import { mkdir as mkdir6 } from "node:fs/promises";
2555
- import path13 from "node:path";
2122
+ import { mkdir as mkdir5 } from "node:fs/promises";
2123
+ import path11 from "node:path";
2556
2124
  import { finished as finished2 } from "node:stream/promises";
2557
2125
  import { stringify as stringifyYaml } from "yaml";
2558
2126
  var YamlWriter = class _YamlWriter {
@@ -2564,7 +2132,7 @@ var YamlWriter = class _YamlWriter {
2564
2132
  this.stream = stream;
2565
2133
  }
2566
2134
  static async open(filePath) {
2567
- await mkdir6(path13.dirname(filePath), { recursive: true });
2135
+ await mkdir5(path11.dirname(filePath), { recursive: true });
2568
2136
  const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
2569
2137
  return new _YamlWriter(stream);
2570
2138
  }
@@ -2604,10 +2172,10 @@ var YamlWriter = class _YamlWriter {
2604
2172
  };
2605
2173
 
2606
2174
  // src/commands/eval/output-writer.ts
2607
- async function createOutputWriter(filePath, format) {
2175
+ async function createOutputWriter(filePath, format, options) {
2608
2176
  switch (format) {
2609
2177
  case "jsonl":
2610
- return JsonlWriter.open(filePath);
2178
+ return JsonlWriter.open(filePath, { append: options?.append });
2611
2179
  case "yaml":
2612
2180
  return YamlWriter.open(filePath);
2613
2181
  case "html":
@@ -2620,7 +2188,7 @@ async function createOutputWriter(filePath, format) {
2620
2188
  }
2621
2189
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
2622
2190
  function createWriterFromPath(filePath, options) {
2623
- const ext = path14.extname(filePath).toLowerCase();
2191
+ const ext = path12.extname(filePath).toLowerCase();
2624
2192
  switch (ext) {
2625
2193
  case ".jsonl":
2626
2194
  return JsonlWriter.open(filePath);
@@ -2719,12 +2287,12 @@ var ProgressDisplay = class {
2719
2287
  }
2720
2288
  addLogPaths(paths) {
2721
2289
  const newPaths = [];
2722
- for (const path19 of paths) {
2723
- if (this.logPathSet.has(path19)) {
2290
+ for (const path17 of paths) {
2291
+ if (this.logPathSet.has(path17)) {
2724
2292
  continue;
2725
2293
  }
2726
- this.logPathSet.add(path19);
2727
- newPaths.push(path19);
2294
+ this.logPathSet.add(path17);
2295
+ newPaths.push(path17);
2728
2296
  }
2729
2297
  if (newPaths.length === 0) {
2730
2298
  return;
@@ -2777,8 +2345,8 @@ async function loadNonErrorResults(jsonlPath) {
2777
2345
  }
2778
2346
 
2779
2347
  // src/commands/eval/run-cache.ts
2780
- import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
2781
- import path15 from "node:path";
2348
+ import { mkdir as mkdir6, readFile, writeFile as writeFile5 } from "node:fs/promises";
2349
+ import path13 from "node:path";
2782
2350
  var CACHE_FILENAME = "cache.json";
2783
2351
  function resolveRunCacheFile(cache) {
2784
2352
  if (cache.lastRunDir) {
@@ -2787,27 +2355,27 @@ function resolveRunCacheFile(cache) {
2787
2355
  return "";
2788
2356
  }
2789
2357
  function cachePath(cwd) {
2790
- return path15.join(cwd, ".agentv", CACHE_FILENAME);
2358
+ return path13.join(cwd, ".agentv", CACHE_FILENAME);
2791
2359
  }
2792
2360
  async function loadRunCache(cwd) {
2793
2361
  try {
2794
- const content = await readFile2(cachePath(cwd), "utf-8");
2362
+ const content = await readFile(cachePath(cwd), "utf-8");
2795
2363
  return JSON.parse(content);
2796
2364
  } catch {
2797
2365
  return void 0;
2798
2366
  }
2799
2367
  }
2800
2368
  async function saveRunCache(cwd, resultPath) {
2801
- if (path15.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2369
+ if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2802
2370
  return;
2803
2371
  }
2804
- const dir = path15.join(cwd, ".agentv");
2805
- await mkdir7(dir, { recursive: true });
2372
+ const dir = path13.join(cwd, ".agentv");
2373
+ await mkdir6(dir, { recursive: true });
2806
2374
  const cache = {
2807
- lastRunDir: path15.dirname(resultPath),
2375
+ lastRunDir: path13.dirname(resultPath),
2808
2376
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
2809
2377
  };
2810
- await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2378
+ await writeFile5(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2811
2379
  `, "utf-8");
2812
2380
  }
2813
2381
 
@@ -3072,13 +2640,13 @@ function formatMatrixSummary(results) {
3072
2640
  }
3073
2641
 
3074
2642
  // ../../packages/core/dist/evaluation/validation/index.js
3075
- import { readFile as readFile3 } from "node:fs/promises";
3076
- import path16 from "node:path";
2643
+ import { readFile as readFile2 } from "node:fs/promises";
2644
+ import path14 from "node:path";
3077
2645
  import { parse } from "yaml";
3078
2646
  import { readFile as readFile22, readdir } from "node:fs/promises";
3079
2647
  import path22 from "node:path";
3080
2648
  import { parse as parse2 } from "yaml";
3081
- import { readFile as readFile32 } from "node:fs/promises";
2649
+ import { readFile as readFile3 } from "node:fs/promises";
3082
2650
  import path32 from "node:path";
3083
2651
  import { parse as parse3 } from "yaml";
3084
2652
  import { readFile as readFile4 } from "node:fs/promises";
@@ -3097,7 +2665,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
3097
2665
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
3098
2666
  async function detectFileType(filePath) {
3099
2667
  try {
3100
- const content = await readFile3(filePath, "utf8");
2668
+ const content = await readFile2(filePath, "utf8");
3101
2669
  const parsed = parse(content);
3102
2670
  if (Array.isArray(parsed)) {
3103
2671
  return "cases";
@@ -3125,8 +2693,8 @@ async function detectFileType(filePath) {
3125
2693
  }
3126
2694
  }
3127
2695
  function inferFileTypeFromPath(filePath) {
3128
- const normalized = path16.normalize(filePath).replace(/\\/g, "/");
3129
- const basename = path16.basename(filePath);
2696
+ const normalized = path14.normalize(filePath).replace(/\\/g, "/");
2697
+ const basename = path14.basename(filePath);
3130
2698
  if (normalized.includes("/.agentv/")) {
3131
2699
  if (basename === "config.yaml" || basename === "config.yml") {
3132
2700
  return "config";
@@ -3723,7 +3291,7 @@ function validateAssertArray(assertField, parentLocation, filePath, errors, cust
3723
3291
  continue;
3724
3292
  }
3725
3293
  const typeValue = rawTypeValue.replace(/_/g, "-");
3726
- if (!isEvaluatorKind(typeValue) && !customAssertionTypes.has(typeValue)) {
3294
+ if (!isGraderKind(typeValue) && !customAssertionTypes.has(typeValue)) {
3727
3295
  errors.push({
3728
3296
  severity: "warning",
3729
3297
  filePath,
@@ -3897,7 +3465,7 @@ async function validateCasesFile(filePath) {
3897
3465
  const absolutePath = path32.resolve(filePath);
3898
3466
  let parsed;
3899
3467
  try {
3900
- const content = await readFile32(absolutePath, "utf8");
3468
+ const content = await readFile3(absolutePath, "utf8");
3901
3469
  parsed = parse3(content);
3902
3470
  } catch (error) {
3903
3471
  errors.push({
@@ -5005,6 +4573,12 @@ Errors in ${targetsFilePath}:`);
5005
4573
 
5006
4574
  // src/commands/eval/run-eval.ts
5007
4575
  var DEFAULT_WORKERS = 3;
4576
+ function shouldSkipExistingResultForResume(result, rerunFailed) {
4577
+ if (rerunFailed) {
4578
+ return result.executionStatus === "ok";
4579
+ }
4580
+ return result.executionStatus !== "execution_error";
4581
+ }
5008
4582
  function normalizeBoolean(value) {
5009
4583
  return value === true;
5010
4584
  }
@@ -5175,6 +4749,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
5175
4749
  otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
5176
4750
  otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
5177
4751
  retryErrors: normalizeString(rawOptions.retryErrors),
4752
+ resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
4753
+ rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
5178
4754
  workspaceMode,
5179
4755
  workspacePath,
5180
4756
  // Precedence: CLI > YAML config > TS config
@@ -5201,7 +4777,7 @@ async function ensureFileExists(filePath, description) {
5201
4777
  function buildDefaultOutputPathForExperiment(cwd, experiment) {
5202
4778
  const runDir = buildDefaultRunDir(cwd, experiment);
5203
4779
  mkdirSync(runDir, { recursive: true });
5204
- return path17.join(runDir, "index.jsonl");
4780
+ return path15.join(runDir, "index.jsonl");
5205
4781
  }
5206
4782
  function createProgressReporter(maxWorkers, options) {
5207
4783
  const display = new ProgressDisplay(maxWorkers, options);
@@ -5215,7 +4791,7 @@ function createProgressReporter(maxWorkers, options) {
5215
4791
  };
5216
4792
  }
5217
4793
  function makeTestCaseKey(testFilePath, testId) {
5218
- return `${path17.resolve(testFilePath)}::${testId}`;
4794
+ return `${path15.resolve(testFilePath)}::${testId}`;
5219
4795
  }
5220
4796
  function resolveTargetLabel(requestedName, resolvedName) {
5221
4797
  if (resolvedName !== requestedName) {
@@ -5277,7 +4853,7 @@ async function prepareFileMetadata(params) {
5277
4853
  repoRoot,
5278
4854
  verbose: options.verbose
5279
4855
  });
5280
- const relativePath = path17.relative(cwd, testFilePath);
4856
+ const relativePath = path15.relative(cwd, testFilePath);
5281
4857
  const category = deriveCategory(relativePath);
5282
4858
  const suite = await loadTestSuite(testFilePath, repoRoot, {
5283
4859
  verbose: options.verbose,
@@ -5302,7 +4878,53 @@ async function prepareFileMetadata(params) {
5302
4878
  selections = [
5303
4879
  {
5304
4880
  selection: transcriptSelection,
5305
- inlineTargetLabel: `transcript (${path17.basename(options.transcript)})`
4881
+ inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
4882
+ }
4883
+ ];
4884
+ } else if (suite.inlineTarget && options.cliTargets.length === 0) {
4885
+ const targetDefinition = suite.inlineTarget;
4886
+ const resolvedTarget = options.dryRun ? {
4887
+ kind: "mock",
4888
+ name: `${targetDefinition.name}-dry-run`,
4889
+ graderTarget: void 0,
4890
+ config: {
4891
+ response: '{"answer":"Mock dry-run response"}',
4892
+ delayMs: options.dryRunDelay,
4893
+ delayMinMs: options.dryRunDelayMin,
4894
+ delayMaxMs: options.dryRunDelayMax
4895
+ }
4896
+ } : resolveTargetDefinition(targetDefinition, process.env, testFilePath, {
4897
+ emitDeprecationWarnings: false
4898
+ });
4899
+ selections = [
4900
+ {
4901
+ selection: {
4902
+ definitions: [targetDefinition],
4903
+ resolvedTarget,
4904
+ targetName: targetDefinition.name,
4905
+ targetSource: "test-file",
4906
+ targetsFilePath: testFilePath
4907
+ },
4908
+ inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name)
4909
+ }
4910
+ ];
4911
+ } else if (suite.providerFactory && options.cliTargets.length === 0) {
4912
+ const taskTarget = {
4913
+ kind: "mock",
4914
+ name: "custom-task",
4915
+ graderTarget: void 0,
4916
+ config: {}
4917
+ };
4918
+ selections = [
4919
+ {
4920
+ selection: {
4921
+ definitions: [],
4922
+ resolvedTarget: taskTarget,
4923
+ targetName: "custom-task",
4924
+ targetSource: "test-file",
4925
+ targetsFilePath: testFilePath
4926
+ },
4927
+ inlineTargetLabel: "custom-task"
5306
4928
  }
5307
4929
  ];
5308
4930
  } else {
@@ -5372,10 +4994,11 @@ async function prepareFileMetadata(params) {
5372
4994
  yamlWorkers: suite.workers,
5373
4995
  yamlCache: suite.cacheConfig?.enabled,
5374
4996
  yamlCachePath: suite.cacheConfig?.cachePath,
5375
- totalBudgetUsd: suite.totalBudgetUsd,
4997
+ budgetUsd: suite.budgetUsd,
5376
4998
  failOnError: suite.failOnError,
5377
4999
  threshold: suite.threshold,
5378
- tags: suite.metadata?.tags
5000
+ tags: suite.metadata?.tags,
5001
+ providerFactory: suite.providerFactory
5379
5002
  };
5380
5003
  }
5381
5004
  async function runSingleEvalFile(params) {
@@ -5398,7 +5021,7 @@ async function runSingleEvalFile(params) {
5398
5021
  testCases,
5399
5022
  trialsConfig,
5400
5023
  matrixMode,
5401
- totalBudgetUsd,
5024
+ budgetUsd,
5402
5025
  failOnError,
5403
5026
  providerFactory
5404
5027
  } = params;
@@ -5464,7 +5087,7 @@ async function runSingleEvalFile(params) {
5464
5087
  workspacePath: options.workspacePath,
5465
5088
  keepWorkspaces: options.keepWorkspaces,
5466
5089
  trials: trialsConfig,
5467
- totalBudgetUsd,
5090
+ budgetUsd,
5468
5091
  failOnError,
5469
5092
  graderTarget: options.graderTarget,
5470
5093
  model: options.model,
@@ -5537,7 +5160,7 @@ async function runEvalCommand(input) {
5537
5160
  );
5538
5161
  }
5539
5162
  const repoRoot = await findRepoRoot(cwd);
5540
- const yamlConfig = await loadConfig(path17.join(cwd, "_"), repoRoot);
5163
+ const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
5541
5164
  if (yamlConfig?.required_version) {
5542
5165
  await enforceRequiredVersion(yamlConfig.required_version, {
5543
5166
  strict: normalizeBoolean(input.rawOptions.strict)
@@ -5552,7 +5175,7 @@ async function runEvalCommand(input) {
5552
5175
  }
5553
5176
  let retryNonErrorResults;
5554
5177
  if (options.retryErrors) {
5555
- const retryPath = path17.resolve(options.retryErrors);
5178
+ const retryPath = path15.resolve(options.retryErrors);
5556
5179
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
5557
5180
  const completedIds = await loadFullyCompletedTestIds(retryPath);
5558
5181
  const errorIds = await loadErrorTestIds(retryPath);
@@ -5565,8 +5188,37 @@ async function runEvalCommand(input) {
5565
5188
  console.log(`Skipping ${completedIds.length} already-completed test(s).`);
5566
5189
  }
5567
5190
  }
5191
+ let resumeSkipKeys;
5192
+ let isResumeAppend = false;
5193
+ if (options.resume && !options.retryErrors) {
5194
+ const explicitResumeDir = options.outputDir ?? options.artifacts;
5195
+ if (explicitResumeDir) {
5196
+ const resumeIndexPath = path15.join(path15.resolve(explicitResumeDir), "index.jsonl");
5197
+ if (existsSync2(resumeIndexPath)) {
5198
+ const content = await readFile8(resumeIndexPath, "utf8");
5199
+ const existingResults = parseJsonlResults(content);
5200
+ resumeSkipKeys = /* @__PURE__ */ new Set();
5201
+ for (const r of existingResults) {
5202
+ if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
5203
+ resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target));
5204
+ }
5205
+ }
5206
+ isResumeAppend = true;
5207
+ const modeLabel = options.rerunFailed ? "Rerun-failed" : "Resume";
5208
+ console.log(
5209
+ `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`
5210
+ );
5211
+ } else {
5212
+ console.log("Resume: no existing index.jsonl found, starting fresh run.");
5213
+ }
5214
+ } else {
5215
+ console.warn(
5216
+ "Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume."
5217
+ );
5218
+ }
5219
+ }
5568
5220
  if (options.workspacePath) {
5569
- const resolvedWorkspace = path17.resolve(options.workspacePath);
5221
+ const resolvedWorkspace = path15.resolve(options.workspacePath);
5570
5222
  try {
5571
5223
  const { stat: stat2 } = await import("node:fs/promises");
5572
5224
  const stats = await stat2(resolvedWorkspace);
@@ -5607,25 +5259,25 @@ async function runEvalCommand(input) {
5607
5259
  let outputPath;
5608
5260
  let usesDefaultArtifactWorkspace;
5609
5261
  if (explicitDir) {
5610
- runDir = path17.resolve(explicitDir);
5262
+ runDir = path15.resolve(explicitDir);
5611
5263
  mkdirSync(runDir, { recursive: true });
5612
- outputPath = path17.join(runDir, "index.jsonl");
5264
+ outputPath = path15.join(runDir, "index.jsonl");
5613
5265
  usesDefaultArtifactWorkspace = true;
5614
5266
  } else if (options.outPath) {
5615
- outputPath = path17.resolve(options.outPath);
5616
- runDir = path17.dirname(outputPath);
5267
+ outputPath = path15.resolve(options.outPath);
5268
+ runDir = path15.dirname(outputPath);
5617
5269
  mkdirSync(runDir, { recursive: true });
5618
5270
  usesDefaultArtifactWorkspace = false;
5619
5271
  } else {
5620
5272
  outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
5621
- runDir = path17.dirname(outputPath);
5273
+ runDir = path15.dirname(outputPath);
5622
5274
  usesDefaultArtifactWorkspace = true;
5623
5275
  }
5624
5276
  let otelExporter = null;
5625
5277
  const useFileExport = !!options.otelFile;
5626
5278
  if (options.exportOtel || useFileExport) {
5627
5279
  try {
5628
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-U5EXNMON.js");
5280
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-RTIUSC6L.js");
5629
5281
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
5630
5282
  let headers = {};
5631
5283
  if (options.otelBackend) {
@@ -5649,7 +5301,7 @@ async function runEvalCommand(input) {
5649
5301
  headers,
5650
5302
  captureContent,
5651
5303
  groupTurns: options.otelGroupTurns,
5652
- otlpFilePath: options.otelFile ? path17.resolve(options.otelFile) : void 0
5304
+ otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
5653
5305
  });
5654
5306
  const initialized = await otelExporter.init();
5655
5307
  if (!initialized) {
@@ -5666,7 +5318,7 @@ async function runEvalCommand(input) {
5666
5318
  }
5667
5319
  }
5668
5320
  const primaryWritePath = outputPath;
5669
- const resolvedExportPaths = options.exportPaths.map((p) => path17.resolve(p));
5321
+ const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
5670
5322
  console.log(`Artifact directory: ${runDir}`);
5671
5323
  if (resolvedExportPaths.length > 0) {
5672
5324
  console.log("Export files:");
@@ -5674,9 +5326,9 @@ async function runEvalCommand(input) {
5674
5326
  console.log(` ${p}`);
5675
5327
  }
5676
5328
  }
5677
- const resolvedTestFiles = input.testFiles.map((file) => path17.resolve(file));
5329
+ const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
5678
5330
  if (options.otelFile) {
5679
- console.log(`OTLP JSON file: ${path17.resolve(options.otelFile)}`);
5331
+ console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
5680
5332
  }
5681
5333
  const evaluationRunner = await resolveEvaluationRunner();
5682
5334
  const allResults = [];
@@ -5685,23 +5337,7 @@ async function runEvalCommand(input) {
5685
5337
  const displayIdTracker = createDisplayIdTracker();
5686
5338
  const perFileWorkers = options.workers;
5687
5339
  const fileMetadata = /* @__PURE__ */ new Map();
5688
- const tsFiles = [];
5689
- const yamlFiles = [];
5690
5340
  for (const testFilePath of resolvedTestFiles) {
5691
- if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
5692
- tsFiles.push(testFilePath);
5693
- } else {
5694
- yamlFiles.push(testFilePath);
5695
- }
5696
- }
5697
- for (const tsFile of tsFiles) {
5698
- await ensureFileExists(tsFile, "TypeScript eval file");
5699
- await import(pathToFileURL(tsFile).href);
5700
- }
5701
- if (yamlFiles.length === 0 && tsFiles.length > 0) {
5702
- return;
5703
- }
5704
- for (const testFilePath of yamlFiles) {
5705
5341
  const meta = await prepareFileMetadata({
5706
5342
  testFilePath,
5707
5343
  repoRoot,
@@ -5716,7 +5352,7 @@ async function runEvalCommand(input) {
5716
5352
  for (const [testFilePath, meta] of fileMetadata.entries()) {
5717
5353
  if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
5718
5354
  fileMetadata.delete(testFilePath);
5719
- skippedFiles.push(path17.relative(cwd, testFilePath));
5355
+ skippedFiles.push(path15.relative(cwd, testFilePath));
5720
5356
  }
5721
5357
  }
5722
5358
  if (skippedFiles.length > 0 && options.verbose) {
@@ -5737,7 +5373,7 @@ async function runEvalCommand(input) {
5737
5373
  cliNoCache: options.noCache,
5738
5374
  yamlCache: yamlCacheEnabled
5739
5375
  });
5740
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path17.resolve(yamlCachePath) : void 0) : void 0;
5376
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
5741
5377
  if (cacheEnabled) {
5742
5378
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
5743
5379
  }
@@ -5747,14 +5383,25 @@ async function runEvalCommand(input) {
5747
5383
  throw new Error("--threshold must be between 0 and 1");
5748
5384
  }
5749
5385
  const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
5750
- const outputWriter = await createOutputWriter(primaryWritePath, "jsonl");
5386
+ const outputWriter = await createOutputWriter(primaryWritePath, "jsonl", {
5387
+ append: isResumeAppend
5388
+ });
5751
5389
  const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
5752
5390
  let totalEvalCount = 0;
5391
+ let resumeSkippedCount = 0;
5753
5392
  for (const meta of fileMetadata.values()) {
5754
5393
  const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
5755
5394
  for (const test of meta.testCases) {
5756
5395
  const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
5757
- totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
5396
+ const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ["unknown"];
5397
+ for (const tn of effectiveTargets) {
5398
+ const key = `${test.id}::${tn}`;
5399
+ if (resumeSkipKeys?.has(key)) {
5400
+ resumeSkippedCount++;
5401
+ } else {
5402
+ totalEvalCount++;
5403
+ }
5404
+ }
5758
5405
  }
5759
5406
  }
5760
5407
  if (totalEvalCount === 0) {
@@ -5762,6 +5409,10 @@ async function runEvalCommand(input) {
5762
5409
  console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
5763
5410
  return;
5764
5411
  }
5412
+ if (resumeSkipKeys && resumeSkippedCount > 0) {
5413
+ console.log(`Nothing to resume \u2014 all ${resumeSkippedCount} test(s) already completed.`);
5414
+ return;
5415
+ }
5765
5416
  throw new Error("No tests matched the provided filters.");
5766
5417
  }
5767
5418
  const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
@@ -5821,7 +5472,7 @@ async function runEvalCommand(input) {
5821
5472
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
5822
5473
  let transcriptProviderFactory;
5823
5474
  if (options.transcript) {
5824
- const { TranscriptProvider } = await import("./dist-U5EXNMON.js");
5475
+ const { TranscriptProvider } = await import("./dist-RTIUSC6L.js");
5825
5476
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
5826
5477
  const totalTests = [...fileMetadata.values()].reduce(
5827
5478
  (sum, meta) => sum + meta.testCases.length,
@@ -5852,7 +5503,10 @@ async function runEvalCommand(input) {
5852
5503
  }
5853
5504
  return true;
5854
5505
  }) : targetPrep.testCases;
5855
- if (applicableTestCases.length === 0) {
5506
+ const filteredTestCases = resumeSkipKeys ? applicableTestCases.filter(
5507
+ (test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName))
5508
+ ) : applicableTestCases;
5509
+ if (filteredTestCases.length === 0) {
5856
5510
  return [];
5857
5511
  }
5858
5512
  try {
@@ -5872,15 +5526,15 @@ async function runEvalCommand(input) {
5872
5526
  displayIdTracker,
5873
5527
  selection,
5874
5528
  inlineTargetLabel,
5875
- testCases: applicableTestCases,
5529
+ testCases: filteredTestCases,
5876
5530
  trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
5877
5531
  matrixMode: targetPrep.selections.length > 1,
5878
- totalBudgetUsd: targetPrep.totalBudgetUsd,
5532
+ budgetUsd: targetPrep.budgetUsd,
5879
5533
  failOnError: targetPrep.failOnError,
5880
5534
  threshold: resolvedThreshold,
5881
- providerFactory: transcriptProviderFactory
5535
+ providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory
5882
5536
  });
5883
- const evalFile = path17.relative(cwd, testFilePath);
5537
+ const evalFile = path15.relative(cwd, testFilePath);
5884
5538
  const existingSummary = remoteEvalSummaries.find(
5885
5539
  (summary2) => summary2.evalFile === evalFile
5886
5540
  );
@@ -5897,10 +5551,10 @@ async function runEvalCommand(input) {
5897
5551
  const message = fileError instanceof Error ? fileError.message : String(fileError);
5898
5552
  console.error(
5899
5553
  `
5900
- [ERROR] \u26A0 Eval file failed: ${path17.basename(testFilePath)} \u2014 ${message}
5554
+ [ERROR] \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
5901
5555
  `
5902
5556
  );
5903
- const errorResults = applicableTestCases.map((testCase) => ({
5557
+ const errorResults = filteredTestCases.map((testCase) => ({
5904
5558
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5905
5559
  testId: testCase.id,
5906
5560
  score: 0,
@@ -5936,37 +5590,60 @@ async function runEvalCommand(input) {
5936
5590
  `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
5937
5591
  );
5938
5592
  }
5593
+ await outputWriter.close().catch(() => void 0);
5594
+ let summaryResults = allResults;
5595
+ if (isResumeAppend && usesDefaultArtifactWorkspace) {
5596
+ const content = await readFile8(outputPath, "utf8");
5597
+ summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
5598
+ }
5939
5599
  const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
5940
- const summary = calculateEvaluationSummary(allResults, thresholdOpts);
5600
+ const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
5941
5601
  console.log(formatEvaluationSummary(summary, thresholdOpts));
5942
5602
  const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
5943
5603
  const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
5944
- if (isMatrixMode && allResults.length > 0) {
5945
- console.log(formatMatrixSummary(allResults));
5604
+ if (isMatrixMode && summaryResults.length > 0) {
5605
+ console.log(formatMatrixSummary(summaryResults));
5946
5606
  }
5947
5607
  if (options.benchmarkJson && allResults.length > 0) {
5948
- const benchmarkPath = path17.resolve(options.benchmarkJson);
5608
+ const benchmarkPath = path15.resolve(options.benchmarkJson);
5949
5609
  await writeBenchmarkJson(benchmarkPath, allResults);
5950
5610
  console.log(`Benchmark written to: ${benchmarkPath}`);
5951
5611
  }
5952
5612
  if (usesDefaultArtifactWorkspace && allResults.length > 0) {
5953
5613
  const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
5954
- const {
5955
- testArtifactDir,
5956
- timingPath,
5957
- benchmarkPath: workspaceBenchmarkPath,
5958
- indexPath
5959
- } = await writeArtifactsFromResults(allResults, runDir, {
5960
- evalFile,
5961
- experiment: normalizeExperimentName(options.experiment)
5962
- });
5963
- console.log(`Artifact workspace written to: ${runDir}`);
5964
- console.log(` Index: ${indexPath}`);
5965
- console.log(
5966
- ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
5967
- );
5968
- console.log(` Timing: ${timingPath}`);
5969
- console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5614
+ if (isResumeAppend) {
5615
+ const { writePerTestArtifacts } = await import("./artifact-writer-YATMDPWI.js");
5616
+ await writePerTestArtifacts(allResults, runDir, {
5617
+ experiment: normalizeExperimentName(options.experiment)
5618
+ });
5619
+ const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
5620
+ runDir,
5621
+ { evalFile, experiment: normalizeExperimentName(options.experiment) }
5622
+ );
5623
+ const indexPath = path15.join(runDir, "index.jsonl");
5624
+ console.log(`Artifact workspace updated: ${runDir}`);
5625
+ console.log(` Index: ${indexPath}`);
5626
+ console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
5627
+ console.log(` Timing: ${timingPath}`);
5628
+ console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5629
+ } else {
5630
+ const {
5631
+ testArtifactDir,
5632
+ timingPath,
5633
+ benchmarkPath: workspaceBenchmarkPath,
5634
+ indexPath
5635
+ } = await writeArtifactsFromResults(allResults, runDir, {
5636
+ evalFile,
5637
+ experiment: normalizeExperimentName(options.experiment)
5638
+ });
5639
+ console.log(`Artifact workspace written to: ${runDir}`);
5640
+ console.log(` Index: ${indexPath}`);
5641
+ console.log(
5642
+ ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
5643
+ );
5644
+ console.log(` Timing: ${timingPath}`);
5645
+ console.log(` Benchmark: ${workspaceBenchmarkPath}`);
5646
+ }
5970
5647
  }
5971
5648
  if (resolvedExportPaths.length > 0 && allResults.length > 0) {
5972
5649
  for (const exportPath of resolvedExportPaths) {
@@ -5977,7 +5654,7 @@ async function runEvalCommand(input) {
5977
5654
  await writer.close();
5978
5655
  }
5979
5656
  console.log(
5980
- `Export file(s) written: ${resolvedExportPaths.map((p) => path17.relative(cwd, p)).join(", ")}`
5657
+ `Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
5981
5658
  );
5982
5659
  }
5983
5660
  const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath);
@@ -6015,14 +5692,14 @@ Results written to: ${outputPath}`);
6015
5692
  experiment: normalizeExperimentName(options.experiment)
6016
5693
  });
6017
5694
  }
6018
- if (summary.executionErrorCount > 0 && !options.retryErrors) {
6019
- const evalFileArgs = activeTestFiles.map((f) => path17.relative(cwd, f)).join(" ");
5695
+ if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) {
5696
+ const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
6020
5697
  const targetFlag = options.target ? ` --target ${options.target}` : "";
6021
- const relativeOutputPath = path17.relative(cwd, outputPath);
5698
+ const relativeRunDir = path15.relative(cwd, runDir);
6022
5699
  console.log(
6023
5700
  `
6024
5701
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
6025
- agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`
5702
+ agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
6026
5703
  );
6027
5704
  }
6028
5705
  return {
@@ -6052,7 +5729,7 @@ async function resolveEvaluationRunner() {
6052
5729
  if (!overridePath) {
6053
5730
  return runEvaluation;
6054
5731
  }
6055
- const resolved = path17.isAbsolute(overridePath) ? overridePath : path17.resolve(process.cwd(), overridePath);
5732
+ const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
6056
5733
  const moduleUrl = pathToFileURL(resolved).href;
6057
5734
  const mod = await import(moduleUrl);
6058
5735
  const candidate = mod.runEvaluation;
@@ -6065,11 +5742,11 @@ async function resolveEvaluationRunner() {
6065
5742
  }
6066
5743
 
6067
5744
  // src/commands/eval/discover.ts
6068
- import path18 from "node:path";
5745
+ import path16 from "node:path";
6069
5746
  import fg2 from "fast-glob";
6070
5747
  async function discoverEvalFiles(cwd) {
6071
5748
  const repoRoot = await findRepoRoot(cwd);
6072
- const config = await loadConfig(path18.join(cwd, "_"), repoRoot);
5749
+ const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
6073
5750
  const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
6074
5751
  const ignore = ["**/node_modules/**", "**/dist/**"];
6075
5752
  const matches = await fg2(patterns, {
@@ -6081,7 +5758,7 @@ async function discoverEvalFiles(cwd) {
6081
5758
  caseSensitiveMatch: false
6082
5759
  });
6083
5760
  const evalFiles = matches.map((absPath) => {
6084
- const relativePath = path18.relative(cwd, absPath);
5761
+ const relativePath = path16.relative(cwd, absPath);
6085
5762
  const category = deriveCategory(relativePath);
6086
5763
  return { path: absPath, relativePath, category };
6087
5764
  });
@@ -6103,10 +5780,6 @@ function filterByCategory(files, category) {
6103
5780
 
6104
5781
  export {
6105
5782
  package_default,
6106
- toSnakeCaseDeep,
6107
- RESULT_INDEX_FILENAME,
6108
- buildDefaultRunDir,
6109
- resolveRunManifestPath,
6110
5783
  parseResultManifest,
6111
5784
  resolveResultSourcePath,
6112
5785
  loadManifestResults,
@@ -6132,7 +5805,6 @@ export {
6132
5805
  listMergedResultFiles,
6133
5806
  findRunById,
6134
5807
  maybeAutoExportRunArtifacts,
6135
- writeArtifactsFromResults,
6136
5808
  resolveRunCacheFile,
6137
5809
  loadRunCache,
6138
5810
  detectFileType,
@@ -6150,4 +5822,4 @@ export {
6150
5822
  getCategories,
6151
5823
  filterByCategory
6152
5824
  };
6153
- //# sourceMappingURL=chunk-ZUNYOUFO.js.map
5825
+ //# sourceMappingURL=chunk-YGNBTBXW.js.map