agentv 4.10.0 → 4.11.2-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/{chunk-XOSNETAV.js → chunk-BAUNAXHT.js} +1 -1
  2. package/dist/chunk-BPGJ4HBU.js +183 -0
  3. package/dist/chunk-BPGJ4HBU.js.map +1 -0
  4. package/dist/{chunk-KF6BABQ5.js → chunk-DHWFLK6T.js} +1090 -303
  5. package/dist/chunk-DHWFLK6T.js.map +1 -0
  6. package/dist/{chunk-SE73HJZG.js → chunk-FQGY6QXQ.js} +780 -346
  7. package/dist/chunk-FQGY6QXQ.js.map +1 -0
  8. package/dist/chunk-NPVGBFF6.js +151 -0
  9. package/dist/chunk-NPVGBFF6.js.map +1 -0
  10. package/dist/{chunk-VA64NETD.js → chunk-YLVQNF23.js} +1120 -731
  11. package/dist/chunk-YLVQNF23.js.map +1 -0
  12. package/dist/cli.js +6 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{dist-XDNB4WDT.js → dist-HNSXNRVK.js} +36 -3
  15. package/dist/docker-workspace-RPPXBT27-B4AQHVWA.js +11 -0
  16. package/dist/{esm-CZAWIY6F.js → esm-UYZ3HJBU.js} +2 -2
  17. package/dist/esm-UYZ3HJBU.js.map +1 -0
  18. package/dist/exec-AR6JUUN5-6MBPURPR.js +11 -0
  19. package/dist/exec-AR6JUUN5-6MBPURPR.js.map +1 -0
  20. package/dist/index.js +6 -4
  21. package/dist/{interactive-SNKK6VCV.js → interactive-OPHUF2UP.js} +6 -4
  22. package/dist/{interactive-SNKK6VCV.js.map → interactive-OPHUF2UP.js.map} +1 -1
  23. package/dist/{src-ML4D2MC2.js → src-PXDA7QIS.js} +2 -2
  24. package/dist/studio/assets/index-Bi-KHfNm.js +65 -0
  25. package/dist/studio/assets/index-D_j-w4UO.css +1 -0
  26. package/dist/studio/assets/{index-DcwjOyrk.js → index-VyDFrnoK.js} +1 -1
  27. package/dist/studio/index.html +2 -2
  28. package/package.json +1 -1
  29. package/dist/chunk-KF6BABQ5.js.map +0 -1
  30. package/dist/chunk-SE73HJZG.js.map +0 -1
  31. package/dist/chunk-VA64NETD.js.map +0 -1
  32. package/dist/studio/assets/index-DHxVz6M9.css +0 -1
  33. package/dist/studio/assets/index-Y5InSvcS.js +0 -65
  34. /package/dist/{chunk-XOSNETAV.js.map → chunk-BAUNAXHT.js.map} +0 -0
  35. /package/dist/{dist-XDNB4WDT.js.map → dist-HNSXNRVK.js.map} +0 -0
  36. /package/dist/{esm-CZAWIY6F.js.map → docker-workspace-RPPXBT27-B4AQHVWA.js.map} +0 -0
  37. /package/dist/{src-ML4D2MC2.js.map → src-PXDA7QIS.js.map} +0 -0
@@ -2,32 +2,47 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  HtmlWriter,
4
4
  RESULT_INDEX_FILENAME,
5
- RESULT_RUNS_DIRNAME,
6
5
  TARGET_FILE_CANDIDATES,
7
6
  buildDefaultRunDir,
7
+ c,
8
8
  detectFileType,
9
9
  discoverEvalFiles,
10
10
  findRepoRoot,
11
+ findRunById,
12
+ formatCost,
13
+ formatDuration,
14
+ formatNumber,
15
+ formatScore,
16
+ formatSize,
17
+ getRemoteResultsStatus,
18
+ getTraceSpans,
19
+ getTraceSummary,
20
+ listMergedResultFiles,
21
+ listResultFiles,
11
22
  loadLightweightResults,
12
23
  loadManifestResults,
24
+ loadResultFile,
13
25
  loadRunCache,
26
+ maybeAutoExportRunArtifacts,
14
27
  package_default,
28
+ padLeft,
29
+ padRight,
15
30
  parseResultManifest,
16
31
  resolveEvalPaths,
17
- resolveExistingRunPrimaryPath,
18
32
  resolveResultSourcePath,
19
33
  resolveRunCacheFile,
20
34
  resolveRunManifestPath,
21
- resolveWorkspaceOrFilePath,
22
35
  runEvalCommand,
23
36
  selectTarget,
37
+ syncRemoteResults,
24
38
  toSnakeCaseDeep,
39
+ toTraceSummary,
25
40
  validateConfigFile,
26
41
  validateEvalFile,
27
42
  validateFileReferences,
28
43
  validateTargetsFile,
29
44
  writeArtifactsFromResults
30
- } from "./chunk-KF6BABQ5.js";
45
+ } from "./chunk-DHWFLK6T.js";
31
46
  import {
32
47
  DEFAULT_CATEGORY,
33
48
  DEFAULT_THRESHOLD,
@@ -61,7 +76,7 @@ import {
61
76
  toTranscriptJsonLine,
62
77
  transpileEvalYamlFile,
63
78
  trimBaselineResult
64
- } from "./chunk-SE73HJZG.js";
79
+ } from "./chunk-FQGY6QXQ.js";
65
80
  import {
66
81
  __commonJS,
67
82
  __require,
@@ -2770,7 +2785,7 @@ var colors = {
2770
2785
  gray: "\x1B[90m"
2771
2786
  };
2772
2787
  var noColor = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
2773
- var c = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
2788
+ var c2 = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
2774
2789
  function loadCompareResults(filePath) {
2775
2790
  return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
2776
2791
  if (!record.testId || record.testId === "unknown") {
@@ -2904,29 +2919,29 @@ function determineMatrixExitCode(matrixOutput, baselineTarget) {
2904
2919
  function formatDelta(delta) {
2905
2920
  const sign = delta >= 0 ? "+" : "";
2906
2921
  const formatted = `${sign}${delta.toFixed(2)}`;
2907
- if (delta > 0) return `${c.green}${formatted}${c.reset}`;
2908
- if (delta < 0) return `${c.red}${formatted}${c.reset}`;
2909
- return `${c.gray}${formatted}${c.reset}`;
2922
+ if (delta > 0) return `${c2.green}${formatted}${c2.reset}`;
2923
+ if (delta < 0) return `${c2.red}${formatted}${c2.reset}`;
2924
+ return `${c2.gray}${formatted}${c2.reset}`;
2910
2925
  }
2911
2926
  function formatOutcome(outcome) {
2912
2927
  switch (outcome) {
2913
2928
  case "win":
2914
- return `${c.green}\u2713 win${c.reset}`;
2929
+ return `${c2.green}\u2713 win${c2.reset}`;
2915
2930
  case "loss":
2916
- return `${c.red}\u2717 loss${c.reset}`;
2931
+ return `${c2.red}\u2717 loss${c2.reset}`;
2917
2932
  case "tie":
2918
- return `${c.gray}= tie${c.reset}`;
2933
+ return `${c2.gray}= tie${c2.reset}`;
2919
2934
  }
2920
2935
  }
2921
2936
  var ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
2922
2937
  function stripAnsi2(str) {
2923
2938
  return str.replace(ansiPattern, "");
2924
2939
  }
2925
- function padRight(str, len) {
2940
+ function padRight2(str, len) {
2926
2941
  const plainLen = stripAnsi2(str).length;
2927
2942
  return str + " ".repeat(Math.max(0, len - plainLen));
2928
2943
  }
2929
- function padLeft(str, len) {
2944
+ function padLeft2(str, len) {
2930
2945
  const plainLen = stripAnsi2(str).length;
2931
2946
  return " ".repeat(Math.max(0, len - plainLen)) + str;
2932
2947
  }
@@ -2934,42 +2949,42 @@ function formatTable(comparison, file1, file2) {
2934
2949
  const lines = [];
2935
2950
  lines.push("");
2936
2951
  lines.push(
2937
- `${c.bold}Comparing:${c.reset} ${c.cyan}${file1}${c.reset} \u2192 ${c.cyan}${file2}${c.reset}`
2952
+ `${c2.bold}Comparing:${c2.reset} ${c2.cyan}${file1}${c2.reset} \u2192 ${c2.cyan}${file2}${c2.reset}`
2938
2953
  );
2939
2954
  lines.push("");
2940
2955
  if (comparison.matched.length === 0) {
2941
- lines.push(`${c.yellow}No matching test IDs found between files.${c.reset}`);
2956
+ lines.push(`${c2.yellow}No matching test IDs found between files.${c2.reset}`);
2942
2957
  } else {
2943
2958
  const maxIdLen = Math.max(
2944
2959
  7,
2945
2960
  ...comparison.matched.map((m) => m.testId.length)
2946
2961
  );
2947
- const header = ` ${padRight("Test ID", maxIdLen)} ${padLeft("Baseline", 8)} ${padLeft("Candidate", 9)} ${padLeft("Delta", 8)} Result`;
2948
- lines.push(`${c.dim}${header}${c.reset}`);
2962
+ const header = ` ${padRight2("Test ID", maxIdLen)} ${padLeft2("Baseline", 8)} ${padLeft2("Candidate", 9)} ${padLeft2("Delta", 8)} Result`;
2963
+ lines.push(`${c2.dim}${header}${c2.reset}`);
2949
2964
  lines.push(
2950
- `${c.dim} ${"\u2500".repeat(maxIdLen)} ${"\u2500".repeat(8)} ${"\u2500".repeat(9)} ${"\u2500".repeat(8)} ${"\u2500".repeat(8)}${c.reset}`
2965
+ `${c2.dim} ${"\u2500".repeat(maxIdLen)} ${"\u2500".repeat(8)} ${"\u2500".repeat(9)} ${"\u2500".repeat(8)} ${"\u2500".repeat(8)}${c2.reset}`
2951
2966
  );
2952
2967
  for (const m of comparison.matched) {
2953
- const row = ` ${padRight(m.testId, maxIdLen)} ${padLeft(m.score1.toFixed(2), 8)} ${padLeft(m.score2.toFixed(2), 9)} ${padLeft(formatDelta(m.delta), 8)} ${formatOutcome(m.outcome)}`;
2968
+ const row = ` ${padRight2(m.testId, maxIdLen)} ${padLeft2(m.score1.toFixed(2), 8)} ${padLeft2(m.score2.toFixed(2), 9)} ${padLeft2(formatDelta(m.delta), 8)} ${formatOutcome(m.outcome)}`;
2954
2969
  lines.push(row);
2955
2970
  }
2956
2971
  }
2957
2972
  if (comparison.unmatched.file1 > 0 || comparison.unmatched.file2 > 0) {
2958
2973
  lines.push("");
2959
2974
  lines.push(
2960
- `${c.yellow}\u26A0 Unmatched:${c.reset} ${comparison.unmatched.file1} in baseline, ${comparison.unmatched.file2} in candidate`
2975
+ `${c2.yellow}\u26A0 Unmatched:${c2.reset} ${comparison.unmatched.file1} in baseline, ${comparison.unmatched.file2} in candidate`
2961
2976
  );
2962
2977
  }
2963
2978
  lines.push("");
2964
2979
  const { wins, losses, ties, meanDelta } = comparison.summary;
2965
- const winStr = wins > 0 ? `${c.green}${wins} win${wins !== 1 ? "s" : ""}${c.reset}` : `${wins} wins`;
2966
- const lossStr = losses > 0 ? `${c.red}${losses} loss${losses !== 1 ? "es" : ""}${c.reset}` : `${losses} losses`;
2980
+ const winStr = wins > 0 ? `${c2.green}${wins} win${wins !== 1 ? "s" : ""}${c2.reset}` : `${wins} wins`;
2981
+ const lossStr = losses > 0 ? `${c2.red}${losses} loss${losses !== 1 ? "es" : ""}${c2.reset}` : `${losses} losses`;
2967
2982
  const tieStr = `${ties} tie${ties !== 1 ? "s" : ""}`;
2968
- const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
2983
+ const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
2969
2984
  const deltaSign = meanDelta >= 0 ? "+" : "";
2970
- const status = meanDelta > 0 ? `${c.green}improved${c.reset}` : meanDelta < 0 ? `${c.red}regressed${c.reset}` : `${c.gray}neutral${c.reset}`;
2985
+ const status = meanDelta > 0 ? `${c2.green}improved${c2.reset}` : meanDelta < 0 ? `${c2.red}regressed${c2.reset}` : `${c2.gray}neutral${c2.reset}`;
2971
2986
  lines.push(
2972
- `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean \u0394: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`
2987
+ `${c2.bold}Summary:${c2.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean \u0394: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c2.reset} | Status: ${status}`
2973
2988
  );
2974
2989
  lines.push("");
2975
2990
  return lines.join("\n");
@@ -2978,10 +2993,10 @@ function formatMatrix(matrixOutput, baselineTarget) {
2978
2993
  const { matrix, pairwise, targets } = matrixOutput;
2979
2994
  const lines = [];
2980
2995
  lines.push("");
2981
- lines.push(`${c.bold}Score Matrix${c.reset}`);
2996
+ lines.push(`${c2.bold}Score Matrix${c2.reset}`);
2982
2997
  lines.push("");
2983
2998
  if (matrix.length === 0) {
2984
- lines.push(`${c.yellow}No results found.${c.reset}`);
2999
+ lines.push(`${c2.yellow}No results found.${c2.reset}`);
2985
3000
  return lines.join("\n");
2986
3001
  }
2987
3002
  const testIdWidth = Math.max(
@@ -2989,49 +3004,49 @@ function formatMatrix(matrixOutput, baselineTarget) {
2989
3004
  ...matrix.map((r) => r.testId.length)
2990
3005
  );
2991
3006
  const targetWidths = targets.map((t) => Math.max(t.length, 6));
2992
- let header = ` ${padRight("Test ID", testIdWidth)}`;
3007
+ let header = ` ${padRight2("Test ID", testIdWidth)}`;
2993
3008
  for (let i = 0; i < targets.length; i++) {
2994
- header += ` ${padLeft(targets[i], targetWidths[i])}`;
3009
+ header += ` ${padLeft2(targets[i], targetWidths[i])}`;
2995
3010
  }
2996
- lines.push(`${c.dim}${header}${c.reset}`);
3011
+ lines.push(`${c2.dim}${header}${c2.reset}`);
2997
3012
  let sep = ` ${"\u2500".repeat(testIdWidth)}`;
2998
3013
  for (let i = 0; i < targets.length; i++) {
2999
3014
  sep += ` ${"\u2500".repeat(targetWidths[i])}`;
3000
3015
  }
3001
- lines.push(`${c.dim}${sep}${c.reset}`);
3016
+ lines.push(`${c2.dim}${sep}${c2.reset}`);
3002
3017
  for (const row of matrix) {
3003
- let line = ` ${padRight(row.testId, testIdWidth)}`;
3018
+ let line = ` ${padRight2(row.testId, testIdWidth)}`;
3004
3019
  for (let i = 0; i < targets.length; i++) {
3005
3020
  const score = row.scores[targets[i]];
3006
3021
  const scoreStr = score !== void 0 ? score.toFixed(2) : " --";
3007
3022
  if (baselineTarget && targets[i] !== baselineTarget && score !== void 0) {
3008
3023
  const baselineScore = row.scores[baselineTarget];
3009
3024
  if (baselineScore !== void 0 && score < baselineScore) {
3010
- line += ` ${padLeft(`${c.red}${scoreStr}${c.reset}`, targetWidths[i])}`;
3025
+ line += ` ${padLeft2(`${c2.red}${scoreStr}${c2.reset}`, targetWidths[i])}`;
3011
3026
  } else if (baselineScore !== void 0 && score > baselineScore) {
3012
- line += ` ${padLeft(`${c.green}${scoreStr}${c.reset}`, targetWidths[i])}`;
3027
+ line += ` ${padLeft2(`${c2.green}${scoreStr}${c2.reset}`, targetWidths[i])}`;
3013
3028
  } else {
3014
- line += ` ${padLeft(scoreStr, targetWidths[i])}`;
3029
+ line += ` ${padLeft2(scoreStr, targetWidths[i])}`;
3015
3030
  }
3016
3031
  } else {
3017
- line += ` ${padLeft(scoreStr, targetWidths[i])}`;
3032
+ line += ` ${padLeft2(scoreStr, targetWidths[i])}`;
3018
3033
  }
3019
3034
  }
3020
3035
  lines.push(line);
3021
3036
  }
3022
3037
  if (pairwise.length > 0) {
3023
3038
  lines.push("");
3024
- lines.push(`${c.bold}Pairwise Summary:${c.reset}`);
3039
+ lines.push(`${c2.bold}Pairwise Summary:${c2.reset}`);
3025
3040
  const maxLabelLen = Math.max(
3026
3041
  ...pairwise.map((pw) => ` ${pw.baseline} \u2192 ${pw.candidate}:`.length)
3027
3042
  );
3028
3043
  for (const p of pairwise) {
3029
3044
  const { wins, losses, ties, meanDelta } = p.summary;
3030
3045
  const sign = meanDelta >= 0 ? "+" : "";
3031
- const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
3046
+ const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
3032
3047
  const label = ` ${p.baseline} \u2192 ${p.candidate}:`;
3033
3048
  lines.push(
3034
- `${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? "s" : ""}, ${losses} loss${losses !== 1 ? "es" : ""}, ${ties} tie${ties !== 1 ? "s" : ""} (${c.bold}\u0394${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`
3049
+ `${padRight2(label, maxLabelLen)} ${wins} win${wins !== 1 ? "s" : ""}, ${losses} loss${losses !== 1 ? "es" : ""}, ${ties} tie${ties !== 1 ? "s" : ""} (${c2.bold}\u0394${c2.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c2.reset})`
3035
3050
  );
3036
3051
  }
3037
3052
  }
@@ -3754,6 +3769,11 @@ var evalRunCommand = command({
3754
3769
  long: "output-format",
3755
3770
  description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)"
3756
3771
  }),
3772
+ experiment: option({
3773
+ type: optional(string),
3774
+ long: "experiment",
3775
+ description: "Experiment label for canonical run output (default: default)"
3776
+ }),
3757
3777
  export: multioption({
3758
3778
  type: array(string),
3759
3779
  long: "export",
@@ -3892,7 +3912,7 @@ var evalRunCommand = command({
3892
3912
  },
3893
3913
  handler: async (args) => {
3894
3914
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
3895
- const { launchInteractiveWizard } = await import("./interactive-SNKK6VCV.js");
3915
+ const { launchInteractiveWizard } = await import("./interactive-OPHUF2UP.js");
3896
3916
  await launchInteractiveWizard();
3897
3917
  return;
3898
3918
  }
@@ -3905,6 +3925,7 @@ var evalRunCommand = command({
3905
3925
  out: args.out,
3906
3926
  output: args.output,
3907
3927
  outputFormat: args.outputFormat,
3928
+ experiment: args.experiment,
3908
3929
  export: args.export,
3909
3930
  dryRun: args.dryRun,
3910
3931
  dryRunDelay: args.dryRunDelay,
@@ -4299,42 +4320,149 @@ function formatDurationMs3(ms) {
4299
4320
  return `${minutes}m ${remainingSeconds}s`;
4300
4321
  }
4301
4322
 
4323
+ // src/commands/import/huggingface.ts
4324
+ import { execFile } from "node:child_process";
4325
+ import { existsSync } from "node:fs";
4326
+ import path7 from "node:path";
4327
+ function findScript() {
4328
+ const candidates = [
4329
+ path7.resolve(__dirname, "..", "..", "..", "..", "..", "scripts", "import-huggingface.py"),
4330
+ path7.resolve(__dirname, "..", "..", "..", "..", "scripts", "import-huggingface.py"),
4331
+ path7.resolve(process.cwd(), "scripts", "import-huggingface.py")
4332
+ ];
4333
+ for (const candidate of candidates) {
4334
+ if (existsSync(candidate)) return candidate;
4335
+ }
4336
+ return candidates[candidates.length - 1];
4337
+ }
4338
+ var importHuggingFaceCommand = command({
4339
+ name: "huggingface",
4340
+ description: "Import a HuggingFace dataset into AgentV EVAL.yaml format",
4341
+ args: {
4342
+ repo: option({
4343
+ type: string,
4344
+ long: "repo",
4345
+ description: "HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)"
4346
+ }),
4347
+ split: option({
4348
+ type: optional(string),
4349
+ long: "split",
4350
+ description: "Dataset split to load (default: test)"
4351
+ }),
4352
+ limit: option({
4353
+ type: optional(number),
4354
+ long: "limit",
4355
+ description: "Maximum number of instances to import"
4356
+ }),
4357
+ output: option({
4358
+ type: optional(string),
4359
+ long: "output",
4360
+ short: "o",
4361
+ description: "Output directory for EVAL.yaml files (default: evals/)"
4362
+ })
4363
+ },
4364
+ handler: async ({ repo, split, limit, output }) => {
4365
+ const scriptPath = findScript();
4366
+ if (!existsSync(scriptPath)) {
4367
+ console.error(`Error: Python script not found at ${scriptPath}`);
4368
+ console.error(
4369
+ "Make sure you are running from the agentv repository root, or install agentv from source."
4370
+ );
4371
+ process.exit(1);
4372
+ }
4373
+ const args = [scriptPath, "--repo", repo];
4374
+ if (split) args.push("--split", split);
4375
+ if (limit !== void 0) args.push("--limit", String(limit));
4376
+ if (output) args.push("--output", output);
4377
+ console.log(`Importing from HuggingFace: ${repo} (split=${split ?? "test"})...`);
4378
+ try {
4379
+ await new Promise((resolve3, reject) => {
4380
+ const child = execFile("uv", ["run", ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => {
4381
+ if (error) {
4382
+ reject(error);
4383
+ } else {
4384
+ resolve3();
4385
+ }
4386
+ });
4387
+ let stderrBuf = "";
4388
+ child.stderr?.on("data", (data) => {
4389
+ const chunk = data.toString();
4390
+ stderrBuf += chunk;
4391
+ process.stderr.write(data);
4392
+ });
4393
+ let stdout = "";
4394
+ child.stdout?.on("data", (data) => {
4395
+ stdout += data.toString();
4396
+ });
4397
+ child.on("close", (code) => {
4398
+ if (code === 0 && stdout.trim()) {
4399
+ try {
4400
+ const summary = JSON.parse(stdout.trim());
4401
+ console.log(
4402
+ `
4403
+ Imported ${summary.files_created} eval(s) from ${summary.dataset} \u2192 ${summary.output_dir}/`
4404
+ );
4405
+ } catch {
4406
+ if (stdout.trim()) console.log(stdout.trim());
4407
+ }
4408
+ } else if (code !== 0) {
4409
+ const tail = stderrBuf.trim().slice(-2e3);
4410
+ if (tail) {
4411
+ console.error("\n--- import-huggingface.py stderr (last 2 000 chars) ---");
4412
+ console.error(tail);
4413
+ }
4414
+ }
4415
+ });
4416
+ });
4417
+ } catch (err2) {
4418
+ if (err2 instanceof Error && err2.code === "ENOENT") {
4419
+ console.error(
4420
+ "Error: `uv` is not installed or not found on PATH.\nInstall it with: curl -LsSf https://astral.sh/uv/install.sh | sh\nSee https://docs.astral.sh/uv/ for details."
4421
+ );
4422
+ process.exit(1);
4423
+ }
4424
+ throw err2;
4425
+ }
4426
+ }
4427
+ });
4428
+
4302
4429
  // src/commands/import/index.ts
4303
4430
  var importCommand = subcommands({
4304
4431
  name: "import",
4305
- description: "Import agent session transcripts for offline grading",
4432
+ description: "Import agent session transcripts and datasets for offline grading",
4306
4433
  cmds: {
4307
4434
  claude: importClaudeCommand,
4308
4435
  codex: importCodexCommand,
4309
- copilot: importCopilotCommand
4436
+ copilot: importCopilotCommand,
4437
+ huggingface: importHuggingFaceCommand
4310
4438
  }
4311
4439
  });
4312
4440
 
4313
4441
  // src/commands/init/index.ts
4314
- import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
4315
- import path8 from "node:path";
4442
+ import { existsSync as existsSync2, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
4443
+ import path9 from "node:path";
4316
4444
  import * as readline from "node:readline/promises";
4317
4445
 
4318
4446
  // src/templates/index.ts
4319
4447
  import { readFileSync as readFileSync3, readdirSync, statSync } from "node:fs";
4320
- import path7 from "node:path";
4448
+ import path8 from "node:path";
4321
4449
  import { fileURLToPath } from "node:url";
4322
4450
  function getAgentvTemplates() {
4323
4451
  return getTemplatesFromDir(".agentv");
4324
4452
  }
4325
4453
  function getEnvExampleTemplate() {
4326
- const currentDir = path7.dirname(fileURLToPath(import.meta.url));
4327
- const templatesBase = currentDir.includes(`${path7.sep}dist`) ? path7.join(currentDir, "templates") : currentDir;
4328
- const content = readFileSync3(path7.join(templatesBase, ".env.example"), "utf-8");
4454
+ const currentDir = path8.dirname(fileURLToPath(import.meta.url));
4455
+ const templatesBase = currentDir.includes(`${path8.sep}dist`) ? path8.join(currentDir, "templates") : currentDir;
4456
+ const content = readFileSync3(path8.join(templatesBase, ".env.example"), "utf-8");
4329
4457
  return { path: ".env.example", content };
4330
4458
  }
4331
4459
  function getTemplatesFromDir(subdir) {
4332
- const currentDir = path7.dirname(fileURLToPath(import.meta.url));
4460
+ const currentDir = path8.dirname(fileURLToPath(import.meta.url));
4333
4461
  let templatesDir;
4334
- if (currentDir.includes(`${path7.sep}dist`)) {
4335
- templatesDir = path7.join(currentDir, "templates", subdir);
4462
+ if (currentDir.includes(`${path8.sep}dist`)) {
4463
+ templatesDir = path8.join(currentDir, "templates", subdir);
4336
4464
  } else {
4337
- templatesDir = path7.join(currentDir, subdir);
4465
+ templatesDir = path8.join(currentDir, subdir);
4338
4466
  }
4339
4467
  return readTemplatesRecursively(templatesDir, "");
4340
4468
  }
@@ -4342,15 +4470,15 @@ function readTemplatesRecursively(dir, relativePath) {
4342
4470
  const templates = [];
4343
4471
  const entries2 = readdirSync(dir);
4344
4472
  for (const entry of entries2) {
4345
- const fullPath = path7.join(dir, entry);
4473
+ const fullPath = path8.join(dir, entry);
4346
4474
  const stat3 = statSync(fullPath);
4347
- const entryRelativePath = relativePath ? path7.join(relativePath, entry) : entry;
4475
+ const entryRelativePath = relativePath ? path8.join(relativePath, entry) : entry;
4348
4476
  if (stat3.isDirectory()) {
4349
4477
  templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
4350
4478
  } else {
4351
4479
  const content = readFileSync3(fullPath, "utf-8");
4352
4480
  templates.push({
4353
- path: entryRelativePath.split(path7.sep).join("/"),
4481
+ path: entryRelativePath.split(path8.sep).join("/"),
4354
4482
  // Normalize to forward slashes
4355
4483
  content
4356
4484
  });
@@ -4379,22 +4507,22 @@ async function promptYesNo(message) {
4379
4507
  }
4380
4508
  }
4381
4509
  async function initCommand(options = {}) {
4382
- const targetPath = path8.resolve(options.targetPath ?? ".");
4383
- const agentvDir = path8.join(targetPath, ".agentv");
4510
+ const targetPath = path9.resolve(options.targetPath ?? ".");
4511
+ const agentvDir = path9.join(targetPath, ".agentv");
4384
4512
  const otherAgentvTemplates = getAgentvTemplates();
4385
4513
  const envTemplate = getEnvExampleTemplate();
4386
4514
  const existingFiles = [];
4387
4515
  if (envTemplate) {
4388
- const envFilePath = path8.join(targetPath, ".env.example");
4389
- if (existsSync(envFilePath)) {
4516
+ const envFilePath = path9.join(targetPath, ".env.example");
4517
+ if (existsSync2(envFilePath)) {
4390
4518
  existingFiles.push(".env.example");
4391
4519
  }
4392
4520
  }
4393
- if (existsSync(agentvDir)) {
4521
+ if (existsSync2(agentvDir)) {
4394
4522
  for (const template of otherAgentvTemplates) {
4395
- const targetFilePath = path8.join(agentvDir, template.path);
4396
- if (existsSync(targetFilePath)) {
4397
- existingFiles.push(path8.relative(targetPath, targetFilePath));
4523
+ const targetFilePath = path9.join(agentvDir, template.path);
4524
+ if (existsSync2(targetFilePath)) {
4525
+ existingFiles.push(path9.relative(targetPath, targetFilePath));
4398
4526
  }
4399
4527
  }
4400
4528
  }
@@ -4412,22 +4540,22 @@ async function initCommand(options = {}) {
4412
4540
  }
4413
4541
  console.log();
4414
4542
  }
4415
- if (!existsSync(agentvDir)) {
4543
+ if (!existsSync2(agentvDir)) {
4416
4544
  mkdirSync(agentvDir, { recursive: true });
4417
4545
  }
4418
4546
  if (envTemplate) {
4419
- const envFilePath = path8.join(targetPath, ".env.example");
4547
+ const envFilePath = path9.join(targetPath, ".env.example");
4420
4548
  writeFileSync2(envFilePath, envTemplate.content, "utf-8");
4421
4549
  console.log("Created .env.example");
4422
4550
  }
4423
4551
  for (const template of otherAgentvTemplates) {
4424
- const targetFilePath = path8.join(agentvDir, template.path);
4425
- const targetDirPath = path8.dirname(targetFilePath);
4426
- if (!existsSync(targetDirPath)) {
4552
+ const targetFilePath = path9.join(agentvDir, template.path);
4553
+ const targetDirPath = path9.dirname(targetFilePath);
4554
+ if (!existsSync2(targetDirPath)) {
4427
4555
  mkdirSync(targetDirPath, { recursive: true });
4428
4556
  }
4429
4557
  writeFileSync2(targetFilePath, template.content, "utf-8");
4430
- console.log(`Created ${path8.relative(targetPath, targetFilePath)}`);
4558
+ console.log(`Created ${path9.relative(targetPath, targetFilePath)}`);
4431
4559
  }
4432
4560
  console.log("\nAgentV initialized successfully!");
4433
4561
  console.log("\nFiles installed to root:");
@@ -4435,7 +4563,7 @@ async function initCommand(options = {}) {
4435
4563
  console.log(" - .env.example");
4436
4564
  }
4437
4565
  console.log(`
4438
- Files installed to ${path8.relative(targetPath, agentvDir)}:`);
4566
+ Files installed to ${path9.relative(targetPath, agentvDir)}:`);
4439
4567
  for (const t of otherAgentvTemplates) {
4440
4568
  console.log(` - ${t.path}`);
4441
4569
  }
@@ -4465,415 +4593,291 @@ var initCmdTsCommand = command({
4465
4593
  }
4466
4594
  });
4467
4595
 
4468
- // src/commands/inspect/utils.ts
4469
- import { readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
4470
- import path9 from "node:path";
4471
- var colors2 = {
4472
- reset: "\x1B[0m",
4473
- bold: "\x1B[1m",
4474
- dim: "\x1B[2m",
4475
- green: "\x1B[32m",
4476
- red: "\x1B[31m",
4477
- yellow: "\x1B[33m",
4478
- cyan: "\x1B[36m",
4479
- gray: "\x1B[90m"
4480
- };
4481
- var noColor2 = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
4482
- var c2 = noColor2 ? Object.fromEntries(Object.keys(colors2).map((k) => [k, ""])) : colors2;
4483
- var ansiPattern2 = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
4484
- function stripAnsi3(str) {
4485
- return str.replace(ansiPattern2, "");
4486
- }
4487
- function padRight2(str, len) {
4488
- const plainLen = stripAnsi3(str).length;
4489
- return str + " ".repeat(Math.max(0, len - plainLen));
4490
- }
4491
- function padLeft2(str, len) {
4492
- const plainLen = stripAnsi3(str).length;
4493
- return " ".repeat(Math.max(0, len - plainLen)) + str;
4494
- }
4495
- function loadResultFile(filePath) {
4496
- const resolvedFilePath = resolveTraceResultPath(filePath);
4497
- if (path9.extname(resolvedFilePath) === ".json") {
4498
- return loadOtlpTraceFile(resolvedFilePath);
4499
- }
4500
- if (path9.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
4501
- return loadManifestAsRawResults(resolvedFilePath);
4596
+ // src/commands/inspect/filter.ts
4597
+ import { existsSync as existsSync3, readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
4598
+ import path10 from "node:path";
4599
+ function collectIndexFiles(dir) {
4600
+ const files = [];
4601
+ try {
4602
+ const entries2 = readdirSync2(dir, { withFileTypes: true });
4603
+ for (const entry of entries2) {
4604
+ const fullPath = path10.join(dir, entry.name);
4605
+ if (entry.isDirectory()) {
4606
+ files.push(...collectIndexFiles(fullPath));
4607
+ } else if (entry.name === "index.jsonl") {
4608
+ files.push(fullPath);
4609
+ }
4610
+ }
4611
+ } catch {
4502
4612
  }
4503
- return loadJsonlRecords(resolvedFilePath);
4504
- }
4505
- function resolveTraceResultPath(filePath) {
4506
- return resolveWorkspaceOrFilePath(filePath);
4613
+ return files;
4507
4614
  }
4508
- function loadJsonlRecords(filePath) {
4509
- const content = readFileSync4(filePath, "utf8");
4510
- const lines = content.trim().split("\n").filter((line) => line.trim());
4511
- return lines.map((line, i) => {
4512
- const record = JSON.parse(line);
4513
- if (typeof record.score !== "number") {
4514
- throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
4615
+ function extractToolNames(record) {
4616
+ const tools = /* @__PURE__ */ new Set();
4617
+ const trace = record.trace;
4618
+ if (trace?.tool_calls && typeof trace.tool_calls === "object") {
4619
+ for (const name of Object.keys(trace.tool_calls)) {
4620
+ tools.add(name);
4515
4621
  }
4516
- return record;
4517
- });
4518
- }
4519
- function loadManifestAsRawResults(filePath) {
4520
- return loadManifestResults(filePath).map(toRawResult);
4521
- }
4522
- function toRawResult(result) {
4523
- return {
4524
- timestamp: result.timestamp,
4525
- test_id: result.testId,
4526
- suite: result.suite,
4527
- conversation_id: result.conversationId,
4528
- score: result.score,
4529
- assertions: result.assertions?.map((assertion) => ({
4530
- text: assertion.text,
4531
- passed: assertion.passed,
4532
- evidence: assertion.evidence
4533
- })),
4534
- target: result.target,
4535
- error: result.error,
4536
- scores: result.scores?.map((score) => ({
4537
- name: score.name,
4538
- type: score.type,
4539
- score: score.score,
4540
- assertions: score.assertions?.map((assertion) => ({
4541
- text: assertion.text,
4542
- passed: assertion.passed,
4543
- evidence: assertion.evidence
4544
- })),
4545
- weight: score.weight
4546
- })),
4547
- token_usage: result.tokenUsage ? {
4548
- input: result.tokenUsage.input,
4549
- output: result.tokenUsage.output,
4550
- cached: result.tokenUsage.cached
4551
- } : void 0,
4552
- cost_usd: result.costUsd,
4553
- duration_ms: result.durationMs,
4554
- start_time: result.startTime,
4555
- end_time: result.endTime,
4556
- input: result.input,
4557
- output: result.output,
4558
- file_changes: result.fileChanges
4559
- };
4560
- }
4561
- function loadOtlpTraceFile(filePath) {
4562
- const parsed = JSON.parse(readFileSync4(filePath, "utf8"));
4563
- const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
4564
- if (!spans || spans.length === 0) {
4565
- return [];
4566
4622
  }
4567
- const spanMap = /* @__PURE__ */ new Map();
4568
- const childMap = /* @__PURE__ */ new Map();
4569
- for (const span of spans) {
4570
- if (!span.spanId) continue;
4571
- spanMap.set(span.spanId, span);
4572
- if (span.parentSpanId) {
4573
- const siblings = childMap.get(span.parentSpanId) ?? [];
4574
- siblings.push(span);
4575
- childMap.set(span.parentSpanId, siblings);
4576
- }
4577
- }
4578
- const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId));
4579
- const supportedRoots = roots.filter(isAgentvEvalRoot);
4580
- const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots;
4581
- return candidateRoots.map((root, index) => {
4582
- const descendants = collectChildSpans(root.spanId, childMap);
4583
- const rootAttrs = parseOtlpAttributes(root.attributes);
4584
- const parsedDescendants = descendants.map((span) => ({
4585
- ...span,
4586
- parsedAttributes: parseOtlpAttributes(span.attributes)
4587
- }));
4588
- const toolSpans = parsedDescendants.filter(
4589
- (span) => typeof span.parsedAttributes.gen_ai_tool_name === "string"
4590
- );
4591
- const llmSpans = parsedDescendants.filter(
4592
- (span) => span.parsedAttributes.gen_ai_operation_name === "chat" || typeof span.name === "string" && span.name.startsWith("chat ")
4593
- );
4594
- const tokenUsage = descendants.reduce(
4595
- (acc, span) => {
4596
- const attrs = parseOtlpAttributes(span.attributes);
4597
- acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0;
4598
- acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0;
4599
- const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens);
4600
- if (cached !== void 0 && cached > 0) {
4601
- acc.cached = (acc.cached ?? 0) + cached;
4623
+ const output = record.output;
4624
+ if (Array.isArray(output)) {
4625
+ for (const msg of output) {
4626
+ if (typeof msg === "object" && msg !== null && Array.isArray(msg.tool_calls)) {
4627
+ for (const tc of msg.tool_calls) {
4628
+ if (typeof tc.tool === "string") {
4629
+ tools.add(tc.tool);
4630
+ }
4602
4631
  }
4603
- return acc;
4604
- },
4605
- { input: 0, output: 0, cached: void 0 }
4606
- );
4607
- const traceSummary = buildDerivedTraceSummary({
4608
- trace: {
4609
- event_count: numberAttr(rootAttrs.agentv_trace_event_count) ?? (toolSpans.length > 0 ? toolSpans.length : void 0),
4610
- tool_calls: countRawSpanNames(
4611
- toolSpans.map((span) => ({
4612
- type: "tool",
4613
- name: String(span.parsedAttributes.gen_ai_tool_name)
4614
- }))
4615
- ),
4616
- error_count: descendants.filter((span) => span.status?.code === 2).length || void 0,
4617
- llm_call_count: numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? (llmSpans.length > 0 ? llmSpans.length : void 0)
4618
- },
4619
- spans: [
4620
- ...llmSpans.map((span) => ({
4621
- type: "llm",
4622
- name: span.name ?? "chat",
4623
- duration_ms: durationFromSpan(span)
4624
- })),
4625
- ...toolSpans.map((span) => ({
4626
- type: "tool",
4627
- name: String(span.parsedAttributes.gen_ai_tool_name),
4628
- duration_ms: durationFromSpan(span)
4629
- }))
4630
- ],
4631
- duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root),
4632
- cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd),
4633
- token_usage: tokenUsage.input || tokenUsage.output || tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_input) || numberAttr(rootAttrs.agentv_trace_token_output) || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
4634
- input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0,
4635
- output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0,
4636
- ...tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
4637
- cached: tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0
4638
- } : {}
4639
- } : void 0
4640
- });
4641
- const score = numberAttr(rootAttrs.agentv_score);
4642
- if (score === void 0) {
4643
- throw new Error(
4644
- `Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`
4645
- );
4632
+ }
4646
4633
  }
4647
- return {
4648
- test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
4649
- suite: stringAttr(rootAttrs.agentv_suite),
4650
- target: stringAttr(rootAttrs.agentv_target),
4651
- score,
4652
- error: root.status?.code === 2 ? root.status.message : void 0,
4653
- cost_usd: traceSummary?.cost_usd,
4654
- duration_ms: traceSummary?.duration_ms,
4655
- token_usage: traceSummary?.token_usage,
4656
- trace: traceSummary ? {
4657
- event_count: traceSummary.event_count,
4658
- tool_calls: traceSummary.tool_calls,
4659
- error_count: traceSummary.error_count,
4660
- tool_durations: traceSummary.tool_durations,
4661
- llm_call_count: traceSummary.llm_call_count,
4662
- token_usage: traceSummary.token_usage,
4663
- cost_usd: traceSummary.cost_usd,
4664
- duration_ms: traceSummary.duration_ms
4665
- } : void 0,
4666
- spans: traceSummary?.spans,
4667
- output: stringAttr(rootAttrs.agentv_output_text),
4668
- scores: root.events?.filter(
4669
- (event) => event.name?.startsWith("agentv.grader.") || event.name?.startsWith("agentv.evaluator.")
4670
- ).map((event) => {
4671
- const attrs = parseOtlpAttributes(event.attributes);
4672
- const name = event.name?.replace(/^agentv\.grader\./, "").replace(/^agentv\.evaluator\./, "") ?? "unknown";
4673
- return {
4674
- name,
4675
- type: stringAttr(attrs.agentv_grader_type) ?? stringAttr(attrs.agentv_evaluator_type) ?? "unknown",
4676
- score: numberAttr(attrs.agentv_grader_score) ?? numberAttr(attrs.agentv_evaluator_score) ?? 0
4677
- };
4678
- })
4679
- };
4680
- });
4681
- }
4682
- function isAgentvEvalRoot(span) {
4683
- const attrs = parseOtlpAttributes(span.attributes);
4684
- return span.name === "agentv.eval" || numberAttr(attrs.agentv_score) !== void 0 || typeof stringAttr(attrs.agentv_test_id) === "string";
4685
- }
4686
- function collectChildSpans(spanId, childMap) {
4687
- if (!spanId) return [];
4688
- const direct = childMap.get(spanId) ?? [];
4689
- const all = [...direct];
4690
- for (const child of direct) {
4691
- all.push(...collectChildSpans(child.spanId, childMap));
4692
- }
4693
- return all;
4694
- }
4695
- function parseOtlpAttributes(attributes) {
4696
- const parsed = {};
4697
- for (const attribute of attributes ?? []) {
4698
- parsed[attribute.key.replace(/\./g, "_")] = parseOtlpValue(attribute.value);
4699
- }
4700
- return parsed;
4701
- }
4702
- function parseOtlpValue(value) {
4703
- if (!value) return void 0;
4704
- if ("stringValue" in value && value.stringValue !== void 0) return value.stringValue;
4705
- if ("intValue" in value && value.intValue !== void 0) return Number(value.intValue);
4706
- if ("doubleValue" in value && value.doubleValue !== void 0) return value.doubleValue;
4707
- if ("boolValue" in value && value.boolValue !== void 0) return value.boolValue;
4708
- if ("arrayValue" in value)
4709
- return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry));
4710
- return void 0;
4711
- }
4712
- function durationFromSpan(span) {
4713
- const start = Number(span.startTimeUnixNano);
4714
- const end = Number(span.endTimeUnixNano);
4715
- if (!Number.isFinite(start) || !Number.isFinite(end)) return void 0;
4716
- return Math.round((end - start) / 1e6);
4717
- }
4718
- function stringAttr(value) {
4719
- return typeof value === "string" ? value : void 0;
4720
- }
4721
- function numberAttr(value) {
4722
- return typeof value === "number" && Number.isFinite(value) ? value : void 0;
4723
- }
4724
- function buildDerivedTraceSummary(result) {
4725
- const toolSpans = (result.spans ?? []).filter((span) => span.type === "tool");
4726
- const llmSpans = (result.spans ?? []).filter((span) => span.type === "llm");
4727
- const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans);
4728
- const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans);
4729
- const hasSpanData = (result.spans?.length ?? 0) > 0;
4730
- const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : void 0);
4731
- const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : void 0);
4732
- if (!result.trace && !result.spans?.length && result.token_usage === void 0 && result.cost_usd === void 0 && result.duration_ms === void 0) {
4733
- return void 0;
4734
- }
4735
- return {
4736
- event_count: eventCount,
4737
- tool_calls: toolCalls,
4738
- error_count: result.trace?.error_count,
4739
- tool_durations: toolDurations,
4740
- llm_call_count: llmCallCount,
4741
- token_usage: result.trace?.token_usage ?? result.token_usage,
4742
- cost_usd: result.trace?.cost_usd ?? result.cost_usd,
4743
- duration_ms: result.trace?.duration_ms ?? result.duration_ms,
4744
- spans: result.spans
4745
- };
4746
- }
4747
- function countRawSpanNames(spans) {
4748
- const counts = {};
4749
- for (const span of spans) {
4750
- counts[span.name] = (counts[span.name] ?? 0) + 1;
4751
4634
  }
4752
- return Object.keys(counts).length > 0 ? counts : void 0;
4753
- }
4754
- function groupRawSpanDurations(spans) {
4755
- const grouped = {};
4756
- for (const span of spans) {
4757
- if (span.duration_ms === void 0) continue;
4758
- const existing = grouped[span.name] ?? [];
4759
- existing.push(span.duration_ms);
4760
- grouped[span.name] = existing;
4761
- }
4762
- return Object.keys(grouped).length > 0 ? grouped : void 0;
4763
- }
4764
- function getTraceSummary(result) {
4765
- const derived = buildDerivedTraceSummary(result);
4766
- if (!derived) return void 0;
4767
- const { spans: _spans, ...trace } = derived;
4768
- return trace;
4769
- }
4770
- function getTraceSpans(result) {
4771
- return buildDerivedTraceSummary(result)?.spans ?? [];
4635
+ return [...tools];
4772
4636
  }
4773
- function toTraceSummary(result) {
4774
- const rawTrace = getTraceSummary(result);
4775
- if (!rawTrace) return void 0;
4776
- return toCamelCaseDeep(rawTrace);
4777
- }
4778
- function listResultFiles(cwd, limit) {
4779
- const runsDir = path9.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME);
4780
- const files = [];
4637
+ function parseFilterableRecords(filePath) {
4638
+ let content;
4781
4639
  try {
4782
- const entries2 = readdirSync2(runsDir, { withFileTypes: true });
4783
- for (const entry of entries2) {
4784
- if (!entry.isDirectory()) {
4785
- continue;
4786
- }
4787
- const primaryPath = resolveExistingRunPrimaryPath(path9.join(runsDir, entry.name));
4788
- if (primaryPath) {
4789
- files.push({ filePath: primaryPath, displayName: entry.name });
4640
+ content = readFileSync4(filePath, "utf8");
4641
+ } catch {
4642
+ return [];
4643
+ }
4644
+ const lines = content.split("\n").filter((line) => line.trim());
4645
+ const records = [];
4646
+ for (const line of lines) {
4647
+ let raw;
4648
+ try {
4649
+ raw = JSON.parse(line);
4650
+ } catch {
4651
+ continue;
4652
+ }
4653
+ let experiment = typeof raw.experiment === "string" ? raw.experiment : void 0;
4654
+ if (!experiment) {
4655
+ const parts = filePath.split(path10.sep);
4656
+ const runsIdx = parts.indexOf("runs");
4657
+ if (runsIdx !== -1 && parts.length - runsIdx >= 3) {
4658
+ const candidate = parts[runsIdx + 1];
4659
+ if (candidate && !/^\d{4}-\d{2}-\d{2}T/.test(candidate)) {
4660
+ experiment = candidate;
4661
+ }
4790
4662
  }
4791
4663
  }
4792
- } catch {
4664
+ records.push({
4665
+ file: filePath,
4666
+ test_id: typeof raw.test_id === "string" ? raw.test_id : "unknown",
4667
+ suite: typeof raw.suite === "string" ? raw.suite : void 0,
4668
+ target: typeof raw.target === "string" ? raw.target : void 0,
4669
+ experiment,
4670
+ score: typeof raw.score === "number" ? raw.score : 0,
4671
+ execution_status: typeof raw.execution_status === "string" ? raw.execution_status : void 0,
4672
+ error: typeof raw.error === "string" ? raw.error : void 0,
4673
+ timestamp: typeof raw.timestamp === "string" ? raw.timestamp : void 0,
4674
+ tool_names: extractToolNames(raw)
4675
+ });
4793
4676
  }
4794
- files.sort((a, b) => b.displayName.localeCompare(a.displayName));
4795
- const limited = limit !== void 0 && limit > 0 ? files.slice(0, limit) : files;
4796
- const metas = [];
4797
- for (const { filePath, displayName } of limited) {
4677
+ return records;
4678
+ }
4679
+ function buildFilterPredicate(opts) {
4680
+ return (record) => {
4681
+ if (opts.target && record.target !== opts.target) return false;
4682
+ if (opts.experiment && record.experiment !== opts.experiment) return false;
4683
+ if (opts.scoreBelow !== void 0 && record.score >= opts.scoreBelow) return false;
4684
+ if (opts.scoreAbove !== void 0 && record.score <= opts.scoreAbove) return false;
4685
+ if (opts.status) {
4686
+ const statusMap = {
4687
+ pass: ["ok"],
4688
+ fail: ["quality_failure"],
4689
+ error: ["error", "timeout", "provider_error"]
4690
+ };
4691
+ const allowedStatuses = statusMap[opts.status] ?? [opts.status];
4692
+ if (record.execution_status && !allowedStatuses.includes(record.execution_status))
4693
+ return false;
4694
+ if (!record.execution_status) {
4695
+ if (opts.status === "pass" && record.score < 1) return false;
4696
+ if (opts.status === "fail" && record.score >= 1) return false;
4697
+ if (opts.status === "error" && !record.error) return false;
4698
+ }
4699
+ }
4700
+ if (opts.hasTool) {
4701
+ const toolPattern = opts.hasTool.toLowerCase();
4702
+ const hasMatch = record.tool_names.some((t) => t.toLowerCase().includes(toolPattern));
4703
+ if (!hasMatch) return false;
4704
+ }
4705
+ return true;
4706
+ };
4707
+ }
4708
+ function discoverFilterSources(searchPath, cwd) {
4709
+ if (searchPath) {
4710
+ const resolved = path10.isAbsolute(searchPath) ? searchPath : path10.resolve(cwd, searchPath);
4711
+ if (!existsSync3(resolved)) {
4712
+ console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`);
4713
+ process.exit(1);
4714
+ }
4798
4715
  try {
4799
- const fileStat = statSync2(filePath);
4800
- const results = loadResultFile(filePath);
4801
- const testCount = results.length;
4802
- const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
4803
- const passRate = testCount > 0 ? passCount / testCount : 0;
4804
- const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
4805
- const filenameTimestamp = extractTimestampFromFilename(displayName);
4806
- const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
4807
- metas.push({
4808
- path: filePath,
4809
- filename: displayName,
4810
- timestamp,
4811
- testCount,
4812
- passRate,
4813
- avgScore,
4814
- sizeBytes: fileStat.size
4815
- });
4716
+ if (statSync2(resolved).isDirectory()) {
4717
+ return collectIndexFiles(resolved);
4718
+ }
4816
4719
  } catch {
4817
4720
  }
4721
+ return [resolved];
4818
4722
  }
4819
- return metas;
4723
+ return collectIndexFiles(path10.join(cwd, ".agentv", "results", "runs"));
4820
4724
  }
4821
- function extractTimestampFromFilename(filename) {
4822
- const match = filename.match(
4823
- /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/
4725
+ function formatFilterTable(records) {
4726
+ const lines = [];
4727
+ if (records.length === 0) {
4728
+ lines.push(`${c.yellow}No matching results found.${c.reset}`);
4729
+ return lines.join("\n");
4730
+ }
4731
+ lines.push("");
4732
+ lines.push(
4733
+ `${c.bold}Filtered Results${c.reset} ${c.dim}(${records.length} match${records.length !== 1 ? "es" : ""})${c.reset}`
4824
4734
  );
4825
- if (!match) return void 0;
4826
- return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ":$1:$2.$3Z");
4827
- }
4828
- function formatNumber(n) {
4829
- return n.toLocaleString();
4830
- }
4831
- function formatDuration(ms) {
4832
- if (ms < 1e3) return `${Math.round(ms)}ms`;
4833
- if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
4834
- const minutes = Math.floor(ms / 6e4);
4835
- const seconds = (ms % 6e4 / 1e3).toFixed(0);
4836
- return `${minutes}m${seconds}s`;
4837
- }
4838
- function formatCost(usd) {
4839
- if (usd < 0.01) return `$${usd.toFixed(4)}`;
4840
- return `$${usd.toFixed(3)}`;
4841
- }
4842
- function formatSize(bytes) {
4843
- if (bytes < 1024) return `${bytes}B`;
4844
- if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`;
4845
- return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
4846
- }
4847
- function formatScore(score) {
4848
- return `${(score * 100).toFixed(0)}%`;
4735
+ lines.push("");
4736
+ const maxIdLen = Math.min(32, Math.max(7, ...records.map((r) => r.test_id.length)));
4737
+ const maxTargetLen = Math.min(16, Math.max(6, ...records.map((r) => (r.target ?? "").length)));
4738
+ const maxExpLen = Math.min(20, Math.max(10, ...records.map((r) => (r.experiment ?? "").length)));
4739
+ const header = ` ${padRight("Test ID", maxIdLen)} ${padRight("Target", maxTargetLen)} ${padRight("Experiment", maxExpLen)} ${padLeft("Score", 6)} Status`;
4740
+ lines.push(`${c.dim}${header}${c.reset}`);
4741
+ lines.push(
4742
+ `${c.dim} ${"\u2500".repeat(maxIdLen)} ${"\u2500".repeat(maxTargetLen)} ${"\u2500".repeat(maxExpLen)} ${"\u2500".repeat(6)} ${"\u2500".repeat(16)}${c.reset}`
4743
+ );
4744
+ for (const record of records) {
4745
+ const scoreColor = record.score >= 1 ? c.green : record.score >= 0.5 ? c.yellow : c.red;
4746
+ const status = record.execution_status ?? (record.error ? "error" : record.score >= 1 ? "ok" : "quality_failure");
4747
+ const statusColor = status === "ok" ? c.green : status === "error" ? c.red : c.yellow;
4748
+ const row = ` ${padRight(record.test_id.slice(0, maxIdLen), maxIdLen)} ${padRight((record.target ?? "-").slice(0, maxTargetLen), maxTargetLen)} ${padRight((record.experiment ?? "-").slice(0, maxExpLen), maxExpLen)} ${padLeft(`${scoreColor}${formatScore(record.score)}${c.reset}`, 6)} ${statusColor}${status}${c.reset}`;
4749
+ lines.push(row);
4750
+ }
4751
+ lines.push("");
4752
+ const passCount = records.filter((r) => r.score >= 1).length;
4753
+ const avgScore = records.length > 0 ? records.reduce((sum, r) => sum + r.score, 0) / records.length : 0;
4754
+ lines.push(
4755
+ `${c.dim}${records.length} result${records.length !== 1 ? "s" : ""} | ${passCount} passed | avg score: ${formatScore(avgScore)}${c.reset}`
4756
+ );
4757
+ lines.push("");
4758
+ return lines.join("\n");
4849
4759
  }
4760
+ var inspectFilterCommand = command({
4761
+ name: "filter",
4762
+ description: "Filter evaluation results by target, experiment, score, status, or tool usage",
4763
+ args: {
4764
+ path: positional({
4765
+ type: optional(string),
4766
+ displayName: "path",
4767
+ description: "Directory or file to filter (default: .agentv/results/runs/)"
4768
+ }),
4769
+ target: option({
4770
+ type: optional(string),
4771
+ long: "target",
4772
+ description: "Filter by target name"
4773
+ }),
4774
+ experiment: option({
4775
+ type: optional(string),
4776
+ long: "experiment",
4777
+ description: "Filter by experiment name"
4778
+ }),
4779
+ scoreBelow: option({
4780
+ type: optional(number),
4781
+ long: "score-below",
4782
+ description: "Filter to results with score below this value"
4783
+ }),
4784
+ scoreAbove: option({
4785
+ type: optional(number),
4786
+ long: "score-above",
4787
+ description: "Filter to results with score above this value"
4788
+ }),
4789
+ status: option({
4790
+ type: optional(string),
4791
+ long: "status",
4792
+ description: "Filter by execution status: pass, fail, error (or raw value like ok, quality_failure)"
4793
+ }),
4794
+ hasTool: option({
4795
+ type: optional(string),
4796
+ long: "has-tool",
4797
+ description: "Filter to results that used a specific tool (substring match)"
4798
+ }),
4799
+ dir: option({
4800
+ type: optional(string),
4801
+ long: "dir",
4802
+ short: "d",
4803
+ description: "Working directory (default: current directory)"
4804
+ }),
4805
+ format: option({
4806
+ type: optional(oneOf(["table", "json"])),
4807
+ long: "format",
4808
+ short: "f",
4809
+ description: "Output format: table (default) or json"
4810
+ })
4811
+ },
4812
+ handler: async ({
4813
+ path: searchPath,
4814
+ target,
4815
+ experiment,
4816
+ scoreBelow,
4817
+ scoreAbove,
4818
+ status,
4819
+ hasTool,
4820
+ dir,
4821
+ format
4822
+ }) => {
4823
+ const cwd = dir ?? process.cwd();
4824
+ const sources = discoverFilterSources(searchPath, cwd);
4825
+ if (sources.length === 0) {
4826
+ console.error(`${c.yellow}No result files found.${c.reset}`);
4827
+ console.error(`${c.dim}Run an evaluation first, or specify a path.${c.reset}`);
4828
+ process.exit(0);
4829
+ }
4830
+ const allRecords = [];
4831
+ for (const source of sources) {
4832
+ allRecords.push(...parseFilterableRecords(source));
4833
+ }
4834
+ if (allRecords.length === 0) {
4835
+ console.error(`${c.yellow}No results found in the specified path.${c.reset}`);
4836
+ process.exit(0);
4837
+ }
4838
+ const predicate = buildFilterPredicate({
4839
+ target,
4840
+ experiment,
4841
+ scoreBelow,
4842
+ scoreAbove,
4843
+ status,
4844
+ hasTool
4845
+ });
4846
+ const filtered = allRecords.filter(predicate);
4847
+ if (format === "json") {
4848
+ console.log(JSON.stringify(filtered, null, 2));
4849
+ } else {
4850
+ console.log(formatFilterTable(filtered));
4851
+ }
4852
+ }
4853
+ });
4850
4854
 
4851
4855
  // src/commands/inspect/list.ts
4852
4856
  function formatListTable(metas) {
4853
4857
  const lines = [];
4854
4858
  if (metas.length === 0) {
4855
- lines.push(`${c2.yellow}No run workspaces found in .agentv/results/runs/${c2.reset}`);
4856
- lines.push(`${c2.dim}Run an evaluation first: agentv run <eval-file>${c2.reset}`);
4859
+ lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`);
4860
+ lines.push(`${c.dim}Run an evaluation first: agentv run <eval-file>${c.reset}`);
4857
4861
  return lines.join("\n");
4858
4862
  }
4859
4863
  lines.push("");
4860
- lines.push(`${c2.bold}Evaluation Runs${c2.reset} ${c2.dim}(.agentv/results/runs/)${c2.reset}`);
4864
+ lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`);
4861
4865
  lines.push("");
4862
4866
  const maxFileLen = Math.max(4, ...metas.map((m) => m.filename.length));
4863
- const header = ` ${padRight2("File", maxFileLen)} ${padLeft2("Tests", 5)} ${padLeft2("Pass", 5)} ${padLeft2("Score", 6)} ${padLeft2("Size", 7)} Timestamp`;
4864
- lines.push(`${c2.dim}${header}${c2.reset}`);
4867
+ const header = ` ${padRight("File", maxFileLen)} ${padLeft("Tests", 5)} ${padLeft("Pass", 5)} ${padLeft("Score", 6)} ${padLeft("Size", 7)} Timestamp`;
4868
+ lines.push(`${c.dim}${header}${c.reset}`);
4865
4869
  lines.push(
4866
- `${c2.dim} ${"\u2500".repeat(maxFileLen)} ${"\u2500".repeat(5)} ${"\u2500".repeat(5)} ${"\u2500".repeat(6)} ${"\u2500".repeat(7)} ${"\u2500".repeat(24)}${c2.reset}`
4870
+ `${c.dim} ${"\u2500".repeat(maxFileLen)} ${"\u2500".repeat(5)} ${"\u2500".repeat(5)} ${"\u2500".repeat(6)} ${"\u2500".repeat(7)} ${"\u2500".repeat(24)}${c.reset}`
4867
4871
  );
4868
4872
  for (const meta of metas) {
4869
- const passColor = meta.passRate >= 1 ? c2.green : meta.passRate >= 0.5 ? c2.yellow : c2.red;
4870
- const scoreColor = meta.avgScore >= 0.9 ? c2.green : meta.avgScore >= 0.5 ? c2.yellow : c2.red;
4871
- const row = ` ${padRight2(meta.filename, maxFileLen)} ${padLeft2(String(meta.testCount), 5)} ${padLeft2(`${passColor}${formatScore(meta.passRate)}${c2.reset}`, 5)} ${padLeft2(`${scoreColor}${formatScore(meta.avgScore)}${c2.reset}`, 6)} ${padLeft2(formatSize(meta.sizeBytes), 7)} ${c2.dim}${meta.timestamp}${c2.reset}`;
4873
+ const passColor = meta.passRate >= 1 ? c.green : meta.passRate >= 0.5 ? c.yellow : c.red;
4874
+ const scoreColor = meta.avgScore >= 0.9 ? c.green : meta.avgScore >= 0.5 ? c.yellow : c.red;
4875
+ const row = ` ${padRight(meta.filename, maxFileLen)} ${padLeft(String(meta.testCount), 5)} ${padLeft(`${passColor}${formatScore(meta.passRate)}${c.reset}`, 5)} ${padLeft(`${scoreColor}${formatScore(meta.avgScore)}${c.reset}`, 6)} ${padLeft(formatSize(meta.sizeBytes), 7)} ${c.dim}${meta.timestamp}${c.reset}`;
4872
4876
  lines.push(row);
4873
4877
  }
4874
4878
  lines.push("");
4875
4879
  lines.push(
4876
- `${c2.dim}${metas.length} run workspace${metas.length !== 1 ? "s" : ""} found${c2.reset}`
4880
+ `${c.dim}${metas.length} run workspace${metas.length !== 1 ? "s" : ""} found${c.reset}`
4877
4881
  );
4878
4882
  lines.push("");
4879
4883
  return lines.join("\n");
@@ -5073,19 +5077,19 @@ function renderTable(scored, assertSpec) {
5073
5077
  { header: "Verdict", width: 8 },
5074
5078
  { header: "Detail", width: 50 }
5075
5079
  ];
5076
- const headerLine = cols.map((col) => padRight2(`${c2.bold}${col.header}${c2.reset}`, col.width)).join(" ");
5080
+ const headerLine = cols.map((col) => padRight(`${c.bold}${col.header}${c.reset}`, col.width)).join(" ");
5077
5081
  lines.push(headerLine);
5078
5082
  lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
5079
5083
  for (const r of scored) {
5080
- const verdictColor = r.verdict === "pass" ? c2.green : c2.red;
5084
+ const verdictColor = r.verdict === "pass" ? c.green : c.red;
5081
5085
  const failed = r.assertions.filter((a) => !a.passed);
5082
5086
  const passed = r.assertions.filter((a) => a.passed);
5083
5087
  const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
5084
5088
  const row = [
5085
- padRight2(r.testId.slice(0, 24), cols[0].width),
5086
- padLeft2(formatScore(r.originalScore), cols[1].width),
5087
- padLeft2(`${verdictColor}${formatScore(r.newScore)}${c2.reset}`, cols[2].width),
5088
- padRight2(`${verdictColor}${r.verdict.toUpperCase()}${c2.reset}`, cols[3].width),
5089
+ padRight(r.testId.slice(0, 24), cols[0].width),
5090
+ padLeft(formatScore(r.originalScore), cols[1].width),
5091
+ padLeft(`${verdictColor}${formatScore(r.newScore)}${c.reset}`, cols[2].width),
5092
+ padRight(`${verdictColor}${r.verdict.toUpperCase()}${c.reset}`, cols[3].width),
5089
5093
  detail.slice(0, cols[4].width)
5090
5094
  ].join(" ");
5091
5095
  lines.push(row);
@@ -5095,7 +5099,7 @@ function renderTable(scored, assertSpec) {
5095
5099
  const meanScore = total > 0 ? scored.reduce((sum, r) => sum + r.newScore, 0) / total : 0;
5096
5100
  lines.push("");
5097
5101
  lines.push(
5098
- `${c2.bold}Assert:${c2.reset} ${assertSpec} ${c2.bold}Results:${c2.reset} ${passCount}/${total} passed (${formatScore(passCount / (total || 1))}) ${c2.bold}Mean:${c2.reset} ${formatScore(meanScore)}`
5102
+ `${c.bold}Assert:${c.reset} ${assertSpec} ${c.bold}Results:${c.reset} ${passCount}/${total} passed (${formatScore(passCount / (total || 1))}) ${c.bold}Mean:${c.reset} ${formatScore(meanScore)}`
5099
5103
  );
5100
5104
  return lines.join("\n");
5101
5105
  }
@@ -5132,7 +5136,7 @@ var traceScoreCommand = command({
5132
5136
  evaluatorConfig = parseAssertSpec(assertSpec);
5133
5137
  } catch (err2) {
5134
5138
  const msg = err2 instanceof Error ? err2.message : String(err2);
5135
- console.error(`${c2.red}Error:${c2.reset} ${msg}`);
5139
+ console.error(`${c.red}Error:${c.reset} ${msg}`);
5136
5140
  process.exit(1);
5137
5141
  }
5138
5142
  let results;
@@ -5140,11 +5144,11 @@ var traceScoreCommand = command({
5140
5144
  results = loadResultFile(file);
5141
5145
  } catch (err2) {
5142
5146
  const msg = err2 instanceof Error ? err2.message : String(err2);
5143
- console.error(`${c2.red}Error:${c2.reset} Could not load result file: ${msg}`);
5147
+ console.error(`${c.red}Error:${c.reset} Could not load result file: ${msg}`);
5144
5148
  process.exit(1);
5145
5149
  }
5146
5150
  if (results.length === 0) {
5147
- console.error(`${c2.yellow}Warning:${c2.reset} No results found in ${file}`);
5151
+ console.error(`${c.yellow}Warning:${c.reset} No results found in ${file}`);
5148
5152
  process.exit(0);
5149
5153
  }
5150
5154
  const traceRequired = ["latency", "cost", "token-usage", "execution-metrics"].includes(
@@ -5156,7 +5160,7 @@ var traceScoreCommand = command({
5156
5160
  );
5157
5161
  if (!hasTrace) {
5158
5162
  console.error(
5159
- `${c2.red}Error:${c2.reset} Source lacks trace metrics. Use an OTLP trace export via ${c2.bold}--otel-file${c2.reset} or a run manifest with summary metrics in ${c2.bold}index.jsonl${c2.reset}.`
5163
+ `${c.red}Error:${c.reset} Source lacks trace metrics. Use an OTLP trace export via ${c.bold}--otel-file${c.reset} or a run manifest with summary metrics in ${c.bold}index.jsonl${c.reset}.`
5160
5164
  );
5161
5165
  process.exit(1);
5162
5166
  }
@@ -5166,12 +5170,12 @@ var traceScoreCommand = command({
5166
5170
  scored = await runScore(results, evaluatorConfig, testId);
5167
5171
  } catch (err2) {
5168
5172
  const msg = err2 instanceof Error ? err2.message : String(err2);
5169
- console.error(`${c2.red}Error:${c2.reset} Scoring failed: ${msg}`);
5173
+ console.error(`${c.red}Error:${c.reset} Scoring failed: ${msg}`);
5170
5174
  process.exit(1);
5171
5175
  }
5172
5176
  if (scored.length === 0) {
5173
5177
  console.error(
5174
- `${c2.yellow}Warning:${c2.reset} No results matched${testId ? ` test ID "${testId}"` : ""}`
5178
+ `${c.yellow}Warning:${c.reset} No results matched${testId ? ` test ID "${testId}"` : ""}`
5175
5179
  );
5176
5180
  process.exit(0);
5177
5181
  }
@@ -5187,6 +5191,192 @@ var traceScoreCommand = command({
5187
5191
  }
5188
5192
  });
5189
5193
 
5194
+ // src/commands/inspect/search.ts
5195
+ import { existsSync as existsSync4, readFileSync as readFileSync5, readdirSync as readdirSync3, statSync as statSync3 } from "node:fs";
5196
+ import path11 from "node:path";
5197
+ function collectJsonlFiles(dir) {
5198
+ const files = [];
5199
+ try {
5200
+ const entries2 = readdirSync3(dir, { withFileTypes: true });
5201
+ for (const entry of entries2) {
5202
+ const fullPath = path11.join(dir, entry.name);
5203
+ if (entry.isDirectory()) {
5204
+ files.push(...collectJsonlFiles(fullPath));
5205
+ } else if (entry.name.endsWith(".jsonl")) {
5206
+ files.push(fullPath);
5207
+ }
5208
+ }
5209
+ } catch {
5210
+ }
5211
+ return files;
5212
+ }
5213
+ function extractSnippet(text, matchIndex, matchLength, contextChars = 60) {
5214
+ const start = Math.max(0, matchIndex - contextChars);
5215
+ const end = Math.min(text.length, matchIndex + matchLength + contextChars);
5216
+ let snippet = text.slice(start, end);
5217
+ if (start > 0) snippet = `...${snippet}`;
5218
+ if (end < text.length) snippet = `${snippet}...`;
5219
+ return snippet.replace(/\n/g, "\\n").replace(/\r/g, "");
5220
+ }
5221
+ function searchJsonlFile(filePath, regex2, targetFilter, experimentFilter) {
5222
+ const matches = [];
5223
+ let content;
5224
+ try {
5225
+ content = readFileSync5(filePath, "utf8");
5226
+ } catch {
5227
+ return matches;
5228
+ }
5229
+ const lines = content.split("\n").filter((line) => line.trim());
5230
+ for (let i = 0; i < lines.length; i++) {
5231
+ const line = lines[i];
5232
+ let record;
5233
+ try {
5234
+ record = JSON.parse(line);
5235
+ } catch {
5236
+ continue;
5237
+ }
5238
+ const target = typeof record.target === "string" ? record.target : void 0;
5239
+ const experiment = typeof record.experiment === "string" ? record.experiment : void 0;
5240
+ const score = typeof record.score === "number" ? record.score : void 0;
5241
+ const testId = typeof record.test_id === "string" ? record.test_id : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
5242
+ if (targetFilter && target !== targetFilter) continue;
5243
+ if (experimentFilter && experiment !== experimentFilter) continue;
5244
+ const match = regex2.exec(line);
5245
+ if (match) {
5246
+ matches.push({
5247
+ file: filePath,
5248
+ id: testId ?? `line-${i + 1}`,
5249
+ lineNumber: i + 1,
5250
+ snippet: extractSnippet(line, match.index, match[0].length),
5251
+ target,
5252
+ experiment,
5253
+ score
5254
+ });
5255
+ }
5256
+ }
5257
+ return matches;
5258
+ }
5259
+ function discoverSources(basePath, cwd) {
5260
+ if (basePath) {
5261
+ const resolved = path11.isAbsolute(basePath) ? basePath : path11.resolve(cwd, basePath);
5262
+ if (!existsSync4(resolved)) {
5263
+ console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`);
5264
+ process.exit(1);
5265
+ }
5266
+ try {
5267
+ if (statSync3(resolved).isDirectory()) {
5268
+ return collectJsonlFiles(resolved);
5269
+ }
5270
+ } catch {
5271
+ }
5272
+ return [resolved];
5273
+ }
5274
+ const sources = [];
5275
+ sources.push(...collectJsonlFiles(path11.join(cwd, ".agentv", "results", "runs")));
5276
+ sources.push(...collectJsonlFiles(path11.join(cwd, ".agentv", "transcripts")));
5277
+ return sources;
5278
+ }
5279
+ function formatSearchResults(matches, pattern) {
5280
+ const lines = [];
5281
+ if (matches.length === 0) {
5282
+ lines.push(`${c.yellow}No matches found for pattern: ${pattern}${c.reset}`);
5283
+ return lines.join("\n");
5284
+ }
5285
+ lines.push("");
5286
+ lines.push(`${c.bold}Search Results${c.reset} ${c.dim}pattern: /${pattern}/${c.reset}`);
5287
+ lines.push(
5288
+ `${c.dim}${matches.length} record${matches.length !== 1 ? "s" : ""} matched${c.reset}`
5289
+ );
5290
+ lines.push("");
5291
+ const byFile = /* @__PURE__ */ new Map();
5292
+ for (const match of matches) {
5293
+ const existing = byFile.get(match.file) ?? [];
5294
+ existing.push(match);
5295
+ byFile.set(match.file, existing);
5296
+ }
5297
+ for (const [file, fileMatches] of byFile) {
5298
+ lines.push(`${c.cyan}${file}${c.reset}`);
5299
+ for (const match of fileMatches) {
5300
+ const meta = [];
5301
+ if (match.target) meta.push(`target:${match.target}`);
5302
+ if (match.experiment) meta.push(`exp:${match.experiment}`);
5303
+ if (match.score !== void 0) meta.push(`score:${match.score}`);
5304
+ const metaStr = meta.length > 0 ? ` ${c.dim}[${meta.join(", ")}]${c.reset}` : "";
5305
+ lines.push(
5306
+ ` ${c.bold}${match.id}${c.reset} ${c.dim}(line ${match.lineNumber})${c.reset}${metaStr}`
5307
+ );
5308
+ lines.push(` ${match.snippet}`);
5309
+ }
5310
+ lines.push("");
5311
+ }
5312
+ return lines.join("\n");
5313
+ }
5314
+ var inspectSearchCommand = command({
5315
+ name: "search",
5316
+ description: "Search across evaluation results and transcripts for a regex pattern",
5317
+ args: {
5318
+ pattern: option({
5319
+ type: string,
5320
+ long: "pattern",
5321
+ short: "p",
5322
+ description: "Regex pattern to search for in result/transcript content"
5323
+ }),
5324
+ path: positional({
5325
+ type: optional(string),
5326
+ displayName: "path",
5327
+ description: "Directory or file to search (default: .agentv/results/runs/ and .agentv/transcripts/)"
5328
+ }),
5329
+ target: option({
5330
+ type: optional(string),
5331
+ long: "target",
5332
+ description: "Filter results to a specific target name"
5333
+ }),
5334
+ experiment: option({
5335
+ type: optional(string),
5336
+ long: "experiment",
5337
+ description: "Filter results to a specific experiment name"
5338
+ }),
5339
+ dir: option({
5340
+ type: optional(string),
5341
+ long: "dir",
5342
+ short: "d",
5343
+ description: "Working directory (default: current directory)"
5344
+ }),
5345
+ format: option({
5346
+ type: optional(oneOf(["table", "json"])),
5347
+ long: "format",
5348
+ short: "f",
5349
+ description: "Output format: table (default) or json"
5350
+ })
5351
+ },
5352
+ handler: async ({ pattern, path: searchPath, target, experiment, dir, format }) => {
5353
+ const cwd = dir ?? process.cwd();
5354
+ let regex2;
5355
+ try {
5356
+ regex2 = new RegExp(pattern, "i");
5357
+ } catch (err2) {
5358
+ console.error(`${c.red}Error:${c.reset} Invalid regex pattern: ${err2.message}`);
5359
+ process.exit(1);
5360
+ }
5361
+ const sources = discoverSources(searchPath, cwd);
5362
+ if (sources.length === 0) {
5363
+ console.error(`${c.yellow}No JSONL files found to search.${c.reset}`);
5364
+ console.error(`${c.dim}Run an evaluation first, or specify a path to search.${c.reset}`);
5365
+ process.exit(0);
5366
+ }
5367
+ const allMatches = [];
5368
+ for (const source of sources) {
5369
+ const fileMatches = searchJsonlFile(source, regex2, target, experiment);
5370
+ allMatches.push(...fileMatches);
5371
+ }
5372
+ if (format === "json") {
5373
+ console.log(JSON.stringify(toSnakeCaseDeep(allMatches), null, 2));
5374
+ } else {
5375
+ console.log(formatSearchResults(allMatches, pattern));
5376
+ }
5377
+ }
5378
+ });
5379
+
5190
5380
  // src/commands/inspect/show.ts
5191
5381
  function renderFlatTrace(result) {
5192
5382
  const trace = getTraceSummary(result);
@@ -5214,8 +5404,8 @@ function renderFlatTrace(result) {
5214
5404
  }
5215
5405
  function renderScores(scores) {
5216
5406
  return scores.map((s) => {
5217
- const scoreColor = s.score >= 0.9 ? c2.green : s.score >= 0.5 ? c2.yellow : c2.red;
5218
- return `${s.name} ${scoreColor}${formatScore(s.score)}${c2.reset}`;
5407
+ const scoreColor = s.score >= 0.9 ? c.green : s.score >= 0.5 ? c.yellow : c.red;
5408
+ return `${s.name} ${scoreColor}${formatScore(s.score)}${c.reset}`;
5219
5409
  }).join(" | ");
5220
5410
  }
5221
5411
  function renderTree(result) {
@@ -5228,7 +5418,7 @@ function renderTree(result) {
5228
5418
  if (getTraceSummary(result) || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
5229
5419
  return renderFlatTrace(result);
5230
5420
  }
5231
- return `${c2.dim}No trace data available${c2.reset}`;
5421
+ return `${c.dim}No trace data available${c.reset}`;
5232
5422
  }
5233
5423
  const lines = [];
5234
5424
  const testId = result.test_id ?? result.eval_id ?? "unknown";
@@ -5238,7 +5428,7 @@ function renderTree(result) {
5238
5428
  if (totalDuration !== void 0) rootParts.push(formatDuration(totalDuration));
5239
5429
  if (totalTokens !== void 0) rootParts.push(`${formatNumber(totalTokens)} tok`);
5240
5430
  if (result.cost_usd !== void 0) rootParts.push(formatCost(result.cost_usd));
5241
- lines.push(`${c2.bold}${rootParts.join(", ")}${c2.reset}`);
5431
+ lines.push(`${c.bold}${rootParts.join(", ")}${c.reset}`);
5242
5432
  const steps = [];
5243
5433
  for (let i = 0; i < messages.length; i++) {
5244
5434
  const msg = messages[i];
@@ -5256,7 +5446,7 @@ function renderTree(result) {
5256
5446
  const connector = isLast ? "\u2514\u2500" : "\u251C\u2500";
5257
5447
  const childPrefix = isLast ? " " : "\u2502 ";
5258
5448
  if (step.type === "llm") {
5259
- const parts = [`${c2.cyan}model${c2.reset}`];
5449
+ const parts = [`${c.cyan}model${c.reset}`];
5260
5450
  if (step.msg.duration_ms !== void 0) parts.push(formatDuration(step.msg.duration_ms));
5261
5451
  if (step.msg.token_usage) {
5262
5452
  const tok = step.msg.token_usage.input + step.msg.token_usage.output;
@@ -5267,18 +5457,18 @@ function renderTree(result) {
5267
5457
  const toolCalls = step.msg.tool_calls ?? [];
5268
5458
  if (toolCalls.length === 1) {
5269
5459
  const tc = toolCalls[0];
5270
- const parts = [`${c2.yellow}${tc.tool}${c2.reset}`];
5460
+ const parts = [`${c.yellow}${tc.tool}${c.reset}`];
5271
5461
  if (tc.duration_ms !== void 0) parts.push(formatDuration(tc.duration_ms));
5272
5462
  lines.push(`${connector} ${parts.join(", ")}`);
5273
5463
  } else {
5274
- const parts = [`${c2.dim}tools${c2.reset}`];
5464
+ const parts = [`${c.dim}tools${c.reset}`];
5275
5465
  if (step.msg.duration_ms !== void 0) parts.push(formatDuration(step.msg.duration_ms));
5276
5466
  lines.push(`${connector} ${parts.join(", ")}`);
5277
5467
  for (let ti = 0; ti < toolCalls.length; ti++) {
5278
5468
  const tc = toolCalls[ti];
5279
5469
  const isLastTool = ti === toolCalls.length - 1;
5280
5470
  const toolConnector = isLastTool ? "\u2514\u2500" : "\u251C\u2500";
5281
- const tcParts = [`${c2.yellow}${tc.tool}${c2.reset}`];
5471
+ const tcParts = [`${c.yellow}${tc.tool}${c.reset}`];
5282
5472
  if (tc.duration_ms !== void 0) tcParts.push(formatDuration(tc.duration_ms));
5283
5473
  lines.push(`${childPrefix}${toolConnector} ${tcParts.join(", ")}`);
5284
5474
  }
@@ -5287,7 +5477,7 @@ function renderTree(result) {
5287
5477
  }
5288
5478
  if (result.scores && result.scores.length > 0) {
5289
5479
  lines.push("");
5290
- lines.push(`${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
5480
+ lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
5291
5481
  }
5292
5482
  return lines.join("\n");
5293
5483
  }
@@ -5299,11 +5489,11 @@ function renderSpanTree(result, spans) {
5299
5489
  if (result.duration_ms !== void 0) rootParts.push(formatDuration(result.duration_ms));
5300
5490
  if (totalTokens !== void 0) rootParts.push(`${formatNumber(totalTokens)} tok`);
5301
5491
  if (result.cost_usd !== void 0) rootParts.push(formatCost(result.cost_usd));
5302
- lines.push(`${c2.bold}${rootParts.join(", ")}${c2.reset}`);
5492
+ lines.push(`${c.bold}${rootParts.join(", ")}${c.reset}`);
5303
5493
  spans.forEach((span, index) => {
5304
5494
  const connector = index === spans.length - 1 ? "\u2514\u2500" : "\u251C\u2500";
5305
- const color = span.type === "llm" ? c2.cyan : c2.yellow;
5306
- const parts = [`${color}${span.name}${c2.reset}`];
5495
+ const color = span.type === "llm" ? c.cyan : c.yellow;
5496
+ const parts = [`${color}${span.name}${c.reset}`];
5307
5497
  if (span.duration_ms !== void 0) {
5308
5498
  parts.push(formatDuration(span.duration_ms));
5309
5499
  }
@@ -5311,7 +5501,7 @@ function renderSpanTree(result, spans) {
5311
5501
  });
5312
5502
  if (result.scores && result.scores.length > 0) {
5313
5503
  lines.push("");
5314
- lines.push(`${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
5504
+ lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
5315
5505
  }
5316
5506
  return lines.join("\n");
5317
5507
  }
@@ -5322,30 +5512,30 @@ function formatResultDetail(result, index, tree) {
5322
5512
  lines.push(renderTree(result));
5323
5513
  return lines.join("\n");
5324
5514
  }
5325
- const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
5515
+ const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
5326
5516
  lines.push(
5327
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.suite ? ` ${c2.dim}suite: ${result.suite}${c2.reset}` : ""}`
5517
+ `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ""}${result.suite ? ` ${c.dim}suite: ${result.suite}${c.reset}` : ""}`
5328
5518
  );
5329
5519
  if (result.error) {
5330
- lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
5520
+ lines.push(` ${c.red}Error: ${result.error}${c.reset}`);
5331
5521
  }
5332
5522
  if (result.assertions && result.assertions.length > 0) {
5333
5523
  const passed = result.assertions.filter((a) => a.passed);
5334
5524
  const failed = result.assertions.filter((a) => !a.passed);
5335
5525
  if (passed.length > 0)
5336
5526
  lines.push(
5337
- ` ${c2.green}\u2713 Passed:${c2.reset} ${passed.map((a) => a.text).join(", ")}`
5527
+ ` ${c.green}\u2713 Passed:${c.reset} ${passed.map((a) => a.text).join(", ")}`
5338
5528
  );
5339
5529
  if (failed.length > 0)
5340
5530
  lines.push(
5341
- ` ${c2.red}\u2717 Failed:${c2.reset} ${failed.map((a) => a.text).join(", ")}`
5531
+ ` ${c.red}\u2717 Failed:${c.reset} ${failed.map((a) => a.text).join(", ")}`
5342
5532
  );
5343
5533
  }
5344
5534
  if (result.scores && result.scores.length > 0) {
5345
- lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
5535
+ lines.push(` ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
5346
5536
  }
5347
5537
  if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
5348
- lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
5538
+ lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`);
5349
5539
  }
5350
5540
  if (result.assertions && result.assertions.length > 0) {
5351
5541
  const withEvidence = result.assertions.filter((a) => a.evidence);
@@ -5353,7 +5543,7 @@ function formatResultDetail(result, index, tree) {
5353
5543
  const maxLen = 200;
5354
5544
  const evidence = withEvidence[0].evidence;
5355
5545
  const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
5356
- lines.push(` ${c2.dim}Evidence: ${truncated}${c2.reset}`);
5546
+ lines.push(` ${c.dim}Evidence: ${truncated}${c.reset}`);
5357
5547
  }
5358
5548
  }
5359
5549
  return lines.join("\n");
@@ -5364,9 +5554,9 @@ function formatShowTable(results, filePath, testIdFilter, tree) {
5364
5554
  if (testIdFilter) {
5365
5555
  filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testIdFilter);
5366
5556
  if (filtered.length === 0) {
5367
- lines.push(`${c2.yellow}No results found with test ID "${testIdFilter}"${c2.reset}`);
5557
+ lines.push(`${c.yellow}No results found with test ID "${testIdFilter}"${c.reset}`);
5368
5558
  lines.push("");
5369
- lines.push(`${c2.dim}Available test IDs:${c2.reset}`);
5559
+ lines.push(`${c.dim}Available test IDs:${c.reset}`);
5370
5560
  for (const r of results) {
5371
5561
  lines.push(` ${r.test_id ?? r.eval_id ?? "(unnamed)"}`);
5372
5562
  }
@@ -5374,19 +5564,19 @@ function formatShowTable(results, filePath, testIdFilter, tree) {
5374
5564
  }
5375
5565
  }
5376
5566
  lines.push("");
5377
- lines.push(`${c2.bold}Results:${c2.reset} ${c2.cyan}${filePath}${c2.reset}`);
5567
+ lines.push(`${c.bold}Results:${c.reset} ${c.cyan}${filePath}${c.reset}`);
5378
5568
  const totalTests = filtered.length;
5379
5569
  const passCount = filtered.filter((r) => r.score >= 1).length;
5380
5570
  const failCount = totalTests - passCount;
5381
5571
  const avgScore = totalTests > 0 ? filtered.reduce((sum, r) => sum + r.score, 0) / totalTests : 0;
5382
5572
  lines.push(
5383
- `${c2.dim}${totalTests} test${totalTests !== 1 ? "s" : ""} | ${c2.green}${passCount} passed${c2.reset}${c2.dim}${failCount > 0 ? ` | ${c2.red}${failCount} failed${c2.reset}${c2.dim}` : ""} | avg score: ${formatScore(avgScore)}${c2.reset}`
5573
+ `${c.dim}${totalTests} test${totalTests !== 1 ? "s" : ""} | ${c.green}${passCount} passed${c.reset}${c.dim}${failCount > 0 ? ` | ${c.red}${failCount} failed${c.reset}${c.dim}` : ""} | avg score: ${formatScore(avgScore)}${c.reset}`
5384
5574
  );
5385
5575
  lines.push("");
5386
5576
  for (let i = 0; i < filtered.length; i++) {
5387
5577
  lines.push(formatResultDetail(filtered[i], i, tree ?? false));
5388
5578
  if (i < filtered.length - 1) {
5389
- lines.push(`${c2.dim}${"\u2500".repeat(60)}${c2.reset}`);
5579
+ lines.push(`${c.dim}${"\u2500".repeat(60)}${c.reset}`);
5390
5580
  }
5391
5581
  }
5392
5582
  lines.push("");
@@ -5515,32 +5705,32 @@ function groupResults(results, groupBy2) {
5515
5705
  function formatStatsTable(groups, filePath) {
5516
5706
  const lines = [];
5517
5707
  lines.push("");
5518
- lines.push(`${c2.bold}Statistics:${c2.reset} ${c2.cyan}${filePath}${c2.reset}`);
5708
+ lines.push(`${c.bold}Statistics:${c.reset} ${c.cyan}${filePath}${c.reset}`);
5519
5709
  for (const group of groups) {
5520
5710
  if (groups.length > 1 || group.label !== "all") {
5521
5711
  lines.push("");
5522
5712
  lines.push(
5523
- `${c2.bold}Group: ${group.label}${c2.reset} ${c2.dim}(${group.results.length} tests)${c2.reset}`
5713
+ `${c.bold}Group: ${group.label}${c.reset} ${c.dim}(${group.results.length} tests)${c.reset}`
5524
5714
  );
5525
5715
  } else {
5526
- lines.push(`${c2.dim}${group.results.length} tests${c2.reset}`);
5716
+ lines.push(`${c.dim}${group.results.length} tests${c.reset}`);
5527
5717
  }
5528
5718
  lines.push("");
5529
5719
  const metrics = collectMetrics(group.results);
5530
5720
  if (metrics.length === 0) {
5531
- lines.push(`${c2.yellow}No trace metrics available${c2.reset}`);
5721
+ lines.push(`${c.yellow}No trace metrics available${c.reset}`);
5532
5722
  continue;
5533
5723
  }
5534
5724
  const nameWidth = Math.max(12, ...metrics.map((m) => m.name.length));
5535
5725
  const colWidth = 10;
5536
- const header = ` ${padRight2("Metric", nameWidth)} ${padLeft2("Mean", colWidth)} ${padLeft2("P50", colWidth)} ${padLeft2("P90", colWidth)} ${padLeft2("P95", colWidth)} ${padLeft2("P99", colWidth)}`;
5537
- lines.push(`${c2.dim}${header}${c2.reset}`);
5726
+ const header = ` ${padRight("Metric", nameWidth)} ${padLeft("Mean", colWidth)} ${padLeft("P50", colWidth)} ${padLeft("P90", colWidth)} ${padLeft("P95", colWidth)} ${padLeft("P99", colWidth)}`;
5727
+ lines.push(`${c.dim}${header}${c.reset}`);
5538
5728
  lines.push(
5539
- `${c2.dim} ${"\u2500".repeat(nameWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)}${c2.reset}`
5729
+ `${c.dim} ${"\u2500".repeat(nameWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)}${c.reset}`
5540
5730
  );
5541
5731
  for (const metric of metrics) {
5542
5732
  const sorted = [...metric.values].sort((a, b) => a - b);
5543
- const row = ` ${padRight2(metric.name, nameWidth)} ${padLeft2(metric.formatter(mean(sorted)), colWidth)} ${padLeft2(metric.formatter(percentile(sorted, 50)), colWidth)} ${padLeft2(metric.formatter(percentile(sorted, 90)), colWidth)} ${padLeft2(metric.formatter(percentile(sorted, 95)), colWidth)} ${padLeft2(metric.formatter(percentile(sorted, 99)), colWidth)}`;
5733
+ const row = ` ${padRight(metric.name, nameWidth)} ${padLeft(metric.formatter(mean(sorted)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 50)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 90)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 95)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 99)), colWidth)}`;
5544
5734
  lines.push(row);
5545
5735
  }
5546
5736
  }
@@ -5620,15 +5810,17 @@ var inspectCommand = subcommands({
5620
5810
  name: "inspect",
5621
5811
  description: "Inspect and analyze evaluation results",
5622
5812
  cmds: {
5813
+ filter: inspectFilterCommand,
5623
5814
  list: traceListCommand,
5624
5815
  score: traceScoreCommand,
5816
+ search: inspectSearchCommand,
5625
5817
  show: traceShowCommand,
5626
5818
  stats: traceStatsCommand
5627
5819
  }
5628
5820
  });
5629
5821
 
5630
5822
  // src/commands/pipeline/bench.ts
5631
- import { existsSync as existsSync2 } from "node:fs";
5823
+ import { existsSync as existsSync5 } from "node:fs";
5632
5824
  import { readFile as readFile2, readdir, writeFile as writeFile5 } from "node:fs/promises";
5633
5825
  import { join } from "node:path";
5634
5826
  var evalBenchCommand = command({
@@ -5739,7 +5931,7 @@ var evalBenchCommand = command({
5739
5931
  }));
5740
5932
  let executionStatus = "ok";
5741
5933
  const timingPath = join(testDir, "timing.json");
5742
- if (existsSync2(timingPath)) {
5934
+ if (existsSync5(timingPath)) {
5743
5935
  try {
5744
5936
  const timing = JSON.parse(await readFile2(timingPath, "utf8"));
5745
5937
  if (typeof timing.execution_status === "string") {
@@ -5748,7 +5940,7 @@ var evalBenchCommand = command({
5748
5940
  } catch {
5749
5941
  }
5750
5942
  }
5751
- const hasResponse = existsSync2(join(testDir, "response.md"));
5943
+ const hasResponse = existsSync5(join(testDir, "response.md"));
5752
5944
  indexLines.push(
5753
5945
  JSON.stringify({
5754
5946
  timestamp: manifest.timestamp,
@@ -5796,6 +5988,33 @@ var evalBenchCommand = command({
5796
5988
  "utf8"
5797
5989
  );
5798
5990
  console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
5991
+ const results = indexLines.map((line) => JSON.parse(line));
5992
+ await maybeAutoExportRunArtifacts({
5993
+ cwd: process.cwd(),
5994
+ run_dir: exportDir,
5995
+ experiment,
5996
+ test_files: manifest.eval_file ? [manifest.eval_file] : [],
5997
+ results: results.map((result) => ({
5998
+ testId: result.test_id,
5999
+ score: result.score,
6000
+ executionStatus: result.execution_status,
6001
+ target: result.target,
6002
+ timestamp: result.timestamp
6003
+ })),
6004
+ eval_summaries: [
6005
+ {
6006
+ eval_file: manifest.eval_file ?? "pipeline",
6007
+ total: results.length,
6008
+ passed: results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
6009
+ avg_score: results.length > 0 ? results.reduce((sum, result) => sum + result.score, 0) / results.length : 0,
6010
+ results: results.map((result) => ({
6011
+ test_id: result.test_id,
6012
+ score: result.score,
6013
+ status: result.execution_status === "execution_error" ? "ERROR" : result.score >= DEFAULT_THRESHOLD ? "PASS" : "FAIL"
6014
+ }))
6015
+ }
6016
+ ]
6017
+ });
5799
6018
  }
5800
6019
  });
5801
6020
  function computeStats(values) {
@@ -5987,7 +6206,7 @@ var evalInputCommand = command({
5987
6206
  out: option({
5988
6207
  type: optional(string),
5989
6208
  long: "out",
5990
- description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
6209
+ description: "Output directory for extracted inputs (default: .agentv/results/runs/<experiment>/<timestamp>)"
5991
6210
  }),
5992
6211
  experiment: option({
5993
6212
  type: optional(string),
@@ -5997,7 +6216,7 @@ var evalInputCommand = command({
5997
6216
  },
5998
6217
  handler: async ({ evalPath, out, experiment }) => {
5999
6218
  const resolvedEvalPath = resolve(evalPath);
6000
- const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
6219
+ const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
6001
6220
  const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
6002
6221
  const evalDir = dirname(resolvedEvalPath);
6003
6222
  const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
@@ -6147,7 +6366,7 @@ async function writeJson(filePath, data) {
6147
6366
 
6148
6367
  // src/commands/pipeline/run.ts
6149
6368
  import { exec } from "node:child_process";
6150
- import { existsSync as existsSync3, readFileSync as readFileSync5, unlinkSync } from "node:fs";
6369
+ import { existsSync as existsSync6, readFileSync as readFileSync6, unlinkSync } from "node:fs";
6151
6370
  import { mkdir as mkdir7, readFile as readFile5, readdir as readdir3, writeFile as writeFile8 } from "node:fs/promises";
6152
6371
  import { tmpdir } from "node:os";
6153
6372
  import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
@@ -6161,9 +6380,9 @@ function loadEnvFile(dir) {
6161
6380
  let current = resolve2(dir);
6162
6381
  while (true) {
6163
6382
  const candidate = join4(current, ".env");
6164
- if (existsSync3(candidate)) {
6383
+ if (existsSync6(candidate)) {
6165
6384
  const env2 = {};
6166
- for (const line of readFileSync5(candidate, "utf8").split("\n")) {
6385
+ for (const line of readFileSync6(candidate, "utf8").split("\n")) {
6167
6386
  const trimmed = line.trim();
6168
6387
  if (!trimmed || trimmed.startsWith("#")) continue;
6169
6388
  const eqIdx = trimmed.indexOf("=");
@@ -6190,7 +6409,7 @@ var evalRunCommand2 = command({
6190
6409
  out: option({
6191
6410
  type: optional(string),
6192
6411
  long: "out",
6193
- description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
6412
+ description: "Output directory for results (default: .agentv/results/runs/<experiment>/<timestamp>)"
6194
6413
  }),
6195
6414
  workers: option({
6196
6415
  type: optional(number),
@@ -6210,7 +6429,7 @@ var evalRunCommand2 = command({
6210
6429
  },
6211
6430
  handler: async ({ evalPath, out, workers, experiment, graderType }) => {
6212
6431
  const resolvedEvalPath = resolve2(evalPath);
6213
- const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
6432
+ const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd(), experiment));
6214
6433
  const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
6215
6434
  const evalDir = dirname2(resolvedEvalPath);
6216
6435
  const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
@@ -6348,8 +6567,8 @@ var evalRunCommand2 = command({
6348
6567
  });
6349
6568
  const durationMs = Math.round(performance.now() - start);
6350
6569
  let response;
6351
- if (existsSync3(outputFile)) {
6352
- response = readFileSync5(outputFile, "utf8");
6570
+ if (existsSync6(outputFile)) {
6571
+ response = readFileSync6(outputFile, "utf8");
6353
6572
  } else {
6354
6573
  response = "ERROR: No output file generated";
6355
6574
  }
@@ -6381,8 +6600,8 @@ var evalRunCommand2 = command({
6381
6600
  invCompleted++;
6382
6601
  writeInvProgress();
6383
6602
  try {
6384
- if (existsSync3(promptFile)) unlinkSync(promptFile);
6385
- if (existsSync3(outputFile)) unlinkSync(outputFile);
6603
+ if (existsSync6(promptFile)) unlinkSync(promptFile);
6604
+ if (existsSync6(outputFile)) unlinkSync(outputFile);
6386
6605
  } catch {
6387
6606
  }
6388
6607
  }
@@ -6469,7 +6688,7 @@ async function writeGraderConfigs2(testDir, assertions, evalDir) {
6469
6688
  let promptContent = "";
6470
6689
  if (config.resolvedPromptPath) {
6471
6690
  try {
6472
- promptContent = readFileSync5(config.resolvedPromptPath, "utf8");
6691
+ promptContent = readFileSync6(config.resolvedPromptPath, "utf8");
6473
6692
  } catch {
6474
6693
  promptContent = typeof config.prompt === "string" ? config.prompt : "";
6475
6694
  }
@@ -6500,10 +6719,10 @@ var pipelineCommand = subcommands({
6500
6719
  });
6501
6720
 
6502
6721
  // src/commands/results/export.ts
6503
- import path10 from "node:path";
6722
+ import path12 from "node:path";
6504
6723
 
6505
6724
  // src/commands/results/shared.ts
6506
- import { existsSync as existsSync4 } from "node:fs";
6725
+ import { existsSync as existsSync7 } from "node:fs";
6507
6726
  var sourceArg = positional({
6508
6727
  type: optional(string),
6509
6728
  displayName: "source",
@@ -6513,7 +6732,7 @@ async function resolveSourceFile(source, cwd) {
6513
6732
  let sourceFile;
6514
6733
  if (source) {
6515
6734
  sourceFile = resolveResultSourcePath(source, cwd);
6516
- if (!existsSync4(sourceFile)) {
6735
+ if (!existsSync7(sourceFile)) {
6517
6736
  console.error(`Error: File not found: ${sourceFile}`);
6518
6737
  process.exit(1);
6519
6738
  }
@@ -6521,7 +6740,7 @@ async function resolveSourceFile(source, cwd) {
6521
6740
  } else {
6522
6741
  const cache = await loadRunCache(cwd);
6523
6742
  const cachedFile = cache ? resolveRunCacheFile(cache) : "";
6524
- if (cachedFile && existsSync4(cachedFile)) {
6743
+ if (cachedFile && existsSync7(cachedFile)) {
6525
6744
  sourceFile = cachedFile;
6526
6745
  } else {
6527
6746
  const metas = listResultFiles(cwd, 1);
@@ -6547,14 +6766,20 @@ async function loadResults(source, cwd) {
6547
6766
 
6548
6767
  // src/commands/results/export.ts
6549
6768
  function deriveOutputDir(cwd, sourceFile) {
6550
- if (path10.basename(sourceFile) !== RESULT_INDEX_FILENAME) {
6769
+ if (path12.basename(sourceFile) !== RESULT_INDEX_FILENAME) {
6551
6770
  throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
6552
6771
  }
6553
- const parentDir = path10.basename(path10.dirname(sourceFile));
6772
+ const runDir = path12.dirname(sourceFile);
6773
+ const segments = path12.normalize(runDir).split(path12.sep).filter(Boolean);
6774
+ const runsIndex = segments.lastIndexOf("runs");
6775
+ if (runsIndex >= 0 && runsIndex < segments.length - 1) {
6776
+ return path12.join(cwd, ".agentv", "results", "export", ...segments.slice(runsIndex + 1));
6777
+ }
6778
+ const parentDir = path12.basename(runDir);
6554
6779
  if (parentDir.startsWith("eval_")) {
6555
- return path10.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
6780
+ return path12.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
6556
6781
  }
6557
- return path10.join(cwd, ".agentv", "results", "export", parentDir);
6782
+ return path12.join(cwd, ".agentv", "results", "export", parentDir);
6558
6783
  }
6559
6784
  async function loadExportSource(source, cwd) {
6560
6785
  const { sourceFile } = await resolveSourceFile(source, cwd);
@@ -6587,7 +6812,7 @@ var resultsExportCommand = command({
6587
6812
  const cwd = dir ?? process.cwd();
6588
6813
  try {
6589
6814
  const { sourceFile, results } = await loadExportSource(source, cwd);
6590
- const outputDir = out ? path10.isAbsolute(out) ? out : path10.resolve(cwd, out) : deriveOutputDir(cwd, sourceFile);
6815
+ const outputDir = out ? path12.isAbsolute(out) ? out : path12.resolve(cwd, out) : deriveOutputDir(cwd, sourceFile);
6591
6816
  await writeArtifactsFromResults(results, outputDir, {
6592
6817
  evalFile: sourceFile
6593
6818
  });
@@ -6728,7 +6953,7 @@ var resultsShowCommand = command({
6728
6953
  });
6729
6954
 
6730
6955
  // src/commands/results/summary.ts
6731
- import { existsSync as existsSync5, readFileSync as readFileSync6 } from "node:fs";
6956
+ import { existsSync as existsSync8, readFileSync as readFileSync7 } from "node:fs";
6732
6957
  function formatSummary(results, grading) {
6733
6958
  const total = results.length;
6734
6959
  let passed;
@@ -6779,9 +7004,9 @@ var resultsSummaryCommand = command({
6779
7004
  const { results, sourceFile } = await loadResults(source, cwd);
6780
7005
  let grading;
6781
7006
  const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
6782
- if (existsSync5(gradingPath)) {
7007
+ if (existsSync8(gradingPath)) {
6783
7008
  try {
6784
- grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
7009
+ grading = JSON.parse(readFileSync7(gradingPath, "utf8"));
6785
7010
  } catch {
6786
7011
  }
6787
7012
  }
@@ -6794,16 +7019,17 @@ var resultsSummaryCommand = command({
6794
7019
  });
6795
7020
 
6796
7021
  // src/commands/results/validate.ts
6797
- import { existsSync as existsSync6, readFileSync as readFileSync7, statSync as statSync3 } from "node:fs";
6798
- import path11 from "node:path";
7022
+ import { existsSync as existsSync9, readFileSync as readFileSync8, statSync as statSync4 } from "node:fs";
7023
+ import path13 from "node:path";
6799
7024
  function checkDirectoryNaming(runDir) {
6800
- const dirName = path11.basename(runDir);
6801
- const parentName = path11.basename(path11.dirname(runDir));
7025
+ const dirName = path13.basename(runDir);
7026
+ const pathSegments = path13.normalize(runDir).split(path13.sep).filter(Boolean);
7027
+ const runsIndex = pathSegments.lastIndexOf("runs");
6802
7028
  const diagnostics = [];
6803
- if (parentName !== "runs") {
7029
+ if (runsIndex < 0 || runsIndex >= pathSegments.length - 1) {
6804
7030
  diagnostics.push({
6805
7031
  severity: "warning",
6806
- message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
7032
+ message: "Directory is not under a 'runs/' tree. Expected: .agentv/results/runs/<experiment>/<run-dir>"
6807
7033
  });
6808
7034
  }
6809
7035
  const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
@@ -6816,15 +7042,25 @@ function checkDirectoryNaming(runDir) {
6816
7042
  }
6817
7043
  return diagnostics;
6818
7044
  }
7045
+ function validateRunDirectory(runDir) {
7046
+ const diagnostics = [];
7047
+ diagnostics.push(...checkDirectoryNaming(runDir));
7048
+ const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(runDir);
7049
+ diagnostics.push(...indexDiags);
7050
+ if (entries2.length > 0) {
7051
+ diagnostics.push(...checkArtifactFiles(runDir, entries2));
7052
+ }
7053
+ return { diagnostics, entries: entries2 };
7054
+ }
6819
7055
  function checkIndexJsonl(runDir) {
6820
- const indexPath = path11.join(runDir, "index.jsonl");
7056
+ const indexPath = path13.join(runDir, "index.jsonl");
6821
7057
  const diagnostics = [];
6822
7058
  const entries2 = [];
6823
- if (!existsSync6(indexPath)) {
7059
+ if (!existsSync9(indexPath)) {
6824
7060
  diagnostics.push({ severity: "error", message: "index.jsonl is missing" });
6825
7061
  return { diagnostics, entries: entries2 };
6826
7062
  }
6827
- const content = readFileSync7(indexPath, "utf8");
7063
+ const content = readFileSync8(indexPath, "utf8");
6828
7064
  const lines = content.split("\n").filter((l) => l.trim().length > 0);
6829
7065
  if (lines.length === 0) {
6830
7066
  diagnostics.push({ severity: "error", message: "index.jsonl is empty" });
@@ -6916,15 +7152,15 @@ function checkArtifactFiles(runDir, entries2) {
6916
7152
  for (const entry of entries2) {
6917
7153
  const testId = entry.test_id ?? "?";
6918
7154
  if (entry.grading_path) {
6919
- const gradingPath = path11.join(runDir, entry.grading_path);
6920
- if (!existsSync6(gradingPath)) {
7155
+ const gradingPath = path13.join(runDir, entry.grading_path);
7156
+ if (!existsSync9(gradingPath)) {
6921
7157
  diagnostics.push({
6922
7158
  severity: "error",
6923
7159
  message: `${testId}: grading.json not found at '${entry.grading_path}'`
6924
7160
  });
6925
7161
  } else {
6926
7162
  try {
6927
- const grading = JSON.parse(readFileSync7(gradingPath, "utf8"));
7163
+ const grading = JSON.parse(readFileSync8(gradingPath, "utf8"));
6928
7164
  if (!grading.assertions || !Array.isArray(grading.assertions)) {
6929
7165
  diagnostics.push({
6930
7166
  severity: "error",
@@ -6946,8 +7182,8 @@ function checkArtifactFiles(runDir, entries2) {
6946
7182
  }
6947
7183
  }
6948
7184
  if (entry.timing_path) {
6949
- const timingPath = path11.join(runDir, entry.timing_path);
6950
- if (!existsSync6(timingPath)) {
7185
+ const timingPath = path13.join(runDir, entry.timing_path);
7186
+ if (!existsSync9(timingPath)) {
6951
7187
  diagnostics.push({
6952
7188
  severity: "warning",
6953
7189
  message: `${testId}: timing.json not found at '${entry.timing_path}'`
@@ -6955,8 +7191,8 @@ function checkArtifactFiles(runDir, entries2) {
6955
7191
  }
6956
7192
  }
6957
7193
  }
6958
- const benchmarkPath = path11.join(runDir, "benchmark.json");
6959
- if (!existsSync6(benchmarkPath)) {
7194
+ const benchmarkPath = path13.join(runDir, "benchmark.json");
7195
+ if (!existsSync9(benchmarkPath)) {
6960
7196
  diagnostics.push({ severity: "warning", message: "benchmark.json is missing" });
6961
7197
  }
6962
7198
  return diagnostics;
@@ -6972,18 +7208,12 @@ var resultsValidateCommand = command({
6972
7208
  })
6973
7209
  },
6974
7210
  handler: async ({ runDir }) => {
6975
- const resolvedDir = path11.resolve(runDir);
6976
- if (!existsSync6(resolvedDir) || !statSync3(resolvedDir).isDirectory()) {
7211
+ const resolvedDir = path13.resolve(runDir);
7212
+ if (!existsSync9(resolvedDir) || !statSync4(resolvedDir).isDirectory()) {
6977
7213
  console.error(`Error: '${runDir}' is not a directory`);
6978
7214
  process.exit(1);
6979
7215
  }
6980
- const allDiagnostics = [];
6981
- allDiagnostics.push(...checkDirectoryNaming(resolvedDir));
6982
- const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(resolvedDir);
6983
- allDiagnostics.push(...indexDiags);
6984
- if (entries2.length > 0) {
6985
- allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries2));
6986
- }
7216
+ const { diagnostics: allDiagnostics, entries: entries2 } = validateRunDirectory(resolvedDir);
6987
7217
  const errors = allDiagnostics.filter((d) => d.severity === "error");
6988
7218
  const warnings = allDiagnostics.filter((d) => d.severity === "warning");
6989
7219
  if (allDiagnostics.length === 0) {
@@ -7020,15 +7250,15 @@ var resultsCommand = subcommands({
7020
7250
  });
7021
7251
 
7022
7252
  // src/commands/results/serve.ts
7023
- import { existsSync as existsSync9, readFileSync as readFileSync9, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync4 } from "node:fs";
7024
- import path14 from "node:path";
7253
+ import { existsSync as existsSync12, readFileSync as readFileSync10, readdirSync as readdirSync4, statSync as statSync5, writeFileSync as writeFileSync4 } from "node:fs";
7254
+ import path16 from "node:path";
7025
7255
  import { fileURLToPath as fileURLToPath2 } from "node:url";
7026
7256
  import { Hono } from "hono";
7027
7257
 
7028
7258
  // src/commands/results/eval-runner.ts
7029
7259
  import { spawn } from "node:child_process";
7030
- import { existsSync as existsSync7 } from "node:fs";
7031
- import path12 from "node:path";
7260
+ import { existsSync as existsSync10 } from "node:fs";
7261
+ import path14 from "node:path";
7032
7262
  var activeRuns = /* @__PURE__ */ new Map();
7033
7263
  function generateRunId() {
7034
7264
  const now = /* @__PURE__ */ new Date();
@@ -7049,16 +7279,16 @@ async function discoverTargetsInProject(cwd) {
7049
7279
  const repoRoot = await findRepoRoot(cwd) ?? cwd;
7050
7280
  let targetsFilePath;
7051
7281
  for (const candidate of TARGET_FILE_CANDIDATES) {
7052
- const fullPath = path12.join(cwd, candidate);
7053
- if (existsSync7(fullPath)) {
7282
+ const fullPath = path14.join(cwd, candidate);
7283
+ if (existsSync10(fullPath)) {
7054
7284
  targetsFilePath = fullPath;
7055
7285
  break;
7056
7286
  }
7057
7287
  }
7058
7288
  if (!targetsFilePath) {
7059
7289
  for (const candidate of TARGET_FILE_CANDIDATES) {
7060
- const fullPath = path12.join(repoRoot, candidate);
7061
- if (existsSync7(fullPath)) {
7290
+ const fullPath = path14.join(repoRoot, candidate);
7291
+ if (existsSync10(fullPath)) {
7062
7292
  targetsFilePath = fullPath;
7063
7293
  break;
7064
7294
  }
@@ -7107,22 +7337,23 @@ function buildCliPreview(args) {
7107
7337
  }
7108
7338
  function resolveCliPath(cwd) {
7109
7339
  const candidates = [
7110
- path12.join(cwd, "apps/cli/src/cli.ts"),
7111
- path12.join(cwd, "apps/cli/dist/cli.js")
7340
+ path14.join(cwd, "apps/cli/src/cli.ts"),
7341
+ path14.join(cwd, "apps/cli/dist/cli.js")
7112
7342
  ];
7113
7343
  for (const c4 of candidates) {
7114
- if (existsSync7(c4)) {
7344
+ if (existsSync10(c4)) {
7115
7345
  return { bunPath: "bun", cliPath: c4 };
7116
7346
  }
7117
7347
  }
7118
- const currentDir = typeof __dirname !== "undefined" ? __dirname : path12.dirname(new URL(import.meta.url).pathname);
7119
- const fromSrc = path12.resolve(currentDir, "../../../cli.ts");
7120
- const fromDist = path12.resolve(currentDir, "../../cli.js");
7121
- if (existsSync7(fromSrc)) return { bunPath: "bun", cliPath: fromSrc };
7122
- if (existsSync7(fromDist)) return { bunPath: "bun", cliPath: fromDist };
7348
+ const currentDir = typeof __dirname !== "undefined" ? __dirname : path14.dirname(new URL(import.meta.url).pathname);
7349
+ const fromSrc = path14.resolve(currentDir, "../../../cli.ts");
7350
+ const fromDist = path14.resolve(currentDir, "../../cli.js");
7351
+ if (existsSync10(fromSrc)) return { bunPath: "bun", cliPath: fromSrc };
7352
+ if (existsSync10(fromDist)) return { bunPath: "bun", cliPath: fromDist };
7123
7353
  return void 0;
7124
7354
  }
7125
- function registerEvalRoutes(app2, getCwd) {
7355
+ function registerEvalRoutes(app2, getCwd, options) {
7356
+ const readOnly = options?.readOnly === true;
7126
7357
  app2.get("/api/eval/discover", async (c4) => {
7127
7358
  const cwd = getCwd(c4);
7128
7359
  try {
@@ -7148,6 +7379,9 @@ function registerEvalRoutes(app2, getCwd) {
7148
7379
  }
7149
7380
  });
7150
7381
  app2.post("/api/eval/run", async (c4) => {
7382
+ if (readOnly) {
7383
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
7384
+ }
7151
7385
  const cwd = getCwd(c4);
7152
7386
  let body;
7153
7387
  try {
@@ -7389,18 +7623,18 @@ Process error: ${err2.message}`;
7389
7623
  }
7390
7624
 
7391
7625
  // src/commands/results/studio-config.ts
7392
- import { existsSync as existsSync8, mkdirSync as mkdirSync2, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
7393
- import path13 from "node:path";
7626
+ import { existsSync as existsSync11, mkdirSync as mkdirSync2, readFileSync as readFileSync9, writeFileSync as writeFileSync3 } from "node:fs";
7627
+ import path15 from "node:path";
7394
7628
  import { parse as parseYaml, stringify as stringifyYaml2 } from "yaml";
7395
7629
  var DEFAULTS = {
7396
7630
  threshold: DEFAULT_THRESHOLD
7397
7631
  };
7398
7632
  function loadStudioConfig(agentvDir) {
7399
- const configPath = path13.join(agentvDir, "config.yaml");
7400
- if (!existsSync8(configPath)) {
7633
+ const configPath = path15.join(agentvDir, "config.yaml");
7634
+ if (!existsSync11(configPath)) {
7401
7635
  return { ...DEFAULTS };
7402
7636
  }
7403
- const raw = readFileSync8(configPath, "utf-8");
7637
+ const raw = readFileSync9(configPath, "utf-8");
7404
7638
  const parsed = parseYaml(raw);
7405
7639
  if (!parsed || typeof parsed !== "object") {
7406
7640
  return { ...DEFAULTS };
@@ -7422,13 +7656,13 @@ function loadStudioConfig(agentvDir) {
7422
7656
  };
7423
7657
  }
7424
7658
  function saveStudioConfig(agentvDir, config) {
7425
- if (!existsSync8(agentvDir)) {
7659
+ if (!existsSync11(agentvDir)) {
7426
7660
  mkdirSync2(agentvDir, { recursive: true });
7427
7661
  }
7428
- const configPath = path13.join(agentvDir, "config.yaml");
7662
+ const configPath = path15.join(agentvDir, "config.yaml");
7429
7663
  let existing = {};
7430
- if (existsSync8(configPath)) {
7431
- const raw = readFileSync8(configPath, "utf-8");
7664
+ if (existsSync11(configPath)) {
7665
+ const raw = readFileSync9(configPath, "utf-8");
7432
7666
  const parsed = parseYaml(raw);
7433
7667
  if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
7434
7668
  existing = parsed;
@@ -7451,7 +7685,7 @@ function saveStudioConfig(agentvDir, config) {
7451
7685
  async function resolveSourceFile2(source, cwd) {
7452
7686
  if (source) {
7453
7687
  let resolved = resolveResultSourcePath(source, cwd);
7454
- if (!existsSync9(resolved)) {
7688
+ if (!existsSync12(resolved)) {
7455
7689
  throw new Error(`Source file not found: ${resolved}`);
7456
7690
  }
7457
7691
  resolved = resolveRunManifestPath(resolved);
@@ -7459,7 +7693,7 @@ async function resolveSourceFile2(source, cwd) {
7459
7693
  }
7460
7694
  const cache = await loadRunCache(cwd);
7461
7695
  const cachedFile = cache ? resolveRunCacheFile(cache) : "";
7462
- if (cachedFile && existsSync9(cachedFile)) {
7696
+ if (cachedFile && existsSync12(cachedFile)) {
7463
7697
  return cachedFile;
7464
7698
  }
7465
7699
  const metas = listResultFiles(cwd, 10);
@@ -7479,16 +7713,25 @@ Serving most recent: ${metas[0].path}
7479
7713
  }
7480
7714
  return metas[0].path;
7481
7715
  }
7716
+ function resolveDashboardMode(projectCount, options) {
7717
+ if (options.single === true) {
7718
+ return { isMultiProject: false, showMultiWarning: options.multi === true };
7719
+ }
7720
+ if (options.multi === true) {
7721
+ return { isMultiProject: true, showMultiWarning: true };
7722
+ }
7723
+ return { isMultiProject: projectCount > 1, showMultiWarning: false };
7724
+ }
7482
7725
  function feedbackPath(resultDir) {
7483
- return path14.join(resultDir, "feedback.json");
7726
+ return path16.join(resultDir, "feedback.json");
7484
7727
  }
7485
7728
  function readFeedback(cwd) {
7486
7729
  const fp = feedbackPath(cwd);
7487
- if (!existsSync9(fp)) {
7730
+ if (!existsSync12(fp)) {
7488
7731
  return { reviews: [] };
7489
7732
  }
7490
7733
  try {
7491
- return JSON.parse(readFileSync9(fp, "utf8"));
7734
+ return JSON.parse(readFileSync10(fp, "utf8"));
7492
7735
  } catch (err2) {
7493
7736
  console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
7494
7737
  return { reviews: [] };
@@ -7499,16 +7742,16 @@ function writeFeedback(cwd, data) {
7499
7742
  `, "utf8");
7500
7743
  }
7501
7744
  function buildFileTree(dirPath, relativeTo) {
7502
- if (!existsSync9(dirPath) || !statSync4(dirPath).isDirectory()) {
7745
+ if (!existsSync12(dirPath) || !statSync5(dirPath).isDirectory()) {
7503
7746
  return [];
7504
7747
  }
7505
- const entries2 = readdirSync3(dirPath, { withFileTypes: true });
7748
+ const entries2 = readdirSync4(dirPath, { withFileTypes: true });
7506
7749
  return entries2.sort((a, b) => {
7507
7750
  if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
7508
7751
  return a.name.localeCompare(b.name);
7509
7752
  }).map((entry) => {
7510
- const fullPath = path14.join(dirPath, entry.name);
7511
- const relPath = path14.relative(relativeTo, fullPath);
7753
+ const fullPath = path16.join(dirPath, entry.name);
7754
+ const relPath = path16.relative(relativeTo, fullPath);
7512
7755
  if (entry.isDirectory()) {
7513
7756
  return {
7514
7757
  name: entry.name,
@@ -7521,7 +7764,7 @@ function buildFileTree(dirPath, relativeTo) {
7521
7764
  });
7522
7765
  }
7523
7766
  function inferLanguage(filePath) {
7524
- const ext = path14.extname(filePath).toLowerCase();
7767
+ const ext = path16.extname(filePath).toLowerCase();
7525
7768
  const langMap = {
7526
7769
  ".json": "json",
7527
7770
  ".jsonl": "json",
@@ -7559,8 +7802,8 @@ function stripHeavyFields(results) {
7559
7802
  };
7560
7803
  });
7561
7804
  }
7562
- function handleRuns(c4, { searchDir }) {
7563
- const metas = listResultFiles(searchDir);
7805
+ async function handleRuns(c4, { searchDir }) {
7806
+ const { runs: metas } = await listMergedResultFiles(searchDir);
7564
7807
  return c4.json({
7565
7808
  runs: metas.map((m) => {
7566
7809
  let target;
@@ -7575,32 +7818,38 @@ function handleRuns(c4, { searchDir }) {
7575
7818
  }
7576
7819
  return {
7577
7820
  filename: m.filename,
7821
+ display_name: m.displayName,
7578
7822
  path: m.path,
7579
7823
  timestamp: m.timestamp,
7580
7824
  test_count: m.testCount,
7581
7825
  pass_rate: m.passRate,
7582
7826
  avg_score: m.avgScore,
7583
7827
  size_bytes: m.sizeBytes,
7828
+ source: m.source,
7584
7829
  ...target && { target },
7585
7830
  ...experiment && { experiment }
7586
7831
  };
7587
7832
  })
7588
7833
  });
7589
7834
  }
7590
- function handleRunDetail(c4, { searchDir }) {
7591
- const filename = c4.req.param("filename");
7592
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7835
+ async function handleRunDetail(c4, { searchDir }) {
7836
+ const filename = c4.req.param("filename") ?? "";
7837
+ const meta = await findRunById(searchDir, filename);
7593
7838
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7594
7839
  try {
7595
7840
  const loaded = loadManifestResults(meta.path);
7596
- return c4.json({ results: stripHeavyFields(loaded), source: meta.filename });
7841
+ return c4.json({
7842
+ results: stripHeavyFields(loaded),
7843
+ source: meta.source,
7844
+ source_label: meta.displayName
7845
+ });
7597
7846
  } catch {
7598
7847
  return c4.json({ error: "Failed to load run" }, 500);
7599
7848
  }
7600
7849
  }
7601
- function handleRunSuites(c4, { searchDir, agentvDir }) {
7602
- const filename = c4.req.param("filename");
7603
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7850
+ async function handleRunSuites(c4, { searchDir, agentvDir }) {
7851
+ const filename = c4.req.param("filename") ?? "";
7852
+ const meta = await findRunById(searchDir, filename);
7604
7853
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7605
7854
  try {
7606
7855
  const loaded = loadManifestResults(meta.path);
@@ -7626,9 +7875,9 @@ function handleRunSuites(c4, { searchDir, agentvDir }) {
7626
7875
  return c4.json({ error: "Failed to load suites" }, 500);
7627
7876
  }
7628
7877
  }
7629
- function handleRunCategories(c4, { searchDir, agentvDir }) {
7630
- const filename = c4.req.param("filename");
7631
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7878
+ async function handleRunCategories(c4, { searchDir, agentvDir }) {
7879
+ const filename = c4.req.param("filename") ?? "";
7880
+ const meta = await findRunById(searchDir, filename);
7632
7881
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7633
7882
  try {
7634
7883
  const loaded = loadManifestResults(meta.path);
@@ -7661,10 +7910,10 @@ function handleRunCategories(c4, { searchDir, agentvDir }) {
7661
7910
  return c4.json({ error: "Failed to load categories" }, 500);
7662
7911
  }
7663
7912
  }
7664
- function handleCategorySuites(c4, { searchDir, agentvDir }) {
7665
- const filename = c4.req.param("filename");
7913
+ async function handleCategorySuites(c4, { searchDir, agentvDir }) {
7914
+ const filename = c4.req.param("filename") ?? "";
7666
7915
  const category = decodeURIComponent(c4.req.param("category") ?? "");
7667
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7916
+ const meta = await findRunById(searchDir, filename);
7668
7917
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7669
7918
  try {
7670
7919
  const loaded = loadManifestResults(meta.path);
@@ -7691,10 +7940,10 @@ function handleCategorySuites(c4, { searchDir, agentvDir }) {
7691
7940
  return c4.json({ error: "Failed to load suites" }, 500);
7692
7941
  }
7693
7942
  }
7694
- function handleEvalDetail(c4, { searchDir }) {
7695
- const filename = c4.req.param("filename");
7943
+ async function handleEvalDetail(c4, { searchDir }) {
7944
+ const filename = c4.req.param("filename") ?? "";
7696
7945
  const evalId = c4.req.param("evalId");
7697
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7946
+ const meta = await findRunById(searchDir, filename);
7698
7947
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7699
7948
  try {
7700
7949
  const loaded = loadManifestResults(meta.path);
@@ -7705,17 +7954,17 @@ function handleEvalDetail(c4, { searchDir }) {
7705
7954
  return c4.json({ error: "Failed to load eval" }, 500);
7706
7955
  }
7707
7956
  }
7708
- function handleEvalFiles(c4, { searchDir }) {
7709
- const filename = c4.req.param("filename");
7957
+ async function handleEvalFiles(c4, { searchDir }) {
7958
+ const filename = c4.req.param("filename") ?? "";
7710
7959
  const evalId = c4.req.param("evalId");
7711
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7960
+ const meta = await findRunById(searchDir, filename);
7712
7961
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7713
7962
  try {
7714
- const content = readFileSync9(meta.path, "utf8");
7963
+ const content = readFileSync10(meta.path, "utf8");
7715
7964
  const records = parseResultManifest(content);
7716
7965
  const record = records.find((r) => r.test_id === evalId);
7717
7966
  if (!record) return c4.json({ error: "Eval not found" }, 404);
7718
- const baseDir = path14.dirname(meta.path);
7967
+ const baseDir = path16.dirname(meta.path);
7719
7968
  const knownPaths = [
7720
7969
  record.grading_path,
7721
7970
  record.timing_path,
@@ -7724,47 +7973,46 @@ function handleEvalFiles(c4, { searchDir }) {
7724
7973
  record.response_path
7725
7974
  ].filter((p) => !!p);
7726
7975
  if (knownPaths.length === 0) return c4.json({ files: [] });
7727
- const artifactDirs = knownPaths.map((p) => path14.dirname(p));
7976
+ const artifactDirs = knownPaths.map((p) => path16.dirname(p));
7728
7977
  let commonDir = artifactDirs[0];
7729
7978
  for (const dir of artifactDirs) {
7730
7979
  while (!dir.startsWith(commonDir)) {
7731
- commonDir = path14.dirname(commonDir);
7980
+ commonDir = path16.dirname(commonDir);
7732
7981
  }
7733
7982
  }
7734
- const artifactAbsDir = path14.join(baseDir, commonDir);
7983
+ const artifactAbsDir = path16.join(baseDir, commonDir);
7735
7984
  const files = buildFileTree(artifactAbsDir, baseDir);
7736
7985
  return c4.json({ files });
7737
7986
  } catch {
7738
7987
  return c4.json({ error: "Failed to load file tree" }, 500);
7739
7988
  }
7740
7989
  }
7741
- function handleEvalFileContent(c4, { searchDir }) {
7742
- const filename = c4.req.param("filename");
7743
- const evalId = c4.req.param("evalId");
7744
- const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
7990
+ async function handleEvalFileContent(c4, { searchDir }) {
7991
+ const filename = c4.req.param("filename") ?? "";
7992
+ const meta = await findRunById(searchDir, filename);
7745
7993
  if (!meta) return c4.json({ error: "Run not found" }, 404);
7746
- const marker = `/runs/${filename}/evals/${evalId}/files/`;
7994
+ const marker = "/files/";
7747
7995
  const markerIdx = c4.req.path.indexOf(marker);
7748
7996
  const filePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
7749
7997
  if (!filePath) return c4.json({ error: "No file path specified" }, 400);
7750
- const baseDir = path14.dirname(meta.path);
7751
- const absolutePath = path14.resolve(baseDir, filePath);
7752
- if (!absolutePath.startsWith(path14.resolve(baseDir) + path14.sep) && absolutePath !== path14.resolve(baseDir)) {
7998
+ const baseDir = path16.dirname(meta.path);
7999
+ const absolutePath = path16.resolve(baseDir, filePath);
8000
+ if (!absolutePath.startsWith(path16.resolve(baseDir) + path16.sep) && absolutePath !== path16.resolve(baseDir)) {
7753
8001
  return c4.json({ error: "Path traversal not allowed" }, 403);
7754
8002
  }
7755
- if (!existsSync9(absolutePath) || !statSync4(absolutePath).isFile()) {
8003
+ if (!existsSync12(absolutePath) || !statSync5(absolutePath).isFile()) {
7756
8004
  return c4.json({ error: "File not found" }, 404);
7757
8005
  }
7758
8006
  try {
7759
- const fileContent = readFileSync9(absolutePath, "utf8");
8007
+ const fileContent = readFileSync10(absolutePath, "utf8");
7760
8008
  const language = inferLanguage(absolutePath);
7761
8009
  return c4.json({ content: fileContent, language });
7762
8010
  } catch {
7763
8011
  return c4.json({ error: "Failed to read file" }, 500);
7764
8012
  }
7765
8013
  }
7766
- function handleExperiments(c4, { searchDir, agentvDir }) {
7767
- const metas = listResultFiles(searchDir);
8014
+ async function handleExperiments(c4, { searchDir, agentvDir }) {
8015
+ const { runs: metas } = await listMergedResultFiles(searchDir);
7768
8016
  const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
7769
8017
  const experimentMap = /* @__PURE__ */ new Map();
7770
8018
  for (const m of metas) {
@@ -7802,8 +8050,70 @@ function handleExperiments(c4, { searchDir, agentvDir }) {
7802
8050
  }));
7803
8051
  return c4.json({ experiments });
7804
8052
  }
7805
- function handleTargets(c4, { searchDir, agentvDir }) {
7806
- const metas = listResultFiles(searchDir);
8053
+ async function handleCompare(c4, { searchDir, agentvDir }) {
8054
+ const { runs: metas } = await listMergedResultFiles(searchDir);
8055
+ const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
8056
+ const cellMap = /* @__PURE__ */ new Map();
8057
+ const experimentsSet = /* @__PURE__ */ new Set();
8058
+ const targetsSet = /* @__PURE__ */ new Set();
8059
+ for (const m of metas) {
8060
+ try {
8061
+ const records = loadLightweightResults(m.path);
8062
+ for (const r of records) {
8063
+ const experiment = r.experiment ?? "default";
8064
+ const target = r.target ?? "default";
8065
+ experimentsSet.add(experiment);
8066
+ targetsSet.add(target);
8067
+ const key = JSON.stringify([experiment, target]);
8068
+ const entry = cellMap.get(key) ?? {
8069
+ experiment,
8070
+ target,
8071
+ evalCount: 0,
8072
+ passedCount: 0,
8073
+ scoreSum: 0,
8074
+ tests: []
8075
+ };
8076
+ const passed = r.score >= pass_threshold;
8077
+ entry.evalCount++;
8078
+ if (passed) entry.passedCount++;
8079
+ entry.scoreSum += r.score;
8080
+ entry.tests.push({
8081
+ test_id: r.testId,
8082
+ score: r.score,
8083
+ passed,
8084
+ execution_status: r.executionStatus
8085
+ });
8086
+ cellMap.set(key, entry);
8087
+ }
8088
+ } catch {
8089
+ }
8090
+ }
8091
+ const MAX_TESTS_PER_CELL = 100;
8092
+ const cells = [...cellMap.values()].map((entry) => {
8093
+ const dedupMap = /* @__PURE__ */ new Map();
8094
+ for (const t of entry.tests) {
8095
+ dedupMap.set(t.test_id, t);
8096
+ }
8097
+ const dedupedTests = [...dedupMap.values()];
8098
+ const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL);
8099
+ return {
8100
+ experiment: entry.experiment,
8101
+ target: entry.target,
8102
+ eval_count: entry.evalCount,
8103
+ passed_count: entry.passedCount,
8104
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
8105
+ avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0,
8106
+ tests: cappedTests
8107
+ };
8108
+ });
8109
+ return c4.json({
8110
+ experiments: [...experimentsSet].sort(),
8111
+ targets: [...targetsSet].sort(),
8112
+ cells
8113
+ });
8114
+ }
8115
+ async function handleTargets(c4, { searchDir, agentvDir }) {
8116
+ const { runs: metas } = await listMergedResultFiles(searchDir);
7807
8117
  const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
7808
8118
  const targetMap = /* @__PURE__ */ new Map();
7809
8119
  for (const m of metas) {
@@ -7836,29 +8146,38 @@ function handleTargets(c4, { searchDir, agentvDir }) {
7836
8146
  }));
7837
8147
  return c4.json({ targets });
7838
8148
  }
7839
- function handleConfig(c4, { agentvDir }) {
7840
- return c4.json(loadStudioConfig(agentvDir));
8149
+ function handleConfig(c4, { agentvDir, searchDir }, options) {
8150
+ return c4.json({
8151
+ ...loadStudioConfig(agentvDir),
8152
+ read_only: options?.readOnly === true,
8153
+ project_name: path16.basename(searchDir),
8154
+ multi_project_dashboard: options?.multiProjectDashboard === true
8155
+ });
7841
8156
  }
7842
8157
  function handleFeedbackRead(c4, { searchDir }) {
7843
- const resultsDir = path14.join(searchDir, ".agentv", "results");
7844
- return c4.json(readFeedback(existsSync9(resultsDir) ? resultsDir : searchDir));
8158
+ const resultsDir = path16.join(searchDir, ".agentv", "results");
8159
+ return c4.json(readFeedback(existsSync12(resultsDir) ? resultsDir : searchDir));
7845
8160
  }
7846
8161
  function createApp(results, resultDir, cwd, sourceFile, options) {
7847
8162
  const searchDir = cwd ?? resultDir;
7848
- const agentvDir = path14.join(searchDir, ".agentv");
8163
+ const agentvDir = path16.join(searchDir, ".agentv");
7849
8164
  const defaultCtx = { searchDir, agentvDir };
8165
+ const readOnly = options?.readOnly === true;
7850
8166
  const app2 = new Hono();
7851
8167
  function withProject(c4, handler) {
7852
8168
  const project = getProject(c4.req.param("projectId") ?? "");
7853
- if (!project || !existsSync9(project.path)) {
8169
+ if (!project || !existsSync12(project.path)) {
7854
8170
  return c4.json({ error: "Project not found" }, 404);
7855
8171
  }
7856
8172
  return handler(c4, {
7857
8173
  searchDir: project.path,
7858
- agentvDir: path14.join(project.path, ".agentv")
8174
+ agentvDir: path16.join(project.path, ".agentv")
7859
8175
  });
7860
8176
  }
7861
8177
  app2.post("/api/config", async (c4) => {
8178
+ if (readOnly) {
8179
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
8180
+ }
7862
8181
  try {
7863
8182
  const body = await c4.req.json();
7864
8183
  const current = loadStudioConfig(agentvDir);
@@ -7881,32 +8200,37 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7881
8200
  last_opened_at: entry.lastOpenedAt
7882
8201
  };
7883
8202
  }
7884
- app2.get("/api/projects", (c4) => {
8203
+ app2.get("/api/projects", async (c4) => {
7885
8204
  const registry = loadProjectRegistry();
7886
- const projects = registry.projects.map((p) => {
7887
- let runCount = 0;
7888
- let passRate = 0;
7889
- let lastRun = null;
7890
- try {
7891
- const metas = listResultFiles(p.path);
7892
- runCount = metas.length;
7893
- if (metas.length > 0) {
7894
- const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0);
7895
- passRate = totalPassRate / metas.length;
7896
- lastRun = metas[0].timestamp;
8205
+ const projects = await Promise.all(
8206
+ registry.projects.map(async (p) => {
8207
+ let runCount = 0;
8208
+ let passRate = 0;
8209
+ let lastRun = null;
8210
+ try {
8211
+ const { runs: metas } = await listMergedResultFiles(p.path);
8212
+ runCount = metas.length;
8213
+ if (metas.length > 0) {
8214
+ const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0);
8215
+ passRate = totalPassRate / metas.length;
8216
+ lastRun = metas[0].timestamp;
8217
+ }
8218
+ } catch {
7897
8219
  }
7898
- } catch {
7899
- }
7900
- return {
7901
- ...projectEntryToWire(p),
7902
- run_count: runCount,
7903
- pass_rate: passRate,
7904
- last_run: lastRun
7905
- };
7906
- });
8220
+ return {
8221
+ ...projectEntryToWire(p),
8222
+ run_count: runCount,
8223
+ pass_rate: passRate,
8224
+ last_run: lastRun
8225
+ };
8226
+ })
8227
+ );
7907
8228
  return c4.json({ projects });
7908
8229
  });
7909
8230
  app2.post("/api/projects", async (c4) => {
8231
+ if (readOnly) {
8232
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
8233
+ }
7910
8234
  try {
7911
8235
  const body = await c4.req.json();
7912
8236
  if (!body.path) return c4.json({ error: "Missing path" }, 400);
@@ -7917,15 +8241,18 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7917
8241
  }
7918
8242
  });
7919
8243
  app2.delete("/api/projects/:projectId", (c4) => {
8244
+ if (readOnly) {
8245
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
8246
+ }
7920
8247
  const removed = removeProject(c4.req.param("projectId") ?? "");
7921
8248
  if (!removed) return c4.json({ error: "Project not found" }, 404);
7922
8249
  return c4.json({ ok: true });
7923
8250
  });
7924
- app2.get("/api/projects/:projectId/summary", (c4) => {
8251
+ app2.get("/api/projects/:projectId/summary", async (c4) => {
7925
8252
  const project = getProject(c4.req.param("projectId") ?? "");
7926
8253
  if (!project) return c4.json({ error: "Project not found" }, 404);
7927
8254
  try {
7928
- const metas = listResultFiles(project.path);
8255
+ const { runs: metas } = await listMergedResultFiles(project.path);
7929
8256
  const runCount = metas.length;
7930
8257
  const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0;
7931
8258
  const lastRun = metas.length > 0 ? metas[0].timestamp : null;
@@ -7942,6 +8269,9 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7942
8269
  }
7943
8270
  });
7944
8271
  app2.post("/api/projects/discover", async (c4) => {
8272
+ if (readOnly) {
8273
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
8274
+ }
7945
8275
  try {
7946
8276
  const body = await c4.req.json();
7947
8277
  if (!body.path) return c4.json({ error: "Missing path" }, 400);
@@ -7952,12 +8282,12 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7952
8282
  return c4.json({ error: err2.message }, 400);
7953
8283
  }
7954
8284
  });
7955
- app2.get("/api/projects/all-runs", (c4) => {
8285
+ app2.get("/api/projects/all-runs", async (c4) => {
7956
8286
  const registry = loadProjectRegistry();
7957
8287
  const allRuns = [];
7958
8288
  for (const p of registry.projects) {
7959
8289
  try {
7960
- const metas = listResultFiles(p.path);
8290
+ const { runs: metas } = await listMergedResultFiles(p.path);
7961
8291
  for (const m of metas) {
7962
8292
  let target;
7963
8293
  let experiment;
@@ -7971,12 +8301,14 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7971
8301
  }
7972
8302
  allRuns.push({
7973
8303
  filename: m.filename,
8304
+ display_name: m.displayName,
7974
8305
  path: m.path,
7975
8306
  timestamp: m.timestamp,
7976
8307
  test_count: m.testCount,
7977
8308
  pass_rate: m.passRate,
7978
8309
  avg_score: m.avgScore,
7979
8310
  size_bytes: m.sizeBytes,
8311
+ source: m.source,
7980
8312
  ...target && { target },
7981
8313
  ...experiment && { experiment },
7982
8314
  project_id: p.id,
@@ -7989,7 +8321,15 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
7989
8321
  allRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
7990
8322
  return c4.json({ runs: allRuns });
7991
8323
  });
7992
- app2.get("/api/config", (c4) => handleConfig(c4, defaultCtx));
8324
+ app2.get(
8325
+ "/api/config",
8326
+ (c4) => handleConfig(c4, defaultCtx, {
8327
+ readOnly,
8328
+ multiProjectDashboard: options?.multiProjectDashboard
8329
+ })
8330
+ );
8331
+ app2.get("/api/remote/status", async (c4) => c4.json(await getRemoteResultsStatus(searchDir)));
8332
+ app2.post("/api/remote/sync", async (c4) => c4.json(await syncRemoteResults(searchDir)));
7993
8333
  app2.get("/api/runs", (c4) => handleRuns(c4, defaultCtx));
7994
8334
  app2.get("/api/runs/:filename", (c4) => handleRunDetail(c4, defaultCtx));
7995
8335
  app2.get("/api/runs/:filename/suites", (c4) => handleRunSuites(c4, defaultCtx));
@@ -8002,12 +8342,16 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8002
8342
  app2.get("/api/runs/:filename/evals/:evalId/files", (c4) => handleEvalFiles(c4, defaultCtx));
8003
8343
  app2.get("/api/runs/:filename/evals/:evalId/files/*", (c4) => handleEvalFileContent(c4, defaultCtx));
8004
8344
  app2.get("/api/experiments", (c4) => handleExperiments(c4, defaultCtx));
8345
+ app2.get("/api/compare", (c4) => handleCompare(c4, defaultCtx));
8005
8346
  app2.get("/api/targets", (c4) => handleTargets(c4, defaultCtx));
8006
8347
  app2.get("/api/feedback", (c4) => {
8007
8348
  const data = readFeedback(resultDir);
8008
8349
  return c4.json(data);
8009
8350
  });
8010
8351
  app2.post("/api/feedback", async (c4) => {
8352
+ if (readOnly) {
8353
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
8354
+ }
8011
8355
  let body;
8012
8356
  try {
8013
8357
  body = await c4.req.json();
@@ -8045,8 +8389,8 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8045
8389
  writeFeedback(resultDir, existing);
8046
8390
  return c4.json(existing);
8047
8391
  });
8048
- app2.get("/api/index", (c4) => {
8049
- const metas = listResultFiles(searchDir);
8392
+ app2.get("/api/index", async (c4) => {
8393
+ const { runs: metas } = await listMergedResultFiles(searchDir);
8050
8394
  const entries2 = metas.map((m) => {
8051
8395
  let totalCostUsd = 0;
8052
8396
  try {
@@ -8056,6 +8400,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8056
8400
  }
8057
8401
  return {
8058
8402
  run_filename: m.filename,
8403
+ display_name: m.displayName,
8059
8404
  test_count: m.testCount,
8060
8405
  pass_rate: m.passRate,
8061
8406
  avg_score: m.avgScore,
@@ -8065,7 +8410,27 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8065
8410
  });
8066
8411
  return c4.json({ entries: entries2 });
8067
8412
  });
8068
- app2.get("/api/projects/:projectId/config", (c4) => withProject(c4, handleConfig));
8413
+ app2.get(
8414
+ "/api/projects/:projectId/config",
8415
+ (c4) => withProject(
8416
+ c4,
8417
+ (ctx, dataCtx) => handleConfig(ctx, dataCtx, {
8418
+ readOnly,
8419
+ multiProjectDashboard: options?.multiProjectDashboard
8420
+ })
8421
+ )
8422
+ );
8423
+ app2.get(
8424
+ "/api/projects/:projectId/remote/status",
8425
+ (c4) => withProject(
8426
+ c4,
8427
+ async (ctx, dataCtx) => ctx.json(await getRemoteResultsStatus(dataCtx.searchDir))
8428
+ )
8429
+ );
8430
+ app2.post(
8431
+ "/api/projects/:projectId/remote/sync",
8432
+ (c4) => withProject(c4, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir)))
8433
+ );
8069
8434
  app2.get("/api/projects/:projectId/runs", (c4) => withProject(c4, handleRuns));
8070
8435
  app2.get("/api/projects/:projectId/runs/:filename", (c4) => withProject(c4, handleRunDetail));
8071
8436
  app2.get("/api/projects/:projectId/runs/:filename/suites", (c4) => withProject(c4, handleRunSuites));
@@ -8090,31 +8455,36 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8090
8455
  (c4) => withProject(c4, handleEvalFileContent)
8091
8456
  );
8092
8457
  app2.get("/api/projects/:projectId/experiments", (c4) => withProject(c4, handleExperiments));
8458
+ app2.get("/api/projects/:projectId/compare", (c4) => withProject(c4, handleCompare));
8093
8459
  app2.get("/api/projects/:projectId/targets", (c4) => withProject(c4, handleTargets));
8094
8460
  app2.get("/api/projects/:projectId/feedback", (c4) => withProject(c4, handleFeedbackRead));
8095
- registerEvalRoutes(app2, (c4) => {
8096
- const projectId = c4.req.param("projectId");
8097
- if (projectId) {
8098
- const project = getProject(projectId);
8099
- if (project) return project.path;
8100
- }
8101
- return searchDir;
8102
- });
8461
+ registerEvalRoutes(
8462
+ app2,
8463
+ (c4) => {
8464
+ const projectId = c4.req.param("projectId");
8465
+ if (projectId) {
8466
+ const project = getProject(projectId);
8467
+ if (project) return project.path;
8468
+ }
8469
+ return searchDir;
8470
+ },
8471
+ { readOnly }
8472
+ );
8103
8473
  const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
8104
- if (!studioDistPath || !existsSync9(path14.join(studioDistPath, "index.html"))) {
8474
+ if (!studioDistPath || !existsSync12(path16.join(studioDistPath, "index.html"))) {
8105
8475
  throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
8106
8476
  }
8107
8477
  app2.get("/", (c4) => {
8108
- const indexPath = path14.join(studioDistPath, "index.html");
8109
- if (existsSync9(indexPath)) return c4.html(readFileSync9(indexPath, "utf8"));
8478
+ const indexPath = path16.join(studioDistPath, "index.html");
8479
+ if (existsSync12(indexPath)) return c4.html(readFileSync10(indexPath, "utf8"));
8110
8480
  return c4.notFound();
8111
8481
  });
8112
8482
  app2.get("/assets/*", (c4) => {
8113
8483
  const assetPath = c4.req.path;
8114
- const filePath = path14.join(studioDistPath, assetPath);
8115
- if (!existsSync9(filePath)) return c4.notFound();
8116
- const content = readFileSync9(filePath);
8117
- const ext = path14.extname(filePath);
8484
+ const filePath = path16.join(studioDistPath, assetPath);
8485
+ if (!existsSync12(filePath)) return c4.notFound();
8486
+ const content = readFileSync10(filePath);
8487
+ const ext = path16.extname(filePath);
8118
8488
  const mimeTypes = {
8119
8489
  ".js": "application/javascript",
8120
8490
  ".css": "text/css",
@@ -8135,26 +8505,26 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
8135
8505
  });
8136
8506
  app2.get("*", (c4) => {
8137
8507
  if (c4.req.path.startsWith("/api/")) return c4.json({ error: "Not found" }, 404);
8138
- const indexPath = path14.join(studioDistPath, "index.html");
8139
- if (existsSync9(indexPath)) return c4.html(readFileSync9(indexPath, "utf8"));
8508
+ const indexPath = path16.join(studioDistPath, "index.html");
8509
+ if (existsSync12(indexPath)) return c4.html(readFileSync10(indexPath, "utf8"));
8140
8510
  return c4.notFound();
8141
8511
  });
8142
8512
  return app2;
8143
8513
  }
8144
8514
  function resolveStudioDistDir() {
8145
- const currentDir = typeof __dirname !== "undefined" ? __dirname : path14.dirname(fileURLToPath2(import.meta.url));
8515
+ const currentDir = typeof __dirname !== "undefined" ? __dirname : path16.dirname(fileURLToPath2(import.meta.url));
8146
8516
  const candidates = [
8147
8517
  // From src/commands/results/ → sibling apps/studio/dist
8148
- path14.resolve(currentDir, "../../../../studio/dist"),
8518
+ path16.resolve(currentDir, "../../../../studio/dist"),
8149
8519
  // From dist/ → sibling apps/studio/dist (monorepo dev)
8150
- path14.resolve(currentDir, "../../studio/dist"),
8520
+ path16.resolve(currentDir, "../../studio/dist"),
8151
8521
  // Bundled inside CLI dist (published package: dist/studio/)
8152
- path14.resolve(currentDir, "studio"),
8522
+ path16.resolve(currentDir, "studio"),
8153
8523
  // From dist/ in monorepo root context
8154
- path14.resolve(currentDir, "../../../apps/studio/dist")
8524
+ path16.resolve(currentDir, "../../../apps/studio/dist")
8155
8525
  ];
8156
8526
  for (const candidate of candidates) {
8157
- if (existsSync9(candidate) && existsSync9(path14.join(candidate, "index.html"))) {
8527
+ if (existsSync12(candidate) && existsSync12(path16.join(candidate, "index.html"))) {
8158
8528
  return candidate;
8159
8529
  }
8160
8530
  }
@@ -8183,7 +8553,11 @@ var resultsServeCommand = command({
8183
8553
  }),
8184
8554
  multi: flag({
8185
8555
  long: "multi",
8186
- description: "Launch in multi-project dashboard mode"
8556
+ description: "Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)"
8557
+ }),
8558
+ single: flag({
8559
+ long: "single",
8560
+ description: "Force single-project dashboard mode"
8187
8561
  }),
8188
8562
  add: option({
8189
8563
  type: optional(string),
@@ -8199,9 +8573,13 @@ var resultsServeCommand = command({
8199
8573
  type: optional(string),
8200
8574
  long: "discover",
8201
8575
  description: "Scan a directory tree for repos with .agentv/"
8576
+ }),
8577
+ readOnly: flag({
8578
+ long: "read-only",
8579
+ description: "Disable write operations and launch Studio in read-only leaderboard mode"
8202
8580
  })
8203
8581
  },
8204
- handler: async ({ source, port, dir, multi, add, remove, discover }) => {
8582
+ handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => {
8205
8583
  const cwd = dir ?? process.cwd();
8206
8584
  const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
8207
8585
  if (add) {
@@ -8239,7 +8617,10 @@ Discovered ${discovered.length} project(s).`);
8239
8617
  return;
8240
8618
  }
8241
8619
  const registry = loadProjectRegistry();
8242
- const isMultiProject = multi || registry.projects.length > 0;
8620
+ const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.projects.length, {
8621
+ multi,
8622
+ single
8623
+ });
8243
8624
  try {
8244
8625
  let results = [];
8245
8626
  let sourceFile;
@@ -8249,7 +8630,7 @@ Discovered ${discovered.length} project(s).`);
8249
8630
  } else {
8250
8631
  const cache = await loadRunCache(cwd);
8251
8632
  const cachedFile = cache ? resolveRunCacheFile(cache) : "";
8252
- if (cachedFile && existsSync9(cachedFile)) {
8633
+ if (cachedFile && existsSync12(cachedFile)) {
8253
8634
  sourceFile = cachedFile;
8254
8635
  results = loadManifestResults(cachedFile);
8255
8636
  } else {
@@ -8260,8 +8641,16 @@ Discovered ${discovered.length} project(s).`);
8260
8641
  }
8261
8642
  }
8262
8643
  }
8263
- const resultDir = sourceFile ? path14.dirname(path14.resolve(sourceFile)) : cwd;
8264
- const app2 = createApp(results, resultDir, cwd, sourceFile);
8644
+ const resultDir = sourceFile ? path16.dirname(path16.resolve(sourceFile)) : cwd;
8645
+ const app2 = createApp(results, resultDir, cwd, sourceFile, {
8646
+ readOnly,
8647
+ multiProjectDashboard: isMultiProject
8648
+ });
8649
+ if (showMultiWarning) {
8650
+ console.warn(
8651
+ "Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view."
8652
+ );
8653
+ }
8265
8654
  if (isMultiProject) {
8266
8655
  console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`);
8267
8656
  } else if (results.length > 0 && sourceFile) {
@@ -8377,7 +8766,7 @@ var selfCommand = subcommands({
8377
8766
 
8378
8767
  // src/commands/transpile/index.ts
8379
8768
  import { writeFileSync as writeFileSync5 } from "node:fs";
8380
- import path15 from "node:path";
8769
+ import path17 from "node:path";
8381
8770
  var transpileCommand = command({
8382
8771
  name: "transpile",
8383
8772
  description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
@@ -8401,7 +8790,7 @@ var transpileCommand = command({
8401
8790
  handler: async ({ input, outDir, stdout }) => {
8402
8791
  let result;
8403
8792
  try {
8404
- result = transpileEvalYamlFile(path15.resolve(input));
8793
+ result = transpileEvalYamlFile(path17.resolve(input));
8405
8794
  } catch (error) {
8406
8795
  console.error(`Error: ${error.message}`);
8407
8796
  process.exit(1);
@@ -8425,11 +8814,11 @@ var transpileCommand = command({
8425
8814
  process.stdout.write("\n");
8426
8815
  return;
8427
8816
  }
8428
- const outputDir = outDir ? path15.resolve(outDir) : path15.dirname(path15.resolve(input));
8817
+ const outputDir = outDir ? path17.resolve(outDir) : path17.dirname(path17.resolve(input));
8429
8818
  const fileNames = getOutputFilenames(result);
8430
8819
  for (const [skill, evalsJson] of result.files) {
8431
8820
  const fileName = fileNames.get(skill) ?? "evals.json";
8432
- const outputPath = path15.join(outputDir, fileName);
8821
+ const outputPath = path17.join(outputDir, fileName);
8433
8822
  writeFileSync5(outputPath, `${JSON.stringify(evalsJson, null, 2)}
8434
8823
  `);
8435
8824
  console.log(`Transpiled to ${outputPath}`);
@@ -8438,8 +8827,8 @@ var transpileCommand = command({
8438
8827
  });
8439
8828
 
8440
8829
  // src/commands/trend/index.ts
8441
- import path16 from "node:path";
8442
- var colors3 = {
8830
+ import path18 from "node:path";
8831
+ var colors2 = {
8443
8832
  reset: "\x1B[0m",
8444
8833
  bold: "\x1B[1m",
8445
8834
  dim: "\x1B[2m",
@@ -8449,18 +8838,18 @@ var colors3 = {
8449
8838
  cyan: "\x1B[36m",
8450
8839
  gray: "\x1B[90m"
8451
8840
  };
8452
- var noColor3 = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
8453
- var c3 = noColor3 ? Object.fromEntries(Object.keys(colors3).map((k) => [k, ""])) : colors3;
8454
- var ansiPattern3 = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
8455
- function stripAnsi4(str) {
8456
- return str.replace(ansiPattern3, "");
8457
- }
8458
- function padRight3(str, len) {
8459
- const plainLen = stripAnsi4(str).length;
8841
+ var noColor2 = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
8842
+ var c3 = noColor2 ? Object.fromEntries(Object.keys(colors2).map((k) => [k, ""])) : colors2;
8843
+ var ansiPattern2 = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
8844
+ function stripAnsi3(str) {
8845
+ return str.replace(ansiPattern2, "");
8846
+ }
8847
+ function padRight4(str, len) {
8848
+ const plainLen = stripAnsi3(str).length;
8460
8849
  return str + " ".repeat(Math.max(0, len - plainLen));
8461
8850
  }
8462
8851
  function padLeft3(str, len) {
8463
- const plainLen = stripAnsi4(str).length;
8852
+ const plainLen = stripAnsi3(str).length;
8464
8853
  return " ".repeat(Math.max(0, len - plainLen)) + str;
8465
8854
  }
8466
8855
  function formatSignedNumber(value, digits = 3) {
@@ -8488,7 +8877,7 @@ function colorizeSlope(value) {
8488
8877
  }
8489
8878
  function ensureTrendIndexPath(source, cwd) {
8490
8879
  const resolved = resolveResultSourcePath(source, cwd);
8491
- if (path16.basename(resolved) !== RESULT_INDEX_FILENAME) {
8880
+ if (path18.basename(resolved) !== RESULT_INDEX_FILENAME) {
8492
8881
  throw new Error(
8493
8882
  `Unsupported result source for trend: ${source}. Use a run workspace directory or ${RESULT_INDEX_FILENAME} manifest.`
8494
8883
  );
@@ -8508,7 +8897,7 @@ function resolveTrendSources(cwd, sources, last) {
8508
8897
  if (last < 2) {
8509
8898
  throw new Error("--last must be at least 2");
8510
8899
  }
8511
- const metas = listResultFiles(cwd).filter((meta) => path16.basename(meta.path) === RESULT_INDEX_FILENAME).slice(0, last);
8900
+ const metas = listResultFiles(cwd).filter((meta) => path18.basename(meta.path) === RESULT_INDEX_FILENAME).slice(0, last);
8512
8901
  if (metas.length < 2) {
8513
8902
  throw new Error(
8514
8903
  "Trend analysis requires at least 2 canonical run workspaces in .agentv/results/runs/"
@@ -8523,10 +8912,10 @@ function getRunLabel(sourcePath, timestamp) {
8523
8912
  if (timestamp) {
8524
8913
  return timestamp;
8525
8914
  }
8526
- return path16.basename(path16.dirname(sourcePath));
8915
+ return path18.basename(path18.dirname(sourcePath));
8527
8916
  }
8528
8917
  function getRunSortKey(sourcePath, timestamp) {
8529
- return timestamp ?? path16.basename(path16.dirname(sourcePath));
8918
+ return timestamp ?? path18.basename(path18.dirname(sourcePath));
8530
8919
  }
8531
8920
  function mean2(values) {
8532
8921
  return values.reduce((sum, value) => sum + value, 0) / values.length;
@@ -8673,14 +9062,14 @@ function formatTrendTable(output) {
8673
9062
  `${c3.bold}Matched Tests:${c3.reset} ${output.summary.matchedTestCount} | ${c3.bold}Verdict:${c3.reset} ${colorizeDirection(output.summary.direction)}`
8674
9063
  );
8675
9064
  lines.push("");
8676
- const header = ` ${padRight3("Run", runLabelWidth)} ${padLeft3("Tests", matchWidth)} ${padLeft3("Mean Score", scoreWidth)}`;
9065
+ const header = ` ${padRight4("Run", runLabelWidth)} ${padLeft3("Tests", matchWidth)} ${padLeft3("Mean Score", scoreWidth)}`;
8677
9066
  lines.push(`${c3.dim}${header}${c3.reset}`);
8678
9067
  lines.push(
8679
9068
  `${c3.dim} ${"\u2500".repeat(runLabelWidth)} ${"\u2500".repeat(matchWidth)} ${"\u2500".repeat(scoreWidth)}${c3.reset}`
8680
9069
  );
8681
9070
  for (const run2 of output.runs) {
8682
9071
  lines.push(
8683
- ` ${padRight3(run2.label, runLabelWidth)} ${padLeft3(String(run2.matchedTestCount), matchWidth)} ${padLeft3(run2.meanScore.toFixed(3), scoreWidth)}`
9072
+ ` ${padRight4(run2.label, runLabelWidth)} ${padLeft3(String(run2.matchedTestCount), matchWidth)} ${padLeft3(run2.meanScore.toFixed(3), scoreWidth)}`
8684
9073
  );
8685
9074
  }
8686
9075
  lines.push("");
@@ -8781,7 +9170,7 @@ var trendCommand = command({
8781
9170
  });
8782
9171
 
8783
9172
  // src/commands/trim/index.ts
8784
- import { readFileSync as readFileSync10, writeFileSync as writeFileSync6 } from "node:fs";
9173
+ import { readFileSync as readFileSync11, writeFileSync as writeFileSync6 } from "node:fs";
8785
9174
  var trimCommand = command({
8786
9175
  name: "trim",
8787
9176
  description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
@@ -8800,7 +9189,7 @@ var trimCommand = command({
8800
9189
  },
8801
9190
  handler: async ({ input, out }) => {
8802
9191
  try {
8803
- const content = readFileSync10(input, "utf8");
9192
+ const content = readFileSync11(input, "utf8");
8804
9193
  const lines = content.trim().split("\n").filter((line) => line.trim());
8805
9194
  const trimmedLines = lines.map((line) => {
8806
9195
  const record = JSON.parse(line);
@@ -8907,7 +9296,7 @@ function isTTY() {
8907
9296
  // src/commands/validate/validate-files.ts
8908
9297
  import { constants } from "node:fs";
8909
9298
  import { access, readdir as readdir4, stat } from "node:fs/promises";
8910
- import path17 from "node:path";
9299
+ import path19 from "node:path";
8911
9300
  import fg2 from "fast-glob";
8912
9301
  async function validateFiles(paths) {
8913
9302
  const filePaths = await expandPaths(paths);
@@ -8926,7 +9315,7 @@ async function validateFiles(paths) {
8926
9315
  };
8927
9316
  }
8928
9317
  async function validateSingleFile(filePath) {
8929
- const absolutePath = path17.resolve(filePath);
9318
+ const absolutePath = path19.resolve(filePath);
8930
9319
  const fileType = await detectFileType(absolutePath);
8931
9320
  let result;
8932
9321
  if (fileType === "eval") {
@@ -8951,7 +9340,7 @@ async function validateSingleFile(filePath) {
8951
9340
  async function expandPaths(paths) {
8952
9341
  const expanded = /* @__PURE__ */ new Set();
8953
9342
  for (const inputPath of paths) {
8954
- const absolutePath = path17.resolve(inputPath);
9343
+ const absolutePath = path19.resolve(inputPath);
8955
9344
  try {
8956
9345
  await access(absolutePath, constants.F_OK);
8957
9346
  const stats = await stat(absolutePath);
@@ -8979,7 +9368,7 @@ async function expandPaths(paths) {
8979
9368
  if (yamlMatches.length === 0) {
8980
9369
  console.warn(`Warning: No YAML files matched pattern: ${inputPath}`);
8981
9370
  }
8982
- for (const f of yamlMatches) expanded.add(path17.normalize(f));
9371
+ for (const f of yamlMatches) expanded.add(path19.normalize(f));
8983
9372
  }
8984
9373
  const sorted = Array.from(expanded);
8985
9374
  sorted.sort();
@@ -8990,7 +9379,7 @@ async function findYamlFiles(dirPath) {
8990
9379
  try {
8991
9380
  const entries2 = await readdir4(dirPath, { withFileTypes: true });
8992
9381
  for (const entry of entries2) {
8993
- const fullPath = path17.join(dirPath, entry.name);
9382
+ const fullPath = path19.join(dirPath, entry.name);
8994
9383
  if (entry.isDirectory()) {
8995
9384
  if (entry.name === "node_modules" || entry.name.startsWith(".")) {
8996
9385
  continue;
@@ -9007,7 +9396,7 @@ async function findYamlFiles(dirPath) {
9007
9396
  return results;
9008
9397
  }
9009
9398
  function isYamlFile(filePath) {
9010
- const ext = path17.extname(filePath).toLowerCase();
9399
+ const ext = path19.extname(filePath).toLowerCase();
9011
9400
  return ext === ".yaml" || ext === ".yml";
9012
9401
  }
9013
9402
 
@@ -9062,9 +9451,9 @@ var validateCommand = command({
9062
9451
  });
9063
9452
 
9064
9453
  // src/commands/workspace/clean.ts
9065
- import { existsSync as existsSync10 } from "node:fs";
9454
+ import { existsSync as existsSync13 } from "node:fs";
9066
9455
  import { readFile as readFile6, readdir as readdir5, rm } from "node:fs/promises";
9067
- import path18 from "node:path";
9456
+ import path20 from "node:path";
9068
9457
  async function confirm(message) {
9069
9458
  const readline2 = await import("node:readline");
9070
9459
  const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
@@ -9091,7 +9480,7 @@ var cleanCommand = command({
9091
9480
  },
9092
9481
  handler: async ({ repo, force }) => {
9093
9482
  const poolRoot = getWorkspacePoolRoot();
9094
- if (!existsSync10(poolRoot)) {
9483
+ if (!existsSync13(poolRoot)) {
9095
9484
  console.log("No workspace pool entries found.");
9096
9485
  return;
9097
9486
  }
@@ -9100,8 +9489,8 @@ var cleanCommand = command({
9100
9489
  const poolDirs = entries2.filter((e) => e.isDirectory());
9101
9490
  const matchingDirs = [];
9102
9491
  for (const dir of poolDirs) {
9103
- const poolDir = path18.join(poolRoot, dir.name);
9104
- const metadataPath = path18.join(poolDir, "metadata.json");
9492
+ const poolDir = path20.join(poolRoot, dir.name);
9493
+ const metadataPath = path20.join(poolDir, "metadata.json");
9105
9494
  try {
9106
9495
  const raw = await readFile6(metadataPath, "utf-8");
9107
9496
  const metadata = JSON.parse(raw);
@@ -9132,7 +9521,7 @@ var cleanCommand = command({
9132
9521
  }
9133
9522
  for (const dir of matchingDirs) {
9134
9523
  await rm(dir, { recursive: true, force: true });
9135
- console.log(`Removed: ${path18.basename(dir).slice(0, 12)}...`);
9524
+ console.log(`Removed: ${path20.basename(dir).slice(0, 12)}...`);
9136
9525
  }
9137
9526
  console.log("Done.");
9138
9527
  } else {
@@ -9150,7 +9539,7 @@ var cleanCommand = command({
9150
9539
  });
9151
9540
 
9152
9541
  // src/commands/workspace/deps.ts
9153
- import path19 from "node:path";
9542
+ import path21 from "node:path";
9154
9543
  var depsCommand = command({
9155
9544
  name: "deps",
9156
9545
  description: "Scan eval files and list git repo dependencies needed by workspaces",
@@ -9174,7 +9563,7 @@ var depsCommand = command({
9174
9563
  const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
9175
9564
  const result = await scanRepoDeps(resolvedPaths);
9176
9565
  for (const err2 of result.errors) {
9177
- console.error(`warning: ${path19.relative(cwd, err2.file)}: ${err2.message}`);
9566
+ console.error(`warning: ${path21.relative(cwd, err2.file)}: ${err2.message}`);
9178
9567
  }
9179
9568
  const output = {
9180
9569
  repos: result.repos.map((r) => ({
@@ -9182,7 +9571,7 @@ var depsCommand = command({
9182
9571
  ...r.ref !== void 0 && { ref: r.ref },
9183
9572
  ...r.clone !== void 0 && { clone: r.clone },
9184
9573
  ...r.checkout !== void 0 && { checkout: r.checkout },
9185
- ...usedBy && { used_by: r.usedBy.map((p) => path19.relative(cwd, p)) }
9574
+ ...usedBy && { used_by: r.usedBy.map((p) => path21.relative(cwd, p)) }
9186
9575
  }))
9187
9576
  };
9188
9577
  console.log(JSON.stringify(output, null, 2));
@@ -9190,15 +9579,15 @@ var depsCommand = command({
9190
9579
  });
9191
9580
 
9192
9581
  // src/commands/workspace/list.ts
9193
- import { existsSync as existsSync11 } from "node:fs";
9582
+ import { existsSync as existsSync14 } from "node:fs";
9194
9583
  import { readFile as readFile7, readdir as readdir6, stat as stat2 } from "node:fs/promises";
9195
- import path20 from "node:path";
9584
+ import path22 from "node:path";
9196
9585
  async function getDirectorySize(dirPath) {
9197
9586
  let totalSize = 0;
9198
9587
  try {
9199
9588
  const entries2 = await readdir6(dirPath, { withFileTypes: true });
9200
9589
  for (const entry of entries2) {
9201
- const fullPath = path20.join(dirPath, entry.name);
9590
+ const fullPath = path22.join(dirPath, entry.name);
9202
9591
  if (entry.isDirectory()) {
9203
9592
  totalSize += await getDirectorySize(fullPath);
9204
9593
  } else {
@@ -9222,7 +9611,7 @@ var listCommand = command({
9222
9611
  args: {},
9223
9612
  handler: async () => {
9224
9613
  const poolRoot = getWorkspacePoolRoot();
9225
- if (!existsSync11(poolRoot)) {
9614
+ if (!existsSync14(poolRoot)) {
9226
9615
  console.log("No workspace pool entries found.");
9227
9616
  return;
9228
9617
  }
@@ -9233,11 +9622,11 @@ var listCommand = command({
9233
9622
  return;
9234
9623
  }
9235
9624
  for (const dir of poolDirs) {
9236
- const poolDir = path20.join(poolRoot, dir.name);
9625
+ const poolDir = path22.join(poolRoot, dir.name);
9237
9626
  const fingerprint = dir.name;
9238
9627
  const poolEntries = await readdir6(poolDir, { withFileTypes: true });
9239
9628
  const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
9240
- const metadataPath = path20.join(poolDir, "metadata.json");
9629
+ const metadataPath = path22.join(poolDir, "metadata.json");
9241
9630
  let metadata = null;
9242
9631
  try {
9243
9632
  const raw = await readFile7(metadataPath, "utf-8");
@@ -9284,8 +9673,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
9284
9673
  var AGENTV_DIR = getAgentvHome();
9285
9674
  var CACHE_FILE = "version-check.json";
9286
9675
  var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
9287
- async function getCachedUpdateInfo(path21) {
9288
- const filePath = path21 ?? join5(AGENTV_DIR, CACHE_FILE);
9676
+ async function getCachedUpdateInfo(path23) {
9677
+ const filePath = path23 ?? join5(AGENTV_DIR, CACHE_FILE);
9289
9678
  try {
9290
9679
  const raw = await readFile8(filePath, "utf-8");
9291
9680
  const data = JSON.parse(raw);
@@ -9446,4 +9835,4 @@ export {
9446
9835
  preprocessArgv,
9447
9836
  runCli
9448
9837
  };
9449
- //# sourceMappingURL=chunk-VA64NETD.js.map
9838
+ //# sourceMappingURL=chunk-YLVQNF23.js.map