agentv 4.20.0 → 4.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/dist/{artifact-writer-RFXWXUOV.js → artifact-writer-E775664W.js} +4 -4
  2. package/dist/{chunk-36HXBYUY.js → chunk-27VT2KU2.js} +32 -27
  3. package/dist/chunk-27VT2KU2.js.map +1 -0
  4. package/dist/{chunk-LP4Y5D2Z.js → chunk-ERSBQAGK.js} +95 -27
  5. package/dist/chunk-ERSBQAGK.js.map +1 -0
  6. package/dist/{chunk-ZNS74WKH.js → chunk-FZUNMXBT.js} +3 -3
  7. package/dist/{chunk-PHGEGHKR.js → chunk-GPGX24OI.js} +186 -30
  8. package/dist/chunk-GPGX24OI.js.map +1 -0
  9. package/dist/{chunk-KJZ7PZCE.js → chunk-LPSUVXHV.js} +78 -132
  10. package/dist/{chunk-KJZ7PZCE.js.map → chunk-LPSUVXHV.js.map} +1 -1
  11. package/dist/cli.js +5 -5
  12. package/dist/{dist-GURCO6IS.js → dist-RSAA3T6F.js} +3 -3
  13. package/dist/index.js +5 -5
  14. package/dist/{interactive-GLRASSKM.js → interactive-RLMRNXXD.js} +5 -5
  15. package/dist/studio/assets/{index-KfPHd-QM.js → index-BVSHI8Eo.js} +1 -1
  16. package/dist/studio/assets/{index-BTsTcivx.js → index-SZVrc1UE.js} +20 -20
  17. package/dist/studio/index.html +1 -1
  18. package/dist/{ts-eval-loader-32COE32J-TCT4RIRT.js → ts-eval-loader-HPIPE72C-GDYGJVIA.js} +2 -2
  19. package/package.json +1 -1
  20. package/dist/chunk-36HXBYUY.js.map +0 -1
  21. package/dist/chunk-LP4Y5D2Z.js.map +0 -1
  22. package/dist/chunk-PHGEGHKR.js.map +0 -1
  23. /package/dist/{artifact-writer-RFXWXUOV.js.map → artifact-writer-E775664W.js.map} +0 -0
  24. /package/dist/{chunk-ZNS74WKH.js.map → chunk-FZUNMXBT.js.map} +0 -0
  25. /package/dist/{dist-GURCO6IS.js.map → dist-RSAA3T6F.js.map} +0 -0
  26. /package/dist/{interactive-GLRASSKM.js.map → interactive-RLMRNXXD.js.map} +0 -0
  27. /package/dist/{ts-eval-loader-32COE32J-TCT4RIRT.js.map → ts-eval-loader-HPIPE72C-GDYGJVIA.js.map} +0 -0
@@ -4,7 +4,11 @@ import {
4
4
  TARGET_FILE_CANDIDATES,
5
5
  c,
6
6
  detectFileType,
7
+ detectInstallScope,
8
+ detectPackageManager,
7
9
  discoverEvalFiles,
10
+ enforceRequiredVersion,
11
+ fetchLatestVersion,
8
12
  findRepoRoot,
9
13
  findRunById,
10
14
  formatCost,
@@ -26,6 +30,7 @@ import {
26
30
  padLeft,
27
31
  padRight,
28
32
  parseResultManifest,
33
+ performSelfUpdate,
29
34
  resolveEvalPaths,
30
35
  resolveResultSourcePath,
31
36
  resolveRunCacheFile,
@@ -39,7 +44,7 @@ import {
39
44
  validateFileReferences,
40
45
  validateTargetsFile,
41
46
  validateWorkspacePaths
42
- } from "./chunk-PHGEGHKR.js";
47
+ } from "./chunk-GPGX24OI.js";
43
48
  import {
44
49
  RESULT_INDEX_FILENAME,
45
50
  aggregateRunDir,
@@ -47,12 +52,11 @@ import {
47
52
  resolveRunManifestPath,
48
53
  toSnakeCaseDeep,
49
54
  writeArtifactsFromResults
50
- } from "./chunk-ZNS74WKH.js";
55
+ } from "./chunk-FZUNMXBT.js";
51
56
  import {
52
57
  DEFAULT_CATEGORY,
53
58
  addBenchmark,
54
59
  deriveCategory,
55
- discoverBenchmarks,
56
60
  discoverClaudeSessions,
57
61
  discoverCodexSessions,
58
62
  getBenchmark,
@@ -66,7 +70,7 @@ import {
66
70
  toTranscriptJsonLines,
67
71
  transpileEvalYamlFile,
68
72
  trimBaselineResult
69
- } from "./chunk-36HXBYUY.js";
73
+ } from "./chunk-27VT2KU2.js";
70
74
  import {
71
75
  DEFAULT_THRESHOLD,
72
76
  createBuiltinRegistry,
@@ -76,6 +80,7 @@ import {
76
80
  getWorkspacePoolRoot,
77
81
  isAgentSkillsFormat,
78
82
  listTargetNames,
83
+ loadConfig,
79
84
  loadTestSuite,
80
85
  normalizeLineEndings,
81
86
  parseAgentSkillsEvals,
@@ -94,7 +99,7 @@ import {
94
99
  runStartsWithAssertion,
95
100
  toCamelCaseDeep,
96
101
  toSnakeCaseDeep as toSnakeCaseDeep2
97
- } from "./chunk-LP4Y5D2Z.js";
102
+ } from "./chunk-ERSBQAGK.js";
98
103
  import {
99
104
  __commonJS,
100
105
  __require,
@@ -3986,7 +3991,7 @@ var evalRunCommand = command({
3986
3991
  },
3987
3992
  handler: async (args) => {
3988
3993
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
3989
- const { launchInteractiveWizard } = await import("./interactive-GLRASSKM.js");
3994
+ const { launchInteractiveWizard } = await import("./interactive-RLMRNXXD.js");
3990
3995
  await launchInteractiveWizard();
3991
3996
  return;
3992
3997
  }
@@ -9732,14 +9737,14 @@ Serving most recent: ${metas[0].path}
9732
9737
  }
9733
9738
  return metas[0].path;
9734
9739
  }
9735
- function resolveDashboardMode(projectCount, options) {
9740
+ function resolveDashboardMode(benchmarkCount, options) {
9736
9741
  if (options.single === true) {
9737
- return { isMultiProject: false, showMultiWarning: options.multi === true };
9742
+ return { isMultiBenchmark: false, showMultiWarning: options.multi === true };
9738
9743
  }
9739
9744
  if (options.multi === true) {
9740
- return { isMultiProject: true, showMultiWarning: true };
9745
+ return { isMultiBenchmark: true, showMultiWarning: true };
9741
9746
  }
9742
- return { isMultiProject: projectCount > 1, showMultiWarning: false };
9747
+ return { isMultiBenchmark: benchmarkCount > 1, showMultiWarning: false };
9743
9748
  }
9744
9749
  function feedbackPath(resultDir) {
9745
9750
  return path19.join(resultDir, "feedback.json");
@@ -10241,8 +10246,8 @@ function handleConfig(c4, { agentvDir, searchDir }, options) {
10241
10246
  return c4.json({
10242
10247
  ...loadStudioConfig(agentvDir),
10243
10248
  read_only: options?.readOnly === true,
10244
- project_name: path19.basename(searchDir),
10245
- multi_project_dashboard: options?.multiProjectDashboard === true
10249
+ benchmark_name: path19.basename(searchDir),
10250
+ multi_benchmark_dashboard: options?.multiBenchmarkDashboard === true
10246
10251
  });
10247
10252
  }
10248
10253
  function handleFeedbackRead(c4, { searchDir }) {
@@ -10302,7 +10307,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10302
10307
  function withBenchmark(c4, handler) {
10303
10308
  const benchmark = getBenchmark(c4.req.param("benchmarkId") ?? "");
10304
10309
  if (!benchmark || !existsSync14(benchmark.path)) {
10305
- return c4.json({ error: "Project not found" }, 404);
10310
+ return c4.json({ error: "Benchmark not found" }, 404);
10306
10311
  }
10307
10312
  return handler(c4, {
10308
10313
  searchDir: benchmark.path,
@@ -10360,7 +10365,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10360
10365
  };
10361
10366
  })
10362
10367
  );
10363
- return c4.json({ projects: benchmarks });
10368
+ return c4.json({ benchmarks });
10364
10369
  });
10365
10370
  app2.post("/api/benchmarks", async (c4) => {
10366
10371
  if (readOnly) {
@@ -10375,17 +10380,9 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10375
10380
  return c4.json({ error: err2.message }, 400);
10376
10381
  }
10377
10382
  });
10378
- app2.delete("/api/benchmarks/:benchmarkId", (c4) => {
10379
- if (readOnly) {
10380
- return c4.json({ error: "Studio is running in read-only mode" }, 403);
10381
- }
10382
- const removed = removeBenchmark(c4.req.param("benchmarkId") ?? "");
10383
- if (!removed) return c4.json({ error: "Project not found" }, 404);
10384
- return c4.json({ ok: true });
10385
- });
10386
10383
  app2.get("/api/benchmarks/:benchmarkId/summary", async (c4) => {
10387
10384
  const benchmark = getBenchmark(c4.req.param("benchmarkId") ?? "");
10388
- if (!benchmark) return c4.json({ error: "Project not found" }, 404);
10385
+ if (!benchmark) return c4.json({ error: "Benchmark not found" }, 404);
10389
10386
  try {
10390
10387
  const { runs: metas } = await listMergedResultFiles(benchmark.path);
10391
10388
  const runCount = metas.length;
@@ -10400,21 +10397,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10400
10397
  last_run: lastRun
10401
10398
  });
10402
10399
  } catch {
10403
- return c4.json({ error: "Failed to read project" }, 500);
10404
- }
10405
- });
10406
- app2.post("/api/benchmarks/discover", async (c4) => {
10407
- if (readOnly) {
10408
- return c4.json({ error: "Studio is running in read-only mode" }, 403);
10409
- }
10410
- try {
10411
- const body = await c4.req.json();
10412
- if (!body.path) return c4.json({ error: "Missing path" }, 400);
10413
- const discovered = discoverBenchmarks(body.path);
10414
- const registered = discovered.map((p) => benchmarkEntryToWire(addBenchmark(p)));
10415
- return c4.json({ discovered: registered });
10416
- } catch (err2) {
10417
- return c4.json({ error: err2.message }, 400);
10400
+ return c4.json({ error: "Failed to read benchmark" }, 500);
10418
10401
  }
10419
10402
  });
10420
10403
  app2.get("/api/benchmarks/all-runs", async (c4) => {
@@ -10446,8 +10429,8 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10446
10429
  source: m.source,
10447
10430
  ...target && { target },
10448
10431
  ...experiment && { experiment },
10449
- project_id: p.id,
10450
- project_name: p.name
10432
+ benchmark_id: p.id,
10433
+ benchmark_name: p.name
10451
10434
  });
10452
10435
  }
10453
10436
  } catch {
@@ -10456,11 +10439,19 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10456
10439
  allRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
10457
10440
  return c4.json({ runs: allRuns });
10458
10441
  });
10442
+ app2.delete("/api/benchmarks/:benchmarkId", (c4) => {
10443
+ if (readOnly) {
10444
+ return c4.json({ error: "Studio is running in read-only mode" }, 403);
10445
+ }
10446
+ const removed = removeBenchmark(c4.req.param("benchmarkId") ?? "");
10447
+ if (!removed) return c4.json({ error: "Benchmark not found" }, 404);
10448
+ return c4.json({ ok: true });
10449
+ });
10459
10450
  app2.get(
10460
10451
  "/api/config",
10461
10452
  (c4) => handleConfig(c4, defaultCtx, {
10462
10453
  readOnly,
10463
- multiProjectDashboard: options?.multiProjectDashboard
10454
+ multiBenchmarkDashboard: options?.multiBenchmarkDashboard
10464
10455
  })
10465
10456
  );
10466
10457
  app2.get("/api/remote/status", async (c4) => c4.json(await getRemoteResultsStatus(searchDir)));
@@ -10563,7 +10554,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10563
10554
  c4,
10564
10555
  (ctx, dataCtx) => handleConfig(ctx, dataCtx, {
10565
10556
  readOnly,
10566
- multiProjectDashboard: options?.multiProjectDashboard
10557
+ multiBenchmarkDashboard: options?.multiBenchmarkDashboard
10567
10558
  })
10568
10559
  )
10569
10560
  );
@@ -10715,39 +10706,34 @@ var resultsServeCommand = command({
10715
10706
  }),
10716
10707
  multi: flag({
10717
10708
  long: "multi",
10718
- description: "Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)"
10709
+ description: "Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or --single)"
10719
10710
  }),
10720
10711
  single: flag({
10721
10712
  long: "single",
10722
- description: "Force single-project dashboard mode"
10713
+ description: "Force single-benchmark dashboard mode"
10723
10714
  }),
10724
10715
  add: option({
10725
10716
  type: optional(string),
10726
10717
  long: "add",
10727
- description: "Register a project by path"
10718
+ description: "Register a benchmark by path"
10728
10719
  }),
10729
10720
  remove: option({
10730
10721
  type: optional(string),
10731
10722
  long: "remove",
10732
- description: "Unregister a project by ID"
10733
- }),
10734
- discover: option({
10735
- type: optional(string),
10736
- long: "discover",
10737
- description: "Scan a directory tree for repos with .agentv/"
10723
+ description: "Unregister a benchmark by ID"
10738
10724
  }),
10739
10725
  readOnly: flag({
10740
10726
  long: "read-only",
10741
10727
  description: "Disable write operations and launch Studio in read-only leaderboard mode"
10742
10728
  })
10743
10729
  },
10744
- handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => {
10730
+ handler: async ({ source, port, dir, multi, single, add, remove, readOnly }) => {
10745
10731
  const cwd = dir ?? process.cwd();
10746
10732
  const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
10747
10733
  if (add) {
10748
10734
  try {
10749
10735
  const entry = addBenchmark(add);
10750
- console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`);
10736
+ console.log(`Registered benchmark: ${entry.name} (${entry.id}) at ${entry.path}`);
10751
10737
  } catch (err2) {
10752
10738
  console.error(`Error: ${err2.message}`);
10753
10739
  process.exit(1);
@@ -10757,32 +10743,23 @@ var resultsServeCommand = command({
10757
10743
  if (remove) {
10758
10744
  const removed = removeBenchmark(remove);
10759
10745
  if (removed) {
10760
- console.log(`Unregistered project: ${remove}`);
10746
+ console.log(`Unregistered benchmark: ${remove}`);
10761
10747
  } else {
10762
- console.error(`Project not found: ${remove}`);
10748
+ console.error(`Benchmark not found: ${remove}`);
10763
10749
  process.exit(1);
10764
10750
  }
10765
10751
  return;
10766
10752
  }
10767
- if (discover) {
10768
- const discovered = discoverBenchmarks(discover);
10769
- if (discovered.length === 0) {
10770
- console.log(`No projects with .agentv/ found under ${discover}`);
10771
- return;
10772
- }
10773
- for (const p of discovered) {
10774
- const entry = addBenchmark(p);
10775
- console.log(`Registered: ${entry.name} (${entry.id}) at ${entry.path}`);
10776
- }
10777
- console.log(`
10778
- Discovered ${discovered.length} project(s).`);
10779
- return;
10753
+ const repoRoot = await findRepoRoot(cwd);
10754
+ const yamlConfig = await loadConfig(path19.join(cwd, "_"), repoRoot);
10755
+ if (yamlConfig?.required_version) {
10756
+ await enforceRequiredVersion(yamlConfig.required_version);
10780
10757
  }
10781
10758
  const registry = loadBenchmarkRegistry();
10782
- const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.benchmarks.length, {
10783
- multi,
10784
- single
10785
- });
10759
+ const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode(
10760
+ registry.benchmarks.length,
10761
+ { multi, single }
10762
+ );
10786
10763
  try {
10787
10764
  let results = [];
10788
10765
  let sourceFile;
@@ -10806,15 +10783,15 @@ Discovered ${discovered.length} project(s).`);
10806
10783
  const resultDir = sourceFile ? path19.dirname(path19.resolve(sourceFile)) : cwd;
10807
10784
  const app2 = createApp(results, resultDir, cwd, sourceFile, {
10808
10785
  readOnly,
10809
- multiProjectDashboard: isMultiProject
10786
+ multiBenchmarkDashboard: isMultiBenchmark
10810
10787
  });
10811
10788
  if (showMultiWarning) {
10812
10789
  console.warn(
10813
- "Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view."
10790
+ "Warning: --multi is deprecated. Studio now auto-detects multi-benchmark mode when multiple benchmarks are registered. Use --single to force the single-benchmark view."
10814
10791
  );
10815
10792
  }
10816
- if (isMultiProject) {
10817
- console.log(`Multi-project mode: ${registry.benchmarks.length} project(s) registered`);
10793
+ if (isMultiBenchmark) {
10794
+ console.log(`Multi-benchmark mode: ${registry.benchmarks.length} benchmark(s) registered`);
10818
10795
  } else if (results.length > 0 && sourceFile) {
10819
10796
  console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
10820
10797
  } else {
@@ -10839,28 +10816,6 @@ Discovered ${discovered.length} project(s).`);
10839
10816
  });
10840
10817
 
10841
10818
  // src/commands/self/index.ts
10842
- import { spawn as spawn2 } from "node:child_process";
10843
- function detectPackageManagerFromPath(scriptPath) {
10844
- if (scriptPath.includes(".bun")) {
10845
- return "bun";
10846
- }
10847
- return "npm";
10848
- }
10849
- function detectPackageManager() {
10850
- return detectPackageManagerFromPath(process.argv[1] ?? "");
10851
- }
10852
- function runCommand(cmd, args) {
10853
- return new Promise((resolve3, reject) => {
10854
- const child = spawn2(cmd, args, { stdio: ["inherit", "pipe", "inherit"], shell: true });
10855
- let stdout = "";
10856
- child.stdout?.on("data", (data) => {
10857
- process.stdout.write(data);
10858
- stdout += data.toString();
10859
- });
10860
- child.on("error", reject);
10861
- child.on("close", (code) => resolve3({ exitCode: code ?? 1, stdout }));
10862
- });
10863
- }
10864
10819
  var updateCommand = command({
10865
10820
  name: "update",
10866
10821
  description: "Update agentv to the latest version",
@@ -10883,38 +10838,29 @@ var updateCommand = command({
10883
10838
  }
10884
10839
  const currentVersion = package_default.version;
10885
10840
  console.log(`Current version: ${currentVersion}`);
10886
- console.log(`Updating agentv using ${pm}...
10841
+ console.log("Checking for updates...");
10842
+ const latestVersion = await fetchLatestVersion();
10843
+ if (latestVersion && latestVersion === currentVersion) {
10844
+ console.log(`Already up to date (${currentVersion}).`);
10845
+ return;
10846
+ }
10847
+ if (latestVersion) {
10848
+ console.log(`Update available: ${currentVersion} \u2192 ${latestVersion}`);
10849
+ }
10850
+ const scope = detectInstallScope();
10851
+ const scopeLabel = scope === "local" ? "local project install" : "global install";
10852
+ console.log(`Updating agentv using ${pm} (${scopeLabel})...
10887
10853
  `);
10888
- const args = pm === "npm" ? ["install", "-g", "agentv@latest"] : ["add", "-g", "agentv@latest"];
10889
- try {
10890
- const result = await runCommand(pm, args);
10891
- if (result.exitCode !== 0) {
10892
- console.error("\nUpdate failed.");
10893
- process.exit(1);
10894
- }
10895
- let newVersion;
10896
- try {
10897
- const versionResult = await runCommand("agentv", ["--version"]);
10898
- newVersion = versionResult.stdout.trim();
10899
- } catch {
10900
- }
10901
- if (newVersion) {
10902
- console.log(`
10903
- Update complete: ${currentVersion} \u2192 ${newVersion}`);
10904
- } else {
10905
- console.log("\nUpdate complete.");
10906
- }
10907
- } catch (error) {
10908
- if (error instanceof Error) {
10909
- if (error.message.includes("ENOENT") || error.message.includes("not found")) {
10910
- const alternative = pm === "npm" ? "bun" : "npm";
10911
- console.error(`Error: ${pm} not found. Try using --${alternative} flag.`);
10912
- } else {
10913
- console.error(`Error: ${error.message}`);
10914
- }
10915
- process.exit(1);
10916
- }
10917
- throw error;
10854
+ const result = await performSelfUpdate({ pm, currentVersion, scope });
10855
+ if (!result.success) {
10856
+ console.error("\nUpdate failed.");
10857
+ process.exit(1);
10858
+ }
10859
+ if (result.newVersion) {
10860
+ console.log(`
10861
+ Update complete: ${currentVersion} \u2192 ${result.newVersion}`);
10862
+ } else {
10863
+ console.log("\nUpdate complete.");
10918
10864
  }
10919
10865
  }
10920
10866
  });
@@ -11847,7 +11793,7 @@ var workspaceCommand = subcommands({
11847
11793
  });
11848
11794
 
11849
11795
  // src/update-check.ts
11850
- import { spawn as spawn3 } from "node:child_process";
11796
+ import { spawn as spawn2 } from "node:child_process";
11851
11797
  import { readFile as readFile8 } from "node:fs/promises";
11852
11798
  import { join as join5 } from "node:path";
11853
11799
  var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
@@ -11912,7 +11858,7 @@ function backgroundUpdateCheck() {
11912
11858
  }).on('error', () => process.exit()).on('timeout', function() { this.destroy(); process.exit(); });
11913
11859
  `;
11914
11860
  try {
11915
- const child = spawn3(process.execPath, ["-e", script], {
11861
+ const child = spawn2(process.execPath, ["-e", script], {
11916
11862
  detached: true,
11917
11863
  stdio: "ignore",
11918
11864
  windowsHide: true
@@ -12016,4 +11962,4 @@ export {
12016
11962
  preprocessArgv,
12017
11963
  runCli
12018
11964
  };
12019
- //# sourceMappingURL=chunk-KJZ7PZCE.js.map
11965
+ //# sourceMappingURL=chunk-LPSUVXHV.js.map