agentv 4.28.0 → 4.29.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,7 +45,7 @@ import {
45
45
  validateFileReferences,
46
46
  validateTargetsFile,
47
47
  validateWorkspacePaths
48
- } from "./chunk-NL5H3TIC.js";
48
+ } from "./chunk-FEDIWLKK.js";
49
49
  import {
50
50
  RESULT_INDEX_FILENAME,
51
51
  aggregateRunDir,
@@ -53,27 +53,27 @@ import {
53
53
  resolveRunManifestPath,
54
54
  toSnakeCaseDeep as toSnakeCaseDeep2,
55
55
  writeArtifactsFromResults
56
- } from "./chunk-7T6AF75O.js";
56
+ } from "./chunk-OS67VZUO.js";
57
57
  import {
58
58
  DEFAULT_CATEGORY,
59
- addBenchmark,
59
+ addProject,
60
60
  deriveCategory,
61
61
  discoverClaudeSessions,
62
62
  discoverCodexSessions,
63
- getBenchmark,
64
63
  getOutputFilenames,
65
- loadBenchmarkRegistry,
64
+ getProject,
65
+ loadProjectRegistry,
66
66
  parseClaudeSession,
67
67
  parseCodexSession,
68
68
  readTranscriptFile,
69
- removeBenchmark,
69
+ removeProject,
70
70
  runBeforeSessionHook,
71
71
  scanRepoDeps,
72
- syncBenchmarks,
72
+ syncProjects,
73
73
  toTranscriptJsonLines,
74
74
  transpileEvalYamlFile,
75
75
  trimBaselineResult
76
- } from "./chunk-WZVOY2W2.js";
76
+ } from "./chunk-VZMGBDJD.js";
77
77
  import {
78
78
  DEFAULT_THRESHOLD,
79
79
  createBuiltinRegistry,
@@ -4053,7 +4053,7 @@ var evalRunCommand = command({
4053
4053
  },
4054
4054
  handler: async (args) => {
4055
4055
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4056
- const { launchInteractiveWizard } = await import("./interactive-7R2K3CBY.js");
4056
+ const { launchInteractiveWizard } = await import("./interactive-5LEM6ITD.js");
4057
4057
  await launchInteractiveWizard();
4058
4058
  return;
4059
4059
  }
@@ -4121,7 +4121,7 @@ var evalRunCommand = command({
4121
4121
  // src/commands/eval/index.ts
4122
4122
  var evalCommand = subcommands({
4123
4123
  name: "eval",
4124
- description: "Evaluation commands",
4124
+ description: "Evaluation commands. Shorthand: `agentv eval <eval-paths...>` aliases `agentv eval run <eval-paths...>`.",
4125
4125
  cmds: {
4126
4126
  run: evalRunCommand,
4127
4127
  assert: evalAssertCommand,
@@ -9338,7 +9338,7 @@ import { Hono } from "hono";
9338
9338
 
9339
9339
  // src/commands/results/eval-runner.ts
9340
9340
  import { execFileSync as execFileSync2, spawn } from "node:child_process";
9341
- import { existsSync as existsSync12 } from "node:fs";
9341
+ import { createWriteStream, existsSync as existsSync12, mkdirSync as mkdirSync3 } from "node:fs";
9342
9342
  import path17 from "node:path";
9343
9343
  import { fileURLToPath as fileURLToPath2 } from "node:url";
9344
9344
  var activeRuns = /* @__PURE__ */ new Map();
@@ -9365,6 +9365,14 @@ function getActiveRunTarget(indexJsonlPath) {
9365
9365
  }
9366
9366
  return void 0;
9367
9367
  }
9368
+ function getActiveRunStatus(indexJsonlPath) {
9369
+ for (const run2 of activeRuns.values()) {
9370
+ if (run2.outputDir && path17.join(run2.outputDir, "index.jsonl") === indexJsonlPath) {
9371
+ return run2.status;
9372
+ }
9373
+ }
9374
+ return void 0;
9375
+ }
9368
9376
  async function discoverTargetsInProject(cwd) {
9369
9377
  const repoRoot = await findRepoRoot(cwd) ?? cwd;
9370
9378
  let targetsFilePath;
@@ -9477,6 +9485,17 @@ function isCommandAvailable(cmd) {
9477
9485
  return false;
9478
9486
  }
9479
9487
  }
9488
+ function openConsoleLogStream(outputDir) {
9489
+ try {
9490
+ mkdirSync3(outputDir, { recursive: true });
9491
+ const stream = createWriteStream(path17.join(outputDir, "console.log"), { flags: "w" });
9492
+ stream.on("error", () => {
9493
+ });
9494
+ return stream;
9495
+ } catch {
9496
+ return void 0;
9497
+ }
9498
+ }
9480
9499
  function registerEvalRoutes(app2, getCwd, options) {
9481
9500
  const readOnly = options?.readOnly === true;
9482
9501
  app2.get("/api/eval/discover", async (c4) => {
@@ -9553,13 +9572,16 @@ function registerEvalRoutes(app2, getCwd, options) {
9553
9572
  });
9554
9573
  run2.process = child;
9555
9574
  run2.status = "running";
9575
+ const logStream = openConsoleLogStream(outputDir);
9556
9576
  child.stdout?.on("data", (chunk) => {
9577
+ logStream?.write(chunk);
9557
9578
  run2.stdout += chunk.toString();
9558
9579
  if (run2.stdout.length > 1e5) {
9559
9580
  run2.stdout = run2.stdout.slice(-8e4);
9560
9581
  }
9561
9582
  });
9562
9583
  child.stderr?.on("data", (chunk) => {
9584
+ logStream?.write(chunk);
9563
9585
  run2.stderr += chunk.toString();
9564
9586
  if (run2.stderr.length > 1e5) {
9565
9587
  run2.stderr = run2.stderr.slice(-8e4);
@@ -9570,6 +9592,7 @@ function registerEvalRoutes(app2, getCwd, options) {
9570
9592
  run2.status = code === 0 ? "finished" : "failed";
9571
9593
  run2.finishedAt = (/* @__PURE__ */ new Date()).toISOString();
9572
9594
  run2.process = void 0;
9595
+ logStream?.end();
9573
9596
  pruneFinishedRuns();
9574
9597
  });
9575
9598
  child.on("error", (err2) => {
@@ -9578,6 +9601,10 @@ function registerEvalRoutes(app2, getCwd, options) {
9578
9601
  Process error: ${err2.message}`;
9579
9602
  run2.finishedAt = (/* @__PURE__ */ new Date()).toISOString();
9580
9603
  run2.process = void 0;
9604
+ logStream?.write(`
9605
+ Process error: ${err2.message}
9606
+ `);
9607
+ logStream?.end();
9581
9608
  });
9582
9609
  return c4.json(
9583
9610
  {
@@ -9649,7 +9676,7 @@ Process error: ${err2.message}`;
9649
9676
  const args = buildCliArgs(body);
9650
9677
  return c4.json({ command: buildCliPreview(args) });
9651
9678
  });
9652
- app2.get("/api/benchmarks/:benchmarkId/eval/discover", async (c4) => {
9679
+ app2.get("/api/projects/:projectId/eval/discover", async (c4) => {
9653
9680
  const cwd = getCwd(c4);
9654
9681
  try {
9655
9682
  const files = await discoverEvalFiles(cwd);
@@ -9664,7 +9691,7 @@ Process error: ${err2.message}`;
9664
9691
  return c4.json({ error: err2.message, eval_files: [] }, 500);
9665
9692
  }
9666
9693
  });
9667
- app2.get("/api/benchmarks/:benchmarkId/eval/targets", async (c4) => {
9694
+ app2.get("/api/projects/:projectId/eval/targets", async (c4) => {
9668
9695
  const cwd = getCwd(c4);
9669
9696
  try {
9670
9697
  const names = await discoverTargetsInProject(cwd);
@@ -9673,7 +9700,7 @@ Process error: ${err2.message}`;
9673
9700
  return c4.json({ error: err2.message, targets: [] }, 500);
9674
9701
  }
9675
9702
  });
9676
- app2.post("/api/benchmarks/:benchmarkId/eval/run", async (c4) => {
9703
+ app2.post("/api/projects/:projectId/eval/run", async (c4) => {
9677
9704
  if (readOnly) {
9678
9705
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
9679
9706
  }
@@ -9722,11 +9749,14 @@ Process error: ${err2.message}`;
9722
9749
  });
9723
9750
  run2.process = child;
9724
9751
  run2.status = "running";
9752
+ const logStream = openConsoleLogStream(outputDir);
9725
9753
  child.stdout?.on("data", (chunk) => {
9754
+ logStream?.write(chunk);
9726
9755
  run2.stdout += chunk.toString();
9727
9756
  if (run2.stdout.length > 1e5) run2.stdout = run2.stdout.slice(-8e4);
9728
9757
  });
9729
9758
  child.stderr?.on("data", (chunk) => {
9759
+ logStream?.write(chunk);
9730
9760
  run2.stderr += chunk.toString();
9731
9761
  if (run2.stderr.length > 1e5) run2.stderr = run2.stderr.slice(-8e4);
9732
9762
  });
@@ -9735,6 +9765,7 @@ Process error: ${err2.message}`;
9735
9765
  run2.status = code === 0 ? "finished" : "failed";
9736
9766
  run2.finishedAt = (/* @__PURE__ */ new Date()).toISOString();
9737
9767
  run2.process = void 0;
9768
+ logStream?.end();
9738
9769
  pruneFinishedRuns();
9739
9770
  });
9740
9771
  child.on("error", (err2) => {
@@ -9743,6 +9774,10 @@ Process error: ${err2.message}`;
9743
9774
  Process error: ${err2.message}`;
9744
9775
  run2.finishedAt = (/* @__PURE__ */ new Date()).toISOString();
9745
9776
  run2.process = void 0;
9777
+ logStream?.write(`
9778
+ Process error: ${err2.message}
9779
+ `);
9780
+ logStream?.end();
9746
9781
  });
9747
9782
  return c4.json({ id: runId, status: run2.status, command: command2 }, 202);
9748
9783
  } catch (err2) {
@@ -9752,7 +9787,7 @@ Process error: ${err2.message}`;
9752
9787
  return c4.json({ error: err2.message }, 500);
9753
9788
  }
9754
9789
  });
9755
- app2.post("/api/benchmarks/:benchmarkId/eval/run/:id/stop", (c4) => {
9790
+ app2.post("/api/projects/:projectId/eval/run/:id/stop", (c4) => {
9756
9791
  if (readOnly) {
9757
9792
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
9758
9793
  }
@@ -9769,7 +9804,7 @@ Process error: ${err2.message}`;
9769
9804
  }
9770
9805
  return c4.json({ stopped: true, status: run2.status });
9771
9806
  });
9772
- app2.get("/api/benchmarks/:benchmarkId/eval/status/:id", (c4) => {
9807
+ app2.get("/api/projects/:projectId/eval/status/:id", (c4) => {
9773
9808
  const id = c4.req.param("id");
9774
9809
  const run2 = activeRuns.get(id ?? "");
9775
9810
  if (!run2) return c4.json({ error: "Run not found" }, 404);
@@ -9784,7 +9819,7 @@ Process error: ${err2.message}`;
9784
9819
  stderr: run2.stderr.slice(-5e3)
9785
9820
  });
9786
9821
  });
9787
- app2.get("/api/benchmarks/:benchmarkId/eval/runs", (c4) => {
9822
+ app2.get("/api/projects/:projectId/eval/runs", (c4) => {
9788
9823
  const runs = [...activeRuns.values()].map((r) => ({
9789
9824
  id: r.id,
9790
9825
  status: r.status,
@@ -9797,7 +9832,7 @@ Process error: ${err2.message}`;
9797
9832
  runs.sort((a, b) => b.started_at.localeCompare(a.started_at));
9798
9833
  return c4.json({ runs });
9799
9834
  });
9800
- app2.post("/api/benchmarks/:benchmarkId/eval/preview", async (c4) => {
9835
+ app2.post("/api/projects/:projectId/eval/preview", async (c4) => {
9801
9836
  let body;
9802
9837
  try {
9803
9838
  body = await c4.req.json();
@@ -9887,7 +9922,7 @@ function normalizeTags(tags) {
9887
9922
  }
9888
9923
 
9889
9924
  // src/commands/results/studio-config.ts
9890
- import { existsSync as existsSync14, mkdirSync as mkdirSync3, readFileSync as readFileSync11, writeFileSync as writeFileSync5 } from "node:fs";
9925
+ import { existsSync as existsSync14, mkdirSync as mkdirSync4, readFileSync as readFileSync11, writeFileSync as writeFileSync5 } from "node:fs";
9891
9926
  import path19 from "node:path";
9892
9927
  import { stringify as stringifyYaml2 } from "yaml";
9893
9928
  var DEFAULTS = {
@@ -9921,7 +9956,7 @@ function loadStudioConfig(agentvDir) {
9921
9956
  }
9922
9957
  function saveStudioConfig(agentvDir, config) {
9923
9958
  if (!existsSync14(agentvDir)) {
9924
- mkdirSync3(agentvDir, { recursive: true });
9959
+ mkdirSync4(agentvDir, { recursive: true });
9925
9960
  }
9926
9961
  const configPath = path19.join(agentvDir, "config.yaml");
9927
9962
  let existing = {};
@@ -9977,14 +10012,11 @@ Serving most recent: ${metas[0].path}
9977
10012
  }
9978
10013
  return metas[0].path;
9979
10014
  }
9980
- function resolveDashboardMode(benchmarkCount, options) {
10015
+ function resolveDashboardMode(projectCount, options) {
9981
10016
  if (options.single === true) {
9982
- return { isMultiBenchmark: false, showMultiWarning: options.multi === true };
10017
+ return { projectDashboard: false };
9983
10018
  }
9984
- if (options.multi === true) {
9985
- return { isMultiBenchmark: true, showMultiWarning: true };
9986
- }
9987
- return { isMultiBenchmark: benchmarkCount > 1, showMultiWarning: false };
10019
+ return { projectDashboard: projectCount > 1 };
9988
10020
  }
9989
10021
  function feedbackPath(resultDir) {
9990
10022
  return path20.join(resultDir, "feedback.json");
@@ -10096,6 +10128,7 @@ async function handleRuns(c4, { searchDir, agentvDir }) {
10096
10128
  }
10097
10129
  } catch {
10098
10130
  }
10131
+ const liveStatus = getActiveRunStatus(m.path);
10099
10132
  const tagsEntry = readRunTags(m.path);
10100
10133
  return {
10101
10134
  filename: m.filename,
@@ -10109,11 +10142,30 @@ async function handleRuns(c4, { searchDir, agentvDir }) {
10109
10142
  source: m.source,
10110
10143
  ...target && { target },
10111
10144
  ...experiment && { experiment },
10112
- ...tagsEntry && { tags: tagsEntry.tags }
10145
+ ...tagsEntry && { tags: tagsEntry.tags },
10146
+ ...liveStatus && { status: liveStatus }
10113
10147
  };
10114
10148
  })
10115
10149
  });
10116
10150
  }
10151
+ async function handleRunLog(c4, { searchDir }) {
10152
+ const filename = c4.req.param("filename") ?? "";
10153
+ const meta = await findRunById(searchDir, filename);
10154
+ if (!meta) return c4.json({ error: "Run not found" }, 404);
10155
+ if (meta.source === "remote") {
10156
+ return c4.json({ error: "Console log is not available for remote runs" }, 404);
10157
+ }
10158
+ const logPath = path20.join(path20.dirname(meta.path), "console.log");
10159
+ if (!existsSync15(logPath)) {
10160
+ return c4.json({ error: "Console log not found for this run" }, 404);
10161
+ }
10162
+ try {
10163
+ const content = readFileSync12(logPath, "utf8");
10164
+ return c4.text(content);
10165
+ } catch {
10166
+ return c4.json({ error: "Failed to read console log" }, 500);
10167
+ }
10168
+ }
10117
10169
  async function handleRunDetail(c4, { searchDir }) {
10118
10170
  const filename = c4.req.param("filename") ?? "";
10119
10171
  const meta = await findRunById(searchDir, filename);
@@ -10523,8 +10575,8 @@ function handleConfig(c4, { agentvDir, searchDir }, options) {
10523
10575
  return c4.json({
10524
10576
  ...loadStudioConfig(agentvDir),
10525
10577
  read_only: options?.readOnly === true,
10526
- benchmark_name: path20.basename(searchDir),
10527
- multi_benchmark_dashboard: options?.multiBenchmarkDashboard === true
10578
+ project_name: path20.basename(searchDir),
10579
+ project_dashboard: options?.projectDashboard === true
10528
10580
  });
10529
10581
  }
10530
10582
  function handleFeedbackRead(c4, { searchDir }) {
@@ -10581,14 +10633,14 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10581
10633
  const defaultCtx = { searchDir, agentvDir };
10582
10634
  const readOnly = options?.readOnly === true;
10583
10635
  const app2 = new Hono();
10584
- function withBenchmark(c4, handler) {
10585
- const benchmark = getBenchmark(c4.req.param("benchmarkId") ?? "");
10586
- if (!benchmark || !existsSync15(benchmark.path)) {
10587
- return c4.json({ error: "Benchmark not found" }, 404);
10636
+ function withProject(c4, handler) {
10637
+ const project = getProject(c4.req.param("projectId") ?? "");
10638
+ if (!project || !existsSync15(project.path)) {
10639
+ return c4.json({ error: "Project not found" }, 404);
10588
10640
  }
10589
10641
  return handler(c4, {
10590
- searchDir: benchmark.path,
10591
- agentvDir: path20.join(benchmark.path, ".agentv")
10642
+ searchDir: project.path,
10643
+ agentvDir: path20.join(project.path, ".agentv")
10592
10644
  });
10593
10645
  }
10594
10646
  app2.post("/api/config", async (c4) => {
@@ -10608,7 +10660,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10608
10660
  return c4.json({ error: "Failed to save config" }, 500);
10609
10661
  }
10610
10662
  });
10611
- function benchmarkEntryToWire(entry) {
10663
+ function projectEntryToWire(entry) {
10612
10664
  return {
10613
10665
  id: entry.id,
10614
10666
  name: entry.name,
@@ -10617,10 +10669,10 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10617
10669
  last_opened_at: entry.lastOpenedAt
10618
10670
  };
10619
10671
  }
10620
- app2.get("/api/benchmarks", async (c4) => {
10621
- const registry = loadBenchmarkRegistry();
10622
- const benchmarks = await Promise.all(
10623
- registry.benchmarks.map(async (p) => {
10672
+ app2.get("/api/projects", async (c4) => {
10673
+ const registry = loadProjectRegistry();
10674
+ const projects = await Promise.all(
10675
+ registry.projects.map(async (p) => {
10624
10676
  let runCount = 0;
10625
10677
  let passRate = 0;
10626
10678
  let lastRun = null;
@@ -10635,52 +10687,52 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10635
10687
  } catch {
10636
10688
  }
10637
10689
  return {
10638
- ...benchmarkEntryToWire(p),
10690
+ ...projectEntryToWire(p),
10639
10691
  run_count: runCount,
10640
10692
  pass_rate: passRate,
10641
10693
  last_run: lastRun
10642
10694
  };
10643
10695
  })
10644
10696
  );
10645
- return c4.json({ benchmarks });
10697
+ return c4.json({ projects });
10646
10698
  });
10647
- app2.post("/api/benchmarks", async (c4) => {
10699
+ app2.post("/api/projects", async (c4) => {
10648
10700
  if (readOnly) {
10649
10701
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
10650
10702
  }
10651
10703
  try {
10652
10704
  const body = await c4.req.json();
10653
10705
  if (!body.path) return c4.json({ error: "Missing path" }, 400);
10654
- const entry = addBenchmark(body.path);
10655
- return c4.json(benchmarkEntryToWire(entry), 201);
10706
+ const entry = addProject(body.path);
10707
+ return c4.json(projectEntryToWire(entry), 201);
10656
10708
  } catch (err2) {
10657
10709
  return c4.json({ error: err2.message }, 400);
10658
10710
  }
10659
10711
  });
10660
- app2.get("/api/benchmarks/:benchmarkId/summary", async (c4) => {
10661
- const benchmark = getBenchmark(c4.req.param("benchmarkId") ?? "");
10662
- if (!benchmark) return c4.json({ error: "Benchmark not found" }, 404);
10712
+ app2.get("/api/projects/:projectId/summary", async (c4) => {
10713
+ const project = getProject(c4.req.param("projectId") ?? "");
10714
+ if (!project) return c4.json({ error: "Project not found" }, 404);
10663
10715
  try {
10664
- const { runs: metas } = await listMergedResultFiles(benchmark.path);
10716
+ const { runs: metas } = await listMergedResultFiles(project.path);
10665
10717
  const runCount = metas.length;
10666
10718
  const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0;
10667
10719
  const lastRun = metas.length > 0 ? metas[0].timestamp : null;
10668
10720
  return c4.json({
10669
- id: benchmark.id,
10670
- name: benchmark.name,
10671
- path: benchmark.path,
10721
+ id: project.id,
10722
+ name: project.name,
10723
+ path: project.path,
10672
10724
  run_count: runCount,
10673
10725
  pass_rate: passRate,
10674
10726
  last_run: lastRun
10675
10727
  });
10676
10728
  } catch {
10677
- return c4.json({ error: "Failed to read benchmark" }, 500);
10729
+ return c4.json({ error: "Failed to read project" }, 500);
10678
10730
  }
10679
10731
  });
10680
- app2.get("/api/benchmarks/all-runs", async (c4) => {
10681
- const registry = loadBenchmarkRegistry();
10732
+ app2.get("/api/projects/all-runs", async (c4) => {
10733
+ const registry = loadProjectRegistry();
10682
10734
  const allRuns = [];
10683
- for (const p of registry.benchmarks) {
10735
+ for (const p of registry.projects) {
10684
10736
  try {
10685
10737
  const { runs: metas } = await listMergedResultFiles(p.path);
10686
10738
  for (const m of metas) {
@@ -10706,8 +10758,8 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10706
10758
  source: m.source,
10707
10759
  ...target && { target },
10708
10760
  ...experiment && { experiment },
10709
- benchmark_id: p.id,
10710
- benchmark_name: p.name
10761
+ project_id: p.id,
10762
+ project_name: p.name
10711
10763
  });
10712
10764
  }
10713
10765
  } catch {
@@ -10716,19 +10768,19 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10716
10768
  allRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
10717
10769
  return c4.json({ runs: allRuns });
10718
10770
  });
10719
- app2.delete("/api/benchmarks/:benchmarkId", (c4) => {
10771
+ app2.delete("/api/projects/:projectId", (c4) => {
10720
10772
  if (readOnly) {
10721
10773
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
10722
10774
  }
10723
- const removed = removeBenchmark(c4.req.param("benchmarkId") ?? "");
10724
- if (!removed) return c4.json({ error: "Benchmark not found" }, 404);
10775
+ const removed = removeProject(c4.req.param("projectId") ?? "");
10776
+ if (!removed) return c4.json({ error: "Project not found" }, 404);
10725
10777
  return c4.json({ ok: true });
10726
10778
  });
10727
10779
  app2.get(
10728
10780
  "/api/config",
10729
10781
  (c4) => handleConfig(c4, defaultCtx, {
10730
10782
  readOnly,
10731
- multiBenchmarkDashboard: options?.multiBenchmarkDashboard
10783
+ projectDashboard: options?.projectDashboard
10732
10784
  })
10733
10785
  );
10734
10786
  app2.get("/api/remote/status", async (c4) => c4.json(await getRemoteResultsStatus(searchDir)));
@@ -10747,6 +10799,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10747
10799
  return handleRunTagsDelete(c4, defaultCtx);
10748
10800
  });
10749
10801
  app2.get("/api/runs/:filename", (c4) => handleRunDetail(c4, defaultCtx));
10802
+ app2.get("/api/runs/:filename/log", (c4) => handleRunLog(c4, defaultCtx));
10750
10803
  app2.get("/api/runs/:filename/suites", (c4) => handleRunSuites(c4, defaultCtx));
10751
10804
  app2.get("/api/runs/:filename/categories", (c4) => handleRunCategories(c4, defaultCtx));
10752
10805
  app2.get(
@@ -10826,75 +10879,73 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
10826
10879
  return c4.json({ entries: entries2 });
10827
10880
  });
10828
10881
  app2.get(
10829
- "/api/benchmarks/:benchmarkId/config",
10830
- (c4) => withBenchmark(
10882
+ "/api/projects/:projectId/config",
10883
+ (c4) => withProject(
10831
10884
  c4,
10832
10885
  (ctx, dataCtx) => handleConfig(ctx, dataCtx, {
10833
10886
  readOnly,
10834
- multiBenchmarkDashboard: options?.multiBenchmarkDashboard
10887
+ projectDashboard: options?.projectDashboard
10835
10888
  })
10836
10889
  )
10837
10890
  );
10838
10891
  app2.get(
10839
- "/api/benchmarks/:benchmarkId/remote/status",
10840
- (c4) => withBenchmark(
10892
+ "/api/projects/:projectId/remote/status",
10893
+ (c4) => withProject(
10841
10894
  c4,
10842
10895
  async (ctx, dataCtx) => ctx.json(await getRemoteResultsStatus(dataCtx.searchDir))
10843
10896
  )
10844
10897
  );
10845
10898
  app2.post(
10846
- "/api/benchmarks/:benchmarkId/remote/sync",
10847
- (c4) => withBenchmark(c4, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir)))
10899
+ "/api/projects/:projectId/remote/sync",
10900
+ (c4) => withProject(c4, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir)))
10848
10901
  );
10849
- app2.get("/api/benchmarks/:benchmarkId/runs", (c4) => withBenchmark(c4, handleRuns));
10850
- app2.put("/api/benchmarks/:benchmarkId/runs/:filename/tags", (c4) => {
10902
+ app2.get("/api/projects/:projectId/runs", (c4) => withProject(c4, handleRuns));
10903
+ app2.put("/api/projects/:projectId/runs/:filename/tags", (c4) => {
10851
10904
  if (readOnly) {
10852
10905
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
10853
10906
  }
10854
- return withBenchmark(c4, handleRunTagsPut);
10907
+ return withProject(c4, handleRunTagsPut);
10855
10908
  });
10856
- app2.delete("/api/benchmarks/:benchmarkId/runs/:filename/tags", (c4) => {
10909
+ app2.delete("/api/projects/:projectId/runs/:filename/tags", (c4) => {
10857
10910
  if (readOnly) {
10858
10911
  return c4.json({ error: "Studio is running in read-only mode" }, 403);
10859
10912
  }
10860
- return withBenchmark(c4, handleRunTagsDelete);
10913
+ return withProject(c4, handleRunTagsDelete);
10861
10914
  });
10862
- app2.get("/api/benchmarks/:benchmarkId/runs/:filename", (c4) => withBenchmark(c4, handleRunDetail));
10863
- app2.get(
10864
- "/api/benchmarks/:benchmarkId/runs/:filename/suites",
10865
- (c4) => withBenchmark(c4, handleRunSuites)
10866
- );
10915
+ app2.get("/api/projects/:projectId/runs/:filename", (c4) => withProject(c4, handleRunDetail));
10916
+ app2.get("/api/projects/:projectId/runs/:filename/log", (c4) => withProject(c4, handleRunLog));
10917
+ app2.get("/api/projects/:projectId/runs/:filename/suites", (c4) => withProject(c4, handleRunSuites));
10867
10918
  app2.get(
10868
- "/api/benchmarks/:benchmarkId/runs/:filename/categories",
10869
- (c4) => withBenchmark(c4, handleRunCategories)
10919
+ "/api/projects/:projectId/runs/:filename/categories",
10920
+ (c4) => withProject(c4, handleRunCategories)
10870
10921
  );
10871
10922
  app2.get(
10872
- "/api/benchmarks/:benchmarkId/runs/:filename/categories/:category/suites",
10873
- (c4) => withBenchmark(c4, handleCategorySuites)
10923
+ "/api/projects/:projectId/runs/:filename/categories/:category/suites",
10924
+ (c4) => withProject(c4, handleCategorySuites)
10874
10925
  );
10875
10926
  app2.get(
10876
- "/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId",
10877
- (c4) => withBenchmark(c4, handleEvalDetail)
10927
+ "/api/projects/:projectId/runs/:filename/evals/:evalId",
10928
+ (c4) => withProject(c4, handleEvalDetail)
10878
10929
  );
10879
10930
  app2.get(
10880
- "/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId/files",
10881
- (c4) => withBenchmark(c4, handleEvalFiles)
10931
+ "/api/projects/:projectId/runs/:filename/evals/:evalId/files",
10932
+ (c4) => withProject(c4, handleEvalFiles)
10882
10933
  );
10883
10934
  app2.get(
10884
- "/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId/files/*",
10885
- (c4) => withBenchmark(c4, handleEvalFileContent)
10935
+ "/api/projects/:projectId/runs/:filename/evals/:evalId/files/*",
10936
+ (c4) => withProject(c4, handleEvalFileContent)
10886
10937
  );
10887
- app2.get("/api/benchmarks/:benchmarkId/experiments", (c4) => withBenchmark(c4, handleExperiments));
10888
- app2.get("/api/benchmarks/:benchmarkId/compare", (c4) => withBenchmark(c4, handleCompare));
10889
- app2.get("/api/benchmarks/:benchmarkId/targets", (c4) => withBenchmark(c4, handleTargets));
10890
- app2.get("/api/benchmarks/:benchmarkId/feedback", (c4) => withBenchmark(c4, handleFeedbackRead));
10938
+ app2.get("/api/projects/:projectId/experiments", (c4) => withProject(c4, handleExperiments));
10939
+ app2.get("/api/projects/:projectId/compare", (c4) => withProject(c4, handleCompare));
10940
+ app2.get("/api/projects/:projectId/targets", (c4) => withProject(c4, handleTargets));
10941
+ app2.get("/api/projects/:projectId/feedback", (c4) => withProject(c4, handleFeedbackRead));
10891
10942
  registerEvalRoutes(
10892
10943
  app2,
10893
10944
  (c4) => {
10894
- const benchmarkId = c4.req.param("benchmarkId");
10895
- if (benchmarkId) {
10896
- const benchmark = getBenchmark(benchmarkId);
10897
- if (benchmark) return benchmark.path;
10945
+ const projectId = c4.req.param("projectId");
10946
+ if (projectId) {
10947
+ const project = getProject(projectId);
10948
+ if (project) return project.path;
10898
10949
  }
10899
10950
  return searchDir;
10900
10951
  },
@@ -10981,36 +11032,32 @@ var resultsServeCommand = command({
10981
11032
  short: "d",
10982
11033
  description: "Working directory (default: current directory)"
10983
11034
  }),
10984
- multi: flag({
10985
- long: "multi",
10986
- description: "Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or --single)"
10987
- }),
10988
11035
  single: flag({
10989
11036
  long: "single",
10990
- description: "Force single-benchmark dashboard mode"
11037
+ description: "Force single-project dashboard mode"
10991
11038
  }),
10992
11039
  add: option({
10993
11040
  type: optional(string),
10994
11041
  long: "add",
10995
- description: "Register a benchmark by path"
11042
+ description: "Register a project by path"
10996
11043
  }),
10997
11044
  remove: option({
10998
11045
  type: optional(string),
10999
11046
  long: "remove",
11000
- description: "Unregister a benchmark by ID"
11047
+ description: "Unregister a project by ID"
11001
11048
  }),
11002
11049
  readOnly: flag({
11003
11050
  long: "read-only",
11004
11051
  description: "Disable write operations and launch Studio in read-only leaderboard mode"
11005
11052
  })
11006
11053
  },
11007
- handler: async ({ source, port, dir, multi, single, add, remove, readOnly }) => {
11054
+ handler: async ({ source, port, dir, single, add, remove, readOnly }) => {
11008
11055
  const cwd = dir ?? process.cwd();
11009
11056
  const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
11010
11057
  if (add) {
11011
11058
  try {
11012
- const entry = addBenchmark(add);
11013
- console.log(`Registered benchmark: ${entry.name} (${entry.id}) at ${entry.path}`);
11059
+ const entry = addProject(add);
11060
+ console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`);
11014
11061
  } catch (err2) {
11015
11062
  console.error(`Error: ${err2.message}`);
11016
11063
  process.exit(1);
@@ -11018,11 +11065,11 @@ var resultsServeCommand = command({
11018
11065
  return;
11019
11066
  }
11020
11067
  if (remove) {
11021
- const removed = removeBenchmark(remove);
11068
+ const removed = removeProject(remove);
11022
11069
  if (removed) {
11023
- console.log(`Unregistered benchmark: ${remove}`);
11070
+ console.log(`Unregistered project: ${remove}`);
11024
11071
  } else {
11025
- console.error(`Benchmark not found: ${remove}`);
11072
+ console.error(`Project not found: ${remove}`);
11026
11073
  process.exit(1);
11027
11074
  }
11028
11075
  return;
@@ -11032,12 +11079,9 @@ var resultsServeCommand = command({
11032
11079
  if (yamlConfig?.required_version) {
11033
11080
  await enforceRequiredVersion(yamlConfig.required_version);
11034
11081
  }
11035
- const registry = loadBenchmarkRegistry();
11036
- const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode(
11037
- registry.benchmarks.length,
11038
- { multi, single }
11039
- );
11040
- await syncBenchmarks(registry.benchmarks);
11082
+ const registry = loadProjectRegistry();
11083
+ const { projectDashboard } = resolveDashboardMode(registry.projects.length, { single });
11084
+ await syncProjects(registry.projects);
11041
11085
  try {
11042
11086
  let results = [];
11043
11087
  let sourceFile;
@@ -11061,15 +11105,10 @@ var resultsServeCommand = command({
11061
11105
  const resultDir = sourceFile ? path20.dirname(path20.resolve(sourceFile)) : cwd;
11062
11106
  const app2 = createApp(results, resultDir, cwd, sourceFile, {
11063
11107
  readOnly,
11064
- multiBenchmarkDashboard: isMultiBenchmark
11108
+ projectDashboard
11065
11109
  });
11066
- if (showMultiWarning) {
11067
- console.warn(
11068
- "Warning: --multi is deprecated. Studio now auto-detects multi-benchmark mode when multiple benchmarks are registered. Use --single to force the single-benchmark view."
11069
- );
11070
- }
11071
- if (isMultiBenchmark) {
11072
- console.log(`Multi-benchmark mode: ${registry.benchmarks.length} benchmark(s) registered`);
11110
+ if (projectDashboard) {
11111
+ console.log(`Project dashboard: ${registry.projects.length} project(s) registered`);
11073
11112
  } else if (results.length > 0 && sourceFile) {
11074
11113
  console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
11075
11114
  } else {
@@ -11077,7 +11116,7 @@ var resultsServeCommand = command({
11077
11116
  console.log("Run an evaluation to see results: agentv eval <eval-file>");
11078
11117
  }
11079
11118
  console.log(`Dashboard: http://localhost:${listenPort}`);
11080
- console.log(`Benchmarks API: http://localhost:${listenPort}/api/benchmarks`);
11119
+ console.log(`Projects API: http://localhost:${listenPort}/api/projects`);
11081
11120
  console.log("Press Ctrl+C to stop");
11082
11121
  const { serve: startServer } = await import("@hono/node-server");
11083
11122
  startServer({
@@ -12549,4 +12588,4 @@ export {
12549
12588
  preprocessArgv,
12550
12589
  runCli
12551
12590
  };
12552
- //# sourceMappingURL=chunk-2WS3BEPV.js.map
12591
+ //# sourceMappingURL=chunk-GLJVO5PK.js.map