@agentv/core 4.20.0 → 4.21.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -4082,7 +4082,7 @@ declare function createDraftResultsPr(params: {
4082
4082
  /**
4083
4083
  * The default config directory (~/.agentv). Always resolves to the user's home
4084
4084
  * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
4085
- * like version-check.json, last-config.json, and projects.yaml.
4085
+ * like version-check.json, last-config.json, and benchmarks.yaml.
4086
4086
  */
4087
4087
  declare function getAgentvConfigDir(): string;
4088
4088
  /**
@@ -4100,18 +4100,31 @@ declare function getWorkspacePoolRoot(): string;
4100
4100
  * Benchmark registry for AgentV Studio multi-benchmark support.
4101
4101
  *
4102
4102
  * A Benchmark = any directory containing a `.agentv/` folder.
4103
- * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
4103
+ * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
4104
+ * truth for which benchmarks Studio shows. Studio re-reads the file on every
4105
+ * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
4106
+ * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
4107
+ * without restarting `agentv serve`.
4104
4108
  *
4105
- * YAML format:
4109
+ * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
4106
4110
  * benchmarks:
4107
4111
  * - id: my-app
4108
4112
  * name: My App
4109
4113
  * path: /home/user/projects/my-app
4110
- * addedAt: "2026-03-20T10:00:00Z"
4111
- * lastOpenedAt: "2026-03-30T14:00:00Z"
4114
+ * added_at: "2026-03-20T10:00:00Z"
4115
+ * last_opened_at: "2026-03-30T14:00:00Z"
4112
4116
  *
4113
- * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
4114
- * discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
4117
+ * Concurrency: the registry assumes a single writer. All mutating calls
4118
+ * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
4119
+ * without a lock. Studio's HTTP handlers are serialized by Node's
4120
+ * single-threaded event loop, which satisfies the 24/7 deployment case.
4121
+ * Run only one `agentv` process against a given home at a time.
4122
+ *
4123
+ * To extend:
4124
+ * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
4125
+ * add/remove/touch helpers.
4126
+ * - discoverBenchmarks() is a one-shot filesystem utility for bulk
4127
+ * registration; it does not run in the request path.
4115
4128
  */
4116
4129
  interface BenchmarkEntry {
4117
4130
  id: string;
@@ -4151,7 +4164,9 @@ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
4151
4164
  declare function touchBenchmark(benchmarkId: string): void;
4152
4165
  /**
4153
4166
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
4154
- * Returns absolute paths of discovered benchmark directories.
4167
+ * Returns absolute paths of discovered benchmark directories, sorted for
4168
+ * deterministic iteration. This is a one-shot helper for bulk registration;
4169
+ * Studio does not scan at request time.
4155
4170
  */
4156
4171
  declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
4157
4172
 
package/dist/index.d.ts CHANGED
@@ -4082,7 +4082,7 @@ declare function createDraftResultsPr(params: {
4082
4082
  /**
4083
4083
  * The default config directory (~/.agentv). Always resolves to the user's home
4084
4084
  * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
4085
- * like version-check.json, last-config.json, and projects.yaml.
4085
+ * like version-check.json, last-config.json, and benchmarks.yaml.
4086
4086
  */
4087
4087
  declare function getAgentvConfigDir(): string;
4088
4088
  /**
@@ -4100,18 +4100,31 @@ declare function getWorkspacePoolRoot(): string;
4100
4100
  * Benchmark registry for AgentV Studio multi-benchmark support.
4101
4101
  *
4102
4102
  * A Benchmark = any directory containing a `.agentv/` folder.
4103
- * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
4103
+ * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
4104
+ * truth for which benchmarks Studio shows. Studio re-reads the file on every
4105
+ * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
4106
+ * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
4107
+ * without restarting `agentv serve`.
4104
4108
  *
4105
- * YAML format:
4109
+ * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
4106
4110
  * benchmarks:
4107
4111
  * - id: my-app
4108
4112
  * name: My App
4109
4113
  * path: /home/user/projects/my-app
4110
- * addedAt: "2026-03-20T10:00:00Z"
4111
- * lastOpenedAt: "2026-03-30T14:00:00Z"
4114
+ * added_at: "2026-03-20T10:00:00Z"
4115
+ * last_opened_at: "2026-03-30T14:00:00Z"
4112
4116
  *
4113
- * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
4114
- * discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
4117
+ * Concurrency: the registry assumes a single writer. All mutating calls
4118
+ * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
4119
+ * without a lock. Studio's HTTP handlers are serialized by Node's
4120
+ * single-threaded event loop, which satisfies the 24/7 deployment case.
4121
+ * Run only one `agentv` process against a given home at a time.
4122
+ *
4123
+ * To extend:
4124
+ * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
4125
+ * add/remove/touch helpers.
4126
+ * - discoverBenchmarks() is a one-shot filesystem utility for bulk
4127
+ * registration; it does not run in the request path.
4115
4128
  */
4116
4129
  interface BenchmarkEntry {
4117
4130
  id: string;
@@ -4151,7 +4164,9 @@ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
4151
4164
  declare function touchBenchmark(benchmarkId: string): void;
4152
4165
  /**
4153
4166
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
4154
- * Returns absolute paths of discovered benchmark directories.
4167
+ * Returns absolute paths of discovered benchmark directories, sorted for
4168
+ * deterministic iteration. This is a one-shot helper for bulk registration;
4169
+ * Studio does not scan at request time.
4155
4170
  */
4156
4171
  declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
4157
4172
 
package/dist/index.js CHANGED
@@ -128,7 +128,7 @@ import {
128
128
  toCamelCaseDeep,
129
129
  toSnakeCaseDeep,
130
130
  tokensPerTool
131
- } from "./chunk-ELF6SQAK.js";
131
+ } from "./chunk-WCW3V6QJ.js";
132
132
  import {
133
133
  COMMON_TARGET_SETTINGS,
134
134
  TEST_MESSAGE_ROLES,
@@ -152,7 +152,7 @@ import {
152
152
  resolveDelegatedTargetDefinition,
153
153
  resolveFileReference,
154
154
  resolveTargetDefinition
155
- } from "./chunk-24ND5HZC.js";
155
+ } from "./chunk-LKX4QW3G.js";
156
156
  import "./chunk-3WGHC7LC.js";
157
157
  import "./chunk-PRNXHNLF.js";
158
158
  import {
@@ -992,44 +992,48 @@ async function createDraftResultsPr(params) {
992
992
  }
993
993
 
994
994
  // src/benchmarks.ts
995
- import {
996
- copyFileSync,
997
- existsSync as existsSync2,
998
- mkdirSync as mkdirSync2,
999
- readFileSync as readFileSync3,
1000
- readdirSync,
1001
- statSync,
1002
- writeFileSync as writeFileSync2
1003
- } from "node:fs";
995
+ import { existsSync as existsSync2, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync, statSync, writeFileSync as writeFileSync2 } from "node:fs";
1004
996
  import path5 from "node:path";
1005
997
  import { parse as parseYaml, stringify as stringifyYaml } from "yaml";
1006
998
  function getBenchmarksRegistryPath() {
1007
- return path5.join(getAgentvConfigDir(), "projects.yaml");
999
+ return path5.join(getAgentvConfigDir(), "benchmarks.yaml");
1008
1000
  }
1009
- function migrateProjectsYaml(targetPath) {
1010
- const dataHome = getAgentvHome();
1011
- const configDir = getAgentvConfigDir();
1012
- if (dataHome === configDir) return;
1013
- const legacyPath = path5.join(dataHome, "projects.yaml");
1014
- if (!existsSync2(legacyPath)) return;
1015
- mkdirSync2(path5.dirname(targetPath), { recursive: true });
1016
- copyFileSync(legacyPath, targetPath);
1001
+ function fromYaml(raw) {
1002
+ if (!raw || typeof raw !== "object") return null;
1003
+ const e = raw;
1004
+ if (typeof e.id !== "string" || typeof e.name !== "string" || typeof e.path !== "string") {
1005
+ return null;
1006
+ }
1007
+ return {
1008
+ id: e.id,
1009
+ name: e.name,
1010
+ path: e.path,
1011
+ addedAt: typeof e.added_at === "string" ? e.added_at : "",
1012
+ lastOpenedAt: typeof e.last_opened_at === "string" ? e.last_opened_at : ""
1013
+ };
1014
+ }
1015
+ function toYaml(entry) {
1016
+ return {
1017
+ id: entry.id,
1018
+ name: entry.name,
1019
+ path: entry.path,
1020
+ added_at: entry.addedAt,
1021
+ last_opened_at: entry.lastOpenedAt
1022
+ };
1017
1023
  }
1018
1024
  function loadBenchmarkRegistry() {
1019
1025
  const registryPath = getBenchmarksRegistryPath();
1020
- if (!existsSync2(registryPath)) {
1021
- migrateProjectsYaml(registryPath);
1022
- }
1023
1026
  if (!existsSync2(registryPath)) {
1024
1027
  return { benchmarks: [] };
1025
1028
  }
1026
1029
  try {
1027
1030
  const raw = readFileSync3(registryPath, "utf-8");
1028
1031
  const parsed = parseYaml(raw);
1029
- if (!parsed || !Array.isArray(parsed.benchmarks)) {
1032
+ if (!parsed || typeof parsed !== "object") {
1030
1033
  return { benchmarks: [] };
1031
1034
  }
1032
- return { benchmarks: parsed.benchmarks };
1035
+ const benchmarks = Array.isArray(parsed.benchmarks) ? parsed.benchmarks.map(fromYaml).filter((e) => e !== null) : [];
1036
+ return { benchmarks };
1033
1037
  } catch {
1034
1038
  return { benchmarks: [] };
1035
1039
  }
@@ -1040,7 +1044,8 @@ function saveBenchmarkRegistry(registry) {
1040
1044
  if (!existsSync2(dir)) {
1041
1045
  mkdirSync2(dir, { recursive: true });
1042
1046
  }
1043
- writeFileSync2(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), "utf-8");
1047
+ const payload = { benchmarks: registry.benchmarks.map(toYaml) };
1048
+ writeFileSync2(registryPath, stringifyYaml(payload), "utf-8");
1044
1049
  }
1045
1050
  function deriveBenchmarkId(dirPath, existingIds) {
1046
1051
  const base = path5.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
@@ -1123,7 +1128,7 @@ function discoverBenchmarks(rootDir, maxDepth = 2) {
1123
1128
  }
1124
1129
  }
1125
1130
  scan(absRoot, 0);
1126
- return results;
1131
+ return results.sort();
1127
1132
  }
1128
1133
 
1129
1134
  // src/evaluation/baseline.ts