@agentv/core 4.20.0-next.1 → 4.21.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-24ND5HZC.js → chunk-LKX4QW3G.js} +60 -2
- package/dist/{chunk-24ND5HZC.js.map → chunk-LKX4QW3G.js.map} +1 -1
- package/dist/{chunk-ELF6SQAK.js → chunk-WCW3V6QJ.js} +28 -17
- package/dist/chunk-WCW3V6QJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +94 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +40 -10
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +104 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +23 -8
- package/dist/index.d.ts +23 -8
- package/dist/index.js +32 -27
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-32COE32J.js → ts-eval-loader-HPIPE72C.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-ELF6SQAK.js.map +0 -1
- /package/dist/{ts-eval-loader-32COE32J.js.map → ts-eval-loader-HPIPE72C.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -4082,7 +4082,7 @@ declare function createDraftResultsPr(params: {
|
|
|
4082
4082
|
/**
|
|
4083
4083
|
* The default config directory (~/.agentv). Always resolves to the user's home
|
|
4084
4084
|
* directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
|
|
4085
|
-
* like version-check.json, last-config.json, and
|
|
4085
|
+
* like version-check.json, last-config.json, and benchmarks.yaml.
|
|
4086
4086
|
*/
|
|
4087
4087
|
declare function getAgentvConfigDir(): string;
|
|
4088
4088
|
/**
|
|
@@ -4100,18 +4100,31 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4100
4100
|
* Benchmark registry for AgentV Studio multi-benchmark support.
|
|
4101
4101
|
*
|
|
4102
4102
|
* A Benchmark = any directory containing a `.agentv/` folder.
|
|
4103
|
-
* The registry lives at `~/.agentv/
|
|
4103
|
+
* The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
|
|
4104
|
+
* truth for which benchmarks Studio shows. Studio re-reads the file on every
|
|
4105
|
+
* `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
|
|
4106
|
+
* the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
|
|
4107
|
+
* without restarting `agentv serve`.
|
|
4104
4108
|
*
|
|
4105
|
-
* YAML format:
|
|
4109
|
+
* YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
|
|
4106
4110
|
* benchmarks:
|
|
4107
4111
|
* - id: my-app
|
|
4108
4112
|
* name: My App
|
|
4109
4113
|
* path: /home/user/projects/my-app
|
|
4110
|
-
*
|
|
4111
|
-
*
|
|
4114
|
+
* added_at: "2026-03-20T10:00:00Z"
|
|
4115
|
+
* last_opened_at: "2026-03-30T14:00:00Z"
|
|
4112
4116
|
*
|
|
4113
|
-
*
|
|
4114
|
-
*
|
|
4117
|
+
* Concurrency: the registry assumes a single writer. All mutating calls
|
|
4118
|
+
* (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
|
|
4119
|
+
* without a lock. Studio's HTTP handlers are serialized by Node's
|
|
4120
|
+
* single-threaded event loop, which satisfies the 24/7 deployment case.
|
|
4121
|
+
* Run only one `agentv` process against a given home at a time.
|
|
4122
|
+
*
|
|
4123
|
+
* To extend:
|
|
4124
|
+
* - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
|
|
4125
|
+
* add/remove/touch helpers.
|
|
4126
|
+
* - discoverBenchmarks() is a one-shot filesystem utility for bulk
|
|
4127
|
+
* registration; it does not run in the request path.
|
|
4115
4128
|
*/
|
|
4116
4129
|
interface BenchmarkEntry {
|
|
4117
4130
|
id: string;
|
|
@@ -4151,7 +4164,9 @@ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
|
|
|
4151
4164
|
declare function touchBenchmark(benchmarkId: string): void;
|
|
4152
4165
|
/**
|
|
4153
4166
|
* Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
|
|
4154
|
-
* Returns absolute paths of discovered benchmark directories
|
|
4167
|
+
* Returns absolute paths of discovered benchmark directories, sorted for
|
|
4168
|
+
* deterministic iteration. This is a one-shot helper for bulk registration;
|
|
4169
|
+
* Studio does not scan at request time.
|
|
4155
4170
|
*/
|
|
4156
4171
|
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
4157
4172
|
|
package/dist/index.d.ts
CHANGED
|
@@ -4082,7 +4082,7 @@ declare function createDraftResultsPr(params: {
|
|
|
4082
4082
|
/**
|
|
4083
4083
|
* The default config directory (~/.agentv). Always resolves to the user's home
|
|
4084
4084
|
* directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
|
|
4085
|
-
* like version-check.json, last-config.json, and
|
|
4085
|
+
* like version-check.json, last-config.json, and benchmarks.yaml.
|
|
4086
4086
|
*/
|
|
4087
4087
|
declare function getAgentvConfigDir(): string;
|
|
4088
4088
|
/**
|
|
@@ -4100,18 +4100,31 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4100
4100
|
* Benchmark registry for AgentV Studio multi-benchmark support.
|
|
4101
4101
|
*
|
|
4102
4102
|
* A Benchmark = any directory containing a `.agentv/` folder.
|
|
4103
|
-
* The registry lives at `~/.agentv/
|
|
4103
|
+
* The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
|
|
4104
|
+
* truth for which benchmarks Studio shows. Studio re-reads the file on every
|
|
4105
|
+
* `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
|
|
4106
|
+
* the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
|
|
4107
|
+
* without restarting `agentv serve`.
|
|
4104
4108
|
*
|
|
4105
|
-
* YAML format:
|
|
4109
|
+
* YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
|
|
4106
4110
|
* benchmarks:
|
|
4107
4111
|
* - id: my-app
|
|
4108
4112
|
* name: My App
|
|
4109
4113
|
* path: /home/user/projects/my-app
|
|
4110
|
-
*
|
|
4111
|
-
*
|
|
4114
|
+
* added_at: "2026-03-20T10:00:00Z"
|
|
4115
|
+
* last_opened_at: "2026-03-30T14:00:00Z"
|
|
4112
4116
|
*
|
|
4113
|
-
*
|
|
4114
|
-
*
|
|
4117
|
+
* Concurrency: the registry assumes a single writer. All mutating calls
|
|
4118
|
+
* (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
|
|
4119
|
+
* without a lock. Studio's HTTP handlers are serialized by Node's
|
|
4120
|
+
* single-threaded event loop, which satisfies the 24/7 deployment case.
|
|
4121
|
+
* Run only one `agentv` process against a given home at a time.
|
|
4122
|
+
*
|
|
4123
|
+
* To extend:
|
|
4124
|
+
* - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
|
|
4125
|
+
* add/remove/touch helpers.
|
|
4126
|
+
* - discoverBenchmarks() is a one-shot filesystem utility for bulk
|
|
4127
|
+
* registration; it does not run in the request path.
|
|
4115
4128
|
*/
|
|
4116
4129
|
interface BenchmarkEntry {
|
|
4117
4130
|
id: string;
|
|
@@ -4151,7 +4164,9 @@ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
|
|
|
4151
4164
|
declare function touchBenchmark(benchmarkId: string): void;
|
|
4152
4165
|
/**
|
|
4153
4166
|
* Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
|
|
4154
|
-
* Returns absolute paths of discovered benchmark directories
|
|
4167
|
+
* Returns absolute paths of discovered benchmark directories, sorted for
|
|
4168
|
+
* deterministic iteration. This is a one-shot helper for bulk registration;
|
|
4169
|
+
* Studio does not scan at request time.
|
|
4155
4170
|
*/
|
|
4156
4171
|
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
4157
4172
|
|
package/dist/index.js
CHANGED
|
@@ -128,7 +128,7 @@ import {
|
|
|
128
128
|
toCamelCaseDeep,
|
|
129
129
|
toSnakeCaseDeep,
|
|
130
130
|
tokensPerTool
|
|
131
|
-
} from "./chunk-
|
|
131
|
+
} from "./chunk-WCW3V6QJ.js";
|
|
132
132
|
import {
|
|
133
133
|
COMMON_TARGET_SETTINGS,
|
|
134
134
|
TEST_MESSAGE_ROLES,
|
|
@@ -152,7 +152,7 @@ import {
|
|
|
152
152
|
resolveDelegatedTargetDefinition,
|
|
153
153
|
resolveFileReference,
|
|
154
154
|
resolveTargetDefinition
|
|
155
|
-
} from "./chunk-
|
|
155
|
+
} from "./chunk-LKX4QW3G.js";
|
|
156
156
|
import "./chunk-3WGHC7LC.js";
|
|
157
157
|
import "./chunk-PRNXHNLF.js";
|
|
158
158
|
import {
|
|
@@ -992,44 +992,48 @@ async function createDraftResultsPr(params) {
|
|
|
992
992
|
}
|
|
993
993
|
|
|
994
994
|
// src/benchmarks.ts
|
|
995
|
-
import {
|
|
996
|
-
copyFileSync,
|
|
997
|
-
existsSync as existsSync2,
|
|
998
|
-
mkdirSync as mkdirSync2,
|
|
999
|
-
readFileSync as readFileSync3,
|
|
1000
|
-
readdirSync,
|
|
1001
|
-
statSync,
|
|
1002
|
-
writeFileSync as writeFileSync2
|
|
1003
|
-
} from "node:fs";
|
|
995
|
+
import { existsSync as existsSync2, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync, statSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
1004
996
|
import path5 from "node:path";
|
|
1005
997
|
import { parse as parseYaml, stringify as stringifyYaml } from "yaml";
|
|
1006
998
|
function getBenchmarksRegistryPath() {
|
|
1007
|
-
return path5.join(getAgentvConfigDir(), "
|
|
999
|
+
return path5.join(getAgentvConfigDir(), "benchmarks.yaml");
|
|
1008
1000
|
}
|
|
1009
|
-
function
|
|
1010
|
-
|
|
1011
|
-
const
|
|
1012
|
-
if (
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1001
|
+
function fromYaml(raw) {
|
|
1002
|
+
if (!raw || typeof raw !== "object") return null;
|
|
1003
|
+
const e = raw;
|
|
1004
|
+
if (typeof e.id !== "string" || typeof e.name !== "string" || typeof e.path !== "string") {
|
|
1005
|
+
return null;
|
|
1006
|
+
}
|
|
1007
|
+
return {
|
|
1008
|
+
id: e.id,
|
|
1009
|
+
name: e.name,
|
|
1010
|
+
path: e.path,
|
|
1011
|
+
addedAt: typeof e.added_at === "string" ? e.added_at : "",
|
|
1012
|
+
lastOpenedAt: typeof e.last_opened_at === "string" ? e.last_opened_at : ""
|
|
1013
|
+
};
|
|
1014
|
+
}
|
|
1015
|
+
function toYaml(entry) {
|
|
1016
|
+
return {
|
|
1017
|
+
id: entry.id,
|
|
1018
|
+
name: entry.name,
|
|
1019
|
+
path: entry.path,
|
|
1020
|
+
added_at: entry.addedAt,
|
|
1021
|
+
last_opened_at: entry.lastOpenedAt
|
|
1022
|
+
};
|
|
1017
1023
|
}
|
|
1018
1024
|
function loadBenchmarkRegistry() {
|
|
1019
1025
|
const registryPath = getBenchmarksRegistryPath();
|
|
1020
|
-
if (!existsSync2(registryPath)) {
|
|
1021
|
-
migrateProjectsYaml(registryPath);
|
|
1022
|
-
}
|
|
1023
1026
|
if (!existsSync2(registryPath)) {
|
|
1024
1027
|
return { benchmarks: [] };
|
|
1025
1028
|
}
|
|
1026
1029
|
try {
|
|
1027
1030
|
const raw = readFileSync3(registryPath, "utf-8");
|
|
1028
1031
|
const parsed = parseYaml(raw);
|
|
1029
|
-
if (!parsed ||
|
|
1032
|
+
if (!parsed || typeof parsed !== "object") {
|
|
1030
1033
|
return { benchmarks: [] };
|
|
1031
1034
|
}
|
|
1032
|
-
|
|
1035
|
+
const benchmarks = Array.isArray(parsed.benchmarks) ? parsed.benchmarks.map(fromYaml).filter((e) => e !== null) : [];
|
|
1036
|
+
return { benchmarks };
|
|
1033
1037
|
} catch {
|
|
1034
1038
|
return { benchmarks: [] };
|
|
1035
1039
|
}
|
|
@@ -1040,7 +1044,8 @@ function saveBenchmarkRegistry(registry) {
|
|
|
1040
1044
|
if (!existsSync2(dir)) {
|
|
1041
1045
|
mkdirSync2(dir, { recursive: true });
|
|
1042
1046
|
}
|
|
1043
|
-
|
|
1047
|
+
const payload = { benchmarks: registry.benchmarks.map(toYaml) };
|
|
1048
|
+
writeFileSync2(registryPath, stringifyYaml(payload), "utf-8");
|
|
1044
1049
|
}
|
|
1045
1050
|
function deriveBenchmarkId(dirPath, existingIds) {
|
|
1046
1051
|
const base = path5.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
@@ -1123,7 +1128,7 @@ function discoverBenchmarks(rootDir, maxDepth = 2) {
|
|
|
1123
1128
|
}
|
|
1124
1129
|
}
|
|
1125
1130
|
scan(absRoot, 0);
|
|
1126
|
-
return results;
|
|
1131
|
+
return results.sort();
|
|
1127
1132
|
}
|
|
1128
1133
|
|
|
1129
1134
|
// src/evaluation/baseline.ts
|