@agentv/core 4.11.2-next.1 → 4.12.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +52 -52
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +25 -25
- package/dist/index.d.ts +25 -25
- package/dist/index.js +43 -43
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1801,7 +1801,7 @@ __export(index_exports, {
|
|
|
1801
1801
|
TranscriptProvider: () => TranscriptProvider,
|
|
1802
1802
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1803
1803
|
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1804
|
-
|
|
1804
|
+
addBenchmark: () => addBenchmark,
|
|
1805
1805
|
assembleLlmGraderPrompt: () => assembleLlmGraderPrompt,
|
|
1806
1806
|
assembleLlmJudgePrompt: () => assembleLlmGraderPrompt,
|
|
1807
1807
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
@@ -1833,17 +1833,17 @@ __export(index_exports, {
|
|
|
1833
1833
|
createTempWorkspace: () => createTempWorkspace,
|
|
1834
1834
|
deepEqual: () => deepEqual,
|
|
1835
1835
|
defineConfig: () => defineConfig,
|
|
1836
|
+
deriveBenchmarkId: () => deriveBenchmarkId,
|
|
1836
1837
|
deriveCategory: () => deriveCategory,
|
|
1837
|
-
deriveProjectId: () => deriveProjectId,
|
|
1838
1838
|
detectFormat: () => detectFormat,
|
|
1839
1839
|
directorySizeBytes: () => directorySizeBytes,
|
|
1840
1840
|
discoverAssertions: () => discoverAssertions,
|
|
1841
|
+
discoverBenchmarks: () => discoverBenchmarks,
|
|
1841
1842
|
discoverClaudeSessions: () => discoverClaudeSessions,
|
|
1842
1843
|
discoverCodexSessions: () => discoverCodexSessions,
|
|
1843
1844
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
1844
1845
|
discoverGraders: () => discoverGraders,
|
|
1845
1846
|
discoverJudges: () => discoverGraders,
|
|
1846
|
-
discoverProjects: () => discoverProjects,
|
|
1847
1847
|
discoverProviders: () => discoverProviders,
|
|
1848
1848
|
ensureResultsRepoClone: () => ensureResultsRepoClone,
|
|
1849
1849
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
@@ -1867,9 +1867,9 @@ __export(index_exports, {
|
|
|
1867
1867
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
1868
1868
|
generateRubrics: () => generateRubrics,
|
|
1869
1869
|
getAgentvHome: () => getAgentvHome,
|
|
1870
|
+
getBenchmark: () => getBenchmark,
|
|
1871
|
+
getBenchmarksRegistryPath: () => getBenchmarksRegistryPath,
|
|
1870
1872
|
getOutputFilenames: () => getOutputFilenames,
|
|
1871
|
-
getProject: () => getProject,
|
|
1872
|
-
getProjectsRegistryPath: () => getProjectsRegistryPath,
|
|
1873
1873
|
getResultsRepoCachePaths: () => getResultsRepoCachePaths,
|
|
1874
1874
|
getResultsRepoStatus: () => getResultsRepoStatus,
|
|
1875
1875
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
@@ -1889,11 +1889,11 @@ __export(index_exports, {
|
|
|
1889
1889
|
isTestMessage: () => isTestMessage,
|
|
1890
1890
|
isTestMessageRole: () => isTestMessageRole,
|
|
1891
1891
|
listTargetNames: () => listTargetNames,
|
|
1892
|
+
loadBenchmarkRegistry: () => loadBenchmarkRegistry,
|
|
1892
1893
|
loadConfig: () => loadConfig,
|
|
1893
1894
|
loadEvalCaseById: () => loadEvalCaseById,
|
|
1894
1895
|
loadEvalCases: () => loadEvalCases,
|
|
1895
1896
|
loadEvalSuite: () => loadEvalSuite,
|
|
1896
|
-
loadProjectRegistry: () => loadProjectRegistry,
|
|
1897
1897
|
loadTestById: () => loadTestById,
|
|
1898
1898
|
loadTestSuite: () => loadTestSuite,
|
|
1899
1899
|
loadTests: () => loadTests,
|
|
@@ -1916,7 +1916,7 @@ __export(index_exports, {
|
|
|
1916
1916
|
readTextFile: () => readTextFile,
|
|
1917
1917
|
readTranscriptFile: () => readTranscriptFile,
|
|
1918
1918
|
readTranscriptJsonl: () => readTranscriptJsonl,
|
|
1919
|
-
|
|
1919
|
+
removeBenchmark: () => removeBenchmark,
|
|
1920
1920
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
1921
1921
|
resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
|
|
1922
1922
|
resolveFileReference: () => resolveFileReference3,
|
|
@@ -1938,7 +1938,7 @@ __export(index_exports, {
|
|
|
1938
1938
|
runIsJsonAssertion: () => runIsJsonAssertion,
|
|
1939
1939
|
runRegexAssertion: () => runRegexAssertion,
|
|
1940
1940
|
runStartsWithAssertion: () => runStartsWithAssertion,
|
|
1941
|
-
|
|
1941
|
+
saveBenchmarkRegistry: () => saveBenchmarkRegistry,
|
|
1942
1942
|
scanRepoDeps: () => scanRepoDeps,
|
|
1943
1943
|
scoreToVerdict: () => scoreToVerdict,
|
|
1944
1944
|
shouldEnableCache: () => shouldEnableCache,
|
|
@@ -1955,7 +1955,7 @@ __export(index_exports, {
|
|
|
1955
1955
|
toSnakeCaseDeep: () => toSnakeCaseDeep,
|
|
1956
1956
|
toTranscriptJsonLine: () => toTranscriptJsonLine,
|
|
1957
1957
|
tokensPerTool: () => tokensPerTool,
|
|
1958
|
-
|
|
1958
|
+
touchBenchmark: () => touchBenchmark,
|
|
1959
1959
|
transpileEvalYaml: () => transpileEvalYaml,
|
|
1960
1960
|
transpileEvalYamlFile: () => transpileEvalYamlFile,
|
|
1961
1961
|
trimBaselineResult: () => trimBaselineResult
|
|
@@ -23102,41 +23102,41 @@ async function createDraftResultsPr(params) {
|
|
|
23102
23102
|
return stdout.trim();
|
|
23103
23103
|
}
|
|
23104
23104
|
|
|
23105
|
-
// src/
|
|
23105
|
+
// src/benchmarks.ts
|
|
23106
23106
|
init_cjs_shims();
|
|
23107
23107
|
var import_node_fs19 = require("fs");
|
|
23108
23108
|
var import_node_path54 = __toESM(require("path"), 1);
|
|
23109
23109
|
var import_yaml10 = require("yaml");
|
|
23110
|
-
function
|
|
23110
|
+
function getBenchmarksRegistryPath() {
|
|
23111
23111
|
return import_node_path54.default.join(getAgentvHome(), "projects.yaml");
|
|
23112
23112
|
}
|
|
23113
|
-
function
|
|
23114
|
-
const registryPath =
|
|
23113
|
+
function loadBenchmarkRegistry() {
|
|
23114
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
23115
23115
|
if (!(0, import_node_fs19.existsSync)(registryPath)) {
|
|
23116
|
-
return {
|
|
23116
|
+
return { benchmarks: [] };
|
|
23117
23117
|
}
|
|
23118
23118
|
try {
|
|
23119
23119
|
const raw = (0, import_node_fs19.readFileSync)(registryPath, "utf-8");
|
|
23120
23120
|
const parsed = (0, import_yaml10.parse)(raw);
|
|
23121
|
-
if (!parsed || !Array.isArray(parsed.
|
|
23122
|
-
return {
|
|
23121
|
+
if (!parsed || !Array.isArray(parsed.benchmarks)) {
|
|
23122
|
+
return { benchmarks: [] };
|
|
23123
23123
|
}
|
|
23124
|
-
return {
|
|
23124
|
+
return { benchmarks: parsed.benchmarks };
|
|
23125
23125
|
} catch {
|
|
23126
|
-
return {
|
|
23126
|
+
return { benchmarks: [] };
|
|
23127
23127
|
}
|
|
23128
23128
|
}
|
|
23129
|
-
function
|
|
23130
|
-
const registryPath =
|
|
23129
|
+
function saveBenchmarkRegistry(registry) {
|
|
23130
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
23131
23131
|
const dir = import_node_path54.default.dirname(registryPath);
|
|
23132
23132
|
if (!(0, import_node_fs19.existsSync)(dir)) {
|
|
23133
23133
|
(0, import_node_fs19.mkdirSync)(dir, { recursive: true });
|
|
23134
23134
|
}
|
|
23135
|
-
(0, import_node_fs19.writeFileSync)(registryPath, (0, import_yaml10.stringify)(registry), "utf-8");
|
|
23135
|
+
(0, import_node_fs19.writeFileSync)(registryPath, (0, import_yaml10.stringify)({ benchmarks: registry.benchmarks }), "utf-8");
|
|
23136
23136
|
}
|
|
23137
|
-
function
|
|
23137
|
+
function deriveBenchmarkId(dirPath, existingIds) {
|
|
23138
23138
|
const base = import_node_path54.default.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
23139
|
-
let candidate = base || "
|
|
23139
|
+
let candidate = base || "benchmark";
|
|
23140
23140
|
let suffix = 2;
|
|
23141
23141
|
while (existingIds.includes(candidate)) {
|
|
23142
23142
|
candidate = `${base}-${suffix}`;
|
|
@@ -23144,54 +23144,54 @@ function deriveProjectId(dirPath, existingIds) {
|
|
|
23144
23144
|
}
|
|
23145
23145
|
return candidate;
|
|
23146
23146
|
}
|
|
23147
|
-
function
|
|
23148
|
-
const absPath = import_node_path54.default.resolve(
|
|
23147
|
+
function addBenchmark(benchmarkPath) {
|
|
23148
|
+
const absPath = import_node_path54.default.resolve(benchmarkPath);
|
|
23149
23149
|
if (!(0, import_node_fs19.existsSync)(absPath)) {
|
|
23150
23150
|
throw new Error(`Directory not found: ${absPath}`);
|
|
23151
23151
|
}
|
|
23152
23152
|
if (!(0, import_node_fs19.existsSync)(import_node_path54.default.join(absPath, ".agentv"))) {
|
|
23153
23153
|
throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
|
|
23154
23154
|
}
|
|
23155
|
-
const registry =
|
|
23156
|
-
const existing = registry.
|
|
23155
|
+
const registry = loadBenchmarkRegistry();
|
|
23156
|
+
const existing = registry.benchmarks.find((p) => p.path === absPath);
|
|
23157
23157
|
if (existing) {
|
|
23158
23158
|
return existing;
|
|
23159
23159
|
}
|
|
23160
23160
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
23161
23161
|
const entry = {
|
|
23162
|
-
id:
|
|
23162
|
+
id: deriveBenchmarkId(
|
|
23163
23163
|
absPath,
|
|
23164
|
-
registry.
|
|
23164
|
+
registry.benchmarks.map((p) => p.id)
|
|
23165
23165
|
),
|
|
23166
23166
|
name: import_node_path54.default.basename(absPath),
|
|
23167
23167
|
path: absPath,
|
|
23168
23168
|
addedAt: now,
|
|
23169
23169
|
lastOpenedAt: now
|
|
23170
23170
|
};
|
|
23171
|
-
registry.
|
|
23172
|
-
|
|
23171
|
+
registry.benchmarks.push(entry);
|
|
23172
|
+
saveBenchmarkRegistry(registry);
|
|
23173
23173
|
return entry;
|
|
23174
23174
|
}
|
|
23175
|
-
function
|
|
23176
|
-
const registry =
|
|
23177
|
-
const idx = registry.
|
|
23175
|
+
function removeBenchmark(benchmarkId) {
|
|
23176
|
+
const registry = loadBenchmarkRegistry();
|
|
23177
|
+
const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId);
|
|
23178
23178
|
if (idx < 0) return false;
|
|
23179
|
-
registry.
|
|
23180
|
-
|
|
23179
|
+
registry.benchmarks.splice(idx, 1);
|
|
23180
|
+
saveBenchmarkRegistry(registry);
|
|
23181
23181
|
return true;
|
|
23182
23182
|
}
|
|
23183
|
-
function
|
|
23184
|
-
return
|
|
23183
|
+
function getBenchmark(benchmarkId) {
|
|
23184
|
+
return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId);
|
|
23185
23185
|
}
|
|
23186
|
-
function
|
|
23187
|
-
const registry =
|
|
23188
|
-
const entry = registry.
|
|
23186
|
+
function touchBenchmark(benchmarkId) {
|
|
23187
|
+
const registry = loadBenchmarkRegistry();
|
|
23188
|
+
const entry = registry.benchmarks.find((p) => p.id === benchmarkId);
|
|
23189
23189
|
if (entry) {
|
|
23190
23190
|
entry.lastOpenedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
23191
|
-
|
|
23191
|
+
saveBenchmarkRegistry(registry);
|
|
23192
23192
|
}
|
|
23193
23193
|
}
|
|
23194
|
-
function
|
|
23194
|
+
function discoverBenchmarks(rootDir, maxDepth = 2) {
|
|
23195
23195
|
const absRoot = import_node_path54.default.resolve(rootDir);
|
|
23196
23196
|
if (!(0, import_node_fs19.existsSync)(absRoot) || !(0, import_node_fs19.statSync)(absRoot).isDirectory()) {
|
|
23197
23197
|
return [];
|
|
@@ -24379,7 +24379,7 @@ function createAgentKernel() {
|
|
|
24379
24379
|
TranscriptProvider,
|
|
24380
24380
|
WorkspaceCreationError,
|
|
24381
24381
|
WorkspacePoolManager,
|
|
24382
|
-
|
|
24382
|
+
addBenchmark,
|
|
24383
24383
|
assembleLlmGraderPrompt,
|
|
24384
24384
|
assembleLlmJudgePrompt,
|
|
24385
24385
|
avgToolDurationMs,
|
|
@@ -24411,17 +24411,17 @@ function createAgentKernel() {
|
|
|
24411
24411
|
createTempWorkspace,
|
|
24412
24412
|
deepEqual,
|
|
24413
24413
|
defineConfig,
|
|
24414
|
+
deriveBenchmarkId,
|
|
24414
24415
|
deriveCategory,
|
|
24415
|
-
deriveProjectId,
|
|
24416
24416
|
detectFormat,
|
|
24417
24417
|
directorySizeBytes,
|
|
24418
24418
|
discoverAssertions,
|
|
24419
|
+
discoverBenchmarks,
|
|
24419
24420
|
discoverClaudeSessions,
|
|
24420
24421
|
discoverCodexSessions,
|
|
24421
24422
|
discoverCopilotSessions,
|
|
24422
24423
|
discoverGraders,
|
|
24423
24424
|
discoverJudges,
|
|
24424
|
-
discoverProjects,
|
|
24425
24425
|
discoverProviders,
|
|
24426
24426
|
ensureResultsRepoClone,
|
|
24427
24427
|
ensureVSCodeSubagents,
|
|
@@ -24445,9 +24445,9 @@ function createAgentKernel() {
|
|
|
24445
24445
|
freeformEvaluationSchema,
|
|
24446
24446
|
generateRubrics,
|
|
24447
24447
|
getAgentvHome,
|
|
24448
|
+
getBenchmark,
|
|
24449
|
+
getBenchmarksRegistryPath,
|
|
24448
24450
|
getOutputFilenames,
|
|
24449
|
-
getProject,
|
|
24450
|
-
getProjectsRegistryPath,
|
|
24451
24451
|
getResultsRepoCachePaths,
|
|
24452
24452
|
getResultsRepoStatus,
|
|
24453
24453
|
getSubagentsRoot,
|
|
@@ -24467,11 +24467,11 @@ function createAgentKernel() {
|
|
|
24467
24467
|
isTestMessage,
|
|
24468
24468
|
isTestMessageRole,
|
|
24469
24469
|
listTargetNames,
|
|
24470
|
+
loadBenchmarkRegistry,
|
|
24470
24471
|
loadConfig,
|
|
24471
24472
|
loadEvalCaseById,
|
|
24472
24473
|
loadEvalCases,
|
|
24473
24474
|
loadEvalSuite,
|
|
24474
|
-
loadProjectRegistry,
|
|
24475
24475
|
loadTestById,
|
|
24476
24476
|
loadTestSuite,
|
|
24477
24477
|
loadTests,
|
|
@@ -24494,7 +24494,7 @@ function createAgentKernel() {
|
|
|
24494
24494
|
readTextFile,
|
|
24495
24495
|
readTranscriptFile,
|
|
24496
24496
|
readTranscriptJsonl,
|
|
24497
|
-
|
|
24497
|
+
removeBenchmark,
|
|
24498
24498
|
resolveAndCreateProvider,
|
|
24499
24499
|
resolveDelegatedTargetDefinition,
|
|
24500
24500
|
resolveFileReference,
|
|
@@ -24516,7 +24516,7 @@ function createAgentKernel() {
|
|
|
24516
24516
|
runIsJsonAssertion,
|
|
24517
24517
|
runRegexAssertion,
|
|
24518
24518
|
runStartsWithAssertion,
|
|
24519
|
-
|
|
24519
|
+
saveBenchmarkRegistry,
|
|
24520
24520
|
scanRepoDeps,
|
|
24521
24521
|
scoreToVerdict,
|
|
24522
24522
|
shouldEnableCache,
|
|
@@ -24533,7 +24533,7 @@ function createAgentKernel() {
|
|
|
24533
24533
|
toSnakeCaseDeep,
|
|
24534
24534
|
toTranscriptJsonLine,
|
|
24535
24535
|
tokensPerTool,
|
|
24536
|
-
|
|
24536
|
+
touchBenchmark,
|
|
24537
24537
|
transpileEvalYaml,
|
|
24538
24538
|
transpileEvalYamlFile,
|
|
24539
24539
|
trimBaselineResult
|