@agentv/core 4.11.2-next.1 → 4.12.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1801,7 +1801,7 @@ __export(index_exports, {
1801
1801
  TranscriptProvider: () => TranscriptProvider,
1802
1802
  WorkspaceCreationError: () => WorkspaceCreationError,
1803
1803
  WorkspacePoolManager: () => WorkspacePoolManager,
1804
- addProject: () => addProject,
1804
+ addBenchmark: () => addBenchmark,
1805
1805
  assembleLlmGraderPrompt: () => assembleLlmGraderPrompt,
1806
1806
  assembleLlmJudgePrompt: () => assembleLlmGraderPrompt,
1807
1807
  avgToolDurationMs: () => avgToolDurationMs,
@@ -1833,17 +1833,17 @@ __export(index_exports, {
1833
1833
  createTempWorkspace: () => createTempWorkspace,
1834
1834
  deepEqual: () => deepEqual,
1835
1835
  defineConfig: () => defineConfig,
1836
+ deriveBenchmarkId: () => deriveBenchmarkId,
1836
1837
  deriveCategory: () => deriveCategory,
1837
- deriveProjectId: () => deriveProjectId,
1838
1838
  detectFormat: () => detectFormat,
1839
1839
  directorySizeBytes: () => directorySizeBytes,
1840
1840
  discoverAssertions: () => discoverAssertions,
1841
+ discoverBenchmarks: () => discoverBenchmarks,
1841
1842
  discoverClaudeSessions: () => discoverClaudeSessions,
1842
1843
  discoverCodexSessions: () => discoverCodexSessions,
1843
1844
  discoverCopilotSessions: () => discoverCopilotSessions,
1844
1845
  discoverGraders: () => discoverGraders,
1845
1846
  discoverJudges: () => discoverGraders,
1846
- discoverProjects: () => discoverProjects,
1847
1847
  discoverProviders: () => discoverProviders,
1848
1848
  ensureResultsRepoClone: () => ensureResultsRepoClone,
1849
1849
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
@@ -1867,9 +1867,9 @@ __export(index_exports, {
1867
1867
  freeformEvaluationSchema: () => freeformEvaluationSchema,
1868
1868
  generateRubrics: () => generateRubrics,
1869
1869
  getAgentvHome: () => getAgentvHome,
1870
+ getBenchmark: () => getBenchmark,
1871
+ getBenchmarksRegistryPath: () => getBenchmarksRegistryPath,
1870
1872
  getOutputFilenames: () => getOutputFilenames,
1871
- getProject: () => getProject,
1872
- getProjectsRegistryPath: () => getProjectsRegistryPath,
1873
1873
  getResultsRepoCachePaths: () => getResultsRepoCachePaths,
1874
1874
  getResultsRepoStatus: () => getResultsRepoStatus,
1875
1875
  getSubagentsRoot: () => getSubagentsRoot,
@@ -1889,11 +1889,11 @@ __export(index_exports, {
1889
1889
  isTestMessage: () => isTestMessage,
1890
1890
  isTestMessageRole: () => isTestMessageRole,
1891
1891
  listTargetNames: () => listTargetNames,
1892
+ loadBenchmarkRegistry: () => loadBenchmarkRegistry,
1892
1893
  loadConfig: () => loadConfig,
1893
1894
  loadEvalCaseById: () => loadEvalCaseById,
1894
1895
  loadEvalCases: () => loadEvalCases,
1895
1896
  loadEvalSuite: () => loadEvalSuite,
1896
- loadProjectRegistry: () => loadProjectRegistry,
1897
1897
  loadTestById: () => loadTestById,
1898
1898
  loadTestSuite: () => loadTestSuite,
1899
1899
  loadTests: () => loadTests,
@@ -1916,7 +1916,7 @@ __export(index_exports, {
1916
1916
  readTextFile: () => readTextFile,
1917
1917
  readTranscriptFile: () => readTranscriptFile,
1918
1918
  readTranscriptJsonl: () => readTranscriptJsonl,
1919
- removeProject: () => removeProject,
1919
+ removeBenchmark: () => removeBenchmark,
1920
1920
  resolveAndCreateProvider: () => resolveAndCreateProvider,
1921
1921
  resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
1922
1922
  resolveFileReference: () => resolveFileReference3,
@@ -1938,7 +1938,7 @@ __export(index_exports, {
1938
1938
  runIsJsonAssertion: () => runIsJsonAssertion,
1939
1939
  runRegexAssertion: () => runRegexAssertion,
1940
1940
  runStartsWithAssertion: () => runStartsWithAssertion,
1941
- saveProjectRegistry: () => saveProjectRegistry,
1941
+ saveBenchmarkRegistry: () => saveBenchmarkRegistry,
1942
1942
  scanRepoDeps: () => scanRepoDeps,
1943
1943
  scoreToVerdict: () => scoreToVerdict,
1944
1944
  shouldEnableCache: () => shouldEnableCache,
@@ -1955,7 +1955,7 @@ __export(index_exports, {
1955
1955
  toSnakeCaseDeep: () => toSnakeCaseDeep,
1956
1956
  toTranscriptJsonLine: () => toTranscriptJsonLine,
1957
1957
  tokensPerTool: () => tokensPerTool,
1958
- touchProject: () => touchProject,
1958
+ touchBenchmark: () => touchBenchmark,
1959
1959
  transpileEvalYaml: () => transpileEvalYaml,
1960
1960
  transpileEvalYamlFile: () => transpileEvalYamlFile,
1961
1961
  trimBaselineResult: () => trimBaselineResult
@@ -23102,41 +23102,41 @@ async function createDraftResultsPr(params) {
23102
23102
  return stdout.trim();
23103
23103
  }
23104
23104
 
23105
- // src/projects.ts
23105
+ // src/benchmarks.ts
23106
23106
  init_cjs_shims();
23107
23107
  var import_node_fs19 = require("fs");
23108
23108
  var import_node_path54 = __toESM(require("path"), 1);
23109
23109
  var import_yaml10 = require("yaml");
23110
- function getProjectsRegistryPath() {
23110
+ function getBenchmarksRegistryPath() {
23111
23111
  return import_node_path54.default.join(getAgentvHome(), "projects.yaml");
23112
23112
  }
23113
- function loadProjectRegistry() {
23114
- const registryPath = getProjectsRegistryPath();
23113
+ function loadBenchmarkRegistry() {
23114
+ const registryPath = getBenchmarksRegistryPath();
23115
23115
  if (!(0, import_node_fs19.existsSync)(registryPath)) {
23116
- return { projects: [] };
23116
+ return { benchmarks: [] };
23117
23117
  }
23118
23118
  try {
23119
23119
  const raw = (0, import_node_fs19.readFileSync)(registryPath, "utf-8");
23120
23120
  const parsed = (0, import_yaml10.parse)(raw);
23121
- if (!parsed || !Array.isArray(parsed.projects)) {
23122
- return { projects: [] };
23121
+ if (!parsed || !Array.isArray(parsed.benchmarks)) {
23122
+ return { benchmarks: [] };
23123
23123
  }
23124
- return { projects: parsed.projects };
23124
+ return { benchmarks: parsed.benchmarks };
23125
23125
  } catch {
23126
- return { projects: [] };
23126
+ return { benchmarks: [] };
23127
23127
  }
23128
23128
  }
23129
- function saveProjectRegistry(registry) {
23130
- const registryPath = getProjectsRegistryPath();
23129
+ function saveBenchmarkRegistry(registry) {
23130
+ const registryPath = getBenchmarksRegistryPath();
23131
23131
  const dir = import_node_path54.default.dirname(registryPath);
23132
23132
  if (!(0, import_node_fs19.existsSync)(dir)) {
23133
23133
  (0, import_node_fs19.mkdirSync)(dir, { recursive: true });
23134
23134
  }
23135
- (0, import_node_fs19.writeFileSync)(registryPath, (0, import_yaml10.stringify)(registry), "utf-8");
23135
+ (0, import_node_fs19.writeFileSync)(registryPath, (0, import_yaml10.stringify)({ benchmarks: registry.benchmarks }), "utf-8");
23136
23136
  }
23137
- function deriveProjectId(dirPath, existingIds) {
23137
+ function deriveBenchmarkId(dirPath, existingIds) {
23138
23138
  const base = import_node_path54.default.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
23139
- let candidate = base || "project";
23139
+ let candidate = base || "benchmark";
23140
23140
  let suffix = 2;
23141
23141
  while (existingIds.includes(candidate)) {
23142
23142
  candidate = `${base}-${suffix}`;
@@ -23144,54 +23144,54 @@ function deriveProjectId(dirPath, existingIds) {
23144
23144
  }
23145
23145
  return candidate;
23146
23146
  }
23147
- function addProject(projectPath) {
23148
- const absPath = import_node_path54.default.resolve(projectPath);
23147
+ function addBenchmark(benchmarkPath) {
23148
+ const absPath = import_node_path54.default.resolve(benchmarkPath);
23149
23149
  if (!(0, import_node_fs19.existsSync)(absPath)) {
23150
23150
  throw new Error(`Directory not found: ${absPath}`);
23151
23151
  }
23152
23152
  if (!(0, import_node_fs19.existsSync)(import_node_path54.default.join(absPath, ".agentv"))) {
23153
23153
  throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
23154
23154
  }
23155
- const registry = loadProjectRegistry();
23156
- const existing = registry.projects.find((p) => p.path === absPath);
23155
+ const registry = loadBenchmarkRegistry();
23156
+ const existing = registry.benchmarks.find((p) => p.path === absPath);
23157
23157
  if (existing) {
23158
23158
  return existing;
23159
23159
  }
23160
23160
  const now = (/* @__PURE__ */ new Date()).toISOString();
23161
23161
  const entry = {
23162
- id: deriveProjectId(
23162
+ id: deriveBenchmarkId(
23163
23163
  absPath,
23164
- registry.projects.map((p) => p.id)
23164
+ registry.benchmarks.map((p) => p.id)
23165
23165
  ),
23166
23166
  name: import_node_path54.default.basename(absPath),
23167
23167
  path: absPath,
23168
23168
  addedAt: now,
23169
23169
  lastOpenedAt: now
23170
23170
  };
23171
- registry.projects.push(entry);
23172
- saveProjectRegistry(registry);
23171
+ registry.benchmarks.push(entry);
23172
+ saveBenchmarkRegistry(registry);
23173
23173
  return entry;
23174
23174
  }
23175
- function removeProject(projectId) {
23176
- const registry = loadProjectRegistry();
23177
- const idx = registry.projects.findIndex((p) => p.id === projectId);
23175
+ function removeBenchmark(benchmarkId) {
23176
+ const registry = loadBenchmarkRegistry();
23177
+ const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId);
23178
23178
  if (idx < 0) return false;
23179
- registry.projects.splice(idx, 1);
23180
- saveProjectRegistry(registry);
23179
+ registry.benchmarks.splice(idx, 1);
23180
+ saveBenchmarkRegistry(registry);
23181
23181
  return true;
23182
23182
  }
23183
- function getProject(projectId) {
23184
- return loadProjectRegistry().projects.find((p) => p.id === projectId);
23183
+ function getBenchmark(benchmarkId) {
23184
+ return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId);
23185
23185
  }
23186
- function touchProject(projectId) {
23187
- const registry = loadProjectRegistry();
23188
- const entry = registry.projects.find((p) => p.id === projectId);
23186
+ function touchBenchmark(benchmarkId) {
23187
+ const registry = loadBenchmarkRegistry();
23188
+ const entry = registry.benchmarks.find((p) => p.id === benchmarkId);
23189
23189
  if (entry) {
23190
23190
  entry.lastOpenedAt = (/* @__PURE__ */ new Date()).toISOString();
23191
- saveProjectRegistry(registry);
23191
+ saveBenchmarkRegistry(registry);
23192
23192
  }
23193
23193
  }
23194
- function discoverProjects(rootDir, maxDepth = 2) {
23194
+ function discoverBenchmarks(rootDir, maxDepth = 2) {
23195
23195
  const absRoot = import_node_path54.default.resolve(rootDir);
23196
23196
  if (!(0, import_node_fs19.existsSync)(absRoot) || !(0, import_node_fs19.statSync)(absRoot).isDirectory()) {
23197
23197
  return [];
@@ -24379,7 +24379,7 @@ function createAgentKernel() {
24379
24379
  TranscriptProvider,
24380
24380
  WorkspaceCreationError,
24381
24381
  WorkspacePoolManager,
24382
- addProject,
24382
+ addBenchmark,
24383
24383
  assembleLlmGraderPrompt,
24384
24384
  assembleLlmJudgePrompt,
24385
24385
  avgToolDurationMs,
@@ -24411,17 +24411,17 @@ function createAgentKernel() {
24411
24411
  createTempWorkspace,
24412
24412
  deepEqual,
24413
24413
  defineConfig,
24414
+ deriveBenchmarkId,
24414
24415
  deriveCategory,
24415
- deriveProjectId,
24416
24416
  detectFormat,
24417
24417
  directorySizeBytes,
24418
24418
  discoverAssertions,
24419
+ discoverBenchmarks,
24419
24420
  discoverClaudeSessions,
24420
24421
  discoverCodexSessions,
24421
24422
  discoverCopilotSessions,
24422
24423
  discoverGraders,
24423
24424
  discoverJudges,
24424
- discoverProjects,
24425
24425
  discoverProviders,
24426
24426
  ensureResultsRepoClone,
24427
24427
  ensureVSCodeSubagents,
@@ -24445,9 +24445,9 @@ function createAgentKernel() {
24445
24445
  freeformEvaluationSchema,
24446
24446
  generateRubrics,
24447
24447
  getAgentvHome,
24448
+ getBenchmark,
24449
+ getBenchmarksRegistryPath,
24448
24450
  getOutputFilenames,
24449
- getProject,
24450
- getProjectsRegistryPath,
24451
24451
  getResultsRepoCachePaths,
24452
24452
  getResultsRepoStatus,
24453
24453
  getSubagentsRoot,
@@ -24467,11 +24467,11 @@ function createAgentKernel() {
24467
24467
  isTestMessage,
24468
24468
  isTestMessageRole,
24469
24469
  listTargetNames,
24470
+ loadBenchmarkRegistry,
24470
24471
  loadConfig,
24471
24472
  loadEvalCaseById,
24472
24473
  loadEvalCases,
24473
24474
  loadEvalSuite,
24474
- loadProjectRegistry,
24475
24475
  loadTestById,
24476
24476
  loadTestSuite,
24477
24477
  loadTests,
@@ -24494,7 +24494,7 @@ function createAgentKernel() {
24494
24494
  readTextFile,
24495
24495
  readTranscriptFile,
24496
24496
  readTranscriptJsonl,
24497
- removeProject,
24497
+ removeBenchmark,
24498
24498
  resolveAndCreateProvider,
24499
24499
  resolveDelegatedTargetDefinition,
24500
24500
  resolveFileReference,
@@ -24516,7 +24516,7 @@ function createAgentKernel() {
24516
24516
  runIsJsonAssertion,
24517
24517
  runRegexAssertion,
24518
24518
  runStartsWithAssertion,
24519
- saveProjectRegistry,
24519
+ saveBenchmarkRegistry,
24520
24520
  scanRepoDeps,
24521
24521
  scoreToVerdict,
24522
24522
  shouldEnableCache,
@@ -24533,7 +24533,7 @@ function createAgentKernel() {
24533
24533
  toSnakeCaseDeep,
24534
24534
  toTranscriptJsonLine,
24535
24535
  tokensPerTool,
24536
- touchProject,
24536
+ touchBenchmark,
24537
24537
  transpileEvalYaml,
24538
24538
  transpileEvalYamlFile,
24539
24539
  trimBaselineResult