@agentv/core 4.15.9-next.1 → 4.16.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -25,7 +25,7 @@ import {
25
25
  resolveDelegatedTargetDefinition,
26
26
  resolveFileReference,
27
27
  resolveTargetDefinition
28
- } from "./chunk-HVEQNYTC.js";
28
+ } from "./chunk-6VZY3B6M.js";
29
29
  import {
30
30
  execFileWithStdin,
31
31
  execShellWithStdin
@@ -483,17 +483,76 @@ function extractTargetFromSuite(suite) {
483
483
  }
484
484
  return void 0;
485
485
  }
486
- function extractTargetsFromSuite(suite) {
486
+ function extractTargetRefsFromSuite(suite) {
487
487
  const execution = suite.execution;
488
488
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
489
489
  return void 0;
490
490
  }
491
491
  const targets = execution.targets;
492
- if (Array.isArray(targets)) {
493
- const valid = targets.filter((t) => typeof t === "string" && t.trim().length > 0);
494
- return valid.length > 0 ? valid.map((t) => t.trim()) : void 0;
492
+ if (!Array.isArray(targets)) {
493
+ return void 0;
495
494
  }
496
- return void 0;
495
+ const refs = [];
496
+ for (const t of targets) {
497
+ if (typeof t === "string" && t.trim().length > 0) {
498
+ refs.push({ name: t.trim() });
499
+ } else if (t && typeof t === "object" && !Array.isArray(t) && "name" in t) {
500
+ const obj = t;
501
+ const name = typeof obj.name === "string" ? obj.name.trim() : "";
502
+ if (name.length === 0) continue;
503
+ const useTarget = typeof obj.use_target === "string" ? obj.use_target.trim() : void 0;
504
+ const hooks = parseTargetHooks(obj.hooks);
505
+ refs.push({
506
+ name,
507
+ ...useTarget && { use_target: useTarget },
508
+ ...hooks && { hooks }
509
+ });
510
+ }
511
+ }
512
+ return refs.length > 0 ? refs : void 0;
513
+ }
514
+ function extractTargetsFromSuite(suite) {
515
+ const refs = extractTargetRefsFromSuite(suite);
516
+ if (!refs) return void 0;
517
+ const names = refs.map((r) => r.name);
518
+ return names.length > 0 ? names : void 0;
519
+ }
520
+ function parseHookConfig(raw) {
521
+ if (!raw || typeof raw !== "object") return void 0;
522
+ const obj = raw;
523
+ let command;
524
+ if (typeof obj.command === "string") {
525
+ command = ["sh", "-c", obj.command];
526
+ } else if (Array.isArray(obj.command)) {
527
+ command = obj.command.filter((s) => typeof s === "string");
528
+ } else if (typeof obj.script === "string") {
529
+ command = ["sh", "-c", obj.script];
530
+ } else if (Array.isArray(obj.script)) {
531
+ command = obj.script.filter((s) => typeof s === "string");
532
+ }
533
+ if (!command || command.length === 0) return void 0;
534
+ const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : typeof obj.timeoutMs === "number" ? obj.timeoutMs : void 0;
535
+ const cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
536
+ return {
537
+ command,
538
+ ...timeoutMs !== void 0 && { timeout_ms: timeoutMs },
539
+ ...cwd && { cwd }
540
+ };
541
+ }
542
+ function parseTargetHooks(raw) {
543
+ if (!raw || typeof raw !== "object") return void 0;
544
+ const obj = raw;
545
+ const beforeAll = parseHookConfig(obj.before_all);
546
+ const beforeEach = parseHookConfig(obj.before_each);
547
+ const afterEach = parseHookConfig(obj.after_each);
548
+ const afterAll = parseHookConfig(obj.after_all);
549
+ if (!beforeAll && !beforeEach && !afterEach && !afterAll) return void 0;
550
+ return {
551
+ ...beforeAll && { before_all: beforeAll },
552
+ ...beforeEach && { before_each: beforeEach },
553
+ ...afterEach && { after_each: afterEach },
554
+ ...afterAll && { after_all: afterAll }
555
+ };
497
556
  }
498
557
  function extractWorkersFromSuite(suite) {
499
558
  const execution = suite.execution;
@@ -1160,7 +1219,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1160
1219
  }
1161
1220
  const placeholderIndex = result.indexOf(PLACEHOLDER);
1162
1221
  if (strings.length > 0 && placeholderIndex !== -1) {
1163
- result[placeholderIndex] = { type: "rubrics", criteria: strings };
1222
+ result[placeholderIndex] = {
1223
+ type: "rubrics",
1224
+ criteria: strings,
1225
+ weight: strings.length
1226
+ };
1164
1227
  } else if (placeholderIndex !== -1) {
1165
1228
  result.splice(placeholderIndex, 1);
1166
1229
  }
@@ -3555,6 +3618,7 @@ async function readTestSuiteMetadata(testFilePath) {
3555
3618
  return {
3556
3619
  target: extractTargetFromSuite(parsed),
3557
3620
  targets: extractTargetsFromSuite(parsed),
3621
+ targetRefs: extractTargetRefsFromSuite(parsed),
3558
3622
  trials: extractTrialsConfig(parsed)
3559
3623
  };
3560
3624
  } catch {
@@ -3581,6 +3645,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
3581
3645
  tests,
3582
3646
  trials: extractTrialsConfig(parsed),
3583
3647
  targets: extractTargetsFromSuite(parsed),
3648
+ targetRefs: extractTargetRefsFromSuite(parsed),
3584
3649
  workers: extractWorkersFromSuite(parsed),
3585
3650
  cacheConfig: extractCacheConfig(parsed),
3586
3651
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
@@ -5208,7 +5273,7 @@ var ClaudeCliProvider = class {
5208
5273
  if (options.cwd) {
5209
5274
  spawnOptions.cwd = options.cwd;
5210
5275
  }
5211
- const child = spawn("claude", options.args, spawnOptions);
5276
+ const child = spawn(this.config.executable, options.args, spawnOptions);
5212
5277
  let stdout = "";
5213
5278
  let stderr = "";
5214
5279
  let timedOut = false;
@@ -5267,7 +5332,7 @@ var ClaudeCliProvider = class {
5267
5332
  if (err.code === "ENOENT") {
5268
5333
  reject(
5269
5334
  new Error(
5270
- `Claude CLI executable 'claude' was not found on PATH. Install claude-code or ensure it is in PATH.`
5335
+ `Claude CLI executable '${this.config.executable}' was not found on PATH. Install claude-code or ensure it is in PATH.`
5271
5336
  )
5272
5337
  );
5273
5338
  } else {
@@ -9339,6 +9404,9 @@ import { fileURLToPath as fileURLToPath4, pathToFileURL } from "node:url";
9339
9404
  import os2 from "node:os";
9340
9405
  import path22 from "node:path";
9341
9406
  var logged = false;
9407
+ function getAgentvConfigDir() {
9408
+ return path22.join(os2.homedir(), ".agentv");
9409
+ }
9342
9410
  function getAgentvHome() {
9343
9411
  const envHome = process.env.AGENTV_HOME;
9344
9412
  if (envHome && envHome !== "undefined") {
@@ -10010,7 +10078,7 @@ var ProviderRegistry = class {
10010
10078
 
10011
10079
  // src/evaluation/providers/vscode-provider.ts
10012
10080
  import { exec as exec2 } from "node:child_process";
10013
- import { constants as constants3, access as access3, stat as stat6 } from "node:fs/promises";
10081
+ import { constants as constants3, access as access3 } from "node:fs/promises";
10014
10082
  import path34 from "node:path";
10015
10083
  import { promisify as promisify4 } from "node:util";
10016
10084
 
@@ -11151,7 +11219,7 @@ var VSCodeProvider = class {
11151
11219
  await this.ensureEnvironmentReady();
11152
11220
  const inputFiles = normalizeAttachments(request.inputFiles);
11153
11221
  const promptContent = buildPromptDocument2(request, inputFiles);
11154
- const workspaceTemplate = request.workspaceFile ?? await resolveWorkspaceTemplateFile(this.config.workspaceTemplate);
11222
+ const workspaceTemplate = request.workspaceFile;
11155
11223
  const startTime = Date.now();
11156
11224
  const session = await dispatchAgentSession({
11157
11225
  userQuery: promptContent,
@@ -11207,9 +11275,6 @@ var VSCodeProvider = class {
11207
11275
  const userQueries = normalizedRequests.map(
11208
11276
  ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles)
11209
11277
  );
11210
- const batchWorkspaceTemplate = await resolveWorkspaceTemplateFile(
11211
- this.config.workspaceTemplate
11212
- );
11213
11278
  const startTime = Date.now();
11214
11279
  const session = await dispatchBatchAgent({
11215
11280
  userQueries,
@@ -11219,7 +11284,7 @@ var VSCodeProvider = class {
11219
11284
  dryRun: this.config.dryRun,
11220
11285
  vscodeCmd: this.config.executable,
11221
11286
  subagentRoot: this.config.subagentRoot,
11222
- workspaceTemplate: batchWorkspaceTemplate,
11287
+ workspaceTemplate: void 0,
11223
11288
  silent: true,
11224
11289
  timeoutMs: this.config.timeoutMs
11225
11290
  });
@@ -11299,17 +11364,6 @@ async function locateVSCodeExecutable(candidate) {
11299
11364
  `VS Code executable '${candidate}' was not found on PATH. Check the 'executable' setting in your target configuration.`
11300
11365
  );
11301
11366
  }
11302
- async function resolveWorkspaceTemplateFile(template) {
11303
- if (!template) {
11304
- return void 0;
11305
- }
11306
- try {
11307
- const stats = await stat6(path34.resolve(template));
11308
- return stats.isFile() ? template : void 0;
11309
- } catch {
11310
- return template;
11311
- }
11312
- }
11313
11367
  function buildPromptDocument2(request, attachments) {
11314
11368
  const parts = [];
11315
11369
  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
@@ -12507,7 +12561,7 @@ ${context.fileChanges}`;
12507
12561
  const workspacePath = context.workspacePath;
12508
12562
  if (!workspacePath) {
12509
12563
  throw new Error(
12510
- "llm-grader built-in agent mode requires a workspace_template target (workspacePath is not set)"
12564
+ "llm-grader built-in agent mode requires a workspace (workspacePath is not set)"
12511
12565
  );
12512
12566
  }
12513
12567
  const systemPrompt = this.buildAgentSystemPrompt(context);
@@ -13246,11 +13300,11 @@ function createFilesystemTools(workspacePath) {
13246
13300
  execute: async (input) => {
13247
13301
  try {
13248
13302
  const resolved = resolveSandboxed(workspacePath, input.path);
13249
- const stat13 = await fs2.stat(resolved);
13250
- if (stat13.isDirectory()) {
13303
+ const stat12 = await fs2.stat(resolved);
13304
+ if (stat12.isDirectory()) {
13251
13305
  return { error: `'${input.path}' is a directory, not a file` };
13252
13306
  }
13253
- const buffer = Buffer.alloc(Math.min(stat13.size, MAX_FILE_SIZE));
13307
+ const buffer = Buffer.alloc(Math.min(stat12.size, MAX_FILE_SIZE));
13254
13308
  const fd = await fs2.open(resolved, "r");
13255
13309
  try {
13256
13310
  await fd.read(buffer, 0, buffer.length, 0);
@@ -13258,8 +13312,8 @@ function createFilesystemTools(workspacePath) {
13258
13312
  await fd.close();
13259
13313
  }
13260
13314
  const content = buffer.toString("utf-8");
13261
- const truncated = stat13.size > MAX_FILE_SIZE;
13262
- return { content, truncated, size: stat13.size };
13315
+ const truncated = stat12.size > MAX_FILE_SIZE;
13316
+ return { content, truncated, size: stat12.size };
13263
13317
  } catch (error) {
13264
13318
  return { error: error instanceof Error ? error.message : String(error) };
13265
13319
  }
@@ -13310,8 +13364,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
13310
13364
  const ext = path37.extname(entry.name).toLowerCase();
13311
13365
  if (BINARY_EXTENSIONS.has(ext)) continue;
13312
13366
  try {
13313
- const stat13 = await fs2.stat(fullPath);
13314
- if (stat13.size > MAX_FILE_SIZE) continue;
13367
+ const stat12 = await fs2.stat(fullPath);
13368
+ if (stat12.size > MAX_FILE_SIZE) continue;
13315
13369
  const content = await fs2.readFile(fullPath, "utf-8");
13316
13370
  const lines = content.split("\n");
13317
13371
  for (let i = 0; i < lines.length; i++) {
@@ -15205,7 +15259,7 @@ function runEqualsAssertion(output, value) {
15205
15259
  import { execFile as execFile3 } from "node:child_process";
15206
15260
  import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
15207
15261
  import { existsSync as existsSync5 } from "node:fs";
15208
- import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
15262
+ import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat8 } from "node:fs/promises";
15209
15263
  import path45 from "node:path";
15210
15264
  import { promisify as promisify7 } from "node:util";
15211
15265
  import micromatch3 from "micromatch";
@@ -15983,7 +16037,7 @@ function getTCritical(df) {
15983
16037
  }
15984
16038
 
15985
16039
  // src/evaluation/workspace/manager.ts
15986
- import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as stat7 } from "node:fs/promises";
16040
+ import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
15987
16041
  import path41 from "node:path";
15988
16042
  var TemplateNotFoundError = class extends Error {
15989
16043
  constructor(templatePath) {
@@ -16006,7 +16060,7 @@ var WorkspaceCreationError = class extends Error {
16006
16060
  };
16007
16061
  async function isDirectory(filePath) {
16008
16062
  try {
16009
- const stats = await stat7(filePath);
16063
+ const stats = await stat6(filePath);
16010
16064
  return stats.isDirectory();
16011
16065
  } catch {
16012
16066
  return false;
@@ -16582,14 +16636,14 @@ ${lines.join("\n")}`;
16582
16636
  };
16583
16637
 
16584
16638
  // src/evaluation/workspace/resolve.ts
16585
- import { readdir as readdir7, stat as stat8 } from "node:fs/promises";
16639
+ import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
16586
16640
  import path44 from "node:path";
16587
16641
  async function resolveWorkspaceTemplate(templatePath) {
16588
16642
  if (!templatePath) {
16589
16643
  return void 0;
16590
16644
  }
16591
16645
  const resolved = path44.resolve(templatePath);
16592
- const stats = await stat8(resolved);
16646
+ const stats = await stat7(resolved);
16593
16647
  if (stats.isFile()) {
16594
16648
  return {
16595
16649
  dir: path44.dirname(resolved),
@@ -16729,13 +16783,6 @@ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
16729
16783
  await execFileAsync3("git", ["clean", cleanFlag], opts);
16730
16784
  return true;
16731
16785
  }
16732
- function getWorkspaceTemplate(target) {
16733
- const config = target.config;
16734
- if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
16735
- return config.workspaceTemplate;
16736
- }
16737
- return void 0;
16738
- }
16739
16786
  function validateDependencyGraph(tests) {
16740
16787
  const ids = /* @__PURE__ */ new Set();
16741
16788
  for (const test of tests) {
@@ -17009,7 +17056,7 @@ async function runEvaluation(options) {
17009
17056
  }
17010
17057
  }
17011
17058
  const suiteWorkspace = filteredEvalCases[0]?.workspace;
17012
- const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
17059
+ const rawTemplate = suiteWorkspace?.template;
17013
17060
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
17014
17061
  const workspaceTemplate = resolvedTemplate?.dir;
17015
17062
  let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
@@ -17102,7 +17149,7 @@ async function runEvaluation(options) {
17102
17149
  let staticMaterialised = false;
17103
17150
  const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
17104
17151
  if (useStaticWorkspace && configuredStaticPath) {
17105
- const dirExists = await stat9(configuredStaticPath).then(
17152
+ const dirExists = await stat8(configuredStaticPath).then(
17106
17153
  (s) => s.isDirectory(),
17107
17154
  () => false
17108
17155
  );
@@ -17192,7 +17239,7 @@ async function runEvaluation(options) {
17192
17239
  if (suiteWorkspaceFile && sharedWorkspacePath) {
17193
17240
  const copiedWorkspaceFile = path45.join(sharedWorkspacePath, path45.basename(suiteWorkspaceFile));
17194
17241
  try {
17195
- await stat9(copiedWorkspaceFile);
17242
+ await stat8(copiedWorkspaceFile);
17196
17243
  suiteWorkspaceFile = copiedWorkspaceFile;
17197
17244
  } catch {
17198
17245
  }
@@ -17297,6 +17344,54 @@ async function runEvaluation(options) {
17297
17344
  }
17298
17345
  }
17299
17346
  }
17347
+ const targetHooks = options.targetHooks;
17348
+ const targetBeforeAllHook = targetHooks?.before_all;
17349
+ if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
17350
+ const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
17351
+ setupLog(`running target before_all command=${beforeAllCommand}`);
17352
+ const scriptContext = {
17353
+ workspacePath: sharedWorkspacePath,
17354
+ testId: "__target_before_all__",
17355
+ evalRunId,
17356
+ evalDir,
17357
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
17358
+ };
17359
+ try {
17360
+ await executeWorkspaceScript(
17361
+ toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
17362
+ scriptContext
17363
+ );
17364
+ setupLog("target before_all completed");
17365
+ } catch (error) {
17366
+ const message = error instanceof Error ? error.message : String(error);
17367
+ if (sharedWorkspacePath && !useStaticWorkspace) {
17368
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
17369
+ });
17370
+ }
17371
+ throw new Error(`target before_all hook failed: ${message}`);
17372
+ }
17373
+ }
17374
+ if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
17375
+ for (const slot of availablePoolSlots) {
17376
+ setupLog(`running target before_all on pool slot ${slot.index}`);
17377
+ const scriptContext = {
17378
+ workspacePath: slot.path,
17379
+ testId: "__target_before_all__",
17380
+ evalRunId,
17381
+ evalDir,
17382
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
17383
+ };
17384
+ try {
17385
+ await executeWorkspaceScript(
17386
+ toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
17387
+ scriptContext
17388
+ );
17389
+ } catch (error) {
17390
+ const message = error instanceof Error ? error.message : String(error);
17391
+ throw new Error(`target before_all hook failed on pool slot ${slot.index}: ${message}`);
17392
+ }
17393
+ }
17394
+ }
17300
17395
  if (sharedWorkspacePath) {
17301
17396
  try {
17302
17397
  sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
@@ -17442,6 +17537,7 @@ async function runEvaluation(options) {
17442
17537
  evalDir,
17443
17538
  verbose,
17444
17539
  threshold: scoreThreshold,
17540
+ targetHooks: options.targetHooks,
17445
17541
  ...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
17446
17542
  };
17447
17543
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
@@ -17583,6 +17679,26 @@ async function runEvaluation(options) {
17583
17679
  }
17584
17680
  }
17585
17681
  const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
17682
+ const targetAfterAllHook = targetHooks?.after_all;
17683
+ if (afterAllWorkspaces.length > 0 && hasHookCommand(targetAfterAllHook)) {
17684
+ for (const wsPath of afterAllWorkspaces) {
17685
+ const scriptContext = {
17686
+ workspacePath: wsPath,
17687
+ testId: "__target_after_all__",
17688
+ evalRunId,
17689
+ evalDir,
17690
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
17691
+ };
17692
+ try {
17693
+ await executeWorkspaceScript(
17694
+ toScriptConfig(targetAfterAllHook, "after_all", "target hooks"),
17695
+ scriptContext,
17696
+ "warn"
17697
+ );
17698
+ } catch {
17699
+ }
17700
+ }
17701
+ }
17586
17702
  const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
17587
17703
  if (afterAllWorkspaces.length > 0 && suiteHooksEnabled && hasHookCommand(suiteAfterAllHook)) {
17588
17704
  const afterAllHook = suiteAfterAllHook;
@@ -17845,7 +17961,7 @@ async function runEvalCase(options) {
17845
17961
  let caseWorkspaceFile;
17846
17962
  const caseHooksEnabled = hooksEnabled(evalCase.workspace);
17847
17963
  if (!workspacePath) {
17848
- const rawCaseTemplate = evalCase.workspace?.template ?? getWorkspaceTemplate(target);
17964
+ const rawCaseTemplate = evalCase.workspace?.template;
17849
17965
  const resolvedCaseTemplate = await resolveWorkspaceTemplate(rawCaseTemplate);
17850
17966
  const caseWorkspaceTemplate = resolvedCaseTemplate?.dir;
17851
17967
  caseWorkspaceFile = resolvedCaseTemplate?.workspaceFile;
@@ -17869,7 +17985,7 @@ async function runEvalCase(options) {
17869
17985
  if (caseWorkspaceFile && workspacePath) {
17870
17986
  const copiedFile = path45.join(workspacePath, path45.basename(caseWorkspaceFile));
17871
17987
  try {
17872
- await stat9(copiedFile);
17988
+ await stat8(copiedFile);
17873
17989
  caseWorkspaceFile = copiedFile;
17874
17990
  } catch {
17875
17991
  }
@@ -18063,6 +18179,38 @@ async function runEvalCase(options) {
18063
18179
  );
18064
18180
  }
18065
18181
  }
18182
+ const targetBeforeEachHook = options.targetHooks?.before_each;
18183
+ if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
18184
+ const scriptContext = {
18185
+ workspacePath,
18186
+ testId: evalCase.id,
18187
+ evalRunId: evalRunId ?? "",
18188
+ caseInput: evalCase.question,
18189
+ caseMetadata: evalCase.metadata,
18190
+ evalDir,
18191
+ workspaceFileDir: evalCase.workspace?.workspaceFileDir
18192
+ };
18193
+ try {
18194
+ await executeWorkspaceScript(
18195
+ toScriptConfig(targetBeforeEachHook, "before_each", `target hook for '${evalCase.id}'`),
18196
+ scriptContext
18197
+ );
18198
+ beforeEachNeedsFreshBaseline = true;
18199
+ } catch (error) {
18200
+ const message = error instanceof Error ? error.message : String(error);
18201
+ return buildErrorResult(
18202
+ evalCase,
18203
+ target.name,
18204
+ nowFn(),
18205
+ new Error(`target before_each hook failed: ${message}`),
18206
+ promptInputs,
18207
+ provider,
18208
+ "setup",
18209
+ "script_error",
18210
+ verbose
18211
+ );
18212
+ }
18213
+ }
18066
18214
  let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
18067
18215
  if (!baselineCommit && workspacePath) {
18068
18216
  try {
@@ -18217,6 +18365,26 @@ async function runEvalCase(options) {
18217
18365
  ${providerFileChanges}` : providerFileChanges;
18218
18366
  }
18219
18367
  const providerError = extractProviderError(providerResponse);
18368
+ const targetAfterEachHook = options.targetHooks?.after_each;
18369
+ if (workspacePath && hasHookCommand(targetAfterEachHook)) {
18370
+ const scriptContext = {
18371
+ workspacePath,
18372
+ testId: evalCase.id,
18373
+ evalRunId: evalRunId ?? "",
18374
+ caseInput: evalCase.question,
18375
+ caseMetadata: evalCase.metadata,
18376
+ evalDir,
18377
+ workspaceFileDir: evalCase.workspace?.workspaceFileDir
18378
+ };
18379
+ try {
18380
+ await executeWorkspaceScript(
18381
+ toScriptConfig(targetAfterEachHook, "after_each", `target hook for '${evalCase.id}'`),
18382
+ scriptContext,
18383
+ "warn"
18384
+ );
18385
+ } catch {
18386
+ }
18387
+ }
18220
18388
  if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
18221
18389
  try {
18222
18390
  if (repoManager && evalCase.workspace.repos?.length) {
@@ -19838,7 +20006,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
19838
20006
  // src/evaluation/results-repo.ts
19839
20007
  import { execFile as execFile4 } from "node:child_process";
19840
20008
  import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
19841
- import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
20009
+ import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat9 } from "node:fs/promises";
19842
20010
  import os3 from "node:os";
19843
20011
  import path49 from "node:path";
19844
20012
  import { promisify as promisify8 } from "node:util";
@@ -20061,7 +20229,7 @@ function resolveResultsRepoRunsDir(config) {
20061
20229
  );
20062
20230
  }
20063
20231
  async function directorySizeBytes(targetPath) {
20064
- const entry = await stat10(targetPath);
20232
+ const entry = await stat9(targetPath);
20065
20233
  if (entry.isFile()) {
20066
20234
  return entry.size;
20067
20235
  }
@@ -20117,14 +20285,34 @@ async function createDraftResultsPr(params) {
20117
20285
  }
20118
20286
 
20119
20287
  // src/benchmarks.ts
20120
- import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
20288
+ import {
20289
+ copyFileSync,
20290
+ existsSync as existsSync8,
20291
+ mkdirSync as mkdirSync3,
20292
+ readFileSync as readFileSync4,
20293
+ readdirSync as readdirSync3,
20294
+ statSync as statSync2,
20295
+ writeFileSync as writeFileSync2
20296
+ } from "node:fs";
20121
20297
  import path50 from "node:path";
20122
20298
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
20123
20299
  function getBenchmarksRegistryPath() {
20124
- return path50.join(getAgentvHome(), "projects.yaml");
20300
+ return path50.join(getAgentvConfigDir(), "projects.yaml");
20301
+ }
20302
+ function migrateProjectsYaml(targetPath) {
20303
+ const dataHome = getAgentvHome();
20304
+ const configDir = getAgentvConfigDir();
20305
+ if (dataHome === configDir) return;
20306
+ const legacyPath = path50.join(dataHome, "projects.yaml");
20307
+ if (!existsSync8(legacyPath)) return;
20308
+ mkdirSync3(path50.dirname(targetPath), { recursive: true });
20309
+ copyFileSync(legacyPath, targetPath);
20125
20310
  }
20126
20311
  function loadBenchmarkRegistry() {
20127
20312
  const registryPath = getBenchmarksRegistryPath();
20313
+ if (!existsSync8(registryPath)) {
20314
+ migrateProjectsYaml(registryPath);
20315
+ }
20128
20316
  if (!existsSync8(registryPath)) {
20129
20317
  return { benchmarks: [] };
20130
20318
  }
@@ -21143,7 +21331,7 @@ function extractResponseItemContent(content) {
21143
21331
  }
21144
21332
 
21145
21333
  // src/import/codex-session-discovery.ts
21146
- import { readdir as readdir10, stat as stat11 } from "node:fs/promises";
21334
+ import { readdir as readdir10, stat as stat10 } from "node:fs/promises";
21147
21335
  import { homedir as homedir5 } from "node:os";
21148
21336
  import path51 from "node:path";
21149
21337
  var DEFAULT_SESSIONS_DIR = () => path51.join(homedir5(), ".codex", "sessions");
@@ -21193,7 +21381,7 @@ async function discoverCodexSessions(opts) {
21193
21381
  const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
21194
21382
  let updatedAt;
21195
21383
  try {
21196
- const fileStat = await stat11(filePath);
21384
+ const fileStat = await stat10(filePath);
21197
21385
  updatedAt = fileStat.mtime;
21198
21386
  } catch {
21199
21387
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -21208,7 +21396,7 @@ async function discoverCodexSessions(opts) {
21208
21396
  }
21209
21397
 
21210
21398
  // src/import/session-discovery.ts
21211
- import { readdir as readdir11, stat as stat12 } from "node:fs/promises";
21399
+ import { readdir as readdir11, stat as stat11 } from "node:fs/promises";
21212
21400
  import { homedir as homedir6 } from "node:os";
21213
21401
  import path52 from "node:path";
21214
21402
  var DEFAULT_PROJECTS_DIR = () => path52.join(homedir6(), ".claude", "projects");
@@ -21244,7 +21432,7 @@ async function discoverClaudeSessions(opts) {
21244
21432
  const filePath = path52.join(dirPath, entry);
21245
21433
  let updatedAt;
21246
21434
  try {
21247
- const fileStat = await stat12(filePath);
21435
+ const fileStat = await stat11(filePath);
21248
21436
  updatedAt = fileStat.mtime;
21249
21437
  } catch {
21250
21438
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -21500,6 +21688,7 @@ export {
21500
21688
  extractJsonBlob,
21501
21689
  extractLastAssistantContent,
21502
21690
  extractTargetFromSuite,
21691
+ extractTargetRefsFromSuite,
21503
21692
  extractTargetsFromSuite,
21504
21693
  extractTargetsFromTestCase,
21505
21694
  extractThreshold,
@@ -21509,6 +21698,7 @@ export {
21509
21698
  findGitRoot,
21510
21699
  freeformEvaluationSchema,
21511
21700
  generateRubrics,
21701
+ getAgentvConfigDir,
21512
21702
  getAgentvHome,
21513
21703
  getBenchmark,
21514
21704
  getBenchmarksRegistryPath,