@agentv/core 2.16.0 → 2.17.1-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1777,6 +1777,25 @@ var import_node_path8 = __toESM(require("path"), 1);
1777
1777
  var import_micromatch3 = __toESM(require("micromatch"), 1);
1778
1778
  var import_yaml4 = require("yaml");
1779
1779
 
1780
+ // src/evaluation/interpolation.ts
1781
+ var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
1782
+ function interpolateEnv(value, env) {
1783
+ if (typeof value === "string") {
1784
+ return value.replace(ENV_VAR_PATTERN, (_, varName) => env[varName] ?? "");
1785
+ }
1786
+ if (Array.isArray(value)) {
1787
+ return value.map((item) => interpolateEnv(item, env));
1788
+ }
1789
+ if (value !== null && typeof value === "object") {
1790
+ const result = {};
1791
+ for (const [key, val] of Object.entries(value)) {
1792
+ result[key] = interpolateEnv(val, env);
1793
+ }
1794
+ return result;
1795
+ }
1796
+ return value;
1797
+ }
1798
+
1780
1799
  // src/evaluation/loaders/case-file-loader.ts
1781
1800
  var import_promises = require("fs/promises");
1782
1801
  var import_node_path = __toESM(require("path"), 1);
@@ -1795,7 +1814,8 @@ function isGlobPattern(filePath) {
1795
1814
  return filePath.includes("*") || filePath.includes("?") || filePath.includes("{");
1796
1815
  }
1797
1816
  function parseYamlCases(content, filePath) {
1798
- const parsed = (0, import_yaml.parse)(content);
1817
+ const raw = (0, import_yaml.parse)(content);
1818
+ const parsed = interpolateEnv(raw, process.env);
1799
1819
  if (!Array.isArray(parsed)) {
1800
1820
  throw new Error(
1801
1821
  `External test file must contain a YAML array, got ${typeof parsed}: ${filePath}`
@@ -1817,7 +1837,8 @@ function parseJsonlCases(content, filePath) {
1817
1837
  const line = lines[i].trim();
1818
1838
  if (line === "") continue;
1819
1839
  try {
1820
- const parsed = JSON.parse(line);
1840
+ const raw = JSON.parse(line);
1841
+ const parsed = interpolateEnv(raw, process.env);
1821
1842
  if (!isJsonObject(parsed)) {
1822
1843
  throw new Error("Expected JSON object");
1823
1844
  }
@@ -3966,7 +3987,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
3966
3987
  }
3967
3988
  try {
3968
3989
  const content = await (0, import_promises6.readFile)(sidecarPath, "utf8");
3969
- const parsed = (0, import_yaml3.parse)(content);
3990
+ const parsed = interpolateEnv((0, import_yaml3.parse)(content), process.env);
3970
3991
  if (!isJsonObject(parsed)) {
3971
3992
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
3972
3993
  return {};
@@ -3989,7 +4010,8 @@ function parseJsonlContent(content, filePath) {
3989
4010
  const line = lines[i].trim();
3990
4011
  if (line === "") continue;
3991
4012
  try {
3992
- const parsed = JSON.parse(line);
4013
+ const raw = JSON.parse(line);
4014
+ const parsed = interpolateEnv(raw, process.env);
3993
4015
  if (!isJsonObject(parsed)) {
3994
4016
  throw new Error("Expected JSON object");
3995
4017
  }
@@ -4046,9 +4068,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4046
4068
  }
4047
4069
  const inputMessages = resolveInputMessages(evalcase);
4048
4070
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4049
- if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
4071
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
4072
+ if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
4050
4073
  logError(
4051
- `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, criteria, and/or input`
4074
+ `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
4052
4075
  );
4053
4076
  continue;
4054
4077
  }
@@ -4126,7 +4149,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4126
4149
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
4127
4150
  guideline_patterns: guidelinePatterns,
4128
4151
  file_paths: allFilePaths,
4129
- criteria: outcome,
4152
+ criteria: outcome ?? "",
4130
4153
  evaluator: evalCaseEvaluatorKind,
4131
4154
  evaluators
4132
4155
  };
@@ -4439,7 +4462,7 @@ async function readTestSuiteMetadata(testFilePath) {
4439
4462
  try {
4440
4463
  const absolutePath = import_node_path8.default.resolve(testFilePath);
4441
4464
  const content = await (0, import_promises8.readFile)(absolutePath, "utf8");
4442
- const parsed = (0, import_yaml4.parse)(content);
4465
+ const parsed = interpolateEnv((0, import_yaml4.parse)(content), process.env);
4443
4466
  if (!isJsonObject(parsed)) {
4444
4467
  return {};
4445
4468
  }
@@ -4489,11 +4512,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4489
4512
  const config = await loadConfig(absoluteTestPath, repoRootPath);
4490
4513
  const guidelinePatterns = config?.guideline_patterns;
4491
4514
  const rawFile = await (0, import_promises8.readFile)(absoluteTestPath, "utf8");
4492
- const parsed = (0, import_yaml4.parse)(rawFile);
4493
- if (!isJsonObject(parsed)) {
4515
+ const interpolated = interpolateEnv((0, import_yaml4.parse)(rawFile), process.env);
4516
+ if (!isJsonObject(interpolated)) {
4494
4517
  throw new Error(`Invalid test file format: ${evalFilePath}`);
4495
4518
  }
4496
- const suite = parsed;
4519
+ const suite = interpolated;
4497
4520
  const datasetNameFromSuite = asString6(suite.dataset)?.trim();
4498
4521
  const fallbackDataset = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
4499
4522
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
@@ -4537,9 +4560,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4537
4560
  }
4538
4561
  const testInputMessages = resolveInputMessages(evalcase);
4539
4562
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4540
- if (!id || !outcome || !testInputMessages || testInputMessages.length === 0) {
4563
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
4564
+ if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
4541
4565
  logError2(
4542
- `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, criteria, and/or input`
4566
+ `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
4543
4567
  );
4544
4568
  continue;
4545
4569
  }
@@ -4635,7 +4659,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4635
4659
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path8.default.resolve(guidelinePath)),
4636
4660
  guideline_patterns: guidelinePatterns,
4637
4661
  file_paths: allFilePaths,
4638
- criteria: outcome,
4662
+ criteria: outcome ?? "",
4639
4663
  evaluator: evalCaseEvaluatorKind,
4640
4664
  evaluators,
4641
4665
  workspace: mergedWorkspace,
@@ -4745,30 +4769,24 @@ function parseWorkspaceHookConfig(raw, evalFileDir) {
4745
4769
  const script = parseWorkspaceScriptConfig(raw, evalFileDir);
4746
4770
  const obj = raw;
4747
4771
  const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
4748
- const clean = obj.clean === "always" || obj.clean === "on_success" || obj.clean === "on_failure" || obj.clean === "never" ? obj.clean : void 0;
4749
- if (!script && !reset && !clean) return void 0;
4772
+ if (!script && !reset) return void 0;
4750
4773
  return {
4751
4774
  ...script ?? {},
4752
- ...reset !== void 0 && { reset },
4753
- ...clean !== void 0 && { clean }
4775
+ ...reset !== void 0 && { reset }
4754
4776
  };
4755
4777
  }
4756
4778
  function parseWorkspaceHooksConfig(raw, evalFileDir) {
4757
4779
  if (!isJsonObject(raw)) return void 0;
4758
4780
  const obj = raw;
4759
- const beforeAllTests = parseWorkspaceHookConfig(obj.before_all_tests, evalFileDir);
4760
- const beforeEachTest = parseWorkspaceHookConfig(obj.before_each_test, evalFileDir);
4761
- const afterEachTest = parseWorkspaceHookConfig(obj.after_each_test, evalFileDir);
4762
- const afterAllTests = parseWorkspaceHookConfig(obj.after_all_tests, evalFileDir);
4763
- const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
4764
- const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
4781
+ const beforeAll = parseWorkspaceHookConfig(obj.before_all, evalFileDir);
4782
+ const beforeEach = parseWorkspaceHookConfig(obj.before_each, evalFileDir);
4783
+ const afterEach = parseWorkspaceHookConfig(obj.after_each, evalFileDir);
4784
+ const afterAll = parseWorkspaceHookConfig(obj.after_all, evalFileDir);
4765
4785
  const hooks = {
4766
- ...beforeAllTests !== void 0 && { before_all_tests: beforeAllTests },
4767
- ...beforeEachTest !== void 0 && { before_each_test: beforeEachTest },
4768
- ...afterEachTest !== void 0 && { after_each_test: afterEachTest },
4769
- ...afterAllTests !== void 0 && { after_all_tests: afterAllTests },
4770
- ...onReuse !== void 0 && { on_reuse: onReuse },
4771
- ...onFinish !== void 0 && { on_finish: onFinish }
4786
+ ...beforeAll !== void 0 && { before_all: beforeAll },
4787
+ ...beforeEach !== void 0 && { before_each: beforeEach },
4788
+ ...afterEach !== void 0 && { after_each: afterEach },
4789
+ ...afterAll !== void 0 && { after_all: afterAll }
4772
4790
  };
4773
4791
  return Object.keys(hooks).length > 0 ? hooks : void 0;
4774
4792
  }
@@ -4781,7 +4799,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
4781
4799
  } catch {
4782
4800
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
4783
4801
  }
4784
- const parsed = (0, import_yaml4.parse)(content);
4802
+ const parsed = interpolateEnv((0, import_yaml4.parse)(content), process.env);
4785
4803
  if (!isJsonObject(parsed)) {
4786
4804
  throw new Error(
4787
4805
  `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
@@ -4829,18 +4847,10 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
4829
4847
  };
4830
4848
  };
4831
4849
  const mergedHooks = {
4832
- before_all_tests: mergeHook(
4833
- suiteLevel.hooks?.before_all_tests,
4834
- caseLevel.hooks?.before_all_tests
4835
- ),
4836
- before_each_test: mergeHook(
4837
- suiteLevel.hooks?.before_each_test,
4838
- caseLevel.hooks?.before_each_test
4839
- ),
4840
- after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
4841
- after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
4842
- on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
4843
- on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
4850
+ before_all: mergeHook(suiteLevel.hooks?.before_all, caseLevel.hooks?.before_all),
4851
+ before_each: mergeHook(suiteLevel.hooks?.before_each, caseLevel.hooks?.before_each),
4852
+ after_each: mergeHook(suiteLevel.hooks?.after_each, caseLevel.hooks?.after_each),
4853
+ after_all: mergeHook(suiteLevel.hooks?.after_all, caseLevel.hooks?.after_all)
4844
4854
  };
4845
4855
  const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
4846
4856
  return {
@@ -7086,6 +7096,7 @@ var CopilotCliProvider = class {
7086
7096
  const agentProcess = (0, import_node_child_process2.spawn)(executable, args, {
7087
7097
  stdio: ["pipe", "pipe", "inherit"]
7088
7098
  });
7099
+ await waitForProcessSpawn(agentProcess, executable, this.targetName);
7089
7100
  const toolCallsInProgress = /* @__PURE__ */ new Map();
7090
7101
  const completedToolCalls = [];
7091
7102
  let finalContent = "";
@@ -7365,6 +7376,47 @@ var CopilotCliProvider = class {
7365
7376
  }
7366
7377
  }
7367
7378
  };
7379
+ async function waitForProcessSpawn(proc, executable, targetName) {
7380
+ if (proc.pid) {
7381
+ return;
7382
+ }
7383
+ await new Promise((resolve, reject) => {
7384
+ const onSpawn = () => {
7385
+ cleanup();
7386
+ resolve();
7387
+ };
7388
+ const onError = (error) => {
7389
+ cleanup();
7390
+ reject(new Error(formatCopilotSpawnError(error, executable, targetName)));
7391
+ };
7392
+ const cleanup = () => {
7393
+ proc.off("spawn", onSpawn);
7394
+ proc.off("error", onError);
7395
+ };
7396
+ proc.once("spawn", onSpawn);
7397
+ proc.once("error", onError);
7398
+ });
7399
+ }
7400
+ function formatCopilotSpawnError(error, executable, targetName) {
7401
+ const code = error.code;
7402
+ const base = `Failed to start Copilot CLI executable '${executable}' for target '${targetName}'. ${error.message}`;
7403
+ if (process.platform !== "win32") {
7404
+ return base;
7405
+ }
7406
+ if (code !== "ENOENT" && code !== "EINVAL") {
7407
+ return base;
7408
+ }
7409
+ return `${base}
7410
+
7411
+ On Windows, shell commands like 'copilot -h' can work via .ps1/.bat shims, but AgentV launches a subprocess that needs a directly spawnable executable path.
7412
+
7413
+ Fix options:
7414
+ 1) Install native Copilot binary package:
7415
+ npm install -g @github/copilot-win32-x64
7416
+ 2) Set explicit executable for Copilot targets:
7417
+ - In .env: COPILOT_EXE=C:\\Users\\<you>\\AppData\\Roaming\\npm\\node_modules\\@github\\copilot-win32-x64\\copilot.exe
7418
+ - In .agentv/targets.yaml: executable: \${{ COPILOT_EXE }}`;
7419
+ }
7368
7420
  function summarizeAcpEvent(eventType, data) {
7369
7421
  if (!data || typeof data !== "object") {
7370
7422
  return eventType;
@@ -16424,9 +16476,8 @@ async function runEvaluation(options) {
16424
16476
  const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
16425
16477
  const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
16426
16478
  const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
16427
- const finishCleanPolicy = suiteWorkspace?.hooks?.on_finish?.clean;
16428
- const resolvedRetainOnSuccess = (finishCleanPolicy === "always" || finishCleanPolicy === "on_success" ? "cleanup" : finishCleanPolicy === "on_failure" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
16429
- const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
16479
+ const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
16480
+ const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
16430
16481
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
16431
16482
  const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
16432
16483
  setupLog(
@@ -16461,7 +16512,7 @@ async function runEvaluation(options) {
16461
16512
  repos: suiteWorkspace.repos,
16462
16513
  maxSlots: poolMaxSlots,
16463
16514
  repoManager: poolRepoManager,
16464
- poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? suiteWorkspace.hooks?.on_reuse?.reset ?? "fast"
16515
+ poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
16465
16516
  });
16466
16517
  poolSlots.push(slot);
16467
16518
  setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
@@ -16512,7 +16563,7 @@ async function runEvaluation(options) {
16512
16563
  throw new Error(`Failed to materialize repos: ${message}`);
16513
16564
  }
16514
16565
  }
16515
- const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all_tests;
16566
+ const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
16516
16567
  if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
16517
16568
  const beforeAllHook = suiteBeforeAllHook;
16518
16569
  const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
@@ -16527,7 +16578,7 @@ async function runEvaluation(options) {
16527
16578
  };
16528
16579
  try {
16529
16580
  beforeAllOutput = await executeWorkspaceScript(
16530
- toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
16581
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
16531
16582
  scriptContext
16532
16583
  );
16533
16584
  setupLog("shared before_all completed");
@@ -16552,7 +16603,7 @@ async function runEvaluation(options) {
16552
16603
  };
16553
16604
  try {
16554
16605
  const output = await executeWorkspaceScript(
16555
- toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
16606
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
16556
16607
  scriptContext
16557
16608
  );
16558
16609
  if (!beforeAllOutput) beforeAllOutput = output;
@@ -16781,7 +16832,7 @@ async function runEvaluation(options) {
16781
16832
  }
16782
16833
  }
16783
16834
  const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
16784
- const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all_tests;
16835
+ const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
16785
16836
  if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
16786
16837
  const afterAllHook = suiteAfterAllHook;
16787
16838
  for (const wsPath of afterAllWorkspaces) {
@@ -16793,7 +16844,7 @@ async function runEvaluation(options) {
16793
16844
  };
16794
16845
  try {
16795
16846
  const afterAllOutput = await executeWorkspaceScript(
16796
- toScriptConfig(afterAllHook, "after_all_tests", "suite workspace"),
16847
+ toScriptConfig(afterAllHook, "after_all", "suite workspace"),
16797
16848
  scriptContext,
16798
16849
  "warn"
16799
16850
  );
@@ -17090,7 +17141,7 @@ async function runEvalCase(options) {
17090
17141
  );
17091
17142
  }
17092
17143
  }
17093
- const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all_tests;
17144
+ const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
17094
17145
  if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
17095
17146
  const beforeAllHook = caseBeforeAllHook;
17096
17147
  const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
@@ -17109,7 +17160,7 @@ async function runEvalCase(options) {
17109
17160
  };
17110
17161
  try {
17111
17162
  beforeAllOutput = await executeWorkspaceScript(
17112
- toScriptConfig(beforeAllHook, "before_all_tests", `test '${evalCase.id}'`),
17163
+ toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
17113
17164
  scriptContext
17114
17165
  );
17115
17166
  if (setupDebug) {
@@ -17134,7 +17185,7 @@ async function runEvalCase(options) {
17134
17185
  }
17135
17186
  }
17136
17187
  }
17137
- const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each_test;
17188
+ const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
17138
17189
  if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
17139
17190
  const beforeEachHook = caseBeforeEachHook;
17140
17191
  const scriptContext = {
@@ -17147,7 +17198,7 @@ async function runEvalCase(options) {
17147
17198
  };
17148
17199
  try {
17149
17200
  beforeEachOutput = await executeWorkspaceScript(
17150
- toScriptConfig(beforeEachHook, "before_each_test", `test '${evalCase.id}'`),
17201
+ toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
17151
17202
  scriptContext
17152
17203
  );
17153
17204
  } catch (error) {
@@ -17264,17 +17315,17 @@ async function runEvalCase(options) {
17264
17315
  }
17265
17316
  }
17266
17317
  const providerError = extractProviderError(providerResponse);
17267
- if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each_test?.reset && evalCase.workspace.hooks.after_each_test.reset !== "none" && evalCase.workspace.repos) {
17318
+ if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
17268
17319
  try {
17269
17320
  await repoManager.reset(
17270
17321
  evalCase.workspace.repos,
17271
17322
  workspacePath,
17272
- evalCase.workspace.hooks.after_each_test.reset
17323
+ evalCase.workspace.hooks.after_each.reset
17273
17324
  );
17274
17325
  } catch {
17275
17326
  }
17276
17327
  }
17277
- const caseAfterEachHook = evalCase.workspace?.hooks?.after_each_test;
17328
+ const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
17278
17329
  if (workspacePath && hasHookCommand(caseAfterEachHook)) {
17279
17330
  const afterEachHook = caseAfterEachHook;
17280
17331
  const scriptContext = {
@@ -17287,7 +17338,7 @@ async function runEvalCase(options) {
17287
17338
  };
17288
17339
  try {
17289
17340
  afterEachOutput = await executeWorkspaceScript(
17290
- toScriptConfig(afterEachHook, "after_each_test", `test '${evalCase.id}'`),
17341
+ toScriptConfig(afterEachHook, "after_each", `test '${evalCase.id}'`),
17291
17342
  scriptContext,
17292
17343
  "warn"
17293
17344
  );
@@ -17687,9 +17738,11 @@ async function runEvaluatorList(options) {
17687
17738
  registry: typeRegistry
17688
17739
  };
17689
17740
  for (const evaluatorConfig of evaluators ?? []) {
17741
+ const startedAt = /* @__PURE__ */ new Date();
17690
17742
  try {
17691
17743
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
17692
17744
  const score2 = await evaluatorInstance.evaluate(evalContext);
17745
+ const endedAt = /* @__PURE__ */ new Date();
17693
17746
  const weight = evaluatorConfig.weight ?? 1;
17694
17747
  scored.push({
17695
17748
  score: score2,
@@ -17710,9 +17763,13 @@ async function runEvaluatorList(options) {
17710
17763
  evaluatorProviderRequest: score2.evaluatorRawRequest,
17711
17764
  details: score2.details,
17712
17765
  scores: mapChildResults(score2.scores),
17713
- tokenUsage: score2.tokenUsage
17766
+ tokenUsage: score2.tokenUsage,
17767
+ durationMs: endedAt.getTime() - startedAt.getTime(),
17768
+ startedAt: startedAt.toISOString(),
17769
+ endedAt: endedAt.toISOString()
17714
17770
  });
17715
17771
  } catch (error) {
17772
+ const endedAt = /* @__PURE__ */ new Date();
17716
17773
  const message = error instanceof Error ? error.message : String(error);
17717
17774
  const fallbackScore = {
17718
17775
  score: 0,
@@ -17738,7 +17795,10 @@ async function runEvaluatorList(options) {
17738
17795
  verdict: "fail",
17739
17796
  hits: [],
17740
17797
  misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
17741
- reasoning: message
17798
+ reasoning: message,
17799
+ durationMs: endedAt.getTime() - startedAt.getTime(),
17800
+ startedAt: startedAt.toISOString(),
17801
+ endedAt: endedAt.toISOString()
17742
17802
  });
17743
17803
  }
17744
17804
  if (evaluatorConfig.negate === true && scored.length > 0) {