@agentv/core 2.16.0 → 2.17.1-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -526,22 +526,16 @@ type WorkspaceHookConfig = {
526
526
  readonly cwd?: string;
527
527
  /** Optional reset policy for this hook */
528
528
  readonly reset?: 'none' | 'fast' | 'strict';
529
- /** Optional cleanup policy for this hook */
530
- readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
531
529
  };
532
530
  type WorkspaceHooksConfig = {
533
531
  /** Runs once before first test in the workspace lifecycle */
534
- readonly before_all_tests?: WorkspaceHookConfig;
532
+ readonly before_all?: WorkspaceHookConfig;
535
533
  /** Runs before each test case */
536
- readonly before_each_test?: WorkspaceHookConfig;
534
+ readonly before_each?: WorkspaceHookConfig;
537
535
  /** Runs after each test case */
538
- readonly after_each_test?: WorkspaceHookConfig;
536
+ readonly after_each?: WorkspaceHookConfig;
539
537
  /** Runs once after final test in the workspace lifecycle */
540
- readonly after_all_tests?: WorkspaceHookConfig;
541
- /** Runs when reusing a pooled workspace slot */
542
- readonly on_reuse?: WorkspaceHookConfig;
543
- /** Runs/controls behavior when workspace lifecycle finishes */
544
- readonly on_finish?: WorkspaceHookConfig;
538
+ readonly after_all?: WorkspaceHookConfig;
545
539
  };
546
540
  type WorkspaceConfig = {
547
541
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -1172,6 +1166,12 @@ interface EvaluatorResult {
1172
1166
  readonly details?: JsonObject;
1173
1167
  /** Token usage from LLM calls made by this evaluator (optional). */
1174
1168
  readonly tokenUsage?: TokenUsage;
1169
+ /** Wall-clock duration of this judge execution in milliseconds. */
1170
+ readonly durationMs?: number;
1171
+ /** ISO 8601 UTC timestamp when this judge started executing. */
1172
+ readonly startedAt?: string;
1173
+ /** ISO 8601 UTC timestamp when this judge finished executing. */
1174
+ readonly endedAt?: string;
1175
1175
  }
1176
1176
  /**
1177
1177
  * Convenience accessor matching the Python hit_count property.
package/dist/index.d.ts CHANGED
@@ -526,22 +526,16 @@ type WorkspaceHookConfig = {
526
526
  readonly cwd?: string;
527
527
  /** Optional reset policy for this hook */
528
528
  readonly reset?: 'none' | 'fast' | 'strict';
529
- /** Optional cleanup policy for this hook */
530
- readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
531
529
  };
532
530
  type WorkspaceHooksConfig = {
533
531
  /** Runs once before first test in the workspace lifecycle */
534
- readonly before_all_tests?: WorkspaceHookConfig;
532
+ readonly before_all?: WorkspaceHookConfig;
535
533
  /** Runs before each test case */
536
- readonly before_each_test?: WorkspaceHookConfig;
534
+ readonly before_each?: WorkspaceHookConfig;
537
535
  /** Runs after each test case */
538
- readonly after_each_test?: WorkspaceHookConfig;
536
+ readonly after_each?: WorkspaceHookConfig;
539
537
  /** Runs once after final test in the workspace lifecycle */
540
- readonly after_all_tests?: WorkspaceHookConfig;
541
- /** Runs when reusing a pooled workspace slot */
542
- readonly on_reuse?: WorkspaceHookConfig;
543
- /** Runs/controls behavior when workspace lifecycle finishes */
544
- readonly on_finish?: WorkspaceHookConfig;
538
+ readonly after_all?: WorkspaceHookConfig;
545
539
  };
546
540
  type WorkspaceConfig = {
547
541
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -1172,6 +1166,12 @@ interface EvaluatorResult {
1172
1166
  readonly details?: JsonObject;
1173
1167
  /** Token usage from LLM calls made by this evaluator (optional). */
1174
1168
  readonly tokenUsage?: TokenUsage;
1169
+ /** Wall-clock duration of this judge execution in milliseconds. */
1170
+ readonly durationMs?: number;
1171
+ /** ISO 8601 UTC timestamp when this judge started executing. */
1172
+ readonly startedAt?: string;
1173
+ /** ISO 8601 UTC timestamp when this judge finished executing. */
1174
+ readonly endedAt?: string;
1175
1175
  }
1176
1176
  /**
1177
1177
  * Convenience accessor matching the Python hit_count property.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-E6AJPAXM.js";
20
+ } from "./chunk-PSYFRPNT.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -151,6 +151,25 @@ import path8 from "node:path";
151
151
  import micromatch3 from "micromatch";
152
152
  import { parse as parse2 } from "yaml";
153
153
 
154
+ // src/evaluation/interpolation.ts
155
+ var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
156
+ function interpolateEnv(value, env) {
157
+ if (typeof value === "string") {
158
+ return value.replace(ENV_VAR_PATTERN, (_, varName) => env[varName] ?? "");
159
+ }
160
+ if (Array.isArray(value)) {
161
+ return value.map((item) => interpolateEnv(item, env));
162
+ }
163
+ if (value !== null && typeof value === "object") {
164
+ const result = {};
165
+ for (const [key, val] of Object.entries(value)) {
166
+ result[key] = interpolateEnv(val, env);
167
+ }
168
+ return result;
169
+ }
170
+ return value;
171
+ }
172
+
154
173
  // src/evaluation/loaders/case-file-loader.ts
155
174
  import { readFile } from "node:fs/promises";
156
175
  import path from "node:path";
@@ -169,7 +188,8 @@ function isGlobPattern(filePath) {
169
188
  return filePath.includes("*") || filePath.includes("?") || filePath.includes("{");
170
189
  }
171
190
  function parseYamlCases(content, filePath) {
172
- const parsed = parseYaml(content);
191
+ const raw = parseYaml(content);
192
+ const parsed = interpolateEnv(raw, process.env);
173
193
  if (!Array.isArray(parsed)) {
174
194
  throw new Error(
175
195
  `External test file must contain a YAML array, got ${typeof parsed}: ${filePath}`
@@ -191,7 +211,8 @@ function parseJsonlCases(content, filePath) {
191
211
  const line = lines[i].trim();
192
212
  if (line === "") continue;
193
213
  try {
194
- const parsed = JSON.parse(line);
214
+ const raw = JSON.parse(line);
215
+ const parsed = interpolateEnv(raw, process.env);
195
216
  if (!isJsonObject(parsed)) {
196
217
  throw new Error("Expected JSON object");
197
218
  }
@@ -2340,7 +2361,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
2340
2361
  }
2341
2362
  try {
2342
2363
  const content = await readFile5(sidecarPath, "utf8");
2343
- const parsed = parseYaml2(content);
2364
+ const parsed = interpolateEnv(parseYaml2(content), process.env);
2344
2365
  if (!isJsonObject(parsed)) {
2345
2366
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
2346
2367
  return {};
@@ -2363,7 +2384,8 @@ function parseJsonlContent(content, filePath) {
2363
2384
  const line = lines[i].trim();
2364
2385
  if (line === "") continue;
2365
2386
  try {
2366
- const parsed = JSON.parse(line);
2387
+ const raw = JSON.parse(line);
2388
+ const parsed = interpolateEnv(raw, process.env);
2367
2389
  if (!isJsonObject(parsed)) {
2368
2390
  throw new Error("Expected JSON object");
2369
2391
  }
@@ -2420,9 +2442,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2420
2442
  }
2421
2443
  const inputMessages = resolveInputMessages(evalcase);
2422
2444
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2423
- if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
2445
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2446
+ if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
2424
2447
  logError(
2425
- `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, criteria, and/or input`
2448
+ `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
2426
2449
  );
2427
2450
  continue;
2428
2451
  }
@@ -2500,7 +2523,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2500
2523
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
2501
2524
  guideline_patterns: guidelinePatterns,
2502
2525
  file_paths: allFilePaths,
2503
- criteria: outcome,
2526
+ criteria: outcome ?? "",
2504
2527
  evaluator: evalCaseEvaluatorKind,
2505
2528
  evaluators
2506
2529
  };
@@ -2813,7 +2836,7 @@ async function readTestSuiteMetadata(testFilePath) {
2813
2836
  try {
2814
2837
  const absolutePath = path8.resolve(testFilePath);
2815
2838
  const content = await readFile7(absolutePath, "utf8");
2816
- const parsed = parse2(content);
2839
+ const parsed = interpolateEnv(parse2(content), process.env);
2817
2840
  if (!isJsonObject(parsed)) {
2818
2841
  return {};
2819
2842
  }
@@ -2863,11 +2886,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2863
2886
  const config = await loadConfig(absoluteTestPath, repoRootPath);
2864
2887
  const guidelinePatterns = config?.guideline_patterns;
2865
2888
  const rawFile = await readFile7(absoluteTestPath, "utf8");
2866
- const parsed = parse2(rawFile);
2867
- if (!isJsonObject(parsed)) {
2889
+ const interpolated = interpolateEnv(parse2(rawFile), process.env);
2890
+ if (!isJsonObject(interpolated)) {
2868
2891
  throw new Error(`Invalid test file format: ${evalFilePath}`);
2869
2892
  }
2870
- const suite = parsed;
2893
+ const suite = interpolated;
2871
2894
  const datasetNameFromSuite = asString6(suite.dataset)?.trim();
2872
2895
  const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2873
2896
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
@@ -2911,9 +2934,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2911
2934
  }
2912
2935
  const testInputMessages = resolveInputMessages(evalcase);
2913
2936
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2914
- if (!id || !outcome || !testInputMessages || testInputMessages.length === 0) {
2937
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2938
+ if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
2915
2939
  logError2(
2916
- `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, criteria, and/or input`
2940
+ `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
2917
2941
  );
2918
2942
  continue;
2919
2943
  }
@@ -3009,7 +3033,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3009
3033
  guideline_paths: guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
3010
3034
  guideline_patterns: guidelinePatterns,
3011
3035
  file_paths: allFilePaths,
3012
- criteria: outcome,
3036
+ criteria: outcome ?? "",
3013
3037
  evaluator: evalCaseEvaluatorKind,
3014
3038
  evaluators,
3015
3039
  workspace: mergedWorkspace,
@@ -3119,30 +3143,24 @@ function parseWorkspaceHookConfig(raw, evalFileDir) {
3119
3143
  const script = parseWorkspaceScriptConfig(raw, evalFileDir);
3120
3144
  const obj = raw;
3121
3145
  const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
3122
- const clean = obj.clean === "always" || obj.clean === "on_success" || obj.clean === "on_failure" || obj.clean === "never" ? obj.clean : void 0;
3123
- if (!script && !reset && !clean) return void 0;
3146
+ if (!script && !reset) return void 0;
3124
3147
  return {
3125
3148
  ...script ?? {},
3126
- ...reset !== void 0 && { reset },
3127
- ...clean !== void 0 && { clean }
3149
+ ...reset !== void 0 && { reset }
3128
3150
  };
3129
3151
  }
3130
3152
  function parseWorkspaceHooksConfig(raw, evalFileDir) {
3131
3153
  if (!isJsonObject(raw)) return void 0;
3132
3154
  const obj = raw;
3133
- const beforeAllTests = parseWorkspaceHookConfig(obj.before_all_tests, evalFileDir);
3134
- const beforeEachTest = parseWorkspaceHookConfig(obj.before_each_test, evalFileDir);
3135
- const afterEachTest = parseWorkspaceHookConfig(obj.after_each_test, evalFileDir);
3136
- const afterAllTests = parseWorkspaceHookConfig(obj.after_all_tests, evalFileDir);
3137
- const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
3138
- const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
3155
+ const beforeAll = parseWorkspaceHookConfig(obj.before_all, evalFileDir);
3156
+ const beforeEach = parseWorkspaceHookConfig(obj.before_each, evalFileDir);
3157
+ const afterEach = parseWorkspaceHookConfig(obj.after_each, evalFileDir);
3158
+ const afterAll = parseWorkspaceHookConfig(obj.after_all, evalFileDir);
3139
3159
  const hooks = {
3140
- ...beforeAllTests !== void 0 && { before_all_tests: beforeAllTests },
3141
- ...beforeEachTest !== void 0 && { before_each_test: beforeEachTest },
3142
- ...afterEachTest !== void 0 && { after_each_test: afterEachTest },
3143
- ...afterAllTests !== void 0 && { after_all_tests: afterAllTests },
3144
- ...onReuse !== void 0 && { on_reuse: onReuse },
3145
- ...onFinish !== void 0 && { on_finish: onFinish }
3160
+ ...beforeAll !== void 0 && { before_all: beforeAll },
3161
+ ...beforeEach !== void 0 && { before_each: beforeEach },
3162
+ ...afterEach !== void 0 && { after_each: afterEach },
3163
+ ...afterAll !== void 0 && { after_all: afterAll }
3146
3164
  };
3147
3165
  return Object.keys(hooks).length > 0 ? hooks : void 0;
3148
3166
  }
@@ -3155,7 +3173,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
3155
3173
  } catch {
3156
3174
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
3157
3175
  }
3158
- const parsed = parse2(content);
3176
+ const parsed = interpolateEnv(parse2(content), process.env);
3159
3177
  if (!isJsonObject(parsed)) {
3160
3178
  throw new Error(
3161
3179
  `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
@@ -3203,18 +3221,10 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
3203
3221
  };
3204
3222
  };
3205
3223
  const mergedHooks = {
3206
- before_all_tests: mergeHook(
3207
- suiteLevel.hooks?.before_all_tests,
3208
- caseLevel.hooks?.before_all_tests
3209
- ),
3210
- before_each_test: mergeHook(
3211
- suiteLevel.hooks?.before_each_test,
3212
- caseLevel.hooks?.before_each_test
3213
- ),
3214
- after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
3215
- after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
3216
- on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
3217
- on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
3224
+ before_all: mergeHook(suiteLevel.hooks?.before_all, caseLevel.hooks?.before_all),
3225
+ before_each: mergeHook(suiteLevel.hooks?.before_each, caseLevel.hooks?.before_each),
3226
+ after_each: mergeHook(suiteLevel.hooks?.after_each, caseLevel.hooks?.after_each),
3227
+ after_all: mergeHook(suiteLevel.hooks?.after_all, caseLevel.hooks?.after_all)
3218
3228
  };
3219
3229
  const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
3220
3230
  return {
@@ -5344,6 +5354,7 @@ var CopilotCliProvider = class {
5344
5354
  const agentProcess = spawn(executable, args, {
5345
5355
  stdio: ["pipe", "pipe", "inherit"]
5346
5356
  });
5357
+ await waitForProcessSpawn(agentProcess, executable, this.targetName);
5347
5358
  const toolCallsInProgress = /* @__PURE__ */ new Map();
5348
5359
  const completedToolCalls = [];
5349
5360
  let finalContent = "";
@@ -5623,6 +5634,47 @@ var CopilotCliProvider = class {
5623
5634
  }
5624
5635
  }
5625
5636
  };
5637
+ async function waitForProcessSpawn(proc, executable, targetName) {
5638
+ if (proc.pid) {
5639
+ return;
5640
+ }
5641
+ await new Promise((resolve, reject) => {
5642
+ const onSpawn = () => {
5643
+ cleanup();
5644
+ resolve();
5645
+ };
5646
+ const onError = (error) => {
5647
+ cleanup();
5648
+ reject(new Error(formatCopilotSpawnError(error, executable, targetName)));
5649
+ };
5650
+ const cleanup = () => {
5651
+ proc.off("spawn", onSpawn);
5652
+ proc.off("error", onError);
5653
+ };
5654
+ proc.once("spawn", onSpawn);
5655
+ proc.once("error", onError);
5656
+ });
5657
+ }
5658
+ function formatCopilotSpawnError(error, executable, targetName) {
5659
+ const code = error.code;
5660
+ const base = `Failed to start Copilot CLI executable '${executable}' for target '${targetName}'. ${error.message}`;
5661
+ if (process.platform !== "win32") {
5662
+ return base;
5663
+ }
5664
+ if (code !== "ENOENT" && code !== "EINVAL") {
5665
+ return base;
5666
+ }
5667
+ return `${base}
5668
+
5669
+ On Windows, shell commands like 'copilot -h' can work via .ps1/.bat shims, but AgentV launches a subprocess that needs a directly spawnable executable path.
5670
+
5671
+ Fix options:
5672
+ 1) Install native Copilot binary package:
5673
+ npm install -g @github/copilot-win32-x64
5674
+ 2) Set explicit executable for Copilot targets:
5675
+ - In .env: COPILOT_EXE=C:\\Users\\<you>\\AppData\\Roaming\\npm\\node_modules\\@github\\copilot-win32-x64\\copilot.exe
5676
+ - In .agentv/targets.yaml: executable: \${{ COPILOT_EXE }}`;
5677
+ }
5626
5678
  function summarizeAcpEvent(eventType, data) {
5627
5679
  if (!data || typeof data !== "object") {
5628
5680
  return eventType;
@@ -13557,9 +13609,8 @@ async function runEvaluation(options) {
13557
13609
  const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
13558
13610
  const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
13559
13611
  const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
13560
- const finishCleanPolicy = suiteWorkspace?.hooks?.on_finish?.clean;
13561
- const resolvedRetainOnSuccess = (finishCleanPolicy === "always" || finishCleanPolicy === "on_success" ? "cleanup" : finishCleanPolicy === "on_failure" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
13562
- const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
13612
+ const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
13613
+ const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
13563
13614
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
13564
13615
  const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
13565
13616
  setupLog(
@@ -13594,7 +13645,7 @@ async function runEvaluation(options) {
13594
13645
  repos: suiteWorkspace.repos,
13595
13646
  maxSlots: poolMaxSlots,
13596
13647
  repoManager: poolRepoManager,
13597
- poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? suiteWorkspace.hooks?.on_reuse?.reset ?? "fast"
13648
+ poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
13598
13649
  });
13599
13650
  poolSlots.push(slot);
13600
13651
  setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
@@ -13645,7 +13696,7 @@ async function runEvaluation(options) {
13645
13696
  throw new Error(`Failed to materialize repos: ${message}`);
13646
13697
  }
13647
13698
  }
13648
- const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all_tests;
13699
+ const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
13649
13700
  if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
13650
13701
  const beforeAllHook = suiteBeforeAllHook;
13651
13702
  const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
@@ -13660,7 +13711,7 @@ async function runEvaluation(options) {
13660
13711
  };
13661
13712
  try {
13662
13713
  beforeAllOutput = await executeWorkspaceScript(
13663
- toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
13714
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
13664
13715
  scriptContext
13665
13716
  );
13666
13717
  setupLog("shared before_all completed");
@@ -13685,7 +13736,7 @@ async function runEvaluation(options) {
13685
13736
  };
13686
13737
  try {
13687
13738
  const output = await executeWorkspaceScript(
13688
- toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
13739
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
13689
13740
  scriptContext
13690
13741
  );
13691
13742
  if (!beforeAllOutput) beforeAllOutput = output;
@@ -13914,7 +13965,7 @@ async function runEvaluation(options) {
13914
13965
  }
13915
13966
  }
13916
13967
  const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
13917
- const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all_tests;
13968
+ const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
13918
13969
  if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
13919
13970
  const afterAllHook = suiteAfterAllHook;
13920
13971
  for (const wsPath of afterAllWorkspaces) {
@@ -13926,7 +13977,7 @@ async function runEvaluation(options) {
13926
13977
  };
13927
13978
  try {
13928
13979
  const afterAllOutput = await executeWorkspaceScript(
13929
- toScriptConfig(afterAllHook, "after_all_tests", "suite workspace"),
13980
+ toScriptConfig(afterAllHook, "after_all", "suite workspace"),
13930
13981
  scriptContext,
13931
13982
  "warn"
13932
13983
  );
@@ -14223,7 +14274,7 @@ async function runEvalCase(options) {
14223
14274
  );
14224
14275
  }
14225
14276
  }
14226
- const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all_tests;
14277
+ const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
14227
14278
  if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
14228
14279
  const beforeAllHook = caseBeforeAllHook;
14229
14280
  const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
@@ -14242,7 +14293,7 @@ async function runEvalCase(options) {
14242
14293
  };
14243
14294
  try {
14244
14295
  beforeAllOutput = await executeWorkspaceScript(
14245
- toScriptConfig(beforeAllHook, "before_all_tests", `test '${evalCase.id}'`),
14296
+ toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
14246
14297
  scriptContext
14247
14298
  );
14248
14299
  if (setupDebug) {
@@ -14267,7 +14318,7 @@ async function runEvalCase(options) {
14267
14318
  }
14268
14319
  }
14269
14320
  }
14270
- const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each_test;
14321
+ const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
14271
14322
  if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
14272
14323
  const beforeEachHook = caseBeforeEachHook;
14273
14324
  const scriptContext = {
@@ -14280,7 +14331,7 @@ async function runEvalCase(options) {
14280
14331
  };
14281
14332
  try {
14282
14333
  beforeEachOutput = await executeWorkspaceScript(
14283
- toScriptConfig(beforeEachHook, "before_each_test", `test '${evalCase.id}'`),
14334
+ toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
14284
14335
  scriptContext
14285
14336
  );
14286
14337
  } catch (error) {
@@ -14397,17 +14448,17 @@ async function runEvalCase(options) {
14397
14448
  }
14398
14449
  }
14399
14450
  const providerError = extractProviderError(providerResponse);
14400
- if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each_test?.reset && evalCase.workspace.hooks.after_each_test.reset !== "none" && evalCase.workspace.repos) {
14451
+ if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
14401
14452
  try {
14402
14453
  await repoManager.reset(
14403
14454
  evalCase.workspace.repos,
14404
14455
  workspacePath,
14405
- evalCase.workspace.hooks.after_each_test.reset
14456
+ evalCase.workspace.hooks.after_each.reset
14406
14457
  );
14407
14458
  } catch {
14408
14459
  }
14409
14460
  }
14410
- const caseAfterEachHook = evalCase.workspace?.hooks?.after_each_test;
14461
+ const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
14411
14462
  if (workspacePath && hasHookCommand(caseAfterEachHook)) {
14412
14463
  const afterEachHook = caseAfterEachHook;
14413
14464
  const scriptContext = {
@@ -14420,7 +14471,7 @@ async function runEvalCase(options) {
14420
14471
  };
14421
14472
  try {
14422
14473
  afterEachOutput = await executeWorkspaceScript(
14423
- toScriptConfig(afterEachHook, "after_each_test", `test '${evalCase.id}'`),
14474
+ toScriptConfig(afterEachHook, "after_each", `test '${evalCase.id}'`),
14424
14475
  scriptContext,
14425
14476
  "warn"
14426
14477
  );
@@ -14820,9 +14871,11 @@ async function runEvaluatorList(options) {
14820
14871
  registry: typeRegistry
14821
14872
  };
14822
14873
  for (const evaluatorConfig of evaluators ?? []) {
14874
+ const startedAt = /* @__PURE__ */ new Date();
14823
14875
  try {
14824
14876
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
14825
14877
  const score2 = await evaluatorInstance.evaluate(evalContext);
14878
+ const endedAt = /* @__PURE__ */ new Date();
14826
14879
  const weight = evaluatorConfig.weight ?? 1;
14827
14880
  scored.push({
14828
14881
  score: score2,
@@ -14843,9 +14896,13 @@ async function runEvaluatorList(options) {
14843
14896
  evaluatorProviderRequest: score2.evaluatorRawRequest,
14844
14897
  details: score2.details,
14845
14898
  scores: mapChildResults(score2.scores),
14846
- tokenUsage: score2.tokenUsage
14899
+ tokenUsage: score2.tokenUsage,
14900
+ durationMs: endedAt.getTime() - startedAt.getTime(),
14901
+ startedAt: startedAt.toISOString(),
14902
+ endedAt: endedAt.toISOString()
14847
14903
  });
14848
14904
  } catch (error) {
14905
+ const endedAt = /* @__PURE__ */ new Date();
14849
14906
  const message = error instanceof Error ? error.message : String(error);
14850
14907
  const fallbackScore = {
14851
14908
  score: 0,
@@ -14871,7 +14928,10 @@ async function runEvaluatorList(options) {
14871
14928
  verdict: "fail",
14872
14929
  hits: [],
14873
14930
  misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
14874
- reasoning: message
14931
+ reasoning: message,
14932
+ durationMs: endedAt.getTime() - startedAt.getTime(),
14933
+ startedAt: startedAt.toISOString(),
14934
+ endedAt: endedAt.toISOString()
14875
14935
  });
14876
14936
  }
14877
14937
  if (evaluatorConfig.negate === true && scored.length > 0) {