@agentv/core 2.14.3 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-N55K52OO.js";
20
+ } from "./chunk-E6AJPAXM.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -612,6 +612,17 @@ function parseExecutionDefaults(raw, configPath) {
612
612
  } else if (otelFile !== void 0) {
613
613
  logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
614
614
  }
615
+ if (typeof obj.pool_workspaces === "boolean") {
616
+ result.pool_workspaces = obj.pool_workspaces;
617
+ } else if (obj.pool_workspaces !== void 0) {
618
+ logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
619
+ }
620
+ const poolSlots = obj.pool_slots;
621
+ if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
622
+ result.pool_slots = poolSlots;
623
+ } else if (poolSlots !== void 0) {
624
+ logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
625
+ }
615
626
  return Object.keys(result).length > 0 ? result : void 0;
616
627
  }
617
628
  function logWarning(message) {
@@ -2053,6 +2064,7 @@ async function processMessages(options) {
2053
2064
  repoRootPath,
2054
2065
  guidelinePatterns,
2055
2066
  guidelinePaths,
2067
+ treatFileSegmentsAsGuidelines,
2056
2068
  textParts,
2057
2069
  messageType,
2058
2070
  verbose
@@ -2100,16 +2112,20 @@ async function processMessages(options) {
2100
2112
  }
2101
2113
  try {
2102
2114
  const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2103
- if (messageType === "input" && guidelinePatterns && guidelinePaths) {
2104
- const relativeToRepo = path5.relative(repoRootPath, resolvedPath);
2105
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
2106
- guidelinePaths.push(path5.resolve(resolvedPath));
2107
- if (verbose) {
2108
- console.log(` [Guideline] Found: ${displayPath}`);
2109
- console.log(` Resolved to: ${resolvedPath}`);
2110
- }
2111
- continue;
2115
+ const classifyAsGuideline = shouldTreatAsGuideline({
2116
+ messageType,
2117
+ resolvedPath,
2118
+ repoRootPath,
2119
+ guidelinePatterns,
2120
+ treatFileSegmentsAsGuidelines
2121
+ });
2122
+ if (classifyAsGuideline && guidelinePaths) {
2123
+ guidelinePaths.push(path5.resolve(resolvedPath));
2124
+ if (verbose) {
2125
+ console.log(` [Guideline] Found: ${displayPath}`);
2126
+ console.log(` Resolved to: ${resolvedPath}`);
2112
2127
  }
2128
+ continue;
2113
2129
  }
2114
2130
  segments.push({
2115
2131
  type: "file",
@@ -2138,6 +2154,26 @@ async function processMessages(options) {
2138
2154
  }
2139
2155
  return segments;
2140
2156
  }
2157
+ function shouldTreatAsGuideline(options) {
2158
+ const {
2159
+ messageType,
2160
+ resolvedPath,
2161
+ repoRootPath,
2162
+ guidelinePatterns,
2163
+ treatFileSegmentsAsGuidelines
2164
+ } = options;
2165
+ if (messageType !== "input") {
2166
+ return false;
2167
+ }
2168
+ if (treatFileSegmentsAsGuidelines) {
2169
+ return true;
2170
+ }
2171
+ if (!guidelinePatterns || guidelinePatterns.length === 0) {
2172
+ return false;
2173
+ }
2174
+ const relativeToRepo = path5.relative(repoRootPath, resolvedPath);
2175
+ return isGuidelineFile(relativeToRepo, guidelinePatterns);
2176
+ }
2141
2177
  function asString3(value) {
2142
2178
  return typeof value === "string" ? value : void 0;
2143
2179
  }
@@ -2476,6 +2512,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2476
2512
  for (const guidelinePath of testCase.guideline_paths) {
2477
2513
  console.log(` - ${guidelinePath}`);
2478
2514
  }
2515
+ } else if (!guidelinePatterns || guidelinePatterns.length === 0) {
2516
+ console.log(" No guidelines found (guideline_patterns not configured)");
2479
2517
  } else {
2480
2518
  console.log(" No guidelines found");
2481
2519
  }
@@ -2845,7 +2883,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2845
2883
  } else {
2846
2884
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
2847
2885
  }
2848
- const suiteWorkspace = parseWorkspaceConfig(suite.workspace, evalFileDir);
2886
+ const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
2849
2887
  const suiteInputMessages = expandInputShorthand(suite.input);
2850
2888
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
2851
2889
  const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
@@ -2881,12 +2919,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2881
2919
  }
2882
2920
  const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
2883
2921
  const skipDefaults = caseExecution?.skip_defaults === true;
2884
- const inputMessages = suiteInputMessages && !skipDefaults ? [...suiteInputMessages, ...testInputMessages] : testInputMessages;
2922
+ const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
2923
+ const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
2885
2924
  const hasExpectedMessages = expectedMessages.length > 0;
2886
2925
  const guidelinePaths = [];
2887
2926
  const inputTextParts = [];
2888
- const inputSegments = await processMessages({
2889
- messages: inputMessages,
2927
+ const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
2928
+ messages: effectiveSuiteInputMessages,
2929
+ searchRoots,
2930
+ repoRootPath,
2931
+ guidelinePatterns,
2932
+ guidelinePaths,
2933
+ treatFileSegmentsAsGuidelines: true,
2934
+ textParts: inputTextParts,
2935
+ messageType: "input",
2936
+ verbose
2937
+ }) : [];
2938
+ const testInputSegments = await processMessages({
2939
+ messages: testInputMessages,
2890
2940
  searchRoots,
2891
2941
  repoRootPath,
2892
2942
  guidelinePatterns,
@@ -2895,6 +2945,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2895
2945
  messageType: "input",
2896
2946
  verbose
2897
2947
  });
2948
+ const inputSegments = [...suiteInputSegments, ...testInputSegments];
2898
2949
  const outputSegments = hasExpectedMessages ? await processExpectedMessages({
2899
2950
  messages: expectedMessages,
2900
2951
  searchRoots,
@@ -2942,7 +2993,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2942
2993
  ...guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
2943
2994
  ...userFilePaths
2944
2995
  ];
2945
- const caseWorkspace = parseWorkspaceConfig(evalcase.workspace, evalFileDir);
2996
+ const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
2946
2997
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
2947
2998
  const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
2948
2999
  const caseTargets = extractTargetsFromTestCase(evalcase);
@@ -2973,6 +3024,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2973
3024
  for (const guidelinePath of testCase.guideline_paths) {
2974
3025
  console.log(` - ${guidelinePath}`);
2975
3026
  }
3027
+ } else if (!guidelinePatterns || guidelinePatterns.length === 0) {
3028
+ console.log(" No guidelines found (guideline_patterns not configured)");
2976
3029
  } else {
2977
3030
  console.log(" No guidelines found");
2978
3031
  }
@@ -3061,16 +3114,57 @@ function parseRepoConfig(raw) {
3061
3114
  ...clone !== void 0 && { clone }
3062
3115
  };
3063
3116
  }
3064
- function parseResetConfig(raw) {
3117
+ function parseWorkspaceHookConfig(raw, evalFileDir) {
3065
3118
  if (!isJsonObject(raw)) return void 0;
3119
+ const script = parseWorkspaceScriptConfig(raw, evalFileDir);
3066
3120
  const obj = raw;
3067
- const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
3068
- const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
3069
- if (!strategy && afterEach === void 0) return void 0;
3121
+ const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
3122
+ const clean = obj.clean === "always" || obj.clean === "on_success" || obj.clean === "on_failure" || obj.clean === "never" ? obj.clean : void 0;
3123
+ if (!script && !reset && !clean) return void 0;
3070
3124
  return {
3071
- ...strategy !== void 0 && { strategy },
3072
- ...afterEach !== void 0 && { after_each: afterEach }
3125
+ ...script ?? {},
3126
+ ...reset !== void 0 && { reset },
3127
+ ...clean !== void 0 && { clean }
3128
+ };
3129
+ }
3130
+ function parseWorkspaceHooksConfig(raw, evalFileDir) {
3131
+ if (!isJsonObject(raw)) return void 0;
3132
+ const obj = raw;
3133
+ const beforeAllTests = parseWorkspaceHookConfig(obj.before_all_tests, evalFileDir);
3134
+ const beforeEachTest = parseWorkspaceHookConfig(obj.before_each_test, evalFileDir);
3135
+ const afterEachTest = parseWorkspaceHookConfig(obj.after_each_test, evalFileDir);
3136
+ const afterAllTests = parseWorkspaceHookConfig(obj.after_all_tests, evalFileDir);
3137
+ const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
3138
+ const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
3139
+ const hooks = {
3140
+ ...beforeAllTests !== void 0 && { before_all_tests: beforeAllTests },
3141
+ ...beforeEachTest !== void 0 && { before_each_test: beforeEachTest },
3142
+ ...afterEachTest !== void 0 && { after_each_test: afterEachTest },
3143
+ ...afterAllTests !== void 0 && { after_all_tests: afterAllTests },
3144
+ ...onReuse !== void 0 && { on_reuse: onReuse },
3145
+ ...onFinish !== void 0 && { on_finish: onFinish }
3073
3146
  };
3147
+ return Object.keys(hooks).length > 0 ? hooks : void 0;
3148
+ }
3149
+ async function resolveWorkspaceConfig(raw, evalFileDir) {
3150
+ if (typeof raw === "string") {
3151
+ const workspaceFilePath = path8.resolve(evalFileDir, raw);
3152
+ let content;
3153
+ try {
3154
+ content = await readFile7(workspaceFilePath, "utf8");
3155
+ } catch {
3156
+ throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
3157
+ }
3158
+ const parsed = parse2(content);
3159
+ if (!isJsonObject(parsed)) {
3160
+ throw new Error(
3161
+ `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
3162
+ );
3163
+ }
3164
+ const workspaceFileDir = path8.dirname(workspaceFilePath);
3165
+ return parseWorkspaceConfig(parsed, workspaceFileDir);
3166
+ }
3167
+ return parseWorkspaceConfig(raw, evalFileDir);
3074
3168
  }
3075
3169
  function parseWorkspaceConfig(raw, evalFileDir) {
3076
3170
  if (!isJsonObject(raw)) return void 0;
@@ -3081,37 +3175,56 @@ function parseWorkspaceConfig(raw, evalFileDir) {
3081
3175
  }
3082
3176
  const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
3083
3177
  const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
3084
- const reset = parseResetConfig(obj.reset);
3085
- const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
3086
- const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
3087
- const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
3088
- const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
3089
- if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
3178
+ const hooks = parseWorkspaceHooksConfig(obj.hooks, evalFileDir);
3179
+ const mode = obj.mode === "pooled" || obj.mode === "ephemeral" || obj.mode === "static" ? obj.mode : void 0;
3180
+ const staticPath = typeof obj.static_path === "string" ? obj.static_path : void 0;
3181
+ const pool = typeof obj.pool === "boolean" ? obj.pool : void 0;
3182
+ if (!template && !isolation && !repos && !hooks && !mode && !staticPath && pool === void 0)
3090
3183
  return void 0;
3091
3184
  return {
3092
3185
  ...template !== void 0 && { template },
3093
3186
  ...isolation !== void 0 && { isolation },
3094
3187
  ...repos !== void 0 && { repos },
3095
- ...reset !== void 0 && { reset },
3096
- ...beforeAll !== void 0 && { before_all: beforeAll },
3097
- ...afterAll !== void 0 && { after_all: afterAll },
3098
- ...beforeEach !== void 0 && { before_each: beforeEach },
3099
- ...afterEach !== void 0 && { after_each: afterEach }
3188
+ ...hooks !== void 0 && { hooks },
3189
+ ...mode !== void 0 && { mode },
3190
+ ...staticPath !== void 0 && { static_path: staticPath },
3191
+ ...pool !== void 0 && { pool }
3100
3192
  };
3101
3193
  }
3102
3194
  function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
3103
3195
  if (!suiteLevel && !caseLevel) return void 0;
3104
3196
  if (!suiteLevel) return caseLevel;
3105
3197
  if (!caseLevel) return suiteLevel;
3198
+ const mergeHook = (suiteHook, caseHook) => {
3199
+ if (!suiteHook && !caseHook) return void 0;
3200
+ return {
3201
+ ...suiteHook ?? {},
3202
+ ...caseHook ?? {}
3203
+ };
3204
+ };
3205
+ const mergedHooks = {
3206
+ before_all_tests: mergeHook(
3207
+ suiteLevel.hooks?.before_all_tests,
3208
+ caseLevel.hooks?.before_all_tests
3209
+ ),
3210
+ before_each_test: mergeHook(
3211
+ suiteLevel.hooks?.before_each_test,
3212
+ caseLevel.hooks?.before_each_test
3213
+ ),
3214
+ after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
3215
+ after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
3216
+ on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
3217
+ on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
3218
+ };
3219
+ const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
3106
3220
  return {
3107
3221
  template: caseLevel.template ?? suiteLevel.template,
3108
3222
  isolation: caseLevel.isolation ?? suiteLevel.isolation,
3109
3223
  repos: caseLevel.repos ?? suiteLevel.repos,
3110
- reset: caseLevel.reset ?? suiteLevel.reset,
3111
- before_all: caseLevel.before_all ?? suiteLevel.before_all,
3112
- after_all: caseLevel.after_all ?? suiteLevel.after_all,
3113
- before_each: caseLevel.before_each ?? suiteLevel.before_each,
3114
- after_each: caseLevel.after_each ?? suiteLevel.after_each
3224
+ ...hasHooks && { hooks: mergedHooks },
3225
+ mode: caseLevel.mode ?? suiteLevel.mode,
3226
+ static_path: caseLevel.static_path ?? suiteLevel.static_path,
3227
+ pool: caseLevel.pool ?? suiteLevel.pool
3115
3228
  };
3116
3229
  }
3117
3230
  function asString6(value) {
@@ -7165,15 +7278,15 @@ function getAgentvHome() {
7165
7278
  function getWorkspacesRoot() {
7166
7279
  return path21.join(getAgentvHome(), "workspaces");
7167
7280
  }
7168
- function getGitCacheRoot() {
7169
- return path21.join(getAgentvHome(), "git-cache");
7170
- }
7171
7281
  function getSubagentsRoot() {
7172
7282
  return path21.join(getAgentvHome(), "subagents");
7173
7283
  }
7174
7284
  function getTraceStateRoot() {
7175
7285
  return path21.join(getAgentvHome(), "trace-state");
7176
7286
  }
7287
+ function getWorkspacePoolRoot() {
7288
+ return path21.join(getAgentvHome(), "workspace-pool");
7289
+ }
7177
7290
 
7178
7291
  // src/evaluation/providers/vscode/dispatch/constants.ts
7179
7292
  var DEFAULT_LOCK_NAME = "subagent.lock";
@@ -7996,8 +8109,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
7996
8109
 
7997
8110
  **IMPORTANT**: Follow these exact steps:
7998
8111
  1. Create and write your complete response to: {{responseFileTmp}}
7999
- - All intended file outputs/changes MUST be written in your response file.
8000
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
8001
8112
  2. When completely finished, run these PowerShell commands to signal completion:
8002
8113
  \`\`\`
8003
8114
  Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
@@ -8014,8 +8125,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
8014
8125
 
8015
8126
  **IMPORTANT**: Follow these exact steps:
8016
8127
  1. Create and write your complete response to: {{responseFileTmp}}
8017
- - All intended file outputs/changes MUST be written in your response file.
8018
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
8019
8128
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
8020
8129
  3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
8021
8130
  `;
@@ -8628,15 +8737,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
8628
8737
  });
8629
8738
  }
8630
8739
  async function execShellWithStdin(command, stdinPayload, options = {}) {
8631
- const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
8740
+ const { mkdir: mkdir14, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
8632
8741
  const { tmpdir: tmpdir3 } = await import("node:os");
8633
- const path41 = await import("node:path");
8742
+ const path42 = await import("node:path");
8634
8743
  const { randomUUID: randomUUID8 } = await import("node:crypto");
8635
- const dir = path41.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8744
+ const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8636
8745
  await mkdir14(dir, { recursive: true });
8637
- const stdinPath = path41.join(dir, "stdin.txt");
8638
- const stdoutPath = path41.join(dir, "stdout.txt");
8639
- const stderrPath = path41.join(dir, "stderr.txt");
8746
+ const stdinPath = path42.join(dir, "stdin.txt");
8747
+ const stdoutPath = path42.join(dir, "stdout.txt");
8748
+ const stderrPath = path42.join(dir, "stderr.txt");
8640
8749
  await writeFile9(stdinPath, stdinPayload, "utf8");
8641
8750
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
8642
8751
  const { spawn: spawn4 } = await import("node:child_process");
@@ -8666,8 +8775,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
8666
8775
  resolve(code ?? 0);
8667
8776
  });
8668
8777
  });
8669
- const stdout = (await readFile12(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
8670
- const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
8778
+ const stdout = (await readFile13(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
8779
+ const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
8671
8780
  return { stdout, stderr, exitCode };
8672
8781
  } finally {
8673
8782
  await rm6(dir, { recursive: true, force: true });
@@ -8988,7 +9097,7 @@ var CodeEvaluator = class {
8988
9097
  outputPath,
8989
9098
  guidelineFiles: context.evalCase.guideline_paths,
8990
9099
  inputFiles: context.evalCase.file_paths.filter(
8991
- (path41) => !context.evalCase.guideline_paths.includes(path41)
9100
+ (path42) => !context.evalCase.guideline_paths.includes(path42)
8992
9101
  ),
8993
9102
  input: context.evalCase.input,
8994
9103
  trace: context.trace ?? null,
@@ -9238,6 +9347,8 @@ ${context.fileChanges}`;
9238
9347
  };
9239
9348
  } catch (e) {
9240
9349
  const message = e instanceof Error ? e.message : String(e);
9350
+ const evalName = context.evaluator?.name ?? "llm-judge";
9351
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
9241
9352
  return {
9242
9353
  score: 0,
9243
9354
  verdict: "skip",
@@ -9266,24 +9377,39 @@ ${context.fileChanges}`;
9266
9377
  systemPrompt,
9267
9378
  target: judgeProvider.targetName
9268
9379
  };
9269
- const { data, tokenUsage } = await this.runWithRetry({
9270
- context,
9271
- judgeProvider,
9272
- systemPrompt,
9273
- userPrompt: prompt,
9274
- schema: rubricEvaluationSchema
9275
- });
9276
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
9277
- return {
9278
- score,
9279
- verdict,
9280
- hits,
9281
- misses,
9282
- expectedAspectCount: rubrics.length,
9283
- reasoning: data.overall_reasoning,
9284
- evaluatorRawRequest,
9285
- tokenUsage
9286
- };
9380
+ try {
9381
+ const { data, tokenUsage } = await this.runWithRetry({
9382
+ context,
9383
+ judgeProvider,
9384
+ systemPrompt,
9385
+ userPrompt: prompt,
9386
+ schema: rubricEvaluationSchema
9387
+ });
9388
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
9389
+ return {
9390
+ score,
9391
+ verdict,
9392
+ hits,
9393
+ misses,
9394
+ expectedAspectCount: rubrics.length,
9395
+ reasoning: data.overall_reasoning,
9396
+ evaluatorRawRequest,
9397
+ tokenUsage
9398
+ };
9399
+ } catch (e) {
9400
+ const message = e instanceof Error ? e.message : String(e);
9401
+ const evalName = context.evaluator?.name ?? "llm-judge";
9402
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
9403
+ return {
9404
+ score: 0,
9405
+ verdict: "skip",
9406
+ hits: [],
9407
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
9408
+ expectedAspectCount: rubrics.length,
9409
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
9410
+ evaluatorRawRequest
9411
+ };
9412
+ }
9287
9413
  }
9288
9414
  /**
9289
9415
  * Evaluate using score-range rubrics (analytic rubric scoring).
@@ -9297,25 +9423,40 @@ ${context.fileChanges}`;
9297
9423
  systemPrompt,
9298
9424
  target: judgeProvider.targetName
9299
9425
  };
9300
- const { data, tokenUsage } = await this.runWithRetry({
9301
- context,
9302
- judgeProvider,
9303
- systemPrompt,
9304
- userPrompt: prompt,
9305
- schema: scoreRangeEvaluationSchema
9306
- });
9307
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
9308
- return {
9309
- score,
9310
- verdict,
9311
- hits,
9312
- misses,
9313
- expectedAspectCount: rubrics.length,
9314
- reasoning: data.overall_reasoning,
9315
- evaluatorRawRequest,
9316
- details,
9317
- tokenUsage
9318
- };
9426
+ try {
9427
+ const { data, tokenUsage } = await this.runWithRetry({
9428
+ context,
9429
+ judgeProvider,
9430
+ systemPrompt,
9431
+ userPrompt: prompt,
9432
+ schema: scoreRangeEvaluationSchema
9433
+ });
9434
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
9435
+ return {
9436
+ score,
9437
+ verdict,
9438
+ hits,
9439
+ misses,
9440
+ expectedAspectCount: rubrics.length,
9441
+ reasoning: data.overall_reasoning,
9442
+ evaluatorRawRequest,
9443
+ details,
9444
+ tokenUsage
9445
+ };
9446
+ } catch (e) {
9447
+ const message = e instanceof Error ? e.message : String(e);
9448
+ const evalName = context.evaluator?.name ?? "llm-judge";
9449
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
9450
+ return {
9451
+ score: 0,
9452
+ verdict: "skip",
9453
+ hits: [],
9454
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
9455
+ expectedAspectCount: rubrics.length,
9456
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
9457
+ evaluatorRawRequest
9458
+ };
9459
+ }
9319
9460
  }
9320
9461
  /**
9321
9462
  * Build prompt for score-range rubric evaluation.
@@ -9601,19 +9742,13 @@ var CompositeEvaluator = class {
9601
9742
  runWeightedAverage(results, weights) {
9602
9743
  let totalWeight = 0;
9603
9744
  let weightedSum = 0;
9745
+ let evaluatedCount = 0;
9604
9746
  const allHits = [];
9605
9747
  const allMisses = [];
9606
9748
  const reasoningParts = [];
9607
9749
  const scores = [];
9608
9750
  for (const member of results) {
9609
9751
  const weight = weights?.[member.id] ?? 1;
9610
- totalWeight += weight;
9611
- weightedSum += member.result.score * weight;
9612
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
9613
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
9614
- if (member.result.reasoning) {
9615
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
9616
- }
9617
9752
  scores.push({
9618
9753
  name: member.id,
9619
9754
  type: member.type,
@@ -9628,6 +9763,32 @@ var CompositeEvaluator = class {
9628
9763
  details: member.result.details,
9629
9764
  tokenUsage: member.result.tokenUsage
9630
9765
  });
9766
+ if (member.result.verdict === "skip") {
9767
+ continue;
9768
+ }
9769
+ evaluatedCount++;
9770
+ totalWeight += weight;
9771
+ weightedSum += member.result.score * weight;
9772
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
9773
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
9774
+ if (member.result.reasoning) {
9775
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
9776
+ }
9777
+ }
9778
+ if (evaluatedCount === 0 && results.length > 0) {
9779
+ return {
9780
+ score: 0,
9781
+ verdict: "skip",
9782
+ hits: [],
9783
+ misses: [],
9784
+ expectedAspectCount: 1,
9785
+ reasoning: "All evaluators skipped (infrastructure failure)",
9786
+ evaluatorRawRequest: {
9787
+ aggregator: "weighted_average",
9788
+ ...weights ? { weights } : {}
9789
+ },
9790
+ scores
9791
+ };
9631
9792
  }
9632
9793
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
9633
9794
  return {
@@ -9651,19 +9812,8 @@ var CompositeEvaluator = class {
9651
9812
  const reasoningParts = [];
9652
9813
  let passingCount = 0;
9653
9814
  let borderlineCount = 0;
9815
+ let evaluatedCount = 0;
9654
9816
  for (const member of results) {
9655
- const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
9656
- if (isPassing) {
9657
- passingCount++;
9658
- if (member.result.verdict === "borderline") {
9659
- borderlineCount++;
9660
- }
9661
- }
9662
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
9663
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
9664
- if (member.result.reasoning) {
9665
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
9666
- }
9667
9817
  scores.push({
9668
9818
  name: member.id,
9669
9819
  type: member.type,
@@ -9677,8 +9827,39 @@ var CompositeEvaluator = class {
9677
9827
  details: member.result.details,
9678
9828
  tokenUsage: member.result.tokenUsage
9679
9829
  });
9830
+ if (member.result.verdict === "skip") {
9831
+ continue;
9832
+ }
9833
+ evaluatedCount++;
9834
+ const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
9835
+ if (isPassing) {
9836
+ passingCount++;
9837
+ if (member.result.verdict === "borderline") {
9838
+ borderlineCount++;
9839
+ }
9840
+ }
9841
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
9842
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
9843
+ if (member.result.reasoning) {
9844
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
9845
+ }
9846
+ }
9847
+ if (evaluatedCount === 0 && results.length > 0) {
9848
+ return {
9849
+ score: 0,
9850
+ verdict: "skip",
9851
+ hits: [],
9852
+ misses: [],
9853
+ expectedAspectCount: 1,
9854
+ reasoning: "All evaluators skipped (infrastructure failure)",
9855
+ evaluatorRawRequest: {
9856
+ aggregator: "threshold",
9857
+ threshold
9858
+ },
9859
+ scores
9860
+ };
9680
9861
  }
9681
- const totalCount = results.length;
9862
+ const totalCount = evaluatedCount;
9682
9863
  const score = totalCount > 0 ? passingCount / totalCount : 0;
9683
9864
  const pass = score >= threshold;
9684
9865
  if (pass && borderlineCount > 0) {
@@ -10186,115 +10367,115 @@ var FieldAccuracyEvaluator = class {
10186
10367
  * Evaluate a single field against the expected value.
10187
10368
  */
10188
10369
  evaluateField(fieldConfig, candidateData, expectedData) {
10189
- const { path: path41, match, required = true, weight = 1 } = fieldConfig;
10190
- const candidateValue = resolvePath(candidateData, path41);
10191
- const expectedValue = resolvePath(expectedData, path41);
10370
+ const { path: path42, match, required = true, weight = 1 } = fieldConfig;
10371
+ const candidateValue = resolvePath(candidateData, path42);
10372
+ const expectedValue = resolvePath(expectedData, path42);
10192
10373
  if (expectedValue === void 0) {
10193
10374
  return {
10194
- path: path41,
10375
+ path: path42,
10195
10376
  score: 1,
10196
10377
  // No expected value means no comparison needed
10197
10378
  weight,
10198
10379
  hit: true,
10199
- message: `${path41}: no expected value`
10380
+ message: `${path42}: no expected value`
10200
10381
  };
10201
10382
  }
10202
10383
  if (candidateValue === void 0) {
10203
10384
  if (required) {
10204
10385
  return {
10205
- path: path41,
10386
+ path: path42,
10206
10387
  score: 0,
10207
10388
  weight,
10208
10389
  hit: false,
10209
- message: `${path41} (required, missing)`
10390
+ message: `${path42} (required, missing)`
10210
10391
  };
10211
10392
  }
10212
10393
  return {
10213
- path: path41,
10394
+ path: path42,
10214
10395
  score: 1,
10215
10396
  // Don't penalize missing optional fields
10216
10397
  weight: 0,
10217
10398
  // Zero weight means it won't affect the score
10218
10399
  hit: true,
10219
- message: `${path41}: optional field missing`
10400
+ message: `${path42}: optional field missing`
10220
10401
  };
10221
10402
  }
10222
10403
  switch (match) {
10223
10404
  case "exact":
10224
- return this.compareExact(path41, candidateValue, expectedValue, weight);
10405
+ return this.compareExact(path42, candidateValue, expectedValue, weight);
10225
10406
  case "numeric_tolerance":
10226
10407
  return this.compareNumericTolerance(
10227
- path41,
10408
+ path42,
10228
10409
  candidateValue,
10229
10410
  expectedValue,
10230
10411
  fieldConfig,
10231
10412
  weight
10232
10413
  );
10233
10414
  case "date":
10234
- return this.compareDate(path41, candidateValue, expectedValue, fieldConfig, weight);
10415
+ return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
10235
10416
  default:
10236
10417
  return {
10237
- path: path41,
10418
+ path: path42,
10238
10419
  score: 0,
10239
10420
  weight,
10240
10421
  hit: false,
10241
- message: `${path41}: unknown match type "${match}"`
10422
+ message: `${path42}: unknown match type "${match}"`
10242
10423
  };
10243
10424
  }
10244
10425
  }
10245
10426
  /**
10246
10427
  * Exact equality comparison.
10247
10428
  */
10248
- compareExact(path41, candidateValue, expectedValue, weight) {
10429
+ compareExact(path42, candidateValue, expectedValue, weight) {
10249
10430
  if (deepEqual(candidateValue, expectedValue)) {
10250
10431
  return {
10251
- path: path41,
10432
+ path: path42,
10252
10433
  score: 1,
10253
10434
  weight,
10254
10435
  hit: true,
10255
- message: path41
10436
+ message: path42
10256
10437
  };
10257
10438
  }
10258
10439
  if (typeof candidateValue !== typeof expectedValue) {
10259
10440
  return {
10260
- path: path41,
10441
+ path: path42,
10261
10442
  score: 0,
10262
10443
  weight,
10263
10444
  hit: false,
10264
- message: `${path41} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
10445
+ message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
10265
10446
  };
10266
10447
  }
10267
10448
  return {
10268
- path: path41,
10449
+ path: path42,
10269
10450
  score: 0,
10270
10451
  weight,
10271
10452
  hit: false,
10272
- message: `${path41} (value mismatch)`
10453
+ message: `${path42} (value mismatch)`
10273
10454
  };
10274
10455
  }
10275
10456
  /**
10276
10457
  * Numeric comparison with absolute or relative tolerance.
10277
10458
  */
10278
- compareNumericTolerance(path41, candidateValue, expectedValue, fieldConfig, weight) {
10459
+ compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
10279
10460
  const { tolerance = 0, relative = false } = fieldConfig;
10280
10461
  const candidateNum = toNumber2(candidateValue);
10281
10462
  const expectedNum = toNumber2(expectedValue);
10282
10463
  if (candidateNum === null || expectedNum === null) {
10283
10464
  return {
10284
- path: path41,
10465
+ path: path42,
10285
10466
  score: 0,
10286
10467
  weight,
10287
10468
  hit: false,
10288
- message: `${path41} (non-numeric value)`
10469
+ message: `${path42} (non-numeric value)`
10289
10470
  };
10290
10471
  }
10291
10472
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
10292
10473
  return {
10293
- path: path41,
10474
+ path: path42,
10294
10475
  score: 0,
10295
10476
  weight,
10296
10477
  hit: false,
10297
- message: `${path41} (invalid numeric value)`
10478
+ message: `${path42} (invalid numeric value)`
10298
10479
  };
10299
10480
  }
10300
10481
  const diff = Math.abs(candidateNum - expectedNum);
@@ -10307,61 +10488,61 @@ var FieldAccuracyEvaluator = class {
10307
10488
  }
10308
10489
  if (withinTolerance) {
10309
10490
  return {
10310
- path: path41,
10491
+ path: path42,
10311
10492
  score: 1,
10312
10493
  weight,
10313
10494
  hit: true,
10314
- message: `${path41} (within tolerance: diff=${diff.toFixed(2)})`
10495
+ message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
10315
10496
  };
10316
10497
  }
10317
10498
  return {
10318
- path: path41,
10499
+ path: path42,
10319
10500
  score: 0,
10320
10501
  weight,
10321
10502
  hit: false,
10322
- message: `${path41} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10503
+ message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10323
10504
  };
10324
10505
  }
10325
10506
  /**
10326
10507
  * Date comparison with format normalization.
10327
10508
  */
10328
- compareDate(path41, candidateValue, expectedValue, fieldConfig, weight) {
10509
+ compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
10329
10510
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
10330
10511
  const candidateDate = parseDate(String(candidateValue), formats);
10331
10512
  const expectedDate = parseDate(String(expectedValue), formats);
10332
10513
  if (candidateDate === null) {
10333
10514
  return {
10334
- path: path41,
10515
+ path: path42,
10335
10516
  score: 0,
10336
10517
  weight,
10337
10518
  hit: false,
10338
- message: `${path41} (unparseable candidate date)`
10519
+ message: `${path42} (unparseable candidate date)`
10339
10520
  };
10340
10521
  }
10341
10522
  if (expectedDate === null) {
10342
10523
  return {
10343
- path: path41,
10524
+ path: path42,
10344
10525
  score: 0,
10345
10526
  weight,
10346
10527
  hit: false,
10347
- message: `${path41} (unparseable expected date)`
10528
+ message: `${path42} (unparseable expected date)`
10348
10529
  };
10349
10530
  }
10350
10531
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
10351
10532
  return {
10352
- path: path41,
10533
+ path: path42,
10353
10534
  score: 1,
10354
10535
  weight,
10355
10536
  hit: true,
10356
- message: path41
10537
+ message: path42
10357
10538
  };
10358
10539
  }
10359
10540
  return {
10360
- path: path41,
10541
+ path: path42,
10361
10542
  score: 0,
10362
10543
  weight,
10363
10544
  hit: false,
10364
- message: `${path41} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10545
+ message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10365
10546
  };
10366
10547
  }
10367
10548
  /**
@@ -10402,11 +10583,11 @@ var FieldAccuracyEvaluator = class {
10402
10583
  };
10403
10584
  }
10404
10585
  };
10405
- function resolvePath(obj, path41) {
10406
- if (!path41 || !obj) {
10586
+ function resolvePath(obj, path42) {
10587
+ if (!path42 || !obj) {
10407
10588
  return void 0;
10408
10589
  }
10409
- const parts = path41.split(/\.|\[|\]/).filter((p) => p.length > 0);
10590
+ const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
10410
10591
  let current = obj;
10411
10592
  for (const part of parts) {
10412
10593
  if (current === null || current === void 0) {
@@ -11224,8 +11405,8 @@ var TokenUsageEvaluator = class {
11224
11405
  };
11225
11406
 
11226
11407
  // src/evaluation/evaluators/tool-trajectory.ts
11227
- function getNestedValue(obj, path41) {
11228
- const parts = path41.split(".");
11408
+ function getNestedValue(obj, path42) {
11409
+ const parts = path42.split(".");
11229
11410
  let current = obj;
11230
11411
  for (const part of parts) {
11231
11412
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -11788,7 +11969,7 @@ function runEqualsAssertion(output, value) {
11788
11969
  // src/evaluation/orchestrator.ts
11789
11970
  import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
11790
11971
  import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
11791
- import path38 from "node:path";
11972
+ import path39 from "node:path";
11792
11973
  import micromatch4 from "micromatch";
11793
11974
 
11794
11975
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -12658,16 +12839,14 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
12658
12839
  }
12659
12840
  }
12660
12841
 
12661
- // src/evaluation/workspace/repo-manager.ts
12842
+ // src/evaluation/workspace/pool-manager.ts
12662
12843
  import { execFile } from "node:child_process";
12663
12844
  import { createHash } from "node:crypto";
12664
12845
  import { existsSync as existsSync2 } from "node:fs";
12665
- import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
12846
+ import { cp as cp2, mkdir as mkdir11, readFile as readFile11, readdir as readdir4, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
12666
12847
  import path36 from "node:path";
12667
12848
  import { promisify as promisify5 } from "node:util";
12668
12849
  var execFileAsync = promisify5(execFile);
12669
- var DEFAULT_TIMEOUT_MS2 = 3e5;
12670
- var LOCK_TIMEOUT_MS = 6e4;
12671
12850
  function gitEnv() {
12672
12851
  const env = { ...process.env };
12673
12852
  for (const key of Object.keys(env)) {
@@ -12682,160 +12861,326 @@ function gitEnv() {
12682
12861
  GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
12683
12862
  };
12684
12863
  }
12685
- function cacheKey(source) {
12686
- const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
12687
- return createHash("sha256").update(raw).digest("hex");
12688
- }
12689
- function getSourceUrl(source) {
12690
- return source.type === "git" ? source.url : source.path;
12691
- }
12692
12864
  async function git(args, opts) {
12693
12865
  const { stdout } = await execFileAsync("git", args, {
12694
12866
  cwd: opts?.cwd,
12695
- timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
12867
+ timeout: opts?.timeout ?? 3e5,
12696
12868
  env: gitEnv(),
12697
12869
  maxBuffer: 50 * 1024 * 1024
12698
- // 50MB
12699
12870
  });
12700
12871
  return stdout.trim();
12701
12872
  }
12702
- async function acquireLock(lockPath) {
12703
- const start = Date.now();
12704
- while (Date.now() - start < LOCK_TIMEOUT_MS) {
12705
- try {
12706
- await writeFile7(lockPath, String(process.pid), { flag: "wx" });
12707
- return;
12708
- } catch (err) {
12709
- if (err.code === "EEXIST") {
12710
- await new Promise((r) => setTimeout(r, 200));
12873
+ function normalizeRepoForFingerprint(repo) {
12874
+ const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
12875
+ const result = {
12876
+ path: repo.path,
12877
+ source,
12878
+ ref: repo.checkout?.ref ?? "HEAD"
12879
+ };
12880
+ if (repo.clone?.depth !== void 0) {
12881
+ result.depth = repo.clone.depth;
12882
+ }
12883
+ if (repo.clone?.filter !== void 0) {
12884
+ result.filter = repo.clone.filter;
12885
+ }
12886
+ if (repo.clone?.sparse?.length) {
12887
+ result.sparse = [...repo.clone.sparse].sort();
12888
+ }
12889
+ return result;
12890
+ }
12891
+ function computeWorkspaceFingerprint(templatePath, repos) {
12892
+ const canonical = {
12893
+ templatePath: templatePath ?? null,
12894
+ repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
12895
+ };
12896
+ return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
12897
+ }
12898
+ async function copyDirectoryRecursive2(src, dest, skipDirs) {
12899
+ await mkdir11(dest, { recursive: true });
12900
+ const entries = await readdir4(src, { withFileTypes: true });
12901
+ for (const entry of entries) {
12902
+ const srcPath = path36.join(src, entry.name);
12903
+ const destPath = path36.join(dest, entry.name);
12904
+ if (entry.name === ".git") {
12905
+ continue;
12906
+ }
12907
+ if (entry.isDirectory()) {
12908
+ if (skipDirs?.has(entry.name)) {
12711
12909
  continue;
12712
12910
  }
12713
- throw err;
12911
+ await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
12912
+ } else {
12913
+ await cp2(srcPath, destPath, { preserveTimestamps: true, force: true });
12714
12914
  }
12715
12915
  }
12716
- throw new Error(`Timed out waiting for lock: ${lockPath}`);
12717
12916
  }
12718
- async function releaseLock(lockPath) {
12719
- try {
12720
- await unlink(lockPath);
12721
- } catch {
12722
- }
12723
- }
12724
- var RepoManager = class {
12725
- cacheDir;
12726
- verbose;
12727
- constructor(cacheDir, verbose = false) {
12728
- this.cacheDir = cacheDir ?? getGitCacheRoot();
12729
- this.verbose = verbose;
12917
+ var WorkspacePoolManager = class {
12918
+ poolRoot;
12919
+ constructor(poolRoot) {
12920
+ this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
12730
12921
  }
12731
- async runGit(args, opts) {
12732
- const startedAt = Date.now();
12733
- if (this.verbose) {
12734
- console.log(
12735
- `[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`
12922
+ /**
12923
+ * Acquire a workspace slot from the pool.
12924
+ *
12925
+ * 1. Compute fingerprint from template + repos
12926
+ * 2. Check drift (compare stored metadata.json fingerprint vs computed)
12927
+ * 3. If drift: warn, remove all slots, rematerialize
12928
+ * 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
12929
+ * 5. If slot exists: reset repos, re-copy template files (skip repo directories)
12930
+ * 6. If new slot: copy template, materialize all repos, write metadata.json
12931
+ * 7. Return the slot (with path, index, isExisting)
12932
+ */
12933
+ async acquireWorkspace(options) {
12934
+ const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
12935
+ const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
12936
+ const poolDir = path36.join(this.poolRoot, fingerprint);
12937
+ await mkdir11(poolDir, { recursive: true });
12938
+ const drifted = await this.checkDrift(poolDir, fingerprint);
12939
+ if (drifted) {
12940
+ console.warn(
12941
+ `[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
12736
12942
  );
12943
+ await this.removeAllSlots(poolDir);
12737
12944
  }
12738
- try {
12739
- const output = await git(args, opts);
12740
- if (this.verbose) {
12741
- console.log(
12742
- `[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`
12743
- );
12945
+ for (let i = 0; i < maxSlots; i++) {
12946
+ const slotPath = path36.join(poolDir, `slot-${i}`);
12947
+ const lockPath = `${slotPath}.lock`;
12948
+ const locked = await this.tryLock(lockPath);
12949
+ if (!locked) {
12950
+ continue;
12744
12951
  }
12745
- return output;
12746
- } catch (error) {
12747
- if (this.verbose) {
12748
- const message = error instanceof Error ? error.message : String(error);
12749
- console.log(
12750
- `[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
12751
- );
12952
+ const slotExists = existsSync2(slotPath);
12953
+ if (slotExists) {
12954
+ await this.resetSlot(slotPath, templatePath, repos, poolReset);
12955
+ return {
12956
+ index: i,
12957
+ path: slotPath,
12958
+ isExisting: true,
12959
+ lockPath,
12960
+ fingerprint,
12961
+ poolDir
12962
+ };
12752
12963
  }
12753
- throw error;
12964
+ await mkdir11(slotPath, { recursive: true });
12965
+ if (templatePath) {
12966
+ await copyDirectoryRecursive2(templatePath, slotPath);
12967
+ }
12968
+ if (repos.length > 0) {
12969
+ await repoManager.materializeAll(repos, slotPath);
12970
+ }
12971
+ await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
12972
+ return {
12973
+ index: i,
12974
+ path: slotPath,
12975
+ isExisting: false,
12976
+ lockPath,
12977
+ fingerprint,
12978
+ poolDir
12979
+ };
12980
+ }
12981
+ throw new Error(
12982
+ `All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
12983
+ );
12984
+ }
12985
+ /** Remove lock file to release a slot. */
12986
+ async releaseSlot(slot) {
12987
+ try {
12988
+ await unlink(slot.lockPath);
12989
+ } catch {
12754
12990
  }
12755
12991
  }
12756
12992
  /**
12757
- * Ensure a bare mirror cache exists for the given source.
12758
- * Creates on first access, fetches updates on subsequent calls.
12759
- * Returns the absolute path to the cache directory.
12993
+ * Try to acquire a PID-based lock file.
12994
+ * On EEXIST, read PID and check if process is alive. If dead, stale lock remove and retry.
12995
+ * Returns true if lock acquired, false if slot is actively locked.
12996
+ * Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
12760
12997
  */
12761
- async ensureCache(source, depth, resolve) {
12762
- const key = cacheKey(source);
12763
- const cachePath = path36.join(this.cacheDir, key);
12764
- const lockPath = `${cachePath}.lock`;
12765
- const cacheExists = existsSync2(path36.join(cachePath, "HEAD"));
12766
- if (this.verbose) {
12767
- console.log(
12768
- `[repo] ensureCache source=${getSourceUrl(source)} resolve=${resolve ?? "remote"} cache=${cachePath}`
12769
- );
12998
+ async tryLock(lockPath) {
12999
+ for (let attempt = 0; attempt < 3; attempt++) {
13000
+ try {
13001
+ await writeFile7(lockPath, String(process.pid), { flag: "wx" });
13002
+ return true;
13003
+ } catch (err) {
13004
+ if (err.code !== "EEXIST") {
13005
+ throw err;
13006
+ }
13007
+ try {
13008
+ const pidStr = await readFile11(lockPath, "utf-8");
13009
+ const pid = Number.parseInt(pidStr.trim(), 10);
13010
+ if (!Number.isNaN(pid)) {
13011
+ try {
13012
+ process.kill(pid, 0);
13013
+ return false;
13014
+ } catch {
13015
+ await unlink(lockPath).catch(() => {
13016
+ });
13017
+ continue;
13018
+ }
13019
+ }
13020
+ } catch {
13021
+ }
13022
+ return false;
13023
+ }
12770
13024
  }
12771
- if (resolve === "local") {
12772
- if (cacheExists) {
12773
- if (this.verbose) {
12774
- console.log(`[repo] using existing local cache ${cachePath}`);
13025
+ return false;
13026
+ }
13027
+ /**
13028
+ * Check if the stored fingerprint in metadata.json differs from the computed one.
13029
+ * Returns true if drifted, false otherwise.
13030
+ * Returns false (no drift) if metadata.json doesn't exist (first use).
13031
+ */
13032
+ async checkDrift(poolDir, fingerprint) {
13033
+ const metadataPath = path36.join(poolDir, "metadata.json");
13034
+ try {
13035
+ const raw = await readFile11(metadataPath, "utf-8");
13036
+ const metadata = JSON.parse(raw);
13037
+ return metadata.fingerprint !== fingerprint;
13038
+ } catch {
13039
+ return false;
13040
+ }
13041
+ }
13042
+ /** Write metadata.json with fingerprint, inputs, and timestamp. */
13043
+ async writeMetadata(poolDir, fingerprint, templatePath, repos) {
13044
+ const metadata = {
13045
+ fingerprint,
13046
+ templatePath,
13047
+ repos,
13048
+ createdAt: (/* @__PURE__ */ new Date()).toISOString()
13049
+ };
13050
+ await writeFile7(path36.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
13051
+ }
13052
+ /** Remove all slot directories and their lock files from a pool directory. */
13053
+ async removeAllSlots(poolDir) {
13054
+ const entries = await readdir4(poolDir);
13055
+ for (const entry of entries) {
13056
+ if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
13057
+ const lockPath = path36.join(poolDir, `${entry}.lock`);
13058
+ if (existsSync2(lockPath)) {
13059
+ try {
13060
+ const pidStr = await readFile11(lockPath, "utf-8");
13061
+ const pid = Number.parseInt(pidStr.trim(), 10);
13062
+ if (!Number.isNaN(pid)) {
13063
+ try {
13064
+ process.kill(pid, 0);
13065
+ console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
13066
+ continue;
13067
+ } catch {
13068
+ }
13069
+ }
13070
+ } catch {
13071
+ }
12775
13072
  }
12776
- return cachePath;
13073
+ await rm5(path36.join(poolDir, entry), { recursive: true, force: true });
13074
+ await rm5(lockPath, { force: true }).catch(() => {
13075
+ });
12777
13076
  }
12778
- const url = getSourceUrl(source);
12779
- throw new Error(
12780
- `No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
13077
+ }
13078
+ await rm5(path36.join(poolDir, "metadata.json"), { force: true }).catch(() => {
13079
+ });
13080
+ }
13081
+ /**
13082
+ * Reset an existing slot for reuse:
13083
+ * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
13084
+ * 2. Re-copy template files (skip repo directories)
13085
+ */
13086
+ async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
13087
+ for (const repo of repos) {
13088
+ const repoDir = path36.join(slotPath, repo.path);
13089
+ if (!existsSync2(repoDir)) {
13090
+ continue;
13091
+ }
13092
+ if (poolReset === "none") {
13093
+ continue;
13094
+ }
13095
+ const ref = repo.checkout?.ref ?? "HEAD";
13096
+ await git(["reset", "--hard", ref], { cwd: repoDir });
13097
+ const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
13098
+ await git(["clean", cleanFlag], { cwd: repoDir });
13099
+ }
13100
+ if (templatePath) {
13101
+ const repoDirNames = new Set(
13102
+ repos.map((r) => {
13103
+ const normalized = r.path.replace(/^\.\//, "");
13104
+ return normalized.split("/")[0];
13105
+ })
12781
13106
  );
13107
+ await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
12782
13108
  }
12783
- await mkdir11(this.cacheDir, { recursive: true });
12784
- const lockStartedAt = Date.now();
12785
- await acquireLock(lockPath);
13109
+ }
13110
+ };
13111
+
13112
+ // src/evaluation/workspace/repo-manager.ts
13113
+ import { execFile as execFile2 } from "node:child_process";
13114
+ import path37 from "node:path";
13115
+ import { promisify as promisify6 } from "node:util";
13116
+ var execFileAsync2 = promisify6(execFile2);
13117
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
13118
+ function gitEnv2() {
13119
+ const env = { ...process.env };
13120
+ for (const key of Object.keys(env)) {
13121
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
13122
+ delete env[key];
13123
+ }
13124
+ }
13125
+ return {
13126
+ ...env,
13127
+ GIT_TERMINAL_PROMPT: "0",
13128
+ GIT_ASKPASS: "",
13129
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
13130
+ };
13131
+ }
13132
+ function getSourceUrl(source) {
13133
+ return source.type === "git" ? source.url : source.path;
13134
+ }
13135
+ async function git2(args, opts) {
13136
+ const { stdout } = await execFileAsync2("git", args, {
13137
+ cwd: opts?.cwd,
13138
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
13139
+ env: gitEnv2(),
13140
+ maxBuffer: 50 * 1024 * 1024
13141
+ // 50MB
13142
+ });
13143
+ return stdout.trim();
13144
+ }
13145
+ var RepoManager = class {
13146
+ verbose;
13147
+ constructor(verbose = false) {
13148
+ this.verbose = verbose;
13149
+ }
13150
+ async runGit(args, opts) {
13151
+ const startedAt = Date.now();
12786
13152
  if (this.verbose) {
12787
- console.log(
12788
- `[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
12789
- );
13153
+ console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
12790
13154
  }
12791
13155
  try {
12792
- if (cacheExists) {
12793
- if (this.verbose) {
12794
- console.log(`[repo] refreshing existing cache ${cachePath}`);
12795
- }
12796
- const fetchArgs = ["fetch", "--prune"];
12797
- if (depth) {
12798
- fetchArgs.push("--depth", String(depth));
12799
- }
12800
- await this.runGit(fetchArgs, { cwd: cachePath });
12801
- } else {
12802
- if (this.verbose) {
12803
- console.log(`[repo] creating new cache ${cachePath}`);
12804
- }
12805
- const cloneArgs = ["clone", "--mirror", "--bare"];
12806
- if (depth) {
12807
- cloneArgs.push("--depth", String(depth));
12808
- }
12809
- const sourceUrl = getSourceUrl(source);
12810
- const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
12811
- cloneArgs.push(cloneUrl, cachePath);
12812
- await this.runGit(cloneArgs);
13156
+ const output = await git2(args, opts);
13157
+ if (this.verbose) {
13158
+ console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
12813
13159
  }
12814
- } finally {
12815
- await releaseLock(lockPath);
13160
+ return output;
13161
+ } catch (error) {
12816
13162
  if (this.verbose) {
12817
- console.log(`[repo] lock released path=${lockPath}`);
13163
+ const message = error instanceof Error ? error.message : String(error);
13164
+ console.log(
13165
+ `[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
13166
+ );
12818
13167
  }
13168
+ throw error;
12819
13169
  }
12820
- return cachePath;
12821
13170
  }
12822
13171
  /**
12823
- * Clone a repo from cache into the workspace at the configured path.
13172
+ * Clone a repo directly from source into the workspace at the configured path.
12824
13173
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
12825
13174
  */
12826
13175
  async materialize(repo, workspacePath) {
12827
- const targetDir = path36.join(workspacePath, repo.path);
13176
+ const targetDir = path37.join(workspacePath, repo.path);
13177
+ const sourceUrl = getSourceUrl(repo.source);
12828
13178
  const startedAt = Date.now();
12829
13179
  if (this.verbose) {
12830
13180
  console.log(
12831
- `[repo] materialize start path=${repo.path} source=${getSourceUrl(repo.source)} workspace=${workspacePath}`
13181
+ `[repo] materialize start path=${repo.path} source=${sourceUrl} workspace=${workspacePath}`
12832
13182
  );
12833
13183
  }
12834
- const cachePath = await this.ensureCache(
12835
- repo.source,
12836
- repo.clone?.depth,
12837
- repo.checkout?.resolve
12838
- );
12839
13184
  const cloneArgs = ["clone"];
12840
13185
  if (repo.clone?.depth) {
12841
13186
  cloneArgs.push("--depth", String(repo.clone.depth));
@@ -12844,7 +13189,7 @@ var RepoManager = class {
12844
13189
  cloneArgs.push("--filter", repo.clone.filter);
12845
13190
  }
12846
13191
  cloneArgs.push("--no-checkout");
12847
- const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
13192
+ const cloneUrl = (repo.clone?.depth || repo.clone?.filter) && repo.source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
12848
13193
  cloneArgs.push(cloneUrl, targetDir);
12849
13194
  await this.runGit(cloneArgs);
12850
13195
  if (repo.clone?.sparse?.length) {
@@ -12916,85 +13261,47 @@ var RepoManager = class {
12916
13261
  }
12917
13262
  }
12918
13263
  /** Reset repos in workspace to their checkout state. */
12919
- async reset(repos, workspacePath, strategy) {
12920
- if (strategy === "recreate") {
12921
- for (const repo of repos) {
12922
- const targetDir = path36.join(workspacePath, repo.path);
12923
- await rm5(targetDir, { recursive: true, force: true });
12924
- }
12925
- await this.materializeAll(repos, workspacePath);
12926
- return;
12927
- }
13264
+ async reset(repos, workspacePath, reset) {
13265
+ const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
12928
13266
  for (const repo of repos) {
12929
- const targetDir = path36.join(workspacePath, repo.path);
13267
+ const targetDir = path37.join(workspacePath, repo.path);
12930
13268
  await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
12931
- await this.runGit(["clean", "-fd"], { cwd: targetDir });
13269
+ await this.runGit(["clean", cleanFlag], { cwd: targetDir });
12932
13270
  }
12933
13271
  }
12934
- /**
12935
- * Seed the cache from a local repository, setting the remote to a given URL.
12936
- * Useful for avoiding slow network clones when a local clone already exists.
12937
- */
12938
- async seedCache(localPath, remoteUrl, opts) {
12939
- const source = { type: "git", url: remoteUrl };
12940
- const key = cacheKey(source);
12941
- const cachePath = path36.join(this.cacheDir, key);
12942
- const lockPath = `${cachePath}.lock`;
12943
- await mkdir11(this.cacheDir, { recursive: true });
12944
- await acquireLock(lockPath);
12945
- try {
12946
- if (existsSync2(path36.join(cachePath, "HEAD"))) {
12947
- if (!opts?.force) {
12948
- throw new Error(
12949
- `Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
12950
- );
12951
- }
12952
- await rm5(cachePath, { recursive: true, force: true });
12953
- }
12954
- await git(["clone", "--mirror", "--bare", localPath, cachePath]);
12955
- await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
12956
- } finally {
12957
- await releaseLock(lockPath);
12958
- }
12959
- return cachePath;
12960
- }
12961
- /** Remove the entire cache directory. */
12962
- async cleanCache() {
12963
- await rm5(this.cacheDir, { recursive: true, force: true });
12964
- }
12965
13272
  };
12966
13273
 
12967
13274
  // src/evaluation/workspace/resolve.ts
12968
- import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
12969
- import path37 from "node:path";
13275
+ import { readdir as readdir5, stat as stat6 } from "node:fs/promises";
13276
+ import path38 from "node:path";
12970
13277
  async function resolveWorkspaceTemplate(templatePath) {
12971
13278
  if (!templatePath) {
12972
13279
  return void 0;
12973
13280
  }
12974
- const resolved = path37.resolve(templatePath);
13281
+ const resolved = path38.resolve(templatePath);
12975
13282
  const stats = await stat6(resolved);
12976
13283
  if (stats.isFile()) {
12977
13284
  return {
12978
- dir: path37.dirname(resolved),
13285
+ dir: path38.dirname(resolved),
12979
13286
  workspaceFile: resolved
12980
13287
  };
12981
13288
  }
12982
13289
  if (!stats.isDirectory()) {
12983
13290
  throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
12984
13291
  }
12985
- const entries = await readdir4(resolved);
13292
+ const entries = await readdir5(resolved);
12986
13293
  const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
12987
13294
  if (workspaceFiles.length === 1) {
12988
13295
  return {
12989
13296
  dir: resolved,
12990
- workspaceFile: path37.join(resolved, workspaceFiles[0])
13297
+ workspaceFile: path38.join(resolved, workspaceFiles[0])
12991
13298
  };
12992
13299
  }
12993
13300
  if (workspaceFiles.length > 1) {
12994
13301
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
12995
13302
  return {
12996
13303
  dir: resolved,
12997
- workspaceFile: conventionFile ? path37.join(resolved, conventionFile) : void 0
13304
+ workspaceFile: conventionFile ? path38.join(resolved, conventionFile) : void 0
12998
13305
  };
12999
13306
  }
13000
13307
  return { dir: resolved };
@@ -13046,6 +13353,22 @@ function classifyQualityStatus(score) {
13046
13353
  function usesFileReferencePrompt(provider) {
13047
13354
  return isAgentProvider(provider) || provider.kind === "cli";
13048
13355
  }
13356
+ function toScriptConfig(hook, hookName, context) {
13357
+ const command = hook.command ?? hook.script;
13358
+ if (!command || command.length === 0) {
13359
+ throw new Error(`${hookName} hook in ${context} requires command or script`);
13360
+ }
13361
+ return {
13362
+ command,
13363
+ ...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
13364
+ ...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
13365
+ ...hook.cwd !== void 0 && { cwd: hook.cwd },
13366
+ ...hook.script !== void 0 && { script: hook.script }
13367
+ };
13368
+ }
13369
+ function hasHookCommand(hook) {
13370
+ return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
13371
+ }
13049
13372
  function getWorkspaceTemplate(target) {
13050
13373
  const config = target.config;
13051
13374
  if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -13076,7 +13399,15 @@ async function runEvaluation(options) {
13076
13399
  trials,
13077
13400
  streamCallbacks,
13078
13401
  totalBudgetUsd,
13079
- failOnError
13402
+ failOnError,
13403
+ poolWorkspaces,
13404
+ poolMaxSlots: configPoolMaxSlots,
13405
+ workspace: legacyWorkspacePath,
13406
+ workspaceMode,
13407
+ workspacePath,
13408
+ workspaceClean,
13409
+ retainOnSuccess,
13410
+ retainOnFailure
13080
13411
  } = options;
13081
13412
  let useCache = options.useCache;
13082
13413
  if (trials && trials.count > 1 && useCache) {
@@ -13150,7 +13481,7 @@ async function runEvaluation(options) {
13150
13481
  ];
13151
13482
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
13152
13483
  const typeRegistry = createBuiltinRegistry();
13153
- const discoveryBaseDir = evalFilePath ? path38.dirname(path38.resolve(evalFilePath)) : process.cwd();
13484
+ const discoveryBaseDir = evalFilePath ? path39.dirname(path39.resolve(evalFilePath)) : process.cwd();
13154
13485
  const evalDir = discoveryBaseDir;
13155
13486
  await discoverAssertions(typeRegistry, discoveryBaseDir);
13156
13487
  const providerRegistry = createBuiltinProviderRegistry();
@@ -13212,13 +13543,29 @@ async function runEvaluation(options) {
13212
13543
  }
13213
13544
  };
13214
13545
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
13215
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
13546
+ const configuredMode = suiteWorkspace?.mode ?? workspaceMode;
13547
+ const configuredStaticPath = suiteWorkspace?.static_path ?? workspacePath ?? legacyWorkspacePath;
13548
+ const useStaticWorkspace = configuredMode === "static" || !!configuredStaticPath && !configuredMode;
13549
+ if (useStaticWorkspace && isPerTestIsolation) {
13550
+ throw new Error(
13551
+ "static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
13552
+ );
13553
+ }
13554
+ if (configuredMode === "static" && !configuredStaticPath) {
13555
+ throw new Error("workspace.mode=static requires workspace.static_path or --workspace-path");
13556
+ }
13557
+ const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
13558
+ const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
13559
+ const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
13560
+ const finishCleanPolicy = suiteWorkspace?.hooks?.on_finish?.clean;
13561
+ const resolvedRetainOnSuccess = (finishCleanPolicy === "always" || finishCleanPolicy === "on_success" ? "cleanup" : finishCleanPolicy === "on_failure" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
13562
+ const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
13216
13563
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
13217
- const workers = hasSharedWorkspace ? 1 : requestedWorkers;
13564
+ const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
13218
13565
  setupLog(
13219
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
13566
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
13220
13567
  );
13221
- if (hasSharedWorkspace && requestedWorkers > 1) {
13568
+ if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
13222
13569
  console.warn(
13223
13570
  `Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
13224
13571
  );
@@ -13227,7 +13574,38 @@ async function runEvaluation(options) {
13227
13574
  let sharedWorkspacePath;
13228
13575
  let sharedBaselineCommit;
13229
13576
  let beforeAllOutput;
13230
- if (workspaceTemplate) {
13577
+ let poolManager;
13578
+ let poolSlot;
13579
+ const poolSlots = [];
13580
+ const availablePoolSlots = [];
13581
+ const poolSlotBaselines = /* @__PURE__ */ new Map();
13582
+ const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
13583
+ if (useStaticWorkspace && configuredStaticPath) {
13584
+ sharedWorkspacePath = configuredStaticPath;
13585
+ setupLog(`using static workspace: ${configuredStaticPath}`);
13586
+ } else if (usePool && suiteWorkspace?.repos) {
13587
+ const slotsNeeded = workers;
13588
+ setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
13589
+ poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
13590
+ const poolRepoManager = new RepoManager(verbose);
13591
+ for (let i = 0; i < slotsNeeded; i++) {
13592
+ const slot = await poolManager.acquireWorkspace({
13593
+ templatePath: workspaceTemplate,
13594
+ repos: suiteWorkspace.repos,
13595
+ maxSlots: poolMaxSlots,
13596
+ repoManager: poolRepoManager,
13597
+ poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? suiteWorkspace.hooks?.on_reuse?.reset ?? "fast"
13598
+ });
13599
+ poolSlots.push(slot);
13600
+ setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
13601
+ }
13602
+ if (slotsNeeded === 1) {
13603
+ poolSlot = poolSlots[0];
13604
+ sharedWorkspacePath = poolSlot.path;
13605
+ } else {
13606
+ availablePoolSlots.push(...poolSlots);
13607
+ }
13608
+ } else if (workspaceTemplate) {
13231
13609
  setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
13232
13610
  try {
13233
13611
  sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
@@ -13236,288 +13614,359 @@ async function runEvaluation(options) {
13236
13614
  const message = error instanceof Error ? error.message : String(error);
13237
13615
  throw new Error(`Failed to create shared workspace: ${message}`);
13238
13616
  }
13617
+ } else if (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
13618
+ sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
13619
+ await mkdir12(sharedWorkspacePath, { recursive: true });
13620
+ setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
13621
+ }
13622
+ try {
13239
13623
  if (suiteWorkspaceFile && sharedWorkspacePath) {
13240
- const copiedWorkspaceFile = path38.join(sharedWorkspacePath, path38.basename(suiteWorkspaceFile));
13624
+ const copiedWorkspaceFile = path39.join(sharedWorkspacePath, path39.basename(suiteWorkspaceFile));
13241
13625
  try {
13242
13626
  await stat7(copiedWorkspaceFile);
13243
13627
  suiteWorkspaceFile = copiedWorkspaceFile;
13244
13628
  } catch {
13245
13629
  }
13246
13630
  }
13247
- } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
13248
- sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
13249
- await mkdir12(sharedWorkspacePath, { recursive: true });
13250
- setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
13251
- }
13252
- const repoManager = suiteWorkspace?.repos?.length ? new RepoManager(void 0, verbose) : void 0;
13253
- if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
13254
- setupLog(`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`);
13255
- try {
13256
- await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
13257
- setupLog("shared repo materialization complete");
13258
- } catch (error) {
13259
- const message = error instanceof Error ? error.message : String(error);
13260
- if (sharedWorkspacePath) {
13261
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13262
- });
13263
- }
13264
- throw new Error(`Failed to materialize repos: ${message}`);
13265
- }
13266
- }
13267
- if (sharedWorkspacePath && suiteWorkspace?.before_all) {
13268
- const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
13269
- setupLog(
13270
- `running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
13271
- );
13272
- const scriptContext = {
13273
- workspacePath: sharedWorkspacePath,
13274
- testId: "__before_all__",
13275
- evalRunId,
13276
- evalDir
13277
- };
13278
- try {
13279
- beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
13280
- setupLog("shared before_all completed");
13281
- } catch (error) {
13282
- const message = error instanceof Error ? error.message : String(error);
13283
- if (sharedWorkspacePath) {
13284
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13285
- });
13286
- }
13287
- throw new Error(`before_all script failed: ${message}`);
13288
- }
13289
- }
13290
- if (sharedWorkspacePath) {
13291
- try {
13292
- sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
13293
- setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
13294
- } catch {
13295
- setupLog("shared baseline initialization skipped (non-fatal)");
13296
- }
13297
- }
13298
- let nextWorkerId = 1;
13299
- const workerIdByEvalId = /* @__PURE__ */ new Map();
13300
- let beforeAllOutputAttached = false;
13301
- let cumulativeBudgetCost = 0;
13302
- let budgetExhausted = false;
13303
- let failOnErrorTriggered = false;
13304
- const promises = filteredEvalCases.map(
13305
- (evalCase) => limit(async () => {
13306
- const workerId = nextWorkerId++;
13307
- workerIdByEvalId.set(evalCase.id, workerId);
13308
- if (totalBudgetUsd !== void 0 && budgetExhausted) {
13309
- const budgetResult = {
13310
- timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13311
- testId: evalCase.id,
13312
- dataset: evalCase.dataset,
13313
- score: 0,
13314
- hits: [],
13315
- misses: [],
13316
- answer: "",
13317
- target: target.name,
13318
- error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13319
- budgetExceeded: true,
13320
- executionStatus: "execution_error",
13321
- failureStage: "setup",
13322
- failureReasonCode: "budget_exceeded",
13323
- executionError: {
13324
- message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13325
- stage: "setup"
13326
- }
13327
- };
13328
- if (onProgress) {
13329
- await onProgress({
13330
- workerId,
13331
- testId: evalCase.id,
13332
- status: "failed",
13333
- completedAt: Date.now(),
13334
- error: budgetResult.error
13631
+ const repoManager = suiteWorkspace?.repos?.length && !usePool && !useStaticWorkspace ? new RepoManager(verbose) : void 0;
13632
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
13633
+ setupLog(
13634
+ `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
13635
+ );
13636
+ try {
13637
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
13638
+ setupLog("shared repo materialization complete");
13639
+ } catch (error) {
13640
+ const message = error instanceof Error ? error.message : String(error);
13641
+ if (sharedWorkspacePath && !useStaticWorkspace) {
13642
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13335
13643
  });
13336
13644
  }
13337
- if (onResult) {
13338
- await onResult(budgetResult);
13339
- }
13340
- return budgetResult;
13645
+ throw new Error(`Failed to materialize repos: ${message}`);
13341
13646
  }
13342
- if (failOnError === true && failOnErrorTriggered) {
13343
- const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
13344
- const haltResult = {
13345
- timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13346
- testId: evalCase.id,
13347
- dataset: evalCase.dataset,
13348
- score: 0,
13349
- hits: [],
13350
- misses: [],
13351
- answer: "",
13352
- target: target.name,
13353
- error: errorMsg,
13354
- executionStatus: "execution_error",
13355
- failureStage: "setup",
13356
- failureReasonCode: "error_threshold_exceeded",
13357
- executionError: { message: errorMsg, stage: "setup" }
13358
- };
13359
- if (onProgress) {
13360
- await onProgress({
13361
- workerId,
13362
- testId: evalCase.id,
13363
- status: "failed",
13364
- completedAt: Date.now(),
13365
- error: haltResult.error
13647
+ }
13648
+ const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all_tests;
13649
+ if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
13650
+ const beforeAllHook = suiteBeforeAllHook;
13651
+ const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
13652
+ setupLog(
13653
+ `running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
13654
+ );
13655
+ const scriptContext = {
13656
+ workspacePath: sharedWorkspacePath,
13657
+ testId: "__before_all__",
13658
+ evalRunId,
13659
+ evalDir
13660
+ };
13661
+ try {
13662
+ beforeAllOutput = await executeWorkspaceScript(
13663
+ toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
13664
+ scriptContext
13665
+ );
13666
+ setupLog("shared before_all completed");
13667
+ } catch (error) {
13668
+ const message = error instanceof Error ? error.message : String(error);
13669
+ if (sharedWorkspacePath && !useStaticWorkspace) {
13670
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13366
13671
  });
13367
13672
  }
13368
- if (onResult) {
13369
- await onResult(haltResult);
13370
- }
13371
- return haltResult;
13372
- }
13373
- if (onProgress) {
13374
- await onProgress({
13375
- workerId,
13376
- testId: evalCase.id,
13377
- status: "running",
13378
- startedAt: Date.now()
13379
- });
13673
+ throw new Error(`before_all script failed: ${message}`);
13380
13674
  }
13381
- try {
13382
- const judgeProvider = await resolveJudgeProvider(target);
13383
- const runCaseOptions = {
13384
- evalCase,
13385
- provider: primaryProvider,
13386
- target,
13387
- evaluators: evaluatorRegistry,
13388
- maxRetries,
13389
- agentTimeoutMs,
13390
- cache,
13391
- useCache,
13392
- now,
13393
- judgeProvider,
13394
- targetResolver,
13395
- availableTargets,
13675
+ }
13676
+ if (availablePoolSlots.length > 0 && hasHookCommand(suiteBeforeAllHook)) {
13677
+ const beforeAllHook = suiteBeforeAllHook;
13678
+ for (const slot of availablePoolSlots) {
13679
+ setupLog(`running before_all on pool slot ${slot.index}`);
13680
+ const scriptContext = {
13681
+ workspacePath: slot.path,
13682
+ testId: "__before_all__",
13396
13683
  evalRunId,
13397
- keepWorkspaces,
13398
- cleanupWorkspaces,
13399
- sharedWorkspacePath,
13400
- sharedBaselineCommit,
13401
- suiteWorkspaceFile,
13402
- streamCallbacks,
13403
- typeRegistry,
13404
- repoManager,
13405
13684
  evalDir
13406
13685
  };
13407
- let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
13408
- if (totalBudgetUsd !== void 0) {
13409
- let caseCost;
13410
- if (result.trials && result.trials.length > 0) {
13411
- const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
13412
- if (trialCostSum > 0) {
13413
- caseCost = trialCostSum;
13686
+ try {
13687
+ const output = await executeWorkspaceScript(
13688
+ toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
13689
+ scriptContext
13690
+ );
13691
+ if (!beforeAllOutput) beforeAllOutput = output;
13692
+ setupLog(`before_all completed on pool slot ${slot.index}`);
13693
+ } catch (error) {
13694
+ const message = error instanceof Error ? error.message : String(error);
13695
+ throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
13696
+ }
13697
+ }
13698
+ }
13699
+ if (sharedWorkspacePath) {
13700
+ try {
13701
+ sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
13702
+ setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
13703
+ } catch {
13704
+ setupLog("shared baseline initialization skipped (non-fatal)");
13705
+ }
13706
+ }
13707
+ if (availablePoolSlots.length > 0) {
13708
+ for (const slot of availablePoolSlots) {
13709
+ try {
13710
+ const baseline = await initializeBaseline(slot.path);
13711
+ poolSlotBaselines.set(slot.path, baseline);
13712
+ setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
13713
+ } catch {
13714
+ setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
13715
+ }
13716
+ }
13717
+ }
13718
+ let nextWorkerId = 1;
13719
+ const workerIdByEvalId = /* @__PURE__ */ new Map();
13720
+ let beforeAllOutputAttached = false;
13721
+ let cumulativeBudgetCost = 0;
13722
+ let budgetExhausted = false;
13723
+ let failOnErrorTriggered = false;
13724
+ const promises = filteredEvalCases.map(
13725
+ (evalCase) => limit(async () => {
13726
+ const workerId = nextWorkerId++;
13727
+ workerIdByEvalId.set(evalCase.id, workerId);
13728
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
13729
+ const budgetResult = {
13730
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13731
+ testId: evalCase.id,
13732
+ dataset: evalCase.dataset,
13733
+ score: 0,
13734
+ hits: [],
13735
+ misses: [],
13736
+ answer: "",
13737
+ target: target.name,
13738
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13739
+ budgetExceeded: true,
13740
+ executionStatus: "execution_error",
13741
+ failureStage: "setup",
13742
+ failureReasonCode: "budget_exceeded",
13743
+ executionError: {
13744
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13745
+ stage: "setup"
13414
13746
  }
13415
- } else {
13416
- caseCost = result.costUsd;
13747
+ };
13748
+ if (onProgress) {
13749
+ await onProgress({
13750
+ workerId,
13751
+ testId: evalCase.id,
13752
+ status: "failed",
13753
+ completedAt: Date.now(),
13754
+ error: budgetResult.error
13755
+ });
13417
13756
  }
13418
- if (caseCost !== void 0) {
13419
- cumulativeBudgetCost += caseCost;
13420
- if (cumulativeBudgetCost >= totalBudgetUsd) {
13421
- budgetExhausted = true;
13422
- }
13757
+ if (onResult) {
13758
+ await onResult(budgetResult);
13423
13759
  }
13760
+ return budgetResult;
13424
13761
  }
13425
- if (failOnError === true && result.executionStatus === "execution_error") {
13426
- failOnErrorTriggered = true;
13427
- }
13428
- if (beforeAllOutput && !beforeAllOutputAttached) {
13429
- result = { ...result, beforeAllOutput };
13430
- beforeAllOutputAttached = true;
13762
+ if (failOnError === true && failOnErrorTriggered) {
13763
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
13764
+ const haltResult = {
13765
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13766
+ testId: evalCase.id,
13767
+ dataset: evalCase.dataset,
13768
+ score: 0,
13769
+ hits: [],
13770
+ misses: [],
13771
+ answer: "",
13772
+ target: target.name,
13773
+ error: errorMsg,
13774
+ executionStatus: "execution_error",
13775
+ failureStage: "setup",
13776
+ failureReasonCode: "error_threshold_exceeded",
13777
+ executionError: { message: errorMsg, stage: "setup" }
13778
+ };
13779
+ if (onProgress) {
13780
+ await onProgress({
13781
+ workerId,
13782
+ testId: evalCase.id,
13783
+ status: "failed",
13784
+ completedAt: Date.now(),
13785
+ error: haltResult.error
13786
+ });
13787
+ }
13788
+ if (onResult) {
13789
+ await onResult(haltResult);
13790
+ }
13791
+ return haltResult;
13431
13792
  }
13432
13793
  if (onProgress) {
13433
13794
  await onProgress({
13434
13795
  workerId,
13435
13796
  testId: evalCase.id,
13436
- status: result.error ? "failed" : "completed",
13437
- startedAt: 0,
13438
- // Not used for completed status
13439
- completedAt: Date.now(),
13440
- error: result.error
13797
+ status: "running",
13798
+ startedAt: Date.now()
13441
13799
  });
13442
13800
  }
13443
- if (onResult) {
13444
- await onResult(result);
13801
+ const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
13802
+ const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
13803
+ const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
13804
+ try {
13805
+ const judgeProvider = await resolveJudgeProvider(target);
13806
+ const runCaseOptions = {
13807
+ evalCase,
13808
+ provider: primaryProvider,
13809
+ target,
13810
+ evaluators: evaluatorRegistry,
13811
+ maxRetries,
13812
+ agentTimeoutMs,
13813
+ cache,
13814
+ useCache,
13815
+ now,
13816
+ judgeProvider,
13817
+ targetResolver,
13818
+ availableTargets,
13819
+ evalRunId,
13820
+ keepWorkspaces,
13821
+ cleanupWorkspaces,
13822
+ retainOnSuccess: resolvedRetainOnSuccess,
13823
+ retainOnFailure: resolvedRetainOnFailure,
13824
+ sharedWorkspacePath: testWorkspacePath,
13825
+ sharedBaselineCommit: testBaselineCommit,
13826
+ suiteWorkspaceFile,
13827
+ streamCallbacks,
13828
+ typeRegistry,
13829
+ repoManager,
13830
+ evalDir
13831
+ };
13832
+ let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
13833
+ if (totalBudgetUsd !== void 0) {
13834
+ let caseCost;
13835
+ if (result.trials && result.trials.length > 0) {
13836
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
13837
+ if (trialCostSum > 0) {
13838
+ caseCost = trialCostSum;
13839
+ }
13840
+ } else {
13841
+ caseCost = result.costUsd;
13842
+ }
13843
+ if (caseCost !== void 0) {
13844
+ cumulativeBudgetCost += caseCost;
13845
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
13846
+ budgetExhausted = true;
13847
+ }
13848
+ }
13849
+ }
13850
+ if (failOnError === true && result.executionStatus === "execution_error") {
13851
+ failOnErrorTriggered = true;
13852
+ }
13853
+ if (beforeAllOutput && !beforeAllOutputAttached) {
13854
+ result = { ...result, beforeAllOutput };
13855
+ beforeAllOutputAttached = true;
13856
+ }
13857
+ if (onProgress) {
13858
+ await onProgress({
13859
+ workerId,
13860
+ testId: evalCase.id,
13861
+ status: result.error ? "failed" : "completed",
13862
+ startedAt: 0,
13863
+ // Not used for completed status
13864
+ completedAt: Date.now(),
13865
+ error: result.error
13866
+ });
13867
+ }
13868
+ if (onResult) {
13869
+ await onResult(result);
13870
+ }
13871
+ return result;
13872
+ } catch (error) {
13873
+ if (onProgress) {
13874
+ await onProgress({
13875
+ workerId,
13876
+ testId: evalCase.id,
13877
+ status: "failed",
13878
+ completedAt: Date.now(),
13879
+ error: error instanceof Error ? error.message : String(error)
13880
+ });
13881
+ }
13882
+ throw error;
13883
+ } finally {
13884
+ if (testPoolSlot) {
13885
+ availablePoolSlots.push(testPoolSlot);
13886
+ }
13445
13887
  }
13446
- return result;
13447
- } catch (error) {
13448
- if (onProgress) {
13449
- await onProgress({
13450
- workerId,
13451
- testId: evalCase.id,
13452
- status: "failed",
13453
- completedAt: Date.now(),
13454
- error: error instanceof Error ? error.message : String(error)
13455
- });
13888
+ })
13889
+ );
13890
+ const settled = await Promise.allSettled(promises);
13891
+ const results = [];
13892
+ for (let i = 0; i < settled.length; i++) {
13893
+ const outcome = settled[i];
13894
+ if (outcome.status === "fulfilled") {
13895
+ results.push(outcome.value);
13896
+ } else {
13897
+ const evalCase = filteredEvalCases[i];
13898
+ const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
13899
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
13900
+ const errorResult = buildErrorResult(
13901
+ evalCase,
13902
+ target.name,
13903
+ (now ?? (() => /* @__PURE__ */ new Date()))(),
13904
+ outcome.reason,
13905
+ promptInputs,
13906
+ primaryProvider,
13907
+ "agent",
13908
+ "provider_error"
13909
+ );
13910
+ results.push(errorResult);
13911
+ if (onResult) {
13912
+ await onResult(errorResult);
13456
13913
  }
13457
- throw error;
13458
13914
  }
13459
- })
13460
- );
13461
- const settled = await Promise.allSettled(promises);
13462
- const results = [];
13463
- for (let i = 0; i < settled.length; i++) {
13464
- const outcome = settled[i];
13465
- if (outcome.status === "fulfilled") {
13466
- results.push(outcome.value);
13467
- } else {
13468
- const evalCase = filteredEvalCases[i];
13469
- const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
13470
- const promptInputs = await buildPromptInputs(evalCase, formattingMode);
13471
- const errorResult = buildErrorResult(
13472
- evalCase,
13473
- target.name,
13474
- (now ?? (() => /* @__PURE__ */ new Date()))(),
13475
- outcome.reason,
13476
- promptInputs,
13477
- primaryProvider,
13478
- "agent",
13479
- "provider_error"
13480
- );
13481
- results.push(errorResult);
13482
- if (onResult) {
13483
- await onResult(errorResult);
13915
+ }
13916
+ const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
13917
+ const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all_tests;
13918
+ if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
13919
+ const afterAllHook = suiteAfterAllHook;
13920
+ for (const wsPath of afterAllWorkspaces) {
13921
+ const scriptContext = {
13922
+ workspacePath: wsPath,
13923
+ testId: "__after_all__",
13924
+ evalRunId,
13925
+ evalDir
13926
+ };
13927
+ try {
13928
+ const afterAllOutput = await executeWorkspaceScript(
13929
+ toScriptConfig(afterAllHook, "after_all_tests", "suite workspace"),
13930
+ scriptContext,
13931
+ "warn"
13932
+ );
13933
+ if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
13934
+ results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
13935
+ }
13936
+ } catch {
13937
+ }
13484
13938
  }
13485
13939
  }
13486
- }
13487
- if (sharedWorkspacePath && suiteWorkspace?.after_all) {
13488
- const scriptContext = {
13489
- workspacePath: sharedWorkspacePath,
13490
- testId: "__after_all__",
13491
- evalRunId,
13492
- evalDir
13493
- };
13494
- try {
13495
- const afterAllOutput = await executeWorkspaceScript(
13496
- suiteWorkspace.after_all,
13497
- scriptContext,
13498
- "warn"
13499
- );
13500
- if (afterAllOutput && results.length > 0) {
13501
- results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
13940
+ if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !useStaticWorkspace) {
13941
+ const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
13942
+ if (hasFailure) {
13943
+ if (resolvedRetainOnFailure === "cleanup") {
13944
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13945
+ });
13946
+ }
13947
+ } else if (resolvedRetainOnSuccess === "cleanup") {
13948
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13949
+ });
13502
13950
  }
13503
- } catch {
13504
13951
  }
13505
- }
13506
- if (sharedWorkspacePath) {
13507
- const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
13508
13952
  if (cleanupWorkspaces) {
13509
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13510
- });
13511
- } else if (!hasFailure && !keepWorkspaces) {
13512
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13953
+ await cleanupEvalWorkspaces(evalRunId).catch(() => {
13513
13954
  });
13514
13955
  }
13956
+ return results;
13957
+ } finally {
13958
+ if (poolManager) {
13959
+ if (poolSlot) {
13960
+ await poolManager.releaseSlot(poolSlot);
13961
+ }
13962
+ for (const slot of poolSlots) {
13963
+ if (slot !== poolSlot) {
13964
+ await poolManager.releaseSlot(slot).catch(() => {
13965
+ });
13966
+ }
13967
+ }
13968
+ }
13515
13969
  }
13516
- if (cleanupWorkspaces) {
13517
- await cleanupEvalWorkspaces(evalRunId).catch(() => {
13518
- });
13519
- }
13520
- return results;
13521
13970
  }
13522
13971
  async function runBatchEvaluation(options) {
13523
13972
  const {
@@ -13689,6 +14138,8 @@ async function runEvalCase(options) {
13689
14138
  evalRunId,
13690
14139
  keepWorkspaces,
13691
14140
  cleanupWorkspaces: forceCleanup,
14141
+ retainOnSuccess,
14142
+ retainOnFailure,
13692
14143
  sharedWorkspacePath,
13693
14144
  sharedBaselineCommit,
13694
14145
  suiteWorkspaceFile,
@@ -13700,10 +14151,10 @@ async function runEvalCase(options) {
13700
14151
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
13701
14152
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
13702
14153
  const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
13703
- const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
14154
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
13704
14155
  let cachedResponse;
13705
- if (cacheKey2 && cache) {
13706
- cachedResponse = await cache.get(cacheKey2);
14156
+ if (cacheKey && cache) {
14157
+ cachedResponse = await cache.get(cacheKey);
13707
14158
  }
13708
14159
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
13709
14160
  let workspacePath = sharedWorkspacePath;
@@ -13734,7 +14185,7 @@ async function runEvalCase(options) {
13734
14185
  );
13735
14186
  }
13736
14187
  if (caseWorkspaceFile && workspacePath) {
13737
- const copiedFile = path38.join(workspacePath, path38.basename(caseWorkspaceFile));
14188
+ const copiedFile = path39.join(workspacePath, path39.basename(caseWorkspaceFile));
13738
14189
  try {
13739
14190
  await stat7(copiedFile);
13740
14191
  caseWorkspaceFile = copiedFile;
@@ -13742,12 +14193,12 @@ async function runEvalCase(options) {
13742
14193
  }
13743
14194
  }
13744
14195
  }
13745
- if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
14196
+ if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
13746
14197
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
13747
14198
  await mkdir12(workspacePath, { recursive: true });
13748
14199
  }
13749
14200
  if (evalCase.workspace?.repos?.length && workspacePath) {
13750
- const perCaseRepoManager = new RepoManager(void 0, setupDebug);
14201
+ const perCaseRepoManager = new RepoManager(setupDebug);
13751
14202
  try {
13752
14203
  if (setupDebug) {
13753
14204
  console.log(
@@ -13772,11 +14223,13 @@ async function runEvalCase(options) {
13772
14223
  );
13773
14224
  }
13774
14225
  }
13775
- if (workspacePath && evalCase.workspace?.before_all) {
13776
- const beforeAllCommand = (evalCase.workspace.before_all.command ?? evalCase.workspace.before_all.script ?? []).join(" ");
14226
+ const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all_tests;
14227
+ if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
14228
+ const beforeAllHook = caseBeforeAllHook;
14229
+ const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
13777
14230
  if (setupDebug) {
13778
14231
  console.log(
13779
- `[setup] test=${evalCase.id} running before_all in cwd=${evalCase.workspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
14232
+ `[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
13780
14233
  );
13781
14234
  }
13782
14235
  const scriptContext = {
@@ -13789,7 +14242,7 @@ async function runEvalCase(options) {
13789
14242
  };
13790
14243
  try {
13791
14244
  beforeAllOutput = await executeWorkspaceScript(
13792
- evalCase.workspace.before_all,
14245
+ toScriptConfig(beforeAllHook, "before_all_tests", `test '${evalCase.id}'`),
13793
14246
  scriptContext
13794
14247
  );
13795
14248
  if (setupDebug) {
@@ -13814,7 +14267,9 @@ async function runEvalCase(options) {
13814
14267
  }
13815
14268
  }
13816
14269
  }
13817
- if (workspacePath && evalCase.workspace?.before_each) {
14270
+ const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each_test;
14271
+ if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
14272
+ const beforeEachHook = caseBeforeEachHook;
13818
14273
  const scriptContext = {
13819
14274
  workspacePath,
13820
14275
  testId: evalCase.id,
@@ -13825,7 +14280,7 @@ async function runEvalCase(options) {
13825
14280
  };
13826
14281
  try {
13827
14282
  beforeEachOutput = await executeWorkspaceScript(
13828
- evalCase.workspace.before_each,
14283
+ toScriptConfig(beforeEachHook, "before_each_test", `test '${evalCase.id}'`),
13829
14284
  scriptContext
13830
14285
  );
13831
14286
  } catch (error) {
@@ -13913,8 +14368,8 @@ async function runEvalCase(options) {
13913
14368
  }
13914
14369
  return errorResult;
13915
14370
  }
13916
- if (cacheKey2 && cache && !cachedResponse) {
13917
- await cache.set(cacheKey2, providerResponse);
14371
+ if (cacheKey && cache && !cachedResponse) {
14372
+ await cache.set(cacheKey, providerResponse);
13918
14373
  }
13919
14374
  const output = providerResponse.output;
13920
14375
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
@@ -13942,17 +14397,19 @@ async function runEvalCase(options) {
13942
14397
  }
13943
14398
  }
13944
14399
  const providerError = extractProviderError(providerResponse);
13945
- if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
14400
+ if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each_test?.reset && evalCase.workspace.hooks.after_each_test.reset !== "none" && evalCase.workspace.repos) {
13946
14401
  try {
13947
14402
  await repoManager.reset(
13948
14403
  evalCase.workspace.repos,
13949
14404
  workspacePath,
13950
- evalCase.workspace.reset.strategy
14405
+ evalCase.workspace.hooks.after_each_test.reset
13951
14406
  );
13952
14407
  } catch {
13953
14408
  }
13954
14409
  }
13955
- if (workspacePath && evalCase.workspace?.after_each) {
14410
+ const caseAfterEachHook = evalCase.workspace?.hooks?.after_each_test;
14411
+ if (workspacePath && hasHookCommand(caseAfterEachHook)) {
14412
+ const afterEachHook = caseAfterEachHook;
13956
14413
  const scriptContext = {
13957
14414
  workspacePath,
13958
14415
  testId: evalCase.id,
@@ -13963,7 +14420,7 @@ async function runEvalCase(options) {
13963
14420
  };
13964
14421
  try {
13965
14422
  afterEachOutput = await executeWorkspaceScript(
13966
- evalCase.workspace.after_each,
14423
+ toScriptConfig(afterEachHook, "after_each_test", `test '${evalCase.id}'`),
13967
14424
  scriptContext,
13968
14425
  "warn"
13969
14426
  );
@@ -14013,8 +14470,13 @@ async function runEvalCase(options) {
14013
14470
  await cleanupWorkspace(workspacePath).catch(() => {
14014
14471
  });
14015
14472
  } else if (isFailure) {
14016
- return { ...finalResult, workspacePath };
14017
- } else if (!keepWorkspaces) {
14473
+ if ((retainOnFailure ?? "keep") === "cleanup") {
14474
+ await cleanupWorkspace(workspacePath).catch(() => {
14475
+ });
14476
+ } else {
14477
+ return { ...finalResult, workspacePath };
14478
+ }
14479
+ } else if ((retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup")) !== "keep") {
14018
14480
  await cleanupWorkspace(workspacePath).catch(() => {
14019
14481
  });
14020
14482
  }
@@ -14032,11 +14494,12 @@ async function runEvalCase(options) {
14032
14494
  "evaluator_error"
14033
14495
  );
14034
14496
  if (workspacePath && !isSharedWorkspace) {
14035
- if (forceCleanup) {
14497
+ if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
14036
14498
  await cleanupWorkspace(workspacePath).catch(() => {
14037
14499
  });
14500
+ } else {
14501
+ return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
14038
14502
  }
14039
- return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
14040
14503
  }
14041
14504
  return { ...errorResult, beforeEachOutput, afterEachOutput };
14042
14505
  }
@@ -14055,7 +14518,9 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
14055
14518
  useCache: false,
14056
14519
  // Force cleanup for intermediate trials
14057
14520
  cleanupWorkspaces: isLastDeclaredTrial ? options.cleanupWorkspaces : true,
14058
- keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false
14521
+ keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false,
14522
+ retainOnSuccess: isLastDeclaredTrial ? options.retainOnSuccess : "cleanup",
14523
+ retainOnFailure: isLastDeclaredTrial ? options.retainOnFailure : "cleanup"
14059
14524
  };
14060
14525
  const result = await runEvalCase(trialOptions);
14061
14526
  allResults.push(result);
@@ -14344,7 +14809,7 @@ async function runEvaluatorList(options) {
14344
14809
  fileChanges,
14345
14810
  workspacePath
14346
14811
  };
14347
- const evalFileDir = evalCase.guideline_paths[0] ? path38.dirname(evalCase.guideline_paths[0]) : process.cwd();
14812
+ const evalFileDir = evalCase.guideline_paths[0] ? path39.dirname(evalCase.guideline_paths[0]) : process.cwd();
14348
14813
  const dispatchContext = {
14349
14814
  judgeProvider,
14350
14815
  targetResolver,
@@ -14647,7 +15112,7 @@ function computeWeightedMean(entries) {
14647
15112
 
14648
15113
  // src/evaluation/evaluate.ts
14649
15114
  import { existsSync as existsSync3 } from "node:fs";
14650
- import path39 from "node:path";
15115
+ import path40 from "node:path";
14651
15116
  async function evaluate(config) {
14652
15117
  const startTime = Date.now();
14653
15118
  if (config.tests && config.specFile) {
@@ -14669,13 +15134,13 @@ async function evaluate(config) {
14669
15134
  let evalCases;
14670
15135
  let testFilePath;
14671
15136
  if (config.specFile) {
14672
- testFilePath = path39.resolve(config.specFile);
15137
+ testFilePath = path40.resolve(config.specFile);
14673
15138
  evalCases = await loadTests(testFilePath, repoRoot, {
14674
15139
  verbose: config.verbose,
14675
15140
  filter: config.filter
14676
15141
  });
14677
15142
  } else {
14678
- testFilePath = path39.join(process.cwd(), "__programmatic__.yaml");
15143
+ testFilePath = path40.join(process.cwd(), "__programmatic__.yaml");
14679
15144
  evalCases = (config.tests ?? []).map((test) => {
14680
15145
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
14681
15146
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -14761,10 +15226,10 @@ function computeSummary(results, durationMs) {
14761
15226
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
14762
15227
  async function discoverDefaultTarget(repoRoot) {
14763
15228
  const cwd = process.cwd();
14764
- const chain = buildDirectoryChain(path39.join(cwd, "_placeholder"), repoRoot);
15229
+ const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
14765
15230
  for (const dir of chain) {
14766
15231
  for (const candidate of TARGET_FILE_CANDIDATES) {
14767
- const targetsPath = path39.join(dir, candidate);
15232
+ const targetsPath = path40.join(dir, candidate);
14768
15233
  if (!existsSync3(targetsPath)) continue;
14769
15234
  try {
14770
15235
  const definitions = await readTargetDefinitions(targetsPath);
@@ -14779,10 +15244,10 @@ async function discoverDefaultTarget(repoRoot) {
14779
15244
  async function loadEnvHierarchy(repoRoot) {
14780
15245
  const { readFileSync: readFileSync2 } = await import("node:fs");
14781
15246
  const cwd = process.cwd();
14782
- const chain = buildDirectoryChain(path39.join(cwd, "_placeholder"), repoRoot);
15247
+ const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
14783
15248
  const envFiles = [];
14784
15249
  for (const dir of chain) {
14785
- const envPath = path39.join(dir, ".env");
15250
+ const envPath = path40.join(dir, ".env");
14786
15251
  if (existsSync3(envPath)) envFiles.push(envPath);
14787
15252
  }
14788
15253
  for (let i = envFiles.length - 1; i >= 0; i--) {
@@ -14963,8 +15428,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
14963
15428
  }
14964
15429
 
14965
15430
  // src/evaluation/cache/response-cache.ts
14966
- import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
14967
- import path40 from "node:path";
15431
+ import { mkdir as mkdir13, readFile as readFile12, writeFile as writeFile8 } from "node:fs/promises";
15432
+ import path41 from "node:path";
14968
15433
  var DEFAULT_CACHE_PATH = ".agentv/cache";
14969
15434
  var ResponseCache = class {
14970
15435
  cachePath;
@@ -14974,7 +15439,7 @@ var ResponseCache = class {
14974
15439
  async get(key) {
14975
15440
  const filePath = this.keyToPath(key);
14976
15441
  try {
14977
- const data = await readFile11(filePath, "utf8");
15442
+ const data = await readFile12(filePath, "utf8");
14978
15443
  return JSON.parse(data);
14979
15444
  } catch {
14980
15445
  return void 0;
@@ -14982,13 +15447,13 @@ var ResponseCache = class {
14982
15447
  }
14983
15448
  async set(key, value) {
14984
15449
  const filePath = this.keyToPath(key);
14985
- const dir = path40.dirname(filePath);
15450
+ const dir = path41.dirname(filePath);
14986
15451
  await mkdir13(dir, { recursive: true });
14987
15452
  await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
14988
15453
  }
14989
15454
  keyToPath(key) {
14990
15455
  const prefix = key.slice(0, 2);
14991
- return path40.join(this.cachePath, prefix, `${key}.json`);
15456
+ return path41.join(this.cachePath, prefix, `${key}.json`);
14992
15457
  }
14993
15458
  };
14994
15459
  function shouldEnableCache(params) {
@@ -15470,6 +15935,7 @@ export {
15470
15935
  TokenUsageEvaluator,
15471
15936
  ToolTrajectoryEvaluator,
15472
15937
  WorkspaceCreationError,
15938
+ WorkspacePoolManager,
15473
15939
  assembleLlmJudgePrompt,
15474
15940
  avgToolDurationMs,
15475
15941
  buildDirectoryChain,
@@ -15484,6 +15950,7 @@ export {
15484
15950
  cleanupEvalWorkspaces,
15485
15951
  cleanupWorkspace,
15486
15952
  computeTraceSummary,
15953
+ computeWorkspaceFingerprint,
15487
15954
  consumeClaudeLogEntries,
15488
15955
  consumeCodexLogEntries,
15489
15956
  consumeCopilotCliLogEntries,
@@ -15516,11 +15983,11 @@ export {
15516
15983
  freeformEvaluationSchema,
15517
15984
  generateRubrics,
15518
15985
  getAgentvHome,
15519
- getGitCacheRoot,
15520
15986
  getHitCount,
15521
15987
  getSubagentsRoot,
15522
15988
  getTraceStateRoot,
15523
15989
  getWorkspacePath,
15990
+ getWorkspacePoolRoot,
15524
15991
  getWorkspacesRoot,
15525
15992
  initializeBaseline,
15526
15993
  isEvaluatorKind,