@agentv/core 2.14.3 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
1244
1244
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1245
1245
  return { stringValue: String(value) };
1246
1246
  }
1247
- var import_promises31, import_node_path44, OtlpJsonFileExporter;
1247
+ var import_promises32, import_node_path45, OtlpJsonFileExporter;
1248
1248
  var init_otlp_json_file_exporter = __esm({
1249
1249
  "src/observability/otlp-json-file-exporter.ts"() {
1250
1250
  "use strict";
1251
- import_promises31 = require("fs/promises");
1252
- import_node_path44 = require("path");
1251
+ import_promises32 = require("fs/promises");
1252
+ import_node_path45 = require("path");
1253
1253
  OtlpJsonFileExporter = class {
1254
1254
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1255
1255
  spans = [];
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
1288
1288
  }
1289
1289
  async flush() {
1290
1290
  if (this.spans.length === 0) return;
1291
- await (0, import_promises31.mkdir)((0, import_node_path44.dirname)(this.filePath), { recursive: true });
1291
+ await (0, import_promises32.mkdir)((0, import_node_path45.dirname)(this.filePath), { recursive: true });
1292
1292
  const otlpJson = {
1293
1293
  resourceSpans: [
1294
1294
  {
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
1302
1302
  }
1303
1303
  ]
1304
1304
  };
1305
- const { writeFile: writeFile9 } = await import("fs/promises");
1306
- await writeFile9(this.filePath, JSON.stringify(otlpJson, null, 2));
1305
+ const { writeFile: writeFile10 } = await import("fs/promises");
1306
+ await writeFile10(this.filePath, JSON.stringify(otlpJson, null, 2));
1307
1307
  }
1308
1308
  };
1309
1309
  }
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
1319
1319
  const diffNano = end[1] - start[1];
1320
1320
  return Math.round(diffSec * 1e3 + diffNano / 1e6);
1321
1321
  }
1322
- var import_node_fs13, import_promises32, import_node_path45, SimpleTraceFileExporter;
1322
+ var import_node_fs14, import_promises33, import_node_path46, SimpleTraceFileExporter;
1323
1323
  var init_simple_trace_file_exporter = __esm({
1324
1324
  "src/observability/simple-trace-file-exporter.ts"() {
1325
1325
  "use strict";
1326
- import_node_fs13 = require("fs");
1327
- import_promises32 = require("fs/promises");
1328
- import_node_path45 = require("path");
1326
+ import_node_fs14 = require("fs");
1327
+ import_promises33 = require("fs/promises");
1328
+ import_node_path46 = require("path");
1329
1329
  SimpleTraceFileExporter = class {
1330
1330
  stream = null;
1331
1331
  filePath;
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
1338
1338
  async ensureStream() {
1339
1339
  if (!this.streamReady) {
1340
1340
  this.streamReady = (async () => {
1341
- await (0, import_promises32.mkdir)((0, import_node_path45.dirname)(this.filePath), { recursive: true });
1342
- this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
1341
+ await (0, import_promises33.mkdir)((0, import_node_path46.dirname)(this.filePath), { recursive: true });
1342
+ this.stream = (0, import_node_fs14.createWriteStream)(this.filePath, { flags: "w" });
1343
1343
  return this.stream;
1344
1344
  })();
1345
1345
  }
@@ -1457,6 +1457,7 @@ __export(index_exports, {
1457
1457
  TokenUsageEvaluator: () => TokenUsageEvaluator,
1458
1458
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
1459
1459
  WorkspaceCreationError: () => WorkspaceCreationError,
1460
+ WorkspacePoolManager: () => WorkspacePoolManager,
1460
1461
  assembleLlmJudgePrompt: () => assembleLlmJudgePrompt,
1461
1462
  avgToolDurationMs: () => avgToolDurationMs,
1462
1463
  buildDirectoryChain: () => buildDirectoryChain2,
@@ -1471,6 +1472,7 @@ __export(index_exports, {
1471
1472
  cleanupEvalWorkspaces: () => cleanupEvalWorkspaces,
1472
1473
  cleanupWorkspace: () => cleanupWorkspace,
1473
1474
  computeTraceSummary: () => computeTraceSummary,
1475
+ computeWorkspaceFingerprint: () => computeWorkspaceFingerprint,
1474
1476
  consumeClaudeLogEntries: () => consumeClaudeLogEntries,
1475
1477
  consumeCodexLogEntries: () => consumeCodexLogEntries,
1476
1478
  consumeCopilotCliLogEntries: () => consumeCopilotCliLogEntries,
@@ -1508,6 +1510,7 @@ __export(index_exports, {
1508
1510
  getSubagentsRoot: () => getSubagentsRoot,
1509
1511
  getTraceStateRoot: () => getTraceStateRoot,
1510
1512
  getWorkspacePath: () => getWorkspacePath,
1513
+ getWorkspacePoolRoot: () => getWorkspacePoolRoot,
1511
1514
  getWorkspacesRoot: () => getWorkspacesRoot,
1512
1515
  initializeBaseline: () => initializeBaseline,
1513
1516
  isEvaluatorKind: () => isEvaluatorKind,
@@ -2236,6 +2239,17 @@ function parseExecutionDefaults(raw, configPath) {
2236
2239
  } else if (otelFile !== void 0) {
2237
2240
  logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
2238
2241
  }
2242
+ if (typeof obj.pool_workspaces === "boolean") {
2243
+ result.pool_workspaces = obj.pool_workspaces;
2244
+ } else if (obj.pool_workspaces !== void 0) {
2245
+ logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
2246
+ }
2247
+ const poolSlots = obj.pool_slots;
2248
+ if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
2249
+ result.pool_slots = poolSlots;
2250
+ } else if (poolSlots !== void 0) {
2251
+ logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
2252
+ }
2239
2253
  return Object.keys(result).length > 0 ? result : void 0;
2240
2254
  }
2241
2255
  function logWarning(message) {
@@ -3677,6 +3691,7 @@ async function processMessages(options) {
3677
3691
  repoRootPath,
3678
3692
  guidelinePatterns,
3679
3693
  guidelinePaths,
3694
+ treatFileSegmentsAsGuidelines,
3680
3695
  textParts,
3681
3696
  messageType,
3682
3697
  verbose
@@ -3724,16 +3739,20 @@ async function processMessages(options) {
3724
3739
  }
3725
3740
  try {
3726
3741
  const fileContent = (await (0, import_promises5.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
3727
- if (messageType === "input" && guidelinePatterns && guidelinePaths) {
3728
- const relativeToRepo = import_node_path5.default.relative(repoRootPath, resolvedPath);
3729
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
3730
- guidelinePaths.push(import_node_path5.default.resolve(resolvedPath));
3731
- if (verbose) {
3732
- console.log(` [Guideline] Found: ${displayPath}`);
3733
- console.log(` Resolved to: ${resolvedPath}`);
3734
- }
3735
- continue;
3742
+ const classifyAsGuideline = shouldTreatAsGuideline({
3743
+ messageType,
3744
+ resolvedPath,
3745
+ repoRootPath,
3746
+ guidelinePatterns,
3747
+ treatFileSegmentsAsGuidelines
3748
+ });
3749
+ if (classifyAsGuideline && guidelinePaths) {
3750
+ guidelinePaths.push(import_node_path5.default.resolve(resolvedPath));
3751
+ if (verbose) {
3752
+ console.log(` [Guideline] Found: ${displayPath}`);
3753
+ console.log(` Resolved to: ${resolvedPath}`);
3736
3754
  }
3755
+ continue;
3737
3756
  }
3738
3757
  segments.push({
3739
3758
  type: "file",
@@ -3762,6 +3781,26 @@ async function processMessages(options) {
3762
3781
  }
3763
3782
  return segments;
3764
3783
  }
3784
+ function shouldTreatAsGuideline(options) {
3785
+ const {
3786
+ messageType,
3787
+ resolvedPath,
3788
+ repoRootPath,
3789
+ guidelinePatterns,
3790
+ treatFileSegmentsAsGuidelines
3791
+ } = options;
3792
+ if (messageType !== "input") {
3793
+ return false;
3794
+ }
3795
+ if (treatFileSegmentsAsGuidelines) {
3796
+ return true;
3797
+ }
3798
+ if (!guidelinePatterns || guidelinePatterns.length === 0) {
3799
+ return false;
3800
+ }
3801
+ const relativeToRepo = import_node_path5.default.relative(repoRootPath, resolvedPath);
3802
+ return isGuidelineFile(relativeToRepo, guidelinePatterns);
3803
+ }
3765
3804
  function asString3(value) {
3766
3805
  return typeof value === "string" ? value : void 0;
3767
3806
  }
@@ -4100,6 +4139,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4100
4139
  for (const guidelinePath of testCase.guideline_paths) {
4101
4140
  console.log(` - ${guidelinePath}`);
4102
4141
  }
4142
+ } else if (!guidelinePatterns || guidelinePatterns.length === 0) {
4143
+ console.log(" No guidelines found (guideline_patterns not configured)");
4103
4144
  } else {
4104
4145
  console.log(" No guidelines found");
4105
4146
  }
@@ -4469,7 +4510,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4469
4510
  } else {
4470
4511
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
4471
4512
  }
4472
- const suiteWorkspace = parseWorkspaceConfig(suite.workspace, evalFileDir);
4513
+ const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
4473
4514
  const suiteInputMessages = expandInputShorthand(suite.input);
4474
4515
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
4475
4516
  const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
@@ -4505,12 +4546,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4505
4546
  }
4506
4547
  const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4507
4548
  const skipDefaults = caseExecution?.skip_defaults === true;
4508
- const inputMessages = suiteInputMessages && !skipDefaults ? [...suiteInputMessages, ...testInputMessages] : testInputMessages;
4549
+ const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
4550
+ const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
4509
4551
  const hasExpectedMessages = expectedMessages.length > 0;
4510
4552
  const guidelinePaths = [];
4511
4553
  const inputTextParts = [];
4512
- const inputSegments = await processMessages({
4513
- messages: inputMessages,
4554
+ const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
4555
+ messages: effectiveSuiteInputMessages,
4556
+ searchRoots,
4557
+ repoRootPath,
4558
+ guidelinePatterns,
4559
+ guidelinePaths,
4560
+ treatFileSegmentsAsGuidelines: true,
4561
+ textParts: inputTextParts,
4562
+ messageType: "input",
4563
+ verbose
4564
+ }) : [];
4565
+ const testInputSegments = await processMessages({
4566
+ messages: testInputMessages,
4514
4567
  searchRoots,
4515
4568
  repoRootPath,
4516
4569
  guidelinePatterns,
@@ -4519,6 +4572,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4519
4572
  messageType: "input",
4520
4573
  verbose
4521
4574
  });
4575
+ const inputSegments = [...suiteInputSegments, ...testInputSegments];
4522
4576
  const outputSegments = hasExpectedMessages ? await processExpectedMessages({
4523
4577
  messages: expectedMessages,
4524
4578
  searchRoots,
@@ -4566,7 +4620,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4566
4620
  ...guidelinePaths.map((guidelinePath) => import_node_path8.default.resolve(guidelinePath)),
4567
4621
  ...userFilePaths
4568
4622
  ];
4569
- const caseWorkspace = parseWorkspaceConfig(evalcase.workspace, evalFileDir);
4623
+ const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
4570
4624
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
4571
4625
  const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
4572
4626
  const caseTargets = extractTargetsFromTestCase(evalcase);
@@ -4597,6 +4651,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4597
4651
  for (const guidelinePath of testCase.guideline_paths) {
4598
4652
  console.log(` - ${guidelinePath}`);
4599
4653
  }
4654
+ } else if (!guidelinePatterns || guidelinePatterns.length === 0) {
4655
+ console.log(" No guidelines found (guideline_patterns not configured)");
4600
4656
  } else {
4601
4657
  console.log(" No guidelines found");
4602
4658
  }
@@ -4696,6 +4752,26 @@ function parseResetConfig(raw) {
4696
4752
  ...afterEach !== void 0 && { after_each: afterEach }
4697
4753
  };
4698
4754
  }
4755
+ async function resolveWorkspaceConfig(raw, evalFileDir) {
4756
+ if (typeof raw === "string") {
4757
+ const workspaceFilePath = import_node_path8.default.resolve(evalFileDir, raw);
4758
+ let content;
4759
+ try {
4760
+ content = await (0, import_promises8.readFile)(workspaceFilePath, "utf8");
4761
+ } catch {
4762
+ throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
4763
+ }
4764
+ const parsed = (0, import_yaml4.parse)(content);
4765
+ if (!isJsonObject(parsed)) {
4766
+ throw new Error(
4767
+ `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
4768
+ );
4769
+ }
4770
+ const workspaceFileDir = import_node_path8.default.dirname(workspaceFilePath);
4771
+ return parseWorkspaceConfig(parsed, workspaceFileDir);
4772
+ }
4773
+ return parseWorkspaceConfig(raw, evalFileDir);
4774
+ }
4699
4775
  function parseWorkspaceConfig(raw, evalFileDir) {
4700
4776
  if (!isJsonObject(raw)) return void 0;
4701
4777
  const obj = raw;
@@ -9493,8 +9569,8 @@ function resolveCliConfig(target, env, evalFilePath) {
9493
9569
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
9494
9570
  if (!parseResult.success) {
9495
9571
  const firstError = parseResult.error.errors[0];
9496
- const path43 = firstError?.path.join(".") || "";
9497
- const prefix = path43 ? `${target.name} ${path43}: ` : `${target.name}: `;
9572
+ const path44 = firstError?.path.join(".") || "";
9573
+ const prefix = path44 ? `${target.name} ${path44}: ` : `${target.name}: `;
9498
9574
  throw new Error(`${prefix}${firstError?.message}`);
9499
9575
  }
9500
9576
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -10010,6 +10086,9 @@ function getSubagentsRoot() {
10010
10086
  function getTraceStateRoot() {
10011
10087
  return import_node_path23.default.join(getAgentvHome(), "trace-state");
10012
10088
  }
10089
+ function getWorkspacePoolRoot() {
10090
+ return import_node_path23.default.join(getAgentvHome(), "workspace-pool");
10091
+ }
10013
10092
 
10014
10093
  // src/evaluation/providers/vscode/dispatch/constants.ts
10015
10094
  var DEFAULT_LOCK_NAME = "subagent.lock";
@@ -10832,8 +10911,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
10832
10911
 
10833
10912
  **IMPORTANT**: Follow these exact steps:
10834
10913
  1. Create and write your complete response to: {{responseFileTmp}}
10835
- - All intended file outputs/changes MUST be written in your response file.
10836
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10837
10914
  2. When completely finished, run these PowerShell commands to signal completion:
10838
10915
  \`\`\`
10839
10916
  Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
@@ -10850,8 +10927,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
10850
10927
 
10851
10928
  **IMPORTANT**: Follow these exact steps:
10852
10929
  1. Create and write your complete response to: {{responseFileTmp}}
10853
- - All intended file outputs/changes MUST be written in your response file.
10854
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10855
10930
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
10856
10931
  3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
10857
10932
  `;
@@ -11464,16 +11539,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
11464
11539
  });
11465
11540
  }
11466
11541
  async function execShellWithStdin(command, stdinPayload, options = {}) {
11467
- const { mkdir: mkdir16, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
11542
+ const { mkdir: mkdir17, readFile: readFile14, rm: rm7, writeFile: writeFile10 } = await import("fs/promises");
11468
11543
  const { tmpdir: tmpdir3 } = await import("os");
11469
- const path43 = await import("path");
11544
+ const path44 = await import("path");
11470
11545
  const { randomUUID: randomUUID8 } = await import("crypto");
11471
- const dir = path43.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11472
- await mkdir16(dir, { recursive: true });
11473
- const stdinPath = path43.join(dir, "stdin.txt");
11474
- const stdoutPath = path43.join(dir, "stdout.txt");
11475
- const stderrPath = path43.join(dir, "stderr.txt");
11476
- await writeFile9(stdinPath, stdinPayload, "utf8");
11546
+ const dir = path44.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11547
+ await mkdir17(dir, { recursive: true });
11548
+ const stdinPath = path44.join(dir, "stdin.txt");
11549
+ const stdoutPath = path44.join(dir, "stdout.txt");
11550
+ const stderrPath = path44.join(dir, "stderr.txt");
11551
+ await writeFile10(stdinPath, stdinPayload, "utf8");
11477
11552
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
11478
11553
  const { spawn: spawn4 } = await import("child_process");
11479
11554
  try {
@@ -11502,11 +11577,11 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
11502
11577
  resolve(code ?? 0);
11503
11578
  });
11504
11579
  });
11505
- const stdout = (await readFile13(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
11506
- const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
11580
+ const stdout = (await readFile14(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
11581
+ const stderr = (await readFile14(stderrPath, "utf8")).replace(/\r\n/g, "\n");
11507
11582
  return { stdout, stderr, exitCode };
11508
11583
  } finally {
11509
- await rm6(dir, { recursive: true, force: true });
11584
+ await rm7(dir, { recursive: true, force: true });
11510
11585
  }
11511
11586
  }
11512
11587
 
@@ -11824,7 +11899,7 @@ var CodeEvaluator = class {
11824
11899
  outputPath,
11825
11900
  guidelineFiles: context2.evalCase.guideline_paths,
11826
11901
  inputFiles: context2.evalCase.file_paths.filter(
11827
- (path43) => !context2.evalCase.guideline_paths.includes(path43)
11902
+ (path44) => !context2.evalCase.guideline_paths.includes(path44)
11828
11903
  ),
11829
11904
  input: context2.evalCase.input,
11830
11905
  trace: context2.trace ?? null,
@@ -12103,6 +12178,8 @@ ${context2.fileChanges}`;
12103
12178
  };
12104
12179
  } catch (e) {
12105
12180
  const message = e instanceof Error ? e.message : String(e);
12181
+ const evalName = context2.evaluator?.name ?? "llm-judge";
12182
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
12106
12183
  return {
12107
12184
  score: 0,
12108
12185
  verdict: "skip",
@@ -12131,24 +12208,39 @@ ${context2.fileChanges}`;
12131
12208
  systemPrompt,
12132
12209
  target: judgeProvider.targetName
12133
12210
  };
12134
- const { data, tokenUsage } = await this.runWithRetry({
12135
- context: context2,
12136
- judgeProvider,
12137
- systemPrompt,
12138
- userPrompt: prompt,
12139
- schema: rubricEvaluationSchema
12140
- });
12141
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
12142
- return {
12143
- score,
12144
- verdict,
12145
- hits,
12146
- misses,
12147
- expectedAspectCount: rubrics.length,
12148
- reasoning: data.overall_reasoning,
12149
- evaluatorRawRequest,
12150
- tokenUsage
12151
- };
12211
+ try {
12212
+ const { data, tokenUsage } = await this.runWithRetry({
12213
+ context: context2,
12214
+ judgeProvider,
12215
+ systemPrompt,
12216
+ userPrompt: prompt,
12217
+ schema: rubricEvaluationSchema
12218
+ });
12219
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
12220
+ return {
12221
+ score,
12222
+ verdict,
12223
+ hits,
12224
+ misses,
12225
+ expectedAspectCount: rubrics.length,
12226
+ reasoning: data.overall_reasoning,
12227
+ evaluatorRawRequest,
12228
+ tokenUsage
12229
+ };
12230
+ } catch (e) {
12231
+ const message = e instanceof Error ? e.message : String(e);
12232
+ const evalName = context2.evaluator?.name ?? "llm-judge";
12233
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
12234
+ return {
12235
+ score: 0,
12236
+ verdict: "skip",
12237
+ hits: [],
12238
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
12239
+ expectedAspectCount: rubrics.length,
12240
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
12241
+ evaluatorRawRequest
12242
+ };
12243
+ }
12152
12244
  }
12153
12245
  /**
12154
12246
  * Evaluate using score-range rubrics (analytic rubric scoring).
@@ -12162,25 +12254,40 @@ ${context2.fileChanges}`;
12162
12254
  systemPrompt,
12163
12255
  target: judgeProvider.targetName
12164
12256
  };
12165
- const { data, tokenUsage } = await this.runWithRetry({
12166
- context: context2,
12167
- judgeProvider,
12168
- systemPrompt,
12169
- userPrompt: prompt,
12170
- schema: scoreRangeEvaluationSchema
12171
- });
12172
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
12173
- return {
12174
- score,
12175
- verdict,
12176
- hits,
12177
- misses,
12178
- expectedAspectCount: rubrics.length,
12179
- reasoning: data.overall_reasoning,
12180
- evaluatorRawRequest,
12181
- details,
12182
- tokenUsage
12183
- };
12257
+ try {
12258
+ const { data, tokenUsage } = await this.runWithRetry({
12259
+ context: context2,
12260
+ judgeProvider,
12261
+ systemPrompt,
12262
+ userPrompt: prompt,
12263
+ schema: scoreRangeEvaluationSchema
12264
+ });
12265
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
12266
+ return {
12267
+ score,
12268
+ verdict,
12269
+ hits,
12270
+ misses,
12271
+ expectedAspectCount: rubrics.length,
12272
+ reasoning: data.overall_reasoning,
12273
+ evaluatorRawRequest,
12274
+ details,
12275
+ tokenUsage
12276
+ };
12277
+ } catch (e) {
12278
+ const message = e instanceof Error ? e.message : String(e);
12279
+ const evalName = context2.evaluator?.name ?? "llm-judge";
12280
+ console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
12281
+ return {
12282
+ score: 0,
12283
+ verdict: "skip",
12284
+ hits: [],
12285
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
12286
+ expectedAspectCount: rubrics.length,
12287
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
12288
+ evaluatorRawRequest
12289
+ };
12290
+ }
12184
12291
  }
12185
12292
  /**
12186
12293
  * Build prompt for score-range rubric evaluation.
@@ -12466,19 +12573,13 @@ var CompositeEvaluator = class {
12466
12573
  runWeightedAverage(results, weights) {
12467
12574
  let totalWeight = 0;
12468
12575
  let weightedSum = 0;
12576
+ let evaluatedCount = 0;
12469
12577
  const allHits = [];
12470
12578
  const allMisses = [];
12471
12579
  const reasoningParts = [];
12472
12580
  const scores = [];
12473
12581
  for (const member of results) {
12474
12582
  const weight = weights?.[member.id] ?? 1;
12475
- totalWeight += weight;
12476
- weightedSum += member.result.score * weight;
12477
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
12478
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
12479
- if (member.result.reasoning) {
12480
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
12481
- }
12482
12583
  scores.push({
12483
12584
  name: member.id,
12484
12585
  type: member.type,
@@ -12493,6 +12594,32 @@ var CompositeEvaluator = class {
12493
12594
  details: member.result.details,
12494
12595
  tokenUsage: member.result.tokenUsage
12495
12596
  });
12597
+ if (member.result.verdict === "skip") {
12598
+ continue;
12599
+ }
12600
+ evaluatedCount++;
12601
+ totalWeight += weight;
12602
+ weightedSum += member.result.score * weight;
12603
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
12604
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
12605
+ if (member.result.reasoning) {
12606
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
12607
+ }
12608
+ }
12609
+ if (evaluatedCount === 0 && results.length > 0) {
12610
+ return {
12611
+ score: 0,
12612
+ verdict: "skip",
12613
+ hits: [],
12614
+ misses: [],
12615
+ expectedAspectCount: 1,
12616
+ reasoning: "All evaluators skipped (infrastructure failure)",
12617
+ evaluatorRawRequest: {
12618
+ aggregator: "weighted_average",
12619
+ ...weights ? { weights } : {}
12620
+ },
12621
+ scores
12622
+ };
12496
12623
  }
12497
12624
  const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
12498
12625
  return {
@@ -12516,19 +12643,8 @@ var CompositeEvaluator = class {
12516
12643
  const reasoningParts = [];
12517
12644
  let passingCount = 0;
12518
12645
  let borderlineCount = 0;
12646
+ let evaluatedCount = 0;
12519
12647
  for (const member of results) {
12520
- const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
12521
- if (isPassing) {
12522
- passingCount++;
12523
- if (member.result.verdict === "borderline") {
12524
- borderlineCount++;
12525
- }
12526
- }
12527
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
12528
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
12529
- if (member.result.reasoning) {
12530
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
12531
- }
12532
12648
  scores.push({
12533
12649
  name: member.id,
12534
12650
  type: member.type,
@@ -12542,8 +12658,39 @@ var CompositeEvaluator = class {
12542
12658
  details: member.result.details,
12543
12659
  tokenUsage: member.result.tokenUsage
12544
12660
  });
12661
+ if (member.result.verdict === "skip") {
12662
+ continue;
12663
+ }
12664
+ evaluatedCount++;
12665
+ const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
12666
+ if (isPassing) {
12667
+ passingCount++;
12668
+ if (member.result.verdict === "borderline") {
12669
+ borderlineCount++;
12670
+ }
12671
+ }
12672
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
12673
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
12674
+ if (member.result.reasoning) {
12675
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
12676
+ }
12677
+ }
12678
+ if (evaluatedCount === 0 && results.length > 0) {
12679
+ return {
12680
+ score: 0,
12681
+ verdict: "skip",
12682
+ hits: [],
12683
+ misses: [],
12684
+ expectedAspectCount: 1,
12685
+ reasoning: "All evaluators skipped (infrastructure failure)",
12686
+ evaluatorRawRequest: {
12687
+ aggregator: "threshold",
12688
+ threshold
12689
+ },
12690
+ scores
12691
+ };
12545
12692
  }
12546
- const totalCount = results.length;
12693
+ const totalCount = evaluatedCount;
12547
12694
  const score = totalCount > 0 ? passingCount / totalCount : 0;
12548
12695
  const pass = score >= threshold;
12549
12696
  if (pass && borderlineCount > 0) {
@@ -13051,115 +13198,115 @@ var FieldAccuracyEvaluator = class {
13051
13198
  * Evaluate a single field against the expected value.
13052
13199
  */
13053
13200
  evaluateField(fieldConfig, candidateData, expectedData) {
13054
- const { path: path43, match, required = true, weight = 1 } = fieldConfig;
13055
- const candidateValue = resolvePath(candidateData, path43);
13056
- const expectedValue = resolvePath(expectedData, path43);
13201
+ const { path: path44, match, required = true, weight = 1 } = fieldConfig;
13202
+ const candidateValue = resolvePath(candidateData, path44);
13203
+ const expectedValue = resolvePath(expectedData, path44);
13057
13204
  if (expectedValue === void 0) {
13058
13205
  return {
13059
- path: path43,
13206
+ path: path44,
13060
13207
  score: 1,
13061
13208
  // No expected value means no comparison needed
13062
13209
  weight,
13063
13210
  hit: true,
13064
- message: `${path43}: no expected value`
13211
+ message: `${path44}: no expected value`
13065
13212
  };
13066
13213
  }
13067
13214
  if (candidateValue === void 0) {
13068
13215
  if (required) {
13069
13216
  return {
13070
- path: path43,
13217
+ path: path44,
13071
13218
  score: 0,
13072
13219
  weight,
13073
13220
  hit: false,
13074
- message: `${path43} (required, missing)`
13221
+ message: `${path44} (required, missing)`
13075
13222
  };
13076
13223
  }
13077
13224
  return {
13078
- path: path43,
13225
+ path: path44,
13079
13226
  score: 1,
13080
13227
  // Don't penalize missing optional fields
13081
13228
  weight: 0,
13082
13229
  // Zero weight means it won't affect the score
13083
13230
  hit: true,
13084
- message: `${path43}: optional field missing`
13231
+ message: `${path44}: optional field missing`
13085
13232
  };
13086
13233
  }
13087
13234
  switch (match) {
13088
13235
  case "exact":
13089
- return this.compareExact(path43, candidateValue, expectedValue, weight);
13236
+ return this.compareExact(path44, candidateValue, expectedValue, weight);
13090
13237
  case "numeric_tolerance":
13091
13238
  return this.compareNumericTolerance(
13092
- path43,
13239
+ path44,
13093
13240
  candidateValue,
13094
13241
  expectedValue,
13095
13242
  fieldConfig,
13096
13243
  weight
13097
13244
  );
13098
13245
  case "date":
13099
- return this.compareDate(path43, candidateValue, expectedValue, fieldConfig, weight);
13246
+ return this.compareDate(path44, candidateValue, expectedValue, fieldConfig, weight);
13100
13247
  default:
13101
13248
  return {
13102
- path: path43,
13249
+ path: path44,
13103
13250
  score: 0,
13104
13251
  weight,
13105
13252
  hit: false,
13106
- message: `${path43}: unknown match type "${match}"`
13253
+ message: `${path44}: unknown match type "${match}"`
13107
13254
  };
13108
13255
  }
13109
13256
  }
13110
13257
  /**
13111
13258
  * Exact equality comparison.
13112
13259
  */
13113
- compareExact(path43, candidateValue, expectedValue, weight) {
13260
+ compareExact(path44, candidateValue, expectedValue, weight) {
13114
13261
  if (deepEqual(candidateValue, expectedValue)) {
13115
13262
  return {
13116
- path: path43,
13263
+ path: path44,
13117
13264
  score: 1,
13118
13265
  weight,
13119
13266
  hit: true,
13120
- message: path43
13267
+ message: path44
13121
13268
  };
13122
13269
  }
13123
13270
  if (typeof candidateValue !== typeof expectedValue) {
13124
13271
  return {
13125
- path: path43,
13272
+ path: path44,
13126
13273
  score: 0,
13127
13274
  weight,
13128
13275
  hit: false,
13129
- message: `${path43} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13276
+ message: `${path44} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13130
13277
  };
13131
13278
  }
13132
13279
  return {
13133
- path: path43,
13280
+ path: path44,
13134
13281
  score: 0,
13135
13282
  weight,
13136
13283
  hit: false,
13137
- message: `${path43} (value mismatch)`
13284
+ message: `${path44} (value mismatch)`
13138
13285
  };
13139
13286
  }
13140
13287
  /**
13141
13288
  * Numeric comparison with absolute or relative tolerance.
13142
13289
  */
13143
- compareNumericTolerance(path43, candidateValue, expectedValue, fieldConfig, weight) {
13290
+ compareNumericTolerance(path44, candidateValue, expectedValue, fieldConfig, weight) {
13144
13291
  const { tolerance = 0, relative = false } = fieldConfig;
13145
13292
  const candidateNum = toNumber2(candidateValue);
13146
13293
  const expectedNum = toNumber2(expectedValue);
13147
13294
  if (candidateNum === null || expectedNum === null) {
13148
13295
  return {
13149
- path: path43,
13296
+ path: path44,
13150
13297
  score: 0,
13151
13298
  weight,
13152
13299
  hit: false,
13153
- message: `${path43} (non-numeric value)`
13300
+ message: `${path44} (non-numeric value)`
13154
13301
  };
13155
13302
  }
13156
13303
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
13157
13304
  return {
13158
- path: path43,
13305
+ path: path44,
13159
13306
  score: 0,
13160
13307
  weight,
13161
13308
  hit: false,
13162
- message: `${path43} (invalid numeric value)`
13309
+ message: `${path44} (invalid numeric value)`
13163
13310
  };
13164
13311
  }
13165
13312
  const diff = Math.abs(candidateNum - expectedNum);
@@ -13172,61 +13319,61 @@ var FieldAccuracyEvaluator = class {
13172
13319
  }
13173
13320
  if (withinTolerance) {
13174
13321
  return {
13175
- path: path43,
13322
+ path: path44,
13176
13323
  score: 1,
13177
13324
  weight,
13178
13325
  hit: true,
13179
- message: `${path43} (within tolerance: diff=${diff.toFixed(2)})`
13326
+ message: `${path44} (within tolerance: diff=${diff.toFixed(2)})`
13180
13327
  };
13181
13328
  }
13182
13329
  return {
13183
- path: path43,
13330
+ path: path44,
13184
13331
  score: 0,
13185
13332
  weight,
13186
13333
  hit: false,
13187
- message: `${path43} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13334
+ message: `${path44} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13188
13335
  };
13189
13336
  }
13190
13337
  /**
13191
13338
  * Date comparison with format normalization.
13192
13339
  */
13193
- compareDate(path43, candidateValue, expectedValue, fieldConfig, weight) {
13340
+ compareDate(path44, candidateValue, expectedValue, fieldConfig, weight) {
13194
13341
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
13195
13342
  const candidateDate = parseDate(String(candidateValue), formats);
13196
13343
  const expectedDate = parseDate(String(expectedValue), formats);
13197
13344
  if (candidateDate === null) {
13198
13345
  return {
13199
- path: path43,
13346
+ path: path44,
13200
13347
  score: 0,
13201
13348
  weight,
13202
13349
  hit: false,
13203
- message: `${path43} (unparseable candidate date)`
13350
+ message: `${path44} (unparseable candidate date)`
13204
13351
  };
13205
13352
  }
13206
13353
  if (expectedDate === null) {
13207
13354
  return {
13208
- path: path43,
13355
+ path: path44,
13209
13356
  score: 0,
13210
13357
  weight,
13211
13358
  hit: false,
13212
- message: `${path43} (unparseable expected date)`
13359
+ message: `${path44} (unparseable expected date)`
13213
13360
  };
13214
13361
  }
13215
13362
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
13216
13363
  return {
13217
- path: path43,
13364
+ path: path44,
13218
13365
  score: 1,
13219
13366
  weight,
13220
13367
  hit: true,
13221
- message: path43
13368
+ message: path44
13222
13369
  };
13223
13370
  }
13224
13371
  return {
13225
- path: path43,
13372
+ path: path44,
13226
13373
  score: 0,
13227
13374
  weight,
13228
13375
  hit: false,
13229
- message: `${path43} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13376
+ message: `${path44} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13230
13377
  };
13231
13378
  }
13232
13379
  /**
@@ -13267,11 +13414,11 @@ var FieldAccuracyEvaluator = class {
13267
13414
  };
13268
13415
  }
13269
13416
  };
13270
- function resolvePath(obj, path43) {
13271
- if (!path43 || !obj) {
13417
+ function resolvePath(obj, path44) {
13418
+ if (!path44 || !obj) {
13272
13419
  return void 0;
13273
13420
  }
13274
- const parts = path43.split(/\.|\[|\]/).filter((p) => p.length > 0);
13421
+ const parts = path44.split(/\.|\[|\]/).filter((p) => p.length > 0);
13275
13422
  let current = obj;
13276
13423
  for (const part of parts) {
13277
13424
  if (current === null || current === void 0) {
@@ -14089,8 +14236,8 @@ var TokenUsageEvaluator = class {
14089
14236
  };
14090
14237
 
14091
14238
  // src/evaluation/evaluators/tool-trajectory.ts
14092
- function getNestedValue(obj, path43) {
14093
- const parts = path43.split(".");
14239
+ function getNestedValue(obj, path44) {
14240
+ const parts = path44.split(".");
14094
14241
  let current = obj;
14095
14242
  for (const part of parts) {
14096
14243
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -14651,9 +14798,9 @@ function runEqualsAssertion(output, value) {
14651
14798
  }
14652
14799
 
14653
14800
  // src/evaluation/orchestrator.ts
14654
- var import_node_crypto9 = require("crypto");
14655
- var import_promises29 = require("fs/promises");
14656
- var import_node_path41 = __toESM(require("path"), 1);
14801
+ var import_node_crypto10 = require("crypto");
14802
+ var import_promises30 = require("fs/promises");
14803
+ var import_node_path42 = __toESM(require("path"), 1);
14657
14804
  var import_micromatch4 = __toESM(require("micromatch"), 1);
14658
14805
 
14659
14806
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -15523,7 +15670,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
15523
15670
  }
15524
15671
  }
15525
15672
 
15526
- // src/evaluation/workspace/repo-manager.ts
15673
+ // src/evaluation/workspace/pool-manager.ts
15527
15674
  var import_node_child_process7 = require("child_process");
15528
15675
  var import_node_crypto8 = require("crypto");
15529
15676
  var import_node_fs11 = require("fs");
@@ -15531,8 +15678,6 @@ var import_promises27 = require("fs/promises");
15531
15678
  var import_node_path39 = __toESM(require("path"), 1);
15532
15679
  var import_node_util5 = require("util");
15533
15680
  var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
15534
- var DEFAULT_TIMEOUT_MS2 = 3e5;
15535
- var LOCK_TIMEOUT_MS = 6e4;
15536
15681
  function gitEnv() {
15537
15682
  const env = { ...process.env };
15538
15683
  for (const key of Object.keys(env)) {
@@ -15547,75 +15692,339 @@ function gitEnv() {
15547
15692
  GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
15548
15693
  };
15549
15694
  }
15550
- function cacheKey(source) {
15551
- const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
15552
- return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
15553
- }
15554
- function getSourceUrl(source) {
15555
- return source.type === "git" ? source.url : source.path;
15556
- }
15557
15695
  async function git(args, opts) {
15558
15696
  const { stdout } = await execFileAsync("git", args, {
15559
15697
  cwd: opts?.cwd,
15560
- timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
15698
+ timeout: opts?.timeout ?? 3e5,
15561
15699
  env: gitEnv(),
15562
15700
  maxBuffer: 50 * 1024 * 1024
15563
- // 50MB
15564
15701
  });
15565
15702
  return stdout.trim();
15566
15703
  }
15567
- async function acquireLock(lockPath) {
15568
- const start = Date.now();
15569
- while (Date.now() - start < LOCK_TIMEOUT_MS) {
15570
- try {
15571
- await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
15572
- return;
15573
- } catch (err) {
15574
- if (err.code === "EEXIST") {
15575
- await new Promise((r) => setTimeout(r, 200));
15704
+ function normalizeRepoForFingerprint(repo) {
15705
+ const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
15706
+ const result = {
15707
+ path: repo.path,
15708
+ source,
15709
+ ref: repo.checkout?.ref ?? "HEAD"
15710
+ };
15711
+ if (repo.clone?.depth !== void 0) {
15712
+ result.depth = repo.clone.depth;
15713
+ }
15714
+ if (repo.clone?.filter !== void 0) {
15715
+ result.filter = repo.clone.filter;
15716
+ }
15717
+ if (repo.clone?.sparse?.length) {
15718
+ result.sparse = [...repo.clone.sparse].sort();
15719
+ }
15720
+ return result;
15721
+ }
15722
+ function computeWorkspaceFingerprint(templatePath, repos) {
15723
+ const canonical = {
15724
+ templatePath: templatePath ?? null,
15725
+ repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
15726
+ };
15727
+ return (0, import_node_crypto8.createHash)("sha256").update(JSON.stringify(canonical)).digest("hex");
15728
+ }
15729
+ async function copyDirectoryRecursive2(src, dest, skipDirs) {
15730
+ await (0, import_promises27.mkdir)(dest, { recursive: true });
15731
+ const entries = await (0, import_promises27.readdir)(src, { withFileTypes: true });
15732
+ for (const entry of entries) {
15733
+ const srcPath = import_node_path39.default.join(src, entry.name);
15734
+ const destPath = import_node_path39.default.join(dest, entry.name);
15735
+ if (entry.name === ".git") {
15736
+ continue;
15737
+ }
15738
+ if (entry.isDirectory()) {
15739
+ if (skipDirs?.has(entry.name)) {
15576
15740
  continue;
15577
15741
  }
15578
- throw err;
15742
+ await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
15743
+ } else {
15744
+ await (0, import_promises27.cp)(srcPath, destPath, { preserveTimestamps: true, force: true });
15579
15745
  }
15580
15746
  }
15581
- throw new Error(`Timed out waiting for lock: ${lockPath}`);
15582
- }
15583
- async function releaseLock(lockPath) {
15584
- try {
15585
- await (0, import_promises27.unlink)(lockPath);
15586
- } catch {
15587
- }
15588
15747
  }
15589
- var RepoManager = class {
15590
- cacheDir;
15591
- verbose;
15592
- constructor(cacheDir, verbose = false) {
15593
- this.cacheDir = cacheDir ?? getGitCacheRoot();
15594
- this.verbose = verbose;
15748
+ var WorkspacePoolManager = class {
15749
+ poolRoot;
15750
+ constructor(poolRoot) {
15751
+ this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
15595
15752
  }
15596
- async runGit(args, opts) {
15597
- const startedAt = Date.now();
15598
- if (this.verbose) {
15599
- console.log(
15600
- `[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`
15753
+ /**
15754
+ * Acquire a workspace slot from the pool.
15755
+ *
15756
+ * 1. Compute fingerprint from template + repos
15757
+ * 2. Check drift (compare stored metadata.json fingerprint vs computed)
15758
+ * 3. If drift: warn, remove all slots, rematerialize
15759
+ * 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
15760
+ * 5. If slot exists: reset repos, re-copy template files (skip repo directories)
15761
+ * 6. If new slot: copy template, materialize all repos, write metadata.json
15762
+ * 7. Return the slot (with path, index, isExisting)
15763
+ */
15764
+ async acquireWorkspace(options) {
15765
+ const { templatePath, repos, maxSlots, repoManager } = options;
15766
+ const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
15767
+ const poolDir = import_node_path39.default.join(this.poolRoot, fingerprint);
15768
+ await (0, import_promises27.mkdir)(poolDir, { recursive: true });
15769
+ const drifted = await this.checkDrift(poolDir, fingerprint);
15770
+ if (drifted) {
15771
+ console.warn(
15772
+ `[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
15601
15773
  );
15774
+ await this.removeAllSlots(poolDir);
15602
15775
  }
15603
- try {
15604
- const output = await git(args, opts);
15605
- if (this.verbose) {
15606
- console.log(
15607
- `[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`
15608
- );
15776
+ for (let i = 0; i < maxSlots; i++) {
15777
+ const slotPath = import_node_path39.default.join(poolDir, `slot-${i}`);
15778
+ const lockPath = `${slotPath}.lock`;
15779
+ const locked = await this.tryLock(lockPath);
15780
+ if (!locked) {
15781
+ continue;
15609
15782
  }
15610
- return output;
15611
- } catch (error) {
15612
- if (this.verbose) {
15613
- const message = error instanceof Error ? error.message : String(error);
15614
- console.log(
15615
- `[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
15616
- );
15783
+ const slotExists = (0, import_node_fs11.existsSync)(slotPath);
15784
+ if (slotExists) {
15785
+ await this.resetSlot(slotPath, templatePath, repos);
15786
+ return {
15787
+ index: i,
15788
+ path: slotPath,
15789
+ isExisting: true,
15790
+ lockPath,
15791
+ fingerprint,
15792
+ poolDir
15793
+ };
15617
15794
  }
15618
- throw error;
15795
+ await (0, import_promises27.mkdir)(slotPath, { recursive: true });
15796
+ if (templatePath) {
15797
+ await copyDirectoryRecursive2(templatePath, slotPath);
15798
+ }
15799
+ if (repos.length > 0) {
15800
+ await repoManager.materializeAll(repos, slotPath);
15801
+ }
15802
+ await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
15803
+ return {
15804
+ index: i,
15805
+ path: slotPath,
15806
+ isExisting: false,
15807
+ lockPath,
15808
+ fingerprint,
15809
+ poolDir
15810
+ };
15811
+ }
15812
+ throw new Error(
15813
+ `All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
15814
+ );
15815
+ }
15816
+ /** Remove lock file to release a slot. */
15817
+ async releaseSlot(slot) {
15818
+ try {
15819
+ await (0, import_promises27.unlink)(slot.lockPath);
15820
+ } catch {
15821
+ }
15822
+ }
15823
+ /**
15824
+ * Try to acquire a PID-based lock file.
15825
+ * On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
15826
+ * Returns true if lock acquired, false if slot is actively locked.
15827
+ * Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
15828
+ */
15829
+ async tryLock(lockPath) {
15830
+ for (let attempt = 0; attempt < 3; attempt++) {
15831
+ try {
15832
+ await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
15833
+ return true;
15834
+ } catch (err) {
15835
+ if (err.code !== "EEXIST") {
15836
+ throw err;
15837
+ }
15838
+ try {
15839
+ const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
15840
+ const pid = Number.parseInt(pidStr.trim(), 10);
15841
+ if (!Number.isNaN(pid)) {
15842
+ try {
15843
+ process.kill(pid, 0);
15844
+ return false;
15845
+ } catch {
15846
+ await (0, import_promises27.unlink)(lockPath).catch(() => {
15847
+ });
15848
+ continue;
15849
+ }
15850
+ }
15851
+ } catch {
15852
+ }
15853
+ return false;
15854
+ }
15855
+ }
15856
+ return false;
15857
+ }
15858
+ /**
15859
+ * Check if the stored fingerprint in metadata.json differs from the computed one.
15860
+ * Returns true if drifted, false otherwise.
15861
+ * Returns false (no drift) if metadata.json doesn't exist (first use).
15862
+ */
15863
+ async checkDrift(poolDir, fingerprint) {
15864
+ const metadataPath = import_node_path39.default.join(poolDir, "metadata.json");
15865
+ try {
15866
+ const raw = await (0, import_promises27.readFile)(metadataPath, "utf-8");
15867
+ const metadata = JSON.parse(raw);
15868
+ return metadata.fingerprint !== fingerprint;
15869
+ } catch {
15870
+ return false;
15871
+ }
15872
+ }
15873
+ /** Write metadata.json with fingerprint, inputs, and timestamp. */
15874
+ async writeMetadata(poolDir, fingerprint, templatePath, repos) {
15875
+ const metadata = {
15876
+ fingerprint,
15877
+ templatePath,
15878
+ repos,
15879
+ createdAt: (/* @__PURE__ */ new Date()).toISOString()
15880
+ };
15881
+ await (0, import_promises27.writeFile)(import_node_path39.default.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
15882
+ }
15883
+ /** Remove all slot directories and their lock files from a pool directory. */
15884
+ async removeAllSlots(poolDir) {
15885
+ const entries = await (0, import_promises27.readdir)(poolDir);
15886
+ for (const entry of entries) {
15887
+ if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
15888
+ const lockPath = import_node_path39.default.join(poolDir, `${entry}.lock`);
15889
+ if ((0, import_node_fs11.existsSync)(lockPath)) {
15890
+ try {
15891
+ const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
15892
+ const pid = Number.parseInt(pidStr.trim(), 10);
15893
+ if (!Number.isNaN(pid)) {
15894
+ try {
15895
+ process.kill(pid, 0);
15896
+ console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
15897
+ continue;
15898
+ } catch {
15899
+ }
15900
+ }
15901
+ } catch {
15902
+ }
15903
+ }
15904
+ await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, entry), { recursive: true, force: true });
15905
+ await (0, import_promises27.rm)(lockPath, { force: true }).catch(() => {
15906
+ });
15907
+ }
15908
+ }
15909
+ await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, "metadata.json"), { force: true }).catch(() => {
15910
+ });
15911
+ }
15912
+ /**
15913
+ * Reset an existing slot for reuse:
15914
+ * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
15915
+ * 2. Re-copy template files (skip repo directories)
15916
+ */
15917
+ async resetSlot(slotPath, templatePath, repos) {
15918
+ for (const repo of repos) {
15919
+ const repoDir = import_node_path39.default.join(slotPath, repo.path);
15920
+ if (!(0, import_node_fs11.existsSync)(repoDir)) {
15921
+ continue;
15922
+ }
15923
+ const ref = repo.checkout?.ref ?? "HEAD";
15924
+ await git(["reset", "--hard", ref], { cwd: repoDir });
15925
+ await git(["clean", "-fd"], { cwd: repoDir });
15926
+ }
15927
+ if (templatePath) {
15928
+ const repoDirNames = new Set(
15929
+ repos.map((r) => {
15930
+ const normalized = r.path.replace(/^\.\//, "");
15931
+ return normalized.split("/")[0];
15932
+ })
15933
+ );
15934
+ await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
15935
+ }
15936
+ }
15937
+ };
15938
+
15939
+ // src/evaluation/workspace/repo-manager.ts
15940
+ var import_node_child_process8 = require("child_process");
15941
+ var import_node_crypto9 = require("crypto");
15942
+ var import_node_fs12 = require("fs");
15943
+ var import_promises28 = require("fs/promises");
15944
+ var import_node_path40 = __toESM(require("path"), 1);
15945
+ var import_node_util6 = require("util");
15946
+ var execFileAsync2 = (0, import_node_util6.promisify)(import_node_child_process8.execFile);
15947
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
15948
+ var LOCK_TIMEOUT_MS = 6e4;
15949
+ function gitEnv2() {
15950
+ const env = { ...process.env };
15951
+ for (const key of Object.keys(env)) {
15952
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
15953
+ delete env[key];
15954
+ }
15955
+ }
15956
+ return {
15957
+ ...env,
15958
+ GIT_TERMINAL_PROMPT: "0",
15959
+ GIT_ASKPASS: "",
15960
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
15961
+ };
15962
+ }
15963
+ function cacheKey(source) {
15964
+ const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
15965
+ return (0, import_node_crypto9.createHash)("sha256").update(raw).digest("hex");
15966
+ }
15967
+ function getSourceUrl(source) {
15968
+ return source.type === "git" ? source.url : source.path;
15969
+ }
15970
+ async function git2(args, opts) {
15971
+ const { stdout } = await execFileAsync2("git", args, {
15972
+ cwd: opts?.cwd,
15973
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
15974
+ env: gitEnv2(),
15975
+ maxBuffer: 50 * 1024 * 1024
15976
+ // 50MB
15977
+ });
15978
+ return stdout.trim();
15979
+ }
15980
+ async function acquireLock(lockPath) {
15981
+ const start = Date.now();
15982
+ while (Date.now() - start < LOCK_TIMEOUT_MS) {
15983
+ try {
15984
+ await (0, import_promises28.writeFile)(lockPath, String(process.pid), { flag: "wx" });
15985
+ return;
15986
+ } catch (err) {
15987
+ if (err.code === "EEXIST") {
15988
+ await new Promise((r) => setTimeout(r, 200));
15989
+ continue;
15990
+ }
15991
+ throw err;
15992
+ }
15993
+ }
15994
+ throw new Error(`Timed out waiting for lock: ${lockPath}`);
15995
+ }
15996
+ async function releaseLock(lockPath) {
15997
+ try {
15998
+ await (0, import_promises28.unlink)(lockPath);
15999
+ } catch {
16000
+ }
16001
+ }
16002
+ var RepoManager = class {
16003
+ cacheDir;
16004
+ verbose;
16005
+ constructor(cacheDir, verbose = false) {
16006
+ this.cacheDir = cacheDir ?? getGitCacheRoot();
16007
+ this.verbose = verbose;
16008
+ }
16009
+ async runGit(args, opts) {
16010
+ const startedAt = Date.now();
16011
+ if (this.verbose) {
16012
+ console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
16013
+ }
16014
+ try {
16015
+ const output = await git2(args, opts);
16016
+ if (this.verbose) {
16017
+ console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
16018
+ }
16019
+ return output;
16020
+ } catch (error) {
16021
+ if (this.verbose) {
16022
+ const message = error instanceof Error ? error.message : String(error);
16023
+ console.log(
16024
+ `[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
16025
+ );
16026
+ }
16027
+ throw error;
15619
16028
  }
15620
16029
  }
15621
16030
  /**
@@ -15625,9 +16034,9 @@ var RepoManager = class {
15625
16034
  */
15626
16035
  async ensureCache(source, depth, resolve) {
15627
16036
  const key = cacheKey(source);
15628
- const cachePath = import_node_path39.default.join(this.cacheDir, key);
16037
+ const cachePath = import_node_path40.default.join(this.cacheDir, key);
15629
16038
  const lockPath = `${cachePath}.lock`;
15630
- const cacheExists = (0, import_node_fs11.existsSync)(import_node_path39.default.join(cachePath, "HEAD"));
16039
+ const cacheExists = (0, import_node_fs12.existsSync)(import_node_path40.default.join(cachePath, "HEAD"));
15631
16040
  if (this.verbose) {
15632
16041
  console.log(
15633
16042
  `[repo] ensureCache source=${getSourceUrl(source)} resolve=${resolve ?? "remote"} cache=${cachePath}`
@@ -15645,13 +16054,11 @@ var RepoManager = class {
15645
16054
  `No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
15646
16055
  );
15647
16056
  }
15648
- await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
16057
+ await (0, import_promises28.mkdir)(this.cacheDir, { recursive: true });
15649
16058
  const lockStartedAt = Date.now();
15650
16059
  await acquireLock(lockPath);
15651
16060
  if (this.verbose) {
15652
- console.log(
15653
- `[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
15654
- );
16061
+ console.log(`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`);
15655
16062
  }
15656
16063
  try {
15657
16064
  if (cacheExists) {
@@ -15689,7 +16096,7 @@ var RepoManager = class {
15689
16096
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
15690
16097
  */
15691
16098
  async materialize(repo, workspacePath) {
15692
- const targetDir = import_node_path39.default.join(workspacePath, repo.path);
16099
+ const targetDir = import_node_path40.default.join(workspacePath, repo.path);
15693
16100
  const startedAt = Date.now();
15694
16101
  if (this.verbose) {
15695
16102
  console.log(
@@ -15784,14 +16191,14 @@ var RepoManager = class {
15784
16191
  async reset(repos, workspacePath, strategy) {
15785
16192
  if (strategy === "recreate") {
15786
16193
  for (const repo of repos) {
15787
- const targetDir = import_node_path39.default.join(workspacePath, repo.path);
15788
- await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
16194
+ const targetDir = import_node_path40.default.join(workspacePath, repo.path);
16195
+ await (0, import_promises28.rm)(targetDir, { recursive: true, force: true });
15789
16196
  }
15790
16197
  await this.materializeAll(repos, workspacePath);
15791
16198
  return;
15792
16199
  }
15793
16200
  for (const repo of repos) {
15794
- const targetDir = import_node_path39.default.join(workspacePath, repo.path);
16201
+ const targetDir = import_node_path40.default.join(workspacePath, repo.path);
15795
16202
  await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
15796
16203
  await this.runGit(["clean", "-fd"], { cwd: targetDir });
15797
16204
  }
@@ -15803,21 +16210,21 @@ var RepoManager = class {
15803
16210
  async seedCache(localPath, remoteUrl, opts) {
15804
16211
  const source = { type: "git", url: remoteUrl };
15805
16212
  const key = cacheKey(source);
15806
- const cachePath = import_node_path39.default.join(this.cacheDir, key);
16213
+ const cachePath = import_node_path40.default.join(this.cacheDir, key);
15807
16214
  const lockPath = `${cachePath}.lock`;
15808
- await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
16215
+ await (0, import_promises28.mkdir)(this.cacheDir, { recursive: true });
15809
16216
  await acquireLock(lockPath);
15810
16217
  try {
15811
- if ((0, import_node_fs11.existsSync)(import_node_path39.default.join(cachePath, "HEAD"))) {
16218
+ if ((0, import_node_fs12.existsSync)(import_node_path40.default.join(cachePath, "HEAD"))) {
15812
16219
  if (!opts?.force) {
15813
16220
  throw new Error(
15814
16221
  `Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
15815
16222
  );
15816
16223
  }
15817
- await (0, import_promises27.rm)(cachePath, { recursive: true, force: true });
16224
+ await (0, import_promises28.rm)(cachePath, { recursive: true, force: true });
15818
16225
  }
15819
- await git(["clone", "--mirror", "--bare", localPath, cachePath]);
15820
- await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
16226
+ await git2(["clone", "--mirror", "--bare", localPath, cachePath]);
16227
+ await git2(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
15821
16228
  } finally {
15822
16229
  await releaseLock(lockPath);
15823
16230
  }
@@ -15825,41 +16232,41 @@ var RepoManager = class {
15825
16232
  }
15826
16233
  /** Remove the entire cache directory. */
15827
16234
  async cleanCache() {
15828
- await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
16235
+ await (0, import_promises28.rm)(this.cacheDir, { recursive: true, force: true });
15829
16236
  }
15830
16237
  };
15831
16238
 
15832
16239
  // src/evaluation/workspace/resolve.ts
15833
- var import_promises28 = require("fs/promises");
15834
- var import_node_path40 = __toESM(require("path"), 1);
16240
+ var import_promises29 = require("fs/promises");
16241
+ var import_node_path41 = __toESM(require("path"), 1);
15835
16242
  async function resolveWorkspaceTemplate(templatePath) {
15836
16243
  if (!templatePath) {
15837
16244
  return void 0;
15838
16245
  }
15839
- const resolved = import_node_path40.default.resolve(templatePath);
15840
- const stats = await (0, import_promises28.stat)(resolved);
16246
+ const resolved = import_node_path41.default.resolve(templatePath);
16247
+ const stats = await (0, import_promises29.stat)(resolved);
15841
16248
  if (stats.isFile()) {
15842
16249
  return {
15843
- dir: import_node_path40.default.dirname(resolved),
16250
+ dir: import_node_path41.default.dirname(resolved),
15844
16251
  workspaceFile: resolved
15845
16252
  };
15846
16253
  }
15847
16254
  if (!stats.isDirectory()) {
15848
16255
  throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
15849
16256
  }
15850
- const entries = await (0, import_promises28.readdir)(resolved);
16257
+ const entries = await (0, import_promises29.readdir)(resolved);
15851
16258
  const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
15852
16259
  if (workspaceFiles.length === 1) {
15853
16260
  return {
15854
16261
  dir: resolved,
15855
- workspaceFile: import_node_path40.default.join(resolved, workspaceFiles[0])
16262
+ workspaceFile: import_node_path41.default.join(resolved, workspaceFiles[0])
15856
16263
  };
15857
16264
  }
15858
16265
  if (workspaceFiles.length > 1) {
15859
16266
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
15860
16267
  return {
15861
16268
  dir: resolved,
15862
- workspaceFile: conventionFile ? import_node_path40.default.join(resolved, conventionFile) : void 0
16269
+ workspaceFile: conventionFile ? import_node_path41.default.join(resolved, conventionFile) : void 0
15863
16270
  };
15864
16271
  }
15865
16272
  return { dir: resolved };
@@ -15941,7 +16348,10 @@ async function runEvaluation(options) {
15941
16348
  trials,
15942
16349
  streamCallbacks,
15943
16350
  totalBudgetUsd,
15944
- failOnError
16351
+ failOnError,
16352
+ poolWorkspaces,
16353
+ poolMaxSlots: configPoolMaxSlots,
16354
+ workspace: userWorkspacePath
15945
16355
  } = options;
15946
16356
  let useCache = options.useCache;
15947
16357
  if (trials && trials.count > 1 && useCache) {
@@ -15950,7 +16360,7 @@ async function runEvaluation(options) {
15950
16360
  );
15951
16361
  useCache = false;
15952
16362
  }
15953
- const evalRunId = (0, import_node_crypto9.randomUUID)();
16363
+ const evalRunId = (0, import_node_crypto10.randomUUID)();
15954
16364
  const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
15955
16365
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15956
16366
  if (filteredEvalCases.length === 0) {
@@ -16015,7 +16425,7 @@ async function runEvaluation(options) {
16015
16425
  ];
16016
16426
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
16017
16427
  const typeRegistry = createBuiltinRegistry();
16018
- const discoveryBaseDir = evalFilePath ? import_node_path41.default.dirname(import_node_path41.default.resolve(evalFilePath)) : process.cwd();
16428
+ const discoveryBaseDir = evalFilePath ? import_node_path42.default.dirname(import_node_path42.default.resolve(evalFilePath)) : process.cwd();
16019
16429
  const evalDir = discoveryBaseDir;
16020
16430
  await discoverAssertions(typeRegistry, discoveryBaseDir);
16021
16431
  const providerRegistry = createBuiltinProviderRegistry();
@@ -16077,13 +16487,19 @@ async function runEvaluation(options) {
16077
16487
  }
16078
16488
  };
16079
16489
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
16080
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
16490
+ if (userWorkspacePath && isPerTestIsolation) {
16491
+ throw new Error(
16492
+ "--workspace is incompatible with isolation: per_test. Use isolation: shared (default)."
16493
+ );
16494
+ }
16495
+ const hasSharedWorkspace = !!(userWorkspacePath || workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
16496
+ const usePool = poolWorkspaces === true && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !userWorkspacePath;
16081
16497
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
16082
- const workers = hasSharedWorkspace ? 1 : requestedWorkers;
16498
+ const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
16083
16499
  setupLog(
16084
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
16500
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
16085
16501
  );
16086
- if (hasSharedWorkspace && requestedWorkers > 1) {
16502
+ if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
16087
16503
  console.warn(
16088
16504
  `Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
16089
16505
  );
@@ -16092,7 +16508,37 @@ async function runEvaluation(options) {
16092
16508
  let sharedWorkspacePath;
16093
16509
  let sharedBaselineCommit;
16094
16510
  let beforeAllOutput;
16095
- if (workspaceTemplate) {
16511
+ let poolManager;
16512
+ let poolSlot;
16513
+ const poolSlots = [];
16514
+ const availablePoolSlots = [];
16515
+ const poolSlotBaselines = /* @__PURE__ */ new Map();
16516
+ const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
16517
+ if (userWorkspacePath) {
16518
+ sharedWorkspacePath = userWorkspacePath;
16519
+ setupLog(`using user-provided workspace: ${userWorkspacePath}`);
16520
+ } else if (usePool && suiteWorkspace?.repos) {
16521
+ const slotsNeeded = workers;
16522
+ setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
16523
+ poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
16524
+ const poolRepoManager = new RepoManager(void 0, verbose);
16525
+ for (let i = 0; i < slotsNeeded; i++) {
16526
+ const slot = await poolManager.acquireWorkspace({
16527
+ templatePath: workspaceTemplate,
16528
+ repos: suiteWorkspace.repos,
16529
+ maxSlots: poolMaxSlots,
16530
+ repoManager: poolRepoManager
16531
+ });
16532
+ poolSlots.push(slot);
16533
+ setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
16534
+ }
16535
+ if (slotsNeeded === 1) {
16536
+ poolSlot = poolSlots[0];
16537
+ sharedWorkspacePath = poolSlot.path;
16538
+ } else {
16539
+ availablePoolSlots.push(...poolSlots);
16540
+ }
16541
+ } else if (workspaceTemplate) {
16096
16542
  setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
16097
16543
  try {
16098
16544
  sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
@@ -16101,288 +16547,344 @@ async function runEvaluation(options) {
16101
16547
  const message = error instanceof Error ? error.message : String(error);
16102
16548
  throw new Error(`Failed to create shared workspace: ${message}`);
16103
16549
  }
16104
- if (suiteWorkspaceFile && sharedWorkspacePath) {
16105
- const copiedWorkspaceFile = import_node_path41.default.join(sharedWorkspacePath, import_node_path41.default.basename(suiteWorkspaceFile));
16106
- try {
16107
- await (0, import_promises29.stat)(copiedWorkspaceFile);
16108
- suiteWorkspaceFile = copiedWorkspaceFile;
16109
- } catch {
16110
- }
16111
- }
16112
16550
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
16113
16551
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
16114
- await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
16552
+ await (0, import_promises30.mkdir)(sharedWorkspacePath, { recursive: true });
16115
16553
  setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
16116
16554
  }
16117
- const repoManager = suiteWorkspace?.repos?.length ? new RepoManager(void 0, verbose) : void 0;
16118
- if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
16119
- setupLog(`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`);
16120
- try {
16121
- await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
16122
- setupLog("shared repo materialization complete");
16123
- } catch (error) {
16124
- const message = error instanceof Error ? error.message : String(error);
16125
- if (sharedWorkspacePath) {
16126
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16127
- });
16128
- }
16129
- throw new Error(`Failed to materialize repos: ${message}`);
16130
- }
16131
- }
16132
- if (sharedWorkspacePath && suiteWorkspace?.before_all) {
16133
- const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
16134
- setupLog(
16135
- `running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
16136
- );
16137
- const scriptContext = {
16138
- workspacePath: sharedWorkspacePath,
16139
- testId: "__before_all__",
16140
- evalRunId,
16141
- evalDir
16142
- };
16143
- try {
16144
- beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
16145
- setupLog("shared before_all completed");
16146
- } catch (error) {
16147
- const message = error instanceof Error ? error.message : String(error);
16148
- if (sharedWorkspacePath) {
16149
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16150
- });
16555
+ try {
16556
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
16557
+ const copiedWorkspaceFile = import_node_path42.default.join(sharedWorkspacePath, import_node_path42.default.basename(suiteWorkspaceFile));
16558
+ try {
16559
+ await (0, import_promises30.stat)(copiedWorkspaceFile);
16560
+ suiteWorkspaceFile = copiedWorkspaceFile;
16561
+ } catch {
16151
16562
  }
16152
- throw new Error(`before_all script failed: ${message}`);
16153
16563
  }
16154
- }
16155
- if (sharedWorkspacePath) {
16156
- try {
16157
- sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
16158
- setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
16159
- } catch {
16160
- setupLog("shared baseline initialization skipped (non-fatal)");
16161
- }
16162
- }
16163
- let nextWorkerId = 1;
16164
- const workerIdByEvalId = /* @__PURE__ */ new Map();
16165
- let beforeAllOutputAttached = false;
16166
- let cumulativeBudgetCost = 0;
16167
- let budgetExhausted = false;
16168
- let failOnErrorTriggered = false;
16169
- const promises = filteredEvalCases.map(
16170
- (evalCase) => limit(async () => {
16171
- const workerId = nextWorkerId++;
16172
- workerIdByEvalId.set(evalCase.id, workerId);
16173
- if (totalBudgetUsd !== void 0 && budgetExhausted) {
16174
- const budgetResult = {
16175
- timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16176
- testId: evalCase.id,
16177
- dataset: evalCase.dataset,
16178
- score: 0,
16179
- hits: [],
16180
- misses: [],
16181
- answer: "",
16182
- target: target.name,
16183
- error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16184
- budgetExceeded: true,
16185
- executionStatus: "execution_error",
16186
- failureStage: "setup",
16187
- failureReasonCode: "budget_exceeded",
16188
- executionError: {
16189
- message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16190
- stage: "setup"
16191
- }
16192
- };
16193
- if (onProgress) {
16194
- await onProgress({
16195
- workerId,
16196
- testId: evalCase.id,
16197
- status: "failed",
16198
- completedAt: Date.now(),
16199
- error: budgetResult.error
16564
+ const repoManager = suiteWorkspace?.repos?.length && !usePool && !userWorkspacePath ? new RepoManager(void 0, verbose) : void 0;
16565
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
16566
+ setupLog(
16567
+ `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
16568
+ );
16569
+ try {
16570
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
16571
+ setupLog("shared repo materialization complete");
16572
+ } catch (error) {
16573
+ const message = error instanceof Error ? error.message : String(error);
16574
+ if (sharedWorkspacePath && !userWorkspacePath) {
16575
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16200
16576
  });
16201
16577
  }
16202
- if (onResult) {
16203
- await onResult(budgetResult);
16204
- }
16205
- return budgetResult;
16578
+ throw new Error(`Failed to materialize repos: ${message}`);
16206
16579
  }
16207
- if (failOnError === true && failOnErrorTriggered) {
16208
- const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
16209
- const haltResult = {
16210
- timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16211
- testId: evalCase.id,
16212
- dataset: evalCase.dataset,
16213
- score: 0,
16214
- hits: [],
16215
- misses: [],
16216
- answer: "",
16217
- target: target.name,
16218
- error: errorMsg,
16219
- executionStatus: "execution_error",
16220
- failureStage: "setup",
16221
- failureReasonCode: "error_threshold_exceeded",
16222
- executionError: { message: errorMsg, stage: "setup" }
16223
- };
16224
- if (onProgress) {
16225
- await onProgress({
16226
- workerId,
16227
- testId: evalCase.id,
16228
- status: "failed",
16229
- completedAt: Date.now(),
16230
- error: haltResult.error
16580
+ }
16581
+ if (sharedWorkspacePath && suiteWorkspace?.before_all) {
16582
+ const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
16583
+ setupLog(
16584
+ `running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
16585
+ );
16586
+ const scriptContext = {
16587
+ workspacePath: sharedWorkspacePath,
16588
+ testId: "__before_all__",
16589
+ evalRunId,
16590
+ evalDir
16591
+ };
16592
+ try {
16593
+ beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
16594
+ setupLog("shared before_all completed");
16595
+ } catch (error) {
16596
+ const message = error instanceof Error ? error.message : String(error);
16597
+ if (sharedWorkspacePath && !userWorkspacePath) {
16598
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16231
16599
  });
16232
16600
  }
16233
- if (onResult) {
16234
- await onResult(haltResult);
16235
- }
16236
- return haltResult;
16601
+ throw new Error(`before_all script failed: ${message}`);
16237
16602
  }
16238
- if (onProgress) {
16239
- await onProgress({
16240
- workerId,
16241
- testId: evalCase.id,
16242
- status: "running",
16243
- startedAt: Date.now()
16244
- });
16245
- }
16246
- try {
16247
- const judgeProvider = await resolveJudgeProvider(target);
16248
- const runCaseOptions = {
16249
- evalCase,
16250
- provider: primaryProvider,
16251
- target,
16252
- evaluators: evaluatorRegistry,
16253
- maxRetries,
16254
- agentTimeoutMs,
16255
- cache,
16256
- useCache,
16257
- now,
16258
- judgeProvider,
16259
- targetResolver,
16260
- availableTargets,
16603
+ }
16604
+ if (availablePoolSlots.length > 0 && suiteWorkspace?.before_all) {
16605
+ for (const slot of availablePoolSlots) {
16606
+ setupLog(`running before_all on pool slot ${slot.index}`);
16607
+ const scriptContext = {
16608
+ workspacePath: slot.path,
16609
+ testId: "__before_all__",
16261
16610
  evalRunId,
16262
- keepWorkspaces,
16263
- cleanupWorkspaces,
16264
- sharedWorkspacePath,
16265
- sharedBaselineCommit,
16266
- suiteWorkspaceFile,
16267
- streamCallbacks,
16268
- typeRegistry,
16269
- repoManager,
16270
16611
  evalDir
16271
16612
  };
16272
- let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
16273
- if (totalBudgetUsd !== void 0) {
16274
- let caseCost;
16275
- if (result.trials && result.trials.length > 0) {
16276
- const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
16277
- if (trialCostSum > 0) {
16278
- caseCost = trialCostSum;
16613
+ try {
16614
+ const output = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
16615
+ if (!beforeAllOutput) beforeAllOutput = output;
16616
+ setupLog(`before_all completed on pool slot ${slot.index}`);
16617
+ } catch (error) {
16618
+ const message = error instanceof Error ? error.message : String(error);
16619
+ throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
16620
+ }
16621
+ }
16622
+ }
16623
+ if (sharedWorkspacePath) {
16624
+ try {
16625
+ sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
16626
+ setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
16627
+ } catch {
16628
+ setupLog("shared baseline initialization skipped (non-fatal)");
16629
+ }
16630
+ }
16631
+ if (availablePoolSlots.length > 0) {
16632
+ for (const slot of availablePoolSlots) {
16633
+ try {
16634
+ const baseline = await initializeBaseline(slot.path);
16635
+ poolSlotBaselines.set(slot.path, baseline);
16636
+ setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
16637
+ } catch {
16638
+ setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
16639
+ }
16640
+ }
16641
+ }
16642
+ let nextWorkerId = 1;
16643
+ const workerIdByEvalId = /* @__PURE__ */ new Map();
16644
+ let beforeAllOutputAttached = false;
16645
+ let cumulativeBudgetCost = 0;
16646
+ let budgetExhausted = false;
16647
+ let failOnErrorTriggered = false;
16648
+ const promises = filteredEvalCases.map(
16649
+ (evalCase) => limit(async () => {
16650
+ const workerId = nextWorkerId++;
16651
+ workerIdByEvalId.set(evalCase.id, workerId);
16652
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
16653
+ const budgetResult = {
16654
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16655
+ testId: evalCase.id,
16656
+ dataset: evalCase.dataset,
16657
+ score: 0,
16658
+ hits: [],
16659
+ misses: [],
16660
+ answer: "",
16661
+ target: target.name,
16662
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16663
+ budgetExceeded: true,
16664
+ executionStatus: "execution_error",
16665
+ failureStage: "setup",
16666
+ failureReasonCode: "budget_exceeded",
16667
+ executionError: {
16668
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16669
+ stage: "setup"
16279
16670
  }
16280
- } else {
16281
- caseCost = result.costUsd;
16671
+ };
16672
+ if (onProgress) {
16673
+ await onProgress({
16674
+ workerId,
16675
+ testId: evalCase.id,
16676
+ status: "failed",
16677
+ completedAt: Date.now(),
16678
+ error: budgetResult.error
16679
+ });
16282
16680
  }
16283
- if (caseCost !== void 0) {
16284
- cumulativeBudgetCost += caseCost;
16285
- if (cumulativeBudgetCost >= totalBudgetUsd) {
16286
- budgetExhausted = true;
16287
- }
16681
+ if (onResult) {
16682
+ await onResult(budgetResult);
16288
16683
  }
16684
+ return budgetResult;
16289
16685
  }
16290
- if (failOnError === true && result.executionStatus === "execution_error") {
16291
- failOnErrorTriggered = true;
16292
- }
16293
- if (beforeAllOutput && !beforeAllOutputAttached) {
16294
- result = { ...result, beforeAllOutput };
16295
- beforeAllOutputAttached = true;
16686
+ if (failOnError === true && failOnErrorTriggered) {
16687
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
16688
+ const haltResult = {
16689
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16690
+ testId: evalCase.id,
16691
+ dataset: evalCase.dataset,
16692
+ score: 0,
16693
+ hits: [],
16694
+ misses: [],
16695
+ answer: "",
16696
+ target: target.name,
16697
+ error: errorMsg,
16698
+ executionStatus: "execution_error",
16699
+ failureStage: "setup",
16700
+ failureReasonCode: "error_threshold_exceeded",
16701
+ executionError: { message: errorMsg, stage: "setup" }
16702
+ };
16703
+ if (onProgress) {
16704
+ await onProgress({
16705
+ workerId,
16706
+ testId: evalCase.id,
16707
+ status: "failed",
16708
+ completedAt: Date.now(),
16709
+ error: haltResult.error
16710
+ });
16711
+ }
16712
+ if (onResult) {
16713
+ await onResult(haltResult);
16714
+ }
16715
+ return haltResult;
16296
16716
  }
16297
16717
  if (onProgress) {
16298
16718
  await onProgress({
16299
16719
  workerId,
16300
16720
  testId: evalCase.id,
16301
- status: result.error ? "failed" : "completed",
16302
- startedAt: 0,
16303
- // Not used for completed status
16304
- completedAt: Date.now(),
16305
- error: result.error
16721
+ status: "running",
16722
+ startedAt: Date.now()
16306
16723
  });
16307
16724
  }
16308
- if (onResult) {
16309
- await onResult(result);
16725
+ const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
16726
+ const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
16727
+ const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
16728
+ try {
16729
+ const judgeProvider = await resolveJudgeProvider(target);
16730
+ const runCaseOptions = {
16731
+ evalCase,
16732
+ provider: primaryProvider,
16733
+ target,
16734
+ evaluators: evaluatorRegistry,
16735
+ maxRetries,
16736
+ agentTimeoutMs,
16737
+ cache,
16738
+ useCache,
16739
+ now,
16740
+ judgeProvider,
16741
+ targetResolver,
16742
+ availableTargets,
16743
+ evalRunId,
16744
+ keepWorkspaces,
16745
+ cleanupWorkspaces,
16746
+ sharedWorkspacePath: testWorkspacePath,
16747
+ sharedBaselineCommit: testBaselineCommit,
16748
+ suiteWorkspaceFile,
16749
+ streamCallbacks,
16750
+ typeRegistry,
16751
+ repoManager,
16752
+ evalDir
16753
+ };
16754
+ let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
16755
+ if (totalBudgetUsd !== void 0) {
16756
+ let caseCost;
16757
+ if (result.trials && result.trials.length > 0) {
16758
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
16759
+ if (trialCostSum > 0) {
16760
+ caseCost = trialCostSum;
16761
+ }
16762
+ } else {
16763
+ caseCost = result.costUsd;
16764
+ }
16765
+ if (caseCost !== void 0) {
16766
+ cumulativeBudgetCost += caseCost;
16767
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
16768
+ budgetExhausted = true;
16769
+ }
16770
+ }
16771
+ }
16772
+ if (failOnError === true && result.executionStatus === "execution_error") {
16773
+ failOnErrorTriggered = true;
16774
+ }
16775
+ if (beforeAllOutput && !beforeAllOutputAttached) {
16776
+ result = { ...result, beforeAllOutput };
16777
+ beforeAllOutputAttached = true;
16778
+ }
16779
+ if (onProgress) {
16780
+ await onProgress({
16781
+ workerId,
16782
+ testId: evalCase.id,
16783
+ status: result.error ? "failed" : "completed",
16784
+ startedAt: 0,
16785
+ // Not used for completed status
16786
+ completedAt: Date.now(),
16787
+ error: result.error
16788
+ });
16789
+ }
16790
+ if (onResult) {
16791
+ await onResult(result);
16792
+ }
16793
+ return result;
16794
+ } catch (error) {
16795
+ if (onProgress) {
16796
+ await onProgress({
16797
+ workerId,
16798
+ testId: evalCase.id,
16799
+ status: "failed",
16800
+ completedAt: Date.now(),
16801
+ error: error instanceof Error ? error.message : String(error)
16802
+ });
16803
+ }
16804
+ throw error;
16805
+ } finally {
16806
+ if (testPoolSlot) {
16807
+ availablePoolSlots.push(testPoolSlot);
16808
+ }
16310
16809
  }
16311
- return result;
16312
- } catch (error) {
16313
- if (onProgress) {
16314
- await onProgress({
16315
- workerId,
16316
- testId: evalCase.id,
16317
- status: "failed",
16318
- completedAt: Date.now(),
16319
- error: error instanceof Error ? error.message : String(error)
16320
- });
16810
+ })
16811
+ );
16812
+ const settled = await Promise.allSettled(promises);
16813
+ const results = [];
16814
+ for (let i = 0; i < settled.length; i++) {
16815
+ const outcome = settled[i];
16816
+ if (outcome.status === "fulfilled") {
16817
+ results.push(outcome.value);
16818
+ } else {
16819
+ const evalCase = filteredEvalCases[i];
16820
+ const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
16821
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
16822
+ const errorResult = buildErrorResult(
16823
+ evalCase,
16824
+ target.name,
16825
+ (now ?? (() => /* @__PURE__ */ new Date()))(),
16826
+ outcome.reason,
16827
+ promptInputs,
16828
+ primaryProvider,
16829
+ "agent",
16830
+ "provider_error"
16831
+ );
16832
+ results.push(errorResult);
16833
+ if (onResult) {
16834
+ await onResult(errorResult);
16321
16835
  }
16322
- throw error;
16323
16836
  }
16324
- })
16325
- );
16326
- const settled = await Promise.allSettled(promises);
16327
- const results = [];
16328
- for (let i = 0; i < settled.length; i++) {
16329
- const outcome = settled[i];
16330
- if (outcome.status === "fulfilled") {
16331
- results.push(outcome.value);
16332
- } else {
16333
- const evalCase = filteredEvalCases[i];
16334
- const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
16335
- const promptInputs = await buildPromptInputs(evalCase, formattingMode);
16336
- const errorResult = buildErrorResult(
16337
- evalCase,
16338
- target.name,
16339
- (now ?? (() => /* @__PURE__ */ new Date()))(),
16340
- outcome.reason,
16341
- promptInputs,
16342
- primaryProvider,
16343
- "agent",
16344
- "provider_error"
16345
- );
16346
- results.push(errorResult);
16347
- if (onResult) {
16348
- await onResult(errorResult);
16837
+ }
16838
+ const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
16839
+ if (afterAllWorkspaces.length > 0 && suiteWorkspace?.after_all) {
16840
+ for (const wsPath of afterAllWorkspaces) {
16841
+ const scriptContext = {
16842
+ workspacePath: wsPath,
16843
+ testId: "__after_all__",
16844
+ evalRunId,
16845
+ evalDir
16846
+ };
16847
+ try {
16848
+ const afterAllOutput = await executeWorkspaceScript(
16849
+ suiteWorkspace.after_all,
16850
+ scriptContext,
16851
+ "warn"
16852
+ );
16853
+ if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
16854
+ results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
16855
+ }
16856
+ } catch {
16857
+ }
16349
16858
  }
16350
16859
  }
16351
- }
16352
- if (sharedWorkspacePath && suiteWorkspace?.after_all) {
16353
- const scriptContext = {
16354
- workspacePath: sharedWorkspacePath,
16355
- testId: "__after_all__",
16356
- evalRunId,
16357
- evalDir
16358
- };
16359
- try {
16360
- const afterAllOutput = await executeWorkspaceScript(
16361
- suiteWorkspace.after_all,
16362
- scriptContext,
16363
- "warn"
16364
- );
16365
- if (afterAllOutput && results.length > 0) {
16366
- results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
16860
+ if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !userWorkspacePath) {
16861
+ const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
16862
+ if (cleanupWorkspaces) {
16863
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16864
+ });
16865
+ } else if (!hasFailure && !keepWorkspaces) {
16866
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16867
+ });
16367
16868
  }
16368
- } catch {
16369
16869
  }
16370
- }
16371
- if (sharedWorkspacePath) {
16372
- const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
16373
16870
  if (cleanupWorkspaces) {
16374
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16375
- });
16376
- } else if (!hasFailure && !keepWorkspaces) {
16377
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16871
+ await cleanupEvalWorkspaces(evalRunId).catch(() => {
16378
16872
  });
16379
16873
  }
16874
+ return results;
16875
+ } finally {
16876
+ if (poolManager) {
16877
+ if (poolSlot) {
16878
+ await poolManager.releaseSlot(poolSlot);
16879
+ }
16880
+ for (const slot of poolSlots) {
16881
+ if (slot !== poolSlot) {
16882
+ await poolManager.releaseSlot(slot).catch(() => {
16883
+ });
16884
+ }
16885
+ }
16886
+ }
16380
16887
  }
16381
- if (cleanupWorkspaces) {
16382
- await cleanupEvalWorkspaces(evalRunId).catch(() => {
16383
- });
16384
- }
16385
- return results;
16386
16888
  }
16387
16889
  async function runBatchEvaluation(options) {
16388
16890
  const {
@@ -16599,9 +17101,9 @@ async function runEvalCase(options) {
16599
17101
  );
16600
17102
  }
16601
17103
  if (caseWorkspaceFile && workspacePath) {
16602
- const copiedFile = import_node_path41.default.join(workspacePath, import_node_path41.default.basename(caseWorkspaceFile));
17104
+ const copiedFile = import_node_path42.default.join(workspacePath, import_node_path42.default.basename(caseWorkspaceFile));
16603
17105
  try {
16604
- await (0, import_promises29.stat)(copiedFile);
17106
+ await (0, import_promises30.stat)(copiedFile);
16605
17107
  caseWorkspaceFile = copiedFile;
16606
17108
  } catch {
16607
17109
  }
@@ -16609,7 +17111,7 @@ async function runEvalCase(options) {
16609
17111
  }
16610
17112
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
16611
17113
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
16612
- await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
17114
+ await (0, import_promises30.mkdir)(workspacePath, { recursive: true });
16613
17115
  }
16614
17116
  if (evalCase.workspace?.repos?.length && workspacePath) {
16615
17117
  const perCaseRepoManager = new RepoManager(void 0, setupDebug);
@@ -17209,7 +17711,7 @@ async function runEvaluatorList(options) {
17209
17711
  fileChanges,
17210
17712
  workspacePath
17211
17713
  };
17212
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path41.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
17714
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path42.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
17213
17715
  const dispatchContext = {
17214
17716
  judgeProvider,
17215
17717
  targetResolver,
@@ -17443,7 +17945,7 @@ function extractProviderError(response) {
17443
17945
  return trimmed.length > 0 ? trimmed : void 0;
17444
17946
  }
17445
17947
  function createCacheKey(provider, target, evalCase, promptInputs) {
17446
- const hash = (0, import_node_crypto9.createHash)("sha256");
17948
+ const hash = (0, import_node_crypto10.createHash)("sha256");
17447
17949
  hash.update(provider.id);
17448
17950
  hash.update(target.name);
17449
17951
  hash.update(evalCase.id);
@@ -17511,8 +18013,8 @@ function computeWeightedMean(entries) {
17511
18013
  }
17512
18014
 
17513
18015
  // src/evaluation/evaluate.ts
17514
- var import_node_fs12 = require("fs");
17515
- var import_node_path42 = __toESM(require("path"), 1);
18016
+ var import_node_fs13 = require("fs");
18017
+ var import_node_path43 = __toESM(require("path"), 1);
17516
18018
  async function evaluate(config) {
17517
18019
  const startTime = Date.now();
17518
18020
  if (config.tests && config.specFile) {
@@ -17534,13 +18036,13 @@ async function evaluate(config) {
17534
18036
  let evalCases;
17535
18037
  let testFilePath;
17536
18038
  if (config.specFile) {
17537
- testFilePath = import_node_path42.default.resolve(config.specFile);
18039
+ testFilePath = import_node_path43.default.resolve(config.specFile);
17538
18040
  evalCases = await loadTests(testFilePath, repoRoot, {
17539
18041
  verbose: config.verbose,
17540
18042
  filter: config.filter
17541
18043
  });
17542
18044
  } else {
17543
- testFilePath = import_node_path42.default.join(process.cwd(), "__programmatic__.yaml");
18045
+ testFilePath = import_node_path43.default.join(process.cwd(), "__programmatic__.yaml");
17544
18046
  evalCases = (config.tests ?? []).map((test) => {
17545
18047
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
17546
18048
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -17626,11 +18128,11 @@ function computeSummary(results, durationMs) {
17626
18128
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
17627
18129
  async function discoverDefaultTarget(repoRoot) {
17628
18130
  const cwd = process.cwd();
17629
- const chain = buildDirectoryChain2(import_node_path42.default.join(cwd, "_placeholder"), repoRoot);
18131
+ const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
17630
18132
  for (const dir of chain) {
17631
18133
  for (const candidate of TARGET_FILE_CANDIDATES) {
17632
- const targetsPath = import_node_path42.default.join(dir, candidate);
17633
- if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
18134
+ const targetsPath = import_node_path43.default.join(dir, candidate);
18135
+ if (!(0, import_node_fs13.existsSync)(targetsPath)) continue;
17634
18136
  try {
17635
18137
  const definitions = await readTargetDefinitions(targetsPath);
17636
18138
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -17644,11 +18146,11 @@ async function discoverDefaultTarget(repoRoot) {
17644
18146
  async function loadEnvHierarchy(repoRoot) {
17645
18147
  const { readFileSync: readFileSync2 } = await import("fs");
17646
18148
  const cwd = process.cwd();
17647
- const chain = buildDirectoryChain2(import_node_path42.default.join(cwd, "_placeholder"), repoRoot);
18149
+ const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
17648
18150
  const envFiles = [];
17649
18151
  for (const dir of chain) {
17650
- const envPath = import_node_path42.default.join(dir, ".env");
17651
- if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
18152
+ const envPath = import_node_path43.default.join(dir, ".env");
18153
+ if ((0, import_node_fs13.existsSync)(envPath)) envFiles.push(envPath);
17652
18154
  }
17653
18155
  for (let i = envFiles.length - 1; i >= 0; i--) {
17654
18156
  try {
@@ -17726,12 +18228,12 @@ var CONFIG_FILE_NAMES = [
17726
18228
  ".agentv/config.js"
17727
18229
  ];
17728
18230
  async function loadTsConfig(projectRoot) {
17729
- const { existsSync: existsSync4 } = await import("fs");
18231
+ const { existsSync: existsSync5 } = await import("fs");
17730
18232
  const { pathToFileURL } = await import("url");
17731
18233
  const { join: join2 } = await import("path");
17732
18234
  for (const fileName of CONFIG_FILE_NAMES) {
17733
18235
  const filePath = join2(projectRoot, fileName);
17734
- if (!existsSync4(filePath)) {
18236
+ if (!existsSync5(filePath)) {
17735
18237
  continue;
17736
18238
  }
17737
18239
  try {
@@ -17828,8 +18330,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
17828
18330
  }
17829
18331
 
17830
18332
  // src/evaluation/cache/response-cache.ts
17831
- var import_promises30 = require("fs/promises");
17832
- var import_node_path43 = __toESM(require("path"), 1);
18333
+ var import_promises31 = require("fs/promises");
18334
+ var import_node_path44 = __toESM(require("path"), 1);
17833
18335
  var DEFAULT_CACHE_PATH = ".agentv/cache";
17834
18336
  var ResponseCache = class {
17835
18337
  cachePath;
@@ -17839,7 +18341,7 @@ var ResponseCache = class {
17839
18341
  async get(key) {
17840
18342
  const filePath = this.keyToPath(key);
17841
18343
  try {
17842
- const data = await (0, import_promises30.readFile)(filePath, "utf8");
18344
+ const data = await (0, import_promises31.readFile)(filePath, "utf8");
17843
18345
  return JSON.parse(data);
17844
18346
  } catch {
17845
18347
  return void 0;
@@ -17847,13 +18349,13 @@ var ResponseCache = class {
17847
18349
  }
17848
18350
  async set(key, value) {
17849
18351
  const filePath = this.keyToPath(key);
17850
- const dir = import_node_path43.default.dirname(filePath);
17851
- await (0, import_promises30.mkdir)(dir, { recursive: true });
17852
- await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
18352
+ const dir = import_node_path44.default.dirname(filePath);
18353
+ await (0, import_promises31.mkdir)(dir, { recursive: true });
18354
+ await (0, import_promises31.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
17853
18355
  }
17854
18356
  keyToPath(key) {
17855
18357
  const prefix = key.slice(0, 2);
17856
- return import_node_path43.default.join(this.cachePath, prefix, `${key}.json`);
18358
+ return import_node_path44.default.join(this.cachePath, prefix, `${key}.json`);
17857
18359
  }
17858
18360
  };
17859
18361
  function shouldEnableCache(params) {
@@ -18340,6 +18842,7 @@ function createAgentKernel() {
18340
18842
  TokenUsageEvaluator,
18341
18843
  ToolTrajectoryEvaluator,
18342
18844
  WorkspaceCreationError,
18845
+ WorkspacePoolManager,
18343
18846
  assembleLlmJudgePrompt,
18344
18847
  avgToolDurationMs,
18345
18848
  buildDirectoryChain,
@@ -18354,6 +18857,7 @@ function createAgentKernel() {
18354
18857
  cleanupEvalWorkspaces,
18355
18858
  cleanupWorkspace,
18356
18859
  computeTraceSummary,
18860
+ computeWorkspaceFingerprint,
18357
18861
  consumeClaudeLogEntries,
18358
18862
  consumeCodexLogEntries,
18359
18863
  consumeCopilotCliLogEntries,
@@ -18391,6 +18895,7 @@ function createAgentKernel() {
18391
18895
  getSubagentsRoot,
18392
18896
  getTraceStateRoot,
18393
18897
  getWorkspacePath,
18898
+ getWorkspacePoolRoot,
18394
18899
  getWorkspacesRoot,
18395
18900
  initializeBaseline,
18396
18901
  isEvaluatorKind,