agentv 4.40.1 → 4.41.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/dist/{artifact-writer-GIAIMGPQ.js → artifact-writer-AMV64TWV.js} +4 -4
  2. package/dist/{chunk-B7CT3J2W.js → chunk-6FXICR66.js} +899 -300
  3. package/dist/chunk-6FXICR66.js.map +1 -0
  4. package/dist/{chunk-TWQP7JYQ.js → chunk-A4J456KS.js} +2 -2
  5. package/dist/{chunk-A36XLUI5.js → chunk-CF5RCUWH.js} +12 -10
  6. package/dist/chunk-CF5RCUWH.js.map +1 -0
  7. package/dist/{chunk-BLXYBUU4.js → chunk-ENHX2CCS.js} +1485 -943
  8. package/dist/chunk-ENHX2CCS.js.map +1 -0
  9. package/dist/{chunk-I3SC4FOT.js → chunk-Z45FKRMJ.js} +212 -58
  10. package/dist/chunk-Z45FKRMJ.js.map +1 -0
  11. package/dist/cli.js +5 -5
  12. package/dist/{dist-6Z4OSITR.js → dist-X5P5IR65.js} +7 -3
  13. package/dist/index.js +5 -5
  14. package/dist/{interactive-Q575M3A7.js → interactive-4JKJTY3G.js} +5 -5
  15. package/dist/skills/agentv-bench/references/eval-yaml-spec.md +4 -4
  16. package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +14 -14
  17. package/dist/skills/agentv-eval-writer/references/python-helpers.md +47 -0
  18. package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js} +2 -2
  19. package/package.json +1 -1
  20. package/dist/chunk-A36XLUI5.js.map +0 -1
  21. package/dist/chunk-B7CT3J2W.js.map +0 -1
  22. package/dist/chunk-BLXYBUU4.js.map +0 -1
  23. package/dist/chunk-I3SC4FOT.js.map +0 -1
  24. /package/dist/{artifact-writer-GIAIMGPQ.js.map → artifact-writer-AMV64TWV.js.map} +0 -0
  25. /package/dist/{chunk-TWQP7JYQ.js.map → chunk-A4J456KS.js.map} +0 -0
  26. /package/dist/{dist-6Z4OSITR.js.map → dist-X5P5IR65.js.map} +0 -0
  27. /package/dist/{interactive-Q575M3A7.js.map → interactive-4JKJTY3G.js.map} +0 -0
  28. /package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map} +0 -0
@@ -493,8 +493,8 @@ function getErrorMap() {
493
493
 
494
494
  // ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
495
495
  var makeIssue = (params) => {
496
- const { data, path: path50, errorMaps, issueData } = params;
497
- const fullPath = [...path50, ...issueData.path || []];
496
+ const { data, path: path51, errorMaps, issueData } = params;
497
+ const fullPath = [...path51, ...issueData.path || []];
498
498
  const fullIssue = {
499
499
  ...issueData,
500
500
  path: fullPath
@@ -610,11 +610,11 @@ var errorUtil;
610
610
 
611
611
  // ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
612
612
  var ParseInputLazyPath = class {
613
- constructor(parent, value, path50, key) {
613
+ constructor(parent, value, path51, key) {
614
614
  this._cachedPath = [];
615
615
  this.parent = parent;
616
616
  this.data = value;
617
- this._path = path50;
617
+ this._path = path51;
618
618
  this._key = key;
619
619
  }
620
620
  get path() {
@@ -4056,7 +4056,7 @@ var coerce = {
4056
4056
  };
4057
4057
  var NEVER = INVALID;
4058
4058
 
4059
- // ../../packages/core/dist/chunk-5JNFEE7J.js
4059
+ // ../../packages/core/dist/chunk-3EAL7M5J.js
4060
4060
  import { parse } from "yaml";
4061
4061
  import os from "node:os";
4062
4062
  import path from "node:path";
@@ -5252,17 +5252,22 @@ function resolveCopilotFlatProviderConfig(target, env) {
5252
5252
  optionalEnv: true
5253
5253
  }
5254
5254
  );
5255
- const wireApi = resolveOptionalString(target.wire_api, env, `${target.name} copilot wire API`, {
5256
- allowLiteral: true,
5257
- optionalEnv: true
5258
- });
5255
+ const apiFormat = resolveOptionalString(
5256
+ target.api_format,
5257
+ env,
5258
+ `${target.name} copilot API format`,
5259
+ {
5260
+ allowLiteral: true,
5261
+ optionalEnv: true
5262
+ }
5263
+ );
5259
5264
  return {
5260
5265
  ...type ? { type } : {},
5261
5266
  baseUrl,
5262
5267
  ...apiKey ? { apiKey } : {},
5263
5268
  ...bearerToken ? { bearerToken } : {},
5264
5269
  ...apiVersion ? { apiVersion } : {},
5265
- ...wireApi ? { wireApi } : {}
5270
+ ...apiFormat ? { wireApi: apiFormat } : {}
5266
5271
  };
5267
5272
  }
5268
5273
  function resolveCopilotCliConfig(target, env, _evalFilePath) {
@@ -6151,21 +6156,17 @@ async function expandFileReferences(tests, evalFileDir) {
6151
6156
  return expanded;
6152
6157
  }
6153
6158
 
6154
- // ../../packages/core/dist/chunk-M6LF2BEU.js
6155
- import path49 from "node:path";
6159
+ // ../../packages/core/dist/chunk-REU6TJT4.js
6160
+ import path50 from "node:path";
6156
6161
  import { pathToFileURL as pathToFileURL2 } from "node:url";
6157
6162
  import { existsSync as existsSync7 } from "node:fs";
6158
- import path48 from "node:path";
6163
+ import path49 from "node:path";
6159
6164
  import micromatch4 from "micromatch";
6160
6165
  import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
6161
6166
  import path5 from "node:path";
6162
- import { execFile as execFile2 } from "node:child_process";
6163
6167
  import { createHash as createHash5, randomUUID as randomUUID10 } from "node:crypto";
6164
- import { existsSync as existsSync6 } from "node:fs";
6165
- import { copyFile as copyFile2, mkdir as mkdir17, readdir as readdir8, stat as stat9 } from "node:fs/promises";
6166
- import path46 from "node:path";
6168
+ import path47 from "node:path";
6167
6169
  import { fileURLToPath as fileURLToPath5 } from "node:url";
6168
- import { promisify as promisify6 } from "node:util";
6169
6170
  import micromatch3 from "micromatch";
6170
6171
  import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
6171
6172
  import { tmpdir } from "node:os";
@@ -6883,10 +6884,10 @@ function assignProp(target, prop, value) {
6883
6884
  configurable: true
6884
6885
  });
6885
6886
  }
6886
- function getElementAtPath(obj, path50) {
6887
- if (!path50)
6887
+ function getElementAtPath(obj, path51) {
6888
+ if (!path51)
6888
6889
  return obj;
6889
- return path50.reduce((acc, key) => acc?.[key], obj);
6890
+ return path51.reduce((acc, key) => acc?.[key], obj);
6890
6891
  }
6891
6892
  function promiseAllObject(promisesObj) {
6892
6893
  const keys = Object.keys(promisesObj);
@@ -7206,11 +7207,11 @@ function aborted(x, startIndex = 0) {
7206
7207
  }
7207
7208
  return false;
7208
7209
  }
7209
- function prefixIssues(path50, issues) {
7210
+ function prefixIssues(path51, issues) {
7210
7211
  return issues.map((iss) => {
7211
7212
  var _a;
7212
7213
  (_a = iss).path ?? (_a.path = []);
7213
- iss.path.unshift(path50);
7214
+ iss.path.unshift(path51);
7214
7215
  return iss;
7215
7216
  });
7216
7217
  }
@@ -7347,7 +7348,7 @@ function treeifyError(error40, _mapper) {
7347
7348
  return issue2.message;
7348
7349
  };
7349
7350
  const result = { errors: [] };
7350
- const processError = (error41, path50 = []) => {
7351
+ const processError = (error41, path51 = []) => {
7351
7352
  var _a, _b;
7352
7353
  for (const issue2 of error41.issues) {
7353
7354
  if (issue2.code === "invalid_union" && issue2.errors.length) {
@@ -7357,7 +7358,7 @@ function treeifyError(error40, _mapper) {
7357
7358
  } else if (issue2.code === "invalid_element") {
7358
7359
  processError({ issues: issue2.issues }, issue2.path);
7359
7360
  } else {
7360
- const fullpath = [...path50, ...issue2.path];
7361
+ const fullpath = [...path51, ...issue2.path];
7361
7362
  if (fullpath.length === 0) {
7362
7363
  result.errors.push(mapper(issue2));
7363
7364
  continue;
@@ -7387,9 +7388,9 @@ function treeifyError(error40, _mapper) {
7387
7388
  processError(error40);
7388
7389
  return result;
7389
7390
  }
7390
- function toDotPath(path50) {
7391
+ function toDotPath(path51) {
7391
7392
  const segs = [];
7392
- for (const seg of path50) {
7393
+ for (const seg of path51) {
7393
7394
  if (typeof seg === "number")
7394
7395
  segs.push(`[${seg}]`);
7395
7396
  else if (typeof seg === "symbol")
@@ -18819,7 +18820,7 @@ var RequestError = class _RequestError extends Error {
18819
18820
  }
18820
18821
  };
18821
18822
 
18822
- // ../../packages/core/dist/chunk-M6LF2BEU.js
18823
+ // ../../packages/core/dist/chunk-REU6TJT4.js
18823
18824
  import { exec as execCallback } from "node:child_process";
18824
18825
  import { readdirSync, statSync } from "node:fs";
18825
18826
  import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
@@ -18894,6 +18895,11 @@ import path33 from "node:path";
18894
18895
  import fg3 from "fast-glob";
18895
18896
  import { cp, mkdir as mkdir14, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
18896
18897
  import path34 from "node:path";
18898
+ import { execFile as execFile2 } from "node:child_process";
18899
+ import { existsSync as existsSync6 } from "node:fs";
18900
+ import { copyFile as copyFile2, mkdir as mkdir17, readdir as readdir8, stat as stat8 } from "node:fs/promises";
18901
+ import path39 from "node:path";
18902
+ import { promisify as promisify6 } from "node:util";
18897
18903
  import { createHash as createHash3 } from "node:crypto";
18898
18904
  import { existsSync as existsSync3 } from "node:fs";
18899
18905
  import { cp as cp2, mkdir as mkdir15, readFile as readFile11, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile9 } from "node:fs/promises";
@@ -18909,28 +18915,28 @@ import path36 from "node:path";
18909
18915
  import { stringify as stringifyYaml } from "yaml";
18910
18916
  import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
18911
18917
  import path38 from "node:path";
18912
- import { readFile as readFile18, stat as stat8 } from "node:fs/promises";
18913
- import path45 from "node:path";
18918
+ import { readFile as readFile18, stat as stat9 } from "node:fs/promises";
18919
+ import path46 from "node:path";
18914
18920
  import micromatch2 from "micromatch";
18915
18921
  import { stringify as stringifyYaml2 } from "yaml";
18916
18922
  import { readFile as readFile12 } from "node:fs/promises";
18917
- import path39 from "node:path";
18923
+ import path40 from "node:path";
18918
18924
  import { readFile as readFile13 } from "node:fs/promises";
18919
- import path41 from "node:path";
18925
+ import path422 from "node:path";
18920
18926
  import { constants as constants4 } from "node:fs";
18921
18927
  import { access as access4 } from "node:fs/promises";
18922
- import path40 from "node:path";
18928
+ import path41 from "node:path";
18923
18929
  import { fileURLToPath as fileURLToPath4 } from "node:url";
18924
18930
  import { readFile as readFile15 } from "node:fs/promises";
18925
- import path422 from "node:path";
18931
+ import path43 from "node:path";
18926
18932
  import { readFile as readFile14 } from "node:fs/promises";
18927
18933
  import { readFile as readFile17 } from "node:fs/promises";
18928
- import path44 from "node:path";
18934
+ import path45 from "node:path";
18929
18935
  import micromatch from "micromatch";
18930
18936
  import { readFile as readFile16 } from "node:fs/promises";
18931
- import path43 from "node:path";
18937
+ import path44 from "node:path";
18932
18938
  import { mkdir as mkdir18, readFile as readFile20, writeFile as writeFile10 } from "node:fs/promises";
18933
- import path47 from "node:path";
18939
+ import path48 from "node:path";
18934
18940
  import { readFile as readFile19 } from "node:fs/promises";
18935
18941
  var DEFAULT_CACHE_PATH = ".agentv/cache";
18936
18942
  var ResponseCache = class {
@@ -22462,115 +22468,115 @@ var FieldAccuracyGrader = class {
22462
22468
  * Evaluate a single field against the expected value.
22463
22469
  */
22464
22470
  evaluateField(fieldConfig, candidateData, expectedData) {
22465
- const { path: path50, match, required: required2 = true, weight = 1 } = fieldConfig;
22466
- const candidateValue = resolvePath(candidateData, path50);
22467
- const expectedValue = resolvePath(expectedData, path50);
22471
+ const { path: path51, match, required: required2 = true, weight = 1 } = fieldConfig;
22472
+ const candidateValue = resolvePath(candidateData, path51);
22473
+ const expectedValue = resolvePath(expectedData, path51);
22468
22474
  if (expectedValue === void 0) {
22469
22475
  return {
22470
- path: path50,
22476
+ path: path51,
22471
22477
  score: 1,
22472
22478
  // No expected value means no comparison needed
22473
22479
  weight,
22474
22480
  hit: true,
22475
- message: `${path50}: no expected value`
22481
+ message: `${path51}: no expected value`
22476
22482
  };
22477
22483
  }
22478
22484
  if (candidateValue === void 0) {
22479
22485
  if (required2) {
22480
22486
  return {
22481
- path: path50,
22487
+ path: path51,
22482
22488
  score: 0,
22483
22489
  weight,
22484
22490
  hit: false,
22485
- message: `${path50} (required, missing)`
22491
+ message: `${path51} (required, missing)`
22486
22492
  };
22487
22493
  }
22488
22494
  return {
22489
- path: path50,
22495
+ path: path51,
22490
22496
  score: 1,
22491
22497
  // Don't penalize missing optional fields
22492
22498
  weight: 0,
22493
22499
  // Zero weight means it won't affect the score
22494
22500
  hit: true,
22495
- message: `${path50}: optional field missing`
22501
+ message: `${path51}: optional field missing`
22496
22502
  };
22497
22503
  }
22498
22504
  switch (match) {
22499
22505
  case "exact":
22500
- return this.compareExact(path50, candidateValue, expectedValue, weight);
22506
+ return this.compareExact(path51, candidateValue, expectedValue, weight);
22501
22507
  case "numeric_tolerance":
22502
22508
  return this.compareNumericTolerance(
22503
- path50,
22509
+ path51,
22504
22510
  candidateValue,
22505
22511
  expectedValue,
22506
22512
  fieldConfig,
22507
22513
  weight
22508
22514
  );
22509
22515
  case "date":
22510
- return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
22516
+ return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
22511
22517
  default:
22512
22518
  return {
22513
- path: path50,
22519
+ path: path51,
22514
22520
  score: 0,
22515
22521
  weight,
22516
22522
  hit: false,
22517
- message: `${path50}: unknown match type "${match}"`
22523
+ message: `${path51}: unknown match type "${match}"`
22518
22524
  };
22519
22525
  }
22520
22526
  }
22521
22527
  /**
22522
22528
  * Exact equality comparison.
22523
22529
  */
22524
- compareExact(path50, candidateValue, expectedValue, weight) {
22530
+ compareExact(path51, candidateValue, expectedValue, weight) {
22525
22531
  if (deepEqual(candidateValue, expectedValue)) {
22526
22532
  return {
22527
- path: path50,
22533
+ path: path51,
22528
22534
  score: 1,
22529
22535
  weight,
22530
22536
  hit: true,
22531
- message: path50
22537
+ message: path51
22532
22538
  };
22533
22539
  }
22534
22540
  if (typeof candidateValue !== typeof expectedValue) {
22535
22541
  return {
22536
- path: path50,
22542
+ path: path51,
22537
22543
  score: 0,
22538
22544
  weight,
22539
22545
  hit: false,
22540
- message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
22546
+ message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
22541
22547
  };
22542
22548
  }
22543
22549
  return {
22544
- path: path50,
22550
+ path: path51,
22545
22551
  score: 0,
22546
22552
  weight,
22547
22553
  hit: false,
22548
- message: `${path50} (value mismatch)`
22554
+ message: `${path51} (value mismatch)`
22549
22555
  };
22550
22556
  }
22551
22557
  /**
22552
22558
  * Numeric comparison with absolute or relative tolerance.
22553
22559
  */
22554
- compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
22560
+ compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
22555
22561
  const { tolerance = 0, relative = false } = fieldConfig;
22556
22562
  const candidateNum = toNumber(candidateValue);
22557
22563
  const expectedNum = toNumber(expectedValue);
22558
22564
  if (candidateNum === null || expectedNum === null) {
22559
22565
  return {
22560
- path: path50,
22566
+ path: path51,
22561
22567
  score: 0,
22562
22568
  weight,
22563
22569
  hit: false,
22564
- message: `${path50} (non-numeric value)`
22570
+ message: `${path51} (non-numeric value)`
22565
22571
  };
22566
22572
  }
22567
22573
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
22568
22574
  return {
22569
- path: path50,
22575
+ path: path51,
22570
22576
  score: 0,
22571
22577
  weight,
22572
22578
  hit: false,
22573
- message: `${path50} (invalid numeric value)`
22579
+ message: `${path51} (invalid numeric value)`
22574
22580
  };
22575
22581
  }
22576
22582
  const diff = Math.abs(candidateNum - expectedNum);
@@ -22583,61 +22589,61 @@ var FieldAccuracyGrader = class {
22583
22589
  }
22584
22590
  if (withinTolerance) {
22585
22591
  return {
22586
- path: path50,
22592
+ path: path51,
22587
22593
  score: 1,
22588
22594
  weight,
22589
22595
  hit: true,
22590
- message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
22596
+ message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
22591
22597
  };
22592
22598
  }
22593
22599
  return {
22594
- path: path50,
22600
+ path: path51,
22595
22601
  score: 0,
22596
22602
  weight,
22597
22603
  hit: false,
22598
- message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
22604
+ message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
22599
22605
  };
22600
22606
  }
22601
22607
  /**
22602
22608
  * Date comparison with format normalization.
22603
22609
  */
22604
- compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
22610
+ compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
22605
22611
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
22606
22612
  const candidateDate = parseDate(String(candidateValue), formats);
22607
22613
  const expectedDate = parseDate(String(expectedValue), formats);
22608
22614
  if (candidateDate === null) {
22609
22615
  return {
22610
- path: path50,
22616
+ path: path51,
22611
22617
  score: 0,
22612
22618
  weight,
22613
22619
  hit: false,
22614
- message: `${path50} (unparseable candidate date)`
22620
+ message: `${path51} (unparseable candidate date)`
22615
22621
  };
22616
22622
  }
22617
22623
  if (expectedDate === null) {
22618
22624
  return {
22619
- path: path50,
22625
+ path: path51,
22620
22626
  score: 0,
22621
22627
  weight,
22622
22628
  hit: false,
22623
- message: `${path50} (unparseable expected date)`
22629
+ message: `${path51} (unparseable expected date)`
22624
22630
  };
22625
22631
  }
22626
22632
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
22627
22633
  return {
22628
- path: path50,
22634
+ path: path51,
22629
22635
  score: 1,
22630
22636
  weight,
22631
22637
  hit: true,
22632
- message: path50
22638
+ message: path51
22633
22639
  };
22634
22640
  }
22635
22641
  return {
22636
- path: path50,
22642
+ path: path51,
22637
22643
  score: 0,
22638
22644
  weight,
22639
22645
  hit: false,
22640
- message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
22646
+ message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
22641
22647
  };
22642
22648
  }
22643
22649
  /**
@@ -22670,11 +22676,11 @@ var FieldAccuracyGrader = class {
22670
22676
  };
22671
22677
  }
22672
22678
  };
22673
- function resolvePath(obj, path50) {
22674
- if (!path50 || !obj) {
22679
+ function resolvePath(obj, path51) {
22680
+ if (!path51 || !obj) {
22675
22681
  return void 0;
22676
22682
  }
22677
- const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
22683
+ const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
22678
22684
  let current = obj;
22679
22685
  for (const part of parts) {
22680
22686
  if (current === null || current === void 0) {
@@ -23205,8 +23211,8 @@ var TokenUsageGrader = class {
23205
23211
  };
23206
23212
  }
23207
23213
  };
23208
- function getNestedValue(obj, path50) {
23209
- const parts = path50.split(".");
23214
+ function getNestedValue(obj, path51) {
23215
+ const parts = path51.split(".");
23210
23216
  let current = obj;
23211
23217
  for (const part of parts) {
23212
23218
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -33200,6 +33206,47 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
33200
33206
  await rm4(evalDir, { recursive: true, force: true });
33201
33207
  }
33202
33208
  }
33209
+ function interpolateArgs(args, context) {
33210
+ const vars = {
33211
+ workspace_path: context.workspacePath,
33212
+ test_id: context.testId,
33213
+ eval_run_id: context.evalRunId,
33214
+ case_input: context.caseInput ?? "",
33215
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
33216
+ };
33217
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
33218
+ }
33219
+ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
33220
+ const stdin = JSON.stringify({
33221
+ workspace_path: context.workspacePath,
33222
+ test_id: context.testId,
33223
+ eval_run_id: context.evalRunId,
33224
+ case_input: context.caseInput ?? null,
33225
+ case_metadata: context.caseMetadata ?? null
33226
+ });
33227
+ const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
33228
+ const cwd = config2.cwd ?? context.workspaceFileDir ?? context.evalDir;
33229
+ if (config2.script !== void 0 && config2.command === void 0) {
33230
+ console.warn(
33231
+ "\x1B[33mWarning: 'script' is deprecated in workspace config. Use 'command' instead.\x1B[0m"
33232
+ );
33233
+ }
33234
+ const rawCommand = config2.command ?? config2.script ?? [];
33235
+ const commandArray = interpolateArgs(rawCommand, context);
33236
+ const result = await execFileWithStdin(commandArray, stdin, {
33237
+ timeoutMs,
33238
+ cwd
33239
+ });
33240
+ if (result.exitCode !== 0) {
33241
+ const stderr = result.stderr.trim();
33242
+ const message = stderr ? `${stderr}` : `Process exited with code ${result.exitCode}`;
33243
+ if (failureMode === "fatal") {
33244
+ throw new Error(`Script failed: ${message}`);
33245
+ }
33246
+ console.warn(`Script warning: ${message}`);
33247
+ }
33248
+ return result.stdout;
33249
+ }
33203
33250
  var GITHUB_SHORTHAND_RE = /^[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+$/;
33204
33251
  function resolveRepoCloneUrl(repo) {
33205
33252
  const trimmed = repo.trim();
@@ -34184,46 +34231,919 @@ async function resolveWorkspaceTemplate(templatePath) {
34184
34231
  }
34185
34232
  return { dir: resolved };
34186
34233
  }
34187
- function interpolateArgs(args, context) {
34188
- const vars = {
34189
- workspace_path: context.workspacePath,
34190
- test_id: context.testId,
34191
- eval_run_id: context.evalRunId,
34192
- case_input: context.caseInput ?? "",
34193
- case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
34234
+ var execFileAsync2 = promisify6(execFile2);
34235
+ var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
34236
+ var WorkspaceSetupError = class extends Error {
34237
+ failureStage;
34238
+ failureReasonCode;
34239
+ hookExecutions;
34240
+ constructor(message, options) {
34241
+ super(message);
34242
+ this.name = "WorkspaceSetupError";
34243
+ this.failureStage = options.failureStage;
34244
+ this.failureReasonCode = options.failureReasonCode;
34245
+ this.hookExecutions = options.hookExecutions ?? [];
34246
+ if (options.cause !== void 0) {
34247
+ this.cause = options.cause;
34248
+ }
34249
+ }
34250
+ };
34251
+ function toScriptConfig(hook, hookName, context) {
34252
+ const command = hook.command ?? hook.script;
34253
+ if (!command || command.length === 0) {
34254
+ throw new Error(`${hookName} hook in ${context} requires command or script`);
34255
+ }
34256
+ return {
34257
+ command,
34258
+ ...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
34259
+ ...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
34260
+ ...hook.cwd !== void 0 && { cwd: hook.cwd },
34261
+ ...hook.script !== void 0 && { script: hook.script }
34194
34262
  };
34195
- return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
34196
34263
  }
34197
- async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
34198
- const stdin = JSON.stringify({
34199
- workspace_path: context.workspacePath,
34200
- test_id: context.testId,
34201
- eval_run_id: context.evalRunId,
34202
- case_input: context.caseInput ?? null,
34203
- case_metadata: context.caseMetadata ?? null
34204
- });
34205
- const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
34206
- const cwd = config2.cwd ?? context.workspaceFileDir ?? context.evalDir;
34207
- if (config2.script !== void 0 && config2.command === void 0) {
34264
+ function hasHookCommand(hook) {
34265
+ return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
34266
+ }
34267
+ function hooksEnabled(workspace) {
34268
+ return workspace?.hooks?.enabled !== false;
34269
+ }
34270
+ function workspaceGitEnv() {
34271
+ const env = { ...process.env };
34272
+ for (const key of Object.keys(env)) {
34273
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
34274
+ delete env[key];
34275
+ }
34276
+ }
34277
+ return {
34278
+ ...env,
34279
+ GIT_TERMINAL_PROMPT: "0",
34280
+ GIT_ASKPASS: "",
34281
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
34282
+ };
34283
+ }
34284
+ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
34285
+ if (!existsSync6(path39.join(workspacePath, ".git"))) {
34286
+ return false;
34287
+ }
34288
+ const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
34289
+ const opts = {
34290
+ cwd: workspacePath,
34291
+ timeout: WORKSPACE_GIT_TIMEOUT_MS,
34292
+ env: workspaceGitEnv(),
34293
+ maxBuffer: 50 * 1024 * 1024
34294
+ };
34295
+ await execFileAsync2("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
34296
+ await execFileAsync2("git", ["clean", cleanFlag], opts);
34297
+ return true;
34298
+ }
34299
+ function commandForHook(hook) {
34300
+ return hook?.command ?? hook?.script;
34301
+ }
34302
+ function hookExecution(options) {
34303
+ const command = commandForHook(options.hook);
34304
+ return {
34305
+ scope: options.scope,
34306
+ name: options.name,
34307
+ status: options.status,
34308
+ testId: options.testId,
34309
+ ...options.workspacePath !== void 0 && { workspacePath: options.workspacePath },
34310
+ ...command !== void 0 && { command },
34311
+ ...options.hook?.cwd !== void 0 && { cwd: options.hook.cwd },
34312
+ ...options.output !== void 0 && { output: options.output },
34313
+ ...options.error !== void 0 && { error: options.error }
34314
+ };
34315
+ }
34316
+ async function releasePoolSlots(setup) {
34317
+ if (!setup.poolManager) {
34318
+ return;
34319
+ }
34320
+ if (setup.poolSlot) {
34321
+ await setup.poolManager.releaseSlot(setup.poolSlot);
34322
+ }
34323
+ for (const slot of setup.poolSlots) {
34324
+ if (slot !== setup.poolSlot) {
34325
+ await setup.poolManager.releaseSlot(slot).catch(() => {
34326
+ });
34327
+ }
34328
+ }
34329
+ }
34330
+ async function releaseSharedWorkspaceSetup(setup) {
34331
+ await releasePoolSlots(setup);
34332
+ }
34333
+ async function prepareSharedWorkspaceSetup(options) {
34334
+ const {
34335
+ evalRunId,
34336
+ evalCases,
34337
+ targetHooks,
34338
+ evalDir,
34339
+ verbose,
34340
+ workers,
34341
+ poolMaxSlots: configPoolMaxSlots,
34342
+ workspacePath,
34343
+ legacyWorkspacePath,
34344
+ workspaceMode,
34345
+ workspaceClean
34346
+ } = options;
34347
+ const suiteWorkspace = evalCases[0]?.workspace;
34348
+ const rawTemplate = suiteWorkspace?.template;
34349
+ const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
34350
+ const workspaceTemplate = resolvedTemplate?.dir;
34351
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
34352
+ const setupLog = (message) => {
34353
+ if (verbose) {
34354
+ console.log(`[setup] ${message}`);
34355
+ }
34356
+ };
34357
+ const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
34358
+ const cliWorkspacePath = workspacePath ?? legacyWorkspacePath;
34359
+ const yamlWorkspacePath = suiteWorkspace?.path;
34360
+ if (cliWorkspacePath && workspaceMode && workspaceMode !== "static") {
34361
+ throw new Error("--workspace-path requires --workspace-mode static when both are provided");
34362
+ }
34363
+ let configuredMode = cliWorkspacePath ? "static" : workspaceMode ?? suiteWorkspace?.mode ?? (yamlWorkspacePath ? "static" : "pooled");
34364
+ const configuredStaticPath = cliWorkspacePath ?? yamlWorkspacePath;
34365
+ if (configuredMode === "static" && !configuredStaticPath) {
34366
+ if (!suiteWorkspace?.repos?.length) {
34367
+ setupLog("workspace.mode=static with no path and no repos \u2014 falling back to temp mode");
34368
+ configuredMode = "temp";
34369
+ } else {
34370
+ throw new Error("workspace.mode=static requires workspace.path or --workspace-path");
34371
+ }
34372
+ }
34373
+ const useStaticWorkspace = configuredMode === "static";
34374
+ if (useStaticWorkspace && isPerTestIsolation) {
34375
+ throw new Error(
34376
+ "static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
34377
+ );
34378
+ }
34379
+ if (configuredMode !== "static" && configuredStaticPath) {
34380
+ throw new Error("workspace.path requires workspace.mode=static");
34381
+ }
34382
+ const hasSharedWorkspace = !!(useStaticWorkspace || !isPerTestIsolation && (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length));
34383
+ const poolEnabled = configuredMode === "pooled";
34384
+ const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
34385
+ setupLog(
34386
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
34387
+ );
34388
+ if (hasSharedWorkspace && !usePool && workers > 1 && evalCases.length > 1) {
34208
34389
  console.warn(
34209
- "\x1B[33mWarning: 'script' is deprecated in workspace config. Use 'command' instead.\x1B[0m"
34390
+ [
34391
+ `Warning: This eval uses a shared workspace with ${workers} workers.`,
34392
+ "If the agent under test makes file edits, concurrent runs may corrupt each other.",
34393
+ "To limit concurrency, add this to your eval YAML:",
34394
+ "",
34395
+ " execution:",
34396
+ " workers: 1",
34397
+ "",
34398
+ "Or pass --workers 1 on the command line."
34399
+ ].join("\n")
34210
34400
  );
34211
34401
  }
34212
- const rawCommand = config2.command ?? config2.script ?? [];
34213
- const commandArray = interpolateArgs(rawCommand, context);
34214
- const result = await execFileWithStdin(commandArray, stdin, {
34215
- timeoutMs,
34216
- cwd
34217
- });
34218
- if (result.exitCode !== 0) {
34219
- const stderr = result.stderr.trim();
34220
- const message = stderr ? `${stderr}` : `Process exited with code ${result.exitCode}`;
34221
- if (failureMode === "fatal") {
34222
- throw new Error(`Script failed: ${message}`);
34402
+ let sharedWorkspacePath;
34403
+ let sharedBaselineCommit;
34404
+ let beforeAllOutput;
34405
+ let poolManager;
34406
+ let poolSlot;
34407
+ const poolSlots = [];
34408
+ const availablePoolSlots = [];
34409
+ const poolSlotBaselines = /* @__PURE__ */ new Map();
34410
+ const hookExecutions = [];
34411
+ const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
34412
+ let staticMaterialised = false;
34413
+ const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
34414
+ let repoManager;
34415
+ try {
34416
+ if (useStaticWorkspace && configuredStaticPath) {
34417
+ const dirExists = await stat8(configuredStaticPath).then(
34418
+ (s) => s.isDirectory(),
34419
+ () => false
34420
+ );
34421
+ const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
34422
+ if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
34423
+ if (!dirExists) {
34424
+ await mkdir17(configuredStaticPath, { recursive: true });
34425
+ }
34426
+ if (workspaceTemplate) {
34427
+ await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
34428
+ setupLog(`copied template into static workspace: ${configuredStaticPath}`);
34429
+ }
34430
+ staticMaterialised = true;
34431
+ setupLog(`materialised static workspace at: ${configuredStaticPath}`);
34432
+ } else {
34433
+ setupLog(`reusing existing static workspace: ${configuredStaticPath}`);
34434
+ }
34435
+ sharedWorkspacePath = configuredStaticPath;
34436
+ } else if (!isPerTestIsolation && usePool && suiteWorkspace?.repos) {
34437
+ const slotsNeeded = workers;
34438
+ setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
34439
+ poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
34440
+ const poolRepoManager = new RepoManager(verbose);
34441
+ for (let i = 0; i < slotsNeeded; i++) {
34442
+ const slot = await poolManager.acquireWorkspace({
34443
+ templatePath: workspaceTemplate,
34444
+ repos: suiteWorkspace.repos,
34445
+ maxSlots: poolMaxSlots,
34446
+ repoManager: poolRepoManager,
34447
+ poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
34448
+ });
34449
+ poolSlots.push(slot);
34450
+ setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
34451
+ }
34452
+ if (slotsNeeded === 1) {
34453
+ poolSlot = poolSlots[0];
34454
+ sharedWorkspacePath = poolSlot.path;
34455
+ } else {
34456
+ availablePoolSlots.push(...poolSlots);
34457
+ }
34458
+ } else if (!isPerTestIsolation && workspaceTemplate) {
34459
+ setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
34460
+ try {
34461
+ sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
34462
+ setupLog(`shared workspace created at: ${sharedWorkspacePath}`);
34463
+ } catch (error40) {
34464
+ const message = error40 instanceof Error ? error40.message : String(error40);
34465
+ throw new WorkspaceSetupError(`Failed to create shared workspace: ${message}`, {
34466
+ failureStage: "setup",
34467
+ failureReasonCode: "template_error",
34468
+ hookExecutions,
34469
+ cause: error40
34470
+ });
34471
+ }
34472
+ } else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
34473
+ sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
34474
+ await mkdir17(sharedWorkspacePath, { recursive: true });
34475
+ setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
34223
34476
  }
34224
- console.warn(`Script warning: ${message}`);
34477
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
34478
+ const copiedWorkspaceFile = path39.join(sharedWorkspacePath, path39.basename(suiteWorkspaceFile));
34479
+ try {
34480
+ await stat8(copiedWorkspaceFile);
34481
+ suiteWorkspaceFile = copiedWorkspaceFile;
34482
+ } catch {
34483
+ }
34484
+ }
34485
+ const hasReposToMaterialize = !!suiteWorkspace?.repos?.length && !usePool && !isPerTestIsolation;
34486
+ const needsRepoMaterialisation = hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised);
34487
+ const needsPerRepoCheck = hasReposToMaterialize && useStaticWorkspace && !staticMaterialised && isYamlConfiguredPath;
34488
+ repoManager = needsRepoMaterialisation || needsPerRepoCheck ? new RepoManager(verbose) : void 0;
34489
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos) {
34490
+ try {
34491
+ if (needsPerRepoCheck) {
34492
+ for (const repo of suiteWorkspace.repos) {
34493
+ if (!repo.path || !repo.repo) continue;
34494
+ const targetDir = path39.join(sharedWorkspacePath, repo.path);
34495
+ if (existsSync6(targetDir)) {
34496
+ setupLog(`reusing existing repo at: ${targetDir}`);
34497
+ continue;
34498
+ }
34499
+ setupLog(`materializing missing repo: ${repo.path}`);
34500
+ await repoManager.materialize(repo, sharedWorkspacePath);
34501
+ }
34502
+ } else {
34503
+ setupLog(
34504
+ `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
34505
+ );
34506
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
34507
+ }
34508
+ setupLog("shared repo materialization complete");
34509
+ } catch (error40) {
34510
+ const message = error40 instanceof Error ? error40.message : String(error40);
34511
+ if (sharedWorkspacePath && !useStaticWorkspace) {
34512
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
34513
+ });
34514
+ }
34515
+ throw new WorkspaceSetupError(`Failed to materialize repos: ${message}`, {
34516
+ failureStage: "repo_setup",
34517
+ failureReasonCode: "clone_error",
34518
+ hookExecutions,
34519
+ cause: error40
34520
+ });
34521
+ }
34522
+ }
34523
+ const suiteDockerConfig = suiteWorkspace?.docker;
34524
+ if (suiteDockerConfig) {
34525
+ setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
34526
+ const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27-B4AQHVWA.js");
34527
+ const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
34528
+ if (!await dockerSetup.isDockerAvailable()) {
34529
+ throw new Error(
34530
+ "Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
34531
+ );
34532
+ }
34533
+ await dockerSetup.pullImage();
34534
+ setupLog("Docker image pull complete");
34535
+ }
34536
+ if (suiteWorkspace?.env) {
34537
+ try {
34538
+ await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
34539
+ setupLog("preflight checks passed");
34540
+ } catch (error40) {
34541
+ const message = error40 instanceof Error ? error40.message : String(error40);
34542
+ if (sharedWorkspacePath && !useStaticWorkspace) {
34543
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
34544
+ });
34545
+ }
34546
+ throw new WorkspaceSetupError(message, {
34547
+ failureStage: "setup",
34548
+ failureReasonCode: "preflight_error",
34549
+ hookExecutions,
34550
+ cause: error40
34551
+ });
34552
+ }
34553
+ }
34554
+ const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
34555
+ const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
34556
+ if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
34557
+ const beforeAllHook = suiteBeforeAllHook;
34558
+ const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
34559
+ setupLog(
34560
+ `running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
34561
+ );
34562
+ const scriptContext = {
34563
+ workspacePath: sharedWorkspacePath,
34564
+ testId: "__before_all__",
34565
+ evalRunId,
34566
+ evalDir,
34567
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
34568
+ };
34569
+ try {
34570
+ beforeAllOutput = await executeWorkspaceScript(
34571
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
34572
+ scriptContext
34573
+ );
34574
+ hookExecutions.push(
34575
+ hookExecution({
34576
+ scope: "workspace",
34577
+ name: "before_all",
34578
+ status: "success",
34579
+ testId: "__before_all__",
34580
+ workspacePath: sharedWorkspacePath,
34581
+ hook: beforeAllHook,
34582
+ output: beforeAllOutput
34583
+ })
34584
+ );
34585
+ setupLog("shared before_all completed");
34586
+ } catch (error40) {
34587
+ const message = error40 instanceof Error ? error40.message : String(error40);
34588
+ hookExecutions.push(
34589
+ hookExecution({
34590
+ scope: "workspace",
34591
+ name: "before_all",
34592
+ status: "failed",
34593
+ testId: "__before_all__",
34594
+ workspacePath: sharedWorkspacePath,
34595
+ hook: beforeAllHook,
34596
+ error: message
34597
+ })
34598
+ );
34599
+ if (sharedWorkspacePath && !useStaticWorkspace) {
34600
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
34601
+ });
34602
+ }
34603
+ throw new WorkspaceSetupError(`before_all script failed: ${message}`, {
34604
+ failureStage: "setup",
34605
+ failureReasonCode: "script_error",
34606
+ hookExecutions,
34607
+ cause: error40
34608
+ });
34609
+ }
34610
+ }
34611
+ if (availablePoolSlots.length > 0 && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
34612
+ const beforeAllHook = suiteBeforeAllHook;
34613
+ for (const slot of availablePoolSlots) {
34614
+ setupLog(`running before_all on pool slot ${slot.index}`);
34615
+ const scriptContext = {
34616
+ workspacePath: slot.path,
34617
+ testId: "__before_all__",
34618
+ evalRunId,
34619
+ evalDir,
34620
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
34621
+ };
34622
+ try {
34623
+ const output = await executeWorkspaceScript(
34624
+ toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
34625
+ scriptContext
34626
+ );
34627
+ if (!beforeAllOutput) beforeAllOutput = output;
34628
+ hookExecutions.push(
34629
+ hookExecution({
34630
+ scope: "workspace",
34631
+ name: "before_all",
34632
+ status: "success",
34633
+ testId: "__before_all__",
34634
+ workspacePath: slot.path,
34635
+ hook: beforeAllHook,
34636
+ output
34637
+ })
34638
+ );
34639
+ setupLog(`before_all completed on pool slot ${slot.index}`);
34640
+ } catch (error40) {
34641
+ const message = error40 instanceof Error ? error40.message : String(error40);
34642
+ hookExecutions.push(
34643
+ hookExecution({
34644
+ scope: "workspace",
34645
+ name: "before_all",
34646
+ status: "failed",
34647
+ testId: "__before_all__",
34648
+ workspacePath: slot.path,
34649
+ hook: beforeAllHook,
34650
+ error: message
34651
+ })
34652
+ );
34653
+ throw new WorkspaceSetupError(
34654
+ `before_all script failed on pool slot ${slot.index}: ${message}`,
34655
+ {
34656
+ failureStage: "setup",
34657
+ failureReasonCode: "script_error",
34658
+ hookExecutions,
34659
+ cause: error40
34660
+ }
34661
+ );
34662
+ }
34663
+ }
34664
+ }
34665
+ const targetBeforeAllHook = targetHooks?.before_all;
34666
+ if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
34667
+ const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
34668
+ setupLog(`running target before_all command=${beforeAllCommand}`);
34669
+ const scriptContext = {
34670
+ workspacePath: sharedWorkspacePath,
34671
+ testId: "__target_before_all__",
34672
+ evalRunId,
34673
+ evalDir,
34674
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
34675
+ };
34676
+ try {
34677
+ await executeWorkspaceScript(
34678
+ toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
34679
+ scriptContext
34680
+ );
34681
+ hookExecutions.push(
34682
+ hookExecution({
34683
+ scope: "target",
34684
+ name: "before_all",
34685
+ status: "success",
34686
+ testId: "__target_before_all__",
34687
+ workspacePath: sharedWorkspacePath,
34688
+ hook: targetBeforeAllHook
34689
+ })
34690
+ );
34691
+ setupLog("target before_all completed");
34692
+ } catch (error40) {
34693
+ const message = error40 instanceof Error ? error40.message : String(error40);
34694
+ hookExecutions.push(
34695
+ hookExecution({
34696
+ scope: "target",
34697
+ name: "before_all",
34698
+ status: "failed",
34699
+ testId: "__target_before_all__",
34700
+ workspacePath: sharedWorkspacePath,
34701
+ hook: targetBeforeAllHook,
34702
+ error: message
34703
+ })
34704
+ );
34705
+ if (sharedWorkspacePath && !useStaticWorkspace) {
34706
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
34707
+ });
34708
+ }
34709
+ throw new WorkspaceSetupError(`target before_all hook failed: ${message}`, {
34710
+ failureStage: "setup",
34711
+ failureReasonCode: "script_error",
34712
+ hookExecutions,
34713
+ cause: error40
34714
+ });
34715
+ }
34716
+ }
34717
+ if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
34718
+ for (const slot of availablePoolSlots) {
34719
+ setupLog(`running target before_all on pool slot ${slot.index}`);
34720
+ const scriptContext = {
34721
+ workspacePath: slot.path,
34722
+ testId: "__target_before_all__",
34723
+ evalRunId,
34724
+ evalDir,
34725
+ workspaceFileDir: suiteWorkspace?.workspaceFileDir
34726
+ };
34727
+ try {
34728
+ await executeWorkspaceScript(
34729
+ toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
34730
+ scriptContext
34731
+ );
34732
+ hookExecutions.push(
34733
+ hookExecution({
34734
+ scope: "target",
34735
+ name: "before_all",
34736
+ status: "success",
34737
+ testId: "__target_before_all__",
34738
+ workspacePath: slot.path,
34739
+ hook: targetBeforeAllHook
34740
+ })
34741
+ );
34742
+ } catch (error40) {
34743
+ const message = error40 instanceof Error ? error40.message : String(error40);
34744
+ hookExecutions.push(
34745
+ hookExecution({
34746
+ scope: "target",
34747
+ name: "before_all",
34748
+ status: "failed",
34749
+ testId: "__target_before_all__",
34750
+ workspacePath: slot.path,
34751
+ hook: targetBeforeAllHook,
34752
+ error: message
34753
+ })
34754
+ );
34755
+ throw new WorkspaceSetupError(
34756
+ `target before_all hook failed on pool slot ${slot.index}: ${message}`,
34757
+ {
34758
+ failureStage: "setup",
34759
+ failureReasonCode: "script_error",
34760
+ hookExecutions,
34761
+ cause: error40
34762
+ }
34763
+ );
34764
+ }
34765
+ }
34766
+ }
34767
+ if (sharedWorkspacePath) {
34768
+ try {
34769
+ sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
34770
+ setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
34771
+ } catch (error40) {
34772
+ const message = error40 instanceof Error ? error40.message : String(error40);
34773
+ setupLog(`shared baseline initialization failed (file_changes unavailable): ${message}`);
34774
+ }
34775
+ }
34776
+ if (availablePoolSlots.length > 0) {
34777
+ for (const slot of availablePoolSlots) {
34778
+ try {
34779
+ const baseline = await initializeBaseline(slot.path);
34780
+ poolSlotBaselines.set(slot.path, baseline);
34781
+ setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
34782
+ } catch (error40) {
34783
+ const message = error40 instanceof Error ? error40.message : String(error40);
34784
+ setupLog(
34785
+ `pool slot ${slot.index} baseline initialization failed (file_changes unavailable): ${message}`
34786
+ );
34787
+ }
34788
+ }
34789
+ }
34790
+ return {
34791
+ ...suiteWorkspace !== void 0 && { suiteWorkspace },
34792
+ ...sharedWorkspacePath !== void 0 && { sharedWorkspacePath },
34793
+ ...sharedBaselineCommit !== void 0 && { sharedBaselineCommit },
34794
+ ...suiteWorkspaceFile !== void 0 && { suiteWorkspaceFile },
34795
+ ...beforeAllOutput !== void 0 && { beforeAllOutput },
34796
+ ...repoManager !== void 0 && { repoManager },
34797
+ ...poolManager !== void 0 && { poolManager },
34798
+ ...poolSlot !== void 0 && { poolSlot },
34799
+ poolSlots,
34800
+ availablePoolSlots,
34801
+ poolSlotBaselines,
34802
+ useStaticWorkspace,
34803
+ configuredMode,
34804
+ hookExecutions
34805
+ };
34806
+ } catch (error40) {
34807
+ await releasePoolSlots({ poolManager, poolSlot, poolSlots }).catch(() => {
34808
+ });
34809
+ throw error40;
34810
+ }
34811
+ }
34812
+ async function prepareEvalCaseWorkspace(options) {
34813
+ const {
34814
+ evalCase,
34815
+ evalRunId,
34816
+ sharedWorkspacePath,
34817
+ sharedBaselineCommit,
34818
+ suiteWorkspaceFile,
34819
+ repoManager,
34820
+ evalDir,
34821
+ cleanupWorkspaces: forceCleanup,
34822
+ targetHooks,
34823
+ setupDebug
34824
+ } = options;
34825
+ let workspacePath = sharedWorkspacePath;
34826
+ let beforeAllOutput;
34827
+ let beforeEachOutput;
34828
+ const isSharedWorkspace = !!sharedWorkspacePath;
34829
+ let caseWorkspaceFile;
34830
+ const caseHooksEnabled = hooksEnabled(evalCase.workspace);
34831
+ const hookExecutions = [];
34832
+ if (!workspacePath) {
34833
+ const rawCaseTemplate = evalCase.workspace?.template;
34834
+ const resolvedCaseTemplate = await resolveWorkspaceTemplate(rawCaseTemplate);
34835
+ const caseWorkspaceTemplate = resolvedCaseTemplate?.dir;
34836
+ caseWorkspaceFile = resolvedCaseTemplate?.workspaceFile;
34837
+ if (caseWorkspaceTemplate && evalRunId) {
34838
+ try {
34839
+ workspacePath = await createTempWorkspace(caseWorkspaceTemplate, evalRunId, evalCase.id);
34840
+ } catch (error40) {
34841
+ const message = error40 instanceof Error ? error40.message : String(error40);
34842
+ throw new WorkspaceSetupError(`Failed to create workspace: ${message}`, {
34843
+ failureStage: "setup",
34844
+ failureReasonCode: "template_error",
34845
+ hookExecutions,
34846
+ cause: error40
34847
+ });
34848
+ }
34849
+ if (caseWorkspaceFile && workspacePath) {
34850
+ const copiedFile = path39.join(workspacePath, path39.basename(caseWorkspaceFile));
34851
+ try {
34852
+ await stat8(copiedFile);
34853
+ caseWorkspaceFile = copiedFile;
34854
+ } catch {
34855
+ }
34856
+ }
34857
+ }
34858
+ if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
34859
+ workspacePath = getWorkspacePath(evalRunId, evalCase.id);
34860
+ await mkdir17(workspacePath, { recursive: true });
34861
+ }
34862
+ if (evalCase.workspace?.repos?.length && workspacePath) {
34863
+ const perCaseRepoManager = new RepoManager(setupDebug);
34864
+ try {
34865
+ if (setupDebug) {
34866
+ console.log(
34867
+ `[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-test repo(s) into ${workspacePath}`
34868
+ );
34869
+ }
34870
+ await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
34871
+ if (setupDebug) {
34872
+ console.log(`[setup] test=${evalCase.id} per-test repo materialization complete`);
34873
+ }
34874
+ } catch (error40) {
34875
+ const message = error40 instanceof Error ? error40.message : String(error40);
34876
+ throw new WorkspaceSetupError(`Failed to materialize repos: ${message}`, {
34877
+ failureStage: "repo_setup",
34878
+ failureReasonCode: "clone_error",
34879
+ hookExecutions,
34880
+ cause: error40
34881
+ });
34882
+ }
34883
+ }
34884
+ if (workspacePath && evalCase.metadata?.agent_skills_files) {
34885
+ const baseDir = evalCase.metadata.agent_skills_base_dir;
34886
+ const files = evalCase.metadata.agent_skills_files;
34887
+ if (baseDir && files.length > 0) {
34888
+ for (const relPath of files) {
34889
+ const srcPath = path39.resolve(baseDir, relPath);
34890
+ const destPath = path39.resolve(workspacePath, relPath);
34891
+ try {
34892
+ await mkdir17(path39.dirname(destPath), { recursive: true });
34893
+ await copyFile2(srcPath, destPath);
34894
+ } catch (error40) {
34895
+ const message = error40 instanceof Error ? error40.message : String(error40);
34896
+ throw new WorkspaceSetupError(
34897
+ `Agent Skills eval file not found: ${relPath} (resolved from ${baseDir}): ${message}`,
34898
+ {
34899
+ failureStage: "setup",
34900
+ failureReasonCode: "file_copy_error",
34901
+ hookExecutions,
34902
+ cause: error40
34903
+ }
34904
+ );
34905
+ }
34906
+ }
34907
+ }
34908
+ }
34909
+ const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
34910
+ if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) {
34911
+ const beforeAllHook = caseBeforeAllHook;
34912
+ const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
34913
+ if (setupDebug) {
34914
+ console.log(
34915
+ `[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
34916
+ );
34917
+ }
34918
+ const scriptContext = {
34919
+ workspacePath,
34920
+ testId: evalCase.id,
34921
+ evalRunId: evalRunId ?? "",
34922
+ caseInput: evalCase.question,
34923
+ caseMetadata: evalCase.metadata,
34924
+ evalDir,
34925
+ workspaceFileDir: evalCase.workspace?.workspaceFileDir
34926
+ };
34927
+ try {
34928
+ beforeAllOutput = await executeWorkspaceScript(
34929
+ toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
34930
+ scriptContext
34931
+ );
34932
+ hookExecutions.push(
34933
+ hookExecution({
34934
+ scope: "workspace",
34935
+ name: "before_all",
34936
+ status: "success",
34937
+ testId: evalCase.id,
34938
+ workspacePath,
34939
+ hook: beforeAllHook,
34940
+ output: beforeAllOutput
34941
+ })
34942
+ );
34943
+ if (setupDebug) {
34944
+ console.log(`[setup] test=${evalCase.id} before_all completed`);
34945
+ }
34946
+ } catch (error40) {
34947
+ const message = error40 instanceof Error ? error40.message : String(error40);
34948
+ hookExecutions.push(
34949
+ hookExecution({
34950
+ scope: "workspace",
34951
+ name: "before_all",
34952
+ status: "failed",
34953
+ testId: evalCase.id,
34954
+ workspacePath,
34955
+ hook: beforeAllHook,
34956
+ error: message
34957
+ })
34958
+ );
34959
+ if (forceCleanup && workspacePath) {
34960
+ await cleanupWorkspace(workspacePath).catch(() => {
34961
+ });
34962
+ }
34963
+ throw new WorkspaceSetupError(`before_all script failed: ${message}`, {
34964
+ failureStage: "setup",
34965
+ failureReasonCode: "script_error",
34966
+ hookExecutions,
34967
+ cause: error40
34968
+ });
34969
+ }
34970
+ }
34971
+ }
34972
+ let beforeEachNeedsFreshBaseline = false;
34973
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
34974
+ try {
34975
+ if (repoManager && evalCase.workspace.repos?.length) {
34976
+ await repoManager.reset(
34977
+ evalCase.workspace.repos,
34978
+ workspacePath,
34979
+ evalCase.workspace.hooks.before_each.reset
34980
+ );
34981
+ } else {
34982
+ await resetWorkspaceRoot(
34983
+ workspacePath,
34984
+ evalCase.workspace.hooks.before_each.reset,
34985
+ sharedBaselineCommit
34986
+ );
34987
+ }
34988
+ } catch (error40) {
34989
+ const message = error40 instanceof Error ? error40.message : String(error40);
34990
+ throw new WorkspaceSetupError(`before_each reset failed: ${message}`, {
34991
+ failureStage: "setup",
34992
+ failureReasonCode: "script_error",
34993
+ hookExecutions,
34994
+ cause: error40
34995
+ });
34996
+ }
34997
+ }
34998
+ const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
34999
+ if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
35000
+ const beforeEachHook = caseBeforeEachHook;
35001
+ const scriptContext = {
35002
+ workspacePath,
35003
+ testId: evalCase.id,
35004
+ evalRunId: evalRunId ?? "",
35005
+ caseInput: evalCase.question,
35006
+ caseMetadata: evalCase.metadata,
35007
+ evalDir,
35008
+ workspaceFileDir: evalCase.workspace?.workspaceFileDir
35009
+ };
35010
+ try {
35011
+ beforeEachOutput = await executeWorkspaceScript(
35012
+ toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
35013
+ scriptContext
35014
+ );
35015
+ hookExecutions.push(
35016
+ hookExecution({
35017
+ scope: "workspace",
35018
+ name: "before_each",
35019
+ status: "success",
35020
+ testId: evalCase.id,
35021
+ workspacePath,
35022
+ hook: beforeEachHook,
35023
+ output: beforeEachOutput
35024
+ })
35025
+ );
35026
+ beforeEachNeedsFreshBaseline = true;
35027
+ } catch (error40) {
35028
+ const message = error40 instanceof Error ? error40.message : String(error40);
35029
+ hookExecutions.push(
35030
+ hookExecution({
35031
+ scope: "workspace",
35032
+ name: "before_each",
35033
+ status: "failed",
35034
+ testId: evalCase.id,
35035
+ workspacePath,
35036
+ hook: beforeEachHook,
35037
+ error: message
35038
+ })
35039
+ );
35040
+ throw new WorkspaceSetupError(`before_each script failed: ${message}`, {
35041
+ failureStage: "setup",
35042
+ failureReasonCode: "script_error",
35043
+ hookExecutions,
35044
+ cause: error40
35045
+ });
35046
+ }
35047
+ }
35048
+ const targetBeforeEachHook = targetHooks?.before_each;
35049
+ if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
35050
+ const scriptContext = {
35051
+ workspacePath,
35052
+ testId: evalCase.id,
35053
+ evalRunId: evalRunId ?? "",
35054
+ caseInput: evalCase.question,
35055
+ caseMetadata: evalCase.metadata,
35056
+ evalDir,
35057
+ workspaceFileDir: evalCase.workspace?.workspaceFileDir
35058
+ };
35059
+ try {
35060
+ await executeWorkspaceScript(
35061
+ toScriptConfig(targetBeforeEachHook, "before_each", `target hook for '${evalCase.id}'`),
35062
+ scriptContext
35063
+ );
35064
+ hookExecutions.push(
35065
+ hookExecution({
35066
+ scope: "target",
35067
+ name: "before_each",
35068
+ status: "success",
35069
+ testId: evalCase.id,
35070
+ workspacePath,
35071
+ hook: targetBeforeEachHook
35072
+ })
35073
+ );
35074
+ beforeEachNeedsFreshBaseline = true;
35075
+ } catch (error40) {
35076
+ const message = error40 instanceof Error ? error40.message : String(error40);
35077
+ hookExecutions.push(
35078
+ hookExecution({
35079
+ scope: "target",
35080
+ name: "before_each",
35081
+ status: "failed",
35082
+ testId: evalCase.id,
35083
+ workspacePath,
35084
+ hook: targetBeforeEachHook,
35085
+ error: message
35086
+ })
35087
+ );
35088
+ throw new WorkspaceSetupError(`target before_each hook failed: ${message}`, {
35089
+ failureStage: "setup",
35090
+ failureReasonCode: "script_error",
35091
+ hookExecutions,
35092
+ cause: error40
35093
+ });
35094
+ }
35095
+ }
35096
+ let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
35097
+ if (!baselineCommit && workspacePath) {
35098
+ try {
35099
+ baselineCommit = await initializeBaseline(workspacePath);
35100
+ } catch (error40) {
35101
+ const message = error40 instanceof Error ? error40.message : String(error40);
35102
+ if (setupDebug) {
35103
+ console.warn(`[setup] test=${evalCase.id} baseline initialization failed: ${message}`);
35104
+ }
35105
+ }
35106
+ }
35107
+ return {
35108
+ ...workspacePath !== void 0 && { workspacePath },
35109
+ caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
35110
+ ...beforeAllOutput !== void 0 && { beforeAllOutput },
35111
+ ...beforeEachOutput !== void 0 && { beforeEachOutput },
35112
+ ...baselineCommit !== void 0 && { baselineCommit },
35113
+ isSharedWorkspace,
35114
+ hookExecutions
35115
+ };
35116
+ }
35117
+ async function runPreflightChecks(env, cwd, log) {
35118
+ const missing = [];
35119
+ for (const cmd of env.required_commands ?? []) {
35120
+ log(`preflight: checking command "${cmd}"`);
35121
+ try {
35122
+ if (process.platform === "win32") {
35123
+ await execFileAsync2("where", [cmd], { cwd });
35124
+ } else {
35125
+ await execFileAsync2("sh", ["-c", `command -v ${cmd}`], { cwd });
35126
+ }
35127
+ } catch {
35128
+ missing.push(`command: ${cmd}`);
35129
+ }
35130
+ }
35131
+ for (const mod of env.required_python_modules ?? []) {
35132
+ log(`preflight: checking Python module "${mod}"`);
35133
+ try {
35134
+ await execFileAsync2("python3", ["-c", `import ${mod}`], { cwd });
35135
+ } catch {
35136
+ missing.push(`python module: ${mod}`);
35137
+ }
35138
+ }
35139
+ if (missing.length > 0) {
35140
+ throw new Error(
35141
+ `Preflight checks failed \u2014 missing dependencies:
35142
+ ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
35143
+
35144
+ Install the missing dependencies before running this eval.`
35145
+ );
34225
35146
  }
34226
- return result.stdout;
34227
35147
  }
34228
35148
  function flattenInputMessages(messages) {
34229
35149
  return messages.flatMap((message) => extractContentSegments(message.content));
@@ -34305,7 +35225,7 @@ async function loadTestsFromAgentSkills(filePath) {
34305
35225
  } catch {
34306
35226
  throw new Error(`Invalid Agent Skills evals.json: failed to parse JSON in '${filePath}'`);
34307
35227
  }
34308
- return parseAgentSkillsEvals(parsed, filePath, path39.dirname(path39.resolve(filePath)));
35228
+ return parseAgentSkillsEvals(parsed, filePath, path40.dirname(path40.resolve(filePath)));
34309
35229
  }
34310
35230
  function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
34311
35231
  if (!isAgentSkillsFormat(parsed)) {
@@ -34343,7 +35263,7 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
34343
35263
  if (baseDir) {
34344
35264
  metadata.agent_skills_base_dir = baseDir;
34345
35265
  for (const file2 of evalCase.files) {
34346
- filePaths.push(path39.resolve(baseDir, file2));
35266
+ filePaths.push(path40.resolve(baseDir, file2));
34347
35267
  }
34348
35268
  }
34349
35269
  }
@@ -34379,15 +35299,15 @@ function resolveToAbsolutePath(candidate) {
34379
35299
  if (candidate.startsWith("file:")) {
34380
35300
  return fileURLToPath4(candidate);
34381
35301
  }
34382
- return path40.resolve(candidate);
35302
+ return path41.resolve(candidate);
34383
35303
  }
34384
35304
  throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
34385
35305
  }
34386
35306
  function buildDirectoryChain2(filePath, repoRoot) {
34387
35307
  const directories = [];
34388
35308
  const seen = /* @__PURE__ */ new Set();
34389
- const boundary = path40.resolve(repoRoot);
34390
- let current = path40.resolve(path40.dirname(filePath));
35309
+ const boundary = path41.resolve(repoRoot);
35310
+ let current = path41.resolve(path41.dirname(filePath));
34391
35311
  while (current !== void 0) {
34392
35312
  if (!seen.has(current)) {
34393
35313
  directories.push(current);
@@ -34396,7 +35316,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
34396
35316
  if (current === boundary) {
34397
35317
  break;
34398
35318
  }
34399
- const parent = path40.dirname(current);
35319
+ const parent = path41.dirname(current);
34400
35320
  if (parent === current) {
34401
35321
  break;
34402
35322
  }
@@ -34410,16 +35330,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
34410
35330
  function buildSearchRoots2(evalPath, repoRoot) {
34411
35331
  const uniqueRoots = [];
34412
35332
  const addRoot = (root) => {
34413
- const normalized = path40.resolve(root);
35333
+ const normalized = path41.resolve(root);
34414
35334
  if (!uniqueRoots.includes(normalized)) {
34415
35335
  uniqueRoots.push(normalized);
34416
35336
  }
34417
35337
  };
34418
- let currentDir = path40.dirname(evalPath);
35338
+ let currentDir = path41.dirname(evalPath);
34419
35339
  let reachedBoundary = false;
34420
35340
  while (!reachedBoundary) {
34421
35341
  addRoot(currentDir);
34422
- const parentDir = path40.dirname(currentDir);
35342
+ const parentDir = path41.dirname(currentDir);
34423
35343
  if (currentDir === repoRoot || parentDir === currentDir) {
34424
35344
  reachedBoundary = true;
34425
35345
  } else {
@@ -34437,16 +35357,16 @@ function trimLeadingSeparators2(value) {
34437
35357
  async function resolveFileReference3(rawValue, searchRoots) {
34438
35358
  const displayPath = trimLeadingSeparators2(rawValue);
34439
35359
  const potentialPaths = [];
34440
- if (path40.isAbsolute(rawValue)) {
34441
- potentialPaths.push(path40.normalize(rawValue));
35360
+ if (path41.isAbsolute(rawValue)) {
35361
+ potentialPaths.push(path41.normalize(rawValue));
34442
35362
  }
34443
35363
  for (const base of searchRoots) {
34444
- potentialPaths.push(path40.resolve(base, displayPath));
35364
+ potentialPaths.push(path41.resolve(base, displayPath));
34445
35365
  }
34446
35366
  const attempted = [];
34447
35367
  const seen = /* @__PURE__ */ new Set();
34448
35368
  for (const candidate of potentialPaths) {
34449
- const absoluteCandidate = path40.resolve(candidate);
35369
+ const absoluteCandidate = path41.resolve(candidate);
34450
35370
  if (seen.has(absoluteCandidate)) {
34451
35371
  continue;
34452
35372
  }
@@ -34467,9 +35387,9 @@ var DEFAULT_EVAL_PATTERNS = [
34467
35387
  ];
34468
35388
  async function loadConfig(evalFilePath, repoRoot) {
34469
35389
  const directories = buildDirectoryChain2(evalFilePath, repoRoot);
34470
- const globalConfigPath = path41.join(getAgentvConfigDir(), "config.yaml");
35390
+ const globalConfigPath = path422.join(getAgentvConfigDir(), "config.yaml");
34471
35391
  for (const directory of directories) {
34472
- const configPath2 = path41.join(directory, ".agentv", "config.yaml");
35392
+ const configPath2 = path422.join(directory, ".agentv", "config.yaml");
34473
35393
  if (!await fileExists3(configPath2)) {
34474
35394
  continue;
34475
35395
  }
@@ -35020,8 +35940,8 @@ function isTemplateReference(value) {
35020
35940
  }
35021
35941
  async function resolveAssertionTemplateReference(include, searchRoots) {
35022
35942
  const templateCandidates = isTemplateReference(include) ? [
35023
- path422.join(".agentv", "templates", `${include}.yaml`),
35024
- path422.join(".agentv", "templates", `${include}.yml`)
35943
+ path43.join(".agentv", "templates", `${include}.yaml`),
35944
+ path43.join(".agentv", "templates", `${include}.yml`)
35025
35945
  ] : [include];
35026
35946
  const attempted = [];
35027
35947
  for (const candidate of templateCandidates) {
@@ -35074,10 +35994,10 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
35074
35994
  `Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} is missing a top-level assertions array`
35075
35995
  );
35076
35996
  }
35077
- const templateDir = path422.dirname(resolved.resolvedPath);
35997
+ const templateDir = path43.dirname(resolved.resolvedPath);
35078
35998
  const nestedSearchRoots = [
35079
35999
  templateDir,
35080
- ...searchRoots.filter((root) => path422.resolve(root) !== templateDir)
36000
+ ...searchRoots.filter((root) => path43.resolve(root) !== templateDir)
35081
36001
  ];
35082
36002
  return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
35083
36003
  depth: nextDepth,
@@ -35138,7 +36058,7 @@ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, e
35138
36058
  references.push({
35139
36059
  kind: "assertion_template",
35140
36060
  displayPath: resolved.displayPath,
35141
- ...resolved.resolvedPath ? { resolvedPath: path422.resolve(resolved.resolvedPath) } : {}
36061
+ ...resolved.resolvedPath ? { resolvedPath: path43.resolve(resolved.resolvedPath) } : {}
35142
36062
  });
35143
36063
  if (resolved.resolvedPath) {
35144
36064
  if (includeContext.chain.includes(resolved.resolvedPath)) {
@@ -35148,10 +36068,10 @@ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, e
35148
36068
  const content = await readFile15(resolved.resolvedPath, "utf8");
35149
36069
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
35150
36070
  if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
35151
- const templateDir = path422.dirname(resolved.resolvedPath);
36071
+ const templateDir = path43.dirname(resolved.resolvedPath);
35152
36072
  const nestedSearchRoots = [
35153
36073
  templateDir,
35154
- ...searchRoots.filter((root) => path422.resolve(root) !== templateDir)
36074
+ ...searchRoots.filter((root) => path43.resolve(root) !== templateDir)
35155
36075
  ];
35156
36076
  references.push(
35157
36077
  ...await collectAssertionTemplateReferencesFromValue(
@@ -35337,7 +36257,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
35337
36257
  if (cwd) {
35338
36258
  const resolved = await resolveFileReference3(cwd, searchRoots);
35339
36259
  if (resolved.resolvedPath) {
35340
- resolvedCwd = path422.resolve(resolved.resolvedPath);
36260
+ resolvedCwd = path43.resolve(resolved.resolvedPath);
35341
36261
  } else {
35342
36262
  logWarning2(
35343
36263
  `Code-grader evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
@@ -35523,7 +36443,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
35523
36443
  aggregatorPrompt = fileRef;
35524
36444
  const resolved = await resolveFileReference3(fileRef, searchRoots);
35525
36445
  if (resolved.resolvedPath) {
35526
- promptPath2 = path422.resolve(resolved.resolvedPath);
36446
+ promptPath2 = path43.resolve(resolved.resolvedPath);
35527
36447
  } else {
35528
36448
  throw new Error(
35529
36449
  `Composite aggregator in '${evalId}': prompt file not found: ${resolved.displayPath}`
@@ -36203,7 +37123,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
36203
37123
  const commandPath = commandArray[commandArray.length - 1];
36204
37124
  const resolved = await resolveFileReference3(commandPath, searchRoots);
36205
37125
  if (resolved.resolvedPath) {
36206
- resolvedPromptScript = [...commandArray.slice(0, -1), path422.resolve(resolved.resolvedPath)];
37126
+ resolvedPromptScript = [...commandArray.slice(0, -1), path43.resolve(resolved.resolvedPath)];
36207
37127
  } else {
36208
37128
  throw new Error(
36209
37129
  `Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
@@ -36218,7 +37138,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
36218
37138
  prompt = fileRef;
36219
37139
  const resolved = await resolveFileReference3(fileRef, searchRoots);
36220
37140
  if (resolved.resolvedPath) {
36221
- promptPath = path422.resolve(resolved.resolvedPath);
37141
+ promptPath = path43.resolve(resolved.resolvedPath);
36222
37142
  try {
36223
37143
  await validateCustomPromptContent(promptPath);
36224
37144
  } catch (error40) {
@@ -36376,7 +37296,7 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
36376
37296
  preprocessors.push({
36377
37297
  type,
36378
37298
  command,
36379
- resolvedCommand: [...command.slice(0, -1), path422.resolve(resolved.resolvedPath)]
37299
+ resolvedCommand: [...command.slice(0, -1), path43.resolve(resolved.resolvedPath)]
36380
37300
  });
36381
37301
  }
36382
37302
  return preprocessors;
@@ -36471,10 +37391,10 @@ async function resolveOptionalCommandSource(command, searchRoots) {
36471
37391
  return void 0;
36472
37392
  }
36473
37393
  const resolved = await resolveFileReference3(candidate, searchRoots);
36474
- return resolved.resolvedPath ? path422.resolve(resolved.resolvedPath) : void 0;
37394
+ return resolved.resolvedPath ? path43.resolve(resolved.resolvedPath) : void 0;
36475
37395
  }
36476
37396
  function looksLikeFilePath(value) {
36477
- return path422.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
37397
+ return path43.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
36478
37398
  }
36479
37399
  function parseCommandToArgv(command) {
36480
37400
  if (process.platform === "win32") {
@@ -36862,7 +37782,7 @@ var IMAGE_MEDIA_TYPES = {
36862
37782
  ".bmp": "image/bmp"
36863
37783
  };
36864
37784
  function detectImageMediaType(filePath) {
36865
- const ext = path43.extname(filePath).toLowerCase();
37785
+ const ext = path44.extname(filePath).toLowerCase();
36866
37786
  return IMAGE_MEDIA_TYPES[ext];
36867
37787
  }
36868
37788
  var ANSI_YELLOW3 = "\x1B[33m";
@@ -36926,7 +37846,7 @@ async function processMessages(options) {
36926
37846
  ...cloneJsonObject(rawSegment),
36927
37847
  path: displayPath,
36928
37848
  text: fileContent,
36929
- resolvedPath: path43.resolve(resolvedPath)
37849
+ resolvedPath: path44.resolve(resolvedPath)
36930
37850
  });
36931
37851
  if (verbose) {
36932
37852
  const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -37050,7 +37970,7 @@ async function processExpectedMessages(options) {
37050
37970
  type: "file",
37051
37971
  path: displayPath,
37052
37972
  text: fileContent,
37053
- resolvedPath: path43.resolve(resolvedPath)
37973
+ resolvedPath: path44.resolve(resolvedPath)
37054
37974
  });
37055
37975
  if (verbose) {
37056
37976
  console.log(` [Expected Output File] Found: ${displayPath}`);
@@ -37196,7 +38116,7 @@ function matchesFilter(id, filter) {
37196
38116
  return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
37197
38117
  }
37198
38118
  function detectFormat(filePath) {
37199
- const ext = path44.extname(filePath).toLowerCase();
38119
+ const ext = path45.extname(filePath).toLowerCase();
37200
38120
  if (ext === ".jsonl") return "jsonl";
37201
38121
  if (ext === ".yaml" || ext === ".yml") return "yaml";
37202
38122
  if (ext === ".json") return "agent-skills-json";
@@ -37206,9 +38126,9 @@ function detectFormat(filePath) {
37206
38126
  );
37207
38127
  }
37208
38128
  async function loadSidecarMetadata(jsonlPath, verbose) {
37209
- const dir = path44.dirname(jsonlPath);
37210
- const base = path44.basename(jsonlPath, ".jsonl");
37211
- const sidecarPath = path44.join(dir, `${base}.yaml`);
38129
+ const dir = path45.dirname(jsonlPath);
38130
+ const base = path45.basename(jsonlPath, ".jsonl");
38131
+ const sidecarPath = path45.join(dir, `${base}.yaml`);
37212
38132
  if (!await fileExists3(sidecarPath)) {
37213
38133
  if (verbose) {
37214
38134
  logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
@@ -37257,13 +38177,13 @@ function parseJsonlContent(content, filePath) {
37257
38177
  async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
37258
38178
  const verbose = options?.verbose ?? false;
37259
38179
  const filterPattern = options?.filter;
37260
- const absoluteTestPath = path44.resolve(evalFilePath);
38180
+ const absoluteTestPath = path45.resolve(evalFilePath);
37261
38181
  const repoRootPath = resolveToAbsolutePath(repoRoot);
37262
38182
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
37263
38183
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
37264
38184
  const rawFile = await readFile17(absoluteTestPath, "utf8");
37265
38185
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
37266
- const fallbackSuiteName = path44.basename(absoluteTestPath, ".jsonl") || "eval";
38186
+ const fallbackSuiteName = path45.basename(absoluteTestPath, ".jsonl") || "eval";
37267
38187
  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
37268
38188
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
37269
38189
  const globalExecution = sidecar.execution;
@@ -37660,7 +38580,7 @@ function interpolateRawEvalCase(raw, vars) {
37660
38580
  }
37661
38581
  async function readTestSuiteMetadata(testFilePath) {
37662
38582
  try {
37663
- const absolutePath = path45.resolve(testFilePath);
38583
+ const absolutePath = path46.resolve(testFilePath);
37664
38584
  const content = await readFile18(absolutePath, "utf8");
37665
38585
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
37666
38586
  if (!isJsonObject(parsed)) {
@@ -37685,7 +38605,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
37685
38605
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
37686
38606
  }
37687
38607
  if (format === "typescript") {
37688
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
38608
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-ZVL6CGTE-TZYZX3QS.js");
37689
38609
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
37690
38610
  }
37691
38611
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -37720,7 +38640,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
37720
38640
  return loadTestsFromAgentSkills(evalFilePath);
37721
38641
  }
37722
38642
  if (format === "typescript") {
37723
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
38643
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-ZVL6CGTE-TZYZX3QS.js");
37724
38644
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
37725
38645
  return suite.tests;
37726
38646
  }
@@ -37731,7 +38651,7 @@ var loadEvalCases = loadTests;
37731
38651
  async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
37732
38652
  const verbose = options?.verbose ?? false;
37733
38653
  const filterPattern = options?.filter;
37734
- const absoluteTestPath = path45.resolve(evalFilePath);
38654
+ const absoluteTestPath = path46.resolve(evalFilePath);
37735
38655
  const repoRootPath = resolveToAbsolutePath(repoRoot);
37736
38656
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
37737
38657
  const config2 = await loadConfig(absoluteTestPath, repoRootPath);
@@ -37744,7 +38664,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
37744
38664
  }
37745
38665
  const suite = interpolated;
37746
38666
  const suiteNameFromFile = asString5(suite.name)?.trim();
37747
- const fallbackSuiteName = path45.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
38667
+ const fallbackSuiteName = path46.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
37748
38668
  const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
37749
38669
  const rawTestCases = resolveTests(suite);
37750
38670
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -37754,13 +38674,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
37754
38674
  "<suite>",
37755
38675
  absoluteTestPath
37756
38676
  );
37757
- const evalFileDir = path45.dirname(absoluteTestPath);
38677
+ const evalFileDir = path46.dirname(absoluteTestPath);
37758
38678
  let expandedTestCases;
37759
38679
  if (typeof rawTestCases === "string") {
37760
- const externalPath = path45.resolve(evalFileDir, rawTestCases);
38680
+ const externalPath = path46.resolve(evalFileDir, rawTestCases);
37761
38681
  let isDir = false;
37762
38682
  try {
37763
- const pathStat = await stat8(externalPath);
38683
+ const pathStat = await stat9(externalPath);
37764
38684
  isDir = pathStat.isDirectory();
37765
38685
  } catch {
37766
38686
  }
@@ -38061,7 +38981,7 @@ function collectInputSourceReferences(inputMessages) {
38061
38981
  references.push({
38062
38982
  kind: "input_file",
38063
38983
  displayPath,
38064
- ...typeof segment.resolvedPath === "string" ? { resolvedPath: path45.resolve(segment.resolvedPath) } : {}
38984
+ ...typeof segment.resolvedPath === "string" ? { resolvedPath: path46.resolve(segment.resolvedPath) } : {}
38065
38985
  });
38066
38986
  }
38067
38987
  }
@@ -38134,7 +39054,7 @@ function collectSingleGraderSourceReferences(evaluator) {
38134
39054
  references.push({
38135
39055
  kind: "code_grader_command",
38136
39056
  displayPath: evaluator.aggregator.path,
38137
- resolvedPath: path45.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
39057
+ resolvedPath: path46.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
38138
39058
  graderName: evaluator.name
38139
39059
  });
38140
39060
  } else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
@@ -38167,9 +39087,9 @@ function dedupeSourceReferences(references) {
38167
39087
  return deduped;
38168
39088
  }
38169
39089
  function toPortableRelativePath(root, candidate) {
38170
- const relative = path45.relative(root, candidate);
38171
- if (relative && !relative.startsWith("..") && !path45.isAbsolute(relative)) {
38172
- return relative.split(path45.sep).join("/");
39090
+ const relative = path46.relative(root, candidate);
39091
+ if (relative && !relative.startsWith("..") && !path46.isAbsolute(relative)) {
39092
+ return relative.split(path46.sep).join("/");
38173
39093
  }
38174
39094
  return void 0;
38175
39095
  }
@@ -38223,8 +39143,8 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
38223
39143
  if (!command) return void 0;
38224
39144
  const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : void 0;
38225
39145
  let cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
38226
- if (cwd && !path45.isAbsolute(cwd)) {
38227
- cwd = path45.resolve(evalFileDir, cwd);
39146
+ if (cwd && !path46.isAbsolute(cwd)) {
39147
+ cwd = path46.resolve(evalFileDir, cwd);
38228
39148
  }
38229
39149
  const config2 = { command };
38230
39150
  if (timeoutMs !== void 0) {
@@ -38262,7 +39182,7 @@ function parseWorkspaceHooksConfig(raw, evalFileDir) {
38262
39182
  }
38263
39183
  async function resolveWorkspaceConfig(raw, evalFileDir) {
38264
39184
  if (typeof raw === "string") {
38265
- const workspaceFilePath = path45.resolve(evalFileDir, raw);
39185
+ const workspaceFilePath = path46.resolve(evalFileDir, raw);
38266
39186
  let content;
38267
39187
  try {
38268
39188
  content = await readFile18(workspaceFilePath, "utf8");
@@ -38275,7 +39195,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
38275
39195
  `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
38276
39196
  );
38277
39197
  }
38278
- const workspaceFileDir = path45.dirname(workspaceFilePath);
39198
+ const workspaceFileDir = path46.dirname(workspaceFilePath);
38279
39199
  const resolvedWorkspace = parseWorkspaceConfig(parsed, workspaceFileDir);
38280
39200
  if (resolvedWorkspace) {
38281
39201
  return { ...resolvedWorkspace, workspaceFileDir };
@@ -38309,8 +39229,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
38309
39229
  throw new Error("workspace.static has been removed. Use workspace.mode='static'.");
38310
39230
  }
38311
39231
  let template = typeof obj.template === "string" ? obj.template : void 0;
38312
- if (template && !path45.isAbsolute(template)) {
38313
- template = path45.resolve(evalFileDir, template);
39232
+ if (template && !path46.isAbsolute(template)) {
39233
+ template = path46.resolve(evalFileDir, template);
38314
39234
  }
38315
39235
  const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
38316
39236
  const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
@@ -38447,8 +39367,6 @@ ${detailBlock}${ANSI_RESET6}`);
38447
39367
  console.error(`${ANSI_RED3}Error: ${message}${ANSI_RESET6}`);
38448
39368
  }
38449
39369
  }
38450
- var execFileAsync2 = promisify6(execFile2);
38451
- var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
38452
39370
  function pathFromRoot(root) {
38453
39371
  return root instanceof URL ? fileURLToPath5(root) : String(root);
38454
39372
  }
@@ -38470,53 +39388,84 @@ function buildSkippedEvaluatorError(scores) {
38470
39388
  function usesFileReferencePrompt(provider) {
38471
39389
  return isAgentProvider(provider) || provider.kind === "cli";
38472
39390
  }
38473
- function toScriptConfig(hook, hookName, context) {
38474
- const command = hook.command ?? hook.script;
38475
- if (!command || command.length === 0) {
38476
- throw new Error(`${hookName} hook in ${context} requires command or script`);
39391
+ function createEvaluationRuntime(options) {
39392
+ const {
39393
+ target,
39394
+ targets,
39395
+ env,
39396
+ providerFactory,
39397
+ evalFilePath,
39398
+ graderTarget: cliGraderTarget,
39399
+ model: cliModel
39400
+ } = options;
39401
+ const resolvedTargetsByName = /* @__PURE__ */ new Map();
39402
+ resolvedTargetsByName.set(target.name, target);
39403
+ const targetDefinitions = /* @__PURE__ */ new Map();
39404
+ for (const definition of targets ?? []) {
39405
+ targetDefinitions.set(definition.name, definition);
38477
39406
  }
38478
- return {
38479
- command,
38480
- ...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
38481
- ...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
38482
- ...hook.cwd !== void 0 && { cwd: hook.cwd },
38483
- ...hook.script !== void 0 && { script: hook.script }
39407
+ const envLookup = env ?? process.env;
39408
+ const providerCache = /* @__PURE__ */ new Map();
39409
+ const getOrCreateProvider = (resolved) => {
39410
+ const existing = providerCache.get(resolved.name);
39411
+ if (existing) {
39412
+ return existing;
39413
+ }
39414
+ const factory = providerFactory ?? createProvider;
39415
+ const instance = factory(resolved);
39416
+ providerCache.set(resolved.name, instance);
39417
+ return instance;
38484
39418
  };
38485
- }
38486
- function hasHookCommand(hook) {
38487
- return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
38488
- }
38489
- function hooksEnabled(workspace) {
38490
- return workspace?.hooks?.enabled !== false;
38491
- }
38492
- function workspaceGitEnv() {
38493
- const env = { ...process.env };
38494
- for (const key of Object.keys(env)) {
38495
- if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
38496
- delete env[key];
39419
+ const resolveTargetByName = (name) => {
39420
+ if (resolvedTargetsByName.has(name)) {
39421
+ return resolvedTargetsByName.get(name);
38497
39422
  }
38498
- }
38499
- return {
38500
- ...env,
38501
- GIT_TERMINAL_PROMPT: "0",
38502
- GIT_ASKPASS: "",
38503
- GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
39423
+ const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
39424
+ if (!definition) {
39425
+ return void 0;
39426
+ }
39427
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath ?? "");
39428
+ resolvedTargetsByName.set(name, resolved);
39429
+ return resolved;
38504
39430
  };
38505
- }
38506
- async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
38507
- if (!existsSync6(path46.join(workspacePath, ".git"))) {
38508
- return false;
38509
- }
38510
- const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
38511
- const opts = {
38512
- cwd: workspacePath,
38513
- timeout: WORKSPACE_GIT_TIMEOUT_MS,
38514
- env: workspaceGitEnv(),
38515
- maxBuffer: 50 * 1024 * 1024
39431
+ const resolveGraderProvider = async (targetContext) => {
39432
+ if (cliGraderTarget) {
39433
+ if (cliGraderTarget === "agentv") {
39434
+ if (!cliModel) {
39435
+ throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
39436
+ }
39437
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-AYXH7WLW-NJRC6UQX.js");
39438
+ return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
39439
+ }
39440
+ const overrideTarget = resolveTargetByName(cliGraderTarget);
39441
+ if (!overrideTarget) {
39442
+ throw new Error(`--grader-target "${cliGraderTarget}" not found in targets`);
39443
+ }
39444
+ return getOrCreateProvider(overrideTarget);
39445
+ }
39446
+ const graderName = targetContext.graderTarget ?? targetContext.name;
39447
+ const resolvedGrader = resolveTargetByName(graderName);
39448
+ if (!resolvedGrader) {
39449
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
39450
+ return void 0;
39451
+ }
39452
+ return getOrCreateProvider(targetContext);
39453
+ }
39454
+ return getOrCreateProvider(resolvedGrader);
39455
+ };
39456
+ const targetResolver = (name) => {
39457
+ const resolved = resolveTargetByName(name);
39458
+ if (!resolved) {
39459
+ return void 0;
39460
+ }
39461
+ return getOrCreateProvider(resolved);
39462
+ };
39463
+ return {
39464
+ getOrCreateProvider,
39465
+ resolveGraderProvider,
39466
+ targetResolver,
39467
+ availableTargets: [target.name, ...Array.from(targetDefinitions.keys())]
38516
39468
  };
38517
- await execFileAsync2("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
38518
- await execFileAsync2("git", ["clean", cleanFlag], opts);
38519
- return true;
38520
39469
  }
38521
39470
  function validateDependencyGraph(tests) {
38522
39471
  const ids = /* @__PURE__ */ new Set();
@@ -38547,18 +39496,18 @@ function validateDependencyGraph(tests) {
38547
39496
  }
38548
39497
  const visited = /* @__PURE__ */ new Set();
38549
39498
  const visiting = /* @__PURE__ */ new Set();
38550
- function visit(id, path50) {
39499
+ function visit(id, path51) {
38551
39500
  if (visiting.has(id)) {
38552
- const cycle = [...path50.slice(path50.indexOf(id)), id];
39501
+ const cycle = [...path51.slice(path51.indexOf(id)), id];
38553
39502
  throw new Error(`Circular dependency detected: ${cycle.join(" \u2192 ")}`);
38554
39503
  }
38555
39504
  if (visited.has(id)) return;
38556
39505
  visiting.add(id);
38557
- path50.push(id);
39506
+ path51.push(id);
38558
39507
  for (const dep of depMap.get(id) ?? []) {
38559
- visit(dep, path50);
39508
+ visit(dep, path51);
38560
39509
  }
38561
- path50.pop();
39510
+ path51.pop();
38562
39511
  visiting.delete(id);
38563
39512
  visited.add(id);
38564
39513
  }
@@ -38612,6 +39561,170 @@ function computeWaves(tests) {
38612
39561
  }
38613
39562
  return waves;
38614
39563
  }
39564
+ function createPreparedProvider(target) {
39565
+ return {
39566
+ id: `prepared:${target.name}`,
39567
+ kind: target.kind,
39568
+ targetName: target.name,
39569
+ async invoke() {
39570
+ throw new Error("Prepared grading does not invoke the target provider");
39571
+ }
39572
+ };
39573
+ }
39574
+ function withPreparedMetadata(evalCase, preparedAttempt) {
39575
+ return {
39576
+ ...evalCase.metadata,
39577
+ preparedAttempt
39578
+ };
39579
+ }
39580
+ async function gradePreparedEvalCase(options) {
39581
+ const {
39582
+ evalCase,
39583
+ target,
39584
+ targets,
39585
+ env,
39586
+ evaluators,
39587
+ providerFactory,
39588
+ agentTimeoutMs,
39589
+ graderTarget,
39590
+ model,
39591
+ evalFilePath,
39592
+ workspacePath,
39593
+ baselineCommit,
39594
+ response,
39595
+ verbose,
39596
+ threshold: caseThreshold,
39597
+ preparedAttempt
39598
+ } = options;
39599
+ const nowFn = options.now ?? (() => /* @__PURE__ */ new Date());
39600
+ const caseStartMs = Date.now();
39601
+ const provider = createPreparedProvider(target);
39602
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
39603
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
39604
+ const typeRegistry = createBuiltinRegistry();
39605
+ const runtime = createEvaluationRuntime({
39606
+ target,
39607
+ targets,
39608
+ env,
39609
+ providerFactory,
39610
+ evalFilePath,
39611
+ graderTarget,
39612
+ model
39613
+ });
39614
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, runtime.resolveGraderProvider);
39615
+ const discoveryBaseDir = evalFilePath ? path47.dirname(path47.resolve(evalFilePath)) : process.cwd();
39616
+ await discoverAssertions(typeRegistry, discoveryBaseDir);
39617
+ await discoverGraders(typeRegistry, discoveryBaseDir);
39618
+ let fileChanges;
39619
+ if (baselineCommit) {
39620
+ try {
39621
+ const diff = await captureFileChanges(workspacePath, baselineCommit);
39622
+ if (diff.length > 0) {
39623
+ fileChanges = diff;
39624
+ }
39625
+ } catch (error40) {
39626
+ if (verbose) {
39627
+ const message = error40 instanceof Error ? error40.message : String(error40);
39628
+ console.warn(`Warning: failed to capture prepared workspace diff: ${message}`);
39629
+ }
39630
+ }
39631
+ }
39632
+ const candidate = response ?? "";
39633
+ const input = buildResultInput(promptInputs);
39634
+ const outputMessages = candidate.length > 0 ? [{ role: "assistant", content: candidate }] : [];
39635
+ const resultTrace = buildTraceFromMessages({
39636
+ input,
39637
+ output: outputMessages,
39638
+ finalOutput: candidate,
39639
+ provider: provider.kind,
39640
+ target: target.name,
39641
+ testId: evalCase.id,
39642
+ conversationId: evalCase.conversation_id
39643
+ });
39644
+ try {
39645
+ const gradeStartedAt = nowFn();
39646
+ const { score, scores } = await runEvaluatorsForCase({
39647
+ evalCase,
39648
+ candidate,
39649
+ target,
39650
+ provider,
39651
+ evaluators: evaluatorRegistry,
39652
+ typeRegistry,
39653
+ attempt: 0,
39654
+ promptInputs,
39655
+ now: gradeStartedAt,
39656
+ agentTimeoutMs,
39657
+ targetResolver: runtime.targetResolver,
39658
+ availableTargets: runtime.availableTargets,
39659
+ fileChanges,
39660
+ workspacePath,
39661
+ dockerConfig: evalCase.workspace?.docker,
39662
+ threshold: evalCase.threshold ?? caseThreshold
39663
+ });
39664
+ const timestamp = nowFn();
39665
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
39666
+ const graderTokens = aggregateEvaluatorTokenUsage(scores);
39667
+ const evalRun = {
39668
+ durationMs: Date.now() - caseStartMs,
39669
+ ...graderTokens ? { tokenUsage: graderTokens } : {}
39670
+ };
39671
+ const skippedEvaluatorError = buildSkippedEvaluatorError(scores);
39672
+ const executionStatus = skippedEvaluatorError ? "execution_error" : classifyQualityStatus(score.score, effectiveThreshold);
39673
+ const baseResult = {
39674
+ timestamp: timestamp.toISOString(),
39675
+ testId: evalCase.id,
39676
+ suite: evalCase.suite,
39677
+ category: evalCase.category,
39678
+ conversationId: evalCase.conversation_id,
39679
+ score: skippedEvaluatorError ? 0 : score.score,
39680
+ assertions: score.assertions,
39681
+ target: target.name,
39682
+ input,
39683
+ output: candidate,
39684
+ scores,
39685
+ trace: resultTrace,
39686
+ fileChanges,
39687
+ workspacePath,
39688
+ evalRun,
39689
+ metadata: withPreparedMetadata(evalCase, preparedAttempt),
39690
+ executionStatus
39691
+ };
39692
+ if (!skippedEvaluatorError) {
39693
+ return baseResult;
39694
+ }
39695
+ return {
39696
+ ...baseResult,
39697
+ trace: appendErrorEventToTrace(baseResult.trace, skippedEvaluatorError, {
39698
+ failure_stage: "evaluator",
39699
+ failure_reason_code: "evaluator_error"
39700
+ }),
39701
+ error: skippedEvaluatorError,
39702
+ failureStage: "evaluator",
39703
+ failureReasonCode: "evaluator_error",
39704
+ executionError: { message: skippedEvaluatorError, stage: "evaluator" }
39705
+ };
39706
+ } catch (error40) {
39707
+ const evalRun = { durationMs: Date.now() - caseStartMs };
39708
+ const errorResult = buildErrorResult(
39709
+ evalCase,
39710
+ target.name,
39711
+ nowFn(),
39712
+ error40,
39713
+ promptInputs,
39714
+ provider,
39715
+ "evaluator",
39716
+ "evaluator_error",
39717
+ verbose
39718
+ );
39719
+ return {
39720
+ ...errorResult,
39721
+ evalRun,
39722
+ fileChanges,
39723
+ workspacePath,
39724
+ metadata: withPreparedMetadata(evalCase, preparedAttempt)
39725
+ };
39726
+ }
39727
+ }
38615
39728
  async function runEvaluation(options) {
38616
39729
  const {
38617
39730
  testFilePath: evalFilePath,
@@ -38667,80 +39780,24 @@ async function runEvaluation(options) {
38667
39780
  }
38668
39781
  return [];
38669
39782
  }
38670
- const resolvedTargetsByName = /* @__PURE__ */ new Map();
38671
- resolvedTargetsByName.set(target.name, target);
38672
- const targetDefinitions = /* @__PURE__ */ new Map();
38673
- for (const definition of targets ?? []) {
38674
- targetDefinitions.set(definition.name, definition);
38675
- }
38676
- const envLookup = env ?? process.env;
38677
- const providerCache = /* @__PURE__ */ new Map();
38678
- const getOrCreateProvider = (resolved) => {
38679
- const existing = providerCache.get(resolved.name);
38680
- if (existing) {
38681
- return existing;
38682
- }
38683
- const factory = providerFactory ?? createProvider;
38684
- const instance = factory(resolved);
38685
- providerCache.set(resolved.name, instance);
38686
- return instance;
38687
- };
38688
- const resolveTargetByName = (name) => {
38689
- if (resolvedTargetsByName.has(name)) {
38690
- return resolvedTargetsByName.get(name);
38691
- }
38692
- const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
38693
- if (!definition) {
38694
- return void 0;
38695
- }
38696
- const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
38697
- resolvedTargetsByName.set(name, resolved);
38698
- return resolved;
38699
- };
38700
- const resolveGraderProvider = async (targetContext) => {
38701
- if (cliGraderTarget) {
38702
- if (cliGraderTarget === "agentv") {
38703
- if (!cliModel) {
38704
- throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
38705
- }
38706
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-AYXH7WLW-NJRC6UQX.js");
38707
- return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
38708
- }
38709
- const overrideTarget = resolveTargetByName(cliGraderTarget);
38710
- if (!overrideTarget) {
38711
- throw new Error(`--grader-target "${cliGraderTarget}" not found in targets`);
38712
- }
38713
- return getOrCreateProvider(overrideTarget);
38714
- }
38715
- const graderName = targetContext.graderTarget ?? targetContext.name;
38716
- const resolvedGrader = resolveTargetByName(graderName);
38717
- if (!resolvedGrader) {
38718
- if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
38719
- return void 0;
38720
- }
38721
- return getOrCreateProvider(targetContext);
38722
- }
38723
- return getOrCreateProvider(resolvedGrader);
38724
- };
39783
+ const runtime = createEvaluationRuntime({
39784
+ target,
39785
+ targets,
39786
+ env,
39787
+ providerFactory,
39788
+ evalFilePath,
39789
+ graderTarget: cliGraderTarget,
39790
+ model: cliModel
39791
+ });
39792
+ const { getOrCreateProvider, resolveGraderProvider, targetResolver, availableTargets } = runtime;
38725
39793
  if (isAgentProvider(getOrCreateProvider(target)) && !target.graderTarget && !cliGraderTarget) {
38726
39794
  throw new Error(
38727
39795
  `Target "${target.name}" is an agent provider ("${target.kind}") with no grader_target \u2014 agent providers cannot return structured JSON for grading. Set grader_target to an LLM provider (e.g., azure-llm).`
38728
39796
  );
38729
39797
  }
38730
- const targetResolver = (name) => {
38731
- const resolved = resolveTargetByName(name);
38732
- if (!resolved) {
38733
- return void 0;
38734
- }
38735
- return getOrCreateProvider(resolved);
38736
- };
38737
- const availableTargets = [
38738
- target.name,
38739
- ...Array.from(targetDefinitions.keys())
38740
- ];
38741
39798
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveGraderProvider);
38742
39799
  const typeRegistry = createBuiltinRegistry();
38743
- const discoveryBaseDir = evalFilePath ? path46.dirname(path46.resolve(evalFilePath)) : process.cwd();
39800
+ const discoveryBaseDir = evalFilePath ? path47.dirname(path47.resolve(evalFilePath)) : process.cwd();
38744
39801
  const evalDir = discoveryBaseDir;
38745
39802
  await discoverAssertions(typeRegistry, discoveryBaseDir);
38746
39803
  await discoverGraders(typeRegistry, discoveryBaseDir);
@@ -38796,132 +39853,38 @@ async function runEvaluation(options) {
38796
39853
  }
38797
39854
  }
38798
39855
  }
38799
- const suiteWorkspace = filteredEvalCases[0]?.workspace;
38800
- const rawTemplate = suiteWorkspace?.template;
38801
- const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
38802
- const workspaceTemplate = resolvedTemplate?.dir;
38803
- let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
38804
- const setupLog = (message) => {
38805
- if (verbose) {
38806
- console.log(`[setup] ${message}`);
38807
- }
38808
- };
38809
- const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
38810
- const cliWorkspacePath = workspacePath ?? legacyWorkspacePath;
38811
- const yamlWorkspacePath = suiteWorkspace?.path;
38812
- if (cliWorkspacePath && workspaceMode && workspaceMode !== "static") {
38813
- throw new Error("--workspace-path requires --workspace-mode static when both are provided");
38814
- }
38815
- let configuredMode = cliWorkspacePath ? "static" : workspaceMode ?? suiteWorkspace?.mode ?? (yamlWorkspacePath ? "static" : "pooled");
38816
- const configuredStaticPath = cliWorkspacePath ?? yamlWorkspacePath;
38817
- if (configuredMode === "static" && !configuredStaticPath) {
38818
- if (!suiteWorkspace?.repos?.length) {
38819
- setupLog("workspace.mode=static with no path and no repos \u2014 falling back to temp mode");
38820
- configuredMode = "temp";
38821
- } else {
38822
- throw new Error("workspace.mode=static requires workspace.path or --workspace-path");
38823
- }
38824
- }
38825
- const useStaticWorkspace = configuredMode === "static";
38826
- if (useStaticWorkspace && isPerTestIsolation) {
38827
- throw new Error(
38828
- "static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
38829
- );
38830
- }
38831
- if (configuredMode !== "static" && configuredStaticPath) {
38832
- throw new Error("workspace.path requires workspace.mode=static");
38833
- }
38834
- const hasSharedWorkspace = !!(useStaticWorkspace || !isPerTestIsolation && (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length));
38835
- const poolEnabled = configuredMode === "pooled";
38836
- const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
38837
39856
  const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
38838
39857
  const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
38839
39858
  const workers = options.maxConcurrency ?? target.workers ?? 1;
38840
- setupLog(
38841
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
38842
- );
38843
- if (hasSharedWorkspace && !usePool && workers > 1 && filteredEvalCases.length > 1) {
38844
- console.warn(
38845
- [
38846
- `Warning: This eval uses a shared workspace with ${workers} workers.`,
38847
- "If the agent under test makes file edits, concurrent runs may corrupt each other.",
38848
- "To limit concurrency, add this to your eval YAML:",
38849
- "",
38850
- " execution:",
38851
- " workers: 1",
38852
- "",
38853
- "Or pass --workers 1 on the command line."
38854
- ].join("\n")
38855
- );
38856
- }
38857
39859
  const limit = pLimit(workers);
38858
- let sharedWorkspacePath;
38859
- let sharedBaselineCommit;
38860
- let beforeAllOutput;
38861
- let poolManager;
38862
- let poolSlot;
38863
- const poolSlots = [];
38864
- const availablePoolSlots = [];
38865
- const poolSlotBaselines = /* @__PURE__ */ new Map();
38866
- const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
38867
- let staticMaterialised = false;
38868
- const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
38869
- if (useStaticWorkspace && configuredStaticPath) {
38870
- const dirExists = await stat9(configuredStaticPath).then(
38871
- (s) => s.isDirectory(),
38872
- () => false
38873
- );
38874
- const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
38875
- if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
38876
- if (!dirExists) {
38877
- await mkdir17(configuredStaticPath, { recursive: true });
38878
- }
38879
- if (workspaceTemplate) {
38880
- await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
38881
- setupLog(`copied template into static workspace: ${configuredStaticPath}`);
38882
- }
38883
- staticMaterialised = true;
38884
- setupLog(`materialised static workspace at: ${configuredStaticPath}`);
38885
- } else {
38886
- setupLog(`reusing existing static workspace: ${configuredStaticPath}`);
38887
- }
38888
- sharedWorkspacePath = configuredStaticPath;
38889
- } else if (!isPerTestIsolation && usePool && suiteWorkspace?.repos) {
38890
- const slotsNeeded = workers;
38891
- setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
38892
- poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
38893
- const poolRepoManager = new RepoManager(verbose);
38894
- for (let i = 0; i < slotsNeeded; i++) {
38895
- const slot = await poolManager.acquireWorkspace({
38896
- templatePath: workspaceTemplate,
38897
- repos: suiteWorkspace.repos,
38898
- maxSlots: poolMaxSlots,
38899
- repoManager: poolRepoManager,
38900
- poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
38901
- });
38902
- poolSlots.push(slot);
38903
- setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
38904
- }
38905
- if (slotsNeeded === 1) {
38906
- poolSlot = poolSlots[0];
38907
- sharedWorkspacePath = poolSlot.path;
38908
- } else {
38909
- availablePoolSlots.push(...poolSlots);
38910
- }
38911
- } else if (!isPerTestIsolation && workspaceTemplate) {
38912
- setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
38913
- try {
38914
- sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
38915
- setupLog(`shared workspace created at: ${sharedWorkspacePath}`);
38916
- } catch (error40) {
38917
- const message = error40 instanceof Error ? error40.message : String(error40);
38918
- throw new Error(`Failed to create shared workspace: ${message}`);
38919
- }
38920
- } else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
38921
- sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
38922
- await mkdir17(sharedWorkspacePath, { recursive: true });
38923
- setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
38924
- }
39860
+ const sharedSetup = await prepareSharedWorkspaceSetup({
39861
+ evalRunId,
39862
+ evalCases: filteredEvalCases,
39863
+ targetHooks: options.targetHooks,
39864
+ evalDir,
39865
+ verbose,
39866
+ workers,
39867
+ poolMaxSlots: configPoolMaxSlots,
39868
+ workspacePath,
39869
+ legacyWorkspacePath,
39870
+ workspaceMode,
39871
+ workspaceClean
39872
+ });
39873
+ const {
39874
+ suiteWorkspace,
39875
+ sharedWorkspacePath,
39876
+ sharedBaselineCommit,
39877
+ suiteWorkspaceFile,
39878
+ beforeAllOutput,
39879
+ repoManager,
39880
+ poolSlot,
39881
+ poolSlots,
39882
+ availablePoolSlots,
39883
+ poolSlotBaselines,
39884
+ useStaticWorkspace
39885
+ } = sharedSetup;
39886
+ const targetHooks = options.targetHooks;
39887
+ const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
38925
39888
  try {
38926
39889
  let toDependencyResult2 = function(r) {
38927
39890
  return {
@@ -38959,198 +39922,6 @@ async function runEvaluation(options) {
38959
39922
  return result.costUsd;
38960
39923
  };
38961
39924
  var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
38962
- if (suiteWorkspaceFile && sharedWorkspacePath) {
38963
- const copiedWorkspaceFile = path46.join(sharedWorkspacePath, path46.basename(suiteWorkspaceFile));
38964
- try {
38965
- await stat9(copiedWorkspaceFile);
38966
- suiteWorkspaceFile = copiedWorkspaceFile;
38967
- } catch {
38968
- }
38969
- }
38970
- const hasReposToMaterialize = !!suiteWorkspace?.repos?.length && !usePool && !isPerTestIsolation;
38971
- const needsRepoMaterialisation = hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised);
38972
- const needsPerRepoCheck = hasReposToMaterialize && useStaticWorkspace && !staticMaterialised && isYamlConfiguredPath;
38973
- const repoManager = needsRepoMaterialisation || needsPerRepoCheck ? new RepoManager(verbose) : void 0;
38974
- if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos) {
38975
- try {
38976
- if (needsPerRepoCheck) {
38977
- for (const repo of suiteWorkspace.repos) {
38978
- if (!repo.path || !repo.repo) continue;
38979
- const targetDir = path46.join(sharedWorkspacePath, repo.path);
38980
- if (existsSync6(targetDir)) {
38981
- setupLog(`reusing existing repo at: ${targetDir}`);
38982
- continue;
38983
- }
38984
- setupLog(`materializing missing repo: ${repo.path}`);
38985
- await repoManager.materialize(repo, sharedWorkspacePath);
38986
- }
38987
- } else {
38988
- setupLog(
38989
- `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
38990
- );
38991
- await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
38992
- }
38993
- setupLog("shared repo materialization complete");
38994
- } catch (error40) {
38995
- const message = error40 instanceof Error ? error40.message : String(error40);
38996
- if (sharedWorkspacePath && !useStaticWorkspace) {
38997
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
38998
- });
38999
- }
39000
- throw new Error(`Failed to materialize repos: ${message}`);
39001
- }
39002
- }
39003
- const suiteDockerConfig = suiteWorkspace?.docker;
39004
- if (suiteDockerConfig) {
39005
- setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
39006
- const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27-B4AQHVWA.js");
39007
- const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
39008
- if (!await dockerSetup.isDockerAvailable()) {
39009
- throw new Error(
39010
- "Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
39011
- );
39012
- }
39013
- await dockerSetup.pullImage();
39014
- setupLog("Docker image pull complete");
39015
- }
39016
- if (suiteWorkspace?.env) {
39017
- try {
39018
- await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
39019
- setupLog("preflight checks passed");
39020
- } catch (error40) {
39021
- const message = error40 instanceof Error ? error40.message : String(error40);
39022
- if (sharedWorkspacePath && !useStaticWorkspace) {
39023
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
39024
- });
39025
- }
39026
- throw new Error(message);
39027
- }
39028
- }
39029
- const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
39030
- const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
39031
- if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
39032
- const beforeAllHook = suiteBeforeAllHook;
39033
- const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
39034
- setupLog(
39035
- `running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
39036
- );
39037
- const scriptContext = {
39038
- workspacePath: sharedWorkspacePath,
39039
- testId: "__before_all__",
39040
- evalRunId,
39041
- evalDir,
39042
- workspaceFileDir: suiteWorkspace?.workspaceFileDir
39043
- };
39044
- try {
39045
- beforeAllOutput = await executeWorkspaceScript(
39046
- toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
39047
- scriptContext
39048
- );
39049
- setupLog("shared before_all completed");
39050
- } catch (error40) {
39051
- const message = error40 instanceof Error ? error40.message : String(error40);
39052
- if (sharedWorkspacePath && !useStaticWorkspace) {
39053
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
39054
- });
39055
- }
39056
- throw new Error(`before_all script failed: ${message}`);
39057
- }
39058
- }
39059
- if (availablePoolSlots.length > 0 && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
39060
- const beforeAllHook = suiteBeforeAllHook;
39061
- for (const slot of availablePoolSlots) {
39062
- setupLog(`running before_all on pool slot ${slot.index}`);
39063
- const scriptContext = {
39064
- workspacePath: slot.path,
39065
- testId: "__before_all__",
39066
- evalRunId,
39067
- evalDir,
39068
- workspaceFileDir: suiteWorkspace?.workspaceFileDir
39069
- };
39070
- try {
39071
- const output = await executeWorkspaceScript(
39072
- toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
39073
- scriptContext
39074
- );
39075
- if (!beforeAllOutput) beforeAllOutput = output;
39076
- setupLog(`before_all completed on pool slot ${slot.index}`);
39077
- } catch (error40) {
39078
- const message = error40 instanceof Error ? error40.message : String(error40);
39079
- throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
39080
- }
39081
- }
39082
- }
39083
- const targetHooks = options.targetHooks;
39084
- const targetBeforeAllHook = targetHooks?.before_all;
39085
- if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
39086
- const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
39087
- setupLog(`running target before_all command=${beforeAllCommand}`);
39088
- const scriptContext = {
39089
- workspacePath: sharedWorkspacePath,
39090
- testId: "__target_before_all__",
39091
- evalRunId,
39092
- evalDir,
39093
- workspaceFileDir: suiteWorkspace?.workspaceFileDir
39094
- };
39095
- try {
39096
- await executeWorkspaceScript(
39097
- toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
39098
- scriptContext
39099
- );
39100
- setupLog("target before_all completed");
39101
- } catch (error40) {
39102
- const message = error40 instanceof Error ? error40.message : String(error40);
39103
- if (sharedWorkspacePath && !useStaticWorkspace) {
39104
- await cleanupWorkspace(sharedWorkspacePath).catch(() => {
39105
- });
39106
- }
39107
- throw new Error(`target before_all hook failed: ${message}`);
39108
- }
39109
- }
39110
- if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
39111
- for (const slot of availablePoolSlots) {
39112
- setupLog(`running target before_all on pool slot ${slot.index}`);
39113
- const scriptContext = {
39114
- workspacePath: slot.path,
39115
- testId: "__target_before_all__",
39116
- evalRunId,
39117
- evalDir,
39118
- workspaceFileDir: suiteWorkspace?.workspaceFileDir
39119
- };
39120
- try {
39121
- await executeWorkspaceScript(
39122
- toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
39123
- scriptContext
39124
- );
39125
- } catch (error40) {
39126
- const message = error40 instanceof Error ? error40.message : String(error40);
39127
- throw new Error(`target before_all hook failed on pool slot ${slot.index}: ${message}`);
39128
- }
39129
- }
39130
- }
39131
- if (sharedWorkspacePath) {
39132
- try {
39133
- sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
39134
- setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
39135
- } catch (error40) {
39136
- const message = error40 instanceof Error ? error40.message : String(error40);
39137
- setupLog(`shared baseline initialization failed (file_changes unavailable): ${message}`);
39138
- }
39139
- }
39140
- if (availablePoolSlots.length > 0) {
39141
- for (const slot of availablePoolSlots) {
39142
- try {
39143
- const baseline = await initializeBaseline(slot.path);
39144
- poolSlotBaselines.set(slot.path, baseline);
39145
- setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
39146
- } catch (error40) {
39147
- const message = error40 instanceof Error ? error40.message : String(error40);
39148
- setupLog(
39149
- `pool slot ${slot.index} baseline initialization failed (file_changes unavailable): ${message}`
39150
- );
39151
- }
39152
- }
39153
- }
39154
39925
  let nextWorkerId = 1;
39155
39926
  const workerIdByEvalId = /* @__PURE__ */ new Map();
39156
39927
  let beforeAllOutputAttached = false;
@@ -39554,17 +40325,7 @@ async function runEvaluation(options) {
39554
40325
  }
39555
40326
  return results;
39556
40327
  } finally {
39557
- if (poolManager) {
39558
- if (poolSlot) {
39559
- await poolManager.releaseSlot(poolSlot);
39560
- }
39561
- for (const slot of poolSlots) {
39562
- if (slot !== poolSlot) {
39563
- await poolManager.releaseSlot(slot).catch(() => {
39564
- });
39565
- }
39566
- }
39567
- }
40328
+ await releaseSharedWorkspaceSetup(sharedSetup);
39568
40329
  }
39569
40330
  }
39570
40331
  async function runBatchEvaluation(options) {
@@ -39812,257 +40573,45 @@ async function runEvalCase(options) {
39812
40573
  cachedResponse = await cache.get(cacheKey);
39813
40574
  }
39814
40575
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
39815
- let workspacePath = sharedWorkspacePath;
39816
- let beforeAllOutput;
39817
- let beforeEachOutput;
39818
40576
  let afterEachOutput;
39819
- const isSharedWorkspace = !!sharedWorkspacePath;
39820
- let caseWorkspaceFile;
39821
40577
  const caseHooksEnabled = hooksEnabled(evalCase.workspace);
39822
- if (!workspacePath) {
39823
- const rawCaseTemplate = evalCase.workspace?.template;
39824
- const resolvedCaseTemplate = await resolveWorkspaceTemplate(rawCaseTemplate);
39825
- const caseWorkspaceTemplate = resolvedCaseTemplate?.dir;
39826
- caseWorkspaceFile = resolvedCaseTemplate?.workspaceFile;
39827
- if (caseWorkspaceTemplate && evalRunId) {
39828
- try {
39829
- workspacePath = await createTempWorkspace(caseWorkspaceTemplate, evalRunId, evalCase.id);
39830
- } catch (error40) {
39831
- const message = error40 instanceof Error ? error40.message : String(error40);
39832
- return buildErrorResult(
39833
- evalCase,
39834
- target.name,
39835
- nowFn(),
39836
- new Error(`Failed to create workspace: ${message}`),
39837
- promptInputs,
39838
- provider,
39839
- "setup",
39840
- "template_error",
39841
- verbose
39842
- );
39843
- }
39844
- if (caseWorkspaceFile && workspacePath) {
39845
- const copiedFile = path46.join(workspacePath, path46.basename(caseWorkspaceFile));
39846
- try {
39847
- await stat9(copiedFile);
39848
- caseWorkspaceFile = copiedFile;
39849
- } catch {
39850
- }
39851
- }
39852
- }
39853
- if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
39854
- workspacePath = getWorkspacePath(evalRunId, evalCase.id);
39855
- await mkdir17(workspacePath, { recursive: true });
39856
- }
39857
- if (evalCase.workspace?.repos?.length && workspacePath) {
39858
- const perCaseRepoManager = new RepoManager(setupDebug);
39859
- try {
39860
- if (setupDebug) {
39861
- console.log(
39862
- `[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-test repo(s) into ${workspacePath}`
39863
- );
39864
- }
39865
- await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
39866
- if (setupDebug) {
39867
- console.log(`[setup] test=${evalCase.id} per-test repo materialization complete`);
39868
- }
39869
- } catch (error40) {
39870
- const message = error40 instanceof Error ? error40.message : String(error40);
39871
- return buildErrorResult(
39872
- evalCase,
39873
- target.name,
39874
- nowFn(),
39875
- new Error(`Failed to materialize repos: ${message}`),
39876
- promptInputs,
39877
- provider,
39878
- "repo_setup",
39879
- "clone_error",
39880
- verbose
39881
- );
39882
- }
39883
- }
39884
- if (workspacePath && evalCase.metadata?.agent_skills_files) {
39885
- const baseDir = evalCase.metadata.agent_skills_base_dir;
39886
- const files = evalCase.metadata.agent_skills_files;
39887
- if (baseDir && files.length > 0) {
39888
- for (const relPath of files) {
39889
- const srcPath = path46.resolve(baseDir, relPath);
39890
- const destPath = path46.resolve(workspacePath, relPath);
39891
- try {
39892
- await mkdir17(path46.dirname(destPath), { recursive: true });
39893
- await copyFile2(srcPath, destPath);
39894
- } catch (error40) {
39895
- const message = error40 instanceof Error ? error40.message : String(error40);
39896
- return buildErrorResult(
39897
- evalCase,
39898
- target.name,
39899
- nowFn(),
39900
- new Error(
39901
- `Agent Skills eval file not found: ${relPath} (resolved from ${baseDir}): ${message}`
39902
- ),
39903
- promptInputs,
39904
- provider,
39905
- "setup",
39906
- "file_copy_error",
39907
- verbose
39908
- );
39909
- }
39910
- }
39911
- }
39912
- }
39913
- const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
39914
- if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) {
39915
- const beforeAllHook = caseBeforeAllHook;
39916
- const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
39917
- if (setupDebug) {
39918
- console.log(
39919
- `[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
39920
- );
39921
- }
39922
- const scriptContext = {
39923
- workspacePath,
39924
- testId: evalCase.id,
39925
- evalRunId: evalRunId ?? "",
39926
- caseInput: evalCase.question,
39927
- caseMetadata: evalCase.metadata,
39928
- evalDir,
39929
- workspaceFileDir: evalCase.workspace?.workspaceFileDir
39930
- };
39931
- try {
39932
- beforeAllOutput = await executeWorkspaceScript(
39933
- toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
39934
- scriptContext
39935
- );
39936
- if (setupDebug) {
39937
- console.log(`[setup] test=${evalCase.id} before_all completed`);
39938
- }
39939
- } catch (error40) {
39940
- const message = error40 instanceof Error ? error40.message : String(error40);
39941
- if (forceCleanup && workspacePath) {
39942
- await cleanupWorkspace(workspacePath).catch(() => {
39943
- });
39944
- }
39945
- return buildErrorResult(
39946
- evalCase,
39947
- target.name,
39948
- nowFn(),
39949
- new Error(`before_all script failed: ${message}`),
39950
- promptInputs,
39951
- provider,
39952
- "setup",
39953
- "script_error",
39954
- verbose
39955
- );
39956
- }
39957
- }
39958
- }
39959
- let beforeEachNeedsFreshBaseline = false;
39960
- if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
39961
- try {
39962
- if (repoManager && evalCase.workspace.repos?.length) {
39963
- await repoManager.reset(
39964
- evalCase.workspace.repos,
39965
- workspacePath,
39966
- evalCase.workspace.hooks.before_each.reset
39967
- );
39968
- } else {
39969
- await resetWorkspaceRoot(
39970
- workspacePath,
39971
- evalCase.workspace.hooks.before_each.reset,
39972
- sharedBaselineCommit
39973
- );
39974
- }
39975
- } catch (error40) {
39976
- const message = error40 instanceof Error ? error40.message : String(error40);
39977
- return buildErrorResult(
39978
- evalCase,
39979
- target.name,
39980
- nowFn(),
39981
- new Error(`before_each reset failed: ${message}`),
39982
- promptInputs,
39983
- provider,
39984
- "setup",
39985
- "script_error",
39986
- verbose
39987
- );
39988
- }
39989
- }
39990
- const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
39991
- if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
39992
- const beforeEachHook = caseBeforeEachHook;
39993
- const scriptContext = {
39994
- workspacePath,
39995
- testId: evalCase.id,
39996
- evalRunId: evalRunId ?? "",
39997
- caseInput: evalCase.question,
39998
- caseMetadata: evalCase.metadata,
39999
- evalDir,
40000
- workspaceFileDir: evalCase.workspace?.workspaceFileDir
40001
- };
40002
- try {
40003
- beforeEachOutput = await executeWorkspaceScript(
40004
- toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
40005
- scriptContext
40006
- );
40007
- beforeEachNeedsFreshBaseline = true;
40008
- } catch (error40) {
40009
- const message = error40 instanceof Error ? error40.message : String(error40);
40010
- return buildErrorResult(
40011
- evalCase,
40012
- target.name,
40013
- nowFn(),
40014
- new Error(`before_each script failed: ${message}`),
40015
- promptInputs,
40016
- provider,
40017
- "setup",
40018
- "script_error",
40019
- verbose
40020
- );
40021
- }
40022
- }
40023
- const targetBeforeEachHook = options.targetHooks?.before_each;
40024
- if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
40025
- const scriptContext = {
40026
- workspacePath,
40027
- testId: evalCase.id,
40028
- evalRunId: evalRunId ?? "",
40029
- caseInput: evalCase.question,
40030
- caseMetadata: evalCase.metadata,
40578
+ let workspaceSetup;
40579
+ try {
40580
+ workspaceSetup = await prepareEvalCaseWorkspace({
40581
+ evalCase,
40582
+ targetName: target.name,
40583
+ evalRunId,
40584
+ sharedWorkspacePath,
40585
+ sharedBaselineCommit,
40586
+ suiteWorkspaceFile,
40587
+ repoManager,
40031
40588
  evalDir,
40032
- workspaceFileDir: evalCase.workspace?.workspaceFileDir
40033
- };
40034
- try {
40035
- await executeWorkspaceScript(
40036
- toScriptConfig(targetBeforeEachHook, "before_each", `target hook for '${evalCase.id}'`),
40037
- scriptContext
40038
- );
40039
- beforeEachNeedsFreshBaseline = true;
40040
- } catch (error40) {
40041
- const message = error40 instanceof Error ? error40.message : String(error40);
40042
- return buildErrorResult(
40043
- evalCase,
40044
- target.name,
40045
- nowFn(),
40046
- new Error(`target before_each hook failed: ${message}`),
40047
- promptInputs,
40048
- provider,
40049
- "setup",
40050
- "script_error",
40051
- verbose
40052
- );
40053
- }
40054
- }
40055
- let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
40056
- if (!baselineCommit && workspacePath) {
40057
- try {
40058
- baselineCommit = await initializeBaseline(workspacePath);
40059
- } catch (error40) {
40060
- const message = error40 instanceof Error ? error40.message : String(error40);
40061
- if (verbose) {
40062
- console.warn(`[setup] test=${evalCase.id} baseline initialization failed: ${message}`);
40063
- }
40064
- }
40589
+ cleanupWorkspaces: forceCleanup,
40590
+ targetHooks: options.targetHooks,
40591
+ setupDebug
40592
+ });
40593
+ } catch (error40) {
40594
+ const setupError = error40 instanceof WorkspaceSetupError ? error40 : void 0;
40595
+ return buildErrorResult(
40596
+ evalCase,
40597
+ target.name,
40598
+ nowFn(),
40599
+ error40,
40600
+ promptInputs,
40601
+ provider,
40602
+ setupError?.failureStage ?? "setup",
40603
+ setupError?.failureReasonCode ?? "script_error",
40604
+ verbose
40605
+ );
40065
40606
  }
40607
+ const {
40608
+ workspacePath,
40609
+ beforeAllOutput,
40610
+ beforeEachOutput,
40611
+ baselineCommit,
40612
+ isSharedWorkspace,
40613
+ caseWorkspaceFile
40614
+ } = workspaceSetup;
40066
40615
  if (evalCase.mode === "conversation" && evalCase.turns?.length) {
40067
40616
  const conversationResult = await runConversationMode({
40068
40617
  evalCase,
@@ -40769,7 +41318,7 @@ async function runEvaluatorList(options) {
40769
41318
  dockerConfig,
40770
41319
  dependencyResults
40771
41320
  };
40772
- const evalFileDir = evalCase.file_paths[0] ? path46.dirname(evalCase.file_paths[0]) : process.cwd();
41321
+ const evalFileDir = evalCase.file_paths[0] ? path47.dirname(evalCase.file_paths[0]) : process.cwd();
40773
41322
  const dispatchContext = {
40774
41323
  graderProvider,
40775
41324
  targetResolver,
@@ -41431,38 +41980,6 @@ function computeWeightedMean(entries) {
41431
41980
  }
41432
41981
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
41433
41982
  }
41434
- async function runPreflightChecks(env, cwd, log) {
41435
- const execFileAsync3 = promisify6(execFile2);
41436
- const missing = [];
41437
- for (const cmd of env.required_commands ?? []) {
41438
- log(`preflight: checking command "${cmd}"`);
41439
- try {
41440
- if (process.platform === "win32") {
41441
- await execFileAsync3("where", [cmd], { cwd });
41442
- } else {
41443
- await execFileAsync3("sh", ["-c", `command -v ${cmd}`], { cwd });
41444
- }
41445
- } catch {
41446
- missing.push(`command: ${cmd}`);
41447
- }
41448
- }
41449
- for (const mod of env.required_python_modules ?? []) {
41450
- log(`preflight: checking Python module "${mod}"`);
41451
- try {
41452
- await execFileAsync3("python3", ["-c", `import ${mod}`], { cwd });
41453
- } catch {
41454
- missing.push(`python module: ${mod}`);
41455
- }
41456
- }
41457
- if (missing.length > 0) {
41458
- throw new Error(
41459
- `Preflight checks failed \u2014 missing dependencies:
41460
- ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
41461
-
41462
- Install the missing dependencies before running this eval.`
41463
- );
41464
- }
41465
- }
41466
41983
  function createFunctionProvider(taskFn) {
41467
41984
  return {
41468
41985
  id: "function-provider",
@@ -41837,22 +42354,22 @@ function deduplicateByTestIdTarget(results) {
41837
42354
  return deduped;
41838
42355
  }
41839
42356
  async function aggregateRunDir(runDir, options) {
41840
- const indexPath = path47.join(runDir, RESULT_INDEX_FILENAME);
42357
+ const indexPath = path48.join(runDir, RESULT_INDEX_FILENAME);
41841
42358
  const content = await readFile20(indexPath, "utf8");
41842
42359
  const allResults = parseJsonlResults(content);
41843
42360
  const results = deduplicateByTestIdTarget(allResults);
41844
42361
  const timing = buildTimingArtifact(results);
41845
- const timingPath = path47.join(runDir, "timing.json");
42362
+ const timingPath = path48.join(runDir, "timing.json");
41846
42363
  await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
41847
42364
  `, "utf8");
41848
- const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(path47.join(runDir, "benchmark.json"));
42365
+ const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(path48.join(runDir, "benchmark.json"));
41849
42366
  const benchmark = buildBenchmarkArtifact(
41850
42367
  results,
41851
42368
  options?.evalFile,
41852
42369
  options?.experiment,
41853
42370
  plannedTestCount
41854
42371
  );
41855
- const benchmarkPath = path47.join(runDir, "benchmark.json");
42372
+ const benchmarkPath = path48.join(runDir, "benchmark.json");
41856
42373
  await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
41857
42374
  `, "utf8");
41858
42375
  const targetSet = new Set(results.map((r) => r.target ?? "unknown"));
@@ -41991,17 +42508,37 @@ function toIndexRerunSource(value) {
41991
42508
  source_timestamp: value.sourceTimestamp
41992
42509
  });
41993
42510
  }
42511
+ function toIndexPreparedAttempt(value) {
42512
+ if (!isRecord4(value)) {
42513
+ return void 0;
42514
+ }
42515
+ return dropUndefined5({
42516
+ source: value.source,
42517
+ manifest_path: value.manifestPath,
42518
+ prepared_dir: value.preparedDir,
42519
+ workspace_path: value.workspacePath,
42520
+ prompt_path: value.promptPath,
42521
+ target: value.target,
42522
+ prepared_at: value.preparedAt,
42523
+ setup_status: value.setupStatus,
42524
+ baseline_status: value.baselineStatus,
42525
+ baseline_commit: value.baselineCommit
42526
+ });
42527
+ }
41994
42528
  function toIndexMetadata(metadata) {
41995
42529
  if (!metadata) {
41996
42530
  return void 0;
41997
42531
  }
41998
42532
  const rerunSource = toIndexRerunSource(metadata.rerunSource);
41999
- if (!rerunSource) {
42533
+ const preparedAttempt = toIndexPreparedAttempt(metadata.preparedAttempt);
42534
+ if (!rerunSource && !preparedAttempt) {
42000
42535
  return { ...metadata };
42001
42536
  }
42537
+ const reservedKeys = /* @__PURE__ */ new Set(["rerunSource", "preparedAttempt"]);
42002
42538
  return {
42003
- ...Object.fromEntries(Object.entries(metadata).filter(([key]) => key !== "rerunSource")),
42004
- rerun_source: rerunSource
42539
+ ...Object.fromEntries(Object.entries(metadata).filter(([key]) => !reservedKeys.has(key))),
42540
+ ...rerunSource ? { rerun_source: rerunSource } : {},
42541
+ ...preparedAttempt ? { prepared_attempt: preparedAttempt } : {}
42005
42542
  };
42006
42543
  }
42007
42544
  function buildGradingArtifact(result) {
@@ -42147,7 +42684,7 @@ async function writeInitialBenchmarkArtifact(runDir, options) {
42147
42684
  options.experiment,
42148
42685
  options.plannedTestCount
42149
42686
  );
42150
- const benchmarkPath = path47.join(runDir, "benchmark.json");
42687
+ const benchmarkPath = path48.join(runDir, "benchmark.json");
42151
42688
  await writeFile10(benchmarkPath, `${JSON.stringify(stub, null, 2)}
42152
42689
  `, "utf8");
42153
42690
  }
@@ -42197,7 +42734,7 @@ function buildArtifactSubdir(result) {
42197
42734
  segments.push(safeArtifactPathSegment(evalSet, "default"));
42198
42735
  }
42199
42736
  segments.push(safeTestId(result.testId));
42200
- return path47.posix.join(...segments);
42737
+ return path48.posix.join(...segments);
42201
42738
  }
42202
42739
  function formatOutputMarkdown(output) {
42203
42740
  return output.map((msg) => `@[${msg.role}]:
@@ -42213,7 +42750,7 @@ function extractInput(result) {
42213
42750
  return null;
42214
42751
  }
42215
42752
  function toRelativeArtifactPath(outputDir, filePath) {
42216
- return path47.relative(outputDir, filePath).split(path47.sep).join("/");
42753
+ return path48.relative(outputDir, filePath).split(path48.sep).join("/");
42217
42754
  }
42218
42755
  function findResultSourceTest(result, testByTestId) {
42219
42756
  return testByTestId.get(result.testId ?? "unknown");
@@ -42229,7 +42766,7 @@ async function writeTraceEnvelopeSidecar(params) {
42229
42766
  const hasTranscript = resultHasExecutionTraceTranscript(params.result);
42230
42767
  const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, {
42231
42768
  evalPath: params.evalPath,
42232
- runId: path47.basename(params.outputDir),
42769
+ runId: path48.basename(params.outputDir),
42233
42770
  experiment: params.experiment,
42234
42771
  source: { path: RESULT_INDEX_FILENAME },
42235
42772
  capture: { content: "full", redactionLevel: "none", redactedFields: [] },
@@ -42241,7 +42778,7 @@ async function writeTraceEnvelopeSidecar(params) {
42241
42778
  }
42242
42779
  });
42243
42780
  await writeFile10(
42244
- path47.join(params.outputsDir, "execution-trace.json"),
42781
+ path48.join(params.outputsDir, "execution-trace.json"),
42245
42782
  `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}
42246
42783
  `,
42247
42784
  "utf8"
@@ -42305,13 +42842,13 @@ function buildResultIndexArtifact(result, extraIndexFields) {
42305
42842
  failure_reason_code: result.failureReasonCode,
42306
42843
  workspace_path: result.workspacePath,
42307
42844
  artifact_dir: artifactSubdir,
42308
- grading_path: path47.posix.join(artifactSubdir, "grading.json"),
42309
- timing_path: path47.posix.join(artifactSubdir, "timing.json"),
42310
- input_path: input ? path47.posix.join(artifactSubdir, "input.md") : void 0,
42311
- output_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42312
- answer_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42313
- transcript_path: hasTranscript ? path47.posix.join(artifactSubdir, "outputs", "transcript.jsonl") : void 0,
42314
- response_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
42845
+ grading_path: path48.posix.join(artifactSubdir, "grading.json"),
42846
+ timing_path: path48.posix.join(artifactSubdir, "timing.json"),
42847
+ input_path: input ? path48.posix.join(artifactSubdir, "input.md") : void 0,
42848
+ output_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42849
+ answer_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42850
+ transcript_path: hasTranscript ? path48.posix.join(artifactSubdir, "outputs", "transcript.jsonl") : void 0,
42851
+ response_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
42315
42852
  ...extraIndexFields,
42316
42853
  metadata: toIndexMetadata(result.metadata)
42317
42854
  };
@@ -42351,7 +42888,7 @@ async function rewriteExistingIndexRecords(outputDir, replacements) {
42351
42888
  if (replacements.length === 0) {
42352
42889
  return;
42353
42890
  }
42354
- const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
42891
+ const indexPath = path48.join(outputDir, RESULT_INDEX_FILENAME);
42355
42892
  const content = await readFile20(indexPath, "utf8").catch(() => void 0);
42356
42893
  if (content === void 0) {
42357
42894
  return;
@@ -42520,29 +43057,29 @@ async function writePerTestArtifacts(results, outputDir, options) {
42520
43057
  const grading = buildGradingArtifact(result);
42521
43058
  const timing = buildTimingArtifact([result]);
42522
43059
  const artifactSubdir = buildArtifactSubdir(result);
42523
- const testDir = path47.join(outputDir, artifactSubdir);
43060
+ const testDir = path48.join(outputDir, artifactSubdir);
42524
43061
  await mkdir18(testDir, { recursive: true });
42525
43062
  await writeFile10(
42526
- path47.join(testDir, "grading.json"),
43063
+ path48.join(testDir, "grading.json"),
42527
43064
  `${JSON.stringify(grading, null, 2)}
42528
43065
  `,
42529
43066
  "utf8"
42530
43067
  );
42531
43068
  await writeFile10(
42532
- path47.join(testDir, "timing.json"),
43069
+ path48.join(testDir, "timing.json"),
42533
43070
  `${JSON.stringify(timing, null, 2)}
42534
43071
  `,
42535
43072
  "utf8"
42536
43073
  );
42537
43074
  const input = extractInput(result);
42538
43075
  if (input) {
42539
- await writeFile10(path47.join(testDir, "input.md"), input, "utf8");
43076
+ await writeFile10(path48.join(testDir, "input.md"), input, "utf8");
42540
43077
  }
42541
- const outputsDir = path47.join(testDir, "outputs");
43078
+ const outputsDir = path48.join(testDir, "outputs");
42542
43079
  await mkdir18(outputsDir, { recursive: true });
42543
43080
  if (result.output.length > 0) {
42544
- await writeFile10(path47.join(outputsDir, "answer.md"), result.output, "utf8");
42545
- await writeFile10(path47.join(outputsDir, "response.md"), result.output, "utf8");
43081
+ await writeFile10(path48.join(outputsDir, "answer.md"), result.output, "utf8");
43082
+ await writeFile10(path48.join(outputsDir, "response.md"), result.output, "utf8");
42546
43083
  }
42547
43084
  const envelope = await writeTraceEnvelopeSidecar({
42548
43085
  result,
@@ -42552,7 +43089,7 @@ async function writePerTestArtifacts(results, outputDir, options) {
42552
43089
  experiment: options?.experiment
42553
43090
  });
42554
43091
  if (hasTranscriptProjection(result, envelope)) {
42555
- await writeTranscriptJsonl(path47.join(outputsDir, "transcript.jsonl"), result, envelope);
43092
+ await writeTranscriptJsonl(path48.join(outputsDir, "transcript.jsonl"), result, envelope);
42556
43093
  }
42557
43094
  const extraIndexFields = await collectAdditionalIndexFields(
42558
43095
  result,
@@ -42570,9 +43107,9 @@ async function writePerTestArtifacts(results, outputDir, options) {
42570
43107
  }
42571
43108
  async function writeArtifactsFromResults(results, outputDir, options) {
42572
43109
  const testArtifactDir = outputDir;
42573
- const timingPath = path47.join(outputDir, "timing.json");
42574
- const benchmarkPath = path47.join(outputDir, "benchmark.json");
42575
- const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
43110
+ const timingPath = path48.join(outputDir, "timing.json");
43111
+ const benchmarkPath = path48.join(outputDir, "benchmark.json");
43112
+ const indexPath = path48.join(outputDir, RESULT_INDEX_FILENAME);
42576
43113
  await mkdir18(outputDir, { recursive: true });
42577
43114
  const indexRecords = [];
42578
43115
  const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
@@ -42580,23 +43117,23 @@ async function writeArtifactsFromResults(results, outputDir, options) {
42580
43117
  const grading = buildGradingArtifact(result);
42581
43118
  const timing2 = buildTimingArtifact([result]);
42582
43119
  const artifactSubdir = buildArtifactSubdir(result);
42583
- const testDir = path47.join(outputDir, artifactSubdir);
42584
- const gradingPath = path47.join(testDir, "grading.json");
42585
- const perTestTimingPath = path47.join(testDir, "timing.json");
43120
+ const testDir = path48.join(outputDir, artifactSubdir);
43121
+ const gradingPath = path48.join(testDir, "grading.json");
43122
+ const perTestTimingPath = path48.join(testDir, "timing.json");
42586
43123
  await mkdir18(testDir, { recursive: true });
42587
43124
  await writeFile10(gradingPath, `${JSON.stringify(grading, null, 2)}
42588
43125
  `, "utf8");
42589
43126
  await writeFile10(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
42590
43127
  `, "utf8");
42591
43128
  const input = extractInput(result);
42592
- const inputPath = input ? path47.join(testDir, "input.md") : void 0;
43129
+ const inputPath = input ? path48.join(testDir, "input.md") : void 0;
42593
43130
  if (inputPath && input) {
42594
43131
  await writeFile10(inputPath, input, "utf8");
42595
43132
  }
42596
- const outputsDir = path47.join(testDir, "outputs");
43133
+ const outputsDir = path48.join(testDir, "outputs");
42597
43134
  await mkdir18(outputsDir, { recursive: true });
42598
- const answerPath = result.output.length > 0 ? path47.join(outputsDir, "answer.md") : void 0;
42599
- const responsePath = result.output.length > 0 ? path47.join(outputsDir, "response.md") : void 0;
43135
+ const answerPath = result.output.length > 0 ? path48.join(outputsDir, "answer.md") : void 0;
43136
+ const responsePath = result.output.length > 0 ? path48.join(outputsDir, "response.md") : void 0;
42600
43137
  if (answerPath && responsePath) {
42601
43138
  await writeFile10(answerPath, result.output, "utf8");
42602
43139
  await writeFile10(responsePath, result.output, "utf8");
@@ -42608,7 +43145,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
42608
43145
  evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
42609
43146
  experiment: options?.experiment
42610
43147
  });
42611
- const transcriptPath = hasTranscriptProjection(result, envelope) ? path47.join(outputsDir, "transcript.jsonl") : void 0;
43148
+ const transcriptPath = hasTranscriptProjection(result, envelope) ? path48.join(outputsDir, "transcript.jsonl") : void 0;
42612
43149
  if (transcriptPath) {
42613
43150
  await writeTranscriptJsonl(transcriptPath, result, envelope);
42614
43151
  }
@@ -42649,7 +43186,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
42649
43186
  `, "utf8");
42650
43187
  await writeJsonlFile(indexPath, indexRecords);
42651
43188
  await writeFile10(
42652
- path47.join(outputDir, "transcript.jsonl"),
43189
+ path48.join(outputDir, "transcript.jsonl"),
42653
43190
  buildTranscriptMessageLines(results),
42654
43191
  "utf8"
42655
43192
  );
@@ -42700,7 +43237,7 @@ async function evaluate(config2) {
42700
43237
  cliNoCache: false,
42701
43238
  yamlCache: config2.cache === void 0 ? materialized.cache : void 0
42702
43239
  });
42703
- const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path48.resolve(materialized.cachePath) : void 0) : void 0;
43240
+ const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path49.resolve(materialized.cachePath) : void 0) : void 0;
42704
43241
  const results = await runEvaluation({
42705
43242
  testFilePath,
42706
43243
  repoRoot,
@@ -42723,7 +43260,7 @@ async function evaluate(config2) {
42723
43260
  });
42724
43261
  const allResults = collectedResults.length > 0 ? collectedResults : [...results];
42725
43262
  const durationMs = Date.now() - startTime;
42726
- const outputDir = config2.outputDir ? path48.resolve(config2.outputDir) : void 0;
43263
+ const outputDir = config2.outputDir ? path49.resolve(config2.outputDir) : void 0;
42727
43264
  const artifacts = outputDir ? await writeArtifactsFromResults(allResults, outputDir, {
42728
43265
  evalFile: config2.specFile ? testFilePath : "",
42729
43266
  experiment: config2.experiment,
@@ -42743,7 +43280,7 @@ async function evaluate(config2) {
42743
43280
  async function materializeEvalConfig(config2, options) {
42744
43281
  const baseDir = options?.baseDir ?? process.cwd();
42745
43282
  const repoRoot = options?.repoRoot ?? await findGitRoot(baseDir) ?? baseDir;
42746
- const testFilePath = config2.specFile ? path48.resolve(baseDir, config2.specFile) : path48.join(baseDir, "__programmatic__.yaml");
43283
+ const testFilePath = config2.specFile ? path49.resolve(baseDir, config2.specFile) : path49.join(baseDir, "__programmatic__.yaml");
42747
43284
  const effectiveFilter = options?.filter ?? config2.filter;
42748
43285
  if (config2.specFile) {
42749
43286
  const suite = await loadTestSuite(testFilePath, repoRoot, {
@@ -42820,7 +43357,7 @@ function convertAssertions(entries) {
42820
43357
  }
42821
43358
  function buildInlineEvalTests(config2, options) {
42822
43359
  const suiteWorkspace = config2.beforeAll ? { hooks: { before_all: toBeforeAllHook(config2.beforeAll) } } : void 0;
42823
- const derivedSuiteName = path48.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
43360
+ const derivedSuiteName = path49.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
42824
43361
  const suiteName = config2.metadata?.name ?? (derivedSuiteName || "eval");
42825
43362
  return (config2.tests ?? []).filter((test) => !options.filter || matchesFilter4(test.id, options.filter)).map((test) => {
42826
43363
  const isConversation = test.mode === "conversation" || test.turns && test.turns.length > 0;
@@ -42916,10 +43453,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
42916
43453
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
42917
43454
  async function discoverDefaultTarget(repoRoot) {
42918
43455
  const cwd = process.cwd();
42919
- const chain = buildDirectoryChain(path48.join(cwd, "_placeholder"), repoRoot);
43456
+ const chain = buildDirectoryChain(path49.join(cwd, "_placeholder"), repoRoot);
42920
43457
  for (const dir of chain) {
42921
43458
  for (const candidate of TARGET_FILE_CANDIDATES) {
42922
- const targetsPath = path48.join(dir, candidate);
43459
+ const targetsPath = path49.join(dir, candidate);
42923
43460
  if (!existsSync7(targetsPath)) continue;
42924
43461
  try {
42925
43462
  const definitions = await readTargetDefinitions(targetsPath);
@@ -42936,7 +43473,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
42936
43473
  const chain = buildDirectoryChain(startPath, repoRoot);
42937
43474
  const envFiles = [];
42938
43475
  for (const dir of chain) {
42939
- const envPath = path48.join(dir, ".env");
43476
+ const envPath = path49.join(dir, ".env");
42940
43477
  if (existsSync7(envPath)) envFiles.push(envPath);
42941
43478
  }
42942
43479
  for (let i = 0; i < envFiles.length; i++) {
@@ -42962,7 +43499,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
42962
43499
  }
42963
43500
  var EXPORT_NAMES = ["default", "config", "evalConfig"];
42964
43501
  async function loadTsEvalFile(filePath) {
42965
- const absolutePath = path49.resolve(filePath);
43502
+ const absolutePath = path50.resolve(filePath);
42966
43503
  const moduleUrl = pathToFileURL2(absolutePath).href;
42967
43504
  const module = await import(moduleUrl);
42968
43505
  let config2;
@@ -42984,7 +43521,7 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
42984
43521
  const { config: config2, filePath: absolutePath } = await loadTsEvalFile(filePath);
42985
43522
  const materialized = await materializeEvalConfig(config2, {
42986
43523
  repoRoot,
42987
- baseDir: path49.dirname(absolutePath),
43524
+ baseDir: path50.dirname(absolutePath),
42988
43525
  filter: options?.filter,
42989
43526
  category: options?.category
42990
43527
  });
@@ -43046,6 +43583,7 @@ export {
43046
43583
  buildDirectoryChain,
43047
43584
  buildSearchRoots,
43048
43585
  resolveFileReference,
43586
+ AGENT_PROVIDER_KINDS,
43049
43587
  KNOWN_PROVIDERS,
43050
43588
  PROVIDER_ALIASES,
43051
43589
  extractLastAssistantContent,
@@ -43230,6 +43768,7 @@ export {
43230
43768
  createTempWorkspace,
43231
43769
  cleanupWorkspace,
43232
43770
  cleanupEvalWorkspaces,
43771
+ executeWorkspaceScript,
43233
43772
  resolveRepoCloneUrl,
43234
43773
  normalizeRepoIdentity,
43235
43774
  computeWorkspaceFingerprint,
@@ -43246,7 +43785,9 @@ export {
43246
43785
  discoverProjects,
43247
43786
  RepoManager,
43248
43787
  resolveWorkspaceTemplate,
43249
- executeWorkspaceScript,
43788
+ releaseSharedWorkspaceSetup,
43789
+ prepareSharedWorkspaceSetup,
43790
+ prepareEvalCaseWorkspace,
43250
43791
  isAgentSkillsFormat,
43251
43792
  parseAgentSkillsEvals,
43252
43793
  DEFAULT_EVAL_PATTERNS,
@@ -43271,6 +43812,7 @@ export {
43271
43812
  loadEvalCases,
43272
43813
  loadTestById,
43273
43814
  loadEvalCaseById,
43815
+ gradePreparedEvalCase,
43274
43816
  runEvaluation,
43275
43817
  runEvalCase,
43276
43818
  toTranscriptJsonLines,
@@ -43300,4 +43842,4 @@ export {
43300
43842
  loadTsEvalFile,
43301
43843
  loadTsEvalSuite
43302
43844
  };
43303
- //# sourceMappingURL=chunk-BLXYBUU4.js.map
43845
+ //# sourceMappingURL=chunk-ENHX2CCS.js.map