agentv 4.40.1 → 4.41.1-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-GIAIMGPQ.js → artifact-writer-AMV64TWV.js} +4 -4
- package/dist/{chunk-TWQP7JYQ.js → chunk-A4J456KS.js} +2 -2
- package/dist/{chunk-BLXYBUU4.js → chunk-ENHX2CCS.js} +1485 -943
- package/dist/chunk-ENHX2CCS.js.map +1 -0
- package/dist/{chunk-B7CT3J2W.js → chunk-NRCVKN7X.js} +899 -300
- package/dist/chunk-NRCVKN7X.js.map +1 -0
- package/dist/{chunk-A36XLUI5.js → chunk-UMPZ64HO.js} +12 -10
- package/dist/chunk-UMPZ64HO.js.map +1 -0
- package/dist/{chunk-I3SC4FOT.js → chunk-Z45FKRMJ.js} +212 -58
- package/dist/chunk-Z45FKRMJ.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/{dist-6Z4OSITR.js → dist-X5P5IR65.js} +7 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-Q575M3A7.js → interactive-KU2RGBJJ.js} +5 -5
- package/dist/skills/agentv-bench/references/eval-yaml-spec.md +4 -4
- package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +14 -14
- package/dist/skills/agentv-eval-writer/references/python-helpers.md +47 -0
- package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-A36XLUI5.js.map +0 -1
- package/dist/chunk-B7CT3J2W.js.map +0 -1
- package/dist/chunk-BLXYBUU4.js.map +0 -1
- package/dist/chunk-I3SC4FOT.js.map +0 -1
- /package/dist/{artifact-writer-GIAIMGPQ.js.map → artifact-writer-AMV64TWV.js.map} +0 -0
- /package/dist/{chunk-TWQP7JYQ.js.map → chunk-A4J456KS.js.map} +0 -0
- /package/dist/{dist-6Z4OSITR.js.map → dist-X5P5IR65.js.map} +0 -0
- /package/dist/{interactive-Q575M3A7.js.map → interactive-KU2RGBJJ.js.map} +0 -0
- /package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map} +0 -0
|
@@ -493,8 +493,8 @@ function getErrorMap() {
|
|
|
493
493
|
|
|
494
494
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
495
495
|
var makeIssue = (params) => {
|
|
496
|
-
const { data, path:
|
|
497
|
-
const fullPath = [...
|
|
496
|
+
const { data, path: path51, errorMaps, issueData } = params;
|
|
497
|
+
const fullPath = [...path51, ...issueData.path || []];
|
|
498
498
|
const fullIssue = {
|
|
499
499
|
...issueData,
|
|
500
500
|
path: fullPath
|
|
@@ -610,11 +610,11 @@ var errorUtil;
|
|
|
610
610
|
|
|
611
611
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
612
612
|
var ParseInputLazyPath = class {
|
|
613
|
-
constructor(parent, value,
|
|
613
|
+
constructor(parent, value, path51, key) {
|
|
614
614
|
this._cachedPath = [];
|
|
615
615
|
this.parent = parent;
|
|
616
616
|
this.data = value;
|
|
617
|
-
this._path =
|
|
617
|
+
this._path = path51;
|
|
618
618
|
this._key = key;
|
|
619
619
|
}
|
|
620
620
|
get path() {
|
|
@@ -4056,7 +4056,7 @@ var coerce = {
|
|
|
4056
4056
|
};
|
|
4057
4057
|
var NEVER = INVALID;
|
|
4058
4058
|
|
|
4059
|
-
// ../../packages/core/dist/chunk-
|
|
4059
|
+
// ../../packages/core/dist/chunk-3EAL7M5J.js
|
|
4060
4060
|
import { parse } from "yaml";
|
|
4061
4061
|
import os from "node:os";
|
|
4062
4062
|
import path from "node:path";
|
|
@@ -5252,17 +5252,22 @@ function resolveCopilotFlatProviderConfig(target, env) {
|
|
|
5252
5252
|
optionalEnv: true
|
|
5253
5253
|
}
|
|
5254
5254
|
);
|
|
5255
|
-
const
|
|
5256
|
-
|
|
5257
|
-
|
|
5258
|
-
|
|
5255
|
+
const apiFormat = resolveOptionalString(
|
|
5256
|
+
target.api_format,
|
|
5257
|
+
env,
|
|
5258
|
+
`${target.name} copilot API format`,
|
|
5259
|
+
{
|
|
5260
|
+
allowLiteral: true,
|
|
5261
|
+
optionalEnv: true
|
|
5262
|
+
}
|
|
5263
|
+
);
|
|
5259
5264
|
return {
|
|
5260
5265
|
...type ? { type } : {},
|
|
5261
5266
|
baseUrl,
|
|
5262
5267
|
...apiKey ? { apiKey } : {},
|
|
5263
5268
|
...bearerToken ? { bearerToken } : {},
|
|
5264
5269
|
...apiVersion ? { apiVersion } : {},
|
|
5265
|
-
...
|
|
5270
|
+
...apiFormat ? { wireApi: apiFormat } : {}
|
|
5266
5271
|
};
|
|
5267
5272
|
}
|
|
5268
5273
|
function resolveCopilotCliConfig(target, env, _evalFilePath) {
|
|
@@ -6151,21 +6156,17 @@ async function expandFileReferences(tests, evalFileDir) {
|
|
|
6151
6156
|
return expanded;
|
|
6152
6157
|
}
|
|
6153
6158
|
|
|
6154
|
-
// ../../packages/core/dist/chunk-
|
|
6155
|
-
import
|
|
6159
|
+
// ../../packages/core/dist/chunk-REU6TJT4.js
|
|
6160
|
+
import path50 from "node:path";
|
|
6156
6161
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
6157
6162
|
import { existsSync as existsSync7 } from "node:fs";
|
|
6158
|
-
import
|
|
6163
|
+
import path49 from "node:path";
|
|
6159
6164
|
import micromatch4 from "micromatch";
|
|
6160
6165
|
import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
|
|
6161
6166
|
import path5 from "node:path";
|
|
6162
|
-
import { execFile as execFile2 } from "node:child_process";
|
|
6163
6167
|
import { createHash as createHash5, randomUUID as randomUUID10 } from "node:crypto";
|
|
6164
|
-
import
|
|
6165
|
-
import { copyFile as copyFile2, mkdir as mkdir17, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
6166
|
-
import path46 from "node:path";
|
|
6168
|
+
import path47 from "node:path";
|
|
6167
6169
|
import { fileURLToPath as fileURLToPath5 } from "node:url";
|
|
6168
|
-
import { promisify as promisify6 } from "node:util";
|
|
6169
6170
|
import micromatch3 from "micromatch";
|
|
6170
6171
|
import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
|
|
6171
6172
|
import { tmpdir } from "node:os";
|
|
@@ -6883,10 +6884,10 @@ function assignProp(target, prop, value) {
|
|
|
6883
6884
|
configurable: true
|
|
6884
6885
|
});
|
|
6885
6886
|
}
|
|
6886
|
-
function getElementAtPath(obj,
|
|
6887
|
-
if (!
|
|
6887
|
+
function getElementAtPath(obj, path51) {
|
|
6888
|
+
if (!path51)
|
|
6888
6889
|
return obj;
|
|
6889
|
-
return
|
|
6890
|
+
return path51.reduce((acc, key) => acc?.[key], obj);
|
|
6890
6891
|
}
|
|
6891
6892
|
function promiseAllObject(promisesObj) {
|
|
6892
6893
|
const keys = Object.keys(promisesObj);
|
|
@@ -7206,11 +7207,11 @@ function aborted(x, startIndex = 0) {
|
|
|
7206
7207
|
}
|
|
7207
7208
|
return false;
|
|
7208
7209
|
}
|
|
7209
|
-
function prefixIssues(
|
|
7210
|
+
function prefixIssues(path51, issues) {
|
|
7210
7211
|
return issues.map((iss) => {
|
|
7211
7212
|
var _a;
|
|
7212
7213
|
(_a = iss).path ?? (_a.path = []);
|
|
7213
|
-
iss.path.unshift(
|
|
7214
|
+
iss.path.unshift(path51);
|
|
7214
7215
|
return iss;
|
|
7215
7216
|
});
|
|
7216
7217
|
}
|
|
@@ -7347,7 +7348,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7347
7348
|
return issue2.message;
|
|
7348
7349
|
};
|
|
7349
7350
|
const result = { errors: [] };
|
|
7350
|
-
const processError = (error41,
|
|
7351
|
+
const processError = (error41, path51 = []) => {
|
|
7351
7352
|
var _a, _b;
|
|
7352
7353
|
for (const issue2 of error41.issues) {
|
|
7353
7354
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -7357,7 +7358,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7357
7358
|
} else if (issue2.code === "invalid_element") {
|
|
7358
7359
|
processError({ issues: issue2.issues }, issue2.path);
|
|
7359
7360
|
} else {
|
|
7360
|
-
const fullpath = [...
|
|
7361
|
+
const fullpath = [...path51, ...issue2.path];
|
|
7361
7362
|
if (fullpath.length === 0) {
|
|
7362
7363
|
result.errors.push(mapper(issue2));
|
|
7363
7364
|
continue;
|
|
@@ -7387,9 +7388,9 @@ function treeifyError(error40, _mapper) {
|
|
|
7387
7388
|
processError(error40);
|
|
7388
7389
|
return result;
|
|
7389
7390
|
}
|
|
7390
|
-
function toDotPath(
|
|
7391
|
+
function toDotPath(path51) {
|
|
7391
7392
|
const segs = [];
|
|
7392
|
-
for (const seg of
|
|
7393
|
+
for (const seg of path51) {
|
|
7393
7394
|
if (typeof seg === "number")
|
|
7394
7395
|
segs.push(`[${seg}]`);
|
|
7395
7396
|
else if (typeof seg === "symbol")
|
|
@@ -18819,7 +18820,7 @@ var RequestError = class _RequestError extends Error {
|
|
|
18819
18820
|
}
|
|
18820
18821
|
};
|
|
18821
18822
|
|
|
18822
|
-
// ../../packages/core/dist/chunk-
|
|
18823
|
+
// ../../packages/core/dist/chunk-REU6TJT4.js
|
|
18823
18824
|
import { exec as execCallback } from "node:child_process";
|
|
18824
18825
|
import { readdirSync, statSync } from "node:fs";
|
|
18825
18826
|
import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
@@ -18894,6 +18895,11 @@ import path33 from "node:path";
|
|
|
18894
18895
|
import fg3 from "fast-glob";
|
|
18895
18896
|
import { cp, mkdir as mkdir14, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
|
|
18896
18897
|
import path34 from "node:path";
|
|
18898
|
+
import { execFile as execFile2 } from "node:child_process";
|
|
18899
|
+
import { existsSync as existsSync6 } from "node:fs";
|
|
18900
|
+
import { copyFile as copyFile2, mkdir as mkdir17, readdir as readdir8, stat as stat8 } from "node:fs/promises";
|
|
18901
|
+
import path39 from "node:path";
|
|
18902
|
+
import { promisify as promisify6 } from "node:util";
|
|
18897
18903
|
import { createHash as createHash3 } from "node:crypto";
|
|
18898
18904
|
import { existsSync as existsSync3 } from "node:fs";
|
|
18899
18905
|
import { cp as cp2, mkdir as mkdir15, readFile as readFile11, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile9 } from "node:fs/promises";
|
|
@@ -18909,28 +18915,28 @@ import path36 from "node:path";
|
|
|
18909
18915
|
import { stringify as stringifyYaml } from "yaml";
|
|
18910
18916
|
import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
|
|
18911
18917
|
import path38 from "node:path";
|
|
18912
|
-
import { readFile as readFile18, stat as
|
|
18913
|
-
import
|
|
18918
|
+
import { readFile as readFile18, stat as stat9 } from "node:fs/promises";
|
|
18919
|
+
import path46 from "node:path";
|
|
18914
18920
|
import micromatch2 from "micromatch";
|
|
18915
18921
|
import { stringify as stringifyYaml2 } from "yaml";
|
|
18916
18922
|
import { readFile as readFile12 } from "node:fs/promises";
|
|
18917
|
-
import
|
|
18923
|
+
import path40 from "node:path";
|
|
18918
18924
|
import { readFile as readFile13 } from "node:fs/promises";
|
|
18919
|
-
import
|
|
18925
|
+
import path422 from "node:path";
|
|
18920
18926
|
import { constants as constants4 } from "node:fs";
|
|
18921
18927
|
import { access as access4 } from "node:fs/promises";
|
|
18922
|
-
import
|
|
18928
|
+
import path41 from "node:path";
|
|
18923
18929
|
import { fileURLToPath as fileURLToPath4 } from "node:url";
|
|
18924
18930
|
import { readFile as readFile15 } from "node:fs/promises";
|
|
18925
|
-
import
|
|
18931
|
+
import path43 from "node:path";
|
|
18926
18932
|
import { readFile as readFile14 } from "node:fs/promises";
|
|
18927
18933
|
import { readFile as readFile17 } from "node:fs/promises";
|
|
18928
|
-
import
|
|
18934
|
+
import path45 from "node:path";
|
|
18929
18935
|
import micromatch from "micromatch";
|
|
18930
18936
|
import { readFile as readFile16 } from "node:fs/promises";
|
|
18931
|
-
import
|
|
18937
|
+
import path44 from "node:path";
|
|
18932
18938
|
import { mkdir as mkdir18, readFile as readFile20, writeFile as writeFile10 } from "node:fs/promises";
|
|
18933
|
-
import
|
|
18939
|
+
import path48 from "node:path";
|
|
18934
18940
|
import { readFile as readFile19 } from "node:fs/promises";
|
|
18935
18941
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
18936
18942
|
var ResponseCache = class {
|
|
@@ -22462,115 +22468,115 @@ var FieldAccuracyGrader = class {
|
|
|
22462
22468
|
* Evaluate a single field against the expected value.
|
|
22463
22469
|
*/
|
|
22464
22470
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
22465
|
-
const { path:
|
|
22466
|
-
const candidateValue = resolvePath(candidateData,
|
|
22467
|
-
const expectedValue = resolvePath(expectedData,
|
|
22471
|
+
const { path: path51, match, required: required2 = true, weight = 1 } = fieldConfig;
|
|
22472
|
+
const candidateValue = resolvePath(candidateData, path51);
|
|
22473
|
+
const expectedValue = resolvePath(expectedData, path51);
|
|
22468
22474
|
if (expectedValue === void 0) {
|
|
22469
22475
|
return {
|
|
22470
|
-
path:
|
|
22476
|
+
path: path51,
|
|
22471
22477
|
score: 1,
|
|
22472
22478
|
// No expected value means no comparison needed
|
|
22473
22479
|
weight,
|
|
22474
22480
|
hit: true,
|
|
22475
|
-
message: `${
|
|
22481
|
+
message: `${path51}: no expected value`
|
|
22476
22482
|
};
|
|
22477
22483
|
}
|
|
22478
22484
|
if (candidateValue === void 0) {
|
|
22479
22485
|
if (required2) {
|
|
22480
22486
|
return {
|
|
22481
|
-
path:
|
|
22487
|
+
path: path51,
|
|
22482
22488
|
score: 0,
|
|
22483
22489
|
weight,
|
|
22484
22490
|
hit: false,
|
|
22485
|
-
message: `${
|
|
22491
|
+
message: `${path51} (required, missing)`
|
|
22486
22492
|
};
|
|
22487
22493
|
}
|
|
22488
22494
|
return {
|
|
22489
|
-
path:
|
|
22495
|
+
path: path51,
|
|
22490
22496
|
score: 1,
|
|
22491
22497
|
// Don't penalize missing optional fields
|
|
22492
22498
|
weight: 0,
|
|
22493
22499
|
// Zero weight means it won't affect the score
|
|
22494
22500
|
hit: true,
|
|
22495
|
-
message: `${
|
|
22501
|
+
message: `${path51}: optional field missing`
|
|
22496
22502
|
};
|
|
22497
22503
|
}
|
|
22498
22504
|
switch (match) {
|
|
22499
22505
|
case "exact":
|
|
22500
|
-
return this.compareExact(
|
|
22506
|
+
return this.compareExact(path51, candidateValue, expectedValue, weight);
|
|
22501
22507
|
case "numeric_tolerance":
|
|
22502
22508
|
return this.compareNumericTolerance(
|
|
22503
|
-
|
|
22509
|
+
path51,
|
|
22504
22510
|
candidateValue,
|
|
22505
22511
|
expectedValue,
|
|
22506
22512
|
fieldConfig,
|
|
22507
22513
|
weight
|
|
22508
22514
|
);
|
|
22509
22515
|
case "date":
|
|
22510
|
-
return this.compareDate(
|
|
22516
|
+
return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
|
|
22511
22517
|
default:
|
|
22512
22518
|
return {
|
|
22513
|
-
path:
|
|
22519
|
+
path: path51,
|
|
22514
22520
|
score: 0,
|
|
22515
22521
|
weight,
|
|
22516
22522
|
hit: false,
|
|
22517
|
-
message: `${
|
|
22523
|
+
message: `${path51}: unknown match type "${match}"`
|
|
22518
22524
|
};
|
|
22519
22525
|
}
|
|
22520
22526
|
}
|
|
22521
22527
|
/**
|
|
22522
22528
|
* Exact equality comparison.
|
|
22523
22529
|
*/
|
|
22524
|
-
compareExact(
|
|
22530
|
+
compareExact(path51, candidateValue, expectedValue, weight) {
|
|
22525
22531
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
22526
22532
|
return {
|
|
22527
|
-
path:
|
|
22533
|
+
path: path51,
|
|
22528
22534
|
score: 1,
|
|
22529
22535
|
weight,
|
|
22530
22536
|
hit: true,
|
|
22531
|
-
message:
|
|
22537
|
+
message: path51
|
|
22532
22538
|
};
|
|
22533
22539
|
}
|
|
22534
22540
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
22535
22541
|
return {
|
|
22536
|
-
path:
|
|
22542
|
+
path: path51,
|
|
22537
22543
|
score: 0,
|
|
22538
22544
|
weight,
|
|
22539
22545
|
hit: false,
|
|
22540
|
-
message: `${
|
|
22546
|
+
message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
22541
22547
|
};
|
|
22542
22548
|
}
|
|
22543
22549
|
return {
|
|
22544
|
-
path:
|
|
22550
|
+
path: path51,
|
|
22545
22551
|
score: 0,
|
|
22546
22552
|
weight,
|
|
22547
22553
|
hit: false,
|
|
22548
|
-
message: `${
|
|
22554
|
+
message: `${path51} (value mismatch)`
|
|
22549
22555
|
};
|
|
22550
22556
|
}
|
|
22551
22557
|
/**
|
|
22552
22558
|
* Numeric comparison with absolute or relative tolerance.
|
|
22553
22559
|
*/
|
|
22554
|
-
compareNumericTolerance(
|
|
22560
|
+
compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
22555
22561
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
22556
22562
|
const candidateNum = toNumber(candidateValue);
|
|
22557
22563
|
const expectedNum = toNumber(expectedValue);
|
|
22558
22564
|
if (candidateNum === null || expectedNum === null) {
|
|
22559
22565
|
return {
|
|
22560
|
-
path:
|
|
22566
|
+
path: path51,
|
|
22561
22567
|
score: 0,
|
|
22562
22568
|
weight,
|
|
22563
22569
|
hit: false,
|
|
22564
|
-
message: `${
|
|
22570
|
+
message: `${path51} (non-numeric value)`
|
|
22565
22571
|
};
|
|
22566
22572
|
}
|
|
22567
22573
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
22568
22574
|
return {
|
|
22569
|
-
path:
|
|
22575
|
+
path: path51,
|
|
22570
22576
|
score: 0,
|
|
22571
22577
|
weight,
|
|
22572
22578
|
hit: false,
|
|
22573
|
-
message: `${
|
|
22579
|
+
message: `${path51} (invalid numeric value)`
|
|
22574
22580
|
};
|
|
22575
22581
|
}
|
|
22576
22582
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -22583,61 +22589,61 @@ var FieldAccuracyGrader = class {
|
|
|
22583
22589
|
}
|
|
22584
22590
|
if (withinTolerance) {
|
|
22585
22591
|
return {
|
|
22586
|
-
path:
|
|
22592
|
+
path: path51,
|
|
22587
22593
|
score: 1,
|
|
22588
22594
|
weight,
|
|
22589
22595
|
hit: true,
|
|
22590
|
-
message: `${
|
|
22596
|
+
message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
|
|
22591
22597
|
};
|
|
22592
22598
|
}
|
|
22593
22599
|
return {
|
|
22594
|
-
path:
|
|
22600
|
+
path: path51,
|
|
22595
22601
|
score: 0,
|
|
22596
22602
|
weight,
|
|
22597
22603
|
hit: false,
|
|
22598
|
-
message: `${
|
|
22604
|
+
message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
22599
22605
|
};
|
|
22600
22606
|
}
|
|
22601
22607
|
/**
|
|
22602
22608
|
* Date comparison with format normalization.
|
|
22603
22609
|
*/
|
|
22604
|
-
compareDate(
|
|
22610
|
+
compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
22605
22611
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
22606
22612
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
22607
22613
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
22608
22614
|
if (candidateDate === null) {
|
|
22609
22615
|
return {
|
|
22610
|
-
path:
|
|
22616
|
+
path: path51,
|
|
22611
22617
|
score: 0,
|
|
22612
22618
|
weight,
|
|
22613
22619
|
hit: false,
|
|
22614
|
-
message: `${
|
|
22620
|
+
message: `${path51} (unparseable candidate date)`
|
|
22615
22621
|
};
|
|
22616
22622
|
}
|
|
22617
22623
|
if (expectedDate === null) {
|
|
22618
22624
|
return {
|
|
22619
|
-
path:
|
|
22625
|
+
path: path51,
|
|
22620
22626
|
score: 0,
|
|
22621
22627
|
weight,
|
|
22622
22628
|
hit: false,
|
|
22623
|
-
message: `${
|
|
22629
|
+
message: `${path51} (unparseable expected date)`
|
|
22624
22630
|
};
|
|
22625
22631
|
}
|
|
22626
22632
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
22627
22633
|
return {
|
|
22628
|
-
path:
|
|
22634
|
+
path: path51,
|
|
22629
22635
|
score: 1,
|
|
22630
22636
|
weight,
|
|
22631
22637
|
hit: true,
|
|
22632
|
-
message:
|
|
22638
|
+
message: path51
|
|
22633
22639
|
};
|
|
22634
22640
|
}
|
|
22635
22641
|
return {
|
|
22636
|
-
path:
|
|
22642
|
+
path: path51,
|
|
22637
22643
|
score: 0,
|
|
22638
22644
|
weight,
|
|
22639
22645
|
hit: false,
|
|
22640
|
-
message: `${
|
|
22646
|
+
message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
22641
22647
|
};
|
|
22642
22648
|
}
|
|
22643
22649
|
/**
|
|
@@ -22670,11 +22676,11 @@ var FieldAccuracyGrader = class {
|
|
|
22670
22676
|
};
|
|
22671
22677
|
}
|
|
22672
22678
|
};
|
|
22673
|
-
function resolvePath(obj,
|
|
22674
|
-
if (!
|
|
22679
|
+
function resolvePath(obj, path51) {
|
|
22680
|
+
if (!path51 || !obj) {
|
|
22675
22681
|
return void 0;
|
|
22676
22682
|
}
|
|
22677
|
-
const parts =
|
|
22683
|
+
const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
22678
22684
|
let current = obj;
|
|
22679
22685
|
for (const part of parts) {
|
|
22680
22686
|
if (current === null || current === void 0) {
|
|
@@ -23205,8 +23211,8 @@ var TokenUsageGrader = class {
|
|
|
23205
23211
|
};
|
|
23206
23212
|
}
|
|
23207
23213
|
};
|
|
23208
|
-
function getNestedValue(obj,
|
|
23209
|
-
const parts =
|
|
23214
|
+
function getNestedValue(obj, path51) {
|
|
23215
|
+
const parts = path51.split(".");
|
|
23210
23216
|
let current = obj;
|
|
23211
23217
|
for (const part of parts) {
|
|
23212
23218
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -33200,6 +33206,47 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
33200
33206
|
await rm4(evalDir, { recursive: true, force: true });
|
|
33201
33207
|
}
|
|
33202
33208
|
}
|
|
33209
|
+
function interpolateArgs(args, context) {
|
|
33210
|
+
const vars = {
|
|
33211
|
+
workspace_path: context.workspacePath,
|
|
33212
|
+
test_id: context.testId,
|
|
33213
|
+
eval_run_id: context.evalRunId,
|
|
33214
|
+
case_input: context.caseInput ?? "",
|
|
33215
|
+
case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
|
|
33216
|
+
};
|
|
33217
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
33218
|
+
}
|
|
33219
|
+
async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
33220
|
+
const stdin = JSON.stringify({
|
|
33221
|
+
workspace_path: context.workspacePath,
|
|
33222
|
+
test_id: context.testId,
|
|
33223
|
+
eval_run_id: context.evalRunId,
|
|
33224
|
+
case_input: context.caseInput ?? null,
|
|
33225
|
+
case_metadata: context.caseMetadata ?? null
|
|
33226
|
+
});
|
|
33227
|
+
const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
33228
|
+
const cwd = config2.cwd ?? context.workspaceFileDir ?? context.evalDir;
|
|
33229
|
+
if (config2.script !== void 0 && config2.command === void 0) {
|
|
33230
|
+
console.warn(
|
|
33231
|
+
"\x1B[33mWarning: 'script' is deprecated in workspace config. Use 'command' instead.\x1B[0m"
|
|
33232
|
+
);
|
|
33233
|
+
}
|
|
33234
|
+
const rawCommand = config2.command ?? config2.script ?? [];
|
|
33235
|
+
const commandArray = interpolateArgs(rawCommand, context);
|
|
33236
|
+
const result = await execFileWithStdin(commandArray, stdin, {
|
|
33237
|
+
timeoutMs,
|
|
33238
|
+
cwd
|
|
33239
|
+
});
|
|
33240
|
+
if (result.exitCode !== 0) {
|
|
33241
|
+
const stderr = result.stderr.trim();
|
|
33242
|
+
const message = stderr ? `${stderr}` : `Process exited with code ${result.exitCode}`;
|
|
33243
|
+
if (failureMode === "fatal") {
|
|
33244
|
+
throw new Error(`Script failed: ${message}`);
|
|
33245
|
+
}
|
|
33246
|
+
console.warn(`Script warning: ${message}`);
|
|
33247
|
+
}
|
|
33248
|
+
return result.stdout;
|
|
33249
|
+
}
|
|
33203
33250
|
var GITHUB_SHORTHAND_RE = /^[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+$/;
|
|
33204
33251
|
function resolveRepoCloneUrl(repo) {
|
|
33205
33252
|
const trimmed = repo.trim();
|
|
@@ -34184,46 +34231,919 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
34184
34231
|
}
|
|
34185
34232
|
return { dir: resolved };
|
|
34186
34233
|
}
|
|
34187
|
-
|
|
34188
|
-
|
|
34189
|
-
|
|
34190
|
-
|
|
34191
|
-
|
|
34192
|
-
|
|
34193
|
-
|
|
34234
|
+
var execFileAsync2 = promisify6(execFile2);
|
|
34235
|
+
var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
|
|
34236
|
+
var WorkspaceSetupError = class extends Error {
|
|
34237
|
+
failureStage;
|
|
34238
|
+
failureReasonCode;
|
|
34239
|
+
hookExecutions;
|
|
34240
|
+
constructor(message, options) {
|
|
34241
|
+
super(message);
|
|
34242
|
+
this.name = "WorkspaceSetupError";
|
|
34243
|
+
this.failureStage = options.failureStage;
|
|
34244
|
+
this.failureReasonCode = options.failureReasonCode;
|
|
34245
|
+
this.hookExecutions = options.hookExecutions ?? [];
|
|
34246
|
+
if (options.cause !== void 0) {
|
|
34247
|
+
this.cause = options.cause;
|
|
34248
|
+
}
|
|
34249
|
+
}
|
|
34250
|
+
};
|
|
34251
|
+
function toScriptConfig(hook, hookName, context) {
|
|
34252
|
+
const command = hook.command ?? hook.script;
|
|
34253
|
+
if (!command || command.length === 0) {
|
|
34254
|
+
throw new Error(`${hookName} hook in ${context} requires command or script`);
|
|
34255
|
+
}
|
|
34256
|
+
return {
|
|
34257
|
+
command,
|
|
34258
|
+
...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
|
|
34259
|
+
...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
|
|
34260
|
+
...hook.cwd !== void 0 && { cwd: hook.cwd },
|
|
34261
|
+
...hook.script !== void 0 && { script: hook.script }
|
|
34194
34262
|
};
|
|
34195
|
-
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
34196
34263
|
}
|
|
34197
|
-
|
|
34198
|
-
|
|
34199
|
-
|
|
34200
|
-
|
|
34201
|
-
|
|
34202
|
-
|
|
34203
|
-
|
|
34204
|
-
}
|
|
34205
|
-
const
|
|
34206
|
-
|
|
34207
|
-
|
|
34264
|
+
function hasHookCommand(hook) {
|
|
34265
|
+
return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
|
|
34266
|
+
}
|
|
34267
|
+
function hooksEnabled(workspace) {
|
|
34268
|
+
return workspace?.hooks?.enabled !== false;
|
|
34269
|
+
}
|
|
34270
|
+
function workspaceGitEnv() {
|
|
34271
|
+
const env = { ...process.env };
|
|
34272
|
+
for (const key of Object.keys(env)) {
|
|
34273
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
34274
|
+
delete env[key];
|
|
34275
|
+
}
|
|
34276
|
+
}
|
|
34277
|
+
return {
|
|
34278
|
+
...env,
|
|
34279
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
34280
|
+
GIT_ASKPASS: "",
|
|
34281
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
34282
|
+
};
|
|
34283
|
+
}
|
|
34284
|
+
async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
|
|
34285
|
+
if (!existsSync6(path39.join(workspacePath, ".git"))) {
|
|
34286
|
+
return false;
|
|
34287
|
+
}
|
|
34288
|
+
const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
|
|
34289
|
+
const opts = {
|
|
34290
|
+
cwd: workspacePath,
|
|
34291
|
+
timeout: WORKSPACE_GIT_TIMEOUT_MS,
|
|
34292
|
+
env: workspaceGitEnv(),
|
|
34293
|
+
maxBuffer: 50 * 1024 * 1024
|
|
34294
|
+
};
|
|
34295
|
+
await execFileAsync2("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
|
|
34296
|
+
await execFileAsync2("git", ["clean", cleanFlag], opts);
|
|
34297
|
+
return true;
|
|
34298
|
+
}
|
|
34299
|
+
function commandForHook(hook) {
|
|
34300
|
+
return hook?.command ?? hook?.script;
|
|
34301
|
+
}
|
|
34302
|
+
function hookExecution(options) {
|
|
34303
|
+
const command = commandForHook(options.hook);
|
|
34304
|
+
return {
|
|
34305
|
+
scope: options.scope,
|
|
34306
|
+
name: options.name,
|
|
34307
|
+
status: options.status,
|
|
34308
|
+
testId: options.testId,
|
|
34309
|
+
...options.workspacePath !== void 0 && { workspacePath: options.workspacePath },
|
|
34310
|
+
...command !== void 0 && { command },
|
|
34311
|
+
...options.hook?.cwd !== void 0 && { cwd: options.hook.cwd },
|
|
34312
|
+
...options.output !== void 0 && { output: options.output },
|
|
34313
|
+
...options.error !== void 0 && { error: options.error }
|
|
34314
|
+
};
|
|
34315
|
+
}
|
|
34316
|
+
async function releasePoolSlots(setup) {
|
|
34317
|
+
if (!setup.poolManager) {
|
|
34318
|
+
return;
|
|
34319
|
+
}
|
|
34320
|
+
if (setup.poolSlot) {
|
|
34321
|
+
await setup.poolManager.releaseSlot(setup.poolSlot);
|
|
34322
|
+
}
|
|
34323
|
+
for (const slot of setup.poolSlots) {
|
|
34324
|
+
if (slot !== setup.poolSlot) {
|
|
34325
|
+
await setup.poolManager.releaseSlot(slot).catch(() => {
|
|
34326
|
+
});
|
|
34327
|
+
}
|
|
34328
|
+
}
|
|
34329
|
+
}
|
|
34330
|
+
async function releaseSharedWorkspaceSetup(setup) {
|
|
34331
|
+
await releasePoolSlots(setup);
|
|
34332
|
+
}
|
|
34333
|
+
async function prepareSharedWorkspaceSetup(options) {
|
|
34334
|
+
const {
|
|
34335
|
+
evalRunId,
|
|
34336
|
+
evalCases,
|
|
34337
|
+
targetHooks,
|
|
34338
|
+
evalDir,
|
|
34339
|
+
verbose,
|
|
34340
|
+
workers,
|
|
34341
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
34342
|
+
workspacePath,
|
|
34343
|
+
legacyWorkspacePath,
|
|
34344
|
+
workspaceMode,
|
|
34345
|
+
workspaceClean
|
|
34346
|
+
} = options;
|
|
34347
|
+
const suiteWorkspace = evalCases[0]?.workspace;
|
|
34348
|
+
const rawTemplate = suiteWorkspace?.template;
|
|
34349
|
+
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
34350
|
+
const workspaceTemplate = resolvedTemplate?.dir;
|
|
34351
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
34352
|
+
const setupLog = (message) => {
|
|
34353
|
+
if (verbose) {
|
|
34354
|
+
console.log(`[setup] ${message}`);
|
|
34355
|
+
}
|
|
34356
|
+
};
|
|
34357
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
34358
|
+
const cliWorkspacePath = workspacePath ?? legacyWorkspacePath;
|
|
34359
|
+
const yamlWorkspacePath = suiteWorkspace?.path;
|
|
34360
|
+
if (cliWorkspacePath && workspaceMode && workspaceMode !== "static") {
|
|
34361
|
+
throw new Error("--workspace-path requires --workspace-mode static when both are provided");
|
|
34362
|
+
}
|
|
34363
|
+
let configuredMode = cliWorkspacePath ? "static" : workspaceMode ?? suiteWorkspace?.mode ?? (yamlWorkspacePath ? "static" : "pooled");
|
|
34364
|
+
const configuredStaticPath = cliWorkspacePath ?? yamlWorkspacePath;
|
|
34365
|
+
if (configuredMode === "static" && !configuredStaticPath) {
|
|
34366
|
+
if (!suiteWorkspace?.repos?.length) {
|
|
34367
|
+
setupLog("workspace.mode=static with no path and no repos \u2014 falling back to temp mode");
|
|
34368
|
+
configuredMode = "temp";
|
|
34369
|
+
} else {
|
|
34370
|
+
throw new Error("workspace.mode=static requires workspace.path or --workspace-path");
|
|
34371
|
+
}
|
|
34372
|
+
}
|
|
34373
|
+
const useStaticWorkspace = configuredMode === "static";
|
|
34374
|
+
if (useStaticWorkspace && isPerTestIsolation) {
|
|
34375
|
+
throw new Error(
|
|
34376
|
+
"static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
34377
|
+
);
|
|
34378
|
+
}
|
|
34379
|
+
if (configuredMode !== "static" && configuredStaticPath) {
|
|
34380
|
+
throw new Error("workspace.path requires workspace.mode=static");
|
|
34381
|
+
}
|
|
34382
|
+
const hasSharedWorkspace = !!(useStaticWorkspace || !isPerTestIsolation && (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length));
|
|
34383
|
+
const poolEnabled = configuredMode === "pooled";
|
|
34384
|
+
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
34385
|
+
setupLog(
|
|
34386
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
|
|
34387
|
+
);
|
|
34388
|
+
if (hasSharedWorkspace && !usePool && workers > 1 && evalCases.length > 1) {
|
|
34208
34389
|
console.warn(
|
|
34209
|
-
|
|
34390
|
+
[
|
|
34391
|
+
`Warning: This eval uses a shared workspace with ${workers} workers.`,
|
|
34392
|
+
"If the agent under test makes file edits, concurrent runs may corrupt each other.",
|
|
34393
|
+
"To limit concurrency, add this to your eval YAML:",
|
|
34394
|
+
"",
|
|
34395
|
+
" execution:",
|
|
34396
|
+
" workers: 1",
|
|
34397
|
+
"",
|
|
34398
|
+
"Or pass --workers 1 on the command line."
|
|
34399
|
+
].join("\n")
|
|
34210
34400
|
);
|
|
34211
34401
|
}
|
|
34212
|
-
|
|
34213
|
-
|
|
34214
|
-
|
|
34215
|
-
|
|
34216
|
-
|
|
34217
|
-
|
|
34218
|
-
|
|
34219
|
-
|
|
34220
|
-
|
|
34221
|
-
|
|
34222
|
-
|
|
34402
|
+
let sharedWorkspacePath;
|
|
34403
|
+
let sharedBaselineCommit;
|
|
34404
|
+
let beforeAllOutput;
|
|
34405
|
+
let poolManager;
|
|
34406
|
+
let poolSlot;
|
|
34407
|
+
const poolSlots = [];
|
|
34408
|
+
const availablePoolSlots = [];
|
|
34409
|
+
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
34410
|
+
const hookExecutions = [];
|
|
34411
|
+
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
34412
|
+
let staticMaterialised = false;
|
|
34413
|
+
const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
|
|
34414
|
+
let repoManager;
|
|
34415
|
+
try {
|
|
34416
|
+
if (useStaticWorkspace && configuredStaticPath) {
|
|
34417
|
+
const dirExists = await stat8(configuredStaticPath).then(
|
|
34418
|
+
(s) => s.isDirectory(),
|
|
34419
|
+
() => false
|
|
34420
|
+
);
|
|
34421
|
+
const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
|
|
34422
|
+
if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
|
|
34423
|
+
if (!dirExists) {
|
|
34424
|
+
await mkdir17(configuredStaticPath, { recursive: true });
|
|
34425
|
+
}
|
|
34426
|
+
if (workspaceTemplate) {
|
|
34427
|
+
await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
|
|
34428
|
+
setupLog(`copied template into static workspace: ${configuredStaticPath}`);
|
|
34429
|
+
}
|
|
34430
|
+
staticMaterialised = true;
|
|
34431
|
+
setupLog(`materialised static workspace at: ${configuredStaticPath}`);
|
|
34432
|
+
} else {
|
|
34433
|
+
setupLog(`reusing existing static workspace: ${configuredStaticPath}`);
|
|
34434
|
+
}
|
|
34435
|
+
sharedWorkspacePath = configuredStaticPath;
|
|
34436
|
+
} else if (!isPerTestIsolation && usePool && suiteWorkspace?.repos) {
|
|
34437
|
+
const slotsNeeded = workers;
|
|
34438
|
+
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
34439
|
+
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
34440
|
+
const poolRepoManager = new RepoManager(verbose);
|
|
34441
|
+
for (let i = 0; i < slotsNeeded; i++) {
|
|
34442
|
+
const slot = await poolManager.acquireWorkspace({
|
|
34443
|
+
templatePath: workspaceTemplate,
|
|
34444
|
+
repos: suiteWorkspace.repos,
|
|
34445
|
+
maxSlots: poolMaxSlots,
|
|
34446
|
+
repoManager: poolRepoManager,
|
|
34447
|
+
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
|
|
34448
|
+
});
|
|
34449
|
+
poolSlots.push(slot);
|
|
34450
|
+
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
34451
|
+
}
|
|
34452
|
+
if (slotsNeeded === 1) {
|
|
34453
|
+
poolSlot = poolSlots[0];
|
|
34454
|
+
sharedWorkspacePath = poolSlot.path;
|
|
34455
|
+
} else {
|
|
34456
|
+
availablePoolSlots.push(...poolSlots);
|
|
34457
|
+
}
|
|
34458
|
+
} else if (!isPerTestIsolation && workspaceTemplate) {
|
|
34459
|
+
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
34460
|
+
try {
|
|
34461
|
+
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
34462
|
+
setupLog(`shared workspace created at: ${sharedWorkspacePath}`);
|
|
34463
|
+
} catch (error40) {
|
|
34464
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34465
|
+
throw new WorkspaceSetupError(`Failed to create shared workspace: ${message}`, {
|
|
34466
|
+
failureStage: "setup",
|
|
34467
|
+
failureReasonCode: "template_error",
|
|
34468
|
+
hookExecutions,
|
|
34469
|
+
cause: error40
|
|
34470
|
+
});
|
|
34471
|
+
}
|
|
34472
|
+
} else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
|
|
34473
|
+
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
34474
|
+
await mkdir17(sharedWorkspacePath, { recursive: true });
|
|
34475
|
+
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
34223
34476
|
}
|
|
34224
|
-
|
|
34477
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
34478
|
+
const copiedWorkspaceFile = path39.join(sharedWorkspacePath, path39.basename(suiteWorkspaceFile));
|
|
34479
|
+
try {
|
|
34480
|
+
await stat8(copiedWorkspaceFile);
|
|
34481
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
34482
|
+
} catch {
|
|
34483
|
+
}
|
|
34484
|
+
}
|
|
34485
|
+
const hasReposToMaterialize = !!suiteWorkspace?.repos?.length && !usePool && !isPerTestIsolation;
|
|
34486
|
+
const needsRepoMaterialisation = hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised);
|
|
34487
|
+
const needsPerRepoCheck = hasReposToMaterialize && useStaticWorkspace && !staticMaterialised && isYamlConfiguredPath;
|
|
34488
|
+
repoManager = needsRepoMaterialisation || needsPerRepoCheck ? new RepoManager(verbose) : void 0;
|
|
34489
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos) {
|
|
34490
|
+
try {
|
|
34491
|
+
if (needsPerRepoCheck) {
|
|
34492
|
+
for (const repo of suiteWorkspace.repos) {
|
|
34493
|
+
if (!repo.path || !repo.repo) continue;
|
|
34494
|
+
const targetDir = path39.join(sharedWorkspacePath, repo.path);
|
|
34495
|
+
if (existsSync6(targetDir)) {
|
|
34496
|
+
setupLog(`reusing existing repo at: ${targetDir}`);
|
|
34497
|
+
continue;
|
|
34498
|
+
}
|
|
34499
|
+
setupLog(`materializing missing repo: ${repo.path}`);
|
|
34500
|
+
await repoManager.materialize(repo, sharedWorkspacePath);
|
|
34501
|
+
}
|
|
34502
|
+
} else {
|
|
34503
|
+
setupLog(
|
|
34504
|
+
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
34505
|
+
);
|
|
34506
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
34507
|
+
}
|
|
34508
|
+
setupLog("shared repo materialization complete");
|
|
34509
|
+
} catch (error40) {
|
|
34510
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34511
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
34512
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
34513
|
+
});
|
|
34514
|
+
}
|
|
34515
|
+
throw new WorkspaceSetupError(`Failed to materialize repos: ${message}`, {
|
|
34516
|
+
failureStage: "repo_setup",
|
|
34517
|
+
failureReasonCode: "clone_error",
|
|
34518
|
+
hookExecutions,
|
|
34519
|
+
cause: error40
|
|
34520
|
+
});
|
|
34521
|
+
}
|
|
34522
|
+
}
|
|
34523
|
+
const suiteDockerConfig = suiteWorkspace?.docker;
|
|
34524
|
+
if (suiteDockerConfig) {
|
|
34525
|
+
setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
|
|
34526
|
+
const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27-B4AQHVWA.js");
|
|
34527
|
+
const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
|
|
34528
|
+
if (!await dockerSetup.isDockerAvailable()) {
|
|
34529
|
+
throw new Error(
|
|
34530
|
+
"Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
|
|
34531
|
+
);
|
|
34532
|
+
}
|
|
34533
|
+
await dockerSetup.pullImage();
|
|
34534
|
+
setupLog("Docker image pull complete");
|
|
34535
|
+
}
|
|
34536
|
+
if (suiteWorkspace?.env) {
|
|
34537
|
+
try {
|
|
34538
|
+
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
34539
|
+
setupLog("preflight checks passed");
|
|
34540
|
+
} catch (error40) {
|
|
34541
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34542
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
34543
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
34544
|
+
});
|
|
34545
|
+
}
|
|
34546
|
+
throw new WorkspaceSetupError(message, {
|
|
34547
|
+
failureStage: "setup",
|
|
34548
|
+
failureReasonCode: "preflight_error",
|
|
34549
|
+
hookExecutions,
|
|
34550
|
+
cause: error40
|
|
34551
|
+
});
|
|
34552
|
+
}
|
|
34553
|
+
}
|
|
34554
|
+
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
34555
|
+
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
34556
|
+
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
34557
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
34558
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
34559
|
+
setupLog(
|
|
34560
|
+
`running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
34561
|
+
);
|
|
34562
|
+
const scriptContext = {
|
|
34563
|
+
workspacePath: sharedWorkspacePath,
|
|
34564
|
+
testId: "__before_all__",
|
|
34565
|
+
evalRunId,
|
|
34566
|
+
evalDir,
|
|
34567
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
34568
|
+
};
|
|
34569
|
+
try {
|
|
34570
|
+
beforeAllOutput = await executeWorkspaceScript(
|
|
34571
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
34572
|
+
scriptContext
|
|
34573
|
+
);
|
|
34574
|
+
hookExecutions.push(
|
|
34575
|
+
hookExecution({
|
|
34576
|
+
scope: "workspace",
|
|
34577
|
+
name: "before_all",
|
|
34578
|
+
status: "success",
|
|
34579
|
+
testId: "__before_all__",
|
|
34580
|
+
workspacePath: sharedWorkspacePath,
|
|
34581
|
+
hook: beforeAllHook,
|
|
34582
|
+
output: beforeAllOutput
|
|
34583
|
+
})
|
|
34584
|
+
);
|
|
34585
|
+
setupLog("shared before_all completed");
|
|
34586
|
+
} catch (error40) {
|
|
34587
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34588
|
+
hookExecutions.push(
|
|
34589
|
+
hookExecution({
|
|
34590
|
+
scope: "workspace",
|
|
34591
|
+
name: "before_all",
|
|
34592
|
+
status: "failed",
|
|
34593
|
+
testId: "__before_all__",
|
|
34594
|
+
workspacePath: sharedWorkspacePath,
|
|
34595
|
+
hook: beforeAllHook,
|
|
34596
|
+
error: message
|
|
34597
|
+
})
|
|
34598
|
+
);
|
|
34599
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
34600
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
34601
|
+
});
|
|
34602
|
+
}
|
|
34603
|
+
throw new WorkspaceSetupError(`before_all script failed: ${message}`, {
|
|
34604
|
+
failureStage: "setup",
|
|
34605
|
+
failureReasonCode: "script_error",
|
|
34606
|
+
hookExecutions,
|
|
34607
|
+
cause: error40
|
|
34608
|
+
});
|
|
34609
|
+
}
|
|
34610
|
+
}
|
|
34611
|
+
if (availablePoolSlots.length > 0 && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
34612
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
34613
|
+
for (const slot of availablePoolSlots) {
|
|
34614
|
+
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
34615
|
+
const scriptContext = {
|
|
34616
|
+
workspacePath: slot.path,
|
|
34617
|
+
testId: "__before_all__",
|
|
34618
|
+
evalRunId,
|
|
34619
|
+
evalDir,
|
|
34620
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
34621
|
+
};
|
|
34622
|
+
try {
|
|
34623
|
+
const output = await executeWorkspaceScript(
|
|
34624
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
34625
|
+
scriptContext
|
|
34626
|
+
);
|
|
34627
|
+
if (!beforeAllOutput) beforeAllOutput = output;
|
|
34628
|
+
hookExecutions.push(
|
|
34629
|
+
hookExecution({
|
|
34630
|
+
scope: "workspace",
|
|
34631
|
+
name: "before_all",
|
|
34632
|
+
status: "success",
|
|
34633
|
+
testId: "__before_all__",
|
|
34634
|
+
workspacePath: slot.path,
|
|
34635
|
+
hook: beforeAllHook,
|
|
34636
|
+
output
|
|
34637
|
+
})
|
|
34638
|
+
);
|
|
34639
|
+
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
34640
|
+
} catch (error40) {
|
|
34641
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34642
|
+
hookExecutions.push(
|
|
34643
|
+
hookExecution({
|
|
34644
|
+
scope: "workspace",
|
|
34645
|
+
name: "before_all",
|
|
34646
|
+
status: "failed",
|
|
34647
|
+
testId: "__before_all__",
|
|
34648
|
+
workspacePath: slot.path,
|
|
34649
|
+
hook: beforeAllHook,
|
|
34650
|
+
error: message
|
|
34651
|
+
})
|
|
34652
|
+
);
|
|
34653
|
+
throw new WorkspaceSetupError(
|
|
34654
|
+
`before_all script failed on pool slot ${slot.index}: ${message}`,
|
|
34655
|
+
{
|
|
34656
|
+
failureStage: "setup",
|
|
34657
|
+
failureReasonCode: "script_error",
|
|
34658
|
+
hookExecutions,
|
|
34659
|
+
cause: error40
|
|
34660
|
+
}
|
|
34661
|
+
);
|
|
34662
|
+
}
|
|
34663
|
+
}
|
|
34664
|
+
}
|
|
34665
|
+
const targetBeforeAllHook = targetHooks?.before_all;
|
|
34666
|
+
if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
|
|
34667
|
+
const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
|
|
34668
|
+
setupLog(`running target before_all command=${beforeAllCommand}`);
|
|
34669
|
+
const scriptContext = {
|
|
34670
|
+
workspacePath: sharedWorkspacePath,
|
|
34671
|
+
testId: "__target_before_all__",
|
|
34672
|
+
evalRunId,
|
|
34673
|
+
evalDir,
|
|
34674
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
34675
|
+
};
|
|
34676
|
+
try {
|
|
34677
|
+
await executeWorkspaceScript(
|
|
34678
|
+
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
34679
|
+
scriptContext
|
|
34680
|
+
);
|
|
34681
|
+
hookExecutions.push(
|
|
34682
|
+
hookExecution({
|
|
34683
|
+
scope: "target",
|
|
34684
|
+
name: "before_all",
|
|
34685
|
+
status: "success",
|
|
34686
|
+
testId: "__target_before_all__",
|
|
34687
|
+
workspacePath: sharedWorkspacePath,
|
|
34688
|
+
hook: targetBeforeAllHook
|
|
34689
|
+
})
|
|
34690
|
+
);
|
|
34691
|
+
setupLog("target before_all completed");
|
|
34692
|
+
} catch (error40) {
|
|
34693
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34694
|
+
hookExecutions.push(
|
|
34695
|
+
hookExecution({
|
|
34696
|
+
scope: "target",
|
|
34697
|
+
name: "before_all",
|
|
34698
|
+
status: "failed",
|
|
34699
|
+
testId: "__target_before_all__",
|
|
34700
|
+
workspacePath: sharedWorkspacePath,
|
|
34701
|
+
hook: targetBeforeAllHook,
|
|
34702
|
+
error: message
|
|
34703
|
+
})
|
|
34704
|
+
);
|
|
34705
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
34706
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
34707
|
+
});
|
|
34708
|
+
}
|
|
34709
|
+
throw new WorkspaceSetupError(`target before_all hook failed: ${message}`, {
|
|
34710
|
+
failureStage: "setup",
|
|
34711
|
+
failureReasonCode: "script_error",
|
|
34712
|
+
hookExecutions,
|
|
34713
|
+
cause: error40
|
|
34714
|
+
});
|
|
34715
|
+
}
|
|
34716
|
+
}
|
|
34717
|
+
if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
|
|
34718
|
+
for (const slot of availablePoolSlots) {
|
|
34719
|
+
setupLog(`running target before_all on pool slot ${slot.index}`);
|
|
34720
|
+
const scriptContext = {
|
|
34721
|
+
workspacePath: slot.path,
|
|
34722
|
+
testId: "__target_before_all__",
|
|
34723
|
+
evalRunId,
|
|
34724
|
+
evalDir,
|
|
34725
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
34726
|
+
};
|
|
34727
|
+
try {
|
|
34728
|
+
await executeWorkspaceScript(
|
|
34729
|
+
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
34730
|
+
scriptContext
|
|
34731
|
+
);
|
|
34732
|
+
hookExecutions.push(
|
|
34733
|
+
hookExecution({
|
|
34734
|
+
scope: "target",
|
|
34735
|
+
name: "before_all",
|
|
34736
|
+
status: "success",
|
|
34737
|
+
testId: "__target_before_all__",
|
|
34738
|
+
workspacePath: slot.path,
|
|
34739
|
+
hook: targetBeforeAllHook
|
|
34740
|
+
})
|
|
34741
|
+
);
|
|
34742
|
+
} catch (error40) {
|
|
34743
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34744
|
+
hookExecutions.push(
|
|
34745
|
+
hookExecution({
|
|
34746
|
+
scope: "target",
|
|
34747
|
+
name: "before_all",
|
|
34748
|
+
status: "failed",
|
|
34749
|
+
testId: "__target_before_all__",
|
|
34750
|
+
workspacePath: slot.path,
|
|
34751
|
+
hook: targetBeforeAllHook,
|
|
34752
|
+
error: message
|
|
34753
|
+
})
|
|
34754
|
+
);
|
|
34755
|
+
throw new WorkspaceSetupError(
|
|
34756
|
+
`target before_all hook failed on pool slot ${slot.index}: ${message}`,
|
|
34757
|
+
{
|
|
34758
|
+
failureStage: "setup",
|
|
34759
|
+
failureReasonCode: "script_error",
|
|
34760
|
+
hookExecutions,
|
|
34761
|
+
cause: error40
|
|
34762
|
+
}
|
|
34763
|
+
);
|
|
34764
|
+
}
|
|
34765
|
+
}
|
|
34766
|
+
}
|
|
34767
|
+
if (sharedWorkspacePath) {
|
|
34768
|
+
try {
|
|
34769
|
+
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
34770
|
+
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
34771
|
+
} catch (error40) {
|
|
34772
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34773
|
+
setupLog(`shared baseline initialization failed (file_changes unavailable): ${message}`);
|
|
34774
|
+
}
|
|
34775
|
+
}
|
|
34776
|
+
if (availablePoolSlots.length > 0) {
|
|
34777
|
+
for (const slot of availablePoolSlots) {
|
|
34778
|
+
try {
|
|
34779
|
+
const baseline = await initializeBaseline(slot.path);
|
|
34780
|
+
poolSlotBaselines.set(slot.path, baseline);
|
|
34781
|
+
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
34782
|
+
} catch (error40) {
|
|
34783
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34784
|
+
setupLog(
|
|
34785
|
+
`pool slot ${slot.index} baseline initialization failed (file_changes unavailable): ${message}`
|
|
34786
|
+
);
|
|
34787
|
+
}
|
|
34788
|
+
}
|
|
34789
|
+
}
|
|
34790
|
+
return {
|
|
34791
|
+
...suiteWorkspace !== void 0 && { suiteWorkspace },
|
|
34792
|
+
...sharedWorkspacePath !== void 0 && { sharedWorkspacePath },
|
|
34793
|
+
...sharedBaselineCommit !== void 0 && { sharedBaselineCommit },
|
|
34794
|
+
...suiteWorkspaceFile !== void 0 && { suiteWorkspaceFile },
|
|
34795
|
+
...beforeAllOutput !== void 0 && { beforeAllOutput },
|
|
34796
|
+
...repoManager !== void 0 && { repoManager },
|
|
34797
|
+
...poolManager !== void 0 && { poolManager },
|
|
34798
|
+
...poolSlot !== void 0 && { poolSlot },
|
|
34799
|
+
poolSlots,
|
|
34800
|
+
availablePoolSlots,
|
|
34801
|
+
poolSlotBaselines,
|
|
34802
|
+
useStaticWorkspace,
|
|
34803
|
+
configuredMode,
|
|
34804
|
+
hookExecutions
|
|
34805
|
+
};
|
|
34806
|
+
} catch (error40) {
|
|
34807
|
+
await releasePoolSlots({ poolManager, poolSlot, poolSlots }).catch(() => {
|
|
34808
|
+
});
|
|
34809
|
+
throw error40;
|
|
34810
|
+
}
|
|
34811
|
+
}
|
|
34812
|
+
async function prepareEvalCaseWorkspace(options) {
|
|
34813
|
+
const {
|
|
34814
|
+
evalCase,
|
|
34815
|
+
evalRunId,
|
|
34816
|
+
sharedWorkspacePath,
|
|
34817
|
+
sharedBaselineCommit,
|
|
34818
|
+
suiteWorkspaceFile,
|
|
34819
|
+
repoManager,
|
|
34820
|
+
evalDir,
|
|
34821
|
+
cleanupWorkspaces: forceCleanup,
|
|
34822
|
+
targetHooks,
|
|
34823
|
+
setupDebug
|
|
34824
|
+
} = options;
|
|
34825
|
+
let workspacePath = sharedWorkspacePath;
|
|
34826
|
+
let beforeAllOutput;
|
|
34827
|
+
let beforeEachOutput;
|
|
34828
|
+
const isSharedWorkspace = !!sharedWorkspacePath;
|
|
34829
|
+
let caseWorkspaceFile;
|
|
34830
|
+
const caseHooksEnabled = hooksEnabled(evalCase.workspace);
|
|
34831
|
+
const hookExecutions = [];
|
|
34832
|
+
if (!workspacePath) {
|
|
34833
|
+
const rawCaseTemplate = evalCase.workspace?.template;
|
|
34834
|
+
const resolvedCaseTemplate = await resolveWorkspaceTemplate(rawCaseTemplate);
|
|
34835
|
+
const caseWorkspaceTemplate = resolvedCaseTemplate?.dir;
|
|
34836
|
+
caseWorkspaceFile = resolvedCaseTemplate?.workspaceFile;
|
|
34837
|
+
if (caseWorkspaceTemplate && evalRunId) {
|
|
34838
|
+
try {
|
|
34839
|
+
workspacePath = await createTempWorkspace(caseWorkspaceTemplate, evalRunId, evalCase.id);
|
|
34840
|
+
} catch (error40) {
|
|
34841
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34842
|
+
throw new WorkspaceSetupError(`Failed to create workspace: ${message}`, {
|
|
34843
|
+
failureStage: "setup",
|
|
34844
|
+
failureReasonCode: "template_error",
|
|
34845
|
+
hookExecutions,
|
|
34846
|
+
cause: error40
|
|
34847
|
+
});
|
|
34848
|
+
}
|
|
34849
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
34850
|
+
const copiedFile = path39.join(workspacePath, path39.basename(caseWorkspaceFile));
|
|
34851
|
+
try {
|
|
34852
|
+
await stat8(copiedFile);
|
|
34853
|
+
caseWorkspaceFile = copiedFile;
|
|
34854
|
+
} catch {
|
|
34855
|
+
}
|
|
34856
|
+
}
|
|
34857
|
+
}
|
|
34858
|
+
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
34859
|
+
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
34860
|
+
await mkdir17(workspacePath, { recursive: true });
|
|
34861
|
+
}
|
|
34862
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
34863
|
+
const perCaseRepoManager = new RepoManager(setupDebug);
|
|
34864
|
+
try {
|
|
34865
|
+
if (setupDebug) {
|
|
34866
|
+
console.log(
|
|
34867
|
+
`[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-test repo(s) into ${workspacePath}`
|
|
34868
|
+
);
|
|
34869
|
+
}
|
|
34870
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
34871
|
+
if (setupDebug) {
|
|
34872
|
+
console.log(`[setup] test=${evalCase.id} per-test repo materialization complete`);
|
|
34873
|
+
}
|
|
34874
|
+
} catch (error40) {
|
|
34875
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34876
|
+
throw new WorkspaceSetupError(`Failed to materialize repos: ${message}`, {
|
|
34877
|
+
failureStage: "repo_setup",
|
|
34878
|
+
failureReasonCode: "clone_error",
|
|
34879
|
+
hookExecutions,
|
|
34880
|
+
cause: error40
|
|
34881
|
+
});
|
|
34882
|
+
}
|
|
34883
|
+
}
|
|
34884
|
+
if (workspacePath && evalCase.metadata?.agent_skills_files) {
|
|
34885
|
+
const baseDir = evalCase.metadata.agent_skills_base_dir;
|
|
34886
|
+
const files = evalCase.metadata.agent_skills_files;
|
|
34887
|
+
if (baseDir && files.length > 0) {
|
|
34888
|
+
for (const relPath of files) {
|
|
34889
|
+
const srcPath = path39.resolve(baseDir, relPath);
|
|
34890
|
+
const destPath = path39.resolve(workspacePath, relPath);
|
|
34891
|
+
try {
|
|
34892
|
+
await mkdir17(path39.dirname(destPath), { recursive: true });
|
|
34893
|
+
await copyFile2(srcPath, destPath);
|
|
34894
|
+
} catch (error40) {
|
|
34895
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34896
|
+
throw new WorkspaceSetupError(
|
|
34897
|
+
`Agent Skills eval file not found: ${relPath} (resolved from ${baseDir}): ${message}`,
|
|
34898
|
+
{
|
|
34899
|
+
failureStage: "setup",
|
|
34900
|
+
failureReasonCode: "file_copy_error",
|
|
34901
|
+
hookExecutions,
|
|
34902
|
+
cause: error40
|
|
34903
|
+
}
|
|
34904
|
+
);
|
|
34905
|
+
}
|
|
34906
|
+
}
|
|
34907
|
+
}
|
|
34908
|
+
}
|
|
34909
|
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
|
|
34910
|
+
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) {
|
|
34911
|
+
const beforeAllHook = caseBeforeAllHook;
|
|
34912
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
34913
|
+
if (setupDebug) {
|
|
34914
|
+
console.log(
|
|
34915
|
+
`[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
34916
|
+
);
|
|
34917
|
+
}
|
|
34918
|
+
const scriptContext = {
|
|
34919
|
+
workspacePath,
|
|
34920
|
+
testId: evalCase.id,
|
|
34921
|
+
evalRunId: evalRunId ?? "",
|
|
34922
|
+
caseInput: evalCase.question,
|
|
34923
|
+
caseMetadata: evalCase.metadata,
|
|
34924
|
+
evalDir,
|
|
34925
|
+
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
34926
|
+
};
|
|
34927
|
+
try {
|
|
34928
|
+
beforeAllOutput = await executeWorkspaceScript(
|
|
34929
|
+
toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
|
|
34930
|
+
scriptContext
|
|
34931
|
+
);
|
|
34932
|
+
hookExecutions.push(
|
|
34933
|
+
hookExecution({
|
|
34934
|
+
scope: "workspace",
|
|
34935
|
+
name: "before_all",
|
|
34936
|
+
status: "success",
|
|
34937
|
+
testId: evalCase.id,
|
|
34938
|
+
workspacePath,
|
|
34939
|
+
hook: beforeAllHook,
|
|
34940
|
+
output: beforeAllOutput
|
|
34941
|
+
})
|
|
34942
|
+
);
|
|
34943
|
+
if (setupDebug) {
|
|
34944
|
+
console.log(`[setup] test=${evalCase.id} before_all completed`);
|
|
34945
|
+
}
|
|
34946
|
+
} catch (error40) {
|
|
34947
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34948
|
+
hookExecutions.push(
|
|
34949
|
+
hookExecution({
|
|
34950
|
+
scope: "workspace",
|
|
34951
|
+
name: "before_all",
|
|
34952
|
+
status: "failed",
|
|
34953
|
+
testId: evalCase.id,
|
|
34954
|
+
workspacePath,
|
|
34955
|
+
hook: beforeAllHook,
|
|
34956
|
+
error: message
|
|
34957
|
+
})
|
|
34958
|
+
);
|
|
34959
|
+
if (forceCleanup && workspacePath) {
|
|
34960
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
34961
|
+
});
|
|
34962
|
+
}
|
|
34963
|
+
throw new WorkspaceSetupError(`before_all script failed: ${message}`, {
|
|
34964
|
+
failureStage: "setup",
|
|
34965
|
+
failureReasonCode: "script_error",
|
|
34966
|
+
hookExecutions,
|
|
34967
|
+
cause: error40
|
|
34968
|
+
});
|
|
34969
|
+
}
|
|
34970
|
+
}
|
|
34971
|
+
}
|
|
34972
|
+
let beforeEachNeedsFreshBaseline = false;
|
|
34973
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
|
|
34974
|
+
try {
|
|
34975
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
34976
|
+
await repoManager.reset(
|
|
34977
|
+
evalCase.workspace.repos,
|
|
34978
|
+
workspacePath,
|
|
34979
|
+
evalCase.workspace.hooks.before_each.reset
|
|
34980
|
+
);
|
|
34981
|
+
} else {
|
|
34982
|
+
await resetWorkspaceRoot(
|
|
34983
|
+
workspacePath,
|
|
34984
|
+
evalCase.workspace.hooks.before_each.reset,
|
|
34985
|
+
sharedBaselineCommit
|
|
34986
|
+
);
|
|
34987
|
+
}
|
|
34988
|
+
} catch (error40) {
|
|
34989
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
34990
|
+
throw new WorkspaceSetupError(`before_each reset failed: ${message}`, {
|
|
34991
|
+
failureStage: "setup",
|
|
34992
|
+
failureReasonCode: "script_error",
|
|
34993
|
+
hookExecutions,
|
|
34994
|
+
cause: error40
|
|
34995
|
+
});
|
|
34996
|
+
}
|
|
34997
|
+
}
|
|
34998
|
+
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
34999
|
+
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
|
|
35000
|
+
const beforeEachHook = caseBeforeEachHook;
|
|
35001
|
+
const scriptContext = {
|
|
35002
|
+
workspacePath,
|
|
35003
|
+
testId: evalCase.id,
|
|
35004
|
+
evalRunId: evalRunId ?? "",
|
|
35005
|
+
caseInput: evalCase.question,
|
|
35006
|
+
caseMetadata: evalCase.metadata,
|
|
35007
|
+
evalDir,
|
|
35008
|
+
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
35009
|
+
};
|
|
35010
|
+
try {
|
|
35011
|
+
beforeEachOutput = await executeWorkspaceScript(
|
|
35012
|
+
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
35013
|
+
scriptContext
|
|
35014
|
+
);
|
|
35015
|
+
hookExecutions.push(
|
|
35016
|
+
hookExecution({
|
|
35017
|
+
scope: "workspace",
|
|
35018
|
+
name: "before_each",
|
|
35019
|
+
status: "success",
|
|
35020
|
+
testId: evalCase.id,
|
|
35021
|
+
workspacePath,
|
|
35022
|
+
hook: beforeEachHook,
|
|
35023
|
+
output: beforeEachOutput
|
|
35024
|
+
})
|
|
35025
|
+
);
|
|
35026
|
+
beforeEachNeedsFreshBaseline = true;
|
|
35027
|
+
} catch (error40) {
|
|
35028
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
35029
|
+
hookExecutions.push(
|
|
35030
|
+
hookExecution({
|
|
35031
|
+
scope: "workspace",
|
|
35032
|
+
name: "before_each",
|
|
35033
|
+
status: "failed",
|
|
35034
|
+
testId: evalCase.id,
|
|
35035
|
+
workspacePath,
|
|
35036
|
+
hook: beforeEachHook,
|
|
35037
|
+
error: message
|
|
35038
|
+
})
|
|
35039
|
+
);
|
|
35040
|
+
throw new WorkspaceSetupError(`before_each script failed: ${message}`, {
|
|
35041
|
+
failureStage: "setup",
|
|
35042
|
+
failureReasonCode: "script_error",
|
|
35043
|
+
hookExecutions,
|
|
35044
|
+
cause: error40
|
|
35045
|
+
});
|
|
35046
|
+
}
|
|
35047
|
+
}
|
|
35048
|
+
const targetBeforeEachHook = targetHooks?.before_each;
|
|
35049
|
+
if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
|
|
35050
|
+
const scriptContext = {
|
|
35051
|
+
workspacePath,
|
|
35052
|
+
testId: evalCase.id,
|
|
35053
|
+
evalRunId: evalRunId ?? "",
|
|
35054
|
+
caseInput: evalCase.question,
|
|
35055
|
+
caseMetadata: evalCase.metadata,
|
|
35056
|
+
evalDir,
|
|
35057
|
+
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
35058
|
+
};
|
|
35059
|
+
try {
|
|
35060
|
+
await executeWorkspaceScript(
|
|
35061
|
+
toScriptConfig(targetBeforeEachHook, "before_each", `target hook for '${evalCase.id}'`),
|
|
35062
|
+
scriptContext
|
|
35063
|
+
);
|
|
35064
|
+
hookExecutions.push(
|
|
35065
|
+
hookExecution({
|
|
35066
|
+
scope: "target",
|
|
35067
|
+
name: "before_each",
|
|
35068
|
+
status: "success",
|
|
35069
|
+
testId: evalCase.id,
|
|
35070
|
+
workspacePath,
|
|
35071
|
+
hook: targetBeforeEachHook
|
|
35072
|
+
})
|
|
35073
|
+
);
|
|
35074
|
+
beforeEachNeedsFreshBaseline = true;
|
|
35075
|
+
} catch (error40) {
|
|
35076
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
35077
|
+
hookExecutions.push(
|
|
35078
|
+
hookExecution({
|
|
35079
|
+
scope: "target",
|
|
35080
|
+
name: "before_each",
|
|
35081
|
+
status: "failed",
|
|
35082
|
+
testId: evalCase.id,
|
|
35083
|
+
workspacePath,
|
|
35084
|
+
hook: targetBeforeEachHook,
|
|
35085
|
+
error: message
|
|
35086
|
+
})
|
|
35087
|
+
);
|
|
35088
|
+
throw new WorkspaceSetupError(`target before_each hook failed: ${message}`, {
|
|
35089
|
+
failureStage: "setup",
|
|
35090
|
+
failureReasonCode: "script_error",
|
|
35091
|
+
hookExecutions,
|
|
35092
|
+
cause: error40
|
|
35093
|
+
});
|
|
35094
|
+
}
|
|
35095
|
+
}
|
|
35096
|
+
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
35097
|
+
if (!baselineCommit && workspacePath) {
|
|
35098
|
+
try {
|
|
35099
|
+
baselineCommit = await initializeBaseline(workspacePath);
|
|
35100
|
+
} catch (error40) {
|
|
35101
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
35102
|
+
if (setupDebug) {
|
|
35103
|
+
console.warn(`[setup] test=${evalCase.id} baseline initialization failed: ${message}`);
|
|
35104
|
+
}
|
|
35105
|
+
}
|
|
35106
|
+
}
|
|
35107
|
+
return {
|
|
35108
|
+
...workspacePath !== void 0 && { workspacePath },
|
|
35109
|
+
caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
35110
|
+
...beforeAllOutput !== void 0 && { beforeAllOutput },
|
|
35111
|
+
...beforeEachOutput !== void 0 && { beforeEachOutput },
|
|
35112
|
+
...baselineCommit !== void 0 && { baselineCommit },
|
|
35113
|
+
isSharedWorkspace,
|
|
35114
|
+
hookExecutions
|
|
35115
|
+
};
|
|
35116
|
+
}
|
|
35117
|
+
async function runPreflightChecks(env, cwd, log) {
|
|
35118
|
+
const missing = [];
|
|
35119
|
+
for (const cmd of env.required_commands ?? []) {
|
|
35120
|
+
log(`preflight: checking command "${cmd}"`);
|
|
35121
|
+
try {
|
|
35122
|
+
if (process.platform === "win32") {
|
|
35123
|
+
await execFileAsync2("where", [cmd], { cwd });
|
|
35124
|
+
} else {
|
|
35125
|
+
await execFileAsync2("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
35126
|
+
}
|
|
35127
|
+
} catch {
|
|
35128
|
+
missing.push(`command: ${cmd}`);
|
|
35129
|
+
}
|
|
35130
|
+
}
|
|
35131
|
+
for (const mod of env.required_python_modules ?? []) {
|
|
35132
|
+
log(`preflight: checking Python module "${mod}"`);
|
|
35133
|
+
try {
|
|
35134
|
+
await execFileAsync2("python3", ["-c", `import ${mod}`], { cwd });
|
|
35135
|
+
} catch {
|
|
35136
|
+
missing.push(`python module: ${mod}`);
|
|
35137
|
+
}
|
|
35138
|
+
}
|
|
35139
|
+
if (missing.length > 0) {
|
|
35140
|
+
throw new Error(
|
|
35141
|
+
`Preflight checks failed \u2014 missing dependencies:
|
|
35142
|
+
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
35143
|
+
|
|
35144
|
+
Install the missing dependencies before running this eval.`
|
|
35145
|
+
);
|
|
34225
35146
|
}
|
|
34226
|
-
return result.stdout;
|
|
34227
35147
|
}
|
|
34228
35148
|
function flattenInputMessages(messages) {
|
|
34229
35149
|
return messages.flatMap((message) => extractContentSegments(message.content));
|
|
@@ -34305,7 +35225,7 @@ async function loadTestsFromAgentSkills(filePath) {
|
|
|
34305
35225
|
} catch {
|
|
34306
35226
|
throw new Error(`Invalid Agent Skills evals.json: failed to parse JSON in '${filePath}'`);
|
|
34307
35227
|
}
|
|
34308
|
-
return parseAgentSkillsEvals(parsed, filePath,
|
|
35228
|
+
return parseAgentSkillsEvals(parsed, filePath, path40.dirname(path40.resolve(filePath)));
|
|
34309
35229
|
}
|
|
34310
35230
|
function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
34311
35231
|
if (!isAgentSkillsFormat(parsed)) {
|
|
@@ -34343,7 +35263,7 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
34343
35263
|
if (baseDir) {
|
|
34344
35264
|
metadata.agent_skills_base_dir = baseDir;
|
|
34345
35265
|
for (const file2 of evalCase.files) {
|
|
34346
|
-
filePaths.push(
|
|
35266
|
+
filePaths.push(path40.resolve(baseDir, file2));
|
|
34347
35267
|
}
|
|
34348
35268
|
}
|
|
34349
35269
|
}
|
|
@@ -34379,15 +35299,15 @@ function resolveToAbsolutePath(candidate) {
|
|
|
34379
35299
|
if (candidate.startsWith("file:")) {
|
|
34380
35300
|
return fileURLToPath4(candidate);
|
|
34381
35301
|
}
|
|
34382
|
-
return
|
|
35302
|
+
return path41.resolve(candidate);
|
|
34383
35303
|
}
|
|
34384
35304
|
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
34385
35305
|
}
|
|
34386
35306
|
function buildDirectoryChain2(filePath, repoRoot) {
|
|
34387
35307
|
const directories = [];
|
|
34388
35308
|
const seen = /* @__PURE__ */ new Set();
|
|
34389
|
-
const boundary =
|
|
34390
|
-
let current =
|
|
35309
|
+
const boundary = path41.resolve(repoRoot);
|
|
35310
|
+
let current = path41.resolve(path41.dirname(filePath));
|
|
34391
35311
|
while (current !== void 0) {
|
|
34392
35312
|
if (!seen.has(current)) {
|
|
34393
35313
|
directories.push(current);
|
|
@@ -34396,7 +35316,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
34396
35316
|
if (current === boundary) {
|
|
34397
35317
|
break;
|
|
34398
35318
|
}
|
|
34399
|
-
const parent =
|
|
35319
|
+
const parent = path41.dirname(current);
|
|
34400
35320
|
if (parent === current) {
|
|
34401
35321
|
break;
|
|
34402
35322
|
}
|
|
@@ -34410,16 +35330,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
34410
35330
|
function buildSearchRoots2(evalPath, repoRoot) {
|
|
34411
35331
|
const uniqueRoots = [];
|
|
34412
35332
|
const addRoot = (root) => {
|
|
34413
|
-
const normalized =
|
|
35333
|
+
const normalized = path41.resolve(root);
|
|
34414
35334
|
if (!uniqueRoots.includes(normalized)) {
|
|
34415
35335
|
uniqueRoots.push(normalized);
|
|
34416
35336
|
}
|
|
34417
35337
|
};
|
|
34418
|
-
let currentDir =
|
|
35338
|
+
let currentDir = path41.dirname(evalPath);
|
|
34419
35339
|
let reachedBoundary = false;
|
|
34420
35340
|
while (!reachedBoundary) {
|
|
34421
35341
|
addRoot(currentDir);
|
|
34422
|
-
const parentDir =
|
|
35342
|
+
const parentDir = path41.dirname(currentDir);
|
|
34423
35343
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
34424
35344
|
reachedBoundary = true;
|
|
34425
35345
|
} else {
|
|
@@ -34437,16 +35357,16 @@ function trimLeadingSeparators2(value) {
|
|
|
34437
35357
|
async function resolveFileReference3(rawValue, searchRoots) {
|
|
34438
35358
|
const displayPath = trimLeadingSeparators2(rawValue);
|
|
34439
35359
|
const potentialPaths = [];
|
|
34440
|
-
if (
|
|
34441
|
-
potentialPaths.push(
|
|
35360
|
+
if (path41.isAbsolute(rawValue)) {
|
|
35361
|
+
potentialPaths.push(path41.normalize(rawValue));
|
|
34442
35362
|
}
|
|
34443
35363
|
for (const base of searchRoots) {
|
|
34444
|
-
potentialPaths.push(
|
|
35364
|
+
potentialPaths.push(path41.resolve(base, displayPath));
|
|
34445
35365
|
}
|
|
34446
35366
|
const attempted = [];
|
|
34447
35367
|
const seen = /* @__PURE__ */ new Set();
|
|
34448
35368
|
for (const candidate of potentialPaths) {
|
|
34449
|
-
const absoluteCandidate =
|
|
35369
|
+
const absoluteCandidate = path41.resolve(candidate);
|
|
34450
35370
|
if (seen.has(absoluteCandidate)) {
|
|
34451
35371
|
continue;
|
|
34452
35372
|
}
|
|
@@ -34467,9 +35387,9 @@ var DEFAULT_EVAL_PATTERNS = [
|
|
|
34467
35387
|
];
|
|
34468
35388
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
34469
35389
|
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
34470
|
-
const globalConfigPath =
|
|
35390
|
+
const globalConfigPath = path422.join(getAgentvConfigDir(), "config.yaml");
|
|
34471
35391
|
for (const directory of directories) {
|
|
34472
|
-
const configPath2 =
|
|
35392
|
+
const configPath2 = path422.join(directory, ".agentv", "config.yaml");
|
|
34473
35393
|
if (!await fileExists3(configPath2)) {
|
|
34474
35394
|
continue;
|
|
34475
35395
|
}
|
|
@@ -35020,8 +35940,8 @@ function isTemplateReference(value) {
|
|
|
35020
35940
|
}
|
|
35021
35941
|
async function resolveAssertionTemplateReference(include, searchRoots) {
|
|
35022
35942
|
const templateCandidates = isTemplateReference(include) ? [
|
|
35023
|
-
|
|
35024
|
-
|
|
35943
|
+
path43.join(".agentv", "templates", `${include}.yaml`),
|
|
35944
|
+
path43.join(".agentv", "templates", `${include}.yml`)
|
|
35025
35945
|
] : [include];
|
|
35026
35946
|
const attempted = [];
|
|
35027
35947
|
for (const candidate of templateCandidates) {
|
|
@@ -35074,10 +35994,10 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
35074
35994
|
`Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} is missing a top-level assertions array`
|
|
35075
35995
|
);
|
|
35076
35996
|
}
|
|
35077
|
-
const templateDir =
|
|
35997
|
+
const templateDir = path43.dirname(resolved.resolvedPath);
|
|
35078
35998
|
const nestedSearchRoots = [
|
|
35079
35999
|
templateDir,
|
|
35080
|
-
...searchRoots.filter((root) =>
|
|
36000
|
+
...searchRoots.filter((root) => path43.resolve(root) !== templateDir)
|
|
35081
36001
|
];
|
|
35082
36002
|
return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
|
|
35083
36003
|
depth: nextDepth,
|
|
@@ -35138,7 +36058,7 @@ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, e
|
|
|
35138
36058
|
references.push({
|
|
35139
36059
|
kind: "assertion_template",
|
|
35140
36060
|
displayPath: resolved.displayPath,
|
|
35141
|
-
...resolved.resolvedPath ? { resolvedPath:
|
|
36061
|
+
...resolved.resolvedPath ? { resolvedPath: path43.resolve(resolved.resolvedPath) } : {}
|
|
35142
36062
|
});
|
|
35143
36063
|
if (resolved.resolvedPath) {
|
|
35144
36064
|
if (includeContext.chain.includes(resolved.resolvedPath)) {
|
|
@@ -35148,10 +36068,10 @@ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, e
|
|
|
35148
36068
|
const content = await readFile15(resolved.resolvedPath, "utf8");
|
|
35149
36069
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
35150
36070
|
if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
|
|
35151
|
-
const templateDir =
|
|
36071
|
+
const templateDir = path43.dirname(resolved.resolvedPath);
|
|
35152
36072
|
const nestedSearchRoots = [
|
|
35153
36073
|
templateDir,
|
|
35154
|
-
...searchRoots.filter((root) =>
|
|
36074
|
+
...searchRoots.filter((root) => path43.resolve(root) !== templateDir)
|
|
35155
36075
|
];
|
|
35156
36076
|
references.push(
|
|
35157
36077
|
...await collectAssertionTemplateReferencesFromValue(
|
|
@@ -35337,7 +36257,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
35337
36257
|
if (cwd) {
|
|
35338
36258
|
const resolved = await resolveFileReference3(cwd, searchRoots);
|
|
35339
36259
|
if (resolved.resolvedPath) {
|
|
35340
|
-
resolvedCwd =
|
|
36260
|
+
resolvedCwd = path43.resolve(resolved.resolvedPath);
|
|
35341
36261
|
} else {
|
|
35342
36262
|
logWarning2(
|
|
35343
36263
|
`Code-grader evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
@@ -35523,7 +36443,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
35523
36443
|
aggregatorPrompt = fileRef;
|
|
35524
36444
|
const resolved = await resolveFileReference3(fileRef, searchRoots);
|
|
35525
36445
|
if (resolved.resolvedPath) {
|
|
35526
|
-
promptPath2 =
|
|
36446
|
+
promptPath2 = path43.resolve(resolved.resolvedPath);
|
|
35527
36447
|
} else {
|
|
35528
36448
|
throw new Error(
|
|
35529
36449
|
`Composite aggregator in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
@@ -36203,7 +37123,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
36203
37123
|
const commandPath = commandArray[commandArray.length - 1];
|
|
36204
37124
|
const resolved = await resolveFileReference3(commandPath, searchRoots);
|
|
36205
37125
|
if (resolved.resolvedPath) {
|
|
36206
|
-
resolvedPromptScript = [...commandArray.slice(0, -1),
|
|
37126
|
+
resolvedPromptScript = [...commandArray.slice(0, -1), path43.resolve(resolved.resolvedPath)];
|
|
36207
37127
|
} else {
|
|
36208
37128
|
throw new Error(
|
|
36209
37129
|
`Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
|
|
@@ -36218,7 +37138,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
36218
37138
|
prompt = fileRef;
|
|
36219
37139
|
const resolved = await resolveFileReference3(fileRef, searchRoots);
|
|
36220
37140
|
if (resolved.resolvedPath) {
|
|
36221
|
-
promptPath =
|
|
37141
|
+
promptPath = path43.resolve(resolved.resolvedPath);
|
|
36222
37142
|
try {
|
|
36223
37143
|
await validateCustomPromptContent(promptPath);
|
|
36224
37144
|
} catch (error40) {
|
|
@@ -36376,7 +37296,7 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
36376
37296
|
preprocessors.push({
|
|
36377
37297
|
type,
|
|
36378
37298
|
command,
|
|
36379
|
-
resolvedCommand: [...command.slice(0, -1),
|
|
37299
|
+
resolvedCommand: [...command.slice(0, -1), path43.resolve(resolved.resolvedPath)]
|
|
36380
37300
|
});
|
|
36381
37301
|
}
|
|
36382
37302
|
return preprocessors;
|
|
@@ -36471,10 +37391,10 @@ async function resolveOptionalCommandSource(command, searchRoots) {
|
|
|
36471
37391
|
return void 0;
|
|
36472
37392
|
}
|
|
36473
37393
|
const resolved = await resolveFileReference3(candidate, searchRoots);
|
|
36474
|
-
return resolved.resolvedPath ?
|
|
37394
|
+
return resolved.resolvedPath ? path43.resolve(resolved.resolvedPath) : void 0;
|
|
36475
37395
|
}
|
|
36476
37396
|
function looksLikeFilePath(value) {
|
|
36477
|
-
return
|
|
37397
|
+
return path43.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
|
|
36478
37398
|
}
|
|
36479
37399
|
function parseCommandToArgv(command) {
|
|
36480
37400
|
if (process.platform === "win32") {
|
|
@@ -36862,7 +37782,7 @@ var IMAGE_MEDIA_TYPES = {
|
|
|
36862
37782
|
".bmp": "image/bmp"
|
|
36863
37783
|
};
|
|
36864
37784
|
function detectImageMediaType(filePath) {
|
|
36865
|
-
const ext =
|
|
37785
|
+
const ext = path44.extname(filePath).toLowerCase();
|
|
36866
37786
|
return IMAGE_MEDIA_TYPES[ext];
|
|
36867
37787
|
}
|
|
36868
37788
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
@@ -36926,7 +37846,7 @@ async function processMessages(options) {
|
|
|
36926
37846
|
...cloneJsonObject(rawSegment),
|
|
36927
37847
|
path: displayPath,
|
|
36928
37848
|
text: fileContent,
|
|
36929
|
-
resolvedPath:
|
|
37849
|
+
resolvedPath: path44.resolve(resolvedPath)
|
|
36930
37850
|
});
|
|
36931
37851
|
if (verbose) {
|
|
36932
37852
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -37050,7 +37970,7 @@ async function processExpectedMessages(options) {
|
|
|
37050
37970
|
type: "file",
|
|
37051
37971
|
path: displayPath,
|
|
37052
37972
|
text: fileContent,
|
|
37053
|
-
resolvedPath:
|
|
37973
|
+
resolvedPath: path44.resolve(resolvedPath)
|
|
37054
37974
|
});
|
|
37055
37975
|
if (verbose) {
|
|
37056
37976
|
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
@@ -37196,7 +38116,7 @@ function matchesFilter(id, filter) {
|
|
|
37196
38116
|
return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
|
|
37197
38117
|
}
|
|
37198
38118
|
function detectFormat(filePath) {
|
|
37199
|
-
const ext =
|
|
38119
|
+
const ext = path45.extname(filePath).toLowerCase();
|
|
37200
38120
|
if (ext === ".jsonl") return "jsonl";
|
|
37201
38121
|
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
37202
38122
|
if (ext === ".json") return "agent-skills-json";
|
|
@@ -37206,9 +38126,9 @@ function detectFormat(filePath) {
|
|
|
37206
38126
|
);
|
|
37207
38127
|
}
|
|
37208
38128
|
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
37209
|
-
const dir =
|
|
37210
|
-
const base =
|
|
37211
|
-
const sidecarPath =
|
|
38129
|
+
const dir = path45.dirname(jsonlPath);
|
|
38130
|
+
const base = path45.basename(jsonlPath, ".jsonl");
|
|
38131
|
+
const sidecarPath = path45.join(dir, `${base}.yaml`);
|
|
37212
38132
|
if (!await fileExists3(sidecarPath)) {
|
|
37213
38133
|
if (verbose) {
|
|
37214
38134
|
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
@@ -37257,13 +38177,13 @@ function parseJsonlContent(content, filePath) {
|
|
|
37257
38177
|
async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
37258
38178
|
const verbose = options?.verbose ?? false;
|
|
37259
38179
|
const filterPattern = options?.filter;
|
|
37260
|
-
const absoluteTestPath =
|
|
38180
|
+
const absoluteTestPath = path45.resolve(evalFilePath);
|
|
37261
38181
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
37262
38182
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
37263
38183
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
37264
38184
|
const rawFile = await readFile17(absoluteTestPath, "utf8");
|
|
37265
38185
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
37266
|
-
const fallbackSuiteName =
|
|
38186
|
+
const fallbackSuiteName = path45.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
37267
38187
|
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
37268
38188
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
37269
38189
|
const globalExecution = sidecar.execution;
|
|
@@ -37660,7 +38580,7 @@ function interpolateRawEvalCase(raw, vars) {
|
|
|
37660
38580
|
}
|
|
37661
38581
|
async function readTestSuiteMetadata(testFilePath) {
|
|
37662
38582
|
try {
|
|
37663
|
-
const absolutePath =
|
|
38583
|
+
const absolutePath = path46.resolve(testFilePath);
|
|
37664
38584
|
const content = await readFile18(absolutePath, "utf8");
|
|
37665
38585
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
37666
38586
|
if (!isJsonObject(parsed)) {
|
|
@@ -37685,7 +38605,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
37685
38605
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
37686
38606
|
}
|
|
37687
38607
|
if (format === "typescript") {
|
|
37688
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
38608
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-ZVL6CGTE-TZYZX3QS.js");
|
|
37689
38609
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
37690
38610
|
}
|
|
37691
38611
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -37720,7 +38640,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
37720
38640
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
37721
38641
|
}
|
|
37722
38642
|
if (format === "typescript") {
|
|
37723
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
38643
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-ZVL6CGTE-TZYZX3QS.js");
|
|
37724
38644
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
37725
38645
|
return suite.tests;
|
|
37726
38646
|
}
|
|
@@ -37731,7 +38651,7 @@ var loadEvalCases = loadTests;
|
|
|
37731
38651
|
async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
37732
38652
|
const verbose = options?.verbose ?? false;
|
|
37733
38653
|
const filterPattern = options?.filter;
|
|
37734
|
-
const absoluteTestPath =
|
|
38654
|
+
const absoluteTestPath = path46.resolve(evalFilePath);
|
|
37735
38655
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
37736
38656
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
37737
38657
|
const config2 = await loadConfig(absoluteTestPath, repoRootPath);
|
|
@@ -37744,7 +38664,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
37744
38664
|
}
|
|
37745
38665
|
const suite = interpolated;
|
|
37746
38666
|
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
37747
|
-
const fallbackSuiteName =
|
|
38667
|
+
const fallbackSuiteName = path46.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
37748
38668
|
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
37749
38669
|
const rawTestCases = resolveTests(suite);
|
|
37750
38670
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
@@ -37754,13 +38674,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
37754
38674
|
"<suite>",
|
|
37755
38675
|
absoluteTestPath
|
|
37756
38676
|
);
|
|
37757
|
-
const evalFileDir =
|
|
38677
|
+
const evalFileDir = path46.dirname(absoluteTestPath);
|
|
37758
38678
|
let expandedTestCases;
|
|
37759
38679
|
if (typeof rawTestCases === "string") {
|
|
37760
|
-
const externalPath =
|
|
38680
|
+
const externalPath = path46.resolve(evalFileDir, rawTestCases);
|
|
37761
38681
|
let isDir = false;
|
|
37762
38682
|
try {
|
|
37763
|
-
const pathStat = await
|
|
38683
|
+
const pathStat = await stat9(externalPath);
|
|
37764
38684
|
isDir = pathStat.isDirectory();
|
|
37765
38685
|
} catch {
|
|
37766
38686
|
}
|
|
@@ -38061,7 +38981,7 @@ function collectInputSourceReferences(inputMessages) {
|
|
|
38061
38981
|
references.push({
|
|
38062
38982
|
kind: "input_file",
|
|
38063
38983
|
displayPath,
|
|
38064
|
-
...typeof segment.resolvedPath === "string" ? { resolvedPath:
|
|
38984
|
+
...typeof segment.resolvedPath === "string" ? { resolvedPath: path46.resolve(segment.resolvedPath) } : {}
|
|
38065
38985
|
});
|
|
38066
38986
|
}
|
|
38067
38987
|
}
|
|
@@ -38134,7 +39054,7 @@ function collectSingleGraderSourceReferences(evaluator) {
|
|
|
38134
39054
|
references.push({
|
|
38135
39055
|
kind: "code_grader_command",
|
|
38136
39056
|
displayPath: evaluator.aggregator.path,
|
|
38137
|
-
resolvedPath:
|
|
39057
|
+
resolvedPath: path46.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
|
|
38138
39058
|
graderName: evaluator.name
|
|
38139
39059
|
});
|
|
38140
39060
|
} else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
|
|
@@ -38167,9 +39087,9 @@ function dedupeSourceReferences(references) {
|
|
|
38167
39087
|
return deduped;
|
|
38168
39088
|
}
|
|
38169
39089
|
function toPortableRelativePath(root, candidate) {
|
|
38170
|
-
const relative =
|
|
38171
|
-
if (relative && !relative.startsWith("..") && !
|
|
38172
|
-
return relative.split(
|
|
39090
|
+
const relative = path46.relative(root, candidate);
|
|
39091
|
+
if (relative && !relative.startsWith("..") && !path46.isAbsolute(relative)) {
|
|
39092
|
+
return relative.split(path46.sep).join("/");
|
|
38173
39093
|
}
|
|
38174
39094
|
return void 0;
|
|
38175
39095
|
}
|
|
@@ -38223,8 +39143,8 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
38223
39143
|
if (!command) return void 0;
|
|
38224
39144
|
const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : void 0;
|
|
38225
39145
|
let cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
|
|
38226
|
-
if (cwd && !
|
|
38227
|
-
cwd =
|
|
39146
|
+
if (cwd && !path46.isAbsolute(cwd)) {
|
|
39147
|
+
cwd = path46.resolve(evalFileDir, cwd);
|
|
38228
39148
|
}
|
|
38229
39149
|
const config2 = { command };
|
|
38230
39150
|
if (timeoutMs !== void 0) {
|
|
@@ -38262,7 +39182,7 @@ function parseWorkspaceHooksConfig(raw, evalFileDir) {
|
|
|
38262
39182
|
}
|
|
38263
39183
|
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
38264
39184
|
if (typeof raw === "string") {
|
|
38265
|
-
const workspaceFilePath =
|
|
39185
|
+
const workspaceFilePath = path46.resolve(evalFileDir, raw);
|
|
38266
39186
|
let content;
|
|
38267
39187
|
try {
|
|
38268
39188
|
content = await readFile18(workspaceFilePath, "utf8");
|
|
@@ -38275,7 +39195,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
38275
39195
|
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
38276
39196
|
);
|
|
38277
39197
|
}
|
|
38278
|
-
const workspaceFileDir =
|
|
39198
|
+
const workspaceFileDir = path46.dirname(workspaceFilePath);
|
|
38279
39199
|
const resolvedWorkspace = parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
38280
39200
|
if (resolvedWorkspace) {
|
|
38281
39201
|
return { ...resolvedWorkspace, workspaceFileDir };
|
|
@@ -38309,8 +39229,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
38309
39229
|
throw new Error("workspace.static has been removed. Use workspace.mode='static'.");
|
|
38310
39230
|
}
|
|
38311
39231
|
let template = typeof obj.template === "string" ? obj.template : void 0;
|
|
38312
|
-
if (template && !
|
|
38313
|
-
template =
|
|
39232
|
+
if (template && !path46.isAbsolute(template)) {
|
|
39233
|
+
template = path46.resolve(evalFileDir, template);
|
|
38314
39234
|
}
|
|
38315
39235
|
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
38316
39236
|
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
@@ -38447,8 +39367,6 @@ ${detailBlock}${ANSI_RESET6}`);
|
|
|
38447
39367
|
console.error(`${ANSI_RED3}Error: ${message}${ANSI_RESET6}`);
|
|
38448
39368
|
}
|
|
38449
39369
|
}
|
|
38450
|
-
var execFileAsync2 = promisify6(execFile2);
|
|
38451
|
-
var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
|
|
38452
39370
|
function pathFromRoot(root) {
|
|
38453
39371
|
return root instanceof URL ? fileURLToPath5(root) : String(root);
|
|
38454
39372
|
}
|
|
@@ -38470,53 +39388,84 @@ function buildSkippedEvaluatorError(scores) {
|
|
|
38470
39388
|
function usesFileReferencePrompt(provider) {
|
|
38471
39389
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
38472
39390
|
}
|
|
38473
|
-
function
|
|
38474
|
-
const
|
|
38475
|
-
|
|
38476
|
-
|
|
39391
|
+
function createEvaluationRuntime(options) {
|
|
39392
|
+
const {
|
|
39393
|
+
target,
|
|
39394
|
+
targets,
|
|
39395
|
+
env,
|
|
39396
|
+
providerFactory,
|
|
39397
|
+
evalFilePath,
|
|
39398
|
+
graderTarget: cliGraderTarget,
|
|
39399
|
+
model: cliModel
|
|
39400
|
+
} = options;
|
|
39401
|
+
const resolvedTargetsByName = /* @__PURE__ */ new Map();
|
|
39402
|
+
resolvedTargetsByName.set(target.name, target);
|
|
39403
|
+
const targetDefinitions = /* @__PURE__ */ new Map();
|
|
39404
|
+
for (const definition of targets ?? []) {
|
|
39405
|
+
targetDefinitions.set(definition.name, definition);
|
|
38477
39406
|
}
|
|
38478
|
-
|
|
38479
|
-
|
|
38480
|
-
|
|
38481
|
-
|
|
38482
|
-
|
|
38483
|
-
|
|
39407
|
+
const envLookup = env ?? process.env;
|
|
39408
|
+
const providerCache = /* @__PURE__ */ new Map();
|
|
39409
|
+
const getOrCreateProvider = (resolved) => {
|
|
39410
|
+
const existing = providerCache.get(resolved.name);
|
|
39411
|
+
if (existing) {
|
|
39412
|
+
return existing;
|
|
39413
|
+
}
|
|
39414
|
+
const factory = providerFactory ?? createProvider;
|
|
39415
|
+
const instance = factory(resolved);
|
|
39416
|
+
providerCache.set(resolved.name, instance);
|
|
39417
|
+
return instance;
|
|
38484
39418
|
};
|
|
38485
|
-
|
|
38486
|
-
|
|
38487
|
-
|
|
38488
|
-
}
|
|
38489
|
-
function hooksEnabled(workspace) {
|
|
38490
|
-
return workspace?.hooks?.enabled !== false;
|
|
38491
|
-
}
|
|
38492
|
-
function workspaceGitEnv() {
|
|
38493
|
-
const env = { ...process.env };
|
|
38494
|
-
for (const key of Object.keys(env)) {
|
|
38495
|
-
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
38496
|
-
delete env[key];
|
|
39419
|
+
const resolveTargetByName = (name) => {
|
|
39420
|
+
if (resolvedTargetsByName.has(name)) {
|
|
39421
|
+
return resolvedTargetsByName.get(name);
|
|
38497
39422
|
}
|
|
38498
|
-
|
|
38499
|
-
|
|
38500
|
-
|
|
38501
|
-
|
|
38502
|
-
|
|
38503
|
-
|
|
39423
|
+
const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
|
|
39424
|
+
if (!definition) {
|
|
39425
|
+
return void 0;
|
|
39426
|
+
}
|
|
39427
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath ?? "");
|
|
39428
|
+
resolvedTargetsByName.set(name, resolved);
|
|
39429
|
+
return resolved;
|
|
38504
39430
|
};
|
|
38505
|
-
|
|
38506
|
-
|
|
38507
|
-
|
|
38508
|
-
|
|
38509
|
-
|
|
38510
|
-
|
|
38511
|
-
|
|
38512
|
-
|
|
38513
|
-
|
|
38514
|
-
|
|
38515
|
-
|
|
39431
|
+
const resolveGraderProvider = async (targetContext) => {
|
|
39432
|
+
if (cliGraderTarget) {
|
|
39433
|
+
if (cliGraderTarget === "agentv") {
|
|
39434
|
+
if (!cliModel) {
|
|
39435
|
+
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
39436
|
+
}
|
|
39437
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-AYXH7WLW-NJRC6UQX.js");
|
|
39438
|
+
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
39439
|
+
}
|
|
39440
|
+
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
39441
|
+
if (!overrideTarget) {
|
|
39442
|
+
throw new Error(`--grader-target "${cliGraderTarget}" not found in targets`);
|
|
39443
|
+
}
|
|
39444
|
+
return getOrCreateProvider(overrideTarget);
|
|
39445
|
+
}
|
|
39446
|
+
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
39447
|
+
const resolvedGrader = resolveTargetByName(graderName);
|
|
39448
|
+
if (!resolvedGrader) {
|
|
39449
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
39450
|
+
return void 0;
|
|
39451
|
+
}
|
|
39452
|
+
return getOrCreateProvider(targetContext);
|
|
39453
|
+
}
|
|
39454
|
+
return getOrCreateProvider(resolvedGrader);
|
|
39455
|
+
};
|
|
39456
|
+
const targetResolver = (name) => {
|
|
39457
|
+
const resolved = resolveTargetByName(name);
|
|
39458
|
+
if (!resolved) {
|
|
39459
|
+
return void 0;
|
|
39460
|
+
}
|
|
39461
|
+
return getOrCreateProvider(resolved);
|
|
39462
|
+
};
|
|
39463
|
+
return {
|
|
39464
|
+
getOrCreateProvider,
|
|
39465
|
+
resolveGraderProvider,
|
|
39466
|
+
targetResolver,
|
|
39467
|
+
availableTargets: [target.name, ...Array.from(targetDefinitions.keys())]
|
|
38516
39468
|
};
|
|
38517
|
-
await execFileAsync2("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
|
|
38518
|
-
await execFileAsync2("git", ["clean", cleanFlag], opts);
|
|
38519
|
-
return true;
|
|
38520
39469
|
}
|
|
38521
39470
|
function validateDependencyGraph(tests) {
|
|
38522
39471
|
const ids = /* @__PURE__ */ new Set();
|
|
@@ -38547,18 +39496,18 @@ function validateDependencyGraph(tests) {
|
|
|
38547
39496
|
}
|
|
38548
39497
|
const visited = /* @__PURE__ */ new Set();
|
|
38549
39498
|
const visiting = /* @__PURE__ */ new Set();
|
|
38550
|
-
function visit(id,
|
|
39499
|
+
function visit(id, path51) {
|
|
38551
39500
|
if (visiting.has(id)) {
|
|
38552
|
-
const cycle = [...
|
|
39501
|
+
const cycle = [...path51.slice(path51.indexOf(id)), id];
|
|
38553
39502
|
throw new Error(`Circular dependency detected: ${cycle.join(" \u2192 ")}`);
|
|
38554
39503
|
}
|
|
38555
39504
|
if (visited.has(id)) return;
|
|
38556
39505
|
visiting.add(id);
|
|
38557
|
-
|
|
39506
|
+
path51.push(id);
|
|
38558
39507
|
for (const dep of depMap.get(id) ?? []) {
|
|
38559
|
-
visit(dep,
|
|
39508
|
+
visit(dep, path51);
|
|
38560
39509
|
}
|
|
38561
|
-
|
|
39510
|
+
path51.pop();
|
|
38562
39511
|
visiting.delete(id);
|
|
38563
39512
|
visited.add(id);
|
|
38564
39513
|
}
|
|
@@ -38612,6 +39561,170 @@ function computeWaves(tests) {
|
|
|
38612
39561
|
}
|
|
38613
39562
|
return waves;
|
|
38614
39563
|
}
|
|
39564
|
+
function createPreparedProvider(target) {
|
|
39565
|
+
return {
|
|
39566
|
+
id: `prepared:${target.name}`,
|
|
39567
|
+
kind: target.kind,
|
|
39568
|
+
targetName: target.name,
|
|
39569
|
+
async invoke() {
|
|
39570
|
+
throw new Error("Prepared grading does not invoke the target provider");
|
|
39571
|
+
}
|
|
39572
|
+
};
|
|
39573
|
+
}
|
|
39574
|
+
function withPreparedMetadata(evalCase, preparedAttempt) {
|
|
39575
|
+
return {
|
|
39576
|
+
...evalCase.metadata,
|
|
39577
|
+
preparedAttempt
|
|
39578
|
+
};
|
|
39579
|
+
}
|
|
39580
|
+
async function gradePreparedEvalCase(options) {
|
|
39581
|
+
const {
|
|
39582
|
+
evalCase,
|
|
39583
|
+
target,
|
|
39584
|
+
targets,
|
|
39585
|
+
env,
|
|
39586
|
+
evaluators,
|
|
39587
|
+
providerFactory,
|
|
39588
|
+
agentTimeoutMs,
|
|
39589
|
+
graderTarget,
|
|
39590
|
+
model,
|
|
39591
|
+
evalFilePath,
|
|
39592
|
+
workspacePath,
|
|
39593
|
+
baselineCommit,
|
|
39594
|
+
response,
|
|
39595
|
+
verbose,
|
|
39596
|
+
threshold: caseThreshold,
|
|
39597
|
+
preparedAttempt
|
|
39598
|
+
} = options;
|
|
39599
|
+
const nowFn = options.now ?? (() => /* @__PURE__ */ new Date());
|
|
39600
|
+
const caseStartMs = Date.now();
|
|
39601
|
+
const provider = createPreparedProvider(target);
|
|
39602
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
39603
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
39604
|
+
const typeRegistry = createBuiltinRegistry();
|
|
39605
|
+
const runtime = createEvaluationRuntime({
|
|
39606
|
+
target,
|
|
39607
|
+
targets,
|
|
39608
|
+
env,
|
|
39609
|
+
providerFactory,
|
|
39610
|
+
evalFilePath,
|
|
39611
|
+
graderTarget,
|
|
39612
|
+
model
|
|
39613
|
+
});
|
|
39614
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, runtime.resolveGraderProvider);
|
|
39615
|
+
const discoveryBaseDir = evalFilePath ? path47.dirname(path47.resolve(evalFilePath)) : process.cwd();
|
|
39616
|
+
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
39617
|
+
await discoverGraders(typeRegistry, discoveryBaseDir);
|
|
39618
|
+
let fileChanges;
|
|
39619
|
+
if (baselineCommit) {
|
|
39620
|
+
try {
|
|
39621
|
+
const diff = await captureFileChanges(workspacePath, baselineCommit);
|
|
39622
|
+
if (diff.length > 0) {
|
|
39623
|
+
fileChanges = diff;
|
|
39624
|
+
}
|
|
39625
|
+
} catch (error40) {
|
|
39626
|
+
if (verbose) {
|
|
39627
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39628
|
+
console.warn(`Warning: failed to capture prepared workspace diff: ${message}`);
|
|
39629
|
+
}
|
|
39630
|
+
}
|
|
39631
|
+
}
|
|
39632
|
+
const candidate = response ?? "";
|
|
39633
|
+
const input = buildResultInput(promptInputs);
|
|
39634
|
+
const outputMessages = candidate.length > 0 ? [{ role: "assistant", content: candidate }] : [];
|
|
39635
|
+
const resultTrace = buildTraceFromMessages({
|
|
39636
|
+
input,
|
|
39637
|
+
output: outputMessages,
|
|
39638
|
+
finalOutput: candidate,
|
|
39639
|
+
provider: provider.kind,
|
|
39640
|
+
target: target.name,
|
|
39641
|
+
testId: evalCase.id,
|
|
39642
|
+
conversationId: evalCase.conversation_id
|
|
39643
|
+
});
|
|
39644
|
+
try {
|
|
39645
|
+
const gradeStartedAt = nowFn();
|
|
39646
|
+
const { score, scores } = await runEvaluatorsForCase({
|
|
39647
|
+
evalCase,
|
|
39648
|
+
candidate,
|
|
39649
|
+
target,
|
|
39650
|
+
provider,
|
|
39651
|
+
evaluators: evaluatorRegistry,
|
|
39652
|
+
typeRegistry,
|
|
39653
|
+
attempt: 0,
|
|
39654
|
+
promptInputs,
|
|
39655
|
+
now: gradeStartedAt,
|
|
39656
|
+
agentTimeoutMs,
|
|
39657
|
+
targetResolver: runtime.targetResolver,
|
|
39658
|
+
availableTargets: runtime.availableTargets,
|
|
39659
|
+
fileChanges,
|
|
39660
|
+
workspacePath,
|
|
39661
|
+
dockerConfig: evalCase.workspace?.docker,
|
|
39662
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
39663
|
+
});
|
|
39664
|
+
const timestamp = nowFn();
|
|
39665
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
39666
|
+
const graderTokens = aggregateEvaluatorTokenUsage(scores);
|
|
39667
|
+
const evalRun = {
|
|
39668
|
+
durationMs: Date.now() - caseStartMs,
|
|
39669
|
+
...graderTokens ? { tokenUsage: graderTokens } : {}
|
|
39670
|
+
};
|
|
39671
|
+
const skippedEvaluatorError = buildSkippedEvaluatorError(scores);
|
|
39672
|
+
const executionStatus = skippedEvaluatorError ? "execution_error" : classifyQualityStatus(score.score, effectiveThreshold);
|
|
39673
|
+
const baseResult = {
|
|
39674
|
+
timestamp: timestamp.toISOString(),
|
|
39675
|
+
testId: evalCase.id,
|
|
39676
|
+
suite: evalCase.suite,
|
|
39677
|
+
category: evalCase.category,
|
|
39678
|
+
conversationId: evalCase.conversation_id,
|
|
39679
|
+
score: skippedEvaluatorError ? 0 : score.score,
|
|
39680
|
+
assertions: score.assertions,
|
|
39681
|
+
target: target.name,
|
|
39682
|
+
input,
|
|
39683
|
+
output: candidate,
|
|
39684
|
+
scores,
|
|
39685
|
+
trace: resultTrace,
|
|
39686
|
+
fileChanges,
|
|
39687
|
+
workspacePath,
|
|
39688
|
+
evalRun,
|
|
39689
|
+
metadata: withPreparedMetadata(evalCase, preparedAttempt),
|
|
39690
|
+
executionStatus
|
|
39691
|
+
};
|
|
39692
|
+
if (!skippedEvaluatorError) {
|
|
39693
|
+
return baseResult;
|
|
39694
|
+
}
|
|
39695
|
+
return {
|
|
39696
|
+
...baseResult,
|
|
39697
|
+
trace: appendErrorEventToTrace(baseResult.trace, skippedEvaluatorError, {
|
|
39698
|
+
failure_stage: "evaluator",
|
|
39699
|
+
failure_reason_code: "evaluator_error"
|
|
39700
|
+
}),
|
|
39701
|
+
error: skippedEvaluatorError,
|
|
39702
|
+
failureStage: "evaluator",
|
|
39703
|
+
failureReasonCode: "evaluator_error",
|
|
39704
|
+
executionError: { message: skippedEvaluatorError, stage: "evaluator" }
|
|
39705
|
+
};
|
|
39706
|
+
} catch (error40) {
|
|
39707
|
+
const evalRun = { durationMs: Date.now() - caseStartMs };
|
|
39708
|
+
const errorResult = buildErrorResult(
|
|
39709
|
+
evalCase,
|
|
39710
|
+
target.name,
|
|
39711
|
+
nowFn(),
|
|
39712
|
+
error40,
|
|
39713
|
+
promptInputs,
|
|
39714
|
+
provider,
|
|
39715
|
+
"evaluator",
|
|
39716
|
+
"evaluator_error",
|
|
39717
|
+
verbose
|
|
39718
|
+
);
|
|
39719
|
+
return {
|
|
39720
|
+
...errorResult,
|
|
39721
|
+
evalRun,
|
|
39722
|
+
fileChanges,
|
|
39723
|
+
workspacePath,
|
|
39724
|
+
metadata: withPreparedMetadata(evalCase, preparedAttempt)
|
|
39725
|
+
};
|
|
39726
|
+
}
|
|
39727
|
+
}
|
|
38615
39728
|
async function runEvaluation(options) {
|
|
38616
39729
|
const {
|
|
38617
39730
|
testFilePath: evalFilePath,
|
|
@@ -38667,80 +39780,24 @@ async function runEvaluation(options) {
|
|
|
38667
39780
|
}
|
|
38668
39781
|
return [];
|
|
38669
39782
|
}
|
|
38670
|
-
const
|
|
38671
|
-
|
|
38672
|
-
|
|
38673
|
-
|
|
38674
|
-
|
|
38675
|
-
|
|
38676
|
-
|
|
38677
|
-
|
|
38678
|
-
|
|
38679
|
-
|
|
38680
|
-
if (existing) {
|
|
38681
|
-
return existing;
|
|
38682
|
-
}
|
|
38683
|
-
const factory = providerFactory ?? createProvider;
|
|
38684
|
-
const instance = factory(resolved);
|
|
38685
|
-
providerCache.set(resolved.name, instance);
|
|
38686
|
-
return instance;
|
|
38687
|
-
};
|
|
38688
|
-
const resolveTargetByName = (name) => {
|
|
38689
|
-
if (resolvedTargetsByName.has(name)) {
|
|
38690
|
-
return resolvedTargetsByName.get(name);
|
|
38691
|
-
}
|
|
38692
|
-
const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
|
|
38693
|
-
if (!definition) {
|
|
38694
|
-
return void 0;
|
|
38695
|
-
}
|
|
38696
|
-
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
38697
|
-
resolvedTargetsByName.set(name, resolved);
|
|
38698
|
-
return resolved;
|
|
38699
|
-
};
|
|
38700
|
-
const resolveGraderProvider = async (targetContext) => {
|
|
38701
|
-
if (cliGraderTarget) {
|
|
38702
|
-
if (cliGraderTarget === "agentv") {
|
|
38703
|
-
if (!cliModel) {
|
|
38704
|
-
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
38705
|
-
}
|
|
38706
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-AYXH7WLW-NJRC6UQX.js");
|
|
38707
|
-
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
38708
|
-
}
|
|
38709
|
-
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
38710
|
-
if (!overrideTarget) {
|
|
38711
|
-
throw new Error(`--grader-target "${cliGraderTarget}" not found in targets`);
|
|
38712
|
-
}
|
|
38713
|
-
return getOrCreateProvider(overrideTarget);
|
|
38714
|
-
}
|
|
38715
|
-
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
38716
|
-
const resolvedGrader = resolveTargetByName(graderName);
|
|
38717
|
-
if (!resolvedGrader) {
|
|
38718
|
-
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
38719
|
-
return void 0;
|
|
38720
|
-
}
|
|
38721
|
-
return getOrCreateProvider(targetContext);
|
|
38722
|
-
}
|
|
38723
|
-
return getOrCreateProvider(resolvedGrader);
|
|
38724
|
-
};
|
|
39783
|
+
const runtime = createEvaluationRuntime({
|
|
39784
|
+
target,
|
|
39785
|
+
targets,
|
|
39786
|
+
env,
|
|
39787
|
+
providerFactory,
|
|
39788
|
+
evalFilePath,
|
|
39789
|
+
graderTarget: cliGraderTarget,
|
|
39790
|
+
model: cliModel
|
|
39791
|
+
});
|
|
39792
|
+
const { getOrCreateProvider, resolveGraderProvider, targetResolver, availableTargets } = runtime;
|
|
38725
39793
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.graderTarget && !cliGraderTarget) {
|
|
38726
39794
|
throw new Error(
|
|
38727
39795
|
`Target "${target.name}" is an agent provider ("${target.kind}") with no grader_target \u2014 agent providers cannot return structured JSON for grading. Set grader_target to an LLM provider (e.g., azure-llm).`
|
|
38728
39796
|
);
|
|
38729
39797
|
}
|
|
38730
|
-
const targetResolver = (name) => {
|
|
38731
|
-
const resolved = resolveTargetByName(name);
|
|
38732
|
-
if (!resolved) {
|
|
38733
|
-
return void 0;
|
|
38734
|
-
}
|
|
38735
|
-
return getOrCreateProvider(resolved);
|
|
38736
|
-
};
|
|
38737
|
-
const availableTargets = [
|
|
38738
|
-
target.name,
|
|
38739
|
-
...Array.from(targetDefinitions.keys())
|
|
38740
|
-
];
|
|
38741
39798
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveGraderProvider);
|
|
38742
39799
|
const typeRegistry = createBuiltinRegistry();
|
|
38743
|
-
const discoveryBaseDir = evalFilePath ?
|
|
39800
|
+
const discoveryBaseDir = evalFilePath ? path47.dirname(path47.resolve(evalFilePath)) : process.cwd();
|
|
38744
39801
|
const evalDir = discoveryBaseDir;
|
|
38745
39802
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
38746
39803
|
await discoverGraders(typeRegistry, discoveryBaseDir);
|
|
@@ -38796,132 +39853,38 @@ async function runEvaluation(options) {
|
|
|
38796
39853
|
}
|
|
38797
39854
|
}
|
|
38798
39855
|
}
|
|
38799
|
-
const suiteWorkspace = filteredEvalCases[0]?.workspace;
|
|
38800
|
-
const rawTemplate = suiteWorkspace?.template;
|
|
38801
|
-
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
38802
|
-
const workspaceTemplate = resolvedTemplate?.dir;
|
|
38803
|
-
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
38804
|
-
const setupLog = (message) => {
|
|
38805
|
-
if (verbose) {
|
|
38806
|
-
console.log(`[setup] ${message}`);
|
|
38807
|
-
}
|
|
38808
|
-
};
|
|
38809
|
-
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
38810
|
-
const cliWorkspacePath = workspacePath ?? legacyWorkspacePath;
|
|
38811
|
-
const yamlWorkspacePath = suiteWorkspace?.path;
|
|
38812
|
-
if (cliWorkspacePath && workspaceMode && workspaceMode !== "static") {
|
|
38813
|
-
throw new Error("--workspace-path requires --workspace-mode static when both are provided");
|
|
38814
|
-
}
|
|
38815
|
-
let configuredMode = cliWorkspacePath ? "static" : workspaceMode ?? suiteWorkspace?.mode ?? (yamlWorkspacePath ? "static" : "pooled");
|
|
38816
|
-
const configuredStaticPath = cliWorkspacePath ?? yamlWorkspacePath;
|
|
38817
|
-
if (configuredMode === "static" && !configuredStaticPath) {
|
|
38818
|
-
if (!suiteWorkspace?.repos?.length) {
|
|
38819
|
-
setupLog("workspace.mode=static with no path and no repos \u2014 falling back to temp mode");
|
|
38820
|
-
configuredMode = "temp";
|
|
38821
|
-
} else {
|
|
38822
|
-
throw new Error("workspace.mode=static requires workspace.path or --workspace-path");
|
|
38823
|
-
}
|
|
38824
|
-
}
|
|
38825
|
-
const useStaticWorkspace = configuredMode === "static";
|
|
38826
|
-
if (useStaticWorkspace && isPerTestIsolation) {
|
|
38827
|
-
throw new Error(
|
|
38828
|
-
"static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
38829
|
-
);
|
|
38830
|
-
}
|
|
38831
|
-
if (configuredMode !== "static" && configuredStaticPath) {
|
|
38832
|
-
throw new Error("workspace.path requires workspace.mode=static");
|
|
38833
|
-
}
|
|
38834
|
-
const hasSharedWorkspace = !!(useStaticWorkspace || !isPerTestIsolation && (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length));
|
|
38835
|
-
const poolEnabled = configuredMode === "pooled";
|
|
38836
|
-
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
38837
39856
|
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
38838
39857
|
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
38839
39858
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
38840
|
-
setupLog(
|
|
38841
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
|
|
38842
|
-
);
|
|
38843
|
-
if (hasSharedWorkspace && !usePool && workers > 1 && filteredEvalCases.length > 1) {
|
|
38844
|
-
console.warn(
|
|
38845
|
-
[
|
|
38846
|
-
`Warning: This eval uses a shared workspace with ${workers} workers.`,
|
|
38847
|
-
"If the agent under test makes file edits, concurrent runs may corrupt each other.",
|
|
38848
|
-
"To limit concurrency, add this to your eval YAML:",
|
|
38849
|
-
"",
|
|
38850
|
-
" execution:",
|
|
38851
|
-
" workers: 1",
|
|
38852
|
-
"",
|
|
38853
|
-
"Or pass --workers 1 on the command line."
|
|
38854
|
-
].join("\n")
|
|
38855
|
-
);
|
|
38856
|
-
}
|
|
38857
39859
|
const limit = pLimit(workers);
|
|
38858
|
-
|
|
38859
|
-
|
|
38860
|
-
|
|
38861
|
-
|
|
38862
|
-
|
|
38863
|
-
|
|
38864
|
-
|
|
38865
|
-
|
|
38866
|
-
|
|
38867
|
-
|
|
38868
|
-
|
|
38869
|
-
|
|
38870
|
-
|
|
38871
|
-
|
|
38872
|
-
|
|
38873
|
-
|
|
38874
|
-
|
|
38875
|
-
|
|
38876
|
-
|
|
38877
|
-
|
|
38878
|
-
|
|
38879
|
-
|
|
38880
|
-
|
|
38881
|
-
|
|
38882
|
-
|
|
38883
|
-
|
|
38884
|
-
|
|
38885
|
-
|
|
38886
|
-
setupLog(`reusing existing static workspace: ${configuredStaticPath}`);
|
|
38887
|
-
}
|
|
38888
|
-
sharedWorkspacePath = configuredStaticPath;
|
|
38889
|
-
} else if (!isPerTestIsolation && usePool && suiteWorkspace?.repos) {
|
|
38890
|
-
const slotsNeeded = workers;
|
|
38891
|
-
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
38892
|
-
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
38893
|
-
const poolRepoManager = new RepoManager(verbose);
|
|
38894
|
-
for (let i = 0; i < slotsNeeded; i++) {
|
|
38895
|
-
const slot = await poolManager.acquireWorkspace({
|
|
38896
|
-
templatePath: workspaceTemplate,
|
|
38897
|
-
repos: suiteWorkspace.repos,
|
|
38898
|
-
maxSlots: poolMaxSlots,
|
|
38899
|
-
repoManager: poolRepoManager,
|
|
38900
|
-
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
|
|
38901
|
-
});
|
|
38902
|
-
poolSlots.push(slot);
|
|
38903
|
-
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
38904
|
-
}
|
|
38905
|
-
if (slotsNeeded === 1) {
|
|
38906
|
-
poolSlot = poolSlots[0];
|
|
38907
|
-
sharedWorkspacePath = poolSlot.path;
|
|
38908
|
-
} else {
|
|
38909
|
-
availablePoolSlots.push(...poolSlots);
|
|
38910
|
-
}
|
|
38911
|
-
} else if (!isPerTestIsolation && workspaceTemplate) {
|
|
38912
|
-
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
38913
|
-
try {
|
|
38914
|
-
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
38915
|
-
setupLog(`shared workspace created at: ${sharedWorkspacePath}`);
|
|
38916
|
-
} catch (error40) {
|
|
38917
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38918
|
-
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
38919
|
-
}
|
|
38920
|
-
} else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
|
|
38921
|
-
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
38922
|
-
await mkdir17(sharedWorkspacePath, { recursive: true });
|
|
38923
|
-
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
38924
|
-
}
|
|
39860
|
+
const sharedSetup = await prepareSharedWorkspaceSetup({
|
|
39861
|
+
evalRunId,
|
|
39862
|
+
evalCases: filteredEvalCases,
|
|
39863
|
+
targetHooks: options.targetHooks,
|
|
39864
|
+
evalDir,
|
|
39865
|
+
verbose,
|
|
39866
|
+
workers,
|
|
39867
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
39868
|
+
workspacePath,
|
|
39869
|
+
legacyWorkspacePath,
|
|
39870
|
+
workspaceMode,
|
|
39871
|
+
workspaceClean
|
|
39872
|
+
});
|
|
39873
|
+
const {
|
|
39874
|
+
suiteWorkspace,
|
|
39875
|
+
sharedWorkspacePath,
|
|
39876
|
+
sharedBaselineCommit,
|
|
39877
|
+
suiteWorkspaceFile,
|
|
39878
|
+
beforeAllOutput,
|
|
39879
|
+
repoManager,
|
|
39880
|
+
poolSlot,
|
|
39881
|
+
poolSlots,
|
|
39882
|
+
availablePoolSlots,
|
|
39883
|
+
poolSlotBaselines,
|
|
39884
|
+
useStaticWorkspace
|
|
39885
|
+
} = sharedSetup;
|
|
39886
|
+
const targetHooks = options.targetHooks;
|
|
39887
|
+
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
38925
39888
|
try {
|
|
38926
39889
|
let toDependencyResult2 = function(r) {
|
|
38927
39890
|
return {
|
|
@@ -38959,198 +39922,6 @@ async function runEvaluation(options) {
|
|
|
38959
39922
|
return result.costUsd;
|
|
38960
39923
|
};
|
|
38961
39924
|
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
|
|
38962
|
-
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
38963
|
-
const copiedWorkspaceFile = path46.join(sharedWorkspacePath, path46.basename(suiteWorkspaceFile));
|
|
38964
|
-
try {
|
|
38965
|
-
await stat9(copiedWorkspaceFile);
|
|
38966
|
-
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
38967
|
-
} catch {
|
|
38968
|
-
}
|
|
38969
|
-
}
|
|
38970
|
-
const hasReposToMaterialize = !!suiteWorkspace?.repos?.length && !usePool && !isPerTestIsolation;
|
|
38971
|
-
const needsRepoMaterialisation = hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised);
|
|
38972
|
-
const needsPerRepoCheck = hasReposToMaterialize && useStaticWorkspace && !staticMaterialised && isYamlConfiguredPath;
|
|
38973
|
-
const repoManager = needsRepoMaterialisation || needsPerRepoCheck ? new RepoManager(verbose) : void 0;
|
|
38974
|
-
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos) {
|
|
38975
|
-
try {
|
|
38976
|
-
if (needsPerRepoCheck) {
|
|
38977
|
-
for (const repo of suiteWorkspace.repos) {
|
|
38978
|
-
if (!repo.path || !repo.repo) continue;
|
|
38979
|
-
const targetDir = path46.join(sharedWorkspacePath, repo.path);
|
|
38980
|
-
if (existsSync6(targetDir)) {
|
|
38981
|
-
setupLog(`reusing existing repo at: ${targetDir}`);
|
|
38982
|
-
continue;
|
|
38983
|
-
}
|
|
38984
|
-
setupLog(`materializing missing repo: ${repo.path}`);
|
|
38985
|
-
await repoManager.materialize(repo, sharedWorkspacePath);
|
|
38986
|
-
}
|
|
38987
|
-
} else {
|
|
38988
|
-
setupLog(
|
|
38989
|
-
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
38990
|
-
);
|
|
38991
|
-
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
38992
|
-
}
|
|
38993
|
-
setupLog("shared repo materialization complete");
|
|
38994
|
-
} catch (error40) {
|
|
38995
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38996
|
-
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
38997
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
38998
|
-
});
|
|
38999
|
-
}
|
|
39000
|
-
throw new Error(`Failed to materialize repos: ${message}`);
|
|
39001
|
-
}
|
|
39002
|
-
}
|
|
39003
|
-
const suiteDockerConfig = suiteWorkspace?.docker;
|
|
39004
|
-
if (suiteDockerConfig) {
|
|
39005
|
-
setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
|
|
39006
|
-
const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27-B4AQHVWA.js");
|
|
39007
|
-
const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
|
|
39008
|
-
if (!await dockerSetup.isDockerAvailable()) {
|
|
39009
|
-
throw new Error(
|
|
39010
|
-
"Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
|
|
39011
|
-
);
|
|
39012
|
-
}
|
|
39013
|
-
await dockerSetup.pullImage();
|
|
39014
|
-
setupLog("Docker image pull complete");
|
|
39015
|
-
}
|
|
39016
|
-
if (suiteWorkspace?.env) {
|
|
39017
|
-
try {
|
|
39018
|
-
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
39019
|
-
setupLog("preflight checks passed");
|
|
39020
|
-
} catch (error40) {
|
|
39021
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39022
|
-
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
39023
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
39024
|
-
});
|
|
39025
|
-
}
|
|
39026
|
-
throw new Error(message);
|
|
39027
|
-
}
|
|
39028
|
-
}
|
|
39029
|
-
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
39030
|
-
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
39031
|
-
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
39032
|
-
const beforeAllHook = suiteBeforeAllHook;
|
|
39033
|
-
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
39034
|
-
setupLog(
|
|
39035
|
-
`running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
39036
|
-
);
|
|
39037
|
-
const scriptContext = {
|
|
39038
|
-
workspacePath: sharedWorkspacePath,
|
|
39039
|
-
testId: "__before_all__",
|
|
39040
|
-
evalRunId,
|
|
39041
|
-
evalDir,
|
|
39042
|
-
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
39043
|
-
};
|
|
39044
|
-
try {
|
|
39045
|
-
beforeAllOutput = await executeWorkspaceScript(
|
|
39046
|
-
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
39047
|
-
scriptContext
|
|
39048
|
-
);
|
|
39049
|
-
setupLog("shared before_all completed");
|
|
39050
|
-
} catch (error40) {
|
|
39051
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39052
|
-
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
39053
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
39054
|
-
});
|
|
39055
|
-
}
|
|
39056
|
-
throw new Error(`before_all script failed: ${message}`);
|
|
39057
|
-
}
|
|
39058
|
-
}
|
|
39059
|
-
if (availablePoolSlots.length > 0 && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
39060
|
-
const beforeAllHook = suiteBeforeAllHook;
|
|
39061
|
-
for (const slot of availablePoolSlots) {
|
|
39062
|
-
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
39063
|
-
const scriptContext = {
|
|
39064
|
-
workspacePath: slot.path,
|
|
39065
|
-
testId: "__before_all__",
|
|
39066
|
-
evalRunId,
|
|
39067
|
-
evalDir,
|
|
39068
|
-
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
39069
|
-
};
|
|
39070
|
-
try {
|
|
39071
|
-
const output = await executeWorkspaceScript(
|
|
39072
|
-
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
39073
|
-
scriptContext
|
|
39074
|
-
);
|
|
39075
|
-
if (!beforeAllOutput) beforeAllOutput = output;
|
|
39076
|
-
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
39077
|
-
} catch (error40) {
|
|
39078
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39079
|
-
throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
|
|
39080
|
-
}
|
|
39081
|
-
}
|
|
39082
|
-
}
|
|
39083
|
-
const targetHooks = options.targetHooks;
|
|
39084
|
-
const targetBeforeAllHook = targetHooks?.before_all;
|
|
39085
|
-
if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
|
|
39086
|
-
const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
|
|
39087
|
-
setupLog(`running target before_all command=${beforeAllCommand}`);
|
|
39088
|
-
const scriptContext = {
|
|
39089
|
-
workspacePath: sharedWorkspacePath,
|
|
39090
|
-
testId: "__target_before_all__",
|
|
39091
|
-
evalRunId,
|
|
39092
|
-
evalDir,
|
|
39093
|
-
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
39094
|
-
};
|
|
39095
|
-
try {
|
|
39096
|
-
await executeWorkspaceScript(
|
|
39097
|
-
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
39098
|
-
scriptContext
|
|
39099
|
-
);
|
|
39100
|
-
setupLog("target before_all completed");
|
|
39101
|
-
} catch (error40) {
|
|
39102
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39103
|
-
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
39104
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
39105
|
-
});
|
|
39106
|
-
}
|
|
39107
|
-
throw new Error(`target before_all hook failed: ${message}`);
|
|
39108
|
-
}
|
|
39109
|
-
}
|
|
39110
|
-
if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
|
|
39111
|
-
for (const slot of availablePoolSlots) {
|
|
39112
|
-
setupLog(`running target before_all on pool slot ${slot.index}`);
|
|
39113
|
-
const scriptContext = {
|
|
39114
|
-
workspacePath: slot.path,
|
|
39115
|
-
testId: "__target_before_all__",
|
|
39116
|
-
evalRunId,
|
|
39117
|
-
evalDir,
|
|
39118
|
-
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
39119
|
-
};
|
|
39120
|
-
try {
|
|
39121
|
-
await executeWorkspaceScript(
|
|
39122
|
-
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
39123
|
-
scriptContext
|
|
39124
|
-
);
|
|
39125
|
-
} catch (error40) {
|
|
39126
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39127
|
-
throw new Error(`target before_all hook failed on pool slot ${slot.index}: ${message}`);
|
|
39128
|
-
}
|
|
39129
|
-
}
|
|
39130
|
-
}
|
|
39131
|
-
if (sharedWorkspacePath) {
|
|
39132
|
-
try {
|
|
39133
|
-
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
39134
|
-
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
39135
|
-
} catch (error40) {
|
|
39136
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39137
|
-
setupLog(`shared baseline initialization failed (file_changes unavailable): ${message}`);
|
|
39138
|
-
}
|
|
39139
|
-
}
|
|
39140
|
-
if (availablePoolSlots.length > 0) {
|
|
39141
|
-
for (const slot of availablePoolSlots) {
|
|
39142
|
-
try {
|
|
39143
|
-
const baseline = await initializeBaseline(slot.path);
|
|
39144
|
-
poolSlotBaselines.set(slot.path, baseline);
|
|
39145
|
-
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
39146
|
-
} catch (error40) {
|
|
39147
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39148
|
-
setupLog(
|
|
39149
|
-
`pool slot ${slot.index} baseline initialization failed (file_changes unavailable): ${message}`
|
|
39150
|
-
);
|
|
39151
|
-
}
|
|
39152
|
-
}
|
|
39153
|
-
}
|
|
39154
39925
|
let nextWorkerId = 1;
|
|
39155
39926
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
39156
39927
|
let beforeAllOutputAttached = false;
|
|
@@ -39554,17 +40325,7 @@ async function runEvaluation(options) {
|
|
|
39554
40325
|
}
|
|
39555
40326
|
return results;
|
|
39556
40327
|
} finally {
|
|
39557
|
-
|
|
39558
|
-
if (poolSlot) {
|
|
39559
|
-
await poolManager.releaseSlot(poolSlot);
|
|
39560
|
-
}
|
|
39561
|
-
for (const slot of poolSlots) {
|
|
39562
|
-
if (slot !== poolSlot) {
|
|
39563
|
-
await poolManager.releaseSlot(slot).catch(() => {
|
|
39564
|
-
});
|
|
39565
|
-
}
|
|
39566
|
-
}
|
|
39567
|
-
}
|
|
40328
|
+
await releaseSharedWorkspaceSetup(sharedSetup);
|
|
39568
40329
|
}
|
|
39569
40330
|
}
|
|
39570
40331
|
async function runBatchEvaluation(options) {
|
|
@@ -39812,257 +40573,45 @@ async function runEvalCase(options) {
|
|
|
39812
40573
|
cachedResponse = await cache.get(cacheKey);
|
|
39813
40574
|
}
|
|
39814
40575
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
39815
|
-
let workspacePath = sharedWorkspacePath;
|
|
39816
|
-
let beforeAllOutput;
|
|
39817
|
-
let beforeEachOutput;
|
|
39818
40576
|
let afterEachOutput;
|
|
39819
|
-
const isSharedWorkspace = !!sharedWorkspacePath;
|
|
39820
|
-
let caseWorkspaceFile;
|
|
39821
40577
|
const caseHooksEnabled = hooksEnabled(evalCase.workspace);
|
|
39822
|
-
|
|
39823
|
-
|
|
39824
|
-
|
|
39825
|
-
|
|
39826
|
-
|
|
39827
|
-
|
|
39828
|
-
|
|
39829
|
-
|
|
39830
|
-
|
|
39831
|
-
|
|
39832
|
-
return buildErrorResult(
|
|
39833
|
-
evalCase,
|
|
39834
|
-
target.name,
|
|
39835
|
-
nowFn(),
|
|
39836
|
-
new Error(`Failed to create workspace: ${message}`),
|
|
39837
|
-
promptInputs,
|
|
39838
|
-
provider,
|
|
39839
|
-
"setup",
|
|
39840
|
-
"template_error",
|
|
39841
|
-
verbose
|
|
39842
|
-
);
|
|
39843
|
-
}
|
|
39844
|
-
if (caseWorkspaceFile && workspacePath) {
|
|
39845
|
-
const copiedFile = path46.join(workspacePath, path46.basename(caseWorkspaceFile));
|
|
39846
|
-
try {
|
|
39847
|
-
await stat9(copiedFile);
|
|
39848
|
-
caseWorkspaceFile = copiedFile;
|
|
39849
|
-
} catch {
|
|
39850
|
-
}
|
|
39851
|
-
}
|
|
39852
|
-
}
|
|
39853
|
-
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
39854
|
-
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
39855
|
-
await mkdir17(workspacePath, { recursive: true });
|
|
39856
|
-
}
|
|
39857
|
-
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
39858
|
-
const perCaseRepoManager = new RepoManager(setupDebug);
|
|
39859
|
-
try {
|
|
39860
|
-
if (setupDebug) {
|
|
39861
|
-
console.log(
|
|
39862
|
-
`[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-test repo(s) into ${workspacePath}`
|
|
39863
|
-
);
|
|
39864
|
-
}
|
|
39865
|
-
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
39866
|
-
if (setupDebug) {
|
|
39867
|
-
console.log(`[setup] test=${evalCase.id} per-test repo materialization complete`);
|
|
39868
|
-
}
|
|
39869
|
-
} catch (error40) {
|
|
39870
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39871
|
-
return buildErrorResult(
|
|
39872
|
-
evalCase,
|
|
39873
|
-
target.name,
|
|
39874
|
-
nowFn(),
|
|
39875
|
-
new Error(`Failed to materialize repos: ${message}`),
|
|
39876
|
-
promptInputs,
|
|
39877
|
-
provider,
|
|
39878
|
-
"repo_setup",
|
|
39879
|
-
"clone_error",
|
|
39880
|
-
verbose
|
|
39881
|
-
);
|
|
39882
|
-
}
|
|
39883
|
-
}
|
|
39884
|
-
if (workspacePath && evalCase.metadata?.agent_skills_files) {
|
|
39885
|
-
const baseDir = evalCase.metadata.agent_skills_base_dir;
|
|
39886
|
-
const files = evalCase.metadata.agent_skills_files;
|
|
39887
|
-
if (baseDir && files.length > 0) {
|
|
39888
|
-
for (const relPath of files) {
|
|
39889
|
-
const srcPath = path46.resolve(baseDir, relPath);
|
|
39890
|
-
const destPath = path46.resolve(workspacePath, relPath);
|
|
39891
|
-
try {
|
|
39892
|
-
await mkdir17(path46.dirname(destPath), { recursive: true });
|
|
39893
|
-
await copyFile2(srcPath, destPath);
|
|
39894
|
-
} catch (error40) {
|
|
39895
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39896
|
-
return buildErrorResult(
|
|
39897
|
-
evalCase,
|
|
39898
|
-
target.name,
|
|
39899
|
-
nowFn(),
|
|
39900
|
-
new Error(
|
|
39901
|
-
`Agent Skills eval file not found: ${relPath} (resolved from ${baseDir}): ${message}`
|
|
39902
|
-
),
|
|
39903
|
-
promptInputs,
|
|
39904
|
-
provider,
|
|
39905
|
-
"setup",
|
|
39906
|
-
"file_copy_error",
|
|
39907
|
-
verbose
|
|
39908
|
-
);
|
|
39909
|
-
}
|
|
39910
|
-
}
|
|
39911
|
-
}
|
|
39912
|
-
}
|
|
39913
|
-
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
|
|
39914
|
-
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) {
|
|
39915
|
-
const beforeAllHook = caseBeforeAllHook;
|
|
39916
|
-
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
39917
|
-
if (setupDebug) {
|
|
39918
|
-
console.log(
|
|
39919
|
-
`[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
39920
|
-
);
|
|
39921
|
-
}
|
|
39922
|
-
const scriptContext = {
|
|
39923
|
-
workspacePath,
|
|
39924
|
-
testId: evalCase.id,
|
|
39925
|
-
evalRunId: evalRunId ?? "",
|
|
39926
|
-
caseInput: evalCase.question,
|
|
39927
|
-
caseMetadata: evalCase.metadata,
|
|
39928
|
-
evalDir,
|
|
39929
|
-
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
39930
|
-
};
|
|
39931
|
-
try {
|
|
39932
|
-
beforeAllOutput = await executeWorkspaceScript(
|
|
39933
|
-
toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
|
|
39934
|
-
scriptContext
|
|
39935
|
-
);
|
|
39936
|
-
if (setupDebug) {
|
|
39937
|
-
console.log(`[setup] test=${evalCase.id} before_all completed`);
|
|
39938
|
-
}
|
|
39939
|
-
} catch (error40) {
|
|
39940
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39941
|
-
if (forceCleanup && workspacePath) {
|
|
39942
|
-
await cleanupWorkspace(workspacePath).catch(() => {
|
|
39943
|
-
});
|
|
39944
|
-
}
|
|
39945
|
-
return buildErrorResult(
|
|
39946
|
-
evalCase,
|
|
39947
|
-
target.name,
|
|
39948
|
-
nowFn(),
|
|
39949
|
-
new Error(`before_all script failed: ${message}`),
|
|
39950
|
-
promptInputs,
|
|
39951
|
-
provider,
|
|
39952
|
-
"setup",
|
|
39953
|
-
"script_error",
|
|
39954
|
-
verbose
|
|
39955
|
-
);
|
|
39956
|
-
}
|
|
39957
|
-
}
|
|
39958
|
-
}
|
|
39959
|
-
let beforeEachNeedsFreshBaseline = false;
|
|
39960
|
-
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
|
|
39961
|
-
try {
|
|
39962
|
-
if (repoManager && evalCase.workspace.repos?.length) {
|
|
39963
|
-
await repoManager.reset(
|
|
39964
|
-
evalCase.workspace.repos,
|
|
39965
|
-
workspacePath,
|
|
39966
|
-
evalCase.workspace.hooks.before_each.reset
|
|
39967
|
-
);
|
|
39968
|
-
} else {
|
|
39969
|
-
await resetWorkspaceRoot(
|
|
39970
|
-
workspacePath,
|
|
39971
|
-
evalCase.workspace.hooks.before_each.reset,
|
|
39972
|
-
sharedBaselineCommit
|
|
39973
|
-
);
|
|
39974
|
-
}
|
|
39975
|
-
} catch (error40) {
|
|
39976
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
39977
|
-
return buildErrorResult(
|
|
39978
|
-
evalCase,
|
|
39979
|
-
target.name,
|
|
39980
|
-
nowFn(),
|
|
39981
|
-
new Error(`before_each reset failed: ${message}`),
|
|
39982
|
-
promptInputs,
|
|
39983
|
-
provider,
|
|
39984
|
-
"setup",
|
|
39985
|
-
"script_error",
|
|
39986
|
-
verbose
|
|
39987
|
-
);
|
|
39988
|
-
}
|
|
39989
|
-
}
|
|
39990
|
-
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
39991
|
-
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
|
|
39992
|
-
const beforeEachHook = caseBeforeEachHook;
|
|
39993
|
-
const scriptContext = {
|
|
39994
|
-
workspacePath,
|
|
39995
|
-
testId: evalCase.id,
|
|
39996
|
-
evalRunId: evalRunId ?? "",
|
|
39997
|
-
caseInput: evalCase.question,
|
|
39998
|
-
caseMetadata: evalCase.metadata,
|
|
39999
|
-
evalDir,
|
|
40000
|
-
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
40001
|
-
};
|
|
40002
|
-
try {
|
|
40003
|
-
beforeEachOutput = await executeWorkspaceScript(
|
|
40004
|
-
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
40005
|
-
scriptContext
|
|
40006
|
-
);
|
|
40007
|
-
beforeEachNeedsFreshBaseline = true;
|
|
40008
|
-
} catch (error40) {
|
|
40009
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
40010
|
-
return buildErrorResult(
|
|
40011
|
-
evalCase,
|
|
40012
|
-
target.name,
|
|
40013
|
-
nowFn(),
|
|
40014
|
-
new Error(`before_each script failed: ${message}`),
|
|
40015
|
-
promptInputs,
|
|
40016
|
-
provider,
|
|
40017
|
-
"setup",
|
|
40018
|
-
"script_error",
|
|
40019
|
-
verbose
|
|
40020
|
-
);
|
|
40021
|
-
}
|
|
40022
|
-
}
|
|
40023
|
-
const targetBeforeEachHook = options.targetHooks?.before_each;
|
|
40024
|
-
if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
|
|
40025
|
-
const scriptContext = {
|
|
40026
|
-
workspacePath,
|
|
40027
|
-
testId: evalCase.id,
|
|
40028
|
-
evalRunId: evalRunId ?? "",
|
|
40029
|
-
caseInput: evalCase.question,
|
|
40030
|
-
caseMetadata: evalCase.metadata,
|
|
40578
|
+
let workspaceSetup;
|
|
40579
|
+
try {
|
|
40580
|
+
workspaceSetup = await prepareEvalCaseWorkspace({
|
|
40581
|
+
evalCase,
|
|
40582
|
+
targetName: target.name,
|
|
40583
|
+
evalRunId,
|
|
40584
|
+
sharedWorkspacePath,
|
|
40585
|
+
sharedBaselineCommit,
|
|
40586
|
+
suiteWorkspaceFile,
|
|
40587
|
+
repoManager,
|
|
40031
40588
|
evalDir,
|
|
40032
|
-
|
|
40033
|
-
|
|
40034
|
-
|
|
40035
|
-
|
|
40036
|
-
|
|
40037
|
-
|
|
40038
|
-
|
|
40039
|
-
|
|
40040
|
-
|
|
40041
|
-
|
|
40042
|
-
|
|
40043
|
-
|
|
40044
|
-
|
|
40045
|
-
|
|
40046
|
-
|
|
40047
|
-
|
|
40048
|
-
|
|
40049
|
-
"setup",
|
|
40050
|
-
"script_error",
|
|
40051
|
-
verbose
|
|
40052
|
-
);
|
|
40053
|
-
}
|
|
40054
|
-
}
|
|
40055
|
-
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
40056
|
-
if (!baselineCommit && workspacePath) {
|
|
40057
|
-
try {
|
|
40058
|
-
baselineCommit = await initializeBaseline(workspacePath);
|
|
40059
|
-
} catch (error40) {
|
|
40060
|
-
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
40061
|
-
if (verbose) {
|
|
40062
|
-
console.warn(`[setup] test=${evalCase.id} baseline initialization failed: ${message}`);
|
|
40063
|
-
}
|
|
40064
|
-
}
|
|
40589
|
+
cleanupWorkspaces: forceCleanup,
|
|
40590
|
+
targetHooks: options.targetHooks,
|
|
40591
|
+
setupDebug
|
|
40592
|
+
});
|
|
40593
|
+
} catch (error40) {
|
|
40594
|
+
const setupError = error40 instanceof WorkspaceSetupError ? error40 : void 0;
|
|
40595
|
+
return buildErrorResult(
|
|
40596
|
+
evalCase,
|
|
40597
|
+
target.name,
|
|
40598
|
+
nowFn(),
|
|
40599
|
+
error40,
|
|
40600
|
+
promptInputs,
|
|
40601
|
+
provider,
|
|
40602
|
+
setupError?.failureStage ?? "setup",
|
|
40603
|
+
setupError?.failureReasonCode ?? "script_error",
|
|
40604
|
+
verbose
|
|
40605
|
+
);
|
|
40065
40606
|
}
|
|
40607
|
+
const {
|
|
40608
|
+
workspacePath,
|
|
40609
|
+
beforeAllOutput,
|
|
40610
|
+
beforeEachOutput,
|
|
40611
|
+
baselineCommit,
|
|
40612
|
+
isSharedWorkspace,
|
|
40613
|
+
caseWorkspaceFile
|
|
40614
|
+
} = workspaceSetup;
|
|
40066
40615
|
if (evalCase.mode === "conversation" && evalCase.turns?.length) {
|
|
40067
40616
|
const conversationResult = await runConversationMode({
|
|
40068
40617
|
evalCase,
|
|
@@ -40769,7 +41318,7 @@ async function runEvaluatorList(options) {
|
|
|
40769
41318
|
dockerConfig,
|
|
40770
41319
|
dependencyResults
|
|
40771
41320
|
};
|
|
40772
|
-
const evalFileDir = evalCase.file_paths[0] ?
|
|
41321
|
+
const evalFileDir = evalCase.file_paths[0] ? path47.dirname(evalCase.file_paths[0]) : process.cwd();
|
|
40773
41322
|
const dispatchContext = {
|
|
40774
41323
|
graderProvider,
|
|
40775
41324
|
targetResolver,
|
|
@@ -41431,38 +41980,6 @@ function computeWeightedMean(entries) {
|
|
|
41431
41980
|
}
|
|
41432
41981
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
41433
41982
|
}
|
|
41434
|
-
async function runPreflightChecks(env, cwd, log) {
|
|
41435
|
-
const execFileAsync3 = promisify6(execFile2);
|
|
41436
|
-
const missing = [];
|
|
41437
|
-
for (const cmd of env.required_commands ?? []) {
|
|
41438
|
-
log(`preflight: checking command "${cmd}"`);
|
|
41439
|
-
try {
|
|
41440
|
-
if (process.platform === "win32") {
|
|
41441
|
-
await execFileAsync3("where", [cmd], { cwd });
|
|
41442
|
-
} else {
|
|
41443
|
-
await execFileAsync3("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
41444
|
-
}
|
|
41445
|
-
} catch {
|
|
41446
|
-
missing.push(`command: ${cmd}`);
|
|
41447
|
-
}
|
|
41448
|
-
}
|
|
41449
|
-
for (const mod of env.required_python_modules ?? []) {
|
|
41450
|
-
log(`preflight: checking Python module "${mod}"`);
|
|
41451
|
-
try {
|
|
41452
|
-
await execFileAsync3("python3", ["-c", `import ${mod}`], { cwd });
|
|
41453
|
-
} catch {
|
|
41454
|
-
missing.push(`python module: ${mod}`);
|
|
41455
|
-
}
|
|
41456
|
-
}
|
|
41457
|
-
if (missing.length > 0) {
|
|
41458
|
-
throw new Error(
|
|
41459
|
-
`Preflight checks failed \u2014 missing dependencies:
|
|
41460
|
-
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
41461
|
-
|
|
41462
|
-
Install the missing dependencies before running this eval.`
|
|
41463
|
-
);
|
|
41464
|
-
}
|
|
41465
|
-
}
|
|
41466
41983
|
function createFunctionProvider(taskFn) {
|
|
41467
41984
|
return {
|
|
41468
41985
|
id: "function-provider",
|
|
@@ -41837,22 +42354,22 @@ function deduplicateByTestIdTarget(results) {
|
|
|
41837
42354
|
return deduped;
|
|
41838
42355
|
}
|
|
41839
42356
|
async function aggregateRunDir(runDir, options) {
|
|
41840
|
-
const indexPath =
|
|
42357
|
+
const indexPath = path48.join(runDir, RESULT_INDEX_FILENAME);
|
|
41841
42358
|
const content = await readFile20(indexPath, "utf8");
|
|
41842
42359
|
const allResults = parseJsonlResults(content);
|
|
41843
42360
|
const results = deduplicateByTestIdTarget(allResults);
|
|
41844
42361
|
const timing = buildTimingArtifact(results);
|
|
41845
|
-
const timingPath =
|
|
42362
|
+
const timingPath = path48.join(runDir, "timing.json");
|
|
41846
42363
|
await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
41847
42364
|
`, "utf8");
|
|
41848
|
-
const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(
|
|
42365
|
+
const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(path48.join(runDir, "benchmark.json"));
|
|
41849
42366
|
const benchmark = buildBenchmarkArtifact(
|
|
41850
42367
|
results,
|
|
41851
42368
|
options?.evalFile,
|
|
41852
42369
|
options?.experiment,
|
|
41853
42370
|
plannedTestCount
|
|
41854
42371
|
);
|
|
41855
|
-
const benchmarkPath =
|
|
42372
|
+
const benchmarkPath = path48.join(runDir, "benchmark.json");
|
|
41856
42373
|
await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
41857
42374
|
`, "utf8");
|
|
41858
42375
|
const targetSet = new Set(results.map((r) => r.target ?? "unknown"));
|
|
@@ -41991,17 +42508,37 @@ function toIndexRerunSource(value) {
|
|
|
41991
42508
|
source_timestamp: value.sourceTimestamp
|
|
41992
42509
|
});
|
|
41993
42510
|
}
|
|
42511
|
+
function toIndexPreparedAttempt(value) {
|
|
42512
|
+
if (!isRecord4(value)) {
|
|
42513
|
+
return void 0;
|
|
42514
|
+
}
|
|
42515
|
+
return dropUndefined5({
|
|
42516
|
+
source: value.source,
|
|
42517
|
+
manifest_path: value.manifestPath,
|
|
42518
|
+
prepared_dir: value.preparedDir,
|
|
42519
|
+
workspace_path: value.workspacePath,
|
|
42520
|
+
prompt_path: value.promptPath,
|
|
42521
|
+
target: value.target,
|
|
42522
|
+
prepared_at: value.preparedAt,
|
|
42523
|
+
setup_status: value.setupStatus,
|
|
42524
|
+
baseline_status: value.baselineStatus,
|
|
42525
|
+
baseline_commit: value.baselineCommit
|
|
42526
|
+
});
|
|
42527
|
+
}
|
|
41994
42528
|
function toIndexMetadata(metadata) {
|
|
41995
42529
|
if (!metadata) {
|
|
41996
42530
|
return void 0;
|
|
41997
42531
|
}
|
|
41998
42532
|
const rerunSource = toIndexRerunSource(metadata.rerunSource);
|
|
41999
|
-
|
|
42533
|
+
const preparedAttempt = toIndexPreparedAttempt(metadata.preparedAttempt);
|
|
42534
|
+
if (!rerunSource && !preparedAttempt) {
|
|
42000
42535
|
return { ...metadata };
|
|
42001
42536
|
}
|
|
42537
|
+
const reservedKeys = /* @__PURE__ */ new Set(["rerunSource", "preparedAttempt"]);
|
|
42002
42538
|
return {
|
|
42003
|
-
...Object.fromEntries(Object.entries(metadata).filter(([key]) => key
|
|
42004
|
-
rerun_source: rerunSource
|
|
42539
|
+
...Object.fromEntries(Object.entries(metadata).filter(([key]) => !reservedKeys.has(key))),
|
|
42540
|
+
...rerunSource ? { rerun_source: rerunSource } : {},
|
|
42541
|
+
...preparedAttempt ? { prepared_attempt: preparedAttempt } : {}
|
|
42005
42542
|
};
|
|
42006
42543
|
}
|
|
42007
42544
|
function buildGradingArtifact(result) {
|
|
@@ -42147,7 +42684,7 @@ async function writeInitialBenchmarkArtifact(runDir, options) {
|
|
|
42147
42684
|
options.experiment,
|
|
42148
42685
|
options.plannedTestCount
|
|
42149
42686
|
);
|
|
42150
|
-
const benchmarkPath =
|
|
42687
|
+
const benchmarkPath = path48.join(runDir, "benchmark.json");
|
|
42151
42688
|
await writeFile10(benchmarkPath, `${JSON.stringify(stub, null, 2)}
|
|
42152
42689
|
`, "utf8");
|
|
42153
42690
|
}
|
|
@@ -42197,7 +42734,7 @@ function buildArtifactSubdir(result) {
|
|
|
42197
42734
|
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
42198
42735
|
}
|
|
42199
42736
|
segments.push(safeTestId(result.testId));
|
|
42200
|
-
return
|
|
42737
|
+
return path48.posix.join(...segments);
|
|
42201
42738
|
}
|
|
42202
42739
|
function formatOutputMarkdown(output) {
|
|
42203
42740
|
return output.map((msg) => `@[${msg.role}]:
|
|
@@ -42213,7 +42750,7 @@ function extractInput(result) {
|
|
|
42213
42750
|
return null;
|
|
42214
42751
|
}
|
|
42215
42752
|
function toRelativeArtifactPath(outputDir, filePath) {
|
|
42216
|
-
return
|
|
42753
|
+
return path48.relative(outputDir, filePath).split(path48.sep).join("/");
|
|
42217
42754
|
}
|
|
42218
42755
|
function findResultSourceTest(result, testByTestId) {
|
|
42219
42756
|
return testByTestId.get(result.testId ?? "unknown");
|
|
@@ -42229,7 +42766,7 @@ async function writeTraceEnvelopeSidecar(params) {
|
|
|
42229
42766
|
const hasTranscript = resultHasExecutionTraceTranscript(params.result);
|
|
42230
42767
|
const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, {
|
|
42231
42768
|
evalPath: params.evalPath,
|
|
42232
|
-
runId:
|
|
42769
|
+
runId: path48.basename(params.outputDir),
|
|
42233
42770
|
experiment: params.experiment,
|
|
42234
42771
|
source: { path: RESULT_INDEX_FILENAME },
|
|
42235
42772
|
capture: { content: "full", redactionLevel: "none", redactedFields: [] },
|
|
@@ -42241,7 +42778,7 @@ async function writeTraceEnvelopeSidecar(params) {
|
|
|
42241
42778
|
}
|
|
42242
42779
|
});
|
|
42243
42780
|
await writeFile10(
|
|
42244
|
-
|
|
42781
|
+
path48.join(params.outputsDir, "execution-trace.json"),
|
|
42245
42782
|
`${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}
|
|
42246
42783
|
`,
|
|
42247
42784
|
"utf8"
|
|
@@ -42305,13 +42842,13 @@ function buildResultIndexArtifact(result, extraIndexFields) {
|
|
|
42305
42842
|
failure_reason_code: result.failureReasonCode,
|
|
42306
42843
|
workspace_path: result.workspacePath,
|
|
42307
42844
|
artifact_dir: artifactSubdir,
|
|
42308
|
-
grading_path:
|
|
42309
|
-
timing_path:
|
|
42310
|
-
input_path: input ?
|
|
42311
|
-
output_path: hasAnswer ?
|
|
42312
|
-
answer_path: hasAnswer ?
|
|
42313
|
-
transcript_path: hasTranscript ?
|
|
42314
|
-
response_path: hasAnswer ?
|
|
42845
|
+
grading_path: path48.posix.join(artifactSubdir, "grading.json"),
|
|
42846
|
+
timing_path: path48.posix.join(artifactSubdir, "timing.json"),
|
|
42847
|
+
input_path: input ? path48.posix.join(artifactSubdir, "input.md") : void 0,
|
|
42848
|
+
output_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
|
|
42849
|
+
answer_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
|
|
42850
|
+
transcript_path: hasTranscript ? path48.posix.join(artifactSubdir, "outputs", "transcript.jsonl") : void 0,
|
|
42851
|
+
response_path: hasAnswer ? path48.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
42315
42852
|
...extraIndexFields,
|
|
42316
42853
|
metadata: toIndexMetadata(result.metadata)
|
|
42317
42854
|
};
|
|
@@ -42351,7 +42888,7 @@ async function rewriteExistingIndexRecords(outputDir, replacements) {
|
|
|
42351
42888
|
if (replacements.length === 0) {
|
|
42352
42889
|
return;
|
|
42353
42890
|
}
|
|
42354
|
-
const indexPath =
|
|
42891
|
+
const indexPath = path48.join(outputDir, RESULT_INDEX_FILENAME);
|
|
42355
42892
|
const content = await readFile20(indexPath, "utf8").catch(() => void 0);
|
|
42356
42893
|
if (content === void 0) {
|
|
42357
42894
|
return;
|
|
@@ -42520,29 +43057,29 @@ async function writePerTestArtifacts(results, outputDir, options) {
|
|
|
42520
43057
|
const grading = buildGradingArtifact(result);
|
|
42521
43058
|
const timing = buildTimingArtifact([result]);
|
|
42522
43059
|
const artifactSubdir = buildArtifactSubdir(result);
|
|
42523
|
-
const testDir =
|
|
43060
|
+
const testDir = path48.join(outputDir, artifactSubdir);
|
|
42524
43061
|
await mkdir18(testDir, { recursive: true });
|
|
42525
43062
|
await writeFile10(
|
|
42526
|
-
|
|
43063
|
+
path48.join(testDir, "grading.json"),
|
|
42527
43064
|
`${JSON.stringify(grading, null, 2)}
|
|
42528
43065
|
`,
|
|
42529
43066
|
"utf8"
|
|
42530
43067
|
);
|
|
42531
43068
|
await writeFile10(
|
|
42532
|
-
|
|
43069
|
+
path48.join(testDir, "timing.json"),
|
|
42533
43070
|
`${JSON.stringify(timing, null, 2)}
|
|
42534
43071
|
`,
|
|
42535
43072
|
"utf8"
|
|
42536
43073
|
);
|
|
42537
43074
|
const input = extractInput(result);
|
|
42538
43075
|
if (input) {
|
|
42539
|
-
await writeFile10(
|
|
43076
|
+
await writeFile10(path48.join(testDir, "input.md"), input, "utf8");
|
|
42540
43077
|
}
|
|
42541
|
-
const outputsDir =
|
|
43078
|
+
const outputsDir = path48.join(testDir, "outputs");
|
|
42542
43079
|
await mkdir18(outputsDir, { recursive: true });
|
|
42543
43080
|
if (result.output.length > 0) {
|
|
42544
|
-
await writeFile10(
|
|
42545
|
-
await writeFile10(
|
|
43081
|
+
await writeFile10(path48.join(outputsDir, "answer.md"), result.output, "utf8");
|
|
43082
|
+
await writeFile10(path48.join(outputsDir, "response.md"), result.output, "utf8");
|
|
42546
43083
|
}
|
|
42547
43084
|
const envelope = await writeTraceEnvelopeSidecar({
|
|
42548
43085
|
result,
|
|
@@ -42552,7 +43089,7 @@ async function writePerTestArtifacts(results, outputDir, options) {
|
|
|
42552
43089
|
experiment: options?.experiment
|
|
42553
43090
|
});
|
|
42554
43091
|
if (hasTranscriptProjection(result, envelope)) {
|
|
42555
|
-
await writeTranscriptJsonl(
|
|
43092
|
+
await writeTranscriptJsonl(path48.join(outputsDir, "transcript.jsonl"), result, envelope);
|
|
42556
43093
|
}
|
|
42557
43094
|
const extraIndexFields = await collectAdditionalIndexFields(
|
|
42558
43095
|
result,
|
|
@@ -42570,9 +43107,9 @@ async function writePerTestArtifacts(results, outputDir, options) {
|
|
|
42570
43107
|
}
|
|
42571
43108
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
42572
43109
|
const testArtifactDir = outputDir;
|
|
42573
|
-
const timingPath =
|
|
42574
|
-
const benchmarkPath =
|
|
42575
|
-
const indexPath =
|
|
43110
|
+
const timingPath = path48.join(outputDir, "timing.json");
|
|
43111
|
+
const benchmarkPath = path48.join(outputDir, "benchmark.json");
|
|
43112
|
+
const indexPath = path48.join(outputDir, RESULT_INDEX_FILENAME);
|
|
42576
43113
|
await mkdir18(outputDir, { recursive: true });
|
|
42577
43114
|
const indexRecords = [];
|
|
42578
43115
|
const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
|
|
@@ -42580,23 +43117,23 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
42580
43117
|
const grading = buildGradingArtifact(result);
|
|
42581
43118
|
const timing2 = buildTimingArtifact([result]);
|
|
42582
43119
|
const artifactSubdir = buildArtifactSubdir(result);
|
|
42583
|
-
const testDir =
|
|
42584
|
-
const gradingPath =
|
|
42585
|
-
const perTestTimingPath =
|
|
43120
|
+
const testDir = path48.join(outputDir, artifactSubdir);
|
|
43121
|
+
const gradingPath = path48.join(testDir, "grading.json");
|
|
43122
|
+
const perTestTimingPath = path48.join(testDir, "timing.json");
|
|
42586
43123
|
await mkdir18(testDir, { recursive: true });
|
|
42587
43124
|
await writeFile10(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
42588
43125
|
`, "utf8");
|
|
42589
43126
|
await writeFile10(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
|
|
42590
43127
|
`, "utf8");
|
|
42591
43128
|
const input = extractInput(result);
|
|
42592
|
-
const inputPath = input ?
|
|
43129
|
+
const inputPath = input ? path48.join(testDir, "input.md") : void 0;
|
|
42593
43130
|
if (inputPath && input) {
|
|
42594
43131
|
await writeFile10(inputPath, input, "utf8");
|
|
42595
43132
|
}
|
|
42596
|
-
const outputsDir =
|
|
43133
|
+
const outputsDir = path48.join(testDir, "outputs");
|
|
42597
43134
|
await mkdir18(outputsDir, { recursive: true });
|
|
42598
|
-
const answerPath = result.output.length > 0 ?
|
|
42599
|
-
const responsePath = result.output.length > 0 ?
|
|
43135
|
+
const answerPath = result.output.length > 0 ? path48.join(outputsDir, "answer.md") : void 0;
|
|
43136
|
+
const responsePath = result.output.length > 0 ? path48.join(outputsDir, "response.md") : void 0;
|
|
42600
43137
|
if (answerPath && responsePath) {
|
|
42601
43138
|
await writeFile10(answerPath, result.output, "utf8");
|
|
42602
43139
|
await writeFile10(responsePath, result.output, "utf8");
|
|
@@ -42608,7 +43145,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
42608
43145
|
evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
|
|
42609
43146
|
experiment: options?.experiment
|
|
42610
43147
|
});
|
|
42611
|
-
const transcriptPath = hasTranscriptProjection(result, envelope) ?
|
|
43148
|
+
const transcriptPath = hasTranscriptProjection(result, envelope) ? path48.join(outputsDir, "transcript.jsonl") : void 0;
|
|
42612
43149
|
if (transcriptPath) {
|
|
42613
43150
|
await writeTranscriptJsonl(transcriptPath, result, envelope);
|
|
42614
43151
|
}
|
|
@@ -42649,7 +43186,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
42649
43186
|
`, "utf8");
|
|
42650
43187
|
await writeJsonlFile(indexPath, indexRecords);
|
|
42651
43188
|
await writeFile10(
|
|
42652
|
-
|
|
43189
|
+
path48.join(outputDir, "transcript.jsonl"),
|
|
42653
43190
|
buildTranscriptMessageLines(results),
|
|
42654
43191
|
"utf8"
|
|
42655
43192
|
);
|
|
@@ -42700,7 +43237,7 @@ async function evaluate(config2) {
|
|
|
42700
43237
|
cliNoCache: false,
|
|
42701
43238
|
yamlCache: config2.cache === void 0 ? materialized.cache : void 0
|
|
42702
43239
|
});
|
|
42703
|
-
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ?
|
|
43240
|
+
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path49.resolve(materialized.cachePath) : void 0) : void 0;
|
|
42704
43241
|
const results = await runEvaluation({
|
|
42705
43242
|
testFilePath,
|
|
42706
43243
|
repoRoot,
|
|
@@ -42723,7 +43260,7 @@ async function evaluate(config2) {
|
|
|
42723
43260
|
});
|
|
42724
43261
|
const allResults = collectedResults.length > 0 ? collectedResults : [...results];
|
|
42725
43262
|
const durationMs = Date.now() - startTime;
|
|
42726
|
-
const outputDir = config2.outputDir ?
|
|
43263
|
+
const outputDir = config2.outputDir ? path49.resolve(config2.outputDir) : void 0;
|
|
42727
43264
|
const artifacts = outputDir ? await writeArtifactsFromResults(allResults, outputDir, {
|
|
42728
43265
|
evalFile: config2.specFile ? testFilePath : "",
|
|
42729
43266
|
experiment: config2.experiment,
|
|
@@ -42743,7 +43280,7 @@ async function evaluate(config2) {
|
|
|
42743
43280
|
async function materializeEvalConfig(config2, options) {
|
|
42744
43281
|
const baseDir = options?.baseDir ?? process.cwd();
|
|
42745
43282
|
const repoRoot = options?.repoRoot ?? await findGitRoot(baseDir) ?? baseDir;
|
|
42746
|
-
const testFilePath = config2.specFile ?
|
|
43283
|
+
const testFilePath = config2.specFile ? path49.resolve(baseDir, config2.specFile) : path49.join(baseDir, "__programmatic__.yaml");
|
|
42747
43284
|
const effectiveFilter = options?.filter ?? config2.filter;
|
|
42748
43285
|
if (config2.specFile) {
|
|
42749
43286
|
const suite = await loadTestSuite(testFilePath, repoRoot, {
|
|
@@ -42820,7 +43357,7 @@ function convertAssertions(entries) {
|
|
|
42820
43357
|
}
|
|
42821
43358
|
function buildInlineEvalTests(config2, options) {
|
|
42822
43359
|
const suiteWorkspace = config2.beforeAll ? { hooks: { before_all: toBeforeAllHook(config2.beforeAll) } } : void 0;
|
|
42823
|
-
const derivedSuiteName =
|
|
43360
|
+
const derivedSuiteName = path49.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
|
|
42824
43361
|
const suiteName = config2.metadata?.name ?? (derivedSuiteName || "eval");
|
|
42825
43362
|
return (config2.tests ?? []).filter((test) => !options.filter || matchesFilter4(test.id, options.filter)).map((test) => {
|
|
42826
43363
|
const isConversation = test.mode === "conversation" || test.turns && test.turns.length > 0;
|
|
@@ -42916,10 +43453,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
|
42916
43453
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
42917
43454
|
async function discoverDefaultTarget(repoRoot) {
|
|
42918
43455
|
const cwd = process.cwd();
|
|
42919
|
-
const chain = buildDirectoryChain(
|
|
43456
|
+
const chain = buildDirectoryChain(path49.join(cwd, "_placeholder"), repoRoot);
|
|
42920
43457
|
for (const dir of chain) {
|
|
42921
43458
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
42922
|
-
const targetsPath =
|
|
43459
|
+
const targetsPath = path49.join(dir, candidate);
|
|
42923
43460
|
if (!existsSync7(targetsPath)) continue;
|
|
42924
43461
|
try {
|
|
42925
43462
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
@@ -42936,7 +43473,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
42936
43473
|
const chain = buildDirectoryChain(startPath, repoRoot);
|
|
42937
43474
|
const envFiles = [];
|
|
42938
43475
|
for (const dir of chain) {
|
|
42939
|
-
const envPath =
|
|
43476
|
+
const envPath = path49.join(dir, ".env");
|
|
42940
43477
|
if (existsSync7(envPath)) envFiles.push(envPath);
|
|
42941
43478
|
}
|
|
42942
43479
|
for (let i = 0; i < envFiles.length; i++) {
|
|
@@ -42962,7 +43499,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
42962
43499
|
}
|
|
42963
43500
|
var EXPORT_NAMES = ["default", "config", "evalConfig"];
|
|
42964
43501
|
async function loadTsEvalFile(filePath) {
|
|
42965
|
-
const absolutePath =
|
|
43502
|
+
const absolutePath = path50.resolve(filePath);
|
|
42966
43503
|
const moduleUrl = pathToFileURL2(absolutePath).href;
|
|
42967
43504
|
const module = await import(moduleUrl);
|
|
42968
43505
|
let config2;
|
|
@@ -42984,7 +43521,7 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
|
|
|
42984
43521
|
const { config: config2, filePath: absolutePath } = await loadTsEvalFile(filePath);
|
|
42985
43522
|
const materialized = await materializeEvalConfig(config2, {
|
|
42986
43523
|
repoRoot,
|
|
42987
|
-
baseDir:
|
|
43524
|
+
baseDir: path50.dirname(absolutePath),
|
|
42988
43525
|
filter: options?.filter,
|
|
42989
43526
|
category: options?.category
|
|
42990
43527
|
});
|
|
@@ -43046,6 +43583,7 @@ export {
|
|
|
43046
43583
|
buildDirectoryChain,
|
|
43047
43584
|
buildSearchRoots,
|
|
43048
43585
|
resolveFileReference,
|
|
43586
|
+
AGENT_PROVIDER_KINDS,
|
|
43049
43587
|
KNOWN_PROVIDERS,
|
|
43050
43588
|
PROVIDER_ALIASES,
|
|
43051
43589
|
extractLastAssistantContent,
|
|
@@ -43230,6 +43768,7 @@ export {
|
|
|
43230
43768
|
createTempWorkspace,
|
|
43231
43769
|
cleanupWorkspace,
|
|
43232
43770
|
cleanupEvalWorkspaces,
|
|
43771
|
+
executeWorkspaceScript,
|
|
43233
43772
|
resolveRepoCloneUrl,
|
|
43234
43773
|
normalizeRepoIdentity,
|
|
43235
43774
|
computeWorkspaceFingerprint,
|
|
@@ -43246,7 +43785,9 @@ export {
|
|
|
43246
43785
|
discoverProjects,
|
|
43247
43786
|
RepoManager,
|
|
43248
43787
|
resolveWorkspaceTemplate,
|
|
43249
|
-
|
|
43788
|
+
releaseSharedWorkspaceSetup,
|
|
43789
|
+
prepareSharedWorkspaceSetup,
|
|
43790
|
+
prepareEvalCaseWorkspace,
|
|
43250
43791
|
isAgentSkillsFormat,
|
|
43251
43792
|
parseAgentSkillsEvals,
|
|
43252
43793
|
DEFAULT_EVAL_PATTERNS,
|
|
@@ -43271,6 +43812,7 @@ export {
|
|
|
43271
43812
|
loadEvalCases,
|
|
43272
43813
|
loadTestById,
|
|
43273
43814
|
loadEvalCaseById,
|
|
43815
|
+
gradePreparedEvalCase,
|
|
43274
43816
|
runEvaluation,
|
|
43275
43817
|
runEvalCase,
|
|
43276
43818
|
toTranscriptJsonLines,
|
|
@@ -43300,4 +43842,4 @@ export {
|
|
|
43300
43842
|
loadTsEvalFile,
|
|
43301
43843
|
loadTsEvalSuite
|
|
43302
43844
|
};
|
|
43303
|
-
//# sourceMappingURL=chunk-
|
|
43845
|
+
//# sourceMappingURL=chunk-ENHX2CCS.js.map
|