@agentv/core 2.9.0-next.2 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7Q4PH265.js → chunk-REN5PS7B.js} +15 -8
- package/dist/chunk-REN5PS7B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +106 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +96 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +745 -170
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +173 -9
- package/dist/index.d.ts +173 -9
- package/dist/index.js +710 -150
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-7Q4PH265.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-REN5PS7B.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -1285,18 +1285,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1285
1285
|
});
|
|
1286
1286
|
continue;
|
|
1287
1287
|
}
|
|
1288
|
+
if (typeValue === "contains_any" || typeValue === "contains_all") {
|
|
1289
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1290
|
+
if (!value || value.length === 0) {
|
|
1291
|
+
logWarning2(
|
|
1292
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
1293
|
+
);
|
|
1294
|
+
continue;
|
|
1295
|
+
}
|
|
1296
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1297
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1298
|
+
evaluators.push({
|
|
1299
|
+
name,
|
|
1300
|
+
type: typeValue,
|
|
1301
|
+
value,
|
|
1302
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1303
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1304
|
+
...negate !== void 0 ? { negate } : {}
|
|
1305
|
+
});
|
|
1306
|
+
continue;
|
|
1307
|
+
}
|
|
1308
|
+
if (typeValue === "icontains") {
|
|
1309
|
+
const value = asString(rawEvaluator.value);
|
|
1310
|
+
if (!value) {
|
|
1311
|
+
logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
|
|
1312
|
+
continue;
|
|
1313
|
+
}
|
|
1314
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1315
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1316
|
+
evaluators.push({
|
|
1317
|
+
name,
|
|
1318
|
+
type: "icontains",
|
|
1319
|
+
value,
|
|
1320
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1321
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1322
|
+
...negate !== void 0 ? { negate } : {}
|
|
1323
|
+
});
|
|
1324
|
+
continue;
|
|
1325
|
+
}
|
|
1326
|
+
if (typeValue === "icontains_any" || typeValue === "icontains_all") {
|
|
1327
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1328
|
+
if (!value || value.length === 0) {
|
|
1329
|
+
logWarning2(
|
|
1330
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
1331
|
+
);
|
|
1332
|
+
continue;
|
|
1333
|
+
}
|
|
1334
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1335
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1336
|
+
evaluators.push({
|
|
1337
|
+
name,
|
|
1338
|
+
type: typeValue,
|
|
1339
|
+
value,
|
|
1340
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1341
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1342
|
+
...negate !== void 0 ? { negate } : {}
|
|
1343
|
+
});
|
|
1344
|
+
continue;
|
|
1345
|
+
}
|
|
1346
|
+
if (typeValue === "starts_with" || typeValue === "ends_with") {
|
|
1347
|
+
const value = asString(rawEvaluator.value);
|
|
1348
|
+
if (!value) {
|
|
1349
|
+
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
1350
|
+
continue;
|
|
1351
|
+
}
|
|
1352
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1353
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1354
|
+
evaluators.push({
|
|
1355
|
+
name,
|
|
1356
|
+
type: typeValue,
|
|
1357
|
+
value,
|
|
1358
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1359
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1360
|
+
...negate !== void 0 ? { negate } : {}
|
|
1361
|
+
});
|
|
1362
|
+
continue;
|
|
1363
|
+
}
|
|
1288
1364
|
if (typeValue === "regex") {
|
|
1289
1365
|
const value = asString(rawEvaluator.value);
|
|
1290
1366
|
if (!value) {
|
|
1291
1367
|
logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
|
|
1292
1368
|
continue;
|
|
1293
1369
|
}
|
|
1370
|
+
const flags = asString(rawEvaluator.flags);
|
|
1294
1371
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1295
1372
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1296
1373
|
evaluators.push({
|
|
1297
1374
|
name,
|
|
1298
1375
|
type: "regex",
|
|
1299
1376
|
value,
|
|
1377
|
+
...flags !== void 0 ? { flags } : {},
|
|
1300
1378
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1301
1379
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1302
1380
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -1469,15 +1547,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1469
1547
|
}
|
|
1470
1548
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
1471
1549
|
}
|
|
1472
|
-
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1550
|
+
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1551
|
+
"contains",
|
|
1552
|
+
"contains_any",
|
|
1553
|
+
"contains_all",
|
|
1554
|
+
"icontains",
|
|
1555
|
+
"icontains_any",
|
|
1556
|
+
"icontains_all",
|
|
1557
|
+
"starts_with",
|
|
1558
|
+
"ends_with",
|
|
1559
|
+
"regex",
|
|
1560
|
+
"is_json",
|
|
1561
|
+
"equals",
|
|
1562
|
+
"rubrics"
|
|
1563
|
+
]);
|
|
1473
1564
|
function generateAssertionName(typeValue, rawEvaluator) {
|
|
1474
1565
|
if (!ASSERTION_TYPES.has(typeValue)) {
|
|
1475
1566
|
return void 0;
|
|
1476
1567
|
}
|
|
1477
1568
|
const value = asString(rawEvaluator.value);
|
|
1569
|
+
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
|
|
1478
1570
|
switch (typeValue) {
|
|
1479
1571
|
case "contains":
|
|
1480
1572
|
return value ? `contains-${value}` : "contains";
|
|
1573
|
+
case "contains_any":
|
|
1574
|
+
return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
|
|
1575
|
+
case "contains_all":
|
|
1576
|
+
return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
|
|
1577
|
+
case "icontains":
|
|
1578
|
+
return value ? `icontains-${value}` : "icontains";
|
|
1579
|
+
case "icontains_any":
|
|
1580
|
+
return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
|
|
1581
|
+
case "icontains_all":
|
|
1582
|
+
return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
|
|
1583
|
+
case "starts_with":
|
|
1584
|
+
return value ? `starts_with-${value}` : "starts_with";
|
|
1585
|
+
case "ends_with":
|
|
1586
|
+
return value ? `ends_with-${value}` : "ends_with";
|
|
1481
1587
|
case "regex":
|
|
1482
1588
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
1483
1589
|
case "is_json":
|
|
@@ -1503,6 +1609,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1503
1609
|
function asString(value) {
|
|
1504
1610
|
return typeof value === "string" ? value : void 0;
|
|
1505
1611
|
}
|
|
1612
|
+
function asStringArrayStrict(value) {
|
|
1613
|
+
if (!Array.isArray(value)) {
|
|
1614
|
+
return void 0;
|
|
1615
|
+
}
|
|
1616
|
+
const result = value.filter((v) => typeof v === "string");
|
|
1617
|
+
return result.length > 0 ? result : void 0;
|
|
1618
|
+
}
|
|
1506
1619
|
function asStringArray(value, description) {
|
|
1507
1620
|
if (value === void 0) {
|
|
1508
1621
|
return void 0;
|
|
@@ -2820,6 +2933,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
2820
2933
|
}
|
|
2821
2934
|
return cwd ? { ...config, cwd } : config;
|
|
2822
2935
|
}
|
|
2936
|
+
function parseRepoSource(raw) {
|
|
2937
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2938
|
+
const obj = raw;
|
|
2939
|
+
if (obj.type === "git" && typeof obj.url === "string") {
|
|
2940
|
+
return { type: "git", url: obj.url };
|
|
2941
|
+
}
|
|
2942
|
+
if (obj.type === "local" && typeof obj.path === "string") {
|
|
2943
|
+
return { type: "local", path: obj.path };
|
|
2944
|
+
}
|
|
2945
|
+
return void 0;
|
|
2946
|
+
}
|
|
2947
|
+
function parseRepoCheckout(raw) {
|
|
2948
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2949
|
+
const obj = raw;
|
|
2950
|
+
const ref = typeof obj.ref === "string" ? obj.ref : void 0;
|
|
2951
|
+
const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
|
|
2952
|
+
const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
|
|
2953
|
+
if (!ref && !resolve && ancestor === void 0) return void 0;
|
|
2954
|
+
return {
|
|
2955
|
+
...ref !== void 0 && { ref },
|
|
2956
|
+
...resolve !== void 0 && { resolve },
|
|
2957
|
+
...ancestor !== void 0 && { ancestor }
|
|
2958
|
+
};
|
|
2959
|
+
}
|
|
2960
|
+
function parseRepoClone(raw) {
|
|
2961
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2962
|
+
const obj = raw;
|
|
2963
|
+
const depth = typeof obj.depth === "number" ? obj.depth : void 0;
|
|
2964
|
+
const filter = typeof obj.filter === "string" ? obj.filter : void 0;
|
|
2965
|
+
const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
|
|
2966
|
+
if (depth === void 0 && !filter && !sparse) return void 0;
|
|
2967
|
+
return {
|
|
2968
|
+
...depth !== void 0 && { depth },
|
|
2969
|
+
...filter !== void 0 && { filter },
|
|
2970
|
+
...sparse !== void 0 && { sparse }
|
|
2971
|
+
};
|
|
2972
|
+
}
|
|
2973
|
+
function parseRepoConfig(raw) {
|
|
2974
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2975
|
+
const obj = raw;
|
|
2976
|
+
const repoPath = typeof obj.path === "string" ? obj.path : void 0;
|
|
2977
|
+
const source = parseRepoSource(obj.source);
|
|
2978
|
+
if (!repoPath || !source) return void 0;
|
|
2979
|
+
const checkout = parseRepoCheckout(obj.checkout);
|
|
2980
|
+
const clone = parseRepoClone(obj.clone);
|
|
2981
|
+
return {
|
|
2982
|
+
path: repoPath,
|
|
2983
|
+
source,
|
|
2984
|
+
...checkout !== void 0 && { checkout },
|
|
2985
|
+
...clone !== void 0 && { clone }
|
|
2986
|
+
};
|
|
2987
|
+
}
|
|
2988
|
+
function parseResetConfig(raw) {
|
|
2989
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2990
|
+
const obj = raw;
|
|
2991
|
+
const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
|
|
2992
|
+
const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
|
|
2993
|
+
if (!strategy && afterEach === void 0) return void 0;
|
|
2994
|
+
return {
|
|
2995
|
+
...strategy !== void 0 && { strategy },
|
|
2996
|
+
...afterEach !== void 0 && { after_each: afterEach }
|
|
2997
|
+
};
|
|
2998
|
+
}
|
|
2823
2999
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
2824
3000
|
if (!isJsonObject(raw)) return void 0;
|
|
2825
3001
|
const obj = raw;
|
|
@@ -2827,13 +3003,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
2827
3003
|
if (template && !path8.isAbsolute(template)) {
|
|
2828
3004
|
template = path8.resolve(evalFileDir, template);
|
|
2829
3005
|
}
|
|
3006
|
+
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
3007
|
+
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
3008
|
+
const reset = parseResetConfig(obj.reset);
|
|
2830
3009
|
const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
|
|
2831
3010
|
const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
|
|
2832
3011
|
const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
|
|
2833
3012
|
const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
|
|
2834
|
-
if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3013
|
+
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3014
|
+
return void 0;
|
|
2835
3015
|
return {
|
|
2836
3016
|
...template !== void 0 && { template },
|
|
3017
|
+
...isolation !== void 0 && { isolation },
|
|
3018
|
+
...repos !== void 0 && { repos },
|
|
3019
|
+
...reset !== void 0 && { reset },
|
|
2837
3020
|
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
2838
3021
|
...afterAll !== void 0 && { after_all: afterAll },
|
|
2839
3022
|
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
@@ -2846,6 +3029,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
2846
3029
|
if (!caseLevel) return suiteLevel;
|
|
2847
3030
|
return {
|
|
2848
3031
|
template: caseLevel.template ?? suiteLevel.template,
|
|
3032
|
+
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
3033
|
+
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
3034
|
+
reset: caseLevel.reset ?? suiteLevel.reset,
|
|
2849
3035
|
before_all: caseLevel.before_all ?? suiteLevel.before_all,
|
|
2850
3036
|
after_all: caseLevel.after_all ?? suiteLevel.after_all,
|
|
2851
3037
|
before_each: caseLevel.before_each ?? suiteLevel.before_each,
|
|
@@ -3385,11 +3571,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
3385
3571
|
}
|
|
3386
3572
|
return claudeSdkModule;
|
|
3387
3573
|
}
|
|
3388
|
-
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3389
|
-
- Do NOT create any additional output files in the workspace.
|
|
3390
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
3391
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3392
|
-
This is required for evaluation scoring.`;
|
|
3393
3574
|
var ClaudeProvider = class {
|
|
3394
3575
|
id;
|
|
3395
3576
|
kind = "claude";
|
|
@@ -3411,7 +3592,7 @@ var ClaudeProvider = class {
|
|
|
3411
3592
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3412
3593
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3413
3594
|
const prompt = buildPromptDocument(request, inputFiles);
|
|
3414
|
-
const systemPrompt = this.config.systemPrompt
|
|
3595
|
+
const systemPrompt = this.config.systemPrompt;
|
|
3415
3596
|
const queryOptions = {
|
|
3416
3597
|
permissionMode: "bypassPermissions",
|
|
3417
3598
|
allowDangerouslySkipPermissions: true,
|
|
@@ -4392,11 +4573,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
4392
4573
|
}
|
|
4393
4574
|
return codexSdkModule;
|
|
4394
4575
|
}
|
|
4395
|
-
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
4396
|
-
- Do NOT create any additional output files in the workspace.
|
|
4397
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
4398
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
4399
|
-
This is required for evaluation scoring.`;
|
|
4400
4576
|
var CodexProvider = class {
|
|
4401
4577
|
id;
|
|
4402
4578
|
kind = "codex";
|
|
@@ -4431,7 +4607,7 @@ var CodexProvider = class {
|
|
|
4431
4607
|
const thread = codex.startThread(threadOptions);
|
|
4432
4608
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
4433
4609
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
4434
|
-
const systemPrompt = this.config.systemPrompt
|
|
4610
|
+
const systemPrompt = this.config.systemPrompt;
|
|
4435
4611
|
const prompt = systemPrompt ? `${systemPrompt}
|
|
4436
4612
|
|
|
4437
4613
|
${basePrompt}` : basePrompt;
|
|
@@ -4797,7 +4973,7 @@ import { arch, platform } from "node:os";
|
|
|
4797
4973
|
import path13 from "node:path";
|
|
4798
4974
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
4799
4975
|
function resolvePlatformCliPath() {
|
|
4800
|
-
const
|
|
4976
|
+
const os5 = platform();
|
|
4801
4977
|
const cpu = arch();
|
|
4802
4978
|
const platformMap = {
|
|
4803
4979
|
linux: "linux",
|
|
@@ -4808,13 +4984,13 @@ function resolvePlatformCliPath() {
|
|
|
4808
4984
|
x64: "x64",
|
|
4809
4985
|
arm64: "arm64"
|
|
4810
4986
|
};
|
|
4811
|
-
const osPart = platformMap[
|
|
4987
|
+
const osPart = platformMap[os5];
|
|
4812
4988
|
const archPart = archMap[cpu];
|
|
4813
4989
|
if (!osPart || !archPart) {
|
|
4814
4990
|
return void 0;
|
|
4815
4991
|
}
|
|
4816
4992
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
4817
|
-
const binaryName =
|
|
4993
|
+
const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
|
|
4818
4994
|
try {
|
|
4819
4995
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
4820
4996
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -4956,11 +5132,6 @@ function isLogStreamingDisabled(envKey) {
|
|
|
4956
5132
|
}
|
|
4957
5133
|
|
|
4958
5134
|
// src/evaluation/providers/copilot-cli.ts
|
|
4959
|
-
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
4960
|
-
- Do NOT create any additional output files in the workspace.
|
|
4961
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
4962
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
4963
|
-
This is required for evaluation scoring.`;
|
|
4964
5135
|
var CopilotCliProvider = class {
|
|
4965
5136
|
id;
|
|
4966
5137
|
kind = "copilot-cli";
|
|
@@ -5163,8 +5334,8 @@ var CopilotCliProvider = class {
|
|
|
5163
5334
|
}
|
|
5164
5335
|
return args;
|
|
5165
5336
|
}
|
|
5166
|
-
resolveSystemPrompt(
|
|
5167
|
-
return this.config.systemPrompt
|
|
5337
|
+
resolveSystemPrompt(_request) {
|
|
5338
|
+
return this.config.systemPrompt;
|
|
5168
5339
|
}
|
|
5169
5340
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
5170
5341
|
const timeoutMs = this.config.timeoutMs;
|
|
@@ -5352,21 +5523,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
5352
5523
|
}
|
|
5353
5524
|
return copilotSdkModule;
|
|
5354
5525
|
}
|
|
5355
|
-
var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5356
|
-
- Do NOT create any additional output files in the workspace.
|
|
5357
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5358
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5359
|
-
This is required for evaluation scoring.`;
|
|
5360
5526
|
var CopilotSdkProvider = class {
|
|
5361
5527
|
id;
|
|
5362
|
-
kind = "copilot";
|
|
5528
|
+
kind = "copilot-sdk";
|
|
5363
5529
|
targetName;
|
|
5364
5530
|
supportsBatch = false;
|
|
5365
5531
|
config;
|
|
5366
5532
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
5367
5533
|
client = null;
|
|
5368
5534
|
constructor(targetName, config) {
|
|
5369
|
-
this.id = `copilot:${targetName}`;
|
|
5535
|
+
this.id = `copilot-sdk:${targetName}`;
|
|
5370
5536
|
this.targetName = targetName;
|
|
5371
5537
|
this.config = config;
|
|
5372
5538
|
}
|
|
@@ -5389,7 +5555,7 @@ var CopilotSdkProvider = class {
|
|
|
5389
5555
|
if (cwd) {
|
|
5390
5556
|
sessionOptions.workingDirectory = cwd;
|
|
5391
5557
|
}
|
|
5392
|
-
const systemPrompt = this.config.systemPrompt
|
|
5558
|
+
const systemPrompt = this.config.systemPrompt;
|
|
5393
5559
|
if (systemPrompt) {
|
|
5394
5560
|
sessionOptions.systemMessage = {
|
|
5395
5561
|
mode: "append",
|
|
@@ -5905,11 +6071,6 @@ function subscribeToPiLogEntries(listener) {
|
|
|
5905
6071
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
5906
6072
|
var WORKSPACE_PREFIX = "agentv-pi-";
|
|
5907
6073
|
var PROMPT_FILENAME = "prompt.md";
|
|
5908
|
-
var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5909
|
-
- Do NOT create any additional output files in the workspace.
|
|
5910
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5911
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5912
|
-
This is required for evaluation scoring.`;
|
|
5913
6074
|
var PiCodingAgentProvider = class {
|
|
5914
6075
|
id;
|
|
5915
6076
|
kind = "pi-coding-agent";
|
|
@@ -5986,7 +6147,7 @@ var PiCodingAgentProvider = class {
|
|
|
5986
6147
|
}
|
|
5987
6148
|
return path16.resolve(this.config.cwd);
|
|
5988
6149
|
}
|
|
5989
|
-
buildPiArgs(prompt, inputFiles,
|
|
6150
|
+
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
5990
6151
|
const args = [];
|
|
5991
6152
|
if (this.config.provider) {
|
|
5992
6153
|
args.push("--provider", this.config.provider);
|
|
@@ -6014,7 +6175,7 @@ var PiCodingAgentProvider = class {
|
|
|
6014
6175
|
args.push(`@${file}`);
|
|
6015
6176
|
}
|
|
6016
6177
|
}
|
|
6017
|
-
const systemPrompt = this.config.systemPrompt
|
|
6178
|
+
const systemPrompt = this.config.systemPrompt;
|
|
6018
6179
|
const fullPrompt = systemPrompt ? `${systemPrompt}
|
|
6019
6180
|
|
|
6020
6181
|
${prompt}` : prompt;
|
|
@@ -7708,7 +7869,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7708
7869
|
|
|
7709
7870
|
**IMPORTANT**: Follow these exact steps:
|
|
7710
7871
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7711
|
-
- Do NOT create any additional output files in the workspace.
|
|
7712
7872
|
- All intended file outputs/changes MUST be written in your response file.
|
|
7713
7873
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7714
7874
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
@@ -7727,7 +7887,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7727
7887
|
|
|
7728
7888
|
**IMPORTANT**: Follow these exact steps:
|
|
7729
7889
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7730
|
-
- Do NOT create any additional output files in the workspace.
|
|
7731
7890
|
- All intended file outputs/changes MUST be written in your response file.
|
|
7732
7891
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7733
7892
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
@@ -8153,7 +8312,7 @@ async function discoverProviders(registry, baseDir) {
|
|
|
8153
8312
|
// src/evaluation/providers/index.ts
|
|
8154
8313
|
function createBuiltinProviderRegistry() {
|
|
8155
8314
|
const registry = new ProviderRegistry();
|
|
8156
|
-
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
8315
|
+
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
8157
8316
|
"vscode-insiders",
|
|
8158
8317
|
(t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
|
|
8159
8318
|
);
|
|
@@ -8342,16 +8501,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
8342
8501
|
});
|
|
8343
8502
|
}
|
|
8344
8503
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
8345
|
-
const { mkdir:
|
|
8504
|
+
const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
8346
8505
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
8347
|
-
const
|
|
8506
|
+
const path40 = await import("node:path");
|
|
8348
8507
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
8349
|
-
const dir =
|
|
8350
|
-
await
|
|
8351
|
-
const stdinPath =
|
|
8352
|
-
const stdoutPath =
|
|
8353
|
-
const stderrPath =
|
|
8354
|
-
await
|
|
8508
|
+
const dir = path40.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
8509
|
+
await mkdir14(dir, { recursive: true });
|
|
8510
|
+
const stdinPath = path40.join(dir, "stdin.txt");
|
|
8511
|
+
const stdoutPath = path40.join(dir, "stdout.txt");
|
|
8512
|
+
const stderrPath = path40.join(dir, "stderr.txt");
|
|
8513
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
8355
8514
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
8356
8515
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
8357
8516
|
try {
|
|
@@ -8384,7 +8543,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
8384
8543
|
const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8385
8544
|
return { stdout, stderr, exitCode };
|
|
8386
8545
|
} finally {
|
|
8387
|
-
await
|
|
8546
|
+
await rm6(dir, { recursive: true, force: true });
|
|
8388
8547
|
}
|
|
8389
8548
|
}
|
|
8390
8549
|
|
|
@@ -8702,7 +8861,7 @@ var CodeEvaluator = class {
|
|
|
8702
8861
|
outputPath,
|
|
8703
8862
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
8704
8863
|
inputFiles: context.evalCase.file_paths.filter(
|
|
8705
|
-
(
|
|
8864
|
+
(path40) => !context.evalCase.guideline_paths.includes(path40)
|
|
8706
8865
|
),
|
|
8707
8866
|
input: context.evalCase.input,
|
|
8708
8867
|
trace: context.trace ?? null,
|
|
@@ -8950,13 +9109,15 @@ ${context.fileChanges}`;
|
|
|
8950
9109
|
evaluatorRawRequest,
|
|
8951
9110
|
tokenUsage
|
|
8952
9111
|
};
|
|
8953
|
-
} catch {
|
|
9112
|
+
} catch (e) {
|
|
9113
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
8954
9114
|
return {
|
|
8955
9115
|
score: 0,
|
|
8956
|
-
verdict: "
|
|
9116
|
+
verdict: "skip",
|
|
8957
9117
|
hits: [],
|
|
8958
|
-
misses: [],
|
|
9118
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
8959
9119
|
expectedAspectCount: 1,
|
|
9120
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
8960
9121
|
evaluatorRawRequest
|
|
8961
9122
|
};
|
|
8962
9123
|
}
|
|
@@ -9898,115 +10059,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
9898
10059
|
* Evaluate a single field against the expected value.
|
|
9899
10060
|
*/
|
|
9900
10061
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
9901
|
-
const { path:
|
|
9902
|
-
const candidateValue = resolvePath(candidateData,
|
|
9903
|
-
const expectedValue = resolvePath(expectedData,
|
|
10062
|
+
const { path: path40, match, required = true, weight = 1 } = fieldConfig;
|
|
10063
|
+
const candidateValue = resolvePath(candidateData, path40);
|
|
10064
|
+
const expectedValue = resolvePath(expectedData, path40);
|
|
9904
10065
|
if (expectedValue === void 0) {
|
|
9905
10066
|
return {
|
|
9906
|
-
path:
|
|
10067
|
+
path: path40,
|
|
9907
10068
|
score: 1,
|
|
9908
10069
|
// No expected value means no comparison needed
|
|
9909
10070
|
weight,
|
|
9910
10071
|
hit: true,
|
|
9911
|
-
message: `${
|
|
10072
|
+
message: `${path40}: no expected value`
|
|
9912
10073
|
};
|
|
9913
10074
|
}
|
|
9914
10075
|
if (candidateValue === void 0) {
|
|
9915
10076
|
if (required) {
|
|
9916
10077
|
return {
|
|
9917
|
-
path:
|
|
10078
|
+
path: path40,
|
|
9918
10079
|
score: 0,
|
|
9919
10080
|
weight,
|
|
9920
10081
|
hit: false,
|
|
9921
|
-
message: `${
|
|
10082
|
+
message: `${path40} (required, missing)`
|
|
9922
10083
|
};
|
|
9923
10084
|
}
|
|
9924
10085
|
return {
|
|
9925
|
-
path:
|
|
10086
|
+
path: path40,
|
|
9926
10087
|
score: 1,
|
|
9927
10088
|
// Don't penalize missing optional fields
|
|
9928
10089
|
weight: 0,
|
|
9929
10090
|
// Zero weight means it won't affect the score
|
|
9930
10091
|
hit: true,
|
|
9931
|
-
message: `${
|
|
10092
|
+
message: `${path40}: optional field missing`
|
|
9932
10093
|
};
|
|
9933
10094
|
}
|
|
9934
10095
|
switch (match) {
|
|
9935
10096
|
case "exact":
|
|
9936
|
-
return this.compareExact(
|
|
10097
|
+
return this.compareExact(path40, candidateValue, expectedValue, weight);
|
|
9937
10098
|
case "numeric_tolerance":
|
|
9938
10099
|
return this.compareNumericTolerance(
|
|
9939
|
-
|
|
10100
|
+
path40,
|
|
9940
10101
|
candidateValue,
|
|
9941
10102
|
expectedValue,
|
|
9942
10103
|
fieldConfig,
|
|
9943
10104
|
weight
|
|
9944
10105
|
);
|
|
9945
10106
|
case "date":
|
|
9946
|
-
return this.compareDate(
|
|
10107
|
+
return this.compareDate(path40, candidateValue, expectedValue, fieldConfig, weight);
|
|
9947
10108
|
default:
|
|
9948
10109
|
return {
|
|
9949
|
-
path:
|
|
10110
|
+
path: path40,
|
|
9950
10111
|
score: 0,
|
|
9951
10112
|
weight,
|
|
9952
10113
|
hit: false,
|
|
9953
|
-
message: `${
|
|
10114
|
+
message: `${path40}: unknown match type "${match}"`
|
|
9954
10115
|
};
|
|
9955
10116
|
}
|
|
9956
10117
|
}
|
|
9957
10118
|
/**
|
|
9958
10119
|
* Exact equality comparison.
|
|
9959
10120
|
*/
|
|
9960
|
-
compareExact(
|
|
10121
|
+
compareExact(path40, candidateValue, expectedValue, weight) {
|
|
9961
10122
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
9962
10123
|
return {
|
|
9963
|
-
path:
|
|
10124
|
+
path: path40,
|
|
9964
10125
|
score: 1,
|
|
9965
10126
|
weight,
|
|
9966
10127
|
hit: true,
|
|
9967
|
-
message:
|
|
10128
|
+
message: path40
|
|
9968
10129
|
};
|
|
9969
10130
|
}
|
|
9970
10131
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
9971
10132
|
return {
|
|
9972
|
-
path:
|
|
10133
|
+
path: path40,
|
|
9973
10134
|
score: 0,
|
|
9974
10135
|
weight,
|
|
9975
10136
|
hit: false,
|
|
9976
|
-
message: `${
|
|
10137
|
+
message: `${path40} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
9977
10138
|
};
|
|
9978
10139
|
}
|
|
9979
10140
|
return {
|
|
9980
|
-
path:
|
|
10141
|
+
path: path40,
|
|
9981
10142
|
score: 0,
|
|
9982
10143
|
weight,
|
|
9983
10144
|
hit: false,
|
|
9984
|
-
message: `${
|
|
10145
|
+
message: `${path40} (value mismatch)`
|
|
9985
10146
|
};
|
|
9986
10147
|
}
|
|
9987
10148
|
/**
|
|
9988
10149
|
* Numeric comparison with absolute or relative tolerance.
|
|
9989
10150
|
*/
|
|
9990
|
-
compareNumericTolerance(
|
|
10151
|
+
compareNumericTolerance(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
9991
10152
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
9992
10153
|
const candidateNum = toNumber2(candidateValue);
|
|
9993
10154
|
const expectedNum = toNumber2(expectedValue);
|
|
9994
10155
|
if (candidateNum === null || expectedNum === null) {
|
|
9995
10156
|
return {
|
|
9996
|
-
path:
|
|
10157
|
+
path: path40,
|
|
9997
10158
|
score: 0,
|
|
9998
10159
|
weight,
|
|
9999
10160
|
hit: false,
|
|
10000
|
-
message: `${
|
|
10161
|
+
message: `${path40} (non-numeric value)`
|
|
10001
10162
|
};
|
|
10002
10163
|
}
|
|
10003
10164
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
10004
10165
|
return {
|
|
10005
|
-
path:
|
|
10166
|
+
path: path40,
|
|
10006
10167
|
score: 0,
|
|
10007
10168
|
weight,
|
|
10008
10169
|
hit: false,
|
|
10009
|
-
message: `${
|
|
10170
|
+
message: `${path40} (invalid numeric value)`
|
|
10010
10171
|
};
|
|
10011
10172
|
}
|
|
10012
10173
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -10019,61 +10180,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
10019
10180
|
}
|
|
10020
10181
|
if (withinTolerance) {
|
|
10021
10182
|
return {
|
|
10022
|
-
path:
|
|
10183
|
+
path: path40,
|
|
10023
10184
|
score: 1,
|
|
10024
10185
|
weight,
|
|
10025
10186
|
hit: true,
|
|
10026
|
-
message: `${
|
|
10187
|
+
message: `${path40} (within tolerance: diff=${diff.toFixed(2)})`
|
|
10027
10188
|
};
|
|
10028
10189
|
}
|
|
10029
10190
|
return {
|
|
10030
|
-
path:
|
|
10191
|
+
path: path40,
|
|
10031
10192
|
score: 0,
|
|
10032
10193
|
weight,
|
|
10033
10194
|
hit: false,
|
|
10034
|
-
message: `${
|
|
10195
|
+
message: `${path40} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
10035
10196
|
};
|
|
10036
10197
|
}
|
|
10037
10198
|
/**
|
|
10038
10199
|
* Date comparison with format normalization.
|
|
10039
10200
|
*/
|
|
10040
|
-
compareDate(
|
|
10201
|
+
compareDate(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10041
10202
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
10042
10203
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
10043
10204
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
10044
10205
|
if (candidateDate === null) {
|
|
10045
10206
|
return {
|
|
10046
|
-
path:
|
|
10207
|
+
path: path40,
|
|
10047
10208
|
score: 0,
|
|
10048
10209
|
weight,
|
|
10049
10210
|
hit: false,
|
|
10050
|
-
message: `${
|
|
10211
|
+
message: `${path40} (unparseable candidate date)`
|
|
10051
10212
|
};
|
|
10052
10213
|
}
|
|
10053
10214
|
if (expectedDate === null) {
|
|
10054
10215
|
return {
|
|
10055
|
-
path:
|
|
10216
|
+
path: path40,
|
|
10056
10217
|
score: 0,
|
|
10057
10218
|
weight,
|
|
10058
10219
|
hit: false,
|
|
10059
|
-
message: `${
|
|
10220
|
+
message: `${path40} (unparseable expected date)`
|
|
10060
10221
|
};
|
|
10061
10222
|
}
|
|
10062
10223
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
10063
10224
|
return {
|
|
10064
|
-
path:
|
|
10225
|
+
path: path40,
|
|
10065
10226
|
score: 1,
|
|
10066
10227
|
weight,
|
|
10067
10228
|
hit: true,
|
|
10068
|
-
message:
|
|
10229
|
+
message: path40
|
|
10069
10230
|
};
|
|
10070
10231
|
}
|
|
10071
10232
|
return {
|
|
10072
|
-
path:
|
|
10233
|
+
path: path40,
|
|
10073
10234
|
score: 0,
|
|
10074
10235
|
weight,
|
|
10075
10236
|
hit: false,
|
|
10076
|
-
message: `${
|
|
10237
|
+
message: `${path40} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
10077
10238
|
};
|
|
10078
10239
|
}
|
|
10079
10240
|
/**
|
|
@@ -10114,11 +10275,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
10114
10275
|
};
|
|
10115
10276
|
}
|
|
10116
10277
|
};
|
|
10117
|
-
function resolvePath(obj,
|
|
10118
|
-
if (!
|
|
10278
|
+
function resolvePath(obj, path40) {
|
|
10279
|
+
if (!path40 || !obj) {
|
|
10119
10280
|
return void 0;
|
|
10120
10281
|
}
|
|
10121
|
-
const parts =
|
|
10282
|
+
const parts = path40.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
10122
10283
|
let current = obj;
|
|
10123
10284
|
for (const part of parts) {
|
|
10124
10285
|
if (current === null || current === void 0) {
|
|
@@ -10936,8 +11097,8 @@ var TokenUsageEvaluator = class {
|
|
|
10936
11097
|
};
|
|
10937
11098
|
|
|
10938
11099
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
10939
|
-
function getNestedValue(obj,
|
|
10940
|
-
const parts =
|
|
11100
|
+
function getNestedValue(obj, path40) {
|
|
11101
|
+
const parts = path40.split(".");
|
|
10941
11102
|
let current = obj;
|
|
10942
11103
|
for (const part of parts) {
|
|
10943
11104
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -11401,13 +11562,78 @@ function runContainsAssertion(output, value) {
|
|
|
11401
11562
|
misses: passed ? [] : [`Output does not contain "${value}"`]
|
|
11402
11563
|
};
|
|
11403
11564
|
}
|
|
11404
|
-
function
|
|
11405
|
-
const
|
|
11565
|
+
function runContainsAnyAssertion(output, values) {
|
|
11566
|
+
const matched = values.filter((v) => output.includes(v));
|
|
11567
|
+
const passed = matched.length > 0;
|
|
11568
|
+
return {
|
|
11569
|
+
score: passed ? 1 : 0,
|
|
11570
|
+
hits: passed ? [`Output contains "${matched[0]}"`] : [],
|
|
11571
|
+
misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
|
|
11572
|
+
};
|
|
11573
|
+
}
|
|
11574
|
+
function runContainsAllAssertion(output, values) {
|
|
11575
|
+
const missing = values.filter((v) => !output.includes(v));
|
|
11576
|
+
const passed = missing.length === 0;
|
|
11577
|
+
return {
|
|
11578
|
+
score: passed ? 1 : 0,
|
|
11579
|
+
hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
|
|
11580
|
+
misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
11581
|
+
};
|
|
11582
|
+
}
|
|
11583
|
+
function runIcontainsAssertion(output, value) {
|
|
11584
|
+
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
11585
|
+
return {
|
|
11586
|
+
score: passed ? 1 : 0,
|
|
11587
|
+
hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
|
|
11588
|
+
misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
|
|
11589
|
+
};
|
|
11590
|
+
}
|
|
11591
|
+
function runIcontainsAnyAssertion(output, values) {
|
|
11592
|
+
const lower = output.toLowerCase();
|
|
11593
|
+
const matched = values.filter((v) => lower.includes(v.toLowerCase()));
|
|
11594
|
+
const passed = matched.length > 0;
|
|
11595
|
+
return {
|
|
11596
|
+
score: passed ? 1 : 0,
|
|
11597
|
+
hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
|
|
11598
|
+
misses: passed ? [] : [
|
|
11599
|
+
`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
|
|
11600
|
+
]
|
|
11601
|
+
};
|
|
11602
|
+
}
|
|
11603
|
+
function runIcontainsAllAssertion(output, values) {
|
|
11604
|
+
const lower = output.toLowerCase();
|
|
11605
|
+
const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
|
|
11606
|
+
const passed = missing.length === 0;
|
|
11607
|
+
return {
|
|
11608
|
+
score: passed ? 1 : 0,
|
|
11609
|
+
hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
|
|
11610
|
+
misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
11611
|
+
};
|
|
11612
|
+
}
|
|
11613
|
+
function runStartsWithAssertion(output, value) {
|
|
11614
|
+
const passed = output.trim().startsWith(value.trim());
|
|
11615
|
+
return {
|
|
11616
|
+
score: passed ? 1 : 0,
|
|
11617
|
+
hits: passed ? [`Output starts with "${value}"`] : [],
|
|
11618
|
+
misses: passed ? [] : [`Output does not start with "${value}"`]
|
|
11619
|
+
};
|
|
11620
|
+
}
|
|
11621
|
+
function runEndsWithAssertion(output, value) {
|
|
11622
|
+
const passed = output.trim().endsWith(value.trim());
|
|
11623
|
+
return {
|
|
11624
|
+
score: passed ? 1 : 0,
|
|
11625
|
+
hits: passed ? [`Output ends with "${value}"`] : [],
|
|
11626
|
+
misses: passed ? [] : [`Output does not end with "${value}"`]
|
|
11627
|
+
};
|
|
11628
|
+
}
|
|
11629
|
+
function runRegexAssertion(output, pattern, flags) {
|
|
11630
|
+
const regex = new RegExp(pattern, flags);
|
|
11406
11631
|
const passed = regex.test(output);
|
|
11632
|
+
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
11407
11633
|
return {
|
|
11408
11634
|
score: passed ? 1 : 0,
|
|
11409
|
-
hits: passed ? [`Output matches pattern /${pattern}
|
|
11410
|
-
misses: passed ? [] : [`Output does not match pattern /${pattern}
|
|
11635
|
+
hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
|
|
11636
|
+
misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
|
|
11411
11637
|
};
|
|
11412
11638
|
}
|
|
11413
11639
|
function runIsJsonAssertion(output) {
|
|
@@ -11433,9 +11659,9 @@ function runEqualsAssertion(output, value) {
|
|
|
11433
11659
|
}
|
|
11434
11660
|
|
|
11435
11661
|
// src/evaluation/orchestrator.ts
|
|
11436
|
-
import { createHash, randomUUID as randomUUID7 } from "node:crypto";
|
|
11437
|
-
import { mkdir as
|
|
11438
|
-
import
|
|
11662
|
+
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11663
|
+
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
11664
|
+
import path37 from "node:path";
|
|
11439
11665
|
import micromatch4 from "micromatch";
|
|
11440
11666
|
|
|
11441
11667
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -11825,13 +12051,13 @@ var containsFactory = (config) => {
|
|
|
11825
12051
|
var regexFactory = (config) => {
|
|
11826
12052
|
const c = config;
|
|
11827
12053
|
return new DeterministicAssertionEvaluator("regex", (ctx) => {
|
|
11828
|
-
const result = runRegexAssertion(ctx.candidate, c.value);
|
|
12054
|
+
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
11829
12055
|
return {
|
|
11830
12056
|
score: result.score,
|
|
11831
12057
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
11832
12058
|
hits: result.hits,
|
|
11833
12059
|
misses: result.misses,
|
|
11834
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}
|
|
12060
|
+
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
11835
12061
|
expectedAspectCount: 1
|
|
11836
12062
|
};
|
|
11837
12063
|
});
|
|
@@ -11863,9 +12089,107 @@ var equalsFactory = (config) => {
|
|
|
11863
12089
|
};
|
|
11864
12090
|
});
|
|
11865
12091
|
};
|
|
12092
|
+
var containsAnyFactory = (config) => {
|
|
12093
|
+
const c = config;
|
|
12094
|
+
return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
|
|
12095
|
+
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
12096
|
+
return {
|
|
12097
|
+
score: result.score,
|
|
12098
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12099
|
+
hits: result.hits,
|
|
12100
|
+
misses: result.misses,
|
|
12101
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12102
|
+
expectedAspectCount: 1
|
|
12103
|
+
};
|
|
12104
|
+
});
|
|
12105
|
+
};
|
|
12106
|
+
var containsAllFactory = (config) => {
|
|
12107
|
+
const c = config;
|
|
12108
|
+
return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
|
|
12109
|
+
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
12110
|
+
return {
|
|
12111
|
+
score: result.score,
|
|
12112
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12113
|
+
hits: result.hits,
|
|
12114
|
+
misses: result.misses,
|
|
12115
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12116
|
+
expectedAspectCount: 1
|
|
12117
|
+
};
|
|
12118
|
+
});
|
|
12119
|
+
};
|
|
12120
|
+
var icontainsFactory = (config) => {
|
|
12121
|
+
const c = config;
|
|
12122
|
+
return new DeterministicAssertionEvaluator("icontains", (ctx) => {
|
|
12123
|
+
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
12124
|
+
return {
|
|
12125
|
+
score: result.score,
|
|
12126
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12127
|
+
hits: result.hits,
|
|
12128
|
+
misses: result.misses,
|
|
12129
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12130
|
+
expectedAspectCount: 1
|
|
12131
|
+
};
|
|
12132
|
+
});
|
|
12133
|
+
};
|
|
12134
|
+
var icontainsAnyFactory = (config) => {
|
|
12135
|
+
const c = config;
|
|
12136
|
+
return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
|
|
12137
|
+
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
12138
|
+
return {
|
|
12139
|
+
score: result.score,
|
|
12140
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12141
|
+
hits: result.hits,
|
|
12142
|
+
misses: result.misses,
|
|
12143
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12144
|
+
expectedAspectCount: 1
|
|
12145
|
+
};
|
|
12146
|
+
});
|
|
12147
|
+
};
|
|
12148
|
+
var icontainsAllFactory = (config) => {
|
|
12149
|
+
const c = config;
|
|
12150
|
+
return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
|
|
12151
|
+
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
12152
|
+
return {
|
|
12153
|
+
score: result.score,
|
|
12154
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12155
|
+
hits: result.hits,
|
|
12156
|
+
misses: result.misses,
|
|
12157
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12158
|
+
expectedAspectCount: 1
|
|
12159
|
+
};
|
|
12160
|
+
});
|
|
12161
|
+
};
|
|
12162
|
+
var startsWithFactory = (config) => {
|
|
12163
|
+
const c = config;
|
|
12164
|
+
return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
|
|
12165
|
+
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
12166
|
+
return {
|
|
12167
|
+
score: result.score,
|
|
12168
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12169
|
+
hits: result.hits,
|
|
12170
|
+
misses: result.misses,
|
|
12171
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12172
|
+
expectedAspectCount: 1
|
|
12173
|
+
};
|
|
12174
|
+
});
|
|
12175
|
+
};
|
|
12176
|
+
var endsWithFactory = (config) => {
|
|
12177
|
+
const c = config;
|
|
12178
|
+
return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
|
|
12179
|
+
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
12180
|
+
return {
|
|
12181
|
+
score: result.score,
|
|
12182
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12183
|
+
hits: result.hits,
|
|
12184
|
+
misses: result.misses,
|
|
12185
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12186
|
+
expectedAspectCount: 1
|
|
12187
|
+
};
|
|
12188
|
+
});
|
|
12189
|
+
};
|
|
11866
12190
|
function createBuiltinRegistry() {
|
|
11867
12191
|
const registry = new EvaluatorRegistry();
|
|
11868
|
-
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
12192
|
+
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
11869
12193
|
return registry;
|
|
11870
12194
|
}
|
|
11871
12195
|
|
|
@@ -12209,18 +12533,198 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
12209
12533
|
}
|
|
12210
12534
|
}
|
|
12211
12535
|
|
|
12536
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
12537
|
+
import { execFile } from "node:child_process";
|
|
12538
|
+
import { createHash } from "node:crypto";
|
|
12539
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
12540
|
+
import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12541
|
+
import os4 from "node:os";
|
|
12542
|
+
import path35 from "node:path";
|
|
12543
|
+
import { promisify as promisify5 } from "node:util";
|
|
12544
|
+
var execFileAsync = promisify5(execFile);
|
|
12545
|
+
var DEFAULT_CACHE_DIR = path35.join(os4.homedir(), ".agentv", "git-cache");
|
|
12546
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
12547
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
12548
|
+
function gitEnv() {
|
|
12549
|
+
const env = { ...process.env };
|
|
12550
|
+
for (const key of Object.keys(env)) {
|
|
12551
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
12552
|
+
delete env[key];
|
|
12553
|
+
}
|
|
12554
|
+
}
|
|
12555
|
+
return {
|
|
12556
|
+
...env,
|
|
12557
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
12558
|
+
GIT_ASKPASS: "",
|
|
12559
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
12560
|
+
};
|
|
12561
|
+
}
|
|
12562
|
+
function cacheKey(source) {
|
|
12563
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
12564
|
+
return createHash("sha256").update(raw).digest("hex");
|
|
12565
|
+
}
|
|
12566
|
+
function getSourceUrl(source) {
|
|
12567
|
+
return source.type === "git" ? source.url : source.path;
|
|
12568
|
+
}
|
|
12569
|
+
async function git(args, opts) {
|
|
12570
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
12571
|
+
cwd: opts?.cwd,
|
|
12572
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
12573
|
+
env: gitEnv(),
|
|
12574
|
+
maxBuffer: 50 * 1024 * 1024
|
|
12575
|
+
// 50MB
|
|
12576
|
+
});
|
|
12577
|
+
return stdout.trim();
|
|
12578
|
+
}
|
|
12579
|
+
async function acquireLock(lockPath) {
|
|
12580
|
+
const start = Date.now();
|
|
12581
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
12582
|
+
try {
|
|
12583
|
+
await writeFile7(lockPath, String(process.pid), { flag: "wx" });
|
|
12584
|
+
return;
|
|
12585
|
+
} catch (err) {
|
|
12586
|
+
if (err.code === "EEXIST") {
|
|
12587
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
12588
|
+
continue;
|
|
12589
|
+
}
|
|
12590
|
+
throw err;
|
|
12591
|
+
}
|
|
12592
|
+
}
|
|
12593
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
12594
|
+
}
|
|
12595
|
+
async function releaseLock(lockPath) {
|
|
12596
|
+
try {
|
|
12597
|
+
await unlink(lockPath);
|
|
12598
|
+
} catch {
|
|
12599
|
+
}
|
|
12600
|
+
}
|
|
12601
|
+
var RepoManager = class {
|
|
12602
|
+
cacheDir;
|
|
12603
|
+
constructor(cacheDir) {
|
|
12604
|
+
this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
|
|
12605
|
+
}
|
|
12606
|
+
/**
|
|
12607
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
12608
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
12609
|
+
* Returns the absolute path to the cache directory.
|
|
12610
|
+
*/
|
|
12611
|
+
async ensureCache(source) {
|
|
12612
|
+
const key = cacheKey(source);
|
|
12613
|
+
const cachePath = path35.join(this.cacheDir, key);
|
|
12614
|
+
const lockPath = `${cachePath}.lock`;
|
|
12615
|
+
await mkdir11(this.cacheDir, { recursive: true });
|
|
12616
|
+
await acquireLock(lockPath);
|
|
12617
|
+
try {
|
|
12618
|
+
if (existsSync2(path35.join(cachePath, "HEAD"))) {
|
|
12619
|
+
await git(["fetch", "--prune"], { cwd: cachePath });
|
|
12620
|
+
} else {
|
|
12621
|
+
await git(["clone", "--mirror", "--bare", getSourceUrl(source), cachePath]);
|
|
12622
|
+
}
|
|
12623
|
+
} finally {
|
|
12624
|
+
await releaseLock(lockPath);
|
|
12625
|
+
}
|
|
12626
|
+
return cachePath;
|
|
12627
|
+
}
|
|
12628
|
+
/**
|
|
12629
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
12630
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
12631
|
+
*/
|
|
12632
|
+
async materialize(repo, workspacePath) {
|
|
12633
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12634
|
+
const cachePath = await this.ensureCache(repo.source);
|
|
12635
|
+
const cloneArgs = ["clone"];
|
|
12636
|
+
if (repo.clone?.depth) {
|
|
12637
|
+
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
12638
|
+
}
|
|
12639
|
+
if (repo.clone?.filter) {
|
|
12640
|
+
cloneArgs.push("--filter", repo.clone.filter);
|
|
12641
|
+
}
|
|
12642
|
+
cloneArgs.push("--no-checkout");
|
|
12643
|
+
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
|
|
12644
|
+
cloneArgs.push(cloneUrl, targetDir);
|
|
12645
|
+
await git(cloneArgs);
|
|
12646
|
+
if (repo.clone?.sparse?.length) {
|
|
12647
|
+
await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
|
|
12648
|
+
await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
|
|
12649
|
+
}
|
|
12650
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
12651
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
12652
|
+
let resolvedSha;
|
|
12653
|
+
if (resolve === "remote" && repo.source.type === "git") {
|
|
12654
|
+
const url = getSourceUrl(repo.source);
|
|
12655
|
+
try {
|
|
12656
|
+
const lsOutput = await git(["ls-remote", url, ref]);
|
|
12657
|
+
const match = lsOutput.split(" ")[0];
|
|
12658
|
+
if (!match) {
|
|
12659
|
+
throw new Error(`Ref '${ref}' not found on remote ${url}`);
|
|
12660
|
+
}
|
|
12661
|
+
resolvedSha = match;
|
|
12662
|
+
} catch (err) {
|
|
12663
|
+
if (err instanceof Error && err.message.includes("not found")) throw err;
|
|
12664
|
+
resolvedSha = ref;
|
|
12665
|
+
}
|
|
12666
|
+
} else {
|
|
12667
|
+
resolvedSha = ref;
|
|
12668
|
+
}
|
|
12669
|
+
await git(["checkout", resolvedSha], { cwd: targetDir });
|
|
12670
|
+
const ancestor = repo.checkout?.ancestor ?? 0;
|
|
12671
|
+
if (ancestor > 0) {
|
|
12672
|
+
try {
|
|
12673
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
12674
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
12675
|
+
} catch {
|
|
12676
|
+
if (repo.clone?.depth) {
|
|
12677
|
+
await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
|
|
12678
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
12679
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
12680
|
+
} else {
|
|
12681
|
+
throw new Error(
|
|
12682
|
+
`Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
|
|
12683
|
+
);
|
|
12684
|
+
}
|
|
12685
|
+
}
|
|
12686
|
+
}
|
|
12687
|
+
}
|
|
12688
|
+
/** Materialize all repos into the workspace. */
|
|
12689
|
+
async materializeAll(repos, workspacePath) {
|
|
12690
|
+
for (const repo of repos) {
|
|
12691
|
+
await this.materialize(repo, workspacePath);
|
|
12692
|
+
}
|
|
12693
|
+
}
|
|
12694
|
+
/** Reset repos in workspace to their checkout state. */
|
|
12695
|
+
async reset(repos, workspacePath, strategy) {
|
|
12696
|
+
if (strategy === "recreate") {
|
|
12697
|
+
for (const repo of repos) {
|
|
12698
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12699
|
+
await rm5(targetDir, { recursive: true, force: true });
|
|
12700
|
+
}
|
|
12701
|
+
await this.materializeAll(repos, workspacePath);
|
|
12702
|
+
return;
|
|
12703
|
+
}
|
|
12704
|
+
for (const repo of repos) {
|
|
12705
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12706
|
+
await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
12707
|
+
await git(["clean", "-fd"], { cwd: targetDir });
|
|
12708
|
+
}
|
|
12709
|
+
}
|
|
12710
|
+
/** Remove the entire cache directory. */
|
|
12711
|
+
async cleanCache() {
|
|
12712
|
+
await rm5(this.cacheDir, { recursive: true, force: true });
|
|
12713
|
+
}
|
|
12714
|
+
};
|
|
12715
|
+
|
|
12212
12716
|
// src/evaluation/workspace/resolve.ts
|
|
12213
12717
|
import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
|
|
12214
|
-
import
|
|
12718
|
+
import path36 from "node:path";
|
|
12215
12719
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
12216
12720
|
if (!templatePath) {
|
|
12217
12721
|
return void 0;
|
|
12218
12722
|
}
|
|
12219
|
-
const resolved =
|
|
12723
|
+
const resolved = path36.resolve(templatePath);
|
|
12220
12724
|
const stats = await stat6(resolved);
|
|
12221
12725
|
if (stats.isFile()) {
|
|
12222
12726
|
return {
|
|
12223
|
-
dir:
|
|
12727
|
+
dir: path36.dirname(resolved),
|
|
12224
12728
|
workspaceFile: resolved
|
|
12225
12729
|
};
|
|
12226
12730
|
}
|
|
@@ -12232,14 +12736,14 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
12232
12736
|
if (workspaceFiles.length === 1) {
|
|
12233
12737
|
return {
|
|
12234
12738
|
dir: resolved,
|
|
12235
|
-
workspaceFile:
|
|
12739
|
+
workspaceFile: path36.join(resolved, workspaceFiles[0])
|
|
12236
12740
|
};
|
|
12237
12741
|
}
|
|
12238
12742
|
if (workspaceFiles.length > 1) {
|
|
12239
12743
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
12240
12744
|
return {
|
|
12241
12745
|
dir: resolved,
|
|
12242
|
-
workspaceFile: conventionFile ?
|
|
12746
|
+
workspaceFile: conventionFile ? path36.join(resolved, conventionFile) : void 0
|
|
12243
12747
|
};
|
|
12244
12748
|
}
|
|
12245
12749
|
return { dir: resolved };
|
|
@@ -12361,6 +12865,11 @@ async function runEvaluation(options) {
|
|
|
12361
12865
|
}
|
|
12362
12866
|
return getOrCreateProvider(resolvedJudge);
|
|
12363
12867
|
};
|
|
12868
|
+
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
12869
|
+
throw new Error(
|
|
12870
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
|
|
12871
|
+
);
|
|
12872
|
+
}
|
|
12364
12873
|
const targetResolver = (name) => {
|
|
12365
12874
|
const resolved = resolveTargetByName(name);
|
|
12366
12875
|
if (!resolved) {
|
|
@@ -12374,7 +12883,7 @@ async function runEvaluation(options) {
|
|
|
12374
12883
|
];
|
|
12375
12884
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
12376
12885
|
const typeRegistry = createBuiltinRegistry();
|
|
12377
|
-
const discoveryBaseDir = evalFilePath ?
|
|
12886
|
+
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
12378
12887
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
12379
12888
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
12380
12889
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -12429,7 +12938,8 @@ async function runEvaluation(options) {
|
|
|
12429
12938
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
12430
12939
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
12431
12940
|
const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
12432
|
-
const
|
|
12941
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
12942
|
+
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
12433
12943
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
12434
12944
|
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
12435
12945
|
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
@@ -12448,9 +12958,22 @@ async function runEvaluation(options) {
|
|
|
12448
12958
|
const message = error instanceof Error ? error.message : String(error);
|
|
12449
12959
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
12450
12960
|
}
|
|
12451
|
-
} else if (suiteWorkspace?.before_all) {
|
|
12961
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
12452
12962
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
12453
|
-
await
|
|
12963
|
+
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
12964
|
+
}
|
|
12965
|
+
const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
|
|
12966
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
12967
|
+
try {
|
|
12968
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
12969
|
+
} catch (error) {
|
|
12970
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
12971
|
+
if (sharedWorkspacePath) {
|
|
12972
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
12973
|
+
});
|
|
12974
|
+
}
|
|
12975
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
12976
|
+
}
|
|
12454
12977
|
}
|
|
12455
12978
|
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
12456
12979
|
const scriptContext = {
|
|
@@ -12541,7 +13064,8 @@ async function runEvaluation(options) {
|
|
|
12541
13064
|
sharedBaselineCommit,
|
|
12542
13065
|
suiteWorkspaceFile,
|
|
12543
13066
|
streamCallbacks,
|
|
12544
|
-
typeRegistry
|
|
13067
|
+
typeRegistry,
|
|
13068
|
+
repoManager
|
|
12545
13069
|
};
|
|
12546
13070
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
12547
13071
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -12816,15 +13340,16 @@ async function runEvalCase(options) {
|
|
|
12816
13340
|
sharedWorkspacePath,
|
|
12817
13341
|
sharedBaselineCommit,
|
|
12818
13342
|
suiteWorkspaceFile,
|
|
12819
|
-
typeRegistry: providedTypeRegistry
|
|
13343
|
+
typeRegistry: providedTypeRegistry,
|
|
13344
|
+
repoManager
|
|
12820
13345
|
} = options;
|
|
12821
13346
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
12822
13347
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
12823
13348
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
12824
|
-
const
|
|
13349
|
+
const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
12825
13350
|
let cachedResponse;
|
|
12826
|
-
if (
|
|
12827
|
-
cachedResponse = await cache.get(
|
|
13351
|
+
if (cacheKey2 && cache) {
|
|
13352
|
+
cachedResponse = await cache.get(cacheKey2);
|
|
12828
13353
|
}
|
|
12829
13354
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
12830
13355
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -12853,9 +13378,25 @@ async function runEvalCase(options) {
|
|
|
12853
13378
|
);
|
|
12854
13379
|
}
|
|
12855
13380
|
}
|
|
12856
|
-
if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
|
|
13381
|
+
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
12857
13382
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
12858
|
-
await
|
|
13383
|
+
await mkdir12(workspacePath, { recursive: true });
|
|
13384
|
+
}
|
|
13385
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
13386
|
+
const perCaseRepoManager = new RepoManager();
|
|
13387
|
+
try {
|
|
13388
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
13389
|
+
} catch (error) {
|
|
13390
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13391
|
+
return buildErrorResult(
|
|
13392
|
+
evalCase,
|
|
13393
|
+
target.name,
|
|
13394
|
+
nowFn(),
|
|
13395
|
+
new Error(`Failed to materialize repos: ${message}`),
|
|
13396
|
+
promptInputs,
|
|
13397
|
+
provider
|
|
13398
|
+
);
|
|
13399
|
+
}
|
|
12859
13400
|
}
|
|
12860
13401
|
if (workspacePath && evalCase.workspace?.before_all) {
|
|
12861
13402
|
const scriptContext = {
|
|
@@ -12979,8 +13520,8 @@ async function runEvalCase(options) {
|
|
|
12979
13520
|
}
|
|
12980
13521
|
return errorResult;
|
|
12981
13522
|
}
|
|
12982
|
-
if (
|
|
12983
|
-
await cache.set(
|
|
13523
|
+
if (cacheKey2 && cache && !cachedResponse) {
|
|
13524
|
+
await cache.set(cacheKey2, providerResponse);
|
|
12984
13525
|
}
|
|
12985
13526
|
const output = providerResponse.output;
|
|
12986
13527
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -13008,6 +13549,16 @@ async function runEvalCase(options) {
|
|
|
13008
13549
|
}
|
|
13009
13550
|
}
|
|
13010
13551
|
const providerError = extractProviderError(providerResponse);
|
|
13552
|
+
if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
|
|
13553
|
+
try {
|
|
13554
|
+
await repoManager.reset(
|
|
13555
|
+
evalCase.workspace.repos,
|
|
13556
|
+
workspacePath,
|
|
13557
|
+
evalCase.workspace.reset.strategy
|
|
13558
|
+
);
|
|
13559
|
+
} catch {
|
|
13560
|
+
}
|
|
13561
|
+
}
|
|
13011
13562
|
if (workspacePath && evalCase.workspace?.after_each) {
|
|
13012
13563
|
const scriptContext = {
|
|
13013
13564
|
workspacePath,
|
|
@@ -13372,7 +13923,7 @@ async function runEvaluatorList(options) {
|
|
|
13372
13923
|
fileChanges,
|
|
13373
13924
|
workspacePath
|
|
13374
13925
|
};
|
|
13375
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
13926
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path37.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
13376
13927
|
const dispatchContext = {
|
|
13377
13928
|
judgeProvider,
|
|
13378
13929
|
targetResolver,
|
|
@@ -13462,8 +14013,9 @@ async function runEvaluatorList(options) {
|
|
|
13462
14013
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
13463
14014
|
return entry.score.score < minScore;
|
|
13464
14015
|
});
|
|
13465
|
-
const
|
|
13466
|
-
|
|
14016
|
+
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
14017
|
+
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
14018
|
+
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
13467
14019
|
) : 0;
|
|
13468
14020
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
13469
14021
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
@@ -13603,7 +14155,7 @@ function extractProviderError(response) {
|
|
|
13603
14155
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
13604
14156
|
}
|
|
13605
14157
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
13606
|
-
const hash =
|
|
14158
|
+
const hash = createHash2("sha256");
|
|
13607
14159
|
hash.update(provider.id);
|
|
13608
14160
|
hash.update(target.name);
|
|
13609
14161
|
hash.update(evalCase.id);
|
|
@@ -13671,8 +14223,8 @@ function computeWeightedMean(entries) {
|
|
|
13671
14223
|
}
|
|
13672
14224
|
|
|
13673
14225
|
// src/evaluation/evaluate.ts
|
|
13674
|
-
import { existsSync as
|
|
13675
|
-
import
|
|
14226
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
14227
|
+
import path38 from "node:path";
|
|
13676
14228
|
async function evaluate(config) {
|
|
13677
14229
|
const startTime = Date.now();
|
|
13678
14230
|
if (config.tests && config.specFile) {
|
|
@@ -13694,13 +14246,13 @@ async function evaluate(config) {
|
|
|
13694
14246
|
let evalCases;
|
|
13695
14247
|
let testFilePath;
|
|
13696
14248
|
if (config.specFile) {
|
|
13697
|
-
testFilePath =
|
|
14249
|
+
testFilePath = path38.resolve(config.specFile);
|
|
13698
14250
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
13699
14251
|
verbose: config.verbose,
|
|
13700
14252
|
filter: config.filter
|
|
13701
14253
|
});
|
|
13702
14254
|
} else {
|
|
13703
|
-
testFilePath =
|
|
14255
|
+
testFilePath = path38.join(process.cwd(), "__programmatic__.yaml");
|
|
13704
14256
|
evalCases = (config.tests ?? []).map((test) => {
|
|
13705
14257
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
13706
14258
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -13791,11 +14343,11 @@ function computeSummary(results, durationMs) {
|
|
|
13791
14343
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
13792
14344
|
async function discoverDefaultTarget(repoRoot) {
|
|
13793
14345
|
const cwd = process.cwd();
|
|
13794
|
-
const chain = buildDirectoryChain(
|
|
14346
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
13795
14347
|
for (const dir of chain) {
|
|
13796
14348
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
13797
|
-
const targetsPath =
|
|
13798
|
-
if (!
|
|
14349
|
+
const targetsPath = path38.join(dir, candidate);
|
|
14350
|
+
if (!existsSync3(targetsPath)) continue;
|
|
13799
14351
|
try {
|
|
13800
14352
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
13801
14353
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -13809,11 +14361,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
13809
14361
|
async function loadEnvHierarchy(repoRoot) {
|
|
13810
14362
|
const { readFileSync: readFileSync2 } = await import("node:fs");
|
|
13811
14363
|
const cwd = process.cwd();
|
|
13812
|
-
const chain = buildDirectoryChain(
|
|
14364
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
13813
14365
|
const envFiles = [];
|
|
13814
14366
|
for (const dir of chain) {
|
|
13815
|
-
const envPath =
|
|
13816
|
-
if (
|
|
14367
|
+
const envPath = path38.join(dir, ".env");
|
|
14368
|
+
if (existsSync3(envPath)) envFiles.push(envPath);
|
|
13817
14369
|
}
|
|
13818
14370
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
13819
14371
|
try {
|
|
@@ -13883,12 +14435,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
13883
14435
|
".agentv/config.js"
|
|
13884
14436
|
];
|
|
13885
14437
|
async function loadTsConfig(projectRoot) {
|
|
13886
|
-
const { existsSync:
|
|
14438
|
+
const { existsSync: existsSync4 } = await import("node:fs");
|
|
13887
14439
|
const { pathToFileURL } = await import("node:url");
|
|
13888
14440
|
const { join: join2 } = await import("node:path");
|
|
13889
14441
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
13890
14442
|
const filePath = join2(projectRoot, fileName);
|
|
13891
|
-
if (!
|
|
14443
|
+
if (!existsSync4(filePath)) {
|
|
13892
14444
|
continue;
|
|
13893
14445
|
}
|
|
13894
14446
|
try {
|
|
@@ -13985,8 +14537,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
13985
14537
|
}
|
|
13986
14538
|
|
|
13987
14539
|
// src/evaluation/cache/response-cache.ts
|
|
13988
|
-
import { mkdir as
|
|
13989
|
-
import
|
|
14540
|
+
import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
|
|
14541
|
+
import path39 from "node:path";
|
|
13990
14542
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
13991
14543
|
var ResponseCache = class {
|
|
13992
14544
|
cachePath;
|
|
@@ -14004,13 +14556,13 @@ var ResponseCache = class {
|
|
|
14004
14556
|
}
|
|
14005
14557
|
async set(key, value) {
|
|
14006
14558
|
const filePath = this.keyToPath(key);
|
|
14007
|
-
const dir =
|
|
14008
|
-
await
|
|
14009
|
-
await
|
|
14559
|
+
const dir = path39.dirname(filePath);
|
|
14560
|
+
await mkdir13(dir, { recursive: true });
|
|
14561
|
+
await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
14010
14562
|
}
|
|
14011
14563
|
keyToPath(key) {
|
|
14012
14564
|
const prefix = key.slice(0, 2);
|
|
14013
|
-
return
|
|
14565
|
+
return path39.join(this.cachePath, prefix, `${key}.json`);
|
|
14014
14566
|
}
|
|
14015
14567
|
};
|
|
14016
14568
|
function shouldEnableCache(params) {
|
|
@@ -14483,6 +15035,7 @@ export {
|
|
|
14483
15035
|
OtelTraceExporter,
|
|
14484
15036
|
OtlpJsonFileExporter,
|
|
14485
15037
|
ProviderRegistry,
|
|
15038
|
+
RepoManager,
|
|
14486
15039
|
ResponseCache,
|
|
14487
15040
|
SimpleTraceFileExporter,
|
|
14488
15041
|
TEST_MESSAGE_ROLES,
|
|
@@ -14568,12 +15121,19 @@ export {
|
|
|
14568
15121
|
resolveTargetDefinition,
|
|
14569
15122
|
resolveWorkspaceTemplate,
|
|
14570
15123
|
rubricEvaluationSchema,
|
|
15124
|
+
runContainsAllAssertion,
|
|
15125
|
+
runContainsAnyAssertion,
|
|
14571
15126
|
runContainsAssertion,
|
|
15127
|
+
runEndsWithAssertion,
|
|
14572
15128
|
runEqualsAssertion,
|
|
14573
15129
|
runEvalCase,
|
|
14574
15130
|
runEvaluation,
|
|
15131
|
+
runIcontainsAllAssertion,
|
|
15132
|
+
runIcontainsAnyAssertion,
|
|
15133
|
+
runIcontainsAssertion,
|
|
14575
15134
|
runIsJsonAssertion,
|
|
14576
15135
|
runRegexAssertion,
|
|
15136
|
+
runStartsWithAssertion,
|
|
14577
15137
|
scoreToVerdict,
|
|
14578
15138
|
shouldEnableCache,
|
|
14579
15139
|
shouldSkipCacheForTemperature,
|