@agentv/core 2.10.0 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7Q4PH265.js → chunk-REN5PS7B.js} +15 -8
- package/dist/chunk-REN5PS7B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +106 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +96 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +830 -172
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +189 -11
- package/dist/index.d.ts +189 -11
- package/dist/index.js +795 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-7Q4PH265.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-REN5PS7B.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -414,9 +414,14 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
414
414
|
logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
|
|
415
415
|
continue;
|
|
416
416
|
}
|
|
417
|
+
const executionDefaults = parseExecutionDefaults(
|
|
418
|
+
parsed.execution,
|
|
419
|
+
configPath
|
|
420
|
+
);
|
|
417
421
|
return {
|
|
418
422
|
guideline_patterns: guidelinePatterns,
|
|
419
|
-
eval_patterns: evalPatterns
|
|
423
|
+
eval_patterns: evalPatterns,
|
|
424
|
+
execution: executionDefaults
|
|
420
425
|
};
|
|
421
426
|
} catch (error) {
|
|
422
427
|
logWarning(
|
|
@@ -557,6 +562,36 @@ function extractTotalBudgetUsd(suite) {
|
|
|
557
562
|
);
|
|
558
563
|
return void 0;
|
|
559
564
|
}
|
|
565
|
+
function parseExecutionDefaults(raw, configPath) {
|
|
566
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
567
|
+
return void 0;
|
|
568
|
+
}
|
|
569
|
+
const obj = raw;
|
|
570
|
+
const result = {};
|
|
571
|
+
if (typeof obj.verbose === "boolean") {
|
|
572
|
+
result.verbose = obj.verbose;
|
|
573
|
+
} else if (obj.verbose !== void 0) {
|
|
574
|
+
logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
|
|
575
|
+
}
|
|
576
|
+
const traceFile = obj.trace_file;
|
|
577
|
+
if (typeof traceFile === "string" && traceFile.trim().length > 0) {
|
|
578
|
+
result.trace_file = traceFile.trim();
|
|
579
|
+
} else if (traceFile !== void 0) {
|
|
580
|
+
logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
|
|
581
|
+
}
|
|
582
|
+
if (typeof obj.keep_workspaces === "boolean") {
|
|
583
|
+
result.keep_workspaces = obj.keep_workspaces;
|
|
584
|
+
} else if (obj.keep_workspaces !== void 0) {
|
|
585
|
+
logWarning(`Invalid execution.keep_workspaces in ${configPath}, expected boolean`);
|
|
586
|
+
}
|
|
587
|
+
const otelFile = obj.otel_file;
|
|
588
|
+
if (typeof otelFile === "string" && otelFile.trim().length > 0) {
|
|
589
|
+
result.otel_file = otelFile.trim();
|
|
590
|
+
} else if (otelFile !== void 0) {
|
|
591
|
+
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
592
|
+
}
|
|
593
|
+
return Object.keys(result).length > 0 ? result : void 0;
|
|
594
|
+
}
|
|
560
595
|
function logWarning(message) {
|
|
561
596
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
562
597
|
}
|
|
@@ -1285,18 +1320,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1285
1320
|
});
|
|
1286
1321
|
continue;
|
|
1287
1322
|
}
|
|
1323
|
+
if (typeValue === "contains_any" || typeValue === "contains_all") {
|
|
1324
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1325
|
+
if (!value || value.length === 0) {
|
|
1326
|
+
logWarning2(
|
|
1327
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
1328
|
+
);
|
|
1329
|
+
continue;
|
|
1330
|
+
}
|
|
1331
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1332
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1333
|
+
evaluators.push({
|
|
1334
|
+
name,
|
|
1335
|
+
type: typeValue,
|
|
1336
|
+
value,
|
|
1337
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1338
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1339
|
+
...negate !== void 0 ? { negate } : {}
|
|
1340
|
+
});
|
|
1341
|
+
continue;
|
|
1342
|
+
}
|
|
1343
|
+
if (typeValue === "icontains") {
|
|
1344
|
+
const value = asString(rawEvaluator.value);
|
|
1345
|
+
if (!value) {
|
|
1346
|
+
logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
|
|
1347
|
+
continue;
|
|
1348
|
+
}
|
|
1349
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1350
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1351
|
+
evaluators.push({
|
|
1352
|
+
name,
|
|
1353
|
+
type: "icontains",
|
|
1354
|
+
value,
|
|
1355
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1356
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1357
|
+
...negate !== void 0 ? { negate } : {}
|
|
1358
|
+
});
|
|
1359
|
+
continue;
|
|
1360
|
+
}
|
|
1361
|
+
if (typeValue === "icontains_any" || typeValue === "icontains_all") {
|
|
1362
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
1363
|
+
if (!value || value.length === 0) {
|
|
1364
|
+
logWarning2(
|
|
1365
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
1366
|
+
);
|
|
1367
|
+
continue;
|
|
1368
|
+
}
|
|
1369
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1370
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1371
|
+
evaluators.push({
|
|
1372
|
+
name,
|
|
1373
|
+
type: typeValue,
|
|
1374
|
+
value,
|
|
1375
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1376
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1377
|
+
...negate !== void 0 ? { negate } : {}
|
|
1378
|
+
});
|
|
1379
|
+
continue;
|
|
1380
|
+
}
|
|
1381
|
+
if (typeValue === "starts_with" || typeValue === "ends_with") {
|
|
1382
|
+
const value = asString(rawEvaluator.value);
|
|
1383
|
+
if (!value) {
|
|
1384
|
+
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
1385
|
+
continue;
|
|
1386
|
+
}
|
|
1387
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1388
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
1389
|
+
evaluators.push({
|
|
1390
|
+
name,
|
|
1391
|
+
type: typeValue,
|
|
1392
|
+
value,
|
|
1393
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1394
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
1395
|
+
...negate !== void 0 ? { negate } : {}
|
|
1396
|
+
});
|
|
1397
|
+
continue;
|
|
1398
|
+
}
|
|
1288
1399
|
if (typeValue === "regex") {
|
|
1289
1400
|
const value = asString(rawEvaluator.value);
|
|
1290
1401
|
if (!value) {
|
|
1291
1402
|
logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
|
|
1292
1403
|
continue;
|
|
1293
1404
|
}
|
|
1405
|
+
const flags = asString(rawEvaluator.flags);
|
|
1294
1406
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1295
1407
|
const required2 = parseRequired(rawEvaluator.required);
|
|
1296
1408
|
evaluators.push({
|
|
1297
1409
|
name,
|
|
1298
1410
|
type: "regex",
|
|
1299
1411
|
value,
|
|
1412
|
+
...flags !== void 0 ? { flags } : {},
|
|
1300
1413
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1301
1414
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1302
1415
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -1469,15 +1582,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1469
1582
|
}
|
|
1470
1583
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
1471
1584
|
}
|
|
1472
|
-
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1585
|
+
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
1586
|
+
"contains",
|
|
1587
|
+
"contains_any",
|
|
1588
|
+
"contains_all",
|
|
1589
|
+
"icontains",
|
|
1590
|
+
"icontains_any",
|
|
1591
|
+
"icontains_all",
|
|
1592
|
+
"starts_with",
|
|
1593
|
+
"ends_with",
|
|
1594
|
+
"regex",
|
|
1595
|
+
"is_json",
|
|
1596
|
+
"equals",
|
|
1597
|
+
"rubrics"
|
|
1598
|
+
]);
|
|
1473
1599
|
function generateAssertionName(typeValue, rawEvaluator) {
|
|
1474
1600
|
if (!ASSERTION_TYPES.has(typeValue)) {
|
|
1475
1601
|
return void 0;
|
|
1476
1602
|
}
|
|
1477
1603
|
const value = asString(rawEvaluator.value);
|
|
1604
|
+
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
|
|
1478
1605
|
switch (typeValue) {
|
|
1479
1606
|
case "contains":
|
|
1480
1607
|
return value ? `contains-${value}` : "contains";
|
|
1608
|
+
case "contains_any":
|
|
1609
|
+
return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
|
|
1610
|
+
case "contains_all":
|
|
1611
|
+
return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
|
|
1612
|
+
case "icontains":
|
|
1613
|
+
return value ? `icontains-${value}` : "icontains";
|
|
1614
|
+
case "icontains_any":
|
|
1615
|
+
return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
|
|
1616
|
+
case "icontains_all":
|
|
1617
|
+
return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
|
|
1618
|
+
case "starts_with":
|
|
1619
|
+
return value ? `starts_with-${value}` : "starts_with";
|
|
1620
|
+
case "ends_with":
|
|
1621
|
+
return value ? `ends_with-${value}` : "ends_with";
|
|
1481
1622
|
case "regex":
|
|
1482
1623
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
1483
1624
|
case "is_json":
|
|
@@ -1503,6 +1644,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1503
1644
|
function asString(value) {
|
|
1504
1645
|
return typeof value === "string" ? value : void 0;
|
|
1505
1646
|
}
|
|
1647
|
+
function asStringArrayStrict(value) {
|
|
1648
|
+
if (!Array.isArray(value)) {
|
|
1649
|
+
return void 0;
|
|
1650
|
+
}
|
|
1651
|
+
const result = value.filter((v) => typeof v === "string");
|
|
1652
|
+
return result.length > 0 ? result : void 0;
|
|
1653
|
+
}
|
|
1506
1654
|
function asStringArray(value, description) {
|
|
1507
1655
|
if (value === void 0) {
|
|
1508
1656
|
return void 0;
|
|
@@ -2820,6 +2968,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
2820
2968
|
}
|
|
2821
2969
|
return cwd ? { ...config, cwd } : config;
|
|
2822
2970
|
}
|
|
2971
|
+
function parseRepoSource(raw) {
|
|
2972
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2973
|
+
const obj = raw;
|
|
2974
|
+
if (obj.type === "git" && typeof obj.url === "string") {
|
|
2975
|
+
return { type: "git", url: obj.url };
|
|
2976
|
+
}
|
|
2977
|
+
if (obj.type === "local" && typeof obj.path === "string") {
|
|
2978
|
+
return { type: "local", path: obj.path };
|
|
2979
|
+
}
|
|
2980
|
+
return void 0;
|
|
2981
|
+
}
|
|
2982
|
+
function parseRepoCheckout(raw) {
|
|
2983
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2984
|
+
const obj = raw;
|
|
2985
|
+
const ref = typeof obj.ref === "string" ? obj.ref : void 0;
|
|
2986
|
+
const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
|
|
2987
|
+
const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
|
|
2988
|
+
if (!ref && !resolve && ancestor === void 0) return void 0;
|
|
2989
|
+
return {
|
|
2990
|
+
...ref !== void 0 && { ref },
|
|
2991
|
+
...resolve !== void 0 && { resolve },
|
|
2992
|
+
...ancestor !== void 0 && { ancestor }
|
|
2993
|
+
};
|
|
2994
|
+
}
|
|
2995
|
+
function parseRepoClone(raw) {
|
|
2996
|
+
if (!isJsonObject(raw)) return void 0;
|
|
2997
|
+
const obj = raw;
|
|
2998
|
+
const depth = typeof obj.depth === "number" ? obj.depth : void 0;
|
|
2999
|
+
const filter = typeof obj.filter === "string" ? obj.filter : void 0;
|
|
3000
|
+
const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
|
|
3001
|
+
if (depth === void 0 && !filter && !sparse) return void 0;
|
|
3002
|
+
return {
|
|
3003
|
+
...depth !== void 0 && { depth },
|
|
3004
|
+
...filter !== void 0 && { filter },
|
|
3005
|
+
...sparse !== void 0 && { sparse }
|
|
3006
|
+
};
|
|
3007
|
+
}
|
|
3008
|
+
function parseRepoConfig(raw) {
|
|
3009
|
+
if (!isJsonObject(raw)) return void 0;
|
|
3010
|
+
const obj = raw;
|
|
3011
|
+
const repoPath = typeof obj.path === "string" ? obj.path : void 0;
|
|
3012
|
+
const source = parseRepoSource(obj.source);
|
|
3013
|
+
if (!repoPath || !source) return void 0;
|
|
3014
|
+
const checkout = parseRepoCheckout(obj.checkout);
|
|
3015
|
+
const clone = parseRepoClone(obj.clone);
|
|
3016
|
+
return {
|
|
3017
|
+
path: repoPath,
|
|
3018
|
+
source,
|
|
3019
|
+
...checkout !== void 0 && { checkout },
|
|
3020
|
+
...clone !== void 0 && { clone }
|
|
3021
|
+
};
|
|
3022
|
+
}
|
|
3023
|
+
function parseResetConfig(raw) {
|
|
3024
|
+
if (!isJsonObject(raw)) return void 0;
|
|
3025
|
+
const obj = raw;
|
|
3026
|
+
const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
|
|
3027
|
+
const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
|
|
3028
|
+
if (!strategy && afterEach === void 0) return void 0;
|
|
3029
|
+
return {
|
|
3030
|
+
...strategy !== void 0 && { strategy },
|
|
3031
|
+
...afterEach !== void 0 && { after_each: afterEach }
|
|
3032
|
+
};
|
|
3033
|
+
}
|
|
2823
3034
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
2824
3035
|
if (!isJsonObject(raw)) return void 0;
|
|
2825
3036
|
const obj = raw;
|
|
@@ -2827,13 +3038,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
2827
3038
|
if (template && !path8.isAbsolute(template)) {
|
|
2828
3039
|
template = path8.resolve(evalFileDir, template);
|
|
2829
3040
|
}
|
|
3041
|
+
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
3042
|
+
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
3043
|
+
const reset = parseResetConfig(obj.reset);
|
|
2830
3044
|
const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
|
|
2831
3045
|
const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
|
|
2832
3046
|
const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
|
|
2833
3047
|
const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
|
|
2834
|
-
if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3048
|
+
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3049
|
+
return void 0;
|
|
2835
3050
|
return {
|
|
2836
3051
|
...template !== void 0 && { template },
|
|
3052
|
+
...isolation !== void 0 && { isolation },
|
|
3053
|
+
...repos !== void 0 && { repos },
|
|
3054
|
+
...reset !== void 0 && { reset },
|
|
2837
3055
|
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
2838
3056
|
...afterAll !== void 0 && { after_all: afterAll },
|
|
2839
3057
|
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
@@ -2846,6 +3064,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
2846
3064
|
if (!caseLevel) return suiteLevel;
|
|
2847
3065
|
return {
|
|
2848
3066
|
template: caseLevel.template ?? suiteLevel.template,
|
|
3067
|
+
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
3068
|
+
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
3069
|
+
reset: caseLevel.reset ?? suiteLevel.reset,
|
|
2849
3070
|
before_all: caseLevel.before_all ?? suiteLevel.before_all,
|
|
2850
3071
|
after_all: caseLevel.after_all ?? suiteLevel.after_all,
|
|
2851
3072
|
before_each: caseLevel.before_each ?? suiteLevel.before_each,
|
|
@@ -3385,11 +3606,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
3385
3606
|
}
|
|
3386
3607
|
return claudeSdkModule;
|
|
3387
3608
|
}
|
|
3388
|
-
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3389
|
-
- Do NOT create any additional output files in the workspace.
|
|
3390
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
3391
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3392
|
-
This is required for evaluation scoring.`;
|
|
3393
3609
|
var ClaudeProvider = class {
|
|
3394
3610
|
id;
|
|
3395
3611
|
kind = "claude";
|
|
@@ -3411,7 +3627,7 @@ var ClaudeProvider = class {
|
|
|
3411
3627
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3412
3628
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3413
3629
|
const prompt = buildPromptDocument(request, inputFiles);
|
|
3414
|
-
const systemPrompt = this.config.systemPrompt
|
|
3630
|
+
const systemPrompt = this.config.systemPrompt;
|
|
3415
3631
|
const queryOptions = {
|
|
3416
3632
|
permissionMode: "bypassPermissions",
|
|
3417
3633
|
allowDangerouslySkipPermissions: true,
|
|
@@ -4392,11 +4608,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
4392
4608
|
}
|
|
4393
4609
|
return codexSdkModule;
|
|
4394
4610
|
}
|
|
4395
|
-
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
4396
|
-
- Do NOT create any additional output files in the workspace.
|
|
4397
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
4398
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
4399
|
-
This is required for evaluation scoring.`;
|
|
4400
4611
|
var CodexProvider = class {
|
|
4401
4612
|
id;
|
|
4402
4613
|
kind = "codex";
|
|
@@ -4431,7 +4642,7 @@ var CodexProvider = class {
|
|
|
4431
4642
|
const thread = codex.startThread(threadOptions);
|
|
4432
4643
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
4433
4644
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
4434
|
-
const systemPrompt = this.config.systemPrompt
|
|
4645
|
+
const systemPrompt = this.config.systemPrompt;
|
|
4435
4646
|
const prompt = systemPrompt ? `${systemPrompt}
|
|
4436
4647
|
|
|
4437
4648
|
${basePrompt}` : basePrompt;
|
|
@@ -4797,7 +5008,7 @@ import { arch, platform } from "node:os";
|
|
|
4797
5008
|
import path13 from "node:path";
|
|
4798
5009
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
4799
5010
|
function resolvePlatformCliPath() {
|
|
4800
|
-
const
|
|
5011
|
+
const os5 = platform();
|
|
4801
5012
|
const cpu = arch();
|
|
4802
5013
|
const platformMap = {
|
|
4803
5014
|
linux: "linux",
|
|
@@ -4808,13 +5019,13 @@ function resolvePlatformCliPath() {
|
|
|
4808
5019
|
x64: "x64",
|
|
4809
5020
|
arm64: "arm64"
|
|
4810
5021
|
};
|
|
4811
|
-
const osPart = platformMap[
|
|
5022
|
+
const osPart = platformMap[os5];
|
|
4812
5023
|
const archPart = archMap[cpu];
|
|
4813
5024
|
if (!osPart || !archPart) {
|
|
4814
5025
|
return void 0;
|
|
4815
5026
|
}
|
|
4816
5027
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
4817
|
-
const binaryName =
|
|
5028
|
+
const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
|
|
4818
5029
|
try {
|
|
4819
5030
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
4820
5031
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -4956,11 +5167,6 @@ function isLogStreamingDisabled(envKey) {
|
|
|
4956
5167
|
}
|
|
4957
5168
|
|
|
4958
5169
|
// src/evaluation/providers/copilot-cli.ts
|
|
4959
|
-
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
4960
|
-
- Do NOT create any additional output files in the workspace.
|
|
4961
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
4962
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
4963
|
-
This is required for evaluation scoring.`;
|
|
4964
5170
|
var CopilotCliProvider = class {
|
|
4965
5171
|
id;
|
|
4966
5172
|
kind = "copilot-cli";
|
|
@@ -5122,6 +5328,16 @@ var CopilotCliProvider = class {
|
|
|
5122
5328
|
}
|
|
5123
5329
|
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
5124
5330
|
const durationMs = Date.now() - startMs;
|
|
5331
|
+
const rejectedCalls = completedToolCalls.filter((tc) => {
|
|
5332
|
+
const out = tc.output;
|
|
5333
|
+
return out && (out.code === "rejected" || out.code === "denied");
|
|
5334
|
+
});
|
|
5335
|
+
if (rejectedCalls.length > 0) {
|
|
5336
|
+
const tools = rejectedCalls.map((tc) => tc.tool).join(", ");
|
|
5337
|
+
throw new Error(
|
|
5338
|
+
`Copilot rejected ${rejectedCalls.length} tool call(s): ${tools}. Add args: ["--yolo"] to your target config or re-run with --yolo to bypass permission checks.`
|
|
5339
|
+
);
|
|
5340
|
+
}
|
|
5125
5341
|
const outputMessages = [];
|
|
5126
5342
|
if (completedToolCalls.length > 0) {
|
|
5127
5343
|
outputMessages.push({
|
|
@@ -5154,7 +5370,7 @@ var CopilotCliProvider = class {
|
|
|
5154
5370
|
}
|
|
5155
5371
|
}
|
|
5156
5372
|
buildCliArgs() {
|
|
5157
|
-
const args = ["--acp", "--stdio", "--allow-all-tools"];
|
|
5373
|
+
const args = ["--acp", "--stdio", "--allow-all-tools", "--yolo"];
|
|
5158
5374
|
if (this.config.model) {
|
|
5159
5375
|
args.push("--model", this.config.model);
|
|
5160
5376
|
}
|
|
@@ -5163,8 +5379,8 @@ var CopilotCliProvider = class {
|
|
|
5163
5379
|
}
|
|
5164
5380
|
return args;
|
|
5165
5381
|
}
|
|
5166
|
-
resolveSystemPrompt(
|
|
5167
|
-
return this.config.systemPrompt
|
|
5382
|
+
resolveSystemPrompt(_request) {
|
|
5383
|
+
return this.config.systemPrompt;
|
|
5168
5384
|
}
|
|
5169
5385
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
5170
5386
|
const timeoutMs = this.config.timeoutMs;
|
|
@@ -5352,21 +5568,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
5352
5568
|
}
|
|
5353
5569
|
return copilotSdkModule;
|
|
5354
5570
|
}
|
|
5355
|
-
var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5356
|
-
- Do NOT create any additional output files in the workspace.
|
|
5357
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5358
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5359
|
-
This is required for evaluation scoring.`;
|
|
5360
5571
|
var CopilotSdkProvider = class {
|
|
5361
5572
|
id;
|
|
5362
|
-
kind = "copilot";
|
|
5573
|
+
kind = "copilot-sdk";
|
|
5363
5574
|
targetName;
|
|
5364
5575
|
supportsBatch = false;
|
|
5365
5576
|
config;
|
|
5366
5577
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
5367
5578
|
client = null;
|
|
5368
5579
|
constructor(targetName, config) {
|
|
5369
|
-
this.id = `copilot:${targetName}`;
|
|
5580
|
+
this.id = `copilot-sdk:${targetName}`;
|
|
5370
5581
|
this.targetName = targetName;
|
|
5371
5582
|
this.config = config;
|
|
5372
5583
|
}
|
|
@@ -5389,7 +5600,7 @@ var CopilotSdkProvider = class {
|
|
|
5389
5600
|
if (cwd) {
|
|
5390
5601
|
sessionOptions.workingDirectory = cwd;
|
|
5391
5602
|
}
|
|
5392
|
-
const systemPrompt = this.config.systemPrompt
|
|
5603
|
+
const systemPrompt = this.config.systemPrompt;
|
|
5393
5604
|
if (systemPrompt) {
|
|
5394
5605
|
sessionOptions.systemMessage = {
|
|
5395
5606
|
mode: "append",
|
|
@@ -5905,11 +6116,6 @@ function subscribeToPiLogEntries(listener) {
|
|
|
5905
6116
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
5906
6117
|
var WORKSPACE_PREFIX = "agentv-pi-";
|
|
5907
6118
|
var PROMPT_FILENAME = "prompt.md";
|
|
5908
|
-
var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5909
|
-
- Do NOT create any additional output files in the workspace.
|
|
5910
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5911
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5912
|
-
This is required for evaluation scoring.`;
|
|
5913
6119
|
var PiCodingAgentProvider = class {
|
|
5914
6120
|
id;
|
|
5915
6121
|
kind = "pi-coding-agent";
|
|
@@ -5986,7 +6192,7 @@ var PiCodingAgentProvider = class {
|
|
|
5986
6192
|
}
|
|
5987
6193
|
return path16.resolve(this.config.cwd);
|
|
5988
6194
|
}
|
|
5989
|
-
buildPiArgs(prompt, inputFiles,
|
|
6195
|
+
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
5990
6196
|
const args = [];
|
|
5991
6197
|
if (this.config.provider) {
|
|
5992
6198
|
args.push("--provider", this.config.provider);
|
|
@@ -6014,7 +6220,7 @@ var PiCodingAgentProvider = class {
|
|
|
6014
6220
|
args.push(`@${file}`);
|
|
6015
6221
|
}
|
|
6016
6222
|
}
|
|
6017
|
-
const systemPrompt = this.config.systemPrompt
|
|
6223
|
+
const systemPrompt = this.config.systemPrompt;
|
|
6018
6224
|
const fullPrompt = systemPrompt ? `${systemPrompt}
|
|
6019
6225
|
|
|
6020
6226
|
${prompt}` : prompt;
|
|
@@ -7708,7 +7914,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7708
7914
|
|
|
7709
7915
|
**IMPORTANT**: Follow these exact steps:
|
|
7710
7916
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7711
|
-
- Do NOT create any additional output files in the workspace.
|
|
7712
7917
|
- All intended file outputs/changes MUST be written in your response file.
|
|
7713
7918
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7714
7919
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
@@ -7727,7 +7932,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7727
7932
|
|
|
7728
7933
|
**IMPORTANT**: Follow these exact steps:
|
|
7729
7934
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7730
|
-
- Do NOT create any additional output files in the workspace.
|
|
7731
7935
|
- All intended file outputs/changes MUST be written in your response file.
|
|
7732
7936
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7733
7937
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
@@ -8153,7 +8357,7 @@ async function discoverProviders(registry, baseDir) {
|
|
|
8153
8357
|
// src/evaluation/providers/index.ts
|
|
8154
8358
|
function createBuiltinProviderRegistry() {
|
|
8155
8359
|
const registry = new ProviderRegistry();
|
|
8156
|
-
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
8360
|
+
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
8157
8361
|
"vscode-insiders",
|
|
8158
8362
|
(t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
|
|
8159
8363
|
);
|
|
@@ -8342,16 +8546,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
8342
8546
|
});
|
|
8343
8547
|
}
|
|
8344
8548
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
8345
|
-
const { mkdir:
|
|
8549
|
+
const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
8346
8550
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
8347
|
-
const
|
|
8551
|
+
const path40 = await import("node:path");
|
|
8348
8552
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
8349
|
-
const dir =
|
|
8350
|
-
await
|
|
8351
|
-
const stdinPath =
|
|
8352
|
-
const stdoutPath =
|
|
8353
|
-
const stderrPath =
|
|
8354
|
-
await
|
|
8553
|
+
const dir = path40.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
8554
|
+
await mkdir14(dir, { recursive: true });
|
|
8555
|
+
const stdinPath = path40.join(dir, "stdin.txt");
|
|
8556
|
+
const stdoutPath = path40.join(dir, "stdout.txt");
|
|
8557
|
+
const stderrPath = path40.join(dir, "stderr.txt");
|
|
8558
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
8355
8559
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
8356
8560
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
8357
8561
|
try {
|
|
@@ -8384,7 +8588,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
8384
8588
|
const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8385
8589
|
return { stdout, stderr, exitCode };
|
|
8386
8590
|
} finally {
|
|
8387
|
-
await
|
|
8591
|
+
await rm6(dir, { recursive: true, force: true });
|
|
8388
8592
|
}
|
|
8389
8593
|
}
|
|
8390
8594
|
|
|
@@ -8702,7 +8906,7 @@ var CodeEvaluator = class {
|
|
|
8702
8906
|
outputPath,
|
|
8703
8907
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
8704
8908
|
inputFiles: context.evalCase.file_paths.filter(
|
|
8705
|
-
(
|
|
8909
|
+
(path40) => !context.evalCase.guideline_paths.includes(path40)
|
|
8706
8910
|
),
|
|
8707
8911
|
input: context.evalCase.input,
|
|
8708
8912
|
trace: context.trace ?? null,
|
|
@@ -8950,13 +9154,15 @@ ${context.fileChanges}`;
|
|
|
8950
9154
|
evaluatorRawRequest,
|
|
8951
9155
|
tokenUsage
|
|
8952
9156
|
};
|
|
8953
|
-
} catch {
|
|
9157
|
+
} catch (e) {
|
|
9158
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
8954
9159
|
return {
|
|
8955
9160
|
score: 0,
|
|
8956
|
-
verdict: "
|
|
9161
|
+
verdict: "skip",
|
|
8957
9162
|
hits: [],
|
|
8958
|
-
misses: [],
|
|
9163
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
8959
9164
|
expectedAspectCount: 1,
|
|
9165
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
8960
9166
|
evaluatorRawRequest
|
|
8961
9167
|
};
|
|
8962
9168
|
}
|
|
@@ -9898,115 +10104,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
9898
10104
|
* Evaluate a single field against the expected value.
|
|
9899
10105
|
*/
|
|
9900
10106
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
9901
|
-
const { path:
|
|
9902
|
-
const candidateValue = resolvePath(candidateData,
|
|
9903
|
-
const expectedValue = resolvePath(expectedData,
|
|
10107
|
+
const { path: path40, match, required = true, weight = 1 } = fieldConfig;
|
|
10108
|
+
const candidateValue = resolvePath(candidateData, path40);
|
|
10109
|
+
const expectedValue = resolvePath(expectedData, path40);
|
|
9904
10110
|
if (expectedValue === void 0) {
|
|
9905
10111
|
return {
|
|
9906
|
-
path:
|
|
10112
|
+
path: path40,
|
|
9907
10113
|
score: 1,
|
|
9908
10114
|
// No expected value means no comparison needed
|
|
9909
10115
|
weight,
|
|
9910
10116
|
hit: true,
|
|
9911
|
-
message: `${
|
|
10117
|
+
message: `${path40}: no expected value`
|
|
9912
10118
|
};
|
|
9913
10119
|
}
|
|
9914
10120
|
if (candidateValue === void 0) {
|
|
9915
10121
|
if (required) {
|
|
9916
10122
|
return {
|
|
9917
|
-
path:
|
|
10123
|
+
path: path40,
|
|
9918
10124
|
score: 0,
|
|
9919
10125
|
weight,
|
|
9920
10126
|
hit: false,
|
|
9921
|
-
message: `${
|
|
10127
|
+
message: `${path40} (required, missing)`
|
|
9922
10128
|
};
|
|
9923
10129
|
}
|
|
9924
10130
|
return {
|
|
9925
|
-
path:
|
|
10131
|
+
path: path40,
|
|
9926
10132
|
score: 1,
|
|
9927
10133
|
// Don't penalize missing optional fields
|
|
9928
10134
|
weight: 0,
|
|
9929
10135
|
// Zero weight means it won't affect the score
|
|
9930
10136
|
hit: true,
|
|
9931
|
-
message: `${
|
|
10137
|
+
message: `${path40}: optional field missing`
|
|
9932
10138
|
};
|
|
9933
10139
|
}
|
|
9934
10140
|
switch (match) {
|
|
9935
10141
|
case "exact":
|
|
9936
|
-
return this.compareExact(
|
|
10142
|
+
return this.compareExact(path40, candidateValue, expectedValue, weight);
|
|
9937
10143
|
case "numeric_tolerance":
|
|
9938
10144
|
return this.compareNumericTolerance(
|
|
9939
|
-
|
|
10145
|
+
path40,
|
|
9940
10146
|
candidateValue,
|
|
9941
10147
|
expectedValue,
|
|
9942
10148
|
fieldConfig,
|
|
9943
10149
|
weight
|
|
9944
10150
|
);
|
|
9945
10151
|
case "date":
|
|
9946
|
-
return this.compareDate(
|
|
10152
|
+
return this.compareDate(path40, candidateValue, expectedValue, fieldConfig, weight);
|
|
9947
10153
|
default:
|
|
9948
10154
|
return {
|
|
9949
|
-
path:
|
|
10155
|
+
path: path40,
|
|
9950
10156
|
score: 0,
|
|
9951
10157
|
weight,
|
|
9952
10158
|
hit: false,
|
|
9953
|
-
message: `${
|
|
10159
|
+
message: `${path40}: unknown match type "${match}"`
|
|
9954
10160
|
};
|
|
9955
10161
|
}
|
|
9956
10162
|
}
|
|
9957
10163
|
/**
|
|
9958
10164
|
* Exact equality comparison.
|
|
9959
10165
|
*/
|
|
9960
|
-
compareExact(
|
|
10166
|
+
compareExact(path40, candidateValue, expectedValue, weight) {
|
|
9961
10167
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
9962
10168
|
return {
|
|
9963
|
-
path:
|
|
10169
|
+
path: path40,
|
|
9964
10170
|
score: 1,
|
|
9965
10171
|
weight,
|
|
9966
10172
|
hit: true,
|
|
9967
|
-
message:
|
|
10173
|
+
message: path40
|
|
9968
10174
|
};
|
|
9969
10175
|
}
|
|
9970
10176
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
9971
10177
|
return {
|
|
9972
|
-
path:
|
|
10178
|
+
path: path40,
|
|
9973
10179
|
score: 0,
|
|
9974
10180
|
weight,
|
|
9975
10181
|
hit: false,
|
|
9976
|
-
message: `${
|
|
10182
|
+
message: `${path40} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
9977
10183
|
};
|
|
9978
10184
|
}
|
|
9979
10185
|
return {
|
|
9980
|
-
path:
|
|
10186
|
+
path: path40,
|
|
9981
10187
|
score: 0,
|
|
9982
10188
|
weight,
|
|
9983
10189
|
hit: false,
|
|
9984
|
-
message: `${
|
|
10190
|
+
message: `${path40} (value mismatch)`
|
|
9985
10191
|
};
|
|
9986
10192
|
}
|
|
9987
10193
|
/**
|
|
9988
10194
|
* Numeric comparison with absolute or relative tolerance.
|
|
9989
10195
|
*/
|
|
9990
|
-
compareNumericTolerance(
|
|
10196
|
+
compareNumericTolerance(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
9991
10197
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
9992
10198
|
const candidateNum = toNumber2(candidateValue);
|
|
9993
10199
|
const expectedNum = toNumber2(expectedValue);
|
|
9994
10200
|
if (candidateNum === null || expectedNum === null) {
|
|
9995
10201
|
return {
|
|
9996
|
-
path:
|
|
10202
|
+
path: path40,
|
|
9997
10203
|
score: 0,
|
|
9998
10204
|
weight,
|
|
9999
10205
|
hit: false,
|
|
10000
|
-
message: `${
|
|
10206
|
+
message: `${path40} (non-numeric value)`
|
|
10001
10207
|
};
|
|
10002
10208
|
}
|
|
10003
10209
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
10004
10210
|
return {
|
|
10005
|
-
path:
|
|
10211
|
+
path: path40,
|
|
10006
10212
|
score: 0,
|
|
10007
10213
|
weight,
|
|
10008
10214
|
hit: false,
|
|
10009
|
-
message: `${
|
|
10215
|
+
message: `${path40} (invalid numeric value)`
|
|
10010
10216
|
};
|
|
10011
10217
|
}
|
|
10012
10218
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -10019,61 +10225,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
10019
10225
|
}
|
|
10020
10226
|
if (withinTolerance) {
|
|
10021
10227
|
return {
|
|
10022
|
-
path:
|
|
10228
|
+
path: path40,
|
|
10023
10229
|
score: 1,
|
|
10024
10230
|
weight,
|
|
10025
10231
|
hit: true,
|
|
10026
|
-
message: `${
|
|
10232
|
+
message: `${path40} (within tolerance: diff=${diff.toFixed(2)})`
|
|
10027
10233
|
};
|
|
10028
10234
|
}
|
|
10029
10235
|
return {
|
|
10030
|
-
path:
|
|
10236
|
+
path: path40,
|
|
10031
10237
|
score: 0,
|
|
10032
10238
|
weight,
|
|
10033
10239
|
hit: false,
|
|
10034
|
-
message: `${
|
|
10240
|
+
message: `${path40} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
10035
10241
|
};
|
|
10036
10242
|
}
|
|
10037
10243
|
/**
|
|
10038
10244
|
* Date comparison with format normalization.
|
|
10039
10245
|
*/
|
|
10040
|
-
compareDate(
|
|
10246
|
+
compareDate(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10041
10247
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
10042
10248
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
10043
10249
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
10044
10250
|
if (candidateDate === null) {
|
|
10045
10251
|
return {
|
|
10046
|
-
path:
|
|
10252
|
+
path: path40,
|
|
10047
10253
|
score: 0,
|
|
10048
10254
|
weight,
|
|
10049
10255
|
hit: false,
|
|
10050
|
-
message: `${
|
|
10256
|
+
message: `${path40} (unparseable candidate date)`
|
|
10051
10257
|
};
|
|
10052
10258
|
}
|
|
10053
10259
|
if (expectedDate === null) {
|
|
10054
10260
|
return {
|
|
10055
|
-
path:
|
|
10261
|
+
path: path40,
|
|
10056
10262
|
score: 0,
|
|
10057
10263
|
weight,
|
|
10058
10264
|
hit: false,
|
|
10059
|
-
message: `${
|
|
10265
|
+
message: `${path40} (unparseable expected date)`
|
|
10060
10266
|
};
|
|
10061
10267
|
}
|
|
10062
10268
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
10063
10269
|
return {
|
|
10064
|
-
path:
|
|
10270
|
+
path: path40,
|
|
10065
10271
|
score: 1,
|
|
10066
10272
|
weight,
|
|
10067
10273
|
hit: true,
|
|
10068
|
-
message:
|
|
10274
|
+
message: path40
|
|
10069
10275
|
};
|
|
10070
10276
|
}
|
|
10071
10277
|
return {
|
|
10072
|
-
path:
|
|
10278
|
+
path: path40,
|
|
10073
10279
|
score: 0,
|
|
10074
10280
|
weight,
|
|
10075
10281
|
hit: false,
|
|
10076
|
-
message: `${
|
|
10282
|
+
message: `${path40} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
10077
10283
|
};
|
|
10078
10284
|
}
|
|
10079
10285
|
/**
|
|
@@ -10114,11 +10320,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
10114
10320
|
};
|
|
10115
10321
|
}
|
|
10116
10322
|
};
|
|
10117
|
-
function resolvePath(obj,
|
|
10118
|
-
if (!
|
|
10323
|
+
function resolvePath(obj, path40) {
|
|
10324
|
+
if (!path40 || !obj) {
|
|
10119
10325
|
return void 0;
|
|
10120
10326
|
}
|
|
10121
|
-
const parts =
|
|
10327
|
+
const parts = path40.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
10122
10328
|
let current = obj;
|
|
10123
10329
|
for (const part of parts) {
|
|
10124
10330
|
if (current === null || current === void 0) {
|
|
@@ -10936,8 +11142,8 @@ var TokenUsageEvaluator = class {
|
|
|
10936
11142
|
};
|
|
10937
11143
|
|
|
10938
11144
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
10939
|
-
function getNestedValue(obj,
|
|
10940
|
-
const parts =
|
|
11145
|
+
function getNestedValue(obj, path40) {
|
|
11146
|
+
const parts = path40.split(".");
|
|
10941
11147
|
let current = obj;
|
|
10942
11148
|
for (const part of parts) {
|
|
10943
11149
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -11401,13 +11607,78 @@ function runContainsAssertion(output, value) {
|
|
|
11401
11607
|
misses: passed ? [] : [`Output does not contain "${value}"`]
|
|
11402
11608
|
};
|
|
11403
11609
|
}
|
|
11404
|
-
function
|
|
11405
|
-
const
|
|
11610
|
+
function runContainsAnyAssertion(output, values) {
|
|
11611
|
+
const matched = values.filter((v) => output.includes(v));
|
|
11612
|
+
const passed = matched.length > 0;
|
|
11613
|
+
return {
|
|
11614
|
+
score: passed ? 1 : 0,
|
|
11615
|
+
hits: passed ? [`Output contains "${matched[0]}"`] : [],
|
|
11616
|
+
misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
|
|
11617
|
+
};
|
|
11618
|
+
}
|
|
11619
|
+
function runContainsAllAssertion(output, values) {
|
|
11620
|
+
const missing = values.filter((v) => !output.includes(v));
|
|
11621
|
+
const passed = missing.length === 0;
|
|
11622
|
+
return {
|
|
11623
|
+
score: passed ? 1 : 0,
|
|
11624
|
+
hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
|
|
11625
|
+
misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
11626
|
+
};
|
|
11627
|
+
}
|
|
11628
|
+
function runIcontainsAssertion(output, value) {
|
|
11629
|
+
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
11630
|
+
return {
|
|
11631
|
+
score: passed ? 1 : 0,
|
|
11632
|
+
hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
|
|
11633
|
+
misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
|
|
11634
|
+
};
|
|
11635
|
+
}
|
|
11636
|
+
function runIcontainsAnyAssertion(output, values) {
|
|
11637
|
+
const lower = output.toLowerCase();
|
|
11638
|
+
const matched = values.filter((v) => lower.includes(v.toLowerCase()));
|
|
11639
|
+
const passed = matched.length > 0;
|
|
11640
|
+
return {
|
|
11641
|
+
score: passed ? 1 : 0,
|
|
11642
|
+
hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
|
|
11643
|
+
misses: passed ? [] : [
|
|
11644
|
+
`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
|
|
11645
|
+
]
|
|
11646
|
+
};
|
|
11647
|
+
}
|
|
11648
|
+
function runIcontainsAllAssertion(output, values) {
|
|
11649
|
+
const lower = output.toLowerCase();
|
|
11650
|
+
const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
|
|
11651
|
+
const passed = missing.length === 0;
|
|
11652
|
+
return {
|
|
11653
|
+
score: passed ? 1 : 0,
|
|
11654
|
+
hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
|
|
11655
|
+
misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
11656
|
+
};
|
|
11657
|
+
}
|
|
11658
|
+
function runStartsWithAssertion(output, value) {
|
|
11659
|
+
const passed = output.trim().startsWith(value.trim());
|
|
11660
|
+
return {
|
|
11661
|
+
score: passed ? 1 : 0,
|
|
11662
|
+
hits: passed ? [`Output starts with "${value}"`] : [],
|
|
11663
|
+
misses: passed ? [] : [`Output does not start with "${value}"`]
|
|
11664
|
+
};
|
|
11665
|
+
}
|
|
11666
|
+
function runEndsWithAssertion(output, value) {
|
|
11667
|
+
const passed = output.trim().endsWith(value.trim());
|
|
11668
|
+
return {
|
|
11669
|
+
score: passed ? 1 : 0,
|
|
11670
|
+
hits: passed ? [`Output ends with "${value}"`] : [],
|
|
11671
|
+
misses: passed ? [] : [`Output does not end with "${value}"`]
|
|
11672
|
+
};
|
|
11673
|
+
}
|
|
11674
|
+
function runRegexAssertion(output, pattern, flags) {
|
|
11675
|
+
const regex = new RegExp(pattern, flags);
|
|
11406
11676
|
const passed = regex.test(output);
|
|
11677
|
+
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
11407
11678
|
return {
|
|
11408
11679
|
score: passed ? 1 : 0,
|
|
11409
|
-
hits: passed ? [`Output matches pattern /${pattern}
|
|
11410
|
-
misses: passed ? [] : [`Output does not match pattern /${pattern}
|
|
11680
|
+
hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
|
|
11681
|
+
misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
|
|
11411
11682
|
};
|
|
11412
11683
|
}
|
|
11413
11684
|
function runIsJsonAssertion(output) {
|
|
@@ -11433,9 +11704,9 @@ function runEqualsAssertion(output, value) {
|
|
|
11433
11704
|
}
|
|
11434
11705
|
|
|
11435
11706
|
// src/evaluation/orchestrator.ts
|
|
11436
|
-
import { createHash, randomUUID as randomUUID7 } from "node:crypto";
|
|
11437
|
-
import { mkdir as
|
|
11438
|
-
import
|
|
11707
|
+
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11708
|
+
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
11709
|
+
import path37 from "node:path";
|
|
11439
11710
|
import micromatch4 from "micromatch";
|
|
11440
11711
|
|
|
11441
11712
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -11825,13 +12096,13 @@ var containsFactory = (config) => {
|
|
|
11825
12096
|
var regexFactory = (config) => {
|
|
11826
12097
|
const c = config;
|
|
11827
12098
|
return new DeterministicAssertionEvaluator("regex", (ctx) => {
|
|
11828
|
-
const result = runRegexAssertion(ctx.candidate, c.value);
|
|
12099
|
+
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
11829
12100
|
return {
|
|
11830
12101
|
score: result.score,
|
|
11831
12102
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
11832
12103
|
hits: result.hits,
|
|
11833
12104
|
misses: result.misses,
|
|
11834
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}
|
|
12105
|
+
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
11835
12106
|
expectedAspectCount: 1
|
|
11836
12107
|
};
|
|
11837
12108
|
});
|
|
@@ -11863,9 +12134,107 @@ var equalsFactory = (config) => {
|
|
|
11863
12134
|
};
|
|
11864
12135
|
});
|
|
11865
12136
|
};
|
|
12137
|
+
var containsAnyFactory = (config) => {
|
|
12138
|
+
const c = config;
|
|
12139
|
+
return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
|
|
12140
|
+
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
12141
|
+
return {
|
|
12142
|
+
score: result.score,
|
|
12143
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12144
|
+
hits: result.hits,
|
|
12145
|
+
misses: result.misses,
|
|
12146
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12147
|
+
expectedAspectCount: 1
|
|
12148
|
+
};
|
|
12149
|
+
});
|
|
12150
|
+
};
|
|
12151
|
+
var containsAllFactory = (config) => {
|
|
12152
|
+
const c = config;
|
|
12153
|
+
return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
|
|
12154
|
+
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
12155
|
+
return {
|
|
12156
|
+
score: result.score,
|
|
12157
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12158
|
+
hits: result.hits,
|
|
12159
|
+
misses: result.misses,
|
|
12160
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12161
|
+
expectedAspectCount: 1
|
|
12162
|
+
};
|
|
12163
|
+
});
|
|
12164
|
+
};
|
|
12165
|
+
var icontainsFactory = (config) => {
|
|
12166
|
+
const c = config;
|
|
12167
|
+
return new DeterministicAssertionEvaluator("icontains", (ctx) => {
|
|
12168
|
+
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
12169
|
+
return {
|
|
12170
|
+
score: result.score,
|
|
12171
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12172
|
+
hits: result.hits,
|
|
12173
|
+
misses: result.misses,
|
|
12174
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12175
|
+
expectedAspectCount: 1
|
|
12176
|
+
};
|
|
12177
|
+
});
|
|
12178
|
+
};
|
|
12179
|
+
var icontainsAnyFactory = (config) => {
|
|
12180
|
+
const c = config;
|
|
12181
|
+
return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
|
|
12182
|
+
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
12183
|
+
return {
|
|
12184
|
+
score: result.score,
|
|
12185
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12186
|
+
hits: result.hits,
|
|
12187
|
+
misses: result.misses,
|
|
12188
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12189
|
+
expectedAspectCount: 1
|
|
12190
|
+
};
|
|
12191
|
+
});
|
|
12192
|
+
};
|
|
12193
|
+
var icontainsAllFactory = (config) => {
|
|
12194
|
+
const c = config;
|
|
12195
|
+
return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
|
|
12196
|
+
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
12197
|
+
return {
|
|
12198
|
+
score: result.score,
|
|
12199
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12200
|
+
hits: result.hits,
|
|
12201
|
+
misses: result.misses,
|
|
12202
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12203
|
+
expectedAspectCount: 1
|
|
12204
|
+
};
|
|
12205
|
+
});
|
|
12206
|
+
};
|
|
12207
|
+
var startsWithFactory = (config) => {
|
|
12208
|
+
const c = config;
|
|
12209
|
+
return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
|
|
12210
|
+
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
12211
|
+
return {
|
|
12212
|
+
score: result.score,
|
|
12213
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12214
|
+
hits: result.hits,
|
|
12215
|
+
misses: result.misses,
|
|
12216
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12217
|
+
expectedAspectCount: 1
|
|
12218
|
+
};
|
|
12219
|
+
});
|
|
12220
|
+
};
|
|
12221
|
+
var endsWithFactory = (config) => {
|
|
12222
|
+
const c = config;
|
|
12223
|
+
return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
|
|
12224
|
+
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
12225
|
+
return {
|
|
12226
|
+
score: result.score,
|
|
12227
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
12228
|
+
hits: result.hits,
|
|
12229
|
+
misses: result.misses,
|
|
12230
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
12231
|
+
expectedAspectCount: 1
|
|
12232
|
+
};
|
|
12233
|
+
});
|
|
12234
|
+
};
|
|
11866
12235
|
function createBuiltinRegistry() {
|
|
11867
12236
|
const registry = new EvaluatorRegistry();
|
|
11868
|
-
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
12237
|
+
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
11869
12238
|
return registry;
|
|
11870
12239
|
}
|
|
11871
12240
|
|
|
@@ -12209,18 +12578,236 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
12209
12578
|
}
|
|
12210
12579
|
}
|
|
12211
12580
|
|
|
12581
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
12582
|
+
import { execFile } from "node:child_process";
|
|
12583
|
+
import { createHash } from "node:crypto";
|
|
12584
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
12585
|
+
import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12586
|
+
import os4 from "node:os";
|
|
12587
|
+
import path35 from "node:path";
|
|
12588
|
+
import { promisify as promisify5 } from "node:util";
|
|
12589
|
+
var execFileAsync = promisify5(execFile);
|
|
12590
|
+
var DEFAULT_CACHE_DIR = path35.join(os4.homedir(), ".agentv", "git-cache");
|
|
12591
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
12592
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
12593
|
+
function gitEnv() {
|
|
12594
|
+
const env = { ...process.env };
|
|
12595
|
+
for (const key of Object.keys(env)) {
|
|
12596
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
12597
|
+
delete env[key];
|
|
12598
|
+
}
|
|
12599
|
+
}
|
|
12600
|
+
return {
|
|
12601
|
+
...env,
|
|
12602
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
12603
|
+
GIT_ASKPASS: "",
|
|
12604
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
12605
|
+
};
|
|
12606
|
+
}
|
|
12607
|
+
function cacheKey(source) {
|
|
12608
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
12609
|
+
return createHash("sha256").update(raw).digest("hex");
|
|
12610
|
+
}
|
|
12611
|
+
function getSourceUrl(source) {
|
|
12612
|
+
return source.type === "git" ? source.url : source.path;
|
|
12613
|
+
}
|
|
12614
|
+
async function git(args, opts) {
|
|
12615
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
12616
|
+
cwd: opts?.cwd,
|
|
12617
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
12618
|
+
env: gitEnv(),
|
|
12619
|
+
maxBuffer: 50 * 1024 * 1024
|
|
12620
|
+
// 50MB
|
|
12621
|
+
});
|
|
12622
|
+
return stdout.trim();
|
|
12623
|
+
}
|
|
12624
|
+
async function acquireLock(lockPath) {
|
|
12625
|
+
const start = Date.now();
|
|
12626
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
12627
|
+
try {
|
|
12628
|
+
await writeFile7(lockPath, String(process.pid), { flag: "wx" });
|
|
12629
|
+
return;
|
|
12630
|
+
} catch (err) {
|
|
12631
|
+
if (err.code === "EEXIST") {
|
|
12632
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
12633
|
+
continue;
|
|
12634
|
+
}
|
|
12635
|
+
throw err;
|
|
12636
|
+
}
|
|
12637
|
+
}
|
|
12638
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
12639
|
+
}
|
|
12640
|
+
async function releaseLock(lockPath) {
|
|
12641
|
+
try {
|
|
12642
|
+
await unlink(lockPath);
|
|
12643
|
+
} catch {
|
|
12644
|
+
}
|
|
12645
|
+
}
|
|
12646
|
+
var RepoManager = class {
|
|
12647
|
+
cacheDir;
|
|
12648
|
+
constructor(cacheDir) {
|
|
12649
|
+
this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
|
|
12650
|
+
}
|
|
12651
|
+
/**
|
|
12652
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
12653
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
12654
|
+
* Returns the absolute path to the cache directory.
|
|
12655
|
+
*/
|
|
12656
|
+
async ensureCache(source, depth) {
|
|
12657
|
+
const key = cacheKey(source);
|
|
12658
|
+
const cachePath = path35.join(this.cacheDir, key);
|
|
12659
|
+
const lockPath = `${cachePath}.lock`;
|
|
12660
|
+
await mkdir11(this.cacheDir, { recursive: true });
|
|
12661
|
+
await acquireLock(lockPath);
|
|
12662
|
+
try {
|
|
12663
|
+
if (existsSync2(path35.join(cachePath, "HEAD"))) {
|
|
12664
|
+
const fetchArgs = ["fetch", "--prune"];
|
|
12665
|
+
if (depth) {
|
|
12666
|
+
fetchArgs.push("--depth", String(depth));
|
|
12667
|
+
}
|
|
12668
|
+
await git(fetchArgs, { cwd: cachePath });
|
|
12669
|
+
} else {
|
|
12670
|
+
const cloneArgs = ["clone", "--mirror", "--bare"];
|
|
12671
|
+
if (depth) {
|
|
12672
|
+
cloneArgs.push("--depth", String(depth));
|
|
12673
|
+
}
|
|
12674
|
+
const sourceUrl = getSourceUrl(source);
|
|
12675
|
+
const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
12676
|
+
cloneArgs.push(cloneUrl, cachePath);
|
|
12677
|
+
await git(cloneArgs);
|
|
12678
|
+
}
|
|
12679
|
+
} finally {
|
|
12680
|
+
await releaseLock(lockPath);
|
|
12681
|
+
}
|
|
12682
|
+
return cachePath;
|
|
12683
|
+
}
|
|
12684
|
+
/**
|
|
12685
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
12686
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
12687
|
+
*/
|
|
12688
|
+
async materialize(repo, workspacePath) {
|
|
12689
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12690
|
+
const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
|
|
12691
|
+
const cloneArgs = ["clone"];
|
|
12692
|
+
if (repo.clone?.depth) {
|
|
12693
|
+
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
12694
|
+
}
|
|
12695
|
+
if (repo.clone?.filter) {
|
|
12696
|
+
cloneArgs.push("--filter", repo.clone.filter);
|
|
12697
|
+
}
|
|
12698
|
+
cloneArgs.push("--no-checkout");
|
|
12699
|
+
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
|
|
12700
|
+
cloneArgs.push(cloneUrl, targetDir);
|
|
12701
|
+
await git(cloneArgs);
|
|
12702
|
+
if (repo.clone?.sparse?.length) {
|
|
12703
|
+
await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
|
|
12704
|
+
await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
|
|
12705
|
+
}
|
|
12706
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
12707
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
12708
|
+
let resolvedSha;
|
|
12709
|
+
if (resolve === "remote" && repo.source.type === "git") {
|
|
12710
|
+
const url = getSourceUrl(repo.source);
|
|
12711
|
+
try {
|
|
12712
|
+
const lsOutput = await git(["ls-remote", url, ref]);
|
|
12713
|
+
const match = lsOutput.split(" ")[0];
|
|
12714
|
+
if (!match) {
|
|
12715
|
+
throw new Error(`Ref '${ref}' not found on remote ${url}`);
|
|
12716
|
+
}
|
|
12717
|
+
resolvedSha = match;
|
|
12718
|
+
} catch (err) {
|
|
12719
|
+
if (err instanceof Error && err.message.includes("not found")) throw err;
|
|
12720
|
+
resolvedSha = ref;
|
|
12721
|
+
}
|
|
12722
|
+
} else {
|
|
12723
|
+
resolvedSha = ref;
|
|
12724
|
+
}
|
|
12725
|
+
await git(["checkout", resolvedSha], { cwd: targetDir });
|
|
12726
|
+
const ancestor = repo.checkout?.ancestor ?? 0;
|
|
12727
|
+
if (ancestor > 0) {
|
|
12728
|
+
try {
|
|
12729
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
12730
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
12731
|
+
} catch {
|
|
12732
|
+
if (repo.clone?.depth) {
|
|
12733
|
+
await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
|
|
12734
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
12735
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
12736
|
+
} else {
|
|
12737
|
+
throw new Error(
|
|
12738
|
+
`Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
|
|
12739
|
+
);
|
|
12740
|
+
}
|
|
12741
|
+
}
|
|
12742
|
+
}
|
|
12743
|
+
}
|
|
12744
|
+
/** Materialize all repos into the workspace. */
|
|
12745
|
+
async materializeAll(repos, workspacePath) {
|
|
12746
|
+
for (const repo of repos) {
|
|
12747
|
+
await this.materialize(repo, workspacePath);
|
|
12748
|
+
}
|
|
12749
|
+
}
|
|
12750
|
+
/** Reset repos in workspace to their checkout state. */
|
|
12751
|
+
async reset(repos, workspacePath, strategy) {
|
|
12752
|
+
if (strategy === "recreate") {
|
|
12753
|
+
for (const repo of repos) {
|
|
12754
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12755
|
+
await rm5(targetDir, { recursive: true, force: true });
|
|
12756
|
+
}
|
|
12757
|
+
await this.materializeAll(repos, workspacePath);
|
|
12758
|
+
return;
|
|
12759
|
+
}
|
|
12760
|
+
for (const repo of repos) {
|
|
12761
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
12762
|
+
await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
12763
|
+
await git(["clean", "-fd"], { cwd: targetDir });
|
|
12764
|
+
}
|
|
12765
|
+
}
|
|
12766
|
+
/**
|
|
12767
|
+
* Seed the cache from a local repository, setting the remote to a given URL.
|
|
12768
|
+
* Useful for avoiding slow network clones when a local clone already exists.
|
|
12769
|
+
*/
|
|
12770
|
+
async seedCache(localPath, remoteUrl, opts) {
|
|
12771
|
+
const source = { type: "git", url: remoteUrl };
|
|
12772
|
+
const key = cacheKey(source);
|
|
12773
|
+
const cachePath = path35.join(this.cacheDir, key);
|
|
12774
|
+
const lockPath = `${cachePath}.lock`;
|
|
12775
|
+
await mkdir11(this.cacheDir, { recursive: true });
|
|
12776
|
+
await acquireLock(lockPath);
|
|
12777
|
+
try {
|
|
12778
|
+
if (existsSync2(path35.join(cachePath, "HEAD"))) {
|
|
12779
|
+
if (!opts?.force) {
|
|
12780
|
+
throw new Error(
|
|
12781
|
+
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
12782
|
+
);
|
|
12783
|
+
}
|
|
12784
|
+
await rm5(cachePath, { recursive: true, force: true });
|
|
12785
|
+
}
|
|
12786
|
+
await git(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
12787
|
+
await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
12788
|
+
} finally {
|
|
12789
|
+
await releaseLock(lockPath);
|
|
12790
|
+
}
|
|
12791
|
+
return cachePath;
|
|
12792
|
+
}
|
|
12793
|
+
/** Remove the entire cache directory. */
|
|
12794
|
+
async cleanCache() {
|
|
12795
|
+
await rm5(this.cacheDir, { recursive: true, force: true });
|
|
12796
|
+
}
|
|
12797
|
+
};
|
|
12798
|
+
|
|
12212
12799
|
// src/evaluation/workspace/resolve.ts
|
|
12213
12800
|
import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
|
|
12214
|
-
import
|
|
12801
|
+
import path36 from "node:path";
|
|
12215
12802
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
12216
12803
|
if (!templatePath) {
|
|
12217
12804
|
return void 0;
|
|
12218
12805
|
}
|
|
12219
|
-
const resolved =
|
|
12806
|
+
const resolved = path36.resolve(templatePath);
|
|
12220
12807
|
const stats = await stat6(resolved);
|
|
12221
12808
|
if (stats.isFile()) {
|
|
12222
12809
|
return {
|
|
12223
|
-
dir:
|
|
12810
|
+
dir: path36.dirname(resolved),
|
|
12224
12811
|
workspaceFile: resolved
|
|
12225
12812
|
};
|
|
12226
12813
|
}
|
|
@@ -12232,14 +12819,14 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
12232
12819
|
if (workspaceFiles.length === 1) {
|
|
12233
12820
|
return {
|
|
12234
12821
|
dir: resolved,
|
|
12235
|
-
workspaceFile:
|
|
12822
|
+
workspaceFile: path36.join(resolved, workspaceFiles[0])
|
|
12236
12823
|
};
|
|
12237
12824
|
}
|
|
12238
12825
|
if (workspaceFiles.length > 1) {
|
|
12239
12826
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
12240
12827
|
return {
|
|
12241
12828
|
dir: resolved,
|
|
12242
|
-
workspaceFile: conventionFile ?
|
|
12829
|
+
workspaceFile: conventionFile ? path36.join(resolved, conventionFile) : void 0
|
|
12243
12830
|
};
|
|
12244
12831
|
}
|
|
12245
12832
|
return { dir: resolved };
|
|
@@ -12361,6 +12948,11 @@ async function runEvaluation(options) {
|
|
|
12361
12948
|
}
|
|
12362
12949
|
return getOrCreateProvider(resolvedJudge);
|
|
12363
12950
|
};
|
|
12951
|
+
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
12952
|
+
throw new Error(
|
|
12953
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
|
|
12954
|
+
);
|
|
12955
|
+
}
|
|
12364
12956
|
const targetResolver = (name) => {
|
|
12365
12957
|
const resolved = resolveTargetByName(name);
|
|
12366
12958
|
if (!resolved) {
|
|
@@ -12374,7 +12966,7 @@ async function runEvaluation(options) {
|
|
|
12374
12966
|
];
|
|
12375
12967
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
12376
12968
|
const typeRegistry = createBuiltinRegistry();
|
|
12377
|
-
const discoveryBaseDir = evalFilePath ?
|
|
12969
|
+
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
12378
12970
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
12379
12971
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
12380
12972
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -12429,7 +13021,8 @@ async function runEvaluation(options) {
|
|
|
12429
13021
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
12430
13022
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
12431
13023
|
const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
12432
|
-
const
|
|
13024
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13025
|
+
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
12433
13026
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
12434
13027
|
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
12435
13028
|
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
@@ -12448,9 +13041,22 @@ async function runEvaluation(options) {
|
|
|
12448
13041
|
const message = error instanceof Error ? error.message : String(error);
|
|
12449
13042
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
12450
13043
|
}
|
|
12451
|
-
} else if (suiteWorkspace?.before_all) {
|
|
13044
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
12452
13045
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
12453
|
-
await
|
|
13046
|
+
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
13047
|
+
}
|
|
13048
|
+
const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
|
|
13049
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
13050
|
+
try {
|
|
13051
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
13052
|
+
} catch (error) {
|
|
13053
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13054
|
+
if (sharedWorkspacePath) {
|
|
13055
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13056
|
+
});
|
|
13057
|
+
}
|
|
13058
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13059
|
+
}
|
|
12454
13060
|
}
|
|
12455
13061
|
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
12456
13062
|
const scriptContext = {
|
|
@@ -12541,7 +13147,8 @@ async function runEvaluation(options) {
|
|
|
12541
13147
|
sharedBaselineCommit,
|
|
12542
13148
|
suiteWorkspaceFile,
|
|
12543
13149
|
streamCallbacks,
|
|
12544
|
-
typeRegistry
|
|
13150
|
+
typeRegistry,
|
|
13151
|
+
repoManager
|
|
12545
13152
|
};
|
|
12546
13153
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
12547
13154
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -12816,15 +13423,16 @@ async function runEvalCase(options) {
|
|
|
12816
13423
|
sharedWorkspacePath,
|
|
12817
13424
|
sharedBaselineCommit,
|
|
12818
13425
|
suiteWorkspaceFile,
|
|
12819
|
-
typeRegistry: providedTypeRegistry
|
|
13426
|
+
typeRegistry: providedTypeRegistry,
|
|
13427
|
+
repoManager
|
|
12820
13428
|
} = options;
|
|
12821
13429
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
12822
13430
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
12823
13431
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
12824
|
-
const
|
|
13432
|
+
const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
12825
13433
|
let cachedResponse;
|
|
12826
|
-
if (
|
|
12827
|
-
cachedResponse = await cache.get(
|
|
13434
|
+
if (cacheKey2 && cache) {
|
|
13435
|
+
cachedResponse = await cache.get(cacheKey2);
|
|
12828
13436
|
}
|
|
12829
13437
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
12830
13438
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -12853,9 +13461,25 @@ async function runEvalCase(options) {
|
|
|
12853
13461
|
);
|
|
12854
13462
|
}
|
|
12855
13463
|
}
|
|
12856
|
-
if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
|
|
13464
|
+
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
12857
13465
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
12858
|
-
await
|
|
13466
|
+
await mkdir12(workspacePath, { recursive: true });
|
|
13467
|
+
}
|
|
13468
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
13469
|
+
const perCaseRepoManager = new RepoManager();
|
|
13470
|
+
try {
|
|
13471
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
13472
|
+
} catch (error) {
|
|
13473
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13474
|
+
return buildErrorResult(
|
|
13475
|
+
evalCase,
|
|
13476
|
+
target.name,
|
|
13477
|
+
nowFn(),
|
|
13478
|
+
new Error(`Failed to materialize repos: ${message}`),
|
|
13479
|
+
promptInputs,
|
|
13480
|
+
provider
|
|
13481
|
+
);
|
|
13482
|
+
}
|
|
12859
13483
|
}
|
|
12860
13484
|
if (workspacePath && evalCase.workspace?.before_all) {
|
|
12861
13485
|
const scriptContext = {
|
|
@@ -12979,8 +13603,8 @@ async function runEvalCase(options) {
|
|
|
12979
13603
|
}
|
|
12980
13604
|
return errorResult;
|
|
12981
13605
|
}
|
|
12982
|
-
if (
|
|
12983
|
-
await cache.set(
|
|
13606
|
+
if (cacheKey2 && cache && !cachedResponse) {
|
|
13607
|
+
await cache.set(cacheKey2, providerResponse);
|
|
12984
13608
|
}
|
|
12985
13609
|
const output = providerResponse.output;
|
|
12986
13610
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -13008,6 +13632,16 @@ async function runEvalCase(options) {
|
|
|
13008
13632
|
}
|
|
13009
13633
|
}
|
|
13010
13634
|
const providerError = extractProviderError(providerResponse);
|
|
13635
|
+
if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
|
|
13636
|
+
try {
|
|
13637
|
+
await repoManager.reset(
|
|
13638
|
+
evalCase.workspace.repos,
|
|
13639
|
+
workspacePath,
|
|
13640
|
+
evalCase.workspace.reset.strategy
|
|
13641
|
+
);
|
|
13642
|
+
} catch {
|
|
13643
|
+
}
|
|
13644
|
+
}
|
|
13011
13645
|
if (workspacePath && evalCase.workspace?.after_each) {
|
|
13012
13646
|
const scriptContext = {
|
|
13013
13647
|
workspacePath,
|
|
@@ -13372,7 +14006,7 @@ async function runEvaluatorList(options) {
|
|
|
13372
14006
|
fileChanges,
|
|
13373
14007
|
workspacePath
|
|
13374
14008
|
};
|
|
13375
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
14009
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path37.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
13376
14010
|
const dispatchContext = {
|
|
13377
14011
|
judgeProvider,
|
|
13378
14012
|
targetResolver,
|
|
@@ -13462,8 +14096,9 @@ async function runEvaluatorList(options) {
|
|
|
13462
14096
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
13463
14097
|
return entry.score.score < minScore;
|
|
13464
14098
|
});
|
|
13465
|
-
const
|
|
13466
|
-
|
|
14099
|
+
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
14100
|
+
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
14101
|
+
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
13467
14102
|
) : 0;
|
|
13468
14103
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
13469
14104
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
@@ -13603,7 +14238,7 @@ function extractProviderError(response) {
|
|
|
13603
14238
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
13604
14239
|
}
|
|
13605
14240
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
13606
|
-
const hash =
|
|
14241
|
+
const hash = createHash2("sha256");
|
|
13607
14242
|
hash.update(provider.id);
|
|
13608
14243
|
hash.update(target.name);
|
|
13609
14244
|
hash.update(evalCase.id);
|
|
@@ -13671,8 +14306,8 @@ function computeWeightedMean(entries) {
|
|
|
13671
14306
|
}
|
|
13672
14307
|
|
|
13673
14308
|
// src/evaluation/evaluate.ts
|
|
13674
|
-
import { existsSync as
|
|
13675
|
-
import
|
|
14309
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
14310
|
+
import path38 from "node:path";
|
|
13676
14311
|
async function evaluate(config) {
|
|
13677
14312
|
const startTime = Date.now();
|
|
13678
14313
|
if (config.tests && config.specFile) {
|
|
@@ -13694,13 +14329,13 @@ async function evaluate(config) {
|
|
|
13694
14329
|
let evalCases;
|
|
13695
14330
|
let testFilePath;
|
|
13696
14331
|
if (config.specFile) {
|
|
13697
|
-
testFilePath =
|
|
14332
|
+
testFilePath = path38.resolve(config.specFile);
|
|
13698
14333
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
13699
14334
|
verbose: config.verbose,
|
|
13700
14335
|
filter: config.filter
|
|
13701
14336
|
});
|
|
13702
14337
|
} else {
|
|
13703
|
-
testFilePath =
|
|
14338
|
+
testFilePath = path38.join(process.cwd(), "__programmatic__.yaml");
|
|
13704
14339
|
evalCases = (config.tests ?? []).map((test) => {
|
|
13705
14340
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
13706
14341
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -13791,11 +14426,11 @@ function computeSummary(results, durationMs) {
|
|
|
13791
14426
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
13792
14427
|
async function discoverDefaultTarget(repoRoot) {
|
|
13793
14428
|
const cwd = process.cwd();
|
|
13794
|
-
const chain = buildDirectoryChain(
|
|
14429
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
13795
14430
|
for (const dir of chain) {
|
|
13796
14431
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
13797
|
-
const targetsPath =
|
|
13798
|
-
if (!
|
|
14432
|
+
const targetsPath = path38.join(dir, candidate);
|
|
14433
|
+
if (!existsSync3(targetsPath)) continue;
|
|
13799
14434
|
try {
|
|
13800
14435
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
13801
14436
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -13809,11 +14444,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
13809
14444
|
async function loadEnvHierarchy(repoRoot) {
|
|
13810
14445
|
const { readFileSync: readFileSync2 } = await import("node:fs");
|
|
13811
14446
|
const cwd = process.cwd();
|
|
13812
|
-
const chain = buildDirectoryChain(
|
|
14447
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
13813
14448
|
const envFiles = [];
|
|
13814
14449
|
for (const dir of chain) {
|
|
13815
|
-
const envPath =
|
|
13816
|
-
if (
|
|
14450
|
+
const envPath = path38.join(dir, ".env");
|
|
14451
|
+
if (existsSync3(envPath)) envFiles.push(envPath);
|
|
13817
14452
|
}
|
|
13818
14453
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
13819
14454
|
try {
|
|
@@ -13883,12 +14518,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
13883
14518
|
".agentv/config.js"
|
|
13884
14519
|
];
|
|
13885
14520
|
async function loadTsConfig(projectRoot) {
|
|
13886
|
-
const { existsSync:
|
|
14521
|
+
const { existsSync: existsSync4 } = await import("node:fs");
|
|
13887
14522
|
const { pathToFileURL } = await import("node:url");
|
|
13888
14523
|
const { join: join2 } = await import("node:path");
|
|
13889
14524
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
13890
14525
|
const filePath = join2(projectRoot, fileName);
|
|
13891
|
-
if (!
|
|
14526
|
+
if (!existsSync4(filePath)) {
|
|
13892
14527
|
continue;
|
|
13893
14528
|
}
|
|
13894
14529
|
try {
|
|
@@ -13985,8 +14620,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
13985
14620
|
}
|
|
13986
14621
|
|
|
13987
14622
|
// src/evaluation/cache/response-cache.ts
|
|
13988
|
-
import { mkdir as
|
|
13989
|
-
import
|
|
14623
|
+
import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
|
|
14624
|
+
import path39 from "node:path";
|
|
13990
14625
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
13991
14626
|
var ResponseCache = class {
|
|
13992
14627
|
cachePath;
|
|
@@ -14004,13 +14639,13 @@ var ResponseCache = class {
|
|
|
14004
14639
|
}
|
|
14005
14640
|
async set(key, value) {
|
|
14006
14641
|
const filePath = this.keyToPath(key);
|
|
14007
|
-
const dir =
|
|
14008
|
-
await
|
|
14009
|
-
await
|
|
14642
|
+
const dir = path39.dirname(filePath);
|
|
14643
|
+
await mkdir13(dir, { recursive: true });
|
|
14644
|
+
await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
14010
14645
|
}
|
|
14011
14646
|
keyToPath(key) {
|
|
14012
14647
|
const prefix = key.slice(0, 2);
|
|
14013
|
-
return
|
|
14648
|
+
return path39.join(this.cachePath, prefix, `${key}.json`);
|
|
14014
14649
|
}
|
|
14015
14650
|
};
|
|
14016
14651
|
function shouldEnableCache(params) {
|
|
@@ -14483,6 +15118,7 @@ export {
|
|
|
14483
15118
|
OtelTraceExporter,
|
|
14484
15119
|
OtlpJsonFileExporter,
|
|
14485
15120
|
ProviderRegistry,
|
|
15121
|
+
RepoManager,
|
|
14486
15122
|
ResponseCache,
|
|
14487
15123
|
SimpleTraceFileExporter,
|
|
14488
15124
|
TEST_MESSAGE_ROLES,
|
|
@@ -14568,12 +15204,19 @@ export {
|
|
|
14568
15204
|
resolveTargetDefinition,
|
|
14569
15205
|
resolveWorkspaceTemplate,
|
|
14570
15206
|
rubricEvaluationSchema,
|
|
15207
|
+
runContainsAllAssertion,
|
|
15208
|
+
runContainsAnyAssertion,
|
|
14571
15209
|
runContainsAssertion,
|
|
15210
|
+
runEndsWithAssertion,
|
|
14572
15211
|
runEqualsAssertion,
|
|
14573
15212
|
runEvalCase,
|
|
14574
15213
|
runEvaluation,
|
|
15214
|
+
runIcontainsAllAssertion,
|
|
15215
|
+
runIcontainsAnyAssertion,
|
|
15216
|
+
runIcontainsAssertion,
|
|
14575
15217
|
runIsJsonAssertion,
|
|
14576
15218
|
runRegexAssertion,
|
|
15219
|
+
runStartsWithAssertion,
|
|
14577
15220
|
scoreToVerdict,
|
|
14578
15221
|
shouldEnableCache,
|
|
14579
15222
|
shouldSkipCacheForTemperature,
|