@agentv/core 2.10.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7Q4PH265.js → chunk-REN5PS7B.js} +15 -8
- package/dist/chunk-REN5PS7B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +106 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +96 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +745 -170
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +173 -9
- package/dist/index.d.ts +173 -9
- package/dist/index.js +710 -150
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-7Q4PH265.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
|
|
|
1244
1244
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1245
1245
|
return { stringValue: String(value) };
|
|
1246
1246
|
}
|
|
1247
|
-
var
|
|
1247
|
+
var import_promises31, import_node_path43, OtlpJsonFileExporter;
|
|
1248
1248
|
var init_otlp_json_file_exporter = __esm({
|
|
1249
1249
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1250
1250
|
"use strict";
|
|
1251
|
-
|
|
1252
|
-
|
|
1251
|
+
import_promises31 = require("fs/promises");
|
|
1252
|
+
import_node_path43 = require("path");
|
|
1253
1253
|
OtlpJsonFileExporter = class {
|
|
1254
1254
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1255
1255
|
spans = [];
|
|
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
async flush() {
|
|
1290
1290
|
if (this.spans.length === 0) return;
|
|
1291
|
-
await (0,
|
|
1291
|
+
await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
|
|
1292
1292
|
const otlpJson = {
|
|
1293
1293
|
resourceSpans: [
|
|
1294
1294
|
{
|
|
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1302
1302
|
}
|
|
1303
1303
|
]
|
|
1304
1304
|
};
|
|
1305
|
-
const { writeFile:
|
|
1306
|
-
await
|
|
1305
|
+
const { writeFile: writeFile9 } = await import("fs/promises");
|
|
1306
|
+
await writeFile9(this.filePath, JSON.stringify(otlpJson, null, 2));
|
|
1307
1307
|
}
|
|
1308
1308
|
};
|
|
1309
1309
|
}
|
|
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
|
|
|
1319
1319
|
const diffNano = end[1] - start[1];
|
|
1320
1320
|
return Math.round(diffSec * 1e3 + diffNano / 1e6);
|
|
1321
1321
|
}
|
|
1322
|
-
var
|
|
1322
|
+
var import_node_fs13, import_promises32, import_node_path44, SimpleTraceFileExporter;
|
|
1323
1323
|
var init_simple_trace_file_exporter = __esm({
|
|
1324
1324
|
"src/observability/simple-trace-file-exporter.ts"() {
|
|
1325
1325
|
"use strict";
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1326
|
+
import_node_fs13 = require("fs");
|
|
1327
|
+
import_promises32 = require("fs/promises");
|
|
1328
|
+
import_node_path44 = require("path");
|
|
1329
1329
|
SimpleTraceFileExporter = class {
|
|
1330
1330
|
stream = null;
|
|
1331
1331
|
filePath;
|
|
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
|
|
|
1338
1338
|
async ensureStream() {
|
|
1339
1339
|
if (!this.streamReady) {
|
|
1340
1340
|
this.streamReady = (async () => {
|
|
1341
|
-
await (0,
|
|
1342
|
-
this.stream = (0,
|
|
1341
|
+
await (0, import_promises32.mkdir)((0, import_node_path44.dirname)(this.filePath), { recursive: true });
|
|
1342
|
+
this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
|
|
1343
1343
|
return this.stream;
|
|
1344
1344
|
})();
|
|
1345
1345
|
}
|
|
@@ -1448,6 +1448,7 @@ __export(index_exports, {
|
|
|
1448
1448
|
OtelTraceExporter: () => OtelTraceExporter,
|
|
1449
1449
|
OtlpJsonFileExporter: () => OtlpJsonFileExporter,
|
|
1450
1450
|
ProviderRegistry: () => ProviderRegistry,
|
|
1451
|
+
RepoManager: () => RepoManager,
|
|
1451
1452
|
ResponseCache: () => ResponseCache,
|
|
1452
1453
|
SimpleTraceFileExporter: () => SimpleTraceFileExporter,
|
|
1453
1454
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
@@ -1533,12 +1534,19 @@ __export(index_exports, {
|
|
|
1533
1534
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
1534
1535
|
resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
|
|
1535
1536
|
rubricEvaluationSchema: () => rubricEvaluationSchema,
|
|
1537
|
+
runContainsAllAssertion: () => runContainsAllAssertion,
|
|
1538
|
+
runContainsAnyAssertion: () => runContainsAnyAssertion,
|
|
1536
1539
|
runContainsAssertion: () => runContainsAssertion,
|
|
1540
|
+
runEndsWithAssertion: () => runEndsWithAssertion,
|
|
1537
1541
|
runEqualsAssertion: () => runEqualsAssertion,
|
|
1538
1542
|
runEvalCase: () => runEvalCase,
|
|
1539
1543
|
runEvaluation: () => runEvaluation,
|
|
1544
|
+
runIcontainsAllAssertion: () => runIcontainsAllAssertion,
|
|
1545
|
+
runIcontainsAnyAssertion: () => runIcontainsAnyAssertion,
|
|
1546
|
+
runIcontainsAssertion: () => runIcontainsAssertion,
|
|
1540
1547
|
runIsJsonAssertion: () => runIsJsonAssertion,
|
|
1541
1548
|
runRegexAssertion: () => runRegexAssertion,
|
|
1549
|
+
runStartsWithAssertion: () => runStartsWithAssertion,
|
|
1542
1550
|
scoreToVerdict: () => scoreToVerdict,
|
|
1543
1551
|
shouldEnableCache: () => shouldEnableCache,
|
|
1544
1552
|
shouldSkipCacheForTemperature: () => shouldSkipCacheForTemperature,
|
|
@@ -1615,6 +1623,13 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
1615
1623
|
"execution_metrics",
|
|
1616
1624
|
"agent_judge",
|
|
1617
1625
|
"contains",
|
|
1626
|
+
"contains_any",
|
|
1627
|
+
"contains_all",
|
|
1628
|
+
"icontains",
|
|
1629
|
+
"icontains_any",
|
|
1630
|
+
"icontains_all",
|
|
1631
|
+
"starts_with",
|
|
1632
|
+
"ends_with",
|
|
1618
1633
|
"regex",
|
|
1619
1634
|
"is_json",
|
|
1620
1635
|
"equals",
|
|
@@ -2888,18 +2903,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2888
2903
|
});
|
|
2889
2904
|
continue;
|
|
2890
2905
|
}
|
|
2906
|
+
if (typeValue === "contains_any" || typeValue === "contains_all") {
|
|
2907
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2908
|
+
if (!value || value.length === 0) {
|
|
2909
|
+
logWarning2(
|
|
2910
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
2911
|
+
);
|
|
2912
|
+
continue;
|
|
2913
|
+
}
|
|
2914
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2915
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2916
|
+
evaluators.push({
|
|
2917
|
+
name,
|
|
2918
|
+
type: typeValue,
|
|
2919
|
+
value,
|
|
2920
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2921
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2922
|
+
...negate !== void 0 ? { negate } : {}
|
|
2923
|
+
});
|
|
2924
|
+
continue;
|
|
2925
|
+
}
|
|
2926
|
+
if (typeValue === "icontains") {
|
|
2927
|
+
const value = asString(rawEvaluator.value);
|
|
2928
|
+
if (!value) {
|
|
2929
|
+
logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
|
|
2930
|
+
continue;
|
|
2931
|
+
}
|
|
2932
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2933
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2934
|
+
evaluators.push({
|
|
2935
|
+
name,
|
|
2936
|
+
type: "icontains",
|
|
2937
|
+
value,
|
|
2938
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2939
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2940
|
+
...negate !== void 0 ? { negate } : {}
|
|
2941
|
+
});
|
|
2942
|
+
continue;
|
|
2943
|
+
}
|
|
2944
|
+
if (typeValue === "icontains_any" || typeValue === "icontains_all") {
|
|
2945
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2946
|
+
if (!value || value.length === 0) {
|
|
2947
|
+
logWarning2(
|
|
2948
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
2949
|
+
);
|
|
2950
|
+
continue;
|
|
2951
|
+
}
|
|
2952
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2953
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2954
|
+
evaluators.push({
|
|
2955
|
+
name,
|
|
2956
|
+
type: typeValue,
|
|
2957
|
+
value,
|
|
2958
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2959
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2960
|
+
...negate !== void 0 ? { negate } : {}
|
|
2961
|
+
});
|
|
2962
|
+
continue;
|
|
2963
|
+
}
|
|
2964
|
+
if (typeValue === "starts_with" || typeValue === "ends_with") {
|
|
2965
|
+
const value = asString(rawEvaluator.value);
|
|
2966
|
+
if (!value) {
|
|
2967
|
+
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
2968
|
+
continue;
|
|
2969
|
+
}
|
|
2970
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2971
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2972
|
+
evaluators.push({
|
|
2973
|
+
name,
|
|
2974
|
+
type: typeValue,
|
|
2975
|
+
value,
|
|
2976
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2977
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2978
|
+
...negate !== void 0 ? { negate } : {}
|
|
2979
|
+
});
|
|
2980
|
+
continue;
|
|
2981
|
+
}
|
|
2891
2982
|
if (typeValue === "regex") {
|
|
2892
2983
|
const value = asString(rawEvaluator.value);
|
|
2893
2984
|
if (!value) {
|
|
2894
2985
|
logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
|
|
2895
2986
|
continue;
|
|
2896
2987
|
}
|
|
2988
|
+
const flags = asString(rawEvaluator.flags);
|
|
2897
2989
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2898
2990
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2899
2991
|
evaluators.push({
|
|
2900
2992
|
name,
|
|
2901
2993
|
type: "regex",
|
|
2902
2994
|
value,
|
|
2995
|
+
...flags !== void 0 ? { flags } : {},
|
|
2903
2996
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2904
2997
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2905
2998
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -3072,15 +3165,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3072
3165
|
}
|
|
3073
3166
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
3074
3167
|
}
|
|
3075
|
-
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3168
|
+
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3169
|
+
"contains",
|
|
3170
|
+
"contains_any",
|
|
3171
|
+
"contains_all",
|
|
3172
|
+
"icontains",
|
|
3173
|
+
"icontains_any",
|
|
3174
|
+
"icontains_all",
|
|
3175
|
+
"starts_with",
|
|
3176
|
+
"ends_with",
|
|
3177
|
+
"regex",
|
|
3178
|
+
"is_json",
|
|
3179
|
+
"equals",
|
|
3180
|
+
"rubrics"
|
|
3181
|
+
]);
|
|
3076
3182
|
function generateAssertionName(typeValue, rawEvaluator) {
|
|
3077
3183
|
if (!ASSERTION_TYPES.has(typeValue)) {
|
|
3078
3184
|
return void 0;
|
|
3079
3185
|
}
|
|
3080
3186
|
const value = asString(rawEvaluator.value);
|
|
3187
|
+
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
|
|
3081
3188
|
switch (typeValue) {
|
|
3082
3189
|
case "contains":
|
|
3083
3190
|
return value ? `contains-${value}` : "contains";
|
|
3191
|
+
case "contains_any":
|
|
3192
|
+
return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
|
|
3193
|
+
case "contains_all":
|
|
3194
|
+
return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
|
|
3195
|
+
case "icontains":
|
|
3196
|
+
return value ? `icontains-${value}` : "icontains";
|
|
3197
|
+
case "icontains_any":
|
|
3198
|
+
return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
|
|
3199
|
+
case "icontains_all":
|
|
3200
|
+
return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
|
|
3201
|
+
case "starts_with":
|
|
3202
|
+
return value ? `starts_with-${value}` : "starts_with";
|
|
3203
|
+
case "ends_with":
|
|
3204
|
+
return value ? `ends_with-${value}` : "ends_with";
|
|
3084
3205
|
case "regex":
|
|
3085
3206
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
3086
3207
|
case "is_json":
|
|
@@ -3106,6 +3227,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3106
3227
|
function asString(value) {
|
|
3107
3228
|
return typeof value === "string" ? value : void 0;
|
|
3108
3229
|
}
|
|
3230
|
+
function asStringArrayStrict(value) {
|
|
3231
|
+
if (!Array.isArray(value)) {
|
|
3232
|
+
return void 0;
|
|
3233
|
+
}
|
|
3234
|
+
const result = value.filter((v) => typeof v === "string");
|
|
3235
|
+
return result.length > 0 ? result : void 0;
|
|
3236
|
+
}
|
|
3109
3237
|
function asStringArray(value, description) {
|
|
3110
3238
|
if (value === void 0) {
|
|
3111
3239
|
return void 0;
|
|
@@ -4423,6 +4551,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
4423
4551
|
}
|
|
4424
4552
|
return cwd ? { ...config, cwd } : config;
|
|
4425
4553
|
}
|
|
4554
|
+
function parseRepoSource(raw) {
|
|
4555
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4556
|
+
const obj = raw;
|
|
4557
|
+
if (obj.type === "git" && typeof obj.url === "string") {
|
|
4558
|
+
return { type: "git", url: obj.url };
|
|
4559
|
+
}
|
|
4560
|
+
if (obj.type === "local" && typeof obj.path === "string") {
|
|
4561
|
+
return { type: "local", path: obj.path };
|
|
4562
|
+
}
|
|
4563
|
+
return void 0;
|
|
4564
|
+
}
|
|
4565
|
+
function parseRepoCheckout(raw) {
|
|
4566
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4567
|
+
const obj = raw;
|
|
4568
|
+
const ref = typeof obj.ref === "string" ? obj.ref : void 0;
|
|
4569
|
+
const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
|
|
4570
|
+
const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
|
|
4571
|
+
if (!ref && !resolve && ancestor === void 0) return void 0;
|
|
4572
|
+
return {
|
|
4573
|
+
...ref !== void 0 && { ref },
|
|
4574
|
+
...resolve !== void 0 && { resolve },
|
|
4575
|
+
...ancestor !== void 0 && { ancestor }
|
|
4576
|
+
};
|
|
4577
|
+
}
|
|
4578
|
+
function parseRepoClone(raw) {
|
|
4579
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4580
|
+
const obj = raw;
|
|
4581
|
+
const depth = typeof obj.depth === "number" ? obj.depth : void 0;
|
|
4582
|
+
const filter = typeof obj.filter === "string" ? obj.filter : void 0;
|
|
4583
|
+
const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
|
|
4584
|
+
if (depth === void 0 && !filter && !sparse) return void 0;
|
|
4585
|
+
return {
|
|
4586
|
+
...depth !== void 0 && { depth },
|
|
4587
|
+
...filter !== void 0 && { filter },
|
|
4588
|
+
...sparse !== void 0 && { sparse }
|
|
4589
|
+
};
|
|
4590
|
+
}
|
|
4591
|
+
function parseRepoConfig(raw) {
|
|
4592
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4593
|
+
const obj = raw;
|
|
4594
|
+
const repoPath = typeof obj.path === "string" ? obj.path : void 0;
|
|
4595
|
+
const source = parseRepoSource(obj.source);
|
|
4596
|
+
if (!repoPath || !source) return void 0;
|
|
4597
|
+
const checkout = parseRepoCheckout(obj.checkout);
|
|
4598
|
+
const clone = parseRepoClone(obj.clone);
|
|
4599
|
+
return {
|
|
4600
|
+
path: repoPath,
|
|
4601
|
+
source,
|
|
4602
|
+
...checkout !== void 0 && { checkout },
|
|
4603
|
+
...clone !== void 0 && { clone }
|
|
4604
|
+
};
|
|
4605
|
+
}
|
|
4606
|
+
function parseResetConfig(raw) {
|
|
4607
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4608
|
+
const obj = raw;
|
|
4609
|
+
const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
|
|
4610
|
+
const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
|
|
4611
|
+
if (!strategy && afterEach === void 0) return void 0;
|
|
4612
|
+
return {
|
|
4613
|
+
...strategy !== void 0 && { strategy },
|
|
4614
|
+
...afterEach !== void 0 && { after_each: afterEach }
|
|
4615
|
+
};
|
|
4616
|
+
}
|
|
4426
4617
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
4427
4618
|
if (!isJsonObject(raw)) return void 0;
|
|
4428
4619
|
const obj = raw;
|
|
@@ -4430,13 +4621,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
4430
4621
|
if (template && !import_node_path8.default.isAbsolute(template)) {
|
|
4431
4622
|
template = import_node_path8.default.resolve(evalFileDir, template);
|
|
4432
4623
|
}
|
|
4624
|
+
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
4625
|
+
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
4626
|
+
const reset = parseResetConfig(obj.reset);
|
|
4433
4627
|
const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
|
|
4434
4628
|
const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
|
|
4435
4629
|
const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
|
|
4436
4630
|
const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
|
|
4437
|
-
if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
4631
|
+
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
4632
|
+
return void 0;
|
|
4438
4633
|
return {
|
|
4439
4634
|
...template !== void 0 && { template },
|
|
4635
|
+
...isolation !== void 0 && { isolation },
|
|
4636
|
+
...repos !== void 0 && { repos },
|
|
4637
|
+
...reset !== void 0 && { reset },
|
|
4440
4638
|
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
4441
4639
|
...afterAll !== void 0 && { after_all: afterAll },
|
|
4442
4640
|
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
@@ -4449,6 +4647,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
4449
4647
|
if (!caseLevel) return suiteLevel;
|
|
4450
4648
|
return {
|
|
4451
4649
|
template: caseLevel.template ?? suiteLevel.template,
|
|
4650
|
+
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
4651
|
+
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
4652
|
+
reset: caseLevel.reset ?? suiteLevel.reset,
|
|
4452
4653
|
before_all: caseLevel.before_all ?? suiteLevel.before_all,
|
|
4453
4654
|
after_all: caseLevel.after_all ?? suiteLevel.after_all,
|
|
4454
4655
|
before_each: caseLevel.before_each ?? suiteLevel.before_each,
|
|
@@ -5103,11 +5304,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
5103
5304
|
}
|
|
5104
5305
|
return claudeSdkModule;
|
|
5105
5306
|
}
|
|
5106
|
-
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5107
|
-
- Do NOT create any additional output files in the workspace.
|
|
5108
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5109
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5110
|
-
This is required for evaluation scoring.`;
|
|
5111
5307
|
var ClaudeProvider = class {
|
|
5112
5308
|
id;
|
|
5113
5309
|
kind = "claude";
|
|
@@ -5129,7 +5325,7 @@ var ClaudeProvider = class {
|
|
|
5129
5325
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
5130
5326
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
5131
5327
|
const prompt = buildPromptDocument(request, inputFiles);
|
|
5132
|
-
const systemPrompt = this.config.systemPrompt
|
|
5328
|
+
const systemPrompt = this.config.systemPrompt;
|
|
5133
5329
|
const queryOptions = {
|
|
5134
5330
|
permissionMode: "bypassPermissions",
|
|
5135
5331
|
allowDangerouslySkipPermissions: true,
|
|
@@ -6110,11 +6306,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
6110
6306
|
}
|
|
6111
6307
|
return codexSdkModule;
|
|
6112
6308
|
}
|
|
6113
|
-
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
6114
|
-
- Do NOT create any additional output files in the workspace.
|
|
6115
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
6116
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
6117
|
-
This is required for evaluation scoring.`;
|
|
6118
6309
|
var CodexProvider = class {
|
|
6119
6310
|
id;
|
|
6120
6311
|
kind = "codex";
|
|
@@ -6149,7 +6340,7 @@ var CodexProvider = class {
|
|
|
6149
6340
|
const thread = codex.startThread(threadOptions);
|
|
6150
6341
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
6151
6342
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
6152
|
-
const systemPrompt = this.config.systemPrompt
|
|
6343
|
+
const systemPrompt = this.config.systemPrompt;
|
|
6153
6344
|
const prompt = systemPrompt ? `${systemPrompt}
|
|
6154
6345
|
|
|
6155
6346
|
${basePrompt}` : basePrompt;
|
|
@@ -6516,7 +6707,7 @@ var import_node_path14 = __toESM(require("path"), 1);
|
|
|
6516
6707
|
var import_node_url2 = require("url");
|
|
6517
6708
|
var import_meta = {};
|
|
6518
6709
|
function resolvePlatformCliPath() {
|
|
6519
|
-
const
|
|
6710
|
+
const os5 = (0, import_node_os2.platform)();
|
|
6520
6711
|
const cpu = (0, import_node_os2.arch)();
|
|
6521
6712
|
const platformMap = {
|
|
6522
6713
|
linux: "linux",
|
|
@@ -6527,13 +6718,13 @@ function resolvePlatformCliPath() {
|
|
|
6527
6718
|
x64: "x64",
|
|
6528
6719
|
arm64: "arm64"
|
|
6529
6720
|
};
|
|
6530
|
-
const osPart = platformMap[
|
|
6721
|
+
const osPart = platformMap[os5];
|
|
6531
6722
|
const archPart = archMap[cpu];
|
|
6532
6723
|
if (!osPart || !archPart) {
|
|
6533
6724
|
return void 0;
|
|
6534
6725
|
}
|
|
6535
6726
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
6536
|
-
const binaryName =
|
|
6727
|
+
const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
|
|
6537
6728
|
try {
|
|
6538
6729
|
const resolved = import_meta.resolve(`${packageName}/package.json`);
|
|
6539
6730
|
const packageJsonPath = resolved.startsWith("file:") ? (0, import_node_url2.fileURLToPath)(resolved) : resolved;
|
|
@@ -6675,11 +6866,6 @@ function isLogStreamingDisabled(envKey) {
|
|
|
6675
6866
|
}
|
|
6676
6867
|
|
|
6677
6868
|
// src/evaluation/providers/copilot-cli.ts
|
|
6678
|
-
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
6679
|
-
- Do NOT create any additional output files in the workspace.
|
|
6680
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
6681
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
6682
|
-
This is required for evaluation scoring.`;
|
|
6683
6869
|
var CopilotCliProvider = class {
|
|
6684
6870
|
id;
|
|
6685
6871
|
kind = "copilot-cli";
|
|
@@ -6882,8 +7068,8 @@ var CopilotCliProvider = class {
|
|
|
6882
7068
|
}
|
|
6883
7069
|
return args;
|
|
6884
7070
|
}
|
|
6885
|
-
resolveSystemPrompt(
|
|
6886
|
-
return this.config.systemPrompt
|
|
7071
|
+
resolveSystemPrompt(_request) {
|
|
7072
|
+
return this.config.systemPrompt;
|
|
6887
7073
|
}
|
|
6888
7074
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
6889
7075
|
const timeoutMs = this.config.timeoutMs;
|
|
@@ -7071,21 +7257,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
7071
7257
|
}
|
|
7072
7258
|
return copilotSdkModule;
|
|
7073
7259
|
}
|
|
7074
|
-
var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
|
|
7075
|
-
- Do NOT create any additional output files in the workspace.
|
|
7076
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
7077
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7078
|
-
This is required for evaluation scoring.`;
|
|
7079
7260
|
var CopilotSdkProvider = class {
|
|
7080
7261
|
id;
|
|
7081
|
-
kind = "copilot";
|
|
7262
|
+
kind = "copilot-sdk";
|
|
7082
7263
|
targetName;
|
|
7083
7264
|
supportsBatch = false;
|
|
7084
7265
|
config;
|
|
7085
7266
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
7086
7267
|
client = null;
|
|
7087
7268
|
constructor(targetName, config) {
|
|
7088
|
-
this.id = `copilot:${targetName}`;
|
|
7269
|
+
this.id = `copilot-sdk:${targetName}`;
|
|
7089
7270
|
this.targetName = targetName;
|
|
7090
7271
|
this.config = config;
|
|
7091
7272
|
}
|
|
@@ -7108,7 +7289,7 @@ var CopilotSdkProvider = class {
|
|
|
7108
7289
|
if (cwd) {
|
|
7109
7290
|
sessionOptions.workingDirectory = cwd;
|
|
7110
7291
|
}
|
|
7111
|
-
const systemPrompt = this.config.systemPrompt
|
|
7292
|
+
const systemPrompt = this.config.systemPrompt;
|
|
7112
7293
|
if (systemPrompt) {
|
|
7113
7294
|
sessionOptions.systemMessage = {
|
|
7114
7295
|
mode: "append",
|
|
@@ -7624,11 +7805,6 @@ function subscribeToPiLogEntries(listener) {
|
|
|
7624
7805
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
7625
7806
|
var WORKSPACE_PREFIX = "agentv-pi-";
|
|
7626
7807
|
var PROMPT_FILENAME = "prompt.md";
|
|
7627
|
-
var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
|
|
7628
|
-
- Do NOT create any additional output files in the workspace.
|
|
7629
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
7630
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7631
|
-
This is required for evaluation scoring.`;
|
|
7632
7808
|
var PiCodingAgentProvider = class {
|
|
7633
7809
|
id;
|
|
7634
7810
|
kind = "pi-coding-agent";
|
|
@@ -7705,7 +7881,7 @@ var PiCodingAgentProvider = class {
|
|
|
7705
7881
|
}
|
|
7706
7882
|
return import_node_path17.default.resolve(this.config.cwd);
|
|
7707
7883
|
}
|
|
7708
|
-
buildPiArgs(prompt, inputFiles,
|
|
7884
|
+
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
7709
7885
|
const args = [];
|
|
7710
7886
|
if (this.config.provider) {
|
|
7711
7887
|
args.push("--provider", this.config.provider);
|
|
@@ -7733,7 +7909,7 @@ var PiCodingAgentProvider = class {
|
|
|
7733
7909
|
args.push(`@${file}`);
|
|
7734
7910
|
}
|
|
7735
7911
|
}
|
|
7736
|
-
const systemPrompt = this.config.systemPrompt
|
|
7912
|
+
const systemPrompt = this.config.systemPrompt;
|
|
7737
7913
|
const fullPrompt = systemPrompt ? `${systemPrompt}
|
|
7738
7914
|
|
|
7739
7915
|
${prompt}` : prompt;
|
|
@@ -8604,17 +8780,17 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
8604
8780
|
providerBatching,
|
|
8605
8781
|
config: resolveCodexConfig(parsed, env, evalFilePath)
|
|
8606
8782
|
};
|
|
8607
|
-
case "copilot":
|
|
8608
8783
|
case "copilot-sdk":
|
|
8609
8784
|
case "copilot_sdk":
|
|
8610
8785
|
return {
|
|
8611
|
-
kind: "copilot",
|
|
8786
|
+
kind: "copilot-sdk",
|
|
8612
8787
|
name: parsed.name,
|
|
8613
8788
|
judgeTarget: parsed.judge_target,
|
|
8614
8789
|
workers: parsed.workers,
|
|
8615
8790
|
providerBatching,
|
|
8616
8791
|
config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
|
|
8617
8792
|
};
|
|
8793
|
+
case "copilot":
|
|
8618
8794
|
case "copilot-cli":
|
|
8619
8795
|
return {
|
|
8620
8796
|
kind: "copilot-cli",
|
|
@@ -9225,8 +9401,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
9225
9401
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
9226
9402
|
if (!parseResult.success) {
|
|
9227
9403
|
const firstError = parseResult.error.errors[0];
|
|
9228
|
-
const
|
|
9229
|
-
const prefix =
|
|
9404
|
+
const path42 = firstError?.path.join(".") || "";
|
|
9405
|
+
const prefix = path42 ? `${target.name} ${path42}: ` : `${target.name}: `;
|
|
9230
9406
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
9231
9407
|
}
|
|
9232
9408
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -10523,7 +10699,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10523
10699
|
|
|
10524
10700
|
**IMPORTANT**: Follow these exact steps:
|
|
10525
10701
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10526
|
-
- Do NOT create any additional output files in the workspace.
|
|
10527
10702
|
- All intended file outputs/changes MUST be written in your response file.
|
|
10528
10703
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10529
10704
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
@@ -10542,7 +10717,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10542
10717
|
|
|
10543
10718
|
**IMPORTANT**: Follow these exact steps:
|
|
10544
10719
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10545
|
-
- Do NOT create any additional output files in the workspace.
|
|
10546
10720
|
- All intended file outputs/changes MUST be written in your response file.
|
|
10547
10721
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10548
10722
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
@@ -10968,7 +11142,7 @@ async function discoverProviders(registry, baseDir) {
|
|
|
10968
11142
|
// src/evaluation/providers/index.ts
|
|
10969
11143
|
function createBuiltinProviderRegistry() {
|
|
10970
11144
|
const registry = new ProviderRegistry();
|
|
10971
|
-
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
11145
|
+
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
10972
11146
|
"vscode-insiders",
|
|
10973
11147
|
(t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
|
|
10974
11148
|
);
|
|
@@ -11157,16 +11331,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
11157
11331
|
});
|
|
11158
11332
|
}
|
|
11159
11333
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
11160
|
-
const { mkdir:
|
|
11334
|
+
const { mkdir: mkdir16, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
11161
11335
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
11162
|
-
const
|
|
11336
|
+
const path42 = await import("path");
|
|
11163
11337
|
const { randomUUID: randomUUID8 } = await import("crypto");
|
|
11164
|
-
const dir =
|
|
11165
|
-
await
|
|
11166
|
-
const stdinPath =
|
|
11167
|
-
const stdoutPath =
|
|
11168
|
-
const stderrPath =
|
|
11169
|
-
await
|
|
11338
|
+
const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
11339
|
+
await mkdir16(dir, { recursive: true });
|
|
11340
|
+
const stdinPath = path42.join(dir, "stdin.txt");
|
|
11341
|
+
const stdoutPath = path42.join(dir, "stdout.txt");
|
|
11342
|
+
const stderrPath = path42.join(dir, "stderr.txt");
|
|
11343
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
11170
11344
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
11171
11345
|
const { spawn: spawn4 } = await import("child_process");
|
|
11172
11346
|
try {
|
|
@@ -11199,7 +11373,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
11199
11373
|
const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11200
11374
|
return { stdout, stderr, exitCode };
|
|
11201
11375
|
} finally {
|
|
11202
|
-
await
|
|
11376
|
+
await rm6(dir, { recursive: true, force: true });
|
|
11203
11377
|
}
|
|
11204
11378
|
}
|
|
11205
11379
|
|
|
@@ -11517,7 +11691,7 @@ var CodeEvaluator = class {
|
|
|
11517
11691
|
outputPath,
|
|
11518
11692
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
11519
11693
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
11520
|
-
(
|
|
11694
|
+
(path42) => !context2.evalCase.guideline_paths.includes(path42)
|
|
11521
11695
|
),
|
|
11522
11696
|
input: context2.evalCase.input,
|
|
11523
11697
|
trace: context2.trace ?? null,
|
|
@@ -11648,7 +11822,7 @@ var import_ai3 = require("ai");
|
|
|
11648
11822
|
// src/evaluation/providers/types.ts
|
|
11649
11823
|
var AGENT_PROVIDER_KINDS = [
|
|
11650
11824
|
"codex",
|
|
11651
|
-
"copilot",
|
|
11825
|
+
"copilot-sdk",
|
|
11652
11826
|
"copilot-cli",
|
|
11653
11827
|
"pi-coding-agent",
|
|
11654
11828
|
"claude",
|
|
@@ -11794,13 +11968,15 @@ ${context2.fileChanges}`;
|
|
|
11794
11968
|
evaluatorRawRequest,
|
|
11795
11969
|
tokenUsage
|
|
11796
11970
|
};
|
|
11797
|
-
} catch {
|
|
11971
|
+
} catch (e) {
|
|
11972
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
11798
11973
|
return {
|
|
11799
11974
|
score: 0,
|
|
11800
|
-
verdict: "
|
|
11975
|
+
verdict: "skip",
|
|
11801
11976
|
hits: [],
|
|
11802
|
-
misses: [],
|
|
11977
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
11803
11978
|
expectedAspectCount: 1,
|
|
11979
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
11804
11980
|
evaluatorRawRequest
|
|
11805
11981
|
};
|
|
11806
11982
|
}
|
|
@@ -12742,115 +12918,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
12742
12918
|
* Evaluate a single field against the expected value.
|
|
12743
12919
|
*/
|
|
12744
12920
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
12745
|
-
const { path:
|
|
12746
|
-
const candidateValue = resolvePath(candidateData,
|
|
12747
|
-
const expectedValue = resolvePath(expectedData,
|
|
12921
|
+
const { path: path42, match, required = true, weight = 1 } = fieldConfig;
|
|
12922
|
+
const candidateValue = resolvePath(candidateData, path42);
|
|
12923
|
+
const expectedValue = resolvePath(expectedData, path42);
|
|
12748
12924
|
if (expectedValue === void 0) {
|
|
12749
12925
|
return {
|
|
12750
|
-
path:
|
|
12926
|
+
path: path42,
|
|
12751
12927
|
score: 1,
|
|
12752
12928
|
// No expected value means no comparison needed
|
|
12753
12929
|
weight,
|
|
12754
12930
|
hit: true,
|
|
12755
|
-
message: `${
|
|
12931
|
+
message: `${path42}: no expected value`
|
|
12756
12932
|
};
|
|
12757
12933
|
}
|
|
12758
12934
|
if (candidateValue === void 0) {
|
|
12759
12935
|
if (required) {
|
|
12760
12936
|
return {
|
|
12761
|
-
path:
|
|
12937
|
+
path: path42,
|
|
12762
12938
|
score: 0,
|
|
12763
12939
|
weight,
|
|
12764
12940
|
hit: false,
|
|
12765
|
-
message: `${
|
|
12941
|
+
message: `${path42} (required, missing)`
|
|
12766
12942
|
};
|
|
12767
12943
|
}
|
|
12768
12944
|
return {
|
|
12769
|
-
path:
|
|
12945
|
+
path: path42,
|
|
12770
12946
|
score: 1,
|
|
12771
12947
|
// Don't penalize missing optional fields
|
|
12772
12948
|
weight: 0,
|
|
12773
12949
|
// Zero weight means it won't affect the score
|
|
12774
12950
|
hit: true,
|
|
12775
|
-
message: `${
|
|
12951
|
+
message: `${path42}: optional field missing`
|
|
12776
12952
|
};
|
|
12777
12953
|
}
|
|
12778
12954
|
switch (match) {
|
|
12779
12955
|
case "exact":
|
|
12780
|
-
return this.compareExact(
|
|
12956
|
+
return this.compareExact(path42, candidateValue, expectedValue, weight);
|
|
12781
12957
|
case "numeric_tolerance":
|
|
12782
12958
|
return this.compareNumericTolerance(
|
|
12783
|
-
|
|
12959
|
+
path42,
|
|
12784
12960
|
candidateValue,
|
|
12785
12961
|
expectedValue,
|
|
12786
12962
|
fieldConfig,
|
|
12787
12963
|
weight
|
|
12788
12964
|
);
|
|
12789
12965
|
case "date":
|
|
12790
|
-
return this.compareDate(
|
|
12966
|
+
return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
|
|
12791
12967
|
default:
|
|
12792
12968
|
return {
|
|
12793
|
-
path:
|
|
12969
|
+
path: path42,
|
|
12794
12970
|
score: 0,
|
|
12795
12971
|
weight,
|
|
12796
12972
|
hit: false,
|
|
12797
|
-
message: `${
|
|
12973
|
+
message: `${path42}: unknown match type "${match}"`
|
|
12798
12974
|
};
|
|
12799
12975
|
}
|
|
12800
12976
|
}
|
|
12801
12977
|
/**
|
|
12802
12978
|
* Exact equality comparison.
|
|
12803
12979
|
*/
|
|
12804
|
-
compareExact(
|
|
12980
|
+
compareExact(path42, candidateValue, expectedValue, weight) {
|
|
12805
12981
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
12806
12982
|
return {
|
|
12807
|
-
path:
|
|
12983
|
+
path: path42,
|
|
12808
12984
|
score: 1,
|
|
12809
12985
|
weight,
|
|
12810
12986
|
hit: true,
|
|
12811
|
-
message:
|
|
12987
|
+
message: path42
|
|
12812
12988
|
};
|
|
12813
12989
|
}
|
|
12814
12990
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
12815
12991
|
return {
|
|
12816
|
-
path:
|
|
12992
|
+
path: path42,
|
|
12817
12993
|
score: 0,
|
|
12818
12994
|
weight,
|
|
12819
12995
|
hit: false,
|
|
12820
|
-
message: `${
|
|
12996
|
+
message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
12821
12997
|
};
|
|
12822
12998
|
}
|
|
12823
12999
|
return {
|
|
12824
|
-
path:
|
|
13000
|
+
path: path42,
|
|
12825
13001
|
score: 0,
|
|
12826
13002
|
weight,
|
|
12827
13003
|
hit: false,
|
|
12828
|
-
message: `${
|
|
13004
|
+
message: `${path42} (value mismatch)`
|
|
12829
13005
|
};
|
|
12830
13006
|
}
|
|
12831
13007
|
/**
|
|
12832
13008
|
* Numeric comparison with absolute or relative tolerance.
|
|
12833
13009
|
*/
|
|
12834
|
-
compareNumericTolerance(
|
|
13010
|
+
compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12835
13011
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
12836
13012
|
const candidateNum = toNumber2(candidateValue);
|
|
12837
13013
|
const expectedNum = toNumber2(expectedValue);
|
|
12838
13014
|
if (candidateNum === null || expectedNum === null) {
|
|
12839
13015
|
return {
|
|
12840
|
-
path:
|
|
13016
|
+
path: path42,
|
|
12841
13017
|
score: 0,
|
|
12842
13018
|
weight,
|
|
12843
13019
|
hit: false,
|
|
12844
|
-
message: `${
|
|
13020
|
+
message: `${path42} (non-numeric value)`
|
|
12845
13021
|
};
|
|
12846
13022
|
}
|
|
12847
13023
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
12848
13024
|
return {
|
|
12849
|
-
path:
|
|
13025
|
+
path: path42,
|
|
12850
13026
|
score: 0,
|
|
12851
13027
|
weight,
|
|
12852
13028
|
hit: false,
|
|
12853
|
-
message: `${
|
|
13029
|
+
message: `${path42} (invalid numeric value)`
|
|
12854
13030
|
};
|
|
12855
13031
|
}
|
|
12856
13032
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -12863,61 +13039,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
12863
13039
|
}
|
|
12864
13040
|
if (withinTolerance) {
|
|
12865
13041
|
return {
|
|
12866
|
-
path:
|
|
13042
|
+
path: path42,
|
|
12867
13043
|
score: 1,
|
|
12868
13044
|
weight,
|
|
12869
13045
|
hit: true,
|
|
12870
|
-
message: `${
|
|
13046
|
+
message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
|
|
12871
13047
|
};
|
|
12872
13048
|
}
|
|
12873
13049
|
return {
|
|
12874
|
-
path:
|
|
13050
|
+
path: path42,
|
|
12875
13051
|
score: 0,
|
|
12876
13052
|
weight,
|
|
12877
13053
|
hit: false,
|
|
12878
|
-
message: `${
|
|
13054
|
+
message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
12879
13055
|
};
|
|
12880
13056
|
}
|
|
12881
13057
|
/**
|
|
12882
13058
|
* Date comparison with format normalization.
|
|
12883
13059
|
*/
|
|
12884
|
-
compareDate(
|
|
13060
|
+
compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12885
13061
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
12886
13062
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
12887
13063
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
12888
13064
|
if (candidateDate === null) {
|
|
12889
13065
|
return {
|
|
12890
|
-
path:
|
|
13066
|
+
path: path42,
|
|
12891
13067
|
score: 0,
|
|
12892
13068
|
weight,
|
|
12893
13069
|
hit: false,
|
|
12894
|
-
message: `${
|
|
13070
|
+
message: `${path42} (unparseable candidate date)`
|
|
12895
13071
|
};
|
|
12896
13072
|
}
|
|
12897
13073
|
if (expectedDate === null) {
|
|
12898
13074
|
return {
|
|
12899
|
-
path:
|
|
13075
|
+
path: path42,
|
|
12900
13076
|
score: 0,
|
|
12901
13077
|
weight,
|
|
12902
13078
|
hit: false,
|
|
12903
|
-
message: `${
|
|
13079
|
+
message: `${path42} (unparseable expected date)`
|
|
12904
13080
|
};
|
|
12905
13081
|
}
|
|
12906
13082
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
12907
13083
|
return {
|
|
12908
|
-
path:
|
|
13084
|
+
path: path42,
|
|
12909
13085
|
score: 1,
|
|
12910
13086
|
weight,
|
|
12911
13087
|
hit: true,
|
|
12912
|
-
message:
|
|
13088
|
+
message: path42
|
|
12913
13089
|
};
|
|
12914
13090
|
}
|
|
12915
13091
|
return {
|
|
12916
|
-
path:
|
|
13092
|
+
path: path42,
|
|
12917
13093
|
score: 0,
|
|
12918
13094
|
weight,
|
|
12919
13095
|
hit: false,
|
|
12920
|
-
message: `${
|
|
13096
|
+
message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
12921
13097
|
};
|
|
12922
13098
|
}
|
|
12923
13099
|
/**
|
|
@@ -12958,11 +13134,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12958
13134
|
};
|
|
12959
13135
|
}
|
|
12960
13136
|
};
|
|
12961
|
-
function resolvePath(obj,
|
|
12962
|
-
if (!
|
|
13137
|
+
function resolvePath(obj, path42) {
|
|
13138
|
+
if (!path42 || !obj) {
|
|
12963
13139
|
return void 0;
|
|
12964
13140
|
}
|
|
12965
|
-
const parts =
|
|
13141
|
+
const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
12966
13142
|
let current = obj;
|
|
12967
13143
|
for (const part of parts) {
|
|
12968
13144
|
if (current === null || current === void 0) {
|
|
@@ -13780,8 +13956,8 @@ var TokenUsageEvaluator = class {
|
|
|
13780
13956
|
};
|
|
13781
13957
|
|
|
13782
13958
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
13783
|
-
function getNestedValue(obj,
|
|
13784
|
-
const parts =
|
|
13959
|
+
function getNestedValue(obj, path42) {
|
|
13960
|
+
const parts = path42.split(".");
|
|
13785
13961
|
let current = obj;
|
|
13786
13962
|
for (const part of parts) {
|
|
13787
13963
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -14245,13 +14421,78 @@ function runContainsAssertion(output, value) {
|
|
|
14245
14421
|
misses: passed ? [] : [`Output does not contain "${value}"`]
|
|
14246
14422
|
};
|
|
14247
14423
|
}
|
|
14248
|
-
function
|
|
14249
|
-
const
|
|
14424
|
+
function runContainsAnyAssertion(output, values) {
|
|
14425
|
+
const matched = values.filter((v) => output.includes(v));
|
|
14426
|
+
const passed = matched.length > 0;
|
|
14427
|
+
return {
|
|
14428
|
+
score: passed ? 1 : 0,
|
|
14429
|
+
hits: passed ? [`Output contains "${matched[0]}"`] : [],
|
|
14430
|
+
misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
|
|
14431
|
+
};
|
|
14432
|
+
}
|
|
14433
|
+
function runContainsAllAssertion(output, values) {
|
|
14434
|
+
const missing = values.filter((v) => !output.includes(v));
|
|
14435
|
+
const passed = missing.length === 0;
|
|
14436
|
+
return {
|
|
14437
|
+
score: passed ? 1 : 0,
|
|
14438
|
+
hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
|
|
14439
|
+
misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
14440
|
+
};
|
|
14441
|
+
}
|
|
14442
|
+
function runIcontainsAssertion(output, value) {
|
|
14443
|
+
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
14444
|
+
return {
|
|
14445
|
+
score: passed ? 1 : 0,
|
|
14446
|
+
hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
|
|
14447
|
+
misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
|
|
14448
|
+
};
|
|
14449
|
+
}
|
|
14450
|
+
function runIcontainsAnyAssertion(output, values) {
|
|
14451
|
+
const lower = output.toLowerCase();
|
|
14452
|
+
const matched = values.filter((v) => lower.includes(v.toLowerCase()));
|
|
14453
|
+
const passed = matched.length > 0;
|
|
14454
|
+
return {
|
|
14455
|
+
score: passed ? 1 : 0,
|
|
14456
|
+
hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
|
|
14457
|
+
misses: passed ? [] : [
|
|
14458
|
+
`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
|
|
14459
|
+
]
|
|
14460
|
+
};
|
|
14461
|
+
}
|
|
14462
|
+
function runIcontainsAllAssertion(output, values) {
|
|
14463
|
+
const lower = output.toLowerCase();
|
|
14464
|
+
const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
|
|
14465
|
+
const passed = missing.length === 0;
|
|
14466
|
+
return {
|
|
14467
|
+
score: passed ? 1 : 0,
|
|
14468
|
+
hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
|
|
14469
|
+
misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
14470
|
+
};
|
|
14471
|
+
}
|
|
14472
|
+
function runStartsWithAssertion(output, value) {
|
|
14473
|
+
const passed = output.trim().startsWith(value.trim());
|
|
14474
|
+
return {
|
|
14475
|
+
score: passed ? 1 : 0,
|
|
14476
|
+
hits: passed ? [`Output starts with "${value}"`] : [],
|
|
14477
|
+
misses: passed ? [] : [`Output does not start with "${value}"`]
|
|
14478
|
+
};
|
|
14479
|
+
}
|
|
14480
|
+
function runEndsWithAssertion(output, value) {
|
|
14481
|
+
const passed = output.trim().endsWith(value.trim());
|
|
14482
|
+
return {
|
|
14483
|
+
score: passed ? 1 : 0,
|
|
14484
|
+
hits: passed ? [`Output ends with "${value}"`] : [],
|
|
14485
|
+
misses: passed ? [] : [`Output does not end with "${value}"`]
|
|
14486
|
+
};
|
|
14487
|
+
}
|
|
14488
|
+
function runRegexAssertion(output, pattern, flags) {
|
|
14489
|
+
const regex = new RegExp(pattern, flags);
|
|
14250
14490
|
const passed = regex.test(output);
|
|
14491
|
+
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
14251
14492
|
return {
|
|
14252
14493
|
score: passed ? 1 : 0,
|
|
14253
|
-
hits: passed ? [`Output matches pattern /${pattern}
|
|
14254
|
-
misses: passed ? [] : [`Output does not match pattern /${pattern}
|
|
14494
|
+
hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
|
|
14495
|
+
misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
|
|
14255
14496
|
};
|
|
14256
14497
|
}
|
|
14257
14498
|
function runIsJsonAssertion(output) {
|
|
@@ -14277,9 +14518,9 @@ function runEqualsAssertion(output, value) {
|
|
|
14277
14518
|
}
|
|
14278
14519
|
|
|
14279
14520
|
// src/evaluation/orchestrator.ts
|
|
14280
|
-
var
|
|
14281
|
-
var
|
|
14282
|
-
var
|
|
14521
|
+
var import_node_crypto9 = require("crypto");
|
|
14522
|
+
var import_promises29 = require("fs/promises");
|
|
14523
|
+
var import_node_path40 = __toESM(require("path"), 1);
|
|
14283
14524
|
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
14284
14525
|
|
|
14285
14526
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -14669,13 +14910,13 @@ var containsFactory = (config) => {
|
|
|
14669
14910
|
var regexFactory = (config) => {
|
|
14670
14911
|
const c = config;
|
|
14671
14912
|
return new DeterministicAssertionEvaluator("regex", (ctx) => {
|
|
14672
|
-
const result = runRegexAssertion(ctx.candidate, c.value);
|
|
14913
|
+
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
14673
14914
|
return {
|
|
14674
14915
|
score: result.score,
|
|
14675
14916
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
14676
14917
|
hits: result.hits,
|
|
14677
14918
|
misses: result.misses,
|
|
14678
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}
|
|
14919
|
+
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
14679
14920
|
expectedAspectCount: 1
|
|
14680
14921
|
};
|
|
14681
14922
|
});
|
|
@@ -14707,9 +14948,107 @@ var equalsFactory = (config) => {
|
|
|
14707
14948
|
};
|
|
14708
14949
|
});
|
|
14709
14950
|
};
|
|
14951
|
+
var containsAnyFactory = (config) => {
|
|
14952
|
+
const c = config;
|
|
14953
|
+
return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
|
|
14954
|
+
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
14955
|
+
return {
|
|
14956
|
+
score: result.score,
|
|
14957
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
14958
|
+
hits: result.hits,
|
|
14959
|
+
misses: result.misses,
|
|
14960
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
14961
|
+
expectedAspectCount: 1
|
|
14962
|
+
};
|
|
14963
|
+
});
|
|
14964
|
+
};
|
|
14965
|
+
var containsAllFactory = (config) => {
|
|
14966
|
+
const c = config;
|
|
14967
|
+
return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
|
|
14968
|
+
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
14969
|
+
return {
|
|
14970
|
+
score: result.score,
|
|
14971
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
14972
|
+
hits: result.hits,
|
|
14973
|
+
misses: result.misses,
|
|
14974
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
14975
|
+
expectedAspectCount: 1
|
|
14976
|
+
};
|
|
14977
|
+
});
|
|
14978
|
+
};
|
|
14979
|
+
var icontainsFactory = (config) => {
|
|
14980
|
+
const c = config;
|
|
14981
|
+
return new DeterministicAssertionEvaluator("icontains", (ctx) => {
|
|
14982
|
+
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
14983
|
+
return {
|
|
14984
|
+
score: result.score,
|
|
14985
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
14986
|
+
hits: result.hits,
|
|
14987
|
+
misses: result.misses,
|
|
14988
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
14989
|
+
expectedAspectCount: 1
|
|
14990
|
+
};
|
|
14991
|
+
});
|
|
14992
|
+
};
|
|
14993
|
+
var icontainsAnyFactory = (config) => {
|
|
14994
|
+
const c = config;
|
|
14995
|
+
return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
|
|
14996
|
+
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
14997
|
+
return {
|
|
14998
|
+
score: result.score,
|
|
14999
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15000
|
+
hits: result.hits,
|
|
15001
|
+
misses: result.misses,
|
|
15002
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15003
|
+
expectedAspectCount: 1
|
|
15004
|
+
};
|
|
15005
|
+
});
|
|
15006
|
+
};
|
|
15007
|
+
var icontainsAllFactory = (config) => {
|
|
15008
|
+
const c = config;
|
|
15009
|
+
return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
|
|
15010
|
+
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
15011
|
+
return {
|
|
15012
|
+
score: result.score,
|
|
15013
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15014
|
+
hits: result.hits,
|
|
15015
|
+
misses: result.misses,
|
|
15016
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15017
|
+
expectedAspectCount: 1
|
|
15018
|
+
};
|
|
15019
|
+
});
|
|
15020
|
+
};
|
|
15021
|
+
var startsWithFactory = (config) => {
|
|
15022
|
+
const c = config;
|
|
15023
|
+
return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
|
|
15024
|
+
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
15025
|
+
return {
|
|
15026
|
+
score: result.score,
|
|
15027
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15028
|
+
hits: result.hits,
|
|
15029
|
+
misses: result.misses,
|
|
15030
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15031
|
+
expectedAspectCount: 1
|
|
15032
|
+
};
|
|
15033
|
+
});
|
|
15034
|
+
};
|
|
15035
|
+
var endsWithFactory = (config) => {
|
|
15036
|
+
const c = config;
|
|
15037
|
+
return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
|
|
15038
|
+
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
15039
|
+
return {
|
|
15040
|
+
score: result.score,
|
|
15041
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15042
|
+
hits: result.hits,
|
|
15043
|
+
misses: result.misses,
|
|
15044
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15045
|
+
expectedAspectCount: 1
|
|
15046
|
+
};
|
|
15047
|
+
});
|
|
15048
|
+
};
|
|
14710
15049
|
function createBuiltinRegistry() {
|
|
14711
15050
|
const registry = new EvaluatorRegistry();
|
|
14712
|
-
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
15051
|
+
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
14713
15052
|
return registry;
|
|
14714
15053
|
}
|
|
14715
15054
|
|
|
@@ -15053,37 +15392,217 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
15053
15392
|
}
|
|
15054
15393
|
}
|
|
15055
15394
|
|
|
15056
|
-
// src/evaluation/workspace/
|
|
15395
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
15396
|
+
var import_node_child_process7 = require("child_process");
|
|
15397
|
+
var import_node_crypto8 = require("crypto");
|
|
15398
|
+
var import_node_fs11 = require("fs");
|
|
15057
15399
|
var import_promises27 = require("fs/promises");
|
|
15400
|
+
var import_node_os7 = __toESM(require("os"), 1);
|
|
15058
15401
|
var import_node_path38 = __toESM(require("path"), 1);
|
|
15402
|
+
var import_node_util5 = require("util");
|
|
15403
|
+
var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
|
|
15404
|
+
var DEFAULT_CACHE_DIR = import_node_path38.default.join(import_node_os7.default.homedir(), ".agentv", "git-cache");
|
|
15405
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15406
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
15407
|
+
function gitEnv() {
|
|
15408
|
+
const env = { ...process.env };
|
|
15409
|
+
for (const key of Object.keys(env)) {
|
|
15410
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
15411
|
+
delete env[key];
|
|
15412
|
+
}
|
|
15413
|
+
}
|
|
15414
|
+
return {
|
|
15415
|
+
...env,
|
|
15416
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
15417
|
+
GIT_ASKPASS: "",
|
|
15418
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15419
|
+
};
|
|
15420
|
+
}
|
|
15421
|
+
function cacheKey(source) {
|
|
15422
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
15423
|
+
return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
|
|
15424
|
+
}
|
|
15425
|
+
function getSourceUrl(source) {
|
|
15426
|
+
return source.type === "git" ? source.url : source.path;
|
|
15427
|
+
}
|
|
15428
|
+
async function git(args, opts) {
|
|
15429
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
15430
|
+
cwd: opts?.cwd,
|
|
15431
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
15432
|
+
env: gitEnv(),
|
|
15433
|
+
maxBuffer: 50 * 1024 * 1024
|
|
15434
|
+
// 50MB
|
|
15435
|
+
});
|
|
15436
|
+
return stdout.trim();
|
|
15437
|
+
}
|
|
15438
|
+
async function acquireLock(lockPath) {
|
|
15439
|
+
const start = Date.now();
|
|
15440
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
15441
|
+
try {
|
|
15442
|
+
await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
|
|
15443
|
+
return;
|
|
15444
|
+
} catch (err) {
|
|
15445
|
+
if (err.code === "EEXIST") {
|
|
15446
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
15447
|
+
continue;
|
|
15448
|
+
}
|
|
15449
|
+
throw err;
|
|
15450
|
+
}
|
|
15451
|
+
}
|
|
15452
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
15453
|
+
}
|
|
15454
|
+
async function releaseLock(lockPath) {
|
|
15455
|
+
try {
|
|
15456
|
+
await (0, import_promises27.unlink)(lockPath);
|
|
15457
|
+
} catch {
|
|
15458
|
+
}
|
|
15459
|
+
}
|
|
15460
|
+
var RepoManager = class {
|
|
15461
|
+
cacheDir;
|
|
15462
|
+
constructor(cacheDir) {
|
|
15463
|
+
this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
|
|
15464
|
+
}
|
|
15465
|
+
/**
|
|
15466
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
15467
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
15468
|
+
* Returns the absolute path to the cache directory.
|
|
15469
|
+
*/
|
|
15470
|
+
async ensureCache(source) {
|
|
15471
|
+
const key = cacheKey(source);
|
|
15472
|
+
const cachePath = import_node_path38.default.join(this.cacheDir, key);
|
|
15473
|
+
const lockPath = `${cachePath}.lock`;
|
|
15474
|
+
await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
|
|
15475
|
+
await acquireLock(lockPath);
|
|
15476
|
+
try {
|
|
15477
|
+
if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
|
|
15478
|
+
await git(["fetch", "--prune"], { cwd: cachePath });
|
|
15479
|
+
} else {
|
|
15480
|
+
await git(["clone", "--mirror", "--bare", getSourceUrl(source), cachePath]);
|
|
15481
|
+
}
|
|
15482
|
+
} finally {
|
|
15483
|
+
await releaseLock(lockPath);
|
|
15484
|
+
}
|
|
15485
|
+
return cachePath;
|
|
15486
|
+
}
|
|
15487
|
+
/**
|
|
15488
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
15489
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
15490
|
+
*/
|
|
15491
|
+
async materialize(repo, workspacePath) {
|
|
15492
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15493
|
+
const cachePath = await this.ensureCache(repo.source);
|
|
15494
|
+
const cloneArgs = ["clone"];
|
|
15495
|
+
if (repo.clone?.depth) {
|
|
15496
|
+
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
15497
|
+
}
|
|
15498
|
+
if (repo.clone?.filter) {
|
|
15499
|
+
cloneArgs.push("--filter", repo.clone.filter);
|
|
15500
|
+
}
|
|
15501
|
+
cloneArgs.push("--no-checkout");
|
|
15502
|
+
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
|
|
15503
|
+
cloneArgs.push(cloneUrl, targetDir);
|
|
15504
|
+
await git(cloneArgs);
|
|
15505
|
+
if (repo.clone?.sparse?.length) {
|
|
15506
|
+
await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
|
|
15507
|
+
await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
|
|
15508
|
+
}
|
|
15509
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15510
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
15511
|
+
let resolvedSha;
|
|
15512
|
+
if (resolve === "remote" && repo.source.type === "git") {
|
|
15513
|
+
const url = getSourceUrl(repo.source);
|
|
15514
|
+
try {
|
|
15515
|
+
const lsOutput = await git(["ls-remote", url, ref]);
|
|
15516
|
+
const match = lsOutput.split(" ")[0];
|
|
15517
|
+
if (!match) {
|
|
15518
|
+
throw new Error(`Ref '${ref}' not found on remote ${url}`);
|
|
15519
|
+
}
|
|
15520
|
+
resolvedSha = match;
|
|
15521
|
+
} catch (err) {
|
|
15522
|
+
if (err instanceof Error && err.message.includes("not found")) throw err;
|
|
15523
|
+
resolvedSha = ref;
|
|
15524
|
+
}
|
|
15525
|
+
} else {
|
|
15526
|
+
resolvedSha = ref;
|
|
15527
|
+
}
|
|
15528
|
+
await git(["checkout", resolvedSha], { cwd: targetDir });
|
|
15529
|
+
const ancestor = repo.checkout?.ancestor ?? 0;
|
|
15530
|
+
if (ancestor > 0) {
|
|
15531
|
+
try {
|
|
15532
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
15533
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
15534
|
+
} catch {
|
|
15535
|
+
if (repo.clone?.depth) {
|
|
15536
|
+
await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
|
|
15537
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
15538
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
15539
|
+
} else {
|
|
15540
|
+
throw new Error(
|
|
15541
|
+
`Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
|
|
15542
|
+
);
|
|
15543
|
+
}
|
|
15544
|
+
}
|
|
15545
|
+
}
|
|
15546
|
+
}
|
|
15547
|
+
/** Materialize all repos into the workspace. */
|
|
15548
|
+
async materializeAll(repos, workspacePath) {
|
|
15549
|
+
for (const repo of repos) {
|
|
15550
|
+
await this.materialize(repo, workspacePath);
|
|
15551
|
+
}
|
|
15552
|
+
}
|
|
15553
|
+
/** Reset repos in workspace to their checkout state. */
|
|
15554
|
+
async reset(repos, workspacePath, strategy) {
|
|
15555
|
+
if (strategy === "recreate") {
|
|
15556
|
+
for (const repo of repos) {
|
|
15557
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15558
|
+
await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
|
|
15559
|
+
}
|
|
15560
|
+
await this.materializeAll(repos, workspacePath);
|
|
15561
|
+
return;
|
|
15562
|
+
}
|
|
15563
|
+
for (const repo of repos) {
|
|
15564
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15565
|
+
await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
15566
|
+
await git(["clean", "-fd"], { cwd: targetDir });
|
|
15567
|
+
}
|
|
15568
|
+
}
|
|
15569
|
+
/** Remove the entire cache directory. */
|
|
15570
|
+
async cleanCache() {
|
|
15571
|
+
await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
|
|
15572
|
+
}
|
|
15573
|
+
};
|
|
15574
|
+
|
|
15575
|
+
// src/evaluation/workspace/resolve.ts
|
|
15576
|
+
var import_promises28 = require("fs/promises");
|
|
15577
|
+
var import_node_path39 = __toESM(require("path"), 1);
|
|
15059
15578
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
15060
15579
|
if (!templatePath) {
|
|
15061
15580
|
return void 0;
|
|
15062
15581
|
}
|
|
15063
|
-
const resolved =
|
|
15064
|
-
const stats = await (0,
|
|
15582
|
+
const resolved = import_node_path39.default.resolve(templatePath);
|
|
15583
|
+
const stats = await (0, import_promises28.stat)(resolved);
|
|
15065
15584
|
if (stats.isFile()) {
|
|
15066
15585
|
return {
|
|
15067
|
-
dir:
|
|
15586
|
+
dir: import_node_path39.default.dirname(resolved),
|
|
15068
15587
|
workspaceFile: resolved
|
|
15069
15588
|
};
|
|
15070
15589
|
}
|
|
15071
15590
|
if (!stats.isDirectory()) {
|
|
15072
15591
|
throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
|
|
15073
15592
|
}
|
|
15074
|
-
const entries = await (0,
|
|
15593
|
+
const entries = await (0, import_promises28.readdir)(resolved);
|
|
15075
15594
|
const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
|
|
15076
15595
|
if (workspaceFiles.length === 1) {
|
|
15077
15596
|
return {
|
|
15078
15597
|
dir: resolved,
|
|
15079
|
-
workspaceFile:
|
|
15598
|
+
workspaceFile: import_node_path39.default.join(resolved, workspaceFiles[0])
|
|
15080
15599
|
};
|
|
15081
15600
|
}
|
|
15082
15601
|
if (workspaceFiles.length > 1) {
|
|
15083
15602
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
15084
15603
|
return {
|
|
15085
15604
|
dir: resolved,
|
|
15086
|
-
workspaceFile: conventionFile ?
|
|
15605
|
+
workspaceFile: conventionFile ? import_node_path39.default.join(resolved, conventionFile) : void 0
|
|
15087
15606
|
};
|
|
15088
15607
|
}
|
|
15089
15608
|
return { dir: resolved };
|
|
@@ -15158,7 +15677,7 @@ async function runEvaluation(options) {
|
|
|
15158
15677
|
);
|
|
15159
15678
|
useCache = false;
|
|
15160
15679
|
}
|
|
15161
|
-
const evalRunId = (0,
|
|
15680
|
+
const evalRunId = (0, import_node_crypto9.randomUUID)();
|
|
15162
15681
|
const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
|
|
15163
15682
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15164
15683
|
if (filteredEvalCases.length === 0) {
|
|
@@ -15205,6 +15724,11 @@ async function runEvaluation(options) {
|
|
|
15205
15724
|
}
|
|
15206
15725
|
return getOrCreateProvider(resolvedJudge);
|
|
15207
15726
|
};
|
|
15727
|
+
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
15728
|
+
throw new Error(
|
|
15729
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
|
|
15730
|
+
);
|
|
15731
|
+
}
|
|
15208
15732
|
const targetResolver = (name) => {
|
|
15209
15733
|
const resolved = resolveTargetByName(name);
|
|
15210
15734
|
if (!resolved) {
|
|
@@ -15218,7 +15742,7 @@ async function runEvaluation(options) {
|
|
|
15218
15742
|
];
|
|
15219
15743
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
15220
15744
|
const typeRegistry = createBuiltinRegistry();
|
|
15221
|
-
const discoveryBaseDir = evalFilePath ?
|
|
15745
|
+
const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
|
|
15222
15746
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
15223
15747
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
15224
15748
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -15273,7 +15797,8 @@ async function runEvaluation(options) {
|
|
|
15273
15797
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
15274
15798
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
15275
15799
|
const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
15276
|
-
const
|
|
15800
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
15801
|
+
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
15277
15802
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
15278
15803
|
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
15279
15804
|
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
@@ -15292,9 +15817,22 @@ async function runEvaluation(options) {
|
|
|
15292
15817
|
const message = error instanceof Error ? error.message : String(error);
|
|
15293
15818
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
15294
15819
|
}
|
|
15295
|
-
} else if (suiteWorkspace?.before_all) {
|
|
15820
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
15296
15821
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
15297
|
-
await (0,
|
|
15822
|
+
await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
15823
|
+
}
|
|
15824
|
+
const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
|
|
15825
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
15826
|
+
try {
|
|
15827
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
15828
|
+
} catch (error) {
|
|
15829
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
15830
|
+
if (sharedWorkspacePath) {
|
|
15831
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
15832
|
+
});
|
|
15833
|
+
}
|
|
15834
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
15835
|
+
}
|
|
15298
15836
|
}
|
|
15299
15837
|
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
15300
15838
|
const scriptContext = {
|
|
@@ -15385,7 +15923,8 @@ async function runEvaluation(options) {
|
|
|
15385
15923
|
sharedBaselineCommit,
|
|
15386
15924
|
suiteWorkspaceFile,
|
|
15387
15925
|
streamCallbacks,
|
|
15388
|
-
typeRegistry
|
|
15926
|
+
typeRegistry,
|
|
15927
|
+
repoManager
|
|
15389
15928
|
};
|
|
15390
15929
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
15391
15930
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -15660,15 +16199,16 @@ async function runEvalCase(options) {
|
|
|
15660
16199
|
sharedWorkspacePath,
|
|
15661
16200
|
sharedBaselineCommit,
|
|
15662
16201
|
suiteWorkspaceFile,
|
|
15663
|
-
typeRegistry: providedTypeRegistry
|
|
16202
|
+
typeRegistry: providedTypeRegistry,
|
|
16203
|
+
repoManager
|
|
15664
16204
|
} = options;
|
|
15665
16205
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
15666
16206
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
15667
16207
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
15668
|
-
const
|
|
16208
|
+
const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
15669
16209
|
let cachedResponse;
|
|
15670
|
-
if (
|
|
15671
|
-
cachedResponse = await cache.get(
|
|
16210
|
+
if (cacheKey2 && cache) {
|
|
16211
|
+
cachedResponse = await cache.get(cacheKey2);
|
|
15672
16212
|
}
|
|
15673
16213
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
15674
16214
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -15697,9 +16237,25 @@ async function runEvalCase(options) {
|
|
|
15697
16237
|
);
|
|
15698
16238
|
}
|
|
15699
16239
|
}
|
|
15700
|
-
if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
|
|
16240
|
+
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
15701
16241
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
15702
|
-
await (0,
|
|
16242
|
+
await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
|
|
16243
|
+
}
|
|
16244
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
16245
|
+
const perCaseRepoManager = new RepoManager();
|
|
16246
|
+
try {
|
|
16247
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
16248
|
+
} catch (error) {
|
|
16249
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16250
|
+
return buildErrorResult(
|
|
16251
|
+
evalCase,
|
|
16252
|
+
target.name,
|
|
16253
|
+
nowFn(),
|
|
16254
|
+
new Error(`Failed to materialize repos: ${message}`),
|
|
16255
|
+
promptInputs,
|
|
16256
|
+
provider
|
|
16257
|
+
);
|
|
16258
|
+
}
|
|
15703
16259
|
}
|
|
15704
16260
|
if (workspacePath && evalCase.workspace?.before_all) {
|
|
15705
16261
|
const scriptContext = {
|
|
@@ -15823,8 +16379,8 @@ async function runEvalCase(options) {
|
|
|
15823
16379
|
}
|
|
15824
16380
|
return errorResult;
|
|
15825
16381
|
}
|
|
15826
|
-
if (
|
|
15827
|
-
await cache.set(
|
|
16382
|
+
if (cacheKey2 && cache && !cachedResponse) {
|
|
16383
|
+
await cache.set(cacheKey2, providerResponse);
|
|
15828
16384
|
}
|
|
15829
16385
|
const output = providerResponse.output;
|
|
15830
16386
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -15852,6 +16408,16 @@ async function runEvalCase(options) {
|
|
|
15852
16408
|
}
|
|
15853
16409
|
}
|
|
15854
16410
|
const providerError = extractProviderError(providerResponse);
|
|
16411
|
+
if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
|
|
16412
|
+
try {
|
|
16413
|
+
await repoManager.reset(
|
|
16414
|
+
evalCase.workspace.repos,
|
|
16415
|
+
workspacePath,
|
|
16416
|
+
evalCase.workspace.reset.strategy
|
|
16417
|
+
);
|
|
16418
|
+
} catch {
|
|
16419
|
+
}
|
|
16420
|
+
}
|
|
15855
16421
|
if (workspacePath && evalCase.workspace?.after_each) {
|
|
15856
16422
|
const scriptContext = {
|
|
15857
16423
|
workspacePath,
|
|
@@ -16216,7 +16782,7 @@ async function runEvaluatorList(options) {
|
|
|
16216
16782
|
fileChanges,
|
|
16217
16783
|
workspacePath
|
|
16218
16784
|
};
|
|
16219
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
16785
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path40.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
16220
16786
|
const dispatchContext = {
|
|
16221
16787
|
judgeProvider,
|
|
16222
16788
|
targetResolver,
|
|
@@ -16306,8 +16872,9 @@ async function runEvaluatorList(options) {
|
|
|
16306
16872
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
16307
16873
|
return entry.score.score < minScore;
|
|
16308
16874
|
});
|
|
16309
|
-
const
|
|
16310
|
-
|
|
16875
|
+
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
16876
|
+
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
16877
|
+
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
16311
16878
|
) : 0;
|
|
16312
16879
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
16313
16880
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
@@ -16447,7 +17014,7 @@ function extractProviderError(response) {
|
|
|
16447
17014
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
16448
17015
|
}
|
|
16449
17016
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
16450
|
-
const hash = (0,
|
|
17017
|
+
const hash = (0, import_node_crypto9.createHash)("sha256");
|
|
16451
17018
|
hash.update(provider.id);
|
|
16452
17019
|
hash.update(target.name);
|
|
16453
17020
|
hash.update(evalCase.id);
|
|
@@ -16515,8 +17082,8 @@ function computeWeightedMean(entries) {
|
|
|
16515
17082
|
}
|
|
16516
17083
|
|
|
16517
17084
|
// src/evaluation/evaluate.ts
|
|
16518
|
-
var
|
|
16519
|
-
var
|
|
17085
|
+
var import_node_fs12 = require("fs");
|
|
17086
|
+
var import_node_path41 = __toESM(require("path"), 1);
|
|
16520
17087
|
async function evaluate(config) {
|
|
16521
17088
|
const startTime = Date.now();
|
|
16522
17089
|
if (config.tests && config.specFile) {
|
|
@@ -16538,13 +17105,13 @@ async function evaluate(config) {
|
|
|
16538
17105
|
let evalCases;
|
|
16539
17106
|
let testFilePath;
|
|
16540
17107
|
if (config.specFile) {
|
|
16541
|
-
testFilePath =
|
|
17108
|
+
testFilePath = import_node_path41.default.resolve(config.specFile);
|
|
16542
17109
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
16543
17110
|
verbose: config.verbose,
|
|
16544
17111
|
filter: config.filter
|
|
16545
17112
|
});
|
|
16546
17113
|
} else {
|
|
16547
|
-
testFilePath =
|
|
17114
|
+
testFilePath = import_node_path41.default.join(process.cwd(), "__programmatic__.yaml");
|
|
16548
17115
|
evalCases = (config.tests ?? []).map((test) => {
|
|
16549
17116
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
16550
17117
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -16635,11 +17202,11 @@ function computeSummary(results, durationMs) {
|
|
|
16635
17202
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
16636
17203
|
async function discoverDefaultTarget(repoRoot) {
|
|
16637
17204
|
const cwd = process.cwd();
|
|
16638
|
-
const chain = buildDirectoryChain2(
|
|
17205
|
+
const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
|
|
16639
17206
|
for (const dir of chain) {
|
|
16640
17207
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
16641
|
-
const targetsPath =
|
|
16642
|
-
if (!(0,
|
|
17208
|
+
const targetsPath = import_node_path41.default.join(dir, candidate);
|
|
17209
|
+
if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
|
|
16643
17210
|
try {
|
|
16644
17211
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
16645
17212
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -16653,11 +17220,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
16653
17220
|
async function loadEnvHierarchy(repoRoot) {
|
|
16654
17221
|
const { readFileSync: readFileSync2 } = await import("fs");
|
|
16655
17222
|
const cwd = process.cwd();
|
|
16656
|
-
const chain = buildDirectoryChain2(
|
|
17223
|
+
const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
|
|
16657
17224
|
const envFiles = [];
|
|
16658
17225
|
for (const dir of chain) {
|
|
16659
|
-
const envPath =
|
|
16660
|
-
if ((0,
|
|
17226
|
+
const envPath = import_node_path41.default.join(dir, ".env");
|
|
17227
|
+
if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
|
|
16661
17228
|
}
|
|
16662
17229
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
16663
17230
|
try {
|
|
@@ -16727,12 +17294,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
16727
17294
|
".agentv/config.js"
|
|
16728
17295
|
];
|
|
16729
17296
|
async function loadTsConfig(projectRoot) {
|
|
16730
|
-
const { existsSync:
|
|
17297
|
+
const { existsSync: existsSync4 } = await import("fs");
|
|
16731
17298
|
const { pathToFileURL } = await import("url");
|
|
16732
17299
|
const { join: join2 } = await import("path");
|
|
16733
17300
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
16734
17301
|
const filePath = join2(projectRoot, fileName);
|
|
16735
|
-
if (!
|
|
17302
|
+
if (!existsSync4(filePath)) {
|
|
16736
17303
|
continue;
|
|
16737
17304
|
}
|
|
16738
17305
|
try {
|
|
@@ -16829,8 +17396,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
16829
17396
|
}
|
|
16830
17397
|
|
|
16831
17398
|
// src/evaluation/cache/response-cache.ts
|
|
16832
|
-
var
|
|
16833
|
-
var
|
|
17399
|
+
var import_promises30 = require("fs/promises");
|
|
17400
|
+
var import_node_path42 = __toESM(require("path"), 1);
|
|
16834
17401
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
16835
17402
|
var ResponseCache = class {
|
|
16836
17403
|
cachePath;
|
|
@@ -16840,7 +17407,7 @@ var ResponseCache = class {
|
|
|
16840
17407
|
async get(key) {
|
|
16841
17408
|
const filePath = this.keyToPath(key);
|
|
16842
17409
|
try {
|
|
16843
|
-
const data = await (0,
|
|
17410
|
+
const data = await (0, import_promises30.readFile)(filePath, "utf8");
|
|
16844
17411
|
return JSON.parse(data);
|
|
16845
17412
|
} catch {
|
|
16846
17413
|
return void 0;
|
|
@@ -16848,13 +17415,13 @@ var ResponseCache = class {
|
|
|
16848
17415
|
}
|
|
16849
17416
|
async set(key, value) {
|
|
16850
17417
|
const filePath = this.keyToPath(key);
|
|
16851
|
-
const dir =
|
|
16852
|
-
await (0,
|
|
16853
|
-
await (0,
|
|
17418
|
+
const dir = import_node_path42.default.dirname(filePath);
|
|
17419
|
+
await (0, import_promises30.mkdir)(dir, { recursive: true });
|
|
17420
|
+
await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
16854
17421
|
}
|
|
16855
17422
|
keyToPath(key) {
|
|
16856
17423
|
const prefix = key.slice(0, 2);
|
|
16857
|
-
return
|
|
17424
|
+
return import_node_path42.default.join(this.cachePath, prefix, `${key}.json`);
|
|
16858
17425
|
}
|
|
16859
17426
|
};
|
|
16860
17427
|
function shouldEnableCache(params) {
|
|
@@ -17332,6 +17899,7 @@ function createAgentKernel() {
|
|
|
17332
17899
|
OtelTraceExporter,
|
|
17333
17900
|
OtlpJsonFileExporter,
|
|
17334
17901
|
ProviderRegistry,
|
|
17902
|
+
RepoManager,
|
|
17335
17903
|
ResponseCache,
|
|
17336
17904
|
SimpleTraceFileExporter,
|
|
17337
17905
|
TEST_MESSAGE_ROLES,
|
|
@@ -17417,12 +17985,19 @@ function createAgentKernel() {
|
|
|
17417
17985
|
resolveTargetDefinition,
|
|
17418
17986
|
resolveWorkspaceTemplate,
|
|
17419
17987
|
rubricEvaluationSchema,
|
|
17988
|
+
runContainsAllAssertion,
|
|
17989
|
+
runContainsAnyAssertion,
|
|
17420
17990
|
runContainsAssertion,
|
|
17991
|
+
runEndsWithAssertion,
|
|
17421
17992
|
runEqualsAssertion,
|
|
17422
17993
|
runEvalCase,
|
|
17423
17994
|
runEvaluation,
|
|
17995
|
+
runIcontainsAllAssertion,
|
|
17996
|
+
runIcontainsAnyAssertion,
|
|
17997
|
+
runIcontainsAssertion,
|
|
17424
17998
|
runIsJsonAssertion,
|
|
17425
17999
|
runRegexAssertion,
|
|
18000
|
+
runStartsWithAssertion,
|
|
17426
18001
|
scoreToVerdict,
|
|
17427
18002
|
shouldEnableCache,
|
|
17428
18003
|
shouldSkipCacheForTemperature,
|