@agentv/core 2.10.0 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7Q4PH265.js → chunk-REN5PS7B.js} +15 -8
- package/dist/chunk-REN5PS7B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +106 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +96 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +830 -172
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +189 -11
- package/dist/index.d.ts +189 -11
- package/dist/index.js +795 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-7Q4PH265.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
|
|
|
1244
1244
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1245
1245
|
return { stringValue: String(value) };
|
|
1246
1246
|
}
|
|
1247
|
-
var
|
|
1247
|
+
var import_promises31, import_node_path43, OtlpJsonFileExporter;
|
|
1248
1248
|
var init_otlp_json_file_exporter = __esm({
|
|
1249
1249
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1250
1250
|
"use strict";
|
|
1251
|
-
|
|
1252
|
-
|
|
1251
|
+
import_promises31 = require("fs/promises");
|
|
1252
|
+
import_node_path43 = require("path");
|
|
1253
1253
|
OtlpJsonFileExporter = class {
|
|
1254
1254
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1255
1255
|
spans = [];
|
|
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
async flush() {
|
|
1290
1290
|
if (this.spans.length === 0) return;
|
|
1291
|
-
await (0,
|
|
1291
|
+
await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
|
|
1292
1292
|
const otlpJson = {
|
|
1293
1293
|
resourceSpans: [
|
|
1294
1294
|
{
|
|
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1302
1302
|
}
|
|
1303
1303
|
]
|
|
1304
1304
|
};
|
|
1305
|
-
const { writeFile:
|
|
1306
|
-
await
|
|
1305
|
+
const { writeFile: writeFile9 } = await import("fs/promises");
|
|
1306
|
+
await writeFile9(this.filePath, JSON.stringify(otlpJson, null, 2));
|
|
1307
1307
|
}
|
|
1308
1308
|
};
|
|
1309
1309
|
}
|
|
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
|
|
|
1319
1319
|
const diffNano = end[1] - start[1];
|
|
1320
1320
|
return Math.round(diffSec * 1e3 + diffNano / 1e6);
|
|
1321
1321
|
}
|
|
1322
|
-
var
|
|
1322
|
+
var import_node_fs13, import_promises32, import_node_path44, SimpleTraceFileExporter;
|
|
1323
1323
|
var init_simple_trace_file_exporter = __esm({
|
|
1324
1324
|
"src/observability/simple-trace-file-exporter.ts"() {
|
|
1325
1325
|
"use strict";
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1326
|
+
import_node_fs13 = require("fs");
|
|
1327
|
+
import_promises32 = require("fs/promises");
|
|
1328
|
+
import_node_path44 = require("path");
|
|
1329
1329
|
SimpleTraceFileExporter = class {
|
|
1330
1330
|
stream = null;
|
|
1331
1331
|
filePath;
|
|
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
|
|
|
1338
1338
|
async ensureStream() {
|
|
1339
1339
|
if (!this.streamReady) {
|
|
1340
1340
|
this.streamReady = (async () => {
|
|
1341
|
-
await (0,
|
|
1342
|
-
this.stream = (0,
|
|
1341
|
+
await (0, import_promises32.mkdir)((0, import_node_path44.dirname)(this.filePath), { recursive: true });
|
|
1342
|
+
this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
|
|
1343
1343
|
return this.stream;
|
|
1344
1344
|
})();
|
|
1345
1345
|
}
|
|
@@ -1448,6 +1448,7 @@ __export(index_exports, {
|
|
|
1448
1448
|
OtelTraceExporter: () => OtelTraceExporter,
|
|
1449
1449
|
OtlpJsonFileExporter: () => OtlpJsonFileExporter,
|
|
1450
1450
|
ProviderRegistry: () => ProviderRegistry,
|
|
1451
|
+
RepoManager: () => RepoManager,
|
|
1451
1452
|
ResponseCache: () => ResponseCache,
|
|
1452
1453
|
SimpleTraceFileExporter: () => SimpleTraceFileExporter,
|
|
1453
1454
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
@@ -1533,12 +1534,19 @@ __export(index_exports, {
|
|
|
1533
1534
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
1534
1535
|
resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
|
|
1535
1536
|
rubricEvaluationSchema: () => rubricEvaluationSchema,
|
|
1537
|
+
runContainsAllAssertion: () => runContainsAllAssertion,
|
|
1538
|
+
runContainsAnyAssertion: () => runContainsAnyAssertion,
|
|
1536
1539
|
runContainsAssertion: () => runContainsAssertion,
|
|
1540
|
+
runEndsWithAssertion: () => runEndsWithAssertion,
|
|
1537
1541
|
runEqualsAssertion: () => runEqualsAssertion,
|
|
1538
1542
|
runEvalCase: () => runEvalCase,
|
|
1539
1543
|
runEvaluation: () => runEvaluation,
|
|
1544
|
+
runIcontainsAllAssertion: () => runIcontainsAllAssertion,
|
|
1545
|
+
runIcontainsAnyAssertion: () => runIcontainsAnyAssertion,
|
|
1546
|
+
runIcontainsAssertion: () => runIcontainsAssertion,
|
|
1540
1547
|
runIsJsonAssertion: () => runIsJsonAssertion,
|
|
1541
1548
|
runRegexAssertion: () => runRegexAssertion,
|
|
1549
|
+
runStartsWithAssertion: () => runStartsWithAssertion,
|
|
1542
1550
|
scoreToVerdict: () => scoreToVerdict,
|
|
1543
1551
|
shouldEnableCache: () => shouldEnableCache,
|
|
1544
1552
|
shouldSkipCacheForTemperature: () => shouldSkipCacheForTemperature,
|
|
@@ -1615,6 +1623,13 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
1615
1623
|
"execution_metrics",
|
|
1616
1624
|
"agent_judge",
|
|
1617
1625
|
"contains",
|
|
1626
|
+
"contains_any",
|
|
1627
|
+
"contains_all",
|
|
1628
|
+
"icontains",
|
|
1629
|
+
"icontains_any",
|
|
1630
|
+
"icontains_all",
|
|
1631
|
+
"starts_with",
|
|
1632
|
+
"ends_with",
|
|
1618
1633
|
"regex",
|
|
1619
1634
|
"is_json",
|
|
1620
1635
|
"equals",
|
|
@@ -2017,9 +2032,14 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2017
2032
|
logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
|
|
2018
2033
|
continue;
|
|
2019
2034
|
}
|
|
2035
|
+
const executionDefaults = parseExecutionDefaults(
|
|
2036
|
+
parsed.execution,
|
|
2037
|
+
configPath
|
|
2038
|
+
);
|
|
2020
2039
|
return {
|
|
2021
2040
|
guideline_patterns: guidelinePatterns,
|
|
2022
|
-
eval_patterns: evalPatterns
|
|
2041
|
+
eval_patterns: evalPatterns,
|
|
2042
|
+
execution: executionDefaults
|
|
2023
2043
|
};
|
|
2024
2044
|
} catch (error) {
|
|
2025
2045
|
logWarning(
|
|
@@ -2160,6 +2180,36 @@ function extractTotalBudgetUsd(suite) {
|
|
|
2160
2180
|
);
|
|
2161
2181
|
return void 0;
|
|
2162
2182
|
}
|
|
2183
|
+
function parseExecutionDefaults(raw, configPath) {
|
|
2184
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
2185
|
+
return void 0;
|
|
2186
|
+
}
|
|
2187
|
+
const obj = raw;
|
|
2188
|
+
const result = {};
|
|
2189
|
+
if (typeof obj.verbose === "boolean") {
|
|
2190
|
+
result.verbose = obj.verbose;
|
|
2191
|
+
} else if (obj.verbose !== void 0) {
|
|
2192
|
+
logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
|
|
2193
|
+
}
|
|
2194
|
+
const traceFile = obj.trace_file;
|
|
2195
|
+
if (typeof traceFile === "string" && traceFile.trim().length > 0) {
|
|
2196
|
+
result.trace_file = traceFile.trim();
|
|
2197
|
+
} else if (traceFile !== void 0) {
|
|
2198
|
+
logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
|
|
2199
|
+
}
|
|
2200
|
+
if (typeof obj.keep_workspaces === "boolean") {
|
|
2201
|
+
result.keep_workspaces = obj.keep_workspaces;
|
|
2202
|
+
} else if (obj.keep_workspaces !== void 0) {
|
|
2203
|
+
logWarning(`Invalid execution.keep_workspaces in ${configPath}, expected boolean`);
|
|
2204
|
+
}
|
|
2205
|
+
const otelFile = obj.otel_file;
|
|
2206
|
+
if (typeof otelFile === "string" && otelFile.trim().length > 0) {
|
|
2207
|
+
result.otel_file = otelFile.trim();
|
|
2208
|
+
} else if (otelFile !== void 0) {
|
|
2209
|
+
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
2210
|
+
}
|
|
2211
|
+
return Object.keys(result).length > 0 ? result : void 0;
|
|
2212
|
+
}
|
|
2163
2213
|
function logWarning(message) {
|
|
2164
2214
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
2165
2215
|
}
|
|
@@ -2888,18 +2938,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2888
2938
|
});
|
|
2889
2939
|
continue;
|
|
2890
2940
|
}
|
|
2941
|
+
if (typeValue === "contains_any" || typeValue === "contains_all") {
|
|
2942
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2943
|
+
if (!value || value.length === 0) {
|
|
2944
|
+
logWarning2(
|
|
2945
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
2946
|
+
);
|
|
2947
|
+
continue;
|
|
2948
|
+
}
|
|
2949
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2950
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2951
|
+
evaluators.push({
|
|
2952
|
+
name,
|
|
2953
|
+
type: typeValue,
|
|
2954
|
+
value,
|
|
2955
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2956
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2957
|
+
...negate !== void 0 ? { negate } : {}
|
|
2958
|
+
});
|
|
2959
|
+
continue;
|
|
2960
|
+
}
|
|
2961
|
+
if (typeValue === "icontains") {
|
|
2962
|
+
const value = asString(rawEvaluator.value);
|
|
2963
|
+
if (!value) {
|
|
2964
|
+
logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
|
|
2965
|
+
continue;
|
|
2966
|
+
}
|
|
2967
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2968
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2969
|
+
evaluators.push({
|
|
2970
|
+
name,
|
|
2971
|
+
type: "icontains",
|
|
2972
|
+
value,
|
|
2973
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2974
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2975
|
+
...negate !== void 0 ? { negate } : {}
|
|
2976
|
+
});
|
|
2977
|
+
continue;
|
|
2978
|
+
}
|
|
2979
|
+
if (typeValue === "icontains_any" || typeValue === "icontains_all") {
|
|
2980
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2981
|
+
if (!value || value.length === 0) {
|
|
2982
|
+
logWarning2(
|
|
2983
|
+
`Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
|
|
2984
|
+
);
|
|
2985
|
+
continue;
|
|
2986
|
+
}
|
|
2987
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2988
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
2989
|
+
evaluators.push({
|
|
2990
|
+
name,
|
|
2991
|
+
type: typeValue,
|
|
2992
|
+
value,
|
|
2993
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2994
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
2995
|
+
...negate !== void 0 ? { negate } : {}
|
|
2996
|
+
});
|
|
2997
|
+
continue;
|
|
2998
|
+
}
|
|
2999
|
+
if (typeValue === "starts_with" || typeValue === "ends_with") {
|
|
3000
|
+
const value = asString(rawEvaluator.value);
|
|
3001
|
+
if (!value) {
|
|
3002
|
+
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
3003
|
+
continue;
|
|
3004
|
+
}
|
|
3005
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3006
|
+
const required2 = parseRequired(rawEvaluator.required);
|
|
3007
|
+
evaluators.push({
|
|
3008
|
+
name,
|
|
3009
|
+
type: typeValue,
|
|
3010
|
+
value,
|
|
3011
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3012
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
3013
|
+
...negate !== void 0 ? { negate } : {}
|
|
3014
|
+
});
|
|
3015
|
+
continue;
|
|
3016
|
+
}
|
|
2891
3017
|
if (typeValue === "regex") {
|
|
2892
3018
|
const value = asString(rawEvaluator.value);
|
|
2893
3019
|
if (!value) {
|
|
2894
3020
|
logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
|
|
2895
3021
|
continue;
|
|
2896
3022
|
}
|
|
3023
|
+
const flags = asString(rawEvaluator.flags);
|
|
2897
3024
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2898
3025
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2899
3026
|
evaluators.push({
|
|
2900
3027
|
name,
|
|
2901
3028
|
type: "regex",
|
|
2902
3029
|
value,
|
|
3030
|
+
...flags !== void 0 ? { flags } : {},
|
|
2903
3031
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2904
3032
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2905
3033
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -3072,15 +3200,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3072
3200
|
}
|
|
3073
3201
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
3074
3202
|
}
|
|
3075
|
-
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3203
|
+
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3204
|
+
"contains",
|
|
3205
|
+
"contains_any",
|
|
3206
|
+
"contains_all",
|
|
3207
|
+
"icontains",
|
|
3208
|
+
"icontains_any",
|
|
3209
|
+
"icontains_all",
|
|
3210
|
+
"starts_with",
|
|
3211
|
+
"ends_with",
|
|
3212
|
+
"regex",
|
|
3213
|
+
"is_json",
|
|
3214
|
+
"equals",
|
|
3215
|
+
"rubrics"
|
|
3216
|
+
]);
|
|
3076
3217
|
function generateAssertionName(typeValue, rawEvaluator) {
|
|
3077
3218
|
if (!ASSERTION_TYPES.has(typeValue)) {
|
|
3078
3219
|
return void 0;
|
|
3079
3220
|
}
|
|
3080
3221
|
const value = asString(rawEvaluator.value);
|
|
3222
|
+
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
|
|
3081
3223
|
switch (typeValue) {
|
|
3082
3224
|
case "contains":
|
|
3083
3225
|
return value ? `contains-${value}` : "contains";
|
|
3226
|
+
case "contains_any":
|
|
3227
|
+
return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
|
|
3228
|
+
case "contains_all":
|
|
3229
|
+
return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
|
|
3230
|
+
case "icontains":
|
|
3231
|
+
return value ? `icontains-${value}` : "icontains";
|
|
3232
|
+
case "icontains_any":
|
|
3233
|
+
return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
|
|
3234
|
+
case "icontains_all":
|
|
3235
|
+
return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
|
|
3236
|
+
case "starts_with":
|
|
3237
|
+
return value ? `starts_with-${value}` : "starts_with";
|
|
3238
|
+
case "ends_with":
|
|
3239
|
+
return value ? `ends_with-${value}` : "ends_with";
|
|
3084
3240
|
case "regex":
|
|
3085
3241
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
3086
3242
|
case "is_json":
|
|
@@ -3106,6 +3262,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3106
3262
|
function asString(value) {
|
|
3107
3263
|
return typeof value === "string" ? value : void 0;
|
|
3108
3264
|
}
|
|
3265
|
+
function asStringArrayStrict(value) {
|
|
3266
|
+
if (!Array.isArray(value)) {
|
|
3267
|
+
return void 0;
|
|
3268
|
+
}
|
|
3269
|
+
const result = value.filter((v) => typeof v === "string");
|
|
3270
|
+
return result.length > 0 ? result : void 0;
|
|
3271
|
+
}
|
|
3109
3272
|
function asStringArray(value, description) {
|
|
3110
3273
|
if (value === void 0) {
|
|
3111
3274
|
return void 0;
|
|
@@ -4423,6 +4586,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
4423
4586
|
}
|
|
4424
4587
|
return cwd ? { ...config, cwd } : config;
|
|
4425
4588
|
}
|
|
4589
|
+
function parseRepoSource(raw) {
|
|
4590
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4591
|
+
const obj = raw;
|
|
4592
|
+
if (obj.type === "git" && typeof obj.url === "string") {
|
|
4593
|
+
return { type: "git", url: obj.url };
|
|
4594
|
+
}
|
|
4595
|
+
if (obj.type === "local" && typeof obj.path === "string") {
|
|
4596
|
+
return { type: "local", path: obj.path };
|
|
4597
|
+
}
|
|
4598
|
+
return void 0;
|
|
4599
|
+
}
|
|
4600
|
+
function parseRepoCheckout(raw) {
|
|
4601
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4602
|
+
const obj = raw;
|
|
4603
|
+
const ref = typeof obj.ref === "string" ? obj.ref : void 0;
|
|
4604
|
+
const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
|
|
4605
|
+
const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
|
|
4606
|
+
if (!ref && !resolve && ancestor === void 0) return void 0;
|
|
4607
|
+
return {
|
|
4608
|
+
...ref !== void 0 && { ref },
|
|
4609
|
+
...resolve !== void 0 && { resolve },
|
|
4610
|
+
...ancestor !== void 0 && { ancestor }
|
|
4611
|
+
};
|
|
4612
|
+
}
|
|
4613
|
+
function parseRepoClone(raw) {
|
|
4614
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4615
|
+
const obj = raw;
|
|
4616
|
+
const depth = typeof obj.depth === "number" ? obj.depth : void 0;
|
|
4617
|
+
const filter = typeof obj.filter === "string" ? obj.filter : void 0;
|
|
4618
|
+
const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
|
|
4619
|
+
if (depth === void 0 && !filter && !sparse) return void 0;
|
|
4620
|
+
return {
|
|
4621
|
+
...depth !== void 0 && { depth },
|
|
4622
|
+
...filter !== void 0 && { filter },
|
|
4623
|
+
...sparse !== void 0 && { sparse }
|
|
4624
|
+
};
|
|
4625
|
+
}
|
|
4626
|
+
function parseRepoConfig(raw) {
|
|
4627
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4628
|
+
const obj = raw;
|
|
4629
|
+
const repoPath = typeof obj.path === "string" ? obj.path : void 0;
|
|
4630
|
+
const source = parseRepoSource(obj.source);
|
|
4631
|
+
if (!repoPath || !source) return void 0;
|
|
4632
|
+
const checkout = parseRepoCheckout(obj.checkout);
|
|
4633
|
+
const clone = parseRepoClone(obj.clone);
|
|
4634
|
+
return {
|
|
4635
|
+
path: repoPath,
|
|
4636
|
+
source,
|
|
4637
|
+
...checkout !== void 0 && { checkout },
|
|
4638
|
+
...clone !== void 0 && { clone }
|
|
4639
|
+
};
|
|
4640
|
+
}
|
|
4641
|
+
function parseResetConfig(raw) {
|
|
4642
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4643
|
+
const obj = raw;
|
|
4644
|
+
const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
|
|
4645
|
+
const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
|
|
4646
|
+
if (!strategy && afterEach === void 0) return void 0;
|
|
4647
|
+
return {
|
|
4648
|
+
...strategy !== void 0 && { strategy },
|
|
4649
|
+
...afterEach !== void 0 && { after_each: afterEach }
|
|
4650
|
+
};
|
|
4651
|
+
}
|
|
4426
4652
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
4427
4653
|
if (!isJsonObject(raw)) return void 0;
|
|
4428
4654
|
const obj = raw;
|
|
@@ -4430,13 +4656,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
4430
4656
|
if (template && !import_node_path8.default.isAbsolute(template)) {
|
|
4431
4657
|
template = import_node_path8.default.resolve(evalFileDir, template);
|
|
4432
4658
|
}
|
|
4659
|
+
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
4660
|
+
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
4661
|
+
const reset = parseResetConfig(obj.reset);
|
|
4433
4662
|
const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
|
|
4434
4663
|
const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
|
|
4435
4664
|
const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
|
|
4436
4665
|
const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
|
|
4437
|
-
if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
4666
|
+
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
4667
|
+
return void 0;
|
|
4438
4668
|
return {
|
|
4439
4669
|
...template !== void 0 && { template },
|
|
4670
|
+
...isolation !== void 0 && { isolation },
|
|
4671
|
+
...repos !== void 0 && { repos },
|
|
4672
|
+
...reset !== void 0 && { reset },
|
|
4440
4673
|
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
4441
4674
|
...afterAll !== void 0 && { after_all: afterAll },
|
|
4442
4675
|
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
@@ -4449,6 +4682,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
4449
4682
|
if (!caseLevel) return suiteLevel;
|
|
4450
4683
|
return {
|
|
4451
4684
|
template: caseLevel.template ?? suiteLevel.template,
|
|
4685
|
+
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
4686
|
+
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
4687
|
+
reset: caseLevel.reset ?? suiteLevel.reset,
|
|
4452
4688
|
before_all: caseLevel.before_all ?? suiteLevel.before_all,
|
|
4453
4689
|
after_all: caseLevel.after_all ?? suiteLevel.after_all,
|
|
4454
4690
|
before_each: caseLevel.before_each ?? suiteLevel.before_each,
|
|
@@ -5103,11 +5339,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
5103
5339
|
}
|
|
5104
5340
|
return claudeSdkModule;
|
|
5105
5341
|
}
|
|
5106
|
-
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
5107
|
-
- Do NOT create any additional output files in the workspace.
|
|
5108
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
5109
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
5110
|
-
This is required for evaluation scoring.`;
|
|
5111
5342
|
var ClaudeProvider = class {
|
|
5112
5343
|
id;
|
|
5113
5344
|
kind = "claude";
|
|
@@ -5129,7 +5360,7 @@ var ClaudeProvider = class {
|
|
|
5129
5360
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
5130
5361
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
5131
5362
|
const prompt = buildPromptDocument(request, inputFiles);
|
|
5132
|
-
const systemPrompt = this.config.systemPrompt
|
|
5363
|
+
const systemPrompt = this.config.systemPrompt;
|
|
5133
5364
|
const queryOptions = {
|
|
5134
5365
|
permissionMode: "bypassPermissions",
|
|
5135
5366
|
allowDangerouslySkipPermissions: true,
|
|
@@ -6110,11 +6341,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
6110
6341
|
}
|
|
6111
6342
|
return codexSdkModule;
|
|
6112
6343
|
}
|
|
6113
|
-
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
6114
|
-
- Do NOT create any additional output files in the workspace.
|
|
6115
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
6116
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
6117
|
-
This is required for evaluation scoring.`;
|
|
6118
6344
|
var CodexProvider = class {
|
|
6119
6345
|
id;
|
|
6120
6346
|
kind = "codex";
|
|
@@ -6149,7 +6375,7 @@ var CodexProvider = class {
|
|
|
6149
6375
|
const thread = codex.startThread(threadOptions);
|
|
6150
6376
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
6151
6377
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
6152
|
-
const systemPrompt = this.config.systemPrompt
|
|
6378
|
+
const systemPrompt = this.config.systemPrompt;
|
|
6153
6379
|
const prompt = systemPrompt ? `${systemPrompt}
|
|
6154
6380
|
|
|
6155
6381
|
${basePrompt}` : basePrompt;
|
|
@@ -6516,7 +6742,7 @@ var import_node_path14 = __toESM(require("path"), 1);
|
|
|
6516
6742
|
var import_node_url2 = require("url");
|
|
6517
6743
|
var import_meta = {};
|
|
6518
6744
|
function resolvePlatformCliPath() {
|
|
6519
|
-
const
|
|
6745
|
+
const os5 = (0, import_node_os2.platform)();
|
|
6520
6746
|
const cpu = (0, import_node_os2.arch)();
|
|
6521
6747
|
const platformMap = {
|
|
6522
6748
|
linux: "linux",
|
|
@@ -6527,13 +6753,13 @@ function resolvePlatformCliPath() {
|
|
|
6527
6753
|
x64: "x64",
|
|
6528
6754
|
arm64: "arm64"
|
|
6529
6755
|
};
|
|
6530
|
-
const osPart = platformMap[
|
|
6756
|
+
const osPart = platformMap[os5];
|
|
6531
6757
|
const archPart = archMap[cpu];
|
|
6532
6758
|
if (!osPart || !archPart) {
|
|
6533
6759
|
return void 0;
|
|
6534
6760
|
}
|
|
6535
6761
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
6536
|
-
const binaryName =
|
|
6762
|
+
const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
|
|
6537
6763
|
try {
|
|
6538
6764
|
const resolved = import_meta.resolve(`${packageName}/package.json`);
|
|
6539
6765
|
const packageJsonPath = resolved.startsWith("file:") ? (0, import_node_url2.fileURLToPath)(resolved) : resolved;
|
|
@@ -6675,11 +6901,6 @@ function isLogStreamingDisabled(envKey) {
|
|
|
6675
6901
|
}
|
|
6676
6902
|
|
|
6677
6903
|
// src/evaluation/providers/copilot-cli.ts
|
|
6678
|
-
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
6679
|
-
- Do NOT create any additional output files in the workspace.
|
|
6680
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
6681
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
6682
|
-
This is required for evaluation scoring.`;
|
|
6683
6904
|
var CopilotCliProvider = class {
|
|
6684
6905
|
id;
|
|
6685
6906
|
kind = "copilot-cli";
|
|
@@ -6841,6 +7062,16 @@ var CopilotCliProvider = class {
|
|
|
6841
7062
|
}
|
|
6842
7063
|
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
6843
7064
|
const durationMs = Date.now() - startMs;
|
|
7065
|
+
const rejectedCalls = completedToolCalls.filter((tc) => {
|
|
7066
|
+
const out = tc.output;
|
|
7067
|
+
return out && (out.code === "rejected" || out.code === "denied");
|
|
7068
|
+
});
|
|
7069
|
+
if (rejectedCalls.length > 0) {
|
|
7070
|
+
const tools = rejectedCalls.map((tc) => tc.tool).join(", ");
|
|
7071
|
+
throw new Error(
|
|
7072
|
+
`Copilot rejected ${rejectedCalls.length} tool call(s): ${tools}. Add args: ["--yolo"] to your target config or re-run with --yolo to bypass permission checks.`
|
|
7073
|
+
);
|
|
7074
|
+
}
|
|
6844
7075
|
const outputMessages = [];
|
|
6845
7076
|
if (completedToolCalls.length > 0) {
|
|
6846
7077
|
outputMessages.push({
|
|
@@ -6873,7 +7104,7 @@ var CopilotCliProvider = class {
|
|
|
6873
7104
|
}
|
|
6874
7105
|
}
|
|
6875
7106
|
buildCliArgs() {
|
|
6876
|
-
const args = ["--acp", "--stdio", "--allow-all-tools"];
|
|
7107
|
+
const args = ["--acp", "--stdio", "--allow-all-tools", "--yolo"];
|
|
6877
7108
|
if (this.config.model) {
|
|
6878
7109
|
args.push("--model", this.config.model);
|
|
6879
7110
|
}
|
|
@@ -6882,8 +7113,8 @@ var CopilotCliProvider = class {
|
|
|
6882
7113
|
}
|
|
6883
7114
|
return args;
|
|
6884
7115
|
}
|
|
6885
|
-
resolveSystemPrompt(
|
|
6886
|
-
return this.config.systemPrompt
|
|
7116
|
+
resolveSystemPrompt(_request) {
|
|
7117
|
+
return this.config.systemPrompt;
|
|
6887
7118
|
}
|
|
6888
7119
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
6889
7120
|
const timeoutMs = this.config.timeoutMs;
|
|
@@ -7071,21 +7302,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
|
7071
7302
|
}
|
|
7072
7303
|
return copilotSdkModule;
|
|
7073
7304
|
}
|
|
7074
|
-
var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
|
|
7075
|
-
- Do NOT create any additional output files in the workspace.
|
|
7076
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
7077
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7078
|
-
This is required for evaluation scoring.`;
|
|
7079
7305
|
var CopilotSdkProvider = class {
|
|
7080
7306
|
id;
|
|
7081
|
-
kind = "copilot";
|
|
7307
|
+
kind = "copilot-sdk";
|
|
7082
7308
|
targetName;
|
|
7083
7309
|
supportsBatch = false;
|
|
7084
7310
|
config;
|
|
7085
7311
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
7086
7312
|
client = null;
|
|
7087
7313
|
constructor(targetName, config) {
|
|
7088
|
-
this.id = `copilot:${targetName}`;
|
|
7314
|
+
this.id = `copilot-sdk:${targetName}`;
|
|
7089
7315
|
this.targetName = targetName;
|
|
7090
7316
|
this.config = config;
|
|
7091
7317
|
}
|
|
@@ -7108,7 +7334,7 @@ var CopilotSdkProvider = class {
|
|
|
7108
7334
|
if (cwd) {
|
|
7109
7335
|
sessionOptions.workingDirectory = cwd;
|
|
7110
7336
|
}
|
|
7111
|
-
const systemPrompt = this.config.systemPrompt
|
|
7337
|
+
const systemPrompt = this.config.systemPrompt;
|
|
7112
7338
|
if (systemPrompt) {
|
|
7113
7339
|
sessionOptions.systemMessage = {
|
|
7114
7340
|
mode: "append",
|
|
@@ -7624,11 +7850,6 @@ function subscribeToPiLogEntries(listener) {
|
|
|
7624
7850
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
7625
7851
|
var WORKSPACE_PREFIX = "agentv-pi-";
|
|
7626
7852
|
var PROMPT_FILENAME = "prompt.md";
|
|
7627
|
-
var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
|
|
7628
|
-
- Do NOT create any additional output files in the workspace.
|
|
7629
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
7630
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
7631
|
-
This is required for evaluation scoring.`;
|
|
7632
7853
|
var PiCodingAgentProvider = class {
|
|
7633
7854
|
id;
|
|
7634
7855
|
kind = "pi-coding-agent";
|
|
@@ -7705,7 +7926,7 @@ var PiCodingAgentProvider = class {
|
|
|
7705
7926
|
}
|
|
7706
7927
|
return import_node_path17.default.resolve(this.config.cwd);
|
|
7707
7928
|
}
|
|
7708
|
-
buildPiArgs(prompt, inputFiles,
|
|
7929
|
+
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
7709
7930
|
const args = [];
|
|
7710
7931
|
if (this.config.provider) {
|
|
7711
7932
|
args.push("--provider", this.config.provider);
|
|
@@ -7733,7 +7954,7 @@ var PiCodingAgentProvider = class {
|
|
|
7733
7954
|
args.push(`@${file}`);
|
|
7734
7955
|
}
|
|
7735
7956
|
}
|
|
7736
|
-
const systemPrompt = this.config.systemPrompt
|
|
7957
|
+
const systemPrompt = this.config.systemPrompt;
|
|
7737
7958
|
const fullPrompt = systemPrompt ? `${systemPrompt}
|
|
7738
7959
|
|
|
7739
7960
|
${prompt}` : prompt;
|
|
@@ -8604,17 +8825,17 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
8604
8825
|
providerBatching,
|
|
8605
8826
|
config: resolveCodexConfig(parsed, env, evalFilePath)
|
|
8606
8827
|
};
|
|
8607
|
-
case "copilot":
|
|
8608
8828
|
case "copilot-sdk":
|
|
8609
8829
|
case "copilot_sdk":
|
|
8610
8830
|
return {
|
|
8611
|
-
kind: "copilot",
|
|
8831
|
+
kind: "copilot-sdk",
|
|
8612
8832
|
name: parsed.name,
|
|
8613
8833
|
judgeTarget: parsed.judge_target,
|
|
8614
8834
|
workers: parsed.workers,
|
|
8615
8835
|
providerBatching,
|
|
8616
8836
|
config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
|
|
8617
8837
|
};
|
|
8838
|
+
case "copilot":
|
|
8618
8839
|
case "copilot-cli":
|
|
8619
8840
|
return {
|
|
8620
8841
|
kind: "copilot-cli",
|
|
@@ -9225,8 +9446,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
9225
9446
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
9226
9447
|
if (!parseResult.success) {
|
|
9227
9448
|
const firstError = parseResult.error.errors[0];
|
|
9228
|
-
const
|
|
9229
|
-
const prefix =
|
|
9449
|
+
const path42 = firstError?.path.join(".") || "";
|
|
9450
|
+
const prefix = path42 ? `${target.name} ${path42}: ` : `${target.name}: `;
|
|
9230
9451
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
9231
9452
|
}
|
|
9232
9453
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -10523,7 +10744,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10523
10744
|
|
|
10524
10745
|
**IMPORTANT**: Follow these exact steps:
|
|
10525
10746
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10526
|
-
- Do NOT create any additional output files in the workspace.
|
|
10527
10747
|
- All intended file outputs/changes MUST be written in your response file.
|
|
10528
10748
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10529
10749
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
@@ -10542,7 +10762,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10542
10762
|
|
|
10543
10763
|
**IMPORTANT**: Follow these exact steps:
|
|
10544
10764
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10545
|
-
- Do NOT create any additional output files in the workspace.
|
|
10546
10765
|
- All intended file outputs/changes MUST be written in your response file.
|
|
10547
10766
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10548
10767
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
@@ -10968,7 +11187,7 @@ async function discoverProviders(registry, baseDir) {
|
|
|
10968
11187
|
// src/evaluation/providers/index.ts
|
|
10969
11188
|
function createBuiltinProviderRegistry() {
|
|
10970
11189
|
const registry = new ProviderRegistry();
|
|
10971
|
-
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
11190
|
+
registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
10972
11191
|
"vscode-insiders",
|
|
10973
11192
|
(t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
|
|
10974
11193
|
);
|
|
@@ -11157,16 +11376,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
11157
11376
|
});
|
|
11158
11377
|
}
|
|
11159
11378
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
11160
|
-
const { mkdir:
|
|
11379
|
+
const { mkdir: mkdir16, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
11161
11380
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
11162
|
-
const
|
|
11381
|
+
const path42 = await import("path");
|
|
11163
11382
|
const { randomUUID: randomUUID8 } = await import("crypto");
|
|
11164
|
-
const dir =
|
|
11165
|
-
await
|
|
11166
|
-
const stdinPath =
|
|
11167
|
-
const stdoutPath =
|
|
11168
|
-
const stderrPath =
|
|
11169
|
-
await
|
|
11383
|
+
const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
11384
|
+
await mkdir16(dir, { recursive: true });
|
|
11385
|
+
const stdinPath = path42.join(dir, "stdin.txt");
|
|
11386
|
+
const stdoutPath = path42.join(dir, "stdout.txt");
|
|
11387
|
+
const stderrPath = path42.join(dir, "stderr.txt");
|
|
11388
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
11170
11389
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
11171
11390
|
const { spawn: spawn4 } = await import("child_process");
|
|
11172
11391
|
try {
|
|
@@ -11199,7 +11418,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
11199
11418
|
const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11200
11419
|
return { stdout, stderr, exitCode };
|
|
11201
11420
|
} finally {
|
|
11202
|
-
await
|
|
11421
|
+
await rm6(dir, { recursive: true, force: true });
|
|
11203
11422
|
}
|
|
11204
11423
|
}
|
|
11205
11424
|
|
|
@@ -11517,7 +11736,7 @@ var CodeEvaluator = class {
|
|
|
11517
11736
|
outputPath,
|
|
11518
11737
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
11519
11738
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
11520
|
-
(
|
|
11739
|
+
(path42) => !context2.evalCase.guideline_paths.includes(path42)
|
|
11521
11740
|
),
|
|
11522
11741
|
input: context2.evalCase.input,
|
|
11523
11742
|
trace: context2.trace ?? null,
|
|
@@ -11648,7 +11867,7 @@ var import_ai3 = require("ai");
|
|
|
11648
11867
|
// src/evaluation/providers/types.ts
|
|
11649
11868
|
var AGENT_PROVIDER_KINDS = [
|
|
11650
11869
|
"codex",
|
|
11651
|
-
"copilot",
|
|
11870
|
+
"copilot-sdk",
|
|
11652
11871
|
"copilot-cli",
|
|
11653
11872
|
"pi-coding-agent",
|
|
11654
11873
|
"claude",
|
|
@@ -11794,13 +12013,15 @@ ${context2.fileChanges}`;
|
|
|
11794
12013
|
evaluatorRawRequest,
|
|
11795
12014
|
tokenUsage
|
|
11796
12015
|
};
|
|
11797
|
-
} catch {
|
|
12016
|
+
} catch (e) {
|
|
12017
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
11798
12018
|
return {
|
|
11799
12019
|
score: 0,
|
|
11800
|
-
verdict: "
|
|
12020
|
+
verdict: "skip",
|
|
11801
12021
|
hits: [],
|
|
11802
|
-
misses: [],
|
|
12022
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
11803
12023
|
expectedAspectCount: 1,
|
|
12024
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
11804
12025
|
evaluatorRawRequest
|
|
11805
12026
|
};
|
|
11806
12027
|
}
|
|
@@ -12742,115 +12963,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
12742
12963
|
* Evaluate a single field against the expected value.
|
|
12743
12964
|
*/
|
|
12744
12965
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
12745
|
-
const { path:
|
|
12746
|
-
const candidateValue = resolvePath(candidateData,
|
|
12747
|
-
const expectedValue = resolvePath(expectedData,
|
|
12966
|
+
const { path: path42, match, required = true, weight = 1 } = fieldConfig;
|
|
12967
|
+
const candidateValue = resolvePath(candidateData, path42);
|
|
12968
|
+
const expectedValue = resolvePath(expectedData, path42);
|
|
12748
12969
|
if (expectedValue === void 0) {
|
|
12749
12970
|
return {
|
|
12750
|
-
path:
|
|
12971
|
+
path: path42,
|
|
12751
12972
|
score: 1,
|
|
12752
12973
|
// No expected value means no comparison needed
|
|
12753
12974
|
weight,
|
|
12754
12975
|
hit: true,
|
|
12755
|
-
message: `${
|
|
12976
|
+
message: `${path42}: no expected value`
|
|
12756
12977
|
};
|
|
12757
12978
|
}
|
|
12758
12979
|
if (candidateValue === void 0) {
|
|
12759
12980
|
if (required) {
|
|
12760
12981
|
return {
|
|
12761
|
-
path:
|
|
12982
|
+
path: path42,
|
|
12762
12983
|
score: 0,
|
|
12763
12984
|
weight,
|
|
12764
12985
|
hit: false,
|
|
12765
|
-
message: `${
|
|
12986
|
+
message: `${path42} (required, missing)`
|
|
12766
12987
|
};
|
|
12767
12988
|
}
|
|
12768
12989
|
return {
|
|
12769
|
-
path:
|
|
12990
|
+
path: path42,
|
|
12770
12991
|
score: 1,
|
|
12771
12992
|
// Don't penalize missing optional fields
|
|
12772
12993
|
weight: 0,
|
|
12773
12994
|
// Zero weight means it won't affect the score
|
|
12774
12995
|
hit: true,
|
|
12775
|
-
message: `${
|
|
12996
|
+
message: `${path42}: optional field missing`
|
|
12776
12997
|
};
|
|
12777
12998
|
}
|
|
12778
12999
|
switch (match) {
|
|
12779
13000
|
case "exact":
|
|
12780
|
-
return this.compareExact(
|
|
13001
|
+
return this.compareExact(path42, candidateValue, expectedValue, weight);
|
|
12781
13002
|
case "numeric_tolerance":
|
|
12782
13003
|
return this.compareNumericTolerance(
|
|
12783
|
-
|
|
13004
|
+
path42,
|
|
12784
13005
|
candidateValue,
|
|
12785
13006
|
expectedValue,
|
|
12786
13007
|
fieldConfig,
|
|
12787
13008
|
weight
|
|
12788
13009
|
);
|
|
12789
13010
|
case "date":
|
|
12790
|
-
return this.compareDate(
|
|
13011
|
+
return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
|
|
12791
13012
|
default:
|
|
12792
13013
|
return {
|
|
12793
|
-
path:
|
|
13014
|
+
path: path42,
|
|
12794
13015
|
score: 0,
|
|
12795
13016
|
weight,
|
|
12796
13017
|
hit: false,
|
|
12797
|
-
message: `${
|
|
13018
|
+
message: `${path42}: unknown match type "${match}"`
|
|
12798
13019
|
};
|
|
12799
13020
|
}
|
|
12800
13021
|
}
|
|
12801
13022
|
/**
|
|
12802
13023
|
* Exact equality comparison.
|
|
12803
13024
|
*/
|
|
12804
|
-
compareExact(
|
|
13025
|
+
compareExact(path42, candidateValue, expectedValue, weight) {
|
|
12805
13026
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
12806
13027
|
return {
|
|
12807
|
-
path:
|
|
13028
|
+
path: path42,
|
|
12808
13029
|
score: 1,
|
|
12809
13030
|
weight,
|
|
12810
13031
|
hit: true,
|
|
12811
|
-
message:
|
|
13032
|
+
message: path42
|
|
12812
13033
|
};
|
|
12813
13034
|
}
|
|
12814
13035
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
12815
13036
|
return {
|
|
12816
|
-
path:
|
|
13037
|
+
path: path42,
|
|
12817
13038
|
score: 0,
|
|
12818
13039
|
weight,
|
|
12819
13040
|
hit: false,
|
|
12820
|
-
message: `${
|
|
13041
|
+
message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
12821
13042
|
};
|
|
12822
13043
|
}
|
|
12823
13044
|
return {
|
|
12824
|
-
path:
|
|
13045
|
+
path: path42,
|
|
12825
13046
|
score: 0,
|
|
12826
13047
|
weight,
|
|
12827
13048
|
hit: false,
|
|
12828
|
-
message: `${
|
|
13049
|
+
message: `${path42} (value mismatch)`
|
|
12829
13050
|
};
|
|
12830
13051
|
}
|
|
12831
13052
|
/**
|
|
12832
13053
|
* Numeric comparison with absolute or relative tolerance.
|
|
12833
13054
|
*/
|
|
12834
|
-
compareNumericTolerance(
|
|
13055
|
+
compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12835
13056
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
12836
13057
|
const candidateNum = toNumber2(candidateValue);
|
|
12837
13058
|
const expectedNum = toNumber2(expectedValue);
|
|
12838
13059
|
if (candidateNum === null || expectedNum === null) {
|
|
12839
13060
|
return {
|
|
12840
|
-
path:
|
|
13061
|
+
path: path42,
|
|
12841
13062
|
score: 0,
|
|
12842
13063
|
weight,
|
|
12843
13064
|
hit: false,
|
|
12844
|
-
message: `${
|
|
13065
|
+
message: `${path42} (non-numeric value)`
|
|
12845
13066
|
};
|
|
12846
13067
|
}
|
|
12847
13068
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
12848
13069
|
return {
|
|
12849
|
-
path:
|
|
13070
|
+
path: path42,
|
|
12850
13071
|
score: 0,
|
|
12851
13072
|
weight,
|
|
12852
13073
|
hit: false,
|
|
12853
|
-
message: `${
|
|
13074
|
+
message: `${path42} (invalid numeric value)`
|
|
12854
13075
|
};
|
|
12855
13076
|
}
|
|
12856
13077
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -12863,61 +13084,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
12863
13084
|
}
|
|
12864
13085
|
if (withinTolerance) {
|
|
12865
13086
|
return {
|
|
12866
|
-
path:
|
|
13087
|
+
path: path42,
|
|
12867
13088
|
score: 1,
|
|
12868
13089
|
weight,
|
|
12869
13090
|
hit: true,
|
|
12870
|
-
message: `${
|
|
13091
|
+
message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
|
|
12871
13092
|
};
|
|
12872
13093
|
}
|
|
12873
13094
|
return {
|
|
12874
|
-
path:
|
|
13095
|
+
path: path42,
|
|
12875
13096
|
score: 0,
|
|
12876
13097
|
weight,
|
|
12877
13098
|
hit: false,
|
|
12878
|
-
message: `${
|
|
13099
|
+
message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
12879
13100
|
};
|
|
12880
13101
|
}
|
|
12881
13102
|
/**
|
|
12882
13103
|
* Date comparison with format normalization.
|
|
12883
13104
|
*/
|
|
12884
|
-
compareDate(
|
|
13105
|
+
compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12885
13106
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
12886
13107
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
12887
13108
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
12888
13109
|
if (candidateDate === null) {
|
|
12889
13110
|
return {
|
|
12890
|
-
path:
|
|
13111
|
+
path: path42,
|
|
12891
13112
|
score: 0,
|
|
12892
13113
|
weight,
|
|
12893
13114
|
hit: false,
|
|
12894
|
-
message: `${
|
|
13115
|
+
message: `${path42} (unparseable candidate date)`
|
|
12895
13116
|
};
|
|
12896
13117
|
}
|
|
12897
13118
|
if (expectedDate === null) {
|
|
12898
13119
|
return {
|
|
12899
|
-
path:
|
|
13120
|
+
path: path42,
|
|
12900
13121
|
score: 0,
|
|
12901
13122
|
weight,
|
|
12902
13123
|
hit: false,
|
|
12903
|
-
message: `${
|
|
13124
|
+
message: `${path42} (unparseable expected date)`
|
|
12904
13125
|
};
|
|
12905
13126
|
}
|
|
12906
13127
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
12907
13128
|
return {
|
|
12908
|
-
path:
|
|
13129
|
+
path: path42,
|
|
12909
13130
|
score: 1,
|
|
12910
13131
|
weight,
|
|
12911
13132
|
hit: true,
|
|
12912
|
-
message:
|
|
13133
|
+
message: path42
|
|
12913
13134
|
};
|
|
12914
13135
|
}
|
|
12915
13136
|
return {
|
|
12916
|
-
path:
|
|
13137
|
+
path: path42,
|
|
12917
13138
|
score: 0,
|
|
12918
13139
|
weight,
|
|
12919
13140
|
hit: false,
|
|
12920
|
-
message: `${
|
|
13141
|
+
message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
12921
13142
|
};
|
|
12922
13143
|
}
|
|
12923
13144
|
/**
|
|
@@ -12958,11 +13179,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12958
13179
|
};
|
|
12959
13180
|
}
|
|
12960
13181
|
};
|
|
12961
|
-
function resolvePath(obj,
|
|
12962
|
-
if (!
|
|
13182
|
+
function resolvePath(obj, path42) {
|
|
13183
|
+
if (!path42 || !obj) {
|
|
12963
13184
|
return void 0;
|
|
12964
13185
|
}
|
|
12965
|
-
const parts =
|
|
13186
|
+
const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
12966
13187
|
let current = obj;
|
|
12967
13188
|
for (const part of parts) {
|
|
12968
13189
|
if (current === null || current === void 0) {
|
|
@@ -13780,8 +14001,8 @@ var TokenUsageEvaluator = class {
|
|
|
13780
14001
|
};
|
|
13781
14002
|
|
|
13782
14003
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
13783
|
-
function getNestedValue(obj,
|
|
13784
|
-
const parts =
|
|
14004
|
+
function getNestedValue(obj, path42) {
|
|
14005
|
+
const parts = path42.split(".");
|
|
13785
14006
|
let current = obj;
|
|
13786
14007
|
for (const part of parts) {
|
|
13787
14008
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -14245,13 +14466,78 @@ function runContainsAssertion(output, value) {
|
|
|
14245
14466
|
misses: passed ? [] : [`Output does not contain "${value}"`]
|
|
14246
14467
|
};
|
|
14247
14468
|
}
|
|
14248
|
-
function
|
|
14249
|
-
const
|
|
14469
|
+
function runContainsAnyAssertion(output, values) {
|
|
14470
|
+
const matched = values.filter((v) => output.includes(v));
|
|
14471
|
+
const passed = matched.length > 0;
|
|
14472
|
+
return {
|
|
14473
|
+
score: passed ? 1 : 0,
|
|
14474
|
+
hits: passed ? [`Output contains "${matched[0]}"`] : [],
|
|
14475
|
+
misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
|
|
14476
|
+
};
|
|
14477
|
+
}
|
|
14478
|
+
function runContainsAllAssertion(output, values) {
|
|
14479
|
+
const missing = values.filter((v) => !output.includes(v));
|
|
14480
|
+
const passed = missing.length === 0;
|
|
14481
|
+
return {
|
|
14482
|
+
score: passed ? 1 : 0,
|
|
14483
|
+
hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
|
|
14484
|
+
misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
14485
|
+
};
|
|
14486
|
+
}
|
|
14487
|
+
function runIcontainsAssertion(output, value) {
|
|
14488
|
+
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
14489
|
+
return {
|
|
14490
|
+
score: passed ? 1 : 0,
|
|
14491
|
+
hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
|
|
14492
|
+
misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
|
|
14493
|
+
};
|
|
14494
|
+
}
|
|
14495
|
+
function runIcontainsAnyAssertion(output, values) {
|
|
14496
|
+
const lower = output.toLowerCase();
|
|
14497
|
+
const matched = values.filter((v) => lower.includes(v.toLowerCase()));
|
|
14498
|
+
const passed = matched.length > 0;
|
|
14499
|
+
return {
|
|
14500
|
+
score: passed ? 1 : 0,
|
|
14501
|
+
hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
|
|
14502
|
+
misses: passed ? [] : [
|
|
14503
|
+
`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
|
|
14504
|
+
]
|
|
14505
|
+
};
|
|
14506
|
+
}
|
|
14507
|
+
function runIcontainsAllAssertion(output, values) {
|
|
14508
|
+
const lower = output.toLowerCase();
|
|
14509
|
+
const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
|
|
14510
|
+
const passed = missing.length === 0;
|
|
14511
|
+
return {
|
|
14512
|
+
score: passed ? 1 : 0,
|
|
14513
|
+
hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
|
|
14514
|
+
misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
14515
|
+
};
|
|
14516
|
+
}
|
|
14517
|
+
function runStartsWithAssertion(output, value) {
|
|
14518
|
+
const passed = output.trim().startsWith(value.trim());
|
|
14519
|
+
return {
|
|
14520
|
+
score: passed ? 1 : 0,
|
|
14521
|
+
hits: passed ? [`Output starts with "${value}"`] : [],
|
|
14522
|
+
misses: passed ? [] : [`Output does not start with "${value}"`]
|
|
14523
|
+
};
|
|
14524
|
+
}
|
|
14525
|
+
function runEndsWithAssertion(output, value) {
|
|
14526
|
+
const passed = output.trim().endsWith(value.trim());
|
|
14527
|
+
return {
|
|
14528
|
+
score: passed ? 1 : 0,
|
|
14529
|
+
hits: passed ? [`Output ends with "${value}"`] : [],
|
|
14530
|
+
misses: passed ? [] : [`Output does not end with "${value}"`]
|
|
14531
|
+
};
|
|
14532
|
+
}
|
|
14533
|
+
function runRegexAssertion(output, pattern, flags) {
|
|
14534
|
+
const regex = new RegExp(pattern, flags);
|
|
14250
14535
|
const passed = regex.test(output);
|
|
14536
|
+
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
14251
14537
|
return {
|
|
14252
14538
|
score: passed ? 1 : 0,
|
|
14253
|
-
hits: passed ? [`Output matches pattern /${pattern}
|
|
14254
|
-
misses: passed ? [] : [`Output does not match pattern /${pattern}
|
|
14539
|
+
hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
|
|
14540
|
+
misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
|
|
14255
14541
|
};
|
|
14256
14542
|
}
|
|
14257
14543
|
function runIsJsonAssertion(output) {
|
|
@@ -14277,9 +14563,9 @@ function runEqualsAssertion(output, value) {
|
|
|
14277
14563
|
}
|
|
14278
14564
|
|
|
14279
14565
|
// src/evaluation/orchestrator.ts
|
|
14280
|
-
var
|
|
14281
|
-
var
|
|
14282
|
-
var
|
|
14566
|
+
var import_node_crypto9 = require("crypto");
|
|
14567
|
+
var import_promises29 = require("fs/promises");
|
|
14568
|
+
var import_node_path40 = __toESM(require("path"), 1);
|
|
14283
14569
|
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
14284
14570
|
|
|
14285
14571
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -14669,13 +14955,13 @@ var containsFactory = (config) => {
|
|
|
14669
14955
|
var regexFactory = (config) => {
|
|
14670
14956
|
const c = config;
|
|
14671
14957
|
return new DeterministicAssertionEvaluator("regex", (ctx) => {
|
|
14672
|
-
const result = runRegexAssertion(ctx.candidate, c.value);
|
|
14958
|
+
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
14673
14959
|
return {
|
|
14674
14960
|
score: result.score,
|
|
14675
14961
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
14676
14962
|
hits: result.hits,
|
|
14677
14963
|
misses: result.misses,
|
|
14678
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}
|
|
14964
|
+
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
14679
14965
|
expectedAspectCount: 1
|
|
14680
14966
|
};
|
|
14681
14967
|
});
|
|
@@ -14707,9 +14993,107 @@ var equalsFactory = (config) => {
|
|
|
14707
14993
|
};
|
|
14708
14994
|
});
|
|
14709
14995
|
};
|
|
14996
|
+
var containsAnyFactory = (config) => {
|
|
14997
|
+
const c = config;
|
|
14998
|
+
return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
|
|
14999
|
+
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
15000
|
+
return {
|
|
15001
|
+
score: result.score,
|
|
15002
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15003
|
+
hits: result.hits,
|
|
15004
|
+
misses: result.misses,
|
|
15005
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15006
|
+
expectedAspectCount: 1
|
|
15007
|
+
};
|
|
15008
|
+
});
|
|
15009
|
+
};
|
|
15010
|
+
var containsAllFactory = (config) => {
|
|
15011
|
+
const c = config;
|
|
15012
|
+
return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
|
|
15013
|
+
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
15014
|
+
return {
|
|
15015
|
+
score: result.score,
|
|
15016
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15017
|
+
hits: result.hits,
|
|
15018
|
+
misses: result.misses,
|
|
15019
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15020
|
+
expectedAspectCount: 1
|
|
15021
|
+
};
|
|
15022
|
+
});
|
|
15023
|
+
};
|
|
15024
|
+
var icontainsFactory = (config) => {
|
|
15025
|
+
const c = config;
|
|
15026
|
+
return new DeterministicAssertionEvaluator("icontains", (ctx) => {
|
|
15027
|
+
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
15028
|
+
return {
|
|
15029
|
+
score: result.score,
|
|
15030
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15031
|
+
hits: result.hits,
|
|
15032
|
+
misses: result.misses,
|
|
15033
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15034
|
+
expectedAspectCount: 1
|
|
15035
|
+
};
|
|
15036
|
+
});
|
|
15037
|
+
};
|
|
15038
|
+
var icontainsAnyFactory = (config) => {
|
|
15039
|
+
const c = config;
|
|
15040
|
+
return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
|
|
15041
|
+
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
15042
|
+
return {
|
|
15043
|
+
score: result.score,
|
|
15044
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15045
|
+
hits: result.hits,
|
|
15046
|
+
misses: result.misses,
|
|
15047
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15048
|
+
expectedAspectCount: 1
|
|
15049
|
+
};
|
|
15050
|
+
});
|
|
15051
|
+
};
|
|
15052
|
+
var icontainsAllFactory = (config) => {
|
|
15053
|
+
const c = config;
|
|
15054
|
+
return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
|
|
15055
|
+
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
15056
|
+
return {
|
|
15057
|
+
score: result.score,
|
|
15058
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15059
|
+
hits: result.hits,
|
|
15060
|
+
misses: result.misses,
|
|
15061
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15062
|
+
expectedAspectCount: 1
|
|
15063
|
+
};
|
|
15064
|
+
});
|
|
15065
|
+
};
|
|
15066
|
+
var startsWithFactory = (config) => {
|
|
15067
|
+
const c = config;
|
|
15068
|
+
return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
|
|
15069
|
+
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
15070
|
+
return {
|
|
15071
|
+
score: result.score,
|
|
15072
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15073
|
+
hits: result.hits,
|
|
15074
|
+
misses: result.misses,
|
|
15075
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15076
|
+
expectedAspectCount: 1
|
|
15077
|
+
};
|
|
15078
|
+
});
|
|
15079
|
+
};
|
|
15080
|
+
var endsWithFactory = (config) => {
|
|
15081
|
+
const c = config;
|
|
15082
|
+
return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
|
|
15083
|
+
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
15084
|
+
return {
|
|
15085
|
+
score: result.score,
|
|
15086
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
15087
|
+
hits: result.hits,
|
|
15088
|
+
misses: result.misses,
|
|
15089
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
15090
|
+
expectedAspectCount: 1
|
|
15091
|
+
};
|
|
15092
|
+
});
|
|
15093
|
+
};
|
|
14710
15094
|
function createBuiltinRegistry() {
|
|
14711
15095
|
const registry = new EvaluatorRegistry();
|
|
14712
|
-
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
15096
|
+
registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
14713
15097
|
return registry;
|
|
14714
15098
|
}
|
|
14715
15099
|
|
|
@@ -15053,37 +15437,255 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
15053
15437
|
}
|
|
15054
15438
|
}
|
|
15055
15439
|
|
|
15056
|
-
// src/evaluation/workspace/
|
|
15440
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
15441
|
+
var import_node_child_process7 = require("child_process");
|
|
15442
|
+
var import_node_crypto8 = require("crypto");
|
|
15443
|
+
var import_node_fs11 = require("fs");
|
|
15057
15444
|
var import_promises27 = require("fs/promises");
|
|
15445
|
+
var import_node_os7 = __toESM(require("os"), 1);
|
|
15058
15446
|
var import_node_path38 = __toESM(require("path"), 1);
|
|
15447
|
+
var import_node_util5 = require("util");
|
|
15448
|
+
var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
|
|
15449
|
+
var DEFAULT_CACHE_DIR = import_node_path38.default.join(import_node_os7.default.homedir(), ".agentv", "git-cache");
|
|
15450
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15451
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
15452
|
+
function gitEnv() {
|
|
15453
|
+
const env = { ...process.env };
|
|
15454
|
+
for (const key of Object.keys(env)) {
|
|
15455
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
15456
|
+
delete env[key];
|
|
15457
|
+
}
|
|
15458
|
+
}
|
|
15459
|
+
return {
|
|
15460
|
+
...env,
|
|
15461
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
15462
|
+
GIT_ASKPASS: "",
|
|
15463
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15464
|
+
};
|
|
15465
|
+
}
|
|
15466
|
+
function cacheKey(source) {
|
|
15467
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
15468
|
+
return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
|
|
15469
|
+
}
|
|
15470
|
+
function getSourceUrl(source) {
|
|
15471
|
+
return source.type === "git" ? source.url : source.path;
|
|
15472
|
+
}
|
|
15473
|
+
async function git(args, opts) {
|
|
15474
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
15475
|
+
cwd: opts?.cwd,
|
|
15476
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
15477
|
+
env: gitEnv(),
|
|
15478
|
+
maxBuffer: 50 * 1024 * 1024
|
|
15479
|
+
// 50MB
|
|
15480
|
+
});
|
|
15481
|
+
return stdout.trim();
|
|
15482
|
+
}
|
|
15483
|
+
async function acquireLock(lockPath) {
|
|
15484
|
+
const start = Date.now();
|
|
15485
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
15486
|
+
try {
|
|
15487
|
+
await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
|
|
15488
|
+
return;
|
|
15489
|
+
} catch (err) {
|
|
15490
|
+
if (err.code === "EEXIST") {
|
|
15491
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
15492
|
+
continue;
|
|
15493
|
+
}
|
|
15494
|
+
throw err;
|
|
15495
|
+
}
|
|
15496
|
+
}
|
|
15497
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
15498
|
+
}
|
|
15499
|
+
async function releaseLock(lockPath) {
|
|
15500
|
+
try {
|
|
15501
|
+
await (0, import_promises27.unlink)(lockPath);
|
|
15502
|
+
} catch {
|
|
15503
|
+
}
|
|
15504
|
+
}
|
|
15505
|
+
var RepoManager = class {
|
|
15506
|
+
cacheDir;
|
|
15507
|
+
constructor(cacheDir) {
|
|
15508
|
+
this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
|
|
15509
|
+
}
|
|
15510
|
+
/**
|
|
15511
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
15512
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
15513
|
+
* Returns the absolute path to the cache directory.
|
|
15514
|
+
*/
|
|
15515
|
+
async ensureCache(source, depth) {
|
|
15516
|
+
const key = cacheKey(source);
|
|
15517
|
+
const cachePath = import_node_path38.default.join(this.cacheDir, key);
|
|
15518
|
+
const lockPath = `${cachePath}.lock`;
|
|
15519
|
+
await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
|
|
15520
|
+
await acquireLock(lockPath);
|
|
15521
|
+
try {
|
|
15522
|
+
if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
|
|
15523
|
+
const fetchArgs = ["fetch", "--prune"];
|
|
15524
|
+
if (depth) {
|
|
15525
|
+
fetchArgs.push("--depth", String(depth));
|
|
15526
|
+
}
|
|
15527
|
+
await git(fetchArgs, { cwd: cachePath });
|
|
15528
|
+
} else {
|
|
15529
|
+
const cloneArgs = ["clone", "--mirror", "--bare"];
|
|
15530
|
+
if (depth) {
|
|
15531
|
+
cloneArgs.push("--depth", String(depth));
|
|
15532
|
+
}
|
|
15533
|
+
const sourceUrl = getSourceUrl(source);
|
|
15534
|
+
const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
15535
|
+
cloneArgs.push(cloneUrl, cachePath);
|
|
15536
|
+
await git(cloneArgs);
|
|
15537
|
+
}
|
|
15538
|
+
} finally {
|
|
15539
|
+
await releaseLock(lockPath);
|
|
15540
|
+
}
|
|
15541
|
+
return cachePath;
|
|
15542
|
+
}
|
|
15543
|
+
/**
|
|
15544
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
15545
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
15546
|
+
*/
|
|
15547
|
+
async materialize(repo, workspacePath) {
|
|
15548
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15549
|
+
const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
|
|
15550
|
+
const cloneArgs = ["clone"];
|
|
15551
|
+
if (repo.clone?.depth) {
|
|
15552
|
+
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
15553
|
+
}
|
|
15554
|
+
if (repo.clone?.filter) {
|
|
15555
|
+
cloneArgs.push("--filter", repo.clone.filter);
|
|
15556
|
+
}
|
|
15557
|
+
cloneArgs.push("--no-checkout");
|
|
15558
|
+
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
|
|
15559
|
+
cloneArgs.push(cloneUrl, targetDir);
|
|
15560
|
+
await git(cloneArgs);
|
|
15561
|
+
if (repo.clone?.sparse?.length) {
|
|
15562
|
+
await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
|
|
15563
|
+
await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
|
|
15564
|
+
}
|
|
15565
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15566
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
15567
|
+
let resolvedSha;
|
|
15568
|
+
if (resolve === "remote" && repo.source.type === "git") {
|
|
15569
|
+
const url = getSourceUrl(repo.source);
|
|
15570
|
+
try {
|
|
15571
|
+
const lsOutput = await git(["ls-remote", url, ref]);
|
|
15572
|
+
const match = lsOutput.split(" ")[0];
|
|
15573
|
+
if (!match) {
|
|
15574
|
+
throw new Error(`Ref '${ref}' not found on remote ${url}`);
|
|
15575
|
+
}
|
|
15576
|
+
resolvedSha = match;
|
|
15577
|
+
} catch (err) {
|
|
15578
|
+
if (err instanceof Error && err.message.includes("not found")) throw err;
|
|
15579
|
+
resolvedSha = ref;
|
|
15580
|
+
}
|
|
15581
|
+
} else {
|
|
15582
|
+
resolvedSha = ref;
|
|
15583
|
+
}
|
|
15584
|
+
await git(["checkout", resolvedSha], { cwd: targetDir });
|
|
15585
|
+
const ancestor = repo.checkout?.ancestor ?? 0;
|
|
15586
|
+
if (ancestor > 0) {
|
|
15587
|
+
try {
|
|
15588
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
15589
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
15590
|
+
} catch {
|
|
15591
|
+
if (repo.clone?.depth) {
|
|
15592
|
+
await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
|
|
15593
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
15594
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
15595
|
+
} else {
|
|
15596
|
+
throw new Error(
|
|
15597
|
+
`Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
|
|
15598
|
+
);
|
|
15599
|
+
}
|
|
15600
|
+
}
|
|
15601
|
+
}
|
|
15602
|
+
}
|
|
15603
|
+
/** Materialize all repos into the workspace. */
|
|
15604
|
+
async materializeAll(repos, workspacePath) {
|
|
15605
|
+
for (const repo of repos) {
|
|
15606
|
+
await this.materialize(repo, workspacePath);
|
|
15607
|
+
}
|
|
15608
|
+
}
|
|
15609
|
+
/** Reset repos in workspace to their checkout state. */
|
|
15610
|
+
async reset(repos, workspacePath, strategy) {
|
|
15611
|
+
if (strategy === "recreate") {
|
|
15612
|
+
for (const repo of repos) {
|
|
15613
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15614
|
+
await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
|
|
15615
|
+
}
|
|
15616
|
+
await this.materializeAll(repos, workspacePath);
|
|
15617
|
+
return;
|
|
15618
|
+
}
|
|
15619
|
+
for (const repo of repos) {
|
|
15620
|
+
const targetDir = import_node_path38.default.join(workspacePath, repo.path);
|
|
15621
|
+
await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
15622
|
+
await git(["clean", "-fd"], { cwd: targetDir });
|
|
15623
|
+
}
|
|
15624
|
+
}
|
|
15625
|
+
/**
|
|
15626
|
+
* Seed the cache from a local repository, setting the remote to a given URL.
|
|
15627
|
+
* Useful for avoiding slow network clones when a local clone already exists.
|
|
15628
|
+
*/
|
|
15629
|
+
async seedCache(localPath, remoteUrl, opts) {
|
|
15630
|
+
const source = { type: "git", url: remoteUrl };
|
|
15631
|
+
const key = cacheKey(source);
|
|
15632
|
+
const cachePath = import_node_path38.default.join(this.cacheDir, key);
|
|
15633
|
+
const lockPath = `${cachePath}.lock`;
|
|
15634
|
+
await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
|
|
15635
|
+
await acquireLock(lockPath);
|
|
15636
|
+
try {
|
|
15637
|
+
if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
|
|
15638
|
+
if (!opts?.force) {
|
|
15639
|
+
throw new Error(
|
|
15640
|
+
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
15641
|
+
);
|
|
15642
|
+
}
|
|
15643
|
+
await (0, import_promises27.rm)(cachePath, { recursive: true, force: true });
|
|
15644
|
+
}
|
|
15645
|
+
await git(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
15646
|
+
await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
15647
|
+
} finally {
|
|
15648
|
+
await releaseLock(lockPath);
|
|
15649
|
+
}
|
|
15650
|
+
return cachePath;
|
|
15651
|
+
}
|
|
15652
|
+
/** Remove the entire cache directory. */
|
|
15653
|
+
async cleanCache() {
|
|
15654
|
+
await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
|
|
15655
|
+
}
|
|
15656
|
+
};
|
|
15657
|
+
|
|
15658
|
+
// src/evaluation/workspace/resolve.ts
|
|
15659
|
+
var import_promises28 = require("fs/promises");
|
|
15660
|
+
var import_node_path39 = __toESM(require("path"), 1);
|
|
15059
15661
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
15060
15662
|
if (!templatePath) {
|
|
15061
15663
|
return void 0;
|
|
15062
15664
|
}
|
|
15063
|
-
const resolved =
|
|
15064
|
-
const stats = await (0,
|
|
15665
|
+
const resolved = import_node_path39.default.resolve(templatePath);
|
|
15666
|
+
const stats = await (0, import_promises28.stat)(resolved);
|
|
15065
15667
|
if (stats.isFile()) {
|
|
15066
15668
|
return {
|
|
15067
|
-
dir:
|
|
15669
|
+
dir: import_node_path39.default.dirname(resolved),
|
|
15068
15670
|
workspaceFile: resolved
|
|
15069
15671
|
};
|
|
15070
15672
|
}
|
|
15071
15673
|
if (!stats.isDirectory()) {
|
|
15072
15674
|
throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
|
|
15073
15675
|
}
|
|
15074
|
-
const entries = await (0,
|
|
15676
|
+
const entries = await (0, import_promises28.readdir)(resolved);
|
|
15075
15677
|
const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
|
|
15076
15678
|
if (workspaceFiles.length === 1) {
|
|
15077
15679
|
return {
|
|
15078
15680
|
dir: resolved,
|
|
15079
|
-
workspaceFile:
|
|
15681
|
+
workspaceFile: import_node_path39.default.join(resolved, workspaceFiles[0])
|
|
15080
15682
|
};
|
|
15081
15683
|
}
|
|
15082
15684
|
if (workspaceFiles.length > 1) {
|
|
15083
15685
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
15084
15686
|
return {
|
|
15085
15687
|
dir: resolved,
|
|
15086
|
-
workspaceFile: conventionFile ?
|
|
15688
|
+
workspaceFile: conventionFile ? import_node_path39.default.join(resolved, conventionFile) : void 0
|
|
15087
15689
|
};
|
|
15088
15690
|
}
|
|
15089
15691
|
return { dir: resolved };
|
|
@@ -15158,7 +15760,7 @@ async function runEvaluation(options) {
|
|
|
15158
15760
|
);
|
|
15159
15761
|
useCache = false;
|
|
15160
15762
|
}
|
|
15161
|
-
const evalRunId = (0,
|
|
15763
|
+
const evalRunId = (0, import_node_crypto9.randomUUID)();
|
|
15162
15764
|
const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
|
|
15163
15765
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15164
15766
|
if (filteredEvalCases.length === 0) {
|
|
@@ -15205,6 +15807,11 @@ async function runEvaluation(options) {
|
|
|
15205
15807
|
}
|
|
15206
15808
|
return getOrCreateProvider(resolvedJudge);
|
|
15207
15809
|
};
|
|
15810
|
+
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
15811
|
+
throw new Error(
|
|
15812
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
|
|
15813
|
+
);
|
|
15814
|
+
}
|
|
15208
15815
|
const targetResolver = (name) => {
|
|
15209
15816
|
const resolved = resolveTargetByName(name);
|
|
15210
15817
|
if (!resolved) {
|
|
@@ -15218,7 +15825,7 @@ async function runEvaluation(options) {
|
|
|
15218
15825
|
];
|
|
15219
15826
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
15220
15827
|
const typeRegistry = createBuiltinRegistry();
|
|
15221
|
-
const discoveryBaseDir = evalFilePath ?
|
|
15828
|
+
const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
|
|
15222
15829
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
15223
15830
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
15224
15831
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -15273,7 +15880,8 @@ async function runEvaluation(options) {
|
|
|
15273
15880
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
15274
15881
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
15275
15882
|
const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
15276
|
-
const
|
|
15883
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
15884
|
+
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
15277
15885
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
15278
15886
|
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
15279
15887
|
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
@@ -15292,9 +15900,22 @@ async function runEvaluation(options) {
|
|
|
15292
15900
|
const message = error instanceof Error ? error.message : String(error);
|
|
15293
15901
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
15294
15902
|
}
|
|
15295
|
-
} else if (suiteWorkspace?.before_all) {
|
|
15903
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
15296
15904
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
15297
|
-
await (0,
|
|
15905
|
+
await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
15906
|
+
}
|
|
15907
|
+
const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
|
|
15908
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
15909
|
+
try {
|
|
15910
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
15911
|
+
} catch (error) {
|
|
15912
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
15913
|
+
if (sharedWorkspacePath) {
|
|
15914
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
15915
|
+
});
|
|
15916
|
+
}
|
|
15917
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
15918
|
+
}
|
|
15298
15919
|
}
|
|
15299
15920
|
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
15300
15921
|
const scriptContext = {
|
|
@@ -15385,7 +16006,8 @@ async function runEvaluation(options) {
|
|
|
15385
16006
|
sharedBaselineCommit,
|
|
15386
16007
|
suiteWorkspaceFile,
|
|
15387
16008
|
streamCallbacks,
|
|
15388
|
-
typeRegistry
|
|
16009
|
+
typeRegistry,
|
|
16010
|
+
repoManager
|
|
15389
16011
|
};
|
|
15390
16012
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
15391
16013
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -15660,15 +16282,16 @@ async function runEvalCase(options) {
|
|
|
15660
16282
|
sharedWorkspacePath,
|
|
15661
16283
|
sharedBaselineCommit,
|
|
15662
16284
|
suiteWorkspaceFile,
|
|
15663
|
-
typeRegistry: providedTypeRegistry
|
|
16285
|
+
typeRegistry: providedTypeRegistry,
|
|
16286
|
+
repoManager
|
|
15664
16287
|
} = options;
|
|
15665
16288
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
15666
16289
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
15667
16290
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
15668
|
-
const
|
|
16291
|
+
const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
15669
16292
|
let cachedResponse;
|
|
15670
|
-
if (
|
|
15671
|
-
cachedResponse = await cache.get(
|
|
16293
|
+
if (cacheKey2 && cache) {
|
|
16294
|
+
cachedResponse = await cache.get(cacheKey2);
|
|
15672
16295
|
}
|
|
15673
16296
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
15674
16297
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -15697,9 +16320,25 @@ async function runEvalCase(options) {
|
|
|
15697
16320
|
);
|
|
15698
16321
|
}
|
|
15699
16322
|
}
|
|
15700
|
-
if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
|
|
16323
|
+
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
15701
16324
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
15702
|
-
await (0,
|
|
16325
|
+
await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
|
|
16326
|
+
}
|
|
16327
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
16328
|
+
const perCaseRepoManager = new RepoManager();
|
|
16329
|
+
try {
|
|
16330
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
16331
|
+
} catch (error) {
|
|
16332
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16333
|
+
return buildErrorResult(
|
|
16334
|
+
evalCase,
|
|
16335
|
+
target.name,
|
|
16336
|
+
nowFn(),
|
|
16337
|
+
new Error(`Failed to materialize repos: ${message}`),
|
|
16338
|
+
promptInputs,
|
|
16339
|
+
provider
|
|
16340
|
+
);
|
|
16341
|
+
}
|
|
15703
16342
|
}
|
|
15704
16343
|
if (workspacePath && evalCase.workspace?.before_all) {
|
|
15705
16344
|
const scriptContext = {
|
|
@@ -15823,8 +16462,8 @@ async function runEvalCase(options) {
|
|
|
15823
16462
|
}
|
|
15824
16463
|
return errorResult;
|
|
15825
16464
|
}
|
|
15826
|
-
if (
|
|
15827
|
-
await cache.set(
|
|
16465
|
+
if (cacheKey2 && cache && !cachedResponse) {
|
|
16466
|
+
await cache.set(cacheKey2, providerResponse);
|
|
15828
16467
|
}
|
|
15829
16468
|
const output = providerResponse.output;
|
|
15830
16469
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -15852,6 +16491,16 @@ async function runEvalCase(options) {
|
|
|
15852
16491
|
}
|
|
15853
16492
|
}
|
|
15854
16493
|
const providerError = extractProviderError(providerResponse);
|
|
16494
|
+
if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
|
|
16495
|
+
try {
|
|
16496
|
+
await repoManager.reset(
|
|
16497
|
+
evalCase.workspace.repos,
|
|
16498
|
+
workspacePath,
|
|
16499
|
+
evalCase.workspace.reset.strategy
|
|
16500
|
+
);
|
|
16501
|
+
} catch {
|
|
16502
|
+
}
|
|
16503
|
+
}
|
|
15855
16504
|
if (workspacePath && evalCase.workspace?.after_each) {
|
|
15856
16505
|
const scriptContext = {
|
|
15857
16506
|
workspacePath,
|
|
@@ -16216,7 +16865,7 @@ async function runEvaluatorList(options) {
|
|
|
16216
16865
|
fileChanges,
|
|
16217
16866
|
workspacePath
|
|
16218
16867
|
};
|
|
16219
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
16868
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path40.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
16220
16869
|
const dispatchContext = {
|
|
16221
16870
|
judgeProvider,
|
|
16222
16871
|
targetResolver,
|
|
@@ -16306,8 +16955,9 @@ async function runEvaluatorList(options) {
|
|
|
16306
16955
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
16307
16956
|
return entry.score.score < minScore;
|
|
16308
16957
|
});
|
|
16309
|
-
const
|
|
16310
|
-
|
|
16958
|
+
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
16959
|
+
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
16960
|
+
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
16311
16961
|
) : 0;
|
|
16312
16962
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
16313
16963
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
@@ -16447,7 +17097,7 @@ function extractProviderError(response) {
|
|
|
16447
17097
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
16448
17098
|
}
|
|
16449
17099
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
16450
|
-
const hash = (0,
|
|
17100
|
+
const hash = (0, import_node_crypto9.createHash)("sha256");
|
|
16451
17101
|
hash.update(provider.id);
|
|
16452
17102
|
hash.update(target.name);
|
|
16453
17103
|
hash.update(evalCase.id);
|
|
@@ -16515,8 +17165,8 @@ function computeWeightedMean(entries) {
|
|
|
16515
17165
|
}
|
|
16516
17166
|
|
|
16517
17167
|
// src/evaluation/evaluate.ts
|
|
16518
|
-
var
|
|
16519
|
-
var
|
|
17168
|
+
var import_node_fs12 = require("fs");
|
|
17169
|
+
var import_node_path41 = __toESM(require("path"), 1);
|
|
16520
17170
|
async function evaluate(config) {
|
|
16521
17171
|
const startTime = Date.now();
|
|
16522
17172
|
if (config.tests && config.specFile) {
|
|
@@ -16538,13 +17188,13 @@ async function evaluate(config) {
|
|
|
16538
17188
|
let evalCases;
|
|
16539
17189
|
let testFilePath;
|
|
16540
17190
|
if (config.specFile) {
|
|
16541
|
-
testFilePath =
|
|
17191
|
+
testFilePath = import_node_path41.default.resolve(config.specFile);
|
|
16542
17192
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
16543
17193
|
verbose: config.verbose,
|
|
16544
17194
|
filter: config.filter
|
|
16545
17195
|
});
|
|
16546
17196
|
} else {
|
|
16547
|
-
testFilePath =
|
|
17197
|
+
testFilePath = import_node_path41.default.join(process.cwd(), "__programmatic__.yaml");
|
|
16548
17198
|
evalCases = (config.tests ?? []).map((test) => {
|
|
16549
17199
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
16550
17200
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -16635,11 +17285,11 @@ function computeSummary(results, durationMs) {
|
|
|
16635
17285
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
16636
17286
|
async function discoverDefaultTarget(repoRoot) {
|
|
16637
17287
|
const cwd = process.cwd();
|
|
16638
|
-
const chain = buildDirectoryChain2(
|
|
17288
|
+
const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
|
|
16639
17289
|
for (const dir of chain) {
|
|
16640
17290
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
16641
|
-
const targetsPath =
|
|
16642
|
-
if (!(0,
|
|
17291
|
+
const targetsPath = import_node_path41.default.join(dir, candidate);
|
|
17292
|
+
if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
|
|
16643
17293
|
try {
|
|
16644
17294
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
16645
17295
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -16653,11 +17303,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
16653
17303
|
async function loadEnvHierarchy(repoRoot) {
|
|
16654
17304
|
const { readFileSync: readFileSync2 } = await import("fs");
|
|
16655
17305
|
const cwd = process.cwd();
|
|
16656
|
-
const chain = buildDirectoryChain2(
|
|
17306
|
+
const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
|
|
16657
17307
|
const envFiles = [];
|
|
16658
17308
|
for (const dir of chain) {
|
|
16659
|
-
const envPath =
|
|
16660
|
-
if ((0,
|
|
17309
|
+
const envPath = import_node_path41.default.join(dir, ".env");
|
|
17310
|
+
if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
|
|
16661
17311
|
}
|
|
16662
17312
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
16663
17313
|
try {
|
|
@@ -16727,12 +17377,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
16727
17377
|
".agentv/config.js"
|
|
16728
17378
|
];
|
|
16729
17379
|
async function loadTsConfig(projectRoot) {
|
|
16730
|
-
const { existsSync:
|
|
17380
|
+
const { existsSync: existsSync4 } = await import("fs");
|
|
16731
17381
|
const { pathToFileURL } = await import("url");
|
|
16732
17382
|
const { join: join2 } = await import("path");
|
|
16733
17383
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
16734
17384
|
const filePath = join2(projectRoot, fileName);
|
|
16735
|
-
if (!
|
|
17385
|
+
if (!existsSync4(filePath)) {
|
|
16736
17386
|
continue;
|
|
16737
17387
|
}
|
|
16738
17388
|
try {
|
|
@@ -16829,8 +17479,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
16829
17479
|
}
|
|
16830
17480
|
|
|
16831
17481
|
// src/evaluation/cache/response-cache.ts
|
|
16832
|
-
var
|
|
16833
|
-
var
|
|
17482
|
+
var import_promises30 = require("fs/promises");
|
|
17483
|
+
var import_node_path42 = __toESM(require("path"), 1);
|
|
16834
17484
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
16835
17485
|
var ResponseCache = class {
|
|
16836
17486
|
cachePath;
|
|
@@ -16840,7 +17490,7 @@ var ResponseCache = class {
|
|
|
16840
17490
|
async get(key) {
|
|
16841
17491
|
const filePath = this.keyToPath(key);
|
|
16842
17492
|
try {
|
|
16843
|
-
const data = await (0,
|
|
17493
|
+
const data = await (0, import_promises30.readFile)(filePath, "utf8");
|
|
16844
17494
|
return JSON.parse(data);
|
|
16845
17495
|
} catch {
|
|
16846
17496
|
return void 0;
|
|
@@ -16848,13 +17498,13 @@ var ResponseCache = class {
|
|
|
16848
17498
|
}
|
|
16849
17499
|
async set(key, value) {
|
|
16850
17500
|
const filePath = this.keyToPath(key);
|
|
16851
|
-
const dir =
|
|
16852
|
-
await (0,
|
|
16853
|
-
await (0,
|
|
17501
|
+
const dir = import_node_path42.default.dirname(filePath);
|
|
17502
|
+
await (0, import_promises30.mkdir)(dir, { recursive: true });
|
|
17503
|
+
await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
16854
17504
|
}
|
|
16855
17505
|
keyToPath(key) {
|
|
16856
17506
|
const prefix = key.slice(0, 2);
|
|
16857
|
-
return
|
|
17507
|
+
return import_node_path42.default.join(this.cachePath, prefix, `${key}.json`);
|
|
16858
17508
|
}
|
|
16859
17509
|
};
|
|
16860
17510
|
function shouldEnableCache(params) {
|
|
@@ -17332,6 +17982,7 @@ function createAgentKernel() {
|
|
|
17332
17982
|
OtelTraceExporter,
|
|
17333
17983
|
OtlpJsonFileExporter,
|
|
17334
17984
|
ProviderRegistry,
|
|
17985
|
+
RepoManager,
|
|
17335
17986
|
ResponseCache,
|
|
17336
17987
|
SimpleTraceFileExporter,
|
|
17337
17988
|
TEST_MESSAGE_ROLES,
|
|
@@ -17417,12 +18068,19 @@ function createAgentKernel() {
|
|
|
17417
18068
|
resolveTargetDefinition,
|
|
17418
18069
|
resolveWorkspaceTemplate,
|
|
17419
18070
|
rubricEvaluationSchema,
|
|
18071
|
+
runContainsAllAssertion,
|
|
18072
|
+
runContainsAnyAssertion,
|
|
17420
18073
|
runContainsAssertion,
|
|
18074
|
+
runEndsWithAssertion,
|
|
17421
18075
|
runEqualsAssertion,
|
|
17422
18076
|
runEvalCase,
|
|
17423
18077
|
runEvaluation,
|
|
18078
|
+
runIcontainsAllAssertion,
|
|
18079
|
+
runIcontainsAnyAssertion,
|
|
18080
|
+
runIcontainsAssertion,
|
|
17424
18081
|
runIsJsonAssertion,
|
|
17425
18082
|
runRegexAssertion,
|
|
18083
|
+
runStartsWithAssertion,
|
|
17426
18084
|
scoreToVerdict,
|
|
17427
18085
|
shouldEnableCache,
|
|
17428
18086
|
shouldSkipCacheForTemperature,
|