agentv 2.11.4 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KWUTY5XR.js → chunk-FSBZM3HT.js} +176 -31
- package/dist/chunk-FSBZM3HT.js.map +1 -0
- package/dist/{chunk-FBGAD3CQ.js → chunk-M6JYP6A6.js} +17 -55
- package/dist/chunk-M6JYP6A6.js.map +1 -0
- package/dist/{chunk-APGYGAVM.js → chunk-UWDI4UVN.js} +266 -34
- package/dist/chunk-UWDI4UVN.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-RVGCGRG4.js → dist-CCUHG3SN.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-O2KBWGEI.js → interactive-P3D5O673.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-APGYGAVM.js.map +0 -1
- package/dist/chunk-FBGAD3CQ.js.map +0 -1
- package/dist/chunk-KWUTY5XR.js.map +0 -1
- /package/dist/{dist-RVGCGRG4.js.map → dist-CCUHG3SN.js.map} +0 -0
- /package/dist/{interactive-O2KBWGEI.js.map → interactive-P3D5O673.js.map} +0 -0
|
@@ -25,7 +25,59 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-FSBZM3HT.js";
|
|
29
|
+
|
|
30
|
+
// package.json
|
|
31
|
+
var package_default = {
|
|
32
|
+
name: "agentv",
|
|
33
|
+
version: "2.13.0",
|
|
34
|
+
description: "CLI entry point for AgentV",
|
|
35
|
+
type: "module",
|
|
36
|
+
repository: {
|
|
37
|
+
type: "git",
|
|
38
|
+
url: "https://github.com/EntityProcess/agentv.git"
|
|
39
|
+
},
|
|
40
|
+
homepage: "https://github.com/EntityProcess/agentv#readme",
|
|
41
|
+
bugs: {
|
|
42
|
+
url: "https://github.com/EntityProcess/agentv/issues"
|
|
43
|
+
},
|
|
44
|
+
bin: {
|
|
45
|
+
agentv: "./dist/cli.js"
|
|
46
|
+
},
|
|
47
|
+
files: ["dist", "README.md"],
|
|
48
|
+
scripts: {
|
|
49
|
+
dev: "bun src/cli.ts",
|
|
50
|
+
build: "tsup && bun run copy-readme",
|
|
51
|
+
"copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
|
|
52
|
+
prepublishOnly: "bun run copy-readme",
|
|
53
|
+
typecheck: "tsc --noEmit",
|
|
54
|
+
lint: "biome check .",
|
|
55
|
+
format: "biome format --write .",
|
|
56
|
+
fix: "biome check --write .",
|
|
57
|
+
test: "bun test",
|
|
58
|
+
"test:watch": "bun test --watch"
|
|
59
|
+
},
|
|
60
|
+
dependencies: {
|
|
61
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
62
|
+
"@github/copilot-sdk": "^0.1.25",
|
|
63
|
+
"@inquirer/prompts": "^8.2.1",
|
|
64
|
+
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
65
|
+
"@mariozechner/pi-ai": "^0.54.2",
|
|
66
|
+
"@openai/codex-sdk": "^0.104.0",
|
|
67
|
+
"cmd-ts": "^0.14.3",
|
|
68
|
+
dotenv: "^16.4.5",
|
|
69
|
+
"fast-glob": "^3.3.3",
|
|
70
|
+
json5: "^2.2.3",
|
|
71
|
+
micromatch: "^4.0.8",
|
|
72
|
+
semver: "^7.7.4",
|
|
73
|
+
yaml: "^2.6.1"
|
|
74
|
+
},
|
|
75
|
+
devDependencies: {
|
|
76
|
+
"@agentv/core": "workspace:*",
|
|
77
|
+
"@types/semver": "^7.7.1",
|
|
78
|
+
execa: "^9.3.0"
|
|
79
|
+
}
|
|
80
|
+
};
|
|
29
81
|
|
|
30
82
|
// src/commands/eval/shared.ts
|
|
31
83
|
import { constants } from "node:fs";
|
|
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
|
|
|
152
204
|
import path10 from "node:path";
|
|
153
205
|
import { pathToFileURL } from "node:url";
|
|
154
206
|
|
|
207
|
+
// src/version-check.ts
|
|
208
|
+
import { satisfies, validRange } from "semver";
|
|
209
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
210
|
+
var ANSI_RED = "\x1B[31m";
|
|
211
|
+
var ANSI_RESET = "\x1B[0m";
|
|
212
|
+
function checkVersion(requiredVersion) {
|
|
213
|
+
const currentVersion = package_default.version;
|
|
214
|
+
if (!requiredVersion.trim() || !validRange(requiredVersion)) {
|
|
215
|
+
throw new Error(
|
|
216
|
+
`Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
satisfied: satisfies(currentVersion, requiredVersion),
|
|
221
|
+
currentVersion,
|
|
222
|
+
requiredRange: requiredVersion
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
async function enforceRequiredVersion(requiredVersion, options) {
|
|
226
|
+
let result;
|
|
227
|
+
try {
|
|
228
|
+
result = checkVersion(requiredVersion);
|
|
229
|
+
} catch (err) {
|
|
230
|
+
console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
|
|
231
|
+
process.exit(1);
|
|
232
|
+
}
|
|
233
|
+
if (result.satisfied) {
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
|
|
237
|
+
Run \`agentv self update\` to upgrade.`;
|
|
238
|
+
if (options?.strict) {
|
|
239
|
+
console.error(warning);
|
|
240
|
+
console.error(
|
|
241
|
+
`${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
|
|
242
|
+
);
|
|
243
|
+
process.exit(1);
|
|
244
|
+
}
|
|
245
|
+
if (process.stdin.isTTY && process.stdout.isTTY) {
|
|
246
|
+
console.warn(warning);
|
|
247
|
+
const shouldContinue = await promptContinue();
|
|
248
|
+
if (!shouldContinue) {
|
|
249
|
+
process.exit(1);
|
|
250
|
+
}
|
|
251
|
+
} else {
|
|
252
|
+
process.stderr.write(`${warning}
|
|
253
|
+
`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
async function promptContinue() {
|
|
257
|
+
const { confirm } = await import("@inquirer/prompts");
|
|
258
|
+
return confirm({ message: "Continue anyway?", default: false });
|
|
259
|
+
}
|
|
260
|
+
|
|
155
261
|
// src/commands/eval/env.ts
|
|
156
262
|
import { constants as constants3 } from "node:fs";
|
|
157
263
|
import { access as access3 } from "node:fs/promises";
|
|
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
|
|
|
822
928
|
}
|
|
823
929
|
};
|
|
824
930
|
|
|
931
|
+
// src/commands/eval/retry-errors.ts
|
|
932
|
+
import { createReadStream } from "node:fs";
|
|
933
|
+
import { createInterface } from "node:readline";
|
|
934
|
+
async function loadErrorTestIds(jsonlPath) {
|
|
935
|
+
const ids = [];
|
|
936
|
+
const rl = createInterface({
|
|
937
|
+
input: createReadStream(jsonlPath),
|
|
938
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
939
|
+
});
|
|
940
|
+
for await (const line of rl) {
|
|
941
|
+
const trimmed = line.trim();
|
|
942
|
+
if (!trimmed) continue;
|
|
943
|
+
try {
|
|
944
|
+
const parsed = JSON.parse(trimmed);
|
|
945
|
+
if (parsed.executionStatus === "execution_error" && parsed.testId) {
|
|
946
|
+
ids.push(parsed.testId);
|
|
947
|
+
}
|
|
948
|
+
} catch {
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
return [...new Set(ids)];
|
|
952
|
+
}
|
|
953
|
+
async function loadNonErrorResults(jsonlPath) {
|
|
954
|
+
const results = [];
|
|
955
|
+
const rl = createInterface({
|
|
956
|
+
input: createReadStream(jsonlPath),
|
|
957
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
958
|
+
});
|
|
959
|
+
for await (const line of rl) {
|
|
960
|
+
const trimmed = line.trim();
|
|
961
|
+
if (!trimmed) continue;
|
|
962
|
+
try {
|
|
963
|
+
const parsed = JSON.parse(trimmed);
|
|
964
|
+
if (!parsed.testId || parsed.score === void 0) continue;
|
|
965
|
+
if (parsed.executionStatus !== "execution_error") {
|
|
966
|
+
results.push(parsed);
|
|
967
|
+
}
|
|
968
|
+
} catch {
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
return results;
|
|
972
|
+
}
|
|
973
|
+
|
|
825
974
|
// src/commands/eval/statistics.ts
|
|
826
975
|
var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
|
|
827
976
|
function computeMean(values) {
|
|
@@ -872,7 +1021,6 @@ function buildHistogram(values) {
|
|
|
872
1021
|
return bins;
|
|
873
1022
|
}
|
|
874
1023
|
function calculateEvaluationSummary(results) {
|
|
875
|
-
const scores = results.map((result) => result.score);
|
|
876
1024
|
const total = results.length;
|
|
877
1025
|
const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
|
|
878
1026
|
const errorCount = errors.length;
|
|
@@ -888,18 +1036,39 @@ function calculateEvaluationSummary(results) {
|
|
|
888
1036
|
topResults: [],
|
|
889
1037
|
bottomResults: [],
|
|
890
1038
|
errorCount: 0,
|
|
891
|
-
errors: []
|
|
1039
|
+
errors: [],
|
|
1040
|
+
executionErrorCount: 0,
|
|
1041
|
+
qualityFailureCount: 0,
|
|
1042
|
+
passedCount: 0,
|
|
1043
|
+
byFailureStage: {},
|
|
1044
|
+
byFailureReason: {}
|
|
892
1045
|
};
|
|
893
1046
|
}
|
|
894
|
-
const
|
|
895
|
-
const
|
|
896
|
-
const
|
|
897
|
-
const
|
|
898
|
-
const
|
|
899
|
-
const
|
|
900
|
-
const
|
|
1047
|
+
const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
|
|
1048
|
+
const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
|
|
1049
|
+
const qualityScores = qualityResults.map((r) => r.score);
|
|
1050
|
+
const mean = computeMean(qualityScores);
|
|
1051
|
+
const median = computeMedian(qualityScores);
|
|
1052
|
+
const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
|
|
1053
|
+
const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
|
|
1054
|
+
const standardDeviation = computeStandardDeviation(qualityScores);
|
|
1055
|
+
const histogram = buildHistogram(qualityScores);
|
|
1056
|
+
const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
|
|
901
1057
|
const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
|
|
902
1058
|
const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
|
|
1059
|
+
const executionErrorCount = executionErrors.length;
|
|
1060
|
+
const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
|
|
1061
|
+
const passedCount = results.filter((r) => r.executionStatus === "ok").length;
|
|
1062
|
+
const byFailureStage = {};
|
|
1063
|
+
const byFailureReason = {};
|
|
1064
|
+
for (const result of executionErrors) {
|
|
1065
|
+
if (result.failureStage) {
|
|
1066
|
+
byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
|
|
1067
|
+
}
|
|
1068
|
+
if (result.failureReasonCode) {
|
|
1069
|
+
byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
903
1072
|
return {
|
|
904
1073
|
total,
|
|
905
1074
|
mean,
|
|
@@ -911,7 +1080,12 @@ function calculateEvaluationSummary(results) {
|
|
|
911
1080
|
topResults,
|
|
912
1081
|
bottomResults,
|
|
913
1082
|
errorCount,
|
|
914
|
-
errors
|
|
1083
|
+
errors,
|
|
1084
|
+
executionErrorCount,
|
|
1085
|
+
qualityFailureCount,
|
|
1086
|
+
passedCount,
|
|
1087
|
+
byFailureStage,
|
|
1088
|
+
byFailureReason
|
|
915
1089
|
};
|
|
916
1090
|
}
|
|
917
1091
|
function formatScore(value) {
|
|
@@ -924,7 +1098,7 @@ function formatEvaluationSummary(summary) {
|
|
|
924
1098
|
const lines = [];
|
|
925
1099
|
if (summary.errorCount > 0) {
|
|
926
1100
|
lines.push("\n==================================================");
|
|
927
|
-
lines.push("ERRORS");
|
|
1101
|
+
lines.push("EXECUTION ERRORS");
|
|
928
1102
|
lines.push("==================================================");
|
|
929
1103
|
for (const error of summary.errors) {
|
|
930
1104
|
lines.push(`
|
|
@@ -937,11 +1111,21 @@ function formatEvaluationSummary(summary) {
|
|
|
937
1111
|
lines.push("EVALUATION SUMMARY");
|
|
938
1112
|
lines.push("==================================================");
|
|
939
1113
|
lines.push(`Total tests: ${summary.total}`);
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
lines.push(`
|
|
1114
|
+
lines.push(`Passed: ${summary.passedCount}`);
|
|
1115
|
+
if (summary.qualityFailureCount > 0) {
|
|
1116
|
+
lines.push(`Quality failures: ${summary.qualityFailureCount}`);
|
|
1117
|
+
}
|
|
1118
|
+
if (summary.executionErrorCount > 0) {
|
|
1119
|
+
lines.push(`Execution errors: ${summary.executionErrorCount}`);
|
|
1120
|
+
}
|
|
1121
|
+
if (summary.executionErrorCount > 0) {
|
|
1122
|
+
const qualityCount = summary.total - summary.executionErrorCount;
|
|
1123
|
+
lines.push(
|
|
1124
|
+
`Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
|
|
1125
|
+
);
|
|
1126
|
+
} else {
|
|
1127
|
+
lines.push(`Mean score: ${formatScore(summary.mean)}`);
|
|
943
1128
|
}
|
|
944
|
-
lines.push(`Mean score: ${formatScore(summary.mean)}`);
|
|
945
1129
|
lines.push(`Median score: ${formatScore(summary.median)}`);
|
|
946
1130
|
lines.push(`Min score: ${formatScore(summary.min)}`);
|
|
947
1131
|
lines.push(`Max score: ${formatScore(summary.max)}`);
|
|
@@ -961,6 +1145,20 @@ function formatEvaluationSummary(summary) {
|
|
|
961
1145
|
summary.bottomResults.forEach((result, index) => {
|
|
962
1146
|
lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
|
|
963
1147
|
});
|
|
1148
|
+
const failureStageEntries = Object.entries(summary.byFailureStage);
|
|
1149
|
+
if (failureStageEntries.length > 0) {
|
|
1150
|
+
lines.push("\nExecution errors by stage:");
|
|
1151
|
+
for (const [stage, count] of failureStageEntries) {
|
|
1152
|
+
lines.push(` ${stage}: ${count}`);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
const failureReasonEntries = Object.entries(summary.byFailureReason);
|
|
1156
|
+
if (failureReasonEntries.length > 0) {
|
|
1157
|
+
lines.push("\nExecution errors by reason:");
|
|
1158
|
+
for (const [reason, count] of failureReasonEntries) {
|
|
1159
|
+
lines.push(` ${reason}: ${count}`);
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
964
1162
|
return lines.join("\n");
|
|
965
1163
|
}
|
|
966
1164
|
function formatMatrixSummary(results) {
|
|
@@ -2181,9 +2379,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
2181
2379
|
}
|
|
2182
2380
|
|
|
2183
2381
|
// src/commands/eval/targets.ts
|
|
2184
|
-
var
|
|
2185
|
-
var
|
|
2186
|
-
var
|
|
2382
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
2383
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
2384
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
2187
2385
|
function isTTY() {
|
|
2188
2386
|
return process.stdout.isTTY ?? false;
|
|
2189
2387
|
}
|
|
@@ -2229,8 +2427,8 @@ async function selectTarget(options) {
|
|
|
2229
2427
|
Warnings in ${targetsFilePath}:`);
|
|
2230
2428
|
for (const warning of warnings) {
|
|
2231
2429
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2232
|
-
const prefix = useColors ? `${
|
|
2233
|
-
const message = useColors ? `${
|
|
2430
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2431
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2234
2432
|
console.warn(`${prefix}${location} ${message}`);
|
|
2235
2433
|
}
|
|
2236
2434
|
console.warn("");
|
|
@@ -2241,8 +2439,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2241
2439
|
Errors in ${targetsFilePath}:`);
|
|
2242
2440
|
for (const error of errors) {
|
|
2243
2441
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2244
|
-
const prefix = useColors ? `${
|
|
2245
|
-
const message = useColors ? `${
|
|
2442
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2443
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2246
2444
|
console.error(`${prefix}${location} ${message}`);
|
|
2247
2445
|
}
|
|
2248
2446
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2320,8 +2518,8 @@ async function selectMultipleTargets(options) {
|
|
|
2320
2518
|
Warnings in ${targetsFilePath}:`);
|
|
2321
2519
|
for (const warning of warnings) {
|
|
2322
2520
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2323
|
-
const prefix = useColors ? `${
|
|
2324
|
-
const message = useColors ? `${
|
|
2521
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2522
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2325
2523
|
console.warn(`${prefix}${location} ${message}`);
|
|
2326
2524
|
}
|
|
2327
2525
|
console.warn("");
|
|
@@ -2332,8 +2530,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2332
2530
|
Errors in ${targetsFilePath}:`);
|
|
2333
2531
|
for (const error of errors) {
|
|
2334
2532
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2335
|
-
const prefix = useColors ? `${
|
|
2336
|
-
const message = useColors ? `${
|
|
2533
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2534
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2337
2535
|
console.error(`${prefix}${location} ${message}`);
|
|
2338
2536
|
}
|
|
2339
2537
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2494,7 +2692,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
2494
2692
|
exportOtel: normalizeBoolean(rawOptions.exportOtel),
|
|
2495
2693
|
otelBackend: normalizeString(rawOptions.otelBackend),
|
|
2496
2694
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
|
|
2497
|
-
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
|
|
2695
|
+
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
|
|
2696
|
+
retryErrors: normalizeString(rawOptions.retryErrors)
|
|
2498
2697
|
};
|
|
2499
2698
|
}
|
|
2500
2699
|
async function ensureFileExists(filePath, description) {
|
|
@@ -2628,7 +2827,8 @@ async function prepareFileMetadata(params) {
|
|
|
2628
2827
|
suiteTargets,
|
|
2629
2828
|
yamlCache: suite.cacheConfig?.enabled,
|
|
2630
2829
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
2631
|
-
totalBudgetUsd: suite.totalBudgetUsd
|
|
2830
|
+
totalBudgetUsd: suite.totalBudgetUsd,
|
|
2831
|
+
failOnError: suite.failOnError
|
|
2632
2832
|
};
|
|
2633
2833
|
}
|
|
2634
2834
|
async function runWithLimit(items, limit, task) {
|
|
@@ -2662,7 +2862,8 @@ async function runSingleEvalFile(params) {
|
|
|
2662
2862
|
evalCases,
|
|
2663
2863
|
trialsConfig,
|
|
2664
2864
|
matrixMode,
|
|
2665
|
-
totalBudgetUsd
|
|
2865
|
+
totalBudgetUsd,
|
|
2866
|
+
failOnError
|
|
2666
2867
|
} = params;
|
|
2667
2868
|
const targetName = selection.targetName;
|
|
2668
2869
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -2724,6 +2925,7 @@ async function runSingleEvalFile(params) {
|
|
|
2724
2925
|
cleanupWorkspaces: options.cleanupWorkspaces,
|
|
2725
2926
|
trials: trialsConfig,
|
|
2726
2927
|
totalBudgetUsd,
|
|
2928
|
+
failOnError,
|
|
2727
2929
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
2728
2930
|
onResult: async (result) => {
|
|
2729
2931
|
streamingObserver?.finalizeEvalCase(result.score, result.error);
|
|
@@ -2777,7 +2979,26 @@ async function runEvalCommand(input) {
|
|
|
2777
2979
|
}
|
|
2778
2980
|
const repoRoot = await findRepoRoot(cwd);
|
|
2779
2981
|
const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
|
|
2780
|
-
|
|
2982
|
+
if (yamlConfig?.required_version) {
|
|
2983
|
+
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
2984
|
+
strict: normalizeBoolean(input.rawOptions.strict)
|
|
2985
|
+
});
|
|
2986
|
+
}
|
|
2987
|
+
let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
|
|
2988
|
+
let retryNonErrorResults;
|
|
2989
|
+
if (options.retryErrors) {
|
|
2990
|
+
const retryPath = path10.resolve(options.retryErrors);
|
|
2991
|
+
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
2992
|
+
const errorIds = await loadErrorTestIds(retryPath);
|
|
2993
|
+
if (errorIds.length === 0) {
|
|
2994
|
+
console.log("No execution errors found in the previous output. Nothing to retry.");
|
|
2995
|
+
return;
|
|
2996
|
+
}
|
|
2997
|
+
console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
|
|
2998
|
+
const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
|
|
2999
|
+
options = { ...options, filter: filterPattern };
|
|
3000
|
+
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
3001
|
+
}
|
|
2781
3002
|
if (options.keepWorkspaces && options.cleanupWorkspaces) {
|
|
2782
3003
|
console.warn(
|
|
2783
3004
|
"Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
|
|
@@ -2790,7 +3011,7 @@ async function runEvalCommand(input) {
|
|
|
2790
3011
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
2791
3012
|
if (options.exportOtel || useFileExport) {
|
|
2792
3013
|
try {
|
|
2793
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
3014
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-CCUHG3SN.js");
|
|
2794
3015
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
2795
3016
|
let headers = {};
|
|
2796
3017
|
if (options.otelBackend) {
|
|
@@ -2985,12 +3206,22 @@ async function runEvalCommand(input) {
|
|
|
2985
3206
|
evalCases: applicableEvalCases,
|
|
2986
3207
|
trialsConfig: targetPrep.trialsConfig,
|
|
2987
3208
|
matrixMode: targetPrep.selections.length > 1,
|
|
2988
|
-
totalBudgetUsd: targetPrep.totalBudgetUsd
|
|
3209
|
+
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
3210
|
+
failOnError: targetPrep.failOnError
|
|
2989
3211
|
});
|
|
2990
3212
|
allResults.push(...result.results);
|
|
2991
3213
|
}
|
|
2992
3214
|
});
|
|
2993
3215
|
progressReporter.finish();
|
|
3216
|
+
if (retryNonErrorResults && retryNonErrorResults.length > 0) {
|
|
3217
|
+
for (const preserved of retryNonErrorResults) {
|
|
3218
|
+
await outputWriter.append(preserved);
|
|
3219
|
+
}
|
|
3220
|
+
allResults.push(...retryNonErrorResults);
|
|
3221
|
+
console.log(
|
|
3222
|
+
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
3223
|
+
);
|
|
3224
|
+
}
|
|
2994
3225
|
const summary = calculateEvaluationSummary(allResults);
|
|
2995
3226
|
console.log(formatEvaluationSummary(summary));
|
|
2996
3227
|
if (isMatrixMode && allResults.length > 0) {
|
|
@@ -3048,6 +3279,7 @@ async function resolveEvaluationRunner() {
|
|
|
3048
3279
|
}
|
|
3049
3280
|
|
|
3050
3281
|
export {
|
|
3282
|
+
package_default,
|
|
3051
3283
|
toSnakeCaseDeep,
|
|
3052
3284
|
resolveEvalPaths,
|
|
3053
3285
|
findRepoRoot,
|
|
@@ -3061,4 +3293,4 @@ export {
|
|
|
3061
3293
|
selectTarget,
|
|
3062
3294
|
runEvalCommand
|
|
3063
3295
|
};
|
|
3064
|
-
//# sourceMappingURL=chunk-
|
|
3296
|
+
//# sourceMappingURL=chunk-UWDI4UVN.js.map
|