agentv 2.12.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-LUHCYBMD.js → chunk-FSBZM3HT.js} +66 -5
- package/dist/chunk-FSBZM3HT.js.map +1 -0
- package/dist/{chunk-6KU2ZUFJ.js → chunk-M6JYP6A6.js} +17 -55
- package/dist/chunk-M6JYP6A6.js.map +1 -0
- package/dist/{chunk-YBJX5CP6.js → chunk-UWDI4UVN.js} +202 -19
- package/dist/chunk-UWDI4UVN.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OPPA4P5R.js → dist-CCUHG3SN.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TOUKPSHP.js → interactive-P3D5O673.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-6KU2ZUFJ.js.map +0 -1
- package/dist/chunk-LUHCYBMD.js.map +0 -1
- package/dist/chunk-YBJX5CP6.js.map +0 -1
- /package/dist/{dist-OPPA4P5R.js.map → dist-CCUHG3SN.js.map} +0 -0
- /package/dist/{interactive-TOUKPSHP.js.map → interactive-P3D5O673.js.map} +0 -0
|
@@ -25,7 +25,59 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-FSBZM3HT.js";
|
|
29
|
+
|
|
30
|
+
// package.json
|
|
31
|
+
var package_default = {
|
|
32
|
+
name: "agentv",
|
|
33
|
+
version: "2.13.0",
|
|
34
|
+
description: "CLI entry point for AgentV",
|
|
35
|
+
type: "module",
|
|
36
|
+
repository: {
|
|
37
|
+
type: "git",
|
|
38
|
+
url: "https://github.com/EntityProcess/agentv.git"
|
|
39
|
+
},
|
|
40
|
+
homepage: "https://github.com/EntityProcess/agentv#readme",
|
|
41
|
+
bugs: {
|
|
42
|
+
url: "https://github.com/EntityProcess/agentv/issues"
|
|
43
|
+
},
|
|
44
|
+
bin: {
|
|
45
|
+
agentv: "./dist/cli.js"
|
|
46
|
+
},
|
|
47
|
+
files: ["dist", "README.md"],
|
|
48
|
+
scripts: {
|
|
49
|
+
dev: "bun src/cli.ts",
|
|
50
|
+
build: "tsup && bun run copy-readme",
|
|
51
|
+
"copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
|
|
52
|
+
prepublishOnly: "bun run copy-readme",
|
|
53
|
+
typecheck: "tsc --noEmit",
|
|
54
|
+
lint: "biome check .",
|
|
55
|
+
format: "biome format --write .",
|
|
56
|
+
fix: "biome check --write .",
|
|
57
|
+
test: "bun test",
|
|
58
|
+
"test:watch": "bun test --watch"
|
|
59
|
+
},
|
|
60
|
+
dependencies: {
|
|
61
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
62
|
+
"@github/copilot-sdk": "^0.1.25",
|
|
63
|
+
"@inquirer/prompts": "^8.2.1",
|
|
64
|
+
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
65
|
+
"@mariozechner/pi-ai": "^0.54.2",
|
|
66
|
+
"@openai/codex-sdk": "^0.104.0",
|
|
67
|
+
"cmd-ts": "^0.14.3",
|
|
68
|
+
dotenv: "^16.4.5",
|
|
69
|
+
"fast-glob": "^3.3.3",
|
|
70
|
+
json5: "^2.2.3",
|
|
71
|
+
micromatch: "^4.0.8",
|
|
72
|
+
semver: "^7.7.4",
|
|
73
|
+
yaml: "^2.6.1"
|
|
74
|
+
},
|
|
75
|
+
devDependencies: {
|
|
76
|
+
"@agentv/core": "workspace:*",
|
|
77
|
+
"@types/semver": "^7.7.1",
|
|
78
|
+
execa: "^9.3.0"
|
|
79
|
+
}
|
|
80
|
+
};
|
|
29
81
|
|
|
30
82
|
// src/commands/eval/shared.ts
|
|
31
83
|
import { constants } from "node:fs";
|
|
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
|
|
|
152
204
|
import path10 from "node:path";
|
|
153
205
|
import { pathToFileURL } from "node:url";
|
|
154
206
|
|
|
207
|
+
// src/version-check.ts
|
|
208
|
+
import { satisfies, validRange } from "semver";
|
|
209
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
210
|
+
var ANSI_RED = "\x1B[31m";
|
|
211
|
+
var ANSI_RESET = "\x1B[0m";
|
|
212
|
+
function checkVersion(requiredVersion) {
|
|
213
|
+
const currentVersion = package_default.version;
|
|
214
|
+
if (!requiredVersion.trim() || !validRange(requiredVersion)) {
|
|
215
|
+
throw new Error(
|
|
216
|
+
`Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
satisfied: satisfies(currentVersion, requiredVersion),
|
|
221
|
+
currentVersion,
|
|
222
|
+
requiredRange: requiredVersion
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
async function enforceRequiredVersion(requiredVersion, options) {
|
|
226
|
+
let result;
|
|
227
|
+
try {
|
|
228
|
+
result = checkVersion(requiredVersion);
|
|
229
|
+
} catch (err) {
|
|
230
|
+
console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
|
|
231
|
+
process.exit(1);
|
|
232
|
+
}
|
|
233
|
+
if (result.satisfied) {
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
|
|
237
|
+
Run \`agentv self update\` to upgrade.`;
|
|
238
|
+
if (options?.strict) {
|
|
239
|
+
console.error(warning);
|
|
240
|
+
console.error(
|
|
241
|
+
`${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
|
|
242
|
+
);
|
|
243
|
+
process.exit(1);
|
|
244
|
+
}
|
|
245
|
+
if (process.stdin.isTTY && process.stdout.isTTY) {
|
|
246
|
+
console.warn(warning);
|
|
247
|
+
const shouldContinue = await promptContinue();
|
|
248
|
+
if (!shouldContinue) {
|
|
249
|
+
process.exit(1);
|
|
250
|
+
}
|
|
251
|
+
} else {
|
|
252
|
+
process.stderr.write(`${warning}
|
|
253
|
+
`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
async function promptContinue() {
|
|
257
|
+
const { confirm } = await import("@inquirer/prompts");
|
|
258
|
+
return confirm({ message: "Continue anyway?", default: false });
|
|
259
|
+
}
|
|
260
|
+
|
|
155
261
|
// src/commands/eval/env.ts
|
|
156
262
|
import { constants as constants3 } from "node:fs";
|
|
157
263
|
import { access as access3 } from "node:fs/promises";
|
|
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
|
|
|
822
928
|
}
|
|
823
929
|
};
|
|
824
930
|
|
|
931
|
+
// src/commands/eval/retry-errors.ts
|
|
932
|
+
import { createReadStream } from "node:fs";
|
|
933
|
+
import { createInterface } from "node:readline";
|
|
934
|
+
async function loadErrorTestIds(jsonlPath) {
|
|
935
|
+
const ids = [];
|
|
936
|
+
const rl = createInterface({
|
|
937
|
+
input: createReadStream(jsonlPath),
|
|
938
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
939
|
+
});
|
|
940
|
+
for await (const line of rl) {
|
|
941
|
+
const trimmed = line.trim();
|
|
942
|
+
if (!trimmed) continue;
|
|
943
|
+
try {
|
|
944
|
+
const parsed = JSON.parse(trimmed);
|
|
945
|
+
if (parsed.executionStatus === "execution_error" && parsed.testId) {
|
|
946
|
+
ids.push(parsed.testId);
|
|
947
|
+
}
|
|
948
|
+
} catch {
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
return [...new Set(ids)];
|
|
952
|
+
}
|
|
953
|
+
async function loadNonErrorResults(jsonlPath) {
|
|
954
|
+
const results = [];
|
|
955
|
+
const rl = createInterface({
|
|
956
|
+
input: createReadStream(jsonlPath),
|
|
957
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
958
|
+
});
|
|
959
|
+
for await (const line of rl) {
|
|
960
|
+
const trimmed = line.trim();
|
|
961
|
+
if (!trimmed) continue;
|
|
962
|
+
try {
|
|
963
|
+
const parsed = JSON.parse(trimmed);
|
|
964
|
+
if (!parsed.testId || parsed.score === void 0) continue;
|
|
965
|
+
if (parsed.executionStatus !== "execution_error") {
|
|
966
|
+
results.push(parsed);
|
|
967
|
+
}
|
|
968
|
+
} catch {
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
return results;
|
|
972
|
+
}
|
|
973
|
+
|
|
825
974
|
// src/commands/eval/statistics.ts
|
|
826
975
|
var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
|
|
827
976
|
function computeMean(values) {
|
|
@@ -2230,9 +2379,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
2230
2379
|
}
|
|
2231
2380
|
|
|
2232
2381
|
// src/commands/eval/targets.ts
|
|
2233
|
-
var
|
|
2234
|
-
var
|
|
2235
|
-
var
|
|
2382
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
2383
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
2384
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
2236
2385
|
function isTTY() {
|
|
2237
2386
|
return process.stdout.isTTY ?? false;
|
|
2238
2387
|
}
|
|
@@ -2278,8 +2427,8 @@ async function selectTarget(options) {
|
|
|
2278
2427
|
Warnings in ${targetsFilePath}:`);
|
|
2279
2428
|
for (const warning of warnings) {
|
|
2280
2429
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2281
|
-
const prefix = useColors ? `${
|
|
2282
|
-
const message = useColors ? `${
|
|
2430
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2431
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2283
2432
|
console.warn(`${prefix}${location} ${message}`);
|
|
2284
2433
|
}
|
|
2285
2434
|
console.warn("");
|
|
@@ -2290,8 +2439,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2290
2439
|
Errors in ${targetsFilePath}:`);
|
|
2291
2440
|
for (const error of errors) {
|
|
2292
2441
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2293
|
-
const prefix = useColors ? `${
|
|
2294
|
-
const message = useColors ? `${
|
|
2442
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2443
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2295
2444
|
console.error(`${prefix}${location} ${message}`);
|
|
2296
2445
|
}
|
|
2297
2446
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2369,8 +2518,8 @@ async function selectMultipleTargets(options) {
|
|
|
2369
2518
|
Warnings in ${targetsFilePath}:`);
|
|
2370
2519
|
for (const warning of warnings) {
|
|
2371
2520
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2372
|
-
const prefix = useColors ? `${
|
|
2373
|
-
const message = useColors ? `${
|
|
2521
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2522
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2374
2523
|
console.warn(`${prefix}${location} ${message}`);
|
|
2375
2524
|
}
|
|
2376
2525
|
console.warn("");
|
|
@@ -2381,8 +2530,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2381
2530
|
Errors in ${targetsFilePath}:`);
|
|
2382
2531
|
for (const error of errors) {
|
|
2383
2532
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2384
|
-
const prefix = useColors ? `${
|
|
2385
|
-
const message = useColors ? `${
|
|
2533
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2534
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2386
2535
|
console.error(`${prefix}${location} ${message}`);
|
|
2387
2536
|
}
|
|
2388
2537
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2543,7 +2692,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
2543
2692
|
exportOtel: normalizeBoolean(rawOptions.exportOtel),
|
|
2544
2693
|
otelBackend: normalizeString(rawOptions.otelBackend),
|
|
2545
2694
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
|
|
2546
|
-
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
|
|
2695
|
+
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
|
|
2696
|
+
retryErrors: normalizeString(rawOptions.retryErrors)
|
|
2547
2697
|
};
|
|
2548
2698
|
}
|
|
2549
2699
|
async function ensureFileExists(filePath, description) {
|
|
@@ -2677,7 +2827,8 @@ async function prepareFileMetadata(params) {
|
|
|
2677
2827
|
suiteTargets,
|
|
2678
2828
|
yamlCache: suite.cacheConfig?.enabled,
|
|
2679
2829
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
2680
|
-
totalBudgetUsd: suite.totalBudgetUsd
|
|
2830
|
+
totalBudgetUsd: suite.totalBudgetUsd,
|
|
2831
|
+
failOnError: suite.failOnError
|
|
2681
2832
|
};
|
|
2682
2833
|
}
|
|
2683
2834
|
async function runWithLimit(items, limit, task) {
|
|
@@ -2711,7 +2862,8 @@ async function runSingleEvalFile(params) {
|
|
|
2711
2862
|
evalCases,
|
|
2712
2863
|
trialsConfig,
|
|
2713
2864
|
matrixMode,
|
|
2714
|
-
totalBudgetUsd
|
|
2865
|
+
totalBudgetUsd,
|
|
2866
|
+
failOnError
|
|
2715
2867
|
} = params;
|
|
2716
2868
|
const targetName = selection.targetName;
|
|
2717
2869
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -2773,6 +2925,7 @@ async function runSingleEvalFile(params) {
|
|
|
2773
2925
|
cleanupWorkspaces: options.cleanupWorkspaces,
|
|
2774
2926
|
trials: trialsConfig,
|
|
2775
2927
|
totalBudgetUsd,
|
|
2928
|
+
failOnError,
|
|
2776
2929
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
2777
2930
|
onResult: async (result) => {
|
|
2778
2931
|
streamingObserver?.finalizeEvalCase(result.score, result.error);
|
|
@@ -2826,7 +2979,26 @@ async function runEvalCommand(input) {
|
|
|
2826
2979
|
}
|
|
2827
2980
|
const repoRoot = await findRepoRoot(cwd);
|
|
2828
2981
|
const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
|
|
2829
|
-
|
|
2982
|
+
if (yamlConfig?.required_version) {
|
|
2983
|
+
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
2984
|
+
strict: normalizeBoolean(input.rawOptions.strict)
|
|
2985
|
+
});
|
|
2986
|
+
}
|
|
2987
|
+
let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
|
|
2988
|
+
let retryNonErrorResults;
|
|
2989
|
+
if (options.retryErrors) {
|
|
2990
|
+
const retryPath = path10.resolve(options.retryErrors);
|
|
2991
|
+
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
2992
|
+
const errorIds = await loadErrorTestIds(retryPath);
|
|
2993
|
+
if (errorIds.length === 0) {
|
|
2994
|
+
console.log("No execution errors found in the previous output. Nothing to retry.");
|
|
2995
|
+
return;
|
|
2996
|
+
}
|
|
2997
|
+
console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
|
|
2998
|
+
const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
|
|
2999
|
+
options = { ...options, filter: filterPattern };
|
|
3000
|
+
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
3001
|
+
}
|
|
2830
3002
|
if (options.keepWorkspaces && options.cleanupWorkspaces) {
|
|
2831
3003
|
console.warn(
|
|
2832
3004
|
"Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
|
|
@@ -2839,7 +3011,7 @@ async function runEvalCommand(input) {
|
|
|
2839
3011
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
2840
3012
|
if (options.exportOtel || useFileExport) {
|
|
2841
3013
|
try {
|
|
2842
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
3014
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-CCUHG3SN.js");
|
|
2843
3015
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
2844
3016
|
let headers = {};
|
|
2845
3017
|
if (options.otelBackend) {
|
|
@@ -3034,12 +3206,22 @@ async function runEvalCommand(input) {
|
|
|
3034
3206
|
evalCases: applicableEvalCases,
|
|
3035
3207
|
trialsConfig: targetPrep.trialsConfig,
|
|
3036
3208
|
matrixMode: targetPrep.selections.length > 1,
|
|
3037
|
-
totalBudgetUsd: targetPrep.totalBudgetUsd
|
|
3209
|
+
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
3210
|
+
failOnError: targetPrep.failOnError
|
|
3038
3211
|
});
|
|
3039
3212
|
allResults.push(...result.results);
|
|
3040
3213
|
}
|
|
3041
3214
|
});
|
|
3042
3215
|
progressReporter.finish();
|
|
3216
|
+
if (retryNonErrorResults && retryNonErrorResults.length > 0) {
|
|
3217
|
+
for (const preserved of retryNonErrorResults) {
|
|
3218
|
+
await outputWriter.append(preserved);
|
|
3219
|
+
}
|
|
3220
|
+
allResults.push(...retryNonErrorResults);
|
|
3221
|
+
console.log(
|
|
3222
|
+
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
3223
|
+
);
|
|
3224
|
+
}
|
|
3043
3225
|
const summary = calculateEvaluationSummary(allResults);
|
|
3044
3226
|
console.log(formatEvaluationSummary(summary));
|
|
3045
3227
|
if (isMatrixMode && allResults.length > 0) {
|
|
@@ -3097,6 +3279,7 @@ async function resolveEvaluationRunner() {
|
|
|
3097
3279
|
}
|
|
3098
3280
|
|
|
3099
3281
|
export {
|
|
3282
|
+
package_default,
|
|
3100
3283
|
toSnakeCaseDeep,
|
|
3101
3284
|
resolveEvalPaths,
|
|
3102
3285
|
findRepoRoot,
|
|
@@ -3110,4 +3293,4 @@ export {
|
|
|
3110
3293
|
selectTarget,
|
|
3111
3294
|
runEvalCommand
|
|
3112
3295
|
};
|
|
3113
|
-
//# sourceMappingURL=chunk-
|
|
3296
|
+
//# sourceMappingURL=chunk-UWDI4UVN.js.map
|