agentv 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,59 @@ import {
25
25
  subscribeToCopilotCliLogEntries,
26
26
  subscribeToCopilotSdkLogEntries,
27
27
  subscribeToPiLogEntries
28
- } from "./chunk-KWUTY5XR.js";
28
+ } from "./chunk-FSBZM3HT.js";
29
+
30
+ // package.json
31
+ var package_default = {
32
+ name: "agentv",
33
+ version: "2.13.0",
34
+ description: "CLI entry point for AgentV",
35
+ type: "module",
36
+ repository: {
37
+ type: "git",
38
+ url: "https://github.com/EntityProcess/agentv.git"
39
+ },
40
+ homepage: "https://github.com/EntityProcess/agentv#readme",
41
+ bugs: {
42
+ url: "https://github.com/EntityProcess/agentv/issues"
43
+ },
44
+ bin: {
45
+ agentv: "./dist/cli.js"
46
+ },
47
+ files: ["dist", "README.md"],
48
+ scripts: {
49
+ dev: "bun src/cli.ts",
50
+ build: "tsup && bun run copy-readme",
51
+ "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
52
+ prepublishOnly: "bun run copy-readme",
53
+ typecheck: "tsc --noEmit",
54
+ lint: "biome check .",
55
+ format: "biome format --write .",
56
+ fix: "biome check --write .",
57
+ test: "bun test",
58
+ "test:watch": "bun test --watch"
59
+ },
60
+ dependencies: {
61
+ "@anthropic-ai/claude-agent-sdk": "^0.2.49",
62
+ "@github/copilot-sdk": "^0.1.25",
63
+ "@inquirer/prompts": "^8.2.1",
64
+ "@mariozechner/pi-agent-core": "^0.54.2",
65
+ "@mariozechner/pi-ai": "^0.54.2",
66
+ "@openai/codex-sdk": "^0.104.0",
67
+ "cmd-ts": "^0.14.3",
68
+ dotenv: "^16.4.5",
69
+ "fast-glob": "^3.3.3",
70
+ json5: "^2.2.3",
71
+ micromatch: "^4.0.8",
72
+ semver: "^7.7.4",
73
+ yaml: "^2.6.1"
74
+ },
75
+ devDependencies: {
76
+ "@agentv/core": "workspace:*",
77
+ "@types/semver": "^7.7.1",
78
+ execa: "^9.3.0"
79
+ }
80
+ };
29
81
 
30
82
  // src/commands/eval/shared.ts
31
83
  import { constants } from "node:fs";
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
152
204
  import path10 from "node:path";
153
205
  import { pathToFileURL } from "node:url";
154
206
 
207
+ // src/version-check.ts
208
+ import { satisfies, validRange } from "semver";
209
+ var ANSI_YELLOW = "\x1B[33m";
210
+ var ANSI_RED = "\x1B[31m";
211
+ var ANSI_RESET = "\x1B[0m";
212
+ function checkVersion(requiredVersion) {
213
+ const currentVersion = package_default.version;
214
+ if (!requiredVersion.trim() || !validRange(requiredVersion)) {
215
+ throw new Error(
216
+ `Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
217
+ );
218
+ }
219
+ return {
220
+ satisfied: satisfies(currentVersion, requiredVersion),
221
+ currentVersion,
222
+ requiredRange: requiredVersion
223
+ };
224
+ }
225
+ async function enforceRequiredVersion(requiredVersion, options) {
226
+ let result;
227
+ try {
228
+ result = checkVersion(requiredVersion);
229
+ } catch (err) {
230
+ console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
231
+ process.exit(1);
232
+ }
233
+ if (result.satisfied) {
234
+ return;
235
+ }
236
+ const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
237
+ Run \`agentv self update\` to upgrade.`;
238
+ if (options?.strict) {
239
+ console.error(warning);
240
+ console.error(
241
+ `${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
242
+ );
243
+ process.exit(1);
244
+ }
245
+ if (process.stdin.isTTY && process.stdout.isTTY) {
246
+ console.warn(warning);
247
+ const shouldContinue = await promptContinue();
248
+ if (!shouldContinue) {
249
+ process.exit(1);
250
+ }
251
+ } else {
252
+ process.stderr.write(`${warning}
253
+ `);
254
+ }
255
+ }
256
+ async function promptContinue() {
257
+ const { confirm } = await import("@inquirer/prompts");
258
+ return confirm({ message: "Continue anyway?", default: false });
259
+ }
260
+
155
261
  // src/commands/eval/env.ts
156
262
  import { constants as constants3 } from "node:fs";
157
263
  import { access as access3 } from "node:fs/promises";
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
822
928
  }
823
929
  };
824
930
 
931
+ // src/commands/eval/retry-errors.ts
932
+ import { createReadStream } from "node:fs";
933
+ import { createInterface } from "node:readline";
934
+ async function loadErrorTestIds(jsonlPath) {
935
+ const ids = [];
936
+ const rl = createInterface({
937
+ input: createReadStream(jsonlPath),
938
+ crlfDelay: Number.POSITIVE_INFINITY
939
+ });
940
+ for await (const line of rl) {
941
+ const trimmed = line.trim();
942
+ if (!trimmed) continue;
943
+ try {
944
+ const parsed = JSON.parse(trimmed);
945
+ if (parsed.executionStatus === "execution_error" && parsed.testId) {
946
+ ids.push(parsed.testId);
947
+ }
948
+ } catch {
949
+ }
950
+ }
951
+ return [...new Set(ids)];
952
+ }
953
+ async function loadNonErrorResults(jsonlPath) {
954
+ const results = [];
955
+ const rl = createInterface({
956
+ input: createReadStream(jsonlPath),
957
+ crlfDelay: Number.POSITIVE_INFINITY
958
+ });
959
+ for await (const line of rl) {
960
+ const trimmed = line.trim();
961
+ if (!trimmed) continue;
962
+ try {
963
+ const parsed = JSON.parse(trimmed);
964
+ if (!parsed.testId || parsed.score === void 0) continue;
965
+ if (parsed.executionStatus !== "execution_error") {
966
+ results.push(parsed);
967
+ }
968
+ } catch {
969
+ }
970
+ }
971
+ return results;
972
+ }
973
+
825
974
  // src/commands/eval/statistics.ts
826
975
  var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
827
976
  function computeMean(values) {
@@ -872,7 +1021,6 @@ function buildHistogram(values) {
872
1021
  return bins;
873
1022
  }
874
1023
  function calculateEvaluationSummary(results) {
875
- const scores = results.map((result) => result.score);
876
1024
  const total = results.length;
877
1025
  const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
878
1026
  const errorCount = errors.length;
@@ -888,18 +1036,39 @@ function calculateEvaluationSummary(results) {
888
1036
  topResults: [],
889
1037
  bottomResults: [],
890
1038
  errorCount: 0,
891
- errors: []
1039
+ errors: [],
1040
+ executionErrorCount: 0,
1041
+ qualityFailureCount: 0,
1042
+ passedCount: 0,
1043
+ byFailureStage: {},
1044
+ byFailureReason: {}
892
1045
  };
893
1046
  }
894
- const mean = computeMean(scores);
895
- const median = computeMedian(scores);
896
- const min = Math.min(...scores);
897
- const max = Math.max(...scores);
898
- const standardDeviation = computeStandardDeviation(scores);
899
- const histogram = buildHistogram(scores);
900
- const sortedResults = [...results].sort((a, b) => b.score - a.score);
1047
+ const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
1048
+ const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
1049
+ const qualityScores = qualityResults.map((r) => r.score);
1050
+ const mean = computeMean(qualityScores);
1051
+ const median = computeMedian(qualityScores);
1052
+ const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
1053
+ const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
1054
+ const standardDeviation = computeStandardDeviation(qualityScores);
1055
+ const histogram = buildHistogram(qualityScores);
1056
+ const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
901
1057
  const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
902
1058
  const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
1059
+ const executionErrorCount = executionErrors.length;
1060
+ const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
1061
+ const passedCount = results.filter((r) => r.executionStatus === "ok").length;
1062
+ const byFailureStage = {};
1063
+ const byFailureReason = {};
1064
+ for (const result of executionErrors) {
1065
+ if (result.failureStage) {
1066
+ byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
1067
+ }
1068
+ if (result.failureReasonCode) {
1069
+ byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
1070
+ }
1071
+ }
903
1072
  return {
904
1073
  total,
905
1074
  mean,
@@ -911,7 +1080,12 @@ function calculateEvaluationSummary(results) {
911
1080
  topResults,
912
1081
  bottomResults,
913
1082
  errorCount,
914
- errors
1083
+ errors,
1084
+ executionErrorCount,
1085
+ qualityFailureCount,
1086
+ passedCount,
1087
+ byFailureStage,
1088
+ byFailureReason
915
1089
  };
916
1090
  }
917
1091
  function formatScore(value) {
@@ -924,7 +1098,7 @@ function formatEvaluationSummary(summary) {
924
1098
  const lines = [];
925
1099
  if (summary.errorCount > 0) {
926
1100
  lines.push("\n==================================================");
927
- lines.push("ERRORS");
1101
+ lines.push("EXECUTION ERRORS");
928
1102
  lines.push("==================================================");
929
1103
  for (const error of summary.errors) {
930
1104
  lines.push(`
@@ -937,11 +1111,21 @@ function formatEvaluationSummary(summary) {
937
1111
  lines.push("EVALUATION SUMMARY");
938
1112
  lines.push("==================================================");
939
1113
  lines.push(`Total tests: ${summary.total}`);
940
- if (summary.errorCount > 0) {
941
- lines.push(`Failed: ${summary.errorCount}`);
942
- lines.push(`Passed: ${summary.total - summary.errorCount}`);
1114
+ lines.push(`Passed: ${summary.passedCount}`);
1115
+ if (summary.qualityFailureCount > 0) {
1116
+ lines.push(`Quality failures: ${summary.qualityFailureCount}`);
1117
+ }
1118
+ if (summary.executionErrorCount > 0) {
1119
+ lines.push(`Execution errors: ${summary.executionErrorCount}`);
1120
+ }
1121
+ if (summary.executionErrorCount > 0) {
1122
+ const qualityCount = summary.total - summary.executionErrorCount;
1123
+ lines.push(
1124
+ `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
1125
+ );
1126
+ } else {
1127
+ lines.push(`Mean score: ${formatScore(summary.mean)}`);
943
1128
  }
944
- lines.push(`Mean score: ${formatScore(summary.mean)}`);
945
1129
  lines.push(`Median score: ${formatScore(summary.median)}`);
946
1130
  lines.push(`Min score: ${formatScore(summary.min)}`);
947
1131
  lines.push(`Max score: ${formatScore(summary.max)}`);
@@ -961,6 +1145,20 @@ function formatEvaluationSummary(summary) {
961
1145
  summary.bottomResults.forEach((result, index) => {
962
1146
  lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
963
1147
  });
1148
+ const failureStageEntries = Object.entries(summary.byFailureStage);
1149
+ if (failureStageEntries.length > 0) {
1150
+ lines.push("\nExecution errors by stage:");
1151
+ for (const [stage, count] of failureStageEntries) {
1152
+ lines.push(` ${stage}: ${count}`);
1153
+ }
1154
+ }
1155
+ const failureReasonEntries = Object.entries(summary.byFailureReason);
1156
+ if (failureReasonEntries.length > 0) {
1157
+ lines.push("\nExecution errors by reason:");
1158
+ for (const [reason, count] of failureReasonEntries) {
1159
+ lines.push(` ${reason}: ${count}`);
1160
+ }
1161
+ }
964
1162
  return lines.join("\n");
965
1163
  }
966
1164
  function formatMatrixSummary(results) {
@@ -2181,9 +2379,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
2181
2379
  }
2182
2380
 
2183
2381
  // src/commands/eval/targets.ts
2184
- var ANSI_YELLOW = "\x1B[33m";
2185
- var ANSI_RED = "\x1B[31m";
2186
- var ANSI_RESET = "\x1B[0m";
2382
+ var ANSI_YELLOW2 = "\x1B[33m";
2383
+ var ANSI_RED2 = "\x1B[31m";
2384
+ var ANSI_RESET2 = "\x1B[0m";
2187
2385
  function isTTY() {
2188
2386
  return process.stdout.isTTY ?? false;
2189
2387
  }
@@ -2229,8 +2427,8 @@ async function selectTarget(options) {
2229
2427
  Warnings in ${targetsFilePath}:`);
2230
2428
  for (const warning of warnings) {
2231
2429
  const location = warning.location ? ` [${warning.location}]` : "";
2232
- const prefix = useColors ? `${ANSI_YELLOW} \u26A0${ANSI_RESET}` : " \u26A0";
2233
- const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
2430
+ const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
2431
+ const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
2234
2432
  console.warn(`${prefix}${location} ${message}`);
2235
2433
  }
2236
2434
  console.warn("");
@@ -2241,8 +2439,8 @@ Warnings in ${targetsFilePath}:`);
2241
2439
  Errors in ${targetsFilePath}:`);
2242
2440
  for (const error of errors) {
2243
2441
  const location = error.location ? ` [${error.location}]` : "";
2244
- const prefix = useColors ? `${ANSI_RED} \u2717${ANSI_RESET}` : " \u2717";
2245
- const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
2442
+ const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
2443
+ const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
2246
2444
  console.error(`${prefix}${location} ${message}`);
2247
2445
  }
2248
2446
  throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2320,8 +2518,8 @@ async function selectMultipleTargets(options) {
2320
2518
  Warnings in ${targetsFilePath}:`);
2321
2519
  for (const warning of warnings) {
2322
2520
  const location = warning.location ? ` [${warning.location}]` : "";
2323
- const prefix = useColors ? `${ANSI_YELLOW} \u26A0${ANSI_RESET}` : " \u26A0";
2324
- const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
2521
+ const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
2522
+ const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
2325
2523
  console.warn(`${prefix}${location} ${message}`);
2326
2524
  }
2327
2525
  console.warn("");
@@ -2332,8 +2530,8 @@ Warnings in ${targetsFilePath}:`);
2332
2530
  Errors in ${targetsFilePath}:`);
2333
2531
  for (const error of errors) {
2334
2532
  const location = error.location ? ` [${error.location}]` : "";
2335
- const prefix = useColors ? `${ANSI_RED} \u2717${ANSI_RESET}` : " \u2717";
2336
- const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
2533
+ const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
2534
+ const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
2337
2535
  console.error(`${prefix}${location} ${message}`);
2338
2536
  }
2339
2537
  throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2494,7 +2692,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
2494
2692
  exportOtel: normalizeBoolean(rawOptions.exportOtel),
2495
2693
  otelBackend: normalizeString(rawOptions.otelBackend),
2496
2694
  otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
2497
- otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
2695
+ otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
2696
+ retryErrors: normalizeString(rawOptions.retryErrors)
2498
2697
  };
2499
2698
  }
2500
2699
  async function ensureFileExists(filePath, description) {
@@ -2628,7 +2827,8 @@ async function prepareFileMetadata(params) {
2628
2827
  suiteTargets,
2629
2828
  yamlCache: suite.cacheConfig?.enabled,
2630
2829
  yamlCachePath: suite.cacheConfig?.cachePath,
2631
- totalBudgetUsd: suite.totalBudgetUsd
2830
+ totalBudgetUsd: suite.totalBudgetUsd,
2831
+ failOnError: suite.failOnError
2632
2832
  };
2633
2833
  }
2634
2834
  async function runWithLimit(items, limit, task) {
@@ -2662,7 +2862,8 @@ async function runSingleEvalFile(params) {
2662
2862
  evalCases,
2663
2863
  trialsConfig,
2664
2864
  matrixMode,
2665
- totalBudgetUsd
2865
+ totalBudgetUsd,
2866
+ failOnError
2666
2867
  } = params;
2667
2868
  const targetName = selection.targetName;
2668
2869
  await ensureFileExists(testFilePath, "Test file");
@@ -2724,6 +2925,7 @@ async function runSingleEvalFile(params) {
2724
2925
  cleanupWorkspaces: options.cleanupWorkspaces,
2725
2926
  trials: trialsConfig,
2726
2927
  totalBudgetUsd,
2928
+ failOnError,
2727
2929
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
2728
2930
  onResult: async (result) => {
2729
2931
  streamingObserver?.finalizeEvalCase(result.score, result.error);
@@ -2777,7 +2979,26 @@ async function runEvalCommand(input) {
2777
2979
  }
2778
2980
  const repoRoot = await findRepoRoot(cwd);
2779
2981
  const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
2780
- const options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
2982
+ if (yamlConfig?.required_version) {
2983
+ await enforceRequiredVersion(yamlConfig.required_version, {
2984
+ strict: normalizeBoolean(input.rawOptions.strict)
2985
+ });
2986
+ }
2987
+ let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
2988
+ let retryNonErrorResults;
2989
+ if (options.retryErrors) {
2990
+ const retryPath = path10.resolve(options.retryErrors);
2991
+ await ensureFileExists(retryPath, "Retry-errors JSONL file");
2992
+ const errorIds = await loadErrorTestIds(retryPath);
2993
+ if (errorIds.length === 0) {
2994
+ console.log("No execution errors found in the previous output. Nothing to retry.");
2995
+ return;
2996
+ }
2997
+ console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
2998
+ const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
2999
+ options = { ...options, filter: filterPattern };
3000
+ retryNonErrorResults = await loadNonErrorResults(retryPath);
3001
+ }
2781
3002
  if (options.keepWorkspaces && options.cleanupWorkspaces) {
2782
3003
  console.warn(
2783
3004
  "Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
@@ -2790,7 +3011,7 @@ async function runEvalCommand(input) {
2790
3011
  const useFileExport = !!(options.otelFile || options.traceFile);
2791
3012
  if (options.exportOtel || useFileExport) {
2792
3013
  try {
2793
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-RVGCGRG4.js");
3014
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-CCUHG3SN.js");
2794
3015
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
2795
3016
  let headers = {};
2796
3017
  if (options.otelBackend) {
@@ -2985,12 +3206,22 @@ async function runEvalCommand(input) {
2985
3206
  evalCases: applicableEvalCases,
2986
3207
  trialsConfig: targetPrep.trialsConfig,
2987
3208
  matrixMode: targetPrep.selections.length > 1,
2988
- totalBudgetUsd: targetPrep.totalBudgetUsd
3209
+ totalBudgetUsd: targetPrep.totalBudgetUsd,
3210
+ failOnError: targetPrep.failOnError
2989
3211
  });
2990
3212
  allResults.push(...result.results);
2991
3213
  }
2992
3214
  });
2993
3215
  progressReporter.finish();
3216
+ if (retryNonErrorResults && retryNonErrorResults.length > 0) {
3217
+ for (const preserved of retryNonErrorResults) {
3218
+ await outputWriter.append(preserved);
3219
+ }
3220
+ allResults.push(...retryNonErrorResults);
3221
+ console.log(
3222
+ `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
3223
+ );
3224
+ }
2994
3225
  const summary = calculateEvaluationSummary(allResults);
2995
3226
  console.log(formatEvaluationSummary(summary));
2996
3227
  if (isMatrixMode && allResults.length > 0) {
@@ -3048,6 +3279,7 @@ async function resolveEvaluationRunner() {
3048
3279
  }
3049
3280
 
3050
3281
  export {
3282
+ package_default,
3051
3283
  toSnakeCaseDeep,
3052
3284
  resolveEvalPaths,
3053
3285
  findRepoRoot,
@@ -3061,4 +3293,4 @@ export {
3061
3293
  selectTarget,
3062
3294
  runEvalCommand
3063
3295
  };
3064
- //# sourceMappingURL=chunk-APGYGAVM.js.map
3296
+ //# sourceMappingURL=chunk-UWDI4UVN.js.map