npm - agentv - Versions diffs - 2.11.4 → 2.13.0 - Mend

agentv 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{chunk-KWUTY5XR.js → chunk-FSBZM3HT.js} +176 -31
package/dist/chunk-FSBZM3HT.js.map +1 -0
package/dist/{chunk-FBGAD3CQ.js → chunk-M6JYP6A6.js} +17 -55
package/dist/chunk-M6JYP6A6.js.map +1 -0
package/dist/{chunk-APGYGAVM.js → chunk-UWDI4UVN.js} +266 -34
package/dist/chunk-UWDI4UVN.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-RVGCGRG4.js → dist-CCUHG3SN.js} +4 -2
package/dist/index.js +3 -3
package/dist/{interactive-O2KBWGEI.js → interactive-P3D5O673.js} +3 -3
package/package.json +4 -2
package/dist/chunk-APGYGAVM.js.map +0 -1
package/dist/chunk-FBGAD3CQ.js.map +0 -1
package/dist/chunk-KWUTY5XR.js.map +0 -1
/package/dist/{dist-RVGCGRG4.js.map → dist-CCUHG3SN.js.map} +0 -0
/package/dist/{interactive-O2KBWGEI.js.map → interactive-P3D5O673.js.map} +0 -0

package/dist/{chunk-APGYGAVM.js → chunk-UWDI4UVN.js} RENAMED Viewed

@@ -25,7 +25,59 @@ import {
   subscribeToCopilotCliLogEntries,
   subscribeToCopilotSdkLogEntries,
   subscribeToPiLogEntries
-} from "./chunk-KWUTY5XR.js";
+} from "./chunk-FSBZM3HT.js";
+// package.json
+var package_default = {
+  name: "agentv",
+  version: "2.13.0",
+  description: "CLI entry point for AgentV",
+  type: "module",
+  repository: {
+    type: "git",
+    url: "https://github.com/EntityProcess/agentv.git"
+  },
+  homepage: "https://github.com/EntityProcess/agentv#readme",
+  bugs: {
+    url: "https://github.com/EntityProcess/agentv/issues"
+  },
+  bin: {
+    agentv: "./dist/cli.js"
+  },
+  files: ["dist", "README.md"],
+  scripts: {
+    dev: "bun src/cli.ts",
+    build: "tsup && bun run copy-readme",
+    "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
+    prepublishOnly: "bun run copy-readme",
+    typecheck: "tsc --noEmit",
+    lint: "biome check .",
+    format: "biome format --write .",
+    fix: "biome check --write .",
+    test: "bun test",
+    "test:watch": "bun test --watch"
+  },
+  dependencies: {
+    "@anthropic-ai/claude-agent-sdk": "^0.2.49",
+    "@github/copilot-sdk": "^0.1.25",
+    "@inquirer/prompts": "^8.2.1",
+    "@mariozechner/pi-agent-core": "^0.54.2",
+    "@mariozechner/pi-ai": "^0.54.2",
+    "@openai/codex-sdk": "^0.104.0",
+    "cmd-ts": "^0.14.3",
+    dotenv: "^16.4.5",
+    "fast-glob": "^3.3.3",
+    json5: "^2.2.3",
+    micromatch: "^4.0.8",
+    semver: "^7.7.4",
+    yaml: "^2.6.1"
+  },
+  devDependencies: {
+    "@agentv/core": "workspace:*",
+    "@types/semver": "^7.7.1",
+    execa: "^9.3.0"
+  }
+};
 // src/commands/eval/shared.ts
 import { constants } from "node:fs";
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
 import path10 from "node:path";
 import { pathToFileURL } from "node:url";
+// src/version-check.ts
+import { satisfies, validRange } from "semver";
+var ANSI_YELLOW = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
+var ANSI_RESET = "\x1B[0m";
+function checkVersion(requiredVersion) {
+  const currentVersion = package_default.version;
+  if (!requiredVersion.trim() || !validRange(requiredVersion)) {
+    throw new Error(
+      `Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
+    );
+  }
+  return {
+    satisfied: satisfies(currentVersion, requiredVersion),
+    currentVersion,
+    requiredRange: requiredVersion
+  };
+}
+async function enforceRequiredVersion(requiredVersion, options) {
+  let result;
+  try {
+    result = checkVersion(requiredVersion);
+  } catch (err) {
+    console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
+    process.exit(1);
+  }
+  if (result.satisfied) {
+    return;
+  }
+  const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
+  Run \`agentv self update\` to upgrade.`;
+  if (options?.strict) {
+    console.error(warning);
+    console.error(
+      `${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
+    );
+    process.exit(1);
+  }
+  if (process.stdin.isTTY && process.stdout.isTTY) {
+    console.warn(warning);
+    const shouldContinue = await promptContinue();
+    if (!shouldContinue) {
+      process.exit(1);
+    }
+  } else {
+    process.stderr.write(`${warning}
+`);
+  }
+}
+async function promptContinue() {
+  const { confirm } = await import("@inquirer/prompts");
+  return confirm({ message: "Continue anyway?", default: false });
+}
 // src/commands/eval/env.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3 } from "node:fs/promises";
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
   }
 };
+// src/commands/eval/retry-errors.ts
+import { createReadStream } from "node:fs";
+import { createInterface } from "node:readline";
+async function loadErrorTestIds(jsonlPath) {
+  const ids = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY
+  });
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed);
+      if (parsed.executionStatus === "execution_error" && parsed.testId) {
+        ids.push(parsed.testId);
+      }
+    } catch {
+    }
+  }
+  return [...new Set(ids)];
+}
+async function loadNonErrorResults(jsonlPath) {
+  const results = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY
+  });
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed);
+      if (!parsed.testId || parsed.score === void 0) continue;
+      if (parsed.executionStatus !== "execution_error") {
+        results.push(parsed);
+      }
+    } catch {
+    }
+  }
+  return results;
+}
 // src/commands/eval/statistics.ts
 var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
 function computeMean(values) {
@@ -872,7 +1021,6 @@ function buildHistogram(values) {
   return bins;
 }
 function calculateEvaluationSummary(results) {
-  const scores = results.map((result) => result.score);
   const total = results.length;
   const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
   const errorCount = errors.length;
@@ -888,18 +1036,39 @@ function calculateEvaluationSummary(results) {
       topResults: [],
       bottomResults: [],
       errorCount: 0,
-      errors: []
+      errors: [],
+      executionErrorCount: 0,
+      qualityFailureCount: 0,
+      passedCount: 0,
+      byFailureStage: {},
+      byFailureReason: {}
     };
   }
-  const mean = computeMean(scores);
-  const median = computeMedian(scores);
-  const min = Math.min(...scores);
-  const max = Math.max(...scores);
-  const standardDeviation = computeStandardDeviation(scores);
-  const histogram = buildHistogram(scores);
-  const sortedResults = [...results].sort((a, b) => b.score - a.score);
+  const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
+  const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
+  const qualityScores = qualityResults.map((r) => r.score);
+  const mean = computeMean(qualityScores);
+  const median = computeMedian(qualityScores);
+  const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
+  const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
+  const standardDeviation = computeStandardDeviation(qualityScores);
+  const histogram = buildHistogram(qualityScores);
+  const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
   const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
   const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
+  const executionErrorCount = executionErrors.length;
+  const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
+  const passedCount = results.filter((r) => r.executionStatus === "ok").length;
+  const byFailureStage = {};
+  const byFailureReason = {};
+  for (const result of executionErrors) {
+    if (result.failureStage) {
+      byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
+    }
+    if (result.failureReasonCode) {
+      byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
+    }
+  }
   return {
     total,
     mean,
@@ -911,7 +1080,12 @@ function calculateEvaluationSummary(results) {
     topResults,
     bottomResults,
     errorCount,
-    errors
+    errors,
+    executionErrorCount,
+    qualityFailureCount,
+    passedCount,
+    byFailureStage,
+    byFailureReason
   };
 }
 function formatScore(value) {
@@ -924,7 +1098,7 @@ function formatEvaluationSummary(summary) {
   const lines = [];
   if (summary.errorCount > 0) {
     lines.push("\n==================================================");
-    lines.push("ERRORS");
+    lines.push("EXECUTION ERRORS");
     lines.push("==================================================");
     for (const error of summary.errors) {
       lines.push(`
@@ -937,11 +1111,21 @@ function formatEvaluationSummary(summary) {
   lines.push("EVALUATION SUMMARY");
   lines.push("==================================================");
   lines.push(`Total tests: ${summary.total}`);
-  if (summary.errorCount > 0) {
-    lines.push(`Failed: ${summary.errorCount}`);
-    lines.push(`Passed: ${summary.total - summary.errorCount}`);
+  lines.push(`Passed: ${summary.passedCount}`);
+  if (summary.qualityFailureCount > 0) {
+    lines.push(`Quality failures: ${summary.qualityFailureCount}`);
+  }
+  if (summary.executionErrorCount > 0) {
+    lines.push(`Execution errors: ${summary.executionErrorCount}`);
+  }
+  if (summary.executionErrorCount > 0) {
+    const qualityCount = summary.total - summary.executionErrorCount;
+    lines.push(
+      `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
+    );
+  } else {
+    lines.push(`Mean score: ${formatScore(summary.mean)}`);
   }
-  lines.push(`Mean score: ${formatScore(summary.mean)}`);
   lines.push(`Median score: ${formatScore(summary.median)}`);
   lines.push(`Min score: ${formatScore(summary.min)}`);
   lines.push(`Max score: ${formatScore(summary.max)}`);
@@ -961,6 +1145,20 @@ function formatEvaluationSummary(summary) {
   summary.bottomResults.forEach((result, index) => {
     lines.push(`  ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
   });
+  const failureStageEntries = Object.entries(summary.byFailureStage);
+  if (failureStageEntries.length > 0) {
+    lines.push("\nExecution errors by stage:");
+    for (const [stage, count] of failureStageEntries) {
+      lines.push(`  ${stage}: ${count}`);
+    }
+  }
+  const failureReasonEntries = Object.entries(summary.byFailureReason);
+  if (failureReasonEntries.length > 0) {
+    lines.push("\nExecution errors by reason:");
+    for (const [reason, count] of failureReasonEntries) {
+      lines.push(`  ${reason}: ${count}`);
+    }
+  }
   return lines.join("\n");
 }
 function formatMatrixSummary(results) {
@@ -2181,9 +2379,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
 }
 // src/commands/eval/targets.ts
-var ANSI_YELLOW = "\x1B[33m";
-var ANSI_RED = "\x1B[31m";
-var ANSI_RESET = "\x1B[0m";
+var ANSI_YELLOW2 = "\x1B[33m";
+var ANSI_RED2 = "\x1B[31m";
+var ANSI_RESET2 = "\x1B[0m";
 function isTTY() {
   return process.stdout.isTTY ?? false;
 }
@@ -2229,8 +2427,8 @@ async function selectTarget(options) {
 Warnings in ${targetsFilePath}:`);
     for (const warning of warnings) {
       const location = warning.location ? ` [${warning.location}]` : "";
-      const prefix = useColors ? `${ANSI_YELLOW}  \u26A0${ANSI_RESET}` : "  \u26A0";
-      const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
+      const prefix = useColors ? `${ANSI_YELLOW2}  \u26A0${ANSI_RESET2}` : "  \u26A0";
+      const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
       console.warn(`${prefix}${location} ${message}`);
     }
     console.warn("");
@@ -2241,8 +2439,8 @@ Warnings in ${targetsFilePath}:`);
 Errors in ${targetsFilePath}:`);
     for (const error of errors) {
       const location = error.location ? ` [${error.location}]` : "";
-      const prefix = useColors ? `${ANSI_RED}  \u2717${ANSI_RESET}` : "  \u2717";
-      const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
+      const prefix = useColors ? `${ANSI_RED2}  \u2717${ANSI_RESET2}` : "  \u2717";
+      const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
       console.error(`${prefix}${location} ${message}`);
     }
     throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2320,8 +2518,8 @@ async function selectMultipleTargets(options) {
 Warnings in ${targetsFilePath}:`);
     for (const warning of warnings) {
       const location = warning.location ? ` [${warning.location}]` : "";
-      const prefix = useColors ? `${ANSI_YELLOW}  \u26A0${ANSI_RESET}` : "  \u26A0";
-      const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
+      const prefix = useColors ? `${ANSI_YELLOW2}  \u26A0${ANSI_RESET2}` : "  \u26A0";
+      const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
       console.warn(`${prefix}${location} ${message}`);
     }
     console.warn("");
@@ -2332,8 +2530,8 @@ Warnings in ${targetsFilePath}:`);
 Errors in ${targetsFilePath}:`);
     for (const error of errors) {
       const location = error.location ? ` [${error.location}]` : "";
-      const prefix = useColors ? `${ANSI_RED}  \u2717${ANSI_RESET}` : "  \u2717";
-      const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
+      const prefix = useColors ? `${ANSI_RED2}  \u2717${ANSI_RESET2}` : "  \u2717";
+      const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
       console.error(`${prefix}${location} ${message}`);
     }
     throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2494,7 +2692,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
     exportOtel: normalizeBoolean(rawOptions.exportOtel),
     otelBackend: normalizeString(rawOptions.otelBackend),
     otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
-    otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
+    otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
+    retryErrors: normalizeString(rawOptions.retryErrors)
   };
 }
 async function ensureFileExists(filePath, description) {
@@ -2628,7 +2827,8 @@ async function prepareFileMetadata(params) {
     suiteTargets,
     yamlCache: suite.cacheConfig?.enabled,
     yamlCachePath: suite.cacheConfig?.cachePath,
-    totalBudgetUsd: suite.totalBudgetUsd
+    totalBudgetUsd: suite.totalBudgetUsd,
+    failOnError: suite.failOnError
   };
 }
 async function runWithLimit(items, limit, task) {
@@ -2662,7 +2862,8 @@ async function runSingleEvalFile(params) {
     evalCases,
     trialsConfig,
     matrixMode,
-    totalBudgetUsd
+    totalBudgetUsd,
+    failOnError
   } = params;
   const targetName = selection.targetName;
   await ensureFileExists(testFilePath, "Test file");
@@ -2724,6 +2925,7 @@ async function runSingleEvalFile(params) {
     cleanupWorkspaces: options.cleanupWorkspaces,
     trials: trialsConfig,
     totalBudgetUsd,
+    failOnError,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result) => {
       streamingObserver?.finalizeEvalCase(result.score, result.error);
@@ -2777,7 +2979,26 @@ async function runEvalCommand(input) {
   }
   const repoRoot = await findRepoRoot(cwd);
   const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
-  const options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  if (yamlConfig?.required_version) {
+    await enforceRequiredVersion(yamlConfig.required_version, {
+      strict: normalizeBoolean(input.rawOptions.strict)
+    });
+  }
+  let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  let retryNonErrorResults;
+  if (options.retryErrors) {
+    const retryPath = path10.resolve(options.retryErrors);
+    await ensureFileExists(retryPath, "Retry-errors JSONL file");
+    const errorIds = await loadErrorTestIds(retryPath);
+    if (errorIds.length === 0) {
+      console.log("No execution errors found in the previous output. Nothing to retry.");
+      return;
+    }
+    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
+    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
+    options = { ...options, filter: filterPattern };
+    retryNonErrorResults = await loadNonErrorResults(retryPath);
+  }
   if (options.keepWorkspaces && options.cleanupWorkspaces) {
     console.warn(
       "Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
@@ -2790,7 +3011,7 @@ async function runEvalCommand(input) {
   const useFileExport = !!(options.otelFile || options.traceFile);
   if (options.exportOtel || useFileExport) {
     try {
-      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-RVGCGRG4.js");
+      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-CCUHG3SN.js");
       let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
       let headers = {};
       if (options.otelBackend) {
@@ -2985,12 +3206,22 @@ async function runEvalCommand(input) {
           evalCases: applicableEvalCases,
           trialsConfig: targetPrep.trialsConfig,
           matrixMode: targetPrep.selections.length > 1,
-          totalBudgetUsd: targetPrep.totalBudgetUsd
+          totalBudgetUsd: targetPrep.totalBudgetUsd,
+          failOnError: targetPrep.failOnError
         });
         allResults.push(...result.results);
       }
     });
     progressReporter.finish();
+    if (retryNonErrorResults && retryNonErrorResults.length > 0) {
+      for (const preserved of retryNonErrorResults) {
+        await outputWriter.append(preserved);
+      }
+      allResults.push(...retryNonErrorResults);
+      console.log(
+        `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
+      );
+    }
     const summary = calculateEvaluationSummary(allResults);
     console.log(formatEvaluationSummary(summary));
     if (isMatrixMode && allResults.length > 0) {
@@ -3048,6 +3279,7 @@ async function resolveEvaluationRunner() {
 }
 export {
+  package_default,
   toSnakeCaseDeep,
   resolveEvalPaths,
   findRepoRoot,
@@ -3061,4 +3293,4 @@ export {
   selectTarget,
   runEvalCommand
 };
-//# sourceMappingURL=chunk-APGYGAVM.js.map
+//# sourceMappingURL=chunk-UWDI4UVN.js.map