npm - agentv - Versions diffs - 2.12.0 → 2.14.0-next.1 - Mend

agentv 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +14 -14
package/dist/{chunk-YBJX5CP6.js → chunk-K2APOWTE.js} +213 -29
package/dist/chunk-K2APOWTE.js.map +1 -0
package/dist/{chunk-LUHCYBMD.js → chunk-OQN2GDEU.js} +251 -164
package/dist/chunk-OQN2GDEU.js.map +1 -0
package/dist/{chunk-6KU2ZUFJ.js → chunk-ZSSGXZX6.js} +39 -77
package/dist/chunk-ZSSGXZX6.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-OPPA4P5R.js → dist-QR5OZ4DH.js} +4 -2
package/dist/index.js +3 -3
package/dist/{interactive-TOUKPSHP.js → interactive-WF6UO63B.js} +3 -3
package/package.json +4 -2
package/dist/chunk-6KU2ZUFJ.js.map +0 -1
package/dist/chunk-LUHCYBMD.js.map +0 -1
package/dist/chunk-YBJX5CP6.js.map +0 -1
/package/dist/{dist-OPPA4P5R.js.map → dist-QR5OZ4DH.js.map} +0 -0
/package/dist/{interactive-TOUKPSHP.js.map → interactive-WF6UO63B.js.map} +0 -0

package/README.md CHANGED Viewed

@@ -60,7 +60,7 @@ tests:
     assert:
       - name: math_check
-        type: code_judge
+        type: code-judge
         script: ./validators/check_math.py
 ```
@@ -154,10 +154,10 @@ Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.json
 description: Math evaluation dataset
 dataset: math-tests
 execution:
-  target: azure_base
+  target: azure-base
 assert:
   - name: correctness
-    type: llm_judge
+    type: llm-judge
     prompt: ./judges/correctness.md
 ```
@@ -175,7 +175,7 @@ agentv validate evals/my-eval.yaml
 agentv eval evals/my-eval.yaml
 # Override target
-agentv eval --target azure_base evals/**/*.yaml
+agentv eval --target azure-base evals/**/*.yaml
 # Run specific test
 agentv eval --test-id case-123 evals/my-eval.yaml
@@ -219,7 +219,7 @@ Reference evaluators in your eval file:
 ```yaml
 assert:
   - name: my_validator
-    type: code_judge
+    type: code-judge
     script: ./validators/check_answer.py
 ```
@@ -339,7 +339,7 @@ Define execution targets in `.agentv/targets.yaml` to decouple evals from provid
 ```yaml
 targets:
-  - name: azure_base
+  - name: azure-base
     provider: azure
     endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
     api_key: ${{ AZURE_OPENAI_API_KEY }}
@@ -348,12 +348,12 @@ targets:
   - name: vscode_dev
     provider: vscode
     workspace_template: ${{ WORKSPACE_PATH }}
-    judge_target: azure_base
+    judge_target: azure-base
   - name: local_agent
     provider: cli
     command: 'python agent.py --prompt-file {PROMPT_FILE} --output {OUTPUT_FILE}'
-    judge_target: azure_base
+    judge_target: azure-base
 ```
 Supports: `azure`, `anthropic`, `gemini`, `codex`, `copilot`, `pi-coding-agent`, `claude`, `vscode`, `vscode-insiders`, `cli`, and `mock`.
@@ -398,12 +398,12 @@ All assertions support `weight`, `required`, and `negate` flags. Use `negate: tr
 ```yaml
 assert:
   # Case-insensitive matching for natural language variation
-  - type: icontains_any
+  - type: icontains-any
     value: ["missing rule code", "need rule code", "provide rule code"]
     required: true
   # Multiple required terms
-  - type: icontains_all
+  - type: icontains-all
     value: ["country code", "rule codes"]
   # Case-insensitive regex
@@ -423,10 +423,10 @@ targets:
   # Agent target — requires judge_target for LLM-based evaluation
   - name: codex_local
     provider: codex
-    judge_target: azure_base  # Required: LLM provider for judging
+    judge_target: azure-base  # Required: LLM provider for judging
   # LLM target — no judge_target needed (judges itself)
-  - name: azure_base
+  - name: azure-base
     provider: azure
 ```
@@ -445,7 +445,7 @@ Create markdown judge files with evaluation criteria and scoring guidelines:
 ```yaml
 assert:
   - name: semantic_check
-    type: llm_judge
+    type: llm-judge
     prompt: ./judges/correctness.md
 ```
@@ -487,7 +487,7 @@ Configure automatic retry with exponential backoff:
 ```yaml
 targets:
-  - name: azure_base
+  - name: azure-base
     provider: azure
     max_retries: 5
     retry_initial_delay_ms: 2000

package/dist/{chunk-YBJX5CP6.js → chunk-K2APOWTE.js} RENAMED Viewed

@@ -25,7 +25,59 @@ import {
   subscribeToCopilotCliLogEntries,
   subscribeToCopilotSdkLogEntries,
   subscribeToPiLogEntries
-} from "./chunk-LUHCYBMD.js";
+} from "./chunk-OQN2GDEU.js";
+// package.json
+var package_default = {
+  name: "agentv",
+  version: "2.14.0-next.1",
+  description: "CLI entry point for AgentV",
+  type: "module",
+  repository: {
+    type: "git",
+    url: "https://github.com/EntityProcess/agentv.git"
+  },
+  homepage: "https://github.com/EntityProcess/agentv#readme",
+  bugs: {
+    url: "https://github.com/EntityProcess/agentv/issues"
+  },
+  bin: {
+    agentv: "./dist/cli.js"
+  },
+  files: ["dist", "README.md"],
+  scripts: {
+    dev: "bun src/cli.ts",
+    build: "tsup && bun run copy-readme",
+    "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
+    prepublishOnly: "bun run copy-readme",
+    typecheck: "tsc --noEmit",
+    lint: "biome check .",
+    format: "biome format --write .",
+    fix: "biome check --write .",
+    test: "bun test",
+    "test:watch": "bun test --watch"
+  },
+  dependencies: {
+    "@anthropic-ai/claude-agent-sdk": "^0.2.49",
+    "@github/copilot-sdk": "^0.1.25",
+    "@inquirer/prompts": "^8.2.1",
+    "@mariozechner/pi-agent-core": "^0.54.2",
+    "@mariozechner/pi-ai": "^0.54.2",
+    "@openai/codex-sdk": "^0.104.0",
+    "cmd-ts": "^0.14.3",
+    dotenv: "^16.4.5",
+    "fast-glob": "^3.3.3",
+    json5: "^2.2.3",
+    micromatch: "^4.0.8",
+    semver: "^7.7.4",
+    yaml: "^2.6.1"
+  },
+  devDependencies: {
+    "@agentv/core": "workspace:*",
+    "@types/semver": "^7.7.1",
+    execa: "^9.3.0"
+  }
+};
 // src/commands/eval/shared.ts
 import { constants } from "node:fs";
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
 import path10 from "node:path";
 import { pathToFileURL } from "node:url";
+// src/version-check.ts
+import { satisfies, validRange } from "semver";
+var ANSI_YELLOW = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
+var ANSI_RESET = "\x1B[0m";
+function checkVersion(requiredVersion) {
+  const currentVersion = package_default.version;
+  if (!requiredVersion.trim() || !validRange(requiredVersion)) {
+    throw new Error(
+      `Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
+    );
+  }
+  return {
+    satisfied: satisfies(currentVersion, requiredVersion),
+    currentVersion,
+    requiredRange: requiredVersion
+  };
+}
+async function enforceRequiredVersion(requiredVersion, options) {
+  let result;
+  try {
+    result = checkVersion(requiredVersion);
+  } catch (err) {
+    console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
+    process.exit(1);
+  }
+  if (result.satisfied) {
+    return;
+  }
+  const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
+  Run \`agentv self update\` to upgrade.`;
+  if (options?.strict) {
+    console.error(warning);
+    console.error(
+      `${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
+    );
+    process.exit(1);
+  }
+  if (process.stdin.isTTY && process.stdout.isTTY) {
+    console.warn(warning);
+    const shouldContinue = await promptContinue();
+    if (!shouldContinue) {
+      process.exit(1);
+    }
+  } else {
+    process.stderr.write(`${warning}
+`);
+  }
+}
+async function promptContinue() {
+  const { confirm } = await import("@inquirer/prompts");
+  return confirm({ message: "Continue anyway?", default: false });
+}
 // src/commands/eval/env.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3 } from "node:fs/promises";
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
   }
 };
+// src/commands/eval/retry-errors.ts
+import { createReadStream } from "node:fs";
+import { createInterface } from "node:readline";
+async function loadErrorTestIds(jsonlPath) {
+  const ids = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY
+  });
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed);
+      if (parsed.executionStatus === "execution_error" && parsed.testId) {
+        ids.push(parsed.testId);
+      }
+    } catch {
+    }
+  }
+  return [...new Set(ids)];
+}
+async function loadNonErrorResults(jsonlPath) {
+  const results = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY
+  });
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed);
+      if (!parsed.testId || parsed.score === void 0) continue;
+      if (parsed.executionStatus !== "execution_error") {
+        results.push(parsed);
+      }
+    } catch {
+    }
+  }
+  return results;
+}
 // src/commands/eval/statistics.ts
 var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
 function computeMean(values) {
@@ -1117,16 +1266,16 @@ function inferFileTypeFromPath(filePath) {
 var ASSERTION_TYPES_WITH_STRING_VALUE = /* @__PURE__ */ new Set([
   "contains",
   "icontains",
-  "starts_with",
-  "ends_with",
+  "starts-with",
+  "ends-with",
   "equals",
   "regex"
 ]);
 var ASSERTION_TYPES_WITH_ARRAY_VALUE = /* @__PURE__ */ new Set([
-  "contains_any",
-  "contains_all",
-  "icontains_any",
-  "icontains_all"
+  "contains-any",
+  "contains-all",
+  "icontains-any",
+  "icontains-all"
 ]);
 var VALID_TEST_FILE_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".jsonl"]);
 var NAME_PATTERN = /^[a-z0-9-]+$/;
@@ -1492,8 +1641,8 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
       });
       continue;
     }
-    const typeValue = item.type;
-    if (typeValue === void 0 || typeof typeValue !== "string") {
+    const rawTypeValue = item.type;
+    if (rawTypeValue === void 0 || typeof rawTypeValue !== "string") {
       errors.push({
         severity: "warning",
         filePath,
@@ -1502,12 +1651,13 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
       });
       continue;
     }
+    const typeValue = rawTypeValue.replace(/_/g, "-");
     if (!isEvaluatorKind(typeValue)) {
       errors.push({
         severity: "warning",
         filePath,
         location: `${location}.type`,
-        message: `Unknown assertion type '${typeValue}'.`
+        message: `Unknown assertion type '${rawTypeValue}'.`
       });
       continue;
     }
@@ -1732,7 +1882,7 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
   "delayMinMs",
   "delayMaxMs",
   "trace"
-  // For testing tool_trajectory evaluator
+  // For testing tool-trajectory evaluator
 ]);
 var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
   ...COMMON_SETTINGS,
@@ -2230,9 +2380,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
 }
 // src/commands/eval/targets.ts
-var ANSI_YELLOW = "\x1B[33m";
-var ANSI_RED = "\x1B[31m";
-var ANSI_RESET = "\x1B[0m";
+var ANSI_YELLOW2 = "\x1B[33m";
+var ANSI_RED2 = "\x1B[31m";
+var ANSI_RESET2 = "\x1B[0m";
 function isTTY() {
   return process.stdout.isTTY ?? false;
 }
@@ -2278,8 +2428,8 @@ async function selectTarget(options) {
 Warnings in ${targetsFilePath}:`);
     for (const warning of warnings) {
       const location = warning.location ? ` [${warning.location}]` : "";
-      const prefix = useColors ? `${ANSI_YELLOW}  \u26A0${ANSI_RESET}` : "  \u26A0";
-      const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
+      const prefix = useColors ? `${ANSI_YELLOW2}  \u26A0${ANSI_RESET2}` : "  \u26A0";
+      const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
       console.warn(`${prefix}${location} ${message}`);
     }
     console.warn("");
@@ -2290,8 +2440,8 @@ Warnings in ${targetsFilePath}:`);
 Errors in ${targetsFilePath}:`);
     for (const error of errors) {
       const location = error.location ? ` [${error.location}]` : "";
-      const prefix = useColors ? `${ANSI_RED}  \u2717${ANSI_RESET}` : "  \u2717";
-      const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
+      const prefix = useColors ? `${ANSI_RED2}  \u2717${ANSI_RESET2}` : "  \u2717";
+      const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
       console.error(`${prefix}${location} ${message}`);
     }
     throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2369,8 +2519,8 @@ async function selectMultipleTargets(options) {
 Warnings in ${targetsFilePath}:`);
     for (const warning of warnings) {
       const location = warning.location ? ` [${warning.location}]` : "";
-      const prefix = useColors ? `${ANSI_YELLOW}  \u26A0${ANSI_RESET}` : "  \u26A0";
-      const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
+      const prefix = useColors ? `${ANSI_YELLOW2}  \u26A0${ANSI_RESET2}` : "  \u26A0";
+      const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
       console.warn(`${prefix}${location} ${message}`);
     }
     console.warn("");
@@ -2381,8 +2531,8 @@ Warnings in ${targetsFilePath}:`);
 Errors in ${targetsFilePath}:`);
     for (const error of errors) {
       const location = error.location ? ` [${error.location}]` : "";
-      const prefix = useColors ? `${ANSI_RED}  \u2717${ANSI_RESET}` : "  \u2717";
-      const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
+      const prefix = useColors ? `${ANSI_RED2}  \u2717${ANSI_RESET2}` : "  \u2717";
+      const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
       console.error(`${prefix}${location} ${message}`);
     }
     throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -2543,7 +2693,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
     exportOtel: normalizeBoolean(rawOptions.exportOtel),
     otelBackend: normalizeString(rawOptions.otelBackend),
     otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
-    otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
+    otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
+    retryErrors: normalizeString(rawOptions.retryErrors)
   };
 }
 async function ensureFileExists(filePath, description) {
@@ -2677,7 +2828,8 @@ async function prepareFileMetadata(params) {
     suiteTargets,
     yamlCache: suite.cacheConfig?.enabled,
     yamlCachePath: suite.cacheConfig?.cachePath,
-    totalBudgetUsd: suite.totalBudgetUsd
+    totalBudgetUsd: suite.totalBudgetUsd,
+    failOnError: suite.failOnError
   };
 }
 async function runWithLimit(items, limit, task) {
@@ -2711,7 +2863,8 @@ async function runSingleEvalFile(params) {
     evalCases,
     trialsConfig,
     matrixMode,
-    totalBudgetUsd
+    totalBudgetUsd,
+    failOnError
   } = params;
   const targetName = selection.targetName;
   await ensureFileExists(testFilePath, "Test file");
@@ -2773,6 +2926,7 @@ async function runSingleEvalFile(params) {
     cleanupWorkspaces: options.cleanupWorkspaces,
     trials: trialsConfig,
     totalBudgetUsd,
+    failOnError,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result) => {
       streamingObserver?.finalizeEvalCase(result.score, result.error);
@@ -2826,7 +2980,26 @@ async function runEvalCommand(input) {
   }
   const repoRoot = await findRepoRoot(cwd);
   const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
-  const options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  if (yamlConfig?.required_version) {
+    await enforceRequiredVersion(yamlConfig.required_version, {
+      strict: normalizeBoolean(input.rawOptions.strict)
+    });
+  }
+  let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  let retryNonErrorResults;
+  if (options.retryErrors) {
+    const retryPath = path10.resolve(options.retryErrors);
+    await ensureFileExists(retryPath, "Retry-errors JSONL file");
+    const errorIds = await loadErrorTestIds(retryPath);
+    if (errorIds.length === 0) {
+      console.log("No execution errors found in the previous output. Nothing to retry.");
+      return;
+    }
+    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
+    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
+    options = { ...options, filter: filterPattern };
+    retryNonErrorResults = await loadNonErrorResults(retryPath);
+  }
   if (options.keepWorkspaces && options.cleanupWorkspaces) {
     console.warn(
       "Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
@@ -2839,7 +3012,7 @@ async function runEvalCommand(input) {
   const useFileExport = !!(options.otelFile || options.traceFile);
   if (options.exportOtel || useFileExport) {
     try {
-      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OPPA4P5R.js");
+      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QR5OZ4DH.js");
       let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
       let headers = {};
       if (options.otelBackend) {
@@ -3034,12 +3207,22 @@ async function runEvalCommand(input) {
           evalCases: applicableEvalCases,
           trialsConfig: targetPrep.trialsConfig,
           matrixMode: targetPrep.selections.length > 1,
-          totalBudgetUsd: targetPrep.totalBudgetUsd
+          totalBudgetUsd: targetPrep.totalBudgetUsd,
+          failOnError: targetPrep.failOnError
         });
         allResults.push(...result.results);
       }
     });
     progressReporter.finish();
+    if (retryNonErrorResults && retryNonErrorResults.length > 0) {
+      for (const preserved of retryNonErrorResults) {
+        await outputWriter.append(preserved);
+      }
+      allResults.push(...retryNonErrorResults);
+      console.log(
+        `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
+      );
+    }
     const summary = calculateEvaluationSummary(allResults);
     console.log(formatEvaluationSummary(summary));
     if (isMatrixMode && allResults.length > 0) {
@@ -3097,6 +3280,7 @@ async function resolveEvaluationRunner() {
 }
 export {
+  package_default,
   toSnakeCaseDeep,
   resolveEvalPaths,
   findRepoRoot,
@@ -3110,4 +3294,4 @@ export {
   selectTarget,
   runEvalCommand
 };
-//# sourceMappingURL=chunk-YBJX5CP6.js.map
+//# sourceMappingURL=chunk-K2APOWTE.js.map