npm - agentv - Versions diffs - 3.13.0 → 3.13.2 - Mend

agentv 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +5 -5
package/dist/{chunk-6H4IAXQH.js → chunk-4Z5E5CYT.js} +54 -22
package/dist/chunk-4Z5E5CYT.js.map +1 -0
package/dist/{chunk-7OHZAFND.js → chunk-D3LNJUUB.js} +67 -35
package/dist/chunk-D3LNJUUB.js.map +1 -0
package/dist/{chunk-DJU4C6NS.js → chunk-X2343WOK.js} +31 -19
package/dist/chunk-X2343WOK.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-SMKOBBFB.js → dist-KPMR7RBT.js} +4 -2
package/dist/index.js +3 -3
package/dist/{interactive-RV664PCR.js → interactive-HVKLYGRX.js} +3 -3
package/dist/templates/.agentv/.env.example +23 -0
package/dist/templates/.agentv/config.yaml +13 -4
package/dist/templates/.agentv/targets.yaml +16 -0
package/package.json +1 -1
package/dist/chunk-6H4IAXQH.js.map +0 -1
package/dist/chunk-7OHZAFND.js.map +0 -1
package/dist/chunk-DJU4C6NS.js.map +0 -1
/package/dist/{dist-SMKOBBFB.js.map → dist-KPMR7RBT.js.map} +0 -0
/package/dist/{interactive-RV664PCR.js.map → interactive-HVKLYGRX.js.map} +0 -0

package/README.md CHANGED Viewed

@@ -221,13 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
 The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
-By default, `agentv eval` creates a run workspace under `.agentv/results/raw/<run>/`
+By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
 with `index.jsonl` as the machine-facing manifest.
 You can also convert an existing manifest to HTML after the fact:
 ```bash
-agentv convert .agentv/results/raw/eval_<timestamp>/index.jsonl -o report.html
+agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
 ```
 #### Timeouts
@@ -358,7 +358,7 @@ agentv create eval my-eval          # → evals/my-eval.eval.yaml + .cases.jsonl
 Compare a combined results file across all targets (N-way matrix):
 ```bash
-agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl
+agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
 ```
 ```
@@ -379,8 +379,8 @@ Pairwise Summary:
 Designate a baseline for CI regression gating, or compare two specific targets:
 ```bash
-agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1
-agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
+agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
+agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
 agentv compare before.jsonl after.jsonl                                  # two-file pairwise
 ```

package/dist/{chunk-6H4IAXQH.js → chunk-4Z5E5CYT.js} RENAMED Viewed

@@ -27,12 +27,12 @@ import {
   subscribeToCopilotCliLogEntries,
   subscribeToCopilotSdkLogEntries,
   subscribeToPiLogEntries
-} from "./chunk-7OHZAFND.js";
+} from "./chunk-D3LNJUUB.js";
 // package.json
 var package_default = {
   name: "agentv",
-  version: "3.13.0",
+  version: "3.13.2",
   description: "CLI entry point for AgentV",
   type: "module",
   repository: {
@@ -302,11 +302,12 @@ function toSnakeCaseDeep(obj) {
 import { existsSync, statSync } from "node:fs";
 import path3 from "node:path";
 var RESULT_INDEX_FILENAME = "index.jsonl";
+var RESULT_RUNS_DIRNAME = "runs";
 function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
   return `eval_${timestamp.toISOString().replace(/[:.]/g, "-")}`;
 }
 function buildDefaultRunDir(cwd) {
-  return path3.join(cwd, ".agentv", "results", "raw", createRunDirName());
+  return path3.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME, createRunDirName());
 }
 function resolveRunIndexPath(runDir) {
   return path3.join(runDir, RESULT_INDEX_FILENAME);
@@ -547,7 +548,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
       tests_run: testIds
     },
     run_summary: runSummary,
-    per_evaluator_summary: perEvaluatorSummary,
+    per_grader_summary: perEvaluatorSummary,
     notes
   };
 }
@@ -1643,13 +1644,15 @@ function escapeXml(str) {
 var JunitWriter = class _JunitWriter {
   filePath;
   results = [];
+  threshold;
   closed = false;
-  constructor(filePath) {
+  constructor(filePath, options) {
     this.filePath = filePath;
+    this.threshold = options?.threshold ?? 0.5;
   }
-  static async open(filePath) {
+  static async open(filePath, options) {
     await mkdir5(path9.dirname(filePath), { recursive: true });
-    return new _JunitWriter(filePath);
+    return new _JunitWriter(filePath, options);
   }
   async append(result) {
     if (this.closed) {
@@ -1674,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
     }
     const suiteXmls = [];
     for (const [suiteName, results] of grouped) {
-      const failures = results.filter((r) => r.score < 0.5).length;
+      const failures = results.filter((r) => r.score < this.threshold).length;
       const errors = results.filter((r) => r.error !== void 0).length;
       const testCases = results.map((r) => {
         const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
@@ -1683,7 +1686,7 @@ var JunitWriter = class _JunitWriter {
           inner = `
       <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
     `;
-        } else if (r.score < 0.5) {
+        } else if (r.score < this.threshold) {
           const message = `score=${r.score.toFixed(3)}`;
           const failedAssertions = r.assertions.filter((a) => !a.passed);
           const detail = [
@@ -1703,7 +1706,7 @@ ${testCases.join("\n")}
       );
     }
     const totalTests = this.results.length;
-    const totalFailures = this.results.filter((r) => r.score < 0.5).length;
+    const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
     const totalErrors = this.results.filter((r) => r.error !== void 0).length;
     const xml = `<?xml version="1.0" encoding="UTF-8"?>
 <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
@@ -1784,7 +1787,7 @@ async function createOutputWriter(filePath, format) {
   }
 }
 var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
-function createWriterFromPath(filePath) {
+function createWriterFromPath(filePath, options) {
   const ext = path11.extname(filePath).toLowerCase();
   switch (ext) {
     case ".jsonl":
@@ -1792,7 +1795,7 @@ function createWriterFromPath(filePath) {
     case ".json":
       return JsonWriter.open(filePath);
     case ".xml":
-      return JunitWriter.open(filePath);
+      return JunitWriter.open(filePath, { threshold: options?.threshold });
     case ".yaml":
     case ".yml":
       return YamlWriter.open(filePath);
@@ -1805,8 +1808,8 @@ function createWriterFromPath(filePath) {
       );
   }
 }
-async function createMultiWriter(filePaths) {
-  const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
+async function createMultiWriter(filePaths, options) {
+  const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
   return {
     async append(result) {
       await Promise.all(writers.map((w) => w.append(result)));
@@ -2384,6 +2387,12 @@ function formatMatrixSummary(results) {
   lines.push(`${"Average".padEnd(testIdColWidth)}  ${avgCells.join("  ")}`);
   return lines.join("\n");
 }
+function formatThresholdSummary(meanScore, threshold) {
+  const passed = meanScore >= threshold;
+  const verdict = passed ? "PASS" : "FAIL";
+  const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
+  return { passed, message };
+}
 // ../../packages/core/dist/evaluation/validation/index.js
 import { readFile as readFile3 } from "node:fs/promises";
@@ -4047,7 +4056,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
     artifacts: normalizeString(rawOptions.artifacts),
     graderTarget: normalizeString(rawOptions.graderTarget),
     model: normalizeString(rawOptions.model),
-    outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages))
+    outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
+    threshold: normalizeOptionalNumber(rawOptions.threshold)
   };
 }
 async function ensureFileExists(filePath, description) {
@@ -4193,7 +4203,8 @@ async function prepareFileMetadata(params) {
     yamlCache: suite.cacheConfig?.enabled,
     yamlCachePath: suite.cacheConfig?.cachePath,
     totalBudgetUsd: suite.totalBudgetUsd,
-    failOnError: suite.failOnError
+    failOnError: suite.failOnError,
+    threshold: suite.threshold
   };
 }
 async function runWithLimit(items, limit, task) {
@@ -4349,6 +4360,9 @@ async function runSingleEvalFile(params) {
 }
 async function runEvalCommand(input) {
   const cwd = process.cwd();
+  if (!process.env.AGENTV_RUN_TIMESTAMP) {
+    process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
+  }
   let config = null;
   try {
     config = await loadTsConfig(cwd);
@@ -4407,7 +4421,7 @@ async function runEvalCommand(input) {
   const useFileExport = !!options.otelFile;
   if (options.exportOtel || useFileExport) {
     try {
-      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-SMKOBBFB.js");
+      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-KPMR7RBT.js");
       let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
       let headers = {};
       if (options.otelBackend) {
@@ -4453,12 +4467,9 @@ async function runEvalCommand(input) {
   const uniqueOutputPaths = [...new Set(allOutputPaths)];
   const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
   const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
-  let outputWriter;
   if (uniqueOutputPaths.length === 1) {
-    outputWriter = await createOutputWriter(primaryWritePath, options.format);
     console.log(`Output path: ${outputPath}`);
   } else {
-    outputWriter = await createMultiWriter(uniqueOutputPaths);
     console.log("Output paths:");
     for (const p of uniqueReportedOutputPaths) {
       console.log(`  ${p}`);
@@ -4517,6 +4528,18 @@ async function runEvalCommand(input) {
   if (cacheEnabled) {
     console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
   }
+  const yamlThreshold = firstMeta?.threshold;
+  const resolvedThreshold = options.threshold ?? yamlThreshold;
+  if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
+    throw new Error("--threshold must be between 0 and 1");
+  }
+  const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
+  let outputWriter;
+  if (uniqueOutputPaths.length === 1) {
+    outputWriter = await createOutputWriter(primaryWritePath, options.format);
+  } else {
+    outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
+  }
   const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
   let totalEvalCount = 0;
   for (const meta of fileMetadata.values()) {
@@ -4640,6 +4663,13 @@ async function runEvalCommand(input) {
     }
     const summary = calculateEvaluationSummary(allResults);
     console.log(formatEvaluationSummary(summary));
+    let thresholdFailed = false;
+    if (resolvedThreshold !== void 0) {
+      const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
+      console.log(`
+${thresholdResult.message}`);
+      thresholdFailed = !thresholdResult.passed;
+    }
     if (isMatrixMode && allResults.length > 0) {
       console.log(formatMatrixSummary(allResults));
     }
@@ -4721,7 +4751,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
       executionErrorCount: summary.executionErrorCount,
       outputPath,
       testFiles: resolvedTestFiles,
-      target: options.target
+      target: options.target,
+      thresholdFailed
     };
   } finally {
     unsubscribeCodexLogs();
@@ -4758,6 +4789,7 @@ export {
   package_default,
   toSnakeCaseDeep,
   RESULT_INDEX_FILENAME,
+  RESULT_RUNS_DIRNAME,
   resolveExistingRunPrimaryPath,
   resolveWorkspaceOrFilePath,
   writeArtifactsFromResults,
@@ -4779,4 +4811,4 @@ export {
   selectTarget,
   runEvalCommand
 };
-//# sourceMappingURL=chunk-6H4IAXQH.js.map
+//# sourceMappingURL=chunk-4Z5E5CYT.js.map