npm - agentv - Versions diffs - 4.4.1 → 4.5.1 - Mend

agentv 4.4.1 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{chunk-VYZQMN57.js → chunk-5DEZ72J3.js} +22 -28
package/dist/chunk-5DEZ72J3.js.map +1 -0
package/dist/{chunk-63NDZ6UC.js → chunk-7DRAXDVC.js} +416 -110
package/dist/chunk-7DRAXDVC.js.map +1 -0
package/dist/{chunk-4WMLJHW5.js → chunk-BQC2CDLN.js} +384 -506
package/dist/chunk-BQC2CDLN.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-P74O2P2I.js → dist-VWMHFUXR.js} +8 -2
package/dist/index.js +3 -3
package/dist/{interactive-VJP2AEPT.js → interactive-OG7ZJIHG.js} +3 -3
package/package.json +1 -1
package/dist/chunk-4WMLJHW5.js.map +0 -1
package/dist/chunk-63NDZ6UC.js.map +0 -1
package/dist/chunk-VYZQMN57.js.map +0 -1
/package/dist/{dist-P74O2P2I.js.map → dist-VWMHFUXR.js.map} +0 -0
/package/dist/{interactive-VJP2AEPT.js.map → interactive-OG7ZJIHG.js.map} +0 -0

package/dist/{chunk-VYZQMN57.js → chunk-5DEZ72J3.js} RENAMED Viewed

@@ -29,12 +29,12 @@ import {
   subscribeToCopilotCliLogEntries,
   subscribeToCopilotSdkLogEntries,
   subscribeToPiLogEntries
-} from "./chunk-63NDZ6UC.js";
+} from "./chunk-7DRAXDVC.js";
 // package.json
 var package_default = {
   name: "agentv",
-  version: "4.4.1",
+  version: "4.5.1",
   description: "CLI entry point for AgentV",
   type: "module",
   repository: {
@@ -2209,7 +2209,7 @@ function buildHistogram(values) {
   }
   return bins;
 }
-function calculateEvaluationSummary(results) {
+function calculateEvaluationSummary(results, options) {
   const total = results.length;
   const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
   const errorCount = errors.length;
@@ -2246,8 +2246,9 @@ function calculateEvaluationSummary(results) {
   const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
   const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
   const executionErrorCount = executionErrors.length;
-  const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
-  const passedCount = results.filter((r) => r.executionStatus === "ok").length;
+  const scoreThreshold = options?.threshold;
+  const passedCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score >= scoreThreshold).length : results.filter((r) => r.executionStatus === "ok").length;
+  const qualityFailureCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score < scoreThreshold).length : results.filter((r) => r.executionStatus === "quality_failure").length;
   const byFailureStage = {};
   const byFailureReason = {};
   for (const result of executionErrors) {
@@ -2280,7 +2281,7 @@ function calculateEvaluationSummary(results) {
 function formatScore(value) {
   return value.toFixed(3);
 }
-function formatEvaluationSummary(summary) {
+function formatEvaluationSummary(summary, options) {
   if (summary.total === 0) {
     return "\nNo results to summarize";
   }
@@ -2296,11 +2297,13 @@ function formatEvaluationSummary(summary) {
     }
     lines.push("");
   }
-  const overallPassed = summary.passedCount === summary.total - summary.executionErrorCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
+  const gradedCount = summary.total - summary.executionErrorCount;
+  const threshold = options?.threshold ?? 0.8;
+  const overallPassed = summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
   const overallVerdict = overallPassed ? "PASS" : "FAIL";
   const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
   const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
-  const verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`;
+  const verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
   lines.push("\n==================================================");
   if (useColor) {
     lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
@@ -2406,12 +2409,6 @@ function formatMatrixSummary(results) {
   lines.push(`${"Average".padEnd(testIdColWidth)}  ${avgCells.join("  ")}`);
   return lines.join("\n");
 }
-function formatThresholdSummary(meanScore, threshold) {
-  const passed = meanScore >= threshold;
-  const verdict = passed ? "PASS" : "FAIL";
-  const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
-  return { passed, message };
-}
 // ../../packages/core/dist/evaluation/validation/index.js
 import { readFile as readFile3 } from "node:fs/promises";
@@ -4371,6 +4368,7 @@ async function runSingleEvalFile(params) {
     failOnError,
     graderTarget: options.graderTarget,
     model: options.model,
+    threshold: options.threshold,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result) => {
       streamingObserver?.completeFromResult?.(result);
@@ -4486,7 +4484,7 @@ async function runEvalCommand(input) {
   const useFileExport = !!options.otelFile;
   if (options.exportOtel || useFileExport) {
     try {
-      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-P74O2P2I.js");
+      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-VWMHFUXR.js");
       let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
       let headers = {};
       if (options.otelBackend) {
@@ -4707,7 +4705,8 @@ async function runEvalCommand(input) {
             trialsConfig: targetPrep.trialsConfig,
             matrixMode: targetPrep.selections.length > 1,
             totalBudgetUsd: targetPrep.totalBudgetUsd,
-            failOnError: targetPrep.failOnError
+            failOnError: targetPrep.failOnError,
+            threshold: resolvedThreshold
           });
           return result.results;
         })
@@ -4726,15 +4725,10 @@ async function runEvalCommand(input) {
         `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
       );
     }
-    const summary = calculateEvaluationSummary(allResults);
-    console.log(formatEvaluationSummary(summary));
-    let thresholdFailed = false;
-    if (resolvedThreshold !== void 0) {
-      const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
-      console.log(`
-${thresholdResult.message}`);
-      thresholdFailed = !thresholdResult.passed;
-    }
+    const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
+    const summary = calculateEvaluationSummary(allResults, thresholdOpts);
+    console.log(formatEvaluationSummary(summary, thresholdOpts));
+    const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
     if (isMatrixMode && allResults.length > 0) {
       console.log(formatMatrixSummary(allResults));
     }
@@ -4864,10 +4858,10 @@ export {
   loadManifestResults,
   loadLightweightResults,
   HtmlWriter,
-  resolveEvalPaths,
-  findRepoRoot,
   resolveRunCacheFile,
   loadRunCache,
+  resolveEvalPaths,
+  findRepoRoot,
   detectFileType,
   validateEvalFile,
   validateTargetsFile,
@@ -4878,4 +4872,4 @@ export {
   selectTarget,
   runEvalCommand
 };
-//# sourceMappingURL=chunk-VYZQMN57.js.map
+//# sourceMappingURL=chunk-5DEZ72J3.js.map