agentv 4.4.1 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,12 +29,12 @@ import {
29
29
  subscribeToCopilotCliLogEntries,
30
30
  subscribeToCopilotSdkLogEntries,
31
31
  subscribeToPiLogEntries
32
- } from "./chunk-63NDZ6UC.js";
32
+ } from "./chunk-7DRAXDVC.js";
33
33
 
34
34
  // package.json
35
35
  var package_default = {
36
36
  name: "agentv",
37
- version: "4.4.1",
37
+ version: "4.5.1",
38
38
  description: "CLI entry point for AgentV",
39
39
  type: "module",
40
40
  repository: {
@@ -2209,7 +2209,7 @@ function buildHistogram(values) {
2209
2209
  }
2210
2210
  return bins;
2211
2211
  }
2212
- function calculateEvaluationSummary(results) {
2212
+ function calculateEvaluationSummary(results, options) {
2213
2213
  const total = results.length;
2214
2214
  const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
2215
2215
  const errorCount = errors.length;
@@ -2246,8 +2246,9 @@ function calculateEvaluationSummary(results) {
2246
2246
  const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
2247
2247
  const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
2248
2248
  const executionErrorCount = executionErrors.length;
2249
- const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
2250
- const passedCount = results.filter((r) => r.executionStatus === "ok").length;
2249
+ const scoreThreshold = options?.threshold;
2250
+ const passedCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score >= scoreThreshold).length : results.filter((r) => r.executionStatus === "ok").length;
2251
+ const qualityFailureCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score < scoreThreshold).length : results.filter((r) => r.executionStatus === "quality_failure").length;
2251
2252
  const byFailureStage = {};
2252
2253
  const byFailureReason = {};
2253
2254
  for (const result of executionErrors) {
@@ -2280,7 +2281,7 @@ function calculateEvaluationSummary(results) {
2280
2281
  function formatScore(value) {
2281
2282
  return value.toFixed(3);
2282
2283
  }
2283
- function formatEvaluationSummary(summary) {
2284
+ function formatEvaluationSummary(summary, options) {
2284
2285
  if (summary.total === 0) {
2285
2286
  return "\nNo results to summarize";
2286
2287
  }
@@ -2296,11 +2297,13 @@ function formatEvaluationSummary(summary) {
2296
2297
  }
2297
2298
  lines.push("");
2298
2299
  }
2299
- const overallPassed = summary.passedCount === summary.total - summary.executionErrorCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
2300
+ const gradedCount = summary.total - summary.executionErrorCount;
2301
+ const threshold = options?.threshold ?? 0.8;
2302
+ const overallPassed = summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
2300
2303
  const overallVerdict = overallPassed ? "PASS" : "FAIL";
2301
2304
  const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
2302
2305
  const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2303
- const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`;
2306
+ const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2304
2307
  lines.push("\n==================================================");
2305
2308
  if (useColor) {
2306
2309
  lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
@@ -2406,12 +2409,6 @@ function formatMatrixSummary(results) {
2406
2409
  lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
2407
2410
  return lines.join("\n");
2408
2411
  }
2409
- function formatThresholdSummary(meanScore, threshold) {
2410
- const passed = meanScore >= threshold;
2411
- const verdict = passed ? "PASS" : "FAIL";
2412
- const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
2413
- return { passed, message };
2414
- }
2415
2412
 
2416
2413
  // ../../packages/core/dist/evaluation/validation/index.js
2417
2414
  import { readFile as readFile3 } from "node:fs/promises";
@@ -4371,6 +4368,7 @@ async function runSingleEvalFile(params) {
4371
4368
  failOnError,
4372
4369
  graderTarget: options.graderTarget,
4373
4370
  model: options.model,
4371
+ threshold: options.threshold,
4374
4372
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
4375
4373
  onResult: async (result) => {
4376
4374
  streamingObserver?.completeFromResult?.(result);
@@ -4486,7 +4484,7 @@ async function runEvalCommand(input) {
4486
4484
  const useFileExport = !!options.otelFile;
4487
4485
  if (options.exportOtel || useFileExport) {
4488
4486
  try {
4489
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-P74O2P2I.js");
4487
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-VWMHFUXR.js");
4490
4488
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4491
4489
  let headers = {};
4492
4490
  if (options.otelBackend) {
@@ -4707,7 +4705,8 @@ async function runEvalCommand(input) {
4707
4705
  trialsConfig: targetPrep.trialsConfig,
4708
4706
  matrixMode: targetPrep.selections.length > 1,
4709
4707
  totalBudgetUsd: targetPrep.totalBudgetUsd,
4710
- failOnError: targetPrep.failOnError
4708
+ failOnError: targetPrep.failOnError,
4709
+ threshold: resolvedThreshold
4711
4710
  });
4712
4711
  return result.results;
4713
4712
  })
@@ -4726,15 +4725,10 @@ async function runEvalCommand(input) {
4726
4725
  `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
4727
4726
  );
4728
4727
  }
4729
- const summary = calculateEvaluationSummary(allResults);
4730
- console.log(formatEvaluationSummary(summary));
4731
- let thresholdFailed = false;
4732
- if (resolvedThreshold !== void 0) {
4733
- const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
4734
- console.log(`
4735
- ${thresholdResult.message}`);
4736
- thresholdFailed = !thresholdResult.passed;
4737
- }
4728
+ const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
4729
+ const summary = calculateEvaluationSummary(allResults, thresholdOpts);
4730
+ console.log(formatEvaluationSummary(summary, thresholdOpts));
4731
+ const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
4738
4732
  if (isMatrixMode && allResults.length > 0) {
4739
4733
  console.log(formatMatrixSummary(allResults));
4740
4734
  }
@@ -4864,10 +4858,10 @@ export {
4864
4858
  loadManifestResults,
4865
4859
  loadLightweightResults,
4866
4860
  HtmlWriter,
4867
- resolveEvalPaths,
4868
- findRepoRoot,
4869
4861
  resolveRunCacheFile,
4870
4862
  loadRunCache,
4863
+ resolveEvalPaths,
4864
+ findRepoRoot,
4871
4865
  detectFileType,
4872
4866
  validateEvalFile,
4873
4867
  validateTargetsFile,
@@ -4878,4 +4872,4 @@ export {
4878
4872
  selectTarget,
4879
4873
  runEvalCommand
4880
4874
  };
4881
- //# sourceMappingURL=chunk-VYZQMN57.js.map
4875
+ //# sourceMappingURL=chunk-5DEZ72J3.js.map