agentv 2.11.2 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ import {
25
25
  subscribeToCopilotCliLogEntries,
26
26
  subscribeToCopilotSdkLogEntries,
27
27
  subscribeToPiLogEntries
28
- } from "./chunk-MQIQH5LB.js";
28
+ } from "./chunk-LUHCYBMD.js";
29
29
 
30
30
  // src/commands/eval/shared.ts
31
31
  import { constants } from "node:fs";
@@ -872,7 +872,6 @@ function buildHistogram(values) {
872
872
  return bins;
873
873
  }
874
874
  function calculateEvaluationSummary(results) {
875
- const scores = results.map((result) => result.score);
876
875
  const total = results.length;
877
876
  const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
878
877
  const errorCount = errors.length;
@@ -888,18 +887,39 @@ function calculateEvaluationSummary(results) {
888
887
  topResults: [],
889
888
  bottomResults: [],
890
889
  errorCount: 0,
891
- errors: []
890
+ errors: [],
891
+ executionErrorCount: 0,
892
+ qualityFailureCount: 0,
893
+ passedCount: 0,
894
+ byFailureStage: {},
895
+ byFailureReason: {}
892
896
  };
893
897
  }
894
- const mean = computeMean(scores);
895
- const median = computeMedian(scores);
896
- const min = Math.min(...scores);
897
- const max = Math.max(...scores);
898
- const standardDeviation = computeStandardDeviation(scores);
899
- const histogram = buildHistogram(scores);
900
- const sortedResults = [...results].sort((a, b) => b.score - a.score);
898
+ const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
899
+ const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
900
+ const qualityScores = qualityResults.map((r) => r.score);
901
+ const mean = computeMean(qualityScores);
902
+ const median = computeMedian(qualityScores);
903
+ const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
904
+ const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
905
+ const standardDeviation = computeStandardDeviation(qualityScores);
906
+ const histogram = buildHistogram(qualityScores);
907
+ const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
901
908
  const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
902
909
  const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
910
+ const executionErrorCount = executionErrors.length;
911
+ const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
912
+ const passedCount = results.filter((r) => r.executionStatus === "ok").length;
913
+ const byFailureStage = {};
914
+ const byFailureReason = {};
915
+ for (const result of executionErrors) {
916
+ if (result.failureStage) {
917
+ byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
918
+ }
919
+ if (result.failureReasonCode) {
920
+ byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
921
+ }
922
+ }
903
923
  return {
904
924
  total,
905
925
  mean,
@@ -911,7 +931,12 @@ function calculateEvaluationSummary(results) {
911
931
  topResults,
912
932
  bottomResults,
913
933
  errorCount,
914
- errors
934
+ errors,
935
+ executionErrorCount,
936
+ qualityFailureCount,
937
+ passedCount,
938
+ byFailureStage,
939
+ byFailureReason
915
940
  };
916
941
  }
917
942
  function formatScore(value) {
@@ -924,7 +949,7 @@ function formatEvaluationSummary(summary) {
924
949
  const lines = [];
925
950
  if (summary.errorCount > 0) {
926
951
  lines.push("\n==================================================");
927
- lines.push("ERRORS");
952
+ lines.push("EXECUTION ERRORS");
928
953
  lines.push("==================================================");
929
954
  for (const error of summary.errors) {
930
955
  lines.push(`
@@ -937,11 +962,21 @@ function formatEvaluationSummary(summary) {
937
962
  lines.push("EVALUATION SUMMARY");
938
963
  lines.push("==================================================");
939
964
  lines.push(`Total tests: ${summary.total}`);
940
- if (summary.errorCount > 0) {
941
- lines.push(`Failed: ${summary.errorCount}`);
942
- lines.push(`Passed: ${summary.total - summary.errorCount}`);
965
+ lines.push(`Passed: ${summary.passedCount}`);
966
+ if (summary.qualityFailureCount > 0) {
967
+ lines.push(`Quality failures: ${summary.qualityFailureCount}`);
968
+ }
969
+ if (summary.executionErrorCount > 0) {
970
+ lines.push(`Execution errors: ${summary.executionErrorCount}`);
971
+ }
972
+ if (summary.executionErrorCount > 0) {
973
+ const qualityCount = summary.total - summary.executionErrorCount;
974
+ lines.push(
975
+ `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
976
+ );
977
+ } else {
978
+ lines.push(`Mean score: ${formatScore(summary.mean)}`);
943
979
  }
944
- lines.push(`Mean score: ${formatScore(summary.mean)}`);
945
980
  lines.push(`Median score: ${formatScore(summary.median)}`);
946
981
  lines.push(`Min score: ${formatScore(summary.min)}`);
947
982
  lines.push(`Max score: ${formatScore(summary.max)}`);
@@ -961,6 +996,20 @@ function formatEvaluationSummary(summary) {
961
996
  summary.bottomResults.forEach((result, index) => {
962
997
  lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
963
998
  });
999
+ const failureStageEntries = Object.entries(summary.byFailureStage);
1000
+ if (failureStageEntries.length > 0) {
1001
+ lines.push("\nExecution errors by stage:");
1002
+ for (const [stage, count] of failureStageEntries) {
1003
+ lines.push(` ${stage}: ${count}`);
1004
+ }
1005
+ }
1006
+ const failureReasonEntries = Object.entries(summary.byFailureReason);
1007
+ if (failureReasonEntries.length > 0) {
1008
+ lines.push("\nExecution errors by reason:");
1009
+ for (const [reason, count] of failureReasonEntries) {
1010
+ lines.push(` ${reason}: ${count}`);
1011
+ }
1012
+ }
964
1013
  return lines.join("\n");
965
1014
  }
966
1015
  function formatMatrixSummary(results) {
@@ -2484,12 +2533,13 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
2484
2533
  noCache: resolvedNoCache,
2485
2534
  // Boolean OR: config `true` cannot be overridden to `false` from CLI.
2486
2535
  // Intentional — there are no --no-verbose / --no-keep-workspaces flags.
2487
- verbose: normalizeBoolean(rawOptions.verbose) || yamlExecution?.verbose === true,
2488
- keepWorkspaces: normalizeBoolean(rawOptions.keepWorkspaces) || yamlExecution?.keep_workspaces === true,
2536
+ // Precedence: CLI > YAML config > TS config
2537
+ verbose: normalizeBoolean(rawOptions.verbose) || yamlExecution?.verbose === true || config?.execution?.verbose === true,
2538
+ keepWorkspaces: normalizeBoolean(rawOptions.keepWorkspaces) || yamlExecution?.keep_workspaces === true || config?.execution?.keepWorkspaces === true,
2489
2539
  cleanupWorkspaces: normalizeBoolean(rawOptions.cleanupWorkspaces),
2490
- trace: normalizeBoolean(rawOptions.trace),
2491
- otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0),
2492
- traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0),
2540
+ // Precedence: CLI > YAML config > TS config
2541
+ otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
2542
+ traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
2493
2543
  exportOtel: normalizeBoolean(rawOptions.exportOtel),
2494
2544
  otelBackend: normalizeString(rawOptions.otelBackend),
2495
2545
  otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
@@ -2789,7 +2839,7 @@ async function runEvalCommand(input) {
2789
2839
  const useFileExport = !!(options.otelFile || options.traceFile);
2790
2840
  if (options.exportOtel || useFileExport) {
2791
2841
  try {
2792
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OVEHXEXC.js");
2842
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OPPA4P5R.js");
2793
2843
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
2794
2844
  let headers = {};
2795
2845
  if (options.otelBackend) {
@@ -3060,4 +3110,4 @@ export {
3060
3110
  selectTarget,
3061
3111
  runEvalCommand
3062
3112
  };
3063
- //# sourceMappingURL=chunk-IL7CRMY6.js.map
3113
+ //# sourceMappingURL=chunk-YBJX5CP6.js.map