agentv 2.11.2 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SNABHVUB.js → chunk-6KU2ZUFJ.js} +5 -5
- package/dist/{chunk-SNABHVUB.js.map → chunk-6KU2ZUFJ.js.map} +1 -1
- package/dist/{chunk-MQIQH5LB.js → chunk-LUHCYBMD.js} +139 -33
- package/dist/chunk-LUHCYBMD.js.map +1 -0
- package/dist/{chunk-IL7CRMY6.js → chunk-YBJX5CP6.js} +73 -23
- package/dist/chunk-YBJX5CP6.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OVEHXEXC.js → dist-OPPA4P5R.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-7NQRG7GK.js → interactive-TOUKPSHP.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-IL7CRMY6.js.map +0 -1
- package/dist/chunk-MQIQH5LB.js.map +0 -1
- /package/dist/{dist-OVEHXEXC.js.map → dist-OPPA4P5R.js.map} +0 -0
- /package/dist/{interactive-7NQRG7GK.js.map → interactive-TOUKPSHP.js.map} +0 -0
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-LUHCYBMD.js";
|
|
29
29
|
|
|
30
30
|
// src/commands/eval/shared.ts
|
|
31
31
|
import { constants } from "node:fs";
|
|
@@ -872,7 +872,6 @@ function buildHistogram(values) {
|
|
|
872
872
|
return bins;
|
|
873
873
|
}
|
|
874
874
|
function calculateEvaluationSummary(results) {
|
|
875
|
-
const scores = results.map((result) => result.score);
|
|
876
875
|
const total = results.length;
|
|
877
876
|
const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
|
|
878
877
|
const errorCount = errors.length;
|
|
@@ -888,18 +887,39 @@ function calculateEvaluationSummary(results) {
|
|
|
888
887
|
topResults: [],
|
|
889
888
|
bottomResults: [],
|
|
890
889
|
errorCount: 0,
|
|
891
|
-
errors: []
|
|
890
|
+
errors: [],
|
|
891
|
+
executionErrorCount: 0,
|
|
892
|
+
qualityFailureCount: 0,
|
|
893
|
+
passedCount: 0,
|
|
894
|
+
byFailureStage: {},
|
|
895
|
+
byFailureReason: {}
|
|
892
896
|
};
|
|
893
897
|
}
|
|
894
|
-
const
|
|
895
|
-
const
|
|
896
|
-
const
|
|
897
|
-
const
|
|
898
|
-
const
|
|
899
|
-
const
|
|
900
|
-
const
|
|
898
|
+
const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
|
|
899
|
+
const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
|
|
900
|
+
const qualityScores = qualityResults.map((r) => r.score);
|
|
901
|
+
const mean = computeMean(qualityScores);
|
|
902
|
+
const median = computeMedian(qualityScores);
|
|
903
|
+
const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
|
|
904
|
+
const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
|
|
905
|
+
const standardDeviation = computeStandardDeviation(qualityScores);
|
|
906
|
+
const histogram = buildHistogram(qualityScores);
|
|
907
|
+
const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
|
|
901
908
|
const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
|
|
902
909
|
const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
|
|
910
|
+
const executionErrorCount = executionErrors.length;
|
|
911
|
+
const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
|
|
912
|
+
const passedCount = results.filter((r) => r.executionStatus === "ok").length;
|
|
913
|
+
const byFailureStage = {};
|
|
914
|
+
const byFailureReason = {};
|
|
915
|
+
for (const result of executionErrors) {
|
|
916
|
+
if (result.failureStage) {
|
|
917
|
+
byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
|
|
918
|
+
}
|
|
919
|
+
if (result.failureReasonCode) {
|
|
920
|
+
byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
|
|
921
|
+
}
|
|
922
|
+
}
|
|
903
923
|
return {
|
|
904
924
|
total,
|
|
905
925
|
mean,
|
|
@@ -911,7 +931,12 @@ function calculateEvaluationSummary(results) {
|
|
|
911
931
|
topResults,
|
|
912
932
|
bottomResults,
|
|
913
933
|
errorCount,
|
|
914
|
-
errors
|
|
934
|
+
errors,
|
|
935
|
+
executionErrorCount,
|
|
936
|
+
qualityFailureCount,
|
|
937
|
+
passedCount,
|
|
938
|
+
byFailureStage,
|
|
939
|
+
byFailureReason
|
|
915
940
|
};
|
|
916
941
|
}
|
|
917
942
|
function formatScore(value) {
|
|
@@ -924,7 +949,7 @@ function formatEvaluationSummary(summary) {
|
|
|
924
949
|
const lines = [];
|
|
925
950
|
if (summary.errorCount > 0) {
|
|
926
951
|
lines.push("\n==================================================");
|
|
927
|
-
lines.push("ERRORS");
|
|
952
|
+
lines.push("EXECUTION ERRORS");
|
|
928
953
|
lines.push("==================================================");
|
|
929
954
|
for (const error of summary.errors) {
|
|
930
955
|
lines.push(`
|
|
@@ -937,11 +962,21 @@ function formatEvaluationSummary(summary) {
|
|
|
937
962
|
lines.push("EVALUATION SUMMARY");
|
|
938
963
|
lines.push("==================================================");
|
|
939
964
|
lines.push(`Total tests: ${summary.total}`);
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
lines.push(`
|
|
965
|
+
lines.push(`Passed: ${summary.passedCount}`);
|
|
966
|
+
if (summary.qualityFailureCount > 0) {
|
|
967
|
+
lines.push(`Quality failures: ${summary.qualityFailureCount}`);
|
|
968
|
+
}
|
|
969
|
+
if (summary.executionErrorCount > 0) {
|
|
970
|
+
lines.push(`Execution errors: ${summary.executionErrorCount}`);
|
|
971
|
+
}
|
|
972
|
+
if (summary.executionErrorCount > 0) {
|
|
973
|
+
const qualityCount = summary.total - summary.executionErrorCount;
|
|
974
|
+
lines.push(
|
|
975
|
+
`Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
|
|
976
|
+
);
|
|
977
|
+
} else {
|
|
978
|
+
lines.push(`Mean score: ${formatScore(summary.mean)}`);
|
|
943
979
|
}
|
|
944
|
-
lines.push(`Mean score: ${formatScore(summary.mean)}`);
|
|
945
980
|
lines.push(`Median score: ${formatScore(summary.median)}`);
|
|
946
981
|
lines.push(`Min score: ${formatScore(summary.min)}`);
|
|
947
982
|
lines.push(`Max score: ${formatScore(summary.max)}`);
|
|
@@ -961,6 +996,20 @@ function formatEvaluationSummary(summary) {
|
|
|
961
996
|
summary.bottomResults.forEach((result, index) => {
|
|
962
997
|
lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
|
|
963
998
|
});
|
|
999
|
+
const failureStageEntries = Object.entries(summary.byFailureStage);
|
|
1000
|
+
if (failureStageEntries.length > 0) {
|
|
1001
|
+
lines.push("\nExecution errors by stage:");
|
|
1002
|
+
for (const [stage, count] of failureStageEntries) {
|
|
1003
|
+
lines.push(` ${stage}: ${count}`);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
const failureReasonEntries = Object.entries(summary.byFailureReason);
|
|
1007
|
+
if (failureReasonEntries.length > 0) {
|
|
1008
|
+
lines.push("\nExecution errors by reason:");
|
|
1009
|
+
for (const [reason, count] of failureReasonEntries) {
|
|
1010
|
+
lines.push(` ${reason}: ${count}`);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
964
1013
|
return lines.join("\n");
|
|
965
1014
|
}
|
|
966
1015
|
function formatMatrixSummary(results) {
|
|
@@ -2484,12 +2533,13 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
2484
2533
|
noCache: resolvedNoCache,
|
|
2485
2534
|
// Boolean OR: config `true` cannot be overridden to `false` from CLI.
|
|
2486
2535
|
// Intentional — there are no --no-verbose / --no-keep-workspaces flags.
|
|
2487
|
-
|
|
2488
|
-
|
|
2536
|
+
// Precedence: CLI > YAML config > TS config
|
|
2537
|
+
verbose: normalizeBoolean(rawOptions.verbose) || yamlExecution?.verbose === true || config?.execution?.verbose === true,
|
|
2538
|
+
keepWorkspaces: normalizeBoolean(rawOptions.keepWorkspaces) || yamlExecution?.keep_workspaces === true || config?.execution?.keepWorkspaces === true,
|
|
2489
2539
|
cleanupWorkspaces: normalizeBoolean(rawOptions.cleanupWorkspaces),
|
|
2490
|
-
|
|
2491
|
-
otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0),
|
|
2492
|
-
traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0),
|
|
2540
|
+
// Precedence: CLI > YAML config > TS config
|
|
2541
|
+
otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
|
|
2542
|
+
traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
|
|
2493
2543
|
exportOtel: normalizeBoolean(rawOptions.exportOtel),
|
|
2494
2544
|
otelBackend: normalizeString(rawOptions.otelBackend),
|
|
2495
2545
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
|
|
@@ -2789,7 +2839,7 @@ async function runEvalCommand(input) {
|
|
|
2789
2839
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
2790
2840
|
if (options.exportOtel || useFileExport) {
|
|
2791
2841
|
try {
|
|
2792
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
2842
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OPPA4P5R.js");
|
|
2793
2843
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
2794
2844
|
let headers = {};
|
|
2795
2845
|
if (options.otelBackend) {
|
|
@@ -3060,4 +3110,4 @@ export {
|
|
|
3060
3110
|
selectTarget,
|
|
3061
3111
|
runEvalCommand
|
|
3062
3112
|
};
|
|
3063
|
-
//# sourceMappingURL=chunk-
|
|
3113
|
+
//# sourceMappingURL=chunk-YBJX5CP6.js.map
|