agentv 4.4.1 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-VYZQMN57.js → chunk-5DEZ72J3.js} +22 -28
- package/dist/chunk-5DEZ72J3.js.map +1 -0
- package/dist/{chunk-63NDZ6UC.js → chunk-7DRAXDVC.js} +416 -110
- package/dist/chunk-7DRAXDVC.js.map +1 -0
- package/dist/{chunk-4WMLJHW5.js → chunk-BQC2CDLN.js} +384 -506
- package/dist/chunk-BQC2CDLN.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-P74O2P2I.js → dist-VWMHFUXR.js} +8 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-VJP2AEPT.js → interactive-OG7ZJIHG.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-4WMLJHW5.js.map +0 -1
- package/dist/chunk-63NDZ6UC.js.map +0 -1
- package/dist/chunk-VYZQMN57.js.map +0 -1
- /package/dist/{dist-P74O2P2I.js.map → dist-VWMHFUXR.js.map} +0 -0
- /package/dist/{interactive-VJP2AEPT.js.map → interactive-OG7ZJIHG.js.map} +0 -0
|
@@ -29,12 +29,12 @@ import {
|
|
|
29
29
|
subscribeToCopilotCliLogEntries,
|
|
30
30
|
subscribeToCopilotSdkLogEntries,
|
|
31
31
|
subscribeToPiLogEntries
|
|
32
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-7DRAXDVC.js";
|
|
33
33
|
|
|
34
34
|
// package.json
|
|
35
35
|
var package_default = {
|
|
36
36
|
name: "agentv",
|
|
37
|
-
version: "4.
|
|
37
|
+
version: "4.5.1",
|
|
38
38
|
description: "CLI entry point for AgentV",
|
|
39
39
|
type: "module",
|
|
40
40
|
repository: {
|
|
@@ -2209,7 +2209,7 @@ function buildHistogram(values) {
|
|
|
2209
2209
|
}
|
|
2210
2210
|
return bins;
|
|
2211
2211
|
}
|
|
2212
|
-
function calculateEvaluationSummary(results) {
|
|
2212
|
+
function calculateEvaluationSummary(results, options) {
|
|
2213
2213
|
const total = results.length;
|
|
2214
2214
|
const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
|
|
2215
2215
|
const errorCount = errors.length;
|
|
@@ -2246,8 +2246,9 @@ function calculateEvaluationSummary(results) {
|
|
|
2246
2246
|
const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
|
|
2247
2247
|
const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
|
|
2248
2248
|
const executionErrorCount = executionErrors.length;
|
|
2249
|
-
const
|
|
2250
|
-
const passedCount = results.filter((r) => r.executionStatus === "ok").length;
|
|
2249
|
+
const scoreThreshold = options?.threshold;
|
|
2250
|
+
const passedCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score >= scoreThreshold).length : results.filter((r) => r.executionStatus === "ok").length;
|
|
2251
|
+
const qualityFailureCount = scoreThreshold !== void 0 ? qualityResults.filter((r) => r.score < scoreThreshold).length : results.filter((r) => r.executionStatus === "quality_failure").length;
|
|
2251
2252
|
const byFailureStage = {};
|
|
2252
2253
|
const byFailureReason = {};
|
|
2253
2254
|
for (const result of executionErrors) {
|
|
@@ -2280,7 +2281,7 @@ function calculateEvaluationSummary(results) {
|
|
|
2280
2281
|
function formatScore(value) {
|
|
2281
2282
|
return value.toFixed(3);
|
|
2282
2283
|
}
|
|
2283
|
-
function formatEvaluationSummary(summary) {
|
|
2284
|
+
function formatEvaluationSummary(summary, options) {
|
|
2284
2285
|
if (summary.total === 0) {
|
|
2285
2286
|
return "\nNo results to summarize";
|
|
2286
2287
|
}
|
|
@@ -2296,11 +2297,13 @@ function formatEvaluationSummary(summary) {
|
|
|
2296
2297
|
}
|
|
2297
2298
|
lines.push("");
|
|
2298
2299
|
}
|
|
2299
|
-
const
|
|
2300
|
+
const gradedCount = summary.total - summary.executionErrorCount;
|
|
2301
|
+
const threshold = options?.threshold ?? 0.8;
|
|
2302
|
+
const overallPassed = summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
|
|
2300
2303
|
const overallVerdict = overallPassed ? "PASS" : "FAIL";
|
|
2301
2304
|
const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
|
|
2302
2305
|
const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
|
|
2303
|
-
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${
|
|
2306
|
+
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
|
|
2304
2307
|
lines.push("\n==================================================");
|
|
2305
2308
|
if (useColor) {
|
|
2306
2309
|
lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
|
|
@@ -2406,12 +2409,6 @@ function formatMatrixSummary(results) {
|
|
|
2406
2409
|
lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
|
|
2407
2410
|
return lines.join("\n");
|
|
2408
2411
|
}
|
|
2409
|
-
function formatThresholdSummary(meanScore, threshold) {
|
|
2410
|
-
const passed = meanScore >= threshold;
|
|
2411
|
-
const verdict = passed ? "PASS" : "FAIL";
|
|
2412
|
-
const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
|
|
2413
|
-
return { passed, message };
|
|
2414
|
-
}
|
|
2415
2412
|
|
|
2416
2413
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2417
2414
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -4371,6 +4368,7 @@ async function runSingleEvalFile(params) {
|
|
|
4371
4368
|
failOnError,
|
|
4372
4369
|
graderTarget: options.graderTarget,
|
|
4373
4370
|
model: options.model,
|
|
4371
|
+
threshold: options.threshold,
|
|
4374
4372
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
4375
4373
|
onResult: async (result) => {
|
|
4376
4374
|
streamingObserver?.completeFromResult?.(result);
|
|
@@ -4486,7 +4484,7 @@ async function runEvalCommand(input) {
|
|
|
4486
4484
|
const useFileExport = !!options.otelFile;
|
|
4487
4485
|
if (options.exportOtel || useFileExport) {
|
|
4488
4486
|
try {
|
|
4489
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4487
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-VWMHFUXR.js");
|
|
4490
4488
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4491
4489
|
let headers = {};
|
|
4492
4490
|
if (options.otelBackend) {
|
|
@@ -4707,7 +4705,8 @@ async function runEvalCommand(input) {
|
|
|
4707
4705
|
trialsConfig: targetPrep.trialsConfig,
|
|
4708
4706
|
matrixMode: targetPrep.selections.length > 1,
|
|
4709
4707
|
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
4710
|
-
failOnError: targetPrep.failOnError
|
|
4708
|
+
failOnError: targetPrep.failOnError,
|
|
4709
|
+
threshold: resolvedThreshold
|
|
4711
4710
|
});
|
|
4712
4711
|
return result.results;
|
|
4713
4712
|
})
|
|
@@ -4726,15 +4725,10 @@ async function runEvalCommand(input) {
|
|
|
4726
4725
|
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
4727
4726
|
);
|
|
4728
4727
|
}
|
|
4729
|
-
const
|
|
4730
|
-
|
|
4731
|
-
|
|
4732
|
-
|
|
4733
|
-
const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
|
|
4734
|
-
console.log(`
|
|
4735
|
-
${thresholdResult.message}`);
|
|
4736
|
-
thresholdFailed = !thresholdResult.passed;
|
|
4737
|
-
}
|
|
4728
|
+
const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4729
|
+
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
|
|
4730
|
+
console.log(formatEvaluationSummary(summary, thresholdOpts));
|
|
4731
|
+
const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
|
|
4738
4732
|
if (isMatrixMode && allResults.length > 0) {
|
|
4739
4733
|
console.log(formatMatrixSummary(allResults));
|
|
4740
4734
|
}
|
|
@@ -4864,10 +4858,10 @@ export {
|
|
|
4864
4858
|
loadManifestResults,
|
|
4865
4859
|
loadLightweightResults,
|
|
4866
4860
|
HtmlWriter,
|
|
4867
|
-
resolveEvalPaths,
|
|
4868
|
-
findRepoRoot,
|
|
4869
4861
|
resolveRunCacheFile,
|
|
4870
4862
|
loadRunCache,
|
|
4863
|
+
resolveEvalPaths,
|
|
4864
|
+
findRepoRoot,
|
|
4871
4865
|
detectFileType,
|
|
4872
4866
|
validateEvalFile,
|
|
4873
4867
|
validateTargetsFile,
|
|
@@ -4878,4 +4872,4 @@ export {
|
|
|
4878
4872
|
selectTarget,
|
|
4879
4873
|
runEvalCommand
|
|
4880
4874
|
};
|
|
4881
|
-
//# sourceMappingURL=chunk-
|
|
4875
|
+
//# sourceMappingURL=chunk-5DEZ72J3.js.map
|