agentv 3.13.1 → 3.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-LSXO22CF.js → chunk-4Z5E5CYT.js} +50 -20
- package/dist/chunk-4Z5E5CYT.js.map +1 -0
- package/dist/{chunk-K747KGDP.js → chunk-D3LNJUUB.js} +21 -2
- package/dist/chunk-D3LNJUUB.js.map +1 -0
- package/dist/{chunk-UK7UMQOX.js → chunk-X2343WOK.js} +15 -6
- package/dist/chunk-X2343WOK.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-LCZDS36N.js → dist-KPMR7RBT.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-76ZJVPI7.js → interactive-HVKLYGRX.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-K747KGDP.js.map +0 -1
- package/dist/chunk-LSXO22CF.js.map +0 -1
- package/dist/chunk-UK7UMQOX.js.map +0 -1
- /package/dist/{dist-LCZDS36N.js.map → dist-KPMR7RBT.js.map} +0 -0
- /package/dist/{interactive-76ZJVPI7.js.map → interactive-HVKLYGRX.js.map} +0 -0
|
@@ -27,12 +27,12 @@ import {
|
|
|
27
27
|
subscribeToCopilotCliLogEntries,
|
|
28
28
|
subscribeToCopilotSdkLogEntries,
|
|
29
29
|
subscribeToPiLogEntries
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-D3LNJUUB.js";
|
|
31
31
|
|
|
32
32
|
// package.json
|
|
33
33
|
var package_default = {
|
|
34
34
|
name: "agentv",
|
|
35
|
-
version: "3.13.
|
|
35
|
+
version: "3.13.2",
|
|
36
36
|
description: "CLI entry point for AgentV",
|
|
37
37
|
type: "module",
|
|
38
38
|
repository: {
|
|
@@ -1644,13 +1644,15 @@ function escapeXml(str) {
|
|
|
1644
1644
|
var JunitWriter = class _JunitWriter {
|
|
1645
1645
|
filePath;
|
|
1646
1646
|
results = [];
|
|
1647
|
+
threshold;
|
|
1647
1648
|
closed = false;
|
|
1648
|
-
constructor(filePath) {
|
|
1649
|
+
constructor(filePath, options) {
|
|
1649
1650
|
this.filePath = filePath;
|
|
1651
|
+
this.threshold = options?.threshold ?? 0.5;
|
|
1650
1652
|
}
|
|
1651
|
-
static async open(filePath) {
|
|
1653
|
+
static async open(filePath, options) {
|
|
1652
1654
|
await mkdir5(path9.dirname(filePath), { recursive: true });
|
|
1653
|
-
return new _JunitWriter(filePath);
|
|
1655
|
+
return new _JunitWriter(filePath, options);
|
|
1654
1656
|
}
|
|
1655
1657
|
async append(result) {
|
|
1656
1658
|
if (this.closed) {
|
|
@@ -1675,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1675
1677
|
}
|
|
1676
1678
|
const suiteXmls = [];
|
|
1677
1679
|
for (const [suiteName, results] of grouped) {
|
|
1678
|
-
const failures = results.filter((r) => r.score <
|
|
1680
|
+
const failures = results.filter((r) => r.score < this.threshold).length;
|
|
1679
1681
|
const errors = results.filter((r) => r.error !== void 0).length;
|
|
1680
1682
|
const testCases = results.map((r) => {
|
|
1681
1683
|
const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
|
|
@@ -1684,7 +1686,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1684
1686
|
inner = `
|
|
1685
1687
|
<error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
|
|
1686
1688
|
`;
|
|
1687
|
-
} else if (r.score <
|
|
1689
|
+
} else if (r.score < this.threshold) {
|
|
1688
1690
|
const message = `score=${r.score.toFixed(3)}`;
|
|
1689
1691
|
const failedAssertions = r.assertions.filter((a) => !a.passed);
|
|
1690
1692
|
const detail = [
|
|
@@ -1704,7 +1706,7 @@ ${testCases.join("\n")}
|
|
|
1704
1706
|
);
|
|
1705
1707
|
}
|
|
1706
1708
|
const totalTests = this.results.length;
|
|
1707
|
-
const totalFailures = this.results.filter((r) => r.score <
|
|
1709
|
+
const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
|
|
1708
1710
|
const totalErrors = this.results.filter((r) => r.error !== void 0).length;
|
|
1709
1711
|
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
1710
1712
|
<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
|
|
@@ -1785,7 +1787,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
1785
1787
|
}
|
|
1786
1788
|
}
|
|
1787
1789
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
1788
|
-
function createWriterFromPath(filePath) {
|
|
1790
|
+
function createWriterFromPath(filePath, options) {
|
|
1789
1791
|
const ext = path11.extname(filePath).toLowerCase();
|
|
1790
1792
|
switch (ext) {
|
|
1791
1793
|
case ".jsonl":
|
|
@@ -1793,7 +1795,7 @@ function createWriterFromPath(filePath) {
|
|
|
1793
1795
|
case ".json":
|
|
1794
1796
|
return JsonWriter.open(filePath);
|
|
1795
1797
|
case ".xml":
|
|
1796
|
-
return JunitWriter.open(filePath);
|
|
1798
|
+
return JunitWriter.open(filePath, { threshold: options?.threshold });
|
|
1797
1799
|
case ".yaml":
|
|
1798
1800
|
case ".yml":
|
|
1799
1801
|
return YamlWriter.open(filePath);
|
|
@@ -1806,8 +1808,8 @@ function createWriterFromPath(filePath) {
|
|
|
1806
1808
|
);
|
|
1807
1809
|
}
|
|
1808
1810
|
}
|
|
1809
|
-
async function createMultiWriter(filePaths) {
|
|
1810
|
-
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
|
|
1811
|
+
async function createMultiWriter(filePaths, options) {
|
|
1812
|
+
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
|
|
1811
1813
|
return {
|
|
1812
1814
|
async append(result) {
|
|
1813
1815
|
await Promise.all(writers.map((w) => w.append(result)));
|
|
@@ -2385,6 +2387,12 @@ function formatMatrixSummary(results) {
|
|
|
2385
2387
|
lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
|
|
2386
2388
|
return lines.join("\n");
|
|
2387
2389
|
}
|
|
2390
|
+
function formatThresholdSummary(meanScore, threshold) {
|
|
2391
|
+
const passed = meanScore >= threshold;
|
|
2392
|
+
const verdict = passed ? "PASS" : "FAIL";
|
|
2393
|
+
const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
|
|
2394
|
+
return { passed, message };
|
|
2395
|
+
}
|
|
2388
2396
|
|
|
2389
2397
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2390
2398
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -4048,7 +4056,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4048
4056
|
artifacts: normalizeString(rawOptions.artifacts),
|
|
4049
4057
|
graderTarget: normalizeString(rawOptions.graderTarget),
|
|
4050
4058
|
model: normalizeString(rawOptions.model),
|
|
4051
|
-
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages))
|
|
4059
|
+
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
|
|
4060
|
+
threshold: normalizeOptionalNumber(rawOptions.threshold)
|
|
4052
4061
|
};
|
|
4053
4062
|
}
|
|
4054
4063
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4194,7 +4203,8 @@ async function prepareFileMetadata(params) {
|
|
|
4194
4203
|
yamlCache: suite.cacheConfig?.enabled,
|
|
4195
4204
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
4196
4205
|
totalBudgetUsd: suite.totalBudgetUsd,
|
|
4197
|
-
failOnError: suite.failOnError
|
|
4206
|
+
failOnError: suite.failOnError,
|
|
4207
|
+
threshold: suite.threshold
|
|
4198
4208
|
};
|
|
4199
4209
|
}
|
|
4200
4210
|
async function runWithLimit(items, limit, task) {
|
|
@@ -4350,6 +4360,9 @@ async function runSingleEvalFile(params) {
|
|
|
4350
4360
|
}
|
|
4351
4361
|
async function runEvalCommand(input) {
|
|
4352
4362
|
const cwd = process.cwd();
|
|
4363
|
+
if (!process.env.AGENTV_RUN_TIMESTAMP) {
|
|
4364
|
+
process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
|
|
4365
|
+
}
|
|
4353
4366
|
let config = null;
|
|
4354
4367
|
try {
|
|
4355
4368
|
config = await loadTsConfig(cwd);
|
|
@@ -4408,7 +4421,7 @@ async function runEvalCommand(input) {
|
|
|
4408
4421
|
const useFileExport = !!options.otelFile;
|
|
4409
4422
|
if (options.exportOtel || useFileExport) {
|
|
4410
4423
|
try {
|
|
4411
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4424
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-KPMR7RBT.js");
|
|
4412
4425
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4413
4426
|
let headers = {};
|
|
4414
4427
|
if (options.otelBackend) {
|
|
@@ -4454,12 +4467,9 @@ async function runEvalCommand(input) {
|
|
|
4454
4467
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
4455
4468
|
const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
4456
4469
|
const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
|
|
4457
|
-
let outputWriter;
|
|
4458
4470
|
if (uniqueOutputPaths.length === 1) {
|
|
4459
|
-
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4460
4471
|
console.log(`Output path: ${outputPath}`);
|
|
4461
4472
|
} else {
|
|
4462
|
-
outputWriter = await createMultiWriter(uniqueOutputPaths);
|
|
4463
4473
|
console.log("Output paths:");
|
|
4464
4474
|
for (const p of uniqueReportedOutputPaths) {
|
|
4465
4475
|
console.log(` ${p}`);
|
|
@@ -4518,6 +4528,18 @@ async function runEvalCommand(input) {
|
|
|
4518
4528
|
if (cacheEnabled) {
|
|
4519
4529
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
4520
4530
|
}
|
|
4531
|
+
const yamlThreshold = firstMeta?.threshold;
|
|
4532
|
+
const resolvedThreshold = options.threshold ?? yamlThreshold;
|
|
4533
|
+
if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
|
|
4534
|
+
throw new Error("--threshold must be between 0 and 1");
|
|
4535
|
+
}
|
|
4536
|
+
const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4537
|
+
let outputWriter;
|
|
4538
|
+
if (uniqueOutputPaths.length === 1) {
|
|
4539
|
+
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4540
|
+
} else {
|
|
4541
|
+
outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
|
|
4542
|
+
}
|
|
4521
4543
|
const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
|
|
4522
4544
|
let totalEvalCount = 0;
|
|
4523
4545
|
for (const meta of fileMetadata.values()) {
|
|
@@ -4641,6 +4663,13 @@ async function runEvalCommand(input) {
|
|
|
4641
4663
|
}
|
|
4642
4664
|
const summary = calculateEvaluationSummary(allResults);
|
|
4643
4665
|
console.log(formatEvaluationSummary(summary));
|
|
4666
|
+
let thresholdFailed = false;
|
|
4667
|
+
if (resolvedThreshold !== void 0) {
|
|
4668
|
+
const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
|
|
4669
|
+
console.log(`
|
|
4670
|
+
${thresholdResult.message}`);
|
|
4671
|
+
thresholdFailed = !thresholdResult.passed;
|
|
4672
|
+
}
|
|
4644
4673
|
if (isMatrixMode && allResults.length > 0) {
|
|
4645
4674
|
console.log(formatMatrixSummary(allResults));
|
|
4646
4675
|
}
|
|
@@ -4722,7 +4751,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
4722
4751
|
executionErrorCount: summary.executionErrorCount,
|
|
4723
4752
|
outputPath,
|
|
4724
4753
|
testFiles: resolvedTestFiles,
|
|
4725
|
-
target: options.target
|
|
4754
|
+
target: options.target,
|
|
4755
|
+
thresholdFailed
|
|
4726
4756
|
};
|
|
4727
4757
|
} finally {
|
|
4728
4758
|
unsubscribeCodexLogs();
|
|
@@ -4781,4 +4811,4 @@ export {
|
|
|
4781
4811
|
selectTarget,
|
|
4782
4812
|
runEvalCommand
|
|
4783
4813
|
};
|
|
4784
|
-
//# sourceMappingURL=chunk-
|
|
4814
|
+
//# sourceMappingURL=chunk-4Z5E5CYT.js.map
|