agentv 3.13.0 → 3.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/{chunk-6H4IAXQH.js → chunk-4Z5E5CYT.js} +54 -22
- package/dist/chunk-4Z5E5CYT.js.map +1 -0
- package/dist/{chunk-7OHZAFND.js → chunk-D3LNJUUB.js} +67 -35
- package/dist/chunk-D3LNJUUB.js.map +1 -0
- package/dist/{chunk-DJU4C6NS.js → chunk-X2343WOK.js} +31 -19
- package/dist/chunk-X2343WOK.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-SMKOBBFB.js → dist-KPMR7RBT.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-RV664PCR.js → interactive-HVKLYGRX.js} +3 -3
- package/dist/templates/.agentv/.env.example +23 -0
- package/dist/templates/.agentv/config.yaml +13 -4
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-6H4IAXQH.js.map +0 -1
- package/dist/chunk-7OHZAFND.js.map +0 -1
- package/dist/chunk-DJU4C6NS.js.map +0 -1
- /package/dist/{dist-SMKOBBFB.js.map → dist-KPMR7RBT.js.map} +0 -0
- /package/dist/{interactive-RV664PCR.js.map → interactive-HVKLYGRX.js.map} +0 -0
package/README.md
CHANGED
|
@@ -221,13 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
|
|
|
221
221
|
|
|
222
222
|
The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
|
|
223
223
|
|
|
224
|
-
By default, `agentv eval` creates a run workspace under `.agentv/results/
|
|
224
|
+
By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
|
|
225
225
|
with `index.jsonl` as the machine-facing manifest.
|
|
226
226
|
|
|
227
227
|
You can also convert an existing manifest to HTML after the fact:
|
|
228
228
|
|
|
229
229
|
```bash
|
|
230
|
-
agentv convert .agentv/results/
|
|
230
|
+
agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
|
|
231
231
|
```
|
|
232
232
|
|
|
233
233
|
#### Timeouts
|
|
@@ -358,7 +358,7 @@ agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl
|
|
|
358
358
|
Compare a combined results file across all targets (N-way matrix):
|
|
359
359
|
|
|
360
360
|
```bash
|
|
361
|
-
agentv compare .agentv/results/
|
|
361
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
|
|
362
362
|
```
|
|
363
363
|
|
|
364
364
|
```
|
|
@@ -379,8 +379,8 @@ Pairwise Summary:
|
|
|
379
379
|
Designate a baseline for CI regression gating, or compare two specific targets:
|
|
380
380
|
|
|
381
381
|
```bash
|
|
382
|
-
agentv compare .agentv/results/
|
|
383
|
-
agentv compare .agentv/results/
|
|
382
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
|
|
383
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
|
|
384
384
|
agentv compare before.jsonl after.jsonl # two-file pairwise
|
|
385
385
|
```
|
|
386
386
|
|
|
@@ -27,12 +27,12 @@ import {
|
|
|
27
27
|
subscribeToCopilotCliLogEntries,
|
|
28
28
|
subscribeToCopilotSdkLogEntries,
|
|
29
29
|
subscribeToPiLogEntries
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-D3LNJUUB.js";
|
|
31
31
|
|
|
32
32
|
// package.json
|
|
33
33
|
var package_default = {
|
|
34
34
|
name: "agentv",
|
|
35
|
-
version: "3.13.
|
|
35
|
+
version: "3.13.2",
|
|
36
36
|
description: "CLI entry point for AgentV",
|
|
37
37
|
type: "module",
|
|
38
38
|
repository: {
|
|
@@ -302,11 +302,12 @@ function toSnakeCaseDeep(obj) {
|
|
|
302
302
|
import { existsSync, statSync } from "node:fs";
|
|
303
303
|
import path3 from "node:path";
|
|
304
304
|
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
305
|
+
var RESULT_RUNS_DIRNAME = "runs";
|
|
305
306
|
function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
|
|
306
307
|
return `eval_${timestamp.toISOString().replace(/[:.]/g, "-")}`;
|
|
307
308
|
}
|
|
308
309
|
function buildDefaultRunDir(cwd) {
|
|
309
|
-
return path3.join(cwd, ".agentv", "results",
|
|
310
|
+
return path3.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME, createRunDirName());
|
|
310
311
|
}
|
|
311
312
|
function resolveRunIndexPath(runDir) {
|
|
312
313
|
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
@@ -547,7 +548,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
547
548
|
tests_run: testIds
|
|
548
549
|
},
|
|
549
550
|
run_summary: runSummary,
|
|
550
|
-
|
|
551
|
+
per_grader_summary: perEvaluatorSummary,
|
|
551
552
|
notes
|
|
552
553
|
};
|
|
553
554
|
}
|
|
@@ -1643,13 +1644,15 @@ function escapeXml(str) {
|
|
|
1643
1644
|
var JunitWriter = class _JunitWriter {
|
|
1644
1645
|
filePath;
|
|
1645
1646
|
results = [];
|
|
1647
|
+
threshold;
|
|
1646
1648
|
closed = false;
|
|
1647
|
-
constructor(filePath) {
|
|
1649
|
+
constructor(filePath, options) {
|
|
1648
1650
|
this.filePath = filePath;
|
|
1651
|
+
this.threshold = options?.threshold ?? 0.5;
|
|
1649
1652
|
}
|
|
1650
|
-
static async open(filePath) {
|
|
1653
|
+
static async open(filePath, options) {
|
|
1651
1654
|
await mkdir5(path9.dirname(filePath), { recursive: true });
|
|
1652
|
-
return new _JunitWriter(filePath);
|
|
1655
|
+
return new _JunitWriter(filePath, options);
|
|
1653
1656
|
}
|
|
1654
1657
|
async append(result) {
|
|
1655
1658
|
if (this.closed) {
|
|
@@ -1674,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1674
1677
|
}
|
|
1675
1678
|
const suiteXmls = [];
|
|
1676
1679
|
for (const [suiteName, results] of grouped) {
|
|
1677
|
-
const failures = results.filter((r) => r.score <
|
|
1680
|
+
const failures = results.filter((r) => r.score < this.threshold).length;
|
|
1678
1681
|
const errors = results.filter((r) => r.error !== void 0).length;
|
|
1679
1682
|
const testCases = results.map((r) => {
|
|
1680
1683
|
const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
|
|
@@ -1683,7 +1686,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1683
1686
|
inner = `
|
|
1684
1687
|
<error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
|
|
1685
1688
|
`;
|
|
1686
|
-
} else if (r.score <
|
|
1689
|
+
} else if (r.score < this.threshold) {
|
|
1687
1690
|
const message = `score=${r.score.toFixed(3)}`;
|
|
1688
1691
|
const failedAssertions = r.assertions.filter((a) => !a.passed);
|
|
1689
1692
|
const detail = [
|
|
@@ -1703,7 +1706,7 @@ ${testCases.join("\n")}
|
|
|
1703
1706
|
);
|
|
1704
1707
|
}
|
|
1705
1708
|
const totalTests = this.results.length;
|
|
1706
|
-
const totalFailures = this.results.filter((r) => r.score <
|
|
1709
|
+
const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
|
|
1707
1710
|
const totalErrors = this.results.filter((r) => r.error !== void 0).length;
|
|
1708
1711
|
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
1709
1712
|
<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
|
|
@@ -1784,7 +1787,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
1784
1787
|
}
|
|
1785
1788
|
}
|
|
1786
1789
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
1787
|
-
function createWriterFromPath(filePath) {
|
|
1790
|
+
function createWriterFromPath(filePath, options) {
|
|
1788
1791
|
const ext = path11.extname(filePath).toLowerCase();
|
|
1789
1792
|
switch (ext) {
|
|
1790
1793
|
case ".jsonl":
|
|
@@ -1792,7 +1795,7 @@ function createWriterFromPath(filePath) {
|
|
|
1792
1795
|
case ".json":
|
|
1793
1796
|
return JsonWriter.open(filePath);
|
|
1794
1797
|
case ".xml":
|
|
1795
|
-
return JunitWriter.open(filePath);
|
|
1798
|
+
return JunitWriter.open(filePath, { threshold: options?.threshold });
|
|
1796
1799
|
case ".yaml":
|
|
1797
1800
|
case ".yml":
|
|
1798
1801
|
return YamlWriter.open(filePath);
|
|
@@ -1805,8 +1808,8 @@ function createWriterFromPath(filePath) {
|
|
|
1805
1808
|
);
|
|
1806
1809
|
}
|
|
1807
1810
|
}
|
|
1808
|
-
async function createMultiWriter(filePaths) {
|
|
1809
|
-
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
|
|
1811
|
+
async function createMultiWriter(filePaths, options) {
|
|
1812
|
+
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
|
|
1810
1813
|
return {
|
|
1811
1814
|
async append(result) {
|
|
1812
1815
|
await Promise.all(writers.map((w) => w.append(result)));
|
|
@@ -2384,6 +2387,12 @@ function formatMatrixSummary(results) {
|
|
|
2384
2387
|
lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
|
|
2385
2388
|
return lines.join("\n");
|
|
2386
2389
|
}
|
|
2390
|
+
function formatThresholdSummary(meanScore, threshold) {
|
|
2391
|
+
const passed = meanScore >= threshold;
|
|
2392
|
+
const verdict = passed ? "PASS" : "FAIL";
|
|
2393
|
+
const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
|
|
2394
|
+
return { passed, message };
|
|
2395
|
+
}
|
|
2387
2396
|
|
|
2388
2397
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2389
2398
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -4047,7 +4056,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4047
4056
|
artifacts: normalizeString(rawOptions.artifacts),
|
|
4048
4057
|
graderTarget: normalizeString(rawOptions.graderTarget),
|
|
4049
4058
|
model: normalizeString(rawOptions.model),
|
|
4050
|
-
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages))
|
|
4059
|
+
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
|
|
4060
|
+
threshold: normalizeOptionalNumber(rawOptions.threshold)
|
|
4051
4061
|
};
|
|
4052
4062
|
}
|
|
4053
4063
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4193,7 +4203,8 @@ async function prepareFileMetadata(params) {
|
|
|
4193
4203
|
yamlCache: suite.cacheConfig?.enabled,
|
|
4194
4204
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
4195
4205
|
totalBudgetUsd: suite.totalBudgetUsd,
|
|
4196
|
-
failOnError: suite.failOnError
|
|
4206
|
+
failOnError: suite.failOnError,
|
|
4207
|
+
threshold: suite.threshold
|
|
4197
4208
|
};
|
|
4198
4209
|
}
|
|
4199
4210
|
async function runWithLimit(items, limit, task) {
|
|
@@ -4349,6 +4360,9 @@ async function runSingleEvalFile(params) {
|
|
|
4349
4360
|
}
|
|
4350
4361
|
async function runEvalCommand(input) {
|
|
4351
4362
|
const cwd = process.cwd();
|
|
4363
|
+
if (!process.env.AGENTV_RUN_TIMESTAMP) {
|
|
4364
|
+
process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
|
|
4365
|
+
}
|
|
4352
4366
|
let config = null;
|
|
4353
4367
|
try {
|
|
4354
4368
|
config = await loadTsConfig(cwd);
|
|
@@ -4407,7 +4421,7 @@ async function runEvalCommand(input) {
|
|
|
4407
4421
|
const useFileExport = !!options.otelFile;
|
|
4408
4422
|
if (options.exportOtel || useFileExport) {
|
|
4409
4423
|
try {
|
|
4410
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4424
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-KPMR7RBT.js");
|
|
4411
4425
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4412
4426
|
let headers = {};
|
|
4413
4427
|
if (options.otelBackend) {
|
|
@@ -4453,12 +4467,9 @@ async function runEvalCommand(input) {
|
|
|
4453
4467
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
4454
4468
|
const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
4455
4469
|
const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
|
|
4456
|
-
let outputWriter;
|
|
4457
4470
|
if (uniqueOutputPaths.length === 1) {
|
|
4458
|
-
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4459
4471
|
console.log(`Output path: ${outputPath}`);
|
|
4460
4472
|
} else {
|
|
4461
|
-
outputWriter = await createMultiWriter(uniqueOutputPaths);
|
|
4462
4473
|
console.log("Output paths:");
|
|
4463
4474
|
for (const p of uniqueReportedOutputPaths) {
|
|
4464
4475
|
console.log(` ${p}`);
|
|
@@ -4517,6 +4528,18 @@ async function runEvalCommand(input) {
|
|
|
4517
4528
|
if (cacheEnabled) {
|
|
4518
4529
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
4519
4530
|
}
|
|
4531
|
+
const yamlThreshold = firstMeta?.threshold;
|
|
4532
|
+
const resolvedThreshold = options.threshold ?? yamlThreshold;
|
|
4533
|
+
if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
|
|
4534
|
+
throw new Error("--threshold must be between 0 and 1");
|
|
4535
|
+
}
|
|
4536
|
+
const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4537
|
+
let outputWriter;
|
|
4538
|
+
if (uniqueOutputPaths.length === 1) {
|
|
4539
|
+
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4540
|
+
} else {
|
|
4541
|
+
outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
|
|
4542
|
+
}
|
|
4520
4543
|
const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
|
|
4521
4544
|
let totalEvalCount = 0;
|
|
4522
4545
|
for (const meta of fileMetadata.values()) {
|
|
@@ -4640,6 +4663,13 @@ async function runEvalCommand(input) {
|
|
|
4640
4663
|
}
|
|
4641
4664
|
const summary = calculateEvaluationSummary(allResults);
|
|
4642
4665
|
console.log(formatEvaluationSummary(summary));
|
|
4666
|
+
let thresholdFailed = false;
|
|
4667
|
+
if (resolvedThreshold !== void 0) {
|
|
4668
|
+
const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
|
|
4669
|
+
console.log(`
|
|
4670
|
+
${thresholdResult.message}`);
|
|
4671
|
+
thresholdFailed = !thresholdResult.passed;
|
|
4672
|
+
}
|
|
4643
4673
|
if (isMatrixMode && allResults.length > 0) {
|
|
4644
4674
|
console.log(formatMatrixSummary(allResults));
|
|
4645
4675
|
}
|
|
@@ -4721,7 +4751,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
4721
4751
|
executionErrorCount: summary.executionErrorCount,
|
|
4722
4752
|
outputPath,
|
|
4723
4753
|
testFiles: resolvedTestFiles,
|
|
4724
|
-
target: options.target
|
|
4754
|
+
target: options.target,
|
|
4755
|
+
thresholdFailed
|
|
4725
4756
|
};
|
|
4726
4757
|
} finally {
|
|
4727
4758
|
unsubscribeCodexLogs();
|
|
@@ -4758,6 +4789,7 @@ export {
|
|
|
4758
4789
|
package_default,
|
|
4759
4790
|
toSnakeCaseDeep,
|
|
4760
4791
|
RESULT_INDEX_FILENAME,
|
|
4792
|
+
RESULT_RUNS_DIRNAME,
|
|
4761
4793
|
resolveExistingRunPrimaryPath,
|
|
4762
4794
|
resolveWorkspaceOrFilePath,
|
|
4763
4795
|
writeArtifactsFromResults,
|
|
@@ -4779,4 +4811,4 @@ export {
|
|
|
4779
4811
|
selectTarget,
|
|
4780
4812
|
runEvalCommand
|
|
4781
4813
|
};
|
|
4782
|
-
//# sourceMappingURL=chunk-
|
|
4814
|
+
//# sourceMappingURL=chunk-4Z5E5CYT.js.map
|