agentv 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -221,13 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
221
221
 
222
222
  The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
223
223
 
224
- By default, `agentv eval` creates a run workspace under `.agentv/results/raw/<run>/`
224
+ By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
225
225
  with `index.jsonl` as the machine-facing manifest.
226
226
 
227
227
  You can also convert an existing manifest to HTML after the fact:
228
228
 
229
229
  ```bash
230
- agentv convert .agentv/results/raw/eval_<timestamp>/index.jsonl -o report.html
230
+ agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
231
231
  ```
232
232
 
233
233
  #### Timeouts
@@ -358,7 +358,7 @@ agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl
358
358
  Compare a combined results file across all targets (N-way matrix):
359
359
 
360
360
  ```bash
361
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl
361
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
362
362
  ```
363
363
 
364
364
  ```
@@ -379,8 +379,8 @@ Pairwise Summary:
379
379
  Designate a baseline for CI regression gating, or compare two specific targets:
380
380
 
381
381
  ```bash
382
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1
383
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
382
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
383
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
384
384
  agentv compare before.jsonl after.jsonl # two-file pairwise
385
385
  ```
386
386
 
@@ -27,12 +27,12 @@ import {
27
27
  subscribeToCopilotCliLogEntries,
28
28
  subscribeToCopilotSdkLogEntries,
29
29
  subscribeToPiLogEntries
30
- } from "./chunk-7OHZAFND.js";
30
+ } from "./chunk-D3LNJUUB.js";
31
31
 
32
32
  // package.json
33
33
  var package_default = {
34
34
  name: "agentv",
35
- version: "3.13.0",
35
+ version: "3.13.2",
36
36
  description: "CLI entry point for AgentV",
37
37
  type: "module",
38
38
  repository: {
@@ -302,11 +302,12 @@ function toSnakeCaseDeep(obj) {
302
302
  import { existsSync, statSync } from "node:fs";
303
303
  import path3 from "node:path";
304
304
  var RESULT_INDEX_FILENAME = "index.jsonl";
305
+ var RESULT_RUNS_DIRNAME = "runs";
305
306
  function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
306
307
  return `eval_${timestamp.toISOString().replace(/[:.]/g, "-")}`;
307
308
  }
308
309
  function buildDefaultRunDir(cwd) {
309
- return path3.join(cwd, ".agentv", "results", "raw", createRunDirName());
310
+ return path3.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME, createRunDirName());
310
311
  }
311
312
  function resolveRunIndexPath(runDir) {
312
313
  return path3.join(runDir, RESULT_INDEX_FILENAME);
@@ -547,7 +548,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
547
548
  tests_run: testIds
548
549
  },
549
550
  run_summary: runSummary,
550
- per_evaluator_summary: perEvaluatorSummary,
551
+ per_grader_summary: perEvaluatorSummary,
551
552
  notes
552
553
  };
553
554
  }
@@ -1643,13 +1644,15 @@ function escapeXml(str) {
1643
1644
  var JunitWriter = class _JunitWriter {
1644
1645
  filePath;
1645
1646
  results = [];
1647
+ threshold;
1646
1648
  closed = false;
1647
- constructor(filePath) {
1649
+ constructor(filePath, options) {
1648
1650
  this.filePath = filePath;
1651
+ this.threshold = options?.threshold ?? 0.5;
1649
1652
  }
1650
- static async open(filePath) {
1653
+ static async open(filePath, options) {
1651
1654
  await mkdir5(path9.dirname(filePath), { recursive: true });
1652
- return new _JunitWriter(filePath);
1655
+ return new _JunitWriter(filePath, options);
1653
1656
  }
1654
1657
  async append(result) {
1655
1658
  if (this.closed) {
@@ -1674,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
1674
1677
  }
1675
1678
  const suiteXmls = [];
1676
1679
  for (const [suiteName, results] of grouped) {
1677
- const failures = results.filter((r) => r.score < 0.5).length;
1680
+ const failures = results.filter((r) => r.score < this.threshold).length;
1678
1681
  const errors = results.filter((r) => r.error !== void 0).length;
1679
1682
  const testCases = results.map((r) => {
1680
1683
  const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
@@ -1683,7 +1686,7 @@ var JunitWriter = class _JunitWriter {
1683
1686
  inner = `
1684
1687
  <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
1685
1688
  `;
1686
- } else if (r.score < 0.5) {
1689
+ } else if (r.score < this.threshold) {
1687
1690
  const message = `score=${r.score.toFixed(3)}`;
1688
1691
  const failedAssertions = r.assertions.filter((a) => !a.passed);
1689
1692
  const detail = [
@@ -1703,7 +1706,7 @@ ${testCases.join("\n")}
1703
1706
  );
1704
1707
  }
1705
1708
  const totalTests = this.results.length;
1706
- const totalFailures = this.results.filter((r) => r.score < 0.5).length;
1709
+ const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
1707
1710
  const totalErrors = this.results.filter((r) => r.error !== void 0).length;
1708
1711
  const xml = `<?xml version="1.0" encoding="UTF-8"?>
1709
1712
  <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
@@ -1784,7 +1787,7 @@ async function createOutputWriter(filePath, format) {
1784
1787
  }
1785
1788
  }
1786
1789
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
1787
- function createWriterFromPath(filePath) {
1790
+ function createWriterFromPath(filePath, options) {
1788
1791
  const ext = path11.extname(filePath).toLowerCase();
1789
1792
  switch (ext) {
1790
1793
  case ".jsonl":
@@ -1792,7 +1795,7 @@ function createWriterFromPath(filePath) {
1792
1795
  case ".json":
1793
1796
  return JsonWriter.open(filePath);
1794
1797
  case ".xml":
1795
- return JunitWriter.open(filePath);
1798
+ return JunitWriter.open(filePath, { threshold: options?.threshold });
1796
1799
  case ".yaml":
1797
1800
  case ".yml":
1798
1801
  return YamlWriter.open(filePath);
@@ -1805,8 +1808,8 @@ function createWriterFromPath(filePath) {
1805
1808
  );
1806
1809
  }
1807
1810
  }
1808
- async function createMultiWriter(filePaths) {
1809
- const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
1811
+ async function createMultiWriter(filePaths, options) {
1812
+ const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
1810
1813
  return {
1811
1814
  async append(result) {
1812
1815
  await Promise.all(writers.map((w) => w.append(result)));
@@ -2384,6 +2387,12 @@ function formatMatrixSummary(results) {
2384
2387
  lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
2385
2388
  return lines.join("\n");
2386
2389
  }
2390
+ function formatThresholdSummary(meanScore, threshold) {
2391
+ const passed = meanScore >= threshold;
2392
+ const verdict = passed ? "PASS" : "FAIL";
2393
+ const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) \u2014 ${verdict}`;
2394
+ return { passed, message };
2395
+ }
2387
2396
 
2388
2397
  // ../../packages/core/dist/evaluation/validation/index.js
2389
2398
  import { readFile as readFile3 } from "node:fs/promises";
@@ -4047,7 +4056,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4047
4056
  artifacts: normalizeString(rawOptions.artifacts),
4048
4057
  graderTarget: normalizeString(rawOptions.graderTarget),
4049
4058
  model: normalizeString(rawOptions.model),
4050
- outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages))
4059
+ outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
4060
+ threshold: normalizeOptionalNumber(rawOptions.threshold)
4051
4061
  };
4052
4062
  }
4053
4063
  async function ensureFileExists(filePath, description) {
@@ -4193,7 +4203,8 @@ async function prepareFileMetadata(params) {
4193
4203
  yamlCache: suite.cacheConfig?.enabled,
4194
4204
  yamlCachePath: suite.cacheConfig?.cachePath,
4195
4205
  totalBudgetUsd: suite.totalBudgetUsd,
4196
- failOnError: suite.failOnError
4206
+ failOnError: suite.failOnError,
4207
+ threshold: suite.threshold
4197
4208
  };
4198
4209
  }
4199
4210
  async function runWithLimit(items, limit, task) {
@@ -4349,6 +4360,9 @@ async function runSingleEvalFile(params) {
4349
4360
  }
4350
4361
  async function runEvalCommand(input) {
4351
4362
  const cwd = process.cwd();
4363
+ if (!process.env.AGENTV_RUN_TIMESTAMP) {
4364
+ process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
4365
+ }
4352
4366
  let config = null;
4353
4367
  try {
4354
4368
  config = await loadTsConfig(cwd);
@@ -4407,7 +4421,7 @@ async function runEvalCommand(input) {
4407
4421
  const useFileExport = !!options.otelFile;
4408
4422
  if (options.exportOtel || useFileExport) {
4409
4423
  try {
4410
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-SMKOBBFB.js");
4424
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-KPMR7RBT.js");
4411
4425
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4412
4426
  let headers = {};
4413
4427
  if (options.otelBackend) {
@@ -4453,12 +4467,9 @@ async function runEvalCommand(input) {
4453
4467
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
4454
4468
  const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4455
4469
  const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
4456
- let outputWriter;
4457
4470
  if (uniqueOutputPaths.length === 1) {
4458
- outputWriter = await createOutputWriter(primaryWritePath, options.format);
4459
4471
  console.log(`Output path: ${outputPath}`);
4460
4472
  } else {
4461
- outputWriter = await createMultiWriter(uniqueOutputPaths);
4462
4473
  console.log("Output paths:");
4463
4474
  for (const p of uniqueReportedOutputPaths) {
4464
4475
  console.log(` ${p}`);
@@ -4517,6 +4528,18 @@ async function runEvalCommand(input) {
4517
4528
  if (cacheEnabled) {
4518
4529
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
4519
4530
  }
4531
+ const yamlThreshold = firstMeta?.threshold;
4532
+ const resolvedThreshold = options.threshold ?? yamlThreshold;
4533
+ if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
4534
+ throw new Error("--threshold must be between 0 and 1");
4535
+ }
4536
+ const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
4537
+ let outputWriter;
4538
+ if (uniqueOutputPaths.length === 1) {
4539
+ outputWriter = await createOutputWriter(primaryWritePath, options.format);
4540
+ } else {
4541
+ outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
4542
+ }
4520
4543
  const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
4521
4544
  let totalEvalCount = 0;
4522
4545
  for (const meta of fileMetadata.values()) {
@@ -4640,6 +4663,13 @@ async function runEvalCommand(input) {
4640
4663
  }
4641
4664
  const summary = calculateEvaluationSummary(allResults);
4642
4665
  console.log(formatEvaluationSummary(summary));
4666
+ let thresholdFailed = false;
4667
+ if (resolvedThreshold !== void 0) {
4668
+ const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
4669
+ console.log(`
4670
+ ${thresholdResult.message}`);
4671
+ thresholdFailed = !thresholdResult.passed;
4672
+ }
4643
4673
  if (isMatrixMode && allResults.length > 0) {
4644
4674
  console.log(formatMatrixSummary(allResults));
4645
4675
  }
@@ -4721,7 +4751,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
4721
4751
  executionErrorCount: summary.executionErrorCount,
4722
4752
  outputPath,
4723
4753
  testFiles: resolvedTestFiles,
4724
- target: options.target
4754
+ target: options.target,
4755
+ thresholdFailed
4725
4756
  };
4726
4757
  } finally {
4727
4758
  unsubscribeCodexLogs();
@@ -4758,6 +4789,7 @@ export {
4758
4789
  package_default,
4759
4790
  toSnakeCaseDeep,
4760
4791
  RESULT_INDEX_FILENAME,
4792
+ RESULT_RUNS_DIRNAME,
4761
4793
  resolveExistingRunPrimaryPath,
4762
4794
  resolveWorkspaceOrFilePath,
4763
4795
  writeArtifactsFromResults,
@@ -4779,4 +4811,4 @@ export {
4779
4811
  selectTarget,
4780
4812
  runEvalCommand
4781
4813
  };
4782
- //# sourceMappingURL=chunk-6H4IAXQH.js.map
4814
+ //# sourceMappingURL=chunk-4Z5E5CYT.js.map