agentv 3.11.0 → 3.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,12 +27,12 @@ import {
27
27
  subscribeToCopilotCliLogEntries,
28
28
  subscribeToCopilotSdkLogEntries,
29
29
  subscribeToPiLogEntries
30
- } from "./chunk-EZGWZVVK.js";
30
+ } from "./chunk-UYBLUYHN.js";
31
31
 
32
32
  // package.json
33
33
  var package_default = {
34
34
  name: "agentv",
35
- version: "3.11.0",
35
+ version: "3.12.0",
36
36
  description: "CLI entry point for AgentV",
37
37
  type: "module",
38
38
  repository: {
@@ -65,8 +65,6 @@ var package_default = {
65
65
  "@github/copilot-sdk": "^0.1.25",
66
66
  "@hono/node-server": "^1.19.11",
67
67
  "@inquirer/prompts": "^8.2.1",
68
- "@mariozechner/pi-agent-core": "^0.54.2",
69
- "@mariozechner/pi-ai": "^0.54.2",
70
68
  "@openai/codex-sdk": "^0.104.0",
71
69
  "cmd-ts": "^0.14.3",
72
70
  dotenv: "^16.4.5",
@@ -77,6 +75,14 @@ var package_default = {
77
75
  semver: "^7.7.4",
78
76
  yaml: "^2.6.1"
79
77
  },
78
+ peerDependencies: {
79
+ "@mariozechner/pi-coding-agent": "^0.62.0"
80
+ },
81
+ peerDependenciesMeta: {
82
+ "@mariozechner/pi-coding-agent": {
83
+ optional: true
84
+ }
85
+ },
80
86
  devDependencies: {
81
87
  "@agentv/core": "workspace:*",
82
88
  "@types/semver": "^7.7.1",
@@ -206,7 +212,7 @@ async function discoverTargetsFile(options) {
206
212
  // src/commands/eval/run-eval.ts
207
213
  import { constants as constants4, mkdirSync } from "node:fs";
208
214
  import { access as access4 } from "node:fs/promises";
209
- import path13 from "node:path";
215
+ import path15 from "node:path";
210
216
  import { pathToFileURL } from "node:url";
211
217
 
212
218
  // src/version-check.ts
@@ -265,7 +271,82 @@ async function promptContinue() {
265
271
 
266
272
  // src/commands/eval/artifact-writer.ts
267
273
  import { mkdir, readFile, writeFile } from "node:fs/promises";
274
+ import path4 from "node:path";
275
+
276
+ // src/utils/case-conversion.ts
277
+ function toSnakeCase(str) {
278
+ if (/^[A-Z]/.test(str)) {
279
+ return str;
280
+ }
281
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
282
+ }
283
+ function toSnakeCaseDeep(obj) {
284
+ if (obj === null || obj === void 0) {
285
+ return obj;
286
+ }
287
+ if (Array.isArray(obj)) {
288
+ return obj.map((item) => toSnakeCaseDeep(item));
289
+ }
290
+ if (typeof obj === "object") {
291
+ const result = {};
292
+ for (const [key, value] of Object.entries(obj)) {
293
+ const snakeKey = toSnakeCase(key);
294
+ result[snakeKey] = toSnakeCaseDeep(value);
295
+ }
296
+ return result;
297
+ }
298
+ return obj;
299
+ }
300
+
301
+ // src/commands/eval/result-layout.ts
302
+ import { existsSync, statSync } from "node:fs";
268
303
  import path3 from "node:path";
304
+ var RESULT_INDEX_FILENAME = "index.jsonl";
305
+ var LEGACY_RESULTS_FILENAME = "results.jsonl";
306
+ function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
307
+ return `eval_${timestamp.toISOString().replace(/[:.]/g, "-")}`;
308
+ }
309
+ function buildDefaultRunDir(cwd) {
310
+ return path3.join(cwd, ".agentv", "results", "raw", createRunDirName());
311
+ }
312
+ function resolveRunIndexPath(runDir) {
313
+ return path3.join(runDir, RESULT_INDEX_FILENAME);
314
+ }
315
+ function resolveRunLegacyResultsPath(runDir) {
316
+ return path3.join(runDir, LEGACY_RESULTS_FILENAME);
317
+ }
318
+ function resolveExistingRunPrimaryPath(runDir) {
319
+ const indexPath = resolveRunIndexPath(runDir);
320
+ if (existsSync(indexPath)) {
321
+ return indexPath;
322
+ }
323
+ const legacyPath = resolveRunLegacyResultsPath(runDir);
324
+ if (existsSync(legacyPath)) {
325
+ return legacyPath;
326
+ }
327
+ return void 0;
328
+ }
329
+ function isDirectoryPath(filePath) {
330
+ try {
331
+ return statSync(filePath).isDirectory();
332
+ } catch {
333
+ return false;
334
+ }
335
+ }
336
+ function resolveWorkspaceOrFilePath(filePath) {
337
+ if (!isDirectoryPath(filePath)) {
338
+ return filePath;
339
+ }
340
+ const existing = resolveExistingRunPrimaryPath(filePath);
341
+ if (!existing) {
342
+ throw new Error(
343
+ `Result workspace is missing ${RESULT_INDEX_FILENAME} and ${LEGACY_RESULTS_FILENAME}: ${filePath}`
344
+ );
345
+ }
346
+ return existing;
347
+ }
348
+
349
+ // src/commands/eval/artifact-writer.ts
269
350
  var PASS_THRESHOLD = 0.8;
270
351
  function computeStats(values) {
271
352
  if (values.length === 0) {
@@ -480,33 +561,74 @@ function buildBenchmarkArtifact(results, evalFile = "") {
480
561
  notes
481
562
  };
482
563
  }
483
- function buildAggregateGradingArtifact(results) {
484
- const assertions = [];
485
- for (const result of results) {
486
- if (!result.assertions) continue;
487
- const testId = result.testId ?? "unknown";
488
- for (const a of result.assertions) {
489
- assertions.push({
490
- test_id: testId,
491
- text: a.text,
492
- passed: a.passed,
493
- evidence: a.evidence ?? ""
494
- });
495
- }
564
+ function safeArtifactPathSegment(value, fallback) {
565
+ const trimmed = value?.trim();
566
+ if (!trimmed) {
567
+ return fallback;
496
568
  }
497
- const passed = assertions.filter((a) => a.passed).length;
498
- const failed = assertions.filter((a) => !a.passed).length;
499
- const total = assertions.length;
569
+ return trimmed.replace(/[/\\:*?"<>|]/g, "_");
570
+ }
571
+ function safeTestId(testId) {
572
+ return safeArtifactPathSegment(testId, "unknown");
573
+ }
574
+ function safeTargetId(target) {
575
+ return safeArtifactPathSegment(target, "default");
576
+ }
577
+ function getEvalSet(result) {
578
+ const record = result;
579
+ return result.eval_set ?? record.evalSet;
580
+ }
581
+ function buildArtifactSubdir(result) {
582
+ const segments = [];
583
+ const evalSet = getEvalSet(result);
584
+ if (evalSet) {
585
+ segments.push(safeArtifactPathSegment(evalSet, "default"));
586
+ }
587
+ segments.push(safeTestId(result.testId), safeTargetId(result.target));
588
+ return path4.posix.join(...segments);
589
+ }
590
+ function formatOutputMarkdown(output) {
591
+ return output.map((msg) => `@[${msg.role}]:
592
+ ${String(msg.content ?? "")}`).join("\n\n");
593
+ }
594
+ function extractInput(result) {
595
+ const input = result.input;
596
+ if (!input) return null;
597
+ if (typeof input === "string") return input;
598
+ if (Array.isArray(input) && input.length > 0) {
599
+ return formatOutputMarkdown(input);
600
+ }
601
+ return null;
602
+ }
603
+ function buildResultIndexArtifact(result) {
604
+ const artifactSubdir = buildArtifactSubdir(result);
605
+ const input = extractInput(result);
606
+ const hasResponse = Array.isArray(result.output) && result.output.length > 0;
500
607
  return {
501
- assertions,
502
- summary: {
503
- passed,
504
- failed,
505
- total,
506
- pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
507
- }
608
+ timestamp: result.timestamp,
609
+ test_id: result.testId ?? "unknown",
610
+ eval_set: getEvalSet(result),
611
+ conversation_id: result.conversationId,
612
+ score: result.score,
613
+ target: result.target ?? "unknown",
614
+ scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
615
+ execution_status: result.executionStatus,
616
+ error: result.error,
617
+ failure_stage: result.failureStage,
618
+ failure_reason_code: result.failureReasonCode,
619
+ workspace_path: result.workspacePath,
620
+ grading_path: path4.posix.join(artifactSubdir, "grading.json"),
621
+ timing_path: path4.posix.join(artifactSubdir, "timing.json"),
622
+ input_path: input ? path4.posix.join(artifactSubdir, "input.md") : void 0,
623
+ output_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
624
+ response_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0
508
625
  };
509
626
  }
627
+ async function writeJsonlFile(filePath, records) {
628
+ const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
629
+ `;
630
+ await writeFile(filePath, content, "utf8");
631
+ }
510
632
  function toCamelCase(str) {
511
633
  return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
512
634
  }
@@ -544,17 +666,39 @@ function parseJsonlResults(content) {
544
666
  return results;
545
667
  }
546
668
  async function writeArtifactsFromResults(results, outputDir, options) {
547
- const gradingDir = path3.join(outputDir, "grading");
548
- const timingPath = path3.join(outputDir, "timing.json");
549
- const benchmarkPath = path3.join(outputDir, "benchmark.json");
550
- const aggregateGradingPath = path3.join(outputDir, "grading.json");
551
- await mkdir(gradingDir, { recursive: true });
669
+ const testArtifactDir = outputDir;
670
+ const timingPath = path4.join(outputDir, "timing.json");
671
+ const benchmarkPath = path4.join(outputDir, "benchmark.json");
672
+ const indexPath = path4.join(outputDir, RESULT_INDEX_FILENAME);
673
+ const legacyResultsPath = options?.writeLegacyResults ? path4.join(outputDir, LEGACY_RESULTS_FILENAME) : void 0;
674
+ await mkdir(outputDir, { recursive: true });
675
+ const indexRecords = [];
552
676
  for (const result of results) {
553
677
  const grading = buildGradingArtifact(result);
554
- const safeTestId = (result.testId ?? "unknown").replace(/[/\\:*?"<>|]/g, "_");
555
- const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
678
+ const timing2 = buildTimingArtifact([result]);
679
+ const artifactSubdir = buildArtifactSubdir(result);
680
+ const testDir = path4.join(outputDir, artifactSubdir);
681
+ const gradingPath = path4.join(testDir, "grading.json");
682
+ const perTestTimingPath = path4.join(testDir, "timing.json");
683
+ await mkdir(testDir, { recursive: true });
556
684
  await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
557
685
  `, "utf8");
686
+ await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
687
+ `, "utf8");
688
+ const input = extractInput(result);
689
+ if (input) {
690
+ await writeFile(path4.join(testDir, "input.md"), input, "utf8");
691
+ }
692
+ if (result.output && result.output.length > 0) {
693
+ const outputsDir = path4.join(testDir, "outputs");
694
+ await mkdir(outputsDir, { recursive: true });
695
+ await writeFile(
696
+ path4.join(outputsDir, "response.md"),
697
+ formatOutputMarkdown(result.output),
698
+ "utf8"
699
+ );
700
+ }
701
+ indexRecords.push(buildResultIndexArtifact(result));
558
702
  }
559
703
  const timing = buildTimingArtifact(results);
560
704
  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
@@ -562,10 +706,11 @@ async function writeArtifactsFromResults(results, outputDir, options) {
562
706
  const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
563
707
  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
564
708
  `, "utf8");
565
- const aggregateGrading = buildAggregateGradingArtifact(results);
566
- await writeFile(aggregateGradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
567
- `, "utf8");
568
- return { gradingDir, timingPath, benchmarkPath, aggregateGradingPath };
709
+ await writeJsonlFile(indexPath, indexRecords);
710
+ if (legacyResultsPath) {
711
+ await writeJsonlFile(legacyResultsPath, results);
712
+ }
713
+ return { testArtifactDir, timingPath, benchmarkPath, indexPath, legacyResultsPath };
569
714
  }
570
715
 
571
716
  // src/commands/eval/benchmark-writer.ts
@@ -616,13 +761,13 @@ async function writeBenchmarkJson(outputPath, results) {
616
761
  // src/commands/eval/env.ts
617
762
  import { constants as constants3 } from "node:fs";
618
763
  import { access as access3 } from "node:fs/promises";
619
- import path4 from "node:path";
764
+ import path5 from "node:path";
620
765
  import { config as loadDotenv } from "dotenv";
621
766
  function uniqueDirs(directories) {
622
767
  const seen = /* @__PURE__ */ new Set();
623
768
  const result = [];
624
769
  for (const dir of directories) {
625
- const absolute = path4.resolve(dir);
770
+ const absolute = path5.resolve(dir);
626
771
  if (seen.has(absolute)) {
627
772
  continue;
628
773
  }
@@ -641,14 +786,14 @@ async function fileExists2(filePath) {
641
786
  }
642
787
  function collectAncestorDirectories(start, boundary) {
643
788
  const directories = [];
644
- const boundaryDir = path4.resolve(boundary);
645
- let current = path4.resolve(start);
789
+ const boundaryDir = path5.resolve(boundary);
790
+ let current = path5.resolve(start);
646
791
  while (current !== void 0) {
647
792
  directories.push(current);
648
793
  if (current === boundaryDir) {
649
794
  break;
650
795
  }
651
- const parent = path4.dirname(current);
796
+ const parent = path5.dirname(current);
652
797
  if (parent === current) {
653
798
  break;
654
799
  }
@@ -658,12 +803,12 @@ function collectAncestorDirectories(start, boundary) {
658
803
  }
659
804
  async function loadEnvFromHierarchy(options) {
660
805
  const { testFilePath, repoRoot, verbose } = options;
661
- const testDir = path4.dirname(path4.resolve(testFilePath));
806
+ const testDir = path5.dirname(path5.resolve(testFilePath));
662
807
  const cwd = process.cwd();
663
808
  const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
664
809
  const envFiles = [];
665
810
  for (const dir of searchDirs) {
666
- const candidate = path4.join(dir, ".env");
811
+ const candidate = path5.join(dir, ".env");
667
812
  if (await fileExists2(candidate)) {
668
813
  envFiles.push(candidate);
669
814
  }
@@ -685,11 +830,11 @@ async function loadEnvFromHierarchy(options) {
685
830
  }
686
831
 
687
832
  // src/commands/eval/output-writer.ts
688
- import path10 from "node:path";
833
+ import path11 from "node:path";
689
834
 
690
835
  // src/commands/eval/html-writer.ts
691
836
  import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
692
- import path5 from "node:path";
837
+ import path6 from "node:path";
693
838
 
694
839
  // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
695
840
  var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -908,7 +1053,7 @@ var HtmlWriter = class _HtmlWriter {
908
1053
  this.filePath = filePath;
909
1054
  }
910
1055
  static async open(filePath) {
911
- await mkdir2(path5.dirname(filePath), { recursive: true });
1056
+ await mkdir2(path6.dirname(filePath), { recursive: true });
912
1057
  const writer = new _HtmlWriter(filePath);
913
1058
  await writer.writeHtml();
914
1059
  return writer;
@@ -1419,34 +1564,7 @@ var SCRIPT = `
1419
1564
 
1420
1565
  // src/commands/eval/json-writer.ts
1421
1566
  import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
1422
- import path6 from "node:path";
1423
-
1424
- // src/utils/case-conversion.ts
1425
- function toSnakeCase(str) {
1426
- if (/^[A-Z]/.test(str)) {
1427
- return str;
1428
- }
1429
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
1430
- }
1431
- function toSnakeCaseDeep(obj) {
1432
- if (obj === null || obj === void 0) {
1433
- return obj;
1434
- }
1435
- if (Array.isArray(obj)) {
1436
- return obj.map((item) => toSnakeCaseDeep(item));
1437
- }
1438
- if (typeof obj === "object") {
1439
- const result = {};
1440
- for (const [key, value] of Object.entries(obj)) {
1441
- const snakeKey = toSnakeCase(key);
1442
- result[snakeKey] = toSnakeCaseDeep(value);
1443
- }
1444
- return result;
1445
- }
1446
- return obj;
1447
- }
1448
-
1449
- // src/commands/eval/json-writer.ts
1567
+ import path7 from "node:path";
1450
1568
  var JsonWriter = class _JsonWriter {
1451
1569
  filePath;
1452
1570
  results = [];
@@ -1455,7 +1573,7 @@ var JsonWriter = class _JsonWriter {
1455
1573
  this.filePath = filePath;
1456
1574
  }
1457
1575
  static async open(filePath) {
1458
- await mkdir3(path6.dirname(filePath), { recursive: true });
1576
+ await mkdir3(path7.dirname(filePath), { recursive: true });
1459
1577
  return new _JsonWriter(filePath);
1460
1578
  }
1461
1579
  async append(result) {
@@ -1490,7 +1608,7 @@ var JsonWriter = class _JsonWriter {
1490
1608
  // src/commands/eval/jsonl-writer.ts
1491
1609
  import { createWriteStream } from "node:fs";
1492
1610
  import { mkdir as mkdir4 } from "node:fs/promises";
1493
- import path7 from "node:path";
1611
+ import path8 from "node:path";
1494
1612
  import { finished } from "node:stream/promises";
1495
1613
  var JsonlWriter = class _JsonlWriter {
1496
1614
  stream;
@@ -1500,7 +1618,7 @@ var JsonlWriter = class _JsonlWriter {
1500
1618
  this.stream = stream;
1501
1619
  }
1502
1620
  static async open(filePath) {
1503
- await mkdir4(path7.dirname(filePath), { recursive: true });
1621
+ await mkdir4(path8.dirname(filePath), { recursive: true });
1504
1622
  const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
1505
1623
  return new _JsonlWriter(stream);
1506
1624
  }
@@ -1532,7 +1650,7 @@ var JsonlWriter = class _JsonlWriter {
1532
1650
 
1533
1651
  // src/commands/eval/junit-writer.ts
1534
1652
  import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
1535
- import path8 from "node:path";
1653
+ import path9 from "node:path";
1536
1654
  function escapeXml(str) {
1537
1655
  return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
1538
1656
  }
@@ -1544,7 +1662,7 @@ var JunitWriter = class _JunitWriter {
1544
1662
  this.filePath = filePath;
1545
1663
  }
1546
1664
  static async open(filePath) {
1547
- await mkdir5(path8.dirname(filePath), { recursive: true });
1665
+ await mkdir5(path9.dirname(filePath), { recursive: true });
1548
1666
  return new _JunitWriter(filePath);
1549
1667
  }
1550
1668
  async append(result) {
@@ -1613,7 +1731,7 @@ ${suiteXmls.join("\n")}
1613
1731
  // src/commands/eval/yaml-writer.ts
1614
1732
  import { createWriteStream as createWriteStream2 } from "node:fs";
1615
1733
  import { mkdir as mkdir6 } from "node:fs/promises";
1616
- import path9 from "node:path";
1734
+ import path10 from "node:path";
1617
1735
  import { finished as finished2 } from "node:stream/promises";
1618
1736
  import { stringify as stringifyYaml } from "yaml";
1619
1737
  var YamlWriter = class _YamlWriter {
@@ -1625,7 +1743,7 @@ var YamlWriter = class _YamlWriter {
1625
1743
  this.stream = stream;
1626
1744
  }
1627
1745
  static async open(filePath) {
1628
- await mkdir6(path9.dirname(filePath), { recursive: true });
1746
+ await mkdir6(path10.dirname(filePath), { recursive: true });
1629
1747
  const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
1630
1748
  return new _YamlWriter(stream);
1631
1749
  }
@@ -1681,7 +1799,7 @@ async function createOutputWriter(filePath, format) {
1681
1799
  }
1682
1800
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
1683
1801
  function createWriterFromPath(filePath) {
1684
- const ext = path10.extname(filePath).toLowerCase();
1802
+ const ext = path11.extname(filePath).toLowerCase();
1685
1803
  switch (ext) {
1686
1804
  case ".jsonl":
1687
1805
  return JsonlWriter.open(filePath);
@@ -1788,12 +1906,12 @@ var ProgressDisplay = class {
1788
1906
  }
1789
1907
  addLogPaths(paths, provider) {
1790
1908
  const newPaths = [];
1791
- for (const path14 of paths) {
1792
- if (this.logPathSet.has(path14)) {
1909
+ for (const path16 of paths) {
1910
+ if (this.logPathSet.has(path16)) {
1793
1911
  continue;
1794
1912
  }
1795
- this.logPathSet.add(path14);
1796
- newPaths.push(path14);
1913
+ this.logPathSet.add(path16);
1914
+ newPaths.push(path16);
1797
1915
  }
1798
1916
  if (newPaths.length === 0) {
1799
1917
  return;
@@ -1806,8 +1924,8 @@ var ProgressDisplay = class {
1806
1924
  this.hasPrintedLogHeader = true;
1807
1925
  }
1808
1926
  const startIndex = this.logPaths.length - newPaths.length;
1809
- newPaths.forEach((path14, offset) => {
1810
- console.log(`${startIndex + offset + 1}. ${path14}`);
1927
+ newPaths.forEach((path16, offset) => {
1928
+ console.log(`${startIndex + offset + 1}. ${path16}`);
1811
1929
  });
1812
1930
  }
1813
1931
  finish() {
@@ -1818,81 +1936,198 @@ var ProgressDisplay = class {
1818
1936
  }
1819
1937
  };
1820
1938
 
1821
- // src/commands/eval/retry-errors.ts
1822
- import { createReadStream } from "node:fs";
1823
- import { createInterface } from "node:readline";
1824
- function getTestId(result) {
1825
- return result.testId ?? result.test_id;
1939
+ // src/commands/results/manifest.ts
1940
+ import { existsSync as existsSync2, readFileSync } from "node:fs";
1941
+ import path12 from "node:path";
1942
+ function parseJsonlLines(content) {
1943
+ return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
1826
1944
  }
1827
- function getExecutionStatus(result) {
1828
- return result.executionStatus ?? result.execution_status;
1945
+ function isIndexManifestPath(sourceFile) {
1946
+ return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
1829
1947
  }
1830
- function toEvaluationResult(result) {
1831
- if (result.testId !== void 0 && result.executionStatus !== void 0) {
1832
- return result;
1948
+ function parseMarkdownMessages(content) {
1949
+ const trimmed = content.trim();
1950
+ if (!trimmed.startsWith("@[")) {
1951
+ return [];
1952
+ }
1953
+ const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
1954
+ return matches.map((match) => ({
1955
+ role: match[1],
1956
+ content: match[2].trimEnd()
1957
+ }));
1958
+ }
1959
+ function readOptionalText(baseDir, relativePath) {
1960
+ if (!relativePath) {
1961
+ return void 0;
1962
+ }
1963
+ const absolutePath = path12.join(baseDir, relativePath);
1964
+ if (!existsSync2(absolutePath)) {
1965
+ return void 0;
1966
+ }
1967
+ return readFileSync(absolutePath, "utf8");
1968
+ }
1969
+ function readOptionalJson(baseDir, relativePath) {
1970
+ const text = readOptionalText(baseDir, relativePath);
1971
+ if (!text) {
1972
+ return void 0;
1833
1973
  }
1974
+ try {
1975
+ return JSON.parse(text);
1976
+ } catch {
1977
+ return void 0;
1978
+ }
1979
+ }
1980
+ function hydrateInput(baseDir, record) {
1981
+ const inputText = readOptionalText(baseDir, record.input_path);
1982
+ if (!inputText) {
1983
+ return void 0;
1984
+ }
1985
+ const messages = parseMarkdownMessages(inputText);
1986
+ return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
1987
+ }
1988
+ function hydrateOutput(baseDir, record) {
1989
+ const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
1990
+ if (!responseText) {
1991
+ return void 0;
1992
+ }
1993
+ const messages = parseMarkdownMessages(responseText);
1994
+ if (messages.length > 0) {
1995
+ return messages.map((message) => ({
1996
+ role: message.role,
1997
+ content: message.content
1998
+ }));
1999
+ }
2000
+ return [{ role: "assistant", content: responseText.trimEnd() }];
2001
+ }
2002
+ function hydrateManifestRecord(baseDir, record) {
2003
+ const grading = readOptionalJson(baseDir, record.grading_path);
2004
+ const timing = readOptionalJson(baseDir, record.timing_path);
2005
+ const testId = record.test_id ?? record.eval_id ?? "unknown";
1834
2006
  return {
1835
- ...result,
1836
- testId: getTestId(result) ?? "",
1837
- executionStatus: getExecutionStatus(result)
2007
+ timestamp: record.timestamp,
2008
+ testId,
2009
+ eval_set: record.eval_set,
2010
+ target: record.target,
2011
+ score: record.score,
2012
+ executionStatus: record.execution_status,
2013
+ error: record.error,
2014
+ assertions: grading?.assertions.map((assertion) => ({
2015
+ text: assertion.text,
2016
+ passed: assertion.passed,
2017
+ evidence: assertion.evidence
2018
+ })),
2019
+ scores: grading?.evaluators?.map((evaluator) => ({
2020
+ name: evaluator.name,
2021
+ type: evaluator.type,
2022
+ score: evaluator.score,
2023
+ assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
2024
+ text: String(assertion.text ?? ""),
2025
+ passed: Boolean(assertion.passed),
2026
+ evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
2027
+ })) : void 0,
2028
+ weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
2029
+ verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
2030
+ details: evaluator.details
2031
+ })) ?? record.scores,
2032
+ tokenUsage: timing?.token_usage ? {
2033
+ input: timing.token_usage.input,
2034
+ output: timing.token_usage.output,
2035
+ reasoning: timing.token_usage.reasoning
2036
+ } : record.token_usage,
2037
+ durationMs: timing?.duration_ms ?? record.duration_ms,
2038
+ costUsd: record.cost_usd,
2039
+ input: hydrateInput(baseDir, record),
2040
+ output: hydrateOutput(baseDir, record)
1838
2041
  };
1839
2042
  }
1840
- async function loadErrorTestIds(jsonlPath) {
1841
- const ids = [];
1842
- const rl = createInterface({
1843
- input: createReadStream(jsonlPath),
1844
- crlfDelay: Number.POSITIVE_INFINITY
1845
- });
1846
- for await (const line of rl) {
2043
+ function parseResultManifest(content) {
2044
+ return parseJsonlLines(content);
2045
+ }
2046
+ function resolveResultSourcePath(source, cwd) {
2047
+ const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
2048
+ return resolveWorkspaceOrFilePath(resolved);
2049
+ }
2050
+ function loadManifestResults(sourceFile) {
2051
+ const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2052
+ if (!isIndexManifestPath(resolvedSourceFile)) {
2053
+ return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
2054
+ }
2055
+ const content = readFileSync(resolvedSourceFile, "utf8");
2056
+ const records = parseResultManifest(content);
2057
+ const baseDir = path12.dirname(resolvedSourceFile);
2058
+ return records.map((record) => hydrateManifestRecord(baseDir, record));
2059
+ }
2060
+ function loadLightweightResults(sourceFile) {
2061
+ const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2062
+ const content = readFileSync(resolvedSourceFile, "utf8");
2063
+ if (isIndexManifestPath(resolvedSourceFile)) {
2064
+ return parseResultManifest(content).map((record) => ({
2065
+ testId: record.test_id ?? record.eval_id ?? "unknown",
2066
+ target: record.target,
2067
+ score: record.score,
2068
+ scores: record.scores,
2069
+ executionStatus: record.execution_status,
2070
+ error: record.error,
2071
+ timestamp: record.timestamp
2072
+ }));
2073
+ }
2074
+ const records = [];
2075
+ for (const line of content.split(/\r?\n/)) {
1847
2076
  const trimmed = line.trim();
1848
- if (!trimmed) continue;
2077
+ if (!trimmed) {
2078
+ continue;
2079
+ }
2080
+ let record;
1849
2081
  try {
1850
- const parsed = JSON.parse(trimmed);
1851
- const executionStatus = getExecutionStatus(parsed);
1852
- const testId = getTestId(parsed);
1853
- if (executionStatus === "execution_error" && testId) {
1854
- ids.push(testId);
1855
- }
2082
+ record = JSON.parse(trimmed);
1856
2083
  } catch {
2084
+ continue;
1857
2085
  }
2086
+ const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
2087
+ if (typeof rawTestId !== "string") {
2088
+ throw new Error(`Missing test_id in result: ${trimmed}`);
2089
+ }
2090
+ if (typeof record.score !== "number") {
2091
+ throw new Error(`Missing or invalid score in result: ${trimmed}`);
2092
+ }
2093
+ records.push({
2094
+ testId: rawTestId,
2095
+ target: typeof record.target === "string" ? record.target : void 0,
2096
+ score: record.score,
2097
+ scores: Array.isArray(record.scores) ? record.scores : void 0,
2098
+ executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
2099
+ error: typeof record.error === "string" ? record.error : void 0,
2100
+ timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
2101
+ });
1858
2102
  }
2103
+ return records;
2104
+ }
2105
+
2106
+ // src/commands/eval/retry-errors.ts
2107
+ async function loadErrorTestIds(jsonlPath) {
2108
+ const resolvedPath = resolveResultSourcePath(jsonlPath);
2109
+ const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
1859
2110
  return [...new Set(ids)];
1860
2111
  }
1861
2112
  async function loadNonErrorResults(jsonlPath) {
1862
- const results = [];
1863
- const rl = createInterface({
1864
- input: createReadStream(jsonlPath),
1865
- crlfDelay: Number.POSITIVE_INFINITY
1866
- });
1867
- for await (const line of rl) {
1868
- const trimmed = line.trim();
1869
- if (!trimmed) continue;
1870
- try {
1871
- const parsed = JSON.parse(trimmed);
1872
- const testId = getTestId(parsed);
1873
- const executionStatus = getExecutionStatus(parsed);
1874
- if (!testId || parsed.score === void 0) continue;
1875
- if (executionStatus !== "execution_error") {
1876
- results.push(toEvaluationResult(parsed));
1877
- }
1878
- } catch {
1879
- }
1880
- }
1881
- return results;
2113
+ const resolvedPath = resolveResultSourcePath(jsonlPath);
2114
+ return loadManifestResults(resolvedPath).filter(
2115
+ (result) => result.testId && result.executionStatus !== "execution_error"
2116
+ );
1882
2117
  }
1883
2118
 
1884
2119
  // src/commands/eval/run-cache.ts
1885
2120
  import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
1886
- import path11 from "node:path";
2121
+ import path13 from "node:path";
1887
2122
  var CACHE_FILENAME = "cache.json";
1888
2123
  function resolveRunCacheFile(cache) {
1889
2124
  if (cache.lastRunDir) {
1890
- return path11.join(cache.lastRunDir, "results.jsonl");
2125
+ return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
1891
2126
  }
1892
2127
  return cache.lastResultFile ?? "";
1893
2128
  }
1894
2129
  function cachePath(cwd) {
1895
- return path11.join(cwd, ".agentv", CACHE_FILENAME);
2130
+ return path13.join(cwd, ".agentv", CACHE_FILENAME);
1896
2131
  }
1897
2132
  async function loadRunCache(cwd) {
1898
2133
  try {
@@ -1902,11 +2137,15 @@ async function loadRunCache(cwd) {
1902
2137
  return void 0;
1903
2138
  }
1904
2139
  }
1905
- async function saveRunCache(cwd, runDir) {
1906
- const dir = path11.join(cwd, ".agentv");
2140
+ async function saveRunCache(cwd, resultPath) {
2141
+ const dir = path13.join(cwd, ".agentv");
1907
2142
  await mkdir7(dir, { recursive: true });
1908
- const cache = {
1909
- lastRunDir: runDir,
2143
+ const basename = path13.basename(resultPath);
2144
+ const cache = basename === RESULT_INDEX_FILENAME || basename === LEGACY_RESULTS_FILENAME ? {
2145
+ lastRunDir: path13.dirname(resultPath),
2146
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2147
+ } : {
2148
+ lastResultFile: resultPath,
1910
2149
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
1911
2150
  };
1912
2151
  await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
@@ -2162,7 +2401,7 @@ function formatMatrixSummary(results) {
2162
2401
 
2163
2402
  // ../../packages/core/dist/evaluation/validation/index.js
2164
2403
  import { readFile as readFile3 } from "node:fs/promises";
2165
- import path12 from "node:path";
2404
+ import path14 from "node:path";
2166
2405
  import { parse } from "yaml";
2167
2406
  import { readFile as readFile22 } from "node:fs/promises";
2168
2407
  import path22 from "node:path";
@@ -2205,8 +2444,8 @@ async function detectFileType(filePath) {
2205
2444
  }
2206
2445
  }
2207
2446
  function inferFileTypeFromPath(filePath) {
2208
- const normalized = path12.normalize(filePath).replace(/\\/g, "/");
2209
- const basename = path12.basename(filePath);
2447
+ const normalized = path14.normalize(filePath).replace(/\\/g, "/");
2448
+ const basename = path14.basename(filePath);
2210
2449
  if (normalized.includes("/.agentv/")) {
2211
2450
  if (basename === "config.yaml" || basename === "config.yml") {
2212
2451
  return "config";
@@ -3834,11 +4073,9 @@ async function ensureFileExists(filePath, description) {
3834
4073
  }
3835
4074
  }
3836
4075
  function buildDefaultOutputPath(cwd) {
3837
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3838
- const dirName = `eval_${timestamp}`;
3839
- const runDir = path13.join(cwd, ".agentv", "results", "raw", dirName);
4076
+ const runDir = buildDefaultRunDir(cwd);
3840
4077
  mkdirSync(runDir, { recursive: true });
3841
- return path13.join(runDir, "results.jsonl");
4078
+ return path15.join(runDir, "index.jsonl");
3842
4079
  }
3843
4080
  function createProgressReporter(maxWorkers, options) {
3844
4081
  const display = new ProgressDisplay(maxWorkers, options);
@@ -3852,7 +4089,7 @@ function createProgressReporter(maxWorkers, options) {
3852
4089
  };
3853
4090
  }
3854
4091
  function makeEvalKey(testFilePath, evalId) {
3855
- return `${path13.resolve(testFilePath)}::${evalId}`;
4092
+ return `${path15.resolve(testFilePath)}::${evalId}`;
3856
4093
  }
3857
4094
  function createDisplayIdTracker() {
3858
4095
  const map = /* @__PURE__ */ new Map();
@@ -4041,7 +4278,8 @@ async function runSingleEvalFile(params) {
4041
4278
  vscodeCmd: vsConfig.executable
4042
4279
  });
4043
4280
  }
4044
- const streamingObserver = otelExporter?.createStreamingObserver() ?? null;
4281
+ const useStreamingObserver = !!(otelExporter && options.exportOtel);
4282
+ const streamingObserver = useStreamingObserver ? otelExporter?.createStreamingObserver() ?? null : null;
4045
4283
  const results = await evaluationRunner({
4046
4284
  testFilePath,
4047
4285
  repoRoot,
@@ -4074,6 +4312,7 @@ async function runSingleEvalFile(params) {
4074
4312
  model: options.model,
4075
4313
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
4076
4314
  onResult: async (result) => {
4315
+ streamingObserver?.completeFromResult?.(result);
4077
4316
  streamingObserver?.finalizeEvalCase(result.score, result.error);
4078
4317
  const trimmedOutput = trimOutputMessages(result.output, options.outputMessages);
4079
4318
  const trimmedResult = {
@@ -4134,7 +4373,7 @@ async function runEvalCommand(input) {
4134
4373
  );
4135
4374
  }
4136
4375
  const repoRoot = await findRepoRoot(cwd);
4137
- const yamlConfig = await loadConfig(path13.join(cwd, "_"), repoRoot);
4376
+ const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
4138
4377
  if (yamlConfig?.required_version) {
4139
4378
  await enforceRequiredVersion(yamlConfig.required_version, {
4140
4379
  strict: normalizeBoolean(input.rawOptions.strict)
@@ -4146,7 +4385,7 @@ async function runEvalCommand(input) {
4146
4385
  }
4147
4386
  let retryNonErrorResults;
4148
4387
  if (options.retryErrors) {
4149
- const retryPath = path13.resolve(options.retryErrors);
4388
+ const retryPath = path15.resolve(options.retryErrors);
4150
4389
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
4151
4390
  const errorIds = await loadErrorTestIds(retryPath);
4152
4391
  if (errorIds.length === 0) {
@@ -4159,7 +4398,7 @@ async function runEvalCommand(input) {
4159
4398
  retryNonErrorResults = await loadNonErrorResults(retryPath);
4160
4399
  }
4161
4400
  if (options.workspacePath) {
4162
- const resolvedWorkspace = path13.resolve(options.workspacePath);
4401
+ const resolvedWorkspace = path15.resolve(options.workspacePath);
4163
4402
  try {
4164
4403
  const { stat: stat2 } = await import("node:fs/promises");
4165
4404
  const stats = await stat2(resolvedWorkspace);
@@ -4177,11 +4416,15 @@ async function runEvalCommand(input) {
4177
4416
  if (options.verbose) {
4178
4417
  console.log(`Repository root: ${repoRoot}`);
4179
4418
  }
4419
+ const usesDefaultArtifactWorkspace = !options.outPath;
4420
+ const outputPath = options.outPath ? path15.resolve(options.outPath) : buildDefaultOutputPath(cwd);
4421
+ const defaultTraceFile = usesDefaultArtifactWorkspace && !options.traceFile ? path15.join(path15.dirname(outputPath), "trace.jsonl") : void 0;
4422
+ const traceFilePath = options.traceFile ? path15.resolve(options.traceFile) : defaultTraceFile;
4180
4423
  let otelExporter = null;
4181
- const useFileExport = !!(options.otelFile || options.traceFile);
4424
+ const useFileExport = !!(options.otelFile || traceFilePath);
4182
4425
  if (options.exportOtel || useFileExport) {
4183
4426
  try {
4184
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QERRYDSC.js");
4427
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-L6R5HJ72.js");
4185
4428
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4186
4429
  let headers = {};
4187
4430
  if (options.otelBackend) {
@@ -4205,8 +4448,8 @@ async function runEvalCommand(input) {
4205
4448
  headers,
4206
4449
  captureContent,
4207
4450
  groupTurns: options.otelGroupTurns,
4208
- otlpFilePath: options.otelFile ? path13.resolve(options.otelFile) : void 0,
4209
- traceFilePath: options.traceFile ? path13.resolve(options.traceFile) : void 0
4451
+ otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0,
4452
+ traceFilePath
4210
4453
  });
4211
4454
  const initialized = await otelExporter.init();
4212
4455
  if (!initialized) {
@@ -4222,27 +4465,29 @@ async function runEvalCommand(input) {
4222
4465
  otelExporter = null;
4223
4466
  }
4224
4467
  }
4225
- const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd);
4226
- const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
4227
- const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4468
+ const primaryWritePath = usesDefaultArtifactWorkspace ? path15.join(path15.dirname(outputPath), LEGACY_RESULTS_FILENAME) : outputPath;
4469
+ const extraOutputPaths = options.outputPaths.map((p) => path15.resolve(p));
4470
+ const allOutputPaths = extraOutputPaths.length > 0 ? [primaryWritePath, ...extraOutputPaths] : [primaryWritePath];
4228
4471
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
4472
+ const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4473
+ const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
4229
4474
  let outputWriter;
4230
4475
  if (uniqueOutputPaths.length === 1) {
4231
- outputWriter = await createOutputWriter(outputPath, options.format);
4476
+ outputWriter = await createOutputWriter(primaryWritePath, options.format);
4232
4477
  console.log(`Output path: ${outputPath}`);
4233
4478
  } else {
4234
4479
  outputWriter = await createMultiWriter(uniqueOutputPaths);
4235
4480
  console.log("Output paths:");
4236
- for (const p of uniqueOutputPaths) {
4481
+ for (const p of uniqueReportedOutputPaths) {
4237
4482
  console.log(` ${p}`);
4238
4483
  }
4239
4484
  }
4240
- const resolvedTestFiles = input.testFiles.map((file) => path13.resolve(file));
4485
+ const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
4241
4486
  if (options.otelFile) {
4242
- console.log(`OTLP JSON file: ${path13.resolve(options.otelFile)}`);
4487
+ console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
4243
4488
  }
4244
- if (options.traceFile) {
4245
- console.log(`Trace file: ${path13.resolve(options.traceFile)}`);
4489
+ if (traceFilePath) {
4490
+ console.log(`Trace file: ${traceFilePath}`);
4246
4491
  }
4247
4492
  const evaluationRunner = await resolveEvaluationRunner();
4248
4493
  const allResults = [];
@@ -4288,7 +4533,7 @@ async function runEvalCommand(input) {
4288
4533
  cliNoCache: options.noCache,
4289
4534
  yamlCache: yamlCacheEnabled
4290
4535
  });
4291
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path13.resolve(yamlCachePath) : void 0) : void 0;
4536
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
4292
4537
  const useCache = cacheEnabled;
4293
4538
  if (cacheEnabled) {
4294
4539
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
@@ -4420,20 +4665,51 @@ async function runEvalCommand(input) {
4420
4665
  console.log(formatMatrixSummary(allResults));
4421
4666
  }
4422
4667
  if (options.benchmarkJson && allResults.length > 0) {
4423
- const benchmarkPath = path13.resolve(options.benchmarkJson);
4668
+ const benchmarkPath = path15.resolve(options.benchmarkJson);
4424
4669
  await writeBenchmarkJson(benchmarkPath, allResults);
4425
4670
  console.log(`Benchmark written to: ${benchmarkPath}`);
4426
4671
  }
4427
- if (options.artifacts && allResults.length > 0) {
4428
- const artifactsDir = path13.resolve(options.artifacts);
4672
+ if (usesDefaultArtifactWorkspace) {
4429
4673
  const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4674
+ const workspaceDir = path15.dirname(outputPath);
4430
4675
  const {
4431
- gradingDir,
4676
+ testArtifactDir,
4677
+ timingPath,
4678
+ benchmarkPath: workspaceBenchmarkPath,
4679
+ indexPath,
4680
+ legacyResultsPath
4681
+ } = await writeArtifactsFromResults(allResults, workspaceDir, {
4682
+ evalFile,
4683
+ writeLegacyResults: true
4684
+ });
4685
+ console.log(`Artifact workspace written to: ${workspaceDir}`);
4686
+ console.log(` Index: ${indexPath}`);
4687
+ console.log(
4688
+ ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
4689
+ );
4690
+ console.log(` Timing: ${timingPath}`);
4691
+ console.log(` Benchmark: ${workspaceBenchmarkPath}`);
4692
+ if (legacyResultsPath) {
4693
+ console.log(` Compatibility output: ${legacyResultsPath} (deprecated)`);
4694
+ }
4695
+ }
4696
+ if (options.artifacts) {
4697
+ const artifactsDir = path15.resolve(options.artifacts);
4698
+ const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4699
+ const {
4700
+ testArtifactDir,
4701
+ indexPath,
4432
4702
  timingPath,
4433
4703
  benchmarkPath: abp
4434
- } = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile });
4704
+ } = await writeArtifactsFromResults(allResults, artifactsDir, {
4705
+ evalFile,
4706
+ writeLegacyResults: false
4707
+ });
4435
4708
  console.log(`Artifacts written to: ${artifactsDir}`);
4436
- console.log(` Grading: ${gradingDir} (${allResults.length} files)`);
4709
+ console.log(` Index: ${indexPath}`);
4710
+ console.log(
4711
+ ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
4712
+ );
4437
4713
  console.log(` Timing: ${timingPath}`);
4438
4714
  console.log(` Benchmark: ${abp}`);
4439
4715
  }
@@ -4447,33 +4723,25 @@ async function runEvalCommand(input) {
4447
4723
  }
4448
4724
  }
4449
4725
  if (allResults.length > 0) {
4450
- if (uniqueOutputPaths.length === 1) {
4726
+ if (uniqueReportedOutputPaths.length === 1) {
4451
4727
  console.log(`
4452
4728
  Results written to: ${outputPath}`);
4453
4729
  } else {
4454
4730
  console.log("\nResults written to:");
4455
- for (const p of uniqueOutputPaths) {
4731
+ for (const p of uniqueReportedOutputPaths) {
4456
4732
  console.log(` ${p}`);
4457
4733
  }
4458
4734
  }
4459
- const runDir = path13.dirname(outputPath);
4460
- await saveRunCache(cwd, runDir).catch(() => void 0);
4461
- if (outputPath.endsWith(".jsonl")) {
4462
- const { writeFile: writeFile7 } = await import("node:fs/promises");
4463
- const gradingPath = path13.join(path13.dirname(outputPath), "grading.json");
4464
- const aggregateGrading = buildAggregateGradingArtifact(allResults);
4465
- await writeFile7(gradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
4466
- `, "utf8");
4467
- }
4735
+ await saveRunCache(cwd, outputPath).catch(() => void 0);
4468
4736
  }
4469
4737
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
4470
- const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
4738
+ const evalFileArgs = resolvedTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
4471
4739
  const targetFlag = options.target ? ` --target ${options.target}` : "";
4472
- const relativeOutputPath = path13.relative(cwd, outputPath);
4740
+ const relativeOutputPath = path15.relative(cwd, outputPath);
4473
4741
  console.log(
4474
4742
  `
4475
4743
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
4476
- agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath} -o ${relativeOutputPath}`
4744
+ agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`
4477
4745
  );
4478
4746
  }
4479
4747
  return {
@@ -4501,7 +4769,7 @@ async function resolveEvaluationRunner() {
4501
4769
  if (!overridePath) {
4502
4770
  return runEvaluation;
4503
4771
  }
4504
- const resolved = path13.isAbsolute(overridePath) ? overridePath : path13.resolve(process.cwd(), overridePath);
4772
+ const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
4505
4773
  const moduleUrl = pathToFileURL(resolved).href;
4506
4774
  const mod = await import(moduleUrl);
4507
4775
  const candidate = mod.runEvaluation;
@@ -4516,14 +4784,17 @@ async function resolveEvaluationRunner() {
4516
4784
  export {
4517
4785
  package_default,
4518
4786
  toSnakeCaseDeep,
4787
+ RESULT_INDEX_FILENAME,
4788
+ LEGACY_RESULTS_FILENAME,
4789
+ resolveExistingRunPrimaryPath,
4790
+ resolveWorkspaceOrFilePath,
4791
+ writeArtifactsFromResults,
4792
+ resolveResultSourcePath,
4793
+ loadManifestResults,
4794
+ loadLightweightResults,
4519
4795
  HtmlWriter,
4520
4796
  resolveEvalPaths,
4521
4797
  findRepoRoot,
4522
- buildGradingArtifact,
4523
- buildTimingArtifact,
4524
- buildBenchmarkArtifact,
4525
- buildAggregateGradingArtifact,
4526
- parseJsonlResults,
4527
4798
  resolveRunCacheFile,
4528
4799
  loadRunCache,
4529
4800
  detectFileType,
@@ -4536,4 +4807,4 @@ export {
4536
4807
  selectTarget,
4537
4808
  runEvalCommand
4538
4809
  };
4539
- //# sourceMappingURL=chunk-JEW3FEO7.js.map
4810
+ //# sourceMappingURL=chunk-VLOFRXH4.js.map