agentv 3.11.0 → 3.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -12
- package/dist/{chunk-ETMDLQ72.js → chunk-2ELQ6F3C.js} +916 -523
- package/dist/chunk-2ELQ6F3C.js.map +1 -0
- package/dist/{chunk-JK6V4KVD.js → chunk-NR7QVL75.js} +32 -24
- package/dist/chunk-NR7QVL75.js.map +1 -0
- package/dist/{chunk-EZGWZVVK.js → chunk-UYBLUYHN.js} +927 -615
- package/dist/chunk-UYBLUYHN.js.map +1 -0
- package/dist/{chunk-JEW3FEO7.js → chunk-VLOFRXH4.js} +469 -198
- package/dist/chunk-VLOFRXH4.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-QERRYDSC.js → dist-L6R5HJ72.js} +3 -3
- package/dist/index.js +4 -4
- package/dist/{interactive-AD4PRYDN.js → interactive-5X62YEEX.js} +4 -4
- package/dist/{simple-trace-file-exporter-S76DMABU-5FCJESD2.js → simple-trace-file-exporter-CRIO5HDZ-QYYT2QQT.js} +2 -2
- package/dist/templates/.agentv/.env.example +23 -0
- package/dist/templates/.agentv/config.yaml +13 -4
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +9 -3
- package/dist/chunk-ETMDLQ72.js.map +0 -1
- package/dist/chunk-EZGWZVVK.js.map +0 -1
- package/dist/chunk-JEW3FEO7.js.map +0 -1
- package/dist/chunk-JK6V4KVD.js.map +0 -1
- /package/dist/{dist-QERRYDSC.js.map → dist-L6R5HJ72.js.map} +0 -0
- /package/dist/{interactive-AD4PRYDN.js.map → interactive-5X62YEEX.js.map} +0 -0
- /package/dist/{simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map → simple-trace-file-exporter-CRIO5HDZ-QYYT2QQT.js.map} +0 -0
|
@@ -27,12 +27,12 @@ import {
|
|
|
27
27
|
subscribeToCopilotCliLogEntries,
|
|
28
28
|
subscribeToCopilotSdkLogEntries,
|
|
29
29
|
subscribeToPiLogEntries
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-UYBLUYHN.js";
|
|
31
31
|
|
|
32
32
|
// package.json
|
|
33
33
|
var package_default = {
|
|
34
34
|
name: "agentv",
|
|
35
|
-
version: "3.
|
|
35
|
+
version: "3.12.0",
|
|
36
36
|
description: "CLI entry point for AgentV",
|
|
37
37
|
type: "module",
|
|
38
38
|
repository: {
|
|
@@ -65,8 +65,6 @@ var package_default = {
|
|
|
65
65
|
"@github/copilot-sdk": "^0.1.25",
|
|
66
66
|
"@hono/node-server": "^1.19.11",
|
|
67
67
|
"@inquirer/prompts": "^8.2.1",
|
|
68
|
-
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
69
|
-
"@mariozechner/pi-ai": "^0.54.2",
|
|
70
68
|
"@openai/codex-sdk": "^0.104.0",
|
|
71
69
|
"cmd-ts": "^0.14.3",
|
|
72
70
|
dotenv: "^16.4.5",
|
|
@@ -77,6 +75,14 @@ var package_default = {
|
|
|
77
75
|
semver: "^7.7.4",
|
|
78
76
|
yaml: "^2.6.1"
|
|
79
77
|
},
|
|
78
|
+
peerDependencies: {
|
|
79
|
+
"@mariozechner/pi-coding-agent": "^0.62.0"
|
|
80
|
+
},
|
|
81
|
+
peerDependenciesMeta: {
|
|
82
|
+
"@mariozechner/pi-coding-agent": {
|
|
83
|
+
optional: true
|
|
84
|
+
}
|
|
85
|
+
},
|
|
80
86
|
devDependencies: {
|
|
81
87
|
"@agentv/core": "workspace:*",
|
|
82
88
|
"@types/semver": "^7.7.1",
|
|
@@ -206,7 +212,7 @@ async function discoverTargetsFile(options) {
|
|
|
206
212
|
// src/commands/eval/run-eval.ts
|
|
207
213
|
import { constants as constants4, mkdirSync } from "node:fs";
|
|
208
214
|
import { access as access4 } from "node:fs/promises";
|
|
209
|
-
import
|
|
215
|
+
import path15 from "node:path";
|
|
210
216
|
import { pathToFileURL } from "node:url";
|
|
211
217
|
|
|
212
218
|
// src/version-check.ts
|
|
@@ -265,7 +271,82 @@ async function promptContinue() {
|
|
|
265
271
|
|
|
266
272
|
// src/commands/eval/artifact-writer.ts
|
|
267
273
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
274
|
+
import path4 from "node:path";
|
|
275
|
+
|
|
276
|
+
// src/utils/case-conversion.ts
|
|
277
|
+
function toSnakeCase(str) {
|
|
278
|
+
if (/^[A-Z]/.test(str)) {
|
|
279
|
+
return str;
|
|
280
|
+
}
|
|
281
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
282
|
+
}
|
|
283
|
+
function toSnakeCaseDeep(obj) {
|
|
284
|
+
if (obj === null || obj === void 0) {
|
|
285
|
+
return obj;
|
|
286
|
+
}
|
|
287
|
+
if (Array.isArray(obj)) {
|
|
288
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
289
|
+
}
|
|
290
|
+
if (typeof obj === "object") {
|
|
291
|
+
const result = {};
|
|
292
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
293
|
+
const snakeKey = toSnakeCase(key);
|
|
294
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
295
|
+
}
|
|
296
|
+
return result;
|
|
297
|
+
}
|
|
298
|
+
return obj;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// src/commands/eval/result-layout.ts
|
|
302
|
+
import { existsSync, statSync } from "node:fs";
|
|
268
303
|
import path3 from "node:path";
|
|
304
|
+
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
305
|
+
var LEGACY_RESULTS_FILENAME = "results.jsonl";
|
|
306
|
+
function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
|
|
307
|
+
return `eval_${timestamp.toISOString().replace(/[:.]/g, "-")}`;
|
|
308
|
+
}
|
|
309
|
+
function buildDefaultRunDir(cwd) {
|
|
310
|
+
return path3.join(cwd, ".agentv", "results", "raw", createRunDirName());
|
|
311
|
+
}
|
|
312
|
+
function resolveRunIndexPath(runDir) {
|
|
313
|
+
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
314
|
+
}
|
|
315
|
+
function resolveRunLegacyResultsPath(runDir) {
|
|
316
|
+
return path3.join(runDir, LEGACY_RESULTS_FILENAME);
|
|
317
|
+
}
|
|
318
|
+
function resolveExistingRunPrimaryPath(runDir) {
|
|
319
|
+
const indexPath = resolveRunIndexPath(runDir);
|
|
320
|
+
if (existsSync(indexPath)) {
|
|
321
|
+
return indexPath;
|
|
322
|
+
}
|
|
323
|
+
const legacyPath = resolveRunLegacyResultsPath(runDir);
|
|
324
|
+
if (existsSync(legacyPath)) {
|
|
325
|
+
return legacyPath;
|
|
326
|
+
}
|
|
327
|
+
return void 0;
|
|
328
|
+
}
|
|
329
|
+
function isDirectoryPath(filePath) {
|
|
330
|
+
try {
|
|
331
|
+
return statSync(filePath).isDirectory();
|
|
332
|
+
} catch {
|
|
333
|
+
return false;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
function resolveWorkspaceOrFilePath(filePath) {
|
|
337
|
+
if (!isDirectoryPath(filePath)) {
|
|
338
|
+
return filePath;
|
|
339
|
+
}
|
|
340
|
+
const existing = resolveExistingRunPrimaryPath(filePath);
|
|
341
|
+
if (!existing) {
|
|
342
|
+
throw new Error(
|
|
343
|
+
`Result workspace is missing ${RESULT_INDEX_FILENAME} and ${LEGACY_RESULTS_FILENAME}: ${filePath}`
|
|
344
|
+
);
|
|
345
|
+
}
|
|
346
|
+
return existing;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// src/commands/eval/artifact-writer.ts
|
|
269
350
|
var PASS_THRESHOLD = 0.8;
|
|
270
351
|
function computeStats(values) {
|
|
271
352
|
if (values.length === 0) {
|
|
@@ -480,33 +561,74 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
480
561
|
notes
|
|
481
562
|
};
|
|
482
563
|
}
|
|
483
|
-
function
|
|
484
|
-
const
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
const testId = result.testId ?? "unknown";
|
|
488
|
-
for (const a of result.assertions) {
|
|
489
|
-
assertions.push({
|
|
490
|
-
test_id: testId,
|
|
491
|
-
text: a.text,
|
|
492
|
-
passed: a.passed,
|
|
493
|
-
evidence: a.evidence ?? ""
|
|
494
|
-
});
|
|
495
|
-
}
|
|
564
|
+
function safeArtifactPathSegment(value, fallback) {
|
|
565
|
+
const trimmed = value?.trim();
|
|
566
|
+
if (!trimmed) {
|
|
567
|
+
return fallback;
|
|
496
568
|
}
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
569
|
+
return trimmed.replace(/[/\\:*?"<>|]/g, "_");
|
|
570
|
+
}
|
|
571
|
+
function safeTestId(testId) {
|
|
572
|
+
return safeArtifactPathSegment(testId, "unknown");
|
|
573
|
+
}
|
|
574
|
+
function safeTargetId(target) {
|
|
575
|
+
return safeArtifactPathSegment(target, "default");
|
|
576
|
+
}
|
|
577
|
+
function getEvalSet(result) {
|
|
578
|
+
const record = result;
|
|
579
|
+
return result.eval_set ?? record.evalSet;
|
|
580
|
+
}
|
|
581
|
+
function buildArtifactSubdir(result) {
|
|
582
|
+
const segments = [];
|
|
583
|
+
const evalSet = getEvalSet(result);
|
|
584
|
+
if (evalSet) {
|
|
585
|
+
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
586
|
+
}
|
|
587
|
+
segments.push(safeTestId(result.testId), safeTargetId(result.target));
|
|
588
|
+
return path4.posix.join(...segments);
|
|
589
|
+
}
|
|
590
|
+
function formatOutputMarkdown(output) {
|
|
591
|
+
return output.map((msg) => `@[${msg.role}]:
|
|
592
|
+
${String(msg.content ?? "")}`).join("\n\n");
|
|
593
|
+
}
|
|
594
|
+
function extractInput(result) {
|
|
595
|
+
const input = result.input;
|
|
596
|
+
if (!input) return null;
|
|
597
|
+
if (typeof input === "string") return input;
|
|
598
|
+
if (Array.isArray(input) && input.length > 0) {
|
|
599
|
+
return formatOutputMarkdown(input);
|
|
600
|
+
}
|
|
601
|
+
return null;
|
|
602
|
+
}
|
|
603
|
+
function buildResultIndexArtifact(result) {
|
|
604
|
+
const artifactSubdir = buildArtifactSubdir(result);
|
|
605
|
+
const input = extractInput(result);
|
|
606
|
+
const hasResponse = Array.isArray(result.output) && result.output.length > 0;
|
|
500
607
|
return {
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
608
|
+
timestamp: result.timestamp,
|
|
609
|
+
test_id: result.testId ?? "unknown",
|
|
610
|
+
eval_set: getEvalSet(result),
|
|
611
|
+
conversation_id: result.conversationId,
|
|
612
|
+
score: result.score,
|
|
613
|
+
target: result.target ?? "unknown",
|
|
614
|
+
scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
|
|
615
|
+
execution_status: result.executionStatus,
|
|
616
|
+
error: result.error,
|
|
617
|
+
failure_stage: result.failureStage,
|
|
618
|
+
failure_reason_code: result.failureReasonCode,
|
|
619
|
+
workspace_path: result.workspacePath,
|
|
620
|
+
grading_path: path4.posix.join(artifactSubdir, "grading.json"),
|
|
621
|
+
timing_path: path4.posix.join(artifactSubdir, "timing.json"),
|
|
622
|
+
input_path: input ? path4.posix.join(artifactSubdir, "input.md") : void 0,
|
|
623
|
+
output_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
624
|
+
response_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0
|
|
508
625
|
};
|
|
509
626
|
}
|
|
627
|
+
async function writeJsonlFile(filePath, records) {
|
|
628
|
+
const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
|
|
629
|
+
`;
|
|
630
|
+
await writeFile(filePath, content, "utf8");
|
|
631
|
+
}
|
|
510
632
|
function toCamelCase(str) {
|
|
511
633
|
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
512
634
|
}
|
|
@@ -544,17 +666,39 @@ function parseJsonlResults(content) {
|
|
|
544
666
|
return results;
|
|
545
667
|
}
|
|
546
668
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
547
|
-
const
|
|
548
|
-
const timingPath =
|
|
549
|
-
const benchmarkPath =
|
|
550
|
-
const
|
|
551
|
-
|
|
669
|
+
const testArtifactDir = outputDir;
|
|
670
|
+
const timingPath = path4.join(outputDir, "timing.json");
|
|
671
|
+
const benchmarkPath = path4.join(outputDir, "benchmark.json");
|
|
672
|
+
const indexPath = path4.join(outputDir, RESULT_INDEX_FILENAME);
|
|
673
|
+
const legacyResultsPath = options?.writeLegacyResults ? path4.join(outputDir, LEGACY_RESULTS_FILENAME) : void 0;
|
|
674
|
+
await mkdir(outputDir, { recursive: true });
|
|
675
|
+
const indexRecords = [];
|
|
552
676
|
for (const result of results) {
|
|
553
677
|
const grading = buildGradingArtifact(result);
|
|
554
|
-
const
|
|
555
|
-
const
|
|
678
|
+
const timing2 = buildTimingArtifact([result]);
|
|
679
|
+
const artifactSubdir = buildArtifactSubdir(result);
|
|
680
|
+
const testDir = path4.join(outputDir, artifactSubdir);
|
|
681
|
+
const gradingPath = path4.join(testDir, "grading.json");
|
|
682
|
+
const perTestTimingPath = path4.join(testDir, "timing.json");
|
|
683
|
+
await mkdir(testDir, { recursive: true });
|
|
556
684
|
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
557
685
|
`, "utf8");
|
|
686
|
+
await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
|
|
687
|
+
`, "utf8");
|
|
688
|
+
const input = extractInput(result);
|
|
689
|
+
if (input) {
|
|
690
|
+
await writeFile(path4.join(testDir, "input.md"), input, "utf8");
|
|
691
|
+
}
|
|
692
|
+
if (result.output && result.output.length > 0) {
|
|
693
|
+
const outputsDir = path4.join(testDir, "outputs");
|
|
694
|
+
await mkdir(outputsDir, { recursive: true });
|
|
695
|
+
await writeFile(
|
|
696
|
+
path4.join(outputsDir, "response.md"),
|
|
697
|
+
formatOutputMarkdown(result.output),
|
|
698
|
+
"utf8"
|
|
699
|
+
);
|
|
700
|
+
}
|
|
701
|
+
indexRecords.push(buildResultIndexArtifact(result));
|
|
558
702
|
}
|
|
559
703
|
const timing = buildTimingArtifact(results);
|
|
560
704
|
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
@@ -562,10 +706,11 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
562
706
|
const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
|
|
563
707
|
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
564
708
|
`, "utf8");
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
709
|
+
await writeJsonlFile(indexPath, indexRecords);
|
|
710
|
+
if (legacyResultsPath) {
|
|
711
|
+
await writeJsonlFile(legacyResultsPath, results);
|
|
712
|
+
}
|
|
713
|
+
return { testArtifactDir, timingPath, benchmarkPath, indexPath, legacyResultsPath };
|
|
569
714
|
}
|
|
570
715
|
|
|
571
716
|
// src/commands/eval/benchmark-writer.ts
|
|
@@ -616,13 +761,13 @@ async function writeBenchmarkJson(outputPath, results) {
|
|
|
616
761
|
// src/commands/eval/env.ts
|
|
617
762
|
import { constants as constants3 } from "node:fs";
|
|
618
763
|
import { access as access3 } from "node:fs/promises";
|
|
619
|
-
import
|
|
764
|
+
import path5 from "node:path";
|
|
620
765
|
import { config as loadDotenv } from "dotenv";
|
|
621
766
|
function uniqueDirs(directories) {
|
|
622
767
|
const seen = /* @__PURE__ */ new Set();
|
|
623
768
|
const result = [];
|
|
624
769
|
for (const dir of directories) {
|
|
625
|
-
const absolute =
|
|
770
|
+
const absolute = path5.resolve(dir);
|
|
626
771
|
if (seen.has(absolute)) {
|
|
627
772
|
continue;
|
|
628
773
|
}
|
|
@@ -641,14 +786,14 @@ async function fileExists2(filePath) {
|
|
|
641
786
|
}
|
|
642
787
|
function collectAncestorDirectories(start, boundary) {
|
|
643
788
|
const directories = [];
|
|
644
|
-
const boundaryDir =
|
|
645
|
-
let current =
|
|
789
|
+
const boundaryDir = path5.resolve(boundary);
|
|
790
|
+
let current = path5.resolve(start);
|
|
646
791
|
while (current !== void 0) {
|
|
647
792
|
directories.push(current);
|
|
648
793
|
if (current === boundaryDir) {
|
|
649
794
|
break;
|
|
650
795
|
}
|
|
651
|
-
const parent =
|
|
796
|
+
const parent = path5.dirname(current);
|
|
652
797
|
if (parent === current) {
|
|
653
798
|
break;
|
|
654
799
|
}
|
|
@@ -658,12 +803,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
658
803
|
}
|
|
659
804
|
async function loadEnvFromHierarchy(options) {
|
|
660
805
|
const { testFilePath, repoRoot, verbose } = options;
|
|
661
|
-
const testDir =
|
|
806
|
+
const testDir = path5.dirname(path5.resolve(testFilePath));
|
|
662
807
|
const cwd = process.cwd();
|
|
663
808
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
664
809
|
const envFiles = [];
|
|
665
810
|
for (const dir of searchDirs) {
|
|
666
|
-
const candidate =
|
|
811
|
+
const candidate = path5.join(dir, ".env");
|
|
667
812
|
if (await fileExists2(candidate)) {
|
|
668
813
|
envFiles.push(candidate);
|
|
669
814
|
}
|
|
@@ -685,11 +830,11 @@ async function loadEnvFromHierarchy(options) {
|
|
|
685
830
|
}
|
|
686
831
|
|
|
687
832
|
// src/commands/eval/output-writer.ts
|
|
688
|
-
import
|
|
833
|
+
import path11 from "node:path";
|
|
689
834
|
|
|
690
835
|
// src/commands/eval/html-writer.ts
|
|
691
836
|
import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
692
|
-
import
|
|
837
|
+
import path6 from "node:path";
|
|
693
838
|
|
|
694
839
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
695
840
|
var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
|
|
@@ -908,7 +1053,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
908
1053
|
this.filePath = filePath;
|
|
909
1054
|
}
|
|
910
1055
|
static async open(filePath) {
|
|
911
|
-
await mkdir2(
|
|
1056
|
+
await mkdir2(path6.dirname(filePath), { recursive: true });
|
|
912
1057
|
const writer = new _HtmlWriter(filePath);
|
|
913
1058
|
await writer.writeHtml();
|
|
914
1059
|
return writer;
|
|
@@ -1419,34 +1564,7 @@ var SCRIPT = `
|
|
|
1419
1564
|
|
|
1420
1565
|
// src/commands/eval/json-writer.ts
|
|
1421
1566
|
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
1422
|
-
import
|
|
1423
|
-
|
|
1424
|
-
// src/utils/case-conversion.ts
|
|
1425
|
-
function toSnakeCase(str) {
|
|
1426
|
-
if (/^[A-Z]/.test(str)) {
|
|
1427
|
-
return str;
|
|
1428
|
-
}
|
|
1429
|
-
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
1430
|
-
}
|
|
1431
|
-
function toSnakeCaseDeep(obj) {
|
|
1432
|
-
if (obj === null || obj === void 0) {
|
|
1433
|
-
return obj;
|
|
1434
|
-
}
|
|
1435
|
-
if (Array.isArray(obj)) {
|
|
1436
|
-
return obj.map((item) => toSnakeCaseDeep(item));
|
|
1437
|
-
}
|
|
1438
|
-
if (typeof obj === "object") {
|
|
1439
|
-
const result = {};
|
|
1440
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
1441
|
-
const snakeKey = toSnakeCase(key);
|
|
1442
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
1443
|
-
}
|
|
1444
|
-
return result;
|
|
1445
|
-
}
|
|
1446
|
-
return obj;
|
|
1447
|
-
}
|
|
1448
|
-
|
|
1449
|
-
// src/commands/eval/json-writer.ts
|
|
1567
|
+
import path7 from "node:path";
|
|
1450
1568
|
var JsonWriter = class _JsonWriter {
|
|
1451
1569
|
filePath;
|
|
1452
1570
|
results = [];
|
|
@@ -1455,7 +1573,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
1455
1573
|
this.filePath = filePath;
|
|
1456
1574
|
}
|
|
1457
1575
|
static async open(filePath) {
|
|
1458
|
-
await mkdir3(
|
|
1576
|
+
await mkdir3(path7.dirname(filePath), { recursive: true });
|
|
1459
1577
|
return new _JsonWriter(filePath);
|
|
1460
1578
|
}
|
|
1461
1579
|
async append(result) {
|
|
@@ -1490,7 +1608,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
1490
1608
|
// src/commands/eval/jsonl-writer.ts
|
|
1491
1609
|
import { createWriteStream } from "node:fs";
|
|
1492
1610
|
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
1493
|
-
import
|
|
1611
|
+
import path8 from "node:path";
|
|
1494
1612
|
import { finished } from "node:stream/promises";
|
|
1495
1613
|
var JsonlWriter = class _JsonlWriter {
|
|
1496
1614
|
stream;
|
|
@@ -1500,7 +1618,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1500
1618
|
this.stream = stream;
|
|
1501
1619
|
}
|
|
1502
1620
|
static async open(filePath) {
|
|
1503
|
-
await mkdir4(
|
|
1621
|
+
await mkdir4(path8.dirname(filePath), { recursive: true });
|
|
1504
1622
|
const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
|
|
1505
1623
|
return new _JsonlWriter(stream);
|
|
1506
1624
|
}
|
|
@@ -1532,7 +1650,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1532
1650
|
|
|
1533
1651
|
// src/commands/eval/junit-writer.ts
|
|
1534
1652
|
import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
|
|
1535
|
-
import
|
|
1653
|
+
import path9 from "node:path";
|
|
1536
1654
|
function escapeXml(str) {
|
|
1537
1655
|
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1538
1656
|
}
|
|
@@ -1544,7 +1662,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1544
1662
|
this.filePath = filePath;
|
|
1545
1663
|
}
|
|
1546
1664
|
static async open(filePath) {
|
|
1547
|
-
await mkdir5(
|
|
1665
|
+
await mkdir5(path9.dirname(filePath), { recursive: true });
|
|
1548
1666
|
return new _JunitWriter(filePath);
|
|
1549
1667
|
}
|
|
1550
1668
|
async append(result) {
|
|
@@ -1613,7 +1731,7 @@ ${suiteXmls.join("\n")}
|
|
|
1613
1731
|
// src/commands/eval/yaml-writer.ts
|
|
1614
1732
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
1615
1733
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
1616
|
-
import
|
|
1734
|
+
import path10 from "node:path";
|
|
1617
1735
|
import { finished as finished2 } from "node:stream/promises";
|
|
1618
1736
|
import { stringify as stringifyYaml } from "yaml";
|
|
1619
1737
|
var YamlWriter = class _YamlWriter {
|
|
@@ -1625,7 +1743,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
1625
1743
|
this.stream = stream;
|
|
1626
1744
|
}
|
|
1627
1745
|
static async open(filePath) {
|
|
1628
|
-
await mkdir6(
|
|
1746
|
+
await mkdir6(path10.dirname(filePath), { recursive: true });
|
|
1629
1747
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
1630
1748
|
return new _YamlWriter(stream);
|
|
1631
1749
|
}
|
|
@@ -1681,7 +1799,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
1681
1799
|
}
|
|
1682
1800
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
1683
1801
|
function createWriterFromPath(filePath) {
|
|
1684
|
-
const ext =
|
|
1802
|
+
const ext = path11.extname(filePath).toLowerCase();
|
|
1685
1803
|
switch (ext) {
|
|
1686
1804
|
case ".jsonl":
|
|
1687
1805
|
return JsonlWriter.open(filePath);
|
|
@@ -1788,12 +1906,12 @@ var ProgressDisplay = class {
|
|
|
1788
1906
|
}
|
|
1789
1907
|
addLogPaths(paths, provider) {
|
|
1790
1908
|
const newPaths = [];
|
|
1791
|
-
for (const
|
|
1792
|
-
if (this.logPathSet.has(
|
|
1909
|
+
for (const path16 of paths) {
|
|
1910
|
+
if (this.logPathSet.has(path16)) {
|
|
1793
1911
|
continue;
|
|
1794
1912
|
}
|
|
1795
|
-
this.logPathSet.add(
|
|
1796
|
-
newPaths.push(
|
|
1913
|
+
this.logPathSet.add(path16);
|
|
1914
|
+
newPaths.push(path16);
|
|
1797
1915
|
}
|
|
1798
1916
|
if (newPaths.length === 0) {
|
|
1799
1917
|
return;
|
|
@@ -1806,8 +1924,8 @@ var ProgressDisplay = class {
|
|
|
1806
1924
|
this.hasPrintedLogHeader = true;
|
|
1807
1925
|
}
|
|
1808
1926
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
1809
|
-
newPaths.forEach((
|
|
1810
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
1927
|
+
newPaths.forEach((path16, offset) => {
|
|
1928
|
+
console.log(`${startIndex + offset + 1}. ${path16}`);
|
|
1811
1929
|
});
|
|
1812
1930
|
}
|
|
1813
1931
|
finish() {
|
|
@@ -1818,81 +1936,198 @@ var ProgressDisplay = class {
|
|
|
1818
1936
|
}
|
|
1819
1937
|
};
|
|
1820
1938
|
|
|
1821
|
-
// src/commands/
|
|
1822
|
-
import {
|
|
1823
|
-
import
|
|
1824
|
-
function
|
|
1825
|
-
return
|
|
1939
|
+
// src/commands/results/manifest.ts
|
|
1940
|
+
import { existsSync as existsSync2, readFileSync } from "node:fs";
|
|
1941
|
+
import path12 from "node:path";
|
|
1942
|
+
function parseJsonlLines(content) {
|
|
1943
|
+
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
1826
1944
|
}
|
|
1827
|
-
function
|
|
1828
|
-
return
|
|
1945
|
+
function isIndexManifestPath(sourceFile) {
|
|
1946
|
+
return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
|
|
1829
1947
|
}
|
|
1830
|
-
function
|
|
1831
|
-
|
|
1832
|
-
|
|
1948
|
+
function parseMarkdownMessages(content) {
|
|
1949
|
+
const trimmed = content.trim();
|
|
1950
|
+
if (!trimmed.startsWith("@[")) {
|
|
1951
|
+
return [];
|
|
1952
|
+
}
|
|
1953
|
+
const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
|
|
1954
|
+
return matches.map((match) => ({
|
|
1955
|
+
role: match[1],
|
|
1956
|
+
content: match[2].trimEnd()
|
|
1957
|
+
}));
|
|
1958
|
+
}
|
|
1959
|
+
function readOptionalText(baseDir, relativePath) {
|
|
1960
|
+
if (!relativePath) {
|
|
1961
|
+
return void 0;
|
|
1962
|
+
}
|
|
1963
|
+
const absolutePath = path12.join(baseDir, relativePath);
|
|
1964
|
+
if (!existsSync2(absolutePath)) {
|
|
1965
|
+
return void 0;
|
|
1966
|
+
}
|
|
1967
|
+
return readFileSync(absolutePath, "utf8");
|
|
1968
|
+
}
|
|
1969
|
+
function readOptionalJson(baseDir, relativePath) {
|
|
1970
|
+
const text = readOptionalText(baseDir, relativePath);
|
|
1971
|
+
if (!text) {
|
|
1972
|
+
return void 0;
|
|
1833
1973
|
}
|
|
1974
|
+
try {
|
|
1975
|
+
return JSON.parse(text);
|
|
1976
|
+
} catch {
|
|
1977
|
+
return void 0;
|
|
1978
|
+
}
|
|
1979
|
+
}
|
|
1980
|
+
function hydrateInput(baseDir, record) {
|
|
1981
|
+
const inputText = readOptionalText(baseDir, record.input_path);
|
|
1982
|
+
if (!inputText) {
|
|
1983
|
+
return void 0;
|
|
1984
|
+
}
|
|
1985
|
+
const messages = parseMarkdownMessages(inputText);
|
|
1986
|
+
return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
|
|
1987
|
+
}
|
|
1988
|
+
function hydrateOutput(baseDir, record) {
|
|
1989
|
+
const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
|
|
1990
|
+
if (!responseText) {
|
|
1991
|
+
return void 0;
|
|
1992
|
+
}
|
|
1993
|
+
const messages = parseMarkdownMessages(responseText);
|
|
1994
|
+
if (messages.length > 0) {
|
|
1995
|
+
return messages.map((message) => ({
|
|
1996
|
+
role: message.role,
|
|
1997
|
+
content: message.content
|
|
1998
|
+
}));
|
|
1999
|
+
}
|
|
2000
|
+
return [{ role: "assistant", content: responseText.trimEnd() }];
|
|
2001
|
+
}
|
|
2002
|
+
function hydrateManifestRecord(baseDir, record) {
|
|
2003
|
+
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
2004
|
+
const timing = readOptionalJson(baseDir, record.timing_path);
|
|
2005
|
+
const testId = record.test_id ?? record.eval_id ?? "unknown";
|
|
1834
2006
|
return {
|
|
1835
|
-
|
|
1836
|
-
testId
|
|
1837
|
-
|
|
2007
|
+
timestamp: record.timestamp,
|
|
2008
|
+
testId,
|
|
2009
|
+
eval_set: record.eval_set,
|
|
2010
|
+
target: record.target,
|
|
2011
|
+
score: record.score,
|
|
2012
|
+
executionStatus: record.execution_status,
|
|
2013
|
+
error: record.error,
|
|
2014
|
+
assertions: grading?.assertions.map((assertion) => ({
|
|
2015
|
+
text: assertion.text,
|
|
2016
|
+
passed: assertion.passed,
|
|
2017
|
+
evidence: assertion.evidence
|
|
2018
|
+
})),
|
|
2019
|
+
scores: grading?.evaluators?.map((evaluator) => ({
|
|
2020
|
+
name: evaluator.name,
|
|
2021
|
+
type: evaluator.type,
|
|
2022
|
+
score: evaluator.score,
|
|
2023
|
+
assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
|
|
2024
|
+
text: String(assertion.text ?? ""),
|
|
2025
|
+
passed: Boolean(assertion.passed),
|
|
2026
|
+
evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
|
|
2027
|
+
})) : void 0,
|
|
2028
|
+
weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
|
|
2029
|
+
verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
|
|
2030
|
+
details: evaluator.details
|
|
2031
|
+
})) ?? record.scores,
|
|
2032
|
+
tokenUsage: timing?.token_usage ? {
|
|
2033
|
+
input: timing.token_usage.input,
|
|
2034
|
+
output: timing.token_usage.output,
|
|
2035
|
+
reasoning: timing.token_usage.reasoning
|
|
2036
|
+
} : record.token_usage,
|
|
2037
|
+
durationMs: timing?.duration_ms ?? record.duration_ms,
|
|
2038
|
+
costUsd: record.cost_usd,
|
|
2039
|
+
input: hydrateInput(baseDir, record),
|
|
2040
|
+
output: hydrateOutput(baseDir, record)
|
|
1838
2041
|
};
|
|
1839
2042
|
}
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
2043
|
+
function parseResultManifest(content) {
|
|
2044
|
+
return parseJsonlLines(content);
|
|
2045
|
+
}
|
|
2046
|
+
function resolveResultSourcePath(source, cwd) {
|
|
2047
|
+
const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
|
|
2048
|
+
return resolveWorkspaceOrFilePath(resolved);
|
|
2049
|
+
}
|
|
2050
|
+
function loadManifestResults(sourceFile) {
|
|
2051
|
+
const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
|
|
2052
|
+
if (!isIndexManifestPath(resolvedSourceFile)) {
|
|
2053
|
+
return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
|
|
2054
|
+
}
|
|
2055
|
+
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2056
|
+
const records = parseResultManifest(content);
|
|
2057
|
+
const baseDir = path12.dirname(resolvedSourceFile);
|
|
2058
|
+
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
2059
|
+
}
|
|
2060
|
+
function loadLightweightResults(sourceFile) {
|
|
2061
|
+
const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
|
|
2062
|
+
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2063
|
+
if (isIndexManifestPath(resolvedSourceFile)) {
|
|
2064
|
+
return parseResultManifest(content).map((record) => ({
|
|
2065
|
+
testId: record.test_id ?? record.eval_id ?? "unknown",
|
|
2066
|
+
target: record.target,
|
|
2067
|
+
score: record.score,
|
|
2068
|
+
scores: record.scores,
|
|
2069
|
+
executionStatus: record.execution_status,
|
|
2070
|
+
error: record.error,
|
|
2071
|
+
timestamp: record.timestamp
|
|
2072
|
+
}));
|
|
2073
|
+
}
|
|
2074
|
+
const records = [];
|
|
2075
|
+
for (const line of content.split(/\r?\n/)) {
|
|
1847
2076
|
const trimmed = line.trim();
|
|
1848
|
-
if (!trimmed)
|
|
2077
|
+
if (!trimmed) {
|
|
2078
|
+
continue;
|
|
2079
|
+
}
|
|
2080
|
+
let record;
|
|
1849
2081
|
try {
|
|
1850
|
-
|
|
1851
|
-
const executionStatus = getExecutionStatus(parsed);
|
|
1852
|
-
const testId = getTestId(parsed);
|
|
1853
|
-
if (executionStatus === "execution_error" && testId) {
|
|
1854
|
-
ids.push(testId);
|
|
1855
|
-
}
|
|
2082
|
+
record = JSON.parse(trimmed);
|
|
1856
2083
|
} catch {
|
|
2084
|
+
continue;
|
|
1857
2085
|
}
|
|
2086
|
+
const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
|
|
2087
|
+
if (typeof rawTestId !== "string") {
|
|
2088
|
+
throw new Error(`Missing test_id in result: ${trimmed}`);
|
|
2089
|
+
}
|
|
2090
|
+
if (typeof record.score !== "number") {
|
|
2091
|
+
throw new Error(`Missing or invalid score in result: ${trimmed}`);
|
|
2092
|
+
}
|
|
2093
|
+
records.push({
|
|
2094
|
+
testId: rawTestId,
|
|
2095
|
+
target: typeof record.target === "string" ? record.target : void 0,
|
|
2096
|
+
score: record.score,
|
|
2097
|
+
scores: Array.isArray(record.scores) ? record.scores : void 0,
|
|
2098
|
+
executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
|
|
2099
|
+
error: typeof record.error === "string" ? record.error : void 0,
|
|
2100
|
+
timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
|
|
2101
|
+
});
|
|
1858
2102
|
}
|
|
2103
|
+
return records;
|
|
2104
|
+
}
|
|
2105
|
+
|
|
2106
|
+
// src/commands/eval/retry-errors.ts
|
|
2107
|
+
async function loadErrorTestIds(jsonlPath) {
|
|
2108
|
+
const resolvedPath = resolveResultSourcePath(jsonlPath);
|
|
2109
|
+
const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
1859
2110
|
return [...new Set(ids)];
|
|
1860
2111
|
}
|
|
1861
2112
|
async function loadNonErrorResults(jsonlPath) {
|
|
1862
|
-
const
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
});
|
|
1867
|
-
for await (const line of rl) {
|
|
1868
|
-
const trimmed = line.trim();
|
|
1869
|
-
if (!trimmed) continue;
|
|
1870
|
-
try {
|
|
1871
|
-
const parsed = JSON.parse(trimmed);
|
|
1872
|
-
const testId = getTestId(parsed);
|
|
1873
|
-
const executionStatus = getExecutionStatus(parsed);
|
|
1874
|
-
if (!testId || parsed.score === void 0) continue;
|
|
1875
|
-
if (executionStatus !== "execution_error") {
|
|
1876
|
-
results.push(toEvaluationResult(parsed));
|
|
1877
|
-
}
|
|
1878
|
-
} catch {
|
|
1879
|
-
}
|
|
1880
|
-
}
|
|
1881
|
-
return results;
|
|
2113
|
+
const resolvedPath = resolveResultSourcePath(jsonlPath);
|
|
2114
|
+
return loadManifestResults(resolvedPath).filter(
|
|
2115
|
+
(result) => result.testId && result.executionStatus !== "execution_error"
|
|
2116
|
+
);
|
|
1882
2117
|
}
|
|
1883
2118
|
|
|
1884
2119
|
// src/commands/eval/run-cache.ts
|
|
1885
2120
|
import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
|
|
1886
|
-
import
|
|
2121
|
+
import path13 from "node:path";
|
|
1887
2122
|
var CACHE_FILENAME = "cache.json";
|
|
1888
2123
|
function resolveRunCacheFile(cache) {
|
|
1889
2124
|
if (cache.lastRunDir) {
|
|
1890
|
-
return
|
|
2125
|
+
return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
|
|
1891
2126
|
}
|
|
1892
2127
|
return cache.lastResultFile ?? "";
|
|
1893
2128
|
}
|
|
1894
2129
|
function cachePath(cwd) {
|
|
1895
|
-
return
|
|
2130
|
+
return path13.join(cwd, ".agentv", CACHE_FILENAME);
|
|
1896
2131
|
}
|
|
1897
2132
|
async function loadRunCache(cwd) {
|
|
1898
2133
|
try {
|
|
@@ -1902,11 +2137,15 @@ async function loadRunCache(cwd) {
|
|
|
1902
2137
|
return void 0;
|
|
1903
2138
|
}
|
|
1904
2139
|
}
|
|
1905
|
-
async function saveRunCache(cwd,
|
|
1906
|
-
const dir =
|
|
2140
|
+
async function saveRunCache(cwd, resultPath) {
|
|
2141
|
+
const dir = path13.join(cwd, ".agentv");
|
|
1907
2142
|
await mkdir7(dir, { recursive: true });
|
|
1908
|
-
const
|
|
1909
|
-
|
|
2143
|
+
const basename = path13.basename(resultPath);
|
|
2144
|
+
const cache = basename === RESULT_INDEX_FILENAME || basename === LEGACY_RESULTS_FILENAME ? {
|
|
2145
|
+
lastRunDir: path13.dirname(resultPath),
|
|
2146
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2147
|
+
} : {
|
|
2148
|
+
lastResultFile: resultPath,
|
|
1910
2149
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
1911
2150
|
};
|
|
1912
2151
|
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
@@ -2162,7 +2401,7 @@ function formatMatrixSummary(results) {
|
|
|
2162
2401
|
|
|
2163
2402
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2164
2403
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
2165
|
-
import
|
|
2404
|
+
import path14 from "node:path";
|
|
2166
2405
|
import { parse } from "yaml";
|
|
2167
2406
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
2168
2407
|
import path22 from "node:path";
|
|
@@ -2205,8 +2444,8 @@ async function detectFileType(filePath) {
|
|
|
2205
2444
|
}
|
|
2206
2445
|
}
|
|
2207
2446
|
function inferFileTypeFromPath(filePath) {
|
|
2208
|
-
const normalized =
|
|
2209
|
-
const basename =
|
|
2447
|
+
const normalized = path14.normalize(filePath).replace(/\\/g, "/");
|
|
2448
|
+
const basename = path14.basename(filePath);
|
|
2210
2449
|
if (normalized.includes("/.agentv/")) {
|
|
2211
2450
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
2212
2451
|
return "config";
|
|
@@ -3834,11 +4073,9 @@ async function ensureFileExists(filePath, description) {
|
|
|
3834
4073
|
}
|
|
3835
4074
|
}
|
|
3836
4075
|
function buildDefaultOutputPath(cwd) {
|
|
3837
|
-
const
|
|
3838
|
-
const dirName = `eval_${timestamp}`;
|
|
3839
|
-
const runDir = path13.join(cwd, ".agentv", "results", "raw", dirName);
|
|
4076
|
+
const runDir = buildDefaultRunDir(cwd);
|
|
3840
4077
|
mkdirSync(runDir, { recursive: true });
|
|
3841
|
-
return
|
|
4078
|
+
return path15.join(runDir, "index.jsonl");
|
|
3842
4079
|
}
|
|
3843
4080
|
function createProgressReporter(maxWorkers, options) {
|
|
3844
4081
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -3852,7 +4089,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
3852
4089
|
};
|
|
3853
4090
|
}
|
|
3854
4091
|
function makeEvalKey(testFilePath, evalId) {
|
|
3855
|
-
return `${
|
|
4092
|
+
return `${path15.resolve(testFilePath)}::${evalId}`;
|
|
3856
4093
|
}
|
|
3857
4094
|
function createDisplayIdTracker() {
|
|
3858
4095
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -4041,7 +4278,8 @@ async function runSingleEvalFile(params) {
|
|
|
4041
4278
|
vscodeCmd: vsConfig.executable
|
|
4042
4279
|
});
|
|
4043
4280
|
}
|
|
4044
|
-
const
|
|
4281
|
+
const useStreamingObserver = !!(otelExporter && options.exportOtel);
|
|
4282
|
+
const streamingObserver = useStreamingObserver ? otelExporter?.createStreamingObserver() ?? null : null;
|
|
4045
4283
|
const results = await evaluationRunner({
|
|
4046
4284
|
testFilePath,
|
|
4047
4285
|
repoRoot,
|
|
@@ -4074,6 +4312,7 @@ async function runSingleEvalFile(params) {
|
|
|
4074
4312
|
model: options.model,
|
|
4075
4313
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
4076
4314
|
onResult: async (result) => {
|
|
4315
|
+
streamingObserver?.completeFromResult?.(result);
|
|
4077
4316
|
streamingObserver?.finalizeEvalCase(result.score, result.error);
|
|
4078
4317
|
const trimmedOutput = trimOutputMessages(result.output, options.outputMessages);
|
|
4079
4318
|
const trimmedResult = {
|
|
@@ -4134,7 +4373,7 @@ async function runEvalCommand(input) {
|
|
|
4134
4373
|
);
|
|
4135
4374
|
}
|
|
4136
4375
|
const repoRoot = await findRepoRoot(cwd);
|
|
4137
|
-
const yamlConfig = await loadConfig(
|
|
4376
|
+
const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
|
|
4138
4377
|
if (yamlConfig?.required_version) {
|
|
4139
4378
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
4140
4379
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
@@ -4146,7 +4385,7 @@ async function runEvalCommand(input) {
|
|
|
4146
4385
|
}
|
|
4147
4386
|
let retryNonErrorResults;
|
|
4148
4387
|
if (options.retryErrors) {
|
|
4149
|
-
const retryPath =
|
|
4388
|
+
const retryPath = path15.resolve(options.retryErrors);
|
|
4150
4389
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
4151
4390
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
4152
4391
|
if (errorIds.length === 0) {
|
|
@@ -4159,7 +4398,7 @@ async function runEvalCommand(input) {
|
|
|
4159
4398
|
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
4160
4399
|
}
|
|
4161
4400
|
if (options.workspacePath) {
|
|
4162
|
-
const resolvedWorkspace =
|
|
4401
|
+
const resolvedWorkspace = path15.resolve(options.workspacePath);
|
|
4163
4402
|
try {
|
|
4164
4403
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
4165
4404
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -4177,11 +4416,15 @@ async function runEvalCommand(input) {
|
|
|
4177
4416
|
if (options.verbose) {
|
|
4178
4417
|
console.log(`Repository root: ${repoRoot}`);
|
|
4179
4418
|
}
|
|
4419
|
+
const usesDefaultArtifactWorkspace = !options.outPath;
|
|
4420
|
+
const outputPath = options.outPath ? path15.resolve(options.outPath) : buildDefaultOutputPath(cwd);
|
|
4421
|
+
const defaultTraceFile = usesDefaultArtifactWorkspace && !options.traceFile ? path15.join(path15.dirname(outputPath), "trace.jsonl") : void 0;
|
|
4422
|
+
const traceFilePath = options.traceFile ? path15.resolve(options.traceFile) : defaultTraceFile;
|
|
4180
4423
|
let otelExporter = null;
|
|
4181
|
-
const useFileExport = !!(options.otelFile ||
|
|
4424
|
+
const useFileExport = !!(options.otelFile || traceFilePath);
|
|
4182
4425
|
if (options.exportOtel || useFileExport) {
|
|
4183
4426
|
try {
|
|
4184
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4427
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-L6R5HJ72.js");
|
|
4185
4428
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4186
4429
|
let headers = {};
|
|
4187
4430
|
if (options.otelBackend) {
|
|
@@ -4205,8 +4448,8 @@ async function runEvalCommand(input) {
|
|
|
4205
4448
|
headers,
|
|
4206
4449
|
captureContent,
|
|
4207
4450
|
groupTurns: options.otelGroupTurns,
|
|
4208
|
-
otlpFilePath: options.otelFile ?
|
|
4209
|
-
traceFilePath
|
|
4451
|
+
otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0,
|
|
4452
|
+
traceFilePath
|
|
4210
4453
|
});
|
|
4211
4454
|
const initialized = await otelExporter.init();
|
|
4212
4455
|
if (!initialized) {
|
|
@@ -4222,27 +4465,29 @@ async function runEvalCommand(input) {
|
|
|
4222
4465
|
otelExporter = null;
|
|
4223
4466
|
}
|
|
4224
4467
|
}
|
|
4225
|
-
const
|
|
4226
|
-
const extraOutputPaths = options.outputPaths.map((p) =>
|
|
4227
|
-
const allOutputPaths = extraOutputPaths.length > 0 ? [
|
|
4468
|
+
const primaryWritePath = usesDefaultArtifactWorkspace ? path15.join(path15.dirname(outputPath), LEGACY_RESULTS_FILENAME) : outputPath;
|
|
4469
|
+
const extraOutputPaths = options.outputPaths.map((p) => path15.resolve(p));
|
|
4470
|
+
const allOutputPaths = extraOutputPaths.length > 0 ? [primaryWritePath, ...extraOutputPaths] : [primaryWritePath];
|
|
4228
4471
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
4472
|
+
const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
4473
|
+
const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
|
|
4229
4474
|
let outputWriter;
|
|
4230
4475
|
if (uniqueOutputPaths.length === 1) {
|
|
4231
|
-
outputWriter = await createOutputWriter(
|
|
4476
|
+
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4232
4477
|
console.log(`Output path: ${outputPath}`);
|
|
4233
4478
|
} else {
|
|
4234
4479
|
outputWriter = await createMultiWriter(uniqueOutputPaths);
|
|
4235
4480
|
console.log("Output paths:");
|
|
4236
|
-
for (const p of
|
|
4481
|
+
for (const p of uniqueReportedOutputPaths) {
|
|
4237
4482
|
console.log(` ${p}`);
|
|
4238
4483
|
}
|
|
4239
4484
|
}
|
|
4240
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
4485
|
+
const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
|
|
4241
4486
|
if (options.otelFile) {
|
|
4242
|
-
console.log(`OTLP JSON file: ${
|
|
4487
|
+
console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
|
|
4243
4488
|
}
|
|
4244
|
-
if (
|
|
4245
|
-
console.log(`Trace file: ${
|
|
4489
|
+
if (traceFilePath) {
|
|
4490
|
+
console.log(`Trace file: ${traceFilePath}`);
|
|
4246
4491
|
}
|
|
4247
4492
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
4248
4493
|
const allResults = [];
|
|
@@ -4288,7 +4533,7 @@ async function runEvalCommand(input) {
|
|
|
4288
4533
|
cliNoCache: options.noCache,
|
|
4289
4534
|
yamlCache: yamlCacheEnabled
|
|
4290
4535
|
});
|
|
4291
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
4536
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
|
|
4292
4537
|
const useCache = cacheEnabled;
|
|
4293
4538
|
if (cacheEnabled) {
|
|
4294
4539
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
@@ -4420,20 +4665,51 @@ async function runEvalCommand(input) {
|
|
|
4420
4665
|
console.log(formatMatrixSummary(allResults));
|
|
4421
4666
|
}
|
|
4422
4667
|
if (options.benchmarkJson && allResults.length > 0) {
|
|
4423
|
-
const benchmarkPath =
|
|
4668
|
+
const benchmarkPath = path15.resolve(options.benchmarkJson);
|
|
4424
4669
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
4425
4670
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4426
4671
|
}
|
|
4427
|
-
if (
|
|
4428
|
-
const artifactsDir = path13.resolve(options.artifacts);
|
|
4672
|
+
if (usesDefaultArtifactWorkspace) {
|
|
4429
4673
|
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
|
|
4674
|
+
const workspaceDir = path15.dirname(outputPath);
|
|
4430
4675
|
const {
|
|
4431
|
-
|
|
4676
|
+
testArtifactDir,
|
|
4677
|
+
timingPath,
|
|
4678
|
+
benchmarkPath: workspaceBenchmarkPath,
|
|
4679
|
+
indexPath,
|
|
4680
|
+
legacyResultsPath
|
|
4681
|
+
} = await writeArtifactsFromResults(allResults, workspaceDir, {
|
|
4682
|
+
evalFile,
|
|
4683
|
+
writeLegacyResults: true
|
|
4684
|
+
});
|
|
4685
|
+
console.log(`Artifact workspace written to: ${workspaceDir}`);
|
|
4686
|
+
console.log(` Index: ${indexPath}`);
|
|
4687
|
+
console.log(
|
|
4688
|
+
` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
|
|
4689
|
+
);
|
|
4690
|
+
console.log(` Timing: ${timingPath}`);
|
|
4691
|
+
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
4692
|
+
if (legacyResultsPath) {
|
|
4693
|
+
console.log(` Compatibility output: ${legacyResultsPath} (deprecated)`);
|
|
4694
|
+
}
|
|
4695
|
+
}
|
|
4696
|
+
if (options.artifacts) {
|
|
4697
|
+
const artifactsDir = path15.resolve(options.artifacts);
|
|
4698
|
+
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
|
|
4699
|
+
const {
|
|
4700
|
+
testArtifactDir,
|
|
4701
|
+
indexPath,
|
|
4432
4702
|
timingPath,
|
|
4433
4703
|
benchmarkPath: abp
|
|
4434
|
-
} = await writeArtifactsFromResults(allResults, artifactsDir, {
|
|
4704
|
+
} = await writeArtifactsFromResults(allResults, artifactsDir, {
|
|
4705
|
+
evalFile,
|
|
4706
|
+
writeLegacyResults: false
|
|
4707
|
+
});
|
|
4435
4708
|
console.log(`Artifacts written to: ${artifactsDir}`);
|
|
4436
|
-
console.log(`
|
|
4709
|
+
console.log(` Index: ${indexPath}`);
|
|
4710
|
+
console.log(
|
|
4711
|
+
` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
|
|
4712
|
+
);
|
|
4437
4713
|
console.log(` Timing: ${timingPath}`);
|
|
4438
4714
|
console.log(` Benchmark: ${abp}`);
|
|
4439
4715
|
}
|
|
@@ -4447,33 +4723,25 @@ async function runEvalCommand(input) {
|
|
|
4447
4723
|
}
|
|
4448
4724
|
}
|
|
4449
4725
|
if (allResults.length > 0) {
|
|
4450
|
-
if (
|
|
4726
|
+
if (uniqueReportedOutputPaths.length === 1) {
|
|
4451
4727
|
console.log(`
|
|
4452
4728
|
Results written to: ${outputPath}`);
|
|
4453
4729
|
} else {
|
|
4454
4730
|
console.log("\nResults written to:");
|
|
4455
|
-
for (const p of
|
|
4731
|
+
for (const p of uniqueReportedOutputPaths) {
|
|
4456
4732
|
console.log(` ${p}`);
|
|
4457
4733
|
}
|
|
4458
4734
|
}
|
|
4459
|
-
|
|
4460
|
-
await saveRunCache(cwd, runDir).catch(() => void 0);
|
|
4461
|
-
if (outputPath.endsWith(".jsonl")) {
|
|
4462
|
-
const { writeFile: writeFile7 } = await import("node:fs/promises");
|
|
4463
|
-
const gradingPath = path13.join(path13.dirname(outputPath), "grading.json");
|
|
4464
|
-
const aggregateGrading = buildAggregateGradingArtifact(allResults);
|
|
4465
|
-
await writeFile7(gradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
|
|
4466
|
-
`, "utf8");
|
|
4467
|
-
}
|
|
4735
|
+
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
4468
4736
|
}
|
|
4469
4737
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4470
|
-
const evalFileArgs = resolvedTestFiles.map((f) =>
|
|
4738
|
+
const evalFileArgs = resolvedTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
|
|
4471
4739
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
4472
|
-
const relativeOutputPath =
|
|
4740
|
+
const relativeOutputPath = path15.relative(cwd, outputPath);
|
|
4473
4741
|
console.log(
|
|
4474
4742
|
`
|
|
4475
4743
|
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
4476
|
-
agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}
|
|
4744
|
+
agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`
|
|
4477
4745
|
);
|
|
4478
4746
|
}
|
|
4479
4747
|
return {
|
|
@@ -4501,7 +4769,7 @@ async function resolveEvaluationRunner() {
|
|
|
4501
4769
|
if (!overridePath) {
|
|
4502
4770
|
return runEvaluation;
|
|
4503
4771
|
}
|
|
4504
|
-
const resolved =
|
|
4772
|
+
const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
|
|
4505
4773
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
4506
4774
|
const mod = await import(moduleUrl);
|
|
4507
4775
|
const candidate = mod.runEvaluation;
|
|
@@ -4516,14 +4784,17 @@ async function resolveEvaluationRunner() {
|
|
|
4516
4784
|
export {
|
|
4517
4785
|
package_default,
|
|
4518
4786
|
toSnakeCaseDeep,
|
|
4787
|
+
RESULT_INDEX_FILENAME,
|
|
4788
|
+
LEGACY_RESULTS_FILENAME,
|
|
4789
|
+
resolveExistingRunPrimaryPath,
|
|
4790
|
+
resolveWorkspaceOrFilePath,
|
|
4791
|
+
writeArtifactsFromResults,
|
|
4792
|
+
resolveResultSourcePath,
|
|
4793
|
+
loadManifestResults,
|
|
4794
|
+
loadLightweightResults,
|
|
4519
4795
|
HtmlWriter,
|
|
4520
4796
|
resolveEvalPaths,
|
|
4521
4797
|
findRepoRoot,
|
|
4522
|
-
buildGradingArtifact,
|
|
4523
|
-
buildTimingArtifact,
|
|
4524
|
-
buildBenchmarkArtifact,
|
|
4525
|
-
buildAggregateGradingArtifact,
|
|
4526
|
-
parseJsonlResults,
|
|
4527
4798
|
resolveRunCacheFile,
|
|
4528
4799
|
loadRunCache,
|
|
4529
4800
|
detectFileType,
|
|
@@ -4536,4 +4807,4 @@ export {
|
|
|
4536
4807
|
selectTarget,
|
|
4537
4808
|
runEvalCommand
|
|
4538
4809
|
};
|
|
4539
|
-
//# sourceMappingURL=chunk-
|
|
4810
|
+
//# sourceMappingURL=chunk-VLOFRXH4.js.map
|